diff --git a/CMakeLists.txt b/CMakeLists.txt index 27e16114..f5506c21 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,6 +36,7 @@ else() set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") endif() + # Build options option(BUILD_TEST "Build tests (requires googletest)" OFF) option(DEPENDENCIES_FORCE_DOWNLOAD "Download dependencies and do not search for packages" OFF) diff --git a/benchmark/benchmark_block_adjacent_difference.cpp b/benchmark/benchmark_block_adjacent_difference.cpp index 63af7c7d..1cc70798 100644 --- a/benchmark/benchmark_block_adjacent_difference.cpp +++ b/benchmark/benchmark_block_adjacent_difference.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -32,16 +32,12 @@ const size_t DEFAULT_N = 1024 * 1024 * 128; #endif -template < - class Benchmark, - unsigned int BlockSize, - unsigned int ItemsPerThread, - bool WithTile, - typename... Args -> -__global__ -__launch_bounds__(BlockSize) -void kernel(Args ...args) +template +__global__ __launch_bounds__(BlockSize) void kernel(Args... args) { Benchmark::template run(args...); } @@ -49,8 +45,7 @@ void kernel(Args ...args) template struct minus { - HIPCUB_HOST_DEVICE inline - constexpr T operator()(const T& a, const T& b) const + HIPCUB_HOST_DEVICE inline constexpr T operator()(const T& a, const T& b) const { return a - b; } @@ -58,10 +53,10 @@ struct minus struct subtract_left { - template + template __device__ static void run(const T* d_input, T* d_output, unsigned int trials) { - const unsigned int lid = threadIdx.x; + const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; @@ -69,15 +64,14 @@ struct subtract_left hipcub::BlockAdjacentDifference adjacent_difference; - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < trials; trial++) { T output[ItemsPerThread]; if(WithTile) { adjacent_difference.SubtractLeft(input, output, minus{}, T(123)); - } - else + } else { adjacent_difference.SubtractLeft(input, output, minus{}); } @@ -86,7 +80,7 @@ struct subtract_left { input[i] += output[i]; } - + __syncthreads(); } @@ -96,10 +90,11 @@ struct subtract_left struct subtract_left_partial_tile { - template - __device__ static void run(const T* d_input, const int* tile_sizes, T* d_output, unsigned int trials) + template + __device__ static void + run(const T* d_input, const int* tile_sizes, T* d_output, unsigned int trials) { - const unsigned int lid = threadIdx.x; + const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; @@ -112,7 +107,7 @@ struct subtract_left_partial_tile // Try to evenly distribute the length of tile_sizes between all the trials const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1; - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < trials; trial++) { T output[ItemsPerThread]; @@ -124,8 +119,7 @@ struct subtract_left_partial_tile minus{}, tile_size, T(123)); - } - else + } else { adjacent_difference.SubtractLeftPartialTile(input, output, minus{}, tile_size); } @@ -134,7 +128,7 @@ struct subtract_left_partial_tile { input[i] += output[i]; } - + // Change the tile_size to even out the distribution tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread); __syncthreads(); @@ -146,10 +140,10 @@ struct subtract_left_partial_tile struct subtract_right { - template + template __device__ static void run(const T* d_input, T* d_output, unsigned int trials) { - const unsigned int lid = threadIdx.x; + const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; @@ -157,15 +151,14 @@ struct subtract_right hipcub::BlockAdjacentDifference adjacent_difference; - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < trials; trial++) { T output[ItemsPerThread]; if(WithTile) { adjacent_difference.SubtractRight(input, output, minus{}, T(123)); - } - else + } else { adjacent_difference.SubtractRight(input, output, minus{}); } @@ -174,7 +167,7 @@ struct subtract_right { input[i] += output[i]; } - + __syncthreads(); } @@ -184,10 +177,11 @@ struct subtract_right struct subtract_right_partial_tile { - template - __device__ static void run(const T* d_input, const int* tile_sizes, T* d_output, unsigned int trials) + template + __device__ static void + run(const T* d_input, const int* tile_sizes, T* d_output, unsigned int trials) { - const unsigned int lid = threadIdx.x; + const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; @@ -200,7 +194,7 @@ struct subtract_right_partial_tile // Try to evenly distribute the length of tile_sizes between all the trials const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1; - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < trials; trial++) { T output[ItemsPerThread]; @@ -211,7 +205,7 @@ struct subtract_right_partial_tile { input[i] += output[i]; } - + // Change the tile_size to even out the distribution tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread); __syncthreads(); @@ -221,49 +215,47 @@ struct subtract_right_partial_tile } }; -template +template auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) -> std::enable_if_t::value && !std::is_same::value> { constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto num_blocks = (N + items_per_block - 1) / items_per_block; + const auto num_blocks = (N + items_per_block - 1) / items_per_block; // Round up size to the next multiple of items_per_block const auto size = num_blocks * items_per_block; const std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); - T* d_input; - T* d_output; + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(input[0]), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input, input.data(), input.size() * sizeof(input[0]), hipMemcpyHostToDevice)); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel), - dim3(num_blocks), dim3(BlockSize), 0, stream, - d_input, d_output, Trials - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), + dim3(num_blocks), + dim3(BlockSize), + 0, + stream, + d_input, + d_output, + Trials); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); @@ -273,61 +265,57 @@ auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipFree(d_output)); } -template +template auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) -> std::enable_if_t::value || std::is_same::value> { constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto num_blocks = (N + items_per_block - 1) / items_per_block; + const auto num_blocks = (N + items_per_block - 1) / items_per_block; // Round up size to the next multiple of items_per_block const auto size = num_blocks * items_per_block; - const std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); + const std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); const std::vector tile_sizes = benchmark_utils::get_random_data(num_blocks, 0, items_per_block); - - T* d_input; + + T* d_input; int* d_tile_sizes; - T* d_output; + T* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); HIP_CHECK(hipMalloc(&d_tile_sizes, tile_sizes.size() * sizeof(tile_sizes[0]))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(input[0]), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_tile_sizes, tile_sizes.data(), - tile_sizes.size() * sizeof(tile_sizes[0]), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input, input.data(), input.size() * sizeof(input[0]), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_tile_sizes, + tile_sizes.data(), + tile_sizes.size() * sizeof(tile_sizes[0]), + hipMemcpyHostToDevice)); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel), - dim3(num_blocks), dim3(BlockSize), 0, stream, - d_input, d_tile_sizes, d_output, Trials - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), + dim3(num_blocks), + dim3(BlockSize), + 0, + stream, + d_input, + d_tile_sizes, + d_output, + Trials); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); @@ -338,51 +326,47 @@ auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ -benchmark::RegisterBenchmark( \ - (std::string("block_adjacent_difference<" #T ", " #BS ">.") + name + ("<" #IPT ", " #WITH_TILE ">")).c_str(), \ - &run_benchmark, \ - stream, size \ -) - -#define BENCHMARK_TYPE(type, block, with_tile) \ - CREATE_BENCHMARK(type, block, 1, with_tile), \ - CREATE_BENCHMARK(type, block, 3, with_tile), \ - CREATE_BENCHMARK(type, block, 4, with_tile), \ - CREATE_BENCHMARK(type, block, 8, with_tile), \ - CREATE_BENCHMARK(type, block, 16, with_tile), \ - CREATE_BENCHMARK(type, block, 32, with_tile) +#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ + benchmark::RegisterBenchmark( \ + std::string("block_adjacent_difference.sub_algorithm_name:" \ + + name + "") \ + .c_str(), \ + &run_benchmark, \ + stream, \ + size) + +#define BENCHMARK_TYPE(type, block, with_tile) \ + CREATE_BENCHMARK(type, block, 1, with_tile), CREATE_BENCHMARK(type, block, 3, with_tile), \ + CREATE_BENCHMARK(type, block, 4, with_tile), CREATE_BENCHMARK(type, block, 8, with_tile), \ + CREATE_BENCHMARK(type, block, 16, with_tile), CREATE_BENCHMARK(type, block, 32, with_tile) template -void add_benchmarks(const std::string& name, +void add_benchmarks(const std::string& name, std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - std::vector bs = - { - BENCHMARK_TYPE(int, 256, false), - BENCHMARK_TYPE(float, 256, false), - BENCHMARK_TYPE(int8_t, 256, false), - BENCHMARK_TYPE(long long, 256, false), - BENCHMARK_TYPE(double, 256, false) - }; + std::vector bs = {BENCHMARK_TYPE(int, 256, false), + BENCHMARK_TYPE(float, 256, false), + BENCHMARK_TYPE(int8_t, 256, false), + BENCHMARK_TYPE(long long, 256, false), + BENCHMARK_TYPE(double, 256, false)}; if(!std::is_same::value) { - bs.insert(bs.end(), { - BENCHMARK_TYPE(int, 256, true), - BENCHMARK_TYPE(float, 256, true), - BENCHMARK_TYPE(int8_t, 256, true), - BENCHMARK_TYPE(long long, 256, true), - BENCHMARK_TYPE(double, 256, true) - }); + bs.insert(bs.end(), + {BENCHMARK_TYPE(int, 256, true), + BENCHMARK_TYPE(float, 256, true), + BENCHMARK_TYPE(int8_t, 256, true), + BENCHMARK_TYPE(long long, 256, true), + BENCHMARK_TYPE(double, 256, true)}); } benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -391,23 +375,28 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_block_adjacent_difference" << std::endl; std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; - add_benchmarks("SubtractLeft", benchmarks, stream, size); - add_benchmarks("SubtractRight", benchmarks, stream, size); - add_benchmarks("SubtractLeftPartialTile", benchmarks, stream, size); - add_benchmarks("SubtractRightPartialTile", benchmarks, stream, size); + add_benchmarks("subtract_left", benchmarks, stream, size); + add_benchmarks("subtract_right", benchmarks, stream, size); + add_benchmarks("subtract_left_partial_tile", benchmarks, stream, size); + add_benchmarks("subtract_right_partial_tile", + benchmarks, + stream, + size); // Use manual timing for(auto& b : benchmarks) diff --git a/benchmark/benchmark_block_discontinuity.cpp b/benchmark/benchmark_block_discontinuity.cpp index 72d925ec..24446c9a 100644 --- a/benchmark/benchmark_block_discontinuity.cpp +++ b/benchmark/benchmark_block_discontinuity.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -25,10 +25,9 @@ // HIP API #include "hipcub/block/block_discontinuity.hpp" -#include "hipcub/thread/thread_operators.hpp" //to use hipcub::Equality #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_store.hpp" - +#include "hipcub/thread/thread_operators.hpp" //to use hipcub::Equality #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; @@ -44,49 +43,41 @@ struct custom_flag_op1 } }; -template< - class Runner, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - bool WithTile, - unsigned int Trials -> -__global__ -__launch_bounds__(BlockSize) -void kernel(const T * d_input, T * d_output) +template +__global__ __launch_bounds__(BlockSize) void kernel(const T* d_input, T* d_output) { Runner::template run(d_input, d_output); } struct flag_heads { - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - bool WithTile, - unsigned int Trials - > - __device__ - static void run(const T * d_input, T * d_output) + template + __device__ static void run(const T* d_input, T* d_output) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockDiscontinuity bdiscontinuity; - bool head_flags[ItemsPerThread]; + bool head_flags[ItemsPerThread]; if(WithTile) { bdiscontinuity.FlagHeads(head_flags, input, hipcub::Equality(), T(123)); - } - else + } else { bdiscontinuity.FlagHeads(head_flags, input, hipcub::Equality()); } @@ -103,32 +94,28 @@ struct flag_heads struct flag_tails { - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - bool WithTile, - unsigned int Trials - > - __device__ - static void run(const T * d_input, T * d_output) + template + __device__ static void run(const T* d_input, T* d_output) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockDiscontinuity bdiscontinuity; - bool tail_flags[ItemsPerThread]; + bool tail_flags[ItemsPerThread]; if(WithTile) { bdiscontinuity.FlagTails(tail_flags, input, hipcub::Equality(), T(123)); - } - else + } else { bdiscontinuity.FlagTails(tail_flags, input, hipcub::Equality()); } @@ -145,33 +132,34 @@ struct flag_tails struct flag_heads_and_tails { - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - bool WithTile, - unsigned int Trials - > - __device__ - static void run(const T * d_input, T * d_output) + template + __device__ static void run(const T* d_input, T* d_output) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockDiscontinuity bdiscontinuity; - bool head_flags[ItemsPerThread]; - bool tail_flags[ItemsPerThread]; + bool head_flags[ItemsPerThread]; + bool tail_flags[ItemsPerThread]; if(WithTile) { - bdiscontinuity.FlagHeadsAndTails(head_flags, T(123), tail_flags, T(234), input, hipcub::Equality()); - } - else + bdiscontinuity.FlagHeadsAndTails(head_flags, + T(123), + tail_flags, + T(234), + input, + hipcub::Equality()); + } else { bdiscontinuity.FlagHeadsAndTails(head_flags, tail_flags, input, hipcub::Equality()); } @@ -187,31 +175,23 @@ struct flag_heads_and_tails } }; -template< - class Benchmark, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - bool WithTile, - unsigned int Trials = 100 -> +template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); - T * d_input; - T * d_output; + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) @@ -220,15 +200,18 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), - dim3(size/items_per_block), dim3(BlockSize), 0, stream, - d_input, d_output - ); + dim3(size / items_per_block), + dim3(BlockSize), + 0, + stream, + d_input, + d_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); @@ -238,29 +221,27 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ -benchmark::RegisterBenchmark( \ - (std::string("block_discontinuity.SubAlgorithm Name:") + name + ("")).c_str(), \ - &run_benchmark, \ - stream, size \ -) - -#define BENCHMARK_TYPE(type, block, bool) \ - CREATE_BENCHMARK(type, block, 1, bool), \ - CREATE_BENCHMARK(type, block, 2, bool), \ - CREATE_BENCHMARK(type, block, 3, bool), \ - CREATE_BENCHMARK(type, block, 4, bool), \ - CREATE_BENCHMARK(type, block, 8, bool) +#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ + benchmark::RegisterBenchmark( \ + std::string("block_discontinuity.sub_algorithm_name:" \ + + name + ".") \ + .c_str(), \ + &run_benchmark, \ + stream, \ + size) +#define BENCHMARK_TYPE(type, block, bool) \ + CREATE_BENCHMARK(type, block, 1, bool), CREATE_BENCHMARK(type, block, 2, bool), \ + CREATE_BENCHMARK(type, block, 3, bool), CREATE_BENCHMARK(type, block, 4, bool), \ + CREATE_BENCHMARK(type, block, 8, bool) template -void add_benchmarks(const std::string& name, +void add_benchmarks(const std::string& name, std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - std::vector bs = - { + std::vector bs = { BENCHMARK_TYPE(int, 256, false), BENCHMARK_TYPE(int, 256, true), BENCHMARK_TYPE(int8_t, 256, false), @@ -274,7 +255,7 @@ void add_benchmarks(const std::string& name, benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -283,15 +264,15 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); std::cout << "benchmark_block_discontinuity" << std::endl; // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_block_exchange.cpp b/benchmark/benchmark_block_exchange.cpp index 278a6190..a36d041a 100644 --- a/benchmark/benchmark_block_exchange.cpp +++ b/benchmark/benchmark_block_exchange.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -27,49 +27,41 @@ #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_store.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template< - class Runner, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials -> -__global__ -__launch_bounds__(BlockSize) -void kernel(const T * d_input, const unsigned int * d_ranks, T * d_output) +template +__global__ __launch_bounds__(BlockSize) void kernel(const T* d_input, + const unsigned int* d_ranks, + T* d_output) { Runner::template run(d_input, d_ranks, d_output); } struct blocked_to_striped { - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials - > - __device__ - static void run(const T * d_input, const unsigned int *, T * d_output) + template + __device__ static void run(const T* d_input, const unsigned int*, T* d_output) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectBlocked(lid, d_input + block_offset, input); - - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockExchange exchange; exchange.BlockedToStriped(input, input); - __syncthreads(); // extra sync needed because of loop. In normal usage sync with be cared for by the load and store functions (outside the loop). + __syncthreads(); // extra sync needed because of loop. In normal usage + // sync with be cared for by the load and store functions + // (outside the loop). } hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } @@ -77,27 +69,23 @@ struct blocked_to_striped struct striped_to_blocked { - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials - > - __device__ - static void run(const T * d_input, const unsigned int *, T * d_output) + template + __device__ static void run(const T* d_input, const unsigned int*, T* d_output) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockExchange exchange; exchange.StripedToBlocked(input, input); - __syncthreads();// extra sync needed because of loop. In normal usage sync with be cared for by the load and store functions (outside the loop). + __syncthreads(); // extra sync needed because of loop. In normal usage + // sync with be cared for by the load and store functions + // (outside the loop). } hipcub::StoreDirectBlocked(lid, d_output + block_offset, input); } @@ -105,27 +93,23 @@ struct striped_to_blocked struct blocked_to_warp_striped { - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials - > - __device__ - static void run(const T * d_input, const unsigned int *, T * d_output) + template + __device__ static void run(const T* d_input, const unsigned int*, T* d_output) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectBlocked(lid, d_input + block_offset, input); - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockExchange exchange; exchange.BlockedToWarpStriped(input, input); - __syncthreads();// extra sync needed because of loop. In normal usage sync with be cared for by the load and store functions (outside the loop). + __syncthreads(); // extra sync needed because of loop. In normal usage + // sync with be cared for by the load and store functions + // (outside the loop). } hipcub::StoreDirectWarpStriped(lid, d_output + block_offset, input); } @@ -133,27 +117,23 @@ struct blocked_to_warp_striped struct warp_striped_to_blocked { - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials - > - __device__ - static void run(const T * d_input, const unsigned int *, T * d_output) + template + __device__ static void run(const T* d_input, const unsigned int*, T* d_output) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectWarpStriped(lid, d_input + block_offset, input); - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockExchange exchange; exchange.WarpStripedToBlocked(input, input); - __syncthreads(); // extra sync needed because of loop. In normal usage sync with be cared for by the load and store functions (outside the loop). + __syncthreads(); // extra sync needed because of loop. In normal usage + // sync with be cared for by the load and store functions + // (outside the loop). } hipcub::StoreDirectBlocked(lid, d_output + block_offset, input); } @@ -161,29 +141,25 @@ struct warp_striped_to_blocked struct scatter_to_blocked { - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials - > - __device__ - static void run(const T * d_input, const unsigned int * d_ranks, T * d_output) + template + __device__ static void run(const T* d_input, const unsigned int* d_ranks, T* d_output) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - T input[ItemsPerThread]; + T input[ItemsPerThread]; unsigned int ranks[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); hipcub::LoadDirectStriped(lid, d_ranks + block_offset, ranks); - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockExchange exchange; exchange.ScatterToBlocked(input, input, ranks); - __syncthreads();// extra sync needed because of loop. In normal usage sync with be cared for by the load and store functions (outside the loop). + __syncthreads(); // extra sync needed because of loop. In normal usage + // sync with be cared for by the load and store functions + // (outside the loop). } hipcub::StoreDirectBlocked(lid, d_output + block_offset, input); } @@ -191,45 +167,39 @@ struct scatter_to_blocked struct scatter_to_striped { - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials - > - __device__ - static void run(const T * d_input, const unsigned int * d_ranks, T * d_output) + template + __device__ static void run(const T* d_input, const unsigned int* d_ranks, T* d_output) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - T input[ItemsPerThread]; + T input[ItemsPerThread]; unsigned int ranks[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); hipcub::LoadDirectStriped(lid, d_ranks + block_offset, ranks); - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockExchange exchange; exchange.ScatterToStriped(input, input, ranks); - __syncthreads(); // extra sync needed because of loop. In normal usage sync with be cared for by the load and store functions (outside the loop). + __syncthreads(); // extra sync needed because of loop. In normal usage + // sync with be cared for by the load and store functions + // (outside the loop). } hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } }; -template< - class Benchmark, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials = 100 -> +template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); std::vector input(size); // Fill input @@ -246,43 +216,34 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) std::iota(block_ranks, block_ranks + items_per_block, 0); std::shuffle(block_ranks, block_ranks + items_per_block, gen); } - T * d_input; - unsigned int * d_ranks; - T * d_output; + T* d_input; + unsigned int* d_ranks; + T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_ranks, size * sizeof(unsigned int))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_ranks, ranks.data(), - size * sizeof(unsigned int), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_ranks, ranks.data(), size * sizeof(unsigned int), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel), - dim3(size/items_per_block), dim3(BlockSize), 0, stream, - d_input, d_ranks, d_output - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), + dim3(size / items_per_block), + dim3(BlockSize), + 0, + stream, + d_input, + d_ranks, + d_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); @@ -293,32 +254,30 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IPT) \ -benchmark::RegisterBenchmark( \ - (std::string("block_exchange.SubAlgorithm Name:") + name).c_str(), \ - &run_benchmark, \ - stream, size \ -) - -#define BENCHMARK_TYPE(type, block) \ - CREATE_BENCHMARK(type, block, 1), \ - CREATE_BENCHMARK(type, block, 2), \ - CREATE_BENCHMARK(type, block, 3), \ - CREATE_BENCHMARK(type, block, 4), \ - CREATE_BENCHMARK(type, block, 7), \ - CREATE_BENCHMARK(type, block, 8) +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark(std::string("block_exchange.sub_algorithm_name:" \ + + name) \ + .c_str(), \ + &run_benchmark, \ + stream, \ + size) + +#define BENCHMARK_TYPE(type, block) \ + CREATE_BENCHMARK(type, block, 1), CREATE_BENCHMARK(type, block, 2), \ + CREATE_BENCHMARK(type, block, 3), CREATE_BENCHMARK(type, block, 4), \ + CREATE_BENCHMARK(type, block, 7), CREATE_BENCHMARK(type, block, 8) template -void add_benchmarks(const std::string& name, +void add_benchmarks(const std::string& name, std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - using custom_float2 = benchmark_utils::custom_type; + using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; - std::vector bs = - { + std::vector bs = { BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(int8_t, 256), BENCHMARK_TYPE(long long, 256), @@ -329,7 +288,7 @@ void add_benchmarks(const std::string& name, benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -338,15 +297,15 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); std::cout << "benchmark_block_exchange" << std::endl; // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_block_histogram.cpp b/benchmark/benchmark_block_histogram.cpp index e247a13b..122ccc36 100644 --- a/benchmark/benchmark_block_histogram.cpp +++ b/benchmark/benchmark_block_histogram.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -25,22 +25,17 @@ // HIP API #include "hipcub/block/block_histogram.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif -template< - class Runner, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int BinSize, - unsigned int Trials -> -__global__ -__launch_bounds__(BlockSize) -void kernel(const T* input, T* output) +template +__global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output) { Runner::template run(input, output); } @@ -48,18 +43,15 @@ void kernel(const T* input, T* output) template struct histogram { - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int BinSize, - unsigned int Trials - > - __device__ - static void run(const T* input, T* output) + template + __device__ static void run(const T* input, T* output) { const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; - unsigned int global_offset = hipBlockIdx_x * BinSize; + unsigned int global_offset = hipBlockIdx_x * BinSize; T values[ItemsPerThread]; for(unsigned int k = 0; k < ItemsPerThread; k++) @@ -67,18 +59,19 @@ struct histogram values[k] = input[index + k]; } - using bhistogram_t = hipcub::BlockHistogram; - __shared__ T histogram[BinSize]; + using bhistogram_t + = hipcub::BlockHistogram; + __shared__ T histogram[BinSize]; __shared__ typename bhistogram_t::TempStorage storage; - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { bhistogram_t(storage).Histogram(values, histogram); } - #pragma unroll - for (unsigned int offset = 0; offset < BinSize; offset += BlockSize) +#pragma unroll + for(unsigned int offset = 0; offset < BinSize; offset += BlockSize) { if(offset + hipThreadIdx_x < BinSize) { @@ -89,49 +82,44 @@ struct histogram } }; -template< - class Benchmark, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int BinSize = BlockSize, - unsigned int Trials = 100 -> +template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { // Make sure size is a multiple of BlockSize constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); - const auto bin_size = BinSize * ((N + items_per_block - 1)/items_per_block); + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); + const auto bin_size = BinSize * ((N + items_per_block - 1) / items_per_block); // Allocate and fill memory std::vector input(size, 0.0f); - T * d_input; - T * d_output; + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, bin_size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), - dim3(size/items_per_block), dim3(BlockSize), 0, stream, - d_input, d_output - ); + dim3(size / items_per_block), + dim3(BlockSize), + 0, + stream, + d_input, + d_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } @@ -143,41 +131,38 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) } // IPT - items per thread -#define CREATE_BENCHMARK(T, BS, IPT) \ - benchmark::RegisterBenchmark( \ - (std::string("block_histogram.Method Name:") + method_name).c_str(), \ - &run_benchmark, \ - stream, size \ - ) - -#define BENCHMARK_TYPE(type, block) \ - CREATE_BENCHMARK(type, block, 1), \ - CREATE_BENCHMARK(type, block, 2), \ - CREATE_BENCHMARK(type, block, 3), \ - CREATE_BENCHMARK(type, block, 4), \ - CREATE_BENCHMARK(type, block, 8), \ - CREATE_BENCHMARK(type, block, 16) +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark(std::string("block_histogram.method_name:" + method_name) \ + .c_str(), \ + &run_benchmark, \ + stream, \ + size) + +#define BENCHMARK_TYPE(type, block) \ + CREATE_BENCHMARK(type, block, 1), CREATE_BENCHMARK(type, block, 2), \ + CREATE_BENCHMARK(type, block, 3), CREATE_BENCHMARK(type, block, 4), \ + CREATE_BENCHMARK(type, block, 8), CREATE_BENCHMARK(type, block, 16) template void add_benchmarks(std::vector& benchmarks, - const std::string& method_name, - const std::string& algorithm_name, - hipStream_t stream, - size_t size) + const std::string& method_name, + const std::string& algorithm_name, + hipStream_t stream, + size_t size) { - std::vector new_benchmarks = - { - BENCHMARK_TYPE(int, 256), - BENCHMARK_TYPE(int, 320), - BENCHMARK_TYPE(int, 512), + std::vector new_benchmarks + = {BENCHMARK_TYPE(int, 256), + BENCHMARK_TYPE(int, 320), + BENCHMARK_TYPE(int, 512), - BENCHMARK_TYPE(unsigned long long, 256), - BENCHMARK_TYPE(unsigned long long, 320) - }; + BENCHMARK_TYPE(unsigned long long, 256), + BENCHMARK_TYPE(unsigned long long, 320)}; benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -186,15 +171,15 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); std::cout << "benchmark_block_histogram" << std::endl; // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; @@ -203,14 +188,10 @@ int main(int argc, char *argv[]) std::vector benchmarks; // using_atomic using histogram_a_t = histogram; - add_benchmarks( - benchmarks, "histogram", "using_atomic", stream, size - ); + add_benchmarks(benchmarks, "histogram", "using_atomic", stream, size); // using_sort using histogram_s_t = histogram; - add_benchmarks( - benchmarks, "histogram", "using_sort", stream, size - ); + add_benchmarks(benchmarks, "histogram", "using_sort", stream, size); // Use manual timing for(auto& b : benchmarks) diff --git a/benchmark/benchmark_block_merge_sort.cpp b/benchmark/benchmark_block_merge_sort.cpp index 14407a62..62ffbdfa 100644 --- a/benchmark/benchmark_block_merge_sort.cpp +++ b/benchmark/benchmark_block_merge_sort.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -24,11 +24,10 @@ #include "../test/hipcub/test_utils_sort_comparator.hpp" // HIP API -#include "hipcub/block/block_merge_sort.hpp" #include "hipcub/block/block_load.hpp" +#include "hipcub/block/block_merge_sort.hpp" #include "hipcub/block/block_store.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif @@ -39,24 +38,22 @@ enum class benchmark_kinds sort_pairs }; -template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - class CompareOp, - unsigned int Trials -> -__global__ -__launch_bounds__(BlockSize) -void sort_keys_kernel(const T * input, T * output, CompareOp compare_op) +template +__global__ __launch_bounds__(BlockSize) void sort_keys_kernel(const T* input, + T* output, + CompareOp compare_op) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T keys[ItemsPerThread]; hipcub::LoadDirectStriped(lid, input + block_offset, keys); - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockMergeSort sort; @@ -66,18 +63,16 @@ void sort_keys_kernel(const T * input, T * output, CompareOp compare_op) hipcub::StoreDirectStriped(lid, output + block_offset, keys); } -template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - class CompareOp, - unsigned int Trials -> -__global__ -__launch_bounds__(BlockSize) -void sort_pairs_kernel(const T * input, T * output, CompareOp compare_op) +template +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(const T* input, + T* output, + CompareOp compare_op) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T keys[ItemsPerThread]; @@ -89,7 +84,7 @@ void sort_pairs_kernel(const T * input, T * output, CompareOp compare_op) values[i] = keys[i] + T(1); } - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockMergeSort sort; @@ -101,45 +96,36 @@ void sort_pairs_kernel(const T * input, T * output, CompareOp compare_op) keys[i] += values[i]; } hipcub::StoreDirectStriped(lid, output + block_offset, keys); - } -template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - class CompareOp = test_utils::less, - unsigned int Trials = 10 -> -void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipStream_t stream, size_t N) +template +void run_benchmark(benchmark::State& state, + benchmark_kinds benchmark_kind, + hipStream_t stream, + size_t N) { constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); std::vector input; if(std::is_floating_point::value) { - input = benchmark_utils::get_random_data(size, (T)-1000, (T)+1000); - } - else + input = benchmark_utils::get_random_data(size, (T)-1000, (T) + 1000); + } else { - input = benchmark_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + input = benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max()); } - T * d_input; - T * d_output; + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) @@ -150,24 +136,31 @@ void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipS { hipLaunchKernelGGL( HIP_KERNEL_NAME(sort_keys_kernel), - dim3(size/items_per_block), dim3(BlockSize), 0, stream, - d_input, d_output, CompareOp() - ); - } - else if(benchmark_kind == benchmark_kinds::sort_pairs) + dim3(size / items_per_block), + dim3(BlockSize), + 0, + stream, + d_input, + d_output, + CompareOp()); + } else if(benchmark_kind == benchmark_kinds::sort_pairs) { hipLaunchKernelGGL( HIP_KERNEL_NAME(sort_pairs_kernel), - dim3(size/items_per_block), dim3(BlockSize), 0, stream, - d_input, d_output, CompareOp() - ); + dim3(size / items_per_block), + dim3(BlockSize), + 0, + stream, + d_input, + d_output, + CompareOp()); } HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); @@ -177,53 +170,51 @@ void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipS HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IPT) \ -benchmark::RegisterBenchmark( \ - (std::string("block_merge_sort.SubAlgorithm Name:") + name).c_str(), \ - &run_benchmark, \ - benchmark_kind, stream, size \ -) - -#define BENCHMARK_TYPE(type, block) \ - CREATE_BENCHMARK(type, block, 1), \ - CREATE_BENCHMARK(type, block, 2), \ - CREATE_BENCHMARK(type, block, 3), \ - CREATE_BENCHMARK(type, block, 4), \ - CREATE_BENCHMARK(type, block, 8) - -void add_benchmarks(benchmark_kinds benchmark_kind, - const std::string& name, +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark(std::string("block_merge_sort.sub_algorithm_name:" \ + + name) \ + .c_str(), \ + &run_benchmark, \ + benchmark_kind, \ + stream, \ + size) + +#define BENCHMARK_TYPE(type, block) \ + CREATE_BENCHMARK(type, block, 1), CREATE_BENCHMARK(type, block, 2), \ + CREATE_BENCHMARK(type, block, 3), CREATE_BENCHMARK(type, block, 4), \ + CREATE_BENCHMARK(type, block, 8) + +void add_benchmarks(benchmark_kinds benchmark_kind, + const std::string& name, std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - std::vector bs = - { - BENCHMARK_TYPE(int, 64), - BENCHMARK_TYPE(int, 128), - BENCHMARK_TYPE(int, 256), - BENCHMARK_TYPE(int, 512), - - BENCHMARK_TYPE(int8_t, 64), - BENCHMARK_TYPE(int8_t, 128), - BENCHMARK_TYPE(int8_t, 256), - BENCHMARK_TYPE(int8_t, 512), - - BENCHMARK_TYPE(uint8_t, 64), - BENCHMARK_TYPE(uint8_t, 128), - BENCHMARK_TYPE(uint8_t, 256), - BENCHMARK_TYPE(uint8_t, 512), - - BENCHMARK_TYPE(long long, 64), - BENCHMARK_TYPE(long long, 128), - BENCHMARK_TYPE(long long, 256), - BENCHMARK_TYPE(long long, 512) - }; + std::vector bs = {BENCHMARK_TYPE(int, 64), + BENCHMARK_TYPE(int, 128), + BENCHMARK_TYPE(int, 256), + BENCHMARK_TYPE(int, 512), + + BENCHMARK_TYPE(int8_t, 64), + BENCHMARK_TYPE(int8_t, 128), + BENCHMARK_TYPE(int8_t, 256), + BENCHMARK_TYPE(int8_t, 512), + + BENCHMARK_TYPE(uint8_t, 64), + BENCHMARK_TYPE(uint8_t, 128), + BENCHMARK_TYPE(uint8_t, 256), + BENCHMARK_TYPE(uint8_t, 512), + + BENCHMARK_TYPE(long long, 64), + BENCHMARK_TYPE(long long, 128), + BENCHMARK_TYPE(long long, 256), + BENCHMARK_TYPE(long long, 512)}; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -232,15 +223,15 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); std::cout << "benchmark_block_merge_sort" << std::endl; // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_block_radix_rank.cpp b/benchmark/benchmark_block_radix_rank.cpp index ffecb5aa..8578b75c 100644 --- a/benchmark/benchmark_block_radix_rank.cpp +++ b/benchmark/benchmark_block_radix_rank.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -115,8 +115,7 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) input = benchmark_utils::get_random_data(size, static_cast(-1000), static_cast(1000)); - } - else + } else { input = benchmark_utils::get_random_data(size, std::numeric_limits::min(), @@ -157,12 +156,14 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, KIND, BS, IPT) \ - benchmark::RegisterBenchmark( \ - (std::string("block_radix_rank<" #T ", " #KIND ", " #BS ", " #IPT ">.") + name).c_str(), \ - &run_benchmark, \ - stream, \ - size) +#define CREATE_BENCHMARK(T, KIND, BS, IPT) \ + benchmark::RegisterBenchmark(std::string("block_radix_rank." \ + + name) \ + .c_str(), \ + &run_benchmark, \ + stream, \ + size) // clang-format off #define CREATE_BENCHMARK_KINDS(type, block, ipt) \ @@ -218,6 +219,8 @@ int main(int argc, char* argv[]) int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_block_radix_rank" << std::endl; std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks diff --git a/benchmark/benchmark_block_radix_sort.cpp b/benchmark/benchmark_block_radix_sort.cpp index dbd13fea..7413214e 100644 --- a/benchmark/benchmark_block_radix_sort.cpp +++ b/benchmark/benchmark_block_radix_sort.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -23,11 +23,10 @@ #include "common_benchmark_header.hpp" // HIP API -#include "hipcub/block/block_radix_sort.hpp" #include "hipcub/block/block_load.hpp" +#include "hipcub/block/block_radix_sort.hpp" #include "hipcub/block/block_store.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif @@ -135,7 +134,7 @@ __global__ __launch_bounds__(BlockSize) void sort_keys_kernel(const T* input, T* T keys[ItemsPerThread]; Helper::template load(lid, input + block_offset, keys); - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { Helper::template sort(keys); @@ -163,7 +162,7 @@ __global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(const T* input, T values[i] = keys[i] + T(1); } - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { Helper::template sort(keys, values); @@ -188,32 +187,23 @@ void run_benchmark(benchmark::State& state, size_t N) { constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); std::vector input; if(std::is_floating_point::value) { - input = benchmark_utils::get_random_data(size, (T)-1000, (T)+1000); - } - else + input = benchmark_utils::get_random_data(size, (T)-1000, (T) + 1000); + } else { - input = benchmark_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + input = benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max()); } - T * d_input; - T * d_output; + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) @@ -224,8 +214,7 @@ void run_benchmark(benchmark::State& state, { sort_keys_kernel <<>>(d_input, d_output); - } - else if(benchmark_kind == benchmark_kinds::sort_pairs) + } else if(benchmark_kind == benchmark_kinds::sort_pairs) { sort_pairs_kernel <<>>(d_input, d_output); @@ -234,8 +223,8 @@ void run_benchmark(benchmark::State& state, HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); @@ -246,9 +235,9 @@ void run_benchmark(benchmark::State& state, } #define CREATE_BENCHMARK(T, BS, IPT) \ - benchmark::RegisterBenchmark((std::string("block_radix_sort.SubAlgorithm Name:") \ - + name) \ + benchmark::RegisterBenchmark(std::string("block_radix_sort.sub_algorithm_name:" \ + + name) \ .c_str(), \ &run_benchmark, \ benchmark_kind, \ @@ -293,7 +282,7 @@ void add_benchmarks(benchmark_kinds benchmark_kind benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -302,15 +291,15 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); std::cout << "benchmark_block_radix_sort" << std::endl; // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_block_reduce.cpp b/benchmark/benchmark_block_reduce.cpp index a9a33909..bdb089e7 100644 --- a/benchmark/benchmark_block_reduce.cpp +++ b/benchmark/benchmark_block_reduce.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -26,21 +26,16 @@ #include "hipcub/block/block_reduce.hpp" #include "hipcub/thread/thread_operators.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template< - class Runner, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials -> -__global__ -__launch_bounds__(BlockSize) -void kernel(const T* input, T* output) +template +__global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output) { Runner::template run(input, output); } @@ -48,14 +43,8 @@ void kernel(const T* input, T* output) template struct reduce { - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials - > - __device__ - static void run(const T* input, T* output) + template + __device__ static void run(const T* input, T* output) { const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; @@ -69,11 +58,11 @@ struct reduce using breduce_t = hipcub::BlockReduce; __shared__ typename breduce_t::TempStorage storage; - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { reduced_value = breduce_t(storage).Reduce(values, hipcub::Sum()); - values[0] = reduced_value; + values[0] = reduced_value; } if(hipThreadIdx_x == 0) @@ -83,47 +72,41 @@ struct reduce } }; -template< - class Benchmark, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials = 100 -> +template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { // Make sure size is a multiple of BlockSize constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); // Allocate and fill memory std::vector input(size, T(1)); - T * d_input; - T * d_output; + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel), - dim3(size/items_per_block), dim3(BlockSize), 0, stream, - d_input, d_output - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), + dim3(size / items_per_block), + dim3(BlockSize), + 0, + stream, + d_input, + d_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } @@ -135,32 +118,30 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) } // IPT - items per thread -#define CREATE_BENCHMARK(T, BS, IPT) \ - benchmark::RegisterBenchmark( \ - (std::string("block_reduce.Method Name:") + method_name).c_str(), \ - &run_benchmark, \ - stream, size \ - ) - -#define BENCHMARK_TYPE(type, block) \ - CREATE_BENCHMARK(type, block, 1), \ - CREATE_BENCHMARK(type, block, 2), \ - CREATE_BENCHMARK(type, block, 3), \ - CREATE_BENCHMARK(type, block, 4), \ - CREATE_BENCHMARK(type, block, 8), \ - CREATE_BENCHMARK(type, block, 11), \ - CREATE_BENCHMARK(type, block, 16) +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark(std::string("block_reduce.method_name:" + method_name) \ + .c_str(), \ + &run_benchmark, \ + stream, \ + size) + +#define BENCHMARK_TYPE(type, block) \ + CREATE_BENCHMARK(type, block, 1), CREATE_BENCHMARK(type, block, 2), \ + CREATE_BENCHMARK(type, block, 3), CREATE_BENCHMARK(type, block, 4), \ + CREATE_BENCHMARK(type, block, 8), CREATE_BENCHMARK(type, block, 11), \ + CREATE_BENCHMARK(type, block, 16) template void add_benchmarks(std::vector& benchmarks, - const std::string& method_name, - const std::string& algorithm_name, - hipStream_t stream, - size_t size) + const std::string& method_name, + const std::string& algorithm_name, + hipStream_t stream, + size_t size) { - std::vector new_benchmarks = - { + std::vector new_benchmarks = { // When block size is less than or equal to warp size BENCHMARK_TYPE(int, 64), BENCHMARK_TYPE(float, 64), @@ -177,7 +158,7 @@ void add_benchmarks(std::vector& benchmarks, benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -186,15 +167,15 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); std::cout << "benchmark_block_reduce" << std::endl; // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; @@ -203,19 +184,22 @@ int main(int argc, char *argv[]) std::vector benchmarks; // using_warp_scan using reduce_uwr_t = reduce; - add_benchmarks( - benchmarks, "reduce", "BLOCK_REDUCE_WARP_REDUCTIONS", stream, size - ); + add_benchmarks(benchmarks, + "reduce", + "BLOCK_REDUCE_WARP_REDUCTIONS", + stream, + size); // raking reduce using reduce_rr_t = reduce; - add_benchmarks( - benchmarks, "reduce", "BLOCK_REDUCE_RAKING", stream, size - ); + add_benchmarks(benchmarks, "reduce", "BLOCK_REDUCE_RAKING", stream, size); // raking reduce commutative only - using reduce_rrco_t = reduce; - add_benchmarks( - benchmarks, "reduce", "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY", stream, size - ); + using reduce_rrco_t + = reduce; + add_benchmarks(benchmarks, + "reduce", + "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY", + stream, + size); // Use manual timing for(auto& b : benchmarks) diff --git a/benchmark/benchmark_block_run_length_decode.cpp b/benchmark/benchmark_block_run_length_decode.cpp index 8ef5def9..6769fd47 100644 --- a/benchmark/benchmark_block_run_length_decode.cpp +++ b/benchmark/benchmark_block_run_length_decode.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -30,57 +30,48 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template< - class ItemT, - class OffsetT, - unsigned BlockSize, - unsigned RunsPerThread, - unsigned DecodedItemsPerThread, - unsigned Trials -> +template __global__ -__launch_bounds__(BlockSize) -void block_run_length_decode_kernel( - const ItemT * d_run_items, - const OffsetT * d_run_offsets, - ItemT * d_decoded_items, - bool enable_store = false) + __launch_bounds__(BlockSize) void block_run_length_decode_kernel(const ItemT* d_run_items, + const OffsetT* d_run_offsets, + ItemT* d_decoded_items, + bool enable_store = false) { - using BlockRunLengthDecodeT = hipcub::BlockRunLengthDecode< - ItemT, - BlockSize, - RunsPerThread, - DecodedItemsPerThread - >; - - ItemT run_items[RunsPerThread]; + using BlockRunLengthDecodeT + = hipcub::BlockRunLengthDecode; + + ItemT run_items[RunsPerThread]; OffsetT run_offsets[RunsPerThread]; const unsigned global_thread_idx = BlockSize * hipBlockIdx_x + hipThreadIdx_x; hipcub::LoadDirectBlocked(global_thread_idx, d_run_items, run_items); hipcub::LoadDirectBlocked(global_thread_idx, d_run_offsets, run_offsets); - BlockRunLengthDecodeT block_run_length_decode( - run_items, - run_offsets - ); + BlockRunLengthDecodeT block_run_length_decode(run_items, run_offsets); - const OffsetT total_decoded_size = - d_run_offsets[(hipBlockIdx_x + 1) * BlockSize * RunsPerThread] - - d_run_offsets[hipBlockIdx_x * BlockSize * RunsPerThread]; + const OffsetT total_decoded_size + = d_run_offsets[(hipBlockIdx_x + 1) * BlockSize * RunsPerThread] + - d_run_offsets[hipBlockIdx_x * BlockSize * RunsPerThread]; - #pragma nounroll - for (unsigned i = 0; i < Trials; ++i) +#pragma nounroll + for(unsigned i = 0; i < Trials; ++i) { OffsetT decoded_window_offset = 0; - while (decoded_window_offset < total_decoded_size) + while(decoded_window_offset < total_decoded_size) { ItemT decoded_items[DecodedItemsPerThread]; block_run_length_decode.RunLengthDecode(decoded_items, decoded_window_offset); - if (enable_store) + if(enable_store) { - hipcub::StoreDirectBlocked(global_thread_idx, d_decoded_items + decoded_window_offset, decoded_items); + hipcub::StoreDirectBlocked(global_thread_idx, + d_decoded_items + decoded_window_offset, + decoded_items); } decoded_window_offset += BlockSize * DecodedItemsPerThread; @@ -88,91 +79,81 @@ void block_run_length_decode_kernel( } } -template< - class ItemT, - class OffsetT, - unsigned MinRunLength, - unsigned MaxRunLength, - unsigned BlockSize, - unsigned RunsPerThread, - unsigned DecodedItemsPerThread, - unsigned Trials = 100 -> +template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { - constexpr auto runs_per_block = BlockSize * RunsPerThread; - const auto target_num_runs = 2 * N / (MinRunLength + MaxRunLength); - const auto num_runs = runs_per_block * ((target_num_runs + runs_per_block - 1)/runs_per_block); + constexpr auto runs_per_block = BlockSize * RunsPerThread; + const auto target_num_runs = 2 * N / (MinRunLength + MaxRunLength); + const auto num_runs + = runs_per_block * ((target_num_runs + runs_per_block - 1) / runs_per_block); - std::vector run_items(num_runs); + std::vector run_items(num_runs); std::vector run_offsets(num_runs + 1); std::default_random_engine prng(std::random_device{}()); - using ItemDistribution = std::conditional_t< - std::is_integral::value, - std::uniform_int_distribution, - std::uniform_real_distribution - >; - ItemDistribution run_item_dist(0, 100); + using ItemDistribution = std::conditional_t::value, + std::uniform_int_distribution, + std::uniform_real_distribution>; + ItemDistribution run_item_dist(0, 100); std::uniform_int_distribution run_length_dist(MinRunLength, MaxRunLength); - for (size_t i = 0; i < num_runs; ++i) + for(size_t i = 0; i < num_runs; ++i) { run_items[i] = run_item_dist(prng); } - for (size_t i = 1; i < num_runs + 1; ++i) + for(size_t i = 1; i < num_runs + 1; ++i) { const OffsetT next_run_length = run_length_dist(prng); - run_offsets[i] = run_offsets[i - 1] + next_run_length; + run_offsets[i] = run_offsets[i - 1] + next_run_length; } const OffsetT output_length = run_offsets.back(); - ItemT * d_run_items{}; + ItemT* d_run_items{}; HIP_CHECK(hipMalloc(&d_run_items, run_items.size() * sizeof(ItemT))); - HIP_CHECK( - hipMemcpy( - d_run_items, run_items.data(), - run_items.size() * sizeof(ItemT), - hipMemcpyHostToDevice - ) - ); - - OffsetT * d_run_offsets{}; + HIP_CHECK(hipMemcpy(d_run_items, + run_items.data(), + run_items.size() * sizeof(ItemT), + hipMemcpyHostToDevice)); + + OffsetT* d_run_offsets{}; HIP_CHECK(hipMalloc(&d_run_offsets, run_offsets.size() * sizeof(OffsetT))); - HIP_CHECK( - hipMemcpy( - d_run_offsets, run_offsets.data(), - run_offsets.size() * sizeof(OffsetT), - hipMemcpyHostToDevice - ) - ); - - ItemT * d_output{}; + HIP_CHECK(hipMemcpy(d_run_offsets, + run_offsets.data(), + run_offsets.size() * sizeof(OffsetT), + hipMemcpyHostToDevice)); + + ItemT* d_output{}; HIP_CHECK(hipMalloc(&d_output, output_length * sizeof(ItemT))); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - block_run_length_decode_kernel< - ItemT, - OffsetT, - BlockSize, - RunsPerThread, - DecodedItemsPerThread, - Trials - > - ), - dim3(num_runs/runs_per_block), dim3(BlockSize), 0, stream, - d_run_items, d_run_offsets, d_output - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(block_run_length_decode_kernel), + dim3(num_runs / runs_per_block), + dim3(BlockSize), + 0, + stream, + d_run_items, + d_run_offsets, + d_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } @@ -184,14 +165,17 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(IT, OT, MINRL, MAXRL, BS, RPT, DIPT) \ - benchmark::RegisterBenchmark( \ - "block_run_length_decode", \ - &run_benchmark, \ - stream, size \ - ) - -int main(int argc, char *argv[]) +#define CREATE_BENCHMARK(IT, OT, MINRL, MAXRL, BS, RPT, DIPT) \ + benchmark::RegisterBenchmark( \ + std::string("block_run_length_decode.") \ + .c_str(), \ + &run_benchmark, \ + stream, \ + size) + +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -200,22 +184,21 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); std::cout << "benchmark_block_run_length_decode" << std::endl; // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks - std::vector benchmarks - { + std::vector benchmarks{ CREATE_BENCHMARK(int, int, 1, 5, 128, 2, 4), CREATE_BENCHMARK(int, int, 1, 10, 128, 2, 4), CREATE_BENCHMARK(int, int, 1, 50, 128, 2, 4), @@ -230,8 +213,7 @@ int main(int argc, char *argv[]) CREATE_BENCHMARK(double, long long, 1, 100, 128, 2, 4), CREATE_BENCHMARK(double, long long, 1, 500, 128, 2, 4), CREATE_BENCHMARK(double, long long, 1, 1000, 128, 2, 4), - CREATE_BENCHMARK(double, long long, 1, 5000, 128, 2, 4) - }; + CREATE_BENCHMARK(double, long long, 1, 5000, 128, 2, 4)}; // Use manual timing for(auto& b : benchmarks) diff --git a/benchmark/benchmark_block_scan.cpp b/benchmark/benchmark_block_scan.cpp index f45d3862..340d3b4e 100644 --- a/benchmark/benchmark_block_scan.cpp +++ b/benchmark/benchmark_block_scan.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -25,7 +25,6 @@ // hipCUB API #include "hipcub/block/block_scan.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif @@ -58,7 +57,7 @@ struct inclusive_scan using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage storage; - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { bscan_t(storage).InclusiveScan(values, values, hipcub::Sum()); @@ -110,23 +109,17 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { // Make sure size is a multiple of BlockSize constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); // Allocate and fill memory std::vector input(size, T(1)); - T * d_input; - T * d_output; + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), @@ -141,8 +134,8 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } @@ -154,12 +147,14 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) } // IPT - items per thread -#define CREATE_BENCHMARK(T, BS, IPT) \ - benchmark::RegisterBenchmark( \ - (std::string("block_scan.Method Name:") + method_name).c_str(), \ - &run_benchmark, \ - stream, size \ - ) +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark(std::string("block_scan.method_name:" + method_name) \ + .c_str(), \ + &run_benchmark, \ + stream, \ + size) // clang-format off #define BENCHMARK_TYPE(type, block) \ @@ -178,7 +173,7 @@ void add_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { - using custom_float2 = benchmark_utils::custom_type; + using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; std::vector new_benchmarks = { @@ -204,7 +199,7 @@ void add_benchmarks(std::vector& benchmarks, benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -213,15 +208,15 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); std::cout << "benchmark_block_scan" << std::endl; // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_block_shuffle.cpp b/benchmark/benchmark_block_shuffle.cpp index 2f0d8cb5..4ba9fb0e 100644 --- a/benchmark/benchmark_block_shuffle.cpp +++ b/benchmark/benchmark_block_shuffle.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -214,22 +214,21 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK_IPT(BS, IPT) \ - benchmark::RegisterBenchmark( \ - (std::string("block_shuffle.SubAlgorithm Name:") \ - + name) \ - .c_str(), \ - &run_benchmark, \ - stream, \ +#define CREATE_BENCHMARK_IPT(BS, IPT) \ + benchmark::RegisterBenchmark( \ + ("block_shuffle.sub_algorithm_name:" + name) \ + .c_str(), \ + &run_benchmark, \ + stream, \ size) -#define CREATE_BENCHMARK(BS) \ - benchmark::RegisterBenchmark((std::string("block_shuffle.SubAlgorithm Name:") + name) \ - .c_str(), \ - &run_benchmark, \ - stream, \ +#define CREATE_BENCHMARK(BS) \ + benchmark::RegisterBenchmark(("block_shuffle.sub_algorithm_name:" + name) \ + .c_str(), \ + &run_benchmark, \ + stream, \ size) template = true> @@ -303,6 +302,7 @@ int main(int argc, char* argv[]) hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_device_adjacent_difference.cpp b/benchmark/benchmark_device_adjacent_difference.cpp index f42ceb76..e0788f0b 100644 --- a/benchmark/benchmark_device_adjacent_difference.cpp +++ b/benchmark/benchmark_device_adjacent_difference.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -49,7 +49,7 @@ constexpr std::size_t DEFAULT_N = 1024 * 1024 * 128; constexpr unsigned int batch_size = 10; constexpr unsigned int warmup_size = 5; -template +template auto dispatch_adjacent_difference(std::true_type /*left*/, std::true_type /*copy*/, void* const temporary_storage, @@ -58,11 +58,14 @@ auto dispatch_adjacent_difference(std::true_type /*left*/, const OutputIt output, Args&&... args) { - return ::hipcub::DeviceAdjacentDifference::SubtractLeftCopy( - temporary_storage, storage_size, input, output, std::forward(args)...); + return ::hipcub::DeviceAdjacentDifference::SubtractLeftCopy(temporary_storage, + storage_size, + input, + output, + std::forward(args)...); } -template +template auto dispatch_adjacent_difference(std::false_type /*left*/, std::true_type /*copy*/, void* const temporary_storage, @@ -71,11 +74,14 @@ auto dispatch_adjacent_difference(std::false_type /*left*/, const OutputIt output, Args&&... args) { - return ::hipcub::DeviceAdjacentDifference::SubtractRightCopy( - temporary_storage, storage_size, input, output, std::forward(args)...); + return ::hipcub::DeviceAdjacentDifference::SubtractRightCopy(temporary_storage, + storage_size, + input, + output, + std::forward(args)...); } -template +template auto dispatch_adjacent_difference(std::true_type /*left*/, std::false_type /*copy*/, void* const temporary_storage, @@ -84,11 +90,13 @@ auto dispatch_adjacent_difference(std::true_type /*left*/, const OutputIt /*output*/, Args&&... args) { - return ::hipcub::DeviceAdjacentDifference::SubtractLeft( - temporary_storage, storage_size, input, std::forward(args)...); + return ::hipcub::DeviceAdjacentDifference::SubtractLeft(temporary_storage, + storage_size, + input, + std::forward(args)...); } -template +template auto dispatch_adjacent_difference(std::false_type /*left*/, std::false_type /*copy*/, void* const temporary_storage, @@ -97,11 +105,13 @@ auto dispatch_adjacent_difference(std::false_type /*left*/, const OutputIt /*output*/, Args&&... args) { - return ::hipcub::DeviceAdjacentDifference::SubtractRight( - temporary_storage, storage_size, input, std::forward(args)...); + return ::hipcub::DeviceAdjacentDifference::SubtractRight(temporary_storage, + storage_size, + input, + std::forward(args)...); } -template +template void run_benchmark(benchmark::State& state, const std::size_t size, const hipStream_t stream) { using output_type = T; @@ -180,12 +190,15 @@ void run_benchmark(benchmark::State& state, const std::size_t size, const hipStr using namespace std::string_literals; -#define CREATE_BENCHMARK(T, left, copy) \ - benchmark::RegisterBenchmark(("Subtract" + (left ? "Left"s : "Right"s) \ - + (copy ? "Copy"s : ""s) + "<" #T ">") \ - .c_str(), \ - &run_benchmark, \ - size, \ +#define CREATE_BENCHMARK(T, left, copy) \ + benchmark::RegisterBenchmark(std::string("device_adjacent_difference" \ + "." \ + "sub_algorithm_name:subtract_" \ + + std::string(left ? "left" : "right") \ + + std::string(copy ? "_copy" : "")) \ + .c_str(), \ + &run_benchmark, \ + size, \ stream) // clang-format off @@ -214,6 +227,8 @@ int main(int argc, char* argv[]) int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_device_adjacent_difference" << std::endl; std::cout << "[HIP] Device name: " << devProp.name << std::endl; using custom_float2 = benchmark_utils::custom_type; diff --git a/benchmark/benchmark_device_batch_copy.cpp b/benchmark/benchmark_device_batch_copy.cpp index 5a29f19f..feca312e 100644 --- a/benchmark/benchmark_device_batch_copy.cpp +++ b/benchmark/benchmark_device_batch_copy.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -53,7 +53,8 @@ constexpr int32_t blev_min_size = 1024; // have source and destinations mappings not be the identity function: // // batch_copy( -// [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c, d!) +// [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c, +// d!) // [&a0', &b0', &c0', &d0'], // to (order is the same as above too!) // [3 , 2 , 1 , 2 ]) // size // @@ -327,15 +328,20 @@ void run_benchmark(benchmark::State& state, HIP_CHECK(hipFree(d_temp_storage)); } -#define CREATE_BENCHMARK(item_size, item_alignment, size_type, num_tlev, num_wlev, num_blev) \ - benchmark::RegisterBenchmark( \ - "{lvl:device,item_size:" #item_size ",item_alignment:" #item_alignment \ - ",size_type:" #size_type ",algo:batch_memcpy,num_tlev:" #num_tlev ",num_wlev:" #num_wlev \ - ",num_blev:" #num_blev ",cfg:default_config}", \ - [=](benchmark::State& state) \ - { \ - run_benchmark, \ - size_type>(state, stream, num_tlev, num_wlev, num_blev); \ +#define CREATE_BENCHMARK(IS, IA, T, num_tlev, num_wlev, num_blev) \ + benchmark::RegisterBenchmark( \ + std::string("device_batch_copy" \ + ".") \ + .c_str(), \ + [=](benchmark::State& state) \ + { \ + run_benchmark, T>(state, \ + stream, \ + num_tlev, \ + num_wlev, \ + num_blev); \ }) #define BENCHMARK_TYPE(item_size, item_alignment) \ @@ -364,6 +370,15 @@ int32_t main(int32_t argc, char* argv[]) // HIP hipStream_t stream = hipStreamDefault; // default + hipDeviceProp_t devProp; + int device_id = 0; + + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_device_batch_copy" << std::endl; + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + // Benchmark info benchmark::AddCustomContext("size", std::to_string(size)); @@ -378,6 +393,8 @@ int32_t main(int32_t argc, char* argv[]) BENCHMARK_TYPE(4, 4), BENCHMARK_TYPE(8, 8)}; + + // Use manual timing for(auto& b : benchmarks) { diff --git a/benchmark/benchmark_device_batch_memcpy.cpp b/benchmark/benchmark_device_batch_memcpy.cpp index 3d72e349..f0f38be2 100644 --- a/benchmark/benchmark_device_batch_memcpy.cpp +++ b/benchmark/benchmark_device_batch_memcpy.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -56,7 +56,8 @@ constexpr int32_t blev_min_size = 1024; // have source and destinations mappings not be the identity function: // // batch_memcpy( -// [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c, d!) +// [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c, +// d!) // [&a0', &b0', &c0', &d0'], // to (order is the same as above too!) // [3 , 2 , 1 , 2 ]) // size // @@ -337,15 +338,19 @@ void run_benchmark(benchmark::State& state, HIP_CHECK(hipFree(d_temp_storage)); } -#define CREATE_BENCHMARK(item_size, item_alignment, size_type, num_tlev, num_wlev, num_blev) \ +#define CREATE_BENCHMARK(IS, IA, T, num_tlev, num_wlev, num_blev) \ benchmark::RegisterBenchmark( \ - "{lvl:device,item_size:" #item_size ",item_alignment:" #item_alignment \ - ",size_type:" #size_type ",algo:batch_memcpy,num_tlev:" #num_tlev ",num_wlev:" #num_wlev \ - ",num_blev:" #num_blev ",cfg:default_config}", \ + std::string("device_batch_memcpy.") \ + .c_str(), \ [=](benchmark::State& state) \ { \ - run_benchmark, \ - size_type>(state, stream, num_tlev, num_wlev, num_blev); \ + run_benchmark, T>(state, \ + stream, \ + num_tlev, \ + num_wlev, \ + num_blev); \ }) #define BENCHMARK_TYPE(item_size, item_alignment) \ @@ -371,6 +376,14 @@ int32_t main(int32_t argc, char* argv[]) const size_t size = parser.get("size"); const int32_t trials = parser.get("trials"); + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_device_adjacent_difference" << std::endl; + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + // HIP hipStream_t stream = hipStreamDefault; // default diff --git a/benchmark/benchmark_device_histogram.cpp b/benchmark/benchmark_device_histogram.cpp index cfde99f9..a5019e4b 100644 --- a/benchmark/benchmark_device_histogram.cpp +++ b/benchmark/benchmark_device_histogram.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -36,7 +36,7 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -const unsigned int batch_size = 10; +const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template @@ -50,9 +50,9 @@ std::vector const size_t max_random_size = 1024 * 1024; - std::random_device rd; + std::random_device rd; std::default_random_engine gen(rd()); - std::vector data(size); + std::vector data(size); std::generate(data.begin(), data.begin() + std::min(size, max_random_size), [&]() @@ -87,15 +87,15 @@ int get_entropy_percents(int entropy_reduction) } } -const int entropy_reductions[] = { 0, 2, 4, 6 }; +const int entropy_reductions[] = {0, 2, 4, 6}; template void run_even_benchmark(benchmark::State& state, - size_t bins, - size_t scale, - int entropy_reduction, - hipStream_t stream, - size_t size) + size_t bins, + size_t scale, + int entropy_reduction, + hipStream_t stream, + size_t size) { using counter_type = unsigned int; @@ -107,19 +107,13 @@ void run_even_benchmark(benchmark::State& state, // Generate data std::vector input = generate(size, entropy_reduction, lower_level, upper_level); - T * d_input; - counter_type * d_histogram; + T* d_input; + counter_type* d_histogram; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(hipcub::DeviceHistogram::HistogramEven(d_temporary_storage, temporary_storage_bytes, @@ -149,7 +143,7 @@ void run_even_benchmark(benchmark::State& state, } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); @@ -168,8 +162,8 @@ void run_even_benchmark(benchmark::State& state, HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); @@ -182,11 +176,11 @@ void run_even_benchmark(benchmark::State& state, template void run_multi_even_benchmark(benchmark::State& state, - size_t bins, - size_t scale, - int entropy_reduction, - hipStream_t stream, - size_t size) + size_t bins, + size_t scale, + int entropy_reduction, + hipStream_t stream, + size_t size) { using counter_type = unsigned int; @@ -197,28 +191,23 @@ void run_multi_even_benchmark(benchmark::State& state, { lower_level[channel] = 0; upper_level[channel] = bins * scale; - num_levels[channel] = bins + 1; + num_levels[channel] = bins + 1; } // Generate data - std::vector input = generate(size * Channels, entropy_reduction, lower_level[0], upper_level[0]); + std::vector input + = generate(size * Channels, entropy_reduction, lower_level[0], upper_level[0]); - T * d_input; - counter_type * d_histogram[ActiveChannels]; + T* d_input; + counter_type* d_histogram[ActiveChannels]; HIP_CHECK(hipMalloc(&d_input, size * Channels * sizeof(T))); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK(hipMalloc(&d_histogram[channel], bins * sizeof(counter_type))); } - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * Channels * sizeof(T), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; + HIP_CHECK(hipMemcpy(d_input, input.data(), size * Channels * sizeof(T), hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK((hipcub::DeviceHistogram::MultiHistogramEven( d_temporary_storage, @@ -250,7 +239,7 @@ void run_multi_even_benchmark(benchmark::State& state, } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); @@ -270,8 +259,8 @@ void run_multi_even_benchmark(benchmark::State& state, HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * Channels * sizeof(T)); @@ -296,28 +285,16 @@ void run_range_benchmark(benchmark::State& state, size_t bins, hipStream_t strea std::vector levels(bins + 1); std::iota(levels.begin(), levels.end(), static_cast(0)); - T * d_input; - T * d_levels; - counter_type * d_histogram; + T* d_input; + T* d_levels; + counter_type* d_histogram; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_levels, (bins + 1) * sizeof(T))); HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_levels, levels.data(), - (bins + 1) * sizeof(T), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_levels, levels.data(), (bins + 1) * sizeof(T), hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(hipcub::DeviceHistogram::HistogramRange(d_temporary_storage, temporary_storage_bytes, @@ -345,7 +322,7 @@ void run_range_benchmark(benchmark::State& state, size_t bins, hipStream_t strea } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); @@ -363,8 +340,8 @@ void run_range_benchmark(benchmark::State& state, size_t bins, hipStream_t strea HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); @@ -377,15 +354,18 @@ void run_range_benchmark(benchmark::State& state, size_t bins, hipStream_t strea } template -void run_multi_range_benchmark(benchmark::State& state, size_t bins, hipStream_t stream, size_t size) +void run_multi_range_benchmark(benchmark::State& state, + size_t bins, + hipStream_t stream, + size_t size) { using counter_type = unsigned int; // Number of levels for a single channel - const int num_levels_channel = bins + 1; - int num_levels[ActiveChannels]; + const int num_levels_channel = bins + 1; + int num_levels[ActiveChannels]; std::vector levels[ActiveChannels]; - for (unsigned int channel = 0; channel < ActiveChannels; channel++) + for(unsigned int channel = 0; channel < ActiveChannels; channel++) { levels[channel].resize(num_levels_channel); std::iota(levels[channel].begin(), levels[channel].end(), static_cast(0)); @@ -395,9 +375,9 @@ void run_multi_range_benchmark(benchmark::State& state, size_t bins, hipStream_t // Generate data std::vector input = benchmark_utils::get_random_data(size * Channels, 0, bins); - T * d_input; - T * d_levels[ActiveChannels]; - counter_type * d_histogram[ActiveChannels]; + T* d_input; + T* d_levels[ActiveChannels]; + counter_type* d_histogram[ActiveChannels]; HIP_CHECK(hipMalloc(&d_input, size * Channels * sizeof(T))); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { @@ -405,25 +385,16 @@ void run_multi_range_benchmark(benchmark::State& state, size_t bins, hipStream_t HIP_CHECK(hipMalloc(&d_histogram[channel], size * sizeof(counter_type))); } - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * Channels * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * Channels * sizeof(T), hipMemcpyHostToDevice)); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { - HIP_CHECK( - hipMemcpy( - d_levels[channel], levels[channel].data(), - num_levels_channel * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_levels[channel], + levels[channel].data(), + num_levels_channel * sizeof(T), + hipMemcpyHostToDevice)); } - void * d_temporary_storage = nullptr; + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK((hipcub::DeviceHistogram::MultiHistogramRange( d_temporary_storage, @@ -453,7 +424,7 @@ void run_multi_range_benchmark(benchmark::State& state, size_t bins, hipStream_t } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); @@ -472,8 +443,8 @@ void run_multi_range_benchmark(benchmark::State& state, size_t bins, hipStream_t HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * Channels * sizeof(T)); @@ -510,9 +481,11 @@ struct num_limits<__half> if(num_limits::max() > BINS * SCALE) \ { \ VECTOR.push_back(benchmark::RegisterBenchmark( \ - (std::string("histogram_even") + "" + "(Entropy Percent:" \ - + std::to_string(get_entropy_percents(entropy_reduction)) + "%,Bin Count:" \ - + std::to_string(BINS) + " bins)") \ + std::string("device_histogram_even" \ + "." \ + "(entropy_percent:" \ + + std::to_string(get_entropy_percents(entropy_reduction)) \ + + "%,bin_count:" + std::to_string(BINS) + " bins)") \ .c_str(), \ [=](benchmark::State& state) \ { run_even_benchmark(state, BINS, SCALE, entropy_reduction, stream, size); })); \ @@ -527,8 +500,8 @@ struct num_limits<__half> CREATE_EVEN_BENCHMARK(VECTOR, T, 65536, 1) void add_even_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { for(int entropy_reduction : entropy_reductions) { @@ -538,34 +511,40 @@ void add_even_benchmarks(std::vector& benchmark BENCHMARK_TYPE(benchmarks, uint8_t); BENCHMARK_TYPE(benchmarks, double); BENCHMARK_TYPE(benchmarks, float); - //this limitation can be removed once https://github.com/NVIDIA/cub/issues/484 is fixed + // this limitation can be removed once + // https://github.com/NVIDIA/cub/issues/484 is fixed #ifdef __HIP_PLATFORM_AMD__ BENCHMARK_TYPE(benchmarks, __half); #endif }; } -#define CREATE_MULTI_EVEN_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS, SCALE) \ -benchmark::RegisterBenchmark( \ - (std::string("multi_histogram_even") + "" + \ - "(Entropy Percent:" + std::to_string(get_entropy_percents(entropy_reduction)) + "%,Bin Count:" + \ - std::to_string(BINS) + " bins)" \ - ).c_str(), \ - [=](benchmark::State& state) { \ - run_multi_even_benchmark( \ - state, BINS, SCALE, entropy_reduction, stream, size \ - ); \ - } \ -) +#define CREATE_MULTI_EVEN_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS, SCALE) \ + benchmark::RegisterBenchmark( \ + std::string("device_multi_histogram_even" \ + "." \ + "(entropy_percent:" \ + + std::to_string(get_entropy_percents(entropy_reduction)) \ + + "%,bin_count:" + std::to_string(BINS) + " bins)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { \ + run_multi_even_benchmark(state, \ + BINS, \ + SCALE, \ + entropy_reduction, \ + stream, \ + size); \ + }) void add_multi_even_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { for(int entropy_reduction : entropy_reductions) { - std::vector bs = - { + std::vector bs = { CREATE_MULTI_EVEN_BENCHMARK(4, 3, int, 10, 1234), CREATE_MULTI_EVEN_BENCHMARK(4, 3, int, 100, 1234), @@ -580,13 +559,14 @@ void add_multi_even_benchmarks(std::vector& ben }; } -#define CREATE_RANGE_BENCHMARK(T, BINS) \ -benchmark::RegisterBenchmark( \ - (std::string("histogram_range") + "" + \ - "(Bin Count:" + std::to_string(BINS) + " bins)" \ - ).c_str(), \ - [=](benchmark::State& state) { run_range_benchmark(state, BINS, stream, size); } \ -) +#define CREATE_RANGE_BENCHMARK(T, BINS) \ + benchmark::RegisterBenchmark(std::string("device_histogram_range" \ + "." \ + "(bin_count:" \ + + std::to_string(BINS) + " bins)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_range_benchmark(state, BINS, stream, size); }) #define BENCHMARK_RANGE_TYPE(T) \ CREATE_RANGE_BENCHMARK(T, 10), CREATE_RANGE_BENCHMARK(T, 100), \ @@ -594,32 +574,29 @@ benchmark::RegisterBenchmark( \ CREATE_RANGE_BENCHMARK(T, 100000), CREATE_RANGE_BENCHMARK(T, 1000000) void add_range_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { std::vector bs = {BENCHMARK_RANGE_TYPE(float), BENCHMARK_RANGE_TYPE(double)}; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -#define CREATE_MULTI_RANGE_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS) \ -benchmark::RegisterBenchmark( \ - (std::string("multi_histogram_range") + "<" #CHANNELS ", " #ACTIVE_CHANNELS ", " #T ">" + \ - "(" + std::to_string(BINS) + " bins)" \ - ).c_str(), \ - [=](benchmark::State& state) { \ - run_multi_range_benchmark( \ - state, BINS, stream, size \ - ); \ - } \ -) +#define CREATE_MULTI_RANGE_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS) \ + benchmark::RegisterBenchmark( \ + std::string("device_multi_histogram_range" \ + ".(bin_count:" \ + + std::to_string(BINS) + " bins)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_multi_range_benchmark(state, BINS, stream, size); }) void add_multi_range_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - std::vector bs = - { + std::vector bs = { CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 10), CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 100), CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 1000), @@ -630,7 +607,7 @@ void add_multi_range_benchmarks(std::vector& be benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -639,15 +616,15 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); std::cout << "benchmark_device_histogram" << std::endl; // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_device_memory.cpp b/benchmark/benchmark_device_memory.cpp index 8659cedf..5a879210 100644 --- a/benchmark/benchmark_device_memory.cpp +++ b/benchmark/benchmark_device_memory.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -272,8 +272,7 @@ void run_benchmark(benchmark::State& state, size_t size, const hipStream_t strea if(std::is_floating_point::value) { input = benchmark_utils::get_random_data(size, (T)-1000, (T) + 1000); - } - else + } else { input = benchmark_utils::get_random_data(size, std::numeric_limits::min(), @@ -350,9 +349,9 @@ template void run_benchmark_memcpy(benchmark::State& state, size_t size, const hipStream_t stream) { // Allocate device buffers - // Note: since this benchmark only tests memcpy performance between device buffers, - // we don't really need to copy data into these from the host - whatever happens - // to be in memory will suffice. + // Note: since this benchmark only tests memcpy performance between device + // buffers, we don't really need to copy data into these from the host - + // whatever happens to be in memory will suffice. T* d_input; T* d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); @@ -401,20 +400,18 @@ void run_benchmark_memcpy(benchmark::State& state, size_t size, const hipStream_ HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK_IPT(METHOD, OPERATION, T, SIZE, BLOCK_SIZE, IPT) \ - { \ - benchmarks.push_back(benchmark::RegisterBenchmark( \ - #METHOD "_" #OPERATION "<" #T "," #SIZE ",BS:" #BLOCK_SIZE ",IPT:" #IPT ">", \ - [=](benchmark::State& state) \ - { run_benchmark(state, SIZE, stream); })); \ - } - -#define CREATE_BENCHMARK_MEMCPY(T, SIZE) \ - { \ - benchmarks.push_back(benchmark::RegisterBenchmark( \ - "Memcpy<" #T "," #SIZE ">", \ - [=](benchmark::State& state) { run_benchmark_memcpy(state, SIZE, stream); })); \ - } +#define CREATE_BENCHMARK_IPT(METHOD, OPERATION, T, SIZE, BS, IPT) \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("device_memory.") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_benchmark(state, SIZE, stream); })); + +#define CREATE_BENCHMARK_MEMCPY(T, SIZE) \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("device_memory_memcpy.").c_str(), \ + [=](benchmark::State& state) { run_benchmark_memcpy(state, SIZE, stream); })); // clang-format off #define CREATE_BENCHMARK_BLOCK_SIZE(MEM_OP, OP, TYPE, SIZE, BLOCK_SIZE) \ diff --git a/benchmark/benchmark_device_merge_sort.cpp b/benchmark/benchmark_device_merge_sort.cpp index fbfd35f7..506a8c04 100644 --- a/benchmark/benchmark_device_merge_sort.cpp +++ b/benchmark/benchmark_device_merge_sort.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -23,14 +23,14 @@ #include "common_benchmark_header.hpp" // HIP API -#include "hipcub/hipcub.hpp" #include "hipcub/device/device_merge_sort.hpp" +#include "hipcub/hipcub.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 32 << 20; #endif -const unsigned int batch_size = 10; +const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template @@ -40,50 +40,43 @@ std::vector generate_keys(size_t size) if(std::is_floating_point::value) { - return benchmark_utils::get_random_data(size, static_cast(-1000), static_cast(1000), size); - } - else + return benchmark_utils::get_random_data(size, + static_cast(-1000), + static_cast(1000), + size); + } else { - return benchmark_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max(), - size - ); + return benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max(), + size); } } template -void run_sort_keys_benchmark(benchmark::State& state, - hipStream_t stream, - size_t size) +void run_sort_keys_benchmark(benchmark::State& state, hipStream_t stream, size_t size) { - using key_type = Key; - auto compare_function = [] __device__ (const key_type & a, const key_type & b) { return a < b; }; + using key_type = Key; + auto compare_function = [] __device__(const key_type& a, const key_type& b) { return a < b; }; auto keys_input = generate_keys(size); - key_type * d_keys_input; - key_type * d_keys_output; + key_type* d_keys_input; + key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; + hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; - HIP_CHECK( - hipcub::DeviceMergeSort::SortKeysCopy( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, size, - compare_function, stream - ) - ); + HIP_CHECK(hipcub::DeviceMergeSort::SortKeysCopy(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + compare_function, + stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); @@ -91,35 +84,35 @@ void run_sort_keys_benchmark(benchmark::State& state, // Warm-up for(size_t i = 0; i < warmup_size; i++) { - HIP_CHECK( - hipcub::DeviceMergeSort::SortKeysCopy( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, size, - compare_function, stream - ) - ); + HIP_CHECK(hipcub::DeviceMergeSort::SortKeysCopy(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + compare_function, + stream)); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK( - hipcub::DeviceMergeSort::SortKeysCopy( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, size, - compare_function, stream - ) - ); + HIP_CHECK(hipcub::DeviceMergeSort::SortKeysCopy(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + compare_function, + stream)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); @@ -131,54 +124,46 @@ void run_sort_keys_benchmark(benchmark::State& state, } template -void run_sort_pairs_benchmark(benchmark::State& state, - hipStream_t stream, - size_t size) +void run_sort_pairs_benchmark(benchmark::State& state, hipStream_t stream, size_t size) { - using key_type = Key; - using value_type = Value; - auto compare_function = [] __device__ (const key_type & a, const key_type & b) { return a < b; }; + using key_type = Key; + using value_type = Value; + auto compare_function = [] __device__(const key_type& a, const key_type& b) { return a < b; }; - auto keys_input = generate_keys(size); + auto keys_input = generate_keys(size); std::vector values_input(size); for(size_t i = 0; i < size; i++) { values_input[i] = value_type(i); } - key_type * d_keys_input; - key_type * d_keys_output; + key_type* d_keys_input; + key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - value_type * d_values_input; - value_type * d_values_output; + hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + value_type* d_values_input; + value_type* d_values_output; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - size * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; + HIP_CHECK(hipMemcpy(d_values_input, + values_input.data(), + size * sizeof(value_type), + hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; - HIP_CHECK( - hipcub::DeviceMergeSort::SortPairsCopy( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_values_input, d_keys_output, d_values_output, size, - compare_function, stream - ) - ); + HIP_CHECK(hipcub::DeviceMergeSort::SortPairsCopy(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_values_input, + d_keys_output, + d_values_output, + size, + compare_function, + stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); @@ -186,40 +171,43 @@ void run_sort_pairs_benchmark(benchmark::State& state, // Warm-up for(size_t i = 0; i < warmup_size; i++) { - HIP_CHECK( - hipcub::DeviceMergeSort::SortPairsCopy( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_values_input, d_keys_output, d_values_output, size, - compare_function, stream - ) - ); + HIP_CHECK(hipcub::DeviceMergeSort::SortPairsCopy(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_values_input, + d_keys_output, + d_values_output, + size, + compare_function, + stream)); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK( - hipcub::DeviceMergeSort::SortPairsCopy( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_values_input, d_keys_output, d_values_output, size, - compare_function, stream - ) - ); + HIP_CHECK(hipcub::DeviceMergeSort::SortPairsCopy(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_values_input, + d_keys_output, + d_values_output, + size, + compare_function, + stream)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } - state.SetBytesProcessed( - state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type)) - ); + state.SetBytesProcessed(state.iterations() * batch_size * size + * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); @@ -229,31 +217,23 @@ void run_sort_pairs_benchmark(benchmark::State& state, HIP_CHECK(hipFree(d_values_output)); } +#define CREATE_SORT_KEYS_BENCHMARK(T) \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("device_merge_sort_sort_keys" \ + ".") \ + .c_str(), \ + [=](benchmark::State& state) { run_sort_keys_benchmark(state, stream, size); })); -#define CREATE_SORT_KEYS_BENCHMARK(Key) \ - { \ - benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "<" #Key ">").c_str(), \ - [=](benchmark::State& state) { run_sort_keys_benchmark(state, stream, size); } \ - ) \ - ); \ - } - -#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value) \ - { \ - benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "<" #Key ", " #Value">").c_str(), \ - [=](benchmark::State& state) { run_sort_pairs_benchmark(state, stream, size); } \ - ) \ - ); \ - } - +#define CREATE_SORT_PAIRS_BENCHMARK(T, V) \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("device_merge_sort_sort_pairs<" \ + ",key_data_type:" #T ",value_data_type:" #V ">.") \ + .c_str(), \ + [=](benchmark::State& state) { run_sort_pairs_benchmark(state, stream, size); })); void add_sort_keys_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { CREATE_SORT_KEYS_BENCHMARK(int) CREATE_SORT_KEYS_BENCHMARK(long long) @@ -263,11 +243,11 @@ void add_sort_keys_benchmarks(std::vector& benc } void add_sort_pairs_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - using custom_float2 = benchmark_utils::custom_type; - using custom_double2 = benchmark_utils::custom_type; + using custom_float2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; using custom_char_double = benchmark_utils::custom_type; using custom_double_char = benchmark_utils::custom_type; @@ -289,7 +269,7 @@ void add_sort_pairs_benchmarks(std::vector& ben CREATE_SORT_PAIRS_BENCHMARK(uint8_t, uint8_t) } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -298,15 +278,17 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_device_merge_sort" << std::endl; std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks diff --git a/benchmark/benchmark_device_partition.cpp b/benchmark/benchmark_device_partition.cpp index 26c7739a..786fe139 100644 --- a/benchmark/benchmark_device_partition.cpp +++ b/benchmark/benchmark_device_partition.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -32,43 +32,45 @@ constexpr size_t DEFAULT_N = 1024 * 1024 * 32; #endif -constexpr unsigned int batch_size = 10; +constexpr unsigned int batch_size = 10; constexpr unsigned int warmup_size = 5; -namespace { -template -struct LessOp { - HIPCUB_HOST_DEVICE LessOp(const T& pivot) - : pivot_{pivot} - { - } +namespace +{ +template +struct LessOp +{ + HIPCUB_HOST_DEVICE LessOp(const T& pivot) : pivot_{pivot} {} - HIPCUB_HOST_DEVICE bool operator()(const T& val) const { + HIPCUB_HOST_DEVICE bool operator()(const T& val) const + { return val < pivot_; } + private: T pivot_; }; -} +} // namespace -template +template void run_flagged(benchmark::State& state, const hipStream_t stream, - const T threshold, - const size_t size) + const T threshold, + const size_t size) { - const auto select_op = LessOp{threshold}; - const auto input = - benchmark_utils::get_random_data(size, static_cast(0), static_cast(100)); + const auto select_op = LessOp{threshold}; + const auto input + = benchmark_utils::get_random_data(size, static_cast(0), static_cast(100)); std::vector flags(size); - for(unsigned int i = 0; i < size; i++) { + for(unsigned int i = 0; i < size; i++) + { flags[i] = static_cast(select_op(input[i])); } - T* d_input = nullptr; - F* d_flags = nullptr; - T* d_output = nullptr; + T* d_input = nullptr; + F* d_flags = nullptr; + T* d_output = nullptr; unsigned int* d_num_selected_output = nullptr; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_flags, input.size() * sizeof(F))); @@ -78,63 +80,54 @@ void run_flagged(benchmark::State& state, // Allocate temporary storage void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; - HIP_CHECK( - hipcub::DevicePartition::Flagged( - nullptr, - temp_storage_bytes, - d_input, - d_flags, - d_output, - d_num_selected_output, - static_cast(input.size()), - stream - ) - ); + HIP_CHECK(hipcub::DevicePartition::Flagged(nullptr, + temp_storage_bytes, + d_input, + d_flags, + d_output, + d_num_selected_output, + static_cast(input.size()), + stream)); HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); // Warm-up HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(d_flags, flags.data(), flags.size() * sizeof(F), hipMemcpyHostToDevice)); - for(unsigned int i = 0; i < warmup_size; ++i) { - HIP_CHECK( - hipcub::DevicePartition::Flagged( - d_temp_storage, - temp_storage_bytes, - d_input, - d_flags, - d_output, - d_num_selected_output, - static_cast(input.size()), - stream - ) - ); + for(unsigned int i = 0; i < warmup_size; ++i) + { + HIP_CHECK(hipcub::DevicePartition::Flagged(d_temp_storage, + temp_storage_bytes, + d_input, + d_flags, + d_output, + d_num_selected_output, + static_cast(input.size()), + stream)); } HIP_CHECK(hipDeviceSynchronize()); // Run benchmark - for(auto _ : state) { + for(auto _ : state) + { namespace chrono = std::chrono; - using clock = chrono::high_resolution_clock; + using clock = chrono::high_resolution_clock; const auto start = clock::now(); - for (unsigned int i = 0; i < batch_size; ++i) { - HIP_CHECK( - hipcub::DevicePartition::Flagged( - d_temp_storage, - temp_storage_bytes, - d_input, - d_flags, - d_output, - d_num_selected_output, - static_cast(input.size()), - stream - ) - ); + for(unsigned int i = 0; i < batch_size; ++i) + { + HIP_CHECK(hipcub::DevicePartition::Flagged(d_temp_storage, + temp_storage_bytes, + d_input, + d_flags, + d_output, + d_num_selected_output, + static_cast(input.size()), + stream)); } HIP_CHECK(hipDeviceSynchronize()); - const auto end = clock::now(); - using seconds_d = chrono::duration; + const auto end = clock::now(); + using seconds_d = chrono::duration; const auto elapsed_seconds = chrono::duration_cast(end - start); state.SetIterationTime(elapsed_seconds.count()); @@ -151,83 +144,74 @@ void run_flagged(benchmark::State& state, HIP_CHECK(hipFree(d_input)); } -template +template void run_predicate(benchmark::State& state, const hipStream_t stream, - const T threshold, - const size_t size) + const T threshold, + const size_t size) { - const auto input = - benchmark_utils::get_random_data(size, static_cast(0), static_cast(100)); + const auto input + = benchmark_utils::get_random_data(size, static_cast(0), static_cast(100)); - T* d_input = nullptr; - T* d_output = nullptr; + T* d_input = nullptr; + T* d_output = nullptr; unsigned int* d_num_selected_output = nullptr; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_num_selected_output, sizeof(unsigned int))); - const auto select_op = LessOp{threshold}; + const auto select_op = LessOp{threshold}; // Allocate temporary storage void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; - HIP_CHECK( - hipcub::DevicePartition::If( - nullptr, - temp_storage_bytes, - d_input, - d_output, - d_num_selected_output, - static_cast(input.size()), - select_op, - stream - ) - ); + HIP_CHECK(hipcub::DevicePartition::If(nullptr, + temp_storage_bytes, + d_input, + d_output, + d_num_selected_output, + static_cast(input.size()), + select_op, + stream)); HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); // Warm-up HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); - for(unsigned int i = 0; i < warmup_size; ++i) { - HIP_CHECK( - hipcub::DevicePartition::If( - d_temp_storage, - temp_storage_bytes, - d_input, - d_output, - d_num_selected_output, - static_cast(input.size()), - select_op, - stream - ) - ); + for(unsigned int i = 0; i < warmup_size; ++i) + { + HIP_CHECK(hipcub::DevicePartition::If(d_temp_storage, + temp_storage_bytes, + d_input, + d_output, + d_num_selected_output, + static_cast(input.size()), + select_op, + stream)); } HIP_CHECK(hipDeviceSynchronize()); // Run benchmark - for(auto _ : state) { + for(auto _ : state) + { namespace chrono = std::chrono; - using clock = chrono::high_resolution_clock; + using clock = chrono::high_resolution_clock; const auto start = clock::now(); - for (unsigned int i = 0; i < batch_size; ++i) { - HIP_CHECK( - hipcub::DevicePartition::If( - d_temp_storage, - temp_storage_bytes, - d_input, - d_output, - d_num_selected_output, - static_cast(input.size()), - select_op, - stream - ) - ); + for(unsigned int i = 0; i < batch_size; ++i) + { + HIP_CHECK(hipcub::DevicePartition::If(d_temp_storage, + temp_storage_bytes, + d_input, + d_output, + d_num_selected_output, + static_cast(input.size()), + select_op, + stream)); } HIP_CHECK(hipDeviceSynchronize()); - const auto end = clock::now(); - using seconds_d = chrono::duration; + const auto end = clock::now(); + using seconds_d = chrono::duration; const auto elapsed_seconds = chrono::duration_cast(end - start); state.SetIterationTime(elapsed_seconds.count()); @@ -243,20 +227,20 @@ void run_predicate(benchmark::State& state, HIP_CHECK(hipFree(d_num_selected_output)); } -template +template void run_threeway(benchmark::State& state, const hipStream_t stream, - const T small_threshold, - const T large_threshold, - const size_t size) + const T small_threshold, + const T large_threshold, + const size_t size) { - const auto input = - benchmark_utils::get_random_data(size, static_cast(0), static_cast(100)); + const auto input + = benchmark_utils::get_random_data(size, static_cast(0), static_cast(100)); - T* d_input = nullptr; - T* d_first_output = nullptr; - T* d_second_output = nullptr; - T* d_unselected_output = nullptr; + T* d_input = nullptr; + T* d_first_output = nullptr; + T* d_second_output = nullptr; + T* d_unselected_output = nullptr; unsigned int* d_num_selected_output = nullptr; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_first_output, input.size() * sizeof(T))); @@ -270,71 +254,62 @@ void run_threeway(benchmark::State& state, // Allocate temporary storage void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; - HIP_CHECK( - hipcub::DevicePartition::If( - nullptr, - temp_storage_bytes, - d_input, - d_first_output, - d_second_output, - d_unselected_output, - d_num_selected_output, - static_cast(input.size()), - select_first_part_op, - select_second_part_op, - stream - ) - ); + HIP_CHECK(hipcub::DevicePartition::If(nullptr, + temp_storage_bytes, + d_input, + d_first_output, + d_second_output, + d_unselected_output, + d_num_selected_output, + static_cast(input.size()), + select_first_part_op, + select_second_part_op, + stream)); HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); // Warm-up HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); - for(unsigned int i = 0; i < warmup_size; ++i) { - HIP_CHECK( - hipcub::DevicePartition::If( - d_temp_storage, - temp_storage_bytes, - d_input, - d_first_output, - d_second_output, - d_unselected_output, - d_num_selected_output, - static_cast(input.size()), - select_first_part_op, - select_second_part_op, - stream - ) - ); + for(unsigned int i = 0; i < warmup_size; ++i) + { + HIP_CHECK(hipcub::DevicePartition::If(d_temp_storage, + temp_storage_bytes, + d_input, + d_first_output, + d_second_output, + d_unselected_output, + d_num_selected_output, + static_cast(input.size()), + select_first_part_op, + select_second_part_op, + stream)); } HIP_CHECK(hipDeviceSynchronize()); // Run benchmark - for(auto _ : state) { + for(auto _ : state) + { namespace chrono = std::chrono; - using clock = chrono::high_resolution_clock; + using clock = chrono::high_resolution_clock; const auto start = clock::now(); - for (unsigned int i = 0; i < batch_size; ++i) { - HIP_CHECK( - hipcub::DevicePartition::If( - d_temp_storage, - temp_storage_bytes, - d_input, - d_first_output, - d_second_output, - d_unselected_output, - d_num_selected_output, - static_cast(input.size()), - select_first_part_op, - select_second_part_op, - stream - ) - ); + for(unsigned int i = 0; i < batch_size; ++i) + { + HIP_CHECK(hipcub::DevicePartition::If(d_temp_storage, + temp_storage_bytes, + d_input, + d_first_output, + d_second_output, + d_unselected_output, + d_num_selected_output, + static_cast(input.size()), + select_first_part_op, + select_second_part_op, + stream)); } HIP_CHECK(hipDeviceSynchronize()); - const auto end = clock::now(); - using seconds_d = chrono::duration; + const auto end = clock::now(); + using seconds_d = chrono::duration; const auto elapsed_seconds = chrono::duration_cast(end - start); state.SetIterationTime(elapsed_seconds.count()); @@ -352,43 +327,50 @@ void run_threeway(benchmark::State& state, HIP_CHECK(hipFree(d_num_selected_output)); } -#define CREATE_BENCHMARK_FLAGGED(T, T_FLAG, SPLIT_T) \ -benchmark::RegisterBenchmark( \ - "parition_flagged<" #T ", " #T_FLAG ">(" #SPLIT_T "%)", \ - &run_flagged, stream, static_cast(SPLIT_T), size \ -) - -#define CREATE_BENCHMARK_PREDICATE(T, SPLIT_T) \ -benchmark::RegisterBenchmark( \ - "parition_predicate<" #T ">(" #SPLIT_T "%)", \ - &run_predicate, stream, static_cast(SPLIT_T), size \ -) - -#define CREATE_BENCHMARK_THREEWAY(T, SMALL_T, LARGE_T) \ -benchmark::RegisterBenchmark( \ - "parition_three_way(Small Threshold:" #SMALL_T "%,Large Threshold:" #LARGE_T "%)", \ - &run_threeway, stream, static_cast(SMALL_T), static_cast(LARGE_T), size \ -) - -#define BENCHMARK_FLAGGED_TYPE(type, flag_type) \ - CREATE_BENCHMARK_FLAGGED(type, flag_type, 33), \ - CREATE_BENCHMARK_FLAGGED(type, flag_type, 50), \ - CREATE_BENCHMARK_FLAGGED(type, flag_type, 60), \ - CREATE_BENCHMARK_FLAGGED(type, flag_type, 90) - -#define BENCHMARK_PREDICATE_TYPE(type) \ - CREATE_BENCHMARK_PREDICATE(type, 33), \ - CREATE_BENCHMARK_PREDICATE(type, 50), \ - CREATE_BENCHMARK_PREDICATE(type, 60), \ - CREATE_BENCHMARK_PREDICATE(type, 90) - -#define BENCHMARK_THREEWAY_TYPE(type) \ - CREATE_BENCHMARK_THREEWAY(type, 33, 66), \ - CREATE_BENCHMARK_THREEWAY(type, 10, 66), \ - CREATE_BENCHMARK_THREEWAY(type, 50, 60), \ - CREATE_BENCHMARK_THREEWAY(type, 50, 90) - -int main(int argc, char *argv[]) +#define CREATE_BENCHMARK_FLAGGED(T, T_FLAG, SPLIT_T) \ + benchmark::RegisterBenchmark(std::string("device_parition_flagged.(split_threshold:" #SPLIT_T \ + "%)") \ + .c_str(), \ + &run_flagged, \ + stream, \ + static_cast(SPLIT_T), \ + size) + +#define CREATE_BENCHMARK_PREDICATE(T, SPLIT_T) \ + benchmark::RegisterBenchmark( \ + std::string("device_parition_predicate.(split_threshold:" #SPLIT_T "%)") \ + .c_str(), \ + &run_predicate, \ + stream, \ + static_cast(SPLIT_T), \ + size) + +#define CREATE_BENCHMARK_THREEWAY(T, SMALL_T, LARGE_T) \ + benchmark::RegisterBenchmark(std::string("device_parition_three_way" \ + ".(small_threshold:" #SMALL_T \ + "%,large_threshold:" #LARGE_T "%)") \ + .c_str(), \ + &run_threeway, \ + stream, \ + static_cast(SMALL_T), \ + static_cast(LARGE_T), \ + size) + +#define BENCHMARK_FLAGGED_TYPE(type, flag_type) \ + CREATE_BENCHMARK_FLAGGED(type, flag_type, 33), CREATE_BENCHMARK_FLAGGED(type, flag_type, 50), \ + CREATE_BENCHMARK_FLAGGED(type, flag_type, 60), \ + CREATE_BENCHMARK_FLAGGED(type, flag_type, 90) + +#define BENCHMARK_PREDICATE_TYPE(type) \ + CREATE_BENCHMARK_PREDICATE(type, 33), CREATE_BENCHMARK_PREDICATE(type, 50), \ + CREATE_BENCHMARK_PREDICATE(type, 60), CREATE_BENCHMARK_PREDICATE(type, 90) + +#define BENCHMARK_THREEWAY_TYPE(type) \ + CREATE_BENCHMARK_THREEWAY(type, 33, 66), CREATE_BENCHMARK_THREEWAY(type, 10, 66), \ + CREATE_BENCHMARK_THREEWAY(type, 50, 60), CREATE_BENCHMARK_THREEWAY(type, 50, 90) + +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -397,8 +379,8 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); std::cout << "benchmark_device_partition" << std::endl; @@ -406,18 +388,17 @@ int main(int argc, char *argv[]) const hipStream_t stream = 0; // default { hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; } - using custom_float2 = benchmark_utils::custom_type; + using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; // Add benchmarks - std::vector benchmarks = - { + std::vector benchmarks = { BENCHMARK_FLAGGED_TYPE(int8_t, unsigned char), BENCHMARK_FLAGGED_TYPE(int, unsigned char), BENCHMARK_FLAGGED_TYPE(float, unsigned char), diff --git a/benchmark/benchmark_device_radix_sort.cpp b/benchmark/benchmark_device_radix_sort.cpp index 386281ab..366e62d9 100644 --- a/benchmark/benchmark_device_radix_sort.cpp +++ b/benchmark/benchmark_device_radix_sort.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -28,12 +28,11 @@ // HIP API #include "hipcub/device/device_radix_sort.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -const unsigned int batch_size = 10; +const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template @@ -43,16 +42,16 @@ std::vector generate_keys(size_t size) if(std::is_floating_point::value) { - return benchmark_utils::get_random_data(size, (key_type)-1000, (key_type)+1000, size); - } - else + return benchmark_utils::get_random_data(size, + (key_type)-1000, + (key_type) + 1000, + size); + } else { - return benchmark_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max(), - size - ); + return benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max(), + size); } } @@ -132,25 +131,22 @@ auto invoke_sort_keys(void* d_temp_storage, } template -void run_sort_keys_benchmark(benchmark::State& state, - hipStream_t stream, - size_t size, +void run_sort_keys_benchmark(benchmark::State& state, + hipStream_t stream, + size_t size, std::shared_ptr> keys_input) { using key_type = Key; - key_type * d_keys_input; - key_type * d_keys_output; + key_type* d_keys_input; + key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input->data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; + HIP_CHECK(hipMemcpy(d_keys_input, + keys_input->data(), + size * sizeof(key_type), + hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(invoke_sort_keys(d_temporary_storage, temporary_storage_bytes, @@ -174,7 +170,7 @@ void run_sort_keys_benchmark(benchmark::State& state, } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); @@ -190,8 +186,8 @@ void run_sort_keys_benchmark(benchmark::State& state, HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); @@ -294,12 +290,12 @@ auto invoke_sort_pairs(void* d_temp_storage, } template -void run_sort_pairs_benchmark(benchmark::State& state, - hipStream_t stream, - size_t size, +void run_sort_pairs_benchmark(benchmark::State& state, + hipStream_t stream, + size_t size, std::shared_ptr> keys_input) { - using key_type = Key; + using key_type = Key; using value_type = Value; std::vector values_input(size); for(size_t i = 0; i < size; i++) @@ -307,31 +303,25 @@ void run_sort_pairs_benchmark(benchmark::State& state, values_input[i] = value_type(i); } - key_type * d_keys_input; - key_type * d_keys_output; + key_type* d_keys_input; + key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input->data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - value_type * d_values_input; - value_type * d_values_output; + HIP_CHECK(hipMemcpy(d_keys_input, + keys_input->data(), + size * sizeof(key_type), + hipMemcpyHostToDevice)); + + value_type* d_values_input; + value_type* d_values_output; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - size * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; + HIP_CHECK(hipMemcpy(d_values_input, + values_input.data(), + size * sizeof(value_type), + hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(invoke_sort_pairs(d_temporary_storage, temporary_storage_bytes, @@ -359,7 +349,7 @@ void run_sort_pairs_benchmark(benchmark::State& state, } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); @@ -377,13 +367,12 @@ void run_sort_pairs_benchmark(benchmark::State& state, HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } - state.SetBytesProcessed( - state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type)) - ); + state.SetBytesProcessed(state.iterations() * batch_size * size + * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); @@ -393,45 +382,43 @@ void run_sort_pairs_benchmark(benchmark::State& state, HIP_CHECK(hipFree(d_values_output)); } - -#define CREATE_SORT_KEYS_BENCHMARK(Key) \ - { \ +#define CREATE_SORT_KEYS_BENCHMARK(Key) \ + { \ auto keys_input = std::make_shared>(generate_keys(size)); \ - benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "").c_str(), \ - [=](benchmark::State& state) { run_sort_keys_benchmark(state, stream, size, keys_input); } \ - ) \ - ); \ - benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "<" #Key ">, descending").c_str(), \ - [=](benchmark::State& state) { run_sort_keys_benchmark(state, stream, size, keys_input); } \ - ) \ - ); \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("device_radix_sort_keys_ascending" \ + ".") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_keys_benchmark(state, stream, size, keys_input); })); \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("device_radix_sort_keys_descending" \ + ".") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_keys_benchmark(state, stream, size, keys_input); })); \ } -#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value) \ - { \ - auto keys_input = std::make_shared>(generate_keys(size)); \ - benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "").c_str(), \ - [=](benchmark::State& state) { run_sort_pairs_benchmark(state, stream, size, keys_input); } \ - ) \ - ); \ - benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "<" #Key ", " #Value">, descending").c_str(), \ - [=](benchmark::State& state) { run_sort_pairs_benchmark(state, stream, size, keys_input); } \ - ) \ - ); \ +#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value) \ + { \ + auto keys_input = std::make_shared>(generate_keys(size)); \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("device_radix_sort_pairs_ascending" \ + ".") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_pairs_benchmark(state, stream, size, keys_input); })); \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("device_radix_sort_pairs_descending" \ + ".") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_pairs_benchmark(state, stream, size, keys_input); })); \ } - void add_sort_keys_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { using custom_int_t = benchmark_utils::custom_type; CREATE_SORT_KEYS_BENCHMARK(int) @@ -443,11 +430,11 @@ void add_sort_keys_benchmarks(std::vector& benc } void add_sort_pairs_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - using custom_float2 = benchmark_utils::custom_type; - using custom_double2 = benchmark_utils::custom_type; + using custom_float2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; using custom_char_double = benchmark_utils::custom_type; using custom_double_char = benchmark_utils::custom_type; using custom_int_t = benchmark_utils::custom_type; @@ -472,7 +459,7 @@ void add_sort_pairs_benchmarks(std::vector& ben CREATE_SORT_PAIRS_BENCHMARK(custom_int_t, float) } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -481,15 +468,15 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); std::cout << "benchmark_device_radix_sort" << std::endl; // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_device_reduce.cpp b/benchmark/benchmark_device_reduce.cpp index a58ea8dc..2a4d9df7 100644 --- a/benchmark/benchmark_device_reduce.cpp +++ b/benchmark/benchmark_device_reduce.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -25,45 +25,34 @@ // HIP API #include "hipcub/device/device_reduce.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif -const unsigned int batch_size = 10; +const unsigned int batch_size = 10; const unsigned int warmup_size = 5; -template< - class T, - class OutputT, - class ReduceKernel -> +template void run_benchmark(benchmark::State& state, - size_t size, + size_t size, const hipStream_t stream, - ReduceKernel reduce) + ReduceKernel reduce) { std::vector input = benchmark_utils::get_random_data(size, T(0), T(1000)); - T * d_input; - OutputT * d_output; + T* d_input; + OutputT* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, sizeof(OutputT))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes = 0; - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; // Get size of d_temp_storage HIP_CHECK(reduce(d_temp_storage, temp_storage_size_bytes, d_input, d_output, size, stream)); - HIP_CHECK(hipMalloc(&d_temp_storage,temp_storage_size_bytes)); + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); for(size_t i = 0; i < warmup_size; i++) { @@ -83,8 +72,8 @@ void run_benchmark(benchmark::State& state, HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); @@ -99,7 +88,8 @@ template struct Benchmark; template -struct Benchmark { +struct Benchmark +{ static void run(benchmark::State& state, size_t size, const hipStream_t stream) { hipError_t (*ptr_to_sum)(void*, size_t&, T*, T*, int, hipStream_t) @@ -109,7 +99,8 @@ struct Benchmark { }; template -struct Benchmark { +struct Benchmark +{ static void run(benchmark::State& state, size_t size, const hipStream_t stream) { hipError_t (*ptr_to_min)(void*, size_t&, T*, T*, int, hipStream_t) @@ -119,10 +110,11 @@ struct Benchmark { }; template -struct Benchmark { +struct Benchmark +{ using Difference = int; - using Iterator = typename hipcub::ArgIndexInputIterator; - using KeyValue = typename Iterator::value_type; + using Iterator = typename hipcub::ArgIndexInputIterator; + using KeyValue = typename Iterator::value_type; static void run(benchmark::State& state, size_t size, const hipStream_t stream) { @@ -132,20 +124,20 @@ struct Benchmark { } }; -#define CREATE_BENCHMARK(T, REDUCE_OP) \ -benchmark::RegisterBenchmark( \ - ("reduce"), \ - &Benchmark::run, size, stream \ -) +#define CREATE_BENCHMARK(T, REDUCE_OP) \ + benchmark::RegisterBenchmark(std::string("device_reduce" \ + ".") \ + .c_str(), \ + &Benchmark::run, \ + size, \ + stream) -#define CREATE_BENCHMARKS(REDUCE_OP) \ - CREATE_BENCHMARK(int, REDUCE_OP), \ - CREATE_BENCHMARK(long long, REDUCE_OP), \ - CREATE_BENCHMARK(float, REDUCE_OP), \ - CREATE_BENCHMARK(double, REDUCE_OP), \ - CREATE_BENCHMARK(int8_t, REDUCE_OP) +#define CREATE_BENCHMARKS(REDUCE_OP) \ + CREATE_BENCHMARK(int, REDUCE_OP), CREATE_BENCHMARK(long long, REDUCE_OP), \ + CREATE_BENCHMARK(float, REDUCE_OP), CREATE_BENCHMARK(double, REDUCE_OP), \ + CREATE_BENCHMARK(int8_t, REDUCE_OP) -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -154,15 +146,15 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); std::cout << "benchmark_device_reduce" << std::endl; // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; @@ -170,18 +162,17 @@ int main(int argc, char *argv[]) using custom_double2 = benchmark_utils::custom_type; // Add benchmarks - std::vector benchmarks = - { + std::vector benchmarks = { CREATE_BENCHMARKS(hipcub::Sum), CREATE_BENCHMARK(custom_double2, hipcub::Sum), CREATE_BENCHMARKS(hipcub::Min), - #ifdef HIPCUB_ROCPRIM_API +#ifdef HIPCUB_ROCPRIM_API CREATE_BENCHMARK(custom_double2, hipcub::Min), - #endif +#endif CREATE_BENCHMARKS(hipcub::ArgMin), - #ifdef HIPCUB_ROCPRIM_API +#ifdef HIPCUB_ROCPRIM_API CREATE_BENCHMARK(custom_double2, hipcub::ArgMin), - #endif +#endif }; // Use manual timing diff --git a/benchmark/benchmark_device_reduce_by_key.cpp b/benchmark/benchmark_device_reduce_by_key.cpp index 7437383f..54209e65 100644 --- a/benchmark/benchmark_device_reduce_by_key.cpp +++ b/benchmark/benchmark_device_reduce_by_key.cpp @@ -2,15 +2,15 @@ // // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // - // Permission is hereby granted, free of charge, to any person obtaining a copy +// Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -20,8 +20,8 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -// CUB's implementation of single_pass_scan_operators has maybe uninitialized parameters, -// disable the warning because all warnings are threated as errors: +// CUB's implementation of single_pass_scan_operators has maybe uninitialized +// parameters, disable the warning because all warnings are threated as errors: #ifdef __HIP_PLATFORM_NVIDIA__ #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif @@ -35,25 +35,30 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -const unsigned int batch_size = 10; +const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template -void run_benchmark(benchmark::State& state, size_t max_length, hipStream_t stream, size_t size, BinaryFunction reduce_op) +void run_benchmark(benchmark::State& state, + size_t max_length, + hipStream_t stream, + size_t size, + BinaryFunction reduce_op) { - using key_type = Key; + using key_type = Key; using value_type = Value; // Generate data std::vector keys_input(size); - unsigned int unique_count = 0; - std::vector key_counts = benchmark_utils::get_random_data(100000, 1, max_length); + unsigned int unique_count = 0; + std::vector key_counts + = benchmark_utils::get_random_data(100000, 1, max_length); size_t offset = 0; while(offset < size) { const size_t key_count = key_counts[unique_count % key_counts.size()]; - const size_t end = std::min(size, offset + key_count); + const size_t end = std::min(size, offset + key_count); for(size_t i = offset; i < end; i++) { keys_input[i] = unique_count; @@ -66,46 +71,38 @@ void run_benchmark(benchmark::State& state, size_t max_length, hipStream_t strea std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); - key_type * d_keys_input; + key_type* d_keys_input; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - value_type * d_values_input; + hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + value_type* d_values_input; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - size * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); - - key_type * d_unique_output; - value_type * d_aggregates_output; - unsigned int * d_unique_count_output; + HIP_CHECK(hipMemcpy(d_values_input, + values_input.data(), + size * sizeof(value_type), + hipMemcpyHostToDevice)); + + key_type* d_unique_output; + value_type* d_aggregates_output; + unsigned int* d_unique_count_output; HIP_CHECK(hipMalloc(&d_unique_output, unique_count * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_aggregates_output, unique_count * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_unique_count_output, sizeof(unsigned int))); - void * d_temporary_storage = nullptr; + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; - HIP_CHECK( - hipcub::DeviceReduce::ReduceByKey( - nullptr, temporary_storage_bytes, - d_keys_input, d_unique_output, d_values_input, - d_aggregates_output, - d_unique_count_output, - reduce_op, size, - stream - ) - ); + HIP_CHECK(hipcub::DeviceReduce::ReduceByKey(nullptr, + temporary_storage_bytes, + d_keys_input, + d_unique_output, + d_values_input, + d_aggregates_output, + d_unique_count_output, + reduce_op, + size, + stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); @@ -113,44 +110,45 @@ void run_benchmark(benchmark::State& state, size_t max_length, hipStream_t strea // Warm-up for(size_t i = 0; i < warmup_size; i++) { - HIP_CHECK( - hipcub::DeviceReduce::ReduceByKey( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, - d_unique_output, d_values_input, d_aggregates_output, - d_unique_count_output, - reduce_op, size, - stream - ) - ); + HIP_CHECK(hipcub::DeviceReduce::ReduceByKey(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_unique_output, + d_values_input, + d_aggregates_output, + d_unique_count_output, + reduce_op, + size, + stream)); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK( - hipcub::DeviceReduce::ReduceByKey( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, - d_unique_output, d_values_input, d_aggregates_output, - d_unique_count_output, - reduce_op, size, - stream - ) - ); + HIP_CHECK(hipcub::DeviceReduce::ReduceByKey(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_unique_output, + d_values_input, + d_aggregates_output, + d_unique_count_output, + reduce_op, + size, + stream)); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } - state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); + state.SetBytesProcessed(state.iterations() * batch_size * size + * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); @@ -161,44 +159,46 @@ void run_benchmark(benchmark::State& state, size_t max_length, hipStream_t strea HIP_CHECK(hipFree(d_unique_count_output)); } -#define CREATE_BENCHMARK(Key, Value, REDUCE_OP) \ -benchmark::RegisterBenchmark( \ - (std::string("reduce_by_key") + "" + \ - "(Random Number Range:[1, " + std::to_string(max_length) + "])" \ - ).c_str(), \ - &run_benchmark, \ - max_length, stream, size, REDUCE_OP() \ -) - -#define CREATE_BENCHMARKS(REDUCE_OP) \ - CREATE_BENCHMARK(int, float, REDUCE_OP), \ - CREATE_BENCHMARK(int, double, REDUCE_OP), \ - CREATE_BENCHMARK(int, custom_double2, REDUCE_OP), \ - CREATE_BENCHMARK(int8_t, int8_t, REDUCE_OP), \ - CREATE_BENCHMARK(long long, float, REDUCE_OP), \ - CREATE_BENCHMARK(long long, double, REDUCE_OP) - -void add_benchmarks(size_t max_length, +#define CREATE_BENCHMARK(Key, Value, REDUCE_OP) \ + benchmark::RegisterBenchmark(std::string("device_reduce_by_key" \ + "." \ + "(random_number_range:[1, " \ + + std::to_string(max_length) + "])") \ + .c_str(), \ + &run_benchmark, \ + max_length, \ + stream, \ + size, \ + REDUCE_OP()) + +#define CREATE_BENCHMARKS(REDUCE_OP) \ + CREATE_BENCHMARK(int, float, REDUCE_OP), CREATE_BENCHMARK(int, double, REDUCE_OP), \ + CREATE_BENCHMARK(int, custom_double2, REDUCE_OP), \ + CREATE_BENCHMARK(int8_t, int8_t, REDUCE_OP), \ + CREATE_BENCHMARK(long long, float, REDUCE_OP), \ + CREATE_BENCHMARK(long long, double, REDUCE_OP) + +void add_benchmarks(size_t max_length, std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { using custom_double2 = benchmark_utils::custom_type; - std::vector bs = - { + std::vector bs = { CREATE_BENCHMARKS(hipcub::Sum), CREATE_BENCHMARK(long long, custom_double2, hipcub::Sum), CREATE_BENCHMARKS(hipcub::Min), - #ifdef HIPCUB_ROCPRIM_API +#ifdef HIPCUB_ROCPRIM_API CREATE_BENCHMARK(long long, custom_double2, hipcub::Min), - #endif +#endif }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -207,15 +207,15 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); std::cout << "benchmark_device_reduce_by_key" << std::endl; // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_device_run_length_encode.cpp b/benchmark/benchmark_device_run_length_encode.cpp index 8a20433d..267185c7 100644 --- a/benchmark/benchmark_device_run_length_encode.cpp +++ b/benchmark/benchmark_device_run_length_encode.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -31,27 +31,30 @@ // HIP API #include "hipcub/device/device_run_length_encode.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template -void run_encode_benchmark(benchmark::State& state, size_t max_length, hipStream_t stream, size_t size) +void run_encode_benchmark(benchmark::State& state, + size_t max_length, + hipStream_t stream, + size_t size) { - using key_type = T; + using key_type = T; using count_type = unsigned int; // Generate data std::vector input(size); - unsigned int runs_count = 0; - std::vector key_counts = benchmark_utils::get_random_data(100000, 1, max_length); + unsigned int runs_count = 0; + std::vector key_counts + = benchmark_utils::get_random_data(100000, 1, max_length); size_t offset = 0; while(offset < size) { const size_t key_count = key_counts[runs_count % key_counts.size()]; - const size_t end = std::min(size, offset + key_count); + const size_t end = std::min(size, offset + key_count); for(size_t i = offset; i < end; i++) { input[i] = runs_count; @@ -61,24 +64,18 @@ void run_encode_benchmark(benchmark::State& state, size_t max_length, hipStream_ offset += key_count; } - key_type * d_input; + key_type* d_input; HIP_CHECK(hipMalloc(&d_input, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - key_type * d_unique_output; - count_type * d_counts_output; - count_type * d_runs_count_output; + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + key_type* d_unique_output; + count_type* d_counts_output; + count_type* d_runs_count_output; HIP_CHECK(hipMalloc(&d_unique_output, runs_count * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_counts_output, runs_count * sizeof(count_type))); HIP_CHECK(hipMalloc(&d_runs_count_output, sizeof(count_type))); - void * d_temporary_storage = nullptr; + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(hipcub::DeviceRunLengthEncode::Encode(nullptr, @@ -108,7 +105,7 @@ void run_encode_benchmark(benchmark::State& state, size_t max_length, hipStream_ HIP_CHECK(hipDeviceSynchronize()); const unsigned int batch_size = 10; - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); @@ -126,8 +123,8 @@ void run_encode_benchmark(benchmark::State& state, size_t max_length, hipStream_ HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); @@ -141,22 +138,26 @@ void run_encode_benchmark(benchmark::State& state, size_t max_length, hipStream_ } template -void run_non_trivial_runs_benchmark(benchmark::State& state, size_t max_length, hipStream_t stream, size_t size) +void run_non_trivial_runs_benchmark(benchmark::State& state, + size_t max_length, + hipStream_t stream, + size_t size) { - using key_type = T; + using key_type = T; using offset_type = unsigned int; - using count_type = unsigned int; + using count_type = unsigned int; // Generate data std::vector input(size); - unsigned int runs_count = 0; - std::vector key_counts = benchmark_utils::get_random_data(100000, 1, max_length); + unsigned int runs_count = 0; + std::vector key_counts + = benchmark_utils::get_random_data(100000, 1, max_length); size_t offset = 0; while(offset < size) { const size_t key_count = key_counts[runs_count % key_counts.size()]; - const size_t end = std::min(size, offset + key_count); + const size_t end = std::min(size, offset + key_count); for(size_t i = offset; i < end; i++) { input[i] = runs_count; @@ -166,24 +167,18 @@ void run_non_trivial_runs_benchmark(benchmark::State& state, size_t max_length, offset += key_count; } - key_type * d_input; + key_type* d_input; HIP_CHECK(hipMalloc(&d_input, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - offset_type * d_offsets_output; - count_type * d_counts_output; - count_type * d_runs_count_output; + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + offset_type* d_offsets_output; + count_type* d_counts_output; + count_type* d_runs_count_output; HIP_CHECK(hipMalloc(&d_offsets_output, runs_count * sizeof(offset_type))); HIP_CHECK(hipMalloc(&d_counts_output, runs_count * sizeof(count_type))); HIP_CHECK(hipMalloc(&d_runs_count_output, sizeof(count_type))); - void * d_temporary_storage = nullptr; + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(hipcub::DeviceRunLengthEncode::NonTrivialRuns(nullptr, @@ -213,7 +208,7 @@ void run_non_trivial_runs_benchmark(benchmark::State& state, size_t max_length, HIP_CHECK(hipDeviceSynchronize()); const unsigned int batch_size = 10; - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); @@ -231,8 +226,8 @@ void run_non_trivial_runs_benchmark(benchmark::State& state, size_t max_length, HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); @@ -245,25 +240,26 @@ void run_non_trivial_runs_benchmark(benchmark::State& state, size_t max_length, HIP_CHECK(hipFree(d_runs_count_output)); } -#define CREATE_ENCODE_BENCHMARK(T) \ -benchmark::RegisterBenchmark( \ - (std::string("run_length_encode") + "" + \ - "(Random Number Range:[1, " + std::to_string(max_length) + "])" \ - ).c_str(), \ - &run_encode_benchmark, \ - max_length, stream, size \ -) - -void add_encode_benchmarks(size_t max_length, +#define CREATE_ENCODE_BENCHMARK(T) \ + benchmark::RegisterBenchmark(std::string("device_run_length_encode" \ + "." \ + "(random_number_range:[1, " \ + + std::to_string(max_length) + "])") \ + .c_str(), \ + &run_encode_benchmark, \ + max_length, \ + stream, \ + size) + +void add_encode_benchmarks(size_t max_length, std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - using custom_float2 = benchmark_utils::custom_type; + using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; - std::vector bs = - { + std::vector bs = { CREATE_ENCODE_BENCHMARK(int), CREATE_ENCODE_BENCHMARK(long long), @@ -277,25 +273,26 @@ void add_encode_benchmarks(size_t max_length, benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -#define CREATE_NON_TRIVIAL_RUNS_BENCHMARK(T) \ -benchmark::RegisterBenchmark( \ - (std::string("run_length_encode_non_trivial_runs") + "" + \ - "(Random Number Range:[1, " + std::to_string(max_length) + "])" \ - ).c_str(), \ - &run_non_trivial_runs_benchmark, \ - max_length, stream, size \ -) - -void add_non_trivial_runs_benchmarks(size_t max_length, +#define CREATE_NON_TRIVIAL_RUNS_BENCHMARK(T) \ + benchmark::RegisterBenchmark(std::string("run_length_encode_non_trivial_runs" \ + "" \ + "(random_number_range:[1, " \ + + std::to_string(max_length) + "])") \ + .c_str(), \ + &run_non_trivial_runs_benchmark, \ + max_length, \ + stream, \ + size) + +void add_non_trivial_runs_benchmarks(size_t max_length, std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - using custom_float2 = benchmark_utils::custom_type; + using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; - std::vector bs = - { + std::vector bs = { CREATE_NON_TRIVIAL_RUNS_BENCHMARK(int), CREATE_NON_TRIVIAL_RUNS_BENCHMARK(long long), @@ -309,7 +306,7 @@ void add_non_trivial_runs_benchmarks(size_t max_length, benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -318,15 +315,15 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); std::cout << "benchmark_device_run_length_encode" << std::endl; // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_device_scan.cpp b/benchmark/benchmark_device_scan.cpp index 897f5eec..dbfdda6a 100644 --- a/benchmark/benchmark_device_scan.cpp +++ b/benchmark/benchmark_device_scan.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -20,8 +20,8 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE -// CUB's implementation of single_pass_scan_operators has maybe uninitialized parameters, -// disable the warning because all warnings are threated as errors: +// CUB's implementation of single_pass_scan_operators has maybe uninitialized +// parameters, disable the warning because all warnings are threated as errors: #ifdef __HIP_PLATFORM_NVIDIA__ #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif @@ -31,7 +31,6 @@ // HIP API #include "hipcub/device/device_scan.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif @@ -68,7 +67,7 @@ auto run_device_scan(void* temporary_storage, const hipStream_t stream) -> typename std::enable_if::type { - (void) initial_value; + (void)initial_value; return hipcub::DeviceScan::InclusiveScan(temporary_storage, storage_size, input, @@ -125,55 +124,47 @@ auto run_device_scan_by_key(void* temporary_storage, stream); } -template< - bool Exclusive, - class T, - class BinaryFunction -> +template void run_benchmark(benchmark::State& state, - size_t size, + size_t size, const hipStream_t stream, - BinaryFunction scan_op) + BinaryFunction scan_op) { - std::vector input = benchmark_utils::get_random_data(size, T(0), T(1000)); - T initial_value = T(123); - T * d_input; - T * d_output; + std::vector input = benchmark_utils::get_random_data(size, T(0), T(1000)); + T initial_value = T(123); + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes = 0; - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; // Get size of d_temp_storage - HIP_CHECK(( - run_device_scan( - d_temp_storage, temp_storage_size_bytes, - d_input, d_output, initial_value, size, - scan_op, stream - ) - )); - HIP_CHECK(hipMalloc(&d_temp_storage,temp_storage_size_bytes)); + HIP_CHECK((run_device_scan(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + initial_value, + size, + scan_op, + stream))); + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 5; i++) { - HIP_CHECK(( - run_device_scan( - d_temp_storage, temp_storage_size_bytes, - d_input, d_output, initial_value, size, - scan_op, stream - ) - )); + HIP_CHECK((run_device_scan(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + initial_value, + size, + scan_op, + stream))); } HIP_CHECK(hipDeviceSynchronize()); @@ -183,19 +174,20 @@ void run_benchmark(benchmark::State& state, auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK(( - run_device_scan( - d_temp_storage, temp_storage_size_bytes, - d_input, d_output, initial_value, size, - scan_op, stream - ) - )); + HIP_CHECK((run_device_scan(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + initial_value, + size, + scan_op, + stream))); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); @@ -206,70 +198,59 @@ void run_benchmark(benchmark::State& state, HIP_CHECK(hipFree(d_temp_storage)); } -template< - bool Exclusive, - class T, - class BinaryFunction -> +template void run_benchmark_by_key(benchmark::State& state, - size_t size, + size_t size, const hipStream_t stream, - BinaryFunction scan_op) + BinaryFunction scan_op) { - using key_type = int; + using key_type = int; constexpr size_t max_segment_length = 100; - const std::vector keys = benchmark_utils::get_random_segments( - size, max_segment_length, std::random_device{}() - ); - const std::vector input = benchmark_utils::get_random_data(size, T(0), T(1000)); - const T initial_value = T(123); - key_type * d_keys; - T * d_input; - T * d_output; + const std::vector keys + = benchmark_utils::get_random_segments(size, + max_segment_length, + std::random_device{}()); + const std::vector input = benchmark_utils::get_random_data(size, T(0), T(1000)); + const T initial_value = T(123); + key_type* d_keys; + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_keys, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_keys, keys.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_keys, keys.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes = 0; - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; // Get size of d_temp_storage - HIP_CHECK(( - run_device_scan_by_key( - d_temp_storage, temp_storage_size_bytes, - d_keys, d_input, d_output, initial_value, - size, scan_op, stream - ) - )); - HIP_CHECK(hipMalloc(&d_temp_storage,temp_storage_size_bytes)); + HIP_CHECK((run_device_scan_by_key(d_temp_storage, + temp_storage_size_bytes, + d_keys, + d_input, + d_output, + initial_value, + size, + scan_op, + stream))); + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 5; i++) { - HIP_CHECK(( - run_device_scan_by_key( - d_temp_storage, temp_storage_size_bytes, - d_keys, d_input, d_output, initial_value, - size, scan_op, stream - ) - )); + HIP_CHECK((run_device_scan_by_key(d_temp_storage, + temp_storage_size_bytes, + d_keys, + d_input, + d_output, + initial_value, + size, + scan_op, + stream))); } HIP_CHECK(hipDeviceSynchronize()); @@ -279,19 +260,21 @@ void run_benchmark_by_key(benchmark::State& state, auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK(( - run_device_scan_by_key( - d_temp_storage, temp_storage_size_bytes, - d_keys, d_input, d_output, initial_value, - size, scan_op, stream - ) - )); + HIP_CHECK((run_device_scan_by_key(d_temp_storage, + temp_storage_size_bytes, + d_keys, + d_input, + d_output, + initial_value, + size, + scan_op, + stream))); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); @@ -303,37 +286,38 @@ void run_benchmark_by_key(benchmark::State& state, HIP_CHECK(hipFree(d_temp_storage)); } -#define CREATE_BENCHMARK(EXCL, T, SCAN_OP) \ -benchmark::RegisterBenchmark( \ - (std::string(EXCL ? "exclusive_scan" : "inclusive_scan") + \ - ("")).c_str(), \ - &run_benchmark, size, stream, SCAN_OP() \ -), \ -benchmark::RegisterBenchmark( \ - (std::string(EXCL ? "exclusive_scan_by_key" : "inclusive_scan_by_key") + \ - ("")).c_str(), \ - &run_benchmark_by_key, size, stream, SCAN_OP() \ -) - -#define CREATE_BENCHMARKS(SCAN_OP) \ - CREATE_BENCHMARK(false, int, SCAN_OP), \ - CREATE_BENCHMARK(true, int, SCAN_OP), \ - CREATE_BENCHMARK(false, float, SCAN_OP), \ - CREATE_BENCHMARK(true, float, SCAN_OP), \ - CREATE_BENCHMARK(false, double, SCAN_OP), \ - CREATE_BENCHMARK(true, double, SCAN_OP), \ - CREATE_BENCHMARK(false, long long, SCAN_OP), \ - CREATE_BENCHMARK(true, long long, SCAN_OP), \ - CREATE_BENCHMARK(false, custom_float2, SCAN_OP), \ - CREATE_BENCHMARK(true, custom_float2, SCAN_OP), \ - CREATE_BENCHMARK(false, custom_double2, SCAN_OP), \ - CREATE_BENCHMARK(true, custom_double2, SCAN_OP), \ - CREATE_BENCHMARK(false, int8_t, SCAN_OP), \ - CREATE_BENCHMARK(true, int8_t, SCAN_OP), \ - CREATE_BENCHMARK(false, uint8_t, SCAN_OP), \ - CREATE_BENCHMARK(true, uint8_t, SCAN_OP) - -int main(int argc, char *argv[]) +#define CREATE_BENCHMARK(EXCL, T, SCAN_OP) \ + benchmark::RegisterBenchmark( \ + std::string(std::string(EXCL ? "device_exclusive_scan" : "device_inclusive_scan") \ + + ".") \ + .c_str(), \ + &run_benchmark, \ + size, \ + stream, \ + SCAN_OP()), \ + benchmark::RegisterBenchmark( \ + std::string(std::string(EXCL ? "device_exclusive_scan_by_key" \ + : "device_inclusive_scan_by_key") \ + + ".") \ + .c_str(), \ + &run_benchmark_by_key, \ + size, \ + stream, \ + SCAN_OP()) + +#define CREATE_BENCHMARKS(SCAN_OP) \ + CREATE_BENCHMARK(false, int, SCAN_OP), CREATE_BENCHMARK(true, int, SCAN_OP), \ + CREATE_BENCHMARK(false, float, SCAN_OP), CREATE_BENCHMARK(true, float, SCAN_OP), \ + CREATE_BENCHMARK(false, double, SCAN_OP), CREATE_BENCHMARK(true, double, SCAN_OP), \ + CREATE_BENCHMARK(false, long long, SCAN_OP), CREATE_BENCHMARK(true, long long, SCAN_OP), \ + CREATE_BENCHMARK(false, custom_float2, SCAN_OP), \ + CREATE_BENCHMARK(true, custom_float2, SCAN_OP), \ + CREATE_BENCHMARK(false, custom_double2, SCAN_OP), \ + CREATE_BENCHMARK(true, custom_double2, SCAN_OP), CREATE_BENCHMARK(false, int8_t, SCAN_OP), \ + CREATE_BENCHMARK(true, int8_t, SCAN_OP), CREATE_BENCHMARK(false, uint8_t, SCAN_OP), \ + CREATE_BENCHMARK(true, uint8_t, SCAN_OP) + +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -342,29 +326,29 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); std::cout << "benchmark_device_scan" << std::endl; // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; using custom_double2 = benchmark_utils::custom_type; - using custom_float2 = benchmark_utils::custom_type; + using custom_float2 = benchmark_utils::custom_type; - // Compilation may never finish, if the compiler needs to compile too many kernels, - // it is recommended to compile benchmarks only for 1-2 types when BENCHMARK_CONFIG_TUNING is used - // (all other CREATE_*_BENCHMARK should be commented/removed). + // Compilation may never finish, if the compiler needs to compile too many + // kernels, it is recommended to compile benchmarks only for 1-2 types when + // BENCHMARK_CONFIG_TUNING is used (all other CREATE_*_BENCHMARK should be + // commented/removed). // Add benchmarks - std::vector benchmarks = - { + std::vector benchmarks = { CREATE_BENCHMARKS(hipcub::Sum), CREATE_BENCHMARKS(hipcub::Min), }; diff --git a/benchmark/benchmark_device_segmented_radix_sort.cpp b/benchmark/benchmark_device_segmented_radix_sort.cpp index 65b8d116..ad7f3075 100644 --- a/benchmark/benchmark_device_segmented_radix_sort.cpp +++ b/benchmark/benchmark_device_segmented_radix_sort.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -25,26 +25,25 @@ // HIP API #include "hipcub/hipcub.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -const unsigned int batch_size = 4; +const unsigned int batch_size = 4; const unsigned int warmup_size = 2; -constexpr bool Ascending = false; +constexpr bool Ascending = false; constexpr bool Descending = true; template void run_sort_keys_benchmark(benchmark::State& state, - size_t desired_segments, - hipStream_t stream, - size_t size, - bool descending = false) + size_t desired_segments, + hipStream_t stream, + size_t size, + bool descending = false) { using offset_type = int; - using key_type = Key; + using key_type = Key; typedef hipError_t (*sort_func)(void*, size_t&, const key_type*, @@ -57,11 +56,10 @@ void run_sort_keys_benchmark(benchmark::State& state, int, hipStream_t); - sort_func func_ascending = &hipcub::DeviceSegmentedRadixSort::SortKeys - ; - sort_func func_descending = &hipcub::DeviceSegmentedRadixSort::SortKeysDescending - ; - + sort_func func_ascending = &hipcub::DeviceSegmentedRadixSort::SortKeys; + sort_func func_descending + = &hipcub::DeviceSegmentedRadixSort::SortKeysDescending; + sort_func sorting = descending ? func_descending : func_ascending; // Generate data @@ -69,13 +67,13 @@ void run_sort_keys_benchmark(benchmark::State& state, const double avg_segment_length = static_cast(size) / desired_segments; - const unsigned int seed = 123; + const unsigned int seed = 123; std::default_random_engine gen(seed); std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); unsigned int segments_count = 0; - size_t offset = 0; + size_t offset = 0; while(offset < size) { const size_t segment_length = std::round(segment_length_dis(gen)); @@ -88,41 +86,31 @@ void run_sort_keys_benchmark(benchmark::State& state, std::vector keys_input; if(std::is_floating_point::value) { - keys_input = benchmark_utils::get_random_data( - size, (key_type)-1000, (key_type)+1000); - } - else + keys_input + = benchmark_utils::get_random_data(size, (key_type)-1000, (key_type) + 1000); + } else { - keys_input = benchmark_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + keys_input + = benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max()); } - offset_type * d_offsets; + offset_type* d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); - HIP_CHECK( - hipMemcpy( - d_offsets, offsets.data(), - (segments_count + 1) * sizeof(offset_type), - hipMemcpyHostToDevice - ) - ); - - key_type * d_keys_input; - key_type * d_keys_output; + HIP_CHECK(hipMemcpy(d_offsets, + offsets.data(), + (segments_count + 1) * sizeof(offset_type), + hipMemcpyHostToDevice)); + + key_type* d_keys_input; + key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; + hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, @@ -156,7 +144,7 @@ void run_sort_keys_benchmark(benchmark::State& state, } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); @@ -177,8 +165,8 @@ void run_sort_keys_benchmark(benchmark::State& state, HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); @@ -192,14 +180,14 @@ void run_sort_keys_benchmark(benchmark::State& state, template void run_sort_pairs_benchmark(benchmark::State& state, - size_t desired_segments, - hipStream_t stream, - size_t size, - bool descending = false) + size_t desired_segments, + hipStream_t stream, + size_t size, + bool descending = false) { using offset_type = int; - using key_type = Key; - using value_type = Value; + using key_type = Key; + using value_type = Value; typedef hipError_t (*sort_func)(void*, size_t&, const key_type*, @@ -214,10 +202,10 @@ void run_sort_pairs_benchmark(benchmark::State& state, int, hipStream_t); - sort_func func_ascending = &hipcub::DeviceSegmentedRadixSort::SortPairs - ; - sort_func func_descending = &hipcub::DeviceSegmentedRadixSort::SortPairsDescending - ; + sort_func func_ascending + = &hipcub::DeviceSegmentedRadixSort::SortPairs; + sort_func func_descending = &hipcub::DeviceSegmentedRadixSort:: + SortPairsDescending; sort_func sorting = descending ? func_descending : func_ascending; @@ -226,13 +214,13 @@ void run_sort_pairs_benchmark(benchmark::State& state, const double avg_segment_length = static_cast(size) / desired_segments; - const unsigned int seed = 123; + const unsigned int seed = 123; std::default_random_engine gen(seed); std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); unsigned int segments_count = 0; - size_t offset = 0; + size_t offset = 0; while(offset < size) { const size_t segment_length = std::round(segment_length_dis(gen)); @@ -245,56 +233,43 @@ void run_sort_pairs_benchmark(benchmark::State& state, std::vector keys_input; if(std::is_floating_point::value) { - keys_input = benchmark_utils::get_random_data( - size, (key_type)-1000, (key_type)+1000); - } - else + keys_input + = benchmark_utils::get_random_data(size, (key_type)-1000, (key_type) + 1000); + } else { - keys_input = benchmark_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + keys_input + = benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max()); } std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); - offset_type * d_offsets; + offset_type* d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); - HIP_CHECK( - hipMemcpy( - d_offsets, offsets.data(), - (segments_count + 1) * sizeof(offset_type), - hipMemcpyHostToDevice - ) - ); - - key_type * d_keys_input; - key_type * d_keys_output; + HIP_CHECK(hipMemcpy(d_offsets, + offsets.data(), + (segments_count + 1) * sizeof(offset_type), + hipMemcpyHostToDevice)); + + key_type* d_keys_input; + key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - value_type * d_values_input; - value_type * d_values_output; + hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + value_type* d_values_input; + value_type* d_values_output; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - size * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; + HIP_CHECK(hipMemcpy(d_values_input, + values_input.data(), + size * sizeof(value_type), + hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, @@ -332,7 +307,7 @@ void run_sort_pairs_benchmark(benchmark::State& state, } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); @@ -355,13 +330,12 @@ void run_sort_pairs_benchmark(benchmark::State& state, HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } - state.SetBytesProcessed( - state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type)) - ); + state.SetBytesProcessed(state.iterations() * batch_size * size + * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); @@ -372,41 +346,40 @@ void run_sort_pairs_benchmark(benchmark::State& state, HIP_CHECK(hipFree(d_values_output)); } -#define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ -benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "" + \ - "(Segments:~" + std::to_string(SEGMENTS) + " segments)" \ - ).c_str(), \ - [=](benchmark::State& state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size, Ascending); } \ -) - -#define CREATE_SORT_KEYS_DESCENDING_BENCHMARK(Key, SEGMENTS) \ -benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "<" #Key ">" + \ - "(~" + std::to_string(SEGMENTS) + " segments), descending" \ - ).c_str(), \ - [=](benchmark::State& state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size, Descending); } \ -) - -#define BENCHMARK_KEY_TYPE(type) \ - CREATE_SORT_KEYS_BENCHMARK(type, 1), \ - CREATE_SORT_KEYS_BENCHMARK(type, 10), \ - CREATE_SORT_KEYS_BENCHMARK(type, 100), \ - CREATE_SORT_KEYS_BENCHMARK(type, 1000), \ - CREATE_SORT_KEYS_BENCHMARK(type, 10000), \ - CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 1), \ - CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 10), \ - CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 100), \ - CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 1000), \ - CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 10000) - +#define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_radix_sort_keys" \ + "." \ + "(segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_keys_benchmark(state, SEGMENTS, stream, size, Ascending); }) + +#define CREATE_SORT_KEYS_DESCENDING_BENCHMARK(Key, SEGMENTS) \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_radix_sort_keys" \ + "." \ + "(segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_keys_benchmark(state, SEGMENTS, stream, size, Descending); }) + +#define BENCHMARK_KEY_TYPE(type) \ + CREATE_SORT_KEYS_BENCHMARK(type, 1), CREATE_SORT_KEYS_BENCHMARK(type, 10), \ + CREATE_SORT_KEYS_BENCHMARK(type, 100), CREATE_SORT_KEYS_BENCHMARK(type, 1000), \ + CREATE_SORT_KEYS_BENCHMARK(type, 10000), CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 1), \ + CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 10), \ + CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 100), \ + CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 1000), \ + CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 10000) void add_sort_keys_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - std::vector bs = - { + std::vector bs = { BENCHMARK_KEY_TYPE(float), BENCHMARK_KEY_TYPE(double), BENCHMARK_KEY_TYPE(int8_t), @@ -416,45 +389,45 @@ void add_sort_keys_benchmarks(std::vector& benc benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ -benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "" + \ - "(Segments:~" + std::to_string(SEGMENTS) + " segments)" \ - ).c_str(), \ - [=](benchmark::State& state) { \ - run_sort_pairs_benchmark(state, SEGMENTS, stream, size, Ascending); } \ -) - -#define CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(Key, Value, SEGMENTS) \ -benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "<" #Key ", " #Value ">" + \ - "(~" + std::to_string(SEGMENTS) + " segments), descending" \ - ).c_str(), \ - [=](benchmark::State& state) { \ - run_sort_pairs_benchmark(state, SEGMENTS, stream, size, Descending); } \ -) - -#define BENCHMARK_PAIR_TYPE(type, value) \ - CREATE_SORT_PAIRS_BENCHMARK(type, value, 1), \ - CREATE_SORT_PAIRS_BENCHMARK(type, value, 10), \ - CREATE_SORT_PAIRS_BENCHMARK(type, value, 100), \ - CREATE_SORT_PAIRS_BENCHMARK(type, value, 1000), \ - CREATE_SORT_PAIRS_BENCHMARK(type, value, 10000), \ - CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 1), \ - CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 10), \ - CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 100), \ - CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 1000), \ - CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 10000) +#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_radix_sort_pairs" \ + "." \ + "(segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, Ascending); }) + +#define CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(Key, Value, SEGMENTS) \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_radix_sort_pairs" \ + "." \ + "(segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, Descending); }) + +#define BENCHMARK_PAIR_TYPE(type, value) \ + CREATE_SORT_PAIRS_BENCHMARK(type, value, 1), CREATE_SORT_PAIRS_BENCHMARK(type, value, 10), \ + CREATE_SORT_PAIRS_BENCHMARK(type, value, 100), \ + CREATE_SORT_PAIRS_BENCHMARK(type, value, 1000), \ + CREATE_SORT_PAIRS_BENCHMARK(type, value, 10000), \ + CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 1), \ + CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 10), \ + CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 100), \ + CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 1000), \ + CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 10000) void add_sort_pairs_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - using custom_float2 = benchmark_utils::custom_type; + using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; - std::vector bs = - { + std::vector bs = { BENCHMARK_PAIR_TYPE(int, float), BENCHMARK_PAIR_TYPE(long long, double), BENCHMARK_PAIR_TYPE(int8_t, int8_t), @@ -465,7 +438,7 @@ void add_sort_pairs_benchmarks(std::vector& ben benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -474,15 +447,15 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); std::cout << "benchmark_device_segmented_radix_sort" << std::endl; // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_device_segmented_reduce.cpp b/benchmark/benchmark_device_segmented_reduce.cpp index ca7cb950..d1e40c67 100644 --- a/benchmark/benchmark_device_segmented_reduce.cpp +++ b/benchmark/benchmark_device_segmented_reduce.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -25,36 +25,34 @@ // HIP API #include "hipcub/device/device_segmented_reduce.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif - -const unsigned int batch_size = 10; +const unsigned int batch_size = 10; const unsigned int warmup_size = 5; using OffsetType = int; template -void run_benchmark(benchmark::State& state, - size_t desired_segments, - hipStream_t stream, - size_t size, +void run_benchmark(benchmark::State& state, + size_t desired_segments, + hipStream_t stream, + size_t size, SegmentedReduceKernel segmented_reduce) { using value_type = T; // Generate data - const unsigned int seed = 123; + const unsigned int seed = 123; std::default_random_engine gen(seed); const double avg_segment_length = static_cast(size) / desired_segments; std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); std::vector offsets; - unsigned int segments_count = 0; - size_t offset = 0; + unsigned int segments_count = 0; + size_t offset = 0; while(offset < size) { const size_t segment_length = std::round(segment_length_dis(gen)); @@ -67,30 +65,24 @@ void run_benchmark(benchmark::State& state, std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); - OffsetType * d_offsets; + OffsetType* d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(OffsetType))); - HIP_CHECK( - hipMemcpy( - d_offsets, offsets.data(), - (segments_count + 1) * sizeof(OffsetType), - hipMemcpyHostToDevice - ) - ); - - value_type * d_values_input; + HIP_CHECK(hipMemcpy(d_offsets, + offsets.data(), + (segments_count + 1) * sizeof(OffsetType), + hipMemcpyHostToDevice)); + + value_type* d_values_input; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - size * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); - - OutputT * d_aggregates_output; + HIP_CHECK(hipMemcpy(d_values_input, + values_input.data(), + size * sizeof(value_type), + hipMemcpyHostToDevice)); + + OutputT* d_aggregates_output; HIP_CHECK(hipMalloc(&d_aggregates_output, segments_count * sizeof(OutputT))); - void * d_temporary_storage = nullptr; + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(segmented_reduce(d_temporary_storage, @@ -119,7 +111,7 @@ void run_benchmark(benchmark::State& state, } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); @@ -137,8 +129,8 @@ void run_benchmark(benchmark::State& state, HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(value_type)); @@ -154,8 +146,10 @@ template struct Benchmark; template -struct Benchmark { - static void run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size) +struct Benchmark +{ + static void + run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size) { hipError_t (*ptr_to_sum)(void*, size_t&, T*, T*, int, OffsetType*, OffsetType*, hipStream_t) = &hipcub::DeviceSegmentedReduce::Sum; @@ -164,8 +158,10 @@ struct Benchmark { }; template -struct Benchmark { - static void run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size) +struct Benchmark +{ + static void + run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size) { hipError_t (*ptr_to_min)(void*, size_t&, T*, T*, int, OffsetType*, OffsetType*, hipStream_t) = &hipcub::DeviceSegmentedReduce::Min; @@ -174,12 +170,14 @@ struct Benchmark { }; template -struct Benchmark { +struct Benchmark +{ using Difference = OffsetType; - using Iterator = typename hipcub::ArgIndexInputIterator; - using KeyValue = typename Iterator::value_type; + using Iterator = typename hipcub::ArgIndexInputIterator; + using KeyValue = typename Iterator::value_type; - static void run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size) + static void + run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size) { hipError_t (*ptr_to_argmin)(void*, size_t&, @@ -194,50 +192,48 @@ struct Benchmark { } }; -#define CREATE_BENCHMARK(T, SEGMENTS, REDUCE_OP) \ -benchmark::RegisterBenchmark( \ - (std::string("segmented_reduce") + "" + \ - "(Number of segments:~" + std::to_string(SEGMENTS) + " segments)" \ - ).c_str(), \ - &Benchmark::run, \ - SEGMENTS, stream, size \ -) - -#define BENCHMARK_TYPE(type, REDUCE_OP) \ - CREATE_BENCHMARK(type, 1, REDUCE_OP), \ - CREATE_BENCHMARK(type, 100, REDUCE_OP), \ - CREATE_BENCHMARK(type, 10000, REDUCE_OP) - -#define CREATE_BENCHMARKS(REDUCE_OP) \ - BENCHMARK_TYPE(float, REDUCE_OP), \ - BENCHMARK_TYPE(double, REDUCE_OP), \ - BENCHMARK_TYPE(int8_t, REDUCE_OP), \ - BENCHMARK_TYPE(int, REDUCE_OP) +#define CREATE_BENCHMARK(T, SEGMENTS, REDUCE_OP) \ + benchmark::RegisterBenchmark(std::string("device_segmented_reduce" \ + "." \ + "(number_of_segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + &Benchmark::run, \ + SEGMENTS, \ + stream, \ + size) + +#define BENCHMARK_TYPE(type, REDUCE_OP) \ + CREATE_BENCHMARK(type, 1, REDUCE_OP), CREATE_BENCHMARK(type, 100, REDUCE_OP), \ + CREATE_BENCHMARK(type, 10000, REDUCE_OP) + +#define CREATE_BENCHMARKS(REDUCE_OP) \ + BENCHMARK_TYPE(float, REDUCE_OP), BENCHMARK_TYPE(double, REDUCE_OP), \ + BENCHMARK_TYPE(int8_t, REDUCE_OP), BENCHMARK_TYPE(int, REDUCE_OP) void add_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { using custom_double2 = benchmark_utils::custom_type; - std::vector bs = - { + std::vector bs = { CREATE_BENCHMARKS(hipcub::Sum), BENCHMARK_TYPE(custom_double2, hipcub::Sum), CREATE_BENCHMARKS(hipcub::Min), - #ifdef HIPCUB_ROCPRIM_API +#ifdef HIPCUB_ROCPRIM_API BENCHMARK_TYPE(custom_double2, hipcub::Min), - #endif +#endif CREATE_BENCHMARKS(hipcub::ArgMin), - #ifdef HIPCUB_ROCPRIM_API +#ifdef HIPCUB_ROCPRIM_API BENCHMARK_TYPE(custom_double2, hipcub::ArgMin), - #endif +#endif }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -246,15 +242,15 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); std::cout << "benchmark_device_segmented_reduce" << std::endl; // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_device_segmented_sort.cpp b/benchmark/benchmark_device_segmented_sort.cpp index e2b2a6a2..d98c7f42 100644 --- a/benchmark/benchmark_device_segmented_sort.cpp +++ b/benchmark/benchmark_device_segmented_sort.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -29,19 +29,19 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -const unsigned int batch_size = 4; +const unsigned int batch_size = 4; const unsigned int warmup_size = 2; -template -void run_sort_keys_benchmark(benchmark::State &state, - size_t desired_segments, - hipStream_t stream, - size_t size, - bool Descending = false, - bool Stable = false) +template +void run_sort_keys_benchmark(benchmark::State& state, + size_t desired_segments, + hipStream_t stream, + size_t size, + bool Descending = false, + bool Stable = false) { using offset_type = int; - using key_type = Key; + using key_type = Key; typedef hipError_t (*sort_func)(void*, size_t&, const key_type*, @@ -52,31 +52,29 @@ void run_sort_keys_benchmark(benchmark::State &state, offset_type*, hipStream_t); - sort_func func_ascending = &hipcub::DeviceSegmentedSort::SortKeys - ; - sort_func func_descending = &hipcub::DeviceSegmentedSort::SortKeysDescending - ; - sort_func func_ascending_stable = &hipcub::DeviceSegmentedSort::StableSortKeys - ; - sort_func func_descending_stable = &hipcub::DeviceSegmentedSort::StableSortKeysDescending - ; + sort_func func_ascending = &hipcub::DeviceSegmentedSort::SortKeys; + sort_func func_descending + = &hipcub::DeviceSegmentedSort::SortKeysDescending; + sort_func func_ascending_stable + = &hipcub::DeviceSegmentedSort::StableSortKeys; + sort_func func_descending_stable + = &hipcub::DeviceSegmentedSort::StableSortKeysDescending; - sort_func sorting = Descending ? - (Stable ? func_descending_stable : func_descending) : - (Stable ? func_ascending_stable : func_ascending); + sort_func sorting = Descending ? (Stable ? func_descending_stable : func_descending) + : (Stable ? func_ascending_stable : func_ascending); std::vector offsets; const double avg_segment_length = static_cast(size) / desired_segments; - std::random_device rd; + std::random_device rd; std::default_random_engine gen(rd()); std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); unsigned int segments_count = 0; - size_t offset = 0; - while (offset < size) + size_t offset = 0; + while(offset < size) { const size_t segment_length = std::round(segment_length_dis(gen)); offsets.push_back(offset); @@ -86,46 +84,34 @@ void run_sort_keys_benchmark(benchmark::State &state, offsets.push_back(size); std::vector keys_input; - if (std::is_floating_point::value) + if(std::is_floating_point::value) { - keys_input = benchmark_utils::get_random_data( - size, - static_cast(-1000), - static_cast(1000) - ); - } - else + keys_input = benchmark_utils::get_random_data(size, + static_cast(-1000), + static_cast(1000)); + } else { - keys_input = benchmark_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + keys_input + = benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max()); } - offset_type * d_offsets; + offset_type* d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); - HIP_CHECK( - hipMemcpy( - d_offsets, offsets.data(), - (segments_count + 1) * sizeof(offset_type), - hipMemcpyHostToDevice - ) - ); - - key_type * d_keys_input; - key_type * d_keys_output; + HIP_CHECK(hipMemcpy(d_offsets, + offsets.data(), + (segments_count + 1) * sizeof(offset_type), + hipMemcpyHostToDevice)); + + key_type* d_keys_input; + key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; + hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, @@ -141,7 +127,7 @@ void run_sort_keys_benchmark(benchmark::State &state, HIP_CHECK(hipDeviceSynchronize()); // Warm-up - for (size_t i = 0; i < warmup_size; ++i) + for(size_t i = 0; i < warmup_size; ++i) { HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, @@ -155,11 +141,11 @@ void run_sort_keys_benchmark(benchmark::State &state, } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < batch_size; ++i) + for(size_t i = 0; i < batch_size; ++i) { HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, @@ -174,8 +160,8 @@ void run_sort_keys_benchmark(benchmark::State &state, HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); @@ -187,17 +173,17 @@ void run_sort_keys_benchmark(benchmark::State &state, HIP_CHECK(hipFree(d_keys_output)); } -template -void run_sort_pairs_benchmark(benchmark::State &state, - size_t desired_segments, - hipStream_t stream, - size_t size, - bool Descending = false, - bool Stable = false) +template +void run_sort_pairs_benchmark(benchmark::State& state, + size_t desired_segments, + hipStream_t stream, + size_t size, + bool Descending = false, + bool Stable = false) { using offset_type = int; - using key_type = Key; - using value_type = Value; + using key_type = Key; + using value_type = Value; typedef hipError_t (*sort_func)(void*, size_t&, const key_type*, @@ -210,31 +196,31 @@ void run_sort_pairs_benchmark(benchmark::State &state, offset_type*, hipStream_t); - sort_func func_ascending = &hipcub::DeviceSegmentedSort::SortPairs - ; - sort_func func_descending = &hipcub::DeviceSegmentedSort::SortPairsDescending - ; - sort_func func_ascending_stable = &hipcub::DeviceSegmentedSort::StableSortPairs - ; - sort_func func_descending_stable = &hipcub::DeviceSegmentedSort::StableSortPairsDescending - ; + sort_func func_ascending + = &hipcub::DeviceSegmentedSort::SortPairs; + sort_func func_descending + = &hipcub::DeviceSegmentedSort::SortPairsDescending; + sort_func func_ascending_stable + = &hipcub::DeviceSegmentedSort::StableSortPairs; + sort_func func_descending_stable + = &hipcub::DeviceSegmentedSort:: + StableSortPairsDescending; - sort_func sorting = Descending ? - (Stable ? func_descending_stable : func_descending) : - (Stable ? func_ascending_stable : func_ascending); + sort_func sorting = Descending ? (Stable ? func_descending_stable : func_descending) + : (Stable ? func_ascending_stable : func_ascending); std::vector offsets; const double avg_segment_length = static_cast(size) / desired_segments; - std::random_device rd; + std::random_device rd; std::default_random_engine gen(rd()); std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); unsigned int segments_count = 0; - size_t offset = 0; - while (offset < size) + size_t offset = 0; + while(offset < size) { const size_t segment_length = std::round(segment_length_dis(gen)); offsets.push_back(offset); @@ -244,61 +230,46 @@ void run_sort_pairs_benchmark(benchmark::State &state, offsets.push_back(size); std::vector keys_input; - if (std::is_floating_point::value) + if(std::is_floating_point::value) { - keys_input = benchmark_utils::get_random_data( - size, - static_cast(-1000), - static_cast(1000) - ); - } - else + keys_input = benchmark_utils::get_random_data(size, + static_cast(-1000), + static_cast(1000)); + } else { - keys_input = benchmark_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + keys_input + = benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max()); } std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); - offset_type * d_offsets; + offset_type* d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); - HIP_CHECK( - hipMemcpy( - d_offsets, offsets.data(), - (segments_count + 1) * sizeof(offset_type), - hipMemcpyHostToDevice - ) - ); - - key_type * d_keys_input; - key_type * d_keys_output; + HIP_CHECK(hipMemcpy(d_offsets, + offsets.data(), + (segments_count + 1) * sizeof(offset_type), + hipMemcpyHostToDevice)); + + key_type* d_keys_input; + key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - value_type * d_values_input; - value_type * d_values_output; + hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + value_type* d_values_input; + value_type* d_values_output; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - size * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; + HIP_CHECK(hipMemcpy(d_values_input, + values_input.data(), + size * sizeof(value_type), + hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, @@ -316,7 +287,7 @@ void run_sort_pairs_benchmark(benchmark::State &state, HIP_CHECK(hipDeviceSynchronize()); // Warm-up - for (size_t i = 0; i < warmup_size; i++) + for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, @@ -332,11 +303,11 @@ void run_sort_pairs_benchmark(benchmark::State &state, } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < batch_size; i++) + for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(sorting(d_temporary_storage, temporary_storage_bytes, @@ -353,12 +324,12 @@ void run_sort_pairs_benchmark(benchmark::State &state, HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } - state.SetBytesProcessed( - state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); + state.SetBytesProcessed(state.iterations() * batch_size * size + * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); @@ -369,96 +340,123 @@ void run_sort_pairs_benchmark(benchmark::State &state, HIP_CHECK(hipFree(d_values_output)); } -#define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ - benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "" + \ - "(Number of segments:~" + std::to_string(SEGMENTS) + " segments)") \ - .c_str(), \ - [=](benchmark::State &state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size); }), \ - benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "<" #Key ">" + \ - "(~" + std::to_string(SEGMENTS) + " segments), descending") \ - .c_str(), \ - [=](benchmark::State &state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size, true); }), \ - benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "<" #Key ">" + \ - "(~" + std::to_string(SEGMENTS) + " segments), stable") \ - .c_str(), \ - [=](benchmark::State &state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size, false, true); }), \ - benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "<" #Key ">" + \ - "(~" + std::to_string(SEGMENTS) + " segments), descending, stable") \ - .c_str(), \ - [=](benchmark::State &state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size, true, true); }) - -#define BENCHMARK_KEY_TYPE(type) \ - CREATE_SORT_KEYS_BENCHMARK(type, 10), \ - CREATE_SORT_KEYS_BENCHMARK(type, 100), \ - CREATE_SORT_KEYS_BENCHMARK(type, 1000), \ - CREATE_SORT_KEYS_BENCHMARK(type, 10000) - -void add_sort_keys_benchmarks(std::vector &benchmarks, - hipStream_t stream, - size_t size) +#define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ + benchmark::RegisterBenchmark(std::string("device_segmented_sort_keys" \ + "." \ + "(number_of_segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) { \ + run_sort_keys_benchmark(state, SEGMENTS, stream, size); \ + }), \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_sort_keys" \ + "." \ + "(number_of_segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_keys_benchmark(state, SEGMENTS, stream, size, true); }), \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_sort_keys" \ + "." \ + "(number_of_segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_keys_benchmark(state, SEGMENTS, stream, size, false, true); }), \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_sort_keys" \ + "." \ + "(number_of_segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_keys_benchmark(state, SEGMENTS, stream, size, true, true); }) + +#define BENCHMARK_KEY_TYPE(type) \ + CREATE_SORT_KEYS_BENCHMARK(type, 10), CREATE_SORT_KEYS_BENCHMARK(type, 100), \ + CREATE_SORT_KEYS_BENCHMARK(type, 1000), CREATE_SORT_KEYS_BENCHMARK(type, 10000) + +void add_sort_keys_benchmarks(std::vector& benchmarks, + hipStream_t stream, + size_t size) { - std::vector bs = - { - BENCHMARK_KEY_TYPE(float), - BENCHMARK_KEY_TYPE(double), - BENCHMARK_KEY_TYPE(int8_t), - BENCHMARK_KEY_TYPE(uint8_t), - BENCHMARK_KEY_TYPE(int), - }; + std::vector bs = { + BENCHMARK_KEY_TYPE(float), + BENCHMARK_KEY_TYPE(double), + BENCHMARK_KEY_TYPE(int8_t), + BENCHMARK_KEY_TYPE(uint8_t), + BENCHMARK_KEY_TYPE(int), + }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ - benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "" + \ - "(Number of segments:~" + std::to_string(SEGMENTS) + " segments)") \ - .c_str(), \ - [=](benchmark::State &state) { run_sort_pairs_benchmark(state, SEGMENTS, stream, size); }), \ - benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "<" #Key ", " #Value ">" + \ - "(~" + std::to_string(SEGMENTS) + " segments), descending") \ - .c_str(), \ - [=](benchmark::State &state) { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, true); }), \ - benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "<" #Key ", " #Value ">" + \ - "(~" + std::to_string(SEGMENTS) + " segments), stable") \ - .c_str(), \ - [=](benchmark::State &state) { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, false, true); }), \ - benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "<" #Key ", " #Value ">" + \ - "(~" + std::to_string(SEGMENTS) + " segments), descending, stable") \ - .c_str(), \ - [=](benchmark::State &state) { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, true, true); }) - -#define BENCHMARK_PAIR_TYPE(type, value) \ - CREATE_SORT_PAIRS_BENCHMARK(type, value, 10), \ - CREATE_SORT_PAIRS_BENCHMARK(type, value, 100), \ +#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_sort_pairs" \ + "." \ + "(number_of_segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_pairs_benchmark(state, SEGMENTS, stream, size); }), \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_sort_pairs" \ + "." \ + "(number_of_segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, true); }), \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_sort_pairs" \ + "." \ + "(number_of_segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) { \ + run_sort_pairs_benchmark(state, SEGMENTS, stream, size, false, true); \ + }), \ + benchmark::RegisterBenchmark( \ + std::string("device_segmented_sort_pairs" \ + "." \ + "(number_of_segments:~" \ + + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) \ + { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, true, true); }) +#define BENCHMARK_PAIR_TYPE(type, value) \ + CREATE_SORT_PAIRS_BENCHMARK(type, value, 10), CREATE_SORT_PAIRS_BENCHMARK(type, value, 100), \ CREATE_SORT_PAIRS_BENCHMARK(type, value, 10000) -void add_sort_pairs_benchmarks(std::vector &benchmarks, - hipStream_t stream, - size_t size) +void add_sort_pairs_benchmarks(std::vector& benchmarks, + hipStream_t stream, + size_t size) { - using custom_float2 = benchmark_utils::custom_type; + using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; - std::vector bs = - { - BENCHMARK_PAIR_TYPE(int, float), - BENCHMARK_PAIR_TYPE(long long, double), - BENCHMARK_PAIR_TYPE(int8_t, int8_t), - BENCHMARK_PAIR_TYPE(uint8_t, uint8_t), - BENCHMARK_PAIR_TYPE(int, custom_float2), - BENCHMARK_PAIR_TYPE(long long, custom_double2), - }; + std::vector bs = { + BENCHMARK_PAIR_TYPE(int, float), + BENCHMARK_PAIR_TYPE(long long, double), + BENCHMARK_PAIR_TYPE(int8_t, int8_t), + BENCHMARK_PAIR_TYPE(uint8_t, uint8_t), + BENCHMARK_PAIR_TYPE(int, custom_float2), + BENCHMARK_PAIR_TYPE(long long, custom_double2), + }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -467,35 +465,35 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); std::cout << "benchmark_device_segmented_sort" << std::endl; // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks - std::vector benchmarks; + std::vector benchmarks; add_sort_keys_benchmarks(benchmarks, stream, size); add_sort_pairs_benchmarks(benchmarks, stream, size); // Use manual timing - for (auto &b : benchmarks) + for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations - if (trials > 0) + if(trials > 0) { - for (auto &b : benchmarks) + for(auto& b : benchmarks) { b->Iterations(trials); } diff --git a/benchmark/benchmark_device_select.cpp b/benchmark/benchmark_device_select.cpp index d1617c79..c0921d54 100644 --- a/benchmark/benchmark_device_select.cpp +++ b/benchmark/benchmark_device_select.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -25,93 +25,71 @@ // HIP API #include "hipcub/device/device_select.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template void run_flagged_benchmark(benchmark::State& state, - size_t size, + size_t size, const hipStream_t stream, - float true_probability) + float true_probability) { - std::vector input; - std::vector flags = benchmark_utils::get_random_data01(size, true_probability); + std::vector input; + std::vector flags + = benchmark_utils::get_random_data01(size, true_probability); if(std::is_floating_point::value) { input = benchmark_utils::get_random_data(size, T(-1000), T(1000)); - } - else + } else { - input = benchmark_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + input = benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max()); } - T * d_input; - FlagType * d_flags; - T * d_output; - unsigned int * d_selected_count_output; + T* d_input; + FlagType* d_flags; + T* d_output; + unsigned int* d_selected_count_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_flags, flags.size() * sizeof(FlagType))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_flags, flags.data(), - flags.size() * sizeof(FlagType), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_flags, flags.data(), flags.size() * sizeof(FlagType), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes = 0; // Get size of d_temp_storage - HIP_CHECK( - hipcub::DeviceSelect::Flagged( - nullptr, - temp_storage_size_bytes, - d_input, - d_flags, - d_output, - d_selected_count_output, - input.size(), - stream - ) - ); + HIP_CHECK(hipcub::DeviceSelect::Flagged(nullptr, + temp_storage_size_bytes, + d_input, + d_flags, + d_output, + d_selected_count_output, + input.size(), + stream)); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { - HIP_CHECK( - hipcub::DeviceSelect::Flagged( - d_temp_storage, - temp_storage_size_bytes, - d_input, - d_flags, - d_output, - d_selected_count_output, - input.size(), - stream - ) - ); + HIP_CHECK(hipcub::DeviceSelect::Flagged(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_flags, + d_output, + d_selected_count_output, + input.size(), + stream)); } HIP_CHECK(hipDeviceSynchronize()); @@ -121,24 +99,20 @@ void run_flagged_benchmark(benchmark::State& state, auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK( - hipcub::DeviceSelect::Flagged( - d_temp_storage, - temp_storage_size_bytes, - d_input, - d_flags, - d_output, - d_selected_count_output, - input.size(), - stream - ) - ); + HIP_CHECK(hipcub::DeviceSelect::Flagged(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_flags, + d_output, + d_selected_count_output, + input.size(), + stream)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); @@ -154,71 +128,58 @@ void run_flagged_benchmark(benchmark::State& state, template void run_selectop_benchmark(benchmark::State& state, - size_t size, + size_t size, const hipStream_t stream, - float true_probability) + float true_probability) { std::vector input = benchmark_utils::get_random_data(size, T(0), T(1000)); - auto select_op = [true_probability] __device__ (const T& value) -> bool + auto select_op = [true_probability] __device__(const T& value) -> bool { - if(value < T(1000 * true_probability)) return true; + if(value < T(1000 * true_probability)) + return true; return false; }; - T * d_input; - T * d_output; - unsigned int * d_selected_count_output; + T* d_input; + T* d_output; + unsigned int* d_selected_count_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage - HIP_CHECK( - hipcub::DeviceSelect::If( - nullptr, - temp_storage_size_bytes, - d_input, - d_output, - d_selected_count_output, - input.size(), - select_op, - stream - ) - ); + HIP_CHECK(hipcub::DeviceSelect::If(nullptr, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + input.size(), + select_op, + stream)); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { - HIP_CHECK( - hipcub::DeviceSelect::If( - d_temp_storage, - temp_storage_size_bytes, - d_input, - d_output, - d_selected_count_output, - input.size(), - select_op, - stream - ) - ); + HIP_CHECK(hipcub::DeviceSelect::If(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + input.size(), + select_op, + stream)); } HIP_CHECK(hipDeviceSynchronize()); @@ -228,24 +189,20 @@ void run_selectop_benchmark(benchmark::State& state, auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK( - hipcub::DeviceSelect::If( - d_temp_storage, - temp_storage_size_bytes, - d_input, - d_output, - d_selected_count_output, - input.size(), - select_op, - stream - ) - ); + HIP_CHECK(hipcub::DeviceSelect::If(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + input.size(), + select_op, + stream)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); @@ -260,74 +217,60 @@ void run_selectop_benchmark(benchmark::State& state, template void run_unique_benchmark(benchmark::State& state, - size_t size, + size_t size, const hipStream_t stream, - float discontinuity_probability) + float discontinuity_probability) { hipcub::Sum op; std::vector input(size); { auto input01 = benchmark_utils::get_random_data01(size, discontinuity_probability); - auto acc = input01[0]; - input[0] = acc; + auto acc = input01[0]; + input[0] = acc; for(size_t i = 1; i < input01.size(); i++) { input[i] = op(acc, input01[i]); } } - T * d_input; - T * d_output; - unsigned int * d_selected_count_output; + T* d_input; + T* d_output; + unsigned int* d_selected_count_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage - HIP_CHECK( - hipcub::DeviceSelect::Unique( - nullptr, - temp_storage_size_bytes, - d_input, - d_output, - d_selected_count_output, - input.size(), - stream - ) - ); + HIP_CHECK(hipcub::DeviceSelect::Unique(nullptr, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + input.size(), + stream)); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { - HIP_CHECK( - hipcub::DeviceSelect::Unique( - d_temp_storage, - temp_storage_size_bytes, - d_input, - d_output, - d_selected_count_output, - input.size(), - stream - ) - ); + HIP_CHECK(hipcub::DeviceSelect::Unique(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + input.size(), + stream)); } HIP_CHECK(hipDeviceSynchronize()); @@ -337,23 +280,19 @@ void run_unique_benchmark(benchmark::State& state, auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK( - hipcub::DeviceSelect::Unique( - d_temp_storage, - temp_storage_size_bytes, - d_input, - d_output, - d_selected_count_output, - input.size(), - stream - ) - ); + HIP_CHECK(hipcub::DeviceSelect::Unique(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + input.size(), + stream)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); @@ -367,20 +306,20 @@ void run_unique_benchmark(benchmark::State& state, template void run_unique_by_key_benchmark(benchmark::State& state, - size_t size, + size_t size, const hipStream_t stream, - float discontinuity_probability) + float discontinuity_probability) { hipcub::Sum op; std::vector input_keys(size); { auto input01 = benchmark_utils::get_random_data01(size, discontinuity_probability); - auto acc = input01[0]; + auto acc = input01[0]; input_keys[0] = acc; - for (size_t i = 1; i < input01.size(); i++) + for(size_t i = 1; i < input01.size(); i++) { input_keys[i] = op(acc, input01[i]); } @@ -389,10 +328,10 @@ void run_unique_by_key_benchmark(benchmark::State& state, const auto input_values = benchmark_utils::get_random_data(size, ValueT(-1000), ValueT(1000)); - KeyT* d_keys_input; - ValueT* d_values_input; - KeyT* d_keys_output; - ValueT* d_values_output; + KeyT* d_keys_input; + ValueT* d_values_input; + KeyT* d_keys_output; + ValueT* d_values_output; unsigned int* d_selected_count_output; HIP_CHECK(hipMalloc(&d_keys_input, input_keys.size() * sizeof(input_keys[0]))); @@ -401,40 +340,28 @@ void run_unique_by_key_benchmark(benchmark::State& state, HIP_CHECK(hipMalloc(&d_values_output, input_values.size() * sizeof(input_values[0]))); HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(*d_selected_count_output))); - HIP_CHECK( - hipMemcpy( - d_keys_input, - input_keys.data(), - input_keys.size() * sizeof(input_keys[0]), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_values_input, - input_values.data(), - input_values.size() * sizeof(input_values[0]), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_keys_input, + input_keys.data(), + input_keys.size() * sizeof(input_keys[0]), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_values_input, + input_values.data(), + input_values.size() * sizeof(input_values[0]), + hipMemcpyHostToDevice)); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage - HIP_CHECK( - hipcub::DeviceSelect::UniqueByKey( - nullptr, - temp_storage_size_bytes, - d_keys_input, - d_values_input, - d_keys_output, - d_values_output, - d_selected_count_output, - input_keys.size(), - stream - ) - ); + HIP_CHECK(hipcub::DeviceSelect::UniqueByKey(nullptr, + temp_storage_size_bytes, + d_keys_input, + d_values_input, + d_keys_output, + d_values_output, + d_selected_count_output, + input_keys.size(), + stream)); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage @@ -443,51 +370,45 @@ void run_unique_by_key_benchmark(benchmark::State& state, HIP_CHECK(hipDeviceSynchronize()); // Warm-up - for (size_t i = 0; i < 10; i++) + for(size_t i = 0; i < 10; i++) { - HIP_CHECK( - hipcub::DeviceSelect::UniqueByKey( - d_temp_storage, - temp_storage_size_bytes, - d_keys_input, - d_values_input, - d_keys_output, - d_values_output, - d_selected_count_output, - input_keys.size(), - stream - ) - ); + HIP_CHECK(hipcub::DeviceSelect::UniqueByKey(d_temp_storage, + temp_storage_size_bytes, + d_keys_input, + d_values_input, + d_keys_output, + d_values_output, + d_selected_count_output, + input_keys.size(), + stream)); } HIP_CHECK(hipDeviceSynchronize()); const unsigned int batch_size = 10; - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < batch_size; i++) + for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK( - hipcub::DeviceSelect::UniqueByKey( - d_temp_storage, - temp_storage_size_bytes, - d_keys_input, - d_values_input, - d_keys_output, - d_values_output, - d_selected_count_output, - input_keys.size(), - stream - ) - ); + HIP_CHECK(hipcub::DeviceSelect::UniqueByKey(d_temp_storage, + temp_storage_size_bytes, + d_keys_input, + d_values_input, + d_keys_output, + d_values_output, + d_selected_count_output, + input_keys.size(), + stream)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } - state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(KeyT) + sizeof(ValueT))); + state.SetBytesProcessed(state.iterations() * batch_size * size + * (sizeof(KeyT) + sizeof(ValueT))); state.SetItemsProcessed(state.iterations() * batch_size * size); hipFree(d_keys_input); @@ -498,55 +419,67 @@ void run_unique_by_key_benchmark(benchmark::State& state, hipFree(d_temp_storage); } -#define CREATE_SELECT_FLAGGED_BENCHMARK(T, F, p) \ -benchmark::RegisterBenchmark( \ - ("select_flagged(Probability:" #p")"), \ - &run_flagged_benchmark, size, stream, p \ -) - -#define CREATE_SELECT_IF_BENCHMARK(T, p) \ -benchmark::RegisterBenchmark( \ - ("select_if(Probability:" #p")"), \ - &run_selectop_benchmark, size, stream, p \ -) - -#define CREATE_UNIQUE_BENCHMARK(T, p) \ -benchmark::RegisterBenchmark( \ - ("unique(Probability:" #p")"), \ - &run_unique_benchmark, size, stream, p \ -) - -#define CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, p) \ -benchmark::RegisterBenchmark( \ - ("unique_by_key<" #K ", "#V", unsigned int>(p = " #p")"), \ - &run_unique_by_key_benchmark, size, stream, p \ -) - -#define BENCHMARK_FLAGGED_TYPE(type, value) \ - CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.05f), \ - CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.25f), \ - CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.5f), \ - CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.75f) - -#define BENCHMARK_IF_TYPE(type) \ - CREATE_SELECT_IF_BENCHMARK(type, 0.05f), \ - CREATE_SELECT_IF_BENCHMARK(type, 0.25f), \ - CREATE_SELECT_IF_BENCHMARK(type, 0.5f), \ - CREATE_SELECT_IF_BENCHMARK(type, 0.75f) - -#define BENCHMARK_UNIQUE_TYPE(type) \ - CREATE_UNIQUE_BENCHMARK(type, 0.05f), \ - CREATE_UNIQUE_BENCHMARK(type, 0.25f), \ - CREATE_UNIQUE_BENCHMARK(type, 0.5f), \ - CREATE_UNIQUE_BENCHMARK(type, 0.75f) - -#define BENCHMARK_UNIQUE_BY_KEY_TYPE(key_type, value_type) \ - CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.05f), \ - CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.25f), \ - CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.5f), \ - CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.75f) - -int main(int argc, char *argv[]) +#define CREATE_SELECT_FLAGGED_BENCHMARK(T, F, p) \ + benchmark::RegisterBenchmark( \ + std::string("device_select_flagged.(probability:" #p ")") \ + .c_str(), \ + &run_flagged_benchmark, \ + size, \ + stream, \ + p) + +#define CREATE_SELECT_IF_BENCHMARK(T, p) \ + benchmark::RegisterBenchmark( \ + std::string("device_select_if.(probability:" #p ")") \ + .c_str(), \ + &run_selectop_benchmark, \ + size, \ + stream, \ + p) + +#define CREATE_UNIQUE_BENCHMARK(T, p) \ + benchmark::RegisterBenchmark( \ + std::string("device_select_unique.(probability:" #p ")") \ + .c_str(), \ + &run_unique_benchmark, \ + size, \ + stream, \ + p) + +#define CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, p) \ + benchmark::RegisterBenchmark( \ + std::string("device_select_unique_by_key.(probability:" #p ")") \ + .c_str(), \ + &run_unique_by_key_benchmark, \ + size, \ + stream, \ + p) + +#define BENCHMARK_FLAGGED_TYPE(type, value) \ + CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.05f), \ + CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.25f), \ + CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.5f), \ + CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.75f) + +#define BENCHMARK_IF_TYPE(type) \ + CREATE_SELECT_IF_BENCHMARK(type, 0.05f), CREATE_SELECT_IF_BENCHMARK(type, 0.25f), \ + CREATE_SELECT_IF_BENCHMARK(type, 0.5f), CREATE_SELECT_IF_BENCHMARK(type, 0.75f) + +#define BENCHMARK_UNIQUE_TYPE(type) \ + CREATE_UNIQUE_BENCHMARK(type, 0.05f), CREATE_UNIQUE_BENCHMARK(type, 0.25f), \ + CREATE_UNIQUE_BENCHMARK(type, 0.5f), CREATE_UNIQUE_BENCHMARK(type, 0.75f) + +#define BENCHMARK_UNIQUE_BY_KEY_TYPE(key_type, value_type) \ + CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.05f), \ + CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.25f), \ + CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.5f), \ + CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.75f) + +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -555,53 +488,51 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); std::cout << "benchmark_device_select" << std::endl; // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; - using custom_double2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; using custom_int_double = benchmark_utils::custom_type; // Add benchmarks - std::vector benchmarks = - { - BENCHMARK_FLAGGED_TYPE(int, unsigned char), - BENCHMARK_FLAGGED_TYPE(float, unsigned char), - BENCHMARK_FLAGGED_TYPE(double, unsigned char), - BENCHMARK_FLAGGED_TYPE(uint8_t, uint8_t), - BENCHMARK_FLAGGED_TYPE(int8_t, int8_t), - BENCHMARK_FLAGGED_TYPE(custom_double2, unsigned char), - - BENCHMARK_IF_TYPE(int), - BENCHMARK_IF_TYPE(float), - BENCHMARK_IF_TYPE(double), - BENCHMARK_IF_TYPE(uint8_t), - BENCHMARK_IF_TYPE(int8_t), - BENCHMARK_IF_TYPE(custom_int_double), - - BENCHMARK_UNIQUE_TYPE(int), - BENCHMARK_UNIQUE_TYPE(float), - BENCHMARK_UNIQUE_TYPE(double), - BENCHMARK_UNIQUE_TYPE(uint8_t), - BENCHMARK_UNIQUE_TYPE(int8_t), - BENCHMARK_UNIQUE_TYPE(custom_int_double), - - BENCHMARK_UNIQUE_BY_KEY_TYPE(int, int), - BENCHMARK_UNIQUE_BY_KEY_TYPE(float, double), - BENCHMARK_UNIQUE_BY_KEY_TYPE(double, custom_double2), - BENCHMARK_UNIQUE_BY_KEY_TYPE(uint8_t, uint8_t), - BENCHMARK_UNIQUE_BY_KEY_TYPE(int8_t, double), - BENCHMARK_UNIQUE_BY_KEY_TYPE(custom_int_double, custom_int_double) - }; + std::vector benchmarks + = {BENCHMARK_FLAGGED_TYPE(int, unsigned char), + BENCHMARK_FLAGGED_TYPE(float, unsigned char), + BENCHMARK_FLAGGED_TYPE(double, unsigned char), + BENCHMARK_FLAGGED_TYPE(uint8_t, uint8_t), + BENCHMARK_FLAGGED_TYPE(int8_t, int8_t), + BENCHMARK_FLAGGED_TYPE(custom_double2, unsigned char), + + BENCHMARK_IF_TYPE(int), + BENCHMARK_IF_TYPE(float), + BENCHMARK_IF_TYPE(double), + BENCHMARK_IF_TYPE(uint8_t), + BENCHMARK_IF_TYPE(int8_t), + BENCHMARK_IF_TYPE(custom_int_double), + + BENCHMARK_UNIQUE_TYPE(int), + BENCHMARK_UNIQUE_TYPE(float), + BENCHMARK_UNIQUE_TYPE(double), + BENCHMARK_UNIQUE_TYPE(uint8_t), + BENCHMARK_UNIQUE_TYPE(int8_t), + BENCHMARK_UNIQUE_TYPE(custom_int_double), + + BENCHMARK_UNIQUE_BY_KEY_TYPE(int, int), + BENCHMARK_UNIQUE_BY_KEY_TYPE(float, double), + BENCHMARK_UNIQUE_BY_KEY_TYPE(double, custom_double2), + BENCHMARK_UNIQUE_BY_KEY_TYPE(uint8_t, uint8_t), + BENCHMARK_UNIQUE_BY_KEY_TYPE(int8_t, double), + BENCHMARK_UNIQUE_BY_KEY_TYPE(custom_int_double, custom_int_double)}; // Use manual timing for(auto& b : benchmarks) diff --git a/benchmark/benchmark_device_spmv.cpp b/benchmark/benchmark_device_spmv.cpp index e884f361..37d119ee 100644 --- a/benchmark/benchmark_device_spmv.cpp +++ b/benchmark/benchmark_device_spmv.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -29,26 +29,30 @@ const size_t DEFAULT_N = 1024 * 32; #endif -const unsigned int batch_size = 10; +const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template void run_benchmark(benchmark::State& state, - size_t size, + size_t size, const hipStream_t stream, - float probability) + float probability) { const T rand_min = T(1); const T rand_max = T(10); // generate a lexicograhically sorted list of (row, column) index tuples // number of nonzeroes cannot be guaranteed as duplicates may exist - const int num_nonzeroes_attempt = static_cast(std::min( - static_cast(INT_MAX), static_cast(probability * static_cast(size * size)))); + const int num_nonzeroes_attempt = static_cast( + std::min(static_cast(INT_MAX), + static_cast(probability * static_cast(size * size)))); std::vector> indices(num_nonzeroes_attempt); { - std::vector flat_indices = benchmark_utils::get_random_data( - 2 * num_nonzeroes_attempt, 0, size - 1, 2 * num_nonzeroes_attempt); + std::vector flat_indices + = benchmark_utils::get_random_data(2 * num_nonzeroes_attempt, + 0, + size - 1, + 2 * num_nonzeroes_attempt); for(int i = 0; i < num_nonzeroes_attempt; i++) { indices[i] = std::make_pair(flat_indices[2 * i], flat_indices[2 * i + 1]); @@ -57,16 +61,17 @@ void run_benchmark(benchmark::State& state, } // generate the compressed sparse rows matrix - std::pair prev_cell = std::make_pair(-1, -1); - int num_nonzeroes = 0; - std::vector row_offsets(size + 1); - // this vector might be too large, but doing the allocation now eliminates a scan - std::vector column_indices(num_nonzeroes_attempt); - row_offsets[0] = 0; + std::pair prev_cell = std::make_pair(-1, -1); + int num_nonzeroes = 0; + std::vector row_offsets(size + 1); + // this vector might be too large, but doing the allocation now eliminates a + // scan + std::vector column_indices(num_nonzeroes_attempt); + row_offsets[0] = 0; int last_row_written = 0; for(int i = 0; i < num_nonzeroes_attempt; i++) { - if(indices[i] != prev_cell) + if(indices[i] != prev_cell) { // update the row offets if we go to the next row (or skip some) if(indices[i].first != last_row_written) @@ -94,67 +99,90 @@ void run_benchmark(benchmark::State& state, std::vector vector_x = benchmark_utils::get_random_data(size, rand_min, rand_max); - T * d_values; - int * d_row_offsets; - int * d_column_indices; - T * d_vector_x; - T * d_vector_y; - HIP_CHECK(hipMalloc(&d_values, values.size() * sizeof(T))); + T* d_values; + int* d_row_offsets; + int* d_column_indices; + T* d_vector_x; + T* d_vector_y; + HIP_CHECK(hipMalloc(&d_values, values.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_row_offsets, row_offsets.size() * sizeof(int))); HIP_CHECK(hipMalloc(&d_column_indices, num_nonzeroes * sizeof(int))); HIP_CHECK(hipMalloc(&d_vector_x, vector_x.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_vector_y, size * sizeof(T))); - HIP_CHECK(hipMemcpy( - d_values, values.data(), values.size() * sizeof(T), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy( - d_row_offsets, row_offsets.data(), row_offsets.size() * sizeof(int), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy( - d_column_indices, column_indices.data(), num_nonzeroes * sizeof(int), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy( - d_vector_x, vector_x.data(), vector_x.size() * sizeof(T), - hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_values, values.data(), values.size() * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_row_offsets, + row_offsets.data(), + row_offsets.size() * sizeof(int), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_column_indices, + column_indices.data(), + num_nonzeroes * sizeof(int), + hipMemcpyHostToDevice)); + HIP_CHECK( + hipMemcpy(d_vector_x, vector_x.data(), vector_x.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage - HIP_CHECK(hipcub::DeviceSpmv::CsrMV( - nullptr, temp_storage_size_bytes, d_values, d_row_offsets, - d_column_indices, d_vector_x, d_vector_y, size, size, num_nonzeroes, stream)); + HIP_CHECK(hipcub::DeviceSpmv::CsrMV(nullptr, + temp_storage_size_bytes, + d_values, + d_row_offsets, + d_column_indices, + d_vector_x, + d_vector_y, + size, + size, + num_nonzeroes, + stream)); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up - for(size_t i = 0; i < warmup_size; i++) + for(size_t i = 0; i < warmup_size; i++) { - HIP_CHECK(hipcub::DeviceSpmv::CsrMV( - d_temp_storage, temp_storage_size_bytes, d_values, d_row_offsets, - d_column_indices, d_vector_x, d_vector_y, size, size, num_nonzeroes, stream)); + HIP_CHECK(hipcub::DeviceSpmv::CsrMV(d_temp_storage, + temp_storage_size_bytes, + d_values, + d_row_offsets, + d_column_indices, + d_vector_x, + d_vector_y, + size, + size, + num_nonzeroes, + stream)); } HIP_CHECK(hipDeviceSynchronize()); - for(auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); - for(size_t i = 0; i < batch_size; i++) + for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK(hipcub::DeviceSpmv::CsrMV( - d_temp_storage, temp_storage_size_bytes, d_values, d_row_offsets, - d_column_indices, d_vector_x, d_vector_y, size, size, num_nonzeroes, stream)); + HIP_CHECK(hipcub::DeviceSpmv::CsrMV(d_temp_storage, + temp_storage_size_bytes, + d_values, + d_row_offsets, + d_column_indices, + d_vector_x, + d_vector_y, + size, + size, + num_nonzeroes, + stream)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * (num_nonzeroes + size) * sizeof(T)); @@ -169,20 +197,20 @@ void run_benchmark(benchmark::State& state, HIP_CHECK(hipDeviceSynchronize()); } -#define CREATE_BENCHMARK(T, p) \ -benchmark::RegisterBenchmark( \ - ("CsrMV<" #T ">(p = " #p")"), \ - &run_benchmark, size, stream, p \ -) +#define CREATE_BENCHMARK(T, p) \ + benchmark::RegisterBenchmark( \ + std::string("device_spmv_CsrMV.").c_str(), \ + &run_benchmark, \ + size, \ + stream, \ + p) -#define BENCHMARK_TYPE(type) \ - CREATE_BENCHMARK(type, 1.0e-6f), \ - CREATE_BENCHMARK(type, 1.0e-5f), \ - CREATE_BENCHMARK(type, 1.0e-4f), \ - CREATE_BENCHMARK(type, 1.0e-3f), \ - CREATE_BENCHMARK(type, 1.0e-2f) +#define BENCHMARK_TYPE(type) \ + CREATE_BENCHMARK(type, 1.0e-6f), CREATE_BENCHMARK(type, 1.0e-5f), \ + CREATE_BENCHMARK(type, 1.0e-4f), CREATE_BENCHMARK(type, 1.0e-3f), \ + CREATE_BENCHMARK(type, 1.0e-2f) -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -191,20 +219,21 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_device_spmv" << std::endl; std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks - std::vector benchmarks = - { + std::vector benchmarks = { BENCHMARK_TYPE(int), BENCHMARK_TYPE(unsigned int), BENCHMARK_TYPE(float), diff --git a/benchmark/benchmark_utils.hpp b/benchmark/benchmark_utils.hpp index 6e2f9793..fa3da901 100644 --- a/benchmark/benchmark_utils.hpp +++ b/benchmark/benchmark_utils.hpp @@ -38,9 +38,9 @@ #include "hipcub/tuple.hpp" #ifndef HIPCUB_CUB_API -#define HIPCUB_WARP_THREADS_MACRO warpSize + #define HIPCUB_WARP_THREADS_MACRO warpSize #else -#define HIPCUB_WARP_THREADS_MACRO CUB_PTX_WARP_THREADS + #define HIPCUB_WARP_THREADS_MACRO CUB_PTX_WARP_THREADS #endif namespace benchmark_utils @@ -49,18 +49,18 @@ const size_t default_max_random_size = 1024 * 1024; // get_random_data() generates only part of sequence and replicates it, // because benchmarks usually do not need "true" random sequence. template -inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = default_max_random_size) - -> typename std::enable_if::value, std::vector>::type +inline auto + get_random_data(size_t size, T min, T max, size_t max_random_size = default_max_random_size) -> + typename std::enable_if::value, std::vector>::type { - std::random_device rd; + std::random_device rd; std::default_random_engine gen(rd()); - using distribution_type = typename std::conditional<(sizeof(T)==1), short, T>::type; + using distribution_type = typename std::conditional<(sizeof(T) == 1), short, T>::type; std::uniform_int_distribution distribution(min, max); - std::vector data(size); - std::generate( - data.begin(), data.begin() + std::min(size, max_random_size), - [&]() { return distribution(gen); } - ); + std::vector data(size); + std::generate(data.begin(), + data.begin() + std::min(size, max_random_size), + [&]() { return distribution(gen); }); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); @@ -69,17 +69,17 @@ inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = } template -inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = default_max_random_size) - -> typename std::enable_if::value, std::vector>::type +inline auto + get_random_data(size_t size, T min, T max, size_t max_random_size = default_max_random_size) -> + typename std::enable_if::value, std::vector>::type { - std::random_device rd; - std::default_random_engine gen(rd()); + std::random_device rd; + std::default_random_engine gen(rd()); std::uniform_real_distribution distribution(min, max); - std::vector data(size); - std::generate( - data.begin(), data.begin() + std::min(size, max_random_size), - [&]() { return distribution(gen); } - ); + std::vector data(size); + std::generate(data.begin(), + data.begin() + std::min(size, max_random_size), + [&]() { return distribution(gen); }); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); @@ -88,16 +88,16 @@ inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = } template -inline std::vector get_random_data01(size_t size, float p, size_t max_random_size = default_max_random_size) +inline std::vector + get_random_data01(size_t size, float p, size_t max_random_size = default_max_random_size) { - std::random_device rd; - std::default_random_engine gen(rd()); + std::random_device rd; + std::default_random_engine gen(rd()); std::bernoulli_distribution distribution(p); - std::vector data(size); - std::generate( - data.begin(), data.begin() + std::min(size, max_random_size), - [&]() { return distribution(gen); } - ); + std::vector data(size); + std::generate(data.begin(), + data.begin() + std::min(size, max_random_size), + [&]() { return distribution(gen); }); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); @@ -111,84 +111,87 @@ inline T get_random_value(T min, T max) return get_random_data(1, min, max)[0]; } - // Can't use std::prefix_sum for inclusive/exclusive scan, because // it does not handle short[] -> int(int a, int b) { a + b; } -> int[] // they way we expect. That's because sum in std::prefix_sum's implementation // is of type typename std::iterator_traits::value_type (short) template -OutputIt host_inclusive_scan(InputIt first, InputIt last, - OutputIt d_first, BinaryOperation op) +OutputIt host_inclusive_scan(InputIt first, InputIt last, OutputIt d_first, BinaryOperation op) { - using input_type = typename std::iterator_traits::value_type; + using input_type = typename std::iterator_traits::value_type; using output_type = typename std::iterator_traits::value_type; using result_type = - typename std::conditional< - std::is_void::value, input_type, output_type - >::type; + typename std::conditional::value, input_type, output_type>::type; - if (first == last) return d_first; + if(first == last) + return d_first; result_type sum = *first; - *d_first = sum; + *d_first = sum; - while (++first != last) { - sum = op(sum, static_cast(*first)); - *++d_first = sum; + while(++first != last) + { + sum = op(sum, static_cast(*first)); + *++d_first = sum; } return ++d_first; } template -OutputIt host_exclusive_scan(InputIt first, InputIt last, - T initial_value, OutputIt d_first, - BinaryOperation op) +OutputIt host_exclusive_scan( + InputIt first, InputIt last, T initial_value, OutputIt d_first, BinaryOperation op) { - using input_type = typename std::iterator_traits::value_type; + using input_type = typename std::iterator_traits::value_type; using output_type = typename std::iterator_traits::value_type; using result_type = - typename std::conditional< - std::is_void::value, input_type, output_type - >::type; + typename std::conditional::value, input_type, output_type>::type; - if (first == last) return d_first; + if(first == last) + return d_first; result_type sum = initial_value; - *d_first = initial_value; + *d_first = initial_value; - while ((first+1) != last) + while((first + 1) != last) { - sum = op(sum, static_cast(*first)); - *++d_first = sum; - first++; + sum = op(sum, static_cast(*first)); + *++d_first = sum; + first++; } return ++d_first; } -template -OutputIt host_exclusive_scan_by_key(InputIt first, InputIt last, KeyIt k_first, - T initial_value, OutputIt d_first, - BinaryOperation op, KeyCompare key_compare_op) +template +OutputIt host_exclusive_scan_by_key(InputIt first, + InputIt last, + KeyIt k_first, + T initial_value, + OutputIt d_first, + BinaryOperation op, + KeyCompare key_compare_op) { - using input_type = typename std::iterator_traits::value_type; + using input_type = typename std::iterator_traits::value_type; using output_type = typename std::iterator_traits::value_type; using result_type = - typename std::conditional< - std::is_void::value, input_type, output_type - >::type; + typename std::conditional::value, input_type, output_type>::type; - if (first == last) return d_first; + if(first == last) + return d_first; result_type sum = initial_value; - *d_first = initial_value; + *d_first = initial_value; - while ((first+1) != last) + while((first + 1) != last) { if(key_compare_op(*k_first, *++k_first)) { sum = op(sum, static_cast(*first)); - } - else + } else { sum = initial_value; } @@ -201,120 +204,106 @@ OutputIt host_exclusive_scan_by_key(InputIt first, InputIt last, KeyIt k_first, template struct custom_type { - using first_type = T; + using first_type = T; using second_type = U; T x; U y; - HIPCUB_HOST_DEVICE inline - constexpr custom_type() : x(T()), y(U()) {} + HIPCUB_HOST_DEVICE inline constexpr custom_type() : x(T()), y(U()) {} - HIPCUB_HOST_DEVICE inline - constexpr custom_type(T xx, U yy) : x(xx), y(yy) - { - } + HIPCUB_HOST_DEVICE inline constexpr custom_type(T xx, U yy) : x(xx), y(yy) {} - HIPCUB_HOST_DEVICE inline - constexpr custom_type(T xy) : x(xy), y(xy) - { - } + HIPCUB_HOST_DEVICE inline constexpr custom_type(T xy) : x(xy), y(xy) {} template - HIPCUB_HOST_DEVICE inline - custom_type(const custom_type& other) : x(other.x), y(other.y) - { - } + HIPCUB_HOST_DEVICE inline custom_type(const custom_type& other) : x(other.x), y(other.y) + {} - #ifndef HIPCUB_CUB_API - HIPCUB_HOST_DEVICE inline - ~custom_type() = default; - #endif +#ifndef HIPCUB_CUB_API + HIPCUB_HOST_DEVICE inline ~custom_type() = default; +#endif - HIPCUB_HOST_DEVICE inline - custom_type& operator=(const custom_type& other) + HIPCUB_HOST_DEVICE inline custom_type& operator=(const custom_type& other) { x = other.x; y = other.y; return *this; } - HIPCUB_HOST_DEVICE inline - custom_type operator+(const custom_type& rhs) const + HIPCUB_HOST_DEVICE inline custom_type operator+(const custom_type& rhs) const { return custom_type(x + rhs.x, y + rhs.y); } - HIPCUB_HOST_DEVICE inline - custom_type operator-(const custom_type& other) const + HIPCUB_HOST_DEVICE inline custom_type operator-(const custom_type& other) const { return custom_type(x - other.x, y - other.y); } - HIPCUB_HOST_DEVICE inline - bool operator<(const custom_type& rhs) const + HIPCUB_HOST_DEVICE inline bool operator<(const custom_type& rhs) const { // intentionally suboptimal choice for short-circuting, // required to generate more performant device code return ((x == rhs.x && y < rhs.y) || x < rhs.x); } - HIPCUB_HOST_DEVICE inline - bool operator>(const custom_type& other) const + HIPCUB_HOST_DEVICE inline bool operator>(const custom_type& other) const { return (x > other.x || (x == other.x && y > other.y)); } - HIPCUB_HOST_DEVICE inline - bool operator==(const custom_type& rhs) const + HIPCUB_HOST_DEVICE inline bool operator==(const custom_type& rhs) const { return x == rhs.x && y == rhs.y; } - HIPCUB_HOST_DEVICE inline - bool operator!=(const custom_type& other) const + HIPCUB_HOST_DEVICE inline bool operator!=(const custom_type& other) const { - return !(*this == other); + return !(*this == other); } HIPCUB_HOST_DEVICE custom_type& operator+=(const custom_type& rhs) { - this->x += rhs.x; - this->y += rhs.y; - return *this; + this->x += rhs.x; + this->y += rhs.y; + return *this; } }; template -struct is_custom_type : std::false_type {}; +struct is_custom_type : std::false_type +{}; template -struct is_custom_type> : std::true_type {}; +struct is_custom_type> : std::true_type +{}; template struct custom_type_decomposer { static_assert(is_custom_type::value, - "custom_type_decomposer can only be used with instantiations of custom_type"); + "custom_type_decomposer can only be used with instantiations " + "of custom_type"); using T = typename CustomType::first_type; using U = typename CustomType::second_type; HIPCUB_HOST_DEVICE ::hipcub::tuple operator()(CustomType& key) const { - return ::hipcub::tuple{key.x, key.y}; + return ::hipcub::tuple{key.x, key.y}; } }; template -inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) - -> typename std::enable_if::value, std::vector>::type +inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) -> + typename std::enable_if::value, std::vector>::type { - using first_type = typename T::first_type; + using first_type = typename T::first_type; using second_type = typename T::second_type; std::vector data(size); - auto fdata = get_random_data(size, min.x, max.x, max_random_size); - auto sdata = get_random_data(size, min.y, max.y, max_random_size); + auto fdata = get_random_data(size, min.x, max.x, max_random_size); + auto sdata = get_random_data(size, min.y, max.y, max_random_size); for(size_t i = 0; i < size; i++) { data[i] = T(fdata[i], sdata[i]); @@ -323,13 +312,15 @@ inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = } template -inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) - -> typename std::enable_if::value && !std::is_same::value, std::vector>::type +inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) -> + typename std::enable_if::value + && !std::is_same::value, + std::vector>::type { using field_type = decltype(max.x); std::vector data(size); - auto field_data = get_random_data(size, min.x, max.x, max_random_size); + auto field_data = get_random_data(size, min.x, max.x, max_random_size); for(size_t i = 0; i < size; i++) { data[i] = T(field_data[i]); @@ -338,33 +329,28 @@ inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = } template -std::vector get_random_segments(const size_t size, - const size_t max_segment_length, - const int seed_value) +std::vector + get_random_segments(const size_t size, const size_t max_segment_length, const int seed_value) { static_assert(std::is_arithmetic::value, "Key type must be arithmetic"); - std::default_random_engine prng(seed_value); + std::default_random_engine prng(seed_value); std::uniform_int_distribution segment_length_distribution(max_segment_length); - using key_distribution_type = std::conditional_t< - std::is_integral::value, - std::uniform_int_distribution, - std::uniform_real_distribution - >; + using key_distribution_type = std::conditional_t::value, + std::uniform_int_distribution, + std::uniform_real_distribution>; key_distribution_type key_distribution(std::numeric_limits::max()); - std::vector keys(size); + std::vector keys(size); size_t keys_start_index = 0; - while (keys_start_index < size) + while(keys_start_index < size) { const size_t new_segment_length = segment_length_distribution(prng); - const size_t new_segment_end = std::min(size, keys_start_index + new_segment_length); - const T key = key_distribution(prng); - std::fill( - std::next(keys.begin(), keys_start_index), - std::next(keys.begin(), new_segment_end), - key - ); + const size_t new_segment_end = std::min(size, keys_start_index + new_segment_length); + const T key = key_distribution(prng); + std::fill(std::next(keys.begin(), keys_start_index), + std::next(keys.begin(), new_segment_end), + key); keys_start_index += new_segment_length; } return keys; @@ -437,54 +423,54 @@ inline constexpr auto ceiling_div(const T a, const U b) return a / b + (a % b > 0 ? 1 : 0); } -} // end benchmark_util namespace +} // namespace benchmark_utils // Need for hipcub::DeviceReduce::Min/Max etc. namespace std { - template<> - class numeric_limits> - { - using T = typename benchmark_utils::custom_type; +template<> +class numeric_limits> +{ + using T = typename benchmark_utils::custom_type; - public: - static constexpr inline T min() - { +public: + static constexpr inline T min() + { return std::numeric_limits::min(); - } - - static constexpr inline T max() - { - return std::numeric_limits::max(); - } + } - static constexpr inline T lowest() - { - return std::numeric_limits::lowest(); - } - }; + static constexpr inline T max() + { + return std::numeric_limits::max(); + } - template<> - class numeric_limits> + static constexpr inline T lowest() { - using T = typename benchmark_utils::custom_type; + return std::numeric_limits::lowest(); + } +}; - public: - static constexpr inline T min() - { - return std::numeric_limits::min(); - } +template<> +class numeric_limits> +{ + using T = typename benchmark_utils::custom_type; - static constexpr inline T max() - { - return std::numeric_limits::max(); - } +public: + static constexpr inline T min() + { + return std::numeric_limits::min(); + } - static constexpr inline T lowest() - { - return std::numeric_limits::lowest(); - } - }; -} + static constexpr inline T max() + { + return std::numeric_limits::max(); + } + + static constexpr inline T lowest() + { + return std::numeric_limits::lowest(); + } +}; +} // namespace std #endif // HIPCUB_BENCHMARK_UTILS_HPP_ diff --git a/benchmark/benchmark_warp_exchange.cpp b/benchmark/benchmark_warp_exchange.cpp index be5ce636..598df954 100644 --- a/benchmark/benchmark_warp_exchange.cpp +++ b/benchmark/benchmark_warp_exchange.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -41,8 +41,8 @@ __device__ auto warp_exchange_benchmark(T* d_output) -> std::enable_if_t> { T thread_data[ItemsPerThread]; - #pragma unroll - for (unsigned i = 0; i < ItemsPerThread; ++i) +#pragma unroll + for(unsigned i = 0; i < ItemsPerThread; ++i) { thread_data[i] = static_cast(i); } @@ -52,18 +52,18 @@ __device__ auto warp_exchange_benchmark(T* d_output) LogicalWarpSize, 1, // ARCH Algorithm>; - constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; + constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_in_block]; const unsigned warp_id = threadIdx.x / LogicalWarpSize; WarpExchangeT warp_exchange(temp_storage[warp_id]); Op{}(warp_exchange, thread_data); - #pragma unroll - for (unsigned i = 0; i < ItemsPerThread; ++i) +#pragma unroll + for(unsigned i = 0; i < ItemsPerThread; ++i) { const unsigned global_idx = (BlockSize * blockIdx.x + threadIdx.x) * ItemsPerThread + i; - d_output[global_idx] = thread_data[i]; + d_output[global_idx] = thread_data[i]; } } @@ -97,23 +97,23 @@ __device__ auto warp_exchange_scatter_to_striped_benchmark(T* d_output) -> std::enable_if_t> { const unsigned warp_id = threadIdx.x / LogicalWarpSize; - T thread_data[ItemsPerThread]; - OffsetT thread_ranks[ItemsPerThread]; - #pragma unroll - for (unsigned i = 0; i < ItemsPerThread; ++i) + T thread_data[ItemsPerThread]; + OffsetT thread_ranks[ItemsPerThread]; +#pragma unroll + for(unsigned i = 0; i < ItemsPerThread; ++i) { - thread_data[i] = static_cast(i); + thread_data[i] = static_cast(i); thread_ranks[i] = static_cast(LogicalWarpSize - warp_id * ItemsPerThread - i - 1); } using WarpExchangeT = ::hipcub::WarpExchange; - constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; + constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_in_block]; WarpExchangeT(temp_storage[warp_id]).ScatterToStriped(thread_data, thread_ranks); - #pragma unroll - for (unsigned i = 0; i < ItemsPerThread; ++i) +#pragma unroll + for(unsigned i = 0; i < ItemsPerThread; ++i) { const unsigned striped_global_idx = BlockSize * ItemsPerThread * blockIdx.x + BlockSize * i + threadIdx.x; @@ -149,18 +149,18 @@ template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { - constexpr unsigned trials = 100; + constexpr unsigned trials = 100; constexpr unsigned items_per_block = BlockSize * ItemsPerThread; - const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block); + const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block); - T * d_output; + T* d_output; HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < trials; ++i) + for(size_t i = 0; i < trials; ++i) { warp_exchange_kernel <<>>(d_output); @@ -169,8 +169,8 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipPeekAtLastError()) HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * trials * size * sizeof(T)); @@ -179,27 +179,25 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipFree(d_output)); } -template< - class T, - class OffsetT, - unsigned BlockSize, - unsigned ItemsPerThread, - unsigned LogicalWarpSize -> +template void run_benchmark_scatter_to_striped(benchmark::State& state, hipStream_t stream, size_t N) { - constexpr unsigned trials = 100; + constexpr unsigned trials = 100; constexpr unsigned items_per_block = BlockSize * ItemsPerThread; - const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block); + const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block); - T * d_output; + T* d_output; HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < trials; ++i) + for(size_t i = 0; i < trials; ++i) { warp_exchange_scatter_to_striped_kernel>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * trials * size * sizeof(T)); @@ -239,30 +237,34 @@ struct BlockedToStripedOp } }; -#define CREATE_BENCHMARK_STRIPED_TO_BLOCKED(T, BS, IT, WS, ALG) \ - benchmark::RegisterBenchmark("warp_exchange_striped_to_blocked.", \ - &run_benchmark, \ - stream, \ +#define CREATE_BENCHMARK_STRIPED_TO_BLOCKED(T, BS, IT, WS, ALG) \ + benchmark::RegisterBenchmark(std::string("warp_exchange_striped_to_blocked.") \ + .c_str(), \ + &run_benchmark, \ + stream, \ size) -#define CREATE_BENCHMARK_BLOCKED_TO_STRIPED(T, BS, IT, WS, ALG) \ - benchmark::RegisterBenchmark("warp_exchange_blocked_to_striped.", \ - &run_benchmark, \ - stream, \ +#define CREATE_BENCHMARK_BLOCKED_TO_STRIPED(T, BS, IT, WS, ALG) \ + benchmark::RegisterBenchmark(std::string("warp_exchange_blocked_to_striped.") \ + .c_str(), \ + &run_benchmark, \ + stream, \ size) -#define CREATE_BENCHMARK_SCATTER_TO_STRIPED(T, OFFSET_T, BS, IT, WS) \ -benchmark::RegisterBenchmark( \ - "warp_exchange_scatter_to_striped.", \ - &run_benchmark_scatter_to_striped, \ - stream, size \ -) +#define CREATE_BENCHMARK_SCATTER_TO_STRIPED(T, OFFSET_T, BS, IT, WS) \ + benchmark::RegisterBenchmark(std::string("warp_exchange_scatter_to_striped.") \ + .c_str(), \ + &run_benchmark_scatter_to_striped, \ + stream, \ + size) -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -271,81 +273,79 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); std::cout << "benchmark_warp_exchange" << std::endl; // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks{ - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 16, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 16, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 16, 16, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 16, 16, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 32, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 32, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 32, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 32, WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 16, ::hipcub::WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 16, ::hipcub::WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 16, 16, ::hipcub::WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 16, 16, ::hipcub::WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 32, ::hipcub::WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 32, ::hipcub::WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 32, ::hipcub::WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 32, ::hipcub::WARP_EXCHANGE_SMEM), CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 16), CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 32), CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 256, 4, 32), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 16, 16, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 16, 16, WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 16, 16, ::hipcub::WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 16, 16, ::hipcub::WARP_EXCHANGE_SHUFFLE), // CUB requires WS == IPT for WARP_EXCHANGE_SHUFFLE #ifdef HIPCUB_ROCPRIM_API - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 16, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 16, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 32, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 32, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 32, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 32, WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 16, ::hipcub::WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 16, ::hipcub::WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 32, ::hipcub::WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 32, ::hipcub::WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 32, ::hipcub::WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 32, ::hipcub::WARP_EXCHANGE_SHUFFLE), #endif }; #ifdef HIPCUB_ROCPRIM_API - if (::benchmark_utils::is_warp_size_supported(64)) + if(::benchmark_utils::is_warp_size_supported(64)) { std::vector additional_benchmarks{ - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 64, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 64, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 64, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 64, WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 64, ::hipcub::WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 64, ::hipcub::WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 64, ::hipcub::WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 64, ::hipcub::WARP_EXCHANGE_SHUFFLE), CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 64), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 64, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 64, WARP_EXCHANGE_SHUFFLE), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 64, WARP_EXCHANGE_SMEM), - CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 64, WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 64, ::hipcub::WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 64, ::hipcub::WARP_EXCHANGE_SHUFFLE), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 64, ::hipcub::WARP_EXCHANGE_SMEM), + CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 64, ::hipcub::WARP_EXCHANGE_SHUFFLE), CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 256, 4, 64)}; - benchmarks.insert( - benchmarks.end(), - additional_benchmarks.begin(), - additional_benchmarks.end() - ); + benchmarks.insert(benchmarks.end(), + additional_benchmarks.begin(), + additional_benchmarks.end()); } #endif // Use manual timing - for (auto& b : benchmarks) + for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations - if (trials > 0) + if(trials > 0) { - for (auto& b : benchmarks) + for(auto& b : benchmarks) { b->Iterations(trials); } diff --git a/benchmark/benchmark_warp_load.cpp b/benchmark/benchmark_warp_load.cpp index 50bc0a19..4298db66 100644 --- a/benchmark/benchmark_warp_load.cpp +++ b/benchmark/benchmark_warp_load.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -41,17 +41,17 @@ __device__ auto warp_load_benchmark(T* d_input, T* d_output) { using WarpLoadT = ::hipcub::WarpLoad; constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; - constexpr int tile_size = ItemsPerThread * LogicalWarpSize; + constexpr int tile_size = ItemsPerThread * LogicalWarpSize; const unsigned warp_id = threadIdx.x / LogicalWarpSize; const unsigned global_warp_id = blockIdx.x * warps_in_block + warp_id; __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; - T thread_data[ItemsPerThread]; + T thread_data[ItemsPerThread]; WarpLoadT(temp_storage[warp_id]).Load(d_input + global_warp_id * tile_size, thread_data); - #pragma unroll - for (unsigned i = 0; i < ItemsPerThread; ++i) +#pragma unroll + for(unsigned i = 0; i < ItemsPerThread; ++i) { const unsigned striped_global_idx = BlockSize * ItemsPerThread * blockIdx.x + BlockSize * i + threadIdx.x; @@ -78,37 +78,29 @@ __global__ __launch_bounds__(BlockSize) void warp_load_kernel(T* d_input, T* d_o warp_load_benchmark(d_input, d_output); } -template< - class T, - unsigned BlockSize, - unsigned ItemsPerThread, - unsigned LogicalWarpSize, - ::hipcub::WarpLoadAlgorithm Algorithm, - unsigned Trials = 100 -> +template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr unsigned items_per_block = BlockSize * ItemsPerThread; - const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block); + const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block); std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); - T * d_input; - T * d_output; + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < Trials; i++) + for(size_t i = 0; i < Trials; i++) { warp_load_kernel <<>>(d_input, d_output); @@ -116,8 +108,8 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipPeekAtLastError()) HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); @@ -127,14 +119,16 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ -benchmark::RegisterBenchmark( \ - "warp_load.", \ - &run_benchmark, \ - stream, size \ -) +#define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ + benchmark::RegisterBenchmark(std::string("warp_load.") \ + .c_str(), \ + &run_benchmark, \ + stream, \ + size) -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -143,15 +137,17 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + + std::cout << "benchmark_warp_load" << std::endl; std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks @@ -199,7 +195,7 @@ int main(int argc, char *argv[]) // CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_TRANSPOSE) }; - if (::benchmark_utils::is_warp_size_supported(64)) + if(::benchmark_utils::is_warp_size_supported(64)) { std::vector additional_benchmarks{ CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_DIRECT), @@ -245,24 +241,22 @@ int main(int argc, char *argv[]) // WARP_LOAD_TRANSPOSE removed because of shared memory limit // CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_TRANSPOSE) }; - benchmarks.insert( - benchmarks.end(), - additional_benchmarks.begin(), - additional_benchmarks.end() - ); + benchmarks.insert(benchmarks.end(), + additional_benchmarks.begin(), + additional_benchmarks.end()); } // Use manual timing - for (auto& b : benchmarks) + for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations - if (trials > 0) + if(trials > 0) { - for (auto& b : benchmarks) + for(auto& b : benchmarks) { b->Iterations(trials); } diff --git a/benchmark/benchmark_warp_merge_sort.cpp b/benchmark/benchmark_warp_merge_sort.cpp index 9a0e4fd8..e31f68eb 100644 --- a/benchmark/benchmark_warp_merge_sort.cpp +++ b/benchmark/benchmark_warp_merge_sort.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -53,7 +53,7 @@ __device__ auto sort_keys_benchmark(const T* input, T* output, Compare compare_o const unsigned int flat_tid = threadIdx.x; const unsigned int block_offset = blockIdx.x * items_per_block; - T keys[ItemsPerThread]; + T keys[ItemsPerThread]; hipcub::LoadDirectBlocked(flat_tid, input + block_offset, keys); constexpr unsigned int warps_per_block = BlockSize / LogicalWarpSize; @@ -100,8 +100,8 @@ __device__ auto sort_pairs_benchmark(const T* input, T* output, Compare compare_ const unsigned int flat_tid = threadIdx.x; const unsigned int block_offset = blockIdx.x * items_per_block; - T keys[ItemsPerThread]; - T values[ItemsPerThread]; + T keys[ItemsPerThread]; + T values[ItemsPerThread]; hipcub::LoadDirectBlocked(flat_tid, input + block_offset, keys); for(unsigned int i = 0; i < ItemsPerThread; ++i) @@ -146,8 +146,9 @@ __global__ sort_pairs_benchmark(input, output, compare_op); } -template -struct max_value { +template +struct max_value +{ static constexpr T value = std::numeric_limits::max(); }; @@ -162,20 +163,20 @@ __device__ auto sort_keys_segmented_benchmark(const T* input, Compare compare) -> std::enable_if_t> { - constexpr unsigned int max_segment_size = LogicalWarpSize * ItemsPerThread; + constexpr unsigned int max_segment_size = LogicalWarpSize * ItemsPerThread; constexpr unsigned int segments_per_block = BlockSize / LogicalWarpSize; using warp_merge_sort = hipcub::WarpMergeSort; __shared__ typename warp_merge_sort::TempStorage storage[segments_per_block]; const unsigned int warp_id = threadIdx.x / LogicalWarpSize; - warp_merge_sort wsort{storage[warp_id]}; + warp_merge_sort wsort{storage[warp_id]}; const unsigned int segment_id = blockIdx.x * segments_per_block + warp_id; const unsigned int segment_size = segment_sizes[segment_id]; - const unsigned int warp_offset = segment_id * max_segment_size; - T keys[ItemsPerThread]; + const unsigned int warp_offset = segment_id * max_segment_size; + T keys[ItemsPerThread]; const unsigned int flat_tid = wsort.get_linear_tid(); hipcub::LoadDirectBlocked(flat_tid, input + warp_offset, keys, segment_size); @@ -225,27 +226,29 @@ __device__ auto sort_pairs_segmented_benchmark(const T* input, Compare compare) -> std::enable_if_t> { - constexpr unsigned int max_segment_size = LogicalWarpSize * ItemsPerThread; + constexpr unsigned int max_segment_size = LogicalWarpSize * ItemsPerThread; constexpr unsigned int segments_per_block = BlockSize / LogicalWarpSize; using warp_merge_sort = hipcub::WarpMergeSort; __shared__ typename warp_merge_sort::TempStorage storage[segments_per_block]; const unsigned int warp_id = threadIdx.x / LogicalWarpSize; - warp_merge_sort wsort{storage[warp_id]}; + warp_merge_sort wsort{storage[warp_id]}; const unsigned int segment_id = blockIdx.x * segments_per_block + warp_id; const unsigned int segment_size = segment_sizes[segment_id]; - const unsigned int warp_offset = segment_id * max_segment_size; - T keys[ItemsPerThread]; - T values[ItemsPerThread]; + const unsigned int warp_offset = segment_id * max_segment_size; + T keys[ItemsPerThread]; + T values[ItemsPerThread]; const unsigned int flat_tid = wsort.get_linear_tid(); hipcub::LoadDirectBlocked(flat_tid, input + warp_offset, keys, segment_size); - for(unsigned int i = 0; i < ItemsPerThread; ++i) { - if(flat_tid * ItemsPerThread + i < segment_size) { + for(unsigned int i = 0; i < ItemsPerThread; ++i) + { + if(flat_tid * ItemsPerThread + i < segment_size) + { values[i] = keys[i] + T(1); } } @@ -253,8 +256,10 @@ __device__ auto sort_pairs_segmented_benchmark(const T* input, const T oob_default = max_value::value; wsort.Sort(keys, values, compare, segment_size, oob_default); - for(unsigned int i = 0; i < ItemsPerThread; ++i) { - if(flat_tid * ItemsPerThread + i < segment_size) { + for(unsigned int i = 0; i < ItemsPerThread; ++i) + { + if(flat_tid * ItemsPerThread + i < segment_size) + { keys[i] += values[i]; } } @@ -290,38 +295,33 @@ __global__ __launch_bounds__(BlockSize) void sort_pairs_segmented(const T* compare); } -template< - class T, - unsigned int BlockSize, - unsigned int LogicalWarpSize, - unsigned int ItemsPerThread, - class CompareOp = test_utils::less, - unsigned int Trials = 10 -> -void run_benchmark(benchmark::State& state, const benchmark_kinds benchmark_kind, const hipStream_t stream, const size_t N) +template +void run_benchmark(benchmark::State& state, + const benchmark_kinds benchmark_kind, + const hipStream_t stream, + const size_t N) { constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); - const auto input = std::is_floating_point::value ? - benchmark_utils::get_random_data(size, static_cast(-1000), static_cast(1000)) : - benchmark_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + const auto input = std::is_floating_point::value + ? benchmark_utils::get_random_data(size, + static_cast(-1000), + static_cast(1000)) + : benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max()); T* d_input = nullptr; T* d_output = nullptr; HIP_CHECK(hipMalloc(&d_input, size * sizeof(input[0]))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(input[0]))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); for(auto _ : state) { @@ -329,16 +329,17 @@ void run_benchmark(benchmark::State& state, const benchmark_kinds benchmark_kind if(benchmark_kind == benchmark_kinds::sort_keys) { - for(unsigned int i = 0; i < Trials; ++i) { + for(unsigned int i = 0; i < Trials; ++i) + { sort_keys <<>>(d_input, d_output, CompareOp{}); } - } - else if(benchmark_kind == benchmark_kinds::sort_pairs) + } else if(benchmark_kind == benchmark_kinds::sort_pairs) { - for(unsigned int i = 0; i < Trials; ++i) { + for(unsigned int i = 0; i < Trials; ++i) + { sort_pairs <<>>(d_input, d_output, @@ -349,8 +350,8 @@ void run_benchmark(benchmark::State& state, const benchmark_kinds benchmark_kind HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); @@ -360,49 +361,45 @@ void run_benchmark(benchmark::State& state, const benchmark_kinds benchmark_kind HIP_CHECK(hipFree(d_output)); } -template< - class T, - unsigned int BlockSize, - unsigned int LogicalWarpSize, - unsigned int ItemsPerThread, - class CompareOp = test_utils::less, - unsigned int Trials = 10 -> -void run_segmented_benchmark(benchmark::State& state, const benchmark_kinds benchmark_kind, const hipStream_t stream, const size_t N) +template +void run_segmented_benchmark(benchmark::State& state, + const benchmark_kinds benchmark_kind, + const hipStream_t stream, + const size_t N) { - constexpr auto max_segment_size = LogicalWarpSize * ItemsPerThread; + constexpr auto max_segment_size = LogicalWarpSize * ItemsPerThread; constexpr auto segments_per_block = BlockSize / LogicalWarpSize; - constexpr auto items_per_block = BlockSize * ItemsPerThread; + constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto num_blocks = (N + items_per_block - 1) / items_per_block; + const auto num_blocks = (N + items_per_block - 1) / items_per_block; const auto num_segments = num_blocks * segments_per_block; - const auto size = num_blocks * items_per_block; + const auto size = num_blocks * items_per_block; - const auto input = std::is_floating_point::value ? - benchmark_utils::get_random_data(size, static_cast(-1000), static_cast(1000)) : - benchmark_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + const auto input = std::is_floating_point::value + ? benchmark_utils::get_random_data(size, + static_cast(-1000), + static_cast(1000)) + : benchmark_utils::get_random_data(size, + std::numeric_limits::min(), + std::numeric_limits::max()); - const auto segment_sizes = benchmark_utils::get_random_data( - num_segments, 0, max_segment_size); + const auto segment_sizes + = benchmark_utils::get_random_data(num_segments, 0, max_segment_size); - T* d_input = nullptr; - T* d_output = nullptr; + T* d_input = nullptr; + T* d_output = nullptr; unsigned int* d_segment_sizes = nullptr; HIP_CHECK(hipMalloc(&d_input, size * sizeof(input[0]))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(input[0]))); HIP_CHECK(hipMalloc(&d_segment_sizes, num_segments * sizeof(segment_sizes[0]))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK(hipMemcpy(d_segment_sizes, segment_sizes.data(), + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_segment_sizes, + segment_sizes.data(), num_segments * sizeof(segment_sizes[0]), hipMemcpyHostToDevice)); @@ -420,8 +417,7 @@ void run_segmented_benchmark(benchmark::State& state, const benchmark_kinds benc d_segment_sizes, CompareOp{}); } - } - else if(benchmark_kind == benchmark_kinds::sort_pairs) + } else if(benchmark_kind == benchmark_kinds::sort_pairs) { for(unsigned int i = 0; i < Trials; ++i) { @@ -436,8 +432,8 @@ void run_segmented_benchmark(benchmark::State& state, const benchmark_kinds benc HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); @@ -449,15 +445,18 @@ void run_segmented_benchmark(benchmark::State& state, const benchmark_kinds benc } #define CREATE_BENCHMARK(T, BS, WS, IPT) \ -do { \ - const auto benchmark_name = \ - std::string{"warp_merge_sort.SubAlgorithm Name:"} + name; \ - if(WS <= device_warp_size) { \ - benchmarks.push_back(benchmark::RegisterBenchmark(benchmark_name.c_str(), \ + if(WS <= device_warp_size) \ + { \ + benchmarks.push_back(benchmark::RegisterBenchmark( \ + std::string("warp_merge_sort.sub_algorithm_name:" \ + + name) \ + .c_str(), \ segmented ? &run_benchmark : &run_segmented_benchmark, \ - benchmark_kind, stream, size)); \ - } \ -} while(false) + benchmark_kind, \ + stream, \ + size)); \ + } #define BENCHMARK_TYPE_WS(type, block, warp) \ CREATE_BENCHMARK(type, block, warp, 1); \ @@ -470,13 +469,13 @@ do { BENCHMARK_TYPE_WS(type, block, 32); \ BENCHMARK_TYPE_WS(type, block, 64) -void add_benchmarks(const benchmark_kinds benchmark_kind, - const std::string& name, +void add_benchmarks(const benchmark_kinds benchmark_kind, + const std::string& name, std::vector& benchmarks, - const hipStream_t stream, - const size_t size, - const bool segmented, - const unsigned int device_warp_size) + const hipStream_t stream, + const size_t size, + const bool segmented, + const unsigned int device_warp_size) { BENCHMARK_TYPE(int, 256); BENCHMARK_TYPE(int8_t, 256); @@ -484,7 +483,7 @@ void add_benchmarks(const benchmark_kinds benchmark_kind, BENCHMARK_TYPE(long long, 256); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -493,24 +492,27 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); std::cout << "benchmark_warp_merge_sort" << std::endl; // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; - const auto device_warp_size = [] { + const auto device_warp_size = [] + { const int result = HIPCUB_HOST_WARP_THREADS; - if(result > 0) { + if(result > 0) + { std::cout << "[HIP] Device warp size: " << result << std::endl; - } else { + } else + { std::cerr << "Failed to get device warp size! Aborting.\n"; std::exit(1); } @@ -519,14 +521,34 @@ int main(int argc, char *argv[]) // Add benchmarks std::vector benchmarks; - add_benchmarks(benchmark_kinds::sort_keys, "sort(keys)", benchmarks, stream, - size, false, device_warp_size); - add_benchmarks(benchmark_kinds::sort_pairs, "sort(keys, values)", - benchmarks, stream, size, false, device_warp_size); - add_benchmarks(benchmark_kinds::sort_keys, "segmented_sort(keys)", - benchmarks, stream, size, true, device_warp_size); - add_benchmarks(benchmark_kinds::sort_pairs, "segmented_sort(keys, values)", - benchmarks, stream, size, true, device_warp_size); + add_benchmarks(benchmark_kinds::sort_keys, + "sort(keys)", + benchmarks, + stream, + size, + false, + device_warp_size); + add_benchmarks(benchmark_kinds::sort_pairs, + "sort(keys, values)", + benchmarks, + stream, + size, + false, + device_warp_size); + add_benchmarks(benchmark_kinds::sort_keys, + "segmented_sort(keys)", + benchmarks, + stream, + size, + true, + device_warp_size); + add_benchmarks(benchmark_kinds::sort_pairs, + "segmented_sort(keys, values)", + benchmarks, + stream, + size, + true, + device_warp_size); // Use manual timing for(auto& b : benchmarks) diff --git a/benchmark/benchmark_warp_reduce.cpp b/benchmark/benchmark_warp_reduce.cpp index 39716261..f72c268d 100644 --- a/benchmark/benchmark_warp_reduce.cpp +++ b/benchmark/benchmark_warp_reduce.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -25,19 +25,12 @@ // HIP API #include "hipcub/warp/warp_reduce.hpp" - #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template< - class T, - unsigned int WarpSize, - unsigned int Trials -> -__global__ -__launch_bounds__(64) -void warp_reduce_kernel(const T * d_input, T * d_output) +template +__global__ __launch_bounds__(64) void warp_reduce_kernel(const T* d_input, T* d_output) { const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; @@ -45,8 +38,8 @@ void warp_reduce_kernel(const T * d_input, T * d_output) using wreduce_t = hipcub::WarpReduce; __shared__ typename wreduce_t::TempStorage storage; - auto reduce_op = hipcub::Sum(); - #pragma nounroll + auto reduce_op = hipcub::Sum(); +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { value = wreduce_t(storage).Reduce(value, reduce_op); @@ -55,24 +48,19 @@ void warp_reduce_kernel(const T * d_input, T * d_output) d_output[i] = value; } -template< - class T, - class Flag, - unsigned int WarpSize, - unsigned int Trials -> -__global__ -__launch_bounds__(64) -void segmented_warp_reduce_kernel(const T* d_input, Flag* d_flags, T* d_output) +template +__global__ __launch_bounds__(64) void segmented_warp_reduce_kernel(const T* d_input, + Flag* d_flags, + T* d_output) { const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; auto value = d_input[i]; - auto flag = d_flags[i]; + auto flag = d_flags[i]; using wreduce_t = hipcub::WarpReduce; __shared__ typename wreduce_t::TempStorage storage; - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { value = wreduce_t(storage).HeadSegmentedSum(value, flag); @@ -81,96 +69,83 @@ void segmented_warp_reduce_kernel(const T* d_input, Flag* d_flags, T* d_output) d_output[i] = value; } -template< - bool Segmented, - unsigned int WarpSize, - unsigned int BlockSize, - unsigned int Trials, - class T, - class Flag -> -inline -auto execute_warp_reduce_kernel(T* input, T* output, Flag* /* flags */, - size_t size, hipStream_t stream) - -> typename std::enable_if::type +template +inline auto execute_warp_reduce_kernel( + T* input, T* output, Flag* /* flags */, size_t size, hipStream_t stream) -> + typename std::enable_if::type { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(warp_reduce_kernel), - dim3(size/BlockSize), dim3(BlockSize), 0, stream, - input, output - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(warp_reduce_kernel), + dim3(size / BlockSize), + dim3(BlockSize), + 0, + stream, + input, + output); HIP_CHECK(hipPeekAtLastError()); } -template< - bool Segmented, - unsigned int WarpSize, - unsigned int BlockSize, - unsigned int Trials, - class T, - class Flag -> -inline -auto execute_warp_reduce_kernel(T* input, T* output, Flag* flags, - size_t size, hipStream_t stream) - -> typename std::enable_if::type +template +inline auto + execute_warp_reduce_kernel(T* input, T* output, Flag* flags, size_t size, hipStream_t stream) -> + typename std::enable_if::type { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(segmented_warp_reduce_kernel), - dim3(size/BlockSize), dim3(BlockSize), 0, stream, - input, flags, output - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(segmented_warp_reduce_kernel), + dim3(size / BlockSize), + dim3(BlockSize), + 0, + stream, + input, + flags, + output); HIP_CHECK(hipPeekAtLastError()); } -template< - bool Segmented, - class T, - unsigned int WarpSize, - unsigned int BlockSize, - unsigned int Trials = 100 -> +template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { using flag_type = unsigned char; - const auto size = BlockSize * ((N + BlockSize - 1)/BlockSize); + const auto size = BlockSize * ((N + BlockSize - 1) / BlockSize); - std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); + std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); std::vector flags = benchmark_utils::get_random_data(size, 0, 1); - T * d_input; - flag_type * d_flags; - T * d_output; + T* d_input; + flag_type* d_flags; + T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_flags, size * sizeof(flag_type))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_flags, flags.data(), - size * sizeof(flag_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_flags, flags.data(), size * sizeof(flag_type), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); - execute_warp_reduce_kernel( - d_input, d_output, d_flags, size, stream - ); + execute_warp_reduce_kernel(d_input, + d_output, + d_flags, + size, + stream); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); @@ -181,44 +156,35 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipFree(d_flags)); } -#define CREATE_BENCHMARK(T, WS, BS) \ -benchmark::RegisterBenchmark( \ - (std::string("warp_reduce.SubAlgorithm Name:") + name).c_str(), \ - &run_benchmark, \ - stream, size \ -) - +#define CREATE_BENCHMARK(T, WS, BS) \ + benchmark::RegisterBenchmark(std::string("warp_reduce.sub_algorithm_name:" \ + + name) \ + .c_str(), \ + &run_benchmark, \ + stream, \ + size) // If warp size limit is 16 -#define BENCHMARK_TYPE_WS16(type) \ - CREATE_BENCHMARK(type, 15, 32), \ - CREATE_BENCHMARK(type, 16, 32) - +#define BENCHMARK_TYPE_WS16(type) CREATE_BENCHMARK(type, 15, 32), CREATE_BENCHMARK(type, 16, 32) // If warp size limit is 32 -#define BENCHMARK_TYPE_WS32(type) \ - BENCHMARK_TYPE_WS16(type), \ - CREATE_BENCHMARK(type, 31, 32), \ - CREATE_BENCHMARK(type, 32, 32), \ - CREATE_BENCHMARK(type, 32, 64) - +#define BENCHMARK_TYPE_WS32(type) \ + BENCHMARK_TYPE_WS16(type), CREATE_BENCHMARK(type, 31, 32), CREATE_BENCHMARK(type, 32, 32), \ + CREATE_BENCHMARK(type, 32, 64) // If warp size limit is 64 -#define BENCHMARK_TYPE_WS64(type) \ - BENCHMARK_TYPE_WS32(type), \ - CREATE_BENCHMARK(type, 37, 64), \ - CREATE_BENCHMARK(type, 61, 64), \ - CREATE_BENCHMARK(type, 64, 64) - +#define BENCHMARK_TYPE_WS64(type) \ + BENCHMARK_TYPE_WS32(type), CREATE_BENCHMARK(type, 37, 64), CREATE_BENCHMARK(type, 61, 64), \ + CREATE_BENCHMARK(type, 64, 64) template -void add_benchmarks(const std::string& name, +void add_benchmarks(const std::string& name, std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - std::vector bs = - { + std::vector bs = { #if HIPCUB_WARP_THREADS_MACRO == 16 BENCHMARK_TYPE_WS16(int), BENCHMARK_TYPE_WS16(float), @@ -242,7 +208,7 @@ void add_benchmarks(const std::string& name, benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -251,15 +217,15 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); std::cout << "benchmark_warp_reduce" << std::endl; // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_warp_scan.cpp b/benchmark/benchmark_warp_scan.cpp index c66b31aa..c38defdf 100644 --- a/benchmark/benchmark_warp_scan.cpp +++ b/benchmark/benchmark_warp_scan.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -116,23 +116,17 @@ template input(size, 1.0f); - T * d_input; - T * d_output; + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), @@ -147,8 +141,8 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size) HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } @@ -160,9 +154,9 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size) } #define CREATE_BENCHMARK_IMPL(T, BS, WS, OP) \ - benchmark::RegisterBenchmark((std::string("warp_scan.Method Name:") \ - + method_name) \ + benchmark::RegisterBenchmark(std::string("warp_scan.sub_algorithm_name:" \ + + method_name) \ .c_str(), \ &run_benchmark, \ stream, \ @@ -199,7 +193,7 @@ void add_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { - using custom_double2 = benchmark_utils::custom_type; + using custom_double2 = benchmark_utils::custom_type; using custom_int_double = benchmark_utils::custom_type; std::vector new_benchmarks = { @@ -229,7 +223,7 @@ void add_benchmarks(std::vector& benchmarks, benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -238,15 +232,15 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); std::cout << "benchmark_warp_scan" << std::endl; // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_warp_store.cpp b/benchmark/benchmark_warp_store.cpp index a331f16b..8e88661c 100644 --- a/benchmark/benchmark_warp_store.cpp +++ b/benchmark/benchmark_warp_store.cpp @@ -9,8 +9,8 @@ // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -40,15 +40,15 @@ __device__ auto warp_store_benchmark(T* d_output) -> std::enable_if_t> { T thread_data[ItemsPerThread]; - #pragma unroll - for (unsigned i = 0; i < ItemsPerThread; ++i) +#pragma unroll + for(unsigned i = 0; i < ItemsPerThread; ++i) { thread_data[i] = static_cast(i); } using WarpStoreT = ::hipcub::WarpStore; - constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; - constexpr int tile_size = ItemsPerThread * LogicalWarpSize; + constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; + constexpr int tile_size = ItemsPerThread * LogicalWarpSize; __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; const unsigned warp_id = threadIdx.x / LogicalWarpSize; const unsigned global_warp_id = blockIdx.x * warps_in_block + warp_id; @@ -75,27 +75,25 @@ __global__ __launch_bounds__(BlockSize) void warp_store_kernel(T* d_output) warp_store_benchmark(d_output); } -template< - class T, - unsigned BlockSize, - unsigned ItemsPerThread, - unsigned LogicalWarpSize, - ::hipcub::WarpStoreAlgorithm Algorithm, - unsigned Trials = 100 -> +template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr unsigned items_per_block = BlockSize * ItemsPerThread; - const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block); + const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block); - T * d_output; + T* d_output; HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < Trials; ++i) + for(size_t i = 0; i < Trials; ++i) { warp_store_kernel <<>>(d_output); @@ -103,8 +101,8 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipPeekAtLastError()) HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); @@ -113,14 +111,16 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ -benchmark::RegisterBenchmark( \ - "warp_store.", \ - &run_benchmark, \ - stream, size \ -) +#define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ + benchmark::RegisterBenchmark(std::string("warp_store.") \ + .c_str(), \ + &run_benchmark, \ + stream, \ + size) -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -129,15 +129,15 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); std::cout << "benchmark_warp_store" << std::endl; // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; @@ -187,7 +187,7 @@ int main(int argc, char *argv[]) // CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_TRANSPOSE) }; - if (::benchmark_utils::is_warp_size_supported(64)) + if(::benchmark_utils::is_warp_size_supported(64)) { std::vector additional_benchmarks{ CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_DIRECT), @@ -221,36 +221,36 @@ int main(int argc, char *argv[]) CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_VECTORIZE), // WARP_STORE_TRANSPOSE removed because of shared memory limit - // CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_TRANSPOSE), + // CREATE_BENCHMARK(double, 256, 16, 64, + // ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_VECTORIZE), // WARP_STORE_TRANSPOSE removed because of shared memory limit - // CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_TRANSPOSE), + // CREATE_BENCHMARK(double, 256, 32, 64, + // ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_VECTORIZE) // WARP_STORE_TRANSPOSE removed because of shared memory limit // CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_TRANSPOSE) }; - benchmarks.insert( - benchmarks.end(), - additional_benchmarks.begin(), - additional_benchmarks.end() - ); + benchmarks.insert(benchmarks.end(), + additional_benchmarks.begin(), + additional_benchmarks.end()); } // Use manual timing - for (auto& b : benchmarks) + for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations - if (trials > 0) + if(trials > 0) { - for (auto& b : benchmarks) + for(auto& b : benchmarks) { b->Iterations(trials); }