diff --git a/CMakeLists.txt b/CMakeLists.txt
index 27e16114..f5506c21 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,6 +36,7 @@ else()
   set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation")
 endif()
 
+
 # Build options
 option(BUILD_TEST "Build tests (requires googletest)" OFF)
 option(DEPENDENCIES_FORCE_DOWNLOAD "Download dependencies and do not search for packages" OFF)
diff --git a/benchmark/benchmark_block_adjacent_difference.cpp b/benchmark/benchmark_block_adjacent_difference.cpp
index 63af7c7d..1cc70798 100644
--- a/benchmark/benchmark_block_adjacent_difference.cpp
+++ b/benchmark/benchmark_block_adjacent_difference.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -32,16 +32,12 @@
 const size_t DEFAULT_N = 1024 * 1024 * 128;
 #endif
 
-template <
-    class Benchmark,
-    unsigned int BlockSize,
-    unsigned int ItemsPerThread,
-    bool         WithTile,
-    typename... Args
->
-__global__ 
-__launch_bounds__(BlockSize) 
-void kernel(Args ...args)
+template<class Benchmark,
+         unsigned int BlockSize,
+         unsigned int ItemsPerThread,
+         bool         WithTile,
+         typename... Args>
+__global__ __launch_bounds__(BlockSize) void kernel(Args... args)
 {
     Benchmark::template run<BlockSize, ItemsPerThread, WithTile>(args...);
 }
@@ -49,8 +45,7 @@ void kernel(Args ...args)
 template<class T>
 struct minus
 {
-    HIPCUB_HOST_DEVICE inline
-    constexpr T operator()(const T& a, const T& b) const
+    HIPCUB_HOST_DEVICE inline constexpr T operator()(const T& a, const T& b) const
     {
         return a - b;
     }
@@ -58,10 +53,10 @@ struct minus
 
 struct subtract_left
 {
-    template <unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, typename T>
+    template<unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, typename T>
     __device__ static void run(const T* d_input, T* d_output, unsigned int trials)
     {
-        const unsigned int lid = threadIdx.x;
+        const unsigned int lid          = threadIdx.x;
         const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
 
         T input[ItemsPerThread];
@@ -69,15 +64,14 @@ struct subtract_left
 
         hipcub::BlockAdjacentDifference<T, BlockSize> adjacent_difference;
 
-        #pragma nounroll
+#pragma nounroll
         for(unsigned int trial = 0; trial < trials; trial++)
         {
             T output[ItemsPerThread];
             if(WithTile)
             {
                 adjacent_difference.SubtractLeft(input, output, minus<T>{}, T(123));
-            }
-            else
+            } else
             {
                 adjacent_difference.SubtractLeft(input, output, minus<T>{});
             }
@@ -86,7 +80,7 @@ struct subtract_left
             {
                 input[i] += output[i];
             }
-            
+
             __syncthreads();
         }
 
@@ -96,10 +90,11 @@ struct subtract_left
 
 struct subtract_left_partial_tile
 {
-    template <unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, typename T>
-    __device__ static void run(const T* d_input, const int* tile_sizes, T* d_output, unsigned int trials)
+    template<unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, typename T>
+    __device__ static void
+        run(const T* d_input, const int* tile_sizes, T* d_output, unsigned int trials)
     {
-        const unsigned int lid = threadIdx.x;
+        const unsigned int lid          = threadIdx.x;
         const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
 
         T input[ItemsPerThread];
@@ -112,7 +107,7 @@ struct subtract_left_partial_tile
         // Try to evenly distribute the length of tile_sizes between all the trials
         const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1;
 
-        #pragma nounroll
+#pragma nounroll
         for(unsigned int trial = 0; trial < trials; trial++)
         {
             T output[ItemsPerThread];
@@ -124,8 +119,7 @@ struct subtract_left_partial_tile
                                                             minus<T>{},
                                                             tile_size,
                                                             T(123));
-            }
-            else
+            } else
             {
                 adjacent_difference.SubtractLeftPartialTile(input, output, minus<T>{}, tile_size);
             }
@@ -134,7 +128,7 @@ struct subtract_left_partial_tile
             {
                 input[i] += output[i];
             }
-            
+
             // Change the tile_size to even out the distribution
             tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread);
             __syncthreads();
@@ -146,10 +140,10 @@ struct subtract_left_partial_tile
 
 struct subtract_right
 {
-    template <unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, typename T>
+    template<unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, typename T>
     __device__ static void run(const T* d_input, T* d_output, unsigned int trials)
     {
-        const unsigned int lid = threadIdx.x;
+        const unsigned int lid          = threadIdx.x;
         const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
 
         T input[ItemsPerThread];
@@ -157,15 +151,14 @@ struct subtract_right
 
         hipcub::BlockAdjacentDifference<T, BlockSize> adjacent_difference;
 
-        #pragma nounroll
+#pragma nounroll
         for(unsigned int trial = 0; trial < trials; trial++)
         {
             T output[ItemsPerThread];
             if(WithTile)
             {
                 adjacent_difference.SubtractRight(input, output, minus<T>{}, T(123));
-            }
-            else
+            } else
             {
                 adjacent_difference.SubtractRight(input, output, minus<T>{});
             }
@@ -174,7 +167,7 @@ struct subtract_right
             {
                 input[i] += output[i];
             }
-            
+
             __syncthreads();
         }
 
@@ -184,10 +177,11 @@ struct subtract_right
 
 struct subtract_right_partial_tile
 {
-    template <unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, typename T>
-    __device__ static void run(const T* d_input, const int* tile_sizes, T* d_output, unsigned int trials)
+    template<unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, typename T>
+    __device__ static void
+        run(const T* d_input, const int* tile_sizes, T* d_output, unsigned int trials)
     {
-        const unsigned int lid = threadIdx.x;
+        const unsigned int lid          = threadIdx.x;
         const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
 
         T input[ItemsPerThread];
@@ -200,7 +194,7 @@ struct subtract_right_partial_tile
         // Try to evenly distribute the length of tile_sizes between all the trials
         const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1;
 
-        #pragma nounroll
+#pragma nounroll
         for(unsigned int trial = 0; trial < trials; trial++)
         {
             T output[ItemsPerThread];
@@ -211,7 +205,7 @@ struct subtract_right_partial_tile
             {
                 input[i] += output[i];
             }
-            
+
             // Change the tile_size to even out the distribution
             tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread);
             __syncthreads();
@@ -221,49 +215,47 @@ struct subtract_right_partial_tile
     }
 };
 
-template <class Benchmark,
-          class T,
-          unsigned int BlockSize,
-          unsigned int ItemsPerThread,
-          bool         WithTile,
-          unsigned int Trials = 100>
+template<class Benchmark,
+         class T,
+         unsigned int BlockSize,
+         unsigned int ItemsPerThread,
+         bool         WithTile,
+         unsigned int Trials = 100>
 auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
     -> std::enable_if_t<!std::is_same<Benchmark, subtract_left_partial_tile>::value
                         && !std::is_same<Benchmark, subtract_right_partial_tile>::value>
 {
     constexpr auto items_per_block = BlockSize * ItemsPerThread;
-    const auto num_blocks = (N + items_per_block - 1) / items_per_block;
+    const auto     num_blocks      = (N + items_per_block - 1) / items_per_block;
     // Round up size to the next multiple of items_per_block
     const auto size = num_blocks * items_per_block;
 
     const std::vector<T> input = benchmark_utils::get_random_data<T>(size, T(0), T(10));
-    T* d_input;
-    T* d_output;
+    T*                   d_input;
+    T*                   d_output;
     HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0])));
     HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T)));
     HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            input.size() * sizeof(input[0]),
-            hipMemcpyHostToDevice
-        )
-    );
+        hipMemcpy(d_input, input.data(), input.size() * sizeof(input[0]), hipMemcpyHostToDevice));
 
     for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
 
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(kernel<Benchmark, BlockSize, ItemsPerThread, WithTile>),
-            dim3(num_blocks), dim3(BlockSize), 0, stream,
-            d_input, d_output, Trials
-        );
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel<Benchmark, BlockSize, ItemsPerThread, WithTile>),
+                           dim3(num_blocks),
+                           dim3(BlockSize),
+                           0,
+                           stream,
+                           d_input,
+                           d_output,
+                           Trials);
         HIP_CHECK(hipGetLastError());
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T));
@@ -273,61 +265,57 @@ auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
     HIP_CHECK(hipFree(d_output));
 }
 
-template <class Benchmark,
-          class T,
-          unsigned int BlockSize,
-          unsigned int ItemsPerThread,
-          bool         WithTile,
-          unsigned int Trials = 100>
+template<class Benchmark,
+         class T,
+         unsigned int BlockSize,
+         unsigned int ItemsPerThread,
+         bool         WithTile,
+         unsigned int Trials = 100>
 auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
     -> std::enable_if_t<std::is_same<Benchmark, subtract_left_partial_tile>::value
                         || std::is_same<Benchmark, subtract_right_partial_tile>::value>
 {
     constexpr auto items_per_block = BlockSize * ItemsPerThread;
-    const auto num_blocks = (N + items_per_block - 1) / items_per_block;
+    const auto     num_blocks      = (N + items_per_block - 1) / items_per_block;
     // Round up size to the next multiple of items_per_block
     const auto size = num_blocks * items_per_block;
 
-    const std::vector<T> input = benchmark_utils::get_random_data<T>(size, T(0), T(10));
+    const std::vector<T>   input = benchmark_utils::get_random_data<T>(size, T(0), T(10));
     const std::vector<int> tile_sizes
         = benchmark_utils::get_random_data<int>(num_blocks, 0, items_per_block);
-    
-    T* d_input;
+
+    T*   d_input;
     int* d_tile_sizes;
-    T* d_output;
+    T*   d_output;
     HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0])));
     HIP_CHECK(hipMalloc(&d_tile_sizes, tile_sizes.size() * sizeof(tile_sizes[0])));
     HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T)));
     HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            input.size() * sizeof(input[0]),
-            hipMemcpyHostToDevice
-        )
-    );
-    HIP_CHECK(
-        hipMemcpy(
-            d_tile_sizes, tile_sizes.data(),
-            tile_sizes.size() * sizeof(tile_sizes[0]),
-            hipMemcpyHostToDevice
-        )
-    );
+        hipMemcpy(d_input, input.data(), input.size() * sizeof(input[0]), hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(d_tile_sizes,
+                        tile_sizes.data(),
+                        tile_sizes.size() * sizeof(tile_sizes[0]),
+                        hipMemcpyHostToDevice));
 
     for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
 
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(kernel<Benchmark, BlockSize, ItemsPerThread, WithTile>),
-            dim3(num_blocks), dim3(BlockSize), 0, stream,
-            d_input, d_tile_sizes, d_output, Trials
-        );
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel<Benchmark, BlockSize, ItemsPerThread, WithTile>),
+                           dim3(num_blocks),
+                           dim3(BlockSize),
+                           0,
+                           stream,
+                           d_input,
+                           d_tile_sizes,
+                           d_output,
+                           Trials);
         HIP_CHECK(hipGetLastError());
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T));
@@ -338,51 +326,47 @@ auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
     HIP_CHECK(hipFree(d_output));
 }
 
-#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \
-benchmark::RegisterBenchmark( \
-    (std::string("block_adjacent_difference<" #T ", " #BS ">.") + name + ("<" #IPT ", " #WITH_TILE ">")).c_str(), \
-    &run_benchmark<Benchmark, T, BS, IPT, WITH_TILE>, \
-    stream, size \
-)
-
-#define BENCHMARK_TYPE(type, block, with_tile)    \
-    CREATE_BENCHMARK(type, block, 1,  with_tile), \
-    CREATE_BENCHMARK(type, block, 3,  with_tile), \
-    CREATE_BENCHMARK(type, block, 4,  with_tile), \
-    CREATE_BENCHMARK(type, block, 8,  with_tile), \
-    CREATE_BENCHMARK(type, block, 16, with_tile), \
-    CREATE_BENCHMARK(type, block, 32, with_tile)
+#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE)                                      \
+    benchmark::RegisterBenchmark(                                                    \
+        std::string("block_adjacent_difference<data_type:" #T ",block_size:" #BS     \
+                    ">.sub_algorithm_name:"                                          \
+                    + name + "<items_per_thread:" #IPT ",with_tile:" #WITH_TILE ">") \
+            .c_str(),                                                                \
+        &run_benchmark<Benchmark, T, BS, IPT, WITH_TILE>,                            \
+        stream,                                                                      \
+        size)
+
+#define BENCHMARK_TYPE(type, block, with_tile)                                                    \
+    CREATE_BENCHMARK(type, block, 1, with_tile), CREATE_BENCHMARK(type, block, 3, with_tile),     \
+        CREATE_BENCHMARK(type, block, 4, with_tile), CREATE_BENCHMARK(type, block, 8, with_tile), \
+        CREATE_BENCHMARK(type, block, 16, with_tile), CREATE_BENCHMARK(type, block, 32, with_tile)
 
 template<class Benchmark>
-void add_benchmarks(const std::string& name,
+void add_benchmarks(const std::string&                            name,
                     std::vector<benchmark::internal::Benchmark*>& benchmarks,
-                    hipStream_t stream,
-                    size_t size)
+                    hipStream_t                                   stream,
+                    size_t                                        size)
 {
-    std::vector<benchmark::internal::Benchmark*> bs =
-    {
-        BENCHMARK_TYPE(int, 256, false),
-        BENCHMARK_TYPE(float, 256, false),
-        BENCHMARK_TYPE(int8_t, 256, false),
-        BENCHMARK_TYPE(long long, 256, false),
-        BENCHMARK_TYPE(double, 256, false)
-    };
+    std::vector<benchmark::internal::Benchmark*> bs = {BENCHMARK_TYPE(int, 256, false),
+                                                       BENCHMARK_TYPE(float, 256, false),
+                                                       BENCHMARK_TYPE(int8_t, 256, false),
+                                                       BENCHMARK_TYPE(long long, 256, false),
+                                                       BENCHMARK_TYPE(double, 256, false)};
 
     if(!std::is_same<Benchmark, subtract_right_partial_tile>::value)
     {
-        bs.insert(bs.end(), {
-            BENCHMARK_TYPE(int, 256, true),
-            BENCHMARK_TYPE(float, 256, true),
-            BENCHMARK_TYPE(int8_t, 256, true),
-            BENCHMARK_TYPE(long long, 256, true),
-            BENCHMARK_TYPE(double, 256, true)
-        });
+        bs.insert(bs.end(),
+                  {BENCHMARK_TYPE(int, 256, true),
+                   BENCHMARK_TYPE(float, 256, true),
+                   BENCHMARK_TYPE(int8_t, 256, true),
+                   BENCHMARK_TYPE(long long, 256, true),
+                   BENCHMARK_TYPE(double, 256, true)});
     }
 
     benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -391,23 +375,28 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
+
+    std::cout << "benchmark_block_adjacent_difference" << std::endl;
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
 
     // Add benchmarks
     std::vector<benchmark::internal::Benchmark*> benchmarks;
-    add_benchmarks<subtract_left>("SubtractLeft", benchmarks, stream, size);
-    add_benchmarks<subtract_right>("SubtractRight", benchmarks, stream, size);
-    add_benchmarks<subtract_left_partial_tile>("SubtractLeftPartialTile", benchmarks, stream, size);
-    add_benchmarks<subtract_right_partial_tile>("SubtractRightPartialTile", benchmarks, stream, size);
+    add_benchmarks<subtract_left>("subtract_left", benchmarks, stream, size);
+    add_benchmarks<subtract_right>("subtract_right", benchmarks, stream, size);
+    add_benchmarks<subtract_left_partial_tile>("subtract_left_partial_tile", benchmarks, stream, size);
+    add_benchmarks<subtract_right_partial_tile>("subtract_right_partial_tile",
+                                                benchmarks,
+                                                stream,
+                                                size);
 
     // Use manual timing
     for(auto& b : benchmarks)
diff --git a/benchmark/benchmark_block_discontinuity.cpp b/benchmark/benchmark_block_discontinuity.cpp
index 72d925ec..24446c9a 100644
--- a/benchmark/benchmark_block_discontinuity.cpp
+++ b/benchmark/benchmark_block_discontinuity.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -25,10 +25,9 @@
 // HIP API
 #include "hipcub/block/block_discontinuity.hpp"
 
-#include "hipcub/thread/thread_operators.hpp" //to use hipcub::Equality
 #include "hipcub/block/block_load.hpp"
 #include "hipcub/block/block_store.hpp"
-
+#include "hipcub/thread/thread_operators.hpp" //to use hipcub::Equality
 
 #ifndef DEFAULT_N
 const size_t DEFAULT_N = 1024 * 1024 * 128;
@@ -44,49 +43,41 @@ struct custom_flag_op1
     }
 };
 
-template<
-    class Runner,
-    class T,
-    unsigned int BlockSize,
-    unsigned int ItemsPerThread,
-    bool WithTile,
-    unsigned int Trials
->
-__global__
-__launch_bounds__(BlockSize)
-void kernel(const T * d_input, T * d_output)
+template<class Runner,
+         class T,
+         unsigned int BlockSize,
+         unsigned int ItemsPerThread,
+         bool         WithTile,
+         unsigned int Trials>
+__global__ __launch_bounds__(BlockSize) void kernel(const T* d_input, T* d_output)
 {
     Runner::template run<T, BlockSize, ItemsPerThread, WithTile, Trials>(d_input, d_output);
 }
 
 struct flag_heads
 {
-    template<
-        class T,
-        unsigned int BlockSize,
-        unsigned int ItemsPerThread,
-        bool WithTile,
-        unsigned int Trials
-    >
-    __device__
-    static void run(const T * d_input, T * d_output)
+    template<class T,
+             unsigned int BlockSize,
+             unsigned int ItemsPerThread,
+             bool         WithTile,
+             unsigned int Trials>
+    __device__ static void run(const T* d_input, T* d_output)
     {
-        const unsigned int lid = hipThreadIdx_x;
+        const unsigned int lid          = hipThreadIdx_x;
         const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
 
         T input[ItemsPerThread];
         hipcub::LoadDirectStriped<BlockSize>(lid, d_input + block_offset, input);
 
-        #pragma nounroll
+#pragma nounroll
         for(unsigned int trial = 0; trial < Trials; trial++)
         {
             hipcub::BlockDiscontinuity<T, BlockSize> bdiscontinuity;
-            bool head_flags[ItemsPerThread];
+            bool                                     head_flags[ItemsPerThread];
             if(WithTile)
             {
                 bdiscontinuity.FlagHeads(head_flags, input, hipcub::Equality(), T(123));
-            }
-            else
+            } else
             {
                 bdiscontinuity.FlagHeads(head_flags, input, hipcub::Equality());
             }
@@ -103,32 +94,28 @@ struct flag_heads
 
 struct flag_tails
 {
-    template<
-        class T,
-        unsigned int BlockSize,
-        unsigned int ItemsPerThread,
-        bool WithTile,
-        unsigned int Trials
-    >
-    __device__
-    static void run(const T * d_input, T * d_output)
+    template<class T,
+             unsigned int BlockSize,
+             unsigned int ItemsPerThread,
+             bool         WithTile,
+             unsigned int Trials>
+    __device__ static void run(const T* d_input, T* d_output)
     {
-        const unsigned int lid = hipThreadIdx_x;
+        const unsigned int lid          = hipThreadIdx_x;
         const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
 
         T input[ItemsPerThread];
         hipcub::LoadDirectStriped<BlockSize>(lid, d_input + block_offset, input);
 
-        #pragma nounroll
+#pragma nounroll
         for(unsigned int trial = 0; trial < Trials; trial++)
         {
             hipcub::BlockDiscontinuity<T, BlockSize> bdiscontinuity;
-            bool tail_flags[ItemsPerThread];
+            bool                                     tail_flags[ItemsPerThread];
             if(WithTile)
             {
                 bdiscontinuity.FlagTails(tail_flags, input, hipcub::Equality(), T(123));
-            }
-            else
+            } else
             {
                 bdiscontinuity.FlagTails(tail_flags, input, hipcub::Equality());
             }
@@ -145,33 +132,34 @@ struct flag_tails
 
 struct flag_heads_and_tails
 {
-    template<
-        class T,
-        unsigned int BlockSize,
-        unsigned int ItemsPerThread,
-        bool WithTile,
-        unsigned int Trials
-    >
-    __device__
-    static void run(const T * d_input, T * d_output)
+    template<class T,
+             unsigned int BlockSize,
+             unsigned int ItemsPerThread,
+             bool         WithTile,
+             unsigned int Trials>
+    __device__ static void run(const T* d_input, T* d_output)
     {
-        const unsigned int lid = hipThreadIdx_x;
+        const unsigned int lid          = hipThreadIdx_x;
         const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
 
         T input[ItemsPerThread];
         hipcub::LoadDirectStriped<BlockSize>(lid, d_input + block_offset, input);
 
-        #pragma nounroll
+#pragma nounroll
         for(unsigned int trial = 0; trial < Trials; trial++)
         {
             hipcub::BlockDiscontinuity<T, BlockSize> bdiscontinuity;
-            bool head_flags[ItemsPerThread];
-            bool tail_flags[ItemsPerThread];
+            bool                                     head_flags[ItemsPerThread];
+            bool                                     tail_flags[ItemsPerThread];
             if(WithTile)
             {
-                bdiscontinuity.FlagHeadsAndTails(head_flags, T(123), tail_flags, T(234), input, hipcub::Equality());
-            }
-            else
+                bdiscontinuity.FlagHeadsAndTails(head_flags,
+                                                 T(123),
+                                                 tail_flags,
+                                                 T(234),
+                                                 input,
+                                                 hipcub::Equality());
+            } else
             {
                 bdiscontinuity.FlagHeadsAndTails(head_flags, tail_flags, input, hipcub::Equality());
             }
@@ -187,31 +175,23 @@ struct flag_heads_and_tails
     }
 };
 
-template<
-    class Benchmark,
-    class T,
-    unsigned int BlockSize,
-    unsigned int ItemsPerThread,
-    bool WithTile,
-    unsigned int Trials = 100
->
+template<class Benchmark,
+         class T,
+         unsigned int BlockSize,
+         unsigned int ItemsPerThread,
+         bool         WithTile,
+         unsigned int Trials = 100>
 void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
 {
     constexpr auto items_per_block = BlockSize * ItemsPerThread;
-    const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block);
+    const auto     size = items_per_block * ((N + items_per_block - 1) / items_per_block);
 
     std::vector<T> input = benchmark_utils::get_random_data<T>(size, T(0), T(10));
-    T * d_input;
-    T * d_output;
+    T*             d_input;
+    T*             d_output;
     HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            size * sizeof(T),
-            hipMemcpyHostToDevice
-        )
-    );
+    HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice));
     HIP_CHECK(hipDeviceSynchronize());
 
     for(auto _ : state)
@@ -220,15 +200,18 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
 
         hipLaunchKernelGGL(
             HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, ItemsPerThread, WithTile, Trials>),
-            dim3(size/items_per_block), dim3(BlockSize), 0, stream,
-            d_input, d_output
-        );
+            dim3(size / items_per_block),
+            dim3(BlockSize),
+            0,
+            stream,
+            d_input,
+            d_output);
         HIP_CHECK(hipPeekAtLastError());
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T));
@@ -238,29 +221,27 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
     HIP_CHECK(hipFree(d_output));
 }
 
-#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \
-benchmark::RegisterBenchmark( \
-    (std::string("block_discontinuity<Datatype:" #T ",Block Size:" #BS ">.SubAlgorithm Name:") + name + ("<Items Per Thread:" #IPT ",With Tile:" #WITH_TILE ">")).c_str(), \
-    &run_benchmark<Benchmark, T, BS, IPT, WITH_TILE>, \
-    stream, size \
-)
-
-#define BENCHMARK_TYPE(type, block, bool) \
-    CREATE_BENCHMARK(type, block, 1, bool), \
-    CREATE_BENCHMARK(type, block, 2, bool), \
-    CREATE_BENCHMARK(type, block, 3, bool), \
-    CREATE_BENCHMARK(type, block, 4, bool), \
-    CREATE_BENCHMARK(type, block, 8, bool)
+#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE)                                                    \
+    benchmark::RegisterBenchmark(                                                                  \
+        std::string("block_discontinuity<data_type:" #T ",block_size:" #BS ">.sub_algorithm_name:" \
+                    + name + "<items_per_thread:" #IPT ",with_tile:" #WITH_TILE ">.")              \
+            .c_str(),                                                                              \
+        &run_benchmark<Benchmark, T, BS, IPT, WITH_TILE>,                                          \
+        stream,                                                                                    \
+        size)
 
+#define BENCHMARK_TYPE(type, block, bool)                                               \
+    CREATE_BENCHMARK(type, block, 1, bool), CREATE_BENCHMARK(type, block, 2, bool),     \
+        CREATE_BENCHMARK(type, block, 3, bool), CREATE_BENCHMARK(type, block, 4, bool), \
+        CREATE_BENCHMARK(type, block, 8, bool)
 
 template<class Benchmark>
-void add_benchmarks(const std::string& name,
+void add_benchmarks(const std::string&                            name,
                     std::vector<benchmark::internal::Benchmark*>& benchmarks,
-                    hipStream_t stream,
-                    size_t size)
+                    hipStream_t                                   stream,
+                    size_t                                        size)
 {
-    std::vector<benchmark::internal::Benchmark*> bs =
-    {
+    std::vector<benchmark::internal::Benchmark*> bs = {
         BENCHMARK_TYPE(int, 256, false),
         BENCHMARK_TYPE(int, 256, true),
         BENCHMARK_TYPE(int8_t, 256, false),
@@ -274,7 +255,7 @@ void add_benchmarks(const std::string& name,
     benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -283,15 +264,15 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     std::cout << "benchmark_block_discontinuity" << std::endl;
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
diff --git a/benchmark/benchmark_block_exchange.cpp b/benchmark/benchmark_block_exchange.cpp
index 278a6190..a36d041a 100644
--- a/benchmark/benchmark_block_exchange.cpp
+++ b/benchmark/benchmark_block_exchange.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -27,49 +27,41 @@
 #include "hipcub/block/block_load.hpp"
 #include "hipcub/block/block_store.hpp"
 
-
 #ifndef DEFAULT_N
 const size_t DEFAULT_N = 1024 * 1024 * 32;
 #endif
 
-template<
-    class Runner,
-    class T,
-    unsigned int BlockSize,
-    unsigned int ItemsPerThread,
-    unsigned int Trials
->
-__global__
-__launch_bounds__(BlockSize)
-void kernel(const T * d_input, const unsigned int * d_ranks, T * d_output)
+template<class Runner,
+         class T,
+         unsigned int BlockSize,
+         unsigned int ItemsPerThread,
+         unsigned int Trials>
+__global__ __launch_bounds__(BlockSize) void kernel(const T*            d_input,
+                                                    const unsigned int* d_ranks,
+                                                    T*                  d_output)
 {
     Runner::template run<T, BlockSize, ItemsPerThread, Trials>(d_input, d_ranks, d_output);
 }
 
 struct blocked_to_striped
 {
-    template<
-        class T,
-        unsigned int BlockSize,
-        unsigned int ItemsPerThread,
-        unsigned int Trials
-    >
-    __device__
-    static void run(const T * d_input, const unsigned int *, T * d_output)
+    template<class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials>
+    __device__ static void run(const T* d_input, const unsigned int*, T* d_output)
     {
-        const unsigned int lid = hipThreadIdx_x;
+        const unsigned int lid          = hipThreadIdx_x;
         const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
 
         T input[ItemsPerThread];
         hipcub::LoadDirectBlocked(lid, d_input + block_offset, input);
 
-
-        #pragma nounroll
+#pragma nounroll
         for(unsigned int trial = 0; trial < Trials; trial++)
         {
             hipcub::BlockExchange<T, BlockSize, ItemsPerThread> exchange;
             exchange.BlockedToStriped(input, input);
-            __syncthreads(); // extra sync needed because of loop. In normal usage sync with be cared for by the load and store functions (outside the loop).
+            __syncthreads(); // extra sync needed because of loop. In normal usage
+                // sync with be cared for by the load and store functions
+                // (outside the loop).
         }
         hipcub::StoreDirectStriped<BlockSize>(lid, d_output + block_offset, input);
     }
@@ -77,27 +69,23 @@ struct blocked_to_striped
 
 struct striped_to_blocked
 {
-    template<
-        class T,
-        unsigned int BlockSize,
-        unsigned int ItemsPerThread,
-        unsigned int Trials
-    >
-    __device__
-    static void run(const T * d_input, const unsigned int *, T * d_output)
+    template<class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials>
+    __device__ static void run(const T* d_input, const unsigned int*, T* d_output)
     {
-        const unsigned int lid = hipThreadIdx_x;
+        const unsigned int lid          = hipThreadIdx_x;
         const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
 
         T input[ItemsPerThread];
         hipcub::LoadDirectStriped<BlockSize>(lid, d_input + block_offset, input);
 
-        #pragma nounroll
+#pragma nounroll
         for(unsigned int trial = 0; trial < Trials; trial++)
         {
             hipcub::BlockExchange<T, BlockSize, ItemsPerThread> exchange;
             exchange.StripedToBlocked(input, input);
-            __syncthreads();// extra sync needed because of loop. In normal usage sync with be cared for by the load and store functions (outside the loop).
+            __syncthreads(); // extra sync needed because of loop. In normal usage
+                // sync with be cared for by the load and store functions
+                // (outside the loop).
         }
         hipcub::StoreDirectBlocked(lid, d_output + block_offset, input);
     }
@@ -105,27 +93,23 @@ struct striped_to_blocked
 
 struct blocked_to_warp_striped
 {
-    template<
-        class T,
-        unsigned int BlockSize,
-        unsigned int ItemsPerThread,
-        unsigned int Trials
-    >
-    __device__
-    static void run(const T * d_input, const unsigned int *, T * d_output)
+    template<class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials>
+    __device__ static void run(const T* d_input, const unsigned int*, T* d_output)
     {
-        const unsigned int lid = hipThreadIdx_x;
+        const unsigned int lid          = hipThreadIdx_x;
         const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
 
         T input[ItemsPerThread];
         hipcub::LoadDirectBlocked(lid, d_input + block_offset, input);
 
-        #pragma nounroll
+#pragma nounroll
         for(unsigned int trial = 0; trial < Trials; trial++)
         {
             hipcub::BlockExchange<T, BlockSize, ItemsPerThread> exchange;
             exchange.BlockedToWarpStriped(input, input);
-            __syncthreads();// extra sync needed because of loop. In normal usage sync with be cared for by the load and store functions (outside the loop).
+            __syncthreads(); // extra sync needed because of loop. In normal usage
+                // sync with be cared for by the load and store functions
+                // (outside the loop).
         }
         hipcub::StoreDirectWarpStriped(lid, d_output + block_offset, input);
     }
@@ -133,27 +117,23 @@ struct blocked_to_warp_striped
 
 struct warp_striped_to_blocked
 {
-    template<
-        class T,
-        unsigned int BlockSize,
-        unsigned int ItemsPerThread,
-        unsigned int Trials
-    >
-    __device__
-    static void run(const T * d_input, const unsigned int *, T * d_output)
+    template<class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials>
+    __device__ static void run(const T* d_input, const unsigned int*, T* d_output)
     {
-        const unsigned int lid = hipThreadIdx_x;
+        const unsigned int lid          = hipThreadIdx_x;
         const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
 
         T input[ItemsPerThread];
         hipcub::LoadDirectWarpStriped(lid, d_input + block_offset, input);
 
-        #pragma nounroll
+#pragma nounroll
         for(unsigned int trial = 0; trial < Trials; trial++)
         {
             hipcub::BlockExchange<T, BlockSize, ItemsPerThread> exchange;
             exchange.WarpStripedToBlocked(input, input);
-            __syncthreads(); // extra sync needed because of loop. In normal usage sync with be cared for by the load and store functions (outside the loop).
+            __syncthreads(); // extra sync needed because of loop. In normal usage
+                // sync with be cared for by the load and store functions
+                // (outside the loop).
         }
         hipcub::StoreDirectBlocked(lid, d_output + block_offset, input);
     }
@@ -161,29 +141,25 @@ struct warp_striped_to_blocked
 
 struct scatter_to_blocked
 {
-    template<
-        class T,
-        unsigned int BlockSize,
-        unsigned int ItemsPerThread,
-        unsigned int Trials
-    >
-    __device__
-    static void run(const T * d_input, const unsigned int * d_ranks, T * d_output)
+    template<class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials>
+    __device__ static void run(const T* d_input, const unsigned int* d_ranks, T* d_output)
     {
-        const unsigned int lid = hipThreadIdx_x;
+        const unsigned int lid          = hipThreadIdx_x;
         const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
 
-        T input[ItemsPerThread];
+        T            input[ItemsPerThread];
         unsigned int ranks[ItemsPerThread];
         hipcub::LoadDirectStriped<BlockSize>(lid, d_input + block_offset, input);
         hipcub::LoadDirectStriped<BlockSize>(lid, d_ranks + block_offset, ranks);
 
-        #pragma nounroll
+#pragma nounroll
         for(unsigned int trial = 0; trial < Trials; trial++)
         {
             hipcub::BlockExchange<T, BlockSize, ItemsPerThread> exchange;
             exchange.ScatterToBlocked(input, input, ranks);
-            __syncthreads();// extra sync needed because of loop. In normal usage sync with be cared for by the load and store functions (outside the loop).
+            __syncthreads(); // extra sync needed because of loop. In normal usage
+                // sync with be cared for by the load and store functions
+                // (outside the loop).
         }
         hipcub::StoreDirectBlocked(lid, d_output + block_offset, input);
     }
@@ -191,45 +167,39 @@ struct scatter_to_blocked
 
 struct scatter_to_striped
 {
-    template<
-        class T,
-        unsigned int BlockSize,
-        unsigned int ItemsPerThread,
-        unsigned int Trials
-    >
-    __device__
-    static void run(const T * d_input, const unsigned int * d_ranks, T * d_output)
+    template<class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials>
+    __device__ static void run(const T* d_input, const unsigned int* d_ranks, T* d_output)
     {
-        const unsigned int lid = hipThreadIdx_x;
+        const unsigned int lid          = hipThreadIdx_x;
         const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
 
-        T input[ItemsPerThread];
+        T            input[ItemsPerThread];
         unsigned int ranks[ItemsPerThread];
         hipcub::LoadDirectStriped<BlockSize>(lid, d_input + block_offset, input);
         hipcub::LoadDirectStriped<BlockSize>(lid, d_ranks + block_offset, ranks);
 
-        #pragma nounroll
+#pragma nounroll
         for(unsigned int trial = 0; trial < Trials; trial++)
         {
             hipcub::BlockExchange<T, BlockSize, ItemsPerThread> exchange;
             exchange.ScatterToStriped(input, input, ranks);
-            __syncthreads(); // extra sync needed because of loop. In normal usage sync with be cared for by the load and store functions (outside the loop).
+            __syncthreads(); // extra sync needed because of loop. In normal usage
+                // sync with be cared for by the load and store functions
+                // (outside the loop).
         }
         hipcub::StoreDirectStriped<BlockSize>(lid, d_output + block_offset, input);
     }
 };
 
-template<
-    class Benchmark,
-    class T,
-    unsigned int BlockSize,
-    unsigned int ItemsPerThread,
-    unsigned int Trials = 100
->
+template<class Benchmark,
+         class T,
+         unsigned int BlockSize,
+         unsigned int ItemsPerThread,
+         unsigned int Trials = 100>
 void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
 {
     constexpr auto items_per_block = BlockSize * ItemsPerThread;
-    const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block);
+    const auto     size = items_per_block * ((N + items_per_block - 1) / items_per_block);
 
     std::vector<T> input(size);
     // Fill input
@@ -246,43 +216,34 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
         std::iota(block_ranks, block_ranks + items_per_block, 0);
         std::shuffle(block_ranks, block_ranks + items_per_block, gen);
     }
-    T * d_input;
-    unsigned int * d_ranks;
-    T * d_output;
+    T*            d_input;
+    unsigned int* d_ranks;
+    T*            d_output;
     HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_ranks, size * sizeof(unsigned int)));
     HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            size * sizeof(T),
-            hipMemcpyHostToDevice
-        )
-    );
-    HIP_CHECK(
-        hipMemcpy(
-            d_ranks, ranks.data(),
-            size * sizeof(unsigned int),
-            hipMemcpyHostToDevice
-        )
-    );
+    HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(d_ranks, ranks.data(), size * sizeof(unsigned int), hipMemcpyHostToDevice));
     HIP_CHECK(hipDeviceSynchronize());
 
     for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
 
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, ItemsPerThread, Trials>),
-            dim3(size/items_per_block), dim3(BlockSize), 0, stream,
-            d_input, d_ranks, d_output
-        );
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, ItemsPerThread, Trials>),
+                           dim3(size / items_per_block),
+                           dim3(BlockSize),
+                           0,
+                           stream,
+                           d_input,
+                           d_ranks,
+                           d_output);
         HIP_CHECK(hipPeekAtLastError());
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T));
@@ -293,32 +254,30 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
     HIP_CHECK(hipFree(d_output));
 }
 
-#define CREATE_BENCHMARK(T, BS, IPT) \
-benchmark::RegisterBenchmark( \
-    (std::string("block_exchange<Datatype:" #T ",Block Size:" #BS ",Items Per Thread:" #IPT ">.SubAlgorithm Name:") + name).c_str(), \
-    &run_benchmark<Benchmark, T, BS, IPT>, \
-    stream, size \
-)
-
-#define BENCHMARK_TYPE(type, block) \
-    CREATE_BENCHMARK(type, block, 1), \
-    CREATE_BENCHMARK(type, block, 2), \
-    CREATE_BENCHMARK(type, block, 3), \
-    CREATE_BENCHMARK(type, block, 4), \
-    CREATE_BENCHMARK(type, block, 7), \
-    CREATE_BENCHMARK(type, block, 8)
+#define CREATE_BENCHMARK(T, BS, IPT)                                                           \
+    benchmark::RegisterBenchmark(std::string("block_exchange<data_type:" #T ",block_size:" #BS \
+                                             ",items_per_thread:" #IPT ">.sub_algorithm_name:" \
+                                             + name)                                           \
+                                     .c_str(),                                                 \
+                                 &run_benchmark<Benchmark, T, BS, IPT>,                        \
+                                 stream,                                                       \
+                                 size)
+
+#define BENCHMARK_TYPE(type, block)                                         \
+    CREATE_BENCHMARK(type, block, 1), CREATE_BENCHMARK(type, block, 2),     \
+        CREATE_BENCHMARK(type, block, 3), CREATE_BENCHMARK(type, block, 4), \
+        CREATE_BENCHMARK(type, block, 7), CREATE_BENCHMARK(type, block, 8)
 
 template<class Benchmark>
-void add_benchmarks(const std::string& name,
+void add_benchmarks(const std::string&                            name,
                     std::vector<benchmark::internal::Benchmark*>& benchmarks,
-                    hipStream_t stream,
-                    size_t size)
+                    hipStream_t                                   stream,
+                    size_t                                        size)
 {
-    using custom_float2 = benchmark_utils::custom_type<float, float>;
+    using custom_float2  = benchmark_utils::custom_type<float, float>;
     using custom_double2 = benchmark_utils::custom_type<double, double>;
 
-    std::vector<benchmark::internal::Benchmark*> bs =
-    {
+    std::vector<benchmark::internal::Benchmark*> bs = {
         BENCHMARK_TYPE(int, 256),
         BENCHMARK_TYPE(int8_t, 256),
         BENCHMARK_TYPE(long long, 256),
@@ -329,7 +288,7 @@ void add_benchmarks(const std::string& name,
     benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -338,15 +297,15 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     std::cout << "benchmark_block_exchange" << std::endl;
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
diff --git a/benchmark/benchmark_block_histogram.cpp b/benchmark/benchmark_block_histogram.cpp
index e247a13b..122ccc36 100644
--- a/benchmark/benchmark_block_histogram.cpp
+++ b/benchmark/benchmark_block_histogram.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -25,22 +25,17 @@
 // HIP API
 #include "hipcub/block/block_histogram.hpp"
 
-
 #ifndef DEFAULT_N
 const size_t DEFAULT_N = 1024 * 1024 * 128;
 #endif
 
-template<
-    class Runner,
-    class T,
-    unsigned int BlockSize,
-    unsigned int ItemsPerThread,
-    unsigned int BinSize,
-    unsigned int Trials
->
-__global__
-__launch_bounds__(BlockSize)
-void kernel(const T* input, T* output)
+template<class Runner,
+         class T,
+         unsigned int BlockSize,
+         unsigned int ItemsPerThread,
+         unsigned int BinSize,
+         unsigned int Trials>
+__global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output)
 {
     Runner::template run<T, BlockSize, ItemsPerThread, BinSize, Trials>(input, output);
 }
@@ -48,18 +43,15 @@ void kernel(const T* input, T* output)
 template<hipcub::BlockHistogramAlgorithm algorithm>
 struct histogram
 {
-    template<
-        class T,
-        unsigned int BlockSize,
-        unsigned int ItemsPerThread,
-        unsigned int BinSize,
-        unsigned int Trials
-    >
-    __device__
-    static void run(const T* input, T* output)
+    template<class T,
+             unsigned int BlockSize,
+             unsigned int ItemsPerThread,
+             unsigned int BinSize,
+             unsigned int Trials>
+    __device__ static void run(const T* input, T* output)
     {
         const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
-        unsigned int global_offset = hipBlockIdx_x * BinSize;
+        unsigned int       global_offset = hipBlockIdx_x * BinSize;
 
         T values[ItemsPerThread];
         for(unsigned int k = 0; k < ItemsPerThread; k++)
@@ -67,18 +59,19 @@ struct histogram
             values[k] = input[index + k];
         }
 
-        using bhistogram_t =  hipcub::BlockHistogram<T, BlockSize, ItemsPerThread, BinSize, algorithm>;
-        __shared__ T histogram[BinSize];
+        using bhistogram_t
+            = hipcub::BlockHistogram<T, BlockSize, ItemsPerThread, BinSize, algorithm>;
+        __shared__ T                                  histogram[BinSize];
         __shared__ typename bhistogram_t::TempStorage storage;
 
-        #pragma nounroll
+#pragma nounroll
         for(unsigned int trial = 0; trial < Trials; trial++)
         {
             bhistogram_t(storage).Histogram(values, histogram);
         }
 
-        #pragma unroll
-        for (unsigned int offset = 0; offset < BinSize; offset += BlockSize)
+#pragma unroll
+        for(unsigned int offset = 0; offset < BinSize; offset += BlockSize)
         {
             if(offset + hipThreadIdx_x < BinSize)
             {
@@ -89,49 +82,44 @@ struct histogram
     }
 };
 
-template<
-    class Benchmark,
-    class T,
-    unsigned int BlockSize,
-    unsigned int ItemsPerThread,
-    unsigned int BinSize = BlockSize,
-    unsigned int Trials = 100
->
+template<class Benchmark,
+         class T,
+         unsigned int BlockSize,
+         unsigned int ItemsPerThread,
+         unsigned int BinSize = BlockSize,
+         unsigned int Trials  = 100>
 void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
 {
     // Make sure size is a multiple of BlockSize
     constexpr auto items_per_block = BlockSize * ItemsPerThread;
-    const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block);
-    const auto bin_size = BinSize * ((N + items_per_block - 1)/items_per_block);
+    const auto     size     = items_per_block * ((N + items_per_block - 1) / items_per_block);
+    const auto     bin_size = BinSize * ((N + items_per_block - 1) / items_per_block);
     // Allocate and fill memory
     std::vector<T> input(size, 0.0f);
-    T * d_input;
-    T * d_output;
+    T*             d_input;
+    T*             d_output;
     HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_output, bin_size * sizeof(T)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            size * sizeof(T),
-            hipMemcpyHostToDevice
-        )
-    );
+    HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice));
     HIP_CHECK(hipDeviceSynchronize());
 
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
         hipLaunchKernelGGL(
             HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, ItemsPerThread, BinSize, Trials>),
-            dim3(size/items_per_block), dim3(BlockSize), 0, stream,
-            d_input, d_output
-        );
+            dim3(size / items_per_block),
+            dim3(BlockSize),
+            0,
+            stream,
+            d_input,
+            d_output);
         HIP_CHECK(hipPeekAtLastError());
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
 
         state.SetIterationTime(elapsed_seconds.count());
     }
@@ -143,41 +131,38 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
 }
 
 // IPT - items per thread
-#define CREATE_BENCHMARK(T, BS, IPT) \
-    benchmark::RegisterBenchmark( \
-        (std::string("block_histogram<Datatype:"#T",Block Size:"#BS",Items Per Thread:"#IPT",SubAlgorithm Name:" + algorithm_name + ">.Method Name:") + method_name).c_str(), \
-        &run_benchmark<Benchmark, T, BS, IPT>, \
-        stream, size \
-    )
-
-#define BENCHMARK_TYPE(type, block) \
-    CREATE_BENCHMARK(type, block, 1), \
-    CREATE_BENCHMARK(type, block, 2), \
-    CREATE_BENCHMARK(type, block, 3), \
-    CREATE_BENCHMARK(type, block, 4), \
-    CREATE_BENCHMARK(type, block, 8), \
-    CREATE_BENCHMARK(type, block, 16)
+#define CREATE_BENCHMARK(T, BS, IPT)                                                            \
+    benchmark::RegisterBenchmark(std::string("block_histogram<data_type:" #T ",block_size:" #BS \
+                                             ",items_per_thread:" #IPT ",sub_algorithm_name:"   \
+                                             + algorithm_name + ">.method_name:" + method_name) \
+                                     .c_str(),                                                  \
+                                 &run_benchmark<Benchmark, T, BS, IPT>,                         \
+                                 stream,                                                        \
+                                 size)
+
+#define BENCHMARK_TYPE(type, block)                                         \
+    CREATE_BENCHMARK(type, block, 1), CREATE_BENCHMARK(type, block, 2),     \
+        CREATE_BENCHMARK(type, block, 3), CREATE_BENCHMARK(type, block, 4), \
+        CREATE_BENCHMARK(type, block, 8), CREATE_BENCHMARK(type, block, 16)
 
 template<class Benchmark>
 void add_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
-                    const std::string& method_name,
-                    const std::string& algorithm_name,
-                    hipStream_t stream,
-                    size_t size)
+                    const std::string&                            method_name,
+                    const std::string&                            algorithm_name,
+                    hipStream_t                                   stream,
+                    size_t                                        size)
 {
-    std::vector<benchmark::internal::Benchmark*> new_benchmarks =
-    {
-        BENCHMARK_TYPE(int, 256),
-        BENCHMARK_TYPE(int, 320),
-        BENCHMARK_TYPE(int, 512),
+    std::vector<benchmark::internal::Benchmark*> new_benchmarks
+        = {BENCHMARK_TYPE(int, 256),
+           BENCHMARK_TYPE(int, 320),
+           BENCHMARK_TYPE(int, 512),
 
-        BENCHMARK_TYPE(unsigned long long, 256),
-        BENCHMARK_TYPE(unsigned long long, 320)
-    };
+           BENCHMARK_TYPE(unsigned long long, 256),
+           BENCHMARK_TYPE(unsigned long long, 320)};
     benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end());
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -186,15 +171,15 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     std::cout << "benchmark_block_histogram" << std::endl;
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
@@ -203,14 +188,10 @@ int main(int argc, char *argv[])
     std::vector<benchmark::internal::Benchmark*> benchmarks;
     // using_atomic
     using histogram_a_t = histogram<hipcub::BlockHistogramAlgorithm::BLOCK_HISTO_ATOMIC>;
-    add_benchmarks<histogram_a_t>(
-        benchmarks, "histogram", "using_atomic", stream, size
-    );
+    add_benchmarks<histogram_a_t>(benchmarks, "histogram", "using_atomic", stream, size);
     // using_sort
     using histogram_s_t = histogram<hipcub::BlockHistogramAlgorithm::BLOCK_HISTO_SORT>;
-    add_benchmarks<histogram_s_t>(
-        benchmarks, "histogram", "using_sort", stream, size
-    );
+    add_benchmarks<histogram_s_t>(benchmarks, "histogram", "using_sort", stream, size);
 
     // Use manual timing
     for(auto& b : benchmarks)
diff --git a/benchmark/benchmark_block_merge_sort.cpp b/benchmark/benchmark_block_merge_sort.cpp
index 14407a62..62ffbdfa 100644
--- a/benchmark/benchmark_block_merge_sort.cpp
+++ b/benchmark/benchmark_block_merge_sort.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -24,11 +24,10 @@
 
 #include "../test/hipcub/test_utils_sort_comparator.hpp"
 // HIP API
-#include "hipcub/block/block_merge_sort.hpp"
 #include "hipcub/block/block_load.hpp"
+#include "hipcub/block/block_merge_sort.hpp"
 #include "hipcub/block/block_store.hpp"
 
-
 #ifndef DEFAULT_N
 const size_t DEFAULT_N = 1024 * 1024 * 128;
 #endif
@@ -39,24 +38,22 @@ enum class benchmark_kinds
     sort_pairs
 };
 
-template<
-    class T,
-    unsigned int BlockSize,
-    unsigned int ItemsPerThread,
-    class CompareOp,
-    unsigned int Trials
->
-__global__
-__launch_bounds__(BlockSize)
-void sort_keys_kernel(const T * input, T * output, CompareOp compare_op)
+template<class T,
+         unsigned int BlockSize,
+         unsigned int ItemsPerThread,
+         class CompareOp,
+         unsigned int Trials>
+__global__ __launch_bounds__(BlockSize) void sort_keys_kernel(const T*  input,
+                                                              T*        output,
+                                                              CompareOp compare_op)
 {
-    const unsigned int lid = hipThreadIdx_x;
+    const unsigned int lid          = hipThreadIdx_x;
     const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
 
     T keys[ItemsPerThread];
     hipcub::LoadDirectStriped<BlockSize>(lid, input + block_offset, keys);
 
-    #pragma nounroll
+#pragma nounroll
     for(unsigned int trial = 0; trial < Trials; trial++)
     {
         hipcub::BlockMergeSort<T, BlockSize, ItemsPerThread> sort;
@@ -66,18 +63,16 @@ void sort_keys_kernel(const T * input, T * output, CompareOp compare_op)
     hipcub::StoreDirectStriped<BlockSize>(lid, output + block_offset, keys);
 }
 
-template<
-    class T,
-    unsigned int BlockSize,
-    unsigned int ItemsPerThread,
-    class CompareOp,
-    unsigned int Trials
->
-__global__
-__launch_bounds__(BlockSize)
-void sort_pairs_kernel(const T * input, T * output, CompareOp compare_op)
+template<class T,
+         unsigned int BlockSize,
+         unsigned int ItemsPerThread,
+         class CompareOp,
+         unsigned int Trials>
+__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(const T*  input,
+                                                               T*        output,
+                                                               CompareOp compare_op)
 {
-    const unsigned int lid = hipThreadIdx_x;
+    const unsigned int lid          = hipThreadIdx_x;
     const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
 
     T keys[ItemsPerThread];
@@ -89,7 +84,7 @@ void sort_pairs_kernel(const T * input, T * output, CompareOp compare_op)
         values[i] = keys[i] + T(1);
     }
 
-    #pragma nounroll
+#pragma nounroll
     for(unsigned int trial = 0; trial < Trials; trial++)
     {
         hipcub::BlockMergeSort<T, BlockSize, ItemsPerThread, T> sort;
@@ -101,45 +96,36 @@ void sort_pairs_kernel(const T * input, T * output, CompareOp compare_op)
         keys[i] += values[i];
     }
     hipcub::StoreDirectStriped<BlockSize>(lid, output + block_offset, keys);
-
 }
 
-template<
-    class T,
-    unsigned int BlockSize,
-    unsigned int ItemsPerThread,
-    class CompareOp = test_utils::less,
-    unsigned int Trials = 10
->
-void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipStream_t stream, size_t N)
+template<class T,
+         unsigned int BlockSize,
+         unsigned int ItemsPerThread,
+         class CompareOp     = test_utils::less,
+         unsigned int Trials = 10>
+void run_benchmark(benchmark::State& state,
+                   benchmark_kinds   benchmark_kind,
+                   hipStream_t       stream,
+                   size_t            N)
 {
     constexpr auto items_per_block = BlockSize * ItemsPerThread;
-    const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block);
+    const auto     size = items_per_block * ((N + items_per_block - 1) / items_per_block);
 
     std::vector<T> input;
     if(std::is_floating_point<T>::value)
     {
-        input = benchmark_utils::get_random_data<T>(size, (T)-1000, (T)+1000);
-    }
-    else
+        input = benchmark_utils::get_random_data<T>(size, (T)-1000, (T) + 1000);
+    } else
     {
-        input = benchmark_utils::get_random_data<T>(
-            size,
-            std::numeric_limits<T>::min(),
-            std::numeric_limits<T>::max()
-        );
+        input = benchmark_utils::get_random_data<T>(size,
+                                                    std::numeric_limits<T>::min(),
+                                                    std::numeric_limits<T>::max());
     }
-    T * d_input;
-    T * d_output;
+    T* d_input;
+    T* d_output;
     HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            size * sizeof(T),
-            hipMemcpyHostToDevice
-        )
-    );
+    HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice));
     HIP_CHECK(hipDeviceSynchronize());
 
     for(auto _ : state)
@@ -150,24 +136,31 @@ void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipS
         {
             hipLaunchKernelGGL(
                 HIP_KERNEL_NAME(sort_keys_kernel<T, BlockSize, ItemsPerThread, CompareOp, Trials>),
-                dim3(size/items_per_block), dim3(BlockSize), 0, stream,
-                d_input, d_output, CompareOp()
-            );
-        }
-        else if(benchmark_kind == benchmark_kinds::sort_pairs)
+                dim3(size / items_per_block),
+                dim3(BlockSize),
+                0,
+                stream,
+                d_input,
+                d_output,
+                CompareOp());
+        } else if(benchmark_kind == benchmark_kinds::sort_pairs)
         {
             hipLaunchKernelGGL(
                 HIP_KERNEL_NAME(sort_pairs_kernel<T, BlockSize, ItemsPerThread, CompareOp, Trials>),
-                dim3(size/items_per_block), dim3(BlockSize), 0, stream,
-                d_input, d_output, CompareOp()
-            );
+                dim3(size / items_per_block),
+                dim3(BlockSize),
+                0,
+                stream,
+                d_input,
+                d_output,
+                CompareOp());
         }
         HIP_CHECK(hipPeekAtLastError());
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T));
@@ -177,53 +170,51 @@ void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipS
     HIP_CHECK(hipFree(d_output));
 }
 
-#define CREATE_BENCHMARK(T, BS, IPT) \
-benchmark::RegisterBenchmark( \
-    (std::string("block_merge_sort<Datatype:" #T ",Block Size:" #BS ",Items Per Thread:" #IPT ">.SubAlgorithm Name:") + name).c_str(), \
-    &run_benchmark<T, BS, IPT>, \
-    benchmark_kind, stream, size \
-)
-
-#define BENCHMARK_TYPE(type, block) \
-    CREATE_BENCHMARK(type, block, 1), \
-    CREATE_BENCHMARK(type, block, 2), \
-    CREATE_BENCHMARK(type, block, 3), \
-    CREATE_BENCHMARK(type, block, 4), \
-    CREATE_BENCHMARK(type, block, 8)
-
-void add_benchmarks(benchmark_kinds benchmark_kind,
-                    const std::string& name,
+#define CREATE_BENCHMARK(T, BS, IPT)                                                             \
+    benchmark::RegisterBenchmark(std::string("block_merge_sort<data_type:" #T ",block_size:" #BS \
+                                             ",items_per_thread:" #IPT ">.sub_algorithm_name:"   \
+                                             + name)                                             \
+                                     .c_str(),                                                   \
+                                 &run_benchmark<T, BS, IPT>,                                     \
+                                 benchmark_kind,                                                 \
+                                 stream,                                                         \
+                                 size)
+
+#define BENCHMARK_TYPE(type, block)                                         \
+    CREATE_BENCHMARK(type, block, 1), CREATE_BENCHMARK(type, block, 2),     \
+        CREATE_BENCHMARK(type, block, 3), CREATE_BENCHMARK(type, block, 4), \
+        CREATE_BENCHMARK(type, block, 8)
+
+void add_benchmarks(benchmark_kinds                               benchmark_kind,
+                    const std::string&                            name,
                     std::vector<benchmark::internal::Benchmark*>& benchmarks,
-                    hipStream_t stream,
-                    size_t size)
+                    hipStream_t                                   stream,
+                    size_t                                        size)
 {
-    std::vector<benchmark::internal::Benchmark*> bs =
-    {
-        BENCHMARK_TYPE(int, 64),
-        BENCHMARK_TYPE(int, 128),
-        BENCHMARK_TYPE(int, 256),
-        BENCHMARK_TYPE(int, 512),
-
-        BENCHMARK_TYPE(int8_t, 64),
-        BENCHMARK_TYPE(int8_t, 128),
-        BENCHMARK_TYPE(int8_t, 256),
-        BENCHMARK_TYPE(int8_t, 512),
-
-        BENCHMARK_TYPE(uint8_t, 64),
-        BENCHMARK_TYPE(uint8_t, 128),
-        BENCHMARK_TYPE(uint8_t, 256),
-        BENCHMARK_TYPE(uint8_t, 512),
-
-        BENCHMARK_TYPE(long long, 64),
-        BENCHMARK_TYPE(long long, 128),
-        BENCHMARK_TYPE(long long, 256),
-        BENCHMARK_TYPE(long long, 512)
-    };
+    std::vector<benchmark::internal::Benchmark*> bs = {BENCHMARK_TYPE(int, 64),
+                                                       BENCHMARK_TYPE(int, 128),
+                                                       BENCHMARK_TYPE(int, 256),
+                                                       BENCHMARK_TYPE(int, 512),
+
+                                                       BENCHMARK_TYPE(int8_t, 64),
+                                                       BENCHMARK_TYPE(int8_t, 128),
+                                                       BENCHMARK_TYPE(int8_t, 256),
+                                                       BENCHMARK_TYPE(int8_t, 512),
+
+                                                       BENCHMARK_TYPE(uint8_t, 64),
+                                                       BENCHMARK_TYPE(uint8_t, 128),
+                                                       BENCHMARK_TYPE(uint8_t, 256),
+                                                       BENCHMARK_TYPE(uint8_t, 512),
+
+                                                       BENCHMARK_TYPE(long long, 64),
+                                                       BENCHMARK_TYPE(long long, 128),
+                                                       BENCHMARK_TYPE(long long, 256),
+                                                       BENCHMARK_TYPE(long long, 512)};
 
     benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -232,15 +223,15 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     std::cout << "benchmark_block_merge_sort" << std::endl;
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
diff --git a/benchmark/benchmark_block_radix_rank.cpp b/benchmark/benchmark_block_radix_rank.cpp
index ffecb5aa..8578b75c 100644
--- a/benchmark/benchmark_block_radix_rank.cpp
+++ b/benchmark/benchmark_block_radix_rank.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -115,8 +115,7 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
         input = benchmark_utils::get_random_data<T>(size,
                                                     static_cast<T>(-1000),
                                                     static_cast<T>(1000));
-    }
-    else
+    } else
     {
         input = benchmark_utils::get_random_data<T>(size,
                                                     std::numeric_limits<T>::min(),
@@ -157,12 +156,14 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
     HIP_CHECK(hipFree(d_output));
 }
 
-#define CREATE_BENCHMARK(T, KIND, BS, IPT)                                                       \
-    benchmark::RegisterBenchmark(                                                                \
-        (std::string("block_radix_rank<" #T ", " #KIND ", " #BS ", " #IPT ">.") + name).c_str(), \
-        &run_benchmark<T, KIND, BS, IPT>,                                                        \
-        stream,                                                                                  \
-        size)
+#define CREATE_BENCHMARK(T, KIND, BS, IPT)                                                     \
+    benchmark::RegisterBenchmark(std::string("block_radix_rank<data_type:" #T ",kind:" #KIND   \
+                                             ",block_size:" #BS ",items_per_thread:" #IPT ">." \
+                                             + name)                                           \
+                                     .c_str(),                                                 \
+                                 &run_benchmark<T, KIND, BS, IPT>,                             \
+                                 stream,                                                       \
+                                 size)
 
 // clang-format off
 #define CREATE_BENCHMARK_KINDS(type, block, ipt)                                \
@@ -218,6 +219,8 @@ int main(int argc, char* argv[])
     int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
+
+    std::cout << "benchmark_block_radix_rank" << std::endl;
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
 
     // Add benchmarks
diff --git a/benchmark/benchmark_block_radix_sort.cpp b/benchmark/benchmark_block_radix_sort.cpp
index dbd13fea..7413214e 100644
--- a/benchmark/benchmark_block_radix_sort.cpp
+++ b/benchmark/benchmark_block_radix_sort.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -23,11 +23,10 @@
 #include "common_benchmark_header.hpp"
 
 // HIP API
-#include "hipcub/block/block_radix_sort.hpp"
 #include "hipcub/block/block_load.hpp"
+#include "hipcub/block/block_radix_sort.hpp"
 #include "hipcub/block/block_store.hpp"
 
-
 #ifndef DEFAULT_N
 const size_t DEFAULT_N = 1024 * 1024 * 128;
 #endif
@@ -135,7 +134,7 @@ __global__ __launch_bounds__(BlockSize) void sort_keys_kernel(const T* input, T*
     T keys[ItemsPerThread];
     Helper::template load<BlockSize>(lid, input + block_offset, keys);
 
-    #pragma nounroll
+#pragma nounroll
     for(unsigned int trial = 0; trial < Trials; trial++)
     {
         Helper::template sort<BlockSize>(keys);
@@ -163,7 +162,7 @@ __global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(const T* input, T
         values[i] = keys[i] + T(1);
     }
 
-    #pragma nounroll
+#pragma nounroll
     for(unsigned int trial = 0; trial < Trials; trial++)
     {
         Helper::template sort<BlockSize>(keys, values);
@@ -188,32 +187,23 @@ void run_benchmark(benchmark::State& state,
                    size_t            N)
 {
     constexpr auto items_per_block = BlockSize * ItemsPerThread;
-    const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block);
+    const auto     size = items_per_block * ((N + items_per_block - 1) / items_per_block);
 
     std::vector<T> input;
     if(std::is_floating_point<T>::value)
     {
-        input = benchmark_utils::get_random_data<T>(size, (T)-1000, (T)+1000);
-    }
-    else
+        input = benchmark_utils::get_random_data<T>(size, (T)-1000, (T) + 1000);
+    } else
     {
-        input = benchmark_utils::get_random_data<T>(
-            size,
-            std::numeric_limits<T>::min(),
-            std::numeric_limits<T>::max()
-        );
+        input = benchmark_utils::get_random_data<T>(size,
+                                                    std::numeric_limits<T>::min(),
+                                                    std::numeric_limits<T>::max());
     }
-    T * d_input;
-    T * d_output;
+    T* d_input;
+    T* d_output;
     HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            size * sizeof(T),
-            hipMemcpyHostToDevice
-        )
-    );
+    HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice));
     HIP_CHECK(hipDeviceSynchronize());
 
     for(auto _ : state)
@@ -224,8 +214,7 @@ void run_benchmark(benchmark::State& state,
         {
             sort_keys_kernel<Helper, T, BlockSize, ItemsPerThread, Trials>
                 <<<dim3(size / items_per_block), dim3(BlockSize), 0, stream>>>(d_input, d_output);
-        }
-        else if(benchmark_kind == benchmark_kinds::sort_pairs)
+        } else if(benchmark_kind == benchmark_kinds::sort_pairs)
         {
             sort_pairs_kernel<Helper, T, BlockSize, ItemsPerThread, Trials>
                 <<<dim3(size / items_per_block), dim3(BlockSize), 0, stream>>>(d_input, d_output);
@@ -234,8 +223,8 @@ void run_benchmark(benchmark::State& state,
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T));
@@ -246,9 +235,9 @@ void run_benchmark(benchmark::State& state,
 }
 
 #define CREATE_BENCHMARK(T, BS, IPT)                                                             \
-    benchmark::RegisterBenchmark((std::string("block_radix_sort<Datatype:" #T ",Block Size:" #BS \
-                                              ",Items Per Thread:" #IPT ">.SubAlgorithm Name:")  \
-                                  + name)                                                        \
+    benchmark::RegisterBenchmark(std::string("block_radix_sort<data_type:" #T ",block_size:" #BS \
+                                             ",items_per_thread:" #IPT ">.sub_algorithm_name:"   \
+                                             + name)                                             \
                                      .c_str(),                                                   \
                                  &run_benchmark<Helper, T, BS, IPT>,                             \
                                  benchmark_kind,                                                 \
@@ -293,7 +282,7 @@ void add_benchmarks(benchmark_kinds                               benchmark_kind
     benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -302,15 +291,15 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     std::cout << "benchmark_block_radix_sort" << std::endl;
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
diff --git a/benchmark/benchmark_block_reduce.cpp b/benchmark/benchmark_block_reduce.cpp
index a9a33909..bdb089e7 100644
--- a/benchmark/benchmark_block_reduce.cpp
+++ b/benchmark/benchmark_block_reduce.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -26,21 +26,16 @@
 #include "hipcub/block/block_reduce.hpp"
 #include "hipcub/thread/thread_operators.hpp"
 
-
 #ifndef DEFAULT_N
 const size_t DEFAULT_N = 1024 * 1024 * 32;
 #endif
 
-template<
-    class Runner,
-    class T,
-    unsigned int BlockSize,
-    unsigned int ItemsPerThread,
-    unsigned int Trials
->
-__global__
-__launch_bounds__(BlockSize)
-void kernel(const T* input, T* output)
+template<class Runner,
+         class T,
+         unsigned int BlockSize,
+         unsigned int ItemsPerThread,
+         unsigned int Trials>
+__global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output)
 {
     Runner::template run<T, BlockSize, ItemsPerThread, Trials>(input, output);
 }
@@ -48,14 +43,8 @@ void kernel(const T* input, T* output)
 template<hipcub::BlockReduceAlgorithm algorithm>
 struct reduce
 {
-    template<
-        class T,
-        unsigned int BlockSize,
-        unsigned int ItemsPerThread,
-        unsigned int Trials
-    >
-    __device__
-    static void run(const T* input, T* output)
+    template<class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials>
+    __device__ static void run(const T* input, T* output)
     {
         const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 
@@ -69,11 +58,11 @@ struct reduce
         using breduce_t = hipcub::BlockReduce<T, BlockSize, algorithm>;
         __shared__ typename breduce_t::TempStorage storage;
 
-        #pragma nounroll
+#pragma nounroll
         for(unsigned int trial = 0; trial < Trials; trial++)
         {
             reduced_value = breduce_t(storage).Reduce(values, hipcub::Sum());
-            values[0] = reduced_value;
+            values[0]     = reduced_value;
         }
 
         if(hipThreadIdx_x == 0)
@@ -83,47 +72,41 @@ struct reduce
     }
 };
 
-template<
-    class Benchmark,
-    class T,
-    unsigned int BlockSize,
-    unsigned int ItemsPerThread,
-    unsigned int Trials = 100
->
+template<class Benchmark,
+         class T,
+         unsigned int BlockSize,
+         unsigned int ItemsPerThread,
+         unsigned int Trials = 100>
 void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
 {
     // Make sure size is a multiple of BlockSize
     constexpr auto items_per_block = BlockSize * ItemsPerThread;
-    const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block);
+    const auto     size = items_per_block * ((N + items_per_block - 1) / items_per_block);
     // Allocate and fill memory
     std::vector<T> input(size, T(1));
-    T * d_input;
-    T * d_output;
+    T*             d_input;
+    T*             d_output;
     HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            size * sizeof(T),
-            hipMemcpyHostToDevice
-        )
-    );
+    HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice));
     HIP_CHECK(hipDeviceSynchronize());
 
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, ItemsPerThread, Trials>),
-            dim3(size/items_per_block), dim3(BlockSize), 0, stream,
-            d_input, d_output
-        );
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, ItemsPerThread, Trials>),
+                           dim3(size / items_per_block),
+                           dim3(BlockSize),
+                           0,
+                           stream,
+                           d_input,
+                           d_output);
         HIP_CHECK(hipPeekAtLastError());
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
 
         state.SetIterationTime(elapsed_seconds.count());
     }
@@ -135,32 +118,30 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
 }
 
 // IPT - items per thread
-#define CREATE_BENCHMARK(T, BS, IPT) \
-    benchmark::RegisterBenchmark( \
-        (std::string("block_reduce<Datatype:"#T",Block Size:"#BS",Items Per Thread:"#IPT",SubAlgorithm Name:" + algorithm_name + ">.Method Name:") + method_name).c_str(), \
-        &run_benchmark<Benchmark, T, BS, IPT>, \
-        stream, size \
-    )
-
-#define BENCHMARK_TYPE(type, block) \
-    CREATE_BENCHMARK(type, block, 1), \
-    CREATE_BENCHMARK(type, block, 2), \
-    CREATE_BENCHMARK(type, block, 3), \
-    CREATE_BENCHMARK(type, block, 4), \
-    CREATE_BENCHMARK(type, block, 8), \
-    CREATE_BENCHMARK(type, block, 11), \
-    CREATE_BENCHMARK(type, block, 16)
+#define CREATE_BENCHMARK(T, BS, IPT)                                                            \
+    benchmark::RegisterBenchmark(std::string("block_reduce<data_type:" #T ",block_size:" #BS    \
+                                             ",items_per_thread:" #IPT ",sub_algorithm_name:"   \
+                                             + algorithm_name + ">.method_name:" + method_name) \
+                                     .c_str(),                                                  \
+                                 &run_benchmark<Benchmark, T, BS, IPT>,                         \
+                                 stream,                                                        \
+                                 size)
+
+#define BENCHMARK_TYPE(type, block)                                          \
+    CREATE_BENCHMARK(type, block, 1), CREATE_BENCHMARK(type, block, 2),      \
+        CREATE_BENCHMARK(type, block, 3), CREATE_BENCHMARK(type, block, 4),  \
+        CREATE_BENCHMARK(type, block, 8), CREATE_BENCHMARK(type, block, 11), \
+        CREATE_BENCHMARK(type, block, 16)
 
 template<class Benchmark>
 void add_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
-                    const std::string& method_name,
-                    const std::string& algorithm_name,
-                    hipStream_t stream,
-                    size_t size)
+                    const std::string&                            method_name,
+                    const std::string&                            algorithm_name,
+                    hipStream_t                                   stream,
+                    size_t                                        size)
 {
 
-    std::vector<benchmark::internal::Benchmark*> new_benchmarks =
-    {
+    std::vector<benchmark::internal::Benchmark*> new_benchmarks = {
         // When block size is less than or equal to warp size
         BENCHMARK_TYPE(int, 64),
         BENCHMARK_TYPE(float, 64),
@@ -177,7 +158,7 @@ void add_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
     benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end());
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -186,15 +167,15 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     std::cout << "benchmark_block_reduce" << std::endl;
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
@@ -203,19 +184,22 @@ int main(int argc, char *argv[])
     std::vector<benchmark::internal::Benchmark*> benchmarks;
     // using_warp_scan
     using reduce_uwr_t = reduce<hipcub::BlockReduceAlgorithm::BLOCK_REDUCE_WARP_REDUCTIONS>;
-    add_benchmarks<reduce_uwr_t>(
-        benchmarks, "reduce", "BLOCK_REDUCE_WARP_REDUCTIONS", stream, size
-    );
+    add_benchmarks<reduce_uwr_t>(benchmarks,
+                                 "reduce",
+                                 "BLOCK_REDUCE_WARP_REDUCTIONS",
+                                 stream,
+                                 size);
     // raking reduce
     using reduce_rr_t = reduce<hipcub::BlockReduceAlgorithm::BLOCK_REDUCE_RAKING>;
-    add_benchmarks<reduce_rr_t>(
-        benchmarks, "reduce", "BLOCK_REDUCE_RAKING", stream, size
-    );
+    add_benchmarks<reduce_rr_t>(benchmarks, "reduce", "BLOCK_REDUCE_RAKING", stream, size);
     // raking reduce commutative only
-    using reduce_rrco_t = reduce<hipcub::BlockReduceAlgorithm::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY>;
-    add_benchmarks<reduce_rrco_t>(
-        benchmarks, "reduce", "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY", stream, size
-    );
+    using reduce_rrco_t
+        = reduce<hipcub::BlockReduceAlgorithm::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY>;
+    add_benchmarks<reduce_rrco_t>(benchmarks,
+                                  "reduce",
+                                  "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY",
+                                  stream,
+                                  size);
 
     // Use manual timing
     for(auto& b : benchmarks)
diff --git a/benchmark/benchmark_block_run_length_decode.cpp b/benchmark/benchmark_block_run_length_decode.cpp
index 8ef5def9..6769fd47 100644
--- a/benchmark/benchmark_block_run_length_decode.cpp
+++ b/benchmark/benchmark_block_run_length_decode.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -30,57 +30,48 @@
 const size_t DEFAULT_N = 1024 * 1024 * 32;
 #endif
 
-template<
-    class ItemT,
-    class OffsetT,
-    unsigned BlockSize,
-    unsigned RunsPerThread,
-    unsigned DecodedItemsPerThread,
-    unsigned Trials
->
+template<class ItemT,
+         class OffsetT,
+         unsigned BlockSize,
+         unsigned RunsPerThread,
+         unsigned DecodedItemsPerThread,
+         unsigned Trials>
 __global__
-__launch_bounds__(BlockSize)
-void block_run_length_decode_kernel(
-    const ItemT * d_run_items,
-    const OffsetT * d_run_offsets,
-    ItemT * d_decoded_items,
-    bool enable_store = false)
+    __launch_bounds__(BlockSize) void block_run_length_decode_kernel(const ItemT*   d_run_items,
+                                                                     const OffsetT* d_run_offsets,
+                                                                     ItemT*         d_decoded_items,
+                                                                     bool enable_store = false)
 {
-    using BlockRunLengthDecodeT = hipcub::BlockRunLengthDecode<
-        ItemT,
-        BlockSize,
-        RunsPerThread,
-        DecodedItemsPerThread
-    >;
-
-    ItemT run_items[RunsPerThread];
+    using BlockRunLengthDecodeT
+        = hipcub::BlockRunLengthDecode<ItemT, BlockSize, RunsPerThread, DecodedItemsPerThread>;
+
+    ItemT   run_items[RunsPerThread];
     OffsetT run_offsets[RunsPerThread];
 
     const unsigned global_thread_idx = BlockSize * hipBlockIdx_x + hipThreadIdx_x;
     hipcub::LoadDirectBlocked(global_thread_idx, d_run_items, run_items);
     hipcub::LoadDirectBlocked(global_thread_idx, d_run_offsets, run_offsets);
 
-    BlockRunLengthDecodeT block_run_length_decode(
-        run_items,
-        run_offsets
-    );
+    BlockRunLengthDecodeT block_run_length_decode(run_items, run_offsets);
 
-    const OffsetT total_decoded_size =
-        d_run_offsets[(hipBlockIdx_x + 1) * BlockSize * RunsPerThread]
-        - d_run_offsets[hipBlockIdx_x * BlockSize * RunsPerThread];
+    const OffsetT total_decoded_size
+        = d_run_offsets[(hipBlockIdx_x + 1) * BlockSize * RunsPerThread]
+          - d_run_offsets[hipBlockIdx_x * BlockSize * RunsPerThread];
 
-    #pragma nounroll
-    for (unsigned i = 0; i < Trials; ++i)
+#pragma nounroll
+    for(unsigned i = 0; i < Trials; ++i)
     {
         OffsetT decoded_window_offset = 0;
-        while (decoded_window_offset < total_decoded_size)
+        while(decoded_window_offset < total_decoded_size)
         {
             ItemT decoded_items[DecodedItemsPerThread];
             block_run_length_decode.RunLengthDecode(decoded_items, decoded_window_offset);
 
-            if (enable_store)
+            if(enable_store)
             {
-                hipcub::StoreDirectBlocked(global_thread_idx, d_decoded_items + decoded_window_offset, decoded_items);
+                hipcub::StoreDirectBlocked(global_thread_idx,
+                                           d_decoded_items + decoded_window_offset,
+                                           decoded_items);
             }
 
             decoded_window_offset += BlockSize * DecodedItemsPerThread;
@@ -88,91 +79,81 @@ void block_run_length_decode_kernel(
     }
 }
 
-template<
-    class ItemT,
-    class OffsetT,
-    unsigned MinRunLength,
-    unsigned MaxRunLength,
-    unsigned BlockSize,
-    unsigned RunsPerThread,
-    unsigned DecodedItemsPerThread,
-    unsigned Trials = 100
->
+template<class ItemT,
+         class OffsetT,
+         unsigned MinRunLength,
+         unsigned MaxRunLength,
+         unsigned BlockSize,
+         unsigned RunsPerThread,
+         unsigned DecodedItemsPerThread,
+         unsigned Trials = 100>
 void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
 {
-    constexpr auto runs_per_block = BlockSize * RunsPerThread;
-    const auto target_num_runs = 2 * N / (MinRunLength + MaxRunLength);
-    const auto num_runs = runs_per_block * ((target_num_runs + runs_per_block - 1)/runs_per_block);
+    constexpr auto runs_per_block  = BlockSize * RunsPerThread;
+    const auto     target_num_runs = 2 * N / (MinRunLength + MaxRunLength);
+    const auto     num_runs
+        = runs_per_block * ((target_num_runs + runs_per_block - 1) / runs_per_block);
 
-    std::vector<ItemT> run_items(num_runs);
+    std::vector<ItemT>   run_items(num_runs);
     std::vector<OffsetT> run_offsets(num_runs + 1);
 
     std::default_random_engine prng(std::random_device{}());
-        using ItemDistribution = std::conditional_t<
-        std::is_integral<ItemT>::value,
-        std::uniform_int_distribution<ItemT>,
-        std::uniform_real_distribution<ItemT>
-    >;
-    ItemDistribution run_item_dist(0, 100);
+    using ItemDistribution = std::conditional_t<std::is_integral<ItemT>::value,
+                                                std::uniform_int_distribution<ItemT>,
+                                                std::uniform_real_distribution<ItemT>>;
+    ItemDistribution                       run_item_dist(0, 100);
     std::uniform_int_distribution<OffsetT> run_length_dist(MinRunLength, MaxRunLength);
 
-    for (size_t i = 0; i < num_runs; ++i)
+    for(size_t i = 0; i < num_runs; ++i)
     {
         run_items[i] = run_item_dist(prng);
     }
-    for (size_t i = 1; i < num_runs + 1; ++i)
+    for(size_t i = 1; i < num_runs + 1; ++i)
     {
         const OffsetT next_run_length = run_length_dist(prng);
-        run_offsets[i] = run_offsets[i - 1] + next_run_length;
+        run_offsets[i]                = run_offsets[i - 1] + next_run_length;
     }
     const OffsetT output_length = run_offsets.back();
 
-    ItemT * d_run_items{};
+    ItemT* d_run_items{};
     HIP_CHECK(hipMalloc(&d_run_items, run_items.size() * sizeof(ItemT)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_run_items, run_items.data(),
-            run_items.size() * sizeof(ItemT),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    OffsetT * d_run_offsets{};
+    HIP_CHECK(hipMemcpy(d_run_items,
+                        run_items.data(),
+                        run_items.size() * sizeof(ItemT),
+                        hipMemcpyHostToDevice));
+
+    OffsetT* d_run_offsets{};
     HIP_CHECK(hipMalloc(&d_run_offsets, run_offsets.size() * sizeof(OffsetT)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_run_offsets, run_offsets.data(),
-            run_offsets.size() * sizeof(OffsetT),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    ItemT * d_output{};
+    HIP_CHECK(hipMemcpy(d_run_offsets,
+                        run_offsets.data(),
+                        run_offsets.size() * sizeof(OffsetT),
+                        hipMemcpyHostToDevice));
+
+    ItemT* d_output{};
     HIP_CHECK(hipMalloc(&d_output, output_length * sizeof(ItemT)));
 
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(
-                block_run_length_decode_kernel<
-                    ItemT,
-                    OffsetT,
-                    BlockSize,
-                    RunsPerThread,
-                    DecodedItemsPerThread,
-                    Trials
-                >
-            ),
-            dim3(num_runs/runs_per_block), dim3(BlockSize), 0, stream,
-            d_run_items, d_run_offsets, d_output
-        );
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(block_run_length_decode_kernel<ItemT,
+                                                                          OffsetT,
+                                                                          BlockSize,
+                                                                          RunsPerThread,
+                                                                          DecodedItemsPerThread,
+                                                                          Trials>),
+                           dim3(num_runs / runs_per_block),
+                           dim3(BlockSize),
+                           0,
+                           stream,
+                           d_run_items,
+                           d_run_offsets,
+                           d_output);
         HIP_CHECK(hipPeekAtLastError());
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
 
         state.SetIterationTime(elapsed_seconds.count());
     }
@@ -184,14 +165,17 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
     HIP_CHECK(hipFree(d_output));
 }
 
-#define CREATE_BENCHMARK(IT, OT, MINRL, MAXRL, BS, RPT, DIPT) \
-    benchmark::RegisterBenchmark( \
-        "block_run_length_decode<Item Type:"#IT",Offset Type:"#OT",Min RunLength:"#MINRL",Max RunLength:"#MAXRL",BlockSize: "#BS",Runs Per Thread:"#RPT",Decoded Items Per Thread:"#DIPT">", \
-        &run_benchmark<IT, OT, MINRL, MAXRL, BS, RPT, DIPT>, \
-        stream, size \
-    )
-
-int main(int argc, char *argv[])
+#define CREATE_BENCHMARK(IT, OT, MINRL, MAXRL, BS, RPT, DIPT)                               \
+    benchmark::RegisterBenchmark(                                                           \
+        std::string("block_run_length_decode<item_type:" #IT ",offset_type:" #OT            \
+                    ",min_run_length:" #MINRL ",max_run_length:" #MAXRL ",block_size: " #BS \
+                    ",runs_per_thread:" #RPT ",decoded_items_per_thread:" #DIPT ">.")       \
+            .c_str(),                                                                       \
+        &run_benchmark<IT, OT, MINRL, MAXRL, BS, RPT, DIPT>,                                \
+        stream,                                                                             \
+        size)
+
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -200,22 +184,21 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     std::cout << "benchmark_block_run_length_decode" << std::endl;
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
 
     // Add benchmarks
-    std::vector<benchmark::internal::Benchmark*> benchmarks
-    {
+    std::vector<benchmark::internal::Benchmark*> benchmarks{
         CREATE_BENCHMARK(int, int, 1, 5, 128, 2, 4),
         CREATE_BENCHMARK(int, int, 1, 10, 128, 2, 4),
         CREATE_BENCHMARK(int, int, 1, 50, 128, 2, 4),
@@ -230,8 +213,7 @@ int main(int argc, char *argv[])
         CREATE_BENCHMARK(double, long long, 1, 100, 128, 2, 4),
         CREATE_BENCHMARK(double, long long, 1, 500, 128, 2, 4),
         CREATE_BENCHMARK(double, long long, 1, 1000, 128, 2, 4),
-        CREATE_BENCHMARK(double, long long, 1, 5000, 128, 2, 4)
-    };
+        CREATE_BENCHMARK(double, long long, 1, 5000, 128, 2, 4)};
 
     // Use manual timing
     for(auto& b : benchmarks)
diff --git a/benchmark/benchmark_block_scan.cpp b/benchmark/benchmark_block_scan.cpp
index f45d3862..340d3b4e 100644
--- a/benchmark/benchmark_block_scan.cpp
+++ b/benchmark/benchmark_block_scan.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -25,7 +25,6 @@
 // hipCUB API
 #include "hipcub/block/block_scan.hpp"
 
-
 #ifndef DEFAULT_N
 const size_t DEFAULT_N = 1024 * 1024 * 32;
 #endif
@@ -58,7 +57,7 @@ struct inclusive_scan
         using bscan_t = hipcub::BlockScan<T, BlockSize, Algorithm>;
         __shared__ typename bscan_t::TempStorage storage;
 
-        #pragma nounroll
+#pragma nounroll
         for(unsigned int trial = 0; trial < Trials; trial++)
         {
             bscan_t(storage).InclusiveScan(values, values, hipcub::Sum());
@@ -110,23 +109,17 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
 {
     // Make sure size is a multiple of BlockSize
     constexpr auto items_per_block = BlockSize * ItemsPerThread;
-    const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block);
+    const auto     size = items_per_block * ((N + items_per_block - 1) / items_per_block);
     // Allocate and fill memory
     std::vector<T> input(size, T(1));
-    T * d_input;
-    T * d_output;
+    T*             d_input;
+    T*             d_output;
     HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            size * sizeof(T),
-            hipMemcpyHostToDevice
-        )
-    );
+    HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice));
     HIP_CHECK(hipDeviceSynchronize());
 
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
         hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, ItemsPerThread, Trials>),
@@ -141,8 +134,8 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
 
         state.SetIterationTime(elapsed_seconds.count());
     }
@@ -154,12 +147,14 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
 }
 
 // IPT - items per thread
-#define CREATE_BENCHMARK(T, BS, IPT) \
-    benchmark::RegisterBenchmark( \
-        (std::string("block_scan<Datatype:"#T",Block Size:"#BS",Items Per Thread:"#IPT",SubAlgorithm Name:" + algorithm_name + ">.Method Name:") + method_name).c_str(), \
-        &run_benchmark<Benchmark, T, BS, IPT>, \
-        stream, size \
-    )
+#define CREATE_BENCHMARK(T, BS, IPT)                                                            \
+    benchmark::RegisterBenchmark(std::string("block_scan<data_type:" #T ",block_size:" #BS      \
+                                             ",items_per_thread:" #IPT ",sub_algorithm_name:"   \
+                                             + algorithm_name + ">.method_name:" + method_name) \
+                                     .c_str(),                                                  \
+                                 &run_benchmark<Benchmark, T, BS, IPT>,                         \
+                                 stream,                                                        \
+                                 size)
 
 // clang-format off
 #define BENCHMARK_TYPE(type, block)    \
@@ -178,7 +173,7 @@ void add_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
                     hipStream_t                                   stream,
                     size_t                                        size)
 {
-    using custom_float2 = benchmark_utils::custom_type<float, float>;
+    using custom_float2  = benchmark_utils::custom_type<float, float>;
     using custom_double2 = benchmark_utils::custom_type<double, double>;
 
     std::vector<benchmark::internal::Benchmark*> new_benchmarks = {
@@ -204,7 +199,7 @@ void add_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
     benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end());
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -213,15 +208,15 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     std::cout << "benchmark_block_scan" << std::endl;
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
diff --git a/benchmark/benchmark_block_shuffle.cpp b/benchmark/benchmark_block_shuffle.cpp
index 2f0d8cb5..4ba9fb0e 100644
--- a/benchmark/benchmark_block_shuffle.cpp
+++ b/benchmark/benchmark_block_shuffle.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -214,22 +214,21 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
     HIP_CHECK(hipFree(d_output));
 }
 
-#define CREATE_BENCHMARK_IPT(BS, IPT)                                                       \
-    benchmark::RegisterBenchmark(                                                           \
-        (std::string("block_shuffle<Datatype:") + type_name                                 \
-         + std::string(",Block Size:" #BS ",Items Per Thread:" #IPT ">.SubAlgorithm Name:") \
-         + name)                                                                            \
-            .c_str(),                                                                       \
-        &run_benchmark<Benchmark, T, BS, IPT>,                                              \
-        stream,                                                                             \
+#define CREATE_BENCHMARK_IPT(BS, IPT)                                                   \
+    benchmark::RegisterBenchmark(                                                       \
+        ("block_shuffle<data_type:" + type_name                                         \
+         + ",block_size:" #BS ",items_per_thread:" #IPT ">.sub_algorithm_name:" + name) \
+            .c_str(),                                                                   \
+        &run_benchmark<Benchmark, T, BS, IPT>,                                          \
+        stream,                                                                         \
         size)
 
-#define CREATE_BENCHMARK(BS)                                                                       \
-    benchmark::RegisterBenchmark((std::string("block_shuffle<Datatype:") + type_name               \
-                                  + std::string(",Block Size:" #BS ">.SubAlgorithm Name:") + name) \
-                                     .c_str(),                                                     \
-                                 &run_benchmark<Benchmark, T, BS>,                                 \
-                                 stream,                                                           \
+#define CREATE_BENCHMARK(BS)                                                           \
+    benchmark::RegisterBenchmark(("block_shuffle<data_type:" + type_name               \
+                                  + ",block_size:" #BS ">.sub_algorithm_name:" + name) \
+                                     .c_str(),                                         \
+                                 &run_benchmark<Benchmark, T, BS>,                     \
+                                 stream,                                               \
                                  size)
 
 template<class Benchmark, class T, std::enable_if_t<Benchmark::uses_ipt, bool> = true>
@@ -303,6 +302,7 @@ int main(int argc, char* argv[])
     hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
     int             device_id = 0;
+
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
diff --git a/benchmark/benchmark_device_adjacent_difference.cpp b/benchmark/benchmark_device_adjacent_difference.cpp
index f42ceb76..e0788f0b 100644
--- a/benchmark/benchmark_device_adjacent_difference.cpp
+++ b/benchmark/benchmark_device_adjacent_difference.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -49,7 +49,7 @@ constexpr std::size_t DEFAULT_N = 1024 * 1024 * 128;
 constexpr unsigned int batch_size  = 10;
 constexpr unsigned int warmup_size = 5;
 
-template <typename InputIt, typename OutputIt, typename... Args>
+template<typename InputIt, typename OutputIt, typename... Args>
 auto dispatch_adjacent_difference(std::true_type /*left*/,
                                   std::true_type /*copy*/,
                                   void* const    temporary_storage,
@@ -58,11 +58,14 @@ auto dispatch_adjacent_difference(std::true_type /*left*/,
                                   const OutputIt output,
                                   Args&&... args)
 {
-    return ::hipcub::DeviceAdjacentDifference::SubtractLeftCopy(
-        temporary_storage, storage_size, input, output, std::forward<Args>(args)...);
+    return ::hipcub::DeviceAdjacentDifference::SubtractLeftCopy(temporary_storage,
+                                                                storage_size,
+                                                                input,
+                                                                output,
+                                                                std::forward<Args>(args)...);
 }
 
-template <typename InputIt, typename OutputIt, typename... Args>
+template<typename InputIt, typename OutputIt, typename... Args>
 auto dispatch_adjacent_difference(std::false_type /*left*/,
                                   std::true_type /*copy*/,
                                   void* const    temporary_storage,
@@ -71,11 +74,14 @@ auto dispatch_adjacent_difference(std::false_type /*left*/,
                                   const OutputIt output,
                                   Args&&... args)
 {
-    return ::hipcub::DeviceAdjacentDifference::SubtractRightCopy(
-        temporary_storage, storage_size, input, output, std::forward<Args>(args)...);
+    return ::hipcub::DeviceAdjacentDifference::SubtractRightCopy(temporary_storage,
+                                                                 storage_size,
+                                                                 input,
+                                                                 output,
+                                                                 std::forward<Args>(args)...);
 }
 
-template <typename InputIt, typename OutputIt, typename... Args>
+template<typename InputIt, typename OutputIt, typename... Args>
 auto dispatch_adjacent_difference(std::true_type /*left*/,
                                   std::false_type /*copy*/,
                                   void* const   temporary_storage,
@@ -84,11 +90,13 @@ auto dispatch_adjacent_difference(std::true_type /*left*/,
                                   const OutputIt /*output*/,
                                   Args&&... args)
 {
-    return ::hipcub::DeviceAdjacentDifference::SubtractLeft(
-        temporary_storage, storage_size, input, std::forward<Args>(args)...);
+    return ::hipcub::DeviceAdjacentDifference::SubtractLeft(temporary_storage,
+                                                            storage_size,
+                                                            input,
+                                                            std::forward<Args>(args)...);
 }
 
-template <typename InputIt, typename OutputIt, typename... Args>
+template<typename InputIt, typename OutputIt, typename... Args>
 auto dispatch_adjacent_difference(std::false_type /*left*/,
                                   std::false_type /*copy*/,
                                   void* const   temporary_storage,
@@ -97,11 +105,13 @@ auto dispatch_adjacent_difference(std::false_type /*left*/,
                                   const OutputIt /*output*/,
                                   Args&&... args)
 {
-    return ::hipcub::DeviceAdjacentDifference::SubtractRight(
-        temporary_storage, storage_size, input, std::forward<Args>(args)...);
+    return ::hipcub::DeviceAdjacentDifference::SubtractRight(temporary_storage,
+                                                             storage_size,
+                                                             input,
+                                                             std::forward<Args>(args)...);
 }
 
-template <typename T, bool left, bool copy>
+template<typename T, bool left, bool copy>
 void run_benchmark(benchmark::State& state, const std::size_t size, const hipStream_t stream)
 {
     using output_type = T;
@@ -180,12 +190,15 @@ void run_benchmark(benchmark::State& state, const std::size_t size, const hipStr
 
 using namespace std::string_literals;
 
-#define CREATE_BENCHMARK(T, left, copy)                                    \
-    benchmark::RegisterBenchmark(("Subtract" + (left ? "Left"s : "Right"s) \
-                                  + (copy ? "Copy"s : ""s) + "<" #T ">")   \
-                                     .c_str(),                             \
-                                 &run_benchmark<T, left, copy>,            \
-                                 size,                                     \
+#define CREATE_BENCHMARK(T, left, copy)                                             \
+    benchmark::RegisterBenchmark(std::string("device_adjacent_difference"           \
+                                             "<data_type:" #T ">."                  \
+                                             "sub_algorithm_name:subtract_"         \
+                                             + std::string(left ? "left" : "right") \
+                                             + std::string(copy ? "_copy" : ""))    \
+                                     .c_str(),                                      \
+                                 &run_benchmark<T, left, copy>,                     \
+                                 size,                                              \
                                  stream)
 
 // clang-format off
@@ -214,6 +227,8 @@ int main(int argc, char* argv[])
     int               device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
+
+    std::cout << "benchmark_device_adjacent_difference" << std::endl;
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
 
     using custom_float2  = benchmark_utils::custom_type<float, float>;
diff --git a/benchmark/benchmark_device_batch_copy.cpp b/benchmark/benchmark_device_batch_copy.cpp
index 5a29f19f..feca312e 100644
--- a/benchmark/benchmark_device_batch_copy.cpp
+++ b/benchmark/benchmark_device_batch_copy.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -53,7 +53,8 @@ constexpr int32_t blev_min_size = 1024;
 // have source and destinations mappings not be the identity function:
 //
 //  batch_copy(
-//    [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c, d!)
+//    [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c,
+//    d!)
 //    [&a0', &b0', &c0', &d0'], // to   (order is the same as above too!)
 //    [3   , 2   , 1   , 2   ]) // size
 //
@@ -327,15 +328,20 @@ void run_benchmark(benchmark::State& state,
     HIP_CHECK(hipFree(d_temp_storage));
 }
 
-#define CREATE_BENCHMARK(item_size, item_alignment, size_type, num_tlev, num_wlev, num_blev)     \
-    benchmark::RegisterBenchmark(                                                                \
-        "{lvl:device,item_size:" #item_size ",item_alignment:" #item_alignment                   \
-        ",size_type:" #size_type ",algo:batch_memcpy,num_tlev:" #num_tlev ",num_wlev:" #num_wlev \
-        ",num_blev:" #num_blev ",cfg:default_config}",                                           \
-        [=](benchmark::State& state)                                                             \
-        {                                                                                        \
-            run_benchmark<benchmark_utils::custom_aligned_type<item_size, item_alignment>,       \
-                          size_type>(state, stream, num_tlev, num_wlev, num_blev);               \
+#define CREATE_BENCHMARK(IS, IA, T, num_tlev, num_wlev, num_blev)                     \
+    benchmark::RegisterBenchmark(                                                     \
+        std::string("device_batch_copy"                                               \
+                    "<data_type:" #T ",item_size:" #IS ",item_alignment:" #IA         \
+                    ",number_of_tlev:" #num_tlev ",number_of_wlev:" #num_wlev         \
+                    ",number_of_blev:" #num_blev ">.")                                \
+            .c_str(),                                                                 \
+        [=](benchmark::State& state)                                                  \
+        {                                                                             \
+            run_benchmark<benchmark_utils::custom_aligned_type<IS, IA>, T>(state,     \
+                                                                           stream,    \
+                                                                           num_tlev,  \
+                                                                           num_wlev,  \
+                                                                           num_blev); \
         })
 
 #define BENCHMARK_TYPE(item_size, item_alignment)                            \
@@ -364,6 +370,15 @@ int32_t main(int32_t argc, char* argv[])
     // HIP
     hipStream_t stream = hipStreamDefault; // default
 
+    hipDeviceProp_t devProp;
+    int             device_id = 0;
+    
+    HIP_CHECK(hipGetDevice(&device_id));
+    HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
+
+    std::cout << "benchmark_device_batch_copy" << std::endl;
+    std::cout << "[HIP] Device name: " << devProp.name << std::endl;
+
     // Benchmark info
     benchmark::AddCustomContext("size", std::to_string(size));
 
@@ -378,6 +393,8 @@ int32_t main(int32_t argc, char* argv[])
                   BENCHMARK_TYPE(4, 4),
                   BENCHMARK_TYPE(8, 8)};
 
+            
+
     // Use manual timing
     for(auto& b : benchmarks)
     {
diff --git a/benchmark/benchmark_device_batch_memcpy.cpp b/benchmark/benchmark_device_batch_memcpy.cpp
index 3d72e349..f0f38be2 100644
--- a/benchmark/benchmark_device_batch_memcpy.cpp
+++ b/benchmark/benchmark_device_batch_memcpy.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -56,7 +56,8 @@ constexpr int32_t blev_min_size = 1024;
 // have source and destinations mappings not be the identity function:
 //
 //  batch_memcpy(
-//    [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c, d!)
+//    [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c,
+//    d!)
 //    [&a0', &b0', &c0', &d0'], // to   (order is the same as above too!)
 //    [3   , 2   , 1   , 2   ]) // size
 //
@@ -337,15 +338,19 @@ void run_benchmark(benchmark::State& state,
     HIP_CHECK(hipFree(d_temp_storage));
 }
 
-#define CREATE_BENCHMARK(item_size, item_alignment, size_type, num_tlev, num_wlev, num_blev)     \
+#define CREATE_BENCHMARK(IS, IA, T, num_tlev, num_wlev, num_blev)                                \
     benchmark::RegisterBenchmark(                                                                \
-        "{lvl:device,item_size:" #item_size ",item_alignment:" #item_alignment                   \
-        ",size_type:" #size_type ",algo:batch_memcpy,num_tlev:" #num_tlev ",num_wlev:" #num_wlev \
-        ",num_blev:" #num_blev ",cfg:default_config}",                                           \
+        std::string("device_batch_memcpy<data_type:" #T ",item_size:" #IS ",item_alignment:" #IA \
+                    ",number_of_tlev:" #num_tlev ",number_of_wlev:" #num_wlev                    \
+                    ",number_of_blev:" #num_blev ">.")                                           \
+            .c_str(),                                                                            \
         [=](benchmark::State& state)                                                             \
         {                                                                                        \
-            run_benchmark<benchmark_utils::custom_aligned_type<item_size, item_alignment>,       \
-                          size_type>(state, stream, num_tlev, num_wlev, num_blev);               \
+            run_benchmark<benchmark_utils::custom_aligned_type<IS, IA>, T>(state,                \
+                                                                           stream,               \
+                                                                           num_tlev,             \
+                                                                           num_wlev,             \
+                                                                           num_blev);            \
         })
 
 #define BENCHMARK_TYPE(item_size, item_alignment)                            \
@@ -371,6 +376,14 @@ int32_t main(int32_t argc, char* argv[])
     const size_t  size   = parser.get<size_t>("size");
     const int32_t trials = parser.get<int>("trials");
 
+    hipDeviceProp_t devProp;
+    int             device_id = 0;
+    HIP_CHECK(hipGetDevice(&device_id));
+    HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
+
+    std::cout << "benchmark_device_adjacent_difference" << std::endl;
+    std::cout << "[HIP] Device name: " << devProp.name << std::endl;
+
     // HIP
     hipStream_t stream = hipStreamDefault; // default
 
diff --git a/benchmark/benchmark_device_histogram.cpp b/benchmark/benchmark_device_histogram.cpp
index cfde99f9..a5019e4b 100644
--- a/benchmark/benchmark_device_histogram.cpp
+++ b/benchmark/benchmark_device_histogram.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -36,7 +36,7 @@
 const size_t DEFAULT_N = 1024 * 1024 * 32;
 #endif
 
-const unsigned int batch_size = 10;
+const unsigned int batch_size  = 10;
 const unsigned int warmup_size = 5;
 
 template<class T>
@@ -50,9 +50,9 @@ std::vector<T>
 
     const size_t max_random_size = 1024 * 1024;
 
-    std::random_device rd;
+    std::random_device         rd;
     std::default_random_engine gen(rd());
-    std::vector<T> data(size);
+    std::vector<T>             data(size);
     std::generate(data.begin(),
                   data.begin() + std::min(size, max_random_size),
                   [&]()
@@ -87,15 +87,15 @@ int get_entropy_percents(int entropy_reduction)
     }
 }
 
-const int entropy_reductions[] = { 0, 2, 4, 6 };
+const int entropy_reductions[] = {0, 2, 4, 6};
 
 template<class T>
 void run_even_benchmark(benchmark::State& state,
-                        size_t bins,
-                        size_t scale,
-                        int entropy_reduction,
-                        hipStream_t stream,
-                        size_t size)
+                        size_t            bins,
+                        size_t            scale,
+                        int               entropy_reduction,
+                        hipStream_t       stream,
+                        size_t            size)
 {
     using counter_type = unsigned int;
 
@@ -107,19 +107,13 @@ void run_even_benchmark(benchmark::State& state,
     // Generate data
     std::vector<T> input = generate<T>(size, entropy_reduction, lower_level, upper_level);
 
-    T * d_input;
-    counter_type * d_histogram;
+    T*            d_input;
+    counter_type* d_histogram;
     HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            size * sizeof(T),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    void * d_temporary_storage = nullptr;
+    HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice));
+
+    void*  d_temporary_storage     = nullptr;
     size_t temporary_storage_bytes = 0;
     HIP_CHECK(hipcub::DeviceHistogram::HistogramEven(d_temporary_storage,
                                                      temporary_storage_bytes,
@@ -149,7 +143,7 @@ void run_even_benchmark(benchmark::State& state,
     }
     HIP_CHECK(hipDeviceSynchronize());
 
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
 
@@ -168,8 +162,8 @@ void run_even_benchmark(benchmark::State& state,
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T));
@@ -182,11 +176,11 @@ void run_even_benchmark(benchmark::State& state,
 
 template<class T, unsigned int Channels, unsigned int ActiveChannels>
 void run_multi_even_benchmark(benchmark::State& state,
-                              size_t bins,
-                              size_t scale,
-                              int entropy_reduction,
-                              hipStream_t stream,
-                              size_t size)
+                              size_t            bins,
+                              size_t            scale,
+                              int               entropy_reduction,
+                              hipStream_t       stream,
+                              size_t            size)
 {
     using counter_type = unsigned int;
 
@@ -197,28 +191,23 @@ void run_multi_even_benchmark(benchmark::State& state,
     {
         lower_level[channel] = 0;
         upper_level[channel] = bins * scale;
-        num_levels[channel] = bins + 1;
+        num_levels[channel]  = bins + 1;
     }
 
     // Generate data
-    std::vector<T> input = generate<T>(size * Channels, entropy_reduction, lower_level[0], upper_level[0]);
+    std::vector<T> input
+        = generate<T>(size * Channels, entropy_reduction, lower_level[0], upper_level[0]);
 
-    T * d_input;
-    counter_type * d_histogram[ActiveChannels];
+    T*            d_input;
+    counter_type* d_histogram[ActiveChannels];
     HIP_CHECK(hipMalloc(&d_input, size * Channels * sizeof(T)));
     for(unsigned int channel = 0; channel < ActiveChannels; channel++)
     {
         HIP_CHECK(hipMalloc(&d_histogram[channel], bins * sizeof(counter_type)));
     }
-    HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            size * Channels * sizeof(T),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    void * d_temporary_storage = nullptr;
+    HIP_CHECK(hipMemcpy(d_input, input.data(), size * Channels * sizeof(T), hipMemcpyHostToDevice));
+
+    void*  d_temporary_storage     = nullptr;
     size_t temporary_storage_bytes = 0;
     HIP_CHECK((hipcub::DeviceHistogram::MultiHistogramEven<Channels, ActiveChannels>(
         d_temporary_storage,
@@ -250,7 +239,7 @@ void run_multi_even_benchmark(benchmark::State& state,
     }
     HIP_CHECK(hipDeviceSynchronize());
 
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
 
@@ -270,8 +259,8 @@ void run_multi_even_benchmark(benchmark::State& state,
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * batch_size * size * Channels * sizeof(T));
@@ -296,28 +285,16 @@ void run_range_benchmark(benchmark::State& state, size_t bins, hipStream_t strea
     std::vector<T> levels(bins + 1);
     std::iota(levels.begin(), levels.end(), static_cast<T>(0));
 
-    T * d_input;
-    T * d_levels;
-    counter_type * d_histogram;
+    T*            d_input;
+    T*            d_levels;
+    counter_type* d_histogram;
     HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_levels, (bins + 1) * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            size * sizeof(T),
-            hipMemcpyHostToDevice
-        )
-    );
-    HIP_CHECK(
-        hipMemcpy(
-            d_levels, levels.data(),
-            (bins + 1) * sizeof(T),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    void * d_temporary_storage = nullptr;
+    HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(d_levels, levels.data(), (bins + 1) * sizeof(T), hipMemcpyHostToDevice));
+
+    void*  d_temporary_storage     = nullptr;
     size_t temporary_storage_bytes = 0;
     HIP_CHECK(hipcub::DeviceHistogram::HistogramRange(d_temporary_storage,
                                                       temporary_storage_bytes,
@@ -345,7 +322,7 @@ void run_range_benchmark(benchmark::State& state, size_t bins, hipStream_t strea
     }
     HIP_CHECK(hipDeviceSynchronize());
 
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
 
@@ -363,8 +340,8 @@ void run_range_benchmark(benchmark::State& state, size_t bins, hipStream_t strea
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T));
@@ -377,15 +354,18 @@ void run_range_benchmark(benchmark::State& state, size_t bins, hipStream_t strea
 }
 
 template<class T, unsigned int Channels, unsigned int ActiveChannels>
-void run_multi_range_benchmark(benchmark::State& state, size_t bins, hipStream_t stream, size_t size)
+void run_multi_range_benchmark(benchmark::State& state,
+                               size_t            bins,
+                               hipStream_t       stream,
+                               size_t            size)
 {
     using counter_type = unsigned int;
 
     // Number of levels for a single channel
-    const int num_levels_channel = bins + 1;
-    int num_levels[ActiveChannels];
+    const int      num_levels_channel = bins + 1;
+    int            num_levels[ActiveChannels];
     std::vector<T> levels[ActiveChannels];
-    for (unsigned int channel = 0; channel < ActiveChannels; channel++)
+    for(unsigned int channel = 0; channel < ActiveChannels; channel++)
     {
         levels[channel].resize(num_levels_channel);
         std::iota(levels[channel].begin(), levels[channel].end(), static_cast<T>(0));
@@ -395,9 +375,9 @@ void run_multi_range_benchmark(benchmark::State& state, size_t bins, hipStream_t
     // Generate data
     std::vector<T> input = benchmark_utils::get_random_data<T>(size * Channels, 0, bins);
 
-    T * d_input;
-    T * d_levels[ActiveChannels];
-    counter_type * d_histogram[ActiveChannels];
+    T*            d_input;
+    T*            d_levels[ActiveChannels];
+    counter_type* d_histogram[ActiveChannels];
     HIP_CHECK(hipMalloc(&d_input, size * Channels * sizeof(T)));
     for(unsigned int channel = 0; channel < ActiveChannels; channel++)
     {
@@ -405,25 +385,16 @@ void run_multi_range_benchmark(benchmark::State& state, size_t bins, hipStream_t
         HIP_CHECK(hipMalloc(&d_histogram[channel], size * sizeof(counter_type)));
     }
 
-    HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            size * Channels * sizeof(T),
-            hipMemcpyHostToDevice
-        )
-    );
+    HIP_CHECK(hipMemcpy(d_input, input.data(), size * Channels * sizeof(T), hipMemcpyHostToDevice));
     for(unsigned int channel = 0; channel < ActiveChannels; channel++)
     {
-        HIP_CHECK(
-            hipMemcpy(
-                d_levels[channel], levels[channel].data(),
-                num_levels_channel * sizeof(T),
-                hipMemcpyHostToDevice
-            )
-        );
+        HIP_CHECK(hipMemcpy(d_levels[channel],
+                            levels[channel].data(),
+                            num_levels_channel * sizeof(T),
+                            hipMemcpyHostToDevice));
     }
 
-    void * d_temporary_storage = nullptr;
+    void*  d_temporary_storage     = nullptr;
     size_t temporary_storage_bytes = 0;
     HIP_CHECK((hipcub::DeviceHistogram::MultiHistogramRange<Channels, ActiveChannels>(
         d_temporary_storage,
@@ -453,7 +424,7 @@ void run_multi_range_benchmark(benchmark::State& state, size_t bins, hipStream_t
     }
     HIP_CHECK(hipDeviceSynchronize());
 
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
 
@@ -472,8 +443,8 @@ void run_multi_range_benchmark(benchmark::State& state, size_t bins, hipStream_t
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * batch_size * size * Channels * sizeof(T));
@@ -510,9 +481,11 @@ struct num_limits<__half>
     if(num_limits<T>::max() > BINS * SCALE)                                                    \
     {                                                                                          \
         VECTOR.push_back(benchmark::RegisterBenchmark(                                         \
-            (std::string("histogram_even") + "<Datatype:" #T ">" + "(Entropy Percent:"         \
-             + std::to_string(get_entropy_percents(entropy_reduction)) + "%,Bin Count:"        \
-             + std::to_string(BINS) + " bins)")                                                \
+            std::string("device_histogram_even"                                                \
+                        "<data_type:" #T ">."                                                  \
+                        "(entropy_percent:"                                                    \
+                        + std::to_string(get_entropy_percents(entropy_reduction))              \
+                        + "%,bin_count:" + std::to_string(BINS) + " bins)")                    \
                 .c_str(),                                                                      \
             [=](benchmark::State& state)                                                       \
             { run_even_benchmark<T>(state, BINS, SCALE, entropy_reduction, stream, size); })); \
@@ -527,8 +500,8 @@ struct num_limits<__half>
     CREATE_EVEN_BENCHMARK(VECTOR, T, 65536, 1)
 
 void add_even_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
-                         hipStream_t stream,
-                         size_t size)
+                         hipStream_t                                   stream,
+                         size_t                                        size)
 {
     for(int entropy_reduction : entropy_reductions)
     {
@@ -538,34 +511,40 @@ void add_even_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmark
         BENCHMARK_TYPE(benchmarks, uint8_t);
         BENCHMARK_TYPE(benchmarks, double);
         BENCHMARK_TYPE(benchmarks, float);
-        //this limitation can be removed once https://github.com/NVIDIA/cub/issues/484 is fixed
+        // this limitation can be removed once
+        // https://github.com/NVIDIA/cub/issues/484 is fixed
 #ifdef __HIP_PLATFORM_AMD__
         BENCHMARK_TYPE(benchmarks, __half);
 #endif
     };
 }
 
-#define CREATE_MULTI_EVEN_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS, SCALE) \
-benchmark::RegisterBenchmark( \
-    (std::string("multi_histogram_even") + "<Channels:" #CHANNELS ",Active Channels:" #ACTIVE_CHANNELS ",Datatype:" #T ">" + \
-        "(Entropy Percent:" + std::to_string(get_entropy_percents(entropy_reduction)) + "%,Bin Count:" + \
-        std::to_string(BINS) + " bins)" \
-    ).c_str(), \
-    [=](benchmark::State& state) { \
-        run_multi_even_benchmark<T, CHANNELS, ACTIVE_CHANNELS>( \
-            state, BINS, SCALE, entropy_reduction, stream, size \
-        ); \
-    } \
-)
+#define CREATE_MULTI_EVEN_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS, SCALE)                   \
+    benchmark::RegisterBenchmark(                                                                \
+        std::string("device_multi_histogram_even"                                                \
+                    "<channels:" #CHANNELS ",active_channels:" #ACTIVE_CHANNELS ",data_type:" #T \
+                    ">."                                                                         \
+                    "(entropy_percent:"                                                          \
+                    + std::to_string(get_entropy_percents(entropy_reduction))                    \
+                    + "%,bin_count:" + std::to_string(BINS) + " bins)")                          \
+            .c_str(),                                                                            \
+        [=](benchmark::State& state)                                                             \
+        {                                                                                        \
+            run_multi_even_benchmark<T, CHANNELS, ACTIVE_CHANNELS>(state,                        \
+                                                                   BINS,                         \
+                                                                   SCALE,                        \
+                                                                   entropy_reduction,            \
+                                                                   stream,                       \
+                                                                   size);                        \
+        })
 
 void add_multi_even_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
-                               hipStream_t stream,
-                               size_t size)
+                               hipStream_t                                   stream,
+                               size_t                                        size)
 {
     for(int entropy_reduction : entropy_reductions)
     {
-        std::vector<benchmark::internal::Benchmark*> bs =
-        {
+        std::vector<benchmark::internal::Benchmark*> bs = {
             CREATE_MULTI_EVEN_BENCHMARK(4, 3, int, 10, 1234),
             CREATE_MULTI_EVEN_BENCHMARK(4, 3, int, 100, 1234),
 
@@ -580,13 +559,14 @@ void add_multi_even_benchmarks(std::vector<benchmark::internal::Benchmark*>& ben
     };
 }
 
-#define CREATE_RANGE_BENCHMARK(T, BINS) \
-benchmark::RegisterBenchmark( \
-    (std::string("histogram_range") + "<Datatype:" #T ">" + \
-        "(Bin Count:" + std::to_string(BINS) + " bins)" \
-    ).c_str(), \
-    [=](benchmark::State& state) { run_range_benchmark<T>(state, BINS, stream, size); } \
-)
+#define CREATE_RANGE_BENCHMARK(T, BINS)                                         \
+    benchmark::RegisterBenchmark(std::string("device_histogram_range"           \
+                                             "<data_type:" #T ">."              \
+                                             "(bin_count:"                      \
+                                             + std::to_string(BINS) + " bins)") \
+                                     .c_str(),                                  \
+                                 [=](benchmark::State& state)                   \
+                                 { run_range_benchmark<T>(state, BINS, stream, size); })
 
 #define BENCHMARK_RANGE_TYPE(T)                                            \
     CREATE_RANGE_BENCHMARK(T, 10), CREATE_RANGE_BENCHMARK(T, 100),         \
@@ -594,32 +574,29 @@ benchmark::RegisterBenchmark( \
         CREATE_RANGE_BENCHMARK(T, 100000), CREATE_RANGE_BENCHMARK(T, 1000000)
 
 void add_range_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
-                          hipStream_t stream,
-                          size_t size)
+                          hipStream_t                                   stream,
+                          size_t                                        size)
 {
     std::vector<benchmark::internal::Benchmark*> bs
         = {BENCHMARK_RANGE_TYPE(float), BENCHMARK_RANGE_TYPE(double)};
     benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
 }
 
-#define CREATE_MULTI_RANGE_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS) \
-benchmark::RegisterBenchmark( \
-    (std::string("multi_histogram_range") + "<" #CHANNELS ", " #ACTIVE_CHANNELS ", " #T ">" + \
-        "(" + std::to_string(BINS) + " bins)" \
-    ).c_str(), \
-    [=](benchmark::State& state) { \
-        run_multi_range_benchmark<T, CHANNELS, ACTIVE_CHANNELS>( \
-            state, BINS, stream, size \
-        ); \
-    } \
-)
+#define CREATE_MULTI_RANGE_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS)                         \
+    benchmark::RegisterBenchmark(                                                                \
+        std::string("device_multi_histogram_range"                                               \
+                    "<channels:" #CHANNELS ",active_channels:" #ACTIVE_CHANNELS ",data_type:" #T \
+                    ">.(bin_count:"                                                              \
+                    + std::to_string(BINS) + " bins)")                                           \
+            .c_str(),                                                                            \
+        [=](benchmark::State& state)                                                             \
+        { run_multi_range_benchmark<T, CHANNELS, ACTIVE_CHANNELS>(state, BINS, stream, size); })
 
 void add_multi_range_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
-                                hipStream_t stream,
-                                size_t size)
+                                hipStream_t                                   stream,
+                                size_t                                        size)
 {
-    std::vector<benchmark::internal::Benchmark*> bs =
-    {
+    std::vector<benchmark::internal::Benchmark*> bs = {
         CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 10),
         CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 100),
         CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 1000),
@@ -630,7 +607,7 @@ void add_multi_range_benchmarks(std::vector<benchmark::internal::Benchmark*>& be
     benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -639,15 +616,15 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     std::cout << "benchmark_device_histogram" << std::endl;
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
diff --git a/benchmark/benchmark_device_memory.cpp b/benchmark/benchmark_device_memory.cpp
index 8659cedf..5a879210 100644
--- a/benchmark/benchmark_device_memory.cpp
+++ b/benchmark/benchmark_device_memory.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -272,8 +272,7 @@ void run_benchmark(benchmark::State& state, size_t size, const hipStream_t strea
     if(std::is_floating_point<T>::value)
     {
         input = benchmark_utils::get_random_data<T>(size, (T)-1000, (T) + 1000);
-    }
-    else
+    } else
     {
         input = benchmark_utils::get_random_data<T>(size,
                                                     std::numeric_limits<T>::min(),
@@ -350,9 +349,9 @@ template<typename T>
 void run_benchmark_memcpy(benchmark::State& state, size_t size, const hipStream_t stream)
 {
     // Allocate device buffers
-    // Note: since this benchmark only tests memcpy performance between device buffers,
-    // we don't really need to copy data into these from the host - whatever happens
-    // to be in memory will suffice.
+    // Note: since this benchmark only tests memcpy performance between device
+    // buffers, we don't really need to copy data into these from the host -
+    // whatever happens to be in memory will suffice.
     T* d_input;
     T* d_output;
     HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T)));
@@ -401,20 +400,18 @@ void run_benchmark_memcpy(benchmark::State& state, size_t size, const hipStream_
     HIP_CHECK(hipFree(d_output));
 }
 
-#define CREATE_BENCHMARK_IPT(METHOD, OPERATION, T, SIZE, BLOCK_SIZE, IPT)                     \
-    {                                                                                         \
-        benchmarks.push_back(benchmark::RegisterBenchmark(                                    \
-            #METHOD "_" #OPERATION "<" #T "," #SIZE ",BS:" #BLOCK_SIZE ",IPT:" #IPT ">",      \
-            [=](benchmark::State& state)                                                      \
-            { run_benchmark<T, BLOCK_SIZE, IPT, METHOD, OPERATION>(state, SIZE, stream); })); \
-    }
-
-#define CREATE_BENCHMARK_MEMCPY(T, SIZE)                                                      \
-    {                                                                                         \
-        benchmarks.push_back(benchmark::RegisterBenchmark(                                    \
-            "Memcpy<" #T "," #SIZE ">",                                                       \
-            [=](benchmark::State& state) { run_benchmark_memcpy<T>(state, SIZE, stream); })); \
-    }
+#define CREATE_BENCHMARK_IPT(METHOD, OPERATION, T, SIZE, BS, IPT)                             \
+    benchmarks.push_back(benchmark::RegisterBenchmark(                                        \
+        std::string("device_memory<method:" #METHOD ",operation:" #OPERATION ",data_type:" #T \
+                    ",size:" #SIZE ",block_size:" #BS ",items_per_thread:" #IPT ">.")         \
+            .c_str(),                                                                         \
+        [=](benchmark::State& state)                                                          \
+        { run_benchmark<T, BS, IPT, METHOD, OPERATION>(state, SIZE, stream); }));
+
+#define CREATE_BENCHMARK_MEMCPY(T, SIZE)                                               \
+    benchmarks.push_back(benchmark::RegisterBenchmark(                                 \
+        std::string("device_memory_memcpy<data_type:" #T ",size:" #SIZE ">.").c_str(), \
+        [=](benchmark::State& state) { run_benchmark_memcpy<T>(state, SIZE, stream); }));
 
 // clang-format off
 #define CREATE_BENCHMARK_BLOCK_SIZE(MEM_OP, OP, TYPE, SIZE, BLOCK_SIZE) \
diff --git a/benchmark/benchmark_device_merge_sort.cpp b/benchmark/benchmark_device_merge_sort.cpp
index fbfd35f7..506a8c04 100644
--- a/benchmark/benchmark_device_merge_sort.cpp
+++ b/benchmark/benchmark_device_merge_sort.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -23,14 +23,14 @@
 #include "common_benchmark_header.hpp"
 
 // HIP API
-#include "hipcub/hipcub.hpp"
 #include "hipcub/device/device_merge_sort.hpp"
+#include "hipcub/hipcub.hpp"
 
 #ifndef DEFAULT_N
 const size_t DEFAULT_N = 32 << 20;
 #endif
 
-const unsigned int batch_size = 10;
+const unsigned int batch_size  = 10;
 const unsigned int warmup_size = 5;
 
 template<class Key>
@@ -40,50 +40,43 @@ std::vector<Key> generate_keys(size_t size)
 
     if(std::is_floating_point<key_type>::value)
     {
-        return benchmark_utils::get_random_data<key_type>(size, static_cast<key_type>(-1000), static_cast<key_type>(1000), size);
-    }
-    else
+        return benchmark_utils::get_random_data<key_type>(size,
+                                                          static_cast<key_type>(-1000),
+                                                          static_cast<key_type>(1000),
+                                                          size);
+    } else
     {
-        return benchmark_utils::get_random_data<key_type>(
-            size,
-            std::numeric_limits<key_type>::min(),
-            std::numeric_limits<key_type>::max(),
-            size
-        );
+        return benchmark_utils::get_random_data<key_type>(size,
+                                                          std::numeric_limits<key_type>::min(),
+                                                          std::numeric_limits<key_type>::max(),
+                                                          size);
     }
 }
 
 template<class Key>
-void run_sort_keys_benchmark(benchmark::State& state,
-                             hipStream_t stream,
-                             size_t size)
+void run_sort_keys_benchmark(benchmark::State& state, hipStream_t stream, size_t size)
 {
-    using key_type = Key;
-    auto compare_function = [] __device__ (const key_type & a, const key_type & b) { return a < b; };
+    using key_type        = Key;
+    auto compare_function = [] __device__(const key_type& a, const key_type& b) { return a < b; };
 
     auto keys_input = generate_keys<Key>(size);
 
-    key_type * d_keys_input;
-    key_type * d_keys_output;
+    key_type* d_keys_input;
+    key_type* d_keys_output;
     HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type)));
     HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type)));
     HIP_CHECK(
-        hipMemcpy(
-            d_keys_input, keys_input.data(),
-            size * sizeof(key_type),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    void * d_temporary_storage = nullptr;
+        hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice));
+
+    void*  d_temporary_storage     = nullptr;
     size_t temporary_storage_bytes = 0;
-    HIP_CHECK(
-        hipcub::DeviceMergeSort::SortKeysCopy(
-            d_temporary_storage, temporary_storage_bytes,
-            d_keys_input, d_keys_output, size,
-            compare_function, stream
-        )
-    );
+    HIP_CHECK(hipcub::DeviceMergeSort::SortKeysCopy(d_temporary_storage,
+                                                    temporary_storage_bytes,
+                                                    d_keys_input,
+                                                    d_keys_output,
+                                                    size,
+                                                    compare_function,
+                                                    stream));
 
     HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes));
     HIP_CHECK(hipDeviceSynchronize());
@@ -91,35 +84,35 @@ void run_sort_keys_benchmark(benchmark::State& state,
     // Warm-up
     for(size_t i = 0; i < warmup_size; i++)
     {
-        HIP_CHECK(
-            hipcub::DeviceMergeSort::SortKeysCopy(
-                d_temporary_storage, temporary_storage_bytes,
-                d_keys_input, d_keys_output, size,
-                compare_function, stream
-            )
-        );
+        HIP_CHECK(hipcub::DeviceMergeSort::SortKeysCopy(d_temporary_storage,
+                                                        temporary_storage_bytes,
+                                                        d_keys_input,
+                                                        d_keys_output,
+                                                        size,
+                                                        compare_function,
+                                                        stream));
     }
     HIP_CHECK(hipDeviceSynchronize());
 
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
 
         for(size_t i = 0; i < batch_size; i++)
         {
-            HIP_CHECK(
-                hipcub::DeviceMergeSort::SortKeysCopy(
-                    d_temporary_storage, temporary_storage_bytes,
-                    d_keys_input, d_keys_output, size,
-                    compare_function, stream
-                )
-            );
+            HIP_CHECK(hipcub::DeviceMergeSort::SortKeysCopy(d_temporary_storage,
+                                                            temporary_storage_bytes,
+                                                            d_keys_input,
+                                                            d_keys_output,
+                                                            size,
+                                                            compare_function,
+                                                            stream));
         }
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type));
@@ -131,54 +124,46 @@ void run_sort_keys_benchmark(benchmark::State& state,
 }
 
 template<class Key, class Value>
-void run_sort_pairs_benchmark(benchmark::State& state,
-                              hipStream_t stream,
-                              size_t size)
+void run_sort_pairs_benchmark(benchmark::State& state, hipStream_t stream, size_t size)
 {
-    using key_type = Key;
-    using value_type = Value;
-    auto compare_function = [] __device__ (const key_type & a, const key_type & b) { return a < b; };
+    using key_type        = Key;
+    using value_type      = Value;
+    auto compare_function = [] __device__(const key_type& a, const key_type& b) { return a < b; };
 
-    auto keys_input = generate_keys<Key>(size);
+    auto                    keys_input = generate_keys<Key>(size);
     std::vector<value_type> values_input(size);
     for(size_t i = 0; i < size; i++)
     {
         values_input[i] = value_type(i);
     }
 
-    key_type * d_keys_input;
-    key_type * d_keys_output;
+    key_type* d_keys_input;
+    key_type* d_keys_output;
     HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type)));
     HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type)));
     HIP_CHECK(
-        hipMemcpy(
-            d_keys_input, keys_input.data(),
-            size * sizeof(key_type),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    value_type * d_values_input;
-    value_type * d_values_output;
+        hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice));
+
+    value_type* d_values_input;
+    value_type* d_values_output;
     HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type)));
     HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_values_input, values_input.data(),
-            size * sizeof(value_type),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    void * d_temporary_storage = nullptr;
+    HIP_CHECK(hipMemcpy(d_values_input,
+                        values_input.data(),
+                        size * sizeof(value_type),
+                        hipMemcpyHostToDevice));
+
+    void*  d_temporary_storage     = nullptr;
     size_t temporary_storage_bytes = 0;
-    HIP_CHECK(
-        hipcub::DeviceMergeSort::SortPairsCopy(
-            d_temporary_storage, temporary_storage_bytes,
-            d_keys_input, d_values_input, d_keys_output, d_values_output, size,
-            compare_function, stream
-        )
-    );
+    HIP_CHECK(hipcub::DeviceMergeSort::SortPairsCopy(d_temporary_storage,
+                                                     temporary_storage_bytes,
+                                                     d_keys_input,
+                                                     d_values_input,
+                                                     d_keys_output,
+                                                     d_values_output,
+                                                     size,
+                                                     compare_function,
+                                                     stream));
 
     HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes));
     HIP_CHECK(hipDeviceSynchronize());
@@ -186,40 +171,43 @@ void run_sort_pairs_benchmark(benchmark::State& state,
     // Warm-up
     for(size_t i = 0; i < warmup_size; i++)
     {
-        HIP_CHECK(
-            hipcub::DeviceMergeSort::SortPairsCopy(
-                d_temporary_storage, temporary_storage_bytes,
-                d_keys_input, d_values_input, d_keys_output, d_values_output, size,
-                compare_function, stream
-            )
-        );
+        HIP_CHECK(hipcub::DeviceMergeSort::SortPairsCopy(d_temporary_storage,
+                                                         temporary_storage_bytes,
+                                                         d_keys_input,
+                                                         d_values_input,
+                                                         d_keys_output,
+                                                         d_values_output,
+                                                         size,
+                                                         compare_function,
+                                                         stream));
     }
     HIP_CHECK(hipDeviceSynchronize());
 
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
 
         for(size_t i = 0; i < batch_size; i++)
         {
-            HIP_CHECK(
-                hipcub::DeviceMergeSort::SortPairsCopy(
-                    d_temporary_storage, temporary_storage_bytes,
-                    d_keys_input, d_values_input, d_keys_output, d_values_output, size,
-                    compare_function, stream
-                )
-            );
+            HIP_CHECK(hipcub::DeviceMergeSort::SortPairsCopy(d_temporary_storage,
+                                                             temporary_storage_bytes,
+                                                             d_keys_input,
+                                                             d_values_input,
+                                                             d_keys_output,
+                                                             d_values_output,
+                                                             size,
+                                                             compare_function,
+                                                             stream));
         }
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
-    state.SetBytesProcessed(
-        state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))
-    );
+    state.SetBytesProcessed(state.iterations() * batch_size * size
+                            * (sizeof(key_type) + sizeof(value_type)));
     state.SetItemsProcessed(state.iterations() * batch_size * size);
 
     HIP_CHECK(hipFree(d_temporary_storage));
@@ -229,31 +217,23 @@ void run_sort_pairs_benchmark(benchmark::State& state,
     HIP_CHECK(hipFree(d_values_output));
 }
 
+#define CREATE_SORT_KEYS_BENCHMARK(T)                  \
+    benchmarks.push_back(benchmark::RegisterBenchmark( \
+        std::string("device_merge_sort_sort_keys"      \
+                    "<key_data_type:" #T ">.")         \
+            .c_str(),                                  \
+        [=](benchmark::State& state) { run_sort_keys_benchmark<T>(state, stream, size); }));
 
-#define CREATE_SORT_KEYS_BENCHMARK(Key) \
-    { \
-        benchmarks.push_back( \
-            benchmark::RegisterBenchmark( \
-                (std::string("sort_keys") + "<" #Key ">").c_str(), \
-                [=](benchmark::State& state) { run_sort_keys_benchmark<Key>(state, stream, size); } \
-            ) \
-        ); \
-    }
-
-#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value) \
-    { \
-        benchmarks.push_back( \
-            benchmark::RegisterBenchmark( \
-                (std::string("sort_pairs") + "<" #Key ", " #Value">").c_str(), \
-                [=](benchmark::State& state) { run_sort_pairs_benchmark<Key, Value>(state, stream, size); } \
-            ) \
-        ); \
-    }
-
+#define CREATE_SORT_PAIRS_BENCHMARK(T, V)                             \
+    benchmarks.push_back(benchmark::RegisterBenchmark(                \
+        std::string("device_merge_sort_sort_pairs<"                   \
+                    ",key_data_type:" #T ",value_data_type:" #V ">.") \
+            .c_str(),                                                 \
+        [=](benchmark::State& state) { run_sort_pairs_benchmark<T, V>(state, stream, size); }));
 
 void add_sort_keys_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
-                              hipStream_t stream,
-                              size_t size)
+                              hipStream_t                                   stream,
+                              size_t                                        size)
 {
     CREATE_SORT_KEYS_BENCHMARK(int)
     CREATE_SORT_KEYS_BENCHMARK(long long)
@@ -263,11 +243,11 @@ void add_sort_keys_benchmarks(std::vector<benchmark::internal::Benchmark*>& benc
 }
 
 void add_sort_pairs_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
-                               hipStream_t stream,
-                               size_t size)
+                               hipStream_t                                   stream,
+                               size_t                                        size)
 {
-    using custom_float2 = benchmark_utils::custom_type<float, float>;
-    using custom_double2 = benchmark_utils::custom_type<double, double>;
+    using custom_float2      = benchmark_utils::custom_type<float, float>;
+    using custom_double2     = benchmark_utils::custom_type<double, double>;
     using custom_char_double = benchmark_utils::custom_type<char, double>;
     using custom_double_char = benchmark_utils::custom_type<double, char>;
 
@@ -289,7 +269,7 @@ void add_sort_pairs_benchmarks(std::vector<benchmark::internal::Benchmark*>& ben
     CREATE_SORT_PAIRS_BENCHMARK(uint8_t, uint8_t)
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -298,15 +278,17 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
+
+    std::cout << "benchmark_device_merge_sort" << std::endl;
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
 
     // Add benchmarks
diff --git a/benchmark/benchmark_device_partition.cpp b/benchmark/benchmark_device_partition.cpp
index 26c7739a..786fe139 100644
--- a/benchmark/benchmark_device_partition.cpp
+++ b/benchmark/benchmark_device_partition.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -32,43 +32,45 @@
 constexpr size_t DEFAULT_N = 1024 * 1024 * 32;
 #endif
 
-constexpr unsigned int batch_size = 10;
+constexpr unsigned int batch_size  = 10;
 constexpr unsigned int warmup_size = 5;
 
-namespace {
-template <typename T>
-struct LessOp {
-    HIPCUB_HOST_DEVICE LessOp(const T& pivot)
-        : pivot_{pivot}
-    {
-    }
+namespace
+{
+template<typename T>
+struct LessOp
+{
+    HIPCUB_HOST_DEVICE LessOp(const T& pivot) : pivot_{pivot} {}
 
-    HIPCUB_HOST_DEVICE bool operator()(const T& val) const {
+    HIPCUB_HOST_DEVICE bool operator()(const T& val) const
+    {
         return val < pivot_;
     }
+
 private:
     T pivot_;
 };
-}
+} // namespace
 
-template <typename T, typename F>
+template<typename T, typename F>
 void run_flagged(benchmark::State& state,
                  const hipStream_t stream,
-                 const T threshold,
-                 const size_t size)
+                 const T           threshold,
+                 const size_t      size)
 {
-    const auto select_op  = LessOp<T>{threshold};
-    const auto input =
-        benchmark_utils::get_random_data<T>(size, static_cast<T>(0), static_cast<T>(100));
+    const auto select_op = LessOp<T>{threshold};
+    const auto input
+        = benchmark_utils::get_random_data<T>(size, static_cast<T>(0), static_cast<T>(100));
 
     std::vector<F> flags(size);
-    for(unsigned int i = 0; i < size; i++) {
+    for(unsigned int i = 0; i < size; i++)
+    {
         flags[i] = static_cast<F>(select_op(input[i]));
     }
 
-    T* d_input                          = nullptr;
-    F* d_flags                          = nullptr;
-    T* d_output                         = nullptr;
+    T*            d_input               = nullptr;
+    F*            d_flags               = nullptr;
+    T*            d_output              = nullptr;
     unsigned int* d_num_selected_output = nullptr;
     HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_flags, input.size() * sizeof(F)));
@@ -78,63 +80,54 @@ void run_flagged(benchmark::State& state,
     // Allocate temporary storage
     void*  d_temp_storage     = nullptr;
     size_t temp_storage_bytes = 0;
-    HIP_CHECK(
-        hipcub::DevicePartition::Flagged(
-            nullptr,
-            temp_storage_bytes,
-            d_input,
-            d_flags,
-            d_output,
-            d_num_selected_output,
-            static_cast<int>(input.size()),
-            stream
-        )
-    );
+    HIP_CHECK(hipcub::DevicePartition::Flagged(nullptr,
+                                               temp_storage_bytes,
+                                               d_input,
+                                               d_flags,
+                                               d_output,
+                                               d_num_selected_output,
+                                               static_cast<int>(input.size()),
+                                               stream));
     HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes));
 
     // Warm-up
     HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice));
     HIP_CHECK(hipMemcpy(d_flags, flags.data(), flags.size() * sizeof(F), hipMemcpyHostToDevice));
-    for(unsigned int i = 0; i < warmup_size; ++i) {
-        HIP_CHECK(
-            hipcub::DevicePartition::Flagged(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_input,
-                d_flags,
-                d_output,
-                d_num_selected_output,
-                static_cast<int>(input.size()),
-                stream
-            )
-        );
+    for(unsigned int i = 0; i < warmup_size; ++i)
+    {
+        HIP_CHECK(hipcub::DevicePartition::Flagged(d_temp_storage,
+                                                   temp_storage_bytes,
+                                                   d_input,
+                                                   d_flags,
+                                                   d_output,
+                                                   d_num_selected_output,
+                                                   static_cast<int>(input.size()),
+                                                   stream));
     }
     HIP_CHECK(hipDeviceSynchronize());
 
     // Run benchmark
-    for(auto _ : state) {
+    for(auto _ : state)
+    {
         namespace chrono = std::chrono;
-        using clock  = chrono::high_resolution_clock;
+        using clock      = chrono::high_resolution_clock;
 
         const auto start = clock::now();
-        for (unsigned int i = 0; i < batch_size; ++i) {
-            HIP_CHECK(
-                hipcub::DevicePartition::Flagged(
-                    d_temp_storage,
-                    temp_storage_bytes,
-                    d_input,
-                    d_flags,
-                    d_output,
-                    d_num_selected_output,
-                    static_cast<int>(input.size()),
-                    stream
-                )
-            );
+        for(unsigned int i = 0; i < batch_size; ++i)
+        {
+            HIP_CHECK(hipcub::DevicePartition::Flagged(d_temp_storage,
+                                                       temp_storage_bytes,
+                                                       d_input,
+                                                       d_flags,
+                                                       d_output,
+                                                       d_num_selected_output,
+                                                       static_cast<int>(input.size()),
+                                                       stream));
         }
         HIP_CHECK(hipDeviceSynchronize());
 
-        const auto end = clock::now();
-        using seconds_d = chrono::duration<double>;
+        const auto end             = clock::now();
+        using seconds_d            = chrono::duration<double>;
         const auto elapsed_seconds = chrono::duration_cast<seconds_d>(end - start);
 
         state.SetIterationTime(elapsed_seconds.count());
@@ -151,83 +144,74 @@ void run_flagged(benchmark::State& state,
     HIP_CHECK(hipFree(d_input));
 }
 
-template <typename T>
+template<typename T>
 void run_predicate(benchmark::State& state,
                    const hipStream_t stream,
-                   const T threshold,
-                   const size_t size)
+                   const T           threshold,
+                   const size_t      size)
 {
-    const auto input =
-        benchmark_utils::get_random_data<T>(size, static_cast<T>(0), static_cast<T>(100));
+    const auto input
+        = benchmark_utils::get_random_data<T>(size, static_cast<T>(0), static_cast<T>(100));
 
-    T* d_input                          = nullptr;
-    T* d_output                         = nullptr;
+    T*            d_input               = nullptr;
+    T*            d_output              = nullptr;
     unsigned int* d_num_selected_output = nullptr;
     HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_num_selected_output, sizeof(unsigned int)));
 
-    const auto select_op  = LessOp<T>{threshold};
+    const auto select_op = LessOp<T>{threshold};
 
     // Allocate temporary storage
     void*  d_temp_storage     = nullptr;
     size_t temp_storage_bytes = 0;
-    HIP_CHECK(
-        hipcub::DevicePartition::If(
-            nullptr,
-            temp_storage_bytes,
-            d_input,
-            d_output,
-            d_num_selected_output,
-            static_cast<int>(input.size()),
-            select_op,
-            stream
-        )
-    );
+    HIP_CHECK(hipcub::DevicePartition::If(nullptr,
+                                          temp_storage_bytes,
+                                          d_input,
+                                          d_output,
+                                          d_num_selected_output,
+                                          static_cast<int>(input.size()),
+                                          select_op,
+                                          stream));
     HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes));
 
     // Warm-up
     HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice));
-    for(unsigned int i = 0; i < warmup_size; ++i) {
-        HIP_CHECK(
-            hipcub::DevicePartition::If(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_input,
-                d_output,
-                d_num_selected_output,
-                static_cast<int>(input.size()),
-                select_op,
-                stream
-            )
-        );
+    for(unsigned int i = 0; i < warmup_size; ++i)
+    {
+        HIP_CHECK(hipcub::DevicePartition::If(d_temp_storage,
+                                              temp_storage_bytes,
+                                              d_input,
+                                              d_output,
+                                              d_num_selected_output,
+                                              static_cast<int>(input.size()),
+                                              select_op,
+                                              stream));
     }
     HIP_CHECK(hipDeviceSynchronize());
 
     // Run benchmark
-    for(auto _ : state) {
+    for(auto _ : state)
+    {
         namespace chrono = std::chrono;
-        using clock  = chrono::high_resolution_clock;
+        using clock      = chrono::high_resolution_clock;
 
         const auto start = clock::now();
-        for (unsigned int i = 0; i < batch_size; ++i) {
-            HIP_CHECK(
-                hipcub::DevicePartition::If(
-                    d_temp_storage,
-                    temp_storage_bytes,
-                    d_input,
-                    d_output,
-                    d_num_selected_output,
-                    static_cast<int>(input.size()),
-                    select_op,
-                    stream
-                )
-            );
+        for(unsigned int i = 0; i < batch_size; ++i)
+        {
+            HIP_CHECK(hipcub::DevicePartition::If(d_temp_storage,
+                                                  temp_storage_bytes,
+                                                  d_input,
+                                                  d_output,
+                                                  d_num_selected_output,
+                                                  static_cast<int>(input.size()),
+                                                  select_op,
+                                                  stream));
         }
         HIP_CHECK(hipDeviceSynchronize());
 
-        const auto end = clock::now();
-        using seconds_d = chrono::duration<double>;
+        const auto end             = clock::now();
+        using seconds_d            = chrono::duration<double>;
         const auto elapsed_seconds = chrono::duration_cast<seconds_d>(end - start);
 
         state.SetIterationTime(elapsed_seconds.count());
@@ -243,20 +227,20 @@ void run_predicate(benchmark::State& state,
     HIP_CHECK(hipFree(d_num_selected_output));
 }
 
-template <typename T>
+template<typename T>
 void run_threeway(benchmark::State& state,
                   const hipStream_t stream,
-                  const T small_threshold,
-                  const T large_threshold,
-                  const size_t size)
+                  const T           small_threshold,
+                  const T           large_threshold,
+                  const size_t      size)
 {
-    const auto input =
-        benchmark_utils::get_random_data<T>(size, static_cast<T>(0), static_cast<T>(100));
+    const auto input
+        = benchmark_utils::get_random_data<T>(size, static_cast<T>(0), static_cast<T>(100));
 
-    T* d_input             = nullptr;
-    T* d_first_output      = nullptr;
-    T* d_second_output     = nullptr;
-    T* d_unselected_output = nullptr;
+    T*            d_input               = nullptr;
+    T*            d_first_output        = nullptr;
+    T*            d_second_output       = nullptr;
+    T*            d_unselected_output   = nullptr;
     unsigned int* d_num_selected_output = nullptr;
     HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_first_output, input.size() * sizeof(T)));
@@ -270,71 +254,62 @@ void run_threeway(benchmark::State& state,
     // Allocate temporary storage
     void*  d_temp_storage     = nullptr;
     size_t temp_storage_bytes = 0;
-    HIP_CHECK(
-        hipcub::DevicePartition::If(
-            nullptr,
-            temp_storage_bytes,
-            d_input,
-            d_first_output,
-            d_second_output,
-            d_unselected_output,
-            d_num_selected_output,
-            static_cast<int>(input.size()),
-            select_first_part_op,
-            select_second_part_op,
-            stream
-        )
-    );
+    HIP_CHECK(hipcub::DevicePartition::If(nullptr,
+                                          temp_storage_bytes,
+                                          d_input,
+                                          d_first_output,
+                                          d_second_output,
+                                          d_unselected_output,
+                                          d_num_selected_output,
+                                          static_cast<int>(input.size()),
+                                          select_first_part_op,
+                                          select_second_part_op,
+                                          stream));
     HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes));
 
     // Warm-up
     HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice));
-    for(unsigned int i = 0; i < warmup_size; ++i) {
-        HIP_CHECK(
-            hipcub::DevicePartition::If(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_input,
-                d_first_output,
-                d_second_output,
-                d_unselected_output,
-                d_num_selected_output,
-                static_cast<int>(input.size()),
-                select_first_part_op,
-                select_second_part_op,
-                stream
-            )
-        );
+    for(unsigned int i = 0; i < warmup_size; ++i)
+    {
+        HIP_CHECK(hipcub::DevicePartition::If(d_temp_storage,
+                                              temp_storage_bytes,
+                                              d_input,
+                                              d_first_output,
+                                              d_second_output,
+                                              d_unselected_output,
+                                              d_num_selected_output,
+                                              static_cast<int>(input.size()),
+                                              select_first_part_op,
+                                              select_second_part_op,
+                                              stream));
     }
     HIP_CHECK(hipDeviceSynchronize());
 
     // Run benchmark
-    for(auto _ : state) {
+    for(auto _ : state)
+    {
         namespace chrono = std::chrono;
-        using clock  = chrono::high_resolution_clock;
+        using clock      = chrono::high_resolution_clock;
 
         const auto start = clock::now();
-        for (unsigned int i = 0; i < batch_size; ++i) {
-            HIP_CHECK(
-                hipcub::DevicePartition::If(
-                    d_temp_storage,
-                    temp_storage_bytes,
-                    d_input,
-                    d_first_output,
-                    d_second_output,
-                    d_unselected_output,
-                    d_num_selected_output,
-                    static_cast<int>(input.size()),
-                    select_first_part_op,
-                    select_second_part_op,
-                    stream
-                )
-            );
+        for(unsigned int i = 0; i < batch_size; ++i)
+        {
+            HIP_CHECK(hipcub::DevicePartition::If(d_temp_storage,
+                                                  temp_storage_bytes,
+                                                  d_input,
+                                                  d_first_output,
+                                                  d_second_output,
+                                                  d_unselected_output,
+                                                  d_num_selected_output,
+                                                  static_cast<int>(input.size()),
+                                                  select_first_part_op,
+                                                  select_second_part_op,
+                                                  stream));
         }
         HIP_CHECK(hipDeviceSynchronize());
 
-        const auto end = clock::now();
-        using seconds_d = chrono::duration<double>;
+        const auto end             = clock::now();
+        using seconds_d            = chrono::duration<double>;
         const auto elapsed_seconds = chrono::duration_cast<seconds_d>(end - start);
 
         state.SetIterationTime(elapsed_seconds.count());
@@ -352,43 +327,50 @@ void run_threeway(benchmark::State& state,
     HIP_CHECK(hipFree(d_num_selected_output));
 }
 
-#define CREATE_BENCHMARK_FLAGGED(T, T_FLAG, SPLIT_T)               \
-benchmark::RegisterBenchmark(                                      \
-    "parition_flagged<" #T ", " #T_FLAG ">(" #SPLIT_T "%)",        \
-    &run_flagged<T, T_FLAG>, stream, static_cast<T>(SPLIT_T), size \
-)
-
-#define CREATE_BENCHMARK_PREDICATE(T, SPLIT_T)               \
-benchmark::RegisterBenchmark(                                \
-    "parition_predicate<" #T ">(" #SPLIT_T "%)",             \
-    &run_predicate<T>, stream, static_cast<T>(SPLIT_T), size \
-)
-
-#define CREATE_BENCHMARK_THREEWAY(T, SMALL_T, LARGE_T)                               \
-benchmark::RegisterBenchmark(                                                        \
-    "parition_three_way<Datatype:" #T ">(Small Threshold:" #SMALL_T "%,Large Threshold:" #LARGE_T "%)",                       \
-    &run_threeway<T>, stream, static_cast<T>(SMALL_T), static_cast<T>(LARGE_T), size \
-)
-
-#define BENCHMARK_FLAGGED_TYPE(type, flag_type)    \
-    CREATE_BENCHMARK_FLAGGED(type, flag_type, 33), \
-    CREATE_BENCHMARK_FLAGGED(type, flag_type, 50), \
-    CREATE_BENCHMARK_FLAGGED(type, flag_type, 60), \
-    CREATE_BENCHMARK_FLAGGED(type, flag_type, 90)
-
-#define BENCHMARK_PREDICATE_TYPE(type)    \
-    CREATE_BENCHMARK_PREDICATE(type, 33), \
-    CREATE_BENCHMARK_PREDICATE(type, 50), \
-    CREATE_BENCHMARK_PREDICATE(type, 60), \
-    CREATE_BENCHMARK_PREDICATE(type, 90)
-
-#define BENCHMARK_THREEWAY_TYPE(type)        \
-    CREATE_BENCHMARK_THREEWAY(type, 33, 66), \
-    CREATE_BENCHMARK_THREEWAY(type, 10, 66), \
-    CREATE_BENCHMARK_THREEWAY(type, 50, 60), \
-    CREATE_BENCHMARK_THREEWAY(type, 50, 90)
-
-int main(int argc, char *argv[])
+#define CREATE_BENCHMARK_FLAGGED(T, T_FLAG, SPLIT_T)                                              \
+    benchmark::RegisterBenchmark(std::string("device_parition_flagged<data_type:" #T              \
+                                             ",flag_type:" #T_FLAG ">.(split_threshold:" #SPLIT_T \
+                                             "%)")                                                \
+                                     .c_str(),                                                    \
+                                 &run_flagged<T, T_FLAG>,                                         \
+                                 stream,                                                          \
+                                 static_cast<T>(SPLIT_T),                                         \
+                                 size)
+
+#define CREATE_BENCHMARK_PREDICATE(T, SPLIT_T)                                                     \
+    benchmark::RegisterBenchmark(                                                                  \
+        std::string("device_parition_predicate<data_type:" #T ">.(split_threshold:" #SPLIT_T "%)") \
+            .c_str(),                                                                              \
+        &run_predicate<T>,                                                                         \
+        stream,                                                                                    \
+        static_cast<T>(SPLIT_T),                                                                   \
+        size)
+
+#define CREATE_BENCHMARK_THREEWAY(T, SMALL_T, LARGE_T)                                       \
+    benchmark::RegisterBenchmark(std::string("device_parition_three_way"                     \
+                                             "<data_type:" #T ">.(small_threshold:" #SMALL_T \
+                                             "%,large_threshold:" #LARGE_T "%)")             \
+                                     .c_str(),                                               \
+                                 &run_threeway<T>,                                           \
+                                 stream,                                                     \
+                                 static_cast<T>(SMALL_T),                                    \
+                                 static_cast<T>(LARGE_T),                                    \
+                                 size)
+
+#define BENCHMARK_FLAGGED_TYPE(type, flag_type)                                                   \
+    CREATE_BENCHMARK_FLAGGED(type, flag_type, 33), CREATE_BENCHMARK_FLAGGED(type, flag_type, 50), \
+        CREATE_BENCHMARK_FLAGGED(type, flag_type, 60),                                            \
+        CREATE_BENCHMARK_FLAGGED(type, flag_type, 90)
+
+#define BENCHMARK_PREDICATE_TYPE(type)                                          \
+    CREATE_BENCHMARK_PREDICATE(type, 33), CREATE_BENCHMARK_PREDICATE(type, 50), \
+        CREATE_BENCHMARK_PREDICATE(type, 60), CREATE_BENCHMARK_PREDICATE(type, 90)
+
+#define BENCHMARK_THREEWAY_TYPE(type)                                                 \
+    CREATE_BENCHMARK_THREEWAY(type, 33, 66), CREATE_BENCHMARK_THREEWAY(type, 10, 66), \
+        CREATE_BENCHMARK_THREEWAY(type, 50, 60), CREATE_BENCHMARK_THREEWAY(type, 50, 90)
+
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -397,8 +379,8 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     std::cout << "benchmark_device_partition" << std::endl;
 
@@ -406,18 +388,17 @@ int main(int argc, char *argv[])
     const hipStream_t stream = 0; // default
     {
         hipDeviceProp_t devProp;
-        int device_id = 0;
+        int             device_id = 0;
         HIP_CHECK(hipGetDevice(&device_id));
         HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
         std::cout << "[HIP] Device name: " << devProp.name << std::endl;
     }
 
-    using custom_float2 = benchmark_utils::custom_type<float, float>;
+    using custom_float2  = benchmark_utils::custom_type<float, float>;
     using custom_double2 = benchmark_utils::custom_type<double, double>;
 
     // Add benchmarks
-    std::vector<benchmark::internal::Benchmark*> benchmarks = 
-    {
+    std::vector<benchmark::internal::Benchmark*> benchmarks = {
         BENCHMARK_FLAGGED_TYPE(int8_t, unsigned char),
         BENCHMARK_FLAGGED_TYPE(int, unsigned char),
         BENCHMARK_FLAGGED_TYPE(float, unsigned char),
diff --git a/benchmark/benchmark_device_radix_sort.cpp b/benchmark/benchmark_device_radix_sort.cpp
index 386281ab..366e62d9 100644
--- a/benchmark/benchmark_device_radix_sort.cpp
+++ b/benchmark/benchmark_device_radix_sort.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -28,12 +28,11 @@
 // HIP API
 #include "hipcub/device/device_radix_sort.hpp"
 
-
 #ifndef DEFAULT_N
 const size_t DEFAULT_N = 1024 * 1024 * 32;
 #endif
 
-const unsigned int batch_size = 10;
+const unsigned int batch_size  = 10;
 const unsigned int warmup_size = 5;
 
 template<class Key>
@@ -43,16 +42,16 @@ std::vector<Key> generate_keys(size_t size)
 
     if(std::is_floating_point<key_type>::value)
     {
-        return benchmark_utils::get_random_data<key_type>(size, (key_type)-1000, (key_type)+1000, size);
-    }
-    else
+        return benchmark_utils::get_random_data<key_type>(size,
+                                                          (key_type)-1000,
+                                                          (key_type) + 1000,
+                                                          size);
+    } else
     {
-        return benchmark_utils::get_random_data<key_type>(
-            size,
-            std::numeric_limits<key_type>::min(),
-            std::numeric_limits<key_type>::max(),
-            size
-        );
+        return benchmark_utils::get_random_data<key_type>(size,
+                                                          std::numeric_limits<key_type>::min(),
+                                                          std::numeric_limits<key_type>::max(),
+                                                          size);
     }
 }
 
@@ -132,25 +131,22 @@ auto invoke_sort_keys(void*       d_temp_storage,
 }
 
 template<class Key, bool Descending = false>
-void run_sort_keys_benchmark(benchmark::State& state,
-                             hipStream_t stream,
-                             size_t size,
+void run_sort_keys_benchmark(benchmark::State&                 state,
+                             hipStream_t                       stream,
+                             size_t                            size,
                              std::shared_ptr<std::vector<Key>> keys_input)
 {
     using key_type = Key;
-    key_type * d_keys_input;
-    key_type * d_keys_output;
+    key_type* d_keys_input;
+    key_type* d_keys_output;
     HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type)));
     HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_keys_input, keys_input->data(),
-            size * sizeof(key_type),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    void * d_temporary_storage = nullptr;
+    HIP_CHECK(hipMemcpy(d_keys_input,
+                        keys_input->data(),
+                        size * sizeof(key_type),
+                        hipMemcpyHostToDevice));
+
+    void*  d_temporary_storage     = nullptr;
     size_t temporary_storage_bytes = 0;
     HIP_CHECK(invoke_sort_keys<Descending>(d_temporary_storage,
                                            temporary_storage_bytes,
@@ -174,7 +170,7 @@ void run_sort_keys_benchmark(benchmark::State& state,
     }
     HIP_CHECK(hipDeviceSynchronize());
 
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
 
@@ -190,8 +186,8 @@ void run_sort_keys_benchmark(benchmark::State& state,
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type));
@@ -294,12 +290,12 @@ auto invoke_sort_pairs(void*       d_temp_storage,
 }
 
 template<class Key, class Value, bool Descending = false>
-void run_sort_pairs_benchmark(benchmark::State& state,
-                              hipStream_t stream,
-                              size_t size,
+void run_sort_pairs_benchmark(benchmark::State&                 state,
+                              hipStream_t                       stream,
+                              size_t                            size,
                               std::shared_ptr<std::vector<Key>> keys_input)
 {
-    using key_type = Key;
+    using key_type   = Key;
     using value_type = Value;
     std::vector<value_type> values_input(size);
     for(size_t i = 0; i < size; i++)
@@ -307,31 +303,25 @@ void run_sort_pairs_benchmark(benchmark::State& state,
         values_input[i] = value_type(i);
     }
 
-    key_type * d_keys_input;
-    key_type * d_keys_output;
+    key_type* d_keys_input;
+    key_type* d_keys_output;
     HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type)));
     HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_keys_input, keys_input->data(),
-            size * sizeof(key_type),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    value_type * d_values_input;
-    value_type * d_values_output;
+    HIP_CHECK(hipMemcpy(d_keys_input,
+                        keys_input->data(),
+                        size * sizeof(key_type),
+                        hipMemcpyHostToDevice));
+
+    value_type* d_values_input;
+    value_type* d_values_output;
     HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type)));
     HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_values_input, values_input.data(),
-            size * sizeof(value_type),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    void * d_temporary_storage = nullptr;
+    HIP_CHECK(hipMemcpy(d_values_input,
+                        values_input.data(),
+                        size * sizeof(value_type),
+                        hipMemcpyHostToDevice));
+
+    void*  d_temporary_storage     = nullptr;
     size_t temporary_storage_bytes = 0;
     HIP_CHECK(invoke_sort_pairs<Descending>(d_temporary_storage,
                                             temporary_storage_bytes,
@@ -359,7 +349,7 @@ void run_sort_pairs_benchmark(benchmark::State& state,
     }
     HIP_CHECK(hipDeviceSynchronize());
 
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
 
@@ -377,13 +367,12 @@ void run_sort_pairs_benchmark(benchmark::State& state,
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
-    state.SetBytesProcessed(
-        state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))
-    );
+    state.SetBytesProcessed(state.iterations() * batch_size * size
+                            * (sizeof(key_type) + sizeof(value_type)));
     state.SetItemsProcessed(state.iterations() * batch_size * size);
 
     HIP_CHECK(hipFree(d_temporary_storage));
@@ -393,45 +382,43 @@ void run_sort_pairs_benchmark(benchmark::State& state,
     HIP_CHECK(hipFree(d_values_output));
 }
 
-
-#define CREATE_SORT_KEYS_BENCHMARK(Key) \
-    { \
+#define CREATE_SORT_KEYS_BENCHMARK(Key)                                                 \
+    {                                                                                   \
         auto keys_input = std::make_shared<std::vector<Key>>(generate_keys<Key>(size)); \
-        benchmarks.push_back( \
-            benchmark::RegisterBenchmark( \
-                (std::string("sort_keys") + "<Key Type:" #Key ">").c_str(), \
-                [=](benchmark::State& state) { run_sort_keys_benchmark<Key>(state, stream, size, keys_input); } \
-            ) \
-        ); \
-        benchmarks.push_back( \
-            benchmark::RegisterBenchmark( \
-                (std::string("sort_keys") + "<" #Key ">, descending").c_str(), \
-                [=](benchmark::State& state) { run_sort_keys_benchmark<Key, true>(state, stream, size, keys_input); } \
-            ) \
-        ); \
+        benchmarks.push_back(benchmark::RegisterBenchmark(                              \
+            std::string("device_radix_sort_keys_ascending"                              \
+                        "<key_data_type:" #Key ">.")                                    \
+                .c_str(),                                                               \
+            [=](benchmark::State& state)                                                \
+            { run_sort_keys_benchmark<Key>(state, stream, size, keys_input); }));       \
+        benchmarks.push_back(benchmark::RegisterBenchmark(                              \
+            std::string("device_radix_sort_keys_descending"                             \
+                        "<key_data_type:" #Key ">.")                                    \
+                .c_str(),                                                               \
+            [=](benchmark::State& state)                                                \
+            { run_sort_keys_benchmark<Key, true>(state, stream, size, keys_input); })); \
     }
 
-#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value) \
-    { \
-        auto keys_input = std::make_shared<std::vector<Key>>(generate_keys<Key>(size)); \
-        benchmarks.push_back( \
-            benchmark::RegisterBenchmark( \
-                (std::string("sort_pairs") + "<Key Type:" #Key ",Value Type:" #Value">").c_str(), \
-                [=](benchmark::State& state) { run_sort_pairs_benchmark<Key, Value>(state, stream, size, keys_input); } \
-            ) \
-        ); \
-        benchmarks.push_back( \
-            benchmark::RegisterBenchmark( \
-                (std::string("sort_pairs") + "<" #Key ", " #Value">, descending").c_str(), \
-                [=](benchmark::State& state) { run_sort_pairs_benchmark<Key, Value, true>(state, stream, size, keys_input); } \
-            ) \
-        ); \
+#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value)                                                 \
+    {                                                                                           \
+        auto keys_input = std::make_shared<std::vector<Key>>(generate_keys<Key>(size));         \
+        benchmarks.push_back(benchmark::RegisterBenchmark(                                      \
+            std::string("device_radix_sort_pairs_ascending"                                     \
+                        "<key_data_type:" #Key ",value_data_type:" #Value ">.")                 \
+                .c_str(),                                                                       \
+            [=](benchmark::State& state)                                                        \
+            { run_sort_pairs_benchmark<Key, Value>(state, stream, size, keys_input); }));       \
+        benchmarks.push_back(benchmark::RegisterBenchmark(                                      \
+            std::string("device_radix_sort_pairs_descending"                                    \
+                        "<key_data_type:" #Key ",value_data_type:" #Value ">.")                 \
+                .c_str(),                                                                       \
+            [=](benchmark::State& state)                                                        \
+            { run_sort_pairs_benchmark<Key, Value, true>(state, stream, size, keys_input); })); \
     }
 
-
 void add_sort_keys_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
-                              hipStream_t stream,
-                              size_t size)
+                              hipStream_t                                   stream,
+                              size_t                                        size)
 {
     using custom_int_t = benchmark_utils::custom_type<int>;
     CREATE_SORT_KEYS_BENCHMARK(int)
@@ -443,11 +430,11 @@ void add_sort_keys_benchmarks(std::vector<benchmark::internal::Benchmark*>& benc
 }
 
 void add_sort_pairs_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
-                               hipStream_t stream,
-                               size_t size)
+                               hipStream_t                                   stream,
+                               size_t                                        size)
 {
-    using custom_float2 = benchmark_utils::custom_type<float, float>;
-    using custom_double2 = benchmark_utils::custom_type<double, double>;
+    using custom_float2      = benchmark_utils::custom_type<float, float>;
+    using custom_double2     = benchmark_utils::custom_type<double, double>;
     using custom_char_double = benchmark_utils::custom_type<char, double>;
     using custom_double_char = benchmark_utils::custom_type<double, char>;
     using custom_int_t       = benchmark_utils::custom_type<int>;
@@ -472,7 +459,7 @@ void add_sort_pairs_benchmarks(std::vector<benchmark::internal::Benchmark*>& ben
     CREATE_SORT_PAIRS_BENCHMARK(custom_int_t, float)
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -481,15 +468,15 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     std::cout << "benchmark_device_radix_sort" << std::endl;
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
diff --git a/benchmark/benchmark_device_reduce.cpp b/benchmark/benchmark_device_reduce.cpp
index a58ea8dc..2a4d9df7 100644
--- a/benchmark/benchmark_device_reduce.cpp
+++ b/benchmark/benchmark_device_reduce.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -25,45 +25,34 @@
 // HIP API
 #include "hipcub/device/device_reduce.hpp"
 
-
 #ifndef DEFAULT_N
 const size_t DEFAULT_N = 1024 * 1024 * 128;
 #endif
 
-const unsigned int batch_size = 10;
+const unsigned int batch_size  = 10;
 const unsigned int warmup_size = 5;
 
-template<
-    class T,
-    class OutputT,
-    class ReduceKernel
->
+template<class T, class OutputT, class ReduceKernel>
 void run_benchmark(benchmark::State& state,
-                   size_t size,
+                   size_t            size,
                    const hipStream_t stream,
-                   ReduceKernel reduce)
+                   ReduceKernel      reduce)
 {
     std::vector<T> input = benchmark_utils::get_random_data<T>(size, T(0), T(1000));
 
-    T * d_input;
-    OutputT * d_output;
+    T*       d_input;
+    OutputT* d_output;
     HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_output, sizeof(OutputT)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            size * sizeof(T),
-            hipMemcpyHostToDevice
-        )
-    );
+    HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice));
     HIP_CHECK(hipDeviceSynchronize());
 
     // Allocate temporary storage memory
     size_t temp_storage_size_bytes = 0;
-    void * d_temp_storage = nullptr;
+    void*  d_temp_storage          = nullptr;
     // Get size of d_temp_storage
     HIP_CHECK(reduce(d_temp_storage, temp_storage_size_bytes, d_input, d_output, size, stream));
-    HIP_CHECK(hipMalloc(&d_temp_storage,temp_storage_size_bytes));
+    HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes));
     HIP_CHECK(hipDeviceSynchronize());
     for(size_t i = 0; i < warmup_size; i++)
     {
@@ -83,8 +72,8 @@ void run_benchmark(benchmark::State& state,
         HIP_CHECK(hipStreamSynchronize(stream));
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T));
@@ -99,7 +88,8 @@ template<typename T, typename Op>
 struct Benchmark;
 
 template<typename T>
-struct Benchmark<T, hipcub::Sum> {
+struct Benchmark<T, hipcub::Sum>
+{
     static void run(benchmark::State& state, size_t size, const hipStream_t stream)
     {
         hipError_t (*ptr_to_sum)(void*, size_t&, T*, T*, int, hipStream_t)
@@ -109,7 +99,8 @@ struct Benchmark<T, hipcub::Sum> {
 };
 
 template<typename T>
-struct Benchmark<T, hipcub::Min> {
+struct Benchmark<T, hipcub::Min>
+{
     static void run(benchmark::State& state, size_t size, const hipStream_t stream)
     {
         hipError_t (*ptr_to_min)(void*, size_t&, T*, T*, int, hipStream_t)
@@ -119,10 +110,11 @@ struct Benchmark<T, hipcub::Min> {
 };
 
 template<typename T>
-struct Benchmark<T, hipcub::ArgMin> {
+struct Benchmark<T, hipcub::ArgMin>
+{
     using Difference = int;
-    using Iterator = typename hipcub::ArgIndexInputIterator<T*, Difference>;
-    using KeyValue = typename Iterator::value_type;
+    using Iterator   = typename hipcub::ArgIndexInputIterator<T*, Difference>;
+    using KeyValue   = typename Iterator::value_type;
 
     static void run(benchmark::State& state, size_t size, const hipStream_t stream)
     {
@@ -132,20 +124,20 @@ struct Benchmark<T, hipcub::ArgMin> {
     }
 };
 
-#define CREATE_BENCHMARK(T, REDUCE_OP) \
-benchmark::RegisterBenchmark( \
-    ("reduce<Datatype:" #T ",Op:" #REDUCE_OP ">"), \
-    &Benchmark<T, REDUCE_OP>::run, size, stream \
-)
+#define CREATE_BENCHMARK(T, REDUCE_OP)                                                \
+    benchmark::RegisterBenchmark(std::string("device_reduce"                          \
+                                             "<data_type:" #T ",op:" #REDUCE_OP ">.") \
+                                     .c_str(),                                        \
+                                 &Benchmark<T, REDUCE_OP>::run,                       \
+                                 size,                                                \
+                                 stream)
 
-#define CREATE_BENCHMARKS(REDUCE_OP) \
-    CREATE_BENCHMARK(int, REDUCE_OP), \
-    CREATE_BENCHMARK(long long, REDUCE_OP), \
-    CREATE_BENCHMARK(float, REDUCE_OP), \
-    CREATE_BENCHMARK(double, REDUCE_OP), \
-    CREATE_BENCHMARK(int8_t, REDUCE_OP)
+#define CREATE_BENCHMARKS(REDUCE_OP)                                             \
+    CREATE_BENCHMARK(int, REDUCE_OP), CREATE_BENCHMARK(long long, REDUCE_OP),    \
+        CREATE_BENCHMARK(float, REDUCE_OP), CREATE_BENCHMARK(double, REDUCE_OP), \
+        CREATE_BENCHMARK(int8_t, REDUCE_OP)
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -154,15 +146,15 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     std::cout << "benchmark_device_reduce" << std::endl;
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
@@ -170,18 +162,17 @@ int main(int argc, char *argv[])
     using custom_double2 = benchmark_utils::custom_type<double, double>;
 
     // Add benchmarks
-    std::vector<benchmark::internal::Benchmark*> benchmarks =
-    {
+    std::vector<benchmark::internal::Benchmark*> benchmarks = {
         CREATE_BENCHMARKS(hipcub::Sum),
         CREATE_BENCHMARK(custom_double2, hipcub::Sum),
         CREATE_BENCHMARKS(hipcub::Min),
-        #ifdef HIPCUB_ROCPRIM_API
+#ifdef HIPCUB_ROCPRIM_API
         CREATE_BENCHMARK(custom_double2, hipcub::Min),
-        #endif
+#endif
         CREATE_BENCHMARKS(hipcub::ArgMin),
-        #ifdef HIPCUB_ROCPRIM_API
+#ifdef HIPCUB_ROCPRIM_API
         CREATE_BENCHMARK(custom_double2, hipcub::ArgMin),
-        #endif
+#endif
     };
 
     // Use manual timing
diff --git a/benchmark/benchmark_device_reduce_by_key.cpp b/benchmark/benchmark_device_reduce_by_key.cpp
index 7437383f..54209e65 100644
--- a/benchmark/benchmark_device_reduce_by_key.cpp
+++ b/benchmark/benchmark_device_reduce_by_key.cpp
@@ -2,15 +2,15 @@
 //
 // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
 //
-    // Permission is hereby granted, free of charge, to any person obtaining a copy
+// Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -20,8 +20,8 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
 
-// CUB's implementation of single_pass_scan_operators has maybe uninitialized parameters,
-// disable the warning because all warnings are threated as errors:
+// CUB's implementation of single_pass_scan_operators has maybe uninitialized
+// parameters, disable the warning because all warnings are threated as errors:
 #ifdef __HIP_PLATFORM_NVIDIA__
     #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #endif
@@ -35,25 +35,30 @@
 const size_t DEFAULT_N = 1024 * 1024 * 32;
 #endif
 
-const unsigned int batch_size = 10;
+const unsigned int batch_size  = 10;
 const unsigned int warmup_size = 5;
 
 template<class Key, class Value, class BinaryFunction>
-void run_benchmark(benchmark::State& state, size_t max_length, hipStream_t stream, size_t size, BinaryFunction reduce_op)
+void run_benchmark(benchmark::State& state,
+                   size_t            max_length,
+                   hipStream_t       stream,
+                   size_t            size,
+                   BinaryFunction    reduce_op)
 {
-    using key_type = Key;
+    using key_type   = Key;
     using value_type = Value;
 
     // Generate data
     std::vector<key_type> keys_input(size);
 
-    unsigned int unique_count = 0;
-    std::vector<size_t> key_counts = benchmark_utils::get_random_data<size_t>(100000, 1, max_length);
+    unsigned int        unique_count = 0;
+    std::vector<size_t> key_counts
+        = benchmark_utils::get_random_data<size_t>(100000, 1, max_length);
     size_t offset = 0;
     while(offset < size)
     {
         const size_t key_count = key_counts[unique_count % key_counts.size()];
-        const size_t end = std::min(size, offset + key_count);
+        const size_t end       = std::min(size, offset + key_count);
         for(size_t i = offset; i < end; i++)
         {
             keys_input[i] = unique_count;
@@ -66,46 +71,38 @@ void run_benchmark(benchmark::State& state, size_t max_length, hipStream_t strea
     std::vector<value_type> values_input(size);
     std::iota(values_input.begin(), values_input.end(), 0);
 
-    key_type * d_keys_input;
+    key_type* d_keys_input;
     HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type)));
     HIP_CHECK(
-        hipMemcpy(
-            d_keys_input, keys_input.data(),
-            size * sizeof(key_type),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    value_type * d_values_input;
+        hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice));
+
+    value_type* d_values_input;
     HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_values_input, values_input.data(),
-            size * sizeof(value_type),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    key_type * d_unique_output;
-    value_type * d_aggregates_output;
-    unsigned int * d_unique_count_output;
+    HIP_CHECK(hipMemcpy(d_values_input,
+                        values_input.data(),
+                        size * sizeof(value_type),
+                        hipMemcpyHostToDevice));
+
+    key_type*     d_unique_output;
+    value_type*   d_aggregates_output;
+    unsigned int* d_unique_count_output;
     HIP_CHECK(hipMalloc(&d_unique_output, unique_count * sizeof(key_type)));
     HIP_CHECK(hipMalloc(&d_aggregates_output, unique_count * sizeof(value_type)));
     HIP_CHECK(hipMalloc(&d_unique_count_output, sizeof(unsigned int)));
 
-    void * d_temporary_storage = nullptr;
+    void*  d_temporary_storage     = nullptr;
     size_t temporary_storage_bytes = 0;
 
-    HIP_CHECK(
-        hipcub::DeviceReduce::ReduceByKey(
-            nullptr, temporary_storage_bytes,
-            d_keys_input, d_unique_output, d_values_input,
-            d_aggregates_output,
-            d_unique_count_output,
-            reduce_op, size,
-            stream
-        )
-    );
+    HIP_CHECK(hipcub::DeviceReduce::ReduceByKey(nullptr,
+                                                temporary_storage_bytes,
+                                                d_keys_input,
+                                                d_unique_output,
+                                                d_values_input,
+                                                d_aggregates_output,
+                                                d_unique_count_output,
+                                                reduce_op,
+                                                size,
+                                                stream));
 
     HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes));
     HIP_CHECK(hipDeviceSynchronize());
@@ -113,44 +110,45 @@ void run_benchmark(benchmark::State& state, size_t max_length, hipStream_t strea
     // Warm-up
     for(size_t i = 0; i < warmup_size; i++)
     {
-        HIP_CHECK(
-            hipcub::DeviceReduce::ReduceByKey(
-                d_temporary_storage, temporary_storage_bytes,
-                d_keys_input,
-                d_unique_output, d_values_input, d_aggregates_output,
-                d_unique_count_output,
-                reduce_op, size,
-                stream
-            )
-        );
+        HIP_CHECK(hipcub::DeviceReduce::ReduceByKey(d_temporary_storage,
+                                                    temporary_storage_bytes,
+                                                    d_keys_input,
+                                                    d_unique_output,
+                                                    d_values_input,
+                                                    d_aggregates_output,
+                                                    d_unique_count_output,
+                                                    reduce_op,
+                                                    size,
+                                                    stream));
     }
     HIP_CHECK(hipDeviceSynchronize());
 
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
 
         for(size_t i = 0; i < batch_size; i++)
         {
-            HIP_CHECK(
-                hipcub::DeviceReduce::ReduceByKey(
-                    d_temporary_storage, temporary_storage_bytes,
-                    d_keys_input,
-                    d_unique_output, d_values_input, d_aggregates_output,
-                    d_unique_count_output,
-                    reduce_op, size,
-                    stream
-                )
-            );
+            HIP_CHECK(hipcub::DeviceReduce::ReduceByKey(d_temporary_storage,
+                                                        temporary_storage_bytes,
+                                                        d_keys_input,
+                                                        d_unique_output,
+                                                        d_values_input,
+                                                        d_aggregates_output,
+                                                        d_unique_count_output,
+                                                        reduce_op,
+                                                        size,
+                                                        stream));
         }
         HIP_CHECK(hipStreamSynchronize(stream));
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
-    state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type)));
+    state.SetBytesProcessed(state.iterations() * batch_size * size
+                            * (sizeof(key_type) + sizeof(value_type)));
     state.SetItemsProcessed(state.iterations() * batch_size * size);
 
     HIP_CHECK(hipFree(d_temporary_storage));
@@ -161,44 +159,46 @@ void run_benchmark(benchmark::State& state, size_t max_length, hipStream_t strea
     HIP_CHECK(hipFree(d_unique_count_output));
 }
 
-#define CREATE_BENCHMARK(Key, Value, REDUCE_OP) \
-benchmark::RegisterBenchmark( \
-    (std::string("reduce_by_key") + "<Key Type:" #Key ", Value:Type" #Value ", ReduceOp:" #REDUCE_OP ">" + \
-        "(Random Number Range:[1, " + std::to_string(max_length) + "])" \
-    ).c_str(), \
-    &run_benchmark<Key, Value, REDUCE_OP>, \
-    max_length, stream, size, REDUCE_OP() \
-)
-
-#define CREATE_BENCHMARKS(REDUCE_OP) \
-    CREATE_BENCHMARK(int, float, REDUCE_OP), \
-    CREATE_BENCHMARK(int, double, REDUCE_OP), \
-    CREATE_BENCHMARK(int, custom_double2, REDUCE_OP), \
-    CREATE_BENCHMARK(int8_t, int8_t, REDUCE_OP), \
-    CREATE_BENCHMARK(long long, float, REDUCE_OP), \
-    CREATE_BENCHMARK(long long, double, REDUCE_OP)
-
-void add_benchmarks(size_t max_length,
+#define CREATE_BENCHMARK(Key, Value, REDUCE_OP)                                                \
+    benchmark::RegisterBenchmark(std::string("device_reduce_by_key"                            \
+                                             "<key_data_type:" #Key ",value_data_type:" #Value \
+                                             ",reduce_op:" #REDUCE_OP ">."                     \
+                                             "(random_number_range:[1, "                       \
+                                             + std::to_string(max_length) + "])")              \
+                                     .c_str(),                                                 \
+                                 &run_benchmark<Key, Value, REDUCE_OP>,                        \
+                                 max_length,                                                   \
+                                 stream,                                                       \
+                                 size,                                                         \
+                                 REDUCE_OP())
+
+#define CREATE_BENCHMARKS(REDUCE_OP)                                                   \
+    CREATE_BENCHMARK(int, float, REDUCE_OP), CREATE_BENCHMARK(int, double, REDUCE_OP), \
+        CREATE_BENCHMARK(int, custom_double2, REDUCE_OP),                              \
+        CREATE_BENCHMARK(int8_t, int8_t, REDUCE_OP),                                   \
+        CREATE_BENCHMARK(long long, float, REDUCE_OP),                                 \
+        CREATE_BENCHMARK(long long, double, REDUCE_OP)
+
+void add_benchmarks(size_t                                        max_length,
                     std::vector<benchmark::internal::Benchmark*>& benchmarks,
-                    hipStream_t stream,
-                    size_t size)
+                    hipStream_t                                   stream,
+                    size_t                                        size)
 {
     using custom_double2 = benchmark_utils::custom_type<double, double>;
 
-    std::vector<benchmark::internal::Benchmark*> bs =
-    {
+    std::vector<benchmark::internal::Benchmark*> bs = {
         CREATE_BENCHMARKS(hipcub::Sum),
         CREATE_BENCHMARK(long long, custom_double2, hipcub::Sum),
         CREATE_BENCHMARKS(hipcub::Min),
-        #ifdef HIPCUB_ROCPRIM_API
+#ifdef HIPCUB_ROCPRIM_API
         CREATE_BENCHMARK(long long, custom_double2, hipcub::Min),
-        #endif
+#endif
     };
 
     benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -207,15 +207,15 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     std::cout << "benchmark_device_reduce_by_key" << std::endl;
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
diff --git a/benchmark/benchmark_device_run_length_encode.cpp b/benchmark/benchmark_device_run_length_encode.cpp
index 8a20433d..267185c7 100644
--- a/benchmark/benchmark_device_run_length_encode.cpp
+++ b/benchmark/benchmark_device_run_length_encode.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -31,27 +31,30 @@
 // HIP API
 #include "hipcub/device/device_run_length_encode.hpp"
 
-
 #ifndef DEFAULT_N
 const size_t DEFAULT_N = 1024 * 1024 * 32;
 #endif
 
 template<class T>
-void run_encode_benchmark(benchmark::State& state, size_t max_length, hipStream_t stream, size_t size)
+void run_encode_benchmark(benchmark::State& state,
+                          size_t            max_length,
+                          hipStream_t       stream,
+                          size_t            size)
 {
-    using key_type = T;
+    using key_type   = T;
     using count_type = unsigned int;
 
     // Generate data
     std::vector<key_type> input(size);
 
-    unsigned int runs_count = 0;
-    std::vector<size_t> key_counts = benchmark_utils::get_random_data<size_t>(100000, 1, max_length);
+    unsigned int        runs_count = 0;
+    std::vector<size_t> key_counts
+        = benchmark_utils::get_random_data<size_t>(100000, 1, max_length);
     size_t offset = 0;
     while(offset < size)
     {
         const size_t key_count = key_counts[runs_count % key_counts.size()];
-        const size_t end = std::min(size, offset + key_count);
+        const size_t end       = std::min(size, offset + key_count);
         for(size_t i = offset; i < end; i++)
         {
             input[i] = runs_count;
@@ -61,24 +64,18 @@ void run_encode_benchmark(benchmark::State& state, size_t max_length, hipStream_
         offset += key_count;
     }
 
-    key_type * d_input;
+    key_type* d_input;
     HIP_CHECK(hipMalloc(&d_input, size * sizeof(key_type)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            size * sizeof(key_type),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    key_type * d_unique_output;
-    count_type * d_counts_output;
-    count_type * d_runs_count_output;
+    HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(key_type), hipMemcpyHostToDevice));
+
+    key_type*   d_unique_output;
+    count_type* d_counts_output;
+    count_type* d_runs_count_output;
     HIP_CHECK(hipMalloc(&d_unique_output, runs_count * sizeof(key_type)));
     HIP_CHECK(hipMalloc(&d_counts_output, runs_count * sizeof(count_type)));
     HIP_CHECK(hipMalloc(&d_runs_count_output, sizeof(count_type)));
 
-    void * d_temporary_storage = nullptr;
+    void*  d_temporary_storage     = nullptr;
     size_t temporary_storage_bytes = 0;
 
     HIP_CHECK(hipcub::DeviceRunLengthEncode::Encode(nullptr,
@@ -108,7 +105,7 @@ void run_encode_benchmark(benchmark::State& state, size_t max_length, hipStream_
     HIP_CHECK(hipDeviceSynchronize());
 
     const unsigned int batch_size = 10;
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
 
@@ -126,8 +123,8 @@ void run_encode_benchmark(benchmark::State& state, size_t max_length, hipStream_
         HIP_CHECK(hipStreamSynchronize(stream));
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type));
@@ -141,22 +138,26 @@ void run_encode_benchmark(benchmark::State& state, size_t max_length, hipStream_
 }
 
 template<class T>
-void run_non_trivial_runs_benchmark(benchmark::State& state, size_t max_length, hipStream_t stream, size_t size)
+void run_non_trivial_runs_benchmark(benchmark::State& state,
+                                    size_t            max_length,
+                                    hipStream_t       stream,
+                                    size_t            size)
 {
-    using key_type = T;
+    using key_type    = T;
     using offset_type = unsigned int;
-    using count_type = unsigned int;
+    using count_type  = unsigned int;
 
     // Generate data
     std::vector<key_type> input(size);
 
-    unsigned int runs_count = 0;
-    std::vector<size_t> key_counts = benchmark_utils::get_random_data<size_t>(100000, 1, max_length);
+    unsigned int        runs_count = 0;
+    std::vector<size_t> key_counts
+        = benchmark_utils::get_random_data<size_t>(100000, 1, max_length);
     size_t offset = 0;
     while(offset < size)
     {
         const size_t key_count = key_counts[runs_count % key_counts.size()];
-        const size_t end = std::min(size, offset + key_count);
+        const size_t end       = std::min(size, offset + key_count);
         for(size_t i = offset; i < end; i++)
         {
             input[i] = runs_count;
@@ -166,24 +167,18 @@ void run_non_trivial_runs_benchmark(benchmark::State& state, size_t max_length,
         offset += key_count;
     }
 
-    key_type * d_input;
+    key_type* d_input;
     HIP_CHECK(hipMalloc(&d_input, size * sizeof(key_type)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            size * sizeof(key_type),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    offset_type * d_offsets_output;
-    count_type * d_counts_output;
-    count_type * d_runs_count_output;
+    HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(key_type), hipMemcpyHostToDevice));
+
+    offset_type* d_offsets_output;
+    count_type*  d_counts_output;
+    count_type*  d_runs_count_output;
     HIP_CHECK(hipMalloc(&d_offsets_output, runs_count * sizeof(offset_type)));
     HIP_CHECK(hipMalloc(&d_counts_output, runs_count * sizeof(count_type)));
     HIP_CHECK(hipMalloc(&d_runs_count_output, sizeof(count_type)));
 
-    void * d_temporary_storage = nullptr;
+    void*  d_temporary_storage     = nullptr;
     size_t temporary_storage_bytes = 0;
 
     HIP_CHECK(hipcub::DeviceRunLengthEncode::NonTrivialRuns(nullptr,
@@ -213,7 +208,7 @@ void run_non_trivial_runs_benchmark(benchmark::State& state, size_t max_length,
     HIP_CHECK(hipDeviceSynchronize());
 
     const unsigned int batch_size = 10;
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
 
@@ -231,8 +226,8 @@ void run_non_trivial_runs_benchmark(benchmark::State& state, size_t max_length,
         HIP_CHECK(hipStreamSynchronize(stream));
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type));
@@ -245,25 +240,26 @@ void run_non_trivial_runs_benchmark(benchmark::State& state, size_t max_length,
     HIP_CHECK(hipFree(d_runs_count_output));
 }
 
-#define CREATE_ENCODE_BENCHMARK(T) \
-benchmark::RegisterBenchmark( \
-    (std::string("run_length_encode") + "<Datatype:" #T ">" + \
-        "(Random Number Range:[1, " + std::to_string(max_length) + "])" \
-    ).c_str(), \
-    &run_encode_benchmark<T>, \
-    max_length, stream, size \
-)
-
-void add_encode_benchmarks(size_t max_length,
+#define CREATE_ENCODE_BENCHMARK(T)                                                \
+    benchmark::RegisterBenchmark(std::string("device_run_length_encode"           \
+                                             "<data_type:" #T ">."                \
+                                             "(random_number_range:[1, "          \
+                                             + std::to_string(max_length) + "])") \
+                                     .c_str(),                                    \
+                                 &run_encode_benchmark<T>,                        \
+                                 max_length,                                      \
+                                 stream,                                          \
+                                 size)
+
+void add_encode_benchmarks(size_t                                        max_length,
                            std::vector<benchmark::internal::Benchmark*>& benchmarks,
-                           hipStream_t stream,
-                           size_t size)
+                           hipStream_t                                   stream,
+                           size_t                                        size)
 {
-    using custom_float2 = benchmark_utils::custom_type<float, float>;
+    using custom_float2  = benchmark_utils::custom_type<float, float>;
     using custom_double2 = benchmark_utils::custom_type<double, double>;
 
-    std::vector<benchmark::internal::Benchmark*> bs =
-    {
+    std::vector<benchmark::internal::Benchmark*> bs = {
         CREATE_ENCODE_BENCHMARK(int),
         CREATE_ENCODE_BENCHMARK(long long),
 
@@ -277,25 +273,26 @@ void add_encode_benchmarks(size_t max_length,
     benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
 }
 
-#define CREATE_NON_TRIVIAL_RUNS_BENCHMARK(T) \
-benchmark::RegisterBenchmark( \
-    (std::string("run_length_encode_non_trivial_runs") + "<Datatype:" #T ">" + \
-        "(Random Number Range:[1, " + std::to_string(max_length) + "])" \
-    ).c_str(), \
-    &run_non_trivial_runs_benchmark<T>, \
-    max_length, stream, size \
-)
-
-void add_non_trivial_runs_benchmarks(size_t max_length,
+#define CREATE_NON_TRIVIAL_RUNS_BENCHMARK(T)                                      \
+    benchmark::RegisterBenchmark(std::string("run_length_encode_non_trivial_runs" \
+                                             "<data_type:" #T ">"                 \
+                                             "(random_number_range:[1, "          \
+                                             + std::to_string(max_length) + "])") \
+                                     .c_str(),                                    \
+                                 &run_non_trivial_runs_benchmark<T>,              \
+                                 max_length,                                      \
+                                 stream,                                          \
+                                 size)
+
+void add_non_trivial_runs_benchmarks(size_t                                        max_length,
                                      std::vector<benchmark::internal::Benchmark*>& benchmarks,
-                                     hipStream_t stream,
-                                     size_t size)
+                                     hipStream_t                                   stream,
+                                     size_t                                        size)
 {
-    using custom_float2 = benchmark_utils::custom_type<float, float>;
+    using custom_float2  = benchmark_utils::custom_type<float, float>;
     using custom_double2 = benchmark_utils::custom_type<double, double>;
 
-    std::vector<benchmark::internal::Benchmark*> bs =
-    {
+    std::vector<benchmark::internal::Benchmark*> bs = {
         CREATE_NON_TRIVIAL_RUNS_BENCHMARK(int),
         CREATE_NON_TRIVIAL_RUNS_BENCHMARK(long long),
 
@@ -309,7 +306,7 @@ void add_non_trivial_runs_benchmarks(size_t max_length,
     benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -318,15 +315,15 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     std::cout << "benchmark_device_run_length_encode" << std::endl;
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
diff --git a/benchmark/benchmark_device_scan.cpp b/benchmark/benchmark_device_scan.cpp
index 897f5eec..dbfdda6a 100644
--- a/benchmark/benchmark_device_scan.cpp
+++ b/benchmark/benchmark_device_scan.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -20,8 +20,8 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE
 
-// CUB's implementation of single_pass_scan_operators has maybe uninitialized parameters,
-// disable the warning because all warnings are threated as errors:
+// CUB's implementation of single_pass_scan_operators has maybe uninitialized
+// parameters, disable the warning because all warnings are threated as errors:
 #ifdef __HIP_PLATFORM_NVIDIA__
     #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #endif
@@ -31,7 +31,6 @@
 // HIP API
 #include "hipcub/device/device_scan.hpp"
 
-
 #ifndef DEFAULT_N
 const size_t DEFAULT_N = 1024 * 1024 * 32;
 #endif
@@ -68,7 +67,7 @@ auto run_device_scan(void*             temporary_storage,
                      const hipStream_t stream) ->
     typename std::enable_if<!Exclusive, hipError_t>::type
 {
-    (void) initial_value;
+    (void)initial_value;
     return hipcub::DeviceScan::InclusiveScan(temporary_storage,
                                              storage_size,
                                              input,
@@ -125,55 +124,47 @@ auto run_device_scan_by_key(void*   temporary_storage,
                                                   stream);
 }
 
-template<
-    bool Exclusive,
-    class T,
-    class BinaryFunction
->
+template<bool Exclusive, class T, class BinaryFunction>
 void run_benchmark(benchmark::State& state,
-                   size_t size,
+                   size_t            size,
                    const hipStream_t stream,
-                   BinaryFunction scan_op)
+                   BinaryFunction    scan_op)
 {
-    std::vector<T> input = benchmark_utils::get_random_data<T>(size, T(0), T(1000));
-    T initial_value = T(123);
-    T * d_input;
-    T * d_output;
+    std::vector<T> input         = benchmark_utils::get_random_data<T>(size, T(0), T(1000));
+    T              initial_value = T(123);
+    T*             d_input;
+    T*             d_output;
     HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            size * sizeof(T),
-            hipMemcpyHostToDevice
-        )
-    );
+    HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice));
     HIP_CHECK(hipDeviceSynchronize());
 
     // Allocate temporary storage memory
     size_t temp_storage_size_bytes = 0;
-    void * d_temp_storage = nullptr;
+    void*  d_temp_storage          = nullptr;
     // Get size of d_temp_storage
-    HIP_CHECK((
-        run_device_scan<Exclusive>(
-            d_temp_storage, temp_storage_size_bytes,
-            d_input, d_output, initial_value, size,
-            scan_op, stream
-        )
-    ));
-    HIP_CHECK(hipMalloc(&d_temp_storage,temp_storage_size_bytes));
+    HIP_CHECK((run_device_scan<Exclusive>(d_temp_storage,
+                                          temp_storage_size_bytes,
+                                          d_input,
+                                          d_output,
+                                          initial_value,
+                                          size,
+                                          scan_op,
+                                          stream)));
+    HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes));
     HIP_CHECK(hipDeviceSynchronize());
 
     // Warm-up
     for(size_t i = 0; i < 5; i++)
     {
-        HIP_CHECK((
-            run_device_scan<Exclusive>(
-                d_temp_storage, temp_storage_size_bytes,
-                d_input, d_output, initial_value, size,
-                scan_op, stream
-            )
-        ));
+        HIP_CHECK((run_device_scan<Exclusive>(d_temp_storage,
+                                              temp_storage_size_bytes,
+                                              d_input,
+                                              d_output,
+                                              initial_value,
+                                              size,
+                                              scan_op,
+                                              stream)));
     }
     HIP_CHECK(hipDeviceSynchronize());
 
@@ -183,19 +174,20 @@ void run_benchmark(benchmark::State& state,
         auto start = std::chrono::high_resolution_clock::now();
         for(size_t i = 0; i < batch_size; i++)
         {
-            HIP_CHECK((
-                run_device_scan<Exclusive>(
-                    d_temp_storage, temp_storage_size_bytes,
-                    d_input, d_output, initial_value, size,
-                    scan_op, stream
-                )
-            ));
+            HIP_CHECK((run_device_scan<Exclusive>(d_temp_storage,
+                                                  temp_storage_size_bytes,
+                                                  d_input,
+                                                  d_output,
+                                                  initial_value,
+                                                  size,
+                                                  scan_op,
+                                                  stream)));
         }
         HIP_CHECK(hipStreamSynchronize(stream));
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T));
@@ -206,70 +198,59 @@ void run_benchmark(benchmark::State& state,
     HIP_CHECK(hipFree(d_temp_storage));
 }
 
-template<
-    bool Exclusive,
-    class T,
-    class BinaryFunction
->
+template<bool Exclusive, class T, class BinaryFunction>
 void run_benchmark_by_key(benchmark::State& state,
-                          size_t size,
+                          size_t            size,
                           const hipStream_t stream,
-                          BinaryFunction scan_op)
+                          BinaryFunction    scan_op)
 {
-    using key_type = int;
+    using key_type                      = int;
     constexpr size_t max_segment_length = 100;
 
-    const std::vector<key_type> keys = benchmark_utils::get_random_segments<key_type>(
-        size, max_segment_length, std::random_device{}()
-    );
-    const std::vector<T> input = benchmark_utils::get_random_data<T>(size, T(0), T(1000));
-    const T initial_value = T(123);
-    key_type * d_keys;
-    T * d_input;
-    T * d_output;
+    const std::vector<key_type> keys
+        = benchmark_utils::get_random_segments<key_type>(size,
+                                                         max_segment_length,
+                                                         std::random_device{}());
+    const std::vector<T> input         = benchmark_utils::get_random_data<T>(size, T(0), T(1000));
+    const T              initial_value = T(123);
+    key_type*            d_keys;
+    T*                   d_input;
+    T*                   d_output;
     HIP_CHECK(hipMalloc(&d_keys, size * sizeof(key_type)));
     HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_keys, keys.data(),
-            size * sizeof(key_type),
-            hipMemcpyHostToDevice
-        )
-    );
-    HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            size * sizeof(T),
-            hipMemcpyHostToDevice
-        )
-    );
+    HIP_CHECK(hipMemcpy(d_keys, keys.data(), size * sizeof(key_type), hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice));
     HIP_CHECK(hipDeviceSynchronize());
 
     // Allocate temporary storage memory
     size_t temp_storage_size_bytes = 0;
-    void * d_temp_storage = nullptr;
+    void*  d_temp_storage          = nullptr;
     // Get size of d_temp_storage
-    HIP_CHECK((
-        run_device_scan_by_key<Exclusive>(
-            d_temp_storage, temp_storage_size_bytes,
-            d_keys, d_input, d_output, initial_value,
-            size, scan_op, stream
-        )
-    ));
-    HIP_CHECK(hipMalloc(&d_temp_storage,temp_storage_size_bytes));
+    HIP_CHECK((run_device_scan_by_key<Exclusive>(d_temp_storage,
+                                                 temp_storage_size_bytes,
+                                                 d_keys,
+                                                 d_input,
+                                                 d_output,
+                                                 initial_value,
+                                                 size,
+                                                 scan_op,
+                                                 stream)));
+    HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes));
     HIP_CHECK(hipDeviceSynchronize());
 
     // Warm-up
     for(size_t i = 0; i < 5; i++)
     {
-        HIP_CHECK((
-            run_device_scan_by_key<Exclusive>(
-                d_temp_storage, temp_storage_size_bytes,
-                d_keys, d_input, d_output, initial_value,
-                size, scan_op, stream
-            )
-        ));
+        HIP_CHECK((run_device_scan_by_key<Exclusive>(d_temp_storage,
+                                                     temp_storage_size_bytes,
+                                                     d_keys,
+                                                     d_input,
+                                                     d_output,
+                                                     initial_value,
+                                                     size,
+                                                     scan_op,
+                                                     stream)));
     }
     HIP_CHECK(hipDeviceSynchronize());
 
@@ -279,19 +260,21 @@ void run_benchmark_by_key(benchmark::State& state,
         auto start = std::chrono::high_resolution_clock::now();
         for(size_t i = 0; i < batch_size; i++)
         {
-            HIP_CHECK((
-                run_device_scan_by_key<Exclusive>(
-                    d_temp_storage, temp_storage_size_bytes,
-                    d_keys, d_input, d_output, initial_value,
-                    size, scan_op, stream
-                )
-            ));
+            HIP_CHECK((run_device_scan_by_key<Exclusive>(d_temp_storage,
+                                                         temp_storage_size_bytes,
+                                                         d_keys,
+                                                         d_input,
+                                                         d_output,
+                                                         initial_value,
+                                                         size,
+                                                         scan_op,
+                                                         stream)));
         }
         HIP_CHECK(hipStreamSynchronize(stream));
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T));
@@ -303,37 +286,38 @@ void run_benchmark_by_key(benchmark::State& state,
     HIP_CHECK(hipFree(d_temp_storage));
 }
 
-#define CREATE_BENCHMARK(EXCL, T, SCAN_OP) \
-benchmark::RegisterBenchmark( \
-    (std::string(EXCL ? "exclusive_scan" : "inclusive_scan") + \
-    ("<Datatype:" #T ",Op:" #SCAN_OP ">")).c_str(), \
-    &run_benchmark<EXCL, T, SCAN_OP>, size, stream, SCAN_OP() \
-), \
-benchmark::RegisterBenchmark( \
-    (std::string(EXCL ? "exclusive_scan_by_key" : "inclusive_scan_by_key") + \
-    ("<Datatype:" #T ",Op:" #SCAN_OP ">")).c_str(), \
-    &run_benchmark_by_key<EXCL, T, SCAN_OP>, size, stream, SCAN_OP() \
-)
-
-#define CREATE_BENCHMARKS(SCAN_OP) \
-    CREATE_BENCHMARK(false, int, SCAN_OP), \
-    CREATE_BENCHMARK(true, int, SCAN_OP), \
-    CREATE_BENCHMARK(false, float, SCAN_OP), \
-    CREATE_BENCHMARK(true, float, SCAN_OP), \
-    CREATE_BENCHMARK(false, double, SCAN_OP), \
-    CREATE_BENCHMARK(true, double, SCAN_OP), \
-    CREATE_BENCHMARK(false, long long, SCAN_OP), \
-    CREATE_BENCHMARK(true, long long, SCAN_OP), \
-    CREATE_BENCHMARK(false, custom_float2, SCAN_OP), \
-    CREATE_BENCHMARK(true, custom_float2, SCAN_OP), \
-    CREATE_BENCHMARK(false, custom_double2, SCAN_OP), \
-    CREATE_BENCHMARK(true, custom_double2, SCAN_OP), \
-    CREATE_BENCHMARK(false, int8_t, SCAN_OP), \
-    CREATE_BENCHMARK(true, int8_t, SCAN_OP), \
-    CREATE_BENCHMARK(false, uint8_t, SCAN_OP), \
-    CREATE_BENCHMARK(true, uint8_t, SCAN_OP)
-
-int main(int argc, char *argv[])
+#define CREATE_BENCHMARK(EXCL, T, SCAN_OP)                                                \
+    benchmark::RegisterBenchmark(                                                         \
+        std::string(std::string(EXCL ? "device_exclusive_scan" : "device_inclusive_scan") \
+                    + "<data_type:" #T ",op:" #SCAN_OP ">.")                              \
+            .c_str(),                                                                     \
+        &run_benchmark<EXCL, T, SCAN_OP>,                                                 \
+        size,                                                                             \
+        stream,                                                                           \
+        SCAN_OP()),                                                                       \
+        benchmark::RegisterBenchmark(                                                     \
+            std::string(std::string(EXCL ? "device_exclusive_scan_by_key"                 \
+                                         : "device_inclusive_scan_by_key")                \
+                        + "<data_type:" #T ",op:" #SCAN_OP ">.")                          \
+                .c_str(),                                                                 \
+            &run_benchmark_by_key<EXCL, T, SCAN_OP>,                                      \
+            size,                                                                         \
+            stream,                                                                       \
+            SCAN_OP())
+
+#define CREATE_BENCHMARKS(SCAN_OP)                                                                 \
+    CREATE_BENCHMARK(false, int, SCAN_OP), CREATE_BENCHMARK(true, int, SCAN_OP),                   \
+        CREATE_BENCHMARK(false, float, SCAN_OP), CREATE_BENCHMARK(true, float, SCAN_OP),           \
+        CREATE_BENCHMARK(false, double, SCAN_OP), CREATE_BENCHMARK(true, double, SCAN_OP),         \
+        CREATE_BENCHMARK(false, long long, SCAN_OP), CREATE_BENCHMARK(true, long long, SCAN_OP),   \
+        CREATE_BENCHMARK(false, custom_float2, SCAN_OP),                                           \
+        CREATE_BENCHMARK(true, custom_float2, SCAN_OP),                                            \
+        CREATE_BENCHMARK(false, custom_double2, SCAN_OP),                                          \
+        CREATE_BENCHMARK(true, custom_double2, SCAN_OP), CREATE_BENCHMARK(false, int8_t, SCAN_OP), \
+        CREATE_BENCHMARK(true, int8_t, SCAN_OP), CREATE_BENCHMARK(false, uint8_t, SCAN_OP),        \
+        CREATE_BENCHMARK(true, uint8_t, SCAN_OP)
+
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -342,29 +326,29 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     std::cout << "benchmark_device_scan" << std::endl;
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
 
     using custom_double2 = benchmark_utils::custom_type<double, double>;
-    using custom_float2 = benchmark_utils::custom_type<float, float>;
+    using custom_float2  = benchmark_utils::custom_type<float, float>;
 
-    // Compilation may never finish, if the compiler needs to compile too many kernels,
-    // it is recommended to compile benchmarks only for 1-2 types when BENCHMARK_CONFIG_TUNING is used
-    // (all other CREATE_*_BENCHMARK should be commented/removed).
+    // Compilation may never finish, if the compiler needs to compile too many
+    // kernels, it is recommended to compile benchmarks only for 1-2 types when
+    // BENCHMARK_CONFIG_TUNING is used (all other CREATE_*_BENCHMARK should be
+    // commented/removed).
 
     // Add benchmarks
-    std::vector<benchmark::internal::Benchmark*> benchmarks =
-    {
+    std::vector<benchmark::internal::Benchmark*> benchmarks = {
         CREATE_BENCHMARKS(hipcub::Sum),
         CREATE_BENCHMARKS(hipcub::Min),
     };
diff --git a/benchmark/benchmark_device_segmented_radix_sort.cpp b/benchmark/benchmark_device_segmented_radix_sort.cpp
index 65b8d116..ad7f3075 100644
--- a/benchmark/benchmark_device_segmented_radix_sort.cpp
+++ b/benchmark/benchmark_device_segmented_radix_sort.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -25,26 +25,25 @@
 // HIP API
 #include "hipcub/hipcub.hpp"
 
-
 #ifndef DEFAULT_N
 const size_t DEFAULT_N = 1024 * 1024 * 32;
 #endif
 
-const unsigned int batch_size = 4;
+const unsigned int batch_size  = 4;
 const unsigned int warmup_size = 2;
 
-constexpr bool Ascending = false;
+constexpr bool Ascending  = false;
 constexpr bool Descending = true;
 
 template<class Key>
 void run_sort_keys_benchmark(benchmark::State& state,
-                             size_t desired_segments,
-                             hipStream_t stream, 
-                             size_t size,
-                             bool descending = false)
+                             size_t            desired_segments,
+                             hipStream_t       stream,
+                             size_t            size,
+                             bool              descending = false)
 {
     using offset_type = int;
-    using key_type = Key;
+    using key_type    = Key;
     typedef hipError_t (*sort_func)(void*,
                                     size_t&,
                                     const key_type*,
@@ -57,11 +56,10 @@ void run_sort_keys_benchmark(benchmark::State& state,
                                     int,
                                     hipStream_t);
 
-    sort_func func_ascending  = &hipcub::DeviceSegmentedRadixSort::SortKeys
-        <key_type, offset_type *>;
-    sort_func func_descending = &hipcub::DeviceSegmentedRadixSort::SortKeysDescending
-        <key_type, offset_type *>;
-        
+    sort_func func_ascending = &hipcub::DeviceSegmentedRadixSort::SortKeys<key_type, offset_type*>;
+    sort_func func_descending
+        = &hipcub::DeviceSegmentedRadixSort::SortKeysDescending<key_type, offset_type*>;
+
     sort_func sorting = descending ? func_descending : func_ascending;
 
     // Generate data
@@ -69,13 +67,13 @@ void run_sort_keys_benchmark(benchmark::State& state,
 
     const double avg_segment_length = static_cast<double>(size) / desired_segments;
 
-    const unsigned int seed = 123;
+    const unsigned int         seed = 123;
     std::default_random_engine gen(seed);
 
     std::uniform_real_distribution<double> segment_length_dis(0, avg_segment_length * 2);
 
     unsigned int segments_count = 0;
-    size_t offset = 0;
+    size_t       offset         = 0;
     while(offset < size)
     {
         const size_t segment_length = std::round(segment_length_dis(gen));
@@ -88,41 +86,31 @@ void run_sort_keys_benchmark(benchmark::State& state,
     std::vector<key_type> keys_input;
     if(std::is_floating_point<key_type>::value)
     {
-        keys_input = benchmark_utils::get_random_data<key_type>(
-            size, (key_type)-1000, (key_type)+1000);
-    }
-    else
+        keys_input
+            = benchmark_utils::get_random_data<key_type>(size, (key_type)-1000, (key_type) + 1000);
+    } else
     {
-        keys_input = benchmark_utils::get_random_data<key_type>(
-            size,
-            std::numeric_limits<key_type>::min(),
-            std::numeric_limits<key_type>::max()
-        );
+        keys_input
+            = benchmark_utils::get_random_data<key_type>(size,
+                                                         std::numeric_limits<key_type>::min(),
+                                                         std::numeric_limits<key_type>::max());
     }
 
-    offset_type * d_offsets;
+    offset_type* d_offsets;
     HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_offsets, offsets.data(),
-            (segments_count + 1) * sizeof(offset_type),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    key_type * d_keys_input;
-    key_type * d_keys_output;
+    HIP_CHECK(hipMemcpy(d_offsets,
+                        offsets.data(),
+                        (segments_count + 1) * sizeof(offset_type),
+                        hipMemcpyHostToDevice));
+
+    key_type* d_keys_input;
+    key_type* d_keys_output;
     HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type)));
     HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type)));
     HIP_CHECK(
-        hipMemcpy(
-            d_keys_input, keys_input.data(),
-            size * sizeof(key_type),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    void * d_temporary_storage = nullptr;
+        hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice));
+
+    void*  d_temporary_storage     = nullptr;
     size_t temporary_storage_bytes = 0;
     HIP_CHECK(sorting(d_temporary_storage,
                       temporary_storage_bytes,
@@ -156,7 +144,7 @@ void run_sort_keys_benchmark(benchmark::State& state,
     }
     HIP_CHECK(hipDeviceSynchronize());
 
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
 
@@ -177,8 +165,8 @@ void run_sort_keys_benchmark(benchmark::State& state,
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type));
@@ -192,14 +180,14 @@ void run_sort_keys_benchmark(benchmark::State& state,
 
 template<class Key, class Value>
 void run_sort_pairs_benchmark(benchmark::State& state,
-                              size_t desired_segments,
-                              hipStream_t stream,
-                              size_t size,
-                              bool descending = false)
+                              size_t            desired_segments,
+                              hipStream_t       stream,
+                              size_t            size,
+                              bool              descending = false)
 {
     using offset_type = int;
-    using key_type = Key;
-    using value_type = Value;
+    using key_type    = Key;
+    using value_type  = Value;
     typedef hipError_t (*sort_func)(void*,
                                     size_t&,
                                     const key_type*,
@@ -214,10 +202,10 @@ void run_sort_pairs_benchmark(benchmark::State& state,
                                     int,
                                     hipStream_t);
 
-    sort_func func_ascending  = &hipcub::DeviceSegmentedRadixSort::SortPairs
-        <key_type, value_type, offset_type *>;
-    sort_func func_descending = &hipcub::DeviceSegmentedRadixSort::SortPairsDescending
-        <key_type, value_type, offset_type *>;
+    sort_func func_ascending
+        = &hipcub::DeviceSegmentedRadixSort::SortPairs<key_type, value_type, offset_type*>;
+    sort_func func_descending = &hipcub::DeviceSegmentedRadixSort::
+                                    SortPairsDescending<key_type, value_type, offset_type*>;
 
     sort_func sorting = descending ? func_descending : func_ascending;
 
@@ -226,13 +214,13 @@ void run_sort_pairs_benchmark(benchmark::State& state,
 
     const double avg_segment_length = static_cast<double>(size) / desired_segments;
 
-    const unsigned int seed = 123;
+    const unsigned int         seed = 123;
     std::default_random_engine gen(seed);
 
     std::uniform_real_distribution<double> segment_length_dis(0, avg_segment_length * 2);
 
     unsigned int segments_count = 0;
-    size_t offset = 0;
+    size_t       offset         = 0;
     while(offset < size)
     {
         const size_t segment_length = std::round(segment_length_dis(gen));
@@ -245,56 +233,43 @@ void run_sort_pairs_benchmark(benchmark::State& state,
     std::vector<key_type> keys_input;
     if(std::is_floating_point<key_type>::value)
     {
-        keys_input = benchmark_utils::get_random_data<key_type>(
-            size, (key_type)-1000, (key_type)+1000);
-    }
-    else
+        keys_input
+            = benchmark_utils::get_random_data<key_type>(size, (key_type)-1000, (key_type) + 1000);
+    } else
     {
-        keys_input = benchmark_utils::get_random_data<key_type>(
-            size,
-            std::numeric_limits<key_type>::min(),
-            std::numeric_limits<key_type>::max()
-        );
+        keys_input
+            = benchmark_utils::get_random_data<key_type>(size,
+                                                         std::numeric_limits<key_type>::min(),
+                                                         std::numeric_limits<key_type>::max());
     }
 
     std::vector<value_type> values_input(size);
     std::iota(values_input.begin(), values_input.end(), 0);
 
-    offset_type * d_offsets;
+    offset_type* d_offsets;
     HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_offsets, offsets.data(),
-            (segments_count + 1) * sizeof(offset_type),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    key_type * d_keys_input;
-    key_type * d_keys_output;
+    HIP_CHECK(hipMemcpy(d_offsets,
+                        offsets.data(),
+                        (segments_count + 1) * sizeof(offset_type),
+                        hipMemcpyHostToDevice));
+
+    key_type* d_keys_input;
+    key_type* d_keys_output;
     HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type)));
     HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type)));
     HIP_CHECK(
-        hipMemcpy(
-            d_keys_input, keys_input.data(),
-            size * sizeof(key_type),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    value_type * d_values_input;
-    value_type * d_values_output;
+        hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice));
+
+    value_type* d_values_input;
+    value_type* d_values_output;
     HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type)));
     HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_values_input, values_input.data(),
-            size * sizeof(value_type),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    void * d_temporary_storage = nullptr;
+    HIP_CHECK(hipMemcpy(d_values_input,
+                        values_input.data(),
+                        size * sizeof(value_type),
+                        hipMemcpyHostToDevice));
+
+    void*  d_temporary_storage     = nullptr;
     size_t temporary_storage_bytes = 0;
     HIP_CHECK(sorting(d_temporary_storage,
                       temporary_storage_bytes,
@@ -332,7 +307,7 @@ void run_sort_pairs_benchmark(benchmark::State& state,
     }
     HIP_CHECK(hipDeviceSynchronize());
 
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
 
@@ -355,13 +330,12 @@ void run_sort_pairs_benchmark(benchmark::State& state,
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
-    state.SetBytesProcessed(
-        state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))
-    );
+    state.SetBytesProcessed(state.iterations() * batch_size * size
+                            * (sizeof(key_type) + sizeof(value_type)));
     state.SetItemsProcessed(state.iterations() * batch_size * size);
 
     HIP_CHECK(hipFree(d_temporary_storage));
@@ -372,41 +346,40 @@ void run_sort_pairs_benchmark(benchmark::State& state,
     HIP_CHECK(hipFree(d_values_output));
 }
 
-#define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \
-benchmark::RegisterBenchmark( \
-    (std::string("sort_keys") + "<Key Datatype:" #Key ">" + \
-        "(Segments:~" + std::to_string(SEGMENTS) + " segments)" \
-    ).c_str(), \
-    [=](benchmark::State& state) { run_sort_keys_benchmark<Key>(state, SEGMENTS, stream, size, Ascending); } \
-)
-
-#define CREATE_SORT_KEYS_DESCENDING_BENCHMARK(Key, SEGMENTS) \
-benchmark::RegisterBenchmark( \
-    (std::string("sort_keys") + "<" #Key ">" + \
-        "(~" + std::to_string(SEGMENTS) + " segments), descending" \
-    ).c_str(), \
-    [=](benchmark::State& state) { run_sort_keys_benchmark<Key>(state, SEGMENTS, stream, size, Descending); } \
-)
-
-#define BENCHMARK_KEY_TYPE(type) \
-    CREATE_SORT_KEYS_BENCHMARK(type, 1), \
-    CREATE_SORT_KEYS_BENCHMARK(type, 10), \
-    CREATE_SORT_KEYS_BENCHMARK(type, 100), \
-    CREATE_SORT_KEYS_BENCHMARK(type, 1000), \
-    CREATE_SORT_KEYS_BENCHMARK(type, 10000), \
-    CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 1), \
-    CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 10), \
-    CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 100), \
-    CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 1000), \
-    CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 10000)
-
+#define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS)              \
+    benchmark::RegisterBenchmark(                              \
+        std::string("device_segmented_radix_sort_keys"         \
+                    "<key_data_type:" #Key ",ascending:true>." \
+                    "(segments:~"                              \
+                    + std::to_string(SEGMENTS) + " segments)") \
+            .c_str(),                                          \
+        [=](benchmark::State& state)                           \
+        { run_sort_keys_benchmark<Key>(state, SEGMENTS, stream, size, Ascending); })
+
+#define CREATE_SORT_KEYS_DESCENDING_BENCHMARK(Key, SEGMENTS)    \
+    benchmark::RegisterBenchmark(                               \
+        std::string("device_segmented_radix_sort_keys"          \
+                    "<key_data_type:" #Key ",ascending:false>." \
+                    "(segments:~"                               \
+                    + std::to_string(SEGMENTS) + " segments)")  \
+            .c_str(),                                           \
+        [=](benchmark::State& state)                            \
+        { run_sort_keys_benchmark<Key>(state, SEGMENTS, stream, size, Descending); })
+
+#define BENCHMARK_KEY_TYPE(type)                                                                 \
+    CREATE_SORT_KEYS_BENCHMARK(type, 1), CREATE_SORT_KEYS_BENCHMARK(type, 10),                   \
+        CREATE_SORT_KEYS_BENCHMARK(type, 100), CREATE_SORT_KEYS_BENCHMARK(type, 1000),           \
+        CREATE_SORT_KEYS_BENCHMARK(type, 10000), CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 1), \
+        CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 10),                                         \
+        CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 100),                                        \
+        CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 1000),                                       \
+        CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 10000)
 
 void add_sort_keys_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
-                              hipStream_t stream,
-                              size_t size)
+                              hipStream_t                                   stream,
+                              size_t                                        size)
 {
-    std::vector<benchmark::internal::Benchmark*> bs =
-    {
+    std::vector<benchmark::internal::Benchmark*> bs = {
         BENCHMARK_KEY_TYPE(float),
         BENCHMARK_KEY_TYPE(double),
         BENCHMARK_KEY_TYPE(int8_t),
@@ -416,45 +389,45 @@ void add_sort_keys_benchmarks(std::vector<benchmark::internal::Benchmark*>& benc
     benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
 }
 
-#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \
-benchmark::RegisterBenchmark( \
-    (std::string("sort_pairs") + "<Key Datatype:" #Key ",Value Datatype:" #Value ">" + \
-        "(Segments:~" + std::to_string(SEGMENTS) + " segments)" \
-    ).c_str(), \
-    [=](benchmark::State& state) { \
-        run_sort_pairs_benchmark<Key, Value>(state, SEGMENTS, stream, size, Ascending); } \
-)
-
-#define CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(Key, Value, SEGMENTS) \
-benchmark::RegisterBenchmark( \
-    (std::string("sort_pairs") + "<" #Key ", " #Value ">" + \
-        "(~" + std::to_string(SEGMENTS) + " segments), descending" \
-    ).c_str(), \
-    [=](benchmark::State& state) { \
-        run_sort_pairs_benchmark<Key, Value>(state, SEGMENTS, stream, size, Descending); } \
-)
-
-#define BENCHMARK_PAIR_TYPE(type, value) \
-    CREATE_SORT_PAIRS_BENCHMARK(type, value, 1), \
-    CREATE_SORT_PAIRS_BENCHMARK(type, value, 10), \
-    CREATE_SORT_PAIRS_BENCHMARK(type, value, 100), \
-    CREATE_SORT_PAIRS_BENCHMARK(type, value, 1000), \
-    CREATE_SORT_PAIRS_BENCHMARK(type, value, 10000), \
-    CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 1), \
-    CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 10), \
-    CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 100), \
-    CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 1000), \
-    CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 10000)
+#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS)                                 \
+    benchmark::RegisterBenchmark(                                                         \
+        std::string("device_segmented_radix_sort_pairs"                                   \
+                    "<key_data_type:" #Key ",value_data_type:" #Value ",ascending:true>." \
+                    "(segments:~"                                                         \
+                    + std::to_string(SEGMENTS) + " segments)")                            \
+            .c_str(),                                                                     \
+        [=](benchmark::State& state)                                                      \
+        { run_sort_pairs_benchmark<Key, Value>(state, SEGMENTS, stream, size, Ascending); })
+
+#define CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(Key, Value, SEGMENTS)                       \
+    benchmark::RegisterBenchmark(                                                          \
+        std::string("device_segmented_radix_sort_pairs"                                    \
+                    "<key_data_type:" #Key ",value_data_type:" #Value ",ascending:false>." \
+                    "(segments:~"                                                          \
+                    + std::to_string(SEGMENTS) + " segments)")                             \
+            .c_str(),                                                                      \
+        [=](benchmark::State& state)                                                       \
+        { run_sort_pairs_benchmark<Key, Value>(state, SEGMENTS, stream, size, Descending); })
+
+#define BENCHMARK_PAIR_TYPE(type, value)                                                       \
+    CREATE_SORT_PAIRS_BENCHMARK(type, value, 1), CREATE_SORT_PAIRS_BENCHMARK(type, value, 10), \
+        CREATE_SORT_PAIRS_BENCHMARK(type, value, 100),                                         \
+        CREATE_SORT_PAIRS_BENCHMARK(type, value, 1000),                                        \
+        CREATE_SORT_PAIRS_BENCHMARK(type, value, 10000),                                       \
+        CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 1),                                \
+        CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 10),                               \
+        CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 100),                              \
+        CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 1000),                             \
+        CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 10000)
 
 void add_sort_pairs_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
-                               hipStream_t stream,
-                               size_t size)
+                               hipStream_t                                   stream,
+                               size_t                                        size)
 {
-    using custom_float2 = benchmark_utils::custom_type<float, float>;
+    using custom_float2  = benchmark_utils::custom_type<float, float>;
     using custom_double2 = benchmark_utils::custom_type<double, double>;
 
-    std::vector<benchmark::internal::Benchmark*> bs =
-    {
+    std::vector<benchmark::internal::Benchmark*> bs = {
         BENCHMARK_PAIR_TYPE(int, float),
         BENCHMARK_PAIR_TYPE(long long, double),
         BENCHMARK_PAIR_TYPE(int8_t, int8_t),
@@ -465,7 +438,7 @@ void add_sort_pairs_benchmarks(std::vector<benchmark::internal::Benchmark*>& ben
     benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -474,15 +447,15 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     std::cout << "benchmark_device_segmented_radix_sort" << std::endl;
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
diff --git a/benchmark/benchmark_device_segmented_reduce.cpp b/benchmark/benchmark_device_segmented_reduce.cpp
index ca7cb950..d1e40c67 100644
--- a/benchmark/benchmark_device_segmented_reduce.cpp
+++ b/benchmark/benchmark_device_segmented_reduce.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -25,36 +25,34 @@
 // HIP API
 #include "hipcub/device/device_segmented_reduce.hpp"
 
-
 #ifndef DEFAULT_N
 const size_t DEFAULT_N = 1024 * 1024 * 32;
 #endif
 
-
-const unsigned int batch_size = 10;
+const unsigned int batch_size  = 10;
 const unsigned int warmup_size = 5;
 
 using OffsetType = int;
 
 template<class T, class OutputT, class SegmentedReduceKernel>
-void run_benchmark(benchmark::State& state,
-                   size_t desired_segments,
-                   hipStream_t stream,
-                   size_t size,
+void run_benchmark(benchmark::State&     state,
+                   size_t                desired_segments,
+                   hipStream_t           stream,
+                   size_t                size,
                    SegmentedReduceKernel segmented_reduce)
 {
     using value_type = T;
 
     // Generate data
-    const unsigned int seed = 123;
+    const unsigned int         seed = 123;
     std::default_random_engine gen(seed);
 
     const double avg_segment_length = static_cast<double>(size) / desired_segments;
     std::uniform_real_distribution<double> segment_length_dis(0, avg_segment_length * 2);
 
     std::vector<OffsetType> offsets;
-    unsigned int segments_count = 0;
-    size_t offset = 0;
+    unsigned int            segments_count = 0;
+    size_t                  offset         = 0;
     while(offset < size)
     {
         const size_t segment_length = std::round(segment_length_dis(gen));
@@ -67,30 +65,24 @@ void run_benchmark(benchmark::State& state,
     std::vector<value_type> values_input(size);
     std::iota(values_input.begin(), values_input.end(), 0);
 
-    OffsetType * d_offsets;
+    OffsetType* d_offsets;
     HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(OffsetType)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_offsets, offsets.data(),
-            (segments_count + 1) * sizeof(OffsetType),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    value_type * d_values_input;
+    HIP_CHECK(hipMemcpy(d_offsets,
+                        offsets.data(),
+                        (segments_count + 1) * sizeof(OffsetType),
+                        hipMemcpyHostToDevice));
+
+    value_type* d_values_input;
     HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_values_input, values_input.data(),
-            size * sizeof(value_type),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    OutputT * d_aggregates_output;
+    HIP_CHECK(hipMemcpy(d_values_input,
+                        values_input.data(),
+                        size * sizeof(value_type),
+                        hipMemcpyHostToDevice));
+
+    OutputT* d_aggregates_output;
     HIP_CHECK(hipMalloc(&d_aggregates_output, segments_count * sizeof(OutputT)));
 
-    void * d_temporary_storage = nullptr;
+    void*  d_temporary_storage     = nullptr;
     size_t temporary_storage_bytes = 0;
 
     HIP_CHECK(segmented_reduce(d_temporary_storage,
@@ -119,7 +111,7 @@ void run_benchmark(benchmark::State& state,
     }
     HIP_CHECK(hipDeviceSynchronize());
 
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
 
@@ -137,8 +129,8 @@ void run_benchmark(benchmark::State& state,
         HIP_CHECK(hipStreamSynchronize(stream));
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(value_type));
@@ -154,8 +146,10 @@ template<typename T, typename Op>
 struct Benchmark;
 
 template<typename T>
-struct Benchmark<T, hipcub::Sum> {
-    static void run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size)
+struct Benchmark<T, hipcub::Sum>
+{
+    static void
+        run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size)
     {
         hipError_t (*ptr_to_sum)(void*, size_t&, T*, T*, int, OffsetType*, OffsetType*, hipStream_t)
             = &hipcub::DeviceSegmentedReduce::Sum;
@@ -164,8 +158,10 @@ struct Benchmark<T, hipcub::Sum> {
 };
 
 template<typename T>
-struct Benchmark<T, hipcub::Min> {
-    static void run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size)
+struct Benchmark<T, hipcub::Min>
+{
+    static void
+        run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size)
     {
         hipError_t (*ptr_to_min)(void*, size_t&, T*, T*, int, OffsetType*, OffsetType*, hipStream_t)
             = &hipcub::DeviceSegmentedReduce::Min;
@@ -174,12 +170,14 @@ struct Benchmark<T, hipcub::Min> {
 };
 
 template<typename T>
-struct Benchmark<T, hipcub::ArgMin> {
+struct Benchmark<T, hipcub::ArgMin>
+{
     using Difference = OffsetType;
-    using Iterator = typename hipcub::ArgIndexInputIterator<T*, Difference>;
-    using KeyValue = typename Iterator::value_type;
+    using Iterator   = typename hipcub::ArgIndexInputIterator<T*, Difference>;
+    using KeyValue   = typename Iterator::value_type;
 
-    static void run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size)
+    static void
+        run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size)
     {
         hipError_t (*ptr_to_argmin)(void*,
                                     size_t&,
@@ -194,50 +192,48 @@ struct Benchmark<T, hipcub::ArgMin> {
     }
 };
 
-#define CREATE_BENCHMARK(T, SEGMENTS, REDUCE_OP) \
-benchmark::RegisterBenchmark( \
-    (std::string("segmented_reduce") + "<Datatype:" #T ", ReduceOp:" #REDUCE_OP ">" + \
-        "(Number of segments:~" + std::to_string(SEGMENTS) + " segments)" \
-    ).c_str(), \
-    &Benchmark<T, REDUCE_OP>::run, \
-    SEGMENTS, stream, size \
-)
-
-#define BENCHMARK_TYPE(type, REDUCE_OP) \
-    CREATE_BENCHMARK(type, 1, REDUCE_OP), \
-    CREATE_BENCHMARK(type, 100, REDUCE_OP), \
-    CREATE_BENCHMARK(type, 10000, REDUCE_OP)
-
-#define CREATE_BENCHMARKS(REDUCE_OP) \
-    BENCHMARK_TYPE(float, REDUCE_OP), \
-    BENCHMARK_TYPE(double, REDUCE_OP), \
-    BENCHMARK_TYPE(int8_t, REDUCE_OP), \
-    BENCHMARK_TYPE(int, REDUCE_OP)
+#define CREATE_BENCHMARK(T, SEGMENTS, REDUCE_OP)                                            \
+    benchmark::RegisterBenchmark(std::string("device_segmented_reduce"                      \
+                                             "<data_type:" #T ",reduce_op:" #REDUCE_OP ">." \
+                                             "(number_of_segments:~"                        \
+                                             + std::to_string(SEGMENTS) + " segments)")     \
+                                     .c_str(),                                              \
+                                 &Benchmark<T, REDUCE_OP>::run,                             \
+                                 SEGMENTS,                                                  \
+                                 stream,                                                    \
+                                 size)
+
+#define BENCHMARK_TYPE(type, REDUCE_OP)                                           \
+    CREATE_BENCHMARK(type, 1, REDUCE_OP), CREATE_BENCHMARK(type, 100, REDUCE_OP), \
+        CREATE_BENCHMARK(type, 10000, REDUCE_OP)
+
+#define CREATE_BENCHMARKS(REDUCE_OP)                                     \
+    BENCHMARK_TYPE(float, REDUCE_OP), BENCHMARK_TYPE(double, REDUCE_OP), \
+        BENCHMARK_TYPE(int8_t, REDUCE_OP), BENCHMARK_TYPE(int, REDUCE_OP)
 
 void add_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
-                    hipStream_t stream,
-                    size_t size)
+                    hipStream_t                                   stream,
+                    size_t                                        size)
 {
     using custom_double2 = benchmark_utils::custom_type<double, double>;
 
-    std::vector<benchmark::internal::Benchmark*> bs =
-    {
+    std::vector<benchmark::internal::Benchmark*> bs = {
         CREATE_BENCHMARKS(hipcub::Sum),
         BENCHMARK_TYPE(custom_double2, hipcub::Sum),
         CREATE_BENCHMARKS(hipcub::Min),
-        #ifdef HIPCUB_ROCPRIM_API
+#ifdef HIPCUB_ROCPRIM_API
         BENCHMARK_TYPE(custom_double2, hipcub::Min),
-        #endif
+#endif
         CREATE_BENCHMARKS(hipcub::ArgMin),
-        #ifdef HIPCUB_ROCPRIM_API
+#ifdef HIPCUB_ROCPRIM_API
         BENCHMARK_TYPE(custom_double2, hipcub::ArgMin),
-        #endif
+#endif
     };
 
     benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -246,15 +242,15 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     std::cout << "benchmark_device_segmented_reduce" << std::endl;
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
diff --git a/benchmark/benchmark_device_segmented_sort.cpp b/benchmark/benchmark_device_segmented_sort.cpp
index e2b2a6a2..d98c7f42 100644
--- a/benchmark/benchmark_device_segmented_sort.cpp
+++ b/benchmark/benchmark_device_segmented_sort.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -29,19 +29,19 @@
 const size_t DEFAULT_N = 1024 * 1024 * 32;
 #endif
 
-const unsigned int batch_size = 4;
+const unsigned int batch_size  = 4;
 const unsigned int warmup_size = 2;
 
-template <class Key>
-void run_sort_keys_benchmark(benchmark::State &state,
-                             size_t desired_segments,
-                             hipStream_t stream,
-                             size_t size,
-                             bool Descending = false, 
-                             bool Stable = false)
+template<class Key>
+void run_sort_keys_benchmark(benchmark::State& state,
+                             size_t            desired_segments,
+                             hipStream_t       stream,
+                             size_t            size,
+                             bool              Descending = false,
+                             bool              Stable     = false)
 {
     using offset_type = int;
-    using key_type = Key;
+    using key_type    = Key;
     typedef hipError_t (*sort_func)(void*,
                                     size_t&,
                                     const key_type*,
@@ -52,31 +52,29 @@ void run_sort_keys_benchmark(benchmark::State &state,
                                     offset_type*,
                                     hipStream_t);
 
-    sort_func func_ascending  = &hipcub::DeviceSegmentedSort::SortKeys
-        <key_type, offset_type *>;
-    sort_func func_descending = &hipcub::DeviceSegmentedSort::SortKeysDescending
-        <key_type, offset_type *>;
-    sort_func func_ascending_stable  = &hipcub::DeviceSegmentedSort::StableSortKeys
-        <key_type, offset_type *>;
-    sort_func func_descending_stable = &hipcub::DeviceSegmentedSort::StableSortKeysDescending
-        <key_type, offset_type *>;
+    sort_func func_ascending = &hipcub::DeviceSegmentedSort::SortKeys<key_type, offset_type*>;
+    sort_func func_descending
+        = &hipcub::DeviceSegmentedSort::SortKeysDescending<key_type, offset_type*>;
+    sort_func func_ascending_stable
+        = &hipcub::DeviceSegmentedSort::StableSortKeys<key_type, offset_type*>;
+    sort_func func_descending_stable
+        = &hipcub::DeviceSegmentedSort::StableSortKeysDescending<key_type, offset_type*>;
 
-    sort_func sorting = Descending ? 
-        (Stable ? func_descending_stable : func_descending) : 
-        (Stable ? func_ascending_stable  : func_ascending);
+    sort_func sorting = Descending ? (Stable ? func_descending_stable : func_descending)
+                                   : (Stable ? func_ascending_stable : func_ascending);
 
     std::vector<offset_type> offsets;
 
     const double avg_segment_length = static_cast<double>(size) / desired_segments;
 
-    std::random_device rd;
+    std::random_device         rd;
     std::default_random_engine gen(rd());
 
     std::uniform_real_distribution<double> segment_length_dis(0, avg_segment_length * 2);
 
     unsigned int segments_count = 0;
-    size_t offset = 0;
-    while (offset < size)
+    size_t       offset         = 0;
+    while(offset < size)
     {
         const size_t segment_length = std::round(segment_length_dis(gen));
         offsets.push_back(offset);
@@ -86,46 +84,34 @@ void run_sort_keys_benchmark(benchmark::State &state,
     offsets.push_back(size);
 
     std::vector<key_type> keys_input;
-    if (std::is_floating_point<key_type>::value)
+    if(std::is_floating_point<key_type>::value)
     {
-        keys_input = benchmark_utils::get_random_data<key_type>(
-            size,
-            static_cast<key_type>(-1000),
-            static_cast<key_type>(1000)
-        );
-    }
-    else
+        keys_input = benchmark_utils::get_random_data<key_type>(size,
+                                                                static_cast<key_type>(-1000),
+                                                                static_cast<key_type>(1000));
+    } else
     {
-        keys_input = benchmark_utils::get_random_data<key_type>(
-            size,
-            std::numeric_limits<key_type>::min(),
-            std::numeric_limits<key_type>::max()
-        );
+        keys_input
+            = benchmark_utils::get_random_data<key_type>(size,
+                                                         std::numeric_limits<key_type>::min(),
+                                                         std::numeric_limits<key_type>::max());
     }
 
-    offset_type * d_offsets;
+    offset_type* d_offsets;
     HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_offsets, offsets.data(),
-            (segments_count + 1) * sizeof(offset_type),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    key_type * d_keys_input;
-    key_type * d_keys_output;
+    HIP_CHECK(hipMemcpy(d_offsets,
+                        offsets.data(),
+                        (segments_count + 1) * sizeof(offset_type),
+                        hipMemcpyHostToDevice));
+
+    key_type* d_keys_input;
+    key_type* d_keys_output;
     HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type)));
     HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type)));
     HIP_CHECK(
-        hipMemcpy(
-            d_keys_input, keys_input.data(),
-            size * sizeof(key_type),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    void * d_temporary_storage = nullptr;
+        hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice));
+
+    void*  d_temporary_storage     = nullptr;
     size_t temporary_storage_bytes = 0;
     HIP_CHECK(sorting(d_temporary_storage,
                       temporary_storage_bytes,
@@ -141,7 +127,7 @@ void run_sort_keys_benchmark(benchmark::State &state,
     HIP_CHECK(hipDeviceSynchronize());
 
     // Warm-up
-    for (size_t i = 0; i < warmup_size; ++i)
+    for(size_t i = 0; i < warmup_size; ++i)
     {
         HIP_CHECK(sorting(d_temporary_storage,
                           temporary_storage_bytes,
@@ -155,11 +141,11 @@ void run_sort_keys_benchmark(benchmark::State &state,
     }
     HIP_CHECK(hipDeviceSynchronize());
 
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
 
-        for (size_t i = 0; i < batch_size; ++i)
+        for(size_t i = 0; i < batch_size; ++i)
         {
             HIP_CHECK(sorting(d_temporary_storage,
                               temporary_storage_bytes,
@@ -174,8 +160,8 @@ void run_sort_keys_benchmark(benchmark::State &state,
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type));
@@ -187,17 +173,17 @@ void run_sort_keys_benchmark(benchmark::State &state,
     HIP_CHECK(hipFree(d_keys_output));
 }
 
-template <class Key, class Value>
-void run_sort_pairs_benchmark(benchmark::State &state,
-                              size_t desired_segments,
-                              hipStream_t stream,
-                              size_t size,
-                              bool Descending = false, 
-                              bool Stable = false)
+template<class Key, class Value>
+void run_sort_pairs_benchmark(benchmark::State& state,
+                              size_t            desired_segments,
+                              hipStream_t       stream,
+                              size_t            size,
+                              bool              Descending = false,
+                              bool              Stable     = false)
 {
     using offset_type = int;
-    using key_type = Key;
-    using value_type = Value;
+    using key_type    = Key;
+    using value_type  = Value;
     typedef hipError_t (*sort_func)(void*,
                                     size_t&,
                                     const key_type*,
@@ -210,31 +196,31 @@ void run_sort_pairs_benchmark(benchmark::State &state,
                                     offset_type*,
                                     hipStream_t);
 
-    sort_func func_ascending  = &hipcub::DeviceSegmentedSort::SortPairs
-        <key_type, value_type, offset_type *>;
-    sort_func func_descending = &hipcub::DeviceSegmentedSort::SortPairsDescending
-        <key_type, value_type, offset_type *>;
-    sort_func func_ascending_stable  = &hipcub::DeviceSegmentedSort::StableSortPairs
-        <key_type, value_type, offset_type *>;
-    sort_func func_descending_stable = &hipcub::DeviceSegmentedSort::StableSortPairsDescending
-        <key_type, value_type, offset_type *>;
+    sort_func func_ascending
+        = &hipcub::DeviceSegmentedSort::SortPairs<key_type, value_type, offset_type*>;
+    sort_func func_descending
+        = &hipcub::DeviceSegmentedSort::SortPairsDescending<key_type, value_type, offset_type*>;
+    sort_func func_ascending_stable
+        = &hipcub::DeviceSegmentedSort::StableSortPairs<key_type, value_type, offset_type*>;
+    sort_func func_descending_stable
+        = &hipcub::DeviceSegmentedSort::
+              StableSortPairsDescending<key_type, value_type, offset_type*>;
 
-    sort_func sorting = Descending ? 
-        (Stable ? func_descending_stable : func_descending) : 
-        (Stable ? func_ascending_stable  : func_ascending);
+    sort_func sorting = Descending ? (Stable ? func_descending_stable : func_descending)
+                                   : (Stable ? func_ascending_stable : func_ascending);
 
     std::vector<offset_type> offsets;
 
     const double avg_segment_length = static_cast<double>(size) / desired_segments;
 
-    std::random_device rd;
+    std::random_device         rd;
     std::default_random_engine gen(rd());
 
     std::uniform_real_distribution<double> segment_length_dis(0, avg_segment_length * 2);
 
     unsigned int segments_count = 0;
-    size_t offset = 0;
-    while (offset < size)
+    size_t       offset         = 0;
+    while(offset < size)
     {
         const size_t segment_length = std::round(segment_length_dis(gen));
         offsets.push_back(offset);
@@ -244,61 +230,46 @@ void run_sort_pairs_benchmark(benchmark::State &state,
     offsets.push_back(size);
 
     std::vector<key_type> keys_input;
-    if (std::is_floating_point<key_type>::value)
+    if(std::is_floating_point<key_type>::value)
     {
-        keys_input = benchmark_utils::get_random_data<key_type>(
-            size,
-            static_cast<key_type>(-1000),
-            static_cast<key_type>(1000)
-        );
-    }
-    else
+        keys_input = benchmark_utils::get_random_data<key_type>(size,
+                                                                static_cast<key_type>(-1000),
+                                                                static_cast<key_type>(1000));
+    } else
     {
-        keys_input = benchmark_utils::get_random_data<key_type>(
-            size,
-            std::numeric_limits<key_type>::min(),
-            std::numeric_limits<key_type>::max()
-        );
+        keys_input
+            = benchmark_utils::get_random_data<key_type>(size,
+                                                         std::numeric_limits<key_type>::min(),
+                                                         std::numeric_limits<key_type>::max());
     }
 
     std::vector<value_type> values_input(size);
     std::iota(values_input.begin(), values_input.end(), 0);
 
-    offset_type * d_offsets;
+    offset_type* d_offsets;
     HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_offsets, offsets.data(),
-            (segments_count + 1) * sizeof(offset_type),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    key_type * d_keys_input;
-    key_type * d_keys_output;
+    HIP_CHECK(hipMemcpy(d_offsets,
+                        offsets.data(),
+                        (segments_count + 1) * sizeof(offset_type),
+                        hipMemcpyHostToDevice));
+
+    key_type* d_keys_input;
+    key_type* d_keys_output;
     HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type)));
     HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type)));
     HIP_CHECK(
-        hipMemcpy(
-            d_keys_input, keys_input.data(),
-            size * sizeof(key_type),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    value_type * d_values_input;
-    value_type * d_values_output;
+        hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice));
+
+    value_type* d_values_input;
+    value_type* d_values_output;
     HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type)));
     HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_values_input, values_input.data(),
-            size * sizeof(value_type),
-            hipMemcpyHostToDevice
-        )
-    );
-
-    void * d_temporary_storage = nullptr;
+    HIP_CHECK(hipMemcpy(d_values_input,
+                        values_input.data(),
+                        size * sizeof(value_type),
+                        hipMemcpyHostToDevice));
+
+    void*  d_temporary_storage     = nullptr;
     size_t temporary_storage_bytes = 0;
     HIP_CHECK(sorting(d_temporary_storage,
                       temporary_storage_bytes,
@@ -316,7 +287,7 @@ void run_sort_pairs_benchmark(benchmark::State &state,
     HIP_CHECK(hipDeviceSynchronize());
 
     // Warm-up
-    for (size_t i = 0; i < warmup_size; i++)
+    for(size_t i = 0; i < warmup_size; i++)
     {
         HIP_CHECK(sorting(d_temporary_storage,
                           temporary_storage_bytes,
@@ -332,11 +303,11 @@ void run_sort_pairs_benchmark(benchmark::State &state,
     }
     HIP_CHECK(hipDeviceSynchronize());
 
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
 
-        for (size_t i = 0; i < batch_size; i++)
+        for(size_t i = 0; i < batch_size; i++)
         {
             HIP_CHECK(sorting(d_temporary_storage,
                               temporary_storage_bytes,
@@ -353,12 +324,12 @@ void run_sort_pairs_benchmark(benchmark::State &state,
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
-    state.SetBytesProcessed(
-        state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type)));
+    state.SetBytesProcessed(state.iterations() * batch_size * size
+                            * (sizeof(key_type) + sizeof(value_type)));
     state.SetItemsProcessed(state.iterations() * batch_size * size);
 
     HIP_CHECK(hipFree(d_temporary_storage));
@@ -369,96 +340,123 @@ void run_sort_pairs_benchmark(benchmark::State &state,
     HIP_CHECK(hipFree(d_values_output));
 }
 
-#define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS)        \
-    benchmark::RegisterBenchmark(                        \
-        (std::string("sort_keys") + "<Key Type:" #Key ">" +       \
-         "(Number of segments:~" + std::to_string(SEGMENTS) + " segments)") \
-            .c_str(),                                    \
-        [=](benchmark::State &state) { run_sort_keys_benchmark<Key>(state, SEGMENTS, stream, size); }), \
-    benchmark::RegisterBenchmark(                        \
-        (std::string("sort_keys") + "<" #Key ">" +       \
-         "(~" + std::to_string(SEGMENTS) + " segments), descending") \
-            .c_str(),                                    \
-        [=](benchmark::State &state) { run_sort_keys_benchmark<Key>(state, SEGMENTS, stream, size, true); }), \
-    benchmark::RegisterBenchmark(                        \
-        (std::string("sort_keys") + "<" #Key ">" +       \
-         "(~" + std::to_string(SEGMENTS) + " segments), stable") \
-            .c_str(),                                    \
-        [=](benchmark::State &state) { run_sort_keys_benchmark<Key>(state, SEGMENTS, stream, size, false, true); }), \
-    benchmark::RegisterBenchmark(                        \
-        (std::string("sort_keys") + "<" #Key ">" +       \
-         "(~" + std::to_string(SEGMENTS) + " segments), descending, stable") \
-            .c_str(),                                    \
-        [=](benchmark::State &state) { run_sort_keys_benchmark<Key>(state, SEGMENTS, stream, size, true, true); })
-
-#define BENCHMARK_KEY_TYPE(type)                \
-        CREATE_SORT_KEYS_BENCHMARK(type, 10),   \
-        CREATE_SORT_KEYS_BENCHMARK(type, 100),  \
-        CREATE_SORT_KEYS_BENCHMARK(type, 1000), \
-        CREATE_SORT_KEYS_BENCHMARK(type, 10000)
-
-void add_sort_keys_benchmarks(std::vector<benchmark::internal::Benchmark *> &benchmarks,
-                              hipStream_t stream,
-                              size_t size)
+#define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS)                                                 \
+    benchmark::RegisterBenchmark(std::string("device_segmented_sort_keys"                         \
+                                             "<key_data_type:" #Key ",ascending:true"             \
+                                             ",stable:false>."                                    \
+                                             "(number_of_segments:~"                              \
+                                             + std::to_string(SEGMENTS) + " segments)")           \
+                                     .c_str(),                                                    \
+                                 [=](benchmark::State& state) {                                   \
+                                     run_sort_keys_benchmark<Key>(state, SEGMENTS, stream, size); \
+                                 }),                                                              \
+        benchmark::RegisterBenchmark(                                                             \
+            std::string("device_segmented_sort_keys"                                              \
+                        "<key_data_type:" #Key ",ascending:false"                                 \
+                        ",stable:false>."                                                         \
+                        "(number_of_segments:~"                                                   \
+                        + std::to_string(SEGMENTS) + " segments)")                                \
+                .c_str(),                                                                         \
+            [=](benchmark::State& state)                                                          \
+            { run_sort_keys_benchmark<Key>(state, SEGMENTS, stream, size, true); }),              \
+        benchmark::RegisterBenchmark(                                                             \
+            std::string("device_segmented_sort_keys"                                              \
+                        "<key_data_type:" #Key ",ascending:true"                                  \
+                        ",stable:true>."                                                          \
+                        "(number_of_segments:~"                                                   \
+                        + std::to_string(SEGMENTS) + " segments)")                                \
+                .c_str(),                                                                         \
+            [=](benchmark::State& state)                                                          \
+            { run_sort_keys_benchmark<Key>(state, SEGMENTS, stream, size, false, true); }),       \
+        benchmark::RegisterBenchmark(                                                             \
+            std::string("device_segmented_sort_keys"                                              \
+                        "<key_data_type:" #Key ",ascending:false"                                 \
+                        ",stable:true>."                                                          \
+                        "(number_of_segments:~"                                                   \
+                        + std::to_string(SEGMENTS) + " segments)")                                \
+                .c_str(),                                                                         \
+            [=](benchmark::State& state)                                                          \
+            { run_sort_keys_benchmark<Key>(state, SEGMENTS, stream, size, true, true); })
+
+#define BENCHMARK_KEY_TYPE(type)                                                 \
+    CREATE_SORT_KEYS_BENCHMARK(type, 10), CREATE_SORT_KEYS_BENCHMARK(type, 100), \
+        CREATE_SORT_KEYS_BENCHMARK(type, 1000), CREATE_SORT_KEYS_BENCHMARK(type, 10000)
+
+void add_sort_keys_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
+                              hipStream_t                                   stream,
+                              size_t                                        size)
 {
-    std::vector<benchmark::internal::Benchmark *> bs =
-        {
-            BENCHMARK_KEY_TYPE(float),
-            BENCHMARK_KEY_TYPE(double),
-            BENCHMARK_KEY_TYPE(int8_t),
-            BENCHMARK_KEY_TYPE(uint8_t),
-            BENCHMARK_KEY_TYPE(int),
-        };
+    std::vector<benchmark::internal::Benchmark*> bs = {
+        BENCHMARK_KEY_TYPE(float),
+        BENCHMARK_KEY_TYPE(double),
+        BENCHMARK_KEY_TYPE(int8_t),
+        BENCHMARK_KEY_TYPE(uint8_t),
+        BENCHMARK_KEY_TYPE(int),
+    };
     benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
 }
 
-#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS)       \
-    benchmark::RegisterBenchmark(                               \
-        (std::string("sort_pairs") + "<Key Type:" #Key ",Value Type:" #Value ">" + \
-         "(Number of segments:~" + std::to_string(SEGMENTS) + " segments)")        \
-            .c_str(),                                           \
-        [=](benchmark::State &state) { run_sort_pairs_benchmark<Key, Value>(state, SEGMENTS, stream, size); }), \
-    benchmark::RegisterBenchmark(                               \
-        (std::string("sort_pairs") + "<" #Key ", " #Value ">" + \
-         "(~" + std::to_string(SEGMENTS) + " segments), descending")        \
-            .c_str(),                                           \
-        [=](benchmark::State &state) { run_sort_pairs_benchmark<Key, Value>(state, SEGMENTS, stream, size, true); }), \
-    benchmark::RegisterBenchmark(                               \
-        (std::string("sort_pairs") + "<" #Key ", " #Value ">" + \
-         "(~" + std::to_string(SEGMENTS) + " segments), stable")        \
-            .c_str(),                                           \
-        [=](benchmark::State &state) { run_sort_pairs_benchmark<Key, Value>(state, SEGMENTS, stream, size, false, true); }), \
-    benchmark::RegisterBenchmark(                               \
-        (std::string("sort_pairs") + "<" #Key ", " #Value ">" + \
-         "(~" + std::to_string(SEGMENTS) + " segments), descending, stable")        \
-            .c_str(),                                           \
-        [=](benchmark::State &state) { run_sort_pairs_benchmark<Key, Value>(state, SEGMENTS, stream, size, true, true); })
-
-#define BENCHMARK_PAIR_TYPE(type, value)                \
-        CREATE_SORT_PAIRS_BENCHMARK(type, value, 10),   \
-        CREATE_SORT_PAIRS_BENCHMARK(type, value, 100),  \
+#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS)                                         \
+    benchmark::RegisterBenchmark(                                                                 \
+        std::string("device_segmented_sort_pairs"                                                 \
+                    "<key_data_type:" #Key ",value_data_type:" #Value ",ascending:true"           \
+                    ",stable:false>."                                                             \
+                    "(number_of_segments:~"                                                       \
+                    + std::to_string(SEGMENTS) + " segments)")                                    \
+            .c_str(),                                                                             \
+        [=](benchmark::State& state)                                                              \
+        { run_sort_pairs_benchmark<Key, Value>(state, SEGMENTS, stream, size); }),                \
+        benchmark::RegisterBenchmark(                                                             \
+            std::string("device_segmented_sort_pairs"                                             \
+                        "<key_data_type:" #Key ",value_data_type:" #Value ",ascending:false"      \
+                        ",stable:false>."                                                         \
+                        "(number_of_segments:~"                                                   \
+                        + std::to_string(SEGMENTS) + " segments)")                                \
+                .c_str(),                                                                         \
+            [=](benchmark::State& state)                                                          \
+            { run_sort_pairs_benchmark<Key, Value>(state, SEGMENTS, stream, size, true); }),      \
+        benchmark::RegisterBenchmark(                                                             \
+            std::string("device_segmented_sort_pairs"                                             \
+                        "<key_data_type:" #Key ",value_data_type:" #Value ",ascending:true"       \
+                        ",stable:true>."                                                          \
+                        "(number_of_segments:~"                                                   \
+                        + std::to_string(SEGMENTS) + " segments)")                                \
+                .c_str(),                                                                         \
+            [=](benchmark::State& state) {                                                        \
+                run_sort_pairs_benchmark<Key, Value>(state, SEGMENTS, stream, size, false, true); \
+            }),                                                                                   \
+        benchmark::RegisterBenchmark(                                                             \
+            std::string("device_segmented_sort_pairs"                                             \
+                        "<key_data_type:" #Key ",value_data_type:" #Value ",ascending:false"      \
+                        ",stable:true>."                                                          \
+                        "(number_of_segments:~"                                                   \
+                        + std::to_string(SEGMENTS) + " segments)")                                \
+                .c_str(),                                                                         \
+            [=](benchmark::State& state)                                                          \
+            { run_sort_pairs_benchmark<Key, Value>(state, SEGMENTS, stream, size, true, true); })
+#define BENCHMARK_PAIR_TYPE(type, value)                                                         \
+    CREATE_SORT_PAIRS_BENCHMARK(type, value, 10), CREATE_SORT_PAIRS_BENCHMARK(type, value, 100), \
         CREATE_SORT_PAIRS_BENCHMARK(type, value, 10000)
 
-void add_sort_pairs_benchmarks(std::vector<benchmark::internal::Benchmark *> &benchmarks,
-                               hipStream_t stream,
-                               size_t size)
+void add_sort_pairs_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
+                               hipStream_t                                   stream,
+                               size_t                                        size)
 {
-    using custom_float2 = benchmark_utils::custom_type<float, float>;
+    using custom_float2  = benchmark_utils::custom_type<float, float>;
     using custom_double2 = benchmark_utils::custom_type<double, double>;
 
-    std::vector<benchmark::internal::Benchmark *> bs =
-        {
-            BENCHMARK_PAIR_TYPE(int, float),
-            BENCHMARK_PAIR_TYPE(long long, double),
-            BENCHMARK_PAIR_TYPE(int8_t, int8_t),
-            BENCHMARK_PAIR_TYPE(uint8_t, uint8_t),
-            BENCHMARK_PAIR_TYPE(int, custom_float2),
-            BENCHMARK_PAIR_TYPE(long long, custom_double2),
-        };
+    std::vector<benchmark::internal::Benchmark*> bs = {
+        BENCHMARK_PAIR_TYPE(int, float),
+        BENCHMARK_PAIR_TYPE(long long, double),
+        BENCHMARK_PAIR_TYPE(int8_t, int8_t),
+        BENCHMARK_PAIR_TYPE(uint8_t, uint8_t),
+        BENCHMARK_PAIR_TYPE(int, custom_float2),
+        BENCHMARK_PAIR_TYPE(long long, custom_double2),
+    };
     benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -467,35 +465,35 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     std::cout << "benchmark_device_segmented_sort" << std::endl;
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
 
     // Add benchmarks
-    std::vector<benchmark::internal::Benchmark *> benchmarks;
+    std::vector<benchmark::internal::Benchmark*> benchmarks;
     add_sort_keys_benchmarks(benchmarks, stream, size);
     add_sort_pairs_benchmarks(benchmarks, stream, size);
 
     // Use manual timing
-    for (auto &b : benchmarks)
+    for(auto& b : benchmarks)
     {
         b->UseManualTime();
         b->Unit(benchmark::kMillisecond);
     }
 
     // Force number of iterations
-    if (trials > 0)
+    if(trials > 0)
     {
-        for (auto &b : benchmarks)
+        for(auto& b : benchmarks)
         {
             b->Iterations(trials);
         }
diff --git a/benchmark/benchmark_device_select.cpp b/benchmark/benchmark_device_select.cpp
index d1617c79..c0921d54 100644
--- a/benchmark/benchmark_device_select.cpp
+++ b/benchmark/benchmark_device_select.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -25,93 +25,71 @@
 // HIP API
 #include "hipcub/device/device_select.hpp"
 
-
 #ifndef DEFAULT_N
 const size_t DEFAULT_N = 1024 * 1024 * 32;
 #endif
 
 template<class T, class FlagType>
 void run_flagged_benchmark(benchmark::State& state,
-                           size_t size,
+                           size_t            size,
                            const hipStream_t stream,
-                           float true_probability)
+                           float             true_probability)
 {
-    std::vector<T> input;
-    std::vector<FlagType> flags = benchmark_utils::get_random_data01<FlagType>(size, true_probability);
+    std::vector<T>        input;
+    std::vector<FlagType> flags
+        = benchmark_utils::get_random_data01<FlagType>(size, true_probability);
     if(std::is_floating_point<T>::value)
     {
         input = benchmark_utils::get_random_data<T>(size, T(-1000), T(1000));
-    }
-    else
+    } else
     {
-        input = benchmark_utils::get_random_data<T>(
-            size,
-            std::numeric_limits<T>::min(),
-            std::numeric_limits<T>::max()
-        );
+        input = benchmark_utils::get_random_data<T>(size,
+                                                    std::numeric_limits<T>::min(),
+                                                    std::numeric_limits<T>::max());
     }
 
-    T * d_input;
-    FlagType * d_flags;
-    T * d_output;
-    unsigned int * d_selected_count_output;
+    T*            d_input;
+    FlagType*     d_flags;
+    T*            d_output;
+    unsigned int* d_selected_count_output;
     HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_flags, flags.size() * sizeof(FlagType)));
     HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int)));
+    HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice));
     HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            input.size() * sizeof(T),
-            hipMemcpyHostToDevice
-        )
-    );
-    HIP_CHECK(
-        hipMemcpy(
-            d_flags, flags.data(),
-            flags.size() * sizeof(FlagType),
-            hipMemcpyHostToDevice
-        )
-    );
+        hipMemcpy(d_flags, flags.data(), flags.size() * sizeof(FlagType), hipMemcpyHostToDevice));
     HIP_CHECK(hipDeviceSynchronize());
     // Allocate temporary storage memory
     size_t temp_storage_size_bytes = 0;
 
     // Get size of d_temp_storage
-    HIP_CHECK(
-        hipcub::DeviceSelect::Flagged(
-            nullptr,
-            temp_storage_size_bytes,
-            d_input,
-            d_flags,
-            d_output,
-            d_selected_count_output,
-            input.size(),
-            stream
-        )
-    );
+    HIP_CHECK(hipcub::DeviceSelect::Flagged(nullptr,
+                                            temp_storage_size_bytes,
+                                            d_input,
+                                            d_flags,
+                                            d_output,
+                                            d_selected_count_output,
+                                            input.size(),
+                                            stream));
     HIP_CHECK(hipDeviceSynchronize());
 
     // allocate temporary storage
-    void * d_temp_storage = nullptr;
+    void* d_temp_storage = nullptr;
     HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes));
     HIP_CHECK(hipDeviceSynchronize());
 
     // Warm-up
     for(size_t i = 0; i < 10; i++)
     {
-        HIP_CHECK(
-            hipcub::DeviceSelect::Flagged(
-                d_temp_storage,
-                temp_storage_size_bytes,
-                d_input,
-                d_flags,
-                d_output,
-                d_selected_count_output,
-                input.size(),
-                stream
-            )
-        );
+        HIP_CHECK(hipcub::DeviceSelect::Flagged(d_temp_storage,
+                                                temp_storage_size_bytes,
+                                                d_input,
+                                                d_flags,
+                                                d_output,
+                                                d_selected_count_output,
+                                                input.size(),
+                                                stream));
     }
     HIP_CHECK(hipDeviceSynchronize());
 
@@ -121,24 +99,20 @@ void run_flagged_benchmark(benchmark::State& state,
         auto start = std::chrono::high_resolution_clock::now();
         for(size_t i = 0; i < batch_size; i++)
         {
-            HIP_CHECK(
-                hipcub::DeviceSelect::Flagged(
-                    d_temp_storage,
-                    temp_storage_size_bytes,
-                    d_input,
-                    d_flags,
-                    d_output,
-                    d_selected_count_output,
-                    input.size(),
-                    stream
-                )
-            );
+            HIP_CHECK(hipcub::DeviceSelect::Flagged(d_temp_storage,
+                                                    temp_storage_size_bytes,
+                                                    d_input,
+                                                    d_flags,
+                                                    d_output,
+                                                    d_selected_count_output,
+                                                    input.size(),
+                                                    stream));
         }
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T));
@@ -154,71 +128,58 @@ void run_flagged_benchmark(benchmark::State& state,
 
 template<class T>
 void run_selectop_benchmark(benchmark::State& state,
-                            size_t size,
+                            size_t            size,
                             const hipStream_t stream,
-                            float true_probability)
+                            float             true_probability)
 {
     std::vector<T> input = benchmark_utils::get_random_data<T>(size, T(0), T(1000));
 
-    auto select_op = [true_probability] __device__ (const T& value) -> bool
+    auto select_op = [true_probability] __device__(const T& value) -> bool
     {
-        if(value < T(1000 * true_probability)) return true;
+        if(value < T(1000 * true_probability))
+            return true;
         return false;
     };
 
-    T * d_input;
-    T * d_output;
-    unsigned int * d_selected_count_output;
+    T*            d_input;
+    T*            d_output;
+    unsigned int* d_selected_count_output;
     HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            input.size() * sizeof(T),
-            hipMemcpyHostToDevice
-        )
-    );
+    HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice));
     HIP_CHECK(hipDeviceSynchronize());
 
     // Allocate temporary storage memory
     size_t temp_storage_size_bytes;
 
     // Get size of d_temp_storage
-    HIP_CHECK(
-        hipcub::DeviceSelect::If(
-          nullptr,
-          temp_storage_size_bytes,
-          d_input,
-          d_output,
-          d_selected_count_output,
-          input.size(),
-          select_op,
-          stream
-        )
-    );
+    HIP_CHECK(hipcub::DeviceSelect::If(nullptr,
+                                       temp_storage_size_bytes,
+                                       d_input,
+                                       d_output,
+                                       d_selected_count_output,
+                                       input.size(),
+                                       select_op,
+                                       stream));
     HIP_CHECK(hipDeviceSynchronize());
 
     // allocate temporary storage
-    void * d_temp_storage = nullptr;
+    void* d_temp_storage = nullptr;
     HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes));
     HIP_CHECK(hipDeviceSynchronize());
 
     // Warm-up
     for(size_t i = 0; i < 10; i++)
     {
-        HIP_CHECK(
-            hipcub::DeviceSelect::If(
-                d_temp_storage,
-                temp_storage_size_bytes,
-                d_input,
-                d_output,
-                d_selected_count_output,
-                input.size(),
-                select_op,
-                stream
-            )
-        );
+        HIP_CHECK(hipcub::DeviceSelect::If(d_temp_storage,
+                                           temp_storage_size_bytes,
+                                           d_input,
+                                           d_output,
+                                           d_selected_count_output,
+                                           input.size(),
+                                           select_op,
+                                           stream));
     }
     HIP_CHECK(hipDeviceSynchronize());
 
@@ -228,24 +189,20 @@ void run_selectop_benchmark(benchmark::State& state,
         auto start = std::chrono::high_resolution_clock::now();
         for(size_t i = 0; i < batch_size; i++)
         {
-            HIP_CHECK(
-                hipcub::DeviceSelect::If(
-                    d_temp_storage,
-                    temp_storage_size_bytes,
-                    d_input,
-                    d_output,
-                    d_selected_count_output,
-                    input.size(),
-                    select_op,
-                    stream
-                )
-            );
+            HIP_CHECK(hipcub::DeviceSelect::If(d_temp_storage,
+                                               temp_storage_size_bytes,
+                                               d_input,
+                                               d_output,
+                                               d_selected_count_output,
+                                               input.size(),
+                                               select_op,
+                                               stream));
         }
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T));
@@ -260,74 +217,60 @@ void run_selectop_benchmark(benchmark::State& state,
 
 template<class T>
 void run_unique_benchmark(benchmark::State& state,
-                          size_t size,
+                          size_t            size,
                           const hipStream_t stream,
-                          float discontinuity_probability)
+                          float             discontinuity_probability)
 {
     hipcub::Sum op;
 
     std::vector<T> input(size);
     {
         auto input01 = benchmark_utils::get_random_data01<T>(size, discontinuity_probability);
-        auto acc = input01[0];
-        input[0] = acc;
+        auto acc     = input01[0];
+        input[0]     = acc;
         for(size_t i = 1; i < input01.size(); i++)
         {
             input[i] = op(acc, input01[i]);
         }
     }
 
-    T * d_input;
-    T * d_output;
-    unsigned int * d_selected_count_output;
+    T*            d_input;
+    T*            d_output;
+    unsigned int* d_selected_count_output;
     HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            input.size() * sizeof(T),
-            hipMemcpyHostToDevice
-        )
-    );
+    HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice));
     HIP_CHECK(hipDeviceSynchronize());
 
     // Allocate temporary storage memory
     size_t temp_storage_size_bytes;
 
     // Get size of d_temp_storage
-    HIP_CHECK(
-        hipcub::DeviceSelect::Unique(
-          nullptr,
-          temp_storage_size_bytes,
-          d_input,
-          d_output,
-          d_selected_count_output,
-          input.size(),
-          stream
-        )
-    );
+    HIP_CHECK(hipcub::DeviceSelect::Unique(nullptr,
+                                           temp_storage_size_bytes,
+                                           d_input,
+                                           d_output,
+                                           d_selected_count_output,
+                                           input.size(),
+                                           stream));
     HIP_CHECK(hipDeviceSynchronize());
 
     // allocate temporary storage
-    void * d_temp_storage = nullptr;
+    void* d_temp_storage = nullptr;
     HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes));
     HIP_CHECK(hipDeviceSynchronize());
 
     // Warm-up
     for(size_t i = 0; i < 10; i++)
     {
-        HIP_CHECK(
-            hipcub::DeviceSelect::Unique(
-                d_temp_storage,
-                temp_storage_size_bytes,
-                d_input,
-                d_output,
-                d_selected_count_output,
-                input.size(),
-                stream
-            )
-        );
+        HIP_CHECK(hipcub::DeviceSelect::Unique(d_temp_storage,
+                                               temp_storage_size_bytes,
+                                               d_input,
+                                               d_output,
+                                               d_selected_count_output,
+                                               input.size(),
+                                               stream));
     }
     HIP_CHECK(hipDeviceSynchronize());
 
@@ -337,23 +280,19 @@ void run_unique_benchmark(benchmark::State& state,
         auto start = std::chrono::high_resolution_clock::now();
         for(size_t i = 0; i < batch_size; i++)
         {
-            HIP_CHECK(
-                hipcub::DeviceSelect::Unique(
-                    d_temp_storage,
-                    temp_storage_size_bytes,
-                    d_input,
-                    d_output,
-                    d_selected_count_output,
-                    input.size(),
-                    stream
-                )
-            );
+            HIP_CHECK(hipcub::DeviceSelect::Unique(d_temp_storage,
+                                                   temp_storage_size_bytes,
+                                                   d_input,
+                                                   d_output,
+                                                   d_selected_count_output,
+                                                   input.size(),
+                                                   stream));
         }
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T));
@@ -367,20 +306,20 @@ void run_unique_benchmark(benchmark::State& state,
 
 template<class KeyT, class ValueT>
 void run_unique_by_key_benchmark(benchmark::State& state,
-                                 size_t size,
+                                 size_t            size,
                                  const hipStream_t stream,
-                                 float discontinuity_probability)
+                                 float             discontinuity_probability)
 {
     hipcub::Sum op;
 
     std::vector<KeyT> input_keys(size);
     {
         auto input01 = benchmark_utils::get_random_data01<KeyT>(size, discontinuity_probability);
-        auto acc = input01[0];
+        auto acc     = input01[0];
 
         input_keys[0] = acc;
 
-        for (size_t i = 1; i < input01.size(); i++)
+        for(size_t i = 1; i < input01.size(); i++)
         {
             input_keys[i] = op(acc, input01[i]);
         }
@@ -389,10 +328,10 @@ void run_unique_by_key_benchmark(benchmark::State& state,
     const auto input_values
         = benchmark_utils::get_random_data<ValueT>(size, ValueT(-1000), ValueT(1000));
 
-    KeyT* d_keys_input;
-    ValueT* d_values_input;
-    KeyT* d_keys_output;
-    ValueT* d_values_output;
+    KeyT*         d_keys_input;
+    ValueT*       d_values_input;
+    KeyT*         d_keys_output;
+    ValueT*       d_values_output;
     unsigned int* d_selected_count_output;
 
     HIP_CHECK(hipMalloc(&d_keys_input, input_keys.size() * sizeof(input_keys[0])));
@@ -401,40 +340,28 @@ void run_unique_by_key_benchmark(benchmark::State& state,
     HIP_CHECK(hipMalloc(&d_values_output, input_values.size() * sizeof(input_values[0])));
     HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(*d_selected_count_output)));
 
-    HIP_CHECK(
-        hipMemcpy(
-            d_keys_input,
-            input_keys.data(),
-            input_keys.size() * sizeof(input_keys[0]),
-            hipMemcpyHostToDevice
-        )
-    );
-    HIP_CHECK(
-        hipMemcpy(
-            d_values_input,
-            input_values.data(),
-            input_values.size() * sizeof(input_values[0]),
-            hipMemcpyHostToDevice
-        )
-    );
+    HIP_CHECK(hipMemcpy(d_keys_input,
+                        input_keys.data(),
+                        input_keys.size() * sizeof(input_keys[0]),
+                        hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(d_values_input,
+                        input_values.data(),
+                        input_values.size() * sizeof(input_values[0]),
+                        hipMemcpyHostToDevice));
 
     // Allocate temporary storage memory
     size_t temp_storage_size_bytes;
 
     // Get size of d_temp_storage
-    HIP_CHECK(
-        hipcub::DeviceSelect::UniqueByKey(
-            nullptr,
-            temp_storage_size_bytes,
-            d_keys_input,
-            d_values_input,
-            d_keys_output,
-            d_values_output,
-            d_selected_count_output,
-            input_keys.size(),
-            stream
-        )
-    );
+    HIP_CHECK(hipcub::DeviceSelect::UniqueByKey(nullptr,
+                                                temp_storage_size_bytes,
+                                                d_keys_input,
+                                                d_values_input,
+                                                d_keys_output,
+                                                d_values_output,
+                                                d_selected_count_output,
+                                                input_keys.size(),
+                                                stream));
     HIP_CHECK(hipDeviceSynchronize());
 
     // allocate temporary storage
@@ -443,51 +370,45 @@ void run_unique_by_key_benchmark(benchmark::State& state,
     HIP_CHECK(hipDeviceSynchronize());
 
     // Warm-up
-    for (size_t i = 0; i < 10; i++)
+    for(size_t i = 0; i < 10; i++)
     {
-        HIP_CHECK(
-            hipcub::DeviceSelect::UniqueByKey(
-                d_temp_storage,
-                temp_storage_size_bytes,
-                d_keys_input,
-                d_values_input,
-                d_keys_output,
-                d_values_output,
-                d_selected_count_output,
-                input_keys.size(),
-                stream
-            )
-        );
+        HIP_CHECK(hipcub::DeviceSelect::UniqueByKey(d_temp_storage,
+                                                    temp_storage_size_bytes,
+                                                    d_keys_input,
+                                                    d_values_input,
+                                                    d_keys_output,
+                                                    d_values_output,
+                                                    d_selected_count_output,
+                                                    input_keys.size(),
+                                                    stream));
     }
     HIP_CHECK(hipDeviceSynchronize());
 
     const unsigned int batch_size = 10;
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
-        for (size_t i = 0; i < batch_size; i++)
+        for(size_t i = 0; i < batch_size; i++)
         {
-            HIP_CHECK(
-                hipcub::DeviceSelect::UniqueByKey(
-                    d_temp_storage,
-                    temp_storage_size_bytes,
-                    d_keys_input,
-                    d_values_input,
-                    d_keys_output,
-                    d_values_output,
-                    d_selected_count_output,
-                    input_keys.size(),
-                    stream
-                )
-            );
+            HIP_CHECK(hipcub::DeviceSelect::UniqueByKey(d_temp_storage,
+                                                        temp_storage_size_bytes,
+                                                        d_keys_input,
+                                                        d_values_input,
+                                                        d_keys_output,
+                                                        d_values_output,
+                                                        d_selected_count_output,
+                                                        input_keys.size(),
+                                                        stream));
         }
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
-    state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(KeyT) + sizeof(ValueT)));
+    state.SetBytesProcessed(state.iterations() * batch_size * size
+                            * (sizeof(KeyT) + sizeof(ValueT)));
     state.SetItemsProcessed(state.iterations() * batch_size * size);
 
     hipFree(d_keys_input);
@@ -498,55 +419,67 @@ void run_unique_by_key_benchmark(benchmark::State& state,
     hipFree(d_temp_storage);
 }
 
-#define CREATE_SELECT_FLAGGED_BENCHMARK(T, F, p) \
-benchmark::RegisterBenchmark( \
-    ("select_flagged<Datatype:" #T ",Flag Type:" #F ",Output Datatype:"#T",Selected Output Datatype:unsigned int>(Probability:" #p")"), \
-    &run_flagged_benchmark<T, F>, size, stream, p \
-)
-
-#define CREATE_SELECT_IF_BENCHMARK(T, p) \
-benchmark::RegisterBenchmark( \
-    ("select_if<Datatype:" #T ",Output Datatype:"#T",Selected Output Datatype:unsigned int>(Probability:" #p")"), \
-    &run_selectop_benchmark<T>, size, stream, p \
-)
-
-#define CREATE_UNIQUE_BENCHMARK(T, p) \
-benchmark::RegisterBenchmark( \
-    ("unique<Datatype:" #T ",Output Datatype:"#T",Selected Output Datatype:unsigned int>(Probability:" #p")"), \
-    &run_unique_benchmark<T>, size, stream, p \
-)
-
-#define CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, p) \
-benchmark::RegisterBenchmark( \
-    ("unique_by_key<" #K ", "#V", unsigned int>(p = " #p")"), \
-    &run_unique_by_key_benchmark<K, V>, size, stream, p \
-)
-
-#define BENCHMARK_FLAGGED_TYPE(type, value) \
-    CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.05f), \
-    CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.25f), \
-    CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.5f), \
-    CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.75f)
-
-#define BENCHMARK_IF_TYPE(type) \
-    CREATE_SELECT_IF_BENCHMARK(type, 0.05f), \
-    CREATE_SELECT_IF_BENCHMARK(type, 0.25f), \
-    CREATE_SELECT_IF_BENCHMARK(type, 0.5f), \
-    CREATE_SELECT_IF_BENCHMARK(type, 0.75f)
-
-#define BENCHMARK_UNIQUE_TYPE(type) \
-    CREATE_UNIQUE_BENCHMARK(type, 0.05f), \
-    CREATE_UNIQUE_BENCHMARK(type, 0.25f), \
-    CREATE_UNIQUE_BENCHMARK(type, 0.5f), \
-    CREATE_UNIQUE_BENCHMARK(type, 0.75f)
-
-#define BENCHMARK_UNIQUE_BY_KEY_TYPE(key_type, value_type) \
-    CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.05f), \
-    CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.25f), \
-    CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.5f), \
-    CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.75f)
-
-int main(int argc, char *argv[])
+#define CREATE_SELECT_FLAGGED_BENCHMARK(T, F, p)                                                   \
+    benchmark::RegisterBenchmark(                                                                  \
+        std::string("device_select_flagged<data_type:" #T ",flag_type:" #F ",output_data_type:" #T \
+                    ",selected_output_data_type:unsigned int>.(probability:" #p ")")               \
+            .c_str(),                                                                              \
+        &run_flagged_benchmark<T, F>,                                                              \
+        size,                                                                                      \
+        stream,                                                                                    \
+        p)
+
+#define CREATE_SELECT_IF_BENCHMARK(T, p)                                             \
+    benchmark::RegisterBenchmark(                                                    \
+        std::string("device_select_if<data_type:" #T ",output_data_type:" #T         \
+                    ",selected_output_data_type:unsigned int>.(probability:" #p ")") \
+            .c_str(),                                                                \
+        &run_selectop_benchmark<T>,                                                  \
+        size,                                                                        \
+        stream,                                                                      \
+        p)
+
+#define CREATE_UNIQUE_BENCHMARK(T, p)                                                \
+    benchmark::RegisterBenchmark(                                                    \
+        std::string("device_select_unique<data_type:" #T ",output_data_type:" #T     \
+                    ",selected_output_data_type:unsigned int>.(probability:" #p ")") \
+            .c_str(),                                                                \
+        &run_unique_benchmark<T>,                                                    \
+        size,                                                                        \
+        stream,                                                                      \
+        p)
+
+#define CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, p)                                            \
+    benchmark::RegisterBenchmark(                                                          \
+        std::string("device_select_unique_by_key<Key data_type:" #K ",value_data_type:" #V \
+                    ",selected_output_data_type:unsigned int>.(probability:" #p ")")       \
+            .c_str(),                                                                      \
+        &run_unique_by_key_benchmark<K, V>,                                                \
+        size,                                                                              \
+        stream,                                                                            \
+        p)
+
+#define BENCHMARK_FLAGGED_TYPE(type, value)                  \
+    CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.05f),     \
+        CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.25f), \
+        CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.5f),  \
+        CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.75f)
+
+#define BENCHMARK_IF_TYPE(type)                                                       \
+    CREATE_SELECT_IF_BENCHMARK(type, 0.05f), CREATE_SELECT_IF_BENCHMARK(type, 0.25f), \
+        CREATE_SELECT_IF_BENCHMARK(type, 0.5f), CREATE_SELECT_IF_BENCHMARK(type, 0.75f)
+
+#define BENCHMARK_UNIQUE_TYPE(type)                                             \
+    CREATE_UNIQUE_BENCHMARK(type, 0.05f), CREATE_UNIQUE_BENCHMARK(type, 0.25f), \
+        CREATE_UNIQUE_BENCHMARK(type, 0.5f), CREATE_UNIQUE_BENCHMARK(type, 0.75f)
+
+#define BENCHMARK_UNIQUE_BY_KEY_TYPE(key_type, value_type)           \
+    CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.05f),     \
+        CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.25f), \
+        CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.5f),  \
+        CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.75f)
+
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -555,53 +488,51 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     std::cout << "benchmark_device_select" << std::endl;
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
 
-    using custom_double2 = benchmark_utils::custom_type<double, double>;
+    using custom_double2    = benchmark_utils::custom_type<double, double>;
     using custom_int_double = benchmark_utils::custom_type<int, double>;
 
     // Add benchmarks
-    std::vector<benchmark::internal::Benchmark*> benchmarks =
-    {
-        BENCHMARK_FLAGGED_TYPE(int, unsigned char),
-        BENCHMARK_FLAGGED_TYPE(float, unsigned char),
-        BENCHMARK_FLAGGED_TYPE(double, unsigned char),
-        BENCHMARK_FLAGGED_TYPE(uint8_t, uint8_t),
-        BENCHMARK_FLAGGED_TYPE(int8_t, int8_t),
-        BENCHMARK_FLAGGED_TYPE(custom_double2, unsigned char),
-
-        BENCHMARK_IF_TYPE(int),
-        BENCHMARK_IF_TYPE(float),
-        BENCHMARK_IF_TYPE(double),
-        BENCHMARK_IF_TYPE(uint8_t),
-        BENCHMARK_IF_TYPE(int8_t),
-        BENCHMARK_IF_TYPE(custom_int_double),
-
-        BENCHMARK_UNIQUE_TYPE(int),
-        BENCHMARK_UNIQUE_TYPE(float),
-        BENCHMARK_UNIQUE_TYPE(double),
-        BENCHMARK_UNIQUE_TYPE(uint8_t),
-        BENCHMARK_UNIQUE_TYPE(int8_t),
-        BENCHMARK_UNIQUE_TYPE(custom_int_double),
-
-        BENCHMARK_UNIQUE_BY_KEY_TYPE(int, int),
-        BENCHMARK_UNIQUE_BY_KEY_TYPE(float, double),
-        BENCHMARK_UNIQUE_BY_KEY_TYPE(double, custom_double2),
-        BENCHMARK_UNIQUE_BY_KEY_TYPE(uint8_t, uint8_t),
-        BENCHMARK_UNIQUE_BY_KEY_TYPE(int8_t, double),
-        BENCHMARK_UNIQUE_BY_KEY_TYPE(custom_int_double, custom_int_double)
-    };
+    std::vector<benchmark::internal::Benchmark*> benchmarks
+        = {BENCHMARK_FLAGGED_TYPE(int, unsigned char),
+           BENCHMARK_FLAGGED_TYPE(float, unsigned char),
+           BENCHMARK_FLAGGED_TYPE(double, unsigned char),
+           BENCHMARK_FLAGGED_TYPE(uint8_t, uint8_t),
+           BENCHMARK_FLAGGED_TYPE(int8_t, int8_t),
+           BENCHMARK_FLAGGED_TYPE(custom_double2, unsigned char),
+
+           BENCHMARK_IF_TYPE(int),
+           BENCHMARK_IF_TYPE(float),
+           BENCHMARK_IF_TYPE(double),
+           BENCHMARK_IF_TYPE(uint8_t),
+           BENCHMARK_IF_TYPE(int8_t),
+           BENCHMARK_IF_TYPE(custom_int_double),
+
+           BENCHMARK_UNIQUE_TYPE(int),
+           BENCHMARK_UNIQUE_TYPE(float),
+           BENCHMARK_UNIQUE_TYPE(double),
+           BENCHMARK_UNIQUE_TYPE(uint8_t),
+           BENCHMARK_UNIQUE_TYPE(int8_t),
+           BENCHMARK_UNIQUE_TYPE(custom_int_double),
+
+           BENCHMARK_UNIQUE_BY_KEY_TYPE(int, int),
+           BENCHMARK_UNIQUE_BY_KEY_TYPE(float, double),
+           BENCHMARK_UNIQUE_BY_KEY_TYPE(double, custom_double2),
+           BENCHMARK_UNIQUE_BY_KEY_TYPE(uint8_t, uint8_t),
+           BENCHMARK_UNIQUE_BY_KEY_TYPE(int8_t, double),
+           BENCHMARK_UNIQUE_BY_KEY_TYPE(custom_int_double, custom_int_double)};
 
     // Use manual timing
     for(auto& b : benchmarks)
diff --git a/benchmark/benchmark_device_spmv.cpp b/benchmark/benchmark_device_spmv.cpp
index e884f361..37d119ee 100644
--- a/benchmark/benchmark_device_spmv.cpp
+++ b/benchmark/benchmark_device_spmv.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -29,26 +29,30 @@
 const size_t DEFAULT_N = 1024 * 32;
 #endif
 
-const unsigned int batch_size = 10;
+const unsigned int batch_size  = 10;
 const unsigned int warmup_size = 5;
 
 template<class T>
 void run_benchmark(benchmark::State& state,
-                   size_t size,
+                   size_t            size,
                    const hipStream_t stream,
-                   float probability)
+                   float             probability)
 {
     const T rand_min = T(1);
     const T rand_max = T(10);
 
     // generate a lexicograhically sorted list of (row, column) index tuples
     // number of nonzeroes cannot be guaranteed as duplicates may exist
-    const int num_nonzeroes_attempt = static_cast<int>(std::min(
-        static_cast<size_t>(INT_MAX), static_cast<size_t>(probability * static_cast<float>(size * size))));
+    const int num_nonzeroes_attempt = static_cast<int>(
+        std::min(static_cast<size_t>(INT_MAX),
+                 static_cast<size_t>(probability * static_cast<float>(size * size))));
     std::vector<std::pair<int, int>> indices(num_nonzeroes_attempt);
     {
-        std::vector<int> flat_indices = benchmark_utils::get_random_data<int>(
-            2 * num_nonzeroes_attempt, 0, size - 1, 2 * num_nonzeroes_attempt);
+        std::vector<int> flat_indices
+            = benchmark_utils::get_random_data<int>(2 * num_nonzeroes_attempt,
+                                                    0,
+                                                    size - 1,
+                                                    2 * num_nonzeroes_attempt);
         for(int i = 0; i < num_nonzeroes_attempt; i++)
         {
             indices[i] = std::make_pair(flat_indices[2 * i], flat_indices[2 * i + 1]);
@@ -57,16 +61,17 @@ void run_benchmark(benchmark::State& state,
     }
 
     // generate the compressed sparse rows matrix
-    std::pair<int, int> prev_cell = std::make_pair(-1, -1);
-    int num_nonzeroes = 0;
-    std::vector<int> row_offsets(size + 1);
-    // this vector might be too large, but doing the allocation now eliminates a scan
-    std::vector<int> column_indices(num_nonzeroes_attempt); 
-    row_offsets[0] = 0;
+    std::pair<int, int> prev_cell     = std::make_pair(-1, -1);
+    int                 num_nonzeroes = 0;
+    std::vector<int>    row_offsets(size + 1);
+    // this vector might be too large, but doing the allocation now eliminates a
+    // scan
+    std::vector<int> column_indices(num_nonzeroes_attempt);
+    row_offsets[0]       = 0;
     int last_row_written = 0;
     for(int i = 0; i < num_nonzeroes_attempt; i++)
     {
-        if(indices[i] != prev_cell) 
+        if(indices[i] != prev_cell)
         {
             // update the row offets if we go to the next row (or skip some)
             if(indices[i].first != last_row_written)
@@ -94,67 +99,90 @@ void run_benchmark(benchmark::State& state,
 
     std::vector<T> vector_x = benchmark_utils::get_random_data<T>(size, rand_min, rand_max);
 
-    T * d_values;
-    int * d_row_offsets;
-    int * d_column_indices;
-    T * d_vector_x;
-    T * d_vector_y;
-    HIP_CHECK(hipMalloc(&d_values,  values.size() * sizeof(T)));
+    T*   d_values;
+    int* d_row_offsets;
+    int* d_column_indices;
+    T*   d_vector_x;
+    T*   d_vector_y;
+    HIP_CHECK(hipMalloc(&d_values, values.size() * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_row_offsets, row_offsets.size() * sizeof(int)));
     HIP_CHECK(hipMalloc(&d_column_indices, num_nonzeroes * sizeof(int)));
     HIP_CHECK(hipMalloc(&d_vector_x, vector_x.size() * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_vector_y, size * sizeof(T)));
-    HIP_CHECK(hipMemcpy(
-        d_values, values.data(), values.size() * sizeof(T), 
-        hipMemcpyHostToDevice));
-    HIP_CHECK(hipMemcpy(
-        d_row_offsets, row_offsets.data(), row_offsets.size() * sizeof(int), 
-        hipMemcpyHostToDevice));
-    HIP_CHECK(hipMemcpy(
-        d_column_indices, column_indices.data(), num_nonzeroes * sizeof(int), 
-        hipMemcpyHostToDevice));
-    HIP_CHECK(hipMemcpy(
-        d_vector_x, vector_x.data(), vector_x.size() * sizeof(T), 
-        hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(d_values, values.data(), values.size() * sizeof(T), hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(d_row_offsets,
+                        row_offsets.data(),
+                        row_offsets.size() * sizeof(int),
+                        hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(d_column_indices,
+                        column_indices.data(),
+                        num_nonzeroes * sizeof(int),
+                        hipMemcpyHostToDevice));
+    HIP_CHECK(
+        hipMemcpy(d_vector_x, vector_x.data(), vector_x.size() * sizeof(T), hipMemcpyHostToDevice));
     HIP_CHECK(hipDeviceSynchronize());
 
     // Allocate temporary storage memory
     size_t temp_storage_size_bytes;
 
     // Get size of d_temp_storage
-    HIP_CHECK(hipcub::DeviceSpmv::CsrMV(
-          nullptr, temp_storage_size_bytes, d_values, d_row_offsets, 
-          d_column_indices, d_vector_x, d_vector_y, size, size, num_nonzeroes, stream));
+    HIP_CHECK(hipcub::DeviceSpmv::CsrMV(nullptr,
+                                        temp_storage_size_bytes,
+                                        d_values,
+                                        d_row_offsets,
+                                        d_column_indices,
+                                        d_vector_x,
+                                        d_vector_y,
+                                        size,
+                                        size,
+                                        num_nonzeroes,
+                                        stream));
     HIP_CHECK(hipDeviceSynchronize());
 
     // allocate temporary storage
-    void * d_temp_storage = nullptr;
+    void* d_temp_storage = nullptr;
     HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes));
     HIP_CHECK(hipDeviceSynchronize());
 
     // Warm-up
-    for(size_t i = 0; i < warmup_size; i++) 
+    for(size_t i = 0; i < warmup_size; i++)
     {
-        HIP_CHECK(hipcub::DeviceSpmv::CsrMV(
-            d_temp_storage, temp_storage_size_bytes, d_values, d_row_offsets, 
-            d_column_indices, d_vector_x, d_vector_y, size, size, num_nonzeroes, stream));
+        HIP_CHECK(hipcub::DeviceSpmv::CsrMV(d_temp_storage,
+                                            temp_storage_size_bytes,
+                                            d_values,
+                                            d_row_offsets,
+                                            d_column_indices,
+                                            d_vector_x,
+                                            d_vector_y,
+                                            size,
+                                            size,
+                                            num_nonzeroes,
+                                            stream));
     }
     HIP_CHECK(hipDeviceSynchronize());
 
-    for(auto _ : state) 
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
-        for(size_t i = 0; i < batch_size; i++) 
+        for(size_t i = 0; i < batch_size; i++)
         {
-            HIP_CHECK(hipcub::DeviceSpmv::CsrMV(
-                d_temp_storage, temp_storage_size_bytes, d_values, d_row_offsets,
-                d_column_indices, d_vector_x, d_vector_y, size, size, num_nonzeroes, stream));
+            HIP_CHECK(hipcub::DeviceSpmv::CsrMV(d_temp_storage,
+                                                temp_storage_size_bytes,
+                                                d_values,
+                                                d_row_offsets,
+                                                d_column_indices,
+                                                d_vector_x,
+                                                d_vector_y,
+                                                size,
+                                                size,
+                                                num_nonzeroes,
+                                                stream));
         }
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * batch_size * (num_nonzeroes + size) * sizeof(T));
@@ -169,20 +197,20 @@ void run_benchmark(benchmark::State& state,
     HIP_CHECK(hipDeviceSynchronize());
 }
 
-#define CREATE_BENCHMARK(T, p)         \
-benchmark::RegisterBenchmark(          \
-    ("CsrMV<" #T ">(p = " #p")"),      \
-    &run_benchmark<T>, size, stream, p \
-)
+#define CREATE_BENCHMARK(T, p)                                                          \
+    benchmark::RegisterBenchmark(                                                       \
+        std::string("device_spmv_CsrMV<data_type:" #T ",probability:" #p ">.").c_str(), \
+        &run_benchmark<T>,                                                              \
+        size,                                                                           \
+        stream,                                                                         \
+        p)
 
-#define BENCHMARK_TYPE(type)         \
-    CREATE_BENCHMARK(type, 1.0e-6f), \
-    CREATE_BENCHMARK(type, 1.0e-5f), \
-    CREATE_BENCHMARK(type, 1.0e-4f), \
-    CREATE_BENCHMARK(type, 1.0e-3f), \
-    CREATE_BENCHMARK(type, 1.0e-2f)
+#define BENCHMARK_TYPE(type)                                              \
+    CREATE_BENCHMARK(type, 1.0e-6f), CREATE_BENCHMARK(type, 1.0e-5f),     \
+        CREATE_BENCHMARK(type, 1.0e-4f), CREATE_BENCHMARK(type, 1.0e-3f), \
+        CREATE_BENCHMARK(type, 1.0e-2f)
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -191,20 +219,21 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
+
+    std::cout << "benchmark_device_spmv" << std::endl;
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
 
     // Add benchmarks
-    std::vector<benchmark::internal::Benchmark*> benchmarks =
-    {
+    std::vector<benchmark::internal::Benchmark*> benchmarks = {
         BENCHMARK_TYPE(int),
         BENCHMARK_TYPE(unsigned int),
         BENCHMARK_TYPE(float),
diff --git a/benchmark/benchmark_utils.hpp b/benchmark/benchmark_utils.hpp
index 6e2f9793..fa3da901 100644
--- a/benchmark/benchmark_utils.hpp
+++ b/benchmark/benchmark_utils.hpp
@@ -38,9 +38,9 @@
 #include "hipcub/tuple.hpp"
 
 #ifndef HIPCUB_CUB_API
-#define HIPCUB_WARP_THREADS_MACRO warpSize
+    #define HIPCUB_WARP_THREADS_MACRO warpSize
 #else
-#define HIPCUB_WARP_THREADS_MACRO CUB_PTX_WARP_THREADS
+    #define HIPCUB_WARP_THREADS_MACRO CUB_PTX_WARP_THREADS
 #endif
 
 namespace benchmark_utils
@@ -49,18 +49,18 @@ const size_t default_max_random_size = 1024 * 1024;
 // get_random_data() generates only part of sequence and replicates it,
 // because benchmarks usually do not need "true" random sequence.
 template<class T>
-inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = default_max_random_size)
-    -> typename std::enable_if<std::is_integral<T>::value, std::vector<T>>::type
+inline auto
+    get_random_data(size_t size, T min, T max, size_t max_random_size = default_max_random_size) ->
+    typename std::enable_if<std::is_integral<T>::value, std::vector<T>>::type
 {
-    std::random_device rd;
+    std::random_device         rd;
     std::default_random_engine gen(rd());
-    using distribution_type = typename std::conditional<(sizeof(T)==1), short, T>::type;	
+    using distribution_type = typename std::conditional<(sizeof(T) == 1), short, T>::type;
     std::uniform_int_distribution<distribution_type> distribution(min, max);
-    std::vector<T> data(size);
-    std::generate(
-        data.begin(), data.begin() + std::min(size, max_random_size),
-        [&]() { return distribution(gen); }
-    );
+    std::vector<T>                                   data(size);
+    std::generate(data.begin(),
+                  data.begin() + std::min(size, max_random_size),
+                  [&]() { return distribution(gen); });
     for(size_t i = max_random_size; i < size; i += max_random_size)
     {
         std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i);
@@ -69,17 +69,17 @@ inline auto get_random_data(size_t size, T min, T max, size_t max_random_size =
 }
 
 template<class T>
-inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = default_max_random_size)
-    -> typename std::enable_if<std::is_floating_point<T>::value, std::vector<T>>::type
+inline auto
+    get_random_data(size_t size, T min, T max, size_t max_random_size = default_max_random_size) ->
+    typename std::enable_if<std::is_floating_point<T>::value, std::vector<T>>::type
 {
-    std::random_device rd;
-    std::default_random_engine gen(rd());
+    std::random_device                rd;
+    std::default_random_engine        gen(rd());
     std::uniform_real_distribution<T> distribution(min, max);
-    std::vector<T> data(size);
-    std::generate(
-        data.begin(), data.begin() + std::min(size, max_random_size),
-        [&]() { return distribution(gen); }
-    );
+    std::vector<T>                    data(size);
+    std::generate(data.begin(),
+                  data.begin() + std::min(size, max_random_size),
+                  [&]() { return distribution(gen); });
     for(size_t i = max_random_size; i < size; i += max_random_size)
     {
         std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i);
@@ -88,16 +88,16 @@ inline auto get_random_data(size_t size, T min, T max, size_t max_random_size =
 }
 
 template<class T>
-inline std::vector<T> get_random_data01(size_t size, float p, size_t max_random_size = default_max_random_size)
+inline std::vector<T>
+    get_random_data01(size_t size, float p, size_t max_random_size = default_max_random_size)
 {
-    std::random_device rd;
-    std::default_random_engine gen(rd());
+    std::random_device          rd;
+    std::default_random_engine  gen(rd());
     std::bernoulli_distribution distribution(p);
-    std::vector<T> data(size);
-    std::generate(
-        data.begin(), data.begin() + std::min(size, max_random_size),
-        [&]() { return distribution(gen); }
-    );
+    std::vector<T>              data(size);
+    std::generate(data.begin(),
+                  data.begin() + std::min(size, max_random_size),
+                  [&]() { return distribution(gen); });
     for(size_t i = max_random_size; i < size; i += max_random_size)
     {
         std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i);
@@ -111,84 +111,87 @@ inline T get_random_value(T min, T max)
     return get_random_data(1, min, max)[0];
 }
 
-
 // Can't use std::prefix_sum for inclusive/exclusive scan, because
 // it does not handle short[] -> int(int a, int b) { a + b; } -> int[]
 // they way we expect. That's because sum in std::prefix_sum's implementation
 // is of type typename std::iterator_traits<InputIt>::value_type (short)
 template<class InputIt, class OutputIt, class BinaryOperation>
-OutputIt host_inclusive_scan(InputIt first, InputIt last,
-                             OutputIt d_first, BinaryOperation op)
+OutputIt host_inclusive_scan(InputIt first, InputIt last, OutputIt d_first, BinaryOperation op)
 {
-    using input_type = typename std::iterator_traits<InputIt>::value_type;
+    using input_type  = typename std::iterator_traits<InputIt>::value_type;
     using output_type = typename std::iterator_traits<OutputIt>::value_type;
     using result_type =
-        typename std::conditional<
-            std::is_void<output_type>::value, input_type, output_type
-        >::type;
+        typename std::conditional<std::is_void<output_type>::value, input_type, output_type>::type;
 
-    if (first == last) return d_first;
+    if(first == last)
+        return d_first;
 
     result_type sum = *first;
-    *d_first = sum;
+    *d_first        = sum;
 
-    while (++first != last) {
-       sum = op(sum, static_cast<result_type>(*first));
-       *++d_first = sum;
+    while(++first != last)
+    {
+        sum        = op(sum, static_cast<result_type>(*first));
+        *++d_first = sum;
     }
     return ++d_first;
 }
 
 template<class InputIt, class T, class OutputIt, class BinaryOperation>
-OutputIt host_exclusive_scan(InputIt first, InputIt last,
-                             T initial_value, OutputIt d_first,
-                             BinaryOperation op)
+OutputIt host_exclusive_scan(
+    InputIt first, InputIt last, T initial_value, OutputIt d_first, BinaryOperation op)
 {
-    using input_type = typename std::iterator_traits<InputIt>::value_type;
+    using input_type  = typename std::iterator_traits<InputIt>::value_type;
     using output_type = typename std::iterator_traits<OutputIt>::value_type;
     using result_type =
-        typename std::conditional<
-            std::is_void<output_type>::value, input_type, output_type
-        >::type;
+        typename std::conditional<std::is_void<output_type>::value, input_type, output_type>::type;
 
-    if (first == last) return d_first;
+    if(first == last)
+        return d_first;
 
     result_type sum = initial_value;
-    *d_first = initial_value;
+    *d_first        = initial_value;
 
-    while ((first+1) != last)
+    while((first + 1) != last)
     {
-       sum = op(sum, static_cast<result_type>(*first));
-       *++d_first = sum;
-       first++;
+        sum        = op(sum, static_cast<result_type>(*first));
+        *++d_first = sum;
+        first++;
     }
     return ++d_first;
 }
 
-template<class InputIt, class KeyIt, class T, class OutputIt, class BinaryOperation, class KeyCompare>
-OutputIt host_exclusive_scan_by_key(InputIt first, InputIt last, KeyIt k_first,
-                                    T initial_value, OutputIt d_first,
-                                    BinaryOperation op, KeyCompare key_compare_op)
+template<class InputIt,
+         class KeyIt,
+         class T,
+         class OutputIt,
+         class BinaryOperation,
+         class KeyCompare>
+OutputIt host_exclusive_scan_by_key(InputIt         first,
+                                    InputIt         last,
+                                    KeyIt           k_first,
+                                    T               initial_value,
+                                    OutputIt        d_first,
+                                    BinaryOperation op,
+                                    KeyCompare      key_compare_op)
 {
-    using input_type = typename std::iterator_traits<InputIt>::value_type;
+    using input_type  = typename std::iterator_traits<InputIt>::value_type;
     using output_type = typename std::iterator_traits<OutputIt>::value_type;
     using result_type =
-        typename std::conditional<
-            std::is_void<output_type>::value, input_type, output_type
-        >::type;
+        typename std::conditional<std::is_void<output_type>::value, input_type, output_type>::type;
 
-    if (first == last) return d_first;
+    if(first == last)
+        return d_first;
 
     result_type sum = initial_value;
-    *d_first = initial_value;
+    *d_first        = initial_value;
 
-    while ((first+1) != last)
+    while((first + 1) != last)
     {
         if(key_compare_op(*k_first, *++k_first))
         {
             sum = op(sum, static_cast<result_type>(*first));
-        }
-        else
+        } else
         {
             sum = initial_value;
         }
@@ -201,120 +204,106 @@ OutputIt host_exclusive_scan_by_key(InputIt first, InputIt last, KeyIt k_first,
 template<class T, class U = T>
 struct custom_type
 {
-    using first_type = T;
+    using first_type  = T;
     using second_type = U;
 
     T x;
     U y;
 
-    HIPCUB_HOST_DEVICE inline
-    constexpr custom_type() : x(T()), y(U()) {}
+    HIPCUB_HOST_DEVICE inline constexpr custom_type() : x(T()), y(U()) {}
 
-    HIPCUB_HOST_DEVICE inline
-    constexpr custom_type(T xx, U yy) : x(xx), y(yy)
-    {
-    }
+    HIPCUB_HOST_DEVICE inline constexpr custom_type(T xx, U yy) : x(xx), y(yy) {}
 
-    HIPCUB_HOST_DEVICE inline
-    constexpr custom_type(T xy) : x(xy), y(xy)
-    {
-    }
+    HIPCUB_HOST_DEVICE inline constexpr custom_type(T xy) : x(xy), y(xy) {}
 
     template<class V, class W = V>
-    HIPCUB_HOST_DEVICE inline
-    custom_type(const custom_type<V,W>& other) : x(other.x), y(other.y)
-    {
-    }
+    HIPCUB_HOST_DEVICE inline custom_type(const custom_type<V, W>& other) : x(other.x), y(other.y)
+    {}
 
-    #ifndef HIPCUB_CUB_API
-        HIPCUB_HOST_DEVICE inline
-        ~custom_type() = default;
-    #endif
+#ifndef HIPCUB_CUB_API
+    HIPCUB_HOST_DEVICE inline ~custom_type() = default;
+#endif
 
-    HIPCUB_HOST_DEVICE inline
-    custom_type& operator=(const custom_type& other)
+    HIPCUB_HOST_DEVICE inline custom_type& operator=(const custom_type& other)
     {
         x = other.x;
         y = other.y;
         return *this;
     }
 
-    HIPCUB_HOST_DEVICE inline
-    custom_type operator+(const custom_type& rhs) const
+    HIPCUB_HOST_DEVICE inline custom_type operator+(const custom_type& rhs) const
     {
         return custom_type(x + rhs.x, y + rhs.y);
     }
 
-    HIPCUB_HOST_DEVICE inline
-    custom_type operator-(const custom_type& other) const
+    HIPCUB_HOST_DEVICE inline custom_type operator-(const custom_type& other) const
     {
         return custom_type(x - other.x, y - other.y);
     }
 
-    HIPCUB_HOST_DEVICE inline
-    bool operator<(const custom_type& rhs) const
+    HIPCUB_HOST_DEVICE inline bool operator<(const custom_type& rhs) const
     {
         // intentionally suboptimal choice for short-circuting,
         // required to generate more performant device code
         return ((x == rhs.x && y < rhs.y) || x < rhs.x);
     }
 
-    HIPCUB_HOST_DEVICE inline
-    bool operator>(const custom_type& other) const
+    HIPCUB_HOST_DEVICE inline bool operator>(const custom_type& other) const
     {
         return (x > other.x || (x == other.x && y > other.y));
     }
 
-    HIPCUB_HOST_DEVICE inline
-    bool operator==(const custom_type& rhs) const
+    HIPCUB_HOST_DEVICE inline bool operator==(const custom_type& rhs) const
     {
         return x == rhs.x && y == rhs.y;
     }
 
-    HIPCUB_HOST_DEVICE inline
-    bool operator!=(const custom_type& other) const
+    HIPCUB_HOST_DEVICE inline bool operator!=(const custom_type& other) const
     {
-       return !(*this == other);
+        return !(*this == other);
     }
 
     HIPCUB_HOST_DEVICE custom_type& operator+=(const custom_type& rhs)
     {
-       this->x += rhs.x;
-       this->y += rhs.y;
-       return *this;
+        this->x += rhs.x;
+        this->y += rhs.y;
+        return *this;
     }
 };
 
 template<typename>
-struct is_custom_type : std::false_type {};
+struct is_custom_type : std::false_type
+{};
 
 template<class T, class U>
-struct is_custom_type<custom_type<T,U>> : std::true_type {};
+struct is_custom_type<custom_type<T, U>> : std::true_type
+{};
 
 template<class CustomType>
 struct custom_type_decomposer
 {
     static_assert(is_custom_type<CustomType>::value,
-                  "custom_type_decomposer can only be used with instantiations of custom_type");
+                  "custom_type_decomposer can only be used with instantiations "
+                  "of custom_type");
 
     using T = typename CustomType::first_type;
     using U = typename CustomType::second_type;
 
     HIPCUB_HOST_DEVICE ::hipcub::tuple<T&, U&> operator()(CustomType& key) const
     {
-       return ::hipcub::tuple<T&, U&>{key.x, key.y};
+        return ::hipcub::tuple<T&, U&>{key.x, key.y};
     }
 };
 
 template<class T>
-inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024)
-    -> typename std::enable_if<is_custom_type<T>::value, std::vector<T>>::type
+inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) ->
+    typename std::enable_if<is_custom_type<T>::value, std::vector<T>>::type
 {
-    using first_type = typename T::first_type;
+    using first_type  = typename T::first_type;
     using second_type = typename T::second_type;
     std::vector<T> data(size);
-    auto fdata = get_random_data<first_type>(size, min.x, max.x, max_random_size);
-    auto sdata = get_random_data<second_type>(size, min.y, max.y, max_random_size);
+    auto           fdata = get_random_data<first_type>(size, min.x, max.x, max_random_size);
+    auto           sdata = get_random_data<second_type>(size, min.y, max.y, max_random_size);
     for(size_t i = 0; i < size; i++)
     {
         data[i] = T(fdata[i], sdata[i]);
@@ -323,13 +312,15 @@ inline auto get_random_data(size_t size, T min, T max, size_t max_random_size =
 }
 
 template<class T>
-inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024)
-    -> typename std::enable_if<!is_custom_type<T>::value && !std::is_same<decltype(max.x), void>::value, std::vector<T>>::type
+inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) ->
+    typename std::enable_if<!is_custom_type<T>::value
+                                && !std::is_same<decltype(max.x), void>::value,
+                            std::vector<T>>::type
 {
 
     using field_type = decltype(max.x);
     std::vector<T> data(size);
-    auto field_data = get_random_data<field_type>(size, min.x, max.x, max_random_size);
+    auto           field_data = get_random_data<field_type>(size, min.x, max.x, max_random_size);
     for(size_t i = 0; i < size; i++)
     {
         data[i] = T(field_data[i]);
@@ -338,33 +329,28 @@ inline auto get_random_data(size_t size, T min, T max, size_t max_random_size =
 }
 
 template<typename T>
-std::vector<T> get_random_segments(const size_t size,
-                                   const size_t max_segment_length,
-                                   const int seed_value)
+std::vector<T>
+    get_random_segments(const size_t size, const size_t max_segment_length, const int seed_value)
 {
     static_assert(std::is_arithmetic<T>::value, "Key type must be arithmetic");
 
-    std::default_random_engine prng(seed_value);
+    std::default_random_engine            prng(seed_value);
     std::uniform_int_distribution<size_t> segment_length_distribution(max_segment_length);
-    using key_distribution_type = std::conditional_t<
-        std::is_integral<T>::value,
-        std::uniform_int_distribution<T>,
-        std::uniform_real_distribution<T>
-    >;
+    using key_distribution_type = std::conditional_t<std::is_integral<T>::value,
+                                                     std::uniform_int_distribution<T>,
+                                                     std::uniform_real_distribution<T>>;
     key_distribution_type key_distribution(std::numeric_limits<T>::max());
-    std::vector<T> keys(size);
+    std::vector<T>        keys(size);
 
     size_t keys_start_index = 0;
-    while (keys_start_index < size)
+    while(keys_start_index < size)
     {
         const size_t new_segment_length = segment_length_distribution(prng);
-        const size_t new_segment_end = std::min(size, keys_start_index + new_segment_length);
-        const T key = key_distribution(prng);
-        std::fill(
-            std::next(keys.begin(), keys_start_index),
-            std::next(keys.begin(), new_segment_end),
-            key
-        );
+        const size_t new_segment_end    = std::min(size, keys_start_index + new_segment_length);
+        const T      key                = key_distribution(prng);
+        std::fill(std::next(keys.begin(), keys_start_index),
+                  std::next(keys.begin(), new_segment_end),
+                  key);
         keys_start_index += new_segment_length;
     }
     return keys;
@@ -437,54 +423,54 @@ inline constexpr auto ceiling_div(const T a, const U b)
     return a / b + (a % b > 0 ? 1 : 0);
 }
 
-} // end benchmark_util namespace
+} // namespace benchmark_utils
 
 // Need for hipcub::DeviceReduce::Min/Max etc.
 namespace std
 {
-    template<>
-    class numeric_limits<benchmark_utils::custom_type<int>>
-    {
-        using T = typename benchmark_utils::custom_type<int>;
+template<>
+class numeric_limits<benchmark_utils::custom_type<int>>
+{
+    using T = typename benchmark_utils::custom_type<int>;
 
-        public:
-            static constexpr inline T min()
-            {
+public:
+    static constexpr inline T min()
+    {
         return std::numeric_limits<typename T::first_type>::min();
-            }
-
-        static constexpr inline T max()
-        {
-            return std::numeric_limits<typename T::first_type>::max();
-        }
+    }
 
-        static constexpr inline T lowest()
-        {
-            return std::numeric_limits<typename T::first_type>::lowest();
-        }
-    };
+    static constexpr inline T max()
+    {
+        return std::numeric_limits<typename T::first_type>::max();
+    }
 
-    template<>
-    class numeric_limits<benchmark_utils::custom_type<float>>
+    static constexpr inline T lowest()
     {
-        using T = typename benchmark_utils::custom_type<float>;
+        return std::numeric_limits<typename T::first_type>::lowest();
+    }
+};
 
-        public:
-            static constexpr inline T min()
-            {
-            return std::numeric_limits<typename T::first_type>::min();
-            }
+template<>
+class numeric_limits<benchmark_utils::custom_type<float>>
+{
+    using T = typename benchmark_utils::custom_type<float>;
 
-        static constexpr inline T max()
-        {
-            return std::numeric_limits<typename T::first_type>::max();
-        }
+public:
+    static constexpr inline T min()
+    {
+        return std::numeric_limits<typename T::first_type>::min();
+    }
 
-        static constexpr inline T lowest()
-        {
-            return std::numeric_limits<typename T::first_type>::lowest();
-        }
-    };
-}
+    static constexpr inline T max()
+    {
+        return std::numeric_limits<typename T::first_type>::max();
+    }
+
+    static constexpr inline T lowest()
+    {
+        return std::numeric_limits<typename T::first_type>::lowest();
+    }
+};
+} // namespace std
 
 #endif // HIPCUB_BENCHMARK_UTILS_HPP_
diff --git a/benchmark/benchmark_warp_exchange.cpp b/benchmark/benchmark_warp_exchange.cpp
index be5ce636..598df954 100644
--- a/benchmark/benchmark_warp_exchange.cpp
+++ b/benchmark/benchmark_warp_exchange.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -41,8 +41,8 @@ __device__ auto warp_exchange_benchmark(T* d_output)
     -> std::enable_if_t<benchmark_utils::device_test_enabled_for_warp_size_v<LogicalWarpSize>>
 {
     T thread_data[ItemsPerThread];
-    #pragma unroll
-    for (unsigned i = 0; i < ItemsPerThread; ++i)
+#pragma unroll
+    for(unsigned i = 0; i < ItemsPerThread; ++i)
     {
         thread_data[i] = static_cast<T>(i);
     }
@@ -52,18 +52,18 @@ __device__ auto warp_exchange_benchmark(T* d_output)
                                                  LogicalWarpSize,
                                                  1, // ARCH
                                                  Algorithm>;
-    constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize;
+    constexpr unsigned                             warps_in_block = BlockSize / LogicalWarpSize;
     __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_in_block];
     const unsigned                                 warp_id = threadIdx.x / LogicalWarpSize;
 
     WarpExchangeT warp_exchange(temp_storage[warp_id]);
     Op{}(warp_exchange, thread_data);
 
-    #pragma unroll
-    for (unsigned i = 0; i < ItemsPerThread; ++i)
+#pragma unroll
+    for(unsigned i = 0; i < ItemsPerThread; ++i)
     {
         const unsigned global_idx = (BlockSize * blockIdx.x + threadIdx.x) * ItemsPerThread + i;
-        d_output[global_idx] = thread_data[i];
+        d_output[global_idx]      = thread_data[i];
     }
 }
 
@@ -97,23 +97,23 @@ __device__ auto warp_exchange_scatter_to_striped_benchmark(T* d_output)
     -> std::enable_if_t<benchmark_utils::device_test_enabled_for_warp_size_v<LogicalWarpSize>>
 {
     const unsigned warp_id = threadIdx.x / LogicalWarpSize;
-    T thread_data[ItemsPerThread];
-    OffsetT thread_ranks[ItemsPerThread];
-    #pragma unroll
-    for (unsigned i = 0; i < ItemsPerThread; ++i)
+    T              thread_data[ItemsPerThread];
+    OffsetT        thread_ranks[ItemsPerThread];
+#pragma unroll
+    for(unsigned i = 0; i < ItemsPerThread; ++i)
     {
-        thread_data[i] = static_cast<T>(i);
+        thread_data[i]  = static_cast<T>(i);
         thread_ranks[i] = static_cast<OffsetT>(LogicalWarpSize - warp_id * ItemsPerThread - i - 1);
     }
 
     using WarpExchangeT = ::hipcub::WarpExchange<T, ItemsPerThread, LogicalWarpSize>;
-    constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize;
+    constexpr unsigned                             warps_in_block = BlockSize / LogicalWarpSize;
     __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_in_block];
 
     WarpExchangeT(temp_storage[warp_id]).ScatterToStriped(thread_data, thread_ranks);
 
-    #pragma unroll
-    for (unsigned i = 0; i < ItemsPerThread; ++i)
+#pragma unroll
+    for(unsigned i = 0; i < ItemsPerThread; ++i)
     {
         const unsigned striped_global_idx
             = BlockSize * ItemsPerThread * blockIdx.x + BlockSize * i + threadIdx.x;
@@ -149,18 +149,18 @@ template<class T,
          class Op>
 void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
 {
-    constexpr unsigned trials = 100;
+    constexpr unsigned trials          = 100;
     constexpr unsigned items_per_block = BlockSize * ItemsPerThread;
-    const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block);
+    const unsigned     size = items_per_block * ((N + items_per_block - 1) / items_per_block);
 
-    T * d_output;
+    T* d_output;
     HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
 
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
 
-        for (size_t i = 0; i < trials; ++i)
+        for(size_t i = 0; i < trials; ++i)
         {
             warp_exchange_kernel<BlockSize, ItemsPerThread, LogicalWarpSize, Algorithm, Op>
                 <<<dim3(size / items_per_block), dim3(BlockSize), 0, stream>>>(d_output);
@@ -169,8 +169,8 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
         HIP_CHECK(hipPeekAtLastError())
         HIP_CHECK(hipDeviceSynchronize());
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * trials * size * sizeof(T));
@@ -179,27 +179,25 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
     HIP_CHECK(hipFree(d_output));
 }
 
-template<
-    class T,
-    class OffsetT,
-    unsigned BlockSize,
-    unsigned ItemsPerThread,
-    unsigned LogicalWarpSize
->
+template<class T,
+         class OffsetT,
+         unsigned BlockSize,
+         unsigned ItemsPerThread,
+         unsigned LogicalWarpSize>
 void run_benchmark_scatter_to_striped(benchmark::State& state, hipStream_t stream, size_t N)
 {
-    constexpr unsigned trials = 100;
+    constexpr unsigned trials          = 100;
     constexpr unsigned items_per_block = BlockSize * ItemsPerThread;
-    const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block);
+    const unsigned     size = items_per_block * ((N + items_per_block - 1) / items_per_block);
 
-    T * d_output;
+    T* d_output;
     HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
 
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
 
-        for (size_t i = 0; i < trials; ++i)
+        for(size_t i = 0; i < trials; ++i)
         {
             warp_exchange_scatter_to_striped_kernel<OffsetT,
                                                     BlockSize,
@@ -211,8 +209,8 @@ void run_benchmark_scatter_to_striped(benchmark::State& state, hipStream_t strea
         HIP_CHECK(hipPeekAtLastError())
         HIP_CHECK(hipDeviceSynchronize());
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * trials * size * sizeof(T));
@@ -239,30 +237,34 @@ struct BlockedToStripedOp
     }
 };
 
-#define CREATE_BENCHMARK_STRIPED_TO_BLOCKED(T, BS, IT, WS, ALG)                                    \
-    benchmark::RegisterBenchmark("warp_exchange_striped_to_blocked<Datatype:" #T                   \
-                                 ",Block Size:" #BS ",Items Per Thread:" #IT ",Warp Size:" #WS     \
-                                 ",Algorithm:" #ALG ">.",                                          \
-                                 &run_benchmark<T, BS, IT, WS, ::hipcub::ALG, StripedToBlockedOp>, \
-                                 stream,                                                           \
+#define CREATE_BENCHMARK_STRIPED_TO_BLOCKED(T, BS, IT, WS, ALG)                                  \
+    benchmark::RegisterBenchmark(std::string("warp_exchange_striped_to_blocked<data_type:" #T    \
+                                             ",block_size:" #BS ",items_per_thread:" #IT         \
+                                             ",warp_size:" #WS ",sub_algorithm_name:" #ALG ">.") \
+                                     .c_str(),                                                   \
+                                 &run_benchmark<T, BS, IT, WS, ALG, StripedToBlockedOp>,         \
+                                 stream,                                                         \
                                  size)
 
-#define CREATE_BENCHMARK_BLOCKED_TO_STRIPED(T, BS, IT, WS, ALG)                                    \
-    benchmark::RegisterBenchmark("warp_exchange_blocked_to_striped<Datatype:" #T                   \
-                                 ",Block Size:" #BS ",Items Per Thread:" #IT ",Warp Size:" #WS     \
-                                 ",Algorithm:" #ALG ">.",                                          \
-                                 &run_benchmark<T, BS, IT, WS, ::hipcub::ALG, BlockedToStripedOp>, \
-                                 stream,                                                           \
+#define CREATE_BENCHMARK_BLOCKED_TO_STRIPED(T, BS, IT, WS, ALG)                                  \
+    benchmark::RegisterBenchmark(std::string("warp_exchange_blocked_to_striped<data_type:" #T    \
+                                             ",block_size:" #BS ",items_per_thread:" #IT         \
+                                             ",warp_size:" #WS ",sub_algorithm_name:" #ALG ">.") \
+                                     .c_str(),                                                   \
+                                 &run_benchmark<T, BS, IT, WS, ALG, BlockedToStripedOp>,         \
+                                 stream,                                                         \
                                  size)
 
-#define CREATE_BENCHMARK_SCATTER_TO_STRIPED(T, OFFSET_T, BS, IT, WS) \
-benchmark::RegisterBenchmark( \
-    "warp_exchange_scatter_to_striped<Datatype:" #T ",Offset Type:" #OFFSET_T ",Block Size:" #BS ",Items Per Thread:" #IT ",Warp Size:" #WS ">.", \
-    &run_benchmark_scatter_to_striped<T, OFFSET_T, BS, IT, WS>, \
-    stream, size \
-)
+#define CREATE_BENCHMARK_SCATTER_TO_STRIPED(T, OFFSET_T, BS, IT, WS)                          \
+    benchmark::RegisterBenchmark(std::string("warp_exchange_scatter_to_striped<data_type:" #T \
+                                             ",offset_type:" #OFFSET_T ",block_size:" #BS     \
+                                             ",items_per_thread:" #IT ",warp_size:" #WS ">.") \
+                                     .c_str(),                                                \
+                                 &run_benchmark_scatter_to_striped<T, OFFSET_T, BS, IT, WS>,  \
+                                 stream,                                                      \
+                                 size)
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -271,81 +273,79 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     std::cout << "benchmark_warp_exchange" << std::endl;
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
 
     // Add benchmarks
     std::vector<benchmark::internal::Benchmark*> benchmarks{
-        CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 16, WARP_EXCHANGE_SMEM),
-        CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 16, WARP_EXCHANGE_SMEM),
-        CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 16, 16, WARP_EXCHANGE_SMEM),
-        CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 16, 16, WARP_EXCHANGE_SMEM),
-        CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 32, WARP_EXCHANGE_SMEM),
-        CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 32, WARP_EXCHANGE_SMEM),
-        CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 32, WARP_EXCHANGE_SMEM),
-        CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 32, WARP_EXCHANGE_SMEM),
+        CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 16, ::hipcub::WARP_EXCHANGE_SMEM),
+        CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 16, ::hipcub::WARP_EXCHANGE_SMEM),
+        CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 16, 16, ::hipcub::WARP_EXCHANGE_SMEM),
+        CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 16, 16, ::hipcub::WARP_EXCHANGE_SMEM),
+        CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 32, ::hipcub::WARP_EXCHANGE_SMEM),
+        CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 32, ::hipcub::WARP_EXCHANGE_SMEM),
+        CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 32, ::hipcub::WARP_EXCHANGE_SMEM),
+        CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 32, ::hipcub::WARP_EXCHANGE_SMEM),
         CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 16),
         CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 32),
         CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 256, 4, 32),
 
-        CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 16, 16, WARP_EXCHANGE_SHUFFLE),
-        CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 16, 16, WARP_EXCHANGE_SHUFFLE),
+        CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 16, 16, ::hipcub::WARP_EXCHANGE_SHUFFLE),
+        CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 16, 16, ::hipcub::WARP_EXCHANGE_SHUFFLE),
 
 // CUB requires WS == IPT for WARP_EXCHANGE_SHUFFLE
 #ifdef HIPCUB_ROCPRIM_API
-        CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 16, WARP_EXCHANGE_SHUFFLE),
-        CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 16, WARP_EXCHANGE_SHUFFLE),
-        CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 32, WARP_EXCHANGE_SHUFFLE),
-        CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 32, WARP_EXCHANGE_SHUFFLE),
-        CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 32, WARP_EXCHANGE_SHUFFLE),
-        CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 32, WARP_EXCHANGE_SHUFFLE),
+        CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 16, ::hipcub::WARP_EXCHANGE_SHUFFLE),
+        CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 16, ::hipcub::WARP_EXCHANGE_SHUFFLE),
+        CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 32, ::hipcub::WARP_EXCHANGE_SHUFFLE),
+        CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 32, ::hipcub::WARP_EXCHANGE_SHUFFLE),
+        CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 32, ::hipcub::WARP_EXCHANGE_SHUFFLE),
+        CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 32, ::hipcub::WARP_EXCHANGE_SHUFFLE),
 #endif
     };
 
 #ifdef HIPCUB_ROCPRIM_API
-    if (::benchmark_utils::is_warp_size_supported(64))
+    if(::benchmark_utils::is_warp_size_supported(64))
     {
         std::vector<benchmark::internal::Benchmark*> additional_benchmarks{
-            CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 64, WARP_EXCHANGE_SMEM),
-            CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 64, WARP_EXCHANGE_SHUFFLE),
-            CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 64, WARP_EXCHANGE_SMEM),
-            CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 64, WARP_EXCHANGE_SHUFFLE),
+            CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 64, ::hipcub::WARP_EXCHANGE_SMEM),
+            CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 64, ::hipcub::WARP_EXCHANGE_SHUFFLE),
+            CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 64, ::hipcub::WARP_EXCHANGE_SMEM),
+            CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 64, ::hipcub::WARP_EXCHANGE_SHUFFLE),
             CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 64),
 
-            CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 64, WARP_EXCHANGE_SMEM),
-            CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 64, WARP_EXCHANGE_SHUFFLE),
-            CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 64, WARP_EXCHANGE_SMEM),
-            CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 64, WARP_EXCHANGE_SHUFFLE),
+            CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 64, ::hipcub::WARP_EXCHANGE_SMEM),
+            CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 64, ::hipcub::WARP_EXCHANGE_SHUFFLE),
+            CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 64, ::hipcub::WARP_EXCHANGE_SMEM),
+            CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 64, ::hipcub::WARP_EXCHANGE_SHUFFLE),
             CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 256, 4, 64)};
-        benchmarks.insert(
-            benchmarks.end(),
-            additional_benchmarks.begin(),
-            additional_benchmarks.end()
-        );
+        benchmarks.insert(benchmarks.end(),
+                          additional_benchmarks.begin(),
+                          additional_benchmarks.end());
     }
 #endif
 
     // Use manual timing
-    for (auto& b : benchmarks)
+    for(auto& b : benchmarks)
     {
         b->UseManualTime();
         b->Unit(benchmark::kMillisecond);
     }
 
     // Force number of iterations
-    if (trials > 0)
+    if(trials > 0)
     {
-        for (auto& b : benchmarks)
+        for(auto& b : benchmarks)
         {
             b->Iterations(trials);
         }
diff --git a/benchmark/benchmark_warp_load.cpp b/benchmark/benchmark_warp_load.cpp
index 50bc0a19..4298db66 100644
--- a/benchmark/benchmark_warp_load.cpp
+++ b/benchmark/benchmark_warp_load.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -41,17 +41,17 @@ __device__ auto warp_load_benchmark(T* d_input, T* d_output)
 {
     using WarpLoadT = ::hipcub::WarpLoad<T, ItemsPerThread, Algorithm, LogicalWarpSize>;
     constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize;
-    constexpr int tile_size = ItemsPerThread * LogicalWarpSize;
+    constexpr int      tile_size      = ItemsPerThread * LogicalWarpSize;
 
     const unsigned warp_id        = threadIdx.x / LogicalWarpSize;
     const unsigned global_warp_id = blockIdx.x * warps_in_block + warp_id;
     __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block];
-    T thread_data[ItemsPerThread];
+    T                                          thread_data[ItemsPerThread];
 
     WarpLoadT(temp_storage[warp_id]).Load(d_input + global_warp_id * tile_size, thread_data);
 
-    #pragma unroll
-    for (unsigned i = 0; i < ItemsPerThread; ++i)
+#pragma unroll
+    for(unsigned i = 0; i < ItemsPerThread; ++i)
     {
         const unsigned striped_global_idx
             = BlockSize * ItemsPerThread * blockIdx.x + BlockSize * i + threadIdx.x;
@@ -78,37 +78,29 @@ __global__ __launch_bounds__(BlockSize) void warp_load_kernel(T* d_input, T* d_o
     warp_load_benchmark<BlockSize, ItemsPerThread, LogicalWarpSize, Algorithm>(d_input, d_output);
 }
 
-template<
-    class T,
-    unsigned BlockSize,
-    unsigned ItemsPerThread,
-    unsigned LogicalWarpSize,
-    ::hipcub::WarpLoadAlgorithm Algorithm,
-    unsigned Trials = 100
->
+template<class T,
+         unsigned                    BlockSize,
+         unsigned                    ItemsPerThread,
+         unsigned                    LogicalWarpSize,
+         ::hipcub::WarpLoadAlgorithm Algorithm,
+         unsigned                    Trials = 100>
 void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
 {
     constexpr unsigned items_per_block = BlockSize * ItemsPerThread;
-    const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block);
+    const unsigned     size = items_per_block * ((N + items_per_block - 1) / items_per_block);
 
     std::vector<T> input = benchmark_utils::get_random_data<T>(size, T(0), T(10));
-    T * d_input;
-    T * d_output;
+    T*             d_input;
+    T*             d_output;
     HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            size * sizeof(T),
-            hipMemcpyHostToDevice
-        )
-    );
+    HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice));
 
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
 
-        for (size_t i = 0; i < Trials; i++)
+        for(size_t i = 0; i < Trials; i++)
         {
             warp_load_kernel<BlockSize, ItemsPerThread, LogicalWarpSize, Algorithm>
                 <<<dim3(size / items_per_block), dim3(BlockSize), 0, stream>>>(d_input, d_output);
@@ -116,8 +108,8 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
         HIP_CHECK(hipPeekAtLastError())
         HIP_CHECK(hipDeviceSynchronize());
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T));
@@ -127,14 +119,16 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
     HIP_CHECK(hipFree(d_output));
 }
 
-#define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \
-benchmark::RegisterBenchmark( \
-    "warp_load<Datatype:" #T ",Block Size:" #BS ",Items Per Thread:" #IT ",Warp Size:" #WS ",Warp Load Algorithm:" #ALG ">.", \
-    &run_benchmark<T, BS, IT, WS, ALG>, \
-    stream, size \
-)
+#define CREATE_BENCHMARK(T, BS, IT, WS, ALG)                                              \
+    benchmark::RegisterBenchmark(std::string("warp_load<data_type:" #T ",block_size:" #BS \
+                                             ",items_per_thread:" #IT ",warp_size:" #WS   \
+                                             ",sub_algorithm_name:" #ALG ">.")            \
+                                     .c_str(),                                            \
+                                 &run_benchmark<T, BS, IT, WS, ALG>,                      \
+                                 stream,                                                  \
+                                 size)
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -143,15 +137,17 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
+
+    std::cout << "benchmark_warp_load" << std::endl;
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
 
     // Add benchmarks
@@ -199,7 +195,7 @@ int main(int argc, char *argv[])
         // CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_TRANSPOSE)
     };
 
-    if (::benchmark_utils::is_warp_size_supported(64))
+    if(::benchmark_utils::is_warp_size_supported(64))
     {
         std::vector<benchmark::internal::Benchmark*> additional_benchmarks{
             CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_DIRECT),
@@ -245,24 +241,22 @@ int main(int argc, char *argv[])
             // WARP_LOAD_TRANSPOSE removed because of shared memory limit
             // CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_TRANSPOSE)
         };
-        benchmarks.insert(
-            benchmarks.end(),
-            additional_benchmarks.begin(),
-            additional_benchmarks.end()
-        );
+        benchmarks.insert(benchmarks.end(),
+                          additional_benchmarks.begin(),
+                          additional_benchmarks.end());
     }
 
     // Use manual timing
-    for (auto& b : benchmarks)
+    for(auto& b : benchmarks)
     {
         b->UseManualTime();
         b->Unit(benchmark::kMillisecond);
     }
 
     // Force number of iterations
-    if (trials > 0)
+    if(trials > 0)
     {
-        for (auto& b : benchmarks)
+        for(auto& b : benchmarks)
         {
             b->Iterations(trials);
         }
diff --git a/benchmark/benchmark_warp_merge_sort.cpp b/benchmark/benchmark_warp_merge_sort.cpp
index 9a0e4fd8..e31f68eb 100644
--- a/benchmark/benchmark_warp_merge_sort.cpp
+++ b/benchmark/benchmark_warp_merge_sort.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -53,7 +53,7 @@ __device__ auto sort_keys_benchmark(const T* input, T* output, Compare compare_o
 
     const unsigned int flat_tid     = threadIdx.x;
     const unsigned int block_offset = blockIdx.x * items_per_block;
-    T keys[ItemsPerThread];
+    T                  keys[ItemsPerThread];
     hipcub::LoadDirectBlocked(flat_tid, input + block_offset, keys);
 
     constexpr unsigned int warps_per_block = BlockSize / LogicalWarpSize;
@@ -100,8 +100,8 @@ __device__ auto sort_pairs_benchmark(const T* input, T* output, Compare compare_
 
     const unsigned int flat_tid     = threadIdx.x;
     const unsigned int block_offset = blockIdx.x * items_per_block;
-    T keys[ItemsPerThread];
-    T values[ItemsPerThread];
+    T                  keys[ItemsPerThread];
+    T                  values[ItemsPerThread];
     hipcub::LoadDirectBlocked(flat_tid, input + block_offset, keys);
 
     for(unsigned int i = 0; i < ItemsPerThread; ++i)
@@ -146,8 +146,9 @@ __global__
     sort_pairs_benchmark<BlockSize, LogicalWarpSize, ItemsPerThread>(input, output, compare_op);
 }
 
-template <typename T>
-struct max_value {
+template<typename T>
+struct max_value
+{
     static constexpr T value = std::numeric_limits<T>::max();
 };
 
@@ -162,20 +163,20 @@ __device__ auto sort_keys_segmented_benchmark(const T*            input,
                                               Compare             compare)
     -> std::enable_if_t<benchmark_utils::device_test_enabled_for_warp_size_v<LogicalWarpSize>>
 {
-    constexpr unsigned int max_segment_size = LogicalWarpSize * ItemsPerThread;
+    constexpr unsigned int max_segment_size   = LogicalWarpSize * ItemsPerThread;
     constexpr unsigned int segments_per_block = BlockSize / LogicalWarpSize;
 
     using warp_merge_sort = hipcub::WarpMergeSort<T, ItemsPerThread, LogicalWarpSize>;
     __shared__ typename warp_merge_sort::TempStorage storage[segments_per_block];
 
     const unsigned int warp_id = threadIdx.x / LogicalWarpSize;
-    warp_merge_sort wsort{storage[warp_id]};
+    warp_merge_sort    wsort{storage[warp_id]};
 
     const unsigned int segment_id = blockIdx.x * segments_per_block + warp_id;
 
     const unsigned int segment_size = segment_sizes[segment_id];
-    const unsigned int warp_offset = segment_id * max_segment_size;
-    T keys[ItemsPerThread];
+    const unsigned int warp_offset  = segment_id * max_segment_size;
+    T                  keys[ItemsPerThread];
 
     const unsigned int flat_tid = wsort.get_linear_tid();
     hipcub::LoadDirectBlocked(flat_tid, input + warp_offset, keys, segment_size);
@@ -225,27 +226,29 @@ __device__ auto sort_pairs_segmented_benchmark(const T*            input,
                                                Compare             compare)
     -> std::enable_if_t<benchmark_utils::device_test_enabled_for_warp_size_v<LogicalWarpSize>>
 {
-    constexpr unsigned int max_segment_size = LogicalWarpSize * ItemsPerThread;
+    constexpr unsigned int max_segment_size   = LogicalWarpSize * ItemsPerThread;
     constexpr unsigned int segments_per_block = BlockSize / LogicalWarpSize;
 
     using warp_merge_sort = hipcub::WarpMergeSort<T, ItemsPerThread, LogicalWarpSize, T>;
     __shared__ typename warp_merge_sort::TempStorage storage[segments_per_block];
 
     const unsigned int warp_id = threadIdx.x / LogicalWarpSize;
-    warp_merge_sort wsort{storage[warp_id]};
+    warp_merge_sort    wsort{storage[warp_id]};
 
     const unsigned int segment_id = blockIdx.x * segments_per_block + warp_id;
 
     const unsigned int segment_size = segment_sizes[segment_id];
-    const unsigned int warp_offset = segment_id * max_segment_size;
-    T keys[ItemsPerThread];
-    T values[ItemsPerThread];
+    const unsigned int warp_offset  = segment_id * max_segment_size;
+    T                  keys[ItemsPerThread];
+    T                  values[ItemsPerThread];
 
     const unsigned int flat_tid = wsort.get_linear_tid();
     hipcub::LoadDirectBlocked(flat_tid, input + warp_offset, keys, segment_size);
 
-    for(unsigned int i = 0; i < ItemsPerThread; ++i) {
-        if(flat_tid * ItemsPerThread + i < segment_size) {
+    for(unsigned int i = 0; i < ItemsPerThread; ++i)
+    {
+        if(flat_tid * ItemsPerThread + i < segment_size)
+        {
             values[i] = keys[i] + T(1);
         }
     }
@@ -253,8 +256,10 @@ __device__ auto sort_pairs_segmented_benchmark(const T*            input,
     const T oob_default = max_value<T>::value;
     wsort.Sort(keys, values, compare, segment_size, oob_default);
 
-    for(unsigned int i = 0; i < ItemsPerThread; ++i) {
-        if(flat_tid * ItemsPerThread + i < segment_size) {
+    for(unsigned int i = 0; i < ItemsPerThread; ++i)
+    {
+        if(flat_tid * ItemsPerThread + i < segment_size)
+        {
             keys[i] += values[i];
         }
     }
@@ -290,38 +295,33 @@ __global__ __launch_bounds__(BlockSize) void sort_pairs_segmented(const T*
                                                                                compare);
 }
 
-template<
-    class T,
-    unsigned int BlockSize,
-    unsigned int LogicalWarpSize,
-    unsigned int ItemsPerThread,
-    class CompareOp = test_utils::less,
-    unsigned int Trials = 10
->
-void run_benchmark(benchmark::State& state, const benchmark_kinds benchmark_kind, const hipStream_t stream, const size_t N)
+template<class T,
+         unsigned int BlockSize,
+         unsigned int LogicalWarpSize,
+         unsigned int ItemsPerThread,
+         class CompareOp     = test_utils::less,
+         unsigned int Trials = 10>
+void run_benchmark(benchmark::State&     state,
+                   const benchmark_kinds benchmark_kind,
+                   const hipStream_t     stream,
+                   const size_t          N)
 {
     constexpr auto items_per_block = BlockSize * ItemsPerThread;
-    const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block);
+    const auto     size = items_per_block * ((N + items_per_block - 1) / items_per_block);
 
-    const auto input = std::is_floating_point<T>::value ?
-        benchmark_utils::get_random_data<T>(size, static_cast<T>(-1000), static_cast<T>(1000)) :
-        benchmark_utils::get_random_data<T>(
-            size,
-            std::numeric_limits<T>::min(),
-            std::numeric_limits<T>::max()
-        );
+    const auto input = std::is_floating_point<T>::value
+                           ? benchmark_utils::get_random_data<T>(size,
+                                                                 static_cast<T>(-1000),
+                                                                 static_cast<T>(1000))
+                           : benchmark_utils::get_random_data<T>(size,
+                                                                 std::numeric_limits<T>::min(),
+                                                                 std::numeric_limits<T>::max());
 
     T* d_input  = nullptr;
     T* d_output = nullptr;
     HIP_CHECK(hipMalloc(&d_input, size * sizeof(input[0])));
     HIP_CHECK(hipMalloc(&d_output, size * sizeof(input[0])));
-    HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            size * sizeof(T),
-            hipMemcpyHostToDevice
-        )
-    );
+    HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice));
 
     for(auto _ : state)
     {
@@ -329,16 +329,17 @@ void run_benchmark(benchmark::State& state, const benchmark_kinds benchmark_kind
 
         if(benchmark_kind == benchmark_kinds::sort_keys)
         {
-            for(unsigned int i = 0; i < Trials; ++i) {
+            for(unsigned int i = 0; i < Trials; ++i)
+            {
                 sort_keys<BlockSize, LogicalWarpSize, ItemsPerThread>
                     <<<dim3(size / items_per_block), dim3(BlockSize), 0, stream>>>(d_input,
                                                                                    d_output,
                                                                                    CompareOp{});
             }
-        }
-        else if(benchmark_kind == benchmark_kinds::sort_pairs)
+        } else if(benchmark_kind == benchmark_kinds::sort_pairs)
         {
-            for(unsigned int i = 0; i < Trials; ++i) {
+            for(unsigned int i = 0; i < Trials; ++i)
+            {
                 sort_pairs<BlockSize, LogicalWarpSize, ItemsPerThread>
                     <<<dim3(size / items_per_block), dim3(BlockSize), 0, stream>>>(d_input,
                                                                                    d_output,
@@ -349,8 +350,8 @@ void run_benchmark(benchmark::State& state, const benchmark_kinds benchmark_kind
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T));
@@ -360,49 +361,45 @@ void run_benchmark(benchmark::State& state, const benchmark_kinds benchmark_kind
     HIP_CHECK(hipFree(d_output));
 }
 
-template<
-    class T,
-    unsigned int BlockSize,
-    unsigned int LogicalWarpSize,
-    unsigned int ItemsPerThread,
-    class CompareOp = test_utils::less,
-    unsigned int Trials = 10
->
-void run_segmented_benchmark(benchmark::State& state, const benchmark_kinds benchmark_kind, const hipStream_t stream, const size_t N)
+template<class T,
+         unsigned int BlockSize,
+         unsigned int LogicalWarpSize,
+         unsigned int ItemsPerThread,
+         class CompareOp     = test_utils::less,
+         unsigned int Trials = 10>
+void run_segmented_benchmark(benchmark::State&     state,
+                             const benchmark_kinds benchmark_kind,
+                             const hipStream_t     stream,
+                             const size_t          N)
 {
-    constexpr auto max_segment_size = LogicalWarpSize * ItemsPerThread;
+    constexpr auto max_segment_size   = LogicalWarpSize * ItemsPerThread;
     constexpr auto segments_per_block = BlockSize / LogicalWarpSize;
-    constexpr auto items_per_block = BlockSize * ItemsPerThread;
+    constexpr auto items_per_block    = BlockSize * ItemsPerThread;
 
-    const auto num_blocks = (N + items_per_block - 1) / items_per_block;
+    const auto num_blocks   = (N + items_per_block - 1) / items_per_block;
     const auto num_segments = num_blocks * segments_per_block;
-    const auto size = num_blocks * items_per_block;
+    const auto size         = num_blocks * items_per_block;
 
-    const auto input = std::is_floating_point<T>::value ?
-        benchmark_utils::get_random_data<T>(size, static_cast<T>(-1000), static_cast<T>(1000)) :
-        benchmark_utils::get_random_data<T>(
-            size,
-            std::numeric_limits<T>::min(),
-            std::numeric_limits<T>::max()
-        );
+    const auto input = std::is_floating_point<T>::value
+                           ? benchmark_utils::get_random_data<T>(size,
+                                                                 static_cast<T>(-1000),
+                                                                 static_cast<T>(1000))
+                           : benchmark_utils::get_random_data<T>(size,
+                                                                 std::numeric_limits<T>::min(),
+                                                                 std::numeric_limits<T>::max());
 
-    const auto segment_sizes = benchmark_utils::get_random_data<unsigned int>(
-        num_segments, 0, max_segment_size);
+    const auto segment_sizes
+        = benchmark_utils::get_random_data<unsigned int>(num_segments, 0, max_segment_size);
 
-    T* d_input  = nullptr;
-    T* d_output = nullptr;
+    T*            d_input         = nullptr;
+    T*            d_output        = nullptr;
     unsigned int* d_segment_sizes = nullptr;
     HIP_CHECK(hipMalloc(&d_input, size * sizeof(input[0])));
     HIP_CHECK(hipMalloc(&d_output, size * sizeof(input[0])));
     HIP_CHECK(hipMalloc(&d_segment_sizes, num_segments * sizeof(segment_sizes[0])));
-    HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            size * sizeof(T),
-            hipMemcpyHostToDevice
-        )
-    );
-    HIP_CHECK(hipMemcpy(d_segment_sizes, segment_sizes.data(),
+    HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(d_segment_sizes,
+                        segment_sizes.data(),
                         num_segments * sizeof(segment_sizes[0]),
                         hipMemcpyHostToDevice));
 
@@ -420,8 +417,7 @@ void run_segmented_benchmark(benchmark::State& state, const benchmark_kinds benc
                                                                        d_segment_sizes,
                                                                        CompareOp{});
             }
-        }
-        else if(benchmark_kind == benchmark_kinds::sort_pairs)
+        } else if(benchmark_kind == benchmark_kinds::sort_pairs)
         {
             for(unsigned int i = 0; i < Trials; ++i)
             {
@@ -436,8 +432,8 @@ void run_segmented_benchmark(benchmark::State& state, const benchmark_kinds benc
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T));
@@ -449,15 +445,18 @@ void run_segmented_benchmark(benchmark::State& state, const benchmark_kinds benc
 }
 
 #define CREATE_BENCHMARK(T, BS, WS, IPT)                                                           \
-do {                                                                                               \
-    const auto benchmark_name =                                                                    \
-        std::string{"warp_merge_sort<Datatype:" #T ",Block Size:" #BS ",Warp Size:" #WS ",Items Per Thread:" #IPT ">.SubAlgorithm Name:"} + name;                \
-    if(WS <= device_warp_size) {                                                                   \
-        benchmarks.push_back(benchmark::RegisterBenchmark(benchmark_name.c_str(),                  \
+    if(WS <= device_warp_size)                                                                     \
+    {                                                                                              \
+        benchmarks.push_back(benchmark::RegisterBenchmark(                                         \
+            std::string("warp_merge_sort<data_type:" #T ",block_size:" #BS ",warp_size:" #WS       \
+                        ",items_per_thread:" #IPT ">.sub_algorithm_name:"                          \
+                        + name)                                                                    \
+                .c_str(),                                                                          \
             segmented ? &run_benchmark<T, BS, WS, IPT> : &run_segmented_benchmark<T, BS, WS, IPT>, \
-            benchmark_kind, stream, size));                                                        \
-    }                                                                                              \
-} while(false)
+            benchmark_kind,                                                                        \
+            stream,                                                                                \
+            size));                                                                                \
+    }
 
 #define BENCHMARK_TYPE_WS(type, block, warp) \
     CREATE_BENCHMARK(type, block, warp, 1);  \
@@ -470,13 +469,13 @@ do {
     BENCHMARK_TYPE_WS(type, block, 32); \
     BENCHMARK_TYPE_WS(type, block, 64)
 
-void add_benchmarks(const benchmark_kinds benchmark_kind,
-                    const std::string& name,
+void add_benchmarks(const benchmark_kinds                         benchmark_kind,
+                    const std::string&                            name,
                     std::vector<benchmark::internal::Benchmark*>& benchmarks,
-                    const hipStream_t stream,
-                    const size_t size,
-                    const bool segmented,
-                    const unsigned int device_warp_size)
+                    const hipStream_t                             stream,
+                    const size_t                                  size,
+                    const bool                                    segmented,
+                    const unsigned int                            device_warp_size)
 {
     BENCHMARK_TYPE(int, 256);
     BENCHMARK_TYPE(int8_t, 256);
@@ -484,7 +483,7 @@ void add_benchmarks(const benchmark_kinds benchmark_kind,
     BENCHMARK_TYPE(long long, 256);
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -493,24 +492,27 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     std::cout << "benchmark_warp_merge_sort" << std::endl;
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
 
-    const auto device_warp_size = [] {
+    const auto device_warp_size = []
+    {
         const int result = HIPCUB_HOST_WARP_THREADS;
-        if(result > 0) {
+        if(result > 0)
+        {
             std::cout << "[HIP] Device warp size: " << result << std::endl;
-        } else {
+        } else
+        {
             std::cerr << "Failed to get device warp size! Aborting.\n";
             std::exit(1);
         }
@@ -519,14 +521,34 @@ int main(int argc, char *argv[])
 
     // Add benchmarks
     std::vector<benchmark::internal::Benchmark*> benchmarks;
-    add_benchmarks(benchmark_kinds::sort_keys, "sort(keys)", benchmarks, stream,
-                   size, false, device_warp_size);
-    add_benchmarks(benchmark_kinds::sort_pairs, "sort(keys, values)",
-                   benchmarks, stream, size, false, device_warp_size);
-    add_benchmarks(benchmark_kinds::sort_keys, "segmented_sort(keys)",
-                   benchmarks, stream, size, true, device_warp_size);
-    add_benchmarks(benchmark_kinds::sort_pairs, "segmented_sort(keys, values)",
-                   benchmarks, stream, size, true, device_warp_size);
+    add_benchmarks(benchmark_kinds::sort_keys,
+                   "sort(keys)",
+                   benchmarks,
+                   stream,
+                   size,
+                   false,
+                   device_warp_size);
+    add_benchmarks(benchmark_kinds::sort_pairs,
+                   "sort(keys, values)",
+                   benchmarks,
+                   stream,
+                   size,
+                   false,
+                   device_warp_size);
+    add_benchmarks(benchmark_kinds::sort_keys,
+                   "segmented_sort(keys)",
+                   benchmarks,
+                   stream,
+                   size,
+                   true,
+                   device_warp_size);
+    add_benchmarks(benchmark_kinds::sort_pairs,
+                   "segmented_sort(keys, values)",
+                   benchmarks,
+                   stream,
+                   size,
+                   true,
+                   device_warp_size);
 
     // Use manual timing
     for(auto& b : benchmarks)
diff --git a/benchmark/benchmark_warp_reduce.cpp b/benchmark/benchmark_warp_reduce.cpp
index 39716261..f72c268d 100644
--- a/benchmark/benchmark_warp_reduce.cpp
+++ b/benchmark/benchmark_warp_reduce.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -25,19 +25,12 @@
 // HIP API
 #include "hipcub/warp/warp_reduce.hpp"
 
-
 #ifndef DEFAULT_N
 const size_t DEFAULT_N = 1024 * 1024 * 32;
 #endif
 
-template<
-    class T,
-    unsigned int WarpSize,
-    unsigned int Trials
->
-__global__
-__launch_bounds__(64)
-void warp_reduce_kernel(const T * d_input, T * d_output)
+template<class T, unsigned int WarpSize, unsigned int Trials>
+__global__ __launch_bounds__(64) void warp_reduce_kernel(const T* d_input, T* d_output)
 {
     const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 
@@ -45,8 +38,8 @@ void warp_reduce_kernel(const T * d_input, T * d_output)
 
     using wreduce_t = hipcub::WarpReduce<T, WarpSize>;
     __shared__ typename wreduce_t::TempStorage storage;
-    auto reduce_op = hipcub::Sum();
-    #pragma nounroll
+    auto                                       reduce_op = hipcub::Sum();
+#pragma nounroll
     for(unsigned int trial = 0; trial < Trials; trial++)
     {
         value = wreduce_t(storage).Reduce(value, reduce_op);
@@ -55,24 +48,19 @@ void warp_reduce_kernel(const T * d_input, T * d_output)
     d_output[i] = value;
 }
 
-template<
-    class T,
-    class Flag,
-    unsigned int WarpSize,
-    unsigned int Trials
->
-__global__
-__launch_bounds__(64)
-void segmented_warp_reduce_kernel(const T* d_input, Flag* d_flags, T* d_output)
+template<class T, class Flag, unsigned int WarpSize, unsigned int Trials>
+__global__ __launch_bounds__(64) void segmented_warp_reduce_kernel(const T* d_input,
+                                                                   Flag*    d_flags,
+                                                                   T*       d_output)
 {
     const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 
     auto value = d_input[i];
-    auto flag = d_flags[i];
+    auto flag  = d_flags[i];
 
     using wreduce_t = hipcub::WarpReduce<T, WarpSize>;
     __shared__ typename wreduce_t::TempStorage storage;
-    #pragma nounroll
+#pragma nounroll
     for(unsigned int trial = 0; trial < Trials; trial++)
     {
         value = wreduce_t(storage).HeadSegmentedSum(value, flag);
@@ -81,96 +69,83 @@ void segmented_warp_reduce_kernel(const T* d_input, Flag* d_flags, T* d_output)
     d_output[i] = value;
 }
 
-template<
-    bool Segmented,
-    unsigned int WarpSize,
-    unsigned int BlockSize,
-    unsigned int Trials,
-    class T,
-    class Flag
->
-inline
-auto execute_warp_reduce_kernel(T* input, T* output, Flag* /* flags */,
-                                size_t size, hipStream_t stream)
-    -> typename std::enable_if<!Segmented>::type
+template<bool         Segmented,
+         unsigned int WarpSize,
+         unsigned int BlockSize,
+         unsigned int Trials,
+         class T,
+         class Flag>
+inline auto execute_warp_reduce_kernel(
+    T* input, T* output, Flag* /* flags */, size_t size, hipStream_t stream) ->
+    typename std::enable_if<!Segmented>::type
 {
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(warp_reduce_kernel<T, WarpSize, Trials>),
-        dim3(size/BlockSize), dim3(BlockSize), 0, stream,
-        input, output
-    );
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(warp_reduce_kernel<T, WarpSize, Trials>),
+                       dim3(size / BlockSize),
+                       dim3(BlockSize),
+                       0,
+                       stream,
+                       input,
+                       output);
     HIP_CHECK(hipPeekAtLastError());
 }
 
-template<
-    bool Segmented,
-    unsigned int WarpSize,
-    unsigned int BlockSize,
-    unsigned int Trials,
-    class T,
-    class Flag
->
-inline
-auto execute_warp_reduce_kernel(T* input, T* output, Flag* flags,
-                                size_t size, hipStream_t stream)
-    -> typename std::enable_if<Segmented>::type
+template<bool         Segmented,
+         unsigned int WarpSize,
+         unsigned int BlockSize,
+         unsigned int Trials,
+         class T,
+         class Flag>
+inline auto
+    execute_warp_reduce_kernel(T* input, T* output, Flag* flags, size_t size, hipStream_t stream) ->
+    typename std::enable_if<Segmented>::type
 {
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(segmented_warp_reduce_kernel<T, Flag, WarpSize, Trials>),
-        dim3(size/BlockSize), dim3(BlockSize), 0, stream,
-        input, flags, output
-    );
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(segmented_warp_reduce_kernel<T, Flag, WarpSize, Trials>),
+                       dim3(size / BlockSize),
+                       dim3(BlockSize),
+                       0,
+                       stream,
+                       input,
+                       flags,
+                       output);
     HIP_CHECK(hipPeekAtLastError());
 }
 
-template<
-    bool Segmented,
-    class T,
-    unsigned int WarpSize,
-    unsigned int BlockSize,
-    unsigned int Trials = 100
->
+template<bool Segmented,
+         class T,
+         unsigned int WarpSize,
+         unsigned int BlockSize,
+         unsigned int Trials = 100>
 void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
 {
     using flag_type = unsigned char;
 
-    const auto size = BlockSize * ((N + BlockSize - 1)/BlockSize);
+    const auto size = BlockSize * ((N + BlockSize - 1) / BlockSize);
 
-    std::vector<T> input = benchmark_utils::get_random_data<T>(size, T(0), T(10));
+    std::vector<T>         input = benchmark_utils::get_random_data<T>(size, T(0), T(10));
     std::vector<flag_type> flags = benchmark_utils::get_random_data<flag_type>(size, 0, 1);
-    T * d_input;
-    flag_type * d_flags;
-    T * d_output;
+    T*                     d_input;
+    flag_type*             d_flags;
+    T*                     d_output;
     HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_flags, size * sizeof(flag_type)));
     HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            size * sizeof(T),
-            hipMemcpyHostToDevice
-        )
-    );
-    HIP_CHECK(
-        hipMemcpy(
-            d_flags, flags.data(),
-            size * sizeof(flag_type),
-            hipMemcpyHostToDevice
-        )
-    );
+    HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(d_flags, flags.data(), size * sizeof(flag_type), hipMemcpyHostToDevice));
     HIP_CHECK(hipDeviceSynchronize());
 
     for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
-        execute_warp_reduce_kernel<Segmented, WarpSize, BlockSize, Trials>(
-            d_input, d_output, d_flags, size, stream
-        );
+        execute_warp_reduce_kernel<Segmented, WarpSize, BlockSize, Trials>(d_input,
+                                                                           d_output,
+                                                                           d_flags,
+                                                                           size,
+                                                                           stream);
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T));
@@ -181,44 +156,35 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
     HIP_CHECK(hipFree(d_flags));
 }
 
-#define CREATE_BENCHMARK(T, WS, BS) \
-benchmark::RegisterBenchmark( \
-    (std::string("warp_reduce<Datatype:" #T ",Warp Size:" #WS ",Block Size:" #BS ">.SubAlgorithm Name:") + name).c_str(), \
-    &run_benchmark<Segmented, T, WS, BS>, \
-    stream, size \
-)
-
+#define CREATE_BENCHMARK(T, WS, BS)                                                        \
+    benchmark::RegisterBenchmark(std::string("warp_reduce<data_type:" #T ",warp_size:" #WS \
+                                             ",block_size:" #BS ">.sub_algorithm_name:"    \
+                                             + name)                                       \
+                                     .c_str(),                                             \
+                                 &run_benchmark<Segmented, T, WS, BS>,                     \
+                                 stream,                                                   \
+                                 size)
 
 // If warp size limit is 16
-#define BENCHMARK_TYPE_WS16(type) \
-    CREATE_BENCHMARK(type, 15, 32), \
-    CREATE_BENCHMARK(type, 16, 32)
-
+#define BENCHMARK_TYPE_WS16(type) CREATE_BENCHMARK(type, 15, 32), CREATE_BENCHMARK(type, 16, 32)
 
 // If warp size limit is 32
-#define BENCHMARK_TYPE_WS32(type) \
-    BENCHMARK_TYPE_WS16(type), \
-    CREATE_BENCHMARK(type, 31, 32), \
-    CREATE_BENCHMARK(type, 32, 32), \
-    CREATE_BENCHMARK(type, 32, 64)
-
+#define BENCHMARK_TYPE_WS32(type)                                                              \
+    BENCHMARK_TYPE_WS16(type), CREATE_BENCHMARK(type, 31, 32), CREATE_BENCHMARK(type, 32, 32), \
+        CREATE_BENCHMARK(type, 32, 64)
 
 // If warp size limit is 64
-#define BENCHMARK_TYPE_WS64(type) \
-    BENCHMARK_TYPE_WS32(type), \
-    CREATE_BENCHMARK(type, 37, 64), \
-    CREATE_BENCHMARK(type, 61, 64), \
-    CREATE_BENCHMARK(type, 64, 64)
-
+#define BENCHMARK_TYPE_WS64(type)                                                              \
+    BENCHMARK_TYPE_WS32(type), CREATE_BENCHMARK(type, 37, 64), CREATE_BENCHMARK(type, 61, 64), \
+        CREATE_BENCHMARK(type, 64, 64)
 
 template<bool Segmented>
-void add_benchmarks(const std::string& name,
+void add_benchmarks(const std::string&                            name,
                     std::vector<benchmark::internal::Benchmark*>& benchmarks,
-                    hipStream_t stream,
-                    size_t size)
+                    hipStream_t                                   stream,
+                    size_t                                        size)
 {
-    std::vector<benchmark::internal::Benchmark*> bs =
-    {
+    std::vector<benchmark::internal::Benchmark*> bs = {
 #if HIPCUB_WARP_THREADS_MACRO == 16
         BENCHMARK_TYPE_WS16(int),
         BENCHMARK_TYPE_WS16(float),
@@ -242,7 +208,7 @@ void add_benchmarks(const std::string& name,
     benchmarks.insert(benchmarks.end(), bs.begin(), bs.end());
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -251,15 +217,15 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     std::cout << "benchmark_warp_reduce" << std::endl;
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
diff --git a/benchmark/benchmark_warp_scan.cpp b/benchmark/benchmark_warp_scan.cpp
index c66b31aa..c38defdf 100644
--- a/benchmark/benchmark_warp_scan.cpp
+++ b/benchmark/benchmark_warp_scan.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -116,23 +116,17 @@ template<class Benchmark,
 void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size)
 {
     // Make sure size is a multiple of BlockSize
-    size = BlockSize * ((size + BlockSize - 1)/BlockSize);
+    size = BlockSize * ((size + BlockSize - 1) / BlockSize);
     // Allocate and fill memory
     std::vector<T> input(size, 1.0f);
-    T * d_input;
-    T * d_output;
+    T*             d_input;
+    T*             d_output;
     HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
     HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
-    HIP_CHECK(
-        hipMemcpy(
-            d_input, input.data(),
-            size * sizeof(T),
-            hipMemcpyHostToDevice
-        )
-    );
+    HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice));
     HIP_CHECK(hipDeviceSynchronize());
 
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
         hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, WarpSize, Trials>),
@@ -147,8 +141,8 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size)
         HIP_CHECK(hipDeviceSynchronize());
 
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
 
         state.SetIterationTime(elapsed_seconds.count());
     }
@@ -160,9 +154,9 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size)
 }
 
 #define CREATE_BENCHMARK_IMPL(T, BS, WS, OP)                                              \
-    benchmark::RegisterBenchmark((std::string("warp_scan<Datatype:" #T ",Block Size:" #BS \
-                                              ",Warp Size:" #WS ">.Method Name:")         \
-                                  + method_name)                                          \
+    benchmark::RegisterBenchmark(std::string("warp_scan<data_type:" #T ",block_size:" #BS \
+                                             ",warp_size:" #WS ">.sub_algorithm_name:"    \
+                                             + method_name)                               \
                                      .c_str(),                                            \
                                  &run_benchmark<OP, T, BS, WS>,                           \
                                  stream,                                                  \
@@ -199,7 +193,7 @@ void add_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
                     hipStream_t                                   stream,
                     size_t                                        size)
 {
-    using custom_double2 = benchmark_utils::custom_type<double, double>;
+    using custom_double2    = benchmark_utils::custom_type<double, double>;
     using custom_int_double = benchmark_utils::custom_type<int, double>;
 
     std::vector<benchmark::internal::Benchmark*> new_benchmarks = {
@@ -229,7 +223,7 @@ void add_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
     benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end());
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -238,15 +232,15 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     std::cout << "benchmark_warp_scan" << std::endl;
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
diff --git a/benchmark/benchmark_warp_store.cpp b/benchmark/benchmark_warp_store.cpp
index a331f16b..8e88661c 100644
--- a/benchmark/benchmark_warp_store.cpp
+++ b/benchmark/benchmark_warp_store.cpp
@@ -9,8 +9,8 @@
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -40,15 +40,15 @@ __device__ auto warp_store_benchmark(T* d_output)
     -> std::enable_if_t<benchmark_utils::device_test_enabled_for_warp_size_v<LogicalWarpSize>>
 {
     T thread_data[ItemsPerThread];
-    #pragma unroll
-    for (unsigned i = 0; i < ItemsPerThread; ++i)
+#pragma unroll
+    for(unsigned i = 0; i < ItemsPerThread; ++i)
     {
         thread_data[i] = static_cast<T>(i);
     }
 
     using WarpStoreT = ::hipcub::WarpStore<T, ItemsPerThread, Algorithm, LogicalWarpSize>;
-    constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize;
-    constexpr int tile_size = ItemsPerThread * LogicalWarpSize;
+    constexpr unsigned                          warps_in_block = BlockSize / LogicalWarpSize;
+    constexpr int                               tile_size      = ItemsPerThread * LogicalWarpSize;
     __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block];
     const unsigned                              warp_id = threadIdx.x / LogicalWarpSize;
     const unsigned global_warp_id                       = blockIdx.x * warps_in_block + warp_id;
@@ -75,27 +75,25 @@ __global__ __launch_bounds__(BlockSize) void warp_store_kernel(T* d_output)
     warp_store_benchmark<BlockSize, ItemsPerThread, LogicalWarpSize, Algorithm>(d_output);
 }
 
-template<
-    class T,
-    unsigned BlockSize,
-    unsigned ItemsPerThread,
-    unsigned LogicalWarpSize,
-    ::hipcub::WarpStoreAlgorithm Algorithm,
-    unsigned Trials = 100
->
+template<class T,
+         unsigned                     BlockSize,
+         unsigned                     ItemsPerThread,
+         unsigned                     LogicalWarpSize,
+         ::hipcub::WarpStoreAlgorithm Algorithm,
+         unsigned                     Trials = 100>
 void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
 {
     constexpr unsigned items_per_block = BlockSize * ItemsPerThread;
-    const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block);
+    const unsigned     size = items_per_block * ((N + items_per_block - 1) / items_per_block);
 
-    T * d_output;
+    T* d_output;
     HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
 
-    for (auto _ : state)
+    for(auto _ : state)
     {
         auto start = std::chrono::high_resolution_clock::now();
 
-        for (size_t i = 0; i < Trials; ++i)
+        for(size_t i = 0; i < Trials; ++i)
         {
             warp_store_kernel<BlockSize, ItemsPerThread, LogicalWarpSize, Algorithm>
                 <<<dim3(size / items_per_block), dim3(BlockSize), 0, stream>>>(d_output);
@@ -103,8 +101,8 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
         HIP_CHECK(hipPeekAtLastError())
         HIP_CHECK(hipDeviceSynchronize());
         auto end = std::chrono::high_resolution_clock::now();
-        auto elapsed_seconds =
-            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
         state.SetIterationTime(elapsed_seconds.count());
     }
     state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T));
@@ -113,14 +111,16 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
     HIP_CHECK(hipFree(d_output));
 }
 
-#define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \
-benchmark::RegisterBenchmark( \
-    "warp_store<Datatype:" #T ",Block Size:" #BS ",Items Per Thread:" #IT ",Warp Size:" #WS ",Store Algorithm:" #ALG ">.", \
-    &run_benchmark<T, BS, IT, WS, ALG>, \
-    stream, size \
-)
+#define CREATE_BENCHMARK(T, BS, IT, WS, ALG)                                               \
+    benchmark::RegisterBenchmark(std::string("warp_store<data_type:" #T ",block_size:" #BS \
+                                             ",items_per_thread:" #IT ",warp_size:" #WS    \
+                                             ",sub_algorithm_name:" #ALG ">.")             \
+                                     .c_str(),                                             \
+                                 &run_benchmark<T, BS, IT, WS, ALG>,                       \
+                                 stream,                                                   \
+                                 size)
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
@@ -129,15 +129,15 @@ int main(int argc, char *argv[])
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
-    const size_t size = parser.get<size_t>("size");
-    const int trials = parser.get<int>("trials");
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
 
     std::cout << "benchmark_warp_store" << std::endl;
 
     // HIP
-    hipStream_t stream = 0; // default
+    hipStream_t     stream = 0; // default
     hipDeviceProp_t devProp;
-    int device_id = 0;
+    int             device_id = 0;
     HIP_CHECK(hipGetDevice(&device_id));
     HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
@@ -187,7 +187,7 @@ int main(int argc, char *argv[])
         // CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_TRANSPOSE)
     };
 
-    if (::benchmark_utils::is_warp_size_supported(64))
+    if(::benchmark_utils::is_warp_size_supported(64))
     {
         std::vector<benchmark::internal::Benchmark*> additional_benchmarks{
             CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_DIRECT),
@@ -221,36 +221,36 @@ int main(int argc, char *argv[])
             CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_STRIPED),
             CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_VECTORIZE),
             // WARP_STORE_TRANSPOSE removed because of shared memory limit
-            // CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_TRANSPOSE),
+            // CREATE_BENCHMARK(double, 256, 16, 64,
+            // ::hipcub::WARP_STORE_TRANSPOSE),
             CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_DIRECT),
             CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_STRIPED),
             CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_VECTORIZE),
             // WARP_STORE_TRANSPOSE removed because of shared memory limit
-            // CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_TRANSPOSE),
+            // CREATE_BENCHMARK(double, 256, 32, 64,
+            // ::hipcub::WARP_STORE_TRANSPOSE),
             CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_DIRECT),
             CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_STRIPED),
             CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_VECTORIZE)
             // WARP_STORE_TRANSPOSE removed because of shared memory limit
             // CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_TRANSPOSE)
         };
-        benchmarks.insert(
-            benchmarks.end(),
-            additional_benchmarks.begin(),
-            additional_benchmarks.end()
-        );
+        benchmarks.insert(benchmarks.end(),
+                          additional_benchmarks.begin(),
+                          additional_benchmarks.end());
     }
 
     // Use manual timing
-    for (auto& b : benchmarks)
+    for(auto& b : benchmarks)
     {
         b->UseManualTime();
         b->Unit(benchmark::kMillisecond);
     }
 
     // Force number of iterations
-    if (trials > 0)
+    if(trials > 0)
     {
-        for (auto& b : benchmarks)
+        for(auto& b : benchmarks)
         {
             b->Iterations(trials);
         }