diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 5623a7112..b5545517c 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -1,6 +1,7 @@ find_package(Boost REQUIRED COMPONENTS program_options) add_subdirectory(bvh_driver) +add_subdirectory(sort) if (ARBORX_ENABLE_MPI) add_subdirectory(distributed_tree_driver) endif() diff --git a/benchmarks/sort/CMakeLists.txt b/benchmarks/sort/CMakeLists.txt new file mode 100644 index 000000000..e615e713c --- /dev/null +++ b/benchmarks/sort/CMakeLists.txt @@ -0,0 +1,24 @@ +if(NOT PROJECT_NAME) + cmake_minimum_required(VERSION 3.12) + project(SortBenchmark CXX) + + find_package(Kokkos 3.0 REQUIRED QUIET) + if(Kokkos_ENABLE_CUDA) + kokkos_check(OPTIONS CUDA_LAMBDA) + endif() + + find_package(Boost REQUIRED COMPONENTS program_options) + + set(BENCHMARK_NAME SortBenchmark) +else() + set(BENCHMARK_NAME ArborX_SortBenchmark) +endif() + +# We require version 1.4.0 or higher but the format used by Google benchmark is +# wrong and thus, we cannot check the version during the configuration step. +find_package(benchmark REQUIRED) + +add_executable(${BENCHMARK_NAME}.exe sort_benchmark.cpp pss_common.hpp pss_parallel_stable_sort.hpp sort_benchmark_helpers.hpp) +target_compile_features(${BENCHMARK_NAME}.exe PUBLIC cxx_std_14) +target_link_libraries(${BENCHMARK_NAME}.exe Kokkos::kokkos benchmark::benchmark Boost::program_options) +add_test(NAME ${BENCHMARK_NAME} COMMAND ./${BENCHMARK_NAME}.exe --num-values 10000 --value-type float --benchmark_color=true) diff --git a/benchmarks/sort/pss_common.hpp b/benchmarks/sort/pss_common.hpp new file mode 100644 index 000000000..ba0703fe1 --- /dev/null +++ b/benchmarks/sort/pss_common.hpp @@ -0,0 +1,126 @@ +/* + Copyright (C) 2014 Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef PSS_COMMON_HPP +#define PSS_COMMON_HPP + +namespace pss { + +namespace internal { + +//! Destroy sequence [xs,xe) +template +void serial_destroy( RandomAccessIterator zs, RandomAccessIterator ze ) { + typedef typename std::iterator_traits::value_type T; + while( zs!=ze ) { + --ze; + (*ze).~T(); + } +} + +//! Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,(xe-xs)+(ye-ys)), using std::move +template +void serial_move_merge(RandomAccessIterator1 xs, RandomAccessIterator1 xe, + RandomAccessIterator2 ys, RandomAccessIterator2 ye, + RandomAccessIterator3 zs, Compare comp) { + if( xs!=xe ) { + if( ys!=ye ) { + for(;;) { + if( comp(*ys,*xs) ) { + *zs = std::move(*ys); + ++zs; + if( ++ys==ye ) break; + } else { + *zs = std::move(*xs); + ++zs; + if( ++xs==xe ) goto movey; + } + } + } + ys = xs; + ye = xe; + } +movey: + std::move( ys, ye, zs ); +} + +template +void stable_sort_base_case(RandomAccessIterator1 xs, RandomAccessIterator1 xe, + RandomAccessIterator2 zs, + int inplace, Compare comp) { + std::stable_sort(xs, xe, comp); + if (inplace != 2) { + RandomAccessIterator2 ze = zs + (xe-xs); + typedef typename std::iterator_traits::value_type T; + if( inplace ) + // Initialize the temporary buffer + for(; zs +void parallel_stable_sort(RandomAccessIterator xs, RandomAccessIterator xe) { + typedef typename std::iterator_traits::value_type T; + parallel_stable_sort(xs, xe, std::less()); +} + +} // namespace pss + +#endif diff --git a/benchmarks/sort/pss_parallel_stable_sort.hpp b/benchmarks/sort/pss_parallel_stable_sort.hpp new file mode 100644 index 000000000..ae3a6a68c --- /dev/null +++ b/benchmarks/sort/pss_parallel_stable_sort.hpp @@ -0,0 +1,120 @@ +/* + Copyright (C) 2014 Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef PSS_HPP +#define PSS_HPP + +#include "pss_common.hpp" + +namespace pss { + +namespace internal { + +// Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,zs+(xe-xs)+(ye-ys)) +// Destroy input sequence iff destroy==true +template +void parallel_move_merge(RandomAccessIterator1 xs, RandomAccessIterator1 xe, + RandomAccessIterator2 ys, RandomAccessIterator2 ye, + RandomAccessIterator3 zs, + bool destroy, Compare comp, + ssize_t cutoff) { + while( (xe-xs) + (ye-ys) > cutoff ) { + RandomAccessIterator1 xm; + RandomAccessIterator2 ym; + if( xe-xs < ye-ys ) { + ym = ys+(ye-ys)/2; + xm = std::upper_bound(xs,xe,*ym,comp); + } else { + xm = xs+(xe-xs)/2; + ym = std::lower_bound(ys,ye,*xm,comp); + } +#pragma omp task untied mergeable firstprivate(xs,xm,ys,ym,zs,destroy,comp) + parallel_move_merge( xs, xm, ys, ym, zs, destroy, comp, cutoff ); + zs += (xm-xs) + (ym-ys); + xs = xm; + ys = ym; + } + serial_move_merge( xs, xe, ys, ye, zs, comp ); + if( destroy ) { + serial_destroy( xs, xe ); + serial_destroy( ys, ye ); + } +#pragma omp taskwait +} + +// Sorts [xs,xe), where zs[0:xe-xs) is temporary buffer supplied by caller. +// Result is in [xs,xe) if inplace==true, otherwise in [zs,zs+(xe-xs)) +template +void parallel_stable_sort_aux(RandomAccessIterator1 xs, RandomAccessIterator1 xe, + RandomAccessIterator2 zs, + int inplace, Compare comp, + ssize_t cutoff) { + if((xe - xs) <= cutoff) { + stable_sort_base_case(xs, xe, zs, inplace, comp); + } else { + RandomAccessIterator1 xm = xs + (xe-xs)/2; + RandomAccessIterator2 zm = zs + (xm-xs); + RandomAccessIterator2 ze = zs + (xe-xs); +#pragma omp task + parallel_stable_sort_aux( xs, xm, zs, !inplace, comp, cutoff ); + parallel_stable_sort_aux( xm, xe, zm, !inplace, comp, cutoff ); +#pragma omp taskwait + if( inplace ) + parallel_move_merge( zs, zm, zm, ze, xs, inplace==2, comp, cutoff ); + else + parallel_move_merge( xs, xm, xm, xe, zs, false, comp, cutoff ); + } +} + +} // namespace internal + +template +void parallel_stable_sort(RandomAccessIterator xs, RandomAccessIterator xe, + Compare comp) { + auto n = xe - xs; + auto t = omp_get_max_threads(); + auto cutoff = n / t; + if (cutoff < 2) cutoff = 2; + typedef typename std::iterator_traits::value_type T; + internal::raw_buffer z(size_t(n) * sizeof(T)); +#pragma omp parallel +#pragma omp master + internal::parallel_stable_sort_aux( xs, xe, static_cast(z.get()), 2, comp, cutoff ); +} + +} // namespace pss + +#endif diff --git a/benchmarks/sort/sort_benchmark.cpp b/benchmarks/sort/sort_benchmark.cpp new file mode 100644 index 000000000..2439f9590 --- /dev/null +++ b/benchmarks/sort/sort_benchmark.cpp @@ -0,0 +1,383 @@ +/**************************************************************************** + * Copyright (c) 2012-2020 by the ArborX authors * + * All rights reserved. * + * * + * This file is part of the ArborX library. ArborX is * + * distributed under a BSD 3-clause license. For the licensing terms see * + * the LICENSE file in the top-level directory. * + * * + * SPDX-License-Identifier: BSD-3-Clause * + ****************************************************************************/ + +#include + +#include + +#include +#include + +#include "sort_benchmark_helpers.hpp" +#include + +template +void buildRandomData(ViewType data) +{ + using ValueType = typename ViewType::value_type; + std::conditional_t::value, + std::uniform_int_distribution, + std::uniform_real_distribution> + distribution(0, 10000); + std::default_random_engine generator; + auto random = [&distribution, &generator]() { + return distribution(generator); + }; + + auto data_host = Kokkos::create_mirror_view(Kokkos::HostSpace{}, data); + unsigned int const n = data.extent(0); + for (unsigned int i = 0; i < n; ++i) + data_host(i) = random(); + Kokkos::deep_copy(data, data_host); +} + +template +void sort(benchmark::State &state) +{ + using MemorySpace = typename SortAlgorithm::memory_space; + using ValueType = typename SortAlgorithm::value_type; + + int const n = state.range(0); + + // Construct random points + Kokkos::View data("data", n); + Kokkos::View data_copy("data_copy", n); + buildRandomData(data_copy); + + for (auto _ : state) + { + Kokkos::deep_copy(data, data_copy); + auto const start = std::chrono::high_resolution_clock::now(); + SortAlgorithm::sort(data); + auto const end = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + state.SetIterationTime(elapsed_seconds.count()); + } +} + +template +void sort_and_compute_permutation(benchmark::State &state) +{ + using MemorySpace = typename SortAlgorithm::memory_space; + using ValueType = typename SortAlgorithm::value_type; + + int const n = state.range(0); + + // Construct random points + Kokkos::View data("data", n); + Kokkos::View data_copy("data_copy", n); + buildRandomData(data_copy); + + for (auto _ : state) + { + Kokkos::deep_copy(data, data_copy); + auto const start = std::chrono::high_resolution_clock::now(); + auto permute = SortAlgorithm::sortAndComputePermutation(data); + std::ignore = permute; + auto const end = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + state.SetIterationTime(elapsed_seconds.count()); + } +} + +template +void compute_permutation(benchmark::State &state) +{ + using MemorySpace = typename SortAlgorithm::memory_space; + using ValueType = typename SortAlgorithm::value_type; + + int const n = state.range(0); + + // Construct random points + Kokkos::View data("data", n); + Kokkos::View data_copy("data_copy", n); + buildRandomData(data_copy); + + for (auto _ : state) + { + Kokkos::deep_copy(data, data_copy); + auto const start = std::chrono::high_resolution_clock::now(); + auto permute = SortAlgorithm::computePermutation(data); + auto const end = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + state.SetIterationTime(elapsed_seconds.count()); + } +} +template +void apply_permutation(benchmark::State &state) +{ + using MemorySpace = typename SortAlgorithm::memory_space; + using ValueType = typename SortAlgorithm::value_type; + + int const n = state.range(0); + + // Construct random points + Kokkos::View data("data", n); + Kokkos::View data_copy("data_copy", n); + buildRandomData(data_copy); + + auto permute = SortAlgorithm::sortAndComputePermutation(data_copy); + + for (auto _ : state) + { + auto const start = std::chrono::high_resolution_clock::now(); + SortAlgorithm::applyPermutation(permute, data_copy, data); + auto const end = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + state.SetIterationTime(elapsed_seconds.count()); + } +} + +template +void sort_compute_and_apply_permutation(benchmark::State &state) +{ + using MemorySpace = typename SortAlgorithm::memory_space; + using ValueType = typename SortAlgorithm::value_type; + + int const n = state.range(0); + + // Construct random points + Kokkos::View data_orig("data", n); + Kokkos::View data("data", n); + Kokkos::View data_copy("data_copy", n); + buildRandomData(data_orig); + + for (auto _ : state) + { + Kokkos::deep_copy(data, data_orig); + auto const start = std::chrono::high_resolution_clock::now(); + auto permute = SortAlgorithm::sortAndComputePermutation(data); + SortAlgorithm::applyPermutation(permute, data, data_copy); + auto const end = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + state.SetIterationTime(elapsed_seconds.count()); + } +} + +#define REGISTER_SORT_BENCHMARK(SortAlgorithm) \ + BENCHMARK_TEMPLATE(sort, SortAlgorithm) \ + ->Args({n}) \ + ->UseManualTime() \ + ->Unit(benchmark::kMicrosecond); + +#define REGISTER_SORT_AND_COMPUTE_PERMUTATION_BENCHMARK(SortAlgorithm) \ + BENCHMARK_TEMPLATE(sort_and_compute_permutation, SortAlgorithm) \ + ->Args({n}) \ + ->UseManualTime() \ + ->Unit(benchmark::kMicrosecond); + +#define REGISTER_COMPUTE_PERMUTATION_BENCHMARK(SortAlgorithm) \ + BENCHMARK_TEMPLATE(compute_permutation, SortAlgorithm) \ + ->Args({n}) \ + ->UseManualTime() \ + ->Unit(benchmark::kMicrosecond); + +#define REGISTER_APPLY_PERMUTATION_BENCHMARK(SortAlgorithm) \ + BENCHMARK_TEMPLATE(apply_permutation, SortAlgorithm) \ + ->Args({n}) \ + ->UseManualTime() \ + ->Unit(benchmark::kMicrosecond); + +#define REGISTER_SORT_COMPUTE_AND_APPLY_PERMUTATION_BENCHMARK(SortAlgorithm) \ + BENCHMARK_TEMPLATE(sort_compute_and_apply_permutation, SortAlgorithm) \ + ->Args({n}) \ + ->UseManualTime() \ + ->Unit(benchmark::kMicrosecond); + +// NOTE Motivation for this class that stores the argument count and values +// is I could not figure out how to make the parser consume arguments with +// Boost.Program_options +// Benchmark removes its own arguments from the command line arguments. This +// means, that by virtue of returning references to internal data members in +// argc() and argv() function, it will necessarily modify the members. It +// will decrease _argc, and "reduce" _argv data. Hence, we must keep a copy +// of _argv that is not modified from the outside to release memory in the +// destructor correctly. +class CmdLineArgs +{ +private: + int _argc; + std::vector _argv; + std::vector _owner_ptrs; + +public: + CmdLineArgs(std::vector const &args, char const *exe) + : _argc(args.size() + 1) + , _owner_ptrs{new char[std::strlen(exe) + 1]} + { + std::strcpy(_owner_ptrs[0], exe); + _owner_ptrs.reserve(_argc); + for (auto const &s : args) + { + _owner_ptrs.push_back(new char[s.size() + 1]); + std::strcpy(_owner_ptrs.back(), s.c_str()); + } + _argv = _owner_ptrs; + } + + ~CmdLineArgs() + { + for (auto p : _owner_ptrs) + { + delete[] p; + } + } + + int &argc() { return _argc; } + + char **argv() { return _argv.data(); } +}; + +template +void register_benchmarks(int const n) +{ + using Host = typename Kokkos::HostSpace::execution_space; +#if defined(KOKKOS_ENABLE_SERIAL) + using Serial = Kokkos::Serial; + using Kokkos_Serial = + KokkosHelper; + using StdSort_Serial = StdSortHelper; +#endif +#if defined(KOKKOS_ENABLE_OPENMP) + using OpenMP = Kokkos::OpenMP; + using Kokkos_OpenMP = + KokkosHelper; + using PSS_OpenMP = PSSHelper; +#ifdef ENABLE_GNU_PARALLEL + using GnuParallel_OpenMP = SortGnuParallel; +#endif +#endif +#if defined(KOKKOS_ENABLE_CUDA) + using Cuda = Kokkos::Cuda; + using CudaSpace = Kokkos::CudaSpace; + using Kokkos_Cuda = KokkosHelper; + using Kokkos_Cuda_Host = KokkosHelper; +#if defined(KOKKOS_ENABLE_SERIAL) + using Kokkos_Cuda_Serial = + KokkosHelper; +#endif + using Thrust_Cuda = ThrustHelper; +#endif + +#if defined(KOKKOS_ENABLE_SERIAL) + REGISTER_APPLY_PERMUTATION_BENCHMARK(Kokkos_Serial); + REGISTER_COMPUTE_PERMUTATION_BENCHMARK(StdSort_Serial); + REGISTER_SORT_AND_COMPUTE_PERMUTATION_BENCHMARK(Kokkos_Serial); + REGISTER_SORT_AND_COMPUTE_PERMUTATION_BENCHMARK(StdSort_Serial); + REGISTER_SORT_BENCHMARK(Kokkos_Serial); + REGISTER_SORT_BENCHMARK(StdSort_Serial); + REGISTER_SORT_COMPUTE_AND_APPLY_PERMUTATION_BENCHMARK(Kokkos_Serial); +#endif +#if defined(KOKKOS_ENABLE_OPENMP) + REGISTER_APPLY_PERMUTATION_BENCHMARK(Kokkos_OpenMP); + REGISTER_COMPUTE_PERMUTATION_BENCHMARK(PSS_OpenMP); + REGISTER_SORT_AND_COMPUTE_PERMUTATION_BENCHMARK(Kokkos_OpenMP); + REGISTER_SORT_AND_COMPUTE_PERMUTATION_BENCHMARK(PSS_OpenMP); + REGISTER_SORT_BENCHMARK(Kokkos_OpenMP); + REGISTER_SORT_BENCHMARK(PSS_OpenMP); + REGISTER_SORT_COMPUTE_AND_APPLY_PERMUTATION_BENCHMARK(Kokkos_OpenMP); +#ifdef ENABLE_GNU_PARALLEL + REGISTER_COMPUTE_PERMUTATION_BENCHMARK(GnuParallel_OpenMP); + REGISTER_SORT_AND_COMPUTE_PERMUTATION_BENCHMARK(GnuParallel_OpenMP); + REGISTER_SORT_BENCHMARK(GnuParallel_OpenMP); +#endif +#endif +#if defined(KOKKOS_ENABLE_CUDA) + REGISTER_APPLY_PERMUTATION_BENCHMARK(Kokkos_Cuda); + REGISTER_APPLY_PERMUTATION_BENCHMARK(Kokkos_Cuda_Host); + REGISTER_SORT_AND_COMPUTE_PERMUTATION_BENCHMARK(Kokkos_Cuda); + REGISTER_SORT_AND_COMPUTE_PERMUTATION_BENCHMARK(Kokkos_Cuda_Host); + REGISTER_SORT_AND_COMPUTE_PERMUTATION_BENCHMARK(Thrust_Cuda); + REGISTER_SORT_BENCHMARK(Kokkos_Cuda); + REGISTER_SORT_BENCHMARK(Kokkos_Cuda_Host); + REGISTER_SORT_BENCHMARK(Thrust_Cuda); + REGISTER_SORT_COMPUTE_AND_APPLY_PERMUTATION_BENCHMARK(Kokkos_Cuda); + REGISTER_SORT_COMPUTE_AND_APPLY_PERMUTATION_BENCHMARK(Kokkos_Cuda_Host); +#if defined(KOKKOS_ENABLE_SERIAL) + REGISTER_APPLY_PERMUTATION_BENCHMARK(Kokkos_Cuda_Serial); + REGISTER_SORT_AND_COMPUTE_PERMUTATION_BENCHMARK(Kokkos_Cuda_Serial); + REGISTER_SORT_BENCHMARK(Kokkos_Cuda_Serial); + REGISTER_SORT_COMPUTE_AND_APPLY_PERMUTATION_BENCHMARK(Kokkos_Cuda_Serial); +#endif +#endif +} + +int main(int argc, char *argv[]) +{ + Kokkos::ScopeGuard guard(argc, argv); + + namespace bpo = boost::program_options; + bpo::options_description desc("Allowed options"); + int n; + std::string value_type, size_type; + // clang-format off + desc.add_options() + ( "help", "produce help message" ) + ( "num-values,n", bpo::value(&n)->default_value(1000), "size" ) + ( "value-type", bpo::value(&value_type)->default_value("float"), "value type" ) + ( "size-type", bpo::value(&size_type)->default_value("unsigned int"), "size type" ) + ( "no-header", bpo::bool_switch(), "do not print version and hash" ) + ; + // clang-format on + bpo::variables_map vm; + bpo::parsed_options parsed = bpo::command_line_parser(argc, argv) + .options(desc) + .allow_unregistered() + .run(); + bpo::store(parsed, vm); + CmdLineArgs pass_further{ + bpo::collect_unrecognized(parsed.options, bpo::include_positional), + argv[0]}; + bpo::notify(vm); + + if (!vm["no-header"].as()) + { + std::cout << "value type : " << value_type << std::endl; + std::cout << "size type : " << size_type << std::endl; + } + + if (vm.count("help") > 0) + { + // Full list of options consists of Kokkos + Boost.Program_options + + // Google Benchmark and we still need to call benchmark::Initialize() to + // get those printed to the standard output. + std::cout << desc << "\n"; + int ac = 2; + char *av[] = {(char *)"ignored", (char *)"--help"}; + // benchmark::Initialize() calls exit(0) when `--help` so register + // Kokkos::finalize() to be called on normal program termination. + std::atexit(Kokkos::finalize); + benchmark::Initialize(&ac, av); + return 1; + } + + benchmark::Initialize(&pass_further.argc(), pass_further.argv()); + // Throw if some of the arguments have not been recognized. + std::ignore = + bpo::command_line_parser(pass_further.argc(), pass_further.argv()) + .options(bpo::options_description("")) + .run(); + + benchmark::Initialize(&argc, argv); + + // clang-format off + if (value_type == "float" && size_type == "unsigned int") register_benchmarks(n); + else if (value_type == "float" && size_type == "size_t") register_benchmarks(n); + else if (value_type == "double" && size_type == "unsigned int") register_benchmarks(n); + else if (value_type == "double" && size_type == "size_t") register_benchmarks(n); + else if (value_type == "int" && size_type == "unsigned int") register_benchmarks(n); + else if (value_type == "int" && size_type == "size_t") register_benchmarks(n); + // clang-format on + + benchmark::RunSpecifiedBenchmarks(); + + return EXIT_SUCCESS; +} diff --git a/benchmarks/sort/sort_benchmark_helpers.hpp b/benchmarks/sort/sort_benchmark_helpers.hpp new file mode 100644 index 000000000..2f527f21d --- /dev/null +++ b/benchmarks/sort/sort_benchmark_helpers.hpp @@ -0,0 +1,395 @@ +/**************************************************************************** + * Copyright (c) 2012-2020 by the ArborX authors * + * All rights reserved. * + * * + * This file is part of the ArborX library. ArborX is * + * distributed under a BSD 3-clause license. For the licensing terms see * + * the LICENSE file in the top-level directory. * + * * + * SPDX-License-Identifier: BSD-3-Clause * + ****************************************************************************/ + +#ifndef SORT_BENCHMARK_HELPERS_HPP +#define SORT_BENCHMARK_HELPERS_HPP + +#include +#include + +#include + +#include + +#if defined(KOKKOS_ENABLE_OPENMP) +#include "pss_parallel_stable_sort.hpp" +#endif + +#if defined(__GNUC__) && !defined(__CUDA_ARCH__) +#define ENABLE_GNU_PARALLEL +#endif + +#ifdef ENABLE_GNU_PARALLEL +#include // __gnu_parallel::sort, __gnu_parallel::transform +#endif + +// clang-format off +#if defined(KOKKOS_ENABLE_CUDA) +# if defined(KOKKOS_COMPILER_CLANG) && KOKKOS_COMPILER_CLANG < 900 +// Clang of version less than 9.0 cannot compile Thrust, failing with errors +// like this: +// /thrust/system/cuda/detail/core/agent_launcher.h:557:11: +// error: use of undeclared identifier 'va_printf' +// Defining _CubLog here allows us to avoid that code path, however disabling +// some debugging diagnostics. +// +# define _CubLog +# include +# include +# else // #if (KOKKOS_COMPILER_CLANG < 900) +# include +# include +# endif // #if (KOKKOS_COMPILER_CLANG < 900) +#endif // #if defined(KOKKOS_ENABLE_CUDA) +// clang-format on + +template +struct is_accessible_from : std::false_type +{ + static_assert(Kokkos::is_memory_space::value, ""); + static_assert(Kokkos::is_execution_space::value, ""); +}; + +template +struct is_accessible_from::accessible>::type> + : std::true_type +{ +}; + +template +void iota(ExecutionSpace exec_space, ViewType view) +{ + auto const n = view.extent(0); + Kokkos::parallel_for("iota", + Kokkos::RangePolicy(exec_space, 0, n), + KOKKOS_LAMBDA(int i) { view(i) = i; }); +} + +#if defined(KOKKOS_ENABLE_SERIAL) +template +struct StdSortHelper +{ + using value_type = ValueType; + using execution_space = Kokkos::Serial; + using memory_space = Kokkos::Serial::memory_space; + + static void sort(Kokkos::View view) + { + std::sort(view.data(), view.data() + view.extent(0)); + } + + static auto computePermutation(Kokkos::View view) + { + int const n = view.extent(0); + + Kokkos::View permute( + Kokkos::ViewAllocateWithoutInitializing("permute"), n); + for (int i = 0; i < n; ++i) + permute(i) = i; + + std::sort(permute.data(), permute.data() + n, + [&view](size_t const &a, size_t const &b) { + return view(a) < view(b); + }); + + return permute; + } + + static Kokkos::View + sortAndComputePermutation(Kokkos::View view) + { + int const n = view.extent(0); + + auto permute = computePermutation(view); + + std::vector view_copy(n); + memcpy(view_copy.data(), view.data(), n * sizeof(ValueType)); + for (int i = 0; i < n; ++i) + view(permute(i)) = view_copy[i]; + + return permute; + } +}; +#endif + +template +struct KokkosHelper; + +template +struct KokkosHelper< + ValueType, ExecutionSpace, MemorySpace, SizeType, + std::enable_if_t::value>> +{ + using value_type = ValueType; + using execution_space = ExecutionSpace; + using memory_space = MemorySpace; + + static auto + sortAndComputePermutation(Kokkos::View view) + { + int const n = view.extent(0); + + using ViewType = + Kokkos::View>; + using CompType = Kokkos::BinOp1D; + + Kokkos::MinMaxScalar result; + Kokkos::MinMax reducer(result); + Kokkos::parallel_reduce( + "min_max", Kokkos::RangePolicy(execution_space{}, 0, n), + Kokkos::Impl::min_max_functor(view), reducer); + + Kokkos::BinSort + bin_sort(view, CompType(n / 2, result.min_val, result.max_val), true); + bin_sort.create_permute_vector(); + bin_sort.sort(view); + + return bin_sort.get_permute_vector(); + } + + static void sort(Kokkos::View view) + { + auto permute = sortAndComputePermutation(view); + std::ignore = permute; + } + + static void applyPermutation(Kokkos::View permute, + Kokkos::View in, + Kokkos::View &out) + { + int const n = in.extent(0); + + Kokkos::parallel_for( + "apply_permutation", + Kokkos::RangePolicy(execution_space{}, 0, n), + KOKKOS_LAMBDA(int const i) { out(permute(i)) = in(i); }); + } +}; + +template +struct KokkosHelper< + ValueType, ExecutionSpace, MemorySpace, SizeType, + std::enable_if_t::value>> +{ + using value_type = ValueType; + using execution_space = ExecutionSpace; + using memory_space = MemorySpace; + + static auto + sortAndComputePermutation(Kokkos::View view) + { + int const n = view.extent(0); + + auto view_mirror = + Kokkos::create_mirror_view_and_copy(execution_space{}, view); + +#if 1 + using ViewType = decltype(view_mirror); + using CompType = Kokkos::BinOp1D; + + Kokkos::MinMaxScalar result; + Kokkos::MinMax reducer(result); + Kokkos::parallel_reduce( + "min_max", Kokkos::RangePolicy(execution_space{}, 0, n), + Kokkos::Impl::min_max_functor(view_mirror), reducer); + + Kokkos::BinSort + bin_sort(view_mirror, CompType(n / 2, result.min_val, result.max_val), + true); + bin_sort.create_permute_vector(); + bin_sort.sort(view_mirror); + + Kokkos::deep_copy(view, view_mirror); + + return bin_sort.get_permute_vector(); +#else + auto permute = + StdSortHelper::sortAndComputePermutation( + view_mirror); + Kokkos::deep_copy(view, view_mirror); + return permute; +#endif + } + + static void sort(Kokkos::View view) + { + auto permute = sortAndComputePermutation(view); + std::ignore = permute; + } + + template + static void + applyPermutation(Kokkos::View permute, + Kokkos::View in, + Kokkos::View &out) + { + int const n = in.extent(0); + + execution_space exec_space; + + auto in_mirror = Kokkos::create_mirror_view_and_copy(exec_space, in); + auto out_mirror = Kokkos::create_mirror_view(exec_space, out); + + Kokkos::parallel_for( + "apply_permutation", + Kokkos::RangePolicy(exec_space, 0, n), + KOKKOS_LAMBDA(int const i) { out_mirror(permute(i)) = in_mirror(i); }); + + Kokkos::deep_copy(out, out_mirror); + } +}; + +#if defined(KOKKOS_ENABLE_OPENMP) && defined(ENABLE_GNU_PARALLEL) +template +struct SortGnuParallel +{ + using value_type = ValueType; + using execution_space = Kokkos::OpenMP; + using memory_space = Kokkos::HostSpace; + + static void sort(Kokkos::View view) + { + int const n = view.extent(0); + __gnu_parallel::sort(view.data(), view.data() + n); + } + + static auto computePermutation(Kokkos::View view) + { + int const n = view.extent(0); + + Kokkos::View permute( + Kokkos::ViewAllocateWithoutInitializing("permute"), n); + iota(execution_space{}, permute); + + __gnu_parallel::sort(permute.data(), permute.data() + n, + [&view](size_t const &a, size_t const &b) { + return view(a) < view(b); + }); + + return permute; + } + + static Kokkos::View + sortAndComputePermutation(Kokkos::View view) + { + int const n = view.extent(0); + + auto permute = computePermutation(view); + + Kokkos::View view_copy("view_copy", n); + Kokkos::deep_copy(view_copy, view); + Kokkos::parallel_for( + "apply_permutation", + Kokkos::RangePolicy(execution_space{}, 0, n), + KOKKOS_LAMBDA(int i) { view(permute(i)) = view_copy(i); }); + + return permute; + } +}; +#endif + +#if defined(KOKKOS_ENABLE_OPENMP) +template +struct PSSHelper +{ + using value_type = ValueType; + using execution_space = Kokkos::OpenMP; + using memory_space = Kokkos::HostSpace; + + static void sort(Kokkos::View view) + { + int const n = view.extent(0); + pss::parallel_stable_sort(view.data(), view.data() + n, + std::less{}); + } + + static auto computePermutation(Kokkos::View view) + { + int const n = view.extent(0); + + Kokkos::View permute( + Kokkos::ViewAllocateWithoutInitializing("permute"), n); + iota(execution_space{}, permute); + + pss::parallel_stable_sort(permute.data(), permute.data() + n, + [&view](size_t const &a, size_t const &b) { + return view(a) < view(b); + }); + + return permute; + } + + static Kokkos::View + sortAndComputePermutation(Kokkos::View view) + { + int const n = view.extent(0); + + auto permute = computePermutation(view); + + Kokkos::View view_copy("view_copy", n); + Kokkos::deep_copy(view_copy, view); + Kokkos::parallel_for( + "apply_permutation", + Kokkos::RangePolicy(execution_space{}, 0, n), + KOKKOS_LAMBDA(int i) { view(permute(i)) = view_copy(i); }); + + return permute; + } +}; +#endif + +#if defined(KOKKOS_ENABLE_CUDA) +template +struct ThrustHelper +{ + using value_type = ValueType; + using execution_space = Kokkos::Cuda; + using memory_space = Kokkos::CudaSpace; + + static void sort(Kokkos::View view) + { + int const n = view.extent(0); + + auto begin_ptr = thrust::device_ptr(view.data()); + auto end_ptr = thrust::device_ptr(view.data() + n); + thrust::sort(begin_ptr, end_ptr); + } + + static auto + sortAndComputePermutation(Kokkos::View view) + { + int const n = view.extent(0); + + Kokkos::View permute( + Kokkos::ViewAllocateWithoutInitializing("permutation"), n); + + auto permute_begin = thrust::device_ptr(permute.data()); + auto permute_end = thrust::device_ptr(permute.data() + n); + auto view_begin = thrust::device_ptr(view.data()); + auto view_end = thrust::device_ptr(view.data() + n); + + thrust::sequence(permute_begin, permute_end, 0); + thrust::sort_by_key(view_begin, view_end, permute_begin); + + return permute; + } +}; +#endif + +#endif