Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add sort benchmark #278

Closed
wants to merge 23 commits into from
Closed
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
1eeacdd
Add sort benchmark
aprokop Apr 21, 2020
d62bebd
Allow sort benchmark to be built outside of ArborX
aprokop Apr 21, 2020
0005503
Added compute permutations benchmark
aprokop Apr 21, 2020
c61f29c
Restore data to the original state in the beginning of each loop
aprokop Apr 21, 2020
b88b670
Set cxx standard to 14 in standalone sort benchmark
aprokop Apr 24, 2020
c7becc2
Request C++14 for sort benchmark for all configurations
masterleinad Apr 24, 2020
122a08f
Add PSS tpl
aprokop Apr 24, 2020
2419905
Move header check in PSS
aprokop Apr 24, 2020
b5326b8
Introduced check_exec_space
aprokop Apr 24, 2020
c96965e
Add PSS to sort benchmark
aprokop Apr 24, 2020
44b58d7
Protect PSS behind KOKKOS_ENABLE_OPENMP
aprokop Apr 24, 2020
4c100f0
Fix permutation application
aprokop Apr 25, 2020
d7b994b
Remove unnecessary guards
aprokop Apr 25, 2020
e5213c5
Allow processing device data on host
aprokop Apr 25, 2020
4fd6b94
Get rid of most Kokkos things from StdSort and Thrust
aprokop Apr 26, 2020
23586d5
Use explicit execution space in serial sortAndComputePermutation
masterleinad Apr 30, 2020
46c0c8e
Split KokkosHelper into two to reduce aux var creation
aprokop Apr 30, 2020
cbb8fc1
Add CUDA/Serial option
aprokop Apr 30, 2020
083da0e
Move StdSort on top to allow being used by others
aprokop Apr 30, 2020
0b75c75
Keep permutation on host when migrating it there
aprokop Apr 30, 2020
b093d83
Add an option to use StdSort when migrating to host
aprokop Apr 30, 2020
f0ecae6
Add sort+compute+apply benchmark
aprokop Apr 30, 2020
81c0c0e
Split sort benchmark file into main and helpers
aprokop May 1, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
find_package(Boost REQUIRED COMPONENTS program_options)

add_subdirectory(bvh_driver)
add_subdirectory(sort)
if (ARBORX_ENABLE_MPI)
add_subdirectory(distributed_tree_driver)
endif()
24 changes: 24 additions & 0 deletions benchmarks/sort/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
if(NOT PROJECT_NAME)
cmake_minimum_required(VERSION 3.12)
project(SortBenchmark CXX)

find_package(Kokkos 3.0 REQUIRED QUIET)
if(Kokkos_ENABLE_CUDA)
kokkos_check(OPTIONS CUDA_LAMBDA)
endif()

find_package(Boost REQUIRED COMPONENTS program_options)
dalg24 marked this conversation as resolved.
Show resolved Hide resolved

set(BENCHMARK_NAME SortBenchmark)
else()
set(BENCHMARK_NAME ArborX_SortBenchmark)
endif()

# We require version 1.4.0 or higher but the format used by Google benchmark is
# wrong and thus, we cannot check the version during the configuration step.
find_package(benchmark REQUIRED)

add_executable(${BENCHMARK_NAME}.exe sort_benchmark.cpp pss_common.hpp pss_parallel_stable_sort.hpp)
target_compile_features(${BENCHMARK_NAME}.exe PUBLIC cxx_std_14)
target_link_libraries(${BENCHMARK_NAME}.exe Kokkos::kokkos benchmark::benchmark Boost::program_options)
add_test(NAME ${BENCHMARK_NAME} COMMAND ./${BENCHMARK_NAME}.exe --num-values 10000 --value-type float --benchmark_color=true)
126 changes: 126 additions & 0 deletions benchmarks/sort/pss_common.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
/*
Copyright (C) 2014 Intel Corporation
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:

* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef PSS_COMMON_HPP
#define PSS_COMMON_HPP

namespace pss {

namespace internal {

//! Destroy sequence [xs,xe)
template<class RandomAccessIterator>
void serial_destroy( RandomAccessIterator zs, RandomAccessIterator ze ) {
typedef typename std::iterator_traits<RandomAccessIterator>::value_type T;
while( zs!=ze ) {
--ze;
(*ze).~T();
}
}

//! Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,(xe-xs)+(ye-ys)), using std::move
template <typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename RandomAccessIterator3,
typename Compare>
void serial_move_merge(RandomAccessIterator1 xs, RandomAccessIterator1 xe,
RandomAccessIterator2 ys, RandomAccessIterator2 ye,
RandomAccessIterator3 zs, Compare comp) {
if( xs!=xe ) {
if( ys!=ye ) {
for(;;) {
if( comp(*ys,*xs) ) {
*zs = std::move(*ys);
++zs;
if( ++ys==ye ) break;
} else {
*zs = std::move(*xs);
++zs;
if( ++xs==xe ) goto movey;
}
}
}
ys = xs;
ye = xe;
}
movey:
std::move( ys, ye, zs );
}

template<typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename Compare>
void stable_sort_base_case(RandomAccessIterator1 xs, RandomAccessIterator1 xe,
RandomAccessIterator2 zs,
int inplace, Compare comp) {
std::stable_sort(xs, xe, comp);
if (inplace != 2) {
RandomAccessIterator2 ze = zs + (xe-xs);
typedef typename std::iterator_traits<RandomAccessIterator2>::value_type T;
if( inplace )
// Initialize the temporary buffer
for(; zs<ze; ++zs )
new(&*zs) T;
else
// Initialize the temporary buffer and move keys to it.
for(; zs<ze; ++xs, ++zs )
new(&*zs) T(std::move(*xs));
}
}

//! Raw memory buffer with automatic cleanup.
class raw_buffer {
void* ptr;
public:
//! Try to obtain buffer of given size.
raw_buffer(size_t bytes) : ptr(operator new(bytes, std::nothrow)) {}
//! Return pointer to buffer, or NULL if buffer could not be obtained.
void* get() const {return ptr;}
//! Destroy buffer
~raw_buffer() {operator delete(ptr);}
raw_buffer(raw_buffer const&) = delete;
raw_buffer(raw_buffer&&) = delete;
raw_buffer& operator=(raw_buffer const&) = delete;
raw_buffer& operator=(raw_buffer&&) = delete;
};

} // namespace internal

//! Wrapper for sorting with default comparator.
template<class RandomAccessIterator>
void parallel_stable_sort(RandomAccessIterator xs, RandomAccessIterator xe) {
typedef typename std::iterator_traits<RandomAccessIterator>::value_type T;
parallel_stable_sort(xs, xe, std::less<T>());
}

} // namespace pss

#endif
120 changes: 120 additions & 0 deletions benchmarks/sort/pss_parallel_stable_sort.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/*
Copyright (C) 2014 Intel Corporation
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:

* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef PSS_HPP
#define PSS_HPP

#include "pss_common.hpp"

namespace pss {

namespace internal {

// Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,zs+(xe-xs)+(ye-ys))
// Destroy input sequence iff destroy==true
template <typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename RandomAccessIterator3,
typename Compare>
void parallel_move_merge(RandomAccessIterator1 xs, RandomAccessIterator1 xe,
RandomAccessIterator2 ys, RandomAccessIterator2 ye,
RandomAccessIterator3 zs,
bool destroy, Compare comp,
ssize_t cutoff) {
while( (xe-xs) + (ye-ys) > cutoff ) {
RandomAccessIterator1 xm;
RandomAccessIterator2 ym;
if( xe-xs < ye-ys ) {
ym = ys+(ye-ys)/2;
xm = std::upper_bound(xs,xe,*ym,comp);
} else {
xm = xs+(xe-xs)/2;
ym = std::lower_bound(ys,ye,*xm,comp);
}
#pragma omp task untied mergeable firstprivate(xs,xm,ys,ym,zs,destroy,comp)
parallel_move_merge( xs, xm, ys, ym, zs, destroy, comp, cutoff );
zs += (xm-xs) + (ym-ys);
xs = xm;
ys = ym;
}
serial_move_merge( xs, xe, ys, ye, zs, comp );
if( destroy ) {
serial_destroy( xs, xe );
serial_destroy( ys, ye );
}
#pragma omp taskwait
}

// Sorts [xs,xe), where zs[0:xe-xs) is temporary buffer supplied by caller.
// Result is in [xs,xe) if inplace==true, otherwise in [zs,zs+(xe-xs))
template <typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename Compare>
void parallel_stable_sort_aux(RandomAccessIterator1 xs, RandomAccessIterator1 xe,
RandomAccessIterator2 zs,
int inplace, Compare comp,
ssize_t cutoff) {
if((xe - xs) <= cutoff) {
stable_sort_base_case(xs, xe, zs, inplace, comp);
} else {
RandomAccessIterator1 xm = xs + (xe-xs)/2;
RandomAccessIterator2 zm = zs + (xm-xs);
RandomAccessIterator2 ze = zs + (xe-xs);
#pragma omp task
parallel_stable_sort_aux( xs, xm, zs, !inplace, comp, cutoff );
parallel_stable_sort_aux( xm, xe, zm, !inplace, comp, cutoff );
#pragma omp taskwait
if( inplace )
parallel_move_merge( zs, zm, zm, ze, xs, inplace==2, comp, cutoff );
else
parallel_move_merge( xs, xm, xm, xe, zs, false, comp, cutoff );
}
}

} // namespace internal

template<typename RandomAccessIterator, typename Compare>
void parallel_stable_sort(RandomAccessIterator xs, RandomAccessIterator xe,
Compare comp) {
auto n = xe - xs;
auto t = omp_get_max_threads();
auto cutoff = n / t;
if (cutoff < 2) cutoff = 2;
typedef typename std::iterator_traits<RandomAccessIterator>::value_type T;
internal::raw_buffer z(size_t(n) * sizeof(T));
#pragma omp parallel
#pragma omp master
internal::parallel_stable_sort_aux( xs, xe, static_cast<T*>(z.get()), 2, comp, cutoff );
}

} // namespace pss

#endif
Loading