From 817d5fb4c65f2c7e796c9f95facdc80774c0759d Mon Sep 17 00:00:00 2001 From: Thorsten Hater <24411438+thorstenhater@users.noreply.github.com> Date: Mon, 17 Apr 2023 12:32:28 +0200 Subject: [PATCH 1/2] Add faster merge events, shuffle tourney tree to the background. --- arbor/CMakeLists.txt | 1 + arbor/merge_events.cpp | 197 ++++++++++---------------------- arbor/merge_events.hpp | 41 +------ arbor/simulation.cpp | 2 +- test/ubench/CMakeLists.txt | 1 + test/ubench/README.md | 6 + test/unit/test_merge_events.cpp | 5 +- 7 files changed, 79 insertions(+), 174 deletions(-) diff --git a/arbor/CMakeLists.txt b/arbor/CMakeLists.txt index 14827dd20d..5c46e96e22 100644 --- a/arbor/CMakeLists.txt +++ b/arbor/CMakeLists.txt @@ -61,6 +61,7 @@ set(arbor_sources tree.cpp util/dylib.cpp util/hostname.cpp + util/tourney_tree.cpp util/unwind.cpp version.cpp ) diff --git a/arbor/merge_events.cpp b/arbor/merge_events.cpp index 2cd9745d1b..061aa8c8b3 100644 --- a/arbor/merge_events.cpp +++ b/arbor/merge_events.cpp @@ -1,157 +1,86 @@ #include #include +#include +#include -#include #include -#include #include "io/trace.hpp" #include "merge_events.hpp" -#include "profile/profiler_macro.hpp" +#include "util/tourney_tree.hpp" namespace arb { -namespace impl { - -// A postsynaptic spike event that has delivery time set to -// terminal_time, used as a sentinel in `tourney_tree`. - -static constexpr spike_event terminal_pse{0, terminal_time, 0}; - - -// The tournament tree data structure is used to merge k sorted lists of events. -// See online for high-level information about tournament trees. -// -// This implementation maintains a heap-like data structure, with entries of type: -// std::pair -// where the unsigned ∈ [0, k-1] is the id of the list from which the event was -// drawn. The id is stored so that the operation of removing the most recent event -// knows which leaf node needs to be updated (i.e. the leaf node of the list from -// which the most recent event was drawn). -// -// unsigned is used for storing the index, because if drawing events from more -// event generators than can be counted using an unsigned a complete redesign -// will be needed. - -tourney_tree::tourney_tree(std::vector& input): - input_(input), - n_lanes_(input_.size()) -{ - // Must have at least 1 queue. - arb_assert(n_lanes_>=1u); - - leaves_ = math::next_pow2(n_lanes_); - - // Must be able to fit leaves in unsigned count. - arb_assert(leaves_>=n_lanes_); - nodes_ = 2*leaves_-1; - - // Allocate space for the tree nodes - heap_.resize(nodes_); - // Set the leaf nodes - for (auto i=0u; i& sources, pse_vector& out) { + // Consume all events. + for (;;) { + // Now find the minimum + auto mevt = spike_event{0, terminal_time, 0};; + auto midx = -1; + for (auto idx = 0ull; idx < sources.size(); ++idx) { + auto& source = sources[idx]; + if (!source.empty()) { + auto& evt = source.front(); + if (evt < mevt) { + mevt = evt; + midx = idx; + } + } } - out << "{" << tt.heap_[i].first << "," << tt.heap_[i].second << "}\n"; + if (midx == -1) break; + // Take event: bump chosen stream and stuff event into output. + sources[midx].left++; + out.emplace_back(mevt); } - return out; } -bool tourney_tree::empty() const { - return event(0).time == terminal_time; -} - -spike_event tourney_tree::head() const { - return event(0); -} - -// Remove the smallest (most recent) event from the tree, then update the -// tree so that head() returns the next event. -void tourney_tree::pop() { - unsigned lane = id(0); - unsigned i = leaf(lane); - - // draw the next event from the input lane - auto& in = input_[lane]; - - if (!in.empty()) { - ++in.left; +// priority-queue based merge. +void pqueue_merge_events(std::vector& sources, pse_vector& out) { + // Min heap tracking the minimum element from each span + using kv_type = std::pair; + std::priority_queue, std::greater<>> heap; + + // Add the first element from each sorted vector to the min heap + for (std::size_t ix = 0; ix < sources.size(); ++ix) { + auto& source = sources[ix]; + if (!source.empty()) { + heap.emplace(source.front(), ix); + source.left++; + } } - event(i) = in.empty()? terminal_pse: in.front(); - - // re-heapify the tree with a single walk from leaf to root - while ((i=parent(i))) { - merge_up(i); + // Merge by continually popping the minimum element from the min heap + while (!heap.empty()) { + auto [value, ix] = heap.top(); + heap.pop(); + out.emplace_back(value); + + // If the sorted vector from which the minimum element was taken still + // has elements, add the next smallest element to the heap + auto& source = sources[ix]; + if (!source.empty()) { + heap.emplace(source.front(), ix); + source.left++; + } } - merge_up(0); // handle the root -} - -void tourney_tree::setup(unsigned i) { - if (is_leaf(i)) return; - setup(left(i)); - setup(right(i)); - merge_up(i); -}; - -// Update the value at node i of the tree to be the smallest -// of its left and right children. -// The result is undefined for leaf nodes. -void tourney_tree::merge_up(unsigned i) { - const auto l = left(i); - const auto r = right(i); - heap_[i] = event(l)>1; -} -unsigned tourney_tree::left(unsigned i) const { - return (i<<1) + 1; } -unsigned tourney_tree::right(unsigned i) const { - return left(i)+1; -} -unsigned tourney_tree::leaf(unsigned i) const { - return i+leaves_-1; -} -bool tourney_tree::is_leaf(unsigned i) const { - return i>=leaves_-1; -} -const unsigned& tourney_tree::id(unsigned i) const { - return heap_[i].first; -} -spike_event& tourney_tree::event(unsigned i) { - return heap_[i].second; -} -const spike_event& tourney_tree::event(unsigned i) const { - return heap_[i].second; -} - -} // namespace impl -void tree_merge_events(std::vector& sources, pse_vector& out) { - impl::tourney_tree tree(sources); - while (!tree.empty()) { - out.push_back(tree.head()); - tree.pop(); +void merge_events(std::vector& sources, pse_vector &out) { + // Count events, bail if none; else allocate enough space to store them. + auto n_evts = std::accumulate(sources.begin(), sources.end(), + 0, + [] (auto acc, const auto& rng) { return acc + rng.size(); }); + out.reserve(out.size() + n_evts); + auto n_queues = sources.size(); + if (n_queues < 20) { // NOTE: MAGIC NUMBER, found by ubench/merge + linear_merge_events(sources, out); + } + else { + pqueue_merge_events(sources, out); } } } // namespace arb - diff --git a/arbor/merge_events.hpp b/arbor/merge_events.hpp index 9ce69241ea..c8c42634d1 100644 --- a/arbor/merge_events.hpp +++ b/arbor/merge_events.hpp @@ -16,42 +16,9 @@ namespace arb { using event_span = util::range; -void tree_merge_events(std::vector& sources, pse_vector& out); - -namespace impl { - // The tournament tree is used internally by the merge_events method, and - // it is not intended for use elsewhere. It is exposed here for unit testing - // of its functionality. - class ARB_ARBOR_API tourney_tree { - using key_val = std::pair; - - public: - tourney_tree(std::vector& input); - bool empty() const; - spike_event head() const; - void pop(); - friend std::ostream& operator<<(std::ostream&, const tourney_tree&); - - private: - void setup(unsigned i); - void merge_up(unsigned i); - void update_lane(unsigned lane); - unsigned parent(unsigned i) const; - unsigned left(unsigned i) const; - unsigned right(unsigned i) const; - unsigned leaf(unsigned i) const; - bool is_leaf(unsigned i) const; - const unsigned& id(unsigned i) const; - spike_event& event(unsigned i); - const spike_event& event(unsigned i) const; - unsigned next_power_2(unsigned x) const; - - std::vector heap_; - std::vector& input_; - unsigned leaves_; - unsigned nodes_; - unsigned n_lanes_; - }; -} +void linear_merge_events(std::vector& sources, pse_vector& out); +void pqueue_merge_events(std::vector& sources, pse_vector& out); + +void merge_events(std::vector& sources, pse_vector& out); } // namespace arb diff --git a/arbor/simulation.cpp b/arbor/simulation.cpp index 5e3d92b504..3440ef8e56 100644 --- a/arbor/simulation.cpp +++ b/arbor/simulation.cpp @@ -72,7 +72,7 @@ ARB_ARBOR_API void merge_cell_events( PL(); PE(communication:enqueue:tree); - tree_merge_events(spanbuf, new_events); + merge_events(spanbuf, new_events); PL(); old_events = old_split.second; diff --git a/test/ubench/CMakeLists.txt b/test/ubench/CMakeLists.txt index adea5647a5..8b40a245af 100644 --- a/test/ubench/CMakeLists.txt +++ b/test/ubench/CMakeLists.txt @@ -10,6 +10,7 @@ set(bench_sources fvm_discretize.cpp mech_vec.cpp task_system.cpp + merge.cpp ) if(ARB_WITH_GPU) diff --git a/test/ubench/README.md b/test/ubench/README.md index 01bec9ed0b..849b913d7c 100644 --- a/test/ubench/README.md +++ b/test/ubench/README.md @@ -32,6 +32,12 @@ become otherwise unwieldy. ## Benchmarks +### Merge Events + +During simulation, we need to collate incoming spike events, events from local generators, +and events that have yet to be delivered. There are multiple options for doing so: heap/tree +merge, linear k-merge, or iterative two-way merge. + ### `accumulate_functor_values` #### Motivation diff --git a/test/unit/test_merge_events.cpp b/test/unit/test_merge_events.cpp index 32cfe0e080..7b17c4094b 100644 --- a/test/unit/test_merge_events.cpp +++ b/test/unit/test_merge_events.cpp @@ -7,6 +7,7 @@ #include "merge_events.hpp" #include "util/rangeutil.hpp" +#include "util/tourney_tree.hpp" namespace arb { void merge_cell_events( @@ -209,7 +210,7 @@ TEST(merge_events, tourney_seq) std::vector spans = {g1.events(0, terminal_time), g2.events(0, terminal_time)}; - impl::tourney_tree tree(spans); + tourney_tree tree(spans); pse_vector lf; while (!tree.empty()) { @@ -265,7 +266,7 @@ TEST(merge_events, tourney_poisson) for (auto& gen: generators) { spans.emplace_back(gen.events(t0, tfinal)); } - impl::tourney_tree tree(spans); + tourney_tree tree(spans); pse_vector lf; while (!tree.empty()) { lf.push_back(tree.head()); From 941f49b36ff8ab3bb072b79c1b91a48be12e5ece Mon Sep 17 00:00:00 2001 From: Thorsten Hater <24411438+thorstenhater@users.noreply.github.com> Date: Mon, 17 Apr 2023 13:27:11 +0200 Subject: [PATCH 2/2] Add attachments. --- arbor/util/tourney_tree.cpp | 146 ++++++++++++++++++++++++++++++++++++ arbor/util/tourney_tree.hpp | 46 ++++++++++++ test/ubench/merge.cpp | 117 +++++++++++++++++++++++++++++ 3 files changed, 309 insertions(+) create mode 100644 arbor/util/tourney_tree.cpp create mode 100644 arbor/util/tourney_tree.hpp create mode 100644 test/ubench/merge.cpp diff --git a/arbor/util/tourney_tree.cpp b/arbor/util/tourney_tree.cpp new file mode 100644 index 0000000000..e26416816f --- /dev/null +++ b/arbor/util/tourney_tree.cpp @@ -0,0 +1,146 @@ +#include "tourney_tree.hpp" + +#include +#include + +#include +#include + +namespace arb { + +using event_span = util::range; + +static constexpr spike_event terminal_pse{0, terminal_time, 0}; + +// The tournament tree data structure is used to merge k sorted lists of events. +// See online for high-level information about tournament trees. +// +// This implementation maintains a heap-like data structure, with entries of type: +// std::pair +// where the unsigned ∈ [0, k-1] is the id of the list from which the event was +// drawn. The id is stored so that the operation of removing the most recent event +// knows which leaf node needs to be updated (i.e. the leaf node of the list from +// which the most recent event was drawn). +// +// unsigned is used for storing the index, because if drawing events from more +// event generators than can be counted using an unsigned a complete redesign +// will be needed. + +tourney_tree::tourney_tree(std::vector& input): + input_(input), + n_lanes_(input_.size()) +{ + // Must have at least 1 queue. + arb_assert(n_lanes_>=1u); + + leaves_ = math::next_pow2(n_lanes_); + + // Must be able to fit leaves in unsigned count. + arb_assert(leaves_>=n_lanes_); + nodes_ = 2*leaves_-1; + + // Allocate space for the tree nodes + heap_.resize(nodes_); + // Set the leaf nodes + for (auto i=0u; i>1; +} +unsigned tourney_tree::left(unsigned i) const { + return (i<<1) + 1; +} +unsigned tourney_tree::right(unsigned i) const { + return left(i)+1; +} +unsigned tourney_tree::leaf(unsigned i) const { + return i+leaves_-1; +} +bool tourney_tree::is_leaf(unsigned i) const { + return i>=leaves_-1; +} +const unsigned& tourney_tree::id(unsigned i) const { + return heap_[i].first; +} +spike_event& tourney_tree::event(unsigned i) { + return heap_[i].second; +} +const spike_event& tourney_tree::event(unsigned i) const { + return heap_[i].second; +} + +void tree_merge_events(std::vector& sources, pse_vector& out) { + tourney_tree tree(sources); + while (!tree.empty()) { + out.push_back(tree.head()); + tree.pop(); + } +} +} diff --git a/arbor/util/tourney_tree.hpp b/arbor/util/tourney_tree.hpp new file mode 100644 index 0000000000..85c3109137 --- /dev/null +++ b/arbor/util/tourney_tree.hpp @@ -0,0 +1,46 @@ +#pragma once + +#include +#include + +#include "util/range.hpp" + +namespace arb { + +using event_span = util::range; + +// The tournament tree is used internally by the merge_events method, and +// it is not intended for use elsewhere. It is exposed here for unit testing +// of its functionality. +class ARB_ARBOR_API tourney_tree { + using key_val = std::pair; + +public: + tourney_tree(std::vector& input); + bool empty() const; + spike_event head() const; + void pop(); + friend std::ostream& operator<<(std::ostream&, const tourney_tree&); + +private: + void setup(unsigned i); + void merge_up(unsigned i); + unsigned parent(unsigned i) const; + unsigned left(unsigned i) const; + unsigned right(unsigned i) const; + unsigned leaf(unsigned i) const; + bool is_leaf(unsigned i) const; + const unsigned& id(unsigned i) const; + spike_event& event(unsigned i); + const spike_event& event(unsigned i) const; + + std::vector heap_; + std::vector& input_; + unsigned leaves_; + unsigned nodes_; + unsigned n_lanes_; +}; + +void tree_merge_events(std::vector& sources, pse_vector& out); + +} diff --git a/test/ubench/merge.cpp b/test/ubench/merge.cpp new file mode 100644 index 0000000000..1097fa7341 --- /dev/null +++ b/test/ubench/merge.cpp @@ -0,0 +1,117 @@ +#include +#include +#include + +#include + +#include "util/tourney_tree.hpp" +#include "merge_events.hpp" + +#include +#include + +constexpr auto T = 1000.0; // ms + +using rndgen = std::mt19937_64; + +struct payload { + payload(std::size_t ncells, std::size_t ev_per_cell) { + auto dt = T/ev_per_cell; + for(auto cell = 0ull; cell < ncells; ++cell) { + auto gen = arb::poisson_schedule(1/dt, rndgen{cell}); + auto times = gen.events(0, T); + evts.emplace_back(); + auto& evt = evts.back(); + for (auto t: arb::util::make_range(times)) { + evt.emplace_back(arb::spike_event{42, t, 0.23}); + ++size; + } + span.emplace_back(arb::util::make_range(evt.data(), evt.data() + evt.size())); + } + } + + std::vector> evts; + std::vector span; + std::size_t size = 0; +}; + +static void BM_tree(benchmark::State& state) { + const std::size_t ncells = state.range(0); + const std::size_t ev_per_cell = state.range(1); + + const payload data{ncells, ev_per_cell}; + + while (state.KeepRunning()) { + arb::pse_vector out; + // Need to do this here, normally the wrapper does this + out.reserve(data.size); + auto tmp = data.span; + arb::tree_merge_events(tmp, out); + benchmark::ClobberMemory(); + } +} + +static void BM_linear(benchmark::State& state) { + const std::size_t ncells = state.range(0); + const std::size_t ev_per_cell = state.range(1); + + const payload data{ncells, ev_per_cell}; + + while (state.KeepRunning()) { + arb::pse_vector out; + // Need to do this here, normally the wrapper does this + out.reserve(data.size); + // Clone the input, merge clobbers it. + auto tmp = data.span; + arb::linear_merge_events(tmp, out); + benchmark::ClobberMemory(); + } +} + +static void BM_queue(benchmark::State& state) { + const std::size_t ncells = state.range(0); + const std::size_t ev_per_cell = state.range(1); + + const payload data{ncells, ev_per_cell}; + + while (state.KeepRunning()) { + arb::pse_vector out; + // Need to do this here, normally the wrapper does this + out.reserve(data.size); + // Clone the input, merge clobbers it. + auto tmp = data.span; + arb::pqueue_merge_events(tmp, out); + benchmark::ClobberMemory(); + } +} + +static void BM_default(benchmark::State& state) { + const std::size_t ncells = state.range(0); + const std::size_t ev_per_cell = state.range(1); + + const payload data{ncells, ev_per_cell}; + + while (state.KeepRunning()) { + arb::pse_vector out; + // NOTE: This wrapper _does_ do the allocation. + // Clone the input, merge clobbers it. + auto tmp = data.span; + arb::merge_events(tmp, out); + benchmark::ClobberMemory(); + } +} + +void run_custom_arguments(benchmark::internal::Benchmark* b) { + for (auto ncells: {5, 13, 23, 41, 53}) { + for (auto ev_per_cell: {8, 32, 256, 1024}) { + b->Args({ncells, ev_per_cell}); + } + } +} + +BENCHMARK(BM_tree)->Apply(run_custom_arguments); +BENCHMARK(BM_linear)->Apply(run_custom_arguments); +BENCHMARK(BM_queue)->Apply(run_custom_arguments); +BENCHMARK(BM_default)->Apply(run_custom_arguments); + +BENCHMARK_MAIN();