Skip to content

Commit

Permalink
Use busy waiting barrier in reduction to band (#864)
Browse files Browse the repository at this point in the history
  • Loading branch information
msimberg authored Jun 12, 2023
1 parent 54a7357 commit 97db8c8
Show file tree
Hide file tree
Showing 8 changed files with 85 additions and 15 deletions.
2 changes: 1 addition & 1 deletion ci/.gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ stages:
timeout: 6 hours
variables:
GIT_SUBMODULE_STRATEGY: recursive
SPACK_SHA: b027f64a7f175b0e7a18388d0aa2484599efd12d
SPACK_SHA: b313b28e64c15761be0d45a16c922c25b2786f76
SPACK_DLAF_REPO: ./spack
before_script:
- podman login -u $CSCS_REGISTRY_USER -p $CSCS_REGISTRY_PASSWORD $CSCS_REGISTRY
Expand Down
22 changes: 22 additions & 0 deletions include/dlaf/eigensolver/internal/get_red2band_barrier_busy_wait.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//
// Distributed Linear Algebra with Future (DLAF)
//
// Copyright (c) 2018-2023, ETH Zurich
// All rights reserved.
//
// Please, refer to the LICENSE file in the root directory.
// SPDX-License-Identifier: BSD-3-Clause
//
#pragma once

#include <chrono>

#include "dlaf/tune.h"

namespace dlaf::eigensolver::internal {

inline std::chrono::duration<double> getReductionToBandBarrierBusyWait() noexcept {
return std::chrono::microseconds(getTuneParameters().red2band_barrier_busy_wait_us);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//
// Distributed Linear Algebra with Future (DLAF)
//
// Copyright (c) 2018-2023, ETH Zurich
// All rights reserved.
//
// Please, refer to the LICENSE file in the root directory.
// SPDX-License-Identifier: BSD-3-Clause
//
#pragma once

#include <chrono>

#include "dlaf/tune.h"

namespace dlaf::eigensolver::internal {

inline std::chrono::duration<double> getTridiagRank1BarrierBusyWait() noexcept {
return std::chrono::microseconds(getTuneParameters().red2band_barrier_busy_wait_us);
}

}
19 changes: 11 additions & 8 deletions include/dlaf/eigensolver/reduction_to_band/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include <dlaf/communication/kernels/all_reduce.h>
#include <dlaf/communication/kernels/reduce.h>
#include <dlaf/communication/rdma.h>
#include <dlaf/eigensolver/internal/get_red2band_barrier_busy_wait.h>
#include <dlaf/eigensolver/internal/get_red2band_panel_nworkers.h>
#include <dlaf/eigensolver/reduction_to_band/api.h>
#include <dlaf/factorization/qr.h>
Expand Down Expand Up @@ -299,6 +300,7 @@ auto computePanelReflectors(MatrixLike& mat_a, const matrix::SubPanelView& panel
ex::bulk(nthreads,
[nthreads, nrefls, cols = panel_view.cols()](
const std::size_t index, auto& barrier_ptr, auto& w, auto& taus, auto& tiles) {
const auto barrier_busy_wait = getReductionToBandBarrierBusyWait();
const std::size_t batch_size = util::ceilDiv(tiles.size(), nthreads);
const std::size_t begin = index * batch_size;
const std::size_t end = std::min(index * batch_size + batch_size, tiles.size());
Expand All @@ -312,7 +314,7 @@ auto computePanelReflectors(MatrixLike& mat_a, const matrix::SubPanelView& panel
// STEP1: compute tau and reflector (single-thread)
if (index == 0)
taus.emplace_back(computeReflector(tiles, j));
barrier_ptr->arrive_and_wait();
barrier_ptr->arrive_and_wait(barrier_busy_wait);

// STEP2a: compute w (multi-threaded)
const SizeType pt_cols = cols - (j + 1);
Expand All @@ -322,16 +324,16 @@ auto computePanelReflectors(MatrixLike& mat_a, const matrix::SubPanelView& panel

w[index] = common::internal::vector<T>(pt_cols, 0);
computeWTrailingPanel(has_head, tiles, w[index], j, pt_cols, begin, end);
barrier_ptr->arrive_and_wait();
barrier_ptr->arrive_and_wait(barrier_busy_wait);

// STEP2b: reduce w results (single-threaded)
if (index == 0)
dlaf::eigensolver::internal::reduceColumnVectors(w);
barrier_ptr->arrive_and_wait();
barrier_ptr->arrive_and_wait(barrier_busy_wait);

// STEP3: update trailing panel (multi-threaded)
updateTrailingPanel(has_head, tiles, j, w[0], taus.back(), begin, end);
barrier_ptr->arrive_and_wait();
barrier_ptr->arrive_and_wait(barrier_busy_wait);
}
}) |
ex::then([](auto barrier_ptr, auto w, auto taus, auto tiles) {
Expand Down Expand Up @@ -618,6 +620,7 @@ auto computePanelReflectors(TriggerSender&& trigger, comm::IndexT_MPI rank_v0,
auto& taus, auto& tiles, auto&& pcomm) {
const bool rankHasHead = rank_v0 == pcomm.get().rank();

const auto barrier_busy_wait = getReductionToBandBarrierBusyWait();
const std::size_t batch_size = util::ceilDiv(tiles.size(), nthreads);
const std::size_t begin = index * batch_size;
const std::size_t end = std::min(index * batch_size + batch_size, tiles.size());
Expand All @@ -633,7 +636,7 @@ auto computePanelReflectors(TriggerSender&& trigger, comm::IndexT_MPI rank_v0,
const bool has_head = rankHasHead;
taus.emplace_back(computeReflector(has_head, pcomm.get(), tiles, j));
}
barrier_ptr->arrive_and_wait();
barrier_ptr->arrive_and_wait(barrier_busy_wait);

// STEP2a: compute w (multi-threaded)
const SizeType pt_cols = cols - (j + 1);
Expand All @@ -644,19 +647,19 @@ auto computePanelReflectors(TriggerSender&& trigger, comm::IndexT_MPI rank_v0,

w[index] = common::internal::vector<T>(pt_cols, 0);
computeWTrailingPanel(has_head, tiles, w[index], j, pt_cols, begin, end);
barrier_ptr->arrive_and_wait();
barrier_ptr->arrive_and_wait(barrier_busy_wait);

// STEP2b: reduce w results (single-threaded)
if (index == 0) {
dlaf::eigensolver::internal::reduceColumnVectors(w);
comm::sync::allReduceInPlace(pcomm.get(), MPI_SUM,
common::make_data(w[0].data(), pt_cols));
}
barrier_ptr->arrive_and_wait();
barrier_ptr->arrive_and_wait(barrier_busy_wait);

// STEP3: update trailing panel (multi-threaded)
updateTrailingPanel(has_head, tiles, j, w[0], taus.back(), begin, end);
barrier_ptr->arrive_and_wait();
barrier_ptr->arrive_and_wait(barrier_busy_wait);
}
}) |
ex::then([](auto barrier_ptr, auto w, auto taus, auto tiles, auto pcomm) {
Expand Down
10 changes: 6 additions & 4 deletions include/dlaf/eigensolver/tridiag_solver/merge.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <dlaf/common/range2d.h>
#include <dlaf/common/single_threaded_blas.h>
#include <dlaf/communication/kernels.h>
#include <dlaf/eigensolver/internal/get_tridiag_rank1_barrier_busy_wait.h>
#include <dlaf/eigensolver/internal/get_tridiag_rank1_nworkers.h>
#include <dlaf/eigensolver/tridiag_solver/coltype.h>
#include <dlaf/eigensolver/tridiag_solver/index_manipulation.h>
Expand Down Expand Up @@ -475,6 +476,7 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k
auto& evec_tiles, auto& ws_vecs) {
const matrix::Distribution distr(LocalElementSize(n, n), TileElementSize(nb, nb));

const auto barrier_busy_wait = getTridiagRank1BarrierBusyWait();
const std::size_t batch_size = util::ceilDiv(to_sizet(k), nthreads);
const std::size_t begin = thread_idx * batch_size;
const std::size_t end = std::min(thread_idx * batch_size + batch_size, to_sizet(k));
Expand All @@ -486,7 +488,7 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k
ws_vecs.emplace_back(to_sizet(k));
}

barrier_ptr->arrive_and_wait();
barrier_ptr->arrive_and_wait(barrier_busy_wait);

// STEP 1: LAED4 (multi-thread)
const T* d_ptr = d_tiles_futs[0].get().ptr();
Expand All @@ -511,7 +513,7 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k
return;
}

barrier_ptr->arrive_and_wait();
barrier_ptr->arrive_and_wait(barrier_busy_wait);

// STEP 2a Compute weights (multi-thread)
auto& q = evec_tiles;
Expand Down Expand Up @@ -550,7 +552,7 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k
compute_w({i, j});
}

barrier_ptr->arrive_and_wait();
barrier_ptr->arrive_and_wait(barrier_busy_wait);

// STEP 2B: reduce, then finalize computation with sign and square root (single-thread)
if (thread_idx == 0) {
Expand All @@ -563,7 +565,7 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k
}
}

barrier_ptr->arrive_and_wait();
barrier_ptr->arrive_and_wait(barrier_busy_wait);

// STEP 3: Compute eigenvectors of the modified rank-1 modification (normalize) (multi-thread)
{
Expand Down
9 changes: 9 additions & 0 deletions include/dlaf/tune.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,16 @@ namespace dlaf {
/// - red2band_panel_nworkers:
/// The maximum number of threads to use for computing the panel in the reduction to band algorithm.
/// Set with --dlaf:red2band-panel-nworkers or env variable DLAF_RED2BAND_PANEL_NWORKERS.
/// - red2band_barrier_busy_wait_us:
/// The duration in microseconds to busy-wait in barriers in the reduction to band algorithm.
/// Set with --dlaf:red2band-barrier-busy-wait-us or env variable DLAF_RED2BAND_BARRIER_BUSY_WAIT_US.
/// - tridiag_rank1_nworkers:
/// The maximum number of threads to use for computing rank1 problem solution in tridiagonal solver
/// algorithm. Set with --dlaf:tridiag-rank1-nworkers or env variable DLAF_TRIDIAG_RANK1_NWORKERS.
/// - tridiag_rank1_barrier_busy_wait_us:
/// The duration in microseconds to busy-wait in barriers when computing rank1 problem solution in
/// the tridiagonal solver algorithm. Set with --dlaf:tridiag-rank1-barrier-busy-wait-us or env
/// variable DLAF_TRIDIAG_RANK1_BARRIER_BUSY_WAIT_US.
/// - eigensolver_min_band:
/// The minimum value to start looking for a divisor of the block size.
/// Set with --dlaf:eigensolver-min-band or env variable DLAF_EIGENSOLVER_MIN_BAND.
Expand All @@ -44,8 +51,10 @@ namespace dlaf {
struct TuneParameters {
std::size_t red2band_panel_nworkers =
std::max<std::size_t>(1, pika::resource::get_thread_pool("default").get_os_thread_count() / 2);
std::size_t red2band_barrier_busy_wait_us = 1000;
std::size_t tridiag_rank1_nworkers =
std::max<std::size_t>(1, pika::resource::get_thread_pool("default").get_os_thread_count() / 2);
std::size_t tridiag_rank1_barrier_busy_wait_us = 0;

SizeType eigensolver_min_band = 100;
SizeType band_to_tridiag_1d_block_size_base = 8192;
Expand Down
2 changes: 1 addition & 1 deletion spack/packages/dla-future/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class DlaFuture(CMakePackage, CudaPackage, ROCmPackage):
depends_on("umpire+rocm~shared", when="+rocm")
depends_on("[email protected]:")

depends_on("pika@0.15.1:")
depends_on("pika@0.16:")
depends_on("[email protected]:")
depends_on("pika +mpi")
depends_on("pika +cuda", when="+cuda")
Expand Down
14 changes: 13 additions & 1 deletion src/init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,9 @@ void updateConfiguration(const pika::program_options::variables_map& vm, configu
updateConfigurationValue(vm, param.red2band_panel_nworkers, "RED2BAND_PANEL_NWORKERS",
"red2band-panel-nworkers");

updateConfigurationValue(vm, param.red2band_barrier_busy_wait_us, "RED2BAND_BARRIER_BUSY_WAIT_US",
"red2band-barrier-busy-wait-us");

updateConfigurationValue(vm, param.eigensolver_min_band, "EIGENSOLVER_MIN_BAND",
"eigensolver-min-band");

Expand All @@ -171,6 +174,9 @@ void updateConfiguration(const pika::program_options::variables_map& vm, configu
updateConfigurationValue(vm, param.tridiag_rank1_nworkers, "TRIDIAG_RANK1_NWORKERS",
"tridiag-rank1-nworkers");

updateConfigurationValue(vm, param.red2band_barrier_busy_wait_us, "TRIDIAG_RANK1_BARRIER_BUSY_WAIT_US",
"tridiag-rank1-barrier-busy-wait-us");

updateConfigurationValue(vm, param.bt_band_to_tridiag_hh_apply_group_size,
"DLAF_BT_BAND_TO_TRIDIAG_HH_APPLY_GROUP_SIZE",
"bt-band-to-tridiag-hh-apply-group-size");
Expand Down Expand Up @@ -202,7 +208,10 @@ pika::program_options::options_description getOptionsDescription() {
// Tune parameters command line options
desc.add_options()(
"dlaf:red2band-panel-nworkers", pika::program_options::value<std::size_t>(),
"Maximum number of threads to use for computing the panel in the reduction to band algorithm.");
"The maximum number of threads to use for computing the panel in the reduction to band algorithm.");
desc.add_options()(
"dlaf:red2band-barrier-busy-wait-us", pika::program_options::value<std::size_t>(),
"The duration in microseconds to busy-wait in barriers in the reduction to band algorithm.");
desc.add_options()(
"dlaf:eigensolver-min-band", pika::program_options::value<SizeType>(),
"The minimum value to start looking for a divisor of the block size. When larger than the block size, the block size will be used instead.");
Expand All @@ -212,6 +221,9 @@ pika::program_options::options_description getOptionsDescription() {
desc.add_options()(
"dlaf:tridiag-rank1-nworkers", pika::program_options::value<std::size_t>(),
"The maximum number of threads to use for computing rank1 problem solution in tridiagonal solver algorithm.");
desc.add_options()(
"dlaf:tridiag-rank1-barrier-busy-wait-us", pika::program_options::value<std::size_t>(),
"The duration in microseconds to busy-wait in barriers when computing rank1 problem solution in the tridiagonal solver algorithm.");
desc.add_options()(
"dlaf:bt-band-to-tridiag-hh-apply-group-size", pika::program_options::value<SizeType>(),
"The application of the HH reflector is splitted in smaller applications of group size reflectors.");
Expand Down

0 comments on commit 97db8c8

Please sign in to comment.