Skip to content

Commit

Permalink
Add config option to set busy wait timeout in tridiagonal solver rank…
Browse files Browse the repository at this point in the history
…1 problem
  • Loading branch information
msimberg committed Jun 12, 2023
1 parent 802dee7 commit a5ff63e
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 4 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//
// Distributed Linear Algebra with Future (DLAF)
//
// Copyright (c) 2018-2023, ETH Zurich
// All rights reserved.
//
// Please, refer to the LICENSE file in the root directory.
// SPDX-License-Identifier: BSD-3-Clause
//
#pragma once

#include <chrono>

#include "dlaf/tune.h"

namespace dlaf::eigensolver::internal {

inline std::chrono::duration<double> getTridiagRank1BarrierBusyWait() noexcept {
return std::chrono::microseconds(getTuneParameters().red2band_barrier_busy_wait_us);
}

}
10 changes: 6 additions & 4 deletions include/dlaf/eigensolver/tridiag_solver/merge.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <dlaf/common/range2d.h>
#include <dlaf/common/single_threaded_blas.h>
#include <dlaf/communication/kernels.h>
#include <dlaf/eigensolver/internal/get_tridiag_rank1_barrier_busy_wait.h>
#include <dlaf/eigensolver/internal/get_tridiag_rank1_nworkers.h>
#include <dlaf/eigensolver/tridiag_solver/coltype.h>
#include <dlaf/eigensolver/tridiag_solver/index_manipulation.h>
Expand Down Expand Up @@ -475,6 +476,7 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k
auto& evec_tiles, auto& ws_vecs) {
const matrix::Distribution distr(LocalElementSize(n, n), TileElementSize(nb, nb));

const auto barrier_busy_wait = getTridiagRank1BarrierBusyWait();
const std::size_t batch_size = util::ceilDiv(to_sizet(k), nthreads);
const std::size_t begin = thread_idx * batch_size;
const std::size_t end = std::min(thread_idx * batch_size + batch_size, to_sizet(k));
Expand All @@ -486,7 +488,7 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k
ws_vecs.emplace_back(to_sizet(k));
}

barrier_ptr->arrive_and_wait();
barrier_ptr->arrive_and_wait(barrier_busy_wait);

// STEP 1: LAED4 (multi-thread)
const T* d_ptr = d_tiles_futs[0].get().ptr();
Expand All @@ -511,7 +513,7 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k
return;
}

barrier_ptr->arrive_and_wait();
barrier_ptr->arrive_and_wait(barrier_busy_wait);

// STEP 2a Compute weights (multi-thread)
auto& q = evec_tiles;
Expand Down Expand Up @@ -550,7 +552,7 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k
compute_w({i, j});
}

barrier_ptr->arrive_and_wait();
barrier_ptr->arrive_and_wait(barrier_busy_wait);

// STEP 2B: reduce, then finalize computation with sign and square root (single-thread)
if (thread_idx == 0) {
Expand All @@ -563,7 +565,7 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k
}
}

barrier_ptr->arrive_and_wait();
barrier_ptr->arrive_and_wait(barrier_busy_wait);

// STEP 3: Compute eigenvectors of the modified rank-1 modification (normalize) (multi-thread)
{
Expand Down
5 changes: 5 additions & 0 deletions include/dlaf/tune.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ namespace dlaf {
/// - tridiag_rank1_nworkers:
/// The maximum number of threads to use for computing rank1 problem solution in tridiagonal solver
/// algorithm. Set with --dlaf:tridiag-rank1-nworkers or env variable DLAF_TRIDIAG_RANK1_NWORKERS.
/// - tridiag_rank1_barrier_busy_wait_us:
/// The duration in microseconds to busy-wait in barriers when computing rank1 problem solution in
/// the tridiagonal solver algorithm. Set with --dlaf:tridiag-rank1-barrier-busy-wait-us or env
/// variable DLAF_TRIDIAG_RANK1_BARRIER_BUSY_WAIT_US.
/// - eigensolver_min_band:
/// The minimum value to start looking for a divisor of the block size.
/// Set with --dlaf:eigensolver-min-band or env variable DLAF_EIGENSOLVER_MIN_BAND.
Expand All @@ -50,6 +54,7 @@ struct TuneParameters {
std::size_t red2band_barrier_busy_wait_us = 1000;
std::size_t tridiag_rank1_nworkers =
std::max<std::size_t>(1, pika::resource::get_thread_pool("default").get_os_thread_count() / 2);
std::size_t tridiag_rank1_barrier_busy_wait_us = 0;

SizeType eigensolver_min_band = 100;
SizeType band_to_tridiag_1d_block_size_base = 8192;
Expand Down
6 changes: 6 additions & 0 deletions src/init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,9 @@ void updateConfiguration(const pika::program_options::variables_map& vm, configu
updateConfigurationValue(vm, param.tridiag_rank1_nworkers, "TRIDIAG_RANK1_NWORKERS",
"tridiag-rank1-nworkers");

updateConfigurationValue(vm, param.red2band_barrier_busy_wait_us, "TRIDIAG_RANK1_BARRIER_BUSY_WAIT_US",
"tridiag-rank1-barrier-busy-wait-us");

updateConfigurationValue(vm, param.bt_band_to_tridiag_hh_apply_group_size,
"DLAF_BT_BAND_TO_TRIDIAG_HH_APPLY_GROUP_SIZE",
"bt-band-to-tridiag-hh-apply-group-size");
Expand Down Expand Up @@ -218,6 +221,9 @@ pika::program_options::options_description getOptionsDescription() {
desc.add_options()(
"dlaf:tridiag-rank1-nworkers", pika::program_options::value<std::size_t>(),
"The maximum number of threads to use for computing rank1 problem solution in tridiagonal solver algorithm.");
desc.add_options()(
"dlaf:tridiag-rank1-barrier-busy-wait-us", pika::program_options::value<std::size_t>(),
"The duration in microseconds to busy-wait in barriers when computing rank1 problem solution in the tridiagonal solver algorithm.");
desc.add_options()(
"dlaf:bt-band-to-tridiag-hh-apply-group-size", pika::program_options::value<SizeType>(),
"The application of the HH reflector is splitted in smaller applications of group size reflectors.");
Expand Down

0 comments on commit a5ff63e

Please sign in to comment.