Use busy waiting barrier in reduction to band (#864)

eth-cscs · Jun 12, 2023 · 97db8c8 · 97db8c8
1 parent 54a7357
commit 97db8c8
Show file tree

Hide file tree

Showing 8 changed files with 85 additions and 15 deletions.
diff --git a/ci/.gitlab-ci.yml b/ci/.gitlab-ci.yml
@@ -17,7 +17,7 @@ stages:
   timeout: 6 hours
   variables:
     GIT_SUBMODULE_STRATEGY: recursive
-    SPACK_SHA: b027f64a7f175b0e7a18388d0aa2484599efd12d
+    SPACK_SHA: b313b28e64c15761be0d45a16c922c25b2786f76
     SPACK_DLAF_REPO: ./spack
   before_script:
     - podman login -u $CSCS_REGISTRY_USER -p $CSCS_REGISTRY_PASSWORD $CSCS_REGISTRY

diff --git a/include/dlaf/eigensolver/internal/get_red2band_barrier_busy_wait.h b/include/dlaf/eigensolver/internal/get_red2band_barrier_busy_wait.h
@@ -0,0 +1,22 @@
+//
+// Distributed Linear Algebra with Future (DLAF)
+//
+// Copyright (c) 2018-2023, ETH Zurich
+// All rights reserved.
+//
+// Please, refer to the LICENSE file in the root directory.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+#pragma once
+
+#include <chrono>
+
+#include "dlaf/tune.h"
+
+namespace dlaf::eigensolver::internal {
+
+inline std::chrono::duration<double> getReductionToBandBarrierBusyWait() noexcept {
+  return std::chrono::microseconds(getTuneParameters().red2band_barrier_busy_wait_us);
+}
+
+}
diff --git a/include/dlaf/eigensolver/internal/get_tridiag_rank1_barrier_busy_wait.h b/include/dlaf/eigensolver/internal/get_tridiag_rank1_barrier_busy_wait.h
@@ -0,0 +1,22 @@
+//
+// Distributed Linear Algebra with Future (DLAF)
+//
+// Copyright (c) 2018-2023, ETH Zurich
+// All rights reserved.
+//
+// Please, refer to the LICENSE file in the root directory.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+#pragma once
+
+#include <chrono>
+
+#include "dlaf/tune.h"
+
+namespace dlaf::eigensolver::internal {
+
+inline std::chrono::duration<double> getTridiagRank1BarrierBusyWait() noexcept {
+  return std::chrono::microseconds(getTuneParameters().red2band_barrier_busy_wait_us);
+}
+
+}
diff --git a/include/dlaf/eigensolver/reduction_to_band/impl.h b/include/dlaf/eigensolver/reduction_to_band/impl.h
@@ -32,6 +32,7 @@
 #include <dlaf/communication/kernels/all_reduce.h>
 #include <dlaf/communication/kernels/reduce.h>
 #include <dlaf/communication/rdma.h>
+#include <dlaf/eigensolver/internal/get_red2band_barrier_busy_wait.h>
 #include <dlaf/eigensolver/internal/get_red2band_panel_nworkers.h>
 #include <dlaf/eigensolver/reduction_to_band/api.h>
 #include <dlaf/factorization/qr.h>
@@ -299,6 +300,7 @@ auto computePanelReflectors(MatrixLike& mat_a, const matrix::SubPanelView& panel
          ex::bulk(nthreads,
                   [nthreads, nrefls, cols = panel_view.cols()](
                       const std::size_t index, auto& barrier_ptr, auto& w, auto& taus, auto& tiles) {
+                    const auto barrier_busy_wait = getReductionToBandBarrierBusyWait();
                     const std::size_t batch_size = util::ceilDiv(tiles.size(), nthreads);
                     const std::size_t begin = index * batch_size;
                     const std::size_t end = std::min(index * batch_size + batch_size, tiles.size());
@@ -312,7 +314,7 @@ auto computePanelReflectors(MatrixLike& mat_a, const matrix::SubPanelView& panel
                       // STEP1: compute tau and reflector (single-thread)
                       if (index == 0)
                         taus.emplace_back(computeReflector(tiles, j));
-                      barrier_ptr->arrive_and_wait();
+                      barrier_ptr->arrive_and_wait(barrier_busy_wait);
 
                       // STEP2a: compute w (multi-threaded)
                       const SizeType pt_cols = cols - (j + 1);
@@ -322,16 +324,16 @@ auto computePanelReflectors(MatrixLike& mat_a, const matrix::SubPanelView& panel
 
                       w[index] = common::internal::vector<T>(pt_cols, 0);
                       computeWTrailingPanel(has_head, tiles, w[index], j, pt_cols, begin, end);
-                      barrier_ptr->arrive_and_wait();
+                      barrier_ptr->arrive_and_wait(barrier_busy_wait);
 
                       // STEP2b: reduce w results (single-threaded)
                       if (index == 0)
                         dlaf::eigensolver::internal::reduceColumnVectors(w);
-                      barrier_ptr->arrive_and_wait();
+                      barrier_ptr->arrive_and_wait(barrier_busy_wait);
 
                       // STEP3: update trailing panel (multi-threaded)
                       updateTrailingPanel(has_head, tiles, j, w[0], taus.back(), begin, end);
-                      barrier_ptr->arrive_and_wait();
+                      barrier_ptr->arrive_and_wait(barrier_busy_wait);
                     }
                   }) |
          ex::then([](auto barrier_ptr, auto w, auto taus, auto tiles) {
@@ -618,6 +620,7 @@ auto computePanelReflectors(TriggerSender&& trigger, comm::IndexT_MPI rank_v0,
                                              auto& taus, auto& tiles, auto&& pcomm) {
                     const bool rankHasHead = rank_v0 == pcomm.get().rank();
 
+                    const auto barrier_busy_wait = getReductionToBandBarrierBusyWait();
                     const std::size_t batch_size = util::ceilDiv(tiles.size(), nthreads);
                     const std::size_t begin = index * batch_size;
                     const std::size_t end = std::min(index * batch_size + batch_size, tiles.size());
@@ -633,7 +636,7 @@ auto computePanelReflectors(TriggerSender&& trigger, comm::IndexT_MPI rank_v0,
                         const bool has_head = rankHasHead;
                         taus.emplace_back(computeReflector(has_head, pcomm.get(), tiles, j));
                       }
-                      barrier_ptr->arrive_and_wait();
+                      barrier_ptr->arrive_and_wait(barrier_busy_wait);
 
                       // STEP2a: compute w (multi-threaded)
                       const SizeType pt_cols = cols - (j + 1);
@@ -644,19 +647,19 @@ auto computePanelReflectors(TriggerSender&& trigger, comm::IndexT_MPI rank_v0,
 
                       w[index] = common::internal::vector<T>(pt_cols, 0);
                       computeWTrailingPanel(has_head, tiles, w[index], j, pt_cols, begin, end);
-                      barrier_ptr->arrive_and_wait();
+                      barrier_ptr->arrive_and_wait(barrier_busy_wait);
 
                       // STEP2b: reduce w results (single-threaded)
                       if (index == 0) {
                         dlaf::eigensolver::internal::reduceColumnVectors(w);
                         comm::sync::allReduceInPlace(pcomm.get(), MPI_SUM,
                                                      common::make_data(w[0].data(), pt_cols));
                       }
-                      barrier_ptr->arrive_and_wait();
+                      barrier_ptr->arrive_and_wait(barrier_busy_wait);
 
                       // STEP3: update trailing panel (multi-threaded)
                       updateTrailingPanel(has_head, tiles, j, w[0], taus.back(), begin, end);
-                      barrier_ptr->arrive_and_wait();
+                      barrier_ptr->arrive_and_wait(barrier_busy_wait);
                     }
                   }) |
          ex::then([](auto barrier_ptr, auto w, auto taus, auto tiles, auto pcomm) {

diff --git a/include/dlaf/eigensolver/tridiag_solver/merge.h b/include/dlaf/eigensolver/tridiag_solver/merge.h
@@ -17,6 +17,7 @@
 #include <dlaf/common/range2d.h>
 #include <dlaf/common/single_threaded_blas.h>
 #include <dlaf/communication/kernels.h>
+#include <dlaf/eigensolver/internal/get_tridiag_rank1_barrier_busy_wait.h>
 #include <dlaf/eigensolver/internal/get_tridiag_rank1_nworkers.h>
 #include <dlaf/eigensolver/tridiag_solver/coltype.h>
 #include <dlaf/eigensolver/tridiag_solver/index_manipulation.h>
@@ -475,6 +476,7 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k
                                            auto& evec_tiles, auto& ws_vecs) {
         const matrix::Distribution distr(LocalElementSize(n, n), TileElementSize(nb, nb));
 
+        const auto barrier_busy_wait = getTridiagRank1BarrierBusyWait();
         const std::size_t batch_size = util::ceilDiv(to_sizet(k), nthreads);
         const std::size_t begin = thread_idx * batch_size;
         const std::size_t end = std::min(thread_idx * batch_size + batch_size, to_sizet(k));
@@ -486,7 +488,7 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k
             ws_vecs.emplace_back(to_sizet(k));
         }
 
-        barrier_ptr->arrive_and_wait();
+        barrier_ptr->arrive_and_wait(barrier_busy_wait);
 
         // STEP 1: LAED4 (multi-thread)
         const T* d_ptr = d_tiles_futs[0].get().ptr();
@@ -511,7 +513,7 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k
             return;
         }
 
-        barrier_ptr->arrive_and_wait();
+        barrier_ptr->arrive_and_wait(barrier_busy_wait);
 
         // STEP 2a Compute weights (multi-thread)
         auto& q = evec_tiles;
@@ -550,7 +552,7 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k
             compute_w({i, j});
         }
 
-        barrier_ptr->arrive_and_wait();
+        barrier_ptr->arrive_and_wait(barrier_busy_wait);
 
         // STEP 2B: reduce, then finalize computation with sign and square root (single-thread)
         if (thread_idx == 0) {
@@ -563,7 +565,7 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k
           }
         }
 
-        barrier_ptr->arrive_and_wait();
+        barrier_ptr->arrive_and_wait(barrier_busy_wait);
 
         // STEP 3: Compute eigenvectors of the modified rank-1 modification (normalize) (multi-thread)
         {

diff --git a/include/dlaf/tune.h b/include/dlaf/tune.h
@@ -24,9 +24,16 @@ namespace dlaf {
 /// - red2band_panel_nworkers:
 ///     The maximum number of threads to use for computing the panel in the reduction to band algorithm.
 ///     Set with --dlaf:red2band-panel-nworkers or env variable DLAF_RED2BAND_PANEL_NWORKERS.
+/// - red2band_barrier_busy_wait_us:
+///     The duration in microseconds to busy-wait in barriers in the reduction to band algorithm.
+///     Set with --dlaf:red2band-barrier-busy-wait-us or env variable DLAF_RED2BAND_BARRIER_BUSY_WAIT_US.
 /// - tridiag_rank1_nworkers:
 ///     The maximum number of threads to use for computing rank1 problem solution in tridiagonal solver
 ///     algorithm. Set with --dlaf:tridiag-rank1-nworkers or env variable DLAF_TRIDIAG_RANK1_NWORKERS.
+/// - tridiag_rank1_barrier_busy_wait_us:
+///     The duration in microseconds to busy-wait in barriers when computing rank1 problem solution in
+///     the tridiagonal solver algorithm. Set with --dlaf:tridiag-rank1-barrier-busy-wait-us or env
+///     variable DLAF_TRIDIAG_RANK1_BARRIER_BUSY_WAIT_US.
 /// - eigensolver_min_band:
 ///     The minimum value to start looking for a divisor of the block size.
 ///     Set with --dlaf:eigensolver-min-band or env variable DLAF_EIGENSOLVER_MIN_BAND.
@@ -44,8 +51,10 @@ namespace dlaf {
 struct TuneParameters {
   std::size_t red2band_panel_nworkers =
       std::max<std::size_t>(1, pika::resource::get_thread_pool("default").get_os_thread_count() / 2);
+  std::size_t red2band_barrier_busy_wait_us = 1000;
   std::size_t tridiag_rank1_nworkers =
       std::max<std::size_t>(1, pika::resource::get_thread_pool("default").get_os_thread_count() / 2);
+  std::size_t tridiag_rank1_barrier_busy_wait_us = 0;
 
   SizeType eigensolver_min_band = 100;
   SizeType band_to_tridiag_1d_block_size_base = 8192;

diff --git a/spack/packages/dla-future/package.py b/spack/packages/dla-future/package.py
@@ -47,7 +47,7 @@ class DlaFuture(CMakePackage, CudaPackage, ROCmPackage):
     depends_on("umpire+rocm~shared", when="+rocm")
     depends_on("[email protected]:")
 
-    depends_on("pika@0.15.1:")
+    depends_on("pika@0.16:")
     depends_on("[email protected]:")
     depends_on("pika +mpi")
     depends_on("pika +cuda", when="+cuda")

diff --git a/src/init.cpp b/src/init.cpp
@@ -162,6 +162,9 @@ void updateConfiguration(const pika::program_options::variables_map& vm, configu
   updateConfigurationValue(vm, param.red2band_panel_nworkers, "RED2BAND_PANEL_NWORKERS",
                            "red2band-panel-nworkers");
 
+  updateConfigurationValue(vm, param.red2band_barrier_busy_wait_us, "RED2BAND_BARRIER_BUSY_WAIT_US",
+                           "red2band-barrier-busy-wait-us");
+
   updateConfigurationValue(vm, param.eigensolver_min_band, "EIGENSOLVER_MIN_BAND",
                            "eigensolver-min-band");
 
@@ -171,6 +174,9 @@ void updateConfiguration(const pika::program_options::variables_map& vm, configu
   updateConfigurationValue(vm, param.tridiag_rank1_nworkers, "TRIDIAG_RANK1_NWORKERS",
                            "tridiag-rank1-nworkers");
 
+  updateConfigurationValue(vm, param.red2band_barrier_busy_wait_us, "TRIDIAG_RANK1_BARRIER_BUSY_WAIT_US",
+                           "tridiag-rank1-barrier-busy-wait-us");
+
   updateConfigurationValue(vm, param.bt_band_to_tridiag_hh_apply_group_size,
                            "DLAF_BT_BAND_TO_TRIDIAG_HH_APPLY_GROUP_SIZE",
                            "bt-band-to-tridiag-hh-apply-group-size");
@@ -202,7 +208,10 @@ pika::program_options::options_description getOptionsDescription() {
   // Tune parameters command line options
   desc.add_options()(
       "dlaf:red2band-panel-nworkers", pika::program_options::value<std::size_t>(),
-      "Maximum number of threads to use for computing the panel in the reduction to band algorithm.");
+      "The maximum number of threads to use for computing the panel in the reduction to band algorithm.");
+  desc.add_options()(
+      "dlaf:red2band-barrier-busy-wait-us", pika::program_options::value<std::size_t>(),
+      "The duration in microseconds to busy-wait in barriers in the reduction to band algorithm.");
   desc.add_options()(
       "dlaf:eigensolver-min-band", pika::program_options::value<SizeType>(),
       "The minimum value to start looking for a divisor of the block size. When larger than the block size, the block size will be used instead.");
@@ -212,6 +221,9 @@ pika::program_options::options_description getOptionsDescription() {
   desc.add_options()(
       "dlaf:tridiag-rank1-nworkers", pika::program_options::value<std::size_t>(),
       "The maximum number of threads to use for computing rank1 problem solution in tridiagonal solver algorithm.");
+  desc.add_options()(
+      "dlaf:tridiag-rank1-barrier-busy-wait-us", pika::program_options::value<std::size_t>(),
+      "The duration in microseconds to busy-wait in barriers when computing rank1 problem solution in the tridiagonal solver algorithm.");
   desc.add_options()(
       "dlaf:bt-band-to-tridiag-hh-apply-group-size", pika::program_options::value<SizeType>(),
       "The application of the HH reflector is splitted in smaller applications of group size reflectors.");