Format files

eth-cscs · Jan 24, 2022 · 5092573 · 5092573
1 parent 730980c
commit 5092573
Show file tree

Hide file tree

Showing 29 changed files with 112 additions and 106 deletions.
diff --git a/include/dlaf/auxiliary/norm/mc.h b/include/dlaf/auxiliary/norm/mc.h
@@ -87,11 +87,11 @@ dlaf::BaseType<T> Norm<Backend::MC, Device::CPU, T>::max_L(comm::CommunicatorGri
 
   // TODO unwrapping can be skipped for optimization reasons
   NormT local_max_value = pika::dataflow(unwrapping([](const auto&& values) {
-                                          if (values.size() == 0)
-                                            return std::numeric_limits<NormT>::min();
-                                          return *std::max_element(values.begin(), values.end());
-                                        }),
-                                        tiles_max)
+                                           if (values.size() == 0)
+                                             return std::numeric_limits<NormT>::min();
+                                           return *std::max_element(values.begin(), values.end());
+                                         }),
+                                         tiles_max)
                               .get();
   NormT max_value;
   dlaf::comm::sync::reduce(comm_grid.rankFullCommunicator(rank), comm_grid.fullCommunicator(), MPI_MAX,

diff --git a/include/dlaf/common/pipeline.h b/include/dlaf/common/pipeline.h
@@ -53,7 +53,7 @@ class PromiseGuard {
   };
 
 private:
-  T object_;                              /// the object owned by the wrapper.
+  T object_;                               /// the object owned by the wrapper.
   pika::lcos::local::promise<T> promise_;  /// the shared state that will unlock the next user.
 };
 

diff --git a/include/dlaf/cublas/executor.h b/include/dlaf/cublas/executor.h
@@ -24,10 +24,10 @@
 #include <pika/execution.hpp>
 #include <pika/functional.hpp>
 #include <pika/future.hpp>
+#include <pika/modules/async_cuda.hpp>
 #include <pika/mutex.hpp>
 #include <pika/tuple.hpp>
 #include <pika/type_traits.hpp>
-#include <pika/modules/async_cuda.hpp>
 
 #include "dlaf/common/assert.h"
 #include "dlaf/cublas/error.h"
@@ -60,8 +60,8 @@ inline constexpr bool isAsyncCublasCallable_v = isAsyncCublasCallable<F, Ts...>:
 template <typename F, typename Futures>
 struct isDataflowCublasCallable
     : pika::is_invocable<pika::util::functional::invoke_fused, F,
-                        decltype(pika::tuple_cat(pika::tie(std::declval<cublasHandle_t&>()),
-                                                std::declval<Futures>()))> {};
+                         decltype(pika::tuple_cat(pika::tie(std::declval<cublasHandle_t&>()),
+                                                  std::declval<Futures>()))> {};
 template <typename F, typename Futures>
 inline constexpr bool isDataflowCublasCallable_v = isDataflowCublasCallable<F, Futures>::value;
 }
@@ -122,14 +122,14 @@ class Executor : public cuda::Executor {
     cudaStream_t stream = stream_pool_.getNextStream();
     cublasHandle_t handle = handle_pool_.getNextHandle(stream);
     auto r = pika::invoke_fused(std::forward<F>(f),
-                               pika::tuple_cat(pika::tie(handle), std::forward<Futures>(futures)));
+                                pika::tuple_cat(pika::tie(handle), std::forward<Futures>(futures)));
     pika::future<void> fut = pika::cuda::experimental::detail::get_future_with_event(stream);
 
     // The handle and stream pools are captured by value to ensure that the
     // streams live at least until the event has completed.
     fut.then(pika::launch::sync, [r = std::move(r), frame_p = std::move(frame_p),
-                                 stream_pool = stream_pool_, handle_pool = handle_pool_](
-                                    pika::future<void>&&) mutable { frame_p->set_data(std::move(r)); });
+                                  stream_pool = stream_pool_, handle_pool = handle_pool_](
+                                     pika::future<void>&&) mutable { frame_p->set_data(std::move(r)); });
   }
 };
 }

diff --git a/include/dlaf/cuda/executor.h b/include/dlaf/cuda/executor.h
@@ -66,7 +66,7 @@ class Executor {
     // The stream pool is captured by value to ensure that the streams live at
     // least until the event has completed.
     return fut.then(pika::launch::sync, [r = std::move(r), stream_pool = stream_pool_](
-                                           pika::future<void>&&) mutable { return std::move(r); });
+                                            pika::future<void>&&) mutable { return std::move(r); });
   }
 
   template <class Frame, class F, class Futures>
@@ -77,7 +77,7 @@ class Executor {
 
     cudaStream_t stream = stream_pool_.getNextStream();
     auto r = pika::invoke_fused(std::forward<F>(f),
-                               pika::tuple_cat(std::forward<Futures>(futures), pika::tie(stream)));
+                                pika::tuple_cat(std::forward<Futures>(futures), pika::tie(stream)));
     pika::future<void> fut = pika::cuda::experimental::detail::get_future_with_event(stream);
 
     // The stream pool is captured by value to ensure that the streams live at

diff --git a/include/dlaf/cusolver/executor.h b/include/dlaf/cusolver/executor.h
@@ -24,10 +24,10 @@
 #include <pika/execution.hpp>
 #include <pika/functional.hpp>
 #include <pika/future.hpp>
+#include <pika/modules/async_cuda.hpp>
 #include <pika/mutex.hpp>
 #include <pika/tuple.hpp>
 #include <pika/type_traits.hpp>
-#include <pika/modules/async_cuda.hpp>
 
 #include "dlaf/common/assert.h"
 #include "dlaf/cublas/executor.h"
@@ -60,8 +60,8 @@ inline constexpr bool isAsyncCusolverCallable_v = isAsyncCusolverCallable<F, Ts.
 template <typename F, typename Futures>
 struct isDataflowCusolverCallable
     : pika::is_invocable<pika::util::functional::invoke_fused, F,
-                        decltype(pika::tuple_cat(pika::tie(std::declval<cusolverDnHandle_t&>()),
-                                                std::declval<Futures>()))> {};
+                         decltype(pika::tuple_cat(pika::tie(std::declval<cusolverDnHandle_t&>()),
+                                                  std::declval<Futures>()))> {};
 
 template <typename F, typename Futures>
 inline constexpr bool isDataflowCusolverCallable_v = isDataflowCusolverCallable<F, Futures>::value;
@@ -123,14 +123,14 @@ class Executor : public cublas::Executor {
     cudaStream_t stream = stream_pool_.getNextStream();
     cusolverDnHandle_t handle = handle_pool_.getNextHandle(stream);
     auto r = pika::invoke_fused(std::forward<F>(f),
-                               pika::tuple_cat(pika::tie(handle), std::forward<Futures>(futures)));
+                                pika::tuple_cat(pika::tie(handle), std::forward<Futures>(futures)));
     pika::future<void> fut = pika::cuda::experimental::detail::get_future_with_event(stream);
 
     // The handle and stream pools are captured by value to ensure that the
     // streams live at least until the event has completed.
     fut.then(pika::launch::sync, [r = std::move(r), frame_p = std::move(frame_p),
-                                 stream_pool = stream_pool_, handle_pool = handle_pool_](
-                                    pika::future<void>&&) mutable { frame_p->set_data(std::move(r)); });
+                                  stream_pool = stream_pool_, handle_pool = handle_pool_](
+                                     pika::future<void>&&) mutable { frame_p->set_data(std::move(r)); });
   }
 
   template <typename F, typename... Ts>

diff --git a/include/dlaf/eigensolver/band_to_tridiag/mc.h b/include/dlaf/eigensolver/band_to_tridiag/mc.h
@@ -299,7 +299,7 @@ struct BandToTridiag<Backend::MC, Device::CPU, T> {
           deps.push_back(sf);
         }
         sf = pika::dataflow(executor_hp, unwrapping(copy_offdiag), k * nb,
-                           mat_a.read(GlobalTileIndex{k + 1, k}), sf);
+                            mat_a.read(GlobalTileIndex{k + 1, k}), sf);
         deps.push_back(sf);
       }
       else {
@@ -342,7 +342,7 @@ struct BandToTridiag<Backend::MC, Device::CPU, T> {
         const auto tile_index = sweep / nb;
         const auto start = tile_index * nb;
         pika::dataflow(executor_hp, unwrapping(copy_tridiag_task), start, std::min(nb, size - start),
-                      std::min(nb, size - 1 - start), mat_trid(GlobalTileIndex{0, tile_index}), dep);
+                       std::min(nb, size - 1 - start), mat_trid(GlobalTileIndex{0, tile_index}), dep);
       }
     };
 
@@ -362,7 +362,7 @@ struct BandToTridiag<Backend::MC, Device::CPU, T> {
         const GlobalElementIndex index_v((sweep / b + step) * b, sweep);
 
         pika::dataflow(pika::launch::sync, unwrapping(store_tau_v), w_pipeline(),
-                      mat_v(dist_v.globalTileIndex(index_v)), dist_v.tileElementIndex(index_v));
+                       mat_v(dist_v.globalTileIndex(index_v)), dist_v.tileElementIndex(index_v));
         deps[step] = pika::dataflow(executor_hp, unwrapping(cont_sweep), w_pipeline(), deps[dep_index]);
       }
 

diff --git a/include/dlaf/eigensolver/reduction_to_band.h b/include/dlaf/eigensolver/reduction_to_band.h
@@ -76,8 +76,8 @@ std::vector<pika::shared_future<common::internal::vector<T>>> reductionToBand(Ma
 /// @pre mat_a has a square block size
 /// @pre mat_a is distributed according to @p grid
 template <Backend backend, Device device, class T>
-std::vector<pika::shared_future<common::internal::vector<T>>> reductionToBand(comm::CommunicatorGrid grid,
-                                                                             Matrix<T, device>& mat_a) {
+std::vector<pika::shared_future<common::internal::vector<T>>> reductionToBand(
+    comm::CommunicatorGrid grid, Matrix<T, device>& mat_a) {
   DLAF_ASSERT(matrix::square_size(mat_a), mat_a);
   DLAF_ASSERT(matrix::square_blocksize(mat_a), mat_a);
   DLAF_ASSERT(matrix::equal_process_grid(mat_a, grid), mat_a, grid);

diff --git a/include/dlaf/eigensolver/reduction_to_band/mc.h b/include/dlaf/eigensolver/reduction_to_band/mc.h
@@ -46,7 +46,8 @@ namespace internal {
 
 template <class T>
 struct ReductionToBand<Backend::MC, Device::CPU, T> {
-  static std::vector<pika::shared_future<common::internal::vector<T>>> call(Matrix<T, Device::CPU>& mat_a);
+  static std::vector<pika::shared_future<common::internal::vector<T>>> call(
+      Matrix<T, Device::CPU>& mat_a);
   static std::vector<pika::shared_future<common::internal::vector<T>>> call(
       comm::CommunicatorGrid grid, Matrix<T, Device::CPU>& mat_a);
 };
@@ -217,15 +218,15 @@ template <class Executor, class T>
 void hemmDiag(const Executor& ex, pika::shared_future<TileT<const T>> tile_a,
               pika::shared_future<TileT<const T>> tile_w, pika::future<TileT<T>> tile_x) {
   pika::dataflow(ex, matrix::unwrapExtendTiles(tile::internal::hemm_o), blas::Side::Left,
-                blas::Uplo::Lower, T(1), std::move(tile_a), std::move(tile_w), T(1), std::move(tile_x));
+                 blas::Uplo::Lower, T(1), std::move(tile_a), std::move(tile_w), T(1), std::move(tile_x));
 }
 
 // X += op(A) * W
 template <class Executor, class T>
 void hemmOffDiag(const Executor& ex, blas::Op op, pika::shared_future<TileT<const T>> tile_a,
                  pika::shared_future<TileT<const T>> tile_w, pika::future<TileT<T>> tile_x) {
   pika::dataflow(ex, matrix::unwrapExtendTiles(tile::internal::gemm_o), op, blas::Op::NoTrans, T(1),
-                std::move(tile_a), std::move(tile_w), T(1), std::move(tile_x));
+                 std::move(tile_a), std::move(tile_w), T(1), std::move(tile_x));
 }
 
 template <class Executor, class T>
@@ -357,7 +358,7 @@ void gemmUpdateX(PanelT<Coord::Col, T>& x, ConstMatrixT<T>& w2, MatrixLikeT& v)
   // GEMM X = X - 0.5 . V . W2
   for (const auto& index_i : v.iteratorLocal())
     pika::dataflow(ex, unwrapExtendTiles(gemm_o), blas::Op::NoTrans, blas::Op::NoTrans, T(-0.5),
-                  v.read(index_i), w2.read(LocalTileIndex(0, 0)), T(1), x(index_i));
+                   v.read(index_i), w2.read(LocalTileIndex(0, 0)), T(1), x(index_i));
 }
 
 template <class T>
@@ -426,7 +427,7 @@ void gemmComputeW2(MatrixT<T>& w2, ConstPanelT<Coord::Col, T>& w, ConstPanelT<Co
   // GEMM W2 = W* . X
   for (const auto& index_tile : w.iteratorLocal())
     pika::dataflow(ex, unwrapExtendTiles(gemm_o), blas::Op::ConjTrans, blas::Op::NoTrans, T(1),
-                  w.read(index_tile), x.read(index_tile), T(1), w2(LocalTileIndex(0, 0)));
+                   w.read(index_tile), x.read(index_tile), T(1), w2(LocalTileIndex(0, 0)));
 }
 
 template <class T>
@@ -532,7 +533,7 @@ pika::shared_future<common::internal::vector<T>> computePanelReflectors(
   auto panel_tiles = pika::when_all(matrix::select(mat_a, ai_panel_range));
 
   return pika::dataflow(getHpExecutor<Backend::MC>(), std::move(panel_task), std::move(panel_tiles),
-                       mpi_col_chain_panel, std::move(trigger));
+                        mpi_col_chain_panel, std::move(trigger));
 }
 
 template <class T>

diff --git a/include/dlaf/init.h b/include/dlaf/init.h
@@ -92,7 +92,7 @@ struct [[nodiscard]] ScopedInitializer {
   ScopedInitializer(int argc, const char* const argv[], configuration const& user_cfg = {});
   ~ScopedInitializer();
 
-  ScopedInitializer(ScopedInitializer&&) = delete;
+  ScopedInitializer(ScopedInitializer &&) = delete;
   ScopedInitializer(ScopedInitializer const&) = delete;
   ScopedInitializer& operator=(ScopedInitializer&&) = delete;
   ScopedInitializer& operator=(ScopedInitializer const&) = delete;

diff --git a/include/dlaf/matrix/copy.h b/include/dlaf/matrix/copy.h
@@ -42,8 +42,8 @@ void copy(Matrix<const T, Source>& source, Matrix<T, Destination>& dest) {
   for (SizeType j = 0; j < local_tile_cols; ++j) {
     for (SizeType i = 0; i < local_tile_rows; ++i) {
       pika::dataflow(dlaf::getCopyExecutor<Source, Destination>(),
-                    unwrapExtendTiles(dlaf::matrix::internal::copy_o), source.read(LocalTileIndex(i, j)),
-                    dest(LocalTileIndex(i, j)));
+                     unwrapExtendTiles(dlaf::matrix::internal::copy_o),
+                     source.read(LocalTileIndex(i, j)), dest(LocalTileIndex(i, j)));
     }
   }
 }

diff --git a/include/dlaf/matrix/copy_tile.h b/include/dlaf/matrix/copy_tile.h
@@ -232,8 +232,8 @@ void copyIfNeeded(FutureS<Tile<U, Source>> tile_from, FutureD<Tile<T, Destinatio
                   pika::future<void> wait_for_me = pika::make_ready_future<void>()) {
   if constexpr (Destination != Source)
     pika::dataflow(dlaf::getCopyExecutor<Source, Destination>(),
-                  matrix::unwrapExtendTiles(internal::copy_o), wait_for_me, std::move(tile_from),
-                  std::move(tile_to));
+                   matrix::unwrapExtendTiles(internal::copy_o), wait_for_me, std::move(tile_from),
+                   std::move(tile_to));
 }
 }
 }
diff --git a/include/dlaf/matrix/internal/tile_future_manager.h b/include/dlaf/matrix/internal/tile_future_manager.h
@@ -25,7 +25,8 @@ pika::future<ReturnTileType> setPromiseTileFuture(
   using NonConstTileType = typename ReturnTileType::TileType;
 
   DLAF_ASSERT_HEAVY(old_future.valid(), "");
-  return old_future.then(pika::launch::sync, [p = std::move(p)](pika::future<TileDataType>&& fut) mutable {
+  return old_future.then(pika::launch::sync, [p = std::move(p)](
+                                                 pika::future<TileDataType>&& fut) mutable {
     std::exception_ptr current_exception_ptr;
 
     try {

diff --git a/include/dlaf/matrix/tile.h b/include/dlaf/matrix/tile.h
@@ -129,7 +129,7 @@ pika::shared_future<Tile<T, D>> splitTileInsertFutureInChain(pika::future<Tile<T
 
 template <class T, Device D>
 pika::future<Tile<T, D>> createSubTile(const pika::shared_future<Tile<T, D>>& tile,
-                                      const SubTileSpec& spec);
+                                       const SubTileSpec& spec);
 }
 
 /// The Tile object aims to provide an effective way to access the memory as a two dimensional
@@ -361,7 +361,7 @@ auto create_data(const Tile<T, device>& tile) {
 namespace internal {
 template <class T, Device D>
 pika::future<Tile<T, D>> createSubTile(const pika::shared_future<Tile<T, D>>& tile,
-                                      const SubTileSpec& spec) {
+                                       const SubTileSpec& spec) {
   return pika::dataflow(
       pika::launch::sync, [](auto tile, auto spec) { return Tile<T, D>(tile, spec); }, tile, spec);
 }
@@ -394,7 +394,8 @@ pika::shared_future<Tile<T, D>> splitTileInsertFutureInChain(pika::future<Tile<T
 
     return pika::make_tuple(std::move(tile), std::move(dep_tracker));
   };
-  auto tmp = pika::split_future(tile.then(pika::launch::sync, pika::unwrapping(std::move(swap_promise))));
+  auto tmp =
+      pika::split_future(tile.then(pika::launch::sync, pika::unwrapping(std::move(swap_promise))));
   // old_tile = F1(PN) and will be used to create the subtiles
   pika::shared_future<TileType> old_tile = std::move(pika::get<0>(tmp));
   // 3. Set P2 or SF(P2) into FN to restore the chain:  F1(PN)  FN(*) ...
@@ -405,7 +406,7 @@ pika::shared_future<Tile<T, D>> splitTileInsertFutureInChain(pika::future<Tile<T
   };
   // tile = FN(*) (out argument) can be used to access the full tile after the subtiles tasks completed.
   tile = pika::dataflow(pika::launch::sync, pika::unwrapping(set_promise_or_shfuture), tmp_tile,
-                       std::move(pika::get<1>(tmp)));
+                        std::move(pika::get<1>(tmp)));
 
   return old_tile;
 }
@@ -418,7 +419,7 @@ pika::shared_future<Tile<T, D>> splitTileInsertFutureInChain(pika::future<Tile<T
 /// and the returned subtile go out of scope.
 template <class T, Device D>
 pika::shared_future<Tile<const T, D>> splitTile(const pika::shared_future<Tile<const T, D>>& tile,
-                                               const SubTileSpec& spec) {
+                                                const SubTileSpec& spec) {
   return internal::createSubTile(tile, spec);
 }
 
@@ -463,7 +464,7 @@ pika::future<Tile<T, D>> splitTile(pika::future<Tile<T, D>>& tile, const SubTile
 ///      (i.e. two different subtile cannot access the same element).
 template <class T, Device D>
 std::vector<pika::future<Tile<T, D>>> splitTileDisjoint(pika::future<Tile<T, D>>& tile,
-                                                       const std::vector<SubTileSpec>& specs) {
+                                                        const std::vector<SubTileSpec>& specs) {
   if (specs.size() == 0)
     return {};
 

diff --git a/include/dlaf/multiplication/triangular/impl.h b/include/dlaf/multiplication/triangular/impl.h
@@ -38,8 +38,8 @@ namespace internal {
 
 namespace triangular_lln {
 template <Backend backend, class T, typename InSender, typename OutSender>
-void trmmBPanelTile(pika::threads::thread_priority priority, blas::Diag diag, T alpha, InSender&& in_tile,
-                    OutSender&& out_tile) {
+void trmmBPanelTile(pika::threads::thread_priority priority, blas::Diag diag, T alpha,
+                    InSender&& in_tile, OutSender&& out_tile) {
   dlaf::internal::whenAllLift(blas::Side::Left, blas::Uplo::Lower, blas::Op::NoTrans, diag, alpha,
                               std::forward<InSender>(in_tile), std::forward<OutSender>(out_tile)) |
       tile::trmm(dlaf::internal::Policy<backend>(priority)) |
@@ -78,8 +78,8 @@ void gemmTrailingMatrixTile(pika::threads::thread_priority priority, blas::Op op
 
 namespace triangular_lun {
 template <Backend backend, class T, typename InSender, typename OutSender>
-void trmmBPanelTile(pika::threads::thread_priority priority, blas::Diag diag, T alpha, InSender&& in_tile,
-                    OutSender&& out_tile) {
+void trmmBPanelTile(pika::threads::thread_priority priority, blas::Diag diag, T alpha,
+                    InSender&& in_tile, OutSender&& out_tile) {
   dlaf::internal::whenAllLift(blas::Side::Left, blas::Uplo::Upper, blas::Op::NoTrans, diag, alpha,
                               std::forward<InSender>(in_tile), std::forward<OutSender>(out_tile)) |
       tile::trmm(dlaf::internal::Policy<backend>(priority)) |
@@ -118,8 +118,8 @@ void gemmTrailingMatrixTile(pika::threads::thread_priority priority, blas::Op op
 
 namespace triangular_rln {
 template <Backend backend, class T, typename InSender, typename OutSender>
-void trmmBPanelTile(pika::threads::thread_priority priority, blas::Diag diag, T alpha, InSender&& in_tile,
-                    OutSender&& out_tile) {
+void trmmBPanelTile(pika::threads::thread_priority priority, blas::Diag diag, T alpha,
+                    InSender&& in_tile, OutSender&& out_tile) {
   dlaf::internal::whenAllLift(blas::Side::Right, blas::Uplo::Lower, blas::Op::NoTrans, diag, alpha,
                               std::forward<InSender>(in_tile), std::forward<OutSender>(out_tile)) |
       tile::trmm(dlaf::internal::Policy<backend>(priority)) |
@@ -158,8 +158,8 @@ void gemmTrailingMatrixTile(pika::threads::thread_priority priority, blas::Op op
 
 namespace triangular_run {
 template <Backend backend, class T, typename InSender, typename OutSender>
-void trmmBPanelTile(pika::threads::thread_priority priority, blas::Diag diag, T alpha, InSender&& in_tile,
-                    OutSender&& out_tile) {
+void trmmBPanelTile(pika::threads::thread_priority priority, blas::Diag diag, T alpha,
+                    InSender&& in_tile, OutSender&& out_tile) {
   dlaf::internal::whenAllLift(blas::Side::Right, blas::Uplo::Upper, blas::Op::NoTrans, diag, alpha,
                               std::forward<InSender>(in_tile), std::forward<OutSender>(out_tile)) |
       tile::trmm(dlaf::internal::Policy<backend>(priority)) |