algo: triangular solver LLT (distributed) (#409)

Implementation of the distributed Triangular Solver Left Lower Transposed. Note: keep_future is problematic with transform<GPU> because it releases the future during the submission of the work but then it does not keep it alive till the end of the async operation. Not using it results in getting the contained value, which instead will be kept alive (it does not get unwrapped as the future).
eth-cscs · Dec 1, 2021 · 3df633e · 3df633e
1 parent 41b876d
commit 3df633e
Show file tree

Hide file tree

Showing 23 changed files with 378 additions and 150 deletions.
diff --git a/include/dlaf/blas/tile.h b/include/dlaf/blas/tile.h
@@ -9,7 +9,7 @@
 //
 #pragma once
 
-#include "blas.hh"
+#include <blas.hh>
 
 #include "dlaf/common/callable_object.h"
 #include "dlaf/matrix/tile.h"
@@ -22,7 +22,6 @@
 
 #ifdef DLAF_WITH_CUDA
 #include <cublas_v2.h>
-#include <blas.hh>
 
 #include "dlaf/cublas/error.h"
 #include "dlaf/util_cublas.h"
@@ -238,6 +237,12 @@ void trsm(const blas::Side side, const blas::Uplo uplo, const blas::Op op, const
     }                                                           \
   }
 
+DLAF_DECLARE_CUBLAS_OP(Axpy);
+DLAF_DEFINE_CUBLAS_OP(Axpy, float, Saxpy);
+DLAF_DEFINE_CUBLAS_OP(Axpy, double, Daxpy);
+DLAF_DEFINE_CUBLAS_OP(Axpy, std::complex<float>, Caxpy);
+DLAF_DEFINE_CUBLAS_OP(Axpy, std::complex<double>, Zaxpy);
+
 DLAF_DECLARE_CUBLAS_OP(Gemm);
 DLAF_DEFINE_CUBLAS_OP(Gemm, float, Sgemm);
 DLAF_DEFINE_CUBLAS_OP(Gemm, double, Dgemm);

diff --git a/include/dlaf/blas/tile_extensions.h b/include/dlaf/blas/tile_extensions.h
@@ -0,0 +1,86 @@
+//
+// Distributed Linear Algebra with Future (DLAF)
+//
+// Copyright (c) 2018-2021, ETH Zurich
+// All rights reserved.
+//
+// Please, refer to the LICENSE file in the root directory.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+#pragma once
+
+#include <blas.hh>
+
+#include "dlaf/common/callable_object.h"
+#include "dlaf/matrix/tile.h"
+#include "dlaf/sender/make_sender_algorithm_overloads.h"
+#include "dlaf/sender/partial_transform.h"
+#include "dlaf/sender/policy.h"
+#include "dlaf/sender/transform.h"
+
+#ifdef DLAF_WITH_CUDA
+#include <cublas_v2.h>
+
+#include "dlaf/cublas/error.h"
+#include "dlaf/util_cublas.h"
+#endif
+
+namespace dlaf {
+namespace tile {
+using matrix::Tile;
+
+#ifdef DLAF_DOXYGEN
+
+/// Computes A += alpha * B
+///
+/// This overload blocks until completion of the algorithm.
+template <Backend B, class T, Device D>
+void add(T alpha, const matrix::Tile<const T, D>& tile_b, const matrix::Tile<T, D>& tile_a);
+
+/// \overload add
+///
+/// This overload takes a policy argument and a sender which must send all required arguments for the
+/// algorithm. Returns a sender which signals a connected receiver when the algorithm is done.
+template <Backend B, typename Sender,
+          typename = std::enable_if_t<hpx::execution::experimental::is_sender_v<Sender>>>
+auto add(const dlaf::internal::Policy<B>& p, Sender&& s);
+
+/// \overload add
+///
+/// This overload partially applies the algorithm with a policy for later use with operator| with a
+/// sender on the left-hand side.
+template <Backend B>
+auto add(const dlaf::internal::Policy<B>& p);
+
+#else
+
+namespace internal {
+
+template <class T>
+void add(T alpha, const matrix::Tile<const T, Device::CPU>& tile_b,
+         const matrix::Tile<T, Device::CPU>& tile_a) {
+  DLAF_ASSERT(equal_size(tile_a, tile_b), tile_a, tile_b);
+  for (auto j = 0; j < tile_a.size().cols(); ++j)
+    blas::axpy(tile_a.size().rows(), alpha, tile_b.ptr({0, j}), 1, tile_a.ptr({0, j}), 1);
+}
+
+#ifdef DLAF_WITH_CUDA
+template <class T>
+void add(cublasHandle_t handle, T alpha, const matrix::Tile<const T, Device::GPU>& tile_b,
+         const matrix::Tile<T, Device::GPU>& tile_a) {
+  DLAF_ASSERT(equal_size(tile_a, tile_b), tile_a, tile_b);
+  for (auto j = 0; j < tile_a.size().cols(); ++j)
+    tile::internal::CublasAxpy<T>::call(handle, tile_a.size().rows(), util::blasToCublasCast(&alpha),
+                                        util::blasToCublasCast(tile_b.ptr({0, j})), 1,
+                                        util::blasToCublasCast(tile_a.ptr({0, j})), 1);
+}
+#endif
+
+DLAF_MAKE_CALLABLE_OBJECT(add);
+}
+
+DLAF_MAKE_SENDER_ALGORITHM_OVERLOADS(add, internal::add_o)
+
+#endif
+}
+}
diff --git a/include/dlaf/common/contiguous_buffer_holder.h b/include/dlaf/common/contiguous_buffer_holder.h
@@ -32,6 +32,10 @@ template <class T>
 struct ContiguousBufferHolder {
   common::Buffer<std::remove_const_t<T>> buffer;
   common::DataDescriptor<T> descriptor;
+
+  bool isAllocated() const {
+    return static_cast<bool>(buffer);
+  }
 };
 
 /// It returns a ContiguousBufferHolder starting from a Tile, where the ContiguousBufferHolder will be either:
@@ -53,14 +57,11 @@ auto makeItContiguous(const matrix::Tile<T, Device::CPU>& tile) {
 
 DLAF_MAKE_CALLABLE_OBJECT(makeItContiguous);
 
-/// It returns @p tile, ensuring that if the given @p bag owns a temporary buffer, it copies data from
-/// this latter one to @p tile before returning it. Otherwise it is a no-op.
+/// Copy from bag to tile, just if the bag owns a temporary buffer. Otherwise it is a no-op.
 template <class T>
-auto copyBack(matrix::Tile<T, Device::CPU> tile, ContiguousBufferHolder<T> bag) {
-  auto buffer_used = std::move(bag.buffer);
-  if (buffer_used)
-    common::copy(buffer_used, common::make_data(tile));
-  return tile;
+void copyBack(const ContiguousBufferHolder<T>& bag, const matrix::Tile<T, Device::CPU>& tile) {
+  if (bag.isAllocated())
+    common::copy(bag.buffer, common::make_data(tile));
 }
 
 DLAF_MAKE_CALLABLE_OBJECT(copyBack);

diff --git a/include/dlaf/communication/kernels/all_reduce.h b/include/dlaf/communication/kernels/all_reduce.h
@@ -101,7 +101,7 @@ void scheduleAllReduce(const comm::Executor& ex,
                                                std::move(pcomm), reduce_op, std::move(cont_buf_in),
                                                std::move(cont_buf_out), tile_in));
 
-  dataflow(ex_copy, unwrapping(copyBack_o), std::move(tile_out), std::move(cont_buf_out));
+  dataflow(ex_copy, unwrapping(copyBack_o), std::move(cont_buf_out), std::move(tile_out));
 }
 
 template <class T>
@@ -138,7 +138,10 @@ hpx::future<matrix::Tile<T, Device::CPU>> scheduleAllReduceInPlace(
   cont_buf = dataflow(ex, unwrapping(internal::allReduceInPlace_o), std::move(pcomm), reduce_op,
                       std::move(cont_buf));
 
-  return dataflow(ex_copy, unwrapping(copyBack_o), std::move(tile), std::move(cont_buf));
+  // Note:
+  // This extracts the tile given as argument to copyBack, not the return value.
+  return hpx::get<1>(hpx::split_future(
+      dataflow(ex_copy, matrix::unwrapExtendTiles(copyBack_o), std::move(cont_buf), std::move(tile))));
 }
 }
 }
diff --git a/include/dlaf/communication/kernels/reduce.h b/include/dlaf/communication/kernels/reduce.h
@@ -10,6 +10,10 @@
 
 #pragma once
 
+#ifdef DLAF_WITH_CUDA_RDMA
+#warning "Reduce is not using CUDA_RDMA."
+#endif
+
 /// @file
 
 #include <mpi.h>
@@ -45,10 +49,10 @@ auto reduceRecvInPlace(common::PromiseGuard<comm::Communicator> pcomm, MPI_Op re
 
 DLAF_MAKE_CALLABLE_OBJECT(reduceRecvInPlace);
 
-template <class T>
+template <class T, Device D>
 auto reduceSend(comm::IndexT_MPI rank_root, common::PromiseGuard<comm::Communicator> pcomm,
                 MPI_Op reduce_op, common::internal::ContiguousBufferHolder<const T> cont_buf,
-                matrix::Tile<const T, Device::CPU> const&, MPI_Request* req) {
+                const matrix::Tile<const T, D>&, MPI_Request* req) {
   auto msg = comm::make_message(cont_buf.descriptor);
   auto& comm = pcomm.ref();
 
@@ -60,64 +64,100 @@ DLAF_MAKE_CALLABLE_OBJECT(reduceSend);
 
 }
 
-template <class T>
+template <class T, Device D>
 void scheduleReduceRecvInPlace(const comm::Executor& ex,
                                hpx::future<common::PromiseGuard<comm::Communicator>> pcomm,
-                               MPI_Op reduce_op, hpx::future<matrix::Tile<T, Device::CPU>> tile) {
+                               MPI_Op reduce_op, hpx::future<matrix::Tile<T, D>> tile) {
+  // Note:
+  //
+  // GPU  -> duplicateIfNeeded ---------------------> cCPU -> MPI -------------> copyIfNeeded --> GPU
+  // CPU  ----------------------> makeItContiguous -> cCPU -> MPI -> copyBack ------------------> CPU
+  // cCPU ------------------------------------------> cCPU -> MPI ------------------------------> cCPU
+  //
+  // where: cCPU = contiguous CPU
+
   using hpx::dataflow;
   using hpx::unwrapping;
 
   using common::internal::copyBack_o;
   using common::internal::makeItContiguous_o;
+  using internal::reduceRecvInPlace_o;
+  using matrix::Tile;
+  using matrix::duplicateIfNeeded;
+  using matrix::getUnwrapReturnValue;
   using matrix::unwrapExtendTiles;
 
+  const auto& ex_copy = getHpExecutor<Backend::MC>();
+
   // Note:
   //
-  // TILE ---> makeContiguous --+--> CONT_BUF ----> mpi_call ---> CONT_BUF --+
-  //                            |                                            |
-  //                            +-------------------> TILE ------------------+-> copyBack
+  // TILE_ORIG can be on CPU or GPU
+  //
+  // TILE_CPU ----> duplicateIfNeeded<CPU> ----> TILE_CPU (no-op)
+  // TILE_GPU ----> duplicateIfNeeded<CPU> ----> TILE_CPU
 
-  auto ex_copy = getHpExecutor<Backend::MC>();
+  hpx::shared_future<Tile<T, D>> tile_orig = tile.share();
+  hpx::shared_future<Tile<T, Device::CPU>> tile_cpu = duplicateIfNeeded<Device::CPU>(tile_orig);
 
-  hpx::future<common::internal::ContiguousBufferHolder<T>> cont_buf;
-  {
-    auto wrapped = getUnwrapRetValAndArgs(
-        dataflow(ex_copy, unwrapExtendTiles(makeItContiguous_o), std::move(tile)));
+  // Note:
+  //
+  // TILE_CPU -+-> makeContiguous --> CONT_BUF --> mpi --> CONT_BUF -+-> copyBack --> CHECKPOINT -->
+  //           |                                                     |
+  //           +------------------------> TILE_CPU ------------------+----------------------------->
+  //
+  // CHECKPOINT assures that the copyBack has finished, which is useful for the next step.
 
-    cont_buf = std::move(wrapped.first);
-    tile = std::move(hpx::get<0>(wrapped.second));
-  }
+  auto cont_buf = dataflow(ex_copy, unwrapping(makeItContiguous_o), tile_cpu);
+  cont_buf =
+      dataflow(ex, unwrapping(reduceRecvInPlace_o), std::move(pcomm), reduce_op, std::move(cont_buf));
+  auto checkpoint = dataflow(ex_copy, unwrapExtendTiles(copyBack_o), std::move(cont_buf), tile_cpu);
 
-  cont_buf = dataflow(ex, unwrapping(internal::reduceRecvInPlace_o), std::move(pcomm), reduce_op,
-                      std::move(cont_buf));
+  // Note:
+  //
+  // In case the original tile was on CPU, the previous step already managed to copy the result in place
+  // and nothing else has to be done.
+  // Otherwise, if it was on GPU, a temporary tile on CPU has been used, so this step handles the needed
+  // copy from CPU to GPU.
+  //
+  // TILE_CPU => no-op
+  // TILE_GPU => copy back from CPU to GPU
 
-  dataflow(ex_copy, unwrapping(copyBack_o), std::move(tile), std::move(cont_buf));
+  copyIfNeeded(tile_cpu, tile_orig, std::move(checkpoint));
 }
 
-template <class T>
+template <class T, Device D, template <class> class Future>
 void scheduleReduceSend(const comm::Executor& ex, comm::IndexT_MPI rank_root,
                         hpx::future<common::PromiseGuard<comm::Communicator>> pcomm, MPI_Op reduce_op,
-                        hpx::shared_future<matrix::Tile<const T, Device::CPU>> tile) {
+                        Future<matrix::Tile<T, D>> tile) {
   using hpx::dataflow;
   using hpx::unwrapping;
 
   using common::internal::makeItContiguous_o;
+  using matrix::Tile;
+  using matrix::duplicateIfNeeded;
+  using internal::reduceSend_o;
   using matrix::unwrapExtendTiles;
 
+  const auto& ex_copy = getHpExecutor<Backend::MC>();
+
   // Note:
   //
-  // TILE -+-> makeContiguous --+--> CONT_BUF ---+--> mpi_call
-  //       |                                     |
-  //       +--------------> TILE ----------------+
-
-  auto ex_copy = getHpExecutor<Backend::MC>();
+  // TILE can be on CPU or GPU
+  //
+  // TILE_CPU ----> duplicateIfNeeded<CPU> ----> TILE_CPU (no-op)
+  // TILE_GPU ----> duplicateIfNeeded<CPU> ----> TILE_CPU
+  hpx::shared_future<Tile<const T, Device::CPU>> tile_cpu =
+      duplicateIfNeeded<Device::CPU>(std::move(tile));
 
-  // TODO shared_future<Tile> as assumption, it requires changes for future<Tile>
-  hpx::future<common::internal::ContiguousBufferHolder<const T>> cont_buf =
-      dataflow(ex_copy, unwrapping(makeItContiguous_o), tile);
+  // Note:
+  //
+  // TILE_CPU -+-> makeContiguous ---> CONT_BUF ---+--> mpi_call--+
+  //           |                                   |              |
+  //           +-----------> TILE_CPU -------------+--------------+
+  auto cont_buf = dataflow(ex_copy, unwrapping(makeItContiguous_o), tile_cpu);
 
-  dataflow(ex, unwrapExtendTiles(internal::reduceSend_o), rank_root, std::move(pcomm), reduce_op,
-           std::move(cont_buf), tile);
+  dataflow(ex, unwrapExtendTiles(reduceSend_o), rank_root, std::move(pcomm), reduce_op,
+           std::move(cont_buf), tile_cpu);
 }
 }
 }
diff --git a/include/dlaf/eigensolver/backtransformation/mc.h b/include/dlaf/eigensolver/backtransformation/mc.h
@@ -82,7 +82,6 @@ template <class T>
 void BackTransformation<Backend::MC, Device::CPU, T>::call_FC(
     Matrix<T, Device::CPU>& mat_c, Matrix<const T, Device::CPU>& mat_v,
     common::internal::vector<hpx::shared_future<common::internal::vector<T>>> taus) {
-  auto executor_hp = dlaf::getHpExecutor<Backend::MC>();
   auto executor_np = dlaf::getNpExecutor<Backend::MC>();
 
   const SizeType m = mat_c.nrTiles().rows();
@@ -160,7 +159,7 @@ void BackTransformation<Backend::MC, Device::CPU, T>::call_FC(
     }
 
     // W2 = W C
-    matrix::util::set0(executor_hp, panelW2);
+    matrix::util::set0<Backend::MC>(hpx::threads::thread_priority::high, panelW2);
     LocalTileIndex c_start{k + 1, 0};
     LocalTileIndex c_end{m, n};
     auto c_k = iterate_range2d(c_start, c_end);
@@ -188,7 +187,6 @@ template <class T>
 void BackTransformation<Backend::MC, Device::CPU, T>::call_FC(
     comm::CommunicatorGrid grid, Matrix<T, Device::CPU>& mat_c, Matrix<const T, Device::CPU>& mat_v,
     common::internal::vector<hpx::shared_future<common::internal::vector<T>>> taus) {
-  auto executor_hp = dlaf::getHpExecutor<Backend::MC>();
   auto executor_np = dlaf::getNpExecutor<Backend::MC>();
 
   // Set up MPI
@@ -285,7 +283,7 @@ void BackTransformation<Backend::MC, Device::CPU, T>::call_FC(
       }
     }
 
-    matrix::util::set0(executor_hp, panelW2);
+    matrix::util::set0<Backend::MC>(hpx::threads::thread_priority::high, panelW2);
 
     broadcast(executor_mpi, k_rank_col, panelW, mpi_row_task_chain);
 

diff --git a/include/dlaf/eigensolver/reduction_to_band/mc.h b/include/dlaf/eigensolver/reduction_to_band/mc.h
@@ -364,6 +364,7 @@ template <class T>
 void hemmComputeX(PanelT<Coord::Col, T>& x, const LocalTileSize at_offset, ConstMatrixT<T>& a,
                   ConstPanelT<Coord::Col, T>& w) {
   const auto ex = getHpExecutor<Backend::MC>();
+  const auto priority = hpx::threads::thread_priority::high;
 
   const auto dist = a.distribution();
 
@@ -373,7 +374,7 @@ void hemmComputeX(PanelT<Coord::Col, T>& x, const LocalTileSize at_offset, Const
   // result.
   //
   // TODO set0 can be "embedded" in the logic but currently it will be a bit cumbersome.
-  matrix::util::set0(ex, x);
+  matrix::util::set0<Backend::MC>(priority, x);
 
   for (SizeType i = at_offset.rows(); i < dist.localNrTiles().rows(); ++i) {
     const auto limit = i + 1;
@@ -540,6 +541,8 @@ void hemmComputeX(comm::IndexT_MPI reducer_col, PanelT<Coord::Col, T>& x, PanelT
                   ConstPanelT<Coord::Row, T>& wt, common::Pipeline<comm::Communicator>& mpi_row_chain,
                   common::Pipeline<comm::Communicator>& mpi_col_chain) {
   const auto ex = getHpExecutor<Backend::MC>();
+  const auto priority = hpx::threads::thread_priority::high;
+
   const auto ex_mpi = getMPIExecutor<Backend::MC>();
 
   const auto dist = a.distribution();
@@ -551,8 +554,8 @@ void hemmComputeX(comm::IndexT_MPI reducer_col, PanelT<Coord::Col, T>& x, PanelT
   // result.
   //
   // TODO set0 can be "embedded" in the logic but currently it will be a bit cumbersome.
-  matrix::util::set0(ex, x);
-  matrix::util::set0(ex, xt);
+  matrix::util::set0<Backend::MC>(priority, x);
+  matrix::util::set0<Backend::MC>(priority, xt);
 
   for (SizeType i = at_offset.rows(); i < dist.localNrTiles().rows(); ++i) {
     const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(