Skip to content

Commit

Permalink
algo: triangular solver LLT (distributed) (#409)
Browse files Browse the repository at this point in the history
Implementation of the distributed Triangular Solver Left Lower Transposed.
Note:
keep_future is problematic with transform<GPU> because it releases
the future during the submission of the work but then it does not
keep it alive till the end of the async operation. Not using it
results in getting the contained value, which instead will be kept
alive (it does not get unwrapped as the future).
  • Loading branch information
albestro authored Dec 1, 2021
1 parent 41b876d commit 3df633e
Show file tree
Hide file tree
Showing 23 changed files with 378 additions and 150 deletions.
9 changes: 7 additions & 2 deletions include/dlaf/blas/tile.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
//
#pragma once

#include "blas.hh"
#include <blas.hh>

#include "dlaf/common/callable_object.h"
#include "dlaf/matrix/tile.h"
Expand All @@ -22,7 +22,6 @@

#ifdef DLAF_WITH_CUDA
#include <cublas_v2.h>
#include <blas.hh>

#include "dlaf/cublas/error.h"
#include "dlaf/util_cublas.h"
Expand Down Expand Up @@ -238,6 +237,12 @@ void trsm(const blas::Side side, const blas::Uplo uplo, const blas::Op op, const
} \
}

DLAF_DECLARE_CUBLAS_OP(Axpy);
DLAF_DEFINE_CUBLAS_OP(Axpy, float, Saxpy);
DLAF_DEFINE_CUBLAS_OP(Axpy, double, Daxpy);
DLAF_DEFINE_CUBLAS_OP(Axpy, std::complex<float>, Caxpy);
DLAF_DEFINE_CUBLAS_OP(Axpy, std::complex<double>, Zaxpy);

DLAF_DECLARE_CUBLAS_OP(Gemm);
DLAF_DEFINE_CUBLAS_OP(Gemm, float, Sgemm);
DLAF_DEFINE_CUBLAS_OP(Gemm, double, Dgemm);
Expand Down
86 changes: 86 additions & 0 deletions include/dlaf/blas/tile_extensions.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
//
// Distributed Linear Algebra with Future (DLAF)
//
// Copyright (c) 2018-2021, ETH Zurich
// All rights reserved.
//
// Please, refer to the LICENSE file in the root directory.
// SPDX-License-Identifier: BSD-3-Clause
//
#pragma once

#include <blas.hh>

#include "dlaf/common/callable_object.h"
#include "dlaf/matrix/tile.h"
#include "dlaf/sender/make_sender_algorithm_overloads.h"
#include "dlaf/sender/partial_transform.h"
#include "dlaf/sender/policy.h"
#include "dlaf/sender/transform.h"

#ifdef DLAF_WITH_CUDA
#include <cublas_v2.h>

#include "dlaf/cublas/error.h"
#include "dlaf/util_cublas.h"
#endif

namespace dlaf {
namespace tile {
using matrix::Tile;

#ifdef DLAF_DOXYGEN

/// Computes A += alpha * B
///
/// This overload blocks until completion of the algorithm.
template <Backend B, class T, Device D>
void add(T alpha, const matrix::Tile<const T, D>& tile_b, const matrix::Tile<T, D>& tile_a);

/// \overload add
///
/// This overload takes a policy argument and a sender which must send all required arguments for the
/// algorithm. Returns a sender which signals a connected receiver when the algorithm is done.
template <Backend B, typename Sender,
typename = std::enable_if_t<hpx::execution::experimental::is_sender_v<Sender>>>
auto add(const dlaf::internal::Policy<B>& p, Sender&& s);

/// \overload add
///
/// This overload partially applies the algorithm with a policy for later use with operator| with a
/// sender on the left-hand side.
template <Backend B>
auto add(const dlaf::internal::Policy<B>& p);

#else

namespace internal {

template <class T>
void add(T alpha, const matrix::Tile<const T, Device::CPU>& tile_b,
const matrix::Tile<T, Device::CPU>& tile_a) {
DLAF_ASSERT(equal_size(tile_a, tile_b), tile_a, tile_b);
for (auto j = 0; j < tile_a.size().cols(); ++j)
blas::axpy(tile_a.size().rows(), alpha, tile_b.ptr({0, j}), 1, tile_a.ptr({0, j}), 1);
}

#ifdef DLAF_WITH_CUDA
template <class T>
void add(cublasHandle_t handle, T alpha, const matrix::Tile<const T, Device::GPU>& tile_b,
const matrix::Tile<T, Device::GPU>& tile_a) {
DLAF_ASSERT(equal_size(tile_a, tile_b), tile_a, tile_b);
for (auto j = 0; j < tile_a.size().cols(); ++j)
tile::internal::CublasAxpy<T>::call(handle, tile_a.size().rows(), util::blasToCublasCast(&alpha),
util::blasToCublasCast(tile_b.ptr({0, j})), 1,
util::blasToCublasCast(tile_a.ptr({0, j})), 1);
}
#endif

DLAF_MAKE_CALLABLE_OBJECT(add);
}

DLAF_MAKE_SENDER_ALGORITHM_OVERLOADS(add, internal::add_o)

#endif
}
}
15 changes: 8 additions & 7 deletions include/dlaf/common/contiguous_buffer_holder.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ template <class T>
struct ContiguousBufferHolder {
common::Buffer<std::remove_const_t<T>> buffer;
common::DataDescriptor<T> descriptor;

bool isAllocated() const {
return static_cast<bool>(buffer);
}
};

/// It returns a ContiguousBufferHolder starting from a Tile, where the ContiguousBufferHolder will be either:
Expand All @@ -53,14 +57,11 @@ auto makeItContiguous(const matrix::Tile<T, Device::CPU>& tile) {

DLAF_MAKE_CALLABLE_OBJECT(makeItContiguous);

/// It returns @p tile, ensuring that if the given @p bag owns a temporary buffer, it copies data from
/// this latter one to @p tile before returning it. Otherwise it is a no-op.
/// Copy from bag to tile, just if the bag owns a temporary buffer. Otherwise it is a no-op.
template <class T>
auto copyBack(matrix::Tile<T, Device::CPU> tile, ContiguousBufferHolder<T> bag) {
auto buffer_used = std::move(bag.buffer);
if (buffer_used)
common::copy(buffer_used, common::make_data(tile));
return tile;
void copyBack(const ContiguousBufferHolder<T>& bag, const matrix::Tile<T, Device::CPU>& tile) {
if (bag.isAllocated())
common::copy(bag.buffer, common::make_data(tile));
}

DLAF_MAKE_CALLABLE_OBJECT(copyBack);
Expand Down
7 changes: 5 additions & 2 deletions include/dlaf/communication/kernels/all_reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ void scheduleAllReduce(const comm::Executor& ex,
std::move(pcomm), reduce_op, std::move(cont_buf_in),
std::move(cont_buf_out), tile_in));

dataflow(ex_copy, unwrapping(copyBack_o), std::move(tile_out), std::move(cont_buf_out));
dataflow(ex_copy, unwrapping(copyBack_o), std::move(cont_buf_out), std::move(tile_out));
}

template <class T>
Expand Down Expand Up @@ -138,7 +138,10 @@ hpx::future<matrix::Tile<T, Device::CPU>> scheduleAllReduceInPlace(
cont_buf = dataflow(ex, unwrapping(internal::allReduceInPlace_o), std::move(pcomm), reduce_op,
std::move(cont_buf));

return dataflow(ex_copy, unwrapping(copyBack_o), std::move(tile), std::move(cont_buf));
// Note:
// This extracts the tile given as argument to copyBack, not the return value.
return hpx::get<1>(hpx::split_future(
dataflow(ex_copy, matrix::unwrapExtendTiles(copyBack_o), std::move(cont_buf), std::move(tile))));
}
}
}
100 changes: 70 additions & 30 deletions include/dlaf/communication/kernels/reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@

#pragma once

#ifdef DLAF_WITH_CUDA_RDMA
#warning "Reduce is not using CUDA_RDMA."
#endif

/// @file

#include <mpi.h>
Expand Down Expand Up @@ -45,10 +49,10 @@ auto reduceRecvInPlace(common::PromiseGuard<comm::Communicator> pcomm, MPI_Op re

DLAF_MAKE_CALLABLE_OBJECT(reduceRecvInPlace);

template <class T>
template <class T, Device D>
auto reduceSend(comm::IndexT_MPI rank_root, common::PromiseGuard<comm::Communicator> pcomm,
MPI_Op reduce_op, common::internal::ContiguousBufferHolder<const T> cont_buf,
matrix::Tile<const T, Device::CPU> const&, MPI_Request* req) {
const matrix::Tile<const T, D>&, MPI_Request* req) {
auto msg = comm::make_message(cont_buf.descriptor);
auto& comm = pcomm.ref();

Expand All @@ -60,64 +64,100 @@ DLAF_MAKE_CALLABLE_OBJECT(reduceSend);

}

template <class T>
template <class T, Device D>
void scheduleReduceRecvInPlace(const comm::Executor& ex,
hpx::future<common::PromiseGuard<comm::Communicator>> pcomm,
MPI_Op reduce_op, hpx::future<matrix::Tile<T, Device::CPU>> tile) {
MPI_Op reduce_op, hpx::future<matrix::Tile<T, D>> tile) {
// Note:
//
// GPU -> duplicateIfNeeded ---------------------> cCPU -> MPI -------------> copyIfNeeded --> GPU
// CPU ----------------------> makeItContiguous -> cCPU -> MPI -> copyBack ------------------> CPU
// cCPU ------------------------------------------> cCPU -> MPI ------------------------------> cCPU
//
// where: cCPU = contiguous CPU

using hpx::dataflow;
using hpx::unwrapping;

using common::internal::copyBack_o;
using common::internal::makeItContiguous_o;
using internal::reduceRecvInPlace_o;
using matrix::Tile;
using matrix::duplicateIfNeeded;
using matrix::getUnwrapReturnValue;
using matrix::unwrapExtendTiles;

const auto& ex_copy = getHpExecutor<Backend::MC>();

// Note:
//
// TILE ---> makeContiguous --+--> CONT_BUF ----> mpi_call ---> CONT_BUF --+
// | |
// +-------------------> TILE ------------------+-> copyBack
// TILE_ORIG can be on CPU or GPU
//
// TILE_CPU ----> duplicateIfNeeded<CPU> ----> TILE_CPU (no-op)
// TILE_GPU ----> duplicateIfNeeded<CPU> ----> TILE_CPU

auto ex_copy = getHpExecutor<Backend::MC>();
hpx::shared_future<Tile<T, D>> tile_orig = tile.share();
hpx::shared_future<Tile<T, Device::CPU>> tile_cpu = duplicateIfNeeded<Device::CPU>(tile_orig);

hpx::future<common::internal::ContiguousBufferHolder<T>> cont_buf;
{
auto wrapped = getUnwrapRetValAndArgs(
dataflow(ex_copy, unwrapExtendTiles(makeItContiguous_o), std::move(tile)));
// Note:
//
// TILE_CPU -+-> makeContiguous --> CONT_BUF --> mpi --> CONT_BUF -+-> copyBack --> CHECKPOINT -->
// | |
// +------------------------> TILE_CPU ------------------+----------------------------->
//
// CHECKPOINT assures that the copyBack has finished, which is useful for the next step.

cont_buf = std::move(wrapped.first);
tile = std::move(hpx::get<0>(wrapped.second));
}
auto cont_buf = dataflow(ex_copy, unwrapping(makeItContiguous_o), tile_cpu);
cont_buf =
dataflow(ex, unwrapping(reduceRecvInPlace_o), std::move(pcomm), reduce_op, std::move(cont_buf));
auto checkpoint = dataflow(ex_copy, unwrapExtendTiles(copyBack_o), std::move(cont_buf), tile_cpu);

cont_buf = dataflow(ex, unwrapping(internal::reduceRecvInPlace_o), std::move(pcomm), reduce_op,
std::move(cont_buf));
// Note:
//
// In case the original tile was on CPU, the previous step already managed to copy the result in place
// and nothing else has to be done.
// Otherwise, if it was on GPU, a temporary tile on CPU has been used, so this step handles the needed
// copy from CPU to GPU.
//
// TILE_CPU => no-op
// TILE_GPU => copy back from CPU to GPU

dataflow(ex_copy, unwrapping(copyBack_o), std::move(tile), std::move(cont_buf));
copyIfNeeded(tile_cpu, tile_orig, std::move(checkpoint));
}

template <class T>
template <class T, Device D, template <class> class Future>
void scheduleReduceSend(const comm::Executor& ex, comm::IndexT_MPI rank_root,
hpx::future<common::PromiseGuard<comm::Communicator>> pcomm, MPI_Op reduce_op,
hpx::shared_future<matrix::Tile<const T, Device::CPU>> tile) {
Future<matrix::Tile<T, D>> tile) {
using hpx::dataflow;
using hpx::unwrapping;

using common::internal::makeItContiguous_o;
using matrix::Tile;
using matrix::duplicateIfNeeded;
using internal::reduceSend_o;
using matrix::unwrapExtendTiles;

const auto& ex_copy = getHpExecutor<Backend::MC>();

// Note:
//
// TILE -+-> makeContiguous --+--> CONT_BUF ---+--> mpi_call
// | |
// +--------------> TILE ----------------+

auto ex_copy = getHpExecutor<Backend::MC>();
// TILE can be on CPU or GPU
//
// TILE_CPU ----> duplicateIfNeeded<CPU> ----> TILE_CPU (no-op)
// TILE_GPU ----> duplicateIfNeeded<CPU> ----> TILE_CPU
hpx::shared_future<Tile<const T, Device::CPU>> tile_cpu =
duplicateIfNeeded<Device::CPU>(std::move(tile));

// TODO shared_future<Tile> as assumption, it requires changes for future<Tile>
hpx::future<common::internal::ContiguousBufferHolder<const T>> cont_buf =
dataflow(ex_copy, unwrapping(makeItContiguous_o), tile);
// Note:
//
// TILE_CPU -+-> makeContiguous ---> CONT_BUF ---+--> mpi_call--+
// | | |
// +-----------> TILE_CPU -------------+--------------+
auto cont_buf = dataflow(ex_copy, unwrapping(makeItContiguous_o), tile_cpu);

dataflow(ex, unwrapExtendTiles(internal::reduceSend_o), rank_root, std::move(pcomm), reduce_op,
std::move(cont_buf), tile);
dataflow(ex, unwrapExtendTiles(reduceSend_o), rank_root, std::move(pcomm), reduce_op,
std::move(cont_buf), tile_cpu);
}
}
}
6 changes: 2 additions & 4 deletions include/dlaf/eigensolver/backtransformation/mc.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ template <class T>
void BackTransformation<Backend::MC, Device::CPU, T>::call_FC(
Matrix<T, Device::CPU>& mat_c, Matrix<const T, Device::CPU>& mat_v,
common::internal::vector<hpx::shared_future<common::internal::vector<T>>> taus) {
auto executor_hp = dlaf::getHpExecutor<Backend::MC>();
auto executor_np = dlaf::getNpExecutor<Backend::MC>();

const SizeType m = mat_c.nrTiles().rows();
Expand Down Expand Up @@ -160,7 +159,7 @@ void BackTransformation<Backend::MC, Device::CPU, T>::call_FC(
}

// W2 = W C
matrix::util::set0(executor_hp, panelW2);
matrix::util::set0<Backend::MC>(hpx::threads::thread_priority::high, panelW2);
LocalTileIndex c_start{k + 1, 0};
LocalTileIndex c_end{m, n};
auto c_k = iterate_range2d(c_start, c_end);
Expand Down Expand Up @@ -188,7 +187,6 @@ template <class T>
void BackTransformation<Backend::MC, Device::CPU, T>::call_FC(
comm::CommunicatorGrid grid, Matrix<T, Device::CPU>& mat_c, Matrix<const T, Device::CPU>& mat_v,
common::internal::vector<hpx::shared_future<common::internal::vector<T>>> taus) {
auto executor_hp = dlaf::getHpExecutor<Backend::MC>();
auto executor_np = dlaf::getNpExecutor<Backend::MC>();

// Set up MPI
Expand Down Expand Up @@ -285,7 +283,7 @@ void BackTransformation<Backend::MC, Device::CPU, T>::call_FC(
}
}

matrix::util::set0(executor_hp, panelW2);
matrix::util::set0<Backend::MC>(hpx::threads::thread_priority::high, panelW2);

broadcast(executor_mpi, k_rank_col, panelW, mpi_row_task_chain);

Expand Down
9 changes: 6 additions & 3 deletions include/dlaf/eigensolver/reduction_to_band/mc.h
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,7 @@ template <class T>
void hemmComputeX(PanelT<Coord::Col, T>& x, const LocalTileSize at_offset, ConstMatrixT<T>& a,
ConstPanelT<Coord::Col, T>& w) {
const auto ex = getHpExecutor<Backend::MC>();
const auto priority = hpx::threads::thread_priority::high;

const auto dist = a.distribution();

Expand All @@ -373,7 +374,7 @@ void hemmComputeX(PanelT<Coord::Col, T>& x, const LocalTileSize at_offset, Const
// result.
//
// TODO set0 can be "embedded" in the logic but currently it will be a bit cumbersome.
matrix::util::set0(ex, x);
matrix::util::set0<Backend::MC>(priority, x);

for (SizeType i = at_offset.rows(); i < dist.localNrTiles().rows(); ++i) {
const auto limit = i + 1;
Expand Down Expand Up @@ -540,6 +541,8 @@ void hemmComputeX(comm::IndexT_MPI reducer_col, PanelT<Coord::Col, T>& x, PanelT
ConstPanelT<Coord::Row, T>& wt, common::Pipeline<comm::Communicator>& mpi_row_chain,
common::Pipeline<comm::Communicator>& mpi_col_chain) {
const auto ex = getHpExecutor<Backend::MC>();
const auto priority = hpx::threads::thread_priority::high;

const auto ex_mpi = getMPIExecutor<Backend::MC>();

const auto dist = a.distribution();
Expand All @@ -551,8 +554,8 @@ void hemmComputeX(comm::IndexT_MPI reducer_col, PanelT<Coord::Col, T>& x, PanelT
// result.
//
// TODO set0 can be "embedded" in the logic but currently it will be a bit cumbersome.
matrix::util::set0(ex, x);
matrix::util::set0(ex, xt);
matrix::util::set0<Backend::MC>(priority, x);
matrix::util::set0<Backend::MC>(priority, xt);

for (SizeType i = at_offset.rows(); i < dist.localNrTiles().rows(); ++i) {
const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(
Expand Down
Loading

0 comments on commit 3df633e

Please sign in to comment.