Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TridiagSolver (local): embed row permutation in rank1 solver #936

Merged
merged 6 commits into from
Jul 19, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 0 additions & 37 deletions include/dlaf/eigensolver/tridiag_solver/kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -273,43 +273,6 @@ void initIndexTileAsync(SizeType tile_row, TileSender&& tile) {
di::transformDetach(di::Policy<DefaultBackend_v<D>>(), initIndexTile_o, std::move(sender));
}

template <class T>
void setUnitDiagonal(const SizeType& k, const SizeType& tile_begin,
const matrix::Tile<T, Device::CPU>& tile);

#define DLAF_CPU_SET_UNIT_DIAGONAL_ETI(kword, Type) \
kword template void setUnitDiagonal(const SizeType& k, const SizeType& tile_begin, \
const matrix::Tile<Type, Device::CPU>& tile)

DLAF_CPU_SET_UNIT_DIAGONAL_ETI(extern, float);
DLAF_CPU_SET_UNIT_DIAGONAL_ETI(extern, double);

#ifdef DLAF_WITH_GPU
template <class T>
void setUnitDiagonal(const SizeType& k, const SizeType& tile_begin,
const matrix::Tile<T, Device::GPU>& tile, whip::stream_t stream);

#define DLAF_GPU_SET_UNIT_DIAGONAL_ETI(kword, Type) \
kword template void setUnitDiagonal(const SizeType& k, const SizeType& tile_begin, \
const matrix::Tile<Type, Device::GPU>& tile, \
whip::stream_t stream)

DLAF_GPU_SET_UNIT_DIAGONAL_ETI(extern, float);
DLAF_GPU_SET_UNIT_DIAGONAL_ETI(extern, double);

#endif

DLAF_MAKE_CALLABLE_OBJECT(setUnitDiagonal);

template <Device D, class KSender, class TileSender>
void setUnitDiagonalAsync(KSender&& k, SizeType tile_begin, TileSender&& tile) {
namespace di = dlaf::internal;
auto sender = di::whenAllLift(std::forward<KSender>(k), tile_begin, std::forward<TileSender>(tile));
di::transformDetach(di::Policy<DefaultBackend_v<D>>(), setUnitDiagonal_o, std::move(sender));
}

// ---------------------------

#ifdef DLAF_WITH_GPU

// Returns the number of non-deflated entries
Expand Down
85 changes: 54 additions & 31 deletions include/dlaf/eigensolver/tridiag_solver/merge.h
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,8 @@ auto applyDeflation(const SizeType i_begin, const SizeType i_end, RhoSender&& rh
template <class T, class KSender, class RhoSender>
void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k, RhoSender&& rho,
Matrix<const T, Device::CPU>& d, Matrix<T, Device::CPU>& z,
Matrix<T, Device::CPU>& evals, Matrix<T, Device::CPU>& evecs) {
Matrix<T, Device::CPU>& evals, Matrix<const SizeType, Device::CPU>& i2,
Matrix<T, Device::CPU>& evecs) {
namespace ex = pika::execution::experimental;
namespace di = dlaf::internal;

Expand All @@ -474,20 +475,41 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k
ex::when_all(ex::just(std::make_unique<pika::barrier<>>(nthreads)), std::forward<KSender>(k),
std::forward<RhoSender>(rho), ex::when_all_vector(tc.read(d)),
ex::when_all_vector(tc.readwrite(z)), ex::when_all_vector(tc.readwrite(evals)),
ex::when_all_vector(tc.readwrite(evecs)),
ex::when_all_vector(tc.read(i2)), ex::when_all_vector(tc.readwrite(evecs)),
ex::just(std::vector<memory::MemoryView<T, Device::CPU>>())) |
ex::transfer(di::getBackendScheduler<Backend::MC>(pika::execution::thread_priority::high)) |
ex::bulk(nthreads, [nthreads, n, nb](std::size_t thread_idx, auto& barrier_ptr, auto& k, auto& rho,
auto& d_tiles_futs, auto& z_tiles, auto& eval_tiles,
auto& evec_tiles, auto& ws_vecs) {
const auto& i2_tile_arr, auto& evec_tiles, auto& ws_vecs) {
const matrix::Distribution distr(LocalElementSize(n, n), TileElementSize(nb, nb));

const SizeType* i2_perm = i2_tile_arr[0].get().ptr();

const auto barrier_busy_wait = getTridiagRank1BarrierBusyWait();
const std::size_t batch_size = util::ceilDiv(to_sizet(k), nthreads);
const std::size_t begin = thread_idx * batch_size;
const std::size_t end = std::min(thread_idx * batch_size + batch_size, to_sizet(k));

// STEP 0: Initialize workspaces (single-thread)
// STEP 0a: Fill ones for deflated Eigenvectors. (single-thread)
// Note: this step is completely independent from the rest, but it is small and it is going
// to be dropped soon.
// Note: use last thread that in principle should have less work to do
if (thread_idx == nthreads - 1) {
for (auto i = 0; i < n; ++i) {
const SizeType j = i2_perm[to_sizet(i)];

// if it is deflated
if (j >= k) {
const GlobalElementIndex ij(i, j);
const auto linear_ij = distr.globalTileLinearIndex(ij);
const auto ij_el = distr.tileElementIndex(ij);

evec_tiles[to_sizet(linear_ij)](ij_el) = 1;
}
}
}

// STEP 0b: Initialize workspaces (single-thread)
if (thread_idx == 0) {
ws_vecs.reserve(nthreads);
for (std::size_t i = 0; i < nthreads; ++i)
Expand Down Expand Up @@ -515,13 +537,30 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k
lapack::laed4(to_int(k), to_int(i), d_ptr, z_ptr, delta, rho, &eigenval);
}

// Note: for in-place row permutation implementation: The rows should be permuted for the k=2 case as well.

// Note: laed4 handles k <= 2 cases differently
if (k <= 2)
if (k <= 2) {
// Note: The rows should be permuted for the k=2 case as well.
if (k == 2) {
T* ws = ws_vecs[thread_idx]();
for (int j = to_SizeType(begin); j < to_SizeType(end); ++j) {
const SizeType j_tile = distr.globalTileLinearIndex(GlobalElementIndex(0, j));
const SizeType j_col = distr.tileElementFromGlobalElement<Coord::Col>(j);
T* evec = evec_tiles[to_sizet(j_tile)].ptr(TileElementIndex(0, j_col));

std::copy(evec, evec + k, ws);
std::fill_n(evec, k, 0); // by default "deflated"
for (int i = 0; i < n; ++i) {
const SizeType ii = i2_perm[i];
if (ii < k)
evec[i] = ws[ii];
}
}
}
return;
}
}

// Note: This barrier ensures that LAED4 finished, so from now on values are available
barrier_ptr->arrive_and_wait(barrier_busy_wait);

// STEP 2a Compute weights (multi-thread)
Expand Down Expand Up @@ -593,28 +632,21 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k

const T vec_norm = blas::nrm2(k, s, 1);

for (auto i = 0; i < k; ++i) {
for (auto i = 0; i < n; ++i) {
const SizeType ii = i2_perm[i];
const auto q_tile = distr.globalTileLinearIndex({i, j});
const auto q_ij = distr.tileElementIndex({i, j});

q[to_sizet(q_tile)](q_ij) = s[i] / vec_norm;
if (ii < k)
q[to_sizet(q_tile)](q_ij) = s[ii] / vec_norm;
else
q[to_sizet(q_tile)](q_ij) = 0;
}
}
}
}));
}

template <class T, Device D, class KSender>
void setUnitDiag(const SizeType i_begin, const SizeType i_end, KSender&& k, Matrix<T, D>& mat) {
// Iterate over diagonal tiles
const matrix::Distribution& distr = mat.distribution();
for (SizeType i_tile = i_begin; i_tile < i_end; ++i_tile) {
const SizeType tile_begin = distr.globalTileElementDistance<Coord::Row>(i_begin, i_tile);

setUnitDiagonalAsync<D>(k, tile_begin, mat.readwrite(GlobalTileIndex(i_tile, i_tile)));
}
}

template <Backend B, Device D, class T, class RhoSender>
void mergeSubproblems(const SizeType i_begin, const SizeType i_split, const SizeType i_end,
RhoSender&& rho, WorkSpace<T, D>& ws, WorkSpaceHost<T>& ws_h,
Expand Down Expand Up @@ -697,22 +729,13 @@ void mergeSubproblems(const SizeType i_begin, const SizeType i_split, const Size
// The input is not required to be zero for solveRank1Problem.
matrix::util::set0<Backend::MC>(pika::execution::thread_priority::normal, idx_loc_begin, sz_loc_tiles,
ws_hm.e2);
solveRank1Problem(i_begin, i_end, k, scaled_rho, ws_hm.d1, ws_hm.z1, ws_h.d0, ws_hm.e2);

solveRank1Problem(i_begin, i_end, k, scaled_rho, ws_hm.d1, ws_hm.z1, ws_h.d0, ws_hm.i2, ws_hm.e2);
copy(idx_loc_begin, sz_loc_tiles, ws_hm.e2, ws.e2);

setUnitDiag(i_begin, i_end, k, ws.e2);

// Step #3: Eigenvectors of the tridiagonal system: Q * U
//
// The eigenvectors resulting from the multiplication are already in the order of the eigenvalues as
// prepared for the deflated system.
//
copy(idx_begin_tiles_vec, sz_tiles_vec, ws_hm.i2, ws.i2);
// The following permutation will be removed in the future.
// (The copy is needed to simplify the removal)
dlaf::permutations::permute<B, D, T, Coord::Row>(i_begin, i_end, ws.i2, ws.e2, ws.e0);
copy(idx_loc_begin, sz_loc_tiles, ws.e0, ws.e2);
dlaf::multiplication::generalSubMatrix<B, D, T>(i_begin, i_end, blas::Op::NoTrans, blas::Op::NoTrans,
T(1), ws.e1, ws.e2, T(0), ws.e0);

Expand Down Expand Up @@ -904,7 +927,7 @@ void solveRank1ProblemDist(CommSender&& row_comm, CommSender&& col_comm, const S
// Note:
// Considering that
// - LAED4 requires working on k elements
// - Weight computaiton requires working on m_subm_el_lc
// - Weight computation requires working on m_subm_el_lc
//
// and they are needed at two steps that cannot happen in parallel, we opted for allocating the
// workspace with the highest requirement of memory, and reuse them for both steps.
Expand Down
16 changes: 0 additions & 16 deletions src/eigensolver/tridiag_solver/kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,20 +93,4 @@ void initIndexTile(SizeType offset, const matrix::Tile<SizeType, Device::CPU>& t
}
}

template <class T>
void setUnitDiagonal(const SizeType& k, const SizeType& tile_begin,
const matrix::Tile<T, Device::CPU>& tile) {
// If all elements of the tile are after the `k` index reset the offset
SizeType tile_offset = k - tile_begin;
if (tile_offset < 0)
tile_offset = 0;

// Set all diagonal elements of the tile to 1.
for (SizeType i = tile_offset; i < tile.size().rows(); ++i) {
tile(TileElementIndex(i, i)) = 1;
}
}

DLAF_CPU_SET_UNIT_DIAGONAL_ETI(, float);
DLAF_CPU_SET_UNIT_DIAGONAL_ETI(, double);
}
54 changes: 0 additions & 54 deletions src/eigensolver/tridiag_solver/kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -185,60 +185,6 @@ void initIndexTile(SizeType offset, const matrix::Tile<SizeType, Device::GPU>& t
initIndexTile<<<nr_blocks, nr_threads, 0, stream>>>(offset, len, index_arr);
}

struct StrideOp {
SizeType ld;
SizeType offset;

__host__ __device__ __forceinline__ SizeType operator()(const SizeType i) const {
return offset + i * ld;
}
};

template <class T>
struct Row2ColMajor {
SizeType ld;
SizeType ncols;
T* data;

__host__ __device__ __forceinline__ T operator()(const SizeType idx) const {
SizeType i = idx / ncols;
SizeType j = idx - i * ncols;
return data[i + j * ld];
}
};

constexpr unsigned set_diag_kernel_sz = 256;

template <class T>
__global__ void setUnitDiagTileOnDevice(SizeType len, SizeType ld, T* tile) {
const SizeType i = blockIdx.x * set_diag_kernel_sz + threadIdx.x;
if (i >= len)
return;

tile[i + i * ld] = T(1);
}

template <class T>
void setUnitDiagonal(const SizeType& k, const SizeType& tile_begin,
const matrix::Tile<T, Device::GPU>& tile, whip::stream_t stream) {
SizeType tile_offset = k - tile_begin;
if (tile_offset < 0)
tile_offset = 0;
else if (tile_offset >= tile.size().rows())
return;

SizeType len = tile.size().rows() - tile_offset;
SizeType ld = tile.ld();
T* tile_ptr = tile.ptr(TileElementIndex(tile_offset, tile_offset));

dim3 nr_threads(set_diag_kernel_sz);
dim3 nr_blocks(util::ceilDiv(to_uint(len), set_diag_kernel_sz));
setUnitDiagTileOnDevice<<<nr_blocks, nr_threads, 0, stream>>>(len, ld, tile_ptr);
}

DLAF_GPU_SET_UNIT_DIAGONAL_ETI(, float);
DLAF_GPU_SET_UNIT_DIAGONAL_ETI(, double);

// -----------------------------------------
// This is a separate struct with a call operator instead of a lambda, because
// nvcc does not compile the file with a lambda.
Expand Down