diff --git a/include/dlaf/eigensolver/tridiag_solver/kernels.h b/include/dlaf/eigensolver/tridiag_solver/kernels.h index bdea083853..c2af5b998c 100644 --- a/include/dlaf/eigensolver/tridiag_solver/kernels.h +++ b/include/dlaf/eigensolver/tridiag_solver/kernels.h @@ -273,43 +273,6 @@ void initIndexTileAsync(SizeType tile_row, TileSender&& tile) { di::transformDetach(di::Policy>(), initIndexTile_o, std::move(sender)); } -template -void setUnitDiagonal(const SizeType& k, const SizeType& tile_begin, - const matrix::Tile& tile); - -#define DLAF_CPU_SET_UNIT_DIAGONAL_ETI(kword, Type) \ - kword template void setUnitDiagonal(const SizeType& k, const SizeType& tile_begin, \ - const matrix::Tile& tile) - -DLAF_CPU_SET_UNIT_DIAGONAL_ETI(extern, float); -DLAF_CPU_SET_UNIT_DIAGONAL_ETI(extern, double); - -#ifdef DLAF_WITH_GPU -template -void setUnitDiagonal(const SizeType& k, const SizeType& tile_begin, - const matrix::Tile& tile, whip::stream_t stream); - -#define DLAF_GPU_SET_UNIT_DIAGONAL_ETI(kword, Type) \ - kword template void setUnitDiagonal(const SizeType& k, const SizeType& tile_begin, \ - const matrix::Tile& tile, \ - whip::stream_t stream) - -DLAF_GPU_SET_UNIT_DIAGONAL_ETI(extern, float); -DLAF_GPU_SET_UNIT_DIAGONAL_ETI(extern, double); - -#endif - -DLAF_MAKE_CALLABLE_OBJECT(setUnitDiagonal); - -template -void setUnitDiagonalAsync(KSender&& k, SizeType tile_begin, TileSender&& tile) { - namespace di = dlaf::internal; - auto sender = di::whenAllLift(std::forward(k), tile_begin, std::forward(tile)); - di::transformDetach(di::Policy>(), setUnitDiagonal_o, std::move(sender)); -} - -// --------------------------- - #ifdef DLAF_WITH_GPU // Returns the number of non-deflated entries diff --git a/include/dlaf/eigensolver/tridiag_solver/merge.h b/include/dlaf/eigensolver/tridiag_solver/merge.h index bec63530e2..7c5705320d 100644 --- a/include/dlaf/eigensolver/tridiag_solver/merge.h +++ b/include/dlaf/eigensolver/tridiag_solver/merge.h @@ -460,7 +460,8 @@ auto applyDeflation(const SizeType i_begin, const SizeType i_end, RhoSender&& rh template void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k, RhoSender&& rho, Matrix& d, Matrix& z, - Matrix& evals, Matrix& evecs) { + Matrix& evals, Matrix& i2, + Matrix& evecs) { namespace ex = pika::execution::experimental; namespace di = dlaf::internal; @@ -474,20 +475,41 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k ex::when_all(ex::just(std::make_unique>(nthreads)), std::forward(k), std::forward(rho), ex::when_all_vector(tc.read(d)), ex::when_all_vector(tc.readwrite(z)), ex::when_all_vector(tc.readwrite(evals)), - ex::when_all_vector(tc.readwrite(evecs)), + ex::when_all_vector(tc.read(i2)), ex::when_all_vector(tc.readwrite(evecs)), ex::just(std::vector>())) | ex::transfer(di::getBackendScheduler(pika::execution::thread_priority::high)) | ex::bulk(nthreads, [nthreads, n, nb](std::size_t thread_idx, auto& barrier_ptr, auto& k, auto& rho, auto& d_tiles_futs, auto& z_tiles, auto& eval_tiles, - auto& evec_tiles, auto& ws_vecs) { + const auto& i2_tile_arr, auto& evec_tiles, auto& ws_vecs) { const matrix::Distribution distr(LocalElementSize(n, n), TileElementSize(nb, nb)); + const SizeType* i2_perm = i2_tile_arr[0].get().ptr(); + const auto barrier_busy_wait = getTridiagRank1BarrierBusyWait(); const std::size_t batch_size = util::ceilDiv(to_sizet(k), nthreads); const std::size_t begin = thread_idx * batch_size; const std::size_t end = std::min(thread_idx * batch_size + batch_size, to_sizet(k)); - // STEP 0: Initialize workspaces (single-thread) + // STEP 0a: Fill ones for deflated Eigenvectors. (single-thread) + // Note: this step is completely independent from the rest, but it is small and it is going + // to be dropped soon. + // Note: use last thread that in principle should have less work to do + if (thread_idx == nthreads - 1) { + for (SizeType i = 0; i < n; ++i) { + const SizeType j = i2_perm[to_sizet(i)]; + + // if it is deflated + if (j >= k) { + const GlobalElementIndex ij(i, j); + const auto linear_ij = distr.globalTileLinearIndex(ij); + const auto ij_el = distr.tileElementIndex(ij); + + evec_tiles[to_sizet(linear_ij)](ij_el) = 1; + } + } + } + + // STEP 0b: Initialize workspaces (single-thread) if (thread_idx == 0) { ws_vecs.reserve(nthreads); for (std::size_t i = 0; i < nthreads; ++i) @@ -515,13 +537,30 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k lapack::laed4(to_int(k), to_int(i), d_ptr, z_ptr, delta, rho, &eigenval); } - // Note: for in-place row permutation implementation: The rows should be permuted for the k=2 case as well. - // Note: laed4 handles k <= 2 cases differently - if (k <= 2) + if (k <= 2) { + // Note: The rows should be permuted for the k=2 case as well. + if (k == 2) { + T* ws = ws_vecs[thread_idx](); + for (SizeType j = to_SizeType(begin); j < to_SizeType(end); ++j) { + const SizeType j_tile = distr.globalTileLinearIndex(GlobalElementIndex(0, j)); + const SizeType j_col = distr.tileElementFromGlobalElement(j); + T* evec = evec_tiles[to_sizet(j_tile)].ptr(TileElementIndex(0, j_col)); + + std::copy(evec, evec + k, ws); + std::fill_n(evec, k, 0); // by default "deflated" + for (SizeType i = 0; i < n; ++i) { + const SizeType ii = i2_perm[i]; + if (ii < k) + evec[i] = ws[ii]; + } + } + } return; + } } + // Note: This barrier ensures that LAED4 finished, so from now on values are available barrier_ptr->arrive_and_wait(barrier_busy_wait); // STEP 2a Compute weights (multi-thread) @@ -530,7 +569,7 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k // - copy diagonal from q -> w (or just initialize with 1) if (thread_idx == 0) { - for (auto i = 0; i < k; ++i) { + for (SizeType i = 0; i < k; ++i) { const GlobalElementIndex kk(i, i); const auto diag_tile = distr.globalTileLinearIndex(kk); const auto diag_element = distr.tileElementIndex(kk); @@ -553,11 +592,11 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k w[i] *= q[to_sizet(q_tile)](q_ij) / (d_ptr[to_sizet(i)] - d_ptr[to_sizet(j)]); }; - for (auto j = to_SizeType(begin); j < to_SizeType(end); ++j) { - for (auto i = 0; i < j; ++i) + for (SizeType j = to_SizeType(begin); j < to_SizeType(end); ++j) { + for (SizeType i = 0; i < j; ++i) compute_w({i, j}); - for (auto i = j + 1; i < k; ++i) + for (SizeType i = j + 1; i < k; ++i) compute_w({i, j}); } @@ -565,7 +604,7 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k // STEP 2B: reduce, then finalize computation with sign and square root (single-thread) if (thread_idx == 0) { - for (int i = 0; i < k; ++i) { + for (SizeType i = 0; i < k; ++i) { for (std::size_t tidx = 1; tidx < nthreads; ++tidx) { const T* w_partial = ws_vecs[tidx](); w[i] *= w_partial[i]; @@ -583,8 +622,8 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k const T* w = z_ptr; T* s = ws_vecs[thread_idx](); - for (auto j = to_SizeType(begin); j < to_SizeType(end); ++j) { - for (int i = 0; i < k; ++i) { + for (SizeType j = to_SizeType(begin); j < to_SizeType(end); ++j) { + for (SizeType i = 0; i < k; ++i) { const auto q_tile = distr.globalTileLinearIndex({i, j}); const auto q_ij = distr.tileElementIndex({i, j}); @@ -593,28 +632,21 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k const T vec_norm = blas::nrm2(k, s, 1); - for (auto i = 0; i < k; ++i) { + for (SizeType i = 0; i < n; ++i) { + const SizeType ii = i2_perm[i]; const auto q_tile = distr.globalTileLinearIndex({i, j}); const auto q_ij = distr.tileElementIndex({i, j}); - q[to_sizet(q_tile)](q_ij) = s[i] / vec_norm; + if (ii < k) + q[to_sizet(q_tile)](q_ij) = s[ii] / vec_norm; + else + q[to_sizet(q_tile)](q_ij) = 0; } } } })); } -template -void setUnitDiag(const SizeType i_begin, const SizeType i_end, KSender&& k, Matrix& mat) { - // Iterate over diagonal tiles - const matrix::Distribution& distr = mat.distribution(); - for (SizeType i_tile = i_begin; i_tile < i_end; ++i_tile) { - const SizeType tile_begin = distr.globalTileElementDistance(i_begin, i_tile); - - setUnitDiagonalAsync(k, tile_begin, mat.readwrite(GlobalTileIndex(i_tile, i_tile))); - } -} - template void mergeSubproblems(const SizeType i_begin, const SizeType i_split, const SizeType i_end, RhoSender&& rho, WorkSpace& ws, WorkSpaceHost& ws_h, @@ -697,22 +729,13 @@ void mergeSubproblems(const SizeType i_begin, const SizeType i_split, const Size // The input is not required to be zero for solveRank1Problem. matrix::util::set0(pika::execution::thread_priority::normal, idx_loc_begin, sz_loc_tiles, ws_hm.e2); - solveRank1Problem(i_begin, i_end, k, scaled_rho, ws_hm.d1, ws_hm.z1, ws_h.d0, ws_hm.e2); - + solveRank1Problem(i_begin, i_end, k, scaled_rho, ws_hm.d1, ws_hm.z1, ws_h.d0, ws_hm.i2, ws_hm.e2); copy(idx_loc_begin, sz_loc_tiles, ws_hm.e2, ws.e2); - setUnitDiag(i_begin, i_end, k, ws.e2); - // Step #3: Eigenvectors of the tridiagonal system: Q * U // // The eigenvectors resulting from the multiplication are already in the order of the eigenvalues as // prepared for the deflated system. - // - copy(idx_begin_tiles_vec, sz_tiles_vec, ws_hm.i2, ws.i2); - // The following permutation will be removed in the future. - // (The copy is needed to simplify the removal) - dlaf::permutations::permute(i_begin, i_end, ws.i2, ws.e2, ws.e0); - copy(idx_loc_begin, sz_loc_tiles, ws.e0, ws.e2); dlaf::multiplication::generalSubMatrix(i_begin, i_end, blas::Op::NoTrans, blas::Op::NoTrans, T(1), ws.e1, ws.e2, T(0), ws.e0); @@ -904,7 +927,7 @@ void solveRank1ProblemDist(CommSender&& row_comm, CommSender&& col_comm, const S // Note: // Considering that // - LAED4 requires working on k elements - // - Weight computaiton requires working on m_subm_el_lc + // - Weight computation requires working on m_subm_el_lc // // and they are needed at two steps that cannot happen in parallel, we opted for allocating the // workspace with the highest requirement of memory, and reuse them for both steps. @@ -1083,7 +1106,7 @@ void solveRank1ProblemDist(CommSender&& row_comm, CommSender&& col_comm, const S // STEP 2c: reduce, then finalize computation with sign and square root (single-thread) if (thread_idx == 0) { // local reduction from all bulk workers - for (int i = 0; i < m_subm_el_lc; ++i) { + for (SizeType i = 0; i < m_subm_el_lc; ++i) { for (std::size_t tidx = 1; tidx < nthreads; ++tidx) { const T* w_partial = ws_cols[tidx](); w[i] *= w_partial[i]; @@ -1095,7 +1118,7 @@ void solveRank1ProblemDist(CommSender&& row_comm, CommSender&& col_comm, const S transformMPI(all_reduce_in_place)); T* weights = ws_cols[nthreads](); - for (int i_subm_el_lc = 0; i_subm_el_lc < m_subm_el_lc; ++i_subm_el_lc) { + for (SizeType i_subm_el_lc = 0; i_subm_el_lc < m_subm_el_lc; ++i_subm_el_lc) { const SizeType i_subm_lc = i_subm_el_lc / dist.blockSize().rows(); const SizeType i_lc = ij_begin_lc.row() + i_subm_lc; const SizeType i = dist.globalTileFromLocalTile(i_lc); diff --git a/src/eigensolver/tridiag_solver/kernels.cpp b/src/eigensolver/tridiag_solver/kernels.cpp index 5770f54300..044f46a862 100644 --- a/src/eigensolver/tridiag_solver/kernels.cpp +++ b/src/eigensolver/tridiag_solver/kernels.cpp @@ -93,20 +93,4 @@ void initIndexTile(SizeType offset, const matrix::Tile& t } } -template -void setUnitDiagonal(const SizeType& k, const SizeType& tile_begin, - const matrix::Tile& tile) { - // If all elements of the tile are after the `k` index reset the offset - SizeType tile_offset = k - tile_begin; - if (tile_offset < 0) - tile_offset = 0; - - // Set all diagonal elements of the tile to 1. - for (SizeType i = tile_offset; i < tile.size().rows(); ++i) { - tile(TileElementIndex(i, i)) = 1; - } -} - -DLAF_CPU_SET_UNIT_DIAGONAL_ETI(, float); -DLAF_CPU_SET_UNIT_DIAGONAL_ETI(, double); } diff --git a/src/eigensolver/tridiag_solver/kernels.cu b/src/eigensolver/tridiag_solver/kernels.cu index 1ed3260dfa..240d750437 100644 --- a/src/eigensolver/tridiag_solver/kernels.cu +++ b/src/eigensolver/tridiag_solver/kernels.cu @@ -185,60 +185,6 @@ void initIndexTile(SizeType offset, const matrix::Tile& t initIndexTile<<>>(offset, len, index_arr); } -struct StrideOp { - SizeType ld; - SizeType offset; - - __host__ __device__ __forceinline__ SizeType operator()(const SizeType i) const { - return offset + i * ld; - } -}; - -template -struct Row2ColMajor { - SizeType ld; - SizeType ncols; - T* data; - - __host__ __device__ __forceinline__ T operator()(const SizeType idx) const { - SizeType i = idx / ncols; - SizeType j = idx - i * ncols; - return data[i + j * ld]; - } -}; - -constexpr unsigned set_diag_kernel_sz = 256; - -template -__global__ void setUnitDiagTileOnDevice(SizeType len, SizeType ld, T* tile) { - const SizeType i = blockIdx.x * set_diag_kernel_sz + threadIdx.x; - if (i >= len) - return; - - tile[i + i * ld] = T(1); -} - -template -void setUnitDiagonal(const SizeType& k, const SizeType& tile_begin, - const matrix::Tile& tile, whip::stream_t stream) { - SizeType tile_offset = k - tile_begin; - if (tile_offset < 0) - tile_offset = 0; - else if (tile_offset >= tile.size().rows()) - return; - - SizeType len = tile.size().rows() - tile_offset; - SizeType ld = tile.ld(); - T* tile_ptr = tile.ptr(TileElementIndex(tile_offset, tile_offset)); - - dim3 nr_threads(set_diag_kernel_sz); - dim3 nr_blocks(util::ceilDiv(to_uint(len), set_diag_kernel_sz)); - setUnitDiagTileOnDevice<<>>(len, ld, tile_ptr); -} - -DLAF_GPU_SET_UNIT_DIAGONAL_ETI(, float); -DLAF_GPU_SET_UNIT_DIAGONAL_ETI(, double); - // ----------------------------------------- // This is a separate struct with a call operator instead of a lambda, because // nvcc does not compile the file with a lambda.