diff --git a/include/dlaf/eigensolver/tridiag_solver/impl.h b/include/dlaf/eigensolver/tridiag_solver/impl.h index 4a49b93f01..ddc2db5b8c 100644 --- a/include/dlaf/eigensolver/tridiag_solver/impl.h +++ b/include/dlaf/eigensolver/tridiag_solver/impl.h @@ -70,9 +70,9 @@ inline std::vector> generateSubproblemI template auto cuppensDecomposition(Matrix& tridiag) { namespace ex = pika::execution::experimental; - using sender_type = decltype( - ex::split(cuppensDecompAsync(tridiag.readwrite_sender(std::declval()), - tridiag.readwrite_sender(std::declval())))); + using sender_type = decltype(ex::split(ex::ensure_started( + cuppensDecompAsync(tridiag.readwrite_sender(std::declval()), + tridiag.readwrite_sender(std::declval()))))); using vector_type = std::vector; if (tridiag.nrTiles().rows() == 0) @@ -83,9 +83,9 @@ auto cuppensDecomposition(Matrix& tridiag) { offdiag_vals.reserve(to_sizet(i_end)); for (SizeType i_split = 0; i_split < i_end; ++i_split) { - offdiag_vals.push_back( - ex::split(cuppensDecompAsync(tridiag.readwrite_sender(LocalTileIndex(i_split, 0)), - tridiag.readwrite_sender(LocalTileIndex(i_split + 1, 0))))); + offdiag_vals.push_back(ex::split(ex::ensure_started( + cuppensDecompAsync(tridiag.readwrite_sender(LocalTileIndex(i_split, 0)), + tridiag.readwrite_sender(LocalTileIndex(i_split + 1, 0)))))); } return offdiag_vals; } @@ -329,20 +329,21 @@ void TridiagSolver::call(comm::CommunicatorGrid grid, Matrix ws{Matrix(dist_evecs), // mat1 - Matrix(dist_evecs), // mat2 - Matrix(dist_evals), // z - Matrix(dist_evals)}; // ztmp + WorkSpace ws{Matrix(dist_evecs), // mat1 + Matrix(dist_evecs), // mat2 + Matrix(dist_evals), // z + Matrix(dist_evals), // ztmp + Matrix(dist_evals)}; // i2 - DistWorkSpaceHost ws_h{Matrix(dist_evals), // dtmp - Matrix(dist_evals), // i1 - Matrix(dist_evals), // i2 - Matrix(dist_evals), // i3 - Matrix(dist_evals)}; // c + WorkSpaceHost ws_h{Matrix(dist_evals), // dtmp + Matrix(dist_evals), // i1 + Matrix(dist_evals), // i3 + Matrix(dist_evals)}; // c DistWorkSpaceHostMirror ws_hm{initMirrorMatrix(evals), initMirrorMatrix(evecs), initMirrorMatrix(ws.mat1), initMirrorMatrix(ws.mat2), - initMirrorMatrix(ws.z), initMirrorMatrix(ws.ztmp)}; + initMirrorMatrix(ws.z), initMirrorMatrix(ws.ztmp), + initMirrorMatrix(ws.i2)}; // Set `evecs` to `zero` (needed for Given's rotation to make sure no random values are picked up) matrix::util::set0(pika::execution::thread_priority::normal, evecs); diff --git a/include/dlaf/eigensolver/tridiag_solver/kernels.h b/include/dlaf/eigensolver/tridiag_solver/kernels.h index 483a30d533..b12b6482dd 100644 --- a/include/dlaf/eigensolver/tridiag_solver/kernels.h +++ b/include/dlaf/eigensolver/tridiag_solver/kernels.h @@ -277,15 +277,16 @@ void initIndexTileAsync(SizeType tile_row, TileSender&& tile) { } template -void divideEvecsByDiagonal(const SizeType& k, const SizeType& i_subm_el, const SizeType& j_subm_el, +void divideEvecsByDiagonal(const SizeType& k_row, const SizeType& k_col, const SizeType& i_subm_el, + const SizeType& j_subm_el, const matrix::Tile& diag_rows, const matrix::Tile& diag_cols, const matrix::Tile& evecs_tile, const matrix::Tile& ws_tile); #define DLAF_CPU_DIVIDE_EVECS_BY_DIAGONAL_ETI(kword, Type) \ - kword template void divideEvecsByDiagonal(const SizeType& k, const SizeType& i_subm_el, \ - const SizeType& j_subm_el, \ + kword template void divideEvecsByDiagonal(const SizeType& k_row, const SizeType& k_col, \ + const SizeType& i_subm_el, const SizeType& j_subm_el, \ const matrix::Tile& diag_rows, \ const matrix::Tile& diag_cols, \ const matrix::Tile& evecs_tile, \ @@ -296,15 +297,16 @@ DLAF_CPU_DIVIDE_EVECS_BY_DIAGONAL_ETI(extern, double); #ifdef DLAF_WITH_GPU template -void divideEvecsByDiagonal(const SizeType& k, const SizeType& i_subm_el, const SizeType& j_subm_el, +void divideEvecsByDiagonal(const SizeType& k_row, const SizeType& k_col, const SizeType& i_subm_el, + const SizeType& j_subm_el, const matrix::Tile& diag_rows, const matrix::Tile& diag_cols, const matrix::Tile& evecs_tile, const matrix::Tile& ws_tile, whip::stream_t stream); #define DLAF_GPU_DIVIDE_EVECS_BY_DIAGONAL_ETI(kword, Type) \ - kword template void divideEvecsByDiagonal(const SizeType& k, const SizeType& i_subm_el, \ - const SizeType& j_subm_el, \ + kword template void divideEvecsByDiagonal(const SizeType& k_row, const SizeType& k_col, \ + const SizeType& i_subm_el, const SizeType& j_subm_el, \ const matrix::Tile& diag_rows, \ const matrix::Tile& diag_cols, \ const matrix::Tile& evecs_tile, \ @@ -317,14 +319,15 @@ DLAF_GPU_DIVIDE_EVECS_BY_DIAGONAL_ETI(extern, double); DLAF_MAKE_CALLABLE_OBJECT(divideEvecsByDiagonal); -template -void divideEvecsByDiagonalAsync(KSender&& k, SizeType i_subm_el, SizeType j_subm_el, - DiagRowsTileSender&& diag_rows, DiagColsTileSender&& diag_cols, - EvecsTileSender&& evecs, TempTileSender&& temp) { +void divideEvecsByDiagonalAsync(KRSender&& k_row, KCSender&& k_col, SizeType i_subm_el, + SizeType j_subm_el, DiagRowsTileSender&& diag_rows, + DiagColsTileSender&& diag_cols, EvecsTileSender&& evecs, + TempTileSender&& temp) { namespace di = dlaf::internal; auto sender = - di::whenAllLift(std::forward(k), i_subm_el, j_subm_el, + di::whenAllLift(std::forward(k_row), std::forward(k_col), i_subm_el, j_subm_el, std::forward(diag_rows), std::forward(diag_cols), std::forward(evecs), std::forward(temp)); @@ -332,13 +335,14 @@ void divideEvecsByDiagonalAsync(KSender&& k, SizeType i_subm_el, SizeType j_subm } template -void multiplyFirstColumns(const SizeType& k, const SizeType& row, const SizeType& col, - const matrix::Tile& in, +void multiplyFirstColumns(const SizeType& k_row, const SizeType& k_col, const SizeType& row, + const SizeType& col, const matrix::Tile& in, const matrix::Tile& out); -#define DLAF_CPU_MULTIPLY_FIRST_COLUMNS_ETI(kword, Type) \ - kword template void multiplyFirstColumns(const SizeType& k, const SizeType& row, const SizeType& col, \ - const matrix::Tile& in, \ +#define DLAF_CPU_MULTIPLY_FIRST_COLUMNS_ETI(kword, Type) \ + kword template void multiplyFirstColumns(const SizeType& k_row, const SizeType& k_col, \ + const SizeType& row, const SizeType& col, \ + const matrix::Tile& in, \ const matrix::Tile& out) DLAF_CPU_MULTIPLY_FIRST_COLUMNS_ETI(extern, float); @@ -346,14 +350,15 @@ DLAF_CPU_MULTIPLY_FIRST_COLUMNS_ETI(extern, double); #ifdef DLAF_WITH_GPU template -void multiplyFirstColumns(const SizeType& k, const SizeType& row, const SizeType& col, - const matrix::Tile& in, +void multiplyFirstColumns(const SizeType& k_row, const SizeType& k_col, const SizeType& row, + const SizeType& col, const matrix::Tile& in, const matrix::Tile& out, whip::stream_t stream); -#define DLAF_GPU_MULTIPLY_FIRST_COLUMNS_ETI(kword, Type) \ - kword template void multiplyFirstColumns(const SizeType& k, const SizeType& row, const SizeType& col, \ - const matrix::Tile& in, \ - const matrix::Tile& out, \ +#define DLAF_GPU_MULTIPLY_FIRST_COLUMNS_ETI(kword, Type) \ + kword template void multiplyFirstColumns(const SizeType& k_row, const SizeType& k_col, \ + const SizeType& row, const SizeType& col, \ + const matrix::Tile& in, \ + const matrix::Tile& out, \ whip::stream_t stream) DLAF_GPU_MULTIPLY_FIRST_COLUMNS_ETI(extern, float); @@ -362,24 +367,24 @@ DLAF_GPU_MULTIPLY_FIRST_COLUMNS_ETI(extern, double); DLAF_MAKE_CALLABLE_OBJECT(multiplyFirstColumns); -template -void multiplyFirstColumnsAsync(KSender&& k, SizeType row, SizeType col, InTileSender&& in, - OutTileSender&& out) { +template +void multiplyFirstColumnsAsync(KRSender&& k_row, KCSender&& k_col, SizeType row, SizeType col, + InTileSender&& in, OutTileSender&& out) { namespace di = dlaf::internal; - auto sender = di::whenAllLift(std::forward(k), row, col, std::forward(in), + auto sender = di::whenAllLift(k_row, k_col, row, col, std::forward(in), std::forward(out)); di::transformDetach(di::Policy>(), multiplyFirstColumns_o, std::move(sender)); } template -void calcEvecsFromWeightVec(const SizeType& k, const SizeType& row, const SizeType& col, - const matrix::Tile& z_tile, +void calcEvecsFromWeightVec(const SizeType& k_row, const SizeType& k_col, const SizeType& row, + const SizeType& col, const matrix::Tile& z_tile, const matrix::Tile& ws_tile, const matrix::Tile& evecs_tile); #define DLAF_CPU_CALC_EVECS_FROM_WEIGHT_VEC_ETI(kword, Type) \ - kword template void calcEvecsFromWeightVec(const SizeType& k, const SizeType& row, \ - const SizeType& col, \ + kword template void calcEvecsFromWeightVec(const SizeType& k_row, const SizeType& k_col, \ + const SizeType& row, const SizeType& col, \ const matrix::Tile& z_tile, \ const matrix::Tile& ws_tile, \ const matrix::Tile& evecs_tile) @@ -389,14 +394,14 @@ DLAF_CPU_CALC_EVECS_FROM_WEIGHT_VEC_ETI(extern, double); #ifdef DLAF_WITH_GPU template -void calcEvecsFromWeightVec(const SizeType& k, const SizeType& row, const SizeType& col, - const matrix::Tile& z_tile, +void calcEvecsFromWeightVec(const SizeType& k_row, const SizeType& k_col, const SizeType& row, + const SizeType& col, const matrix::Tile& z_tile, const matrix::Tile& ws_tile, const matrix::Tile& evecs_tile, whip::stream_t stream); #define DLAF_GPU_CALC_EVECS_FROM_WEIGHT_VEC_ETI(kword, Type) \ - kword template void calcEvecsFromWeightVec(const SizeType& k, const SizeType& row, \ - const SizeType& col, \ + kword template void calcEvecsFromWeightVec(const SizeType& k_row, const SizeType& k_col, \ + const SizeType& row, const SizeType& col, \ const matrix::Tile& z_tile, \ const matrix::Tile& ws_tile, \ const matrix::Tile& evecs_tile, \ @@ -408,25 +413,28 @@ DLAF_GPU_CALC_EVECS_FROM_WEIGHT_VEC_ETI(extern, double); DLAF_MAKE_CALLABLE_OBJECT(calcEvecsFromWeightVec); -template -void calcEvecsFromWeightVecAsync(KSender&& k, SizeType row, SizeType col, Rank1TileSender&& rank1, - TempTileSender&& temp, EvecsTileSender&& evecs) { +template +void calcEvecsFromWeightVecAsync(KRSender&& k_row, KCSender&& k_col, SizeType row, SizeType col, + Rank1TileSender&& rank1, TempTileSender&& temp, + EvecsTileSender&& evecs) { namespace di = dlaf::internal; auto sender = - di::whenAllLift(std::forward(k), row, col, std::forward(rank1), + di::whenAllLift(k_row, k_col, row, col, std::forward(rank1), std::forward(temp), std::forward(evecs)); di::transformDetach(di::Policy>(), calcEvecsFromWeightVec_o, std::move(sender)); } template -void sumsqCols(const SizeType& k, const SizeType& row, const SizeType& col, +void sumsqCols(const SizeType& k_row, const SizeType& k_col, const SizeType& row, const SizeType& col, const matrix::Tile& evecs_tile, const matrix::Tile& ws_tile); -#define DLAF_CPU_SUMSQ_COLS_ETI(kword, Type) \ - kword template void sumsqCols(const SizeType& k, const SizeType& row, const SizeType& col, \ - const matrix::Tile& evecs_tile, \ +#define DLAF_CPU_SUMSQ_COLS_ETI(kword, Type) \ + kword template void sumsqCols(const SizeType& k_row, const SizeType& k_col, const SizeType& row, \ + const SizeType& col, \ + const matrix::Tile& evecs_tile, \ const matrix::Tile& ws_tile) DLAF_CPU_SUMSQ_COLS_ETI(extern, float); @@ -434,13 +442,14 @@ DLAF_CPU_SUMSQ_COLS_ETI(extern, double); #ifdef DLAF_WITH_GPU template -void sumsqCols(const SizeType& k, const SizeType& row, const SizeType& col, +void sumsqCols(const SizeType& k_row, const SizeType& k_col, const SizeType& row, const SizeType& col, const matrix::Tile& evecs_tile, const matrix::Tile& ws_tile, whip::stream_t stream); -#define DLAF_GPU_SUMSQ_COLS_ETI(kword, Type) \ - kword template void sumsqCols(const SizeType& k, const SizeType& row, const SizeType& col, \ - const matrix::Tile& evecs_tile, \ +#define DLAF_GPU_SUMSQ_COLS_ETI(kword, Type) \ + kword template void sumsqCols(const SizeType& k_row, const SizeType& k_col, const SizeType& row, \ + const SizeType& col, \ + const matrix::Tile& evecs_tile, \ const matrix::Tile& ws_tile, whip::stream_t stream) DLAF_GPU_SUMSQ_COLS_ETI(extern, float); @@ -449,23 +458,25 @@ DLAF_GPU_SUMSQ_COLS_ETI(extern, double); DLAF_MAKE_CALLABLE_OBJECT(sumsqCols); -template -void sumsqColsAsync(KSender&& k, SizeType row, SizeType col, EvecsTileSender&& evecs, - TempTileSender&& temp) { +template +void sumsqColsAsync(KRSender&& k_row, KCSender&& k_col, SizeType row, SizeType col, + EvecsTileSender&& evecs, TempTileSender&& temp) { namespace di = dlaf::internal; - auto sender = di::whenAllLift(std::forward(k), row, col, std::forward(evecs), - std::forward(temp)); + auto sender = + di::whenAllLift(std::forward(k_row), std::forward(k_col), row, col, + std::forward(evecs), std::forward(temp)); di::transformDetach(di::Policy>(), sumsqCols_o, std::move(sender)); } template -void addFirstRows(const SizeType& k, const SizeType& row, const SizeType& col, +void addFirstRows(const SizeType& k_row, const SizeType& k_col, const SizeType& row, const SizeType& col, const matrix::Tile& in, const matrix::Tile& out); -#define DLAF_CPU_ADD_FIRST_ROWS_ETI(kword, Type) \ - kword template void addFirstRows(const SizeType& k, const SizeType& row, const SizeType& col, \ - const matrix::Tile& in, \ +#define DLAF_CPU_ADD_FIRST_ROWS_ETI(kword, Type) \ + kword template void addFirstRows(const SizeType& k_row, const SizeType& k_col, const SizeType& row, \ + const SizeType& col, \ + const matrix::Tile& in, \ const matrix::Tile& out) DLAF_CPU_ADD_FIRST_ROWS_ETI(extern, float); @@ -473,13 +484,14 @@ DLAF_CPU_ADD_FIRST_ROWS_ETI(extern, double); #ifdef DLAF_WITH_GPU template -void addFirstRows(const SizeType& k, const SizeType& row, const SizeType& col, +void addFirstRows(const SizeType& k_row, const SizeType& k_col, const SizeType& row, const SizeType& col, const matrix::Tile& evecs_tile, const matrix::Tile& ws_tile, whip::stream_t stream); -#define DLAF_GPU_ADD_FIRST_ROWS_ETI(kword, Type) \ - kword template void addFirstRows(const SizeType& k, const SizeType& row, const SizeType& col, \ - const matrix::Tile& in, \ +#define DLAF_GPU_ADD_FIRST_ROWS_ETI(kword, Type) \ + kword template void addFirstRows(const SizeType& k_row, const SizeType& k_col, const SizeType& row, \ + const SizeType& col, \ + const matrix::Tile& in, \ const matrix::Tile& out, whip::stream_t stream) DLAF_GPU_ADD_FIRST_ROWS_ETI(extern, float); @@ -488,23 +500,25 @@ DLAF_GPU_ADD_FIRST_ROWS_ETI(extern, double); DLAF_MAKE_CALLABLE_OBJECT(addFirstRows); -template -void addFirstRowsAsync(KSender&& k, SizeType row, SizeType col, InTileSender&& in, OutTileSender&& out) { +template +void addFirstRowsAsync(KRSender&& k_row, KCSender&& k_col, SizeType row, SizeType col, InTileSender&& in, + OutTileSender&& out) { namespace di = dlaf::internal; - auto sender = di::whenAllLift(std::forward(k), row, col, std::forward(in), - std::forward(out)); + auto sender = di::whenAllLift(std::forward(k_row), std::forward(k_col), row, col, + std::forward(in), std::forward(out)); di::transformDetach(di::Policy>(), addFirstRows_o, std::move(sender)); } template -void divideColsByFirstRow(const SizeType& k, const SizeType& row, const SizeType& col, - const matrix::Tile& in, +void divideColsByFirstRow(const SizeType& k_row, const SizeType& k_col, const SizeType& row, + const SizeType& col, const matrix::Tile& in, const matrix::Tile& out); -#define DLAF_CPU_DIVIDE_COLS_BY_FIRST_ROW_ETI(kword, Type) \ - kword template void divideColsByFirstRow(const SizeType& k, const SizeType& row, const SizeType& col, \ - const matrix::Tile& in, \ +#define DLAF_CPU_DIVIDE_COLS_BY_FIRST_ROW_ETI(kword, Type) \ + kword template void divideColsByFirstRow(const SizeType& k_row, const SizeType& k_col, \ + const SizeType& row, const SizeType& col, \ + const matrix::Tile& in, \ const matrix::Tile& out) DLAF_CPU_DIVIDE_COLS_BY_FIRST_ROW_ETI(extern, float); @@ -512,14 +526,15 @@ DLAF_CPU_DIVIDE_COLS_BY_FIRST_ROW_ETI(extern, double); #ifdef DLAF_WITH_GPU template -void divideColsByFirstRow(const SizeType& k, const SizeType& row, const SizeType& col, - const matrix::Tile& evecs_tile, +void divideColsByFirstRow(const SizeType& k_row, const SizeType& k_col, const SizeType& row, + const SizeType& col, const matrix::Tile& evecs_tile, const matrix::Tile& ws_tile, whip::stream_t stream); -#define DLAF_GPU_DIVIDE_COLS_BY_FIRST_ROW_ETI(kword, Type) \ - kword template void divideColsByFirstRow(const SizeType& k, const SizeType& row, const SizeType& col, \ - const matrix::Tile& in, \ - const matrix::Tile& out, \ +#define DLAF_GPU_DIVIDE_COLS_BY_FIRST_ROW_ETI(kword, Type) \ + kword template void divideColsByFirstRow(const SizeType& k_row, const SizeType& k_col, \ + const SizeType& row, const SizeType& col, \ + const matrix::Tile& in, \ + const matrix::Tile& out, \ whip::stream_t stream) DLAF_GPU_DIVIDE_COLS_BY_FIRST_ROW_ETI(extern, float); @@ -528,13 +543,13 @@ DLAF_GPU_DIVIDE_COLS_BY_FIRST_ROW_ETI(extern, double); DLAF_MAKE_CALLABLE_OBJECT(divideColsByFirstRow); -template -void divideColsByFirstRowAsync(KSender&& k, SizeType row, SizeType col, InTileSender&& in, - OutTileSender&& out) { +template +void divideColsByFirstRowAsync(KRSender&& k_row, KCSender&& k_col, SizeType row, SizeType col, + InTileSender&& in, OutTileSender&& out) { namespace di = dlaf::internal; - auto sender = di::whenAllLift(std::forward(k), row, col, std::forward(in), - std::forward(out)); + auto sender = di::whenAllLift(std::forward(k_row), std::forward(k_col), row, col, + std::forward(in), std::forward(out)); di::transformDetach(di::Policy>(), divideColsByFirstRow_o, std::move(sender)); } @@ -577,14 +592,14 @@ void setUnitDiagonalAsync(KSender&& k, SizeType tile_begin, TileSender&& tile) { // out_coord) of @p out_tile that is inside a sqaure defined by the global element index @p k where both // tiles begin at global element index (@p row, @p col). template -void copy1D(const SizeType& k, const SizeType& row, const SizeType& col, const Coord& in_coord, - const matrix::Tile& in_tile, const Coord& out_coord, - const matrix::Tile& out_tile); - -#define DLAF_CPU_COPY_1D_ETI(kword, Type) \ - kword template void copy1D(const SizeType& k, const SizeType& row, const SizeType& col, \ - const Coord& in_coord, \ - const matrix::Tile& in_tile, \ +void copy1D(const SizeType& k_row, const SizeType& k_col, const SizeType& row, const SizeType& col, + const Coord& in_coord, const matrix::Tile& in_tile, + const Coord& out_coord, const matrix::Tile& out_tile); + +#define DLAF_CPU_COPY_1D_ETI(kword, Type) \ + kword template void copy1D(const SizeType& k_row, const SizeType& k_col, const SizeType& row, \ + const SizeType& col, const Coord& in_coord, \ + const matrix::Tile& in_tile, \ const Coord& out_coord, const matrix::Tile& out_tile) DLAF_CPU_COPY_1D_ETI(extern, float); @@ -593,14 +608,15 @@ DLAF_CPU_COPY_1D_ETI(extern, double); #ifdef DLAF_WITH_GPU template -void copy1D(cublasHandle_t handle, const SizeType& k, const SizeType& row, const SizeType& col, - const Coord& in_coord, const matrix::Tile& in_tile, - const Coord& out_coord, const matrix::Tile& out_tile); - -#define DLAF_GPU_COPY_1D_ETI(kword, Type) \ - kword template void copy1D(cublasHandle_t handle, const SizeType& k, const SizeType& row, \ - const SizeType& col, const Coord& in_coord, \ - const matrix::Tile& in_tile, \ +void copy1D(cublasHandle_t handle, const SizeType& k_row, const SizeType& k_col, const SizeType& row, + const SizeType& col, const Coord& in_coord, + const matrix::Tile& in_tile, const Coord& out_coord, + const matrix::Tile& out_tile); + +#define DLAF_GPU_COPY_1D_ETI(kword, Type) \ + kword template void copy1D(cublasHandle_t handle, const SizeType& k_row, const SizeType& k_col, \ + const SizeType& row, const SizeType& col, const Coord& in_coord, \ + const matrix::Tile& in_tile, \ const Coord& out_coord, const matrix::Tile& out_tile) DLAF_GPU_COPY_1D_ETI(extern, float); @@ -610,14 +626,14 @@ DLAF_GPU_COPY_1D_ETI(extern, double); DLAF_MAKE_CALLABLE_OBJECT(copy1D); -template -void copy1DAsync(KSender&& k, SizeType row, SizeType col, Coord in_coord, InTileSender&& in, - Coord out_coord, OutTileSender&& out) { +template +void copy1DAsync(KRSender&& k_row, KCSender&& k_col, SizeType row, SizeType col, Coord in_coord, + InTileSender&& in, Coord out_coord, OutTileSender&& out) { namespace di = dlaf::internal; namespace ex = pika::execution::experimental; auto sender = - di::whenAllLift(std::forward(k), row, col, in_coord, std::forward(in), - out_coord, std::forward(out)); + di::whenAllLift(std::forward(k_row), std::forward(k_col), row, col, in_coord, + std::forward(in), out_coord, std::forward(out)); ex::start_detached(di::transform(di::Policy>(), copy1D_o, std::move(sender))); } diff --git a/include/dlaf/eigensolver/tridiag_solver/merge.h b/include/dlaf/eigensolver/tridiag_solver/merge.h index 5ac1ed57ad..ceb6fb52f4 100644 --- a/include/dlaf/eigensolver/tridiag_solver/merge.h +++ b/include/dlaf/eigensolver/tridiag_solver/merge.h @@ -133,26 +133,6 @@ struct WorkSpaceHostMirror { Matrix& i2; }; -template -struct DistWorkSpace { - Matrix mat1; - Matrix mat2; - - Matrix z; - Matrix ztmp; -}; - -template -struct DistWorkSpaceHost { - Matrix dtmp; - - Matrix i1; - Matrix i2; - Matrix i3; - - Matrix c; -}; - // forward declaration : Device::GPU - unused template struct DistWorkSpaceHostMirror { @@ -163,6 +143,8 @@ struct DistWorkSpaceHostMirror { Matrix z; Matrix ztmp; + + Matrix i2; }; template @@ -174,6 +156,8 @@ struct DistWorkSpaceHostMirror { Matrix& z; Matrix& ztmp; + + Matrix& i2; }; template @@ -685,10 +669,11 @@ void solveRank1Problem(SizeType i_begin, SizeType i_end, KSender&& k, RhoSender& // - lapack 3.10.0, dlaed3.f, line 293 // - LAPACK Working Notes: lawn132, Parallelizing the Divide and Conquer Algorithm for the Symmetric // Tridiagonal Eigenvalue Problem on Distributed Memory Architectures, 4.2 Orthogonality -template +template void initWeightVector(GlobalTileIndex idx_gl_begin, LocalTileIndex idx_loc_begin, - LocalTileSize sz_loc_tiles, KSender&& k, Matrix& diag, - Matrix& evecs, Matrix& ws) { + LocalTileSize sz_loc_tiles, KRSender&& k_row, KCSender&& k_col, + Matrix& diag, Matrix& diag_i2, Matrix& evecs, + Matrix& ws) { const matrix::Distribution& dist = evecs.distribution(); // Reduce by multiplication into the first local column of each tile of the workspace matrix `ws` @@ -697,8 +682,8 @@ void initWeightVector(GlobalTileIndex idx_gl_begin, LocalTileIndex idx_loc_begin auto sz_gl_el = dist.globalTileElementDistance(idx_gl_begin, idx_gl_tile); // Divide the eigenvectors of the rank1 update problem `evecs` by it's diagonal matrix `diag` and // reduce multiply into the first column of each tile of the workspace matrix `ws` - divideEvecsByDiagonalAsync(k, sz_gl_el.rows(), sz_gl_el.cols(), - diag.read_sender(GlobalTileIndex(idx_gl_tile.row(), 0)), + divideEvecsByDiagonalAsync(k_row, k_col, sz_gl_el.rows(), sz_gl_el.cols(), + diag_i2.read_sender(GlobalTileIndex(idx_gl_tile.row(), 0)), diag.read_sender(GlobalTileIndex(idx_gl_tile.col(), 0)), evecs.read_sender(idx_loc_tile), ws.readwrite_sender(idx_loc_tile)); @@ -709,7 +694,8 @@ void initWeightVector(GlobalTileIndex idx_gl_begin, LocalTileIndex idx_loc_begin // reduce-multiply the first column of each local tile of the workspace matrix into the first local // column of the matrix LocalTileIndex idx_ws_first_col_tile(idx_loc_tile.row(), idx_loc_begin.col()); - multiplyFirstColumnsAsync(k, sz_gl_el.rows(), sz_gl_el.cols(), ws.read_sender(idx_loc_tile), + multiplyFirstColumnsAsync(k_row, k_col, sz_gl_el.rows(), sz_gl_el.cols(), + ws.read_sender(idx_loc_tile), ws.readwrite_sender(idx_ws_first_col_tile)); } } @@ -718,10 +704,10 @@ void initWeightVector(GlobalTileIndex idx_gl_begin, LocalTileIndex idx_loc_begin // - lapack 3.10.0, dlaed3.f, line 293 // - LAPACK Working Notes: lawn132, Parallelizing the Divide and Conquer Algorithm for the Symmetric // Tridiagonal Eigenvalue Problem on Distributed Memory Architectures, 4.2 Orthogonality -template +template void formEvecsUsingWeightVec(GlobalTileIndex idx_gl_begin, LocalTileIndex idx_loc_begin, - LocalTileSize sz_loc_tiles, KSender&& k, Matrix& z, - Matrix& ws, Matrix& evecs) { + LocalTileSize sz_loc_tiles, KRSender&& k_row, KCSender&& k_col, + Matrix& z, Matrix& ws, Matrix& evecs) { const matrix::Distribution& dist = evecs.distribution(); for (auto idx_loc_tile : common::iterate_range2d(idx_loc_begin, sz_loc_tiles)) { @@ -729,7 +715,7 @@ void formEvecsUsingWeightVec(GlobalTileIndex idx_gl_begin, LocalTileIndex idx_lo auto sz_gl_el = dist.globalTileElementDistance(idx_gl_begin, idx_gl_tile); LocalTileIndex idx_ws_first_local_column(idx_loc_tile.row(), idx_loc_begin.col()); - calcEvecsFromWeightVecAsync(k, sz_gl_el.rows(), sz_gl_el.cols(), + calcEvecsFromWeightVecAsync(k_row, k_col, sz_gl_el.rows(), sz_gl_el.cols(), z.read_sender(GlobalTileIndex(idx_gl_tile.row(), 0)), ws.read_sender(idx_ws_first_local_column), evecs.readwrite_sender(idx_loc_tile)); @@ -737,15 +723,15 @@ void formEvecsUsingWeightVec(GlobalTileIndex idx_gl_begin, LocalTileIndex idx_lo } // Sum of squares of columns of @p evecs into the first row of @p ws -template +template void sumsqEvecs(GlobalTileIndex idx_gl_begin, LocalTileIndex idx_loc_begin, LocalTileSize sz_loc_tiles, - KSender&& k, Matrix& evecs, Matrix& ws) { + KRSender&& k_row, KCSender&& k_col, Matrix& evecs, Matrix& ws) { const matrix::Distribution& dist = evecs.distribution(); for (auto idx_loc_tile : common::iterate_range2d(idx_loc_begin, sz_loc_tiles)) { auto idx_gl_tile = dist.globalTileIndex(idx_loc_tile); auto sz_gl_el = dist.globalTileElementDistance(idx_gl_begin, idx_gl_tile); - sumsqColsAsync(k, sz_gl_el.rows(), sz_gl_el.cols(), evecs.read_sender(idx_loc_tile), + sumsqColsAsync(k_row, k_col, sz_gl_el.rows(), sz_gl_el.cols(), evecs.read_sender(idx_loc_tile), ws.readwrite_sender(idx_loc_tile)); // skip the first local row @@ -753,23 +739,23 @@ void sumsqEvecs(GlobalTileIndex idx_gl_begin, LocalTileIndex idx_loc_begin, Loca continue; LocalTileIndex idx_ws_first_row_tile(idx_loc_begin.row(), idx_loc_tile.col()); - addFirstRowsAsync(k, sz_gl_el.rows(), sz_gl_el.cols(), ws.read_sender(idx_loc_tile), + addFirstRowsAsync(k_row, k_col, sz_gl_el.rows(), sz_gl_el.cols(), ws.read_sender(idx_loc_tile), ws.readwrite_sender(idx_ws_first_row_tile)); } } // Normalize column vectors -template +template void normalizeEvecs(GlobalTileIndex idx_gl_begin, LocalTileIndex idx_loc_begin, - LocalTileSize sz_loc_tiles, KSender&& k, Matrix& ws, - Matrix& evecs) { + LocalTileSize sz_loc_tiles, KRSender&& k_row, KCSender&& k_col, + Matrix& ws, Matrix& evecs) { const matrix::Distribution& dist = evecs.distribution(); for (auto idx_loc_tile : common::iterate_range2d(idx_loc_begin, sz_loc_tiles)) { auto idx_gl_tile = dist.globalTileIndex(idx_loc_tile); auto sz_gl_el = dist.globalTileElementDistance(idx_gl_begin, idx_gl_tile); LocalTileIndex idx_ws_first_local_row(idx_loc_begin.row(), idx_loc_tile.col()); - divideColsByFirstRowAsync(k, sz_gl_el.rows(), sz_gl_el.cols(), + divideColsByFirstRowAsync(k_row, k_col, sz_gl_el.rows(), sz_gl_el.cols(), ws.read_sender(idx_ws_first_local_row), evecs.readwrite_sender(idx_loc_tile)); } @@ -845,6 +831,7 @@ void mergeSubproblems(SizeType i_begin, SizeType i_split, SizeType i_end, RhoSen // - set deflated diagonal entries of `U` to 1 (temporary solution until optimized GEMM is implemented) // auto k = stablePartitionIndexForDeflation(i_begin, i_end, ws_h.c, ws_hm.i2, ws_h.i3) | ex::split(); + applyIndex(i_begin, i_end, ws_h.i3, ws_hm.evals, ws_h.dtmp); applyIndex(i_begin, i_end, ws_h.i3, ws_hm.z, ws_hm.ztmp); copy(idx_begin_tiles_vec, sz_tiles_vec, ws_h.dtmp, ws_hm.evals); @@ -859,10 +846,10 @@ void mergeSubproblems(SizeType i_begin, SizeType i_split, SizeType i_end, RhoSen // --- // formEvecs(i_begin, i_end, k, evals, ws.ztmp, ws.mat2, ws.mat1); - initWeightVector(idx_gl_begin, idx_loc_begin, sz_loc_tiles, k, evals, ws.mat1, ws.mat2); - formEvecsUsingWeightVec(idx_gl_begin, idx_loc_begin, sz_loc_tiles, k, ws.ztmp, ws.mat2, ws.mat1); - sumsqEvecs(idx_gl_begin, idx_loc_begin, sz_loc_tiles, k, ws.mat1, ws.mat2); - normalizeEvecs(idx_gl_begin, idx_loc_begin, sz_loc_tiles, k, ws.mat2, ws.mat1); + initWeightVector(idx_gl_begin, idx_loc_begin, sz_loc_tiles, k, k, evals, evals, ws.mat1, ws.mat2); + formEvecsUsingWeightVec(idx_gl_begin, idx_loc_begin, sz_loc_tiles, k, k, ws.ztmp, ws.mat2, ws.mat1); + sumsqEvecs(idx_gl_begin, idx_loc_begin, sz_loc_tiles, k, k, ws.mat1, ws.mat2); + normalizeEvecs(idx_gl_begin, idx_loc_begin, sz_loc_tiles, k, k, ws.mat2, ws.mat1); setUnitDiag(i_begin, i_end, k, ws.mat1); // Step #3: Eigenvectors of the tridiagonal system: Q * U @@ -953,10 +940,11 @@ void setUnitDiagDist(SizeType i_begin, SizeType i_end, KSender&& k, Matrix // (All)Reduce-multiply row-wise the first local columns of the distributed matrix @p mat of size (n, n). // The local column on each rank is first offloaded to a local communication buffer @p comm_vec of size (n, 1). -template +template void reduceMultiplyWeightVector(common::Pipeline& row_task_chain, SizeType i_begin, SizeType i_end, LocalTileIndex idx_loc_begin, LocalTileSize sz_loc_tiles, - KSender&& k, Matrix& mat, Matrix& comm_vec) { + KRSender&& k_row, KCSender&& k_col, Matrix& mat, + Matrix& comm_vec) { namespace ex = pika::execution::experimental; namespace di = dlaf::internal; @@ -994,8 +982,8 @@ void reduceMultiplyWeightVector(common::Pipeline& row_task_c tile::laset(di::Policy>())); // copy the first column of the matrix tile into the column tile of the buffer - copy1DAsync(k, sz_subm.rows(), sz_subm.cols(), Coord::Col, mat.read_sender(idx_loc_tile), - Coord::Col, comm_vec.readwrite_sender(idx_gl_comm)); + copy1DAsync(k_row, k_col, sz_subm.rows(), sz_subm.cols(), Coord::Col, + mat.read_sender(idx_loc_tile), Coord::Col, comm_vec.readwrite_sender(idx_gl_comm)); ex::start_detached(comm::scheduleAllReduceInPlace(ex::make_unique_any_sender(row_task_chain()), MPI_PROD, @@ -1003,17 +991,18 @@ void reduceMultiplyWeightVector(common::Pipeline& row_task_c comm_vec.readwrite_sender(idx_gl_comm)))); // copy the column tile of the buffer into the first column of the matrix tile - copy1DAsync(k, sz_subm.rows(), sz_subm.cols(), Coord::Col, comm_vec.read_sender(idx_gl_comm), - Coord::Col, mat.readwrite_sender(idx_loc_tile)); + copy1DAsync(k_row, k_col, sz_subm.rows(), sz_subm.cols(), Coord::Col, + comm_vec.read_sender(idx_gl_comm), Coord::Col, mat.readwrite_sender(idx_loc_tile)); } } // (All)Reduce-sum column-wise the first local rows of the distributed matrix @p mat of size (n, n). // The local row on each rank is first offloaded to a local communication buffer @p comm_vec of size (n, 1). -template +template void reduceSumScalingVector(common::Pipeline& col_task_chain, SizeType i_begin, SizeType i_end, LocalTileIndex idx_loc_begin, LocalTileSize sz_loc_tiles, - KSender&& k, Matrix& mat, Matrix& comm_vec) { + KRSender&& k_row, KCSender&& k_col, Matrix& mat, + Matrix& comm_vec) { namespace ex = pika::execution::experimental; namespace di = dlaf::internal; @@ -1049,8 +1038,8 @@ void reduceSumScalingVector(common::Pipeline& col_task_chain tile::set0(di::Policy>())); // copy the first row of the matrix tile into the column tile of the buffer - copy1DAsync(k, sz_subm.rows(), sz_subm.cols(), Coord::Row, mat.read_sender(idx_loc_tile), - Coord::Col, comm_vec.readwrite_sender(idx_gl_comm)); + copy1DAsync(k_row, k_col, sz_subm.rows(), sz_subm.cols(), Coord::Row, + mat.read_sender(idx_loc_tile), Coord::Col, comm_vec.readwrite_sender(idx_gl_comm)); ex::start_detached(comm::scheduleAllReduceInPlace(ex::make_unique_any_sender(col_task_chain()), MPI_SUM, @@ -1058,8 +1047,8 @@ void reduceSumScalingVector(common::Pipeline& col_task_chain comm_vec.readwrite_sender(idx_gl_comm)))); // copy the column tile of the buffer into the first column of the matrix tile - copy1DAsync(k, sz_subm.rows(), sz_subm.cols(), Coord::Col, comm_vec.read_sender(idx_gl_comm), - Coord::Row, mat.readwrite_sender(idx_loc_tile)); + copy1DAsync(k_row, k_col, sz_subm.rows(), sz_subm.cols(), Coord::Col, + comm_vec.read_sender(idx_gl_comm), Coord::Row, mat.readwrite_sender(idx_loc_tile)); } } @@ -1068,19 +1057,21 @@ void solveRank1ProblemDist(SizeType i_begin, SizeType i_end, LocalTileIndex idx_ LocalTileSize sz_loc_tiles, KSender&& k, RhoSender&& rho, Matrix& d, Matrix& z, Matrix& evals, Matrix& delta, - Matrix& evecs) { + Matrix& i2, Matrix& evecs) { namespace ex = pika::execution::experimental; namespace di = dlaf::internal; const matrix::Distribution& dist = evecs.distribution(); - auto rank1_fn = [i_begin, idx_loc_begin, sz_loc_tiles, + auto rank1_fn = [i_begin, i_end, idx_loc_begin, sz_loc_tiles, dist](const auto& k, const auto& rho, const auto& d_sfut_tile_arr, const auto& z_sfut_tile_arr, const auto& eval_tiles, const auto& delta_tile_arr, - const auto& evec_tile_arr) { + const auto& i2_tile_arr, const auto& evec_tile_arr) { + const SizeType n = problemSize(i_begin, i_end, dist); const T* d_ptr = d_sfut_tile_arr[0].get().ptr(); const T* z_ptr = z_sfut_tile_arr[0].get().ptr(); T* eval_ptr = eval_tiles[0].ptr(); T* delta_ptr = delta_tile_arr[0].ptr(); + SizeType* i2_ptr = i2_tile_arr[0].ptr(); // Iterate over the columns of the local submatrix tile grid for (SizeType j_loc_subm_tile = 0; j_loc_subm_tile < sz_loc_tiles.cols(); ++j_loc_subm_tile) { @@ -1094,7 +1085,7 @@ void solveRank1ProblemDist(SizeType i_begin, SizeType i_end, LocalTileIndex idx_ // Skip columns that are in the deflation zone if (j_gl_subm_el >= k) - return; + break; // Iterate over the elements of the column tile SizeType ncols = std::min(dist.tileSize(j_gl_tile), k - j_gl_subm_el); @@ -1119,18 +1110,37 @@ void solveRank1ProblemDist(SizeType i_begin, SizeType i_end, LocalTileIndex idx_ // global matrix tile grid SizeType i_gl_subm_el = dist.globalTileElementDistance(i_begin, i_gl_tile); - // Skip rows that are in the deflation zone - if (i_gl_subm_el >= k) - break; - // The tile row in the global submatrix tile grid - SizeType i_subm_delta_arr = dist.globalTileFromLocalTile(i_loc_tile) - i_begin; - auto& delta_tile = delta_tile_arr[to_sizet(i_subm_delta_arr)]; + SizeType i_subm_i2_arr = dist.globalTileFromLocalTile(i_loc_tile) - i_begin; + auto& i2_tile = i2_tile_arr[to_sizet(i_subm_i2_arr)]; + + SizeType nrows = std::min(dist.tileSize(i_gl_tile), n - i_gl_subm_el); + for (int i = 0; i < nrows; ++i) { + SizeType ii = i2_tile({i, 0}); + if (ii < k) + evec_tile({i, j_tile_el}) = delta_ptr[ii]; + } + } + } + } - // Copy from delta buffer into local matrix tile - SizeType nrows = std::min(dist.tileSize(i_gl_tile), k - i_gl_subm_el); - tile::lacpy(TileElementSize(nrows, 1), TileElementIndex(0, 0), delta_tile, - TileElementIndex(0, j_tile_el), evec_tile); + // Fill ones for deflated Eigenvectors. + // Quick return if there is none. + if (n == k) + return; + + GlobalElementIndex origin{i_begin * dist.blockSize().rows(), i_begin * dist.blockSize().cols()}; + for (SizeType i = 0; i < n; ++i) { + SizeType j = i2_ptr[i]; + if (j >= k) { + GlobalElementIndex i_g{i + origin.row(), j + origin.col()}; + GlobalTileIndex i_tile = dist.globalTileIndex(i_g); + if (dist.rankIndex() == dist.rankGlobalTile(i_tile)) { + LocalTileIndex i_tile_l = dist.localTileIndex(i_tile); + SizeType i_subm_evec_arr = i_tile_l.row() - idx_loc_begin.row() + + (i_tile_l.col() - idx_loc_begin.col()) * sz_loc_tiles.rows(); + TileElementIndex i = dist.tileElementIndex(i_g); + evec_tile_arr[to_sizet(i_subm_evec_arr)](i) = T{1}; } } } @@ -1142,7 +1152,7 @@ void solveRank1ProblemDist(SizeType i_begin, SizeType i_end, LocalTileIndex idx_ ex::when_all(std::forward(k), std::forward(rho), ex::when_all_vector(tc.read(d)), ex::when_all_vector(tc.read(z)), ex::when_all_vector(tc.readwrite(evals)), ex::when_all_vector(tc.readwrite(delta)), - ex::when_all_vector(tc.readwrite(evecs))); + ex::when_all_vector(tc.readwrite(i2)), ex::when_all_vector(tc.readwrite(evecs))); ex::start_detached(di::transform(di::Policy(), std::move(rank1_fn), std::move(sender))); } @@ -1177,8 +1187,8 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid, common::Pipeline& full_task_chain, common::Pipeline& row_task_chain, common::Pipeline& col_task_chain, SizeType i_begin, - SizeType i_split, SizeType i_end, RhoSender&& rho, DistWorkSpace& ws, - DistWorkSpaceHost& ws_h, DistWorkSpaceHostMirror& ws_hm, + SizeType i_split, SizeType i_end, RhoSender&& rho, WorkSpace& ws, + WorkSpaceHost& ws_h, DistWorkSpaceHostMirror& ws_hm, Matrix& evals, Matrix& evecs) { namespace ex = pika::execution::experimental; @@ -1220,10 +1230,10 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid, // - apply Givens rotations to `Q` - `evecs` // initIndex(i_begin, i_end, ws_h.i1); - sortIndex(i_begin, i_end, ex::just(n1), ws_hm.evals, ws_h.i1, ws_h.i2); + sortIndex(i_begin, i_end, ex::just(n1), ws_hm.evals, ws_h.i1, ws_hm.i2); auto rots = - applyDeflation(i_begin, i_end, scaled_rho, std::move(tol), ws_h.i2, ws_hm.evals, ws_hm.z, ws_h.c); + applyDeflation(i_begin, i_end, scaled_rho, std::move(tol), ws_hm.i2, ws_hm.evals, ws_hm.z, ws_h.c); // --- @@ -1244,52 +1254,52 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid, // - solve the rank-1 problem and save eigenvalues in `dtmp` and eigenvectors in `mat1`. // - set deflated diagonal entries of `U` to 1 (temporary solution until optimized GEMM is implemented) // - auto k = stablePartitionIndexForDeflation(i_begin, i_end, ws_h.c, ws_h.i2, ws_h.i3) | ex::split(); + auto k = stablePartitionIndexForDeflation(i_begin, i_end, ws_h.c, ws_hm.i2, ws_h.i3) | ex::split(); applyIndex(i_begin, i_end, ws_h.i3, ws_hm.evals, ws_h.dtmp); applyIndex(i_begin, i_end, ws_h.i3, ws_hm.z, ws_hm.ztmp); copy(idx_begin_tiles_vec, sz_tiles_vec, ws_h.dtmp, ws_hm.evals); + // + // i3 (in) : initial <--- deflated + // i2 (out) : initial ---> deflated + // + invertIndex(i_begin, i_end, ws_h.i3, ws_hm.i2); + // Note: here ws_hm.z is used as a contiguous buffer for the laed4 call matrix::util::set0(pika::execution::thread_priority::normal, idx_loc_begin, sz_loc_tiles, ws_hm.mat1); solveRank1ProblemDist(i_begin, i_end, idx_loc_begin, sz_loc_tiles, k, std::move(scaled_rho), - ws_hm.evals, ws_hm.ztmp, ws_h.dtmp, ws_hm.z, ws_hm.mat1); + ws_hm.evals, ws_hm.ztmp, ws_h.dtmp, ws_hm.z, ws_hm.i2, ws_hm.mat1); copy(idx_loc_begin, sz_loc_tiles, ws_hm.mat1, ws.mat1); copy(idx_begin_tiles_vec, sz_tiles_vec, ws_hm.ztmp, ws.ztmp); copy(idx_begin_tiles_vec, sz_tiles_vec, ws_hm.evals, evals); + copy(idx_begin_tiles_vec, sz_tiles_vec, ws_hm.i2, ws.i2); // --- assembleDistEvalsVec(row_task_chain, i_begin, i_end, dist_evecs, ws_h.dtmp); + auto n = ex::just(problemSize(i_begin, i_end, evecs.distribution())); // Eigenvector formation: `ws.mat1` stores the eigenvectors, `ws.mat2` is used as an additional workspace - initWeightVector(idx_gl_begin, idx_loc_begin, sz_loc_tiles, k, evals, ws.mat1, ws.mat2); - reduceMultiplyWeightVector(row_task_chain, i_begin, i_end, idx_loc_begin, sz_loc_tiles, k, ws.mat2, + // Note ws.z is used as continuous buffer workspace to store permuted values of evals and ws.ztmp + applyIndex(i_begin, i_end, ws.i2, evals, ws.z); + initWeightVector(idx_gl_begin, idx_loc_begin, sz_loc_tiles, n, k, evals, ws.z, ws.mat1, ws.mat2); + reduceMultiplyWeightVector(row_task_chain, i_begin, i_end, idx_loc_begin, sz_loc_tiles, n, k, ws.mat2, ws.z); - formEvecsUsingWeightVec(idx_gl_begin, idx_loc_begin, sz_loc_tiles, k, ws.ztmp, ws.mat2, ws.mat1); - sumsqEvecs(idx_gl_begin, idx_loc_begin, sz_loc_tiles, k, ws.mat1, ws.mat2); - reduceSumScalingVector(col_task_chain, i_begin, i_end, idx_loc_begin, sz_loc_tiles, k, ws.mat2, ws.z); - normalizeEvecs(idx_gl_begin, idx_loc_begin, sz_loc_tiles, k, ws.mat2, ws.mat1); - setUnitDiagDist(i_begin, i_end, k, ws.mat1); + applyIndex(i_begin, i_end, ws.i2, ws.ztmp, ws.z); + formEvecsUsingWeightVec(idx_gl_begin, idx_loc_begin, sz_loc_tiles, n, k, ws.z, ws.mat2, ws.mat1); + sumsqEvecs(idx_gl_begin, idx_loc_begin, sz_loc_tiles, n, k, ws.mat1, ws.mat2); + reduceSumScalingVector(col_task_chain, i_begin, i_end, idx_loc_begin, sz_loc_tiles, n, k, ws.mat2, + ws.z); + normalizeEvecs(idx_gl_begin, idx_loc_begin, sz_loc_tiles, n, k, ws.mat2, ws.mat1); // Step #3: Eigenvectors of the tridiagonal system: Q * U - // - // i3 (in) : initial <--- deflated - // i2 (out) : initial ---> deflated - // // The eigenvectors resulting from the multiplication are already in the order of the eigenvalues as // prepared for the deflated system. // - invertIndex(i_begin, i_end, ws_h.i3, ws_h.i2); - - copy(idx_loc_begin, sz_loc_tiles, ws.mat1, ws_hm.mat1); - dlaf::permutations::permute(grid, col_task_chain, i_begin, - i_end, ws_h.i2, ws_hm.mat1, - ws_hm.mat2); - copy(idx_loc_begin, sz_loc_tiles, ws_hm.mat2, ws.mat2); dlaf::multiplication::generalSubMatrix(grid, row_task_chain, col_task_chain, i_begin, i_end, - T(1), evecs, ws.mat2, T(0), ws.mat1); + T(1), evecs, ws.mat1, T(0), ws.mat2); // Step #4: Final sorting of eigenvalues and eigenvectors // @@ -1300,12 +1310,12 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid, // ascending order // - reorder columns in `evecs` using `i2` such that eigenvectors match eigenvalues // - sortIndex(i_begin, i_end, std::move(k), ws_h.dtmp, ws_h.i1, ws_h.i2); - applyIndex(i_begin, i_end, ws_h.i2, ws_h.dtmp, ws_hm.evals); + sortIndex(i_begin, i_end, std::move(k), ws_h.dtmp, ws_h.i1, ws_hm.i2); + applyIndex(i_begin, i_end, ws_hm.i2, ws_h.dtmp, ws_hm.evals); - copy(idx_loc_begin, sz_loc_tiles, ws.mat1, ws_hm.mat1); + copy(idx_loc_begin, sz_loc_tiles, ws.mat2, ws_hm.mat2); dlaf::permutations::permute(grid, row_task_chain, i_begin, - i_end, ws_h.i2, ws_hm.mat1, + i_end, ws_hm.i2, ws_hm.mat2, ws_hm.evecs); copy(idx_loc_begin, sz_loc_tiles, ws_hm.evecs, evecs); } diff --git a/src/eigensolver/tridiag_solver/kernels.cpp b/src/eigensolver/tridiag_solver/kernels.cpp index 6bc0d006c5..98c5274775 100644 --- a/src/eigensolver/tridiag_solver/kernels.cpp +++ b/src/eigensolver/tridiag_solver/kernels.cpp @@ -93,16 +93,17 @@ void initIndexTile(SizeType offset, const matrix::Tile& t } template -void divideEvecsByDiagonal(const SizeType& k, const SizeType& i_subm_el, const SizeType& j_subm_el, +void divideEvecsByDiagonal(const SizeType& k_row, const SizeType& k_col, const SizeType& i_subm_el, + const SizeType& j_subm_el, const matrix::Tile& diag_rows, const matrix::Tile& diag_cols, const matrix::Tile& evecs_tile, const matrix::Tile& ws_tile) { - if (i_subm_el >= k || j_subm_el >= k) + if (i_subm_el >= k_row || j_subm_el >= k_col) return; - SizeType nrows = std::min(k - i_subm_el, evecs_tile.size().rows()); - SizeType ncols = std::min(k - j_subm_el, evecs_tile.size().cols()); + SizeType nrows = std::min(k_row - i_subm_el, evecs_tile.size().rows()); + SizeType ncols = std::min(k_col - j_subm_el, evecs_tile.size().cols()); for (SizeType j = 0; j < ncols; ++j) { for (SizeType i = 0; i < nrows; ++i) { @@ -122,13 +123,13 @@ DLAF_CPU_DIVIDE_EVECS_BY_DIAGONAL_ETI(, float); DLAF_CPU_DIVIDE_EVECS_BY_DIAGONAL_ETI(, double); template -void multiplyFirstColumns(const SizeType& k, const SizeType& row, const SizeType& col, - const matrix::Tile& in, +void multiplyFirstColumns(const SizeType& k_row, const SizeType& k_col, const SizeType& row, + const SizeType& col, const matrix::Tile& in, const matrix::Tile& out) { - if (row >= k || col >= k) + if (row >= k_row || col >= k_col) return; - SizeType nrows = std::min(k - row, in.size().rows()); + SizeType nrows = std::min(k_row - row, in.size().rows()); for (SizeType i = 0; i < nrows; ++i) { TileElementIndex idx(i, 0); @@ -140,22 +141,25 @@ DLAF_CPU_MULTIPLY_FIRST_COLUMNS_ETI(, float); DLAF_CPU_MULTIPLY_FIRST_COLUMNS_ETI(, double); template -void calcEvecsFromWeightVec(const SizeType& k, const SizeType& row, const SizeType& col, - const matrix::Tile& z_tile, +void calcEvecsFromWeightVec(const SizeType& k_row, const SizeType& k_col, const SizeType& row, + const SizeType& col, const matrix::Tile& z_tile, const matrix::Tile& ws_tile, const matrix::Tile& evecs_tile) { - if (row >= k || col >= k) + if (row >= k_row || col >= k_col) return; - SizeType nrows = std::min(k - row, evecs_tile.size().rows()); - SizeType ncols = std::min(k - col, evecs_tile.size().cols()); + SizeType nrows = std::min(k_row - row, evecs_tile.size().rows()); + SizeType ncols = std::min(k_col - col, evecs_tile.size().cols()); for (SizeType j = 0; j < ncols; ++j) { for (SizeType i = 0; i < nrows; ++i) { T ws_el = ws_tile(TileElementIndex(i, 0)); T z_el = z_tile(TileElementIndex(i, 0)); T& el_evec = evecs_tile(TileElementIndex(i, j)); - el_evec = std::copysign(std::sqrt(std::abs(ws_el)), z_el) / el_evec; + // Avoid NaN generated by deflated vectors. + // It is a temporary workaround until deflated vectors are moved at the end of the local matrix. + if (el_evec != 0) + el_evec = std::copysign(std::sqrt(std::abs(ws_el)), z_el) / el_evec; } } } @@ -164,14 +168,14 @@ DLAF_CPU_CALC_EVECS_FROM_WEIGHT_VEC_ETI(, float); DLAF_CPU_CALC_EVECS_FROM_WEIGHT_VEC_ETI(, double); template -void sumsqCols(const SizeType& k, const SizeType& row, const SizeType& col, +void sumsqCols(const SizeType& k_row, const SizeType& k_col, const SizeType& row, const SizeType& col, const matrix::Tile& evecs_tile, const matrix::Tile& ws_tile) { - if (row >= k || col >= k) + if (row >= k_row || col >= k_col) return; - SizeType nrows = std::min(k - row, evecs_tile.size().rows()); - SizeType ncols = std::min(k - col, evecs_tile.size().cols()); + SizeType nrows = std::min(k_row - row, evecs_tile.size().rows()); + SizeType ncols = std::min(k_col - col, evecs_tile.size().cols()); for (SizeType j = 0; j < ncols; ++j) { T loc_norm = 0; @@ -187,13 +191,13 @@ DLAF_CPU_SUMSQ_COLS_ETI(, float); DLAF_CPU_SUMSQ_COLS_ETI(, double); template -void addFirstRows(const SizeType& k, const SizeType& row, const SizeType& col, +void addFirstRows(const SizeType& k_row, const SizeType& k_col, const SizeType& row, const SizeType& col, const matrix::Tile& in, const matrix::Tile& out) { - if (row >= k || col >= k) + if (row >= k_row || col >= k_col) return; - SizeType ncols = std::min(k - col, in.size().cols()); + SizeType ncols = std::min(k_col - col, in.size().cols()); for (SizeType j = 0; j < ncols; ++j) { out(TileElementIndex(0, j)) += in(TileElementIndex(0, j)); @@ -204,14 +208,14 @@ DLAF_CPU_ADD_FIRST_ROWS_ETI(, float); DLAF_CPU_ADD_FIRST_ROWS_ETI(, double); template -void divideColsByFirstRow(const SizeType& k, const SizeType& row, const SizeType& col, - const matrix::Tile& in, +void divideColsByFirstRow(const SizeType& k_row, const SizeType& k_col, const SizeType& row, + const SizeType& col, const matrix::Tile& in, const matrix::Tile& out) { - if (row >= k || col >= k) + if (row >= k_row || col >= k_col) return; - SizeType nrows = std::min(k - row, out.size().rows()); - SizeType ncols = std::min(k - col, out.size().cols()); + SizeType nrows = std::min(k_row - row, out.size().rows()); + SizeType ncols = std::min(k_col - col, out.size().cols()); for (SizeType j = 0; j < ncols; ++j) { for (SizeType i = 0; i < nrows; ++i) { @@ -242,10 +246,10 @@ DLAF_CPU_SET_UNIT_DIAGONAL_ETI(, float); DLAF_CPU_SET_UNIT_DIAGONAL_ETI(, double); template -void copy1D(const SizeType& k, const SizeType& row, const SizeType& col, const Coord& in_coord, - const matrix::Tile& in_tile, const Coord& out_coord, - const matrix::Tile& out_tile) { - if (row >= k || col >= k) +void copy1D(const SizeType& k_row, const SizeType& k_col, const SizeType& row, const SizeType& col, + const Coord& in_coord, const matrix::Tile& in_tile, + const Coord& out_coord, const matrix::Tile& out_tile) { + if (row >= k_row || col >= k_col) return; const T* in_ptr = in_tile.ptr(); @@ -255,12 +259,12 @@ void copy1D(const SizeType& k, const SizeType& row, const SizeType& col, const C SizeType out_ld = (out_coord == Coord::Col) ? 1 : out_tile.ld(); // if `in_tile` is the column buffer - SizeType len = (out_coord == Coord::Col) ? std::min(out_tile.size().rows(), k - row) - : std::min(out_tile.size().cols(), k - col); + SizeType len = (out_coord == Coord::Col) ? std::min(out_tile.size().rows(), k_row - row) + : std::min(out_tile.size().cols(), k_col - col); // if out_tile is the column buffer if (out_tile.size().cols() == 1) { - len = (in_coord == Coord::Col) ? std::min(in_tile.size().rows(), k - row) - : std::min(in_tile.size().cols(), k - col); + len = (in_coord == Coord::Col) ? std::min(in_tile.size().rows(), k_row - row) + : std::min(in_tile.size().cols(), k_col - col); } blas::copy(len, in_ptr, in_ld, out_ptr, out_ld); diff --git a/src/eigensolver/tridiag_solver/kernels.cu b/src/eigensolver/tridiag_solver/kernels.cu index 5f3d3c0004..bd8a11a396 100644 --- a/src/eigensolver/tridiag_solver/kernels.cu +++ b/src/eigensolver/tridiag_solver/kernels.cu @@ -227,16 +227,17 @@ struct Row2ColMajor { }; template -void divideEvecsByDiagonal(const SizeType& k, const SizeType& i_subm_el, const SizeType& j_subm_el, +void divideEvecsByDiagonal(const SizeType& k_row, const SizeType& k_col, const SizeType& i_subm_el, + const SizeType& j_subm_el, const matrix::Tile& diag_rows, const matrix::Tile& diag_cols, const matrix::Tile& evecs_tile, const matrix::Tile& ws_tile, whip::stream_t stream) { - if (i_subm_el >= k || j_subm_el >= k) + if (i_subm_el >= k_row || j_subm_el >= k_col) return; - SizeType nrows = std::min(k - i_subm_el, evecs_tile.size().rows()); - SizeType ncols = std::min(k - j_subm_el, evecs_tile.size().cols()); + SizeType nrows = std::min(k_row - i_subm_el, evecs_tile.size().rows()); + SizeType ncols = std::min(k_col - j_subm_el, evecs_tile.size().cols()); SizeType ld = evecs_tile.ld(); const T* d_rows = diag_rows.ptr(); @@ -300,13 +301,13 @@ __global__ void multiplyColumns(SizeType len, const T* in, T* out) { } template -void multiplyFirstColumns(const SizeType& k, const SizeType& row, const SizeType& col, - const matrix::Tile& in, +void multiplyFirstColumns(const SizeType& k_row, const SizeType& k_col, const SizeType& row, + const SizeType& col, const matrix::Tile& in, const matrix::Tile& out, whip::stream_t stream) { - if (row >= k || col >= k) + if (row >= k_row || col >= k_col) return; - SizeType nrows = std::min(k - row, in.size().rows()); + SizeType nrows = std::min(k_row - row, in.size().rows()); const T* in_ptr = in.ptr(); T* out_ptr = out.ptr(); @@ -334,24 +335,26 @@ __global__ void calcEvecsFromWeightVec(SizeType nrows, SizeType ncols, SizeType T z_el = rank1_vec[i]; T& el_evec = evecs[i + j * ld]; - if constexpr (std::is_same::value) { - el_evec = copysignf(sqrtf(fabsf(ws_el)), z_el) / el_evec; - } - else { - el_evec = copysign(sqrt(fabs(ws_el)), z_el) / el_evec; + if (el_evec != 0) { + if constexpr (std::is_same::value) { + el_evec = copysignf(sqrtf(fabsf(ws_el)), z_el) / el_evec; + } + else { + el_evec = copysign(sqrt(fabs(ws_el)), z_el) / el_evec; + } } } template -void calcEvecsFromWeightVec(const SizeType& k, const SizeType& row, const SizeType& col, - const matrix::Tile& z_tile, +void calcEvecsFromWeightVec(const SizeType& k_row, const SizeType& k_col, const SizeType& row, + const SizeType& col, const matrix::Tile& z_tile, const matrix::Tile& ws_tile, const matrix::Tile& evecs_tile, whip::stream_t stream) { - if (row >= k || col >= k) + if (row >= k_row || col >= k_col) return; - SizeType nrows = std::min(k - row, evecs_tile.size().rows()); - SizeType ncols = std::min(k - col, evecs_tile.size().cols()); + SizeType nrows = std::min(k_row - row, evecs_tile.size().rows()); + SizeType ncols = std::min(k_col - col, evecs_tile.size().cols()); SizeType ld = evecs_tile.ld(); const T* rank1_vec = z_tile.ptr(); @@ -385,14 +388,14 @@ __global__ void sqTile(SizeType nrows, SizeType ncols, SizeType ld, const T* in, } template -void sumsqCols(const SizeType& k, const SizeType& row, const SizeType& col, +void sumsqCols(const SizeType& k_row, const SizeType& k_col, const SizeType& row, const SizeType& col, const matrix::Tile& evecs_tile, const matrix::Tile& ws_tile, whip::stream_t stream) { - if (row >= k || col >= k) + if (row >= k_row || col >= k_col) return; - SizeType nrows = std::min(k - row, evecs_tile.size().rows()); - SizeType ncols = std::min(k - col, evecs_tile.size().cols()); + SizeType nrows = std::min(k_row - row, evecs_tile.size().rows()); + SizeType ncols = std::min(k_col - col, evecs_tile.size().cols()); SizeType ld = evecs_tile.ld(); const T* in = evecs_tile.ptr(); @@ -441,13 +444,13 @@ __global__ void addFirstRows(SizeType len, SizeType ld, const T* in, T* out) { } template -void addFirstRows(const SizeType& k, const SizeType& row, const SizeType& col, +void addFirstRows(const SizeType& k_row, const SizeType& k_col, const SizeType& row, const SizeType& col, const matrix::Tile& in, const matrix::Tile& out, whip::stream_t stream) { - if (row >= k || col >= k) + if (row >= k_row || col >= k_col) return; - SizeType ncols = std::min(k - col, in.size().cols()); + SizeType ncols = std::min(k_col - col, in.size().cols()); SizeType ld = in.ld(); const T* in_ptr = in.ptr(); @@ -484,14 +487,14 @@ __global__ void scaleTileWithRow(SizeType nrows, SizeType ncols, SizeType in_ld, } template -void divideColsByFirstRow(const SizeType& k, const SizeType& row, const SizeType& col, - const matrix::Tile& in, +void divideColsByFirstRow(const SizeType& k_row, const SizeType& k_col, const SizeType& row, + const SizeType& col, const matrix::Tile& in, const matrix::Tile& out, whip::stream_t stream) { - if (row >= k || col >= k) + if (row >= k_row || col >= k_col) return; - SizeType nrows = std::min(k - row, out.size().rows()); - SizeType ncols = std::min(k - col, out.size().cols()); + SizeType nrows = std::min(k_row - row, out.size().rows()); + SizeType ncols = std::min(k_col - col, out.size().cols()); SizeType in_ld = in.ld(); const T* in_ptr = in.ptr(); @@ -543,10 +546,11 @@ DLAF_GPU_SET_UNIT_DIAGONAL_ETI(, double); // Reference to CUBLAS 1D copy(): https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-copy template -void copy1D(cublasHandle_t handle, const SizeType& k, const SizeType& row, const SizeType& col, - const Coord& in_coord, const matrix::Tile& in_tile, - const Coord& out_coord, const matrix::Tile& out_tile) { - if (row >= k || col >= k) +void copy1D(cublasHandle_t handle, const SizeType& k_row, const SizeType& k_col, const SizeType& row, + const SizeType& col, const Coord& in_coord, + const matrix::Tile& in_tile, const Coord& out_coord, + const matrix::Tile& out_tile) { + if (row >= k_row || col >= k_col) return; const T* in_ptr = in_tile.ptr(); @@ -556,12 +560,12 @@ void copy1D(cublasHandle_t handle, const SizeType& k, const SizeType& row, const int out_ld = (out_coord == Coord::Col) ? 1 : to_int(out_tile.ld()); // if `in_tile` is the column buffer - SizeType len = (out_coord == Coord::Col) ? std::min(out_tile.size().rows(), k - row) - : std::min(out_tile.size().cols(), k - col); + SizeType len = (out_coord == Coord::Col) ? std::min(out_tile.size().rows(), k_row - row) + : std::min(out_tile.size().cols(), k_col - col); // if out_tile is the column buffer if (out_tile.size().cols() == 1) { - len = (in_coord == Coord::Col) ? std::min(in_tile.size().rows(), k - row) - : std::min(in_tile.size().cols(), k - col); + len = (in_coord == Coord::Col) ? std::min(in_tile.size().rows(), k_row - row) + : std::min(in_tile.size().cols(), k_col - col); } if constexpr (std::is_same::value) {