diff --git a/include/dlaf/eigensolver/tridiag_solver/impl.h b/include/dlaf/eigensolver/tridiag_solver/impl.h
index 4a49b93f01..ddc2db5b8c 100644
--- a/include/dlaf/eigensolver/tridiag_solver/impl.h
+++ b/include/dlaf/eigensolver/tridiag_solver/impl.h
@@ -70,9 +70,9 @@ inline std::vector<std::tuple<SizeType, SizeType, SizeType>> generateSubproblemI
 template <class T>
 auto cuppensDecomposition(Matrix<T, Device::CPU>& tridiag) {
   namespace ex = pika::execution::experimental;
-  using sender_type = decltype(
-      ex::split(cuppensDecompAsync<T>(tridiag.readwrite_sender(std::declval<LocalTileIndex>()),
-                                      tridiag.readwrite_sender(std::declval<LocalTileIndex>()))));
+  using sender_type = decltype(ex::split(ex::ensure_started(
+      cuppensDecompAsync<T>(tridiag.readwrite_sender(std::declval<LocalTileIndex>()),
+                            tridiag.readwrite_sender(std::declval<LocalTileIndex>())))));
   using vector_type = std::vector<sender_type>;
 
   if (tridiag.nrTiles().rows() == 0)
@@ -83,9 +83,9 @@ auto cuppensDecomposition(Matrix<T, Device::CPU>& tridiag) {
   offdiag_vals.reserve(to_sizet(i_end));
 
   for (SizeType i_split = 0; i_split < i_end; ++i_split) {
-    offdiag_vals.push_back(
-        ex::split(cuppensDecompAsync<T>(tridiag.readwrite_sender(LocalTileIndex(i_split, 0)),
-                                        tridiag.readwrite_sender(LocalTileIndex(i_split + 1, 0)))));
+    offdiag_vals.push_back(ex::split(ex::ensure_started(
+        cuppensDecompAsync<T>(tridiag.readwrite_sender(LocalTileIndex(i_split, 0)),
+                              tridiag.readwrite_sender(LocalTileIndex(i_split + 1, 0))))));
   }
   return offdiag_vals;
 }
@@ -329,20 +329,21 @@ void TridiagSolver<B, D, T>::call(comm::CommunicatorGrid grid, Matrix<T, Device:
   const matrix::Distribution& dist_evecs = evecs.distribution();
   const matrix::Distribution& dist_evals = evals.distribution();
 
-  DistWorkSpace<T, D> ws{Matrix<T, D>(dist_evecs),   // mat1
-                         Matrix<T, D>(dist_evecs),   // mat2
-                         Matrix<T, D>(dist_evals),   // z
-                         Matrix<T, D>(dist_evals)};  // ztmp
+  WorkSpace<T, D> ws{Matrix<T, D>(dist_evecs),          // mat1
+                     Matrix<T, D>(dist_evecs),          // mat2
+                     Matrix<T, D>(dist_evals),          // z
+                     Matrix<T, D>(dist_evals),          // ztmp
+                     Matrix<SizeType, D>(dist_evals)};  // i2
 
-  DistWorkSpaceHost<T> ws_h{Matrix<T, Device::CPU>(dist_evals),         // dtmp
-                            Matrix<SizeType, Device::CPU>(dist_evals),  // i1
-                            Matrix<SizeType, Device::CPU>(dist_evals),  // i2
-                            Matrix<SizeType, Device::CPU>(dist_evals),  // i3
-                            Matrix<ColType, Device::CPU>(dist_evals)};  // c
+  WorkSpaceHost<T> ws_h{Matrix<T, Device::CPU>(dist_evals),         // dtmp
+                        Matrix<SizeType, Device::CPU>(dist_evals),  // i1
+                        Matrix<SizeType, Device::CPU>(dist_evals),  // i3
+                        Matrix<ColType, Device::CPU>(dist_evals)};  // c
 
   DistWorkSpaceHostMirror<T, D> ws_hm{initMirrorMatrix(evals),   initMirrorMatrix(evecs),
                                       initMirrorMatrix(ws.mat1), initMirrorMatrix(ws.mat2),
-                                      initMirrorMatrix(ws.z),    initMirrorMatrix(ws.ztmp)};
+                                      initMirrorMatrix(ws.z),    initMirrorMatrix(ws.ztmp),
+                                      initMirrorMatrix(ws.i2)};
 
   // Set `evecs` to `zero` (needed for Given's rotation to make sure no random values are picked up)
   matrix::util::set0<B, T, D>(pika::execution::thread_priority::normal, evecs);
diff --git a/include/dlaf/eigensolver/tridiag_solver/kernels.h b/include/dlaf/eigensolver/tridiag_solver/kernels.h
index 483a30d533..b12b6482dd 100644
--- a/include/dlaf/eigensolver/tridiag_solver/kernels.h
+++ b/include/dlaf/eigensolver/tridiag_solver/kernels.h
@@ -277,15 +277,16 @@ void initIndexTileAsync(SizeType tile_row, TileSender&& tile) {
 }
 
 template <class T>
-void divideEvecsByDiagonal(const SizeType& k, const SizeType& i_subm_el, const SizeType& j_subm_el,
+void divideEvecsByDiagonal(const SizeType& k_row, const SizeType& k_col, const SizeType& i_subm_el,
+                           const SizeType& j_subm_el,
                            const matrix::Tile<const T, Device::CPU>& diag_rows,
                            const matrix::Tile<const T, Device::CPU>& diag_cols,
                            const matrix::Tile<const T, Device::CPU>& evecs_tile,
                            const matrix::Tile<T, Device::CPU>& ws_tile);
 
 #define DLAF_CPU_DIVIDE_EVECS_BY_DIAGONAL_ETI(kword, Type)                                           \
-  kword template void divideEvecsByDiagonal(const SizeType& k, const SizeType& i_subm_el,            \
-                                            const SizeType& j_subm_el,                               \
+  kword template void divideEvecsByDiagonal(const SizeType& k_row, const SizeType& k_col,            \
+                                            const SizeType& i_subm_el, const SizeType& j_subm_el,    \
                                             const matrix::Tile<const Type, Device::CPU>& diag_rows,  \
                                             const matrix::Tile<const Type, Device::CPU>& diag_cols,  \
                                             const matrix::Tile<const Type, Device::CPU>& evecs_tile, \
@@ -296,15 +297,16 @@ DLAF_CPU_DIVIDE_EVECS_BY_DIAGONAL_ETI(extern, double);
 
 #ifdef DLAF_WITH_GPU
 template <class T>
-void divideEvecsByDiagonal(const SizeType& k, const SizeType& i_subm_el, const SizeType& j_subm_el,
+void divideEvecsByDiagonal(const SizeType& k_row, const SizeType& k_col, const SizeType& i_subm_el,
+                           const SizeType& j_subm_el,
                            const matrix::Tile<const T, Device::GPU>& diag_rows,
                            const matrix::Tile<const T, Device::GPU>& diag_cols,
                            const matrix::Tile<const T, Device::GPU>& evecs_tile,
                            const matrix::Tile<T, Device::GPU>& ws_tile, whip::stream_t stream);
 
 #define DLAF_GPU_DIVIDE_EVECS_BY_DIAGONAL_ETI(kword, Type)                                           \
-  kword template void divideEvecsByDiagonal(const SizeType& k, const SizeType& i_subm_el,            \
-                                            const SizeType& j_subm_el,                               \
+  kword template void divideEvecsByDiagonal(const SizeType& k_row, const SizeType& k_col,            \
+                                            const SizeType& i_subm_el, const SizeType& j_subm_el,    \
                                             const matrix::Tile<const Type, Device::GPU>& diag_rows,  \
                                             const matrix::Tile<const Type, Device::GPU>& diag_cols,  \
                                             const matrix::Tile<const Type, Device::GPU>& evecs_tile, \
@@ -317,14 +319,15 @@ DLAF_GPU_DIVIDE_EVECS_BY_DIAGONAL_ETI(extern, double);
 
 DLAF_MAKE_CALLABLE_OBJECT(divideEvecsByDiagonal);
 
-template <Device D, class KSender, class DiagRowsTileSender, class DiagColsTileSender,
+template <Device D, class KRSender, class KCSender, class DiagRowsTileSender, class DiagColsTileSender,
           class EvecsTileSender, class TempTileSender>
-void divideEvecsByDiagonalAsync(KSender&& k, SizeType i_subm_el, SizeType j_subm_el,
-                                DiagRowsTileSender&& diag_rows, DiagColsTileSender&& diag_cols,
-                                EvecsTileSender&& evecs, TempTileSender&& temp) {
+void divideEvecsByDiagonalAsync(KRSender&& k_row, KCSender&& k_col, SizeType i_subm_el,
+                                SizeType j_subm_el, DiagRowsTileSender&& diag_rows,
+                                DiagColsTileSender&& diag_cols, EvecsTileSender&& evecs,
+                                TempTileSender&& temp) {
   namespace di = dlaf::internal;
   auto sender =
-      di::whenAllLift(std::forward<KSender>(k), i_subm_el, j_subm_el,
+      di::whenAllLift(std::forward<KRSender>(k_row), std::forward<KCSender>(k_col), i_subm_el, j_subm_el,
                       std::forward<DiagRowsTileSender>(diag_rows),
                       std::forward<DiagColsTileSender>(diag_cols), std::forward<EvecsTileSender>(evecs),
                       std::forward<TempTileSender>(temp));
@@ -332,13 +335,14 @@ void divideEvecsByDiagonalAsync(KSender&& k, SizeType i_subm_el, SizeType j_subm
 }
 
 template <class T>
-void multiplyFirstColumns(const SizeType& k, const SizeType& row, const SizeType& col,
-                          const matrix::Tile<const T, Device::CPU>& in,
+void multiplyFirstColumns(const SizeType& k_row, const SizeType& k_col, const SizeType& row,
+                          const SizeType& col, const matrix::Tile<const T, Device::CPU>& in,
                           const matrix::Tile<T, Device::CPU>& out);
 
-#define DLAF_CPU_MULTIPLY_FIRST_COLUMNS_ETI(kword, Type)                                                \
-  kword template void multiplyFirstColumns(const SizeType& k, const SizeType& row, const SizeType& col, \
-                                           const matrix::Tile<const Type, Device::CPU>& in,             \
+#define DLAF_CPU_MULTIPLY_FIRST_COLUMNS_ETI(kword, Type)                                    \
+  kword template void multiplyFirstColumns(const SizeType& k_row, const SizeType& k_col,    \
+                                           const SizeType& row, const SizeType& col,        \
+                                           const matrix::Tile<const Type, Device::CPU>& in, \
                                            const matrix::Tile<Type, Device::CPU>& out)
 
 DLAF_CPU_MULTIPLY_FIRST_COLUMNS_ETI(extern, float);
@@ -346,14 +350,15 @@ DLAF_CPU_MULTIPLY_FIRST_COLUMNS_ETI(extern, double);
 
 #ifdef DLAF_WITH_GPU
 template <class T>
-void multiplyFirstColumns(const SizeType& k, const SizeType& row, const SizeType& col,
-                          const matrix::Tile<const T, Device::GPU>& in,
+void multiplyFirstColumns(const SizeType& k_row, const SizeType& k_col, const SizeType& row,
+                          const SizeType& col, const matrix::Tile<const T, Device::GPU>& in,
                           const matrix::Tile<T, Device::GPU>& out, whip::stream_t stream);
 
-#define DLAF_GPU_MULTIPLY_FIRST_COLUMNS_ETI(kword, Type)                                                \
-  kword template void multiplyFirstColumns(const SizeType& k, const SizeType& row, const SizeType& col, \
-                                           const matrix::Tile<const Type, Device::GPU>& in,             \
-                                           const matrix::Tile<Type, Device::GPU>& out,                  \
+#define DLAF_GPU_MULTIPLY_FIRST_COLUMNS_ETI(kword, Type)                                    \
+  kword template void multiplyFirstColumns(const SizeType& k_row, const SizeType& k_col,    \
+                                           const SizeType& row, const SizeType& col,        \
+                                           const matrix::Tile<const Type, Device::GPU>& in, \
+                                           const matrix::Tile<Type, Device::GPU>& out,      \
                                            whip::stream_t stream)
 
 DLAF_GPU_MULTIPLY_FIRST_COLUMNS_ETI(extern, float);
@@ -362,24 +367,24 @@ DLAF_GPU_MULTIPLY_FIRST_COLUMNS_ETI(extern, double);
 
 DLAF_MAKE_CALLABLE_OBJECT(multiplyFirstColumns);
 
-template <Device D, class KSender, class InTileSender, class OutTileSender>
-void multiplyFirstColumnsAsync(KSender&& k, SizeType row, SizeType col, InTileSender&& in,
-                               OutTileSender&& out) {
+template <Device D, class KRSender, class KCSender, class InTileSender, class OutTileSender>
+void multiplyFirstColumnsAsync(KRSender&& k_row, KCSender&& k_col, SizeType row, SizeType col,
+                               InTileSender&& in, OutTileSender&& out) {
   namespace di = dlaf::internal;
-  auto sender = di::whenAllLift(std::forward<KSender>(k), row, col, std::forward<InTileSender>(in),
+  auto sender = di::whenAllLift(k_row, k_col, row, col, std::forward<InTileSender>(in),
                                 std::forward<OutTileSender>(out));
   di::transformDetach(di::Policy<DefaultBackend_v<D>>(), multiplyFirstColumns_o, std::move(sender));
 }
 
 template <class T>
-void calcEvecsFromWeightVec(const SizeType& k, const SizeType& row, const SizeType& col,
-                            const matrix::Tile<const T, Device::CPU>& z_tile,
+void calcEvecsFromWeightVec(const SizeType& k_row, const SizeType& k_col, const SizeType& row,
+                            const SizeType& col, const matrix::Tile<const T, Device::CPU>& z_tile,
                             const matrix::Tile<const T, Device::CPU>& ws_tile,
                             const matrix::Tile<T, Device::CPU>& evecs_tile);
 
 #define DLAF_CPU_CALC_EVECS_FROM_WEIGHT_VEC_ETI(kword, Type)                                       \
-  kword template void calcEvecsFromWeightVec(const SizeType& k, const SizeType& row,               \
-                                             const SizeType& col,                                  \
+  kword template void calcEvecsFromWeightVec(const SizeType& k_row, const SizeType& k_col,         \
+                                             const SizeType& row, const SizeType& col,             \
                                              const matrix::Tile<const Type, Device::CPU>& z_tile,  \
                                              const matrix::Tile<const Type, Device::CPU>& ws_tile, \
                                              const matrix::Tile<Type, Device::CPU>& evecs_tile)
@@ -389,14 +394,14 @@ DLAF_CPU_CALC_EVECS_FROM_WEIGHT_VEC_ETI(extern, double);
 
 #ifdef DLAF_WITH_GPU
 template <class T>
-void calcEvecsFromWeightVec(const SizeType& k, const SizeType& row, const SizeType& col,
-                            const matrix::Tile<const T, Device::GPU>& z_tile,
+void calcEvecsFromWeightVec(const SizeType& k_row, const SizeType& k_col, const SizeType& row,
+                            const SizeType& col, const matrix::Tile<const T, Device::GPU>& z_tile,
                             const matrix::Tile<const T, Device::GPU>& ws_tile,
                             const matrix::Tile<T, Device::GPU>& evecs_tile, whip::stream_t stream);
 
 #define DLAF_GPU_CALC_EVECS_FROM_WEIGHT_VEC_ETI(kword, Type)                                       \
-  kword template void calcEvecsFromWeightVec(const SizeType& k, const SizeType& row,               \
-                                             const SizeType& col,                                  \
+  kword template void calcEvecsFromWeightVec(const SizeType& k_row, const SizeType& k_col,         \
+                                             const SizeType& row, const SizeType& col,             \
                                              const matrix::Tile<const Type, Device::GPU>& z_tile,  \
                                              const matrix::Tile<const Type, Device::GPU>& ws_tile, \
                                              const matrix::Tile<Type, Device::GPU>& evecs_tile,    \
@@ -408,25 +413,28 @@ DLAF_GPU_CALC_EVECS_FROM_WEIGHT_VEC_ETI(extern, double);
 
 DLAF_MAKE_CALLABLE_OBJECT(calcEvecsFromWeightVec);
 
-template <Device D, class KSender, class Rank1TileSender, class TempTileSender, class EvecsTileSender>
-void calcEvecsFromWeightVecAsync(KSender&& k, SizeType row, SizeType col, Rank1TileSender&& rank1,
-                                 TempTileSender&& temp, EvecsTileSender&& evecs) {
+template <Device D, class KRSender, class KCSender, class Rank1TileSender, class TempTileSender,
+          class EvecsTileSender>
+void calcEvecsFromWeightVecAsync(KRSender&& k_row, KCSender&& k_col, SizeType row, SizeType col,
+                                 Rank1TileSender&& rank1, TempTileSender&& temp,
+                                 EvecsTileSender&& evecs) {
   namespace di = dlaf::internal;
 
   auto sender =
-      di::whenAllLift(std::forward<KSender>(k), row, col, std::forward<Rank1TileSender>(rank1),
+      di::whenAllLift(k_row, k_col, row, col, std::forward<Rank1TileSender>(rank1),
                       std::forward<TempTileSender>(temp), std::forward<EvecsTileSender>(evecs));
   di::transformDetach(di::Policy<DefaultBackend_v<D>>(), calcEvecsFromWeightVec_o, std::move(sender));
 }
 
 template <class T>
-void sumsqCols(const SizeType& k, const SizeType& row, const SizeType& col,
+void sumsqCols(const SizeType& k_row, const SizeType& k_col, const SizeType& row, const SizeType& col,
                const matrix::Tile<const T, Device::CPU>& evecs_tile,
                const matrix::Tile<T, Device::CPU>& ws_tile);
 
-#define DLAF_CPU_SUMSQ_COLS_ETI(kword, Type)                                                 \
-  kword template void sumsqCols(const SizeType& k, const SizeType& row, const SizeType& col, \
-                                const matrix::Tile<const Type, Device::CPU>& evecs_tile,     \
+#define DLAF_CPU_SUMSQ_COLS_ETI(kword, Type)                                                       \
+  kword template void sumsqCols(const SizeType& k_row, const SizeType& k_col, const SizeType& row, \
+                                const SizeType& col,                                               \
+                                const matrix::Tile<const Type, Device::CPU>& evecs_tile,           \
                                 const matrix::Tile<Type, Device::CPU>& ws_tile)
 
 DLAF_CPU_SUMSQ_COLS_ETI(extern, float);
@@ -434,13 +442,14 @@ DLAF_CPU_SUMSQ_COLS_ETI(extern, double);
 
 #ifdef DLAF_WITH_GPU
 template <class T>
-void sumsqCols(const SizeType& k, const SizeType& row, const SizeType& col,
+void sumsqCols(const SizeType& k_row, const SizeType& k_col, const SizeType& row, const SizeType& col,
                const matrix::Tile<const T, Device::GPU>& evecs_tile,
                const matrix::Tile<T, Device::GPU>& ws_tile, whip::stream_t stream);
 
-#define DLAF_GPU_SUMSQ_COLS_ETI(kword, Type)                                                 \
-  kword template void sumsqCols(const SizeType& k, const SizeType& row, const SizeType& col, \
-                                const matrix::Tile<const Type, Device::GPU>& evecs_tile,     \
+#define DLAF_GPU_SUMSQ_COLS_ETI(kword, Type)                                                       \
+  kword template void sumsqCols(const SizeType& k_row, const SizeType& k_col, const SizeType& row, \
+                                const SizeType& col,                                               \
+                                const matrix::Tile<const Type, Device::GPU>& evecs_tile,           \
                                 const matrix::Tile<Type, Device::GPU>& ws_tile, whip::stream_t stream)
 
 DLAF_GPU_SUMSQ_COLS_ETI(extern, float);
@@ -449,23 +458,25 @@ DLAF_GPU_SUMSQ_COLS_ETI(extern, double);
 
 DLAF_MAKE_CALLABLE_OBJECT(sumsqCols);
 
-template <Device D, class KSender, class EvecsTileSender, class TempTileSender>
-void sumsqColsAsync(KSender&& k, SizeType row, SizeType col, EvecsTileSender&& evecs,
-                    TempTileSender&& temp) {
+template <Device D, class KRSender, class KCSender, class EvecsTileSender, class TempTileSender>
+void sumsqColsAsync(KRSender&& k_row, KCSender&& k_col, SizeType row, SizeType col,
+                    EvecsTileSender&& evecs, TempTileSender&& temp) {
   namespace di = dlaf::internal;
 
-  auto sender = di::whenAllLift(std::forward<KSender>(k), row, col, std::forward<EvecsTileSender>(evecs),
-                                std::forward<TempTileSender>(temp));
+  auto sender =
+      di::whenAllLift(std::forward<KRSender>(k_row), std::forward<KCSender>(k_col), row, col,
+                      std::forward<EvecsTileSender>(evecs), std::forward<TempTileSender>(temp));
   di::transformDetach(di::Policy<DefaultBackend_v<D>>(), sumsqCols_o, std::move(sender));
 }
 
 template <class T>
-void addFirstRows(const SizeType& k, const SizeType& row, const SizeType& col,
+void addFirstRows(const SizeType& k_row, const SizeType& k_col, const SizeType& row, const SizeType& col,
                   const matrix::Tile<const T, Device::CPU>& in, const matrix::Tile<T, Device::CPU>& out);
 
-#define DLAF_CPU_ADD_FIRST_ROWS_ETI(kword, Type)                                                \
-  kword template void addFirstRows(const SizeType& k, const SizeType& row, const SizeType& col, \
-                                   const matrix::Tile<const Type, Device::CPU>& in,             \
+#define DLAF_CPU_ADD_FIRST_ROWS_ETI(kword, Type)                                                      \
+  kword template void addFirstRows(const SizeType& k_row, const SizeType& k_col, const SizeType& row, \
+                                   const SizeType& col,                                               \
+                                   const matrix::Tile<const Type, Device::CPU>& in,                   \
                                    const matrix::Tile<Type, Device::CPU>& out)
 
 DLAF_CPU_ADD_FIRST_ROWS_ETI(extern, float);
@@ -473,13 +484,14 @@ DLAF_CPU_ADD_FIRST_ROWS_ETI(extern, double);
 
 #ifdef DLAF_WITH_GPU
 template <class T>
-void addFirstRows(const SizeType& k, const SizeType& row, const SizeType& col,
+void addFirstRows(const SizeType& k_row, const SizeType& k_col, const SizeType& row, const SizeType& col,
                   const matrix::Tile<const T, Device::GPU>& evecs_tile,
                   const matrix::Tile<T, Device::GPU>& ws_tile, whip::stream_t stream);
 
-#define DLAF_GPU_ADD_FIRST_ROWS_ETI(kword, Type)                                                \
-  kword template void addFirstRows(const SizeType& k, const SizeType& row, const SizeType& col, \
-                                   const matrix::Tile<const Type, Device::GPU>& in,             \
+#define DLAF_GPU_ADD_FIRST_ROWS_ETI(kword, Type)                                                      \
+  kword template void addFirstRows(const SizeType& k_row, const SizeType& k_col, const SizeType& row, \
+                                   const SizeType& col,                                               \
+                                   const matrix::Tile<const Type, Device::GPU>& in,                   \
                                    const matrix::Tile<Type, Device::GPU>& out, whip::stream_t stream)
 
 DLAF_GPU_ADD_FIRST_ROWS_ETI(extern, float);
@@ -488,23 +500,25 @@ DLAF_GPU_ADD_FIRST_ROWS_ETI(extern, double);
 
 DLAF_MAKE_CALLABLE_OBJECT(addFirstRows);
 
-template <Device D, class KSender, class InTileSender, class OutTileSender>
-void addFirstRowsAsync(KSender&& k, SizeType row, SizeType col, InTileSender&& in, OutTileSender&& out) {
+template <Device D, class KRSender, class KCSender, class InTileSender, class OutTileSender>
+void addFirstRowsAsync(KRSender&& k_row, KCSender&& k_col, SizeType row, SizeType col, InTileSender&& in,
+                       OutTileSender&& out) {
   namespace di = dlaf::internal;
 
-  auto sender = di::whenAllLift(std::forward<KSender>(k), row, col, std::forward<InTileSender>(in),
-                                std::forward<OutTileSender>(out));
+  auto sender = di::whenAllLift(std::forward<KRSender>(k_row), std::forward<KCSender>(k_col), row, col,
+                                std::forward<InTileSender>(in), std::forward<OutTileSender>(out));
   di::transformDetach(di::Policy<DefaultBackend_v<D>>(), addFirstRows_o, std::move(sender));
 }
 
 template <class T>
-void divideColsByFirstRow(const SizeType& k, const SizeType& row, const SizeType& col,
-                          const matrix::Tile<const T, Device::CPU>& in,
+void divideColsByFirstRow(const SizeType& k_row, const SizeType& k_col, const SizeType& row,
+                          const SizeType& col, const matrix::Tile<const T, Device::CPU>& in,
                           const matrix::Tile<T, Device::CPU>& out);
 
-#define DLAF_CPU_DIVIDE_COLS_BY_FIRST_ROW_ETI(kword, Type)                                              \
-  kword template void divideColsByFirstRow(const SizeType& k, const SizeType& row, const SizeType& col, \
-                                           const matrix::Tile<const Type, Device::CPU>& in,             \
+#define DLAF_CPU_DIVIDE_COLS_BY_FIRST_ROW_ETI(kword, Type)                                  \
+  kword template void divideColsByFirstRow(const SizeType& k_row, const SizeType& k_col,    \
+                                           const SizeType& row, const SizeType& col,        \
+                                           const matrix::Tile<const Type, Device::CPU>& in, \
                                            const matrix::Tile<Type, Device::CPU>& out)
 
 DLAF_CPU_DIVIDE_COLS_BY_FIRST_ROW_ETI(extern, float);
@@ -512,14 +526,15 @@ DLAF_CPU_DIVIDE_COLS_BY_FIRST_ROW_ETI(extern, double);
 
 #ifdef DLAF_WITH_GPU
 template <class T>
-void divideColsByFirstRow(const SizeType& k, const SizeType& row, const SizeType& col,
-                          const matrix::Tile<const T, Device::GPU>& evecs_tile,
+void divideColsByFirstRow(const SizeType& k_row, const SizeType& k_col, const SizeType& row,
+                          const SizeType& col, const matrix::Tile<const T, Device::GPU>& evecs_tile,
                           const matrix::Tile<T, Device::GPU>& ws_tile, whip::stream_t stream);
 
-#define DLAF_GPU_DIVIDE_COLS_BY_FIRST_ROW_ETI(kword, Type)                                              \
-  kword template void divideColsByFirstRow(const SizeType& k, const SizeType& row, const SizeType& col, \
-                                           const matrix::Tile<const Type, Device::GPU>& in,             \
-                                           const matrix::Tile<Type, Device::GPU>& out,                  \
+#define DLAF_GPU_DIVIDE_COLS_BY_FIRST_ROW_ETI(kword, Type)                                  \
+  kword template void divideColsByFirstRow(const SizeType& k_row, const SizeType& k_col,    \
+                                           const SizeType& row, const SizeType& col,        \
+                                           const matrix::Tile<const Type, Device::GPU>& in, \
+                                           const matrix::Tile<Type, Device::GPU>& out,      \
                                            whip::stream_t stream)
 
 DLAF_GPU_DIVIDE_COLS_BY_FIRST_ROW_ETI(extern, float);
@@ -528,13 +543,13 @@ DLAF_GPU_DIVIDE_COLS_BY_FIRST_ROW_ETI(extern, double);
 
 DLAF_MAKE_CALLABLE_OBJECT(divideColsByFirstRow);
 
-template <Device D, class KSender, class InTileSender, class OutTileSender>
-void divideColsByFirstRowAsync(KSender&& k, SizeType row, SizeType col, InTileSender&& in,
-                               OutTileSender&& out) {
+template <Device D, class KRSender, class KCSender, class InTileSender, class OutTileSender>
+void divideColsByFirstRowAsync(KRSender&& k_row, KCSender&& k_col, SizeType row, SizeType col,
+                               InTileSender&& in, OutTileSender&& out) {
   namespace di = dlaf::internal;
 
-  auto sender = di::whenAllLift(std::forward<KSender>(k), row, col, std::forward<InTileSender>(in),
-                                std::forward<OutTileSender>(out));
+  auto sender = di::whenAllLift(std::forward<KRSender>(k_row), std::forward<KCSender>(k_col), row, col,
+                                std::forward<InTileSender>(in), std::forward<OutTileSender>(out));
   di::transformDetach(di::Policy<DefaultBackend_v<D>>(), divideColsByFirstRow_o, std::move(sender));
 }
 
@@ -577,14 +592,14 @@ void setUnitDiagonalAsync(KSender&& k, SizeType tile_begin, TileSender&& tile) {
 // out_coord) of @p out_tile that is inside a sqaure defined by the global element index @p k where both
 // tiles begin at global element index (@p row, @p col).
 template <class T>
-void copy1D(const SizeType& k, const SizeType& row, const SizeType& col, const Coord& in_coord,
-            const matrix::Tile<const T, Device::CPU>& in_tile, const Coord& out_coord,
-            const matrix::Tile<T, Device::CPU>& out_tile);
-
-#define DLAF_CPU_COPY_1D_ETI(kword, Type)                                                 \
-  kword template void copy1D(const SizeType& k, const SizeType& row, const SizeType& col, \
-                             const Coord& in_coord,                                       \
-                             const matrix::Tile<const Type, Device::CPU>& in_tile,        \
+void copy1D(const SizeType& k_row, const SizeType& k_col, const SizeType& row, const SizeType& col,
+            const Coord& in_coord, const matrix::Tile<const T, Device::CPU>& in_tile,
+            const Coord& out_coord, const matrix::Tile<T, Device::CPU>& out_tile);
+
+#define DLAF_CPU_COPY_1D_ETI(kword, Type)                                                       \
+  kword template void copy1D(const SizeType& k_row, const SizeType& k_col, const SizeType& row, \
+                             const SizeType& col, const Coord& in_coord,                        \
+                             const matrix::Tile<const Type, Device::CPU>& in_tile,              \
                              const Coord& out_coord, const matrix::Tile<Type, Device::CPU>& out_tile)
 
 DLAF_CPU_COPY_1D_ETI(extern, float);
@@ -593,14 +608,15 @@ DLAF_CPU_COPY_1D_ETI(extern, double);
 #ifdef DLAF_WITH_GPU
 
 template <class T>
-void copy1D(cublasHandle_t handle, const SizeType& k, const SizeType& row, const SizeType& col,
-            const Coord& in_coord, const matrix::Tile<const T, Device::GPU>& in_tile,
-            const Coord& out_coord, const matrix::Tile<T, Device::GPU>& out_tile);
-
-#define DLAF_GPU_COPY_1D_ETI(kword, Type)                                                   \
-  kword template void copy1D(cublasHandle_t handle, const SizeType& k, const SizeType& row, \
-                             const SizeType& col, const Coord& in_coord,                    \
-                             const matrix::Tile<const Type, Device::GPU>& in_tile,          \
+void copy1D(cublasHandle_t handle, const SizeType& k_row, const SizeType& k_col, const SizeType& row,
+            const SizeType& col, const Coord& in_coord,
+            const matrix::Tile<const T, Device::GPU>& in_tile, const Coord& out_coord,
+            const matrix::Tile<T, Device::GPU>& out_tile);
+
+#define DLAF_GPU_COPY_1D_ETI(kword, Type)                                                         \
+  kword template void copy1D(cublasHandle_t handle, const SizeType& k_row, const SizeType& k_col, \
+                             const SizeType& row, const SizeType& col, const Coord& in_coord,     \
+                             const matrix::Tile<const Type, Device::GPU>& in_tile,                \
                              const Coord& out_coord, const matrix::Tile<Type, Device::GPU>& out_tile)
 
 DLAF_GPU_COPY_1D_ETI(extern, float);
@@ -610,14 +626,14 @@ DLAF_GPU_COPY_1D_ETI(extern, double);
 
 DLAF_MAKE_CALLABLE_OBJECT(copy1D);
 
-template <Device D, class KSender, class InTileSender, class OutTileSender>
-void copy1DAsync(KSender&& k, SizeType row, SizeType col, Coord in_coord, InTileSender&& in,
-                 Coord out_coord, OutTileSender&& out) {
+template <Device D, class KRSender, class KCSender, class InTileSender, class OutTileSender>
+void copy1DAsync(KRSender&& k_row, KCSender&& k_col, SizeType row, SizeType col, Coord in_coord,
+                 InTileSender&& in, Coord out_coord, OutTileSender&& out) {
   namespace di = dlaf::internal;
   namespace ex = pika::execution::experimental;
   auto sender =
-      di::whenAllLift(std::forward<KSender>(k), row, col, in_coord, std::forward<InTileSender>(in),
-                      out_coord, std::forward<OutTileSender>(out));
+      di::whenAllLift(std::forward<KRSender>(k_row), std::forward<KCSender>(k_col), row, col, in_coord,
+                      std::forward<InTileSender>(in), out_coord, std::forward<OutTileSender>(out));
   ex::start_detached(di::transform<di::TransformDispatchType::Blas>(di::Policy<DefaultBackend_v<D>>(),
                                                                     copy1D_o, std::move(sender)));
 }
diff --git a/include/dlaf/eigensolver/tridiag_solver/merge.h b/include/dlaf/eigensolver/tridiag_solver/merge.h
index 5ac1ed57ad..ceb6fb52f4 100644
--- a/include/dlaf/eigensolver/tridiag_solver/merge.h
+++ b/include/dlaf/eigensolver/tridiag_solver/merge.h
@@ -133,26 +133,6 @@ struct WorkSpaceHostMirror<T, Device::CPU> {
   Matrix<SizeType, Device::CPU>& i2;
 };
 
-template <class T, Device device>
-struct DistWorkSpace {
-  Matrix<T, device> mat1;
-  Matrix<T, device> mat2;
-
-  Matrix<T, device> z;
-  Matrix<T, device> ztmp;
-};
-
-template <class T>
-struct DistWorkSpaceHost {
-  Matrix<T, Device::CPU> dtmp;
-
-  Matrix<SizeType, Device::CPU> i1;
-  Matrix<SizeType, Device::CPU> i2;
-  Matrix<SizeType, Device::CPU> i3;
-
-  Matrix<ColType, Device::CPU> c;
-};
-
 // forward declaration : Device::GPU - unused
 template <class T, Device D>
 struct DistWorkSpaceHostMirror {
@@ -163,6 +143,8 @@ struct DistWorkSpaceHostMirror {
 
   Matrix<T, Device::CPU> z;
   Matrix<T, Device::CPU> ztmp;
+
+  Matrix<SizeType, Device::CPU> i2;
 };
 
 template <class T>
@@ -174,6 +156,8 @@ struct DistWorkSpaceHostMirror<T, Device::CPU> {
 
   Matrix<T, Device::CPU>& z;
   Matrix<T, Device::CPU>& ztmp;
+
+  Matrix<SizeType, Device::CPU>& i2;
 };
 
 template <class T>
@@ -685,10 +669,11 @@ void solveRank1Problem(SizeType i_begin, SizeType i_end, KSender&& k, RhoSender&
 // - lapack 3.10.0, dlaed3.f, line 293
 // - LAPACK Working Notes: lawn132, Parallelizing the Divide and Conquer Algorithm for the Symmetric
 //   Tridiagonal Eigenvalue Problem on Distributed Memory Architectures, 4.2 Orthogonality
-template <class T, Device D, class KSender>
+template <class T, Device D, class KRSender, class KCSender>
 void initWeightVector(GlobalTileIndex idx_gl_begin, LocalTileIndex idx_loc_begin,
-                      LocalTileSize sz_loc_tiles, KSender&& k, Matrix<const T, D>& diag,
-                      Matrix<const T, D>& evecs, Matrix<T, D>& ws) {
+                      LocalTileSize sz_loc_tiles, KRSender&& k_row, KCSender&& k_col,
+                      Matrix<const T, D>& diag, Matrix<const T, D>& diag_i2, Matrix<const T, D>& evecs,
+                      Matrix<T, D>& ws) {
   const matrix::Distribution& dist = evecs.distribution();
 
   // Reduce by multiplication into the first local column of each tile of the workspace matrix `ws`
@@ -697,8 +682,8 @@ void initWeightVector(GlobalTileIndex idx_gl_begin, LocalTileIndex idx_loc_begin
     auto sz_gl_el = dist.globalTileElementDistance(idx_gl_begin, idx_gl_tile);
     // Divide the eigenvectors of the rank1 update problem `evecs` by it's diagonal matrix `diag` and
     // reduce multiply into the first column of each tile of the workspace matrix `ws`
-    divideEvecsByDiagonalAsync<D>(k, sz_gl_el.rows(), sz_gl_el.cols(),
-                                  diag.read_sender(GlobalTileIndex(idx_gl_tile.row(), 0)),
+    divideEvecsByDiagonalAsync<D>(k_row, k_col, sz_gl_el.rows(), sz_gl_el.cols(),
+                                  diag_i2.read_sender(GlobalTileIndex(idx_gl_tile.row(), 0)),
                                   diag.read_sender(GlobalTileIndex(idx_gl_tile.col(), 0)),
                                   evecs.read_sender(idx_loc_tile), ws.readwrite_sender(idx_loc_tile));
 
@@ -709,7 +694,8 @@ void initWeightVector(GlobalTileIndex idx_gl_begin, LocalTileIndex idx_loc_begin
     // reduce-multiply the first column of each local tile of the workspace matrix into the first local
     // column of the matrix
     LocalTileIndex idx_ws_first_col_tile(idx_loc_tile.row(), idx_loc_begin.col());
-    multiplyFirstColumnsAsync<D>(k, sz_gl_el.rows(), sz_gl_el.cols(), ws.read_sender(idx_loc_tile),
+    multiplyFirstColumnsAsync<D>(k_row, k_col, sz_gl_el.rows(), sz_gl_el.cols(),
+                                 ws.read_sender(idx_loc_tile),
                                  ws.readwrite_sender(idx_ws_first_col_tile));
   }
 }
@@ -718,10 +704,10 @@ void initWeightVector(GlobalTileIndex idx_gl_begin, LocalTileIndex idx_loc_begin
 // - lapack 3.10.0, dlaed3.f, line 293
 // - LAPACK Working Notes: lawn132, Parallelizing the Divide and Conquer Algorithm for the Symmetric
 //   Tridiagonal Eigenvalue Problem on Distributed Memory Architectures, 4.2 Orthogonality
-template <class T, Device D, class KSender>
+template <class T, Device D, class KRSender, class KCSender>
 void formEvecsUsingWeightVec(GlobalTileIndex idx_gl_begin, LocalTileIndex idx_loc_begin,
-                             LocalTileSize sz_loc_tiles, KSender&& k, Matrix<const T, D>& z,
-                             Matrix<const T, D>& ws, Matrix<T, D>& evecs) {
+                             LocalTileSize sz_loc_tiles, KRSender&& k_row, KCSender&& k_col,
+                             Matrix<const T, D>& z, Matrix<const T, D>& ws, Matrix<T, D>& evecs) {
   const matrix::Distribution& dist = evecs.distribution();
 
   for (auto idx_loc_tile : common::iterate_range2d(idx_loc_begin, sz_loc_tiles)) {
@@ -729,7 +715,7 @@ void formEvecsUsingWeightVec(GlobalTileIndex idx_gl_begin, LocalTileIndex idx_lo
     auto sz_gl_el = dist.globalTileElementDistance(idx_gl_begin, idx_gl_tile);
     LocalTileIndex idx_ws_first_local_column(idx_loc_tile.row(), idx_loc_begin.col());
 
-    calcEvecsFromWeightVecAsync<D>(k, sz_gl_el.rows(), sz_gl_el.cols(),
+    calcEvecsFromWeightVecAsync<D>(k_row, k_col, sz_gl_el.rows(), sz_gl_el.cols(),
                                    z.read_sender(GlobalTileIndex(idx_gl_tile.row(), 0)),
                                    ws.read_sender(idx_ws_first_local_column),
                                    evecs.readwrite_sender(idx_loc_tile));
@@ -737,15 +723,15 @@ void formEvecsUsingWeightVec(GlobalTileIndex idx_gl_begin, LocalTileIndex idx_lo
 }
 
 // Sum of squares of columns of @p evecs into the first row of @p ws
-template <class T, Device D, class KSender>
+template <class T, Device D, class KRSender, class KCSender>
 void sumsqEvecs(GlobalTileIndex idx_gl_begin, LocalTileIndex idx_loc_begin, LocalTileSize sz_loc_tiles,
-                KSender&& k, Matrix<const T, D>& evecs, Matrix<T, D>& ws) {
+                KRSender&& k_row, KCSender&& k_col, Matrix<const T, D>& evecs, Matrix<T, D>& ws) {
   const matrix::Distribution& dist = evecs.distribution();
 
   for (auto idx_loc_tile : common::iterate_range2d(idx_loc_begin, sz_loc_tiles)) {
     auto idx_gl_tile = dist.globalTileIndex(idx_loc_tile);
     auto sz_gl_el = dist.globalTileElementDistance(idx_gl_begin, idx_gl_tile);
-    sumsqColsAsync<D>(k, sz_gl_el.rows(), sz_gl_el.cols(), evecs.read_sender(idx_loc_tile),
+    sumsqColsAsync<D>(k_row, k_col, sz_gl_el.rows(), sz_gl_el.cols(), evecs.read_sender(idx_loc_tile),
                       ws.readwrite_sender(idx_loc_tile));
 
     // skip the first local row
@@ -753,23 +739,23 @@ void sumsqEvecs(GlobalTileIndex idx_gl_begin, LocalTileIndex idx_loc_begin, Loca
       continue;
 
     LocalTileIndex idx_ws_first_row_tile(idx_loc_begin.row(), idx_loc_tile.col());
-    addFirstRowsAsync<D>(k, sz_gl_el.rows(), sz_gl_el.cols(), ws.read_sender(idx_loc_tile),
+    addFirstRowsAsync<D>(k_row, k_col, sz_gl_el.rows(), sz_gl_el.cols(), ws.read_sender(idx_loc_tile),
                          ws.readwrite_sender(idx_ws_first_row_tile));
   }
 }
 
 // Normalize column vectors
-template <class T, Device D, class KSender>
+template <class T, Device D, class KRSender, class KCSender>
 void normalizeEvecs(GlobalTileIndex idx_gl_begin, LocalTileIndex idx_loc_begin,
-                    LocalTileSize sz_loc_tiles, KSender&& k, Matrix<const T, D>& ws,
-                    Matrix<T, D>& evecs) {
+                    LocalTileSize sz_loc_tiles, KRSender&& k_row, KCSender&& k_col,
+                    Matrix<const T, D>& ws, Matrix<T, D>& evecs) {
   const matrix::Distribution& dist = evecs.distribution();
 
   for (auto idx_loc_tile : common::iterate_range2d(idx_loc_begin, sz_loc_tiles)) {
     auto idx_gl_tile = dist.globalTileIndex(idx_loc_tile);
     auto sz_gl_el = dist.globalTileElementDistance(idx_gl_begin, idx_gl_tile);
     LocalTileIndex idx_ws_first_local_row(idx_loc_begin.row(), idx_loc_tile.col());
-    divideColsByFirstRowAsync<D>(k, sz_gl_el.rows(), sz_gl_el.cols(),
+    divideColsByFirstRowAsync<D>(k_row, k_col, sz_gl_el.rows(), sz_gl_el.cols(),
                                  ws.read_sender(idx_ws_first_local_row),
                                  evecs.readwrite_sender(idx_loc_tile));
   }
@@ -845,6 +831,7 @@ void mergeSubproblems(SizeType i_begin, SizeType i_split, SizeType i_end, RhoSen
   // - set deflated diagonal entries of `U` to 1 (temporary solution until optimized GEMM is implemented)
   //
   auto k = stablePartitionIndexForDeflation(i_begin, i_end, ws_h.c, ws_hm.i2, ws_h.i3) | ex::split();
+
   applyIndex(i_begin, i_end, ws_h.i3, ws_hm.evals, ws_h.dtmp);
   applyIndex(i_begin, i_end, ws_h.i3, ws_hm.z, ws_hm.ztmp);
   copy(idx_begin_tiles_vec, sz_tiles_vec, ws_h.dtmp, ws_hm.evals);
@@ -859,10 +846,10 @@ void mergeSubproblems(SizeType i_begin, SizeType i_split, SizeType i_end, RhoSen
   // ---
 
   // formEvecs(i_begin, i_end, k, evals, ws.ztmp, ws.mat2, ws.mat1);
-  initWeightVector(idx_gl_begin, idx_loc_begin, sz_loc_tiles, k, evals, ws.mat1, ws.mat2);
-  formEvecsUsingWeightVec(idx_gl_begin, idx_loc_begin, sz_loc_tiles, k, ws.ztmp, ws.mat2, ws.mat1);
-  sumsqEvecs(idx_gl_begin, idx_loc_begin, sz_loc_tiles, k, ws.mat1, ws.mat2);
-  normalizeEvecs(idx_gl_begin, idx_loc_begin, sz_loc_tiles, k, ws.mat2, ws.mat1);
+  initWeightVector(idx_gl_begin, idx_loc_begin, sz_loc_tiles, k, k, evals, evals, ws.mat1, ws.mat2);
+  formEvecsUsingWeightVec(idx_gl_begin, idx_loc_begin, sz_loc_tiles, k, k, ws.ztmp, ws.mat2, ws.mat1);
+  sumsqEvecs(idx_gl_begin, idx_loc_begin, sz_loc_tiles, k, k, ws.mat1, ws.mat2);
+  normalizeEvecs(idx_gl_begin, idx_loc_begin, sz_loc_tiles, k, k, ws.mat2, ws.mat1);
   setUnitDiag(i_begin, i_end, k, ws.mat1);
 
   // Step #3: Eigenvectors of the tridiagonal system: Q * U
@@ -953,10 +940,11 @@ void setUnitDiagDist(SizeType i_begin, SizeType i_end, KSender&& k, Matrix<T, D>
 
 // (All)Reduce-multiply row-wise the first local columns of the distributed matrix @p mat of size (n, n).
 // The local column on each rank is first offloaded to a local communication buffer @p comm_vec of size (n, 1).
-template <class T, Device D, class KSender>
+template <class T, Device D, class KRSender, class KCSender>
 void reduceMultiplyWeightVector(common::Pipeline<comm::Communicator>& row_task_chain, SizeType i_begin,
                                 SizeType i_end, LocalTileIndex idx_loc_begin, LocalTileSize sz_loc_tiles,
-                                KSender&& k, Matrix<T, D>& mat, Matrix<T, D>& comm_vec) {
+                                KRSender&& k_row, KCSender&& k_col, Matrix<T, D>& mat,
+                                Matrix<T, D>& comm_vec) {
   namespace ex = pika::execution::experimental;
   namespace di = dlaf::internal;
 
@@ -994,8 +982,8 @@ void reduceMultiplyWeightVector(common::Pipeline<comm::Communicator>& row_task_c
         tile::laset(di::Policy<DefaultBackend_v<D>>()));
 
     // copy the first column of the matrix tile into the column tile of the buffer
-    copy1DAsync<D>(k, sz_subm.rows(), sz_subm.cols(), Coord::Col, mat.read_sender(idx_loc_tile),
-                   Coord::Col, comm_vec.readwrite_sender(idx_gl_comm));
+    copy1DAsync<D>(k_row, k_col, sz_subm.rows(), sz_subm.cols(), Coord::Col,
+                   mat.read_sender(idx_loc_tile), Coord::Col, comm_vec.readwrite_sender(idx_gl_comm));
 
     ex::start_detached(comm::scheduleAllReduceInPlace(ex::make_unique_any_sender(row_task_chain()),
                                                       MPI_PROD,
@@ -1003,17 +991,18 @@ void reduceMultiplyWeightVector(common::Pipeline<comm::Communicator>& row_task_c
                                                           comm_vec.readwrite_sender(idx_gl_comm))));
 
     // copy the column tile of the buffer into the first column of the matrix tile
-    copy1DAsync<D>(k, sz_subm.rows(), sz_subm.cols(), Coord::Col, comm_vec.read_sender(idx_gl_comm),
-                   Coord::Col, mat.readwrite_sender(idx_loc_tile));
+    copy1DAsync<D>(k_row, k_col, sz_subm.rows(), sz_subm.cols(), Coord::Col,
+                   comm_vec.read_sender(idx_gl_comm), Coord::Col, mat.readwrite_sender(idx_loc_tile));
   }
 }
 
 // (All)Reduce-sum column-wise the first local rows of the distributed matrix @p mat of size (n, n).
 // The local row on each rank is first offloaded to a local communication buffer @p comm_vec of size (n, 1).
-template <class T, Device D, class KSender>
+template <class T, Device D, class KRSender, class KCSender>
 void reduceSumScalingVector(common::Pipeline<comm::Communicator>& col_task_chain, SizeType i_begin,
                             SizeType i_end, LocalTileIndex idx_loc_begin, LocalTileSize sz_loc_tiles,
-                            KSender&& k, Matrix<T, D>& mat, Matrix<T, D>& comm_vec) {
+                            KRSender&& k_row, KCSender&& k_col, Matrix<T, D>& mat,
+                            Matrix<T, D>& comm_vec) {
   namespace ex = pika::execution::experimental;
   namespace di = dlaf::internal;
 
@@ -1049,8 +1038,8 @@ void reduceSumScalingVector(common::Pipeline<comm::Communicator>& col_task_chain
                        tile::set0(di::Policy<DefaultBackend_v<D>>()));
 
     // copy the first row of the matrix tile into the column tile of the buffer
-    copy1DAsync<D>(k, sz_subm.rows(), sz_subm.cols(), Coord::Row, mat.read_sender(idx_loc_tile),
-                   Coord::Col, comm_vec.readwrite_sender(idx_gl_comm));
+    copy1DAsync<D>(k_row, k_col, sz_subm.rows(), sz_subm.cols(), Coord::Row,
+                   mat.read_sender(idx_loc_tile), Coord::Col, comm_vec.readwrite_sender(idx_gl_comm));
 
     ex::start_detached(comm::scheduleAllReduceInPlace(ex::make_unique_any_sender(col_task_chain()),
                                                       MPI_SUM,
@@ -1058,8 +1047,8 @@ void reduceSumScalingVector(common::Pipeline<comm::Communicator>& col_task_chain
                                                           comm_vec.readwrite_sender(idx_gl_comm))));
 
     // copy the column tile of the buffer into the first column of the matrix tile
-    copy1DAsync<D>(k, sz_subm.rows(), sz_subm.cols(), Coord::Col, comm_vec.read_sender(idx_gl_comm),
-                   Coord::Row, mat.readwrite_sender(idx_loc_tile));
+    copy1DAsync<D>(k_row, k_col, sz_subm.rows(), sz_subm.cols(), Coord::Col,
+                   comm_vec.read_sender(idx_gl_comm), Coord::Row, mat.readwrite_sender(idx_loc_tile));
   }
 }
 
@@ -1068,19 +1057,21 @@ void solveRank1ProblemDist(SizeType i_begin, SizeType i_end, LocalTileIndex idx_
                            LocalTileSize sz_loc_tiles, KSender&& k, RhoSender&& rho,
                            Matrix<const T, Device::CPU>& d, Matrix<const T, Device::CPU>& z,
                            Matrix<T, Device::CPU>& evals, Matrix<T, Device::CPU>& delta,
-                           Matrix<T, Device::CPU>& evecs) {
+                           Matrix<SizeType, Device::CPU>& i2, Matrix<T, Device::CPU>& evecs) {
   namespace ex = pika::execution::experimental;
   namespace di = dlaf::internal;
 
   const matrix::Distribution& dist = evecs.distribution();
-  auto rank1_fn = [i_begin, idx_loc_begin, sz_loc_tiles,
+  auto rank1_fn = [i_begin, i_end, idx_loc_begin, sz_loc_tiles,
                    dist](const auto& k, const auto& rho, const auto& d_sfut_tile_arr,
                          const auto& z_sfut_tile_arr, const auto& eval_tiles, const auto& delta_tile_arr,
-                         const auto& evec_tile_arr) {
+                         const auto& i2_tile_arr, const auto& evec_tile_arr) {
+    const SizeType n = problemSize(i_begin, i_end, dist);
     const T* d_ptr = d_sfut_tile_arr[0].get().ptr();
     const T* z_ptr = z_sfut_tile_arr[0].get().ptr();
     T* eval_ptr = eval_tiles[0].ptr();
     T* delta_ptr = delta_tile_arr[0].ptr();
+    SizeType* i2_ptr = i2_tile_arr[0].ptr();
 
     // Iterate over the columns of the local submatrix tile grid
     for (SizeType j_loc_subm_tile = 0; j_loc_subm_tile < sz_loc_tiles.cols(); ++j_loc_subm_tile) {
@@ -1094,7 +1085,7 @@ void solveRank1ProblemDist(SizeType i_begin, SizeType i_end, LocalTileIndex idx_
 
       // Skip columns that are in the deflation zone
       if (j_gl_subm_el >= k)
-        return;
+        break;
 
       // Iterate over the elements of the column tile
       SizeType ncols = std::min(dist.tileSize<Coord::Col>(j_gl_tile), k - j_gl_subm_el);
@@ -1119,18 +1110,37 @@ void solveRank1ProblemDist(SizeType i_begin, SizeType i_end, LocalTileIndex idx_
           // global matrix tile grid
           SizeType i_gl_subm_el = dist.globalTileElementDistance<Coord::Row>(i_begin, i_gl_tile);
 
-          // Skip rows that are in the deflation zone
-          if (i_gl_subm_el >= k)
-            break;
-
           // The tile row in the global submatrix tile grid
-          SizeType i_subm_delta_arr = dist.globalTileFromLocalTile<Coord::Row>(i_loc_tile) - i_begin;
-          auto& delta_tile = delta_tile_arr[to_sizet(i_subm_delta_arr)];
+          SizeType i_subm_i2_arr = dist.globalTileFromLocalTile<Coord::Row>(i_loc_tile) - i_begin;
+          auto& i2_tile = i2_tile_arr[to_sizet(i_subm_i2_arr)];
+
+          SizeType nrows = std::min(dist.tileSize<Coord::Row>(i_gl_tile), n - i_gl_subm_el);
+          for (int i = 0; i < nrows; ++i) {
+            SizeType ii = i2_tile({i, 0});
+            if (ii < k)
+              evec_tile({i, j_tile_el}) = delta_ptr[ii];
+          }
+        }
+      }
+    }
 
-          // Copy from delta buffer into local matrix tile
-          SizeType nrows = std::min(dist.tileSize<Coord::Row>(i_gl_tile), k - i_gl_subm_el);
-          tile::lacpy(TileElementSize(nrows, 1), TileElementIndex(0, 0), delta_tile,
-                      TileElementIndex(0, j_tile_el), evec_tile);
+    // Fill ones for deflated Eigenvectors.
+    // Quick return if there is none.
+    if (n == k)
+      return;
+
+    GlobalElementIndex origin{i_begin * dist.blockSize().rows(), i_begin * dist.blockSize().cols()};
+    for (SizeType i = 0; i < n; ++i) {
+      SizeType j = i2_ptr[i];
+      if (j >= k) {
+        GlobalElementIndex i_g{i + origin.row(), j + origin.col()};
+        GlobalTileIndex i_tile = dist.globalTileIndex(i_g);
+        if (dist.rankIndex() == dist.rankGlobalTile(i_tile)) {
+          LocalTileIndex i_tile_l = dist.localTileIndex(i_tile);
+          SizeType i_subm_evec_arr = i_tile_l.row() - idx_loc_begin.row() +
+                                     (i_tile_l.col() - idx_loc_begin.col()) * sz_loc_tiles.rows();
+          TileElementIndex i = dist.tileElementIndex(i_g);
+          evec_tile_arr[to_sizet(i_subm_evec_arr)](i) = T{1};
         }
       }
     }
@@ -1142,7 +1152,7 @@ void solveRank1ProblemDist(SizeType i_begin, SizeType i_end, LocalTileIndex idx_
       ex::when_all(std::forward<KSender>(k), std::forward<RhoSender>(rho),
                    ex::when_all_vector(tc.read(d)), ex::when_all_vector(tc.read(z)),
                    ex::when_all_vector(tc.readwrite(evals)), ex::when_all_vector(tc.readwrite(delta)),
-                   ex::when_all_vector(tc.readwrite(evecs)));
+                   ex::when_all_vector(tc.readwrite(i2)), ex::when_all_vector(tc.readwrite(evecs)));
 
   ex::start_detached(di::transform(di::Policy<Backend::MC>(), std::move(rank1_fn), std::move(sender)));
 }
@@ -1177,8 +1187,8 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid,
                           common::Pipeline<comm::Communicator>& full_task_chain,
                           common::Pipeline<comm::Communicator>& row_task_chain,
                           common::Pipeline<comm::Communicator>& col_task_chain, SizeType i_begin,
-                          SizeType i_split, SizeType i_end, RhoSender&& rho, DistWorkSpace<T, D>& ws,
-                          DistWorkSpaceHost<T>& ws_h, DistWorkSpaceHostMirror<T, D>& ws_hm,
+                          SizeType i_split, SizeType i_end, RhoSender&& rho, WorkSpace<T, D>& ws,
+                          WorkSpaceHost<T>& ws_h, DistWorkSpaceHostMirror<T, D>& ws_hm,
                           Matrix<T, D>& evals, Matrix<T, D>& evecs) {
   namespace ex = pika::execution::experimental;
 
@@ -1220,10 +1230,10 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid,
   // - apply Givens rotations to `Q` - `evecs`
   //
   initIndex(i_begin, i_end, ws_h.i1);
-  sortIndex(i_begin, i_end, ex::just(n1), ws_hm.evals, ws_h.i1, ws_h.i2);
+  sortIndex(i_begin, i_end, ex::just(n1), ws_hm.evals, ws_h.i1, ws_hm.i2);
 
   auto rots =
-      applyDeflation(i_begin, i_end, scaled_rho, std::move(tol), ws_h.i2, ws_hm.evals, ws_hm.z, ws_h.c);
+      applyDeflation(i_begin, i_end, scaled_rho, std::move(tol), ws_hm.i2, ws_hm.evals, ws_hm.z, ws_h.c);
 
   // ---
 
@@ -1244,52 +1254,52 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid,
   // - solve the rank-1 problem and save eigenvalues in `dtmp` and eigenvectors in `mat1`.
   // - set deflated diagonal entries of `U` to 1 (temporary solution until optimized GEMM is implemented)
   //
-  auto k = stablePartitionIndexForDeflation(i_begin, i_end, ws_h.c, ws_h.i2, ws_h.i3) | ex::split();
+  auto k = stablePartitionIndexForDeflation(i_begin, i_end, ws_h.c, ws_hm.i2, ws_h.i3) | ex::split();
   applyIndex(i_begin, i_end, ws_h.i3, ws_hm.evals, ws_h.dtmp);
   applyIndex(i_begin, i_end, ws_h.i3, ws_hm.z, ws_hm.ztmp);
   copy(idx_begin_tiles_vec, sz_tiles_vec, ws_h.dtmp, ws_hm.evals);
 
+  //
+  //    i3 (in)  : initial <--- deflated
+  //    i2 (out) : initial ---> deflated
+  //
+  invertIndex(i_begin, i_end, ws_h.i3, ws_hm.i2);
+
   // Note: here ws_hm.z is used as a contiguous buffer for the laed4 call
   matrix::util::set0<Backend::MC>(pika::execution::thread_priority::normal, idx_loc_begin, sz_loc_tiles,
                                   ws_hm.mat1);
   solveRank1ProblemDist(i_begin, i_end, idx_loc_begin, sz_loc_tiles, k, std::move(scaled_rho),
-                        ws_hm.evals, ws_hm.ztmp, ws_h.dtmp, ws_hm.z, ws_hm.mat1);
+                        ws_hm.evals, ws_hm.ztmp, ws_h.dtmp, ws_hm.z, ws_hm.i2, ws_hm.mat1);
 
   copy(idx_loc_begin, sz_loc_tiles, ws_hm.mat1, ws.mat1);
   copy(idx_begin_tiles_vec, sz_tiles_vec, ws_hm.ztmp, ws.ztmp);
   copy(idx_begin_tiles_vec, sz_tiles_vec, ws_hm.evals, evals);
+  copy(idx_begin_tiles_vec, sz_tiles_vec, ws_hm.i2, ws.i2);
   // ---
 
   assembleDistEvalsVec(row_task_chain, i_begin, i_end, dist_evecs, ws_h.dtmp);
 
+  auto n = ex::just(problemSize(i_begin, i_end, evecs.distribution()));
   // Eigenvector formation: `ws.mat1` stores the eigenvectors, `ws.mat2` is used as an additional workspace
-  initWeightVector(idx_gl_begin, idx_loc_begin, sz_loc_tiles, k, evals, ws.mat1, ws.mat2);
-  reduceMultiplyWeightVector(row_task_chain, i_begin, i_end, idx_loc_begin, sz_loc_tiles, k, ws.mat2,
+  // Note ws.z is used as continuous buffer workspace to store permuted values of evals and ws.ztmp
+  applyIndex(i_begin, i_end, ws.i2, evals, ws.z);
+  initWeightVector(idx_gl_begin, idx_loc_begin, sz_loc_tiles, n, k, evals, ws.z, ws.mat1, ws.mat2);
+  reduceMultiplyWeightVector(row_task_chain, i_begin, i_end, idx_loc_begin, sz_loc_tiles, n, k, ws.mat2,
                              ws.z);
-  formEvecsUsingWeightVec(idx_gl_begin, idx_loc_begin, sz_loc_tiles, k, ws.ztmp, ws.mat2, ws.mat1);
-  sumsqEvecs(idx_gl_begin, idx_loc_begin, sz_loc_tiles, k, ws.mat1, ws.mat2);
-  reduceSumScalingVector(col_task_chain, i_begin, i_end, idx_loc_begin, sz_loc_tiles, k, ws.mat2, ws.z);
-  normalizeEvecs(idx_gl_begin, idx_loc_begin, sz_loc_tiles, k, ws.mat2, ws.mat1);
-  setUnitDiagDist(i_begin, i_end, k, ws.mat1);
+  applyIndex(i_begin, i_end, ws.i2, ws.ztmp, ws.z);
+  formEvecsUsingWeightVec(idx_gl_begin, idx_loc_begin, sz_loc_tiles, n, k, ws.z, ws.mat2, ws.mat1);
+  sumsqEvecs(idx_gl_begin, idx_loc_begin, sz_loc_tiles, n, k, ws.mat1, ws.mat2);
+  reduceSumScalingVector(col_task_chain, i_begin, i_end, idx_loc_begin, sz_loc_tiles, n, k, ws.mat2,
+                         ws.z);
+  normalizeEvecs(idx_gl_begin, idx_loc_begin, sz_loc_tiles, n, k, ws.mat2, ws.mat1);
 
   // Step #3: Eigenvectors of the tridiagonal system: Q * U
-  //
-  //    i3 (in)  : initial <--- deflated
-  //    i2 (out) : initial ---> deflated
-  //
   // The eigenvectors resulting from the multiplication are already in the order of the eigenvalues as
   // prepared for the deflated system.
   //
-  invertIndex(i_begin, i_end, ws_h.i3, ws_h.i2);
-
-  copy(idx_loc_begin, sz_loc_tiles, ws.mat1, ws_hm.mat1);
-  dlaf::permutations::permute<Backend::MC, Device::CPU, T, Coord::Row>(grid, col_task_chain, i_begin,
-                                                                       i_end, ws_h.i2, ws_hm.mat1,
-                                                                       ws_hm.mat2);
-  copy(idx_loc_begin, sz_loc_tiles, ws_hm.mat2, ws.mat2);
 
   dlaf::multiplication::generalSubMatrix<B, D, T>(grid, row_task_chain, col_task_chain, i_begin, i_end,
-                                                  T(1), evecs, ws.mat2, T(0), ws.mat1);
+                                                  T(1), evecs, ws.mat1, T(0), ws.mat2);
 
   // Step #4: Final sorting of eigenvalues and eigenvectors
   //
@@ -1300,12 +1310,12 @@ void mergeDistSubproblems(comm::CommunicatorGrid grid,
   //   ascending order
   // - reorder columns in `evecs` using `i2` such that eigenvectors match eigenvalues
   //
-  sortIndex(i_begin, i_end, std::move(k), ws_h.dtmp, ws_h.i1, ws_h.i2);
-  applyIndex(i_begin, i_end, ws_h.i2, ws_h.dtmp, ws_hm.evals);
+  sortIndex(i_begin, i_end, std::move(k), ws_h.dtmp, ws_h.i1, ws_hm.i2);
+  applyIndex(i_begin, i_end, ws_hm.i2, ws_h.dtmp, ws_hm.evals);
 
-  copy(idx_loc_begin, sz_loc_tiles, ws.mat1, ws_hm.mat1);
+  copy(idx_loc_begin, sz_loc_tiles, ws.mat2, ws_hm.mat2);
   dlaf::permutations::permute<Backend::MC, Device::CPU, T, Coord::Col>(grid, row_task_chain, i_begin,
-                                                                       i_end, ws_h.i2, ws_hm.mat1,
+                                                                       i_end, ws_hm.i2, ws_hm.mat2,
                                                                        ws_hm.evecs);
   copy(idx_loc_begin, sz_loc_tiles, ws_hm.evecs, evecs);
 }
diff --git a/src/eigensolver/tridiag_solver/kernels.cpp b/src/eigensolver/tridiag_solver/kernels.cpp
index 6bc0d006c5..98c5274775 100644
--- a/src/eigensolver/tridiag_solver/kernels.cpp
+++ b/src/eigensolver/tridiag_solver/kernels.cpp
@@ -93,16 +93,17 @@ void initIndexTile(SizeType offset, const matrix::Tile<SizeType, Device::CPU>& t
 }
 
 template <class T>
-void divideEvecsByDiagonal(const SizeType& k, const SizeType& i_subm_el, const SizeType& j_subm_el,
+void divideEvecsByDiagonal(const SizeType& k_row, const SizeType& k_col, const SizeType& i_subm_el,
+                           const SizeType& j_subm_el,
                            const matrix::Tile<const T, Device::CPU>& diag_rows,
                            const matrix::Tile<const T, Device::CPU>& diag_cols,
                            const matrix::Tile<const T, Device::CPU>& evecs_tile,
                            const matrix::Tile<T, Device::CPU>& ws_tile) {
-  if (i_subm_el >= k || j_subm_el >= k)
+  if (i_subm_el >= k_row || j_subm_el >= k_col)
     return;
 
-  SizeType nrows = std::min(k - i_subm_el, evecs_tile.size().rows());
-  SizeType ncols = std::min(k - j_subm_el, evecs_tile.size().cols());
+  SizeType nrows = std::min(k_row - i_subm_el, evecs_tile.size().rows());
+  SizeType ncols = std::min(k_col - j_subm_el, evecs_tile.size().cols());
 
   for (SizeType j = 0; j < ncols; ++j) {
     for (SizeType i = 0; i < nrows; ++i) {
@@ -122,13 +123,13 @@ DLAF_CPU_DIVIDE_EVECS_BY_DIAGONAL_ETI(, float);
 DLAF_CPU_DIVIDE_EVECS_BY_DIAGONAL_ETI(, double);
 
 template <class T>
-void multiplyFirstColumns(const SizeType& k, const SizeType& row, const SizeType& col,
-                          const matrix::Tile<const T, Device::CPU>& in,
+void multiplyFirstColumns(const SizeType& k_row, const SizeType& k_col, const SizeType& row,
+                          const SizeType& col, const matrix::Tile<const T, Device::CPU>& in,
                           const matrix::Tile<T, Device::CPU>& out) {
-  if (row >= k || col >= k)
+  if (row >= k_row || col >= k_col)
     return;
 
-  SizeType nrows = std::min(k - row, in.size().rows());
+  SizeType nrows = std::min(k_row - row, in.size().rows());
 
   for (SizeType i = 0; i < nrows; ++i) {
     TileElementIndex idx(i, 0);
@@ -140,22 +141,25 @@ DLAF_CPU_MULTIPLY_FIRST_COLUMNS_ETI(, float);
 DLAF_CPU_MULTIPLY_FIRST_COLUMNS_ETI(, double);
 
 template <class T>
-void calcEvecsFromWeightVec(const SizeType& k, const SizeType& row, const SizeType& col,
-                            const matrix::Tile<const T, Device::CPU>& z_tile,
+void calcEvecsFromWeightVec(const SizeType& k_row, const SizeType& k_col, const SizeType& row,
+                            const SizeType& col, const matrix::Tile<const T, Device::CPU>& z_tile,
                             const matrix::Tile<const T, Device::CPU>& ws_tile,
                             const matrix::Tile<T, Device::CPU>& evecs_tile) {
-  if (row >= k || col >= k)
+  if (row >= k_row || col >= k_col)
     return;
 
-  SizeType nrows = std::min(k - row, evecs_tile.size().rows());
-  SizeType ncols = std::min(k - col, evecs_tile.size().cols());
+  SizeType nrows = std::min(k_row - row, evecs_tile.size().rows());
+  SizeType ncols = std::min(k_col - col, evecs_tile.size().cols());
 
   for (SizeType j = 0; j < ncols; ++j) {
     for (SizeType i = 0; i < nrows; ++i) {
       T ws_el = ws_tile(TileElementIndex(i, 0));
       T z_el = z_tile(TileElementIndex(i, 0));
       T& el_evec = evecs_tile(TileElementIndex(i, j));
-      el_evec = std::copysign(std::sqrt(std::abs(ws_el)), z_el) / el_evec;
+      // Avoid NaN generated by deflated vectors.
+      // It is a temporary workaround until deflated vectors are moved at the end of the local matrix.
+      if (el_evec != 0)
+        el_evec = std::copysign(std::sqrt(std::abs(ws_el)), z_el) / el_evec;
     }
   }
 }
@@ -164,14 +168,14 @@ DLAF_CPU_CALC_EVECS_FROM_WEIGHT_VEC_ETI(, float);
 DLAF_CPU_CALC_EVECS_FROM_WEIGHT_VEC_ETI(, double);
 
 template <class T>
-void sumsqCols(const SizeType& k, const SizeType& row, const SizeType& col,
+void sumsqCols(const SizeType& k_row, const SizeType& k_col, const SizeType& row, const SizeType& col,
                const matrix::Tile<const T, Device::CPU>& evecs_tile,
                const matrix::Tile<T, Device::CPU>& ws_tile) {
-  if (row >= k || col >= k)
+  if (row >= k_row || col >= k_col)
     return;
 
-  SizeType nrows = std::min(k - row, evecs_tile.size().rows());
-  SizeType ncols = std::min(k - col, evecs_tile.size().cols());
+  SizeType nrows = std::min(k_row - row, evecs_tile.size().rows());
+  SizeType ncols = std::min(k_col - col, evecs_tile.size().cols());
 
   for (SizeType j = 0; j < ncols; ++j) {
     T loc_norm = 0;
@@ -187,13 +191,13 @@ DLAF_CPU_SUMSQ_COLS_ETI(, float);
 DLAF_CPU_SUMSQ_COLS_ETI(, double);
 
 template <class T>
-void addFirstRows(const SizeType& k, const SizeType& row, const SizeType& col,
+void addFirstRows(const SizeType& k_row, const SizeType& k_col, const SizeType& row, const SizeType& col,
                   const matrix::Tile<const T, Device::CPU>& in,
                   const matrix::Tile<T, Device::CPU>& out) {
-  if (row >= k || col >= k)
+  if (row >= k_row || col >= k_col)
     return;
 
-  SizeType ncols = std::min(k - col, in.size().cols());
+  SizeType ncols = std::min(k_col - col, in.size().cols());
 
   for (SizeType j = 0; j < ncols; ++j) {
     out(TileElementIndex(0, j)) += in(TileElementIndex(0, j));
@@ -204,14 +208,14 @@ DLAF_CPU_ADD_FIRST_ROWS_ETI(, float);
 DLAF_CPU_ADD_FIRST_ROWS_ETI(, double);
 
 template <class T>
-void divideColsByFirstRow(const SizeType& k, const SizeType& row, const SizeType& col,
-                          const matrix::Tile<const T, Device::CPU>& in,
+void divideColsByFirstRow(const SizeType& k_row, const SizeType& k_col, const SizeType& row,
+                          const SizeType& col, const matrix::Tile<const T, Device::CPU>& in,
                           const matrix::Tile<T, Device::CPU>& out) {
-  if (row >= k || col >= k)
+  if (row >= k_row || col >= k_col)
     return;
 
-  SizeType nrows = std::min(k - row, out.size().rows());
-  SizeType ncols = std::min(k - col, out.size().cols());
+  SizeType nrows = std::min(k_row - row, out.size().rows());
+  SizeType ncols = std::min(k_col - col, out.size().cols());
 
   for (SizeType j = 0; j < ncols; ++j) {
     for (SizeType i = 0; i < nrows; ++i) {
@@ -242,10 +246,10 @@ DLAF_CPU_SET_UNIT_DIAGONAL_ETI(, float);
 DLAF_CPU_SET_UNIT_DIAGONAL_ETI(, double);
 
 template <class T>
-void copy1D(const SizeType& k, const SizeType& row, const SizeType& col, const Coord& in_coord,
-            const matrix::Tile<const T, Device::CPU>& in_tile, const Coord& out_coord,
-            const matrix::Tile<T, Device::CPU>& out_tile) {
-  if (row >= k || col >= k)
+void copy1D(const SizeType& k_row, const SizeType& k_col, const SizeType& row, const SizeType& col,
+            const Coord& in_coord, const matrix::Tile<const T, Device::CPU>& in_tile,
+            const Coord& out_coord, const matrix::Tile<T, Device::CPU>& out_tile) {
+  if (row >= k_row || col >= k_col)
     return;
 
   const T* in_ptr = in_tile.ptr();
@@ -255,12 +259,12 @@ void copy1D(const SizeType& k, const SizeType& row, const SizeType& col, const C
   SizeType out_ld = (out_coord == Coord::Col) ? 1 : out_tile.ld();
 
   // if `in_tile` is the column buffer
-  SizeType len = (out_coord == Coord::Col) ? std::min(out_tile.size().rows(), k - row)
-                                           : std::min(out_tile.size().cols(), k - col);
+  SizeType len = (out_coord == Coord::Col) ? std::min(out_tile.size().rows(), k_row - row)
+                                           : std::min(out_tile.size().cols(), k_col - col);
   // if out_tile is the column buffer
   if (out_tile.size().cols() == 1) {
-    len = (in_coord == Coord::Col) ? std::min(in_tile.size().rows(), k - row)
-                                   : std::min(in_tile.size().cols(), k - col);
+    len = (in_coord == Coord::Col) ? std::min(in_tile.size().rows(), k_row - row)
+                                   : std::min(in_tile.size().cols(), k_col - col);
   }
 
   blas::copy(len, in_ptr, in_ld, out_ptr, out_ld);
diff --git a/src/eigensolver/tridiag_solver/kernels.cu b/src/eigensolver/tridiag_solver/kernels.cu
index 5f3d3c0004..bd8a11a396 100644
--- a/src/eigensolver/tridiag_solver/kernels.cu
+++ b/src/eigensolver/tridiag_solver/kernels.cu
@@ -227,16 +227,17 @@ struct Row2ColMajor {
 };
 
 template <class T>
-void divideEvecsByDiagonal(const SizeType& k, const SizeType& i_subm_el, const SizeType& j_subm_el,
+void divideEvecsByDiagonal(const SizeType& k_row, const SizeType& k_col, const SizeType& i_subm_el,
+                           const SizeType& j_subm_el,
                            const matrix::Tile<const T, Device::GPU>& diag_rows,
                            const matrix::Tile<const T, Device::GPU>& diag_cols,
                            const matrix::Tile<const T, Device::GPU>& evecs_tile,
                            const matrix::Tile<T, Device::GPU>& ws_tile, whip::stream_t stream) {
-  if (i_subm_el >= k || j_subm_el >= k)
+  if (i_subm_el >= k_row || j_subm_el >= k_col)
     return;
 
-  SizeType nrows = std::min(k - i_subm_el, evecs_tile.size().rows());
-  SizeType ncols = std::min(k - j_subm_el, evecs_tile.size().cols());
+  SizeType nrows = std::min(k_row - i_subm_el, evecs_tile.size().rows());
+  SizeType ncols = std::min(k_col - j_subm_el, evecs_tile.size().cols());
 
   SizeType ld = evecs_tile.ld();
   const T* d_rows = diag_rows.ptr();
@@ -300,13 +301,13 @@ __global__ void multiplyColumns(SizeType len, const T* in, T* out) {
 }
 
 template <class T>
-void multiplyFirstColumns(const SizeType& k, const SizeType& row, const SizeType& col,
-                          const matrix::Tile<const T, Device::GPU>& in,
+void multiplyFirstColumns(const SizeType& k_row, const SizeType& k_col, const SizeType& row,
+                          const SizeType& col, const matrix::Tile<const T, Device::GPU>& in,
                           const matrix::Tile<T, Device::GPU>& out, whip::stream_t stream) {
-  if (row >= k || col >= k)
+  if (row >= k_row || col >= k_col)
     return;
 
-  SizeType nrows = std::min(k - row, in.size().rows());
+  SizeType nrows = std::min(k_row - row, in.size().rows());
 
   const T* in_ptr = in.ptr();
   T* out_ptr = out.ptr();
@@ -334,24 +335,26 @@ __global__ void calcEvecsFromWeightVec(SizeType nrows, SizeType ncols, SizeType
   T z_el = rank1_vec[i];
   T& el_evec = evecs[i + j * ld];
 
-  if constexpr (std::is_same<T, float>::value) {
-    el_evec = copysignf(sqrtf(fabsf(ws_el)), z_el) / el_evec;
-  }
-  else {
-    el_evec = copysign(sqrt(fabs(ws_el)), z_el) / el_evec;
+  if (el_evec != 0) {
+    if constexpr (std::is_same<T, float>::value) {
+      el_evec = copysignf(sqrtf(fabsf(ws_el)), z_el) / el_evec;
+    }
+    else {
+      el_evec = copysign(sqrt(fabs(ws_el)), z_el) / el_evec;
+    }
   }
 }
 
 template <class T>
-void calcEvecsFromWeightVec(const SizeType& k, const SizeType& row, const SizeType& col,
-                            const matrix::Tile<const T, Device::GPU>& z_tile,
+void calcEvecsFromWeightVec(const SizeType& k_row, const SizeType& k_col, const SizeType& row,
+                            const SizeType& col, const matrix::Tile<const T, Device::GPU>& z_tile,
                             const matrix::Tile<const T, Device::GPU>& ws_tile,
                             const matrix::Tile<T, Device::GPU>& evecs_tile, whip::stream_t stream) {
-  if (row >= k || col >= k)
+  if (row >= k_row || col >= k_col)
     return;
 
-  SizeType nrows = std::min(k - row, evecs_tile.size().rows());
-  SizeType ncols = std::min(k - col, evecs_tile.size().cols());
+  SizeType nrows = std::min(k_row - row, evecs_tile.size().rows());
+  SizeType ncols = std::min(k_col - col, evecs_tile.size().cols());
 
   SizeType ld = evecs_tile.ld();
   const T* rank1_vec = z_tile.ptr();
@@ -385,14 +388,14 @@ __global__ void sqTile(SizeType nrows, SizeType ncols, SizeType ld, const T* in,
 }
 
 template <class T>
-void sumsqCols(const SizeType& k, const SizeType& row, const SizeType& col,
+void sumsqCols(const SizeType& k_row, const SizeType& k_col, const SizeType& row, const SizeType& col,
                const matrix::Tile<const T, Device::GPU>& evecs_tile,
                const matrix::Tile<T, Device::GPU>& ws_tile, whip::stream_t stream) {
-  if (row >= k || col >= k)
+  if (row >= k_row || col >= k_col)
     return;
 
-  SizeType nrows = std::min(k - row, evecs_tile.size().rows());
-  SizeType ncols = std::min(k - col, evecs_tile.size().cols());
+  SizeType nrows = std::min(k_row - row, evecs_tile.size().rows());
+  SizeType ncols = std::min(k_col - col, evecs_tile.size().cols());
 
   SizeType ld = evecs_tile.ld();
   const T* in = evecs_tile.ptr();
@@ -441,13 +444,13 @@ __global__ void addFirstRows(SizeType len, SizeType ld, const T* in, T* out) {
 }
 
 template <class T>
-void addFirstRows(const SizeType& k, const SizeType& row, const SizeType& col,
+void addFirstRows(const SizeType& k_row, const SizeType& k_col, const SizeType& row, const SizeType& col,
                   const matrix::Tile<const T, Device::GPU>& in, const matrix::Tile<T, Device::GPU>& out,
                   whip::stream_t stream) {
-  if (row >= k || col >= k)
+  if (row >= k_row || col >= k_col)
     return;
 
-  SizeType ncols = std::min(k - col, in.size().cols());
+  SizeType ncols = std::min(k_col - col, in.size().cols());
 
   SizeType ld = in.ld();
   const T* in_ptr = in.ptr();
@@ -484,14 +487,14 @@ __global__ void scaleTileWithRow(SizeType nrows, SizeType ncols, SizeType in_ld,
 }
 
 template <class T>
-void divideColsByFirstRow(const SizeType& k, const SizeType& row, const SizeType& col,
-                          const matrix::Tile<const T, Device::GPU>& in,
+void divideColsByFirstRow(const SizeType& k_row, const SizeType& k_col, const SizeType& row,
+                          const SizeType& col, const matrix::Tile<const T, Device::GPU>& in,
                           const matrix::Tile<T, Device::GPU>& out, whip::stream_t stream) {
-  if (row >= k || col >= k)
+  if (row >= k_row || col >= k_col)
     return;
 
-  SizeType nrows = std::min(k - row, out.size().rows());
-  SizeType ncols = std::min(k - col, out.size().cols());
+  SizeType nrows = std::min(k_row - row, out.size().rows());
+  SizeType ncols = std::min(k_col - col, out.size().cols());
 
   SizeType in_ld = in.ld();
   const T* in_ptr = in.ptr();
@@ -543,10 +546,11 @@ DLAF_GPU_SET_UNIT_DIAGONAL_ETI(, double);
 
 // Reference to CUBLAS 1D copy(): https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-copy
 template <class T>
-void copy1D(cublasHandle_t handle, const SizeType& k, const SizeType& row, const SizeType& col,
-            const Coord& in_coord, const matrix::Tile<const T, Device::GPU>& in_tile,
-            const Coord& out_coord, const matrix::Tile<T, Device::GPU>& out_tile) {
-  if (row >= k || col >= k)
+void copy1D(cublasHandle_t handle, const SizeType& k_row, const SizeType& k_col, const SizeType& row,
+            const SizeType& col, const Coord& in_coord,
+            const matrix::Tile<const T, Device::GPU>& in_tile, const Coord& out_coord,
+            const matrix::Tile<T, Device::GPU>& out_tile) {
+  if (row >= k_row || col >= k_col)
     return;
 
   const T* in_ptr = in_tile.ptr();
@@ -556,12 +560,12 @@ void copy1D(cublasHandle_t handle, const SizeType& k, const SizeType& row, const
   int out_ld = (out_coord == Coord::Col) ? 1 : to_int(out_tile.ld());
 
   // if `in_tile` is the column buffer
-  SizeType len = (out_coord == Coord::Col) ? std::min(out_tile.size().rows(), k - row)
-                                           : std::min(out_tile.size().cols(), k - col);
+  SizeType len = (out_coord == Coord::Col) ? std::min(out_tile.size().rows(), k_row - row)
+                                           : std::min(out_tile.size().cols(), k_col - col);
   // if out_tile is the column buffer
   if (out_tile.size().cols() == 1) {
-    len = (in_coord == Coord::Col) ? std::min(in_tile.size().rows(), k - row)
-                                   : std::min(in_tile.size().cols(), k - col);
+    len = (in_coord == Coord::Col) ? std::min(in_tile.size().rows(), k_row - row)
+                                   : std::min(in_tile.size().cols(), k_col - col);
   }
 
   if constexpr (std::is_same<T, float>::value) {