From d9e207254e37c3ba88d42e1425761d3ea3c0aab9 Mon Sep 17 00:00:00 2001 From: Mitsuaki Kawamura Date: Fri, 19 Apr 2024 14:04:18 +0900 Subject: [PATCH] Fix unstable behaviorin LOBPCG: The resulting vectors of subspace diagonalization should be the same across processes. This caused error in Fugaku with SVE. Also fix typo of overlap --- src/CalcByLOBPCG.c | 39 +++++++++++++++++++---------------- src/include/wrapperMPI.h | 3 +++ src/wrapperMPI.c | 44 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 18 deletions(-) diff --git a/src/CalcByLOBPCG.c b/src/CalcByLOBPCG.c index d5084da85..ff47d74a4 100644 --- a/src/CalcByLOBPCG.c +++ b/src/CalcByLOBPCG.c @@ -48,10 +48,10 @@ void zgemm_(char *transa, char *transb, int *m, int *n, int *k, double complex * with the Lowdin's orthogonalization @return the truncated dimension, nsub2 */ -static int diag_ovrp( +static int diag_ovrlp( int nsub,//!<[in] Original dimension of subspace double complex *hsub,//!<[inout] (nsub*nsub) subspace hamiltonian -> eigenvector - double complex *ovlp,//!<[inout] (nsub*nsub) Overrap matrix -> @f${\hat O}^{1/2}@f$ + double complex *ovrlp,//!<[inout] (nsub*nsub) overlap matrix -> @f${\hat O}^{1/2}@f$ double *eig//!<[out] (nsub) Eigenvalue ) { @@ -77,9 +77,9 @@ static int diag_ovrp( work = cd_1d_allocate(lwork); #endif /**@brief - (1) Compute @f${\hat O}^{-1/2}@f$ with diagonalizing overrap matrix + (1) Compute @f${\hat O}^{-1/2}@f$ with diagonalizing overlap matrix */ - zheevd_(&jobz, &uplo, &nsub, ovlp, &nsub, eig, work, &lwork, rwork, &lrwork, iwork, &liwork, &info); + zheevd_(&jobz, &uplo, &nsub, ovrlp, &nsub, eig, work, &lwork, rwork, &lrwork, iwork, &liwork, &info); /**@brief @f[ {\hat O}^{-1/2} = \left(\frac{|O_1\rangle}{\sqrt{o_1}}, \frac{|O_2\rangle}{\sqrt{o_2}}, @@ -94,21 +94,21 @@ static int diag_ovrp( for (isub = 0; isub < nsub; isub++) { if (eig[isub] > 1.0e-10) { /*to be changed default 1.0e-14*/ for (jsub = 0; jsub < nsub; jsub++) - ovlp[jsub + nsub*nsub2] = ovlp[jsub + nsub*isub] / sqrt(eig[isub]); + ovrlp[jsub + nsub*nsub2] = ovrlp[jsub + nsub*isub] / sqrt(eig[isub]); nsub2 += 1; } } for (isub = nsub2; isub < nsub; isub++) for (jsub = 0; jsub < nsub; jsub++) - ovlp[jsub + nsub*isub] = 0.0; + ovrlp[jsub + nsub*isub] = 0.0; /** (2) Transform @f${\hat H}'\equiv {\hat O}^{-1/2 \dagger}{\hat H}{\hat O}^{-1/2}@f$. @f${\hat H}'@f$ is nsub2*nsub2 matrix. */ transa = 'N'; - zgemm_(&transa, &transb, &nsub, &nsub, &nsub, &one, hsub, &nsub, ovlp, &nsub, &zero, mat, &nsub); + zgemm_(&transa, &transb, &nsub, &nsub, &nsub, &one, hsub, &nsub, ovrlp, &nsub, &zero, mat, &nsub); transa = 'C'; - zgemm_(&transa, &transb, &nsub, &nsub, &nsub, &one, ovlp, &nsub, mat, &nsub, &zero, hsub, &nsub); + zgemm_(&transa, &transb, &nsub, &nsub, &nsub, &one, ovrlp, &nsub, mat, &nsub, &zero, hsub, &nsub); /** (3) Diagonalize @f${\hat H}'@f$. It is the standard eigenvalue problem. @f[ @@ -123,7 +123,7 @@ static int diag_ovrp( @f] */ transa = 'N'; - zgemm_(&transa, &transb, &nsub, &nsub, &nsub, &one, ovlp, &nsub, hsub, &nsub, &zero, mat, &nsub); + zgemm_(&transa, &transb, &nsub, &nsub, &nsub, &one, ovrlp, &nsub, hsub, &nsub, &zero, mat, &nsub); // printf("%d %d %15.5f %15.5f %15.5f\n", info, nsub2, eig[0], eig[1], eig[2]); for (isub = 0; isub < nsub*nsub; isub++)hsub[isub] = mat[isub]; @@ -132,8 +132,11 @@ static int diag_ovrp( free_d_1d_allocate(rwork); free_i_1d_allocate(iwork); - return(nsub2); -}/*void diag_ovrp*/ + BcastMPI_cv(0, nsub * nsub, hsub); + BcastMPI_cv(0, nsub * nsub, ovrlp); + BcastMPI_dv(0, nsub, eig); + return(BcastMPI_i(0, nsub2)); +}/*void diag_ovrlp*/ /**@brief Compute adaptively shifted preconditionar written in S. Yamada, et al., Transactions of JSCES, Paper No. 20060027 (2006). @@ -359,7 +362,7 @@ int LOBPCG_Main( int ii, jj, ie, nsub, stp, nsub_cut, nstate; double complex ***wxp/*[0] w, [1] x, [2] p of Ref.1*/, ***hwxp/*[0] h*w, [1] h*x, [2] h*p of Ref.1*/, - ****hsub, ****ovlp; /*Subspace Hamiltonian and Overlap*/ + ****hsub, ****ovrlp; /*Subspace Hamiltonian and Overlap*/ double *eig, *dnorm, eps_LOBPCG, eigabs_max, preshift, precon, dnormmax, *eigsub, eig_pos_shift; char tN = 'N', tC = 'C'; double complex one = 1.0, zero = 0.0; @@ -372,7 +375,7 @@ int LOBPCG_Main( dnorm = d_1d_allocate(X->Def.k_exct); eigsub = d_1d_allocate(nsub); hsub = cd_4d_allocate(3, X->Def.k_exct, 3, X->Def.k_exct); - ovlp = cd_4d_allocate(3, X->Def.k_exct, 3, X->Def.k_exct); + ovrlp = cd_4d_allocate(3, X->Def.k_exct, 3, X->Def.k_exct); i_max = X->Check.idim_max; i4_max = (int)i_max; @@ -500,7 +503,7 @@ private(idim,precon,ie) TimeKeeperWithStep(X, cFileNameTimeKeep, cLanczos_EigenValueStep, "a", stp); /**@brief -
  • Compute subspace Hamiltonian and overrap matrix: +
  • Compute subspace Hamiltonian and overlap matrix: @f${\hat H}_{\rm sub}=\{{\bf w},{\bf x},{\bf p}\}^\dagger \{{\bf W},{\bf X},{\bf P}\}@f$, @f${\hat O}=\{{\bf w},{\bf x},{\bf p}\}^\dagger \{{\bf w},{\bf x},{\bf p}\}@f$,
  • @@ -508,12 +511,12 @@ private(idim,precon,ie) for (ii = 0; ii < 3; ii++) { for (jj = 0; jj < 3; jj++) { zgemm_(&tN, &tC, &nstate, &nstate, &i4_max, &one, - &wxp[ii][1][0], &nstate, &wxp[jj][1][0], &nstate, &zero, &ovlp[jj][0][ii][0], &nsub); + &wxp[ii][1][0], &nstate, &wxp[jj][1][0], &nstate, &zero, &ovrlp[jj][0][ii][0], &nsub); zgemm_(&tN, &tC, &nstate, &nstate, &i4_max, &one, &wxp[ii][1][0], &nstate, &hwxp[jj][1][0], &nstate, &zero, &hsub[jj][0][ii][0], &nsub); } } - SumMPI_cv(nsub*nsub, &ovlp[0][0][0][0]); + SumMPI_cv(nsub*nsub, &ovrlp[0][0][0][0]); SumMPI_cv(nsub*nsub, &hsub[0][0][0][0]); for (ie = 0; ie < X->Def.k_exct; ie++) @@ -523,7 +526,7 @@ private(idim,precon,ie) generalized eigenvalue problem: @f${\hat H}_{\rm sub}{\bf v}={\hat O}\mu_{\rm sub}{\bf v}@f$, @f${\bf v}=(\alpha, \beta, \gamma)@f$ */ - nsub_cut = diag_ovrp(nsub, &hsub[0][0][0][0], &ovlp[0][0][0][0], eigsub); + nsub_cut = diag_ovrlp(nsub, &hsub[0][0][0][0], &ovrlp[0][0][0][0], eigsub); /**@brief
  • Update @f$\mu=(\mu+\mu_{\rm sub})/2@f$
  • */ @@ -604,7 +607,7 @@ private(idim,precon,ie) free_d_1d_allocate(dnorm); free_d_1d_allocate(eigsub); free_cd_4d_allocate(hsub); - free_cd_4d_allocate(ovlp); + free_cd_4d_allocate(ovrlp); free_cd_3d_allocate(hwxp); /**@brief
  • Output resulting vectors for restart
  • diff --git a/src/include/wrapperMPI.h b/src/include/wrapperMPI.h index 9bea50e78..51033958e 100644 --- a/src/include/wrapperMPI.h +++ b/src/include/wrapperMPI.h @@ -37,6 +37,9 @@ void SumMPI_cv(int nnorm, double complex *norm); unsigned long int SumMPI_li(unsigned long int idim); int SumMPI_i(int idim); unsigned long int BcastMPI_li(int root, unsigned long int idim); +int BcastMPI_i(int root, int nsub); +void BcastMPI_dv(int root, int nlen, double* vector); +void BcastMPI_cv(int root, int nlen, double complex* vector); double NormMPI_dc(unsigned long int idim, double complex *_v1); void NormMPI_dv(unsigned long int ndim, int nstate, double complex **_v1, double *dnorm); double complex VecProdMPI(long unsigned int ndim, double complex *v1, double complex *v2); diff --git a/src/wrapperMPI.c b/src/wrapperMPI.c index 2c54dfb3e..e5d68650f 100644 --- a/src/wrapperMPI.c +++ b/src/wrapperMPI.c @@ -313,6 +313,50 @@ unsigned long int BcastMPI_li( return(idim0); }/*unsigned long int BcastMPI_li*/ /** +@brief MPI wrapper function to broadcast integer across processes. +@return Broadcasted value across processes. +@author Mitsuaki Kawamura (The University of Tokyo) +*/ +int BcastMPI_i( + int root,//!<[in] The source process of the broadcast + int nsub//!<[in] Value to be broadcasted +) { + int nsub0; + nsub0 = nsub; +#ifdef MPI + MPI_Bcast(&nsub0, 1, MPI_INT, root, MPI_COMM_WORLD); +#endif + return(nsub0); +}/*int BcastMPI_i*/ +/** +@brief MPI wrapper function to broadcast double precision vector. + And store it in place. +@author Mitsuaki Kawamura (The University of Tokyo) +*/ +void BcastMPI_dv( + int root,//!<[in] The source process of the broadcast + int nlen,//!<[in] Length of broadcasted vector + double *vector//!<[inout] Broadcasted vector +) { +#ifdef MPI + MPI_Bcast(vector, nlen, MPI_DOUBLE_PRECISION, root, MPI_COMM_WORLD); +#endif +}/*void BcastMPI_dv*/ +/** +@brief MPI wrapper function to broadcast double complex vector. + And store it in place. +@author Mitsuaki Kawamura (The University of Tokyo) +*/ +void BcastMPI_cv( + int root,//!<[in] The source process of the broadcast + int nlen,//!<[in] Length of broadcasted vector + double complex* vector//!<[inout] Broadcasted vector +) { +#ifdef MPI + MPI_Bcast(vector, nlen, MPI_DOUBLE_COMPLEX, root, MPI_COMM_WORLD); +#endif +}/*void BcastMPI_cv*/ +/** @brief Compute norm of process-distributed vector @f$|{\bf v}_1|^2@f$ @return Norm @f$|{\bf v}_1|^2@f$