From 6ea5c6cc4494487a88f2e3a90f3a460a99cca24e Mon Sep 17 00:00:00 2001
From: puzhichen <147788878+puzhichen@users.noreply.github.com>
Date: Mon, 20 Jan 2025 23:13:36 +0800
Subject: [PATCH] Davidson iterations for tddft on GPU (#305)

* some mmodifications

* test

* finish writting, start debugging

* Finish debugging and unit tests.

* remove some comments and unused codes

* after review the codes

* change the threshold in precond

* add the import _response_functions

* change codes according to review comments
---
 gpu4pyscf/tdscf/_lr_eig.py          | 209 +++++++++++++++++-----------
 gpu4pyscf/tdscf/rhf.py              |  35 +++--
 gpu4pyscf/tdscf/rks.py              |   7 +-
 gpu4pyscf/tdscf/tests/test_tdrhf.py |   8 +-
 gpu4pyscf/tdscf/tests/test_tdrks.py |   6 +-
 gpu4pyscf/tdscf/tests/test_tduhf.py |   4 +-
 gpu4pyscf/tdscf/tests/test_tduks.py |   6 +-
 gpu4pyscf/tdscf/uhf.py              |  22 +--
 gpu4pyscf/tdscf/uks.py              |   8 +-
 9 files changed, 178 insertions(+), 127 deletions(-)

diff --git a/gpu4pyscf/tdscf/_lr_eig.py b/gpu4pyscf/tdscf/_lr_eig.py
index 880732e0..ae72899f 100644
--- a/gpu4pyscf/tdscf/_lr_eig.py
+++ b/gpu4pyscf/tdscf/_lr_eig.py
@@ -17,9 +17,11 @@
 functions are provided in future PySCF release.
 '''
 
+import cupy as cp
 import sys
 import numpy as np
 import scipy.linalg
+import cupyx.scipy.linalg
 from pyscf.lib.parameters import MAX_MEMORY
 from pyscf.lib import logger
 from pyscf.lib.linalg_helper import _sort_elast, _outprod_to_subspace
@@ -85,7 +87,8 @@ def eigh(aop, x0, precond, tol_residual=1e-5, lindep=1e-12, nroots=1,
 
     if isinstance(x0, np.ndarray) and x0.ndim == 1:
         x0 = x0[None,:]
-    x0 = np.asarray(x0)
+    
+    x0 = cp.asarray(x0)
 
     x0_size = x0.shape[1]
     if MAX_SPACE_INC is None:
@@ -101,15 +104,15 @@ def eigh(aop, x0, precond, tol_residual=1e-5, lindep=1e-12, nroots=1,
     max_space = min(max_space, x0_size)
     log.debug(f'Set max_space {max_space}, space_inc {space_inc}')
 
-    xs = np.zeros((0, x0_size))
-    ax = np.zeros((0, x0_size))
+    xs = cp.zeros((0, x0_size))
+    ax = cp.zeros((0, x0_size))
     e = w = v = None
-    conv_last = conv = np.zeros(nroots, dtype=bool)
+    conv_last = conv = cp.zeros(nroots, dtype=bool)
     xt = x0
 
     if x0sym is not None:
-        xt_ir = np.asarray(x0sym)
-        xs_ir = np.array([], dtype=xt_ir.dtype)
+        xt_ir = cp.asarray(x0sym)
+        xs_ir = cp.array([], dtype=xt_ir.dtype)
 
     for icyc in range(max_cycle):
         xt, xt_idx = _qr(xt, lindep)
@@ -119,39 +122,46 @@ def eigh(aop, x0, precond, tol_residual=1e-5, lindep=1e-12, nroots=1,
 
         row0 = len(xs)
         axt = aop(xt)
-        xs = np.vstack([xs, xt])
-        ax = np.vstack([ax, axt])
+        xs = cp.vstack([xs, xt])
+        ax = cp.vstack([ax, axt])
+        axt = None
         if x0sym is not None:
-            xs_ir = np.hstack([xs_ir, xt_ir[xt_idx]])
+            xs_ir = cp.hstack([xs_ir, xt_ir[xt_idx]])
 
         # Compute heff = xs.conj().dot(ax.T)
         if w is None:
             heff = xs.conj().dot(ax.T)
         else:
             hsub = xt.conj().dot(ax.T)
-            heff = np.block([[np.diag(w), hsub[:,:row0].conj().T],
-                             [hsub[:,:row0], hsub[:,row0:]]])
+            heff = cp.empty((len(xs), len(xs)), dtype=hsub.dtype)
+            heff[:row0,:row0] = cp.diag(w)
+            heff[:row0,row0:] = hsub[:,:row0].conj().T
+            heff[row0:,:row0] = hsub[:,:row0]
+            heff[row0:,row0:] = hsub[:,row0:]
+        hsub = None
+        xt = None
 
         if x0sym is None:
-            w, v = scipy.linalg.eigh(heff)
+            w, v = cp.linalg.eigh(heff)
         else:
             # Diagonalize within eash symmetry sectors
             row1 = len(xs)
-            w = np.empty(row1)
-            v = np.zeros((row1, row1))
+            w = cp.empty(row1)
+            v = cp.zeros((row1, row1))
             v_ir = []
             i1 = 0
             for ir in set(xs_ir):
-                idx = np.where(xs_ir == ir)[0]
+                idx = cp.where(xs_ir == ir)[0]
                 i0, i1 = i1, i1 + idx.size
-                w_sub, v_sub = scipy.linalg.eigh(heff[idx[:,None],idx])
+                w_sub, v_sub = cp.linalg.eigh(heff[idx[:,None],idx])
                 w[i0:i1] = w_sub
                 v[idx,i0:i1] = v_sub
                 v_ir.append([ir] * idx.size)
-            w_idx = np.argsort(w)
+            w_idx = cp.argsort(w)
             w = w[w_idx]
             v = v[:,w_idx]
-            xs_ir = np.hstack(v_ir)[w_idx]
+            xs_ir = cp.hstack(v_ir)[w_idx]
+        heff = None
 
         w, v, idx = pick(w, v, nroots, locals())
         if x0sym is not None:
@@ -168,8 +178,8 @@ def eigh(aop, x0, precond, tol_residual=1e-5, lindep=1e-12, nroots=1,
             de = e
         else:
             # mapping to previous eigenvectors
-            vlast = np.eye(nroots)
-            elast, conv_last = _sort_elast(elast, conv, vlast,
+            vlast = cp.eye(nroots)
+            elast, conv_last = _sort_elast_gpu(elast, conv, vlast,
                                            v[:nroots,:nroots], log)
             de = e - elast
 
@@ -189,7 +199,7 @@ def eigh(aop, x0, precond, tol_residual=1e-5, lindep=1e-12, nroots=1,
         if x0sym is not None:
             xt_ir = xs_ir[:t_size]
 
-        dx_norm = np.linalg.norm(xt, axis=1)
+        dx_norm = cp.linalg.norm(xt, axis=1)
         max_dx_norm = max(dx_norm[:nroots])
         conv = dx_norm[:nroots] < tol_residual
         for k, ek in enumerate(e[:nroots]):
@@ -199,18 +209,17 @@ def eigh(aop, x0, precond, tol_residual=1e-5, lindep=1e-12, nroots=1,
             else:
                 log.debug1('root %d  |r|= %4.3g  e= %s  max|de|= %4.3g',
                           k, dx_norm[k], ek, de[k])
-        ide = np.argmax(abs(de))
+        ide = cp.argmax(abs(de))
         if all(conv):
             log.debug('converged %d %d  |r|= %4.3g  e= %s  max|de|= %4.3g',
                       icyc, len(xs), max_dx_norm, e, de[ide])
             break
 
         # remove subspace linear dependency
-        for k, xk in enumerate(xt):
-            if dx_norm[k] > tol_residual:
-                xt[k] = precond(xk, e[0])
+        r_index = dx_norm > tol_residual
+        xt[r_index] = precond(xt[r_index], w[:t_size][r_index])
         xt -= xs.conj().dot(xt.T).T.dot(xs)
-        xt_norm = np.linalg.norm(xt, axis=1)
+        xt_norm = cp.linalg.norm(xt, axis=1)
 
         remaining = []
         for k, xk in enumerate(xt):
@@ -236,7 +245,7 @@ def eigh(aop, x0, precond, tol_residual=1e-5, lindep=1e-12, nroots=1,
     if len(x0) < min(x0_size, nroots):
         log.warn(f'Not enough eigenvectors (len(x0)={len(x0)}, nroots={nroots})')
 
-    return conv, e, x0
+    return conv, e.get(), x0.get()
 
 def eig(aop, x0, precond, tol_residual=1e-5, nroots=1, x0sym=None, pick=None,
         max_cycle=50, max_memory=MAX_MEMORY, lindep=1e-12, verbose=logger.WARN):
@@ -541,8 +550,8 @@ def real_eig(aop, x0, precond, tol_residual=1e-5, nroots=1, x0sym=None, pick=Non
         log = verbose
     else:
         log = logger.Logger(sys.stdout, verbose)
-
     assert x0.ndim == 2
+    x0 = cp.asarray(x0)
     A_size = x0.shape[1] // 2
     V = x0[:,:A_size]
     W = x0[:,A_size:]
@@ -562,23 +571,23 @@ def real_eig(aop, x0, precond, tol_residual=1e-5, nroots=1, x0sym=None, pick=Non
     log.debug(f'Set max_space {max_space}, space_inc {space_inc}')
 
     if x0sym is not None:
-        x0_ir = np.asarray(x0sym)
+        x0_ir = cp.asarray(x0sym)
 
     '''U1 = AV + BW
        U2 = AW + BV'''
-    V_holder = np.empty((A_size, max_space), order='F')
-    W_holder = np.empty_like(V_holder)
-    U1_holder = np.empty_like(V_holder)
-    U2_holder = np.empty_like(V_holder)
-
-    a = np.empty((max_space*2,max_space*2))
-    b = np.empty_like(a)
-    sigma = np.empty_like(a)
-    pi = np.empty_like(a)
+    V_holder = cp.empty((A_size, max_space), order='F')
+    W_holder = cp.empty_like(V_holder)
+    U1_holder = cp.empty_like(V_holder)
+    U2_holder = cp.empty_like(V_holder)
+
+    a = cp.empty((max_space*2,max_space*2))
+    b = cp.empty_like(a)
+    sigma = cp.empty_like(a)
+    pi = cp.empty_like(a)
     e = None
     v_sub = None
     vlast = None
-    conv_last = conv = np.zeros(nroots, dtype=bool)
+    conv_last = conv = cp.zeros(nroots, dtype=bool)
 
     fresh_start = True
     for icyc in range(max_cycle):
@@ -588,7 +597,7 @@ def real_eig(aop, x0, precond, tol_residual=1e-5, nroots=1, x0sym=None, pick=Non
             if x0sym is not None:
                 xs_ir = xt_ir = x0_ir
 
-        axt = aop(np.hstack([V, W]))
+        axt = aop(cp.hstack([V, W]))
         U1 =  axt[:,:A_size]
         U2 = -axt[:,A_size:]
         m0, m1 = m1, m1+len(U1)
@@ -633,13 +642,13 @@ def real_eig(aop, x0, precond, tol_residual=1e-5, nroots=1, x0sym=None, pick=Non
                 a[:m1,:m1], b[:m1,:m1], sigma[:m1,:m1], pi[:m1,:m1], space_inc)
         else:
             # Diagonalize within eash symmetry sectors
-            omega = np.empty(m1)
-            x = np.zeros((m1, m1))
-            y = np.zeros_like(x)
+            omega = cp.empty(m1)
+            x = cp.zeros((m1, m1))
+            y = cp.zeros_like(x)
             v_ir = []
             i1 = 0
             for ir in set(xs_ir):
-                idx = np.nonzero(xs_ir[:m1] == ir)[0]
+                idx = cp.nonzero(xs_ir[:m1] == ir)[0]
                 _w, _x, _y = TDDFT_subspace_eigen_solver(
                     a[idx[:,None],idx], b[idx[:,None],idx],
                     sigma[idx[:,None],idx], pi[idx[:,None],idx], idx.size)
@@ -648,16 +657,16 @@ def real_eig(aop, x0, precond, tol_residual=1e-5, nroots=1, x0sym=None, pick=Non
                 x[idx,i0:i1] = _x
                 y[idx,i0:i1] = _y
                 v_ir.append([ir] * _w.size)
-            idx = np.argsort(omega)
+            idx = cp.argsort(omega)
             omega = omega[idx]
-            v_ir = np.hstack(v_ir)[idx]
+            v_ir = cp.hstack(v_ir)[idx]
             x = x[:,idx]
             y = y[:,idx]
 
         w, e, elast = omega[:space_inc], omega[:nroots], e
         v_sub = x[:,:space_inc]
         if not fresh_start:
-            elast, conv_last = _sort_elast(elast, conv, vlast, v_sub[:,:nroots], log)
+            elast, conv_last = _sort_elast_gpu(elast, conv, vlast, v_sub[:,:nroots], log)
         vlast = v_sub[:,:nroots]
 
         if elast is None:
@@ -684,8 +693,8 @@ def real_eig(aop, x0, precond, tol_residual=1e-5, nroots=1, x0sym=None, pick=Non
         R_y += U1_holder[:,:m1].dot(y)
         R_y += Y_full * w
 
-        r_norms  = np.linalg.norm(R_x, axis=0) ** 2
-        r_norms += np.linalg.norm(R_y, axis=0) ** 2
+        r_norms  = cp.linalg.norm(R_x, axis=0) ** 2
+        r_norms += cp.linalg.norm(R_y, axis=0) ** 2
         r_norms = r_norms ** .5
 
         if x0sym is not None:
@@ -701,14 +710,14 @@ def real_eig(aop, x0, precond, tol_residual=1e-5, nroots=1, x0sym=None, pick=Non
             else:
                 log.debug1('root %d  |r|= %4.3g  e= %s  max|de|= %4.3g',
                           k, r_norms[k], ek, de[k])
-        ide = np.argmax(abs(de))
+        ide = cp.argmax(abs(de))
         if all(conv):
             log.debug('converged %d %d  |r|= %4.3g  e= %s  max|de|= %4.3g',
                       icyc, len(conv), max_r_norm, e, de[ide])
             break
 
         r_index = r_norms > tol_residual
-        XY_new = precond(np.vstack([R_x[:,r_index], R_y[:,r_index]]).T, w[r_index])
+        XY_new = precond(cp.vstack([R_x[:,r_index], R_y[:,r_index]]).T, w[r_index])
         X_new = XY_new[:,:A_size].T
         Y_new = XY_new[:,A_size:].T
         if x0sym is None:
@@ -720,17 +729,17 @@ def real_eig(aop, x0, precond, tol_residual=1e-5, nroots=1, x0sym=None, pick=Non
             V = []
             W = []
             for ir in set(xt_ir):
-                idx = np.nonzero(xt_ir == ir)[0]
+                idx = cp.nonzero(xt_ir == ir)[0]
                 _V, _W = VW_Gram_Schmidt_fill_holder(
                     V_holder[:,:m1], W_holder[:,:m1], X_new[:,idx], Y_new[:,idx])
                 V.append(_V)
                 W.append(_W)
                 xt_orth_ir.append([ir] * len(_V))
             if len(V) > 0:
-                V = np.vstack(V)
-                W = np.vstack(W)
-                xt_ir = np.hstack(xt_orth_ir)
-                xs_ir = np.hstack([xs_ir, xt_ir])
+                V = cp.vstack(V)
+                W = cp.vstack(W)
+                xt_ir = cp.hstack(xt_orth_ir)
+                xs_ir = cp.hstack([xs_ir, xt_ir])
 
         if len(V) == 0:
             log.debug(f'Linear dependency in trial subspace. |r| for each state {r_norms}')
@@ -740,8 +749,8 @@ def real_eig(aop, x0, precond, tol_residual=1e-5, nroots=1, x0sym=None, pick=Non
                    len(V), r_norms.size - len(V))
         X_new = Y_new = R_x = R_y = None
 
-        xy_norms  = np.linalg.norm(V, axis=0) ** 2
-        xy_norms += np.linalg.norm(W, axis=0) ** 2
+        xy_norms  = cp.linalg.norm(V, axis=0) ** 2
+        xy_norms += cp.linalg.norm(W, axis=0) ** 2
         norm_min = (xy_norms ** .5).min()
         log.debug('real_lr_eig %d %d  |r|= %4.3g  e= %s  max|de|= %4.3g  lindep= %4.3g',
                   icyc, m1, max_r_norm, e, de[ide], norm_min)
@@ -752,7 +761,7 @@ def real_eig(aop, x0, precond, tol_residual=1e-5, nroots=1, x0sym=None, pick=Non
     if len(x0[0]) < min(A_size, nroots):
         log.warn(f'Not enough eigenvectors (len(x0)={len(x0[0])}, nroots={nroots})')
 
-    return conv[:nroots], e[:nroots], np.hstack(x0)
+    return conv[:nroots], e[:nroots].get(), cp.hstack(x0).get()
 
 def _gen_x0(v, xs):
     out = _outprod_to_subspace(v[::2], xs)
@@ -788,13 +797,14 @@ def _qr(xs, lindep=1e-14):
         for j in range(nv):
             prod = xs[j].conj().dot(xi)
             xi -= xs[j] * prod
-        norm = np.linalg.norm(xi)
+        norm = cp.linalg.norm(xi)
         if norm**2 > lindep:
             xs[nv] = xi/norm
             nv += 1
             idx.append(i)
     return xs[:nv], idx
 
+
 def _symmetric_orth(xt, lindep=1e-6):
     xt = np.asarray(xt)
     if xt.dtype == np.float64:
@@ -906,31 +916,31 @@ def TDDFT_subspace_eigen_solver(a, b, sigma, pi, nroots):
     ''' [ a b ] x - [ σ   π] x  Ω = 0 '''
     ''' [ b a ] y   [-π  -σ] y    = 0 '''
 
-    d = abs(np.diag(sigma))
+    d = abs(cp.diag(sigma))
     d_mh = d**(-0.5)
 
-    s_m_p = np.einsum('i,ij,j->ij', d_mh, sigma - pi, d_mh)
+    s_m_p = cp.einsum('i,ij,j->ij', d_mh, sigma - pi, d_mh)
 
     '''LU = d^−1/2 (σ − π) d^−1/2'''
     ''' A = LU '''
-    L, U = scipy.linalg.lu(s_m_p, permute_l=True)
-    L_inv = np.linalg.inv(L)
-    U_inv = np.linalg.inv(U)
+    L, U = cupyx.scipy.linalg.lu(s_m_p, permute_l=True)
+    L_inv = cp.linalg.inv(L)
+    U_inv = cp.linalg.inv(U)
 
     '''U^-T d^−1/2 (a−b) d^-1/2 U^-1 = GG^T '''
-    d_amb_d = np.einsum('i,ij,j->ij', d_mh, a-b, d_mh)
-    GGT = np.linalg.multi_dot([U_inv.T, d_amb_d, U_inv])
+    d_amb_d = cp.einsum('i,ij,j->ij', d_mh, a-b, d_mh)
+    GGT = cp.dot(U_inv.T, cp.dot(d_amb_d, U_inv))
 
-    G = scipy.linalg.cholesky(GGT, lower=True)
-    G_inv = np.linalg.inv(G)
+    G = cp.linalg.cholesky(GGT)
+    G_inv = cp.linalg.inv(G)
 
     ''' M = G^T L^−1 d^−1/2 (a+b) d^−1/2 L^−T G '''
-    d_apb_d = np.einsum('i,ij,j->ij', d_mh, a+b, d_mh)
-    M = np.linalg.multi_dot([G.T, L_inv, d_apb_d, L_inv.T, G])
+    d_apb_d = cp.einsum('i,ij,j->ij', d_mh, a+b, d_mh)
+    M = cp.dot(G.T, cp.dot(L_inv, cp.dot(d_apb_d, cp.dot(L_inv.T, G))))
 
-    omega2, Z = np.linalg.eigh(M)
-    if np.any(omega2 <= 0):
-        idx = np.nonzero(omega2 > 0)[0]
+    omega2, Z = cp.linalg.eigh(M)
+    if cp.any(omega2 <= 0):
+        idx = cp.nonzero(omega2 > 0)[0]
         omega2 = omega2[idx[:nroots]]
         Z = Z[:,idx[:nroots]]
     else:
@@ -941,13 +951,14 @@ def TDDFT_subspace_eigen_solver(a, b, sigma, pi, nroots):
     ''' It requires Z^T Z = 1/Ω '''
     ''' x+y = d^−1/2 L^−T GZ Ω^-0.5 '''
     ''' x−y = d^−1/2 U^−1 G^−T Z Ω^0.5 '''
-    x_p_y = np.einsum('i,ik,k->ik', d_mh, L_inv.T.dot(G.dot(Z)), omega**-0.5)
-    x_m_y = np.einsum('i,ik,k->ik', d_mh, U_inv.dot(G_inv.T.dot(Z)), omega**0.5)
+    x_p_y = cp.einsum('i,ik,k->ik', d_mh, L_inv.T.dot(G.dot(Z)), omega**-0.5)
+    x_m_y = cp.einsum('i,ik,k->ik', d_mh, U_inv.dot(G_inv.T.dot(Z)), omega**0.5)
 
     x = (x_p_y + x_m_y)/2
     y = x_p_y - x
     return omega, x, y
 
+
 def VW_Gram_Schmidt_fill_holder(V_holder, W_holder, X_new, Y_new, lindep=1e-6):
     '''
     QR orthogonalization for (X_new, Y_new) basis vectors, then apply symmetric
@@ -968,37 +979,40 @@ def VW_Gram_Schmidt_fill_holder(V_holder, W_holder, X_new, Y_new, lindep=1e-6):
     # s21 is symmetric
     s21  = X_new.T.dot(Y_new)
     s21 += Y_new.T.dot(X_new)
-    e, c = np.linalg.eigh(s11)
+    e, c = cp.linalg.eigh(s11)
     mask = e > lindep**2
     e = e[mask]
     if e.size == 0:
-        return (np.zeros([0, x0_size], dtype=X_new.dtype),
-                np.zeros([0, x0_size], dtype=Y_new.dtype))
+        return (cp.zeros([0, x0_size], dtype=X_new.dtype),
+                cp.zeros([0, x0_size], dtype=Y_new.dtype))
     c = c[:,mask] * e**-.5
 
     csc = c.T.dot(s21).dot(c)
     n = csc.shape[0]
     for i in range(n):
-        w, u = np.linalg.eigh(csc[i:,i:])
+        w, u = cp.linalg.eigh(csc[i:,i:])
         mask = 1 - abs(w) > lindep
-        if np.any(mask):
+        if cp.any(mask):
             c = c[:,i:]
             break
     else:
-        return (np.zeros([0, x0_size], dtype=X_new.dtype),
-                np.zeros([0, x0_size], dtype=Y_new.dtype))
+        return (cp.zeros([0, x0_size], dtype=X_new.dtype),
+                cp.zeros([0, x0_size], dtype=Y_new.dtype))
     w = w[mask]
     u = u[:,mask]
     c_orth = c.dot(u)
 
     if e[0] < 1e-6 or 1-abs(w[0]) < 1e-3:
         # Rerun the orthogonalization to reduce numerical errors
-        e, c = np.linalg.eigh(c_orth.T.dot(s11).dot(c_orth))
+        e, c = cp.linalg.eigh(c_orth.T.dot(s11).dot(c_orth))
         c *= e**-.5
         c_orth = c_orth.dot(c)
         csc = c_orth.T.dot(s21).dot(c_orth)
-        w, u = np.linalg.eigh(csc)
+        w, u = cp.linalg.eigh(csc)
         c_orth = c_orth.dot(u)
+    mask = 1 - abs(w) > lindep
+    w = w[mask]
+    c_orth = c_orth[:,mask]
 
     # Symmetric diagonalize
     # [1 w] => c = [a b]
@@ -1016,3 +1030,28 @@ def VW_Gram_Schmidt_fill_holder(V_holder, W_holder, X_new, Y_new, lindep=1e-6):
     y_orth  = Y_new.dot(c_orth * a)
     y_orth += X_new.dot(c_orth * b)
     return x_orth.T, y_orth.T
+
+
+def _sort_elast_gpu(elast, conv_last, vlast, v, log):
+    '''
+    Eigenstates may be flipped during the Davidson iterations.  Reorder the
+    eigenvalues of last iteration to make them comparable to the eigenvalues
+    of the current iterations.
+    '''
+    head, nroots = vlast.shape
+    ovlp = abs(cp.dot(v[:head].conj().T, vlast))
+    mapping = cp.argmax(ovlp, axis=1)
+    found = cp.any(ovlp > .5, axis=1)
+
+    if log.verbose >= logger.DEBUG:
+        ordering_diff = (mapping != cp.arange(len(mapping)))
+        if any(ordering_diff & found):
+            log.debug('Old state -> New state')
+            for i in cp.where(ordering_diff)[0]:
+                log.debug('  %3d     ->   %3d ', mapping[i], i)
+
+    conv = conv_last[mapping]
+    e = elast[mapping]
+    conv[~found] = False
+    e[~found] = 0.
+    return e, conv
\ No newline at end of file
diff --git a/gpu4pyscf/tdscf/rhf.py b/gpu4pyscf/tdscf/rhf.py
index 55b7618c..a7b5c4f8 100644
--- a/gpu4pyscf/tdscf/rhf.py
+++ b/gpu4pyscf/tdscf/rhf.py
@@ -15,11 +15,9 @@
 
 import numpy as np
 import cupy as cp
-import scipy.linalg
-from pyscf import gto
 from pyscf import lib
 from pyscf.tdscf import rhf as tdhf_cpu
-from gpu4pyscf.tdscf._lr_eig import eigh as lr_eigh, eig as lr_eig, real_eig
+from gpu4pyscf.tdscf._lr_eig import eigh as lr_eigh, real_eig
 from gpu4pyscf import scf
 from gpu4pyscf.lib.cupy_helper import contract, tag_array
 from gpu4pyscf.lib import utils
@@ -53,7 +51,7 @@ def gen_tda_operation(mf, fock_ao=None, singlet=True, wfnsym=None):
     orbo2 = orbo * 2. # *2 for double occupancy
 
     e_ia = hdiag = mo_energy[viridx] - mo_energy[occidx,None]
-    hdiag = hdiag.ravel().get()
+    hdiag = hdiag.ravel()
     vresp = mf.gen_response(singlet=singlet, hermi=0)
     nocc, nvir = e_ia.shape
 
@@ -66,7 +64,7 @@ def vind(zs):
         v1mo = contract('xpq,qo->xpo', v1ao, orbo)
         v1mo = contract('xpo,pv->xov', v1mo, orbv.conj())
         v1mo += zs * e_ia
-        return v1mo.reshape(v1mo.shape[0],-1).get()
+        return v1mo.reshape(v1mo.shape[0],-1)
 
     return vind, hdiag
 
@@ -100,11 +98,15 @@ class TDBase(lib.StreamObject):
     get_ab = NotImplemented
 
     def get_precond(self, hdiag):
+        threshold_t=1.0e-4
         def precond(x, e, *args):
-            if isinstance(e, np.ndarray):
-                e = e[0]
+            n_states = x.shape[0]
+            diagd = cp.repeat(hdiag.reshape(1,-1), n_states, axis=0)
+            e = e.reshape(-1,1)
             diagd = hdiag - (e-self.level_shift)
-            diagd[abs(diagd)<1e-8] = 1e-8
+            diagd = cp.where(abs(diagd) < threshold_t, cp.sign(diagd)*threshold_t, diagd)
+            a_size = x.shape[1]//2
+            diagd[:,a_size:] = diagd[:,a_size:]*(-1)
             return x/diagd
         return precond
 
@@ -170,6 +172,17 @@ def _contract_multipole(tdobj, ints, hermi=True, xy=None):
 class TDA(TDBase):
     __doc__ = tdhf_cpu.TDA.__doc__
 
+    def get_precond(self, hdiag):
+        threshold_t=1.0e-4
+        def precond(x, e, *args):
+            n_states = x.shape[0]
+            diagd = cp.repeat(hdiag.reshape(1,-1), n_states, axis=0)
+            e = e.reshape(-1,1)
+            diagd = hdiag - (e-self.level_shift)
+            diagd = cp.where(abs(diagd) < threshold_t, cp.sign(diagd)*threshold_t, diagd)
+            return x/diagd
+        return precond
+
     def gen_vind(self, mf=None):
         '''Generate function to compute Ax'''
         if mf is None:
@@ -228,7 +241,7 @@ def kernel(self, x0=None, nstates=None):
         precond = self.get_precond(hdiag)
 
         def pickeig(w, v, nroots, envs):
-            idx = np.where(w > self.positive_eig_threshold)[0]
+            idx = cp.where(w > self.positive_eig_threshold)[0]
             return w[idx], v[:,idx], idx
 
         x0sym = None
@@ -291,10 +304,10 @@ def vind(zs):
         v1_top += xs * e_ia  # AX
         v1_bot += ys * e_ia  # (A*)Y
         return cp.hstack((v1_top.reshape(nz,nocc*nvir),
-                          -v1_bot.reshape(nz,nocc*nvir))).get()
+                          -v1_bot.reshape(nz,nocc*nvir)))
 
     hdiag = cp.hstack([hdiag.ravel(), -hdiag.ravel()])
-    return vind, hdiag.get()
+    return vind, hdiag
 
 
 class TDHF(TDBase):
diff --git a/gpu4pyscf/tdscf/rks.py b/gpu4pyscf/tdscf/rks.py
index 9961a21c..948c7d50 100644
--- a/gpu4pyscf/tdscf/rks.py
+++ b/gpu4pyscf/tdscf/rks.py
@@ -15,7 +15,7 @@
 import numpy as np
 import cupy as cp
 from pyscf import lib
-from pyscf.tdscf._lr_eig import eigh as lr_eigh
+from gpu4pyscf.tdscf._lr_eig import eigh as lr_eigh
 from gpu4pyscf.dft.rks import KohnShamDFT
 from gpu4pyscf.lib.cupy_helper import contract, tag_array, transpose_sum
 from gpu4pyscf.lib import logger
@@ -54,7 +54,6 @@ def gen_vind(self, mf=None):
         d_ia = e_ia ** .5
         ed_ia = e_ia * d_ia
         hdiag = e_ia.ravel() ** 2
-        hdiag = hdiag.get()
         vresp = mf.gen_response(singlet=singlet, hermi=1)
         nocc, nvir = e_ia.shape
 
@@ -71,7 +70,7 @@ def vind(zs):
             v1mo = contract('xpo,pv->xov', v1mo, orbv)
             v1mo += zs * ed_ia
             v1mo *= d_ia
-            return v1mo.reshape(v1mo.shape[0],-1).get()
+            return v1mo.reshape(v1mo.shape[0],-1)
 
         return vind, hdiag
 
@@ -95,7 +94,7 @@ def kernel(self, x0=None, nstates=None):
         precond = self.get_precond(hdiag)
 
         def pickeig(w, v, nroots, envs):
-            idx = np.where(w > self.positive_eig_threshold)[0]
+            idx = cp.where(w > self.positive_eig_threshold)[0]
             return w[idx], v[:,idx], idx
 
         x0sym = None
diff --git a/gpu4pyscf/tdscf/tests/test_tdrhf.py b/gpu4pyscf/tdscf/tests/test_tdrhf.py
index 4e6761ca..f1a0774a 100644
--- a/gpu4pyscf/tdscf/tests/test_tdrhf.py
+++ b/gpu4pyscf/tdscf/tests/test_tdrhf.py
@@ -125,12 +125,12 @@ def test_tda_vind(self):
         nvir = nmo - nocc
         zs = np.random.rand(3,nocc,nvir)
         ref = mf.to_cpu().TDA().set(singlet=False).gen_vind()[0](zs)
-        dat = mf.TDA().set(singlet=False).gen_vind()[0](cp.asarray(zs))
+        dat = mf.TDA().set(singlet=False).gen_vind()[0](cp.asarray(zs)).get()
         self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
 
         df_mf = self.df_mf
         ref = df_mf.to_cpu().TDA().set(singlet=True).gen_vind()[0](zs)
-        dat = df_mf.TDA().set(singlet=True).gen_vind()[0](cp.asarray(zs))
+        dat = df_mf.TDA().set(singlet=True).gen_vind()[0](cp.asarray(zs)).get()
         self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
 
     def test_tdhf_vind(self):
@@ -140,12 +140,12 @@ def test_tdhf_vind(self):
         nvir = nmo - nocc
         zs = np.random.rand(3,2,nocc,nvir)
         ref = mf.to_cpu().TDHF().set(singlet=True).gen_vind()[0](zs)
-        dat = mf.TDHF().set(singlet=True).gen_vind()[0](zs)
+        dat = mf.TDHF().set(singlet=True).gen_vind()[0](zs).get()
         self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
 
         df_mf = self.df_mf
         ref = df_mf.to_cpu().TDHF().set(singlet=False).gen_vind()[0](zs)
-        dat = df_mf.TDHF().set(singlet=False).gen_vind()[0](zs)
+        dat = df_mf.TDHF().set(singlet=False).gen_vind()[0](zs).get()
         self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
 
 if __name__ == "__main__":
diff --git a/gpu4pyscf/tdscf/tests/test_tdrks.py b/gpu4pyscf/tdscf/tests/test_tdrks.py
index cebae87c..233b6463 100644
--- a/gpu4pyscf/tdscf/tests/test_tdrks.py
+++ b/gpu4pyscf/tdscf/tests/test_tdrks.py
@@ -251,7 +251,7 @@ def test_tda_vind(self):
         nvir = nmo - nocc
         zs = np.random.rand(3,nocc,nvir)
         ref = mf.to_cpu().TDA().set(singlet=False).gen_vind()[0](zs)
-        dat = mf.TDA().set(singlet=False).gen_vind()[0](cp.asarray(zs))
+        dat = mf.TDA().set(singlet=False).gen_vind()[0](cp.asarray(zs)).get()
         self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
 
     def test_tddft_vind(self):
@@ -261,7 +261,7 @@ def test_tddft_vind(self):
         nvir = nmo - nocc
         zs = np.random.rand(3,2,nocc,nvir)
         ref = mf.to_cpu().TDDFT().set(singlet=True).gen_vind()[0](zs)
-        dat = mf.TDDFT().set(singlet=True).gen_vind()[0](zs)
+        dat = mf.TDDFT().set(singlet=True).gen_vind()[0](zs).get()
         self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
 
     def test_casida_tddft_vind(self):
@@ -271,7 +271,7 @@ def test_casida_tddft_vind(self):
         nvir = nmo - nocc
         zs = np.random.rand(3,nocc,nvir)
         ref = mf.to_cpu().CasidaTDDFT().gen_vind()[0](zs)
-        dat = mf.CasidaTDDFT().gen_vind()[0](cp.asarray(zs))
+        dat = mf.CasidaTDDFT().gen_vind()[0](cp.asarray(zs)).get()
         self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
 
 if __name__ == "__main__":
diff --git a/gpu4pyscf/tdscf/tests/test_tduhf.py b/gpu4pyscf/tdscf/tests/test_tduhf.py
index 884249e4..0acd8578 100644
--- a/gpu4pyscf/tdscf/tests/test_tduhf.py
+++ b/gpu4pyscf/tdscf/tests/test_tduhf.py
@@ -100,7 +100,7 @@ def test_tda_vind(self):
         nvirb = nmo - noccb
         zs = np.random.rand(3,nocca*nvira+noccb*nvirb)
         ref = mf.to_cpu().TDA().set().gen_vind()[0](zs)
-        dat = mf.TDA().set().gen_vind()[0](cp.asarray(zs))
+        dat = mf.TDA().set().gen_vind()[0](cp.asarray(zs)).get()
         self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
 
     def test_tdhf_vind(self):
@@ -111,7 +111,7 @@ def test_tdhf_vind(self):
         nvirb = nmo - noccb
         zs = np.random.rand(3,2,nocca*nvira+noccb*nvirb)
         ref = mf.to_cpu().TDHF().set().gen_vind()[0](zs)
-        dat = mf.TDHF().set().gen_vind()[0](zs)
+        dat = mf.TDHF().set().gen_vind()[0](zs).get()
         self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
 
 if __name__ == "__main__":
diff --git a/gpu4pyscf/tdscf/tests/test_tduks.py b/gpu4pyscf/tdscf/tests/test_tduks.py
index ad1e3f45..34c214b8 100644
--- a/gpu4pyscf/tdscf/tests/test_tduks.py
+++ b/gpu4pyscf/tdscf/tests/test_tduks.py
@@ -187,7 +187,7 @@ def test_tda_vind(self):
         nvirb = nmo - noccb
         zs = np.random.rand(3,nocca*nvira+noccb*nvirb)
         ref = mf.to_cpu().TDA().gen_vind()[0](zs)
-        dat = mf.TDA().gen_vind()[0](cp.asarray(zs))
+        dat = mf.TDA().gen_vind()[0](cp.asarray(zs)).get()
         self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
 
     def test_tddft_vind(self):
@@ -198,7 +198,7 @@ def test_tddft_vind(self):
         nvirb = nmo - noccb
         zs = np.random.rand(3,2,nocca*nvira+noccb*nvirb)
         ref = mf.to_cpu().TDDFT().gen_vind()[0](zs)
-        dat = mf.TDDFT().gen_vind()[0](cp.asarray(zs))
+        dat = mf.TDDFT().gen_vind()[0](cp.asarray(zs)).get()
         self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
 
     def test_casida_tddft_vind(self):
@@ -209,7 +209,7 @@ def test_casida_tddft_vind(self):
         nvirb = nmo - noccb
         zs = np.random.rand(3,nocca*nvira+noccb*nvirb)
         ref = mf.to_cpu().CasidaTDDFT().gen_vind()[0](zs)
-        dat = mf.CasidaTDDFT().gen_vind()[0](cp.asarray(zs))
+        dat = mf.CasidaTDDFT().gen_vind()[0](cp.asarray(zs)).get()
         self.assertAlmostEqual(abs(ref - dat).max(), 0, 9)
 
 if __name__ == "__main__":
diff --git a/gpu4pyscf/tdscf/uhf.py b/gpu4pyscf/tdscf/uhf.py
index d2b4224d..a0182390 100644
--- a/gpu4pyscf/tdscf/uhf.py
+++ b/gpu4pyscf/tdscf/uhf.py
@@ -63,7 +63,7 @@ def gen_tda_operation(mf, fock_ao=None, wfnsym=None):
     e_ia_a = mo_energy[0][viridxa] - mo_energy[0][occidxa,None]
     e_ia_b = mo_energy[1][viridxb] - mo_energy[1][occidxb,None]
     e_ia = cp.hstack((e_ia_a.reshape(-1), e_ia_b.reshape(-1)))
-    hdiag = e_ia.get()
+    hdiag = e_ia
     nocca, nvira = e_ia_a.shape
     noccb, nvirb = e_ia_b.shape
 
@@ -88,7 +88,7 @@ def vind(zs):
         v1a += za * e_ia_a
         v1b += zb * e_ia_b
         hx = cp.hstack((v1a.reshape(nz,-1), v1b.reshape(nz,-1)))
-        return hx.get()
+        return hx
 
     return vind, hdiag
 
@@ -185,7 +185,7 @@ def kernel(self, x0=None, nstates=None):
         precond = self.get_precond(hdiag)
 
         def pickeig(w, v, nroots, envs):
-            idx = np.where(w > self.positive_eig_threshold)[0]
+            idx = cp.where(w > self.positive_eig_threshold)[0]
             return w[idx], v[:,idx], idx
 
         x0sym = None
@@ -258,7 +258,7 @@ def gen_vind(self):
             orbva = mo_coeff[0][:,viridxa]
             orbov = (orbob, orbva)
             e_ia = mo_energy[0][viridxa] - mo_energy[1][occidxb,None]
-            hdiag = e_ia.ravel().get()
+            hdiag = e_ia.ravel()
 
         elif extype == 1:
             occidxa = mo_occ[0] > 0
@@ -267,7 +267,7 @@ def gen_vind(self):
             orbvb = mo_coeff[1][:,viridxb]
             orbov = (orboa, orbvb)
             e_ia = mo_energy[1][viridxb] - mo_energy[0][occidxa,None]
-            hdiag = e_ia.ravel().get()
+            hdiag = e_ia.ravel()
 
         vresp = gen_uhf_response_sf(
             mf, hermi=0, collinear=self.collinear,
@@ -283,7 +283,7 @@ def vind(zs):
             v1mo = contract('xpq,qo->xpo', v1ao, orbo)
             v1mo = contract('xpo,pv->xov', v1mo, orbv.conj())
             v1mo += zs * e_ia
-            return v1mo.reshape(len(v1mo), -1).get()
+            return v1mo.reshape(len(v1mo), -1)
 
         return vind, hdiag
 
@@ -461,10 +461,10 @@ def vind(zs):
         v1_bot[:,:nocca*nvira] += v1a_bot.reshape(nz,-1)
         v1_top[:,nocca*nvira:] += v1b_top.reshape(nz,-1)
         v1_bot[:,nocca*nvira:] += v1b_bot.reshape(nz,-1)
-        return cp.hstack([v1_top, -v1_bot]).get()
+        return cp.hstack([v1_top, -v1_bot])
 
     hdiag = cp.hstack([hdiag.ravel(), -hdiag.ravel()])
-    return vind, hdiag.get()
+    return vind, hdiag
 
 
 class TDHF(TDBase):
@@ -578,9 +578,9 @@ def gen_vind(self):
 
         extype = self.extype
         if extype == 0:
-            hdiag = cp.hstack([e_ia_b2a.ravel(), -e_ia_a2b.ravel()]).get()
+            hdiag = cp.hstack([e_ia_b2a.ravel(), -e_ia_a2b.ravel()])
         else:
-            hdiag = cp.hstack([e_ia_a2b.ravel(), -e_ia_b2a.ravel()]).get()
+            hdiag = cp.hstack([e_ia_a2b.ravel(), -e_ia_b2a.ravel()])
 
         vresp = gen_uhf_response_sf(
             mf, hermi=0, collinear=self.collinear,
@@ -681,7 +681,7 @@ def vind(zs):
                 v1_top += zs_a2b * e_ia_a2b
                 v1_bot += zs_b2a * e_ia_b2a
             hx = cp.hstack([v1_top.reshape(nz,-1), -v1_bot.reshape(nz,-1)])
-            return hx.get()
+            return hx
 
         return vind, hdiag
 
diff --git a/gpu4pyscf/tdscf/uks.py b/gpu4pyscf/tdscf/uks.py
index 44672929..c2c48d22 100644
--- a/gpu4pyscf/tdscf/uks.py
+++ b/gpu4pyscf/tdscf/uks.py
@@ -16,7 +16,7 @@
 import cupy as cp
 from pyscf import symm
 from pyscf import lib
-from pyscf.tdscf._lr_eig import eigh as lr_eigh
+from gpu4pyscf.tdscf._lr_eig import eigh as lr_eigh
 from gpu4pyscf.dft.rks import KohnShamDFT
 from gpu4pyscf.lib.cupy_helper import contract, tag_array, transpose_sum
 from gpu4pyscf.lib import logger
@@ -69,7 +69,7 @@ def gen_vind(self, mf=None):
         d_ia = e_ia**.5
         ed_ia = e_ia * d_ia
         hdiag = e_ia ** 2
-        hdiag = hdiag.get()
+        hdiag = hdiag
         vresp = mf.gen_response(mo_coeff, mo_occ, hermi=1)
         nocca, nvira = e_ia_a.shape
         noccb, nvirb = e_ia_b.shape
@@ -96,7 +96,7 @@ def vind(zs):
             hx = cp.hstack((v1a.reshape(nz,-1), v1b.reshape(nz,-1)))
             hx += ed_ia * zs
             hx *= d_ia
-            return hx.get()
+            return hx
 
         return vind, hdiag
 
@@ -120,7 +120,7 @@ def kernel(self, x0=None, nstates=None):
         precond = self.get_precond(hdiag)
 
         def pickeig(w, v, nroots, envs):
-            idx = np.where(w > self.positive_eig_threshold)[0]
+            idx = cp.where(w > self.positive_eig_threshold)[0]
             return w[idx], v[:,idx], idx
 
         x0sym = None