From 8cb5ce49d5bb183a1649b04c659af33ce08d762e Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Thu, 19 Dec 2024 21:39:56 +0000
Subject: [PATCH 01/49] refactor hessian class

---
 gpu4pyscf/df/hessian/jk.py                    | 432 ++++++++++++++++++
 gpu4pyscf/df/hessian/rhf.py                   | 111 +++--
 gpu4pyscf/df/hessian/rks.py                   |   7 +-
 .../df/hessian/tests/test_df_rhf_hessian.py   | 145 ++++++
 .../df/hessian/tests/test_df_rks_hessian.py   | 107 +++++
 .../df/hessian/tests/test_df_uhf_hessian.py   |   6 +-
 gpu4pyscf/df/hessian/uhf.py                   |  72 +--
 gpu4pyscf/df/hessian/uks.py                   |   1 +
 gpu4pyscf/df/tests/test_df_hessian.py         |   3 +-
 gpu4pyscf/hessian/jk.py                       | 296 ++++++++++++
 gpu4pyscf/hessian/rhf.py                      |  56 ++-
 gpu4pyscf/hessian/rks.py                      |  58 ++-
 gpu4pyscf/hessian/tests/test_rhf_hessian.py   |   2 +-
 gpu4pyscf/hessian/uhf.py                      |  47 +-
 gpu4pyscf/hessian/uks.py                      |  64 ++-
 gpu4pyscf/properties/ir.py                    |   3 +-
 gpu4pyscf/scf/jk.py                           |  36 +-
 gpu4pyscf/scf/tests/test_scf_jk.py            |  26 +-
 gpu4pyscf/solvent/hessian/pcm.py              |  25 +
 gpu4pyscf/solvent/hessian/smd.py              |  26 ++
 gpu4pyscf/solvent/tests/test_smd_hessian.py   |   1 +
 gpu4pyscf/tests/test_dft.py                   |   2 +-
 22 files changed, 1334 insertions(+), 192 deletions(-)
 create mode 100644 gpu4pyscf/df/hessian/jk.py
 create mode 100644 gpu4pyscf/df/hessian/tests/test_df_rhf_hessian.py
 create mode 100644 gpu4pyscf/df/hessian/tests/test_df_rks_hessian.py
 create mode 100644 gpu4pyscf/hessian/jk.py

diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py
new file mode 100644
index 00000000..16010bda
--- /dev/null
+++ b/gpu4pyscf/df/hessian/jk.py
@@ -0,0 +1,432 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import ctypes
+import itertools
+import numpy as np
+from concurrent.futures import ThreadPoolExecutor
+import cupy
+from pyscf import gto
+from gpu4pyscf.df import int3c2e
+from gpu4pyscf.scf.int4c2e import libgint
+from gpu4pyscf.hessian.jk import _ao2mo
+from gpu4pyscf.lib import logger
+from gpu4pyscf.lib.cupy_helper import contract, cart2sph, reduce_to_device
+from gpu4pyscf.__config__ import _streams, _num_devices
+
+NROOT_ON_GPU = 7
+
+def _jk_task_with_mo1(dfobj, dms, mo_coeff, mo1s, occ_coeffs,
+                      with_j=True, with_k=True, hermi=0, device_id=0):
+    ''' Calculate J and K matrices with mo response
+        For CP-HF
+    '''
+    with cupy.cuda.Device(device_id), _streams[device_id]:
+        assert isinstance(dfobj.verbose, int)
+        log = logger.new_logger(dfobj.mol, dfobj.verbose)
+        t0 = log.init_timer()
+        dms = cupy.asarray(dms)
+        n_dm = dms.shape[0]
+        mo1s = [cupy.asarray(mo1) for mo1 in mo1s]
+        occ_coeffs = [cupy.asarray(occ_coeff) for occ_coeff in occ_coeffs]
+        mo_coeff = [cupy.asarray(mo) for mo in mo_coeff]
+        nao = dms.shape[-1]
+        intopt = dfobj.intopt
+        rows = intopt.cderi_row
+        cols = intopt.cderi_col
+        dms_shape = dms.shape
+        if with_j:
+            dm_sparse = dms[:,rows,cols]
+            if hermi == 0:
+                dm_sparse += dms[:,cols,rows]
+            else:
+                dm_sparse *= 2
+            dm_sparse[:, intopt.cderi_diag] *= .5
+
+        if with_k:
+            vks = [cupy.zeros_like(mo1) for mo1 in mo1s]
+
+        if with_j:
+            vj_sparse = cupy.zeros_like(dm_sparse)
+
+        nocc = max([mo1.shape[2] for mo1 in mo1s])
+        blksize = dfobj.get_blksize(extra=2*nao*nocc)
+        for cderi, cderi_sparse in dfobj.loop(blksize=blksize, unpack=with_k):
+            if with_j:
+                rhoj = dm_sparse.dot(cderi_sparse)
+                vj_sparse += cupy.dot(rhoj, cderi_sparse.T)
+                rhoj = None
+            cderi_sparse = None
+            if with_k:
+                for occ_coeff, mo1, vk in zip(occ_coeffs, mo1s, vks):
+                    nocc = occ_coeff.shape[1]
+                    rhok = contract('Lij,jo->Loi', cderi, occ_coeff)
+                    rhok_oo = contract('Loi,ip->Lop', rhok, occ_coeff).reshape([-1,nocc])
+                    rhok = rhok.reshape([-1,nao])
+                    for i in range(mo1.shape[0]):
+                        rhok1 = contract('Lij,jo->Loi', cderi, mo1[i])
+                        rhok1 = rhok1.reshape([-1,nao])
+                        vk[i] += cupy.dot(rhok1.T, rhok_oo)
+                        
+                        rhok1 = rhok1.reshape([-1,nocc,nao])
+                        rhok1 = contract('Loi,ip->Lop', rhok1, occ_coeff)
+                        rhok1 = rhok1.reshape([-1,nocc])
+                        vk[i] += cupy.dot(rhok.T, rhok1)
+                mo1 = rhok1 = rhok = rhok_oo = None
+            cderi = None
+        mo1s = None
+        if with_j:
+            vj = cupy.zeros(dms_shape)
+            vj[:,rows,cols] = vj_sparse
+            vj[:,cols,rows] = vj_sparse
+        
+        vj_mo = vk_mo = None
+        if len(occ_coeffs) == 1:
+            # Restricted case
+            mo = mo_coeff[0]
+            if with_j:
+                vj_mo = _ao2mo(vj, occ_coeffs[0], mo).reshape(n_dm,-1)
+                vj = None
+            mo *= 2.0     # Due to double occupancy
+            if with_k:
+                vk_mo = contract('nio,ip->npo', vks[0], mo).reshape(n_dm,-1)
+        elif len(occ_coeffs) == 2:
+            # Unrestricted case
+            n_dm_2 = n_dm // 2
+            mocca, moccb = occ_coeffs
+            moa, mob = mo_coeff
+            nmoa, nmob = moa.shape[1], mob.shape[1]
+            nocca, noccb = mocca.shape[1], moccb.shape[1]
+
+            if with_j:
+                vjab = vj[:n_dm_2] + vj[n_dm_2:]
+                vj = None
+                vj_mo = cupy.empty([n_dm_2,nmoa*nocca+nmob*noccb])
+                vj_mo[:,:nmoa*nocca] = _ao2mo(vjab, mocca, moa).reshape(n_dm_2,-1)
+                vj_mo[:,nmoa*nocca:] = _ao2mo(vjab, moccb, mob).reshape(n_dm_2,-1)
+                vjab = None
+
+            if with_k:
+                vka, vkb = vks
+                vk_mo = cupy.empty([n_dm_2,nmoa*nocca+nmob*noccb])
+                vk_mo[:,:nmoa*nocca] = contract('nio,ip->npo', vka, moa).reshape(n_dm_2,-1)
+                vk_mo[:,nmoa*nocca:] = contract('nio,ip->npo', vkb, mob).reshape(n_dm_2,-1)
+
+        t0 = log.timer_debug1(f'vj and vk on Device {device_id}', *t0)
+    return vj_mo, vk_mo
+
+def get_jk(dfobj, dms_tag, mo_coeff, mocc, hermi=0, 
+           with_j=True, with_k=True, direct_scf_tol=1e-14, omega=None):
+    ''' Compute J/K in MO with density fitting
+    '''
+
+    log = logger.new_logger(dfobj.mol, dfobj.verbose)
+    if not isinstance(dms_tag, cupy.ndarray):
+        dms_tag = cupy.asarray(dms_tag)
+
+    assert(with_j or with_k)
+    if dms_tag is None: logger.error("dm is not given")
+    nao = dms_tag.shape[-1]
+    t1 = t0 = log.init_timer()
+    if dfobj._cderi is None:
+        log.debug('Build CDERI ...')
+        dfobj.build(direct_scf_tol=direct_scf_tol, omega=omega)
+        t1 = log.timer_debug1('init jk', *t0)
+
+    assert nao == dfobj.nao
+    intopt = dfobj.intopt
+
+    nao = dms_tag.shape[-1]
+    dms = dms_tag.reshape([-1,nao,nao])
+    intopt = dfobj.intopt
+    dms = intopt.sort_orbitals(dms, axis=[1,2])
+
+    cupy.cuda.get_current_stream().synchronize()
+    occ_coeffs = dms_tag.occ_coeff
+    mo1s = dms_tag.mo1
+
+    if not isinstance(occ_coeffs, (tuple, list)):
+        occ_coeffs = [occ_coeffs]
+        mo1s = [mo1s]
+        mo_coeff = [mo_coeff]
+    else:
+        assert isinstance(mo1s, (tuple, list))
+        mo_coeff = [mo_coeff[0], mo_coeff[1]]
+
+    occ_coeffs = [intopt.sort_orbitals(occ_coeff, axis=[0]) for occ_coeff in occ_coeffs]
+    mo1s = [intopt.sort_orbitals(mo1, axis=[1]) for mo1 in mo1s]
+    mo_coeff = [intopt.sort_orbitals(mo, axis=[0]) for mo in mo_coeff]
+
+    futures = []
+    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
+        for device_id in range(_num_devices):
+            future = executor.submit(
+                _jk_task_with_mo1,
+                dfobj, dms, mo_coeff, mo1s, occ_coeffs,
+                hermi=hermi, device_id=device_id,
+                with_j=with_j, with_k=with_k)
+            futures.append(future)
+
+    vj = vk = None
+    if with_j:
+        vj = [future.result()[0] for future in futures]
+        vj = reduce_to_device(vj, inplace=True)
+
+    if with_k:
+        vk = [future.result()[1] for future in futures]
+        vk = reduce_to_device(vk, inplace=True)
+    t1 = log.timer_debug1('vj and vk', *t1)
+    return vj, vk
+
+
+def _get_int3c2e_ipip_slice(ip_type, intopt, cp_ij_id, aux_id, omega=None, stream=None):
+
+    if omega is None: omega = 0.0
+    if stream is None: stream = cupy.cuda.get_current_stream()
+
+    fn = getattr(libgint, 'GINTfill_int3c2e_' + ip_type)
+
+    nao = intopt._sorted_mol.nao
+    naux = intopt._sorted_auxmol.nao
+    norb = nao + naux + 1
+    comp = 9
+    order = 2
+
+    lmax = intopt._sorted_mol._bas[:gto.ANG_OF].max()
+    aux_lmax = intopt._sorted_auxmol._bas[:gto.ANG_OF].max()
+    nroots = (lmax + aux_lmax + order)//2 + 1
+    if nroots > NROOT_ON_GPU:
+        from pyscf.gto.moleintor import getints, make_cintopt
+        pmol = intopt._tot_mol
+        intor = pmol._add_suffix('int3c2e_' + ip_type)
+        opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor)
+
+    nbins = 1
+
+    cp_kl_id = aux_id + len(intopt.log_qs)
+    lk = intopt.aux_angular[aux_id]
+    
+    cpi = intopt.cp_idx[cp_ij_id]
+    cpj = intopt.cp_jdx[cp_ij_id]
+    li = intopt.angular[cpi]
+    lj = intopt.angular[cpj]
+
+    i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1]
+    j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1]
+    k0, k1 = intopt.cart_aux_loc[aux_id], intopt.cart_aux_loc[aux_id+1]
+    ni = i1 - i0
+    nj = j1 - j0
+    nk = k1 - k0
+
+    log_q_ij = intopt.log_qs[cp_ij_id]
+    log_q_kl = intopt.aux_log_qs[aux_id]
+
+    bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32)
+    bins_locs_kl = np.array([0, len(log_q_kl)], dtype=np.int32)
+
+    ao_offsets = np.array([i0,j0,nao+1+k0,nao], dtype=np.int32)
+    strides = np.array([1, ni, ni*nj, ni*nj*nk], dtype=np.int32)
+
+    # Use GPU kernels for low-angular momentum
+    if (li + lj + lk + order)//2 + 1 < NROOT_ON_GPU:
+        int3c_blk = cupy.zeros([comp, nk, nj, ni], order='C', dtype=np.float64)
+        err = fn(
+            ctypes.cast(stream.ptr, ctypes.c_void_p),
+            intopt.bpcache,
+            ctypes.cast(int3c_blk.data.ptr, ctypes.c_void_p),
+            ctypes.c_int(norb),
+            strides.ctypes.data_as(ctypes.c_void_p),
+            ao_offsets.ctypes.data_as(ctypes.c_void_p),
+            bins_locs_ij.ctypes.data_as(ctypes.c_void_p),
+            bins_locs_kl.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nbins),
+            ctypes.c_int(cp_ij_id),
+            ctypes.c_int(cp_kl_id),
+            ctypes.c_double(omega))
+        if err != 0:
+            raise RuntimeError(f'GINT_fill_int3c2e general failed, err={err}')
+    else:
+        # TODO: sph2cart in CPU?
+        ishl0, ishl1 = intopt.l_ctr_offsets[cpi], intopt.l_ctr_offsets[cpi+1]
+        jshl0, jshl1 = intopt.l_ctr_offsets[cpj], intopt.l_ctr_offsets[cpj+1]
+        kshl0, kshl1 = intopt.l_ctr_offsets[aux_id+1+intopt.nctr], intopt.l_ctr_offsets[aux_id+1+intopt.nctr+1]
+        shls_slice = np.array([ishl0, ishl1, jshl0, jshl1, kshl0, kshl1], dtype=np.int64)
+        int3c_cpu = getints(intor, pmol._atm, pmol._bas, pmol._env, shls_slice, cintopt=opt).transpose([0,3,2,1])
+        int3c_blk = cupy.asarray(int3c_cpu)
+
+    if not intopt.auxmol.cart:
+        int3c_blk = cart2sph(int3c_blk, axis=1, ang=lk)
+    if not intopt.mol.cart:
+        int3c_blk = cart2sph(int3c_blk, axis=2, ang=lj)
+        int3c_blk = cart2sph(int3c_blk, axis=3, ang=li)
+
+    return int3c_blk
+
+
+def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
+                        device_id=0, with_k=True, omega=None, auxbasis_response=1):
+    natm = intopt.mol.natm
+    nao = dm0.shape[0]
+    naux = rhok.shape[0]
+    ao_loc = intopt.ao_loc
+    aux_ao_loc = intopt.aux_ao_loc
+    with cupy.cuda.Device(device_id), _streams[device_id]:
+        log = logger.new_logger(intopt.mol, intopt.mol.verbose)
+        t0 = log.init_timer()
+        rhoj = cupy.asarray(rhoj)
+        rhok = cupy.asarray(rhok)
+        orbo = cupy.asarray(orbo)
+        dm0 = cupy.asarray(dm0)
+        nao = dm0.shape[0]
+
+        hj_ipip1 = cupy.zeros([nao,9])
+        hj_ipip2 = cupy.zeros([naux,9])
+        hj_ip1ip2 = cupy.zeros([nao,naux,9])
+        hj_ipvip1 = cupy.zeros([nao,nao,9])
+        if with_k:
+            hk_ipip1 = cupy.zeros([nao,9])
+            hk_ipip2 = cupy.zeros([naux,9])
+            hk_ip1ip2 = cupy.zeros([nao,naux,9])
+            hk_ipvip1 = cupy.zeros([nao,nao,9])
+
+        for aux_id, cp_ij_id in task_list:
+            cpi = intopt.cp_idx[cp_ij_id]
+            cpj = intopt.cp_jdx[cp_ij_id]
+            i0, i1 = ao_loc[cpi], ao_loc[cpi+1]
+            j0, j1 = ao_loc[cpj], ao_loc[cpj+1]
+            k0, k1 = aux_ao_loc[aux_id], aux_ao_loc[aux_id+1]
+            
+            if with_k:
+                rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
+                rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1])
+
+            # (20|0), (0|0)(0|00)
+            int3c_blk = _get_int3c2e_ipip_slice('ipip1', intopt, cp_ij_id, aux_id, omega=omega)
+            tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1])
+            hj_ipip1[i0:i1] += contract('xpi,p->ix', tmp, rhoj[k0:k1])
+            if with_k:
+                hk_ipip1[i0:i1] += contract('xpji,pij->ix', int3c_blk, rhok_tmp)
+
+            # (11|0), (0|0)(0|00) without response of RI basis
+            int3c_blk = _get_int3c2e_ipip_slice('ipvip1', intopt, cp_ij_id, aux_id, omega=omega)
+            tmp = contract('xpji,ij->xpij', int3c_blk, dm0[i0:i1,j0:j1])
+            hj_ipvip1[i0:i1,j0:j1] += contract('xpij,p->ijx', tmp, rhoj[k0:k1])
+            if with_k:
+                hk_ipvip1[i0:i1,j0:j1] += contract('xpji,pij->ijx', int3c_blk, rhok_tmp)
+
+            if auxbasis_response < 1:
+                continue
+            
+            # (10|1), (0|0)(0|00)
+            int3c_blk = _get_int3c2e_ipip_slice('ip1ip2', intopt, cp_ij_id, aux_id, omega=omega)
+            tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1])
+            hj_ip1ip2[i0:i1,k0:k1] += contract('xpi,p->ipx', tmp, rhoj[k0:k1])
+            if with_k:
+                hk_ip1ip2[i0:i1,k0:k1] += contract('xpji,pij->ipx', int3c_blk, rhok_tmp)
+            
+            if auxbasis_response < 2:
+                continue
+            
+            # (00|2), (0|0)(0|00)
+            int3c_blk = _get_int3c2e_ipip_slice('ipip2', intopt, cp_ij_id, aux_id, omega=omega)
+            tmp = contract('xpji,ij->xp', int3c_blk, dm0[i0:i1,j0:j1])
+            hj_ipip2[k0:k1] += contract('xp,p->px', tmp, rhoj[k0:k1])
+            if with_k:
+                hk_ipip2[k0:k1] += contract('xpji,pij->px', int3c_blk, rhok_tmp)
+        
+        auxslices = intopt.auxmol.aoslice_by_atom()
+        aoslices = intopt.mol.aoslice_by_atom()
+        ao2atom = int3c2e.get_ao2atom(intopt, aoslices)
+        aux2atom = int3c2e.get_aux2atom(intopt, auxslices)
+
+        hj_ipvip1 = hj_ipvip1.reshape([nao,nao,3,3])
+        tmp = contract('ia,ijxy->ajxy', ao2atom, hj_ipvip1)
+        hj = 2.0 * contract('jb,ajxy->abxy', ao2atom, tmp)
+
+        hj_ipip1 = hj_ipip1.reshape([nao,3,3])
+        tmp = contract('ia,ixy->axy', ao2atom, hj_ipip1)
+        hj[range(natm), range(natm)] += 2.0 * tmp
+
+        hk = None
+        if with_k:
+            hk_ipvip1 = hk_ipvip1.reshape([nao,nao,3,3])
+            tmp = contract('ia,ijxy->ajxy', ao2atom, hk_ipvip1)
+            hk = contract('jb,ajxy->abxy', ao2atom, tmp)
+
+            hk_ipip1 = hk_ipip1.reshape([nao,3,3])
+            tmp = contract('ia,ixy->axy', ao2atom, hk_ipip1)
+            hk[range(natm), range(natm)] += tmp
+        
+        if auxbasis_response > 0:
+            hj_ip1ip2 = hj_ip1ip2.reshape([nao,naux,3,3])
+            tmp = contract('ia,ijxy->ajxy', ao2atom, hj_ip1ip2)
+            tmp = contract('jb,ajxy->abxy',aux2atom, tmp)
+            tmp = tmp + tmp.transpose([1,0,3,2])
+            hj += tmp
+            if auxbasis_response > 1:
+                hj += tmp
+            if with_k:
+                hk_ip1ip2 = hk_ip1ip2.reshape([nao,naux,3,3])
+                tmp = contract('ia,ijxy->ajxy', ao2atom, hk_ip1ip2)
+                tmp = contract('jb,ajxy->abxy', aux2atom, tmp)
+                tmp = 0.5 * (tmp + tmp.transpose([1,0,3,2]))
+                hk += tmp
+                if auxbasis_response > 1:
+                    hk += tmp
+        
+        if auxbasis_response > 1:
+            hj_ipip2 = hj_ipip2.reshape([naux,3,3])
+            tmp = contract('ia,ixy->axy', aux2atom, hj_ipip2)
+            hj[range(natm), range(natm)] += tmp
+            if with_k:
+                hk_ipip2 = hk_ipip2.reshape([naux,3,3])
+                tmp = contract('ia,ixy->axy', aux2atom, hk_ipip2)
+                hk[range(natm), range(natm)] += .5 * tmp
+        t0 = log.timer_debug1(f'int3c2e_ipip on Device {device_id}', *t0)
+    return hj, hk
+
+def get_int3c2e_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, 
+                    omega=None, auxbasis_response=1):
+    orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
+    futures = []
+    ncp_k = len(intopt.aux_log_qs)
+    ncp_ij = len(intopt.log_qs)
+    tasks = np.array(list(itertools.product(range(ncp_k), range(ncp_ij))))
+    task_list = []
+    for device_id in range(_num_devices):
+        task_list.append(tasks[device_id::_num_devices])
+    
+    cupy.cuda.get_current_stream().synchronize()
+    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
+        for device_id in range(_num_devices):
+            future = executor.submit(
+                _int3c2e_ipip_tasks, intopt, task_list[device_id], 
+                rhoj, rhok, dm0_tag, orbo, with_k=with_k, 
+                device_id=device_id, omega=omega, 
+                auxbasis_response=auxbasis_response)
+            futures.append(future)
+    
+    hj_total = []
+    hk_total = []
+    for future in futures:
+        hj, hk = future.result()
+        hj_total.append(hj)
+        hk_total.append(hk)
+        
+    hj = hk = None
+    hj = reduce_to_device(hj_total, inplace=True)
+    if with_k:
+        hk = reduce_to_device(hk_total, inplace=True)
+    return hj, hk
diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py
index e1a25ec3..9471c849 100644
--- a/gpu4pyscf/df/hessian/rhf.py
+++ b/gpu4pyscf/df/hessian/rhf.py
@@ -42,7 +42,7 @@
 from gpu4pyscf.lib import logger
 from gpu4pyscf import __config__
 from gpu4pyscf.df.grad.rhf import _gen_metric_solver
-from gpu4pyscf.gto.mole import sort_atoms
+from gpu4pyscf.df.hessian import jk
 
 LINEAR_DEP_THR = df.LINEAR_DEP_THR
 BLKSIZE = 128
@@ -60,9 +60,10 @@ def _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2):
     '''
     nnz = rhok1_Pko.shape[0]
     nao = dm0.shape[0]
-    mem_avail = get_avail_mem()
-    blksize = int((mem_avail*0.4/(nao*nao*3*8)/ALIGNED))*ALIGNED
     hk_ao_ao = cupy.zeros([nao,nao,3,3])
+    cupy.get_default_memory_pool().free_all_blocks()
+    mem_avail = get_avail_mem()
+    blksize = int((mem_avail*0.2/(nao*nao*3*8)/ALIGNED))*ALIGNED
     for k0, k1 in lib.prange(0,nnz,blksize):
         rhok1_Pko_kslice = cupy.asarray(rhok1_Pko[k0:k1])
 
@@ -77,7 +78,6 @@ def _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2):
         rhok1_Pkl_kslice = None
     return hk_ao_ao
 
-
 def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
                       atmlst=None, max_memory=4000, verbose=None, with_k=True, omega=None):
     '''Partial derivative
@@ -216,41 +216,11 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     wk1_Pko = rhok1_Pko = None
     t1 = log.timer_debug1('intermediate variables with int3c2e_ip1', *t1)
 
-    cupy.get_default_memory_pool().free_all_blocks()
-    #  int3c_ipip1 contributions
-    hj_ao_diag, hk_ao_diag = int3c2e.get_int3c2e_hjk(intopt, 'ipip1', rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k)
-    hj_ao_diag *= 2.0
-    t1 = log.timer_debug1('intermediate variables with int3c2e_ipip1', *t1)
-
-    #  int3c_ipvip1 contributions
-    # (11|0), (0|00) without response of RI basis
-    hj, hk = int3c2e.get_int3c2e_hjk(intopt, 'ipvip1', rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k)
-    hj_ao_ao += 2.0*hj
-    if with_k:
-        hk_ao_ao += hk
-    hj = hk = None
-    t1 = log.timer_debug1('intermediate variables with int3c2e_ipvip1', *t1)
+    hj_ipip, hk_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag,
+                                          with_k=with_k, omega=omega, 
+                                          auxbasis_response=hessobj.auxbasis_response)
+    t1 = log.timer_debug1('intermediate variables with int3c2e_ipip', *t1)
 
-    #  int3c_ip1ip2 contributions
-    # (10|1), (0|0)(0|00)
-    if hessobj.auxbasis_response:
-        hj, hk = int3c2e.get_int3c2e_hjk(intopt, 'ip1ip2', rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k)
-        hj_ao_aux += hj
-        if with_k:
-            hk_ao_aux += hk
-        hj = hk = None
-        t1 = log.timer_debug1('intermediate variables with int3c2e_ip1ip2', *t1)
-
-    #  int3c_ipip2 contributions
-    if hessobj.auxbasis_response > 1:
-        # (00|2), (0|0)(0|00)
-        hj, hk = int3c2e.get_int3c2e_hjk(intopt, 'ipip2', rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k)
-        hj_aux_diag = hj
-        if with_k:
-            hk_aux_diag = .5*hk
-        hj = hk = None
-        t1 = log.timer_debug1('intermediate variables with int3c2e_ipip2', *t1)
-    
     # int2c contributions
     if hessobj.auxbasis_response > 1:
         if omega and omega > 1e-10:
@@ -263,10 +233,10 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P)
         # (00|0)(2|0)(0|00)
         # p,xp->px
-        hj_aux_diag -= (rhoj0_P*rhoj2c_P).T.reshape(-1,3,3)
+        hj_aux_diag = -(rhoj0_P*rhoj2c_P).T.reshape(-1,3,3)
         if with_k:
             rho2c_0 = contract('pij,qji->pq', rhok0_P__, rhok0_P__)
-            hk_aux_diag -= .5 * contract('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3)
+            hk_aux_diag = -.5 * contract('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3)
         int2c_ipip1 = None
 
         if omega and omega > 1e-10:
@@ -334,20 +304,16 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     t1 = log.timer_debug1('contract int2c_*', *t1)
 
     dm0 = intopt.unsort_orbitals(dm0, axis=[0,1])
-    hj_ao_diag = intopt.unsort_orbitals(hj_ao_diag, axis=[0])
     hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1])
     if hessobj.auxbasis_response:
         hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1])
     if hessobj.auxbasis_response > 1:
-        hj_aux_diag = intopt.unsort_orbitals(hj_aux_diag, aux_axis=[0])
         hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1])
     if with_k:
-        hk_ao_diag = intopt.unsort_orbitals(hk_ao_diag, axis=[0])
         hk_ao_ao = intopt.unsort_orbitals(hk_ao_ao, axis=[0,1])
         if hessobj.auxbasis_response:
             hk_ao_aux = intopt.unsort_orbitals(hk_ao_aux, axis=[0], aux_axis=[1])
         if hessobj.auxbasis_response > 1:
-            hk_aux_diag = intopt.unsort_orbitals(hk_aux_diag, aux_axis=[0])
             hk_aux_aux = intopt.unsort_orbitals(hk_aux_aux, aux_axis=[0,1])
     #======================================== sort AO end ===========================================
     # Energy weighted density matrix
@@ -368,14 +334,12 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     #        collecting all
     # -----------------------------------------
     e1 = cupy.zeros([len(atmlst),len(atmlst),3,3])
-    ej = cupy.zeros([len(atmlst),len(atmlst),3,3])
-    ek = cupy.zeros([len(atmlst),len(atmlst),3,3])
+    ej = hj_ipip
+    ek = hk_ipip
+
     for i0, ia in enumerate(atmlst):
         shl0, shl1, p0, p1 = aoslices[ia]
         e1[i0,i0] -= cupy.sum(h1aa[p0:p1], axis=0)
-        ej[i0,i0] += cupy.sum(hj_ao_diag[p0:p1,:,:], axis=0)
-        if with_k:
-            ek[i0,i0] += cupy.sum(hk_ao_diag[p0:p1,:,:], axis=0)
         for j0, ja in enumerate(atmlst[:i0+1]):
             q0, q1 = aoslices[ja][2:]
             ej[i0,j0] += cupy.sum(hj_ao_ao[p0:p1,q0:q1], axis=[0,1])
@@ -408,9 +372,6 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         #
         if hessobj.auxbasis_response > 1:
             shl0, shl1, p0, p1 = auxslices[ia]
-            ej[i0,i0] += cupy.sum(hj_aux_diag[p0:p1], axis=0)
-            if with_k:
-                ek[i0,i0] += cupy.sum(hk_aux_diag[p0:p1], axis=0)
             for j0, (q0, q1) in enumerate(auxslices[:,2:]):
                 _ej = cupy.sum(hj_aux_aux[p0:p1,q0:q1], axis=[0,1])
                 ej[i0,j0] += _ej
@@ -423,8 +384,21 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         for j0 in range(i0):
             e1[j0,i0] = e1[i0,j0].T
             ej[j0,i0] = ej[i0,j0].T
-            ek[j0,i0] = ek[i0,j0].T
+            if with_k:
+                ek[j0,i0] = ek[i0,j0].T
+        
     t1 = log.timer_debug1('hcore contribution', *t1)
+    
+    aux2atom = int3c2e.get_aux2atom(intopt, auxslices)
+    
+    natm = mol.natm
+    idx = range(natm)
+    # Diagonal contributions
+    if hessobj.auxbasis_response > 1:
+        ej[idx, idx] += contract('ia,ixy->axy', aux2atom, hj_aux_diag)
+        if with_k:
+            ek[idx, idx] += contract('ia,ixy->axy', aux2atom, hk_aux_diag)
+    
     log.timer('RHF partial hessian', *time0)
     return e1, ej, ek
 
@@ -501,9 +475,6 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     if isinstance(wk_Pl_, cupy.ndarray):
         rhok0_Pl_ = solve_j2c(wk_Pl_)
     else:
-        #rhok0_Pl_ = np.empty_like(wk_Pl_)
-        #mem = cupy.cuda.alloc_pinned_memory(wk_Pl_.nbytes)
-        #rhok0_Pl_ = np.ndarray(wk_Pl_.shape, dtype=np.float64, order='C', buffer=mem)
         rhok0_Pl_ = wk_Pl_ # reuse the memory
         for p0, p1 in lib.prange(0,nao,64):
             wk_tmp = cupy.asarray(wk_Pl_[:,p0:p1])
@@ -624,14 +595,36 @@ def _ao2mo(mat):
             vk1 = vk1_int3c[ia] + _ao2mo(vk1_ao)
         yield ia, h1, vj1, vk1
 
+def _get_jk_mo(hessobj, mol, dms, mo_coeff, mocc, 
+           hermi=1, with_j=True, with_k=True, omega=None):
+    mf = hessobj.base
+    dfobj = mf.with_df
+    if omega is None:
+        return jk.get_jk(dfobj, dms, mo_coeff, mocc, 
+                         hermi=hermi, with_j=with_j, with_k=with_k)
+    
+    # A temporary treatment for RSH-DF integrals
+    key = '%.6f' % omega
+    if key in dfobj._rsh_df:
+        rsh_df = dfobj._rsh_df[key]
+    else:
+        rsh_df = dfobj._rsh_df[key] = dfobj.copy().reset()
+        logger.info(dfobj, 'Create RSH-DF object %s for omega=%s', rsh_df, omega)
+
+    with rsh_df.mol.with_range_coulomb(omega):
+        return jk.get_jk(rsh_df, dms, mo_coeff, mocc,
+                         hermi=hermi, with_j=with_j, with_k=with_k, omega=omega)
+
+
 class Hessian(rhf_hess.Hessian):
     '''Non-relativistic restricted Hartree-Fock hessian'''
 
     from gpu4pyscf.lib.utils import to_gpu, device
 
-    __init__ = rhf_hess.Hessian.__init__
+    #__init__ = rhf_hess.Hessian.__init__
     auxbasis_response = 1
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1
-    kernel = rhf_hess.kernel
-    hess = kernel
+    #kernel = rhf_hess.kernel
+    #hess = kernel
+    get_jk_mo = _get_jk_mo
diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py
index 014142fa..ad5dc96e 100644
--- a/gpu4pyscf/df/hessian/rks.py
+++ b/gpu4pyscf/df/hessian/rks.py
@@ -115,9 +115,10 @@ class Hessian(rks_hess.Hessian):
     '''Non-relativistic RKS hessian'''
     from gpu4pyscf.lib.utils import to_gpu, device
 
-    __init__ = rks_hess.Hessian.__init__
+    #__init__ = rks_hess.Hessian.__init__
     auxbasis_response = 1
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1
-    kernel = rhf_hess.kernel
-    hess = kernel
+    #kernel = rhf_hess.kernel
+    #hess = kernel
+    get_jk_mo = df_rhf_hess._get_jk_mo
diff --git a/gpu4pyscf/df/hessian/tests/test_df_rhf_hessian.py b/gpu4pyscf/df/hessian/tests/test_df_rhf_hessian.py
new file mode 100644
index 00000000..b8560002
--- /dev/null
+++ b/gpu4pyscf/df/hessian/tests/test_df_rhf_hessian.py
@@ -0,0 +1,145 @@
+# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import unittest
+import numpy
+import cupy
+from pyscf import gto, scf
+from pyscf.df.hessian import rhf as df_rhf_cpu
+from pyscf.hessian import rhf as rhf_cpu
+from gpu4pyscf.df.hessian import rhf as df_rhf_gpu
+from gpu4pyscf.hessian import rhf as rhf_gpu
+
+def setUpModule():
+    global mol
+    mol = gto.Mole()
+    mol.verbose = 1
+    mol.output = '/dev/null'
+    mol.atom.extend([
+        ["O" , (0. , 0.     , 0.)],
+        [1   , (0. , -0.757 , 0.587)],
+        [1   , (0. , 0.757  , 0.587)] ])
+    mol.basis = 'sto3g'
+    mol.build()
+
+def tearDownModule():
+    global mol
+    mol.stdout.close()
+    del mol
+
+class KnownValues(unittest.TestCase):
+    def test_gen_vind(self):
+        mf = scf.RHF(mol).density_fit()
+        mf.conv_tol = 1e-10
+        mf.conv_tol_cpscf = 1e-8
+        mf.kernel()
+        mo_coeff = mf.mo_coeff
+        mo_occ = mf.mo_occ
+
+        nao, nmo = mo_coeff.shape
+        mocc = mo_coeff[:,mo_occ>0]
+        nocc = mocc.shape[1]
+
+        fx_cpu = rhf_cpu.gen_vind(mf, mo_coeff, mo_occ)
+        mo1 = numpy.random.rand(100, nmo*nocc)
+        v1vo_cpu = fx_cpu(mo1).reshape(-1,nmo*nocc)
+
+        mf = mf.to_gpu()
+        hessobj = mf.Hessian()
+        fx_gpu = hessobj.gen_vind(mo_coeff, mo_occ)
+        mo1 = cupy.asarray(mo1)
+        v1vo_gpu = fx_gpu(mo1)
+        assert numpy.linalg.norm(v1vo_cpu - v1vo_gpu.get()) < 1e-8
+
+    def test_partial_hess_elec(self):
+        mf = scf.RHF(mol).density_fit()
+        mf.conv_tol = 1e-10
+        mf.conv_tol_cpscf = 1e-8
+        mf.kernel()
+        hobj = mf.Hessian()
+        hobj.auxbasis_response = 2
+        e1_cpu, ej_cpu, ek_cpu = df_rhf_cpu._partial_hess_ejk(hobj)
+
+        mf = mf.to_gpu()
+        hobj = mf.Hessian()
+        hobj.auxbasis_response = 2
+        e1_gpu, ej_gpu, ek_gpu = df_rhf_gpu._partial_hess_ejk(hobj)
+        assert numpy.linalg.norm(e1_cpu - e1_gpu.get()) < 1e-5
+        assert numpy.linalg.norm(ej_cpu - ej_gpu.get()) < 1e-5
+        assert numpy.linalg.norm(ek_cpu - ek_gpu.get()) < 1e-5
+
+    def test_make_h1(self):
+        mf = scf.RHF(mol).density_fit()
+        mf.conv_tol = 1e-10
+        mf.conv_tol_cpscf = 1e-8
+        mf.kernel()
+        mo_energy = mf.mo_energy
+        mo_coeff = mf.mo_coeff
+        mo_occ = mf.mo_occ
+        mocc = mo_coeff[:,mo_occ>0]
+        hobj = mf.Hessian()
+        hobj.auxbasis_response = 1
+        h1_cpu = df_rhf_cpu.make_h1(hobj, mo_coeff, mo_occ)
+        mo1_cpu, mo_e1_cpu = hobj.solve_mo1(mo_energy, mo_coeff, mo_occ, h1_cpu, verbose=1)
+        h1_cpu = numpy.asarray(h1_cpu)
+        h1_cpu = numpy.einsum('xypq,pi,qj->xyij', h1_cpu, mo_coeff, mocc)
+
+        mf = mf.to_gpu()
+        mf.conv_tol = 1e-10
+        mf.conv_tol_cpscf = 1e-8
+        hobj = mf.Hessian()
+        hobj.auxbasis_response = 1
+        mo_occ = cupy.asarray(mo_occ)
+        h1_gpu = df_rhf_gpu.make_h1(hobj, mo_coeff, mo_occ)
+        h1_gpu = cupy.asarray(h1_gpu)
+        mo_energy = cupy.asarray(mo_energy)
+        mo_coeff = cupy.asarray(mo_coeff)
+        fx = hobj.gen_vind(mo_coeff, mo_occ)
+        mo1_gpu, mo_e1_gpu = hobj.solve_mo1(mo_energy, mo_coeff, mo_occ, h1_gpu, fx, verbose=1)
+        assert numpy.linalg.norm(h1_cpu - h1_gpu.get()) < 1e-5
+        assert numpy.linalg.norm((mo_e1_cpu - mo_e1_gpu)) < 1e-4
+
+    def test_df_rhf_hess_elec(self):
+        mf = scf.RHF(mol).density_fit()
+        mf.conv_tol = 1e-10
+        mf.conv_tol_cpscf = 1e-8
+        mf.kernel()
+        hobj = mf.Hessian()
+        hobj.auxbasis_response = 2
+        hess_cpu = hobj.hess_elec()
+
+        mf = mf.to_gpu()
+        hobj = mf.Hessian()
+        hobj.auxbasis_response = 2
+        hess_gpu = hobj.hess_elec()
+        assert numpy.linalg.norm(hess_cpu - hess_gpu.get()) < 1e-5
+
+    def test_df_rhf_hessian(self):
+        mf = scf.RHF(mol).density_fit()
+        mf.conv_tol = 1e-10
+        mf.conv_tol_cpscf = 1e-8
+        mf.kernel()
+        hobj = mf.Hessian()
+        hobj.auxbasis_response = 2
+        hess_cpu = hobj.kernel()
+        mf = mf.to_gpu()
+        hobj = mf.Hessian()
+        hobj.auxbasis_response = 2
+        hess_gpu = hobj.kernel()
+        assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5
+
+if __name__ == "__main__":
+    print("Full Tests for DF RHF Hessian")
+    unittest.main()
diff --git a/gpu4pyscf/df/hessian/tests/test_df_rks_hessian.py b/gpu4pyscf/df/hessian/tests/test_df_rks_hessian.py
new file mode 100644
index 00000000..5a853a95
--- /dev/null
+++ b/gpu4pyscf/df/hessian/tests/test_df_rks_hessian.py
@@ -0,0 +1,107 @@
+# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import unittest
+import numpy
+from pyscf import gto, dft
+
+def setUpModule():
+    global mol
+    mol = gto.Mole()
+    mol.verbose = 1
+    mol.output = '/dev/null'
+    mol.atom.extend([
+        ["O" , (0. , 0.     , 0.)],
+        [1   , (0. , -0.757 , 0.587)],
+        [1   , (0. , 0.757  , 0.587)] ])
+    mol.basis = 'sto3g'
+    mol.build()
+
+def tearDownModule():
+    global mol
+    mol.stdout.close()
+    del mol
+
+class KnownValues(unittest.TestCase):
+
+    def test_df_rks_hess_elec(self):
+        mf = dft.RKS(mol, xc='b3lyp').density_fit()
+        mf.conv_tol = 1e-10
+        mf.conv_tol_cpscf = 1e-8
+        mf.grids.level = 1
+        mf.kernel()
+        hobj = mf.Hessian()
+        hobj.auxbasis_response = 2
+        hess_cpu = hobj.partial_hess_elec()
+
+        mf = mf.to_gpu()
+        mf.grids.level = 1
+        mf.kernel()
+        hobj = mf.Hessian()
+        hobj.auxbasis_response = 2
+        hess_gpu = hobj.partial_hess_elec()
+        assert numpy.linalg.norm(hess_cpu - hess_gpu.get()) < 1e-5
+
+    def test_df_lda(self):
+        mf = dft.RKS(mol).density_fit()
+        mf.conv_tol = 1e-10
+        mf.grids.level = 1
+        mf.conv_tol_cpscf = 1e-8
+        mf.kernel()
+
+        hessobj = mf.Hessian()
+        hess_cpu = hessobj.kernel()
+
+        mf = mf.to_gpu()
+        hessobj = mf.Hessian()
+        hess_gpu = hessobj.kernel()
+        assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5
+
+    def test_df_gga(self):
+        mf = dft.RKS(mol, xc='b3lyp').density_fit()
+        mf.conv_tol = 1e-10
+        mf.grids.level = 1
+        mf.conv_tol_cpscf = 1e-8
+        mf.kernel()
+
+        hessobj = mf.Hessian()
+        hess_cpu = hessobj.kernel()
+
+        mf = mf.to_gpu()
+        hessobj = mf.Hessian()
+        hessobj.base.cphf_grids = hessobj.base.grids
+        hess_gpu = hessobj.kernel()
+        assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5
+
+    def test_df_mgga(self):
+        mf = dft.RKS(mol, xc='tpss').density_fit()
+        mf.conv_tol = 1e-10
+        mf.grids.level = 1
+        mf.conv_tol_cpscf = 1e-8
+        mf.kernel()
+
+        hessobj = mf.Hessian()
+        hess_cpu = hessobj.kernel()
+
+        mf = mf.to_gpu()
+        hessobj = mf.Hessian()
+        hessobj.base.cphf_grids = hessobj.base.grids
+        hess_gpu = hessobj.kernel()
+        assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5
+
+if __name__ == "__main__":
+    print("Full Tests for DF RKS Hessian")
+    unittest.main()
+    
\ No newline at end of file
diff --git a/gpu4pyscf/df/hessian/tests/test_df_uhf_hessian.py b/gpu4pyscf/df/hessian/tests/test_df_uhf_hessian.py
index 4d137cea..0443f546 100644
--- a/gpu4pyscf/df/hessian/tests/test_df_uhf_hessian.py
+++ b/gpu4pyscf/df/hessian/tests/test_df_uhf_hessian.py
@@ -62,7 +62,8 @@ def test_gen_vind(self):
         v1vo_cpu = fx_cpu(mo1)
 
         mf = mf.to_gpu()
-        fx_gpu = uhf_gpu.gen_vind(mf, mo_coeff, mo_occ)
+        hessobj = mf.Hessian()
+        fx_gpu = hessobj.gen_vind(mo_coeff, mo_occ)
         mo1 = cupy.asarray(mo1)
         v1vo_gpu = fx_gpu(mo1)
         assert numpy.linalg.norm(v1vo_cpu - v1vo_gpu.get()) < 1e-8
@@ -114,7 +115,8 @@ def test_make_h1(self):
         mo_energy = cupy.asarray(mo_energy)
         mo_coeff = cupy.asarray(mo_coeff)
         mo_occ = cupy.asarray(mo_occ)
-        mo1_gpu, mo_e1_gpu = hobj.solve_mo1(mo_energy, mo_coeff, mo_occ, (h1a_gpu, h1b_gpu), verbose=1)
+        fx = hobj.gen_vind(mo_coeff, mo_occ)
+        mo1_gpu, mo_e1_gpu = hobj.solve_mo1(mo_energy, mo_coeff, mo_occ, (h1a_gpu, h1b_gpu), fx, verbose=1)
         assert numpy.linalg.norm(h1a_cpu - h1a_gpu.get()) < 1e-5
         assert numpy.linalg.norm(h1b_cpu - h1b_gpu.get()) < 1e-5
         mo1_cpu = (numpy.asarray(mo1_cpu[0]), numpy.asarray(mo1_cpu[1]))
diff --git a/gpu4pyscf/df/hessian/uhf.py b/gpu4pyscf/df/hessian/uhf.py
index 5d93c708..035f9505 100644
--- a/gpu4pyscf/df/hessian/uhf.py
+++ b/gpu4pyscf/df/hessian/uhf.py
@@ -43,13 +43,14 @@
 from gpu4pyscf.lib.cupy_helper import (
     contract, tag_array, get_avail_mem, release_gpu_stack, pinv)
 from gpu4pyscf.df import int3c2e, df
+from gpu4pyscf.df.hessian import rhf as df_rhf_hess
 from gpu4pyscf.lib import logger
 from gpu4pyscf import __config__
 from gpu4pyscf.df.grad.rhf import _gen_metric_solver
-from gpu4pyscf.gto.mole import sort_atoms
+from gpu4pyscf.df.hessian import jk
 
 LINEAR_DEP_THR = df.LINEAR_DEP_THR
-BLKSIZE = 256
+BLKSIZE = 128
 ALIGNED = getattr(__config__, 'ao_aligned', 32)
 GB = 1024*1024*1024
 
@@ -221,49 +222,16 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     t1 = log.timer_debug1('intermediate variables with int3c2e_ip1', *t1)
 
     cupy.get_default_memory_pool().free_all_blocks()
-    #  int3c_ipip1 contributions
-    fn = int3c2e.get_int3c2e_hjk
-    hja_ao_diag, hka_ao_diag = fn(intopt, 'ipip1', rhoj0_P, rhok0a_P__, dm0a_tag, omega=omega, with_k=with_k)
-    hjb_ao_diag, hkb_ao_diag = fn(intopt, 'ipip1', rhoj0_P, rhok0b_P__, dm0b_tag, omega=omega, with_k=with_k)
-    hj_ao_diag = 2.0 * (hja_ao_diag + hjb_ao_diag)
+    hja_ipip, hka_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0a_P__, dm0a_tag,
+                                          with_k=with_k, omega=omega, 
+                                          auxbasis_response=hessobj.auxbasis_response)
+    hjb_ipip, hkb_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0b_P__, dm0b_tag,
+                                          with_k=with_k, omega=omega, 
+                                          auxbasis_response=hessobj.auxbasis_response)
+    hj_ipip = hja_ipip + hjb_ipip
     if with_k:
-        hk_ao_diag = 2.0 * (hka_ao_diag + hkb_ao_diag)
-    t1 = log.timer_debug1('intermediate variables with int3c2e_ipip1', *t1)
-
-    #  int3c_ipvip1 contributions
-    # (11|0), (0|00) without response of RI basis
-    fn = int3c2e.get_int3c2e_hjk
-    hja, hka = fn(intopt, 'ipvip1', rhoj0_P, rhok0a_P__, dm0a_tag, omega=omega, with_k=with_k)
-    hjb, hkb = fn(intopt, 'ipvip1', rhoj0_P, rhok0b_P__, dm0b_tag, omega=omega, with_k=with_k)
-    hj_ao_ao += 2.0*(hja + hjb)
-    if with_k:
-        hk_ao_ao += (hka + hkb)
-    hja = hjb = hka = hkb = None
-    t1 = log.timer_debug1('intermediate variables with int3c2e_ipvip1', *t1)
-
-    #  int3c_ip1ip2 contributions
-    # (10|1), (0|0)(0|00)
-    if hessobj.auxbasis_response:
-        fn = int3c2e.get_int3c2e_hjk
-        hja, hka = fn(intopt, 'ip1ip2', rhoj0_P, rhok0a_P__, dm0a_tag, omega=omega, with_k=with_k)
-        hjb, hkb = fn(intopt, 'ip1ip2', rhoj0_P, rhok0b_P__, dm0b_tag, omega=omega, with_k=with_k)
-        hj_ao_aux += hja + hjb
-        if with_k:
-            hk_ao_aux += hka + hkb
-        hja = hjb = hka = hkb = None
-        t1 = log.timer_debug1('intermediate variables with int3c2e_ip1ip2', *t1)
-
-    #  int3c_ipip2 contributions
-    if hessobj.auxbasis_response > 1:
-        # (00|2), (0|0)(0|00)
-        fn = int3c2e.get_int3c2e_hjk
-        hja, hka = fn(intopt, 'ipip2', rhoj0_P, rhok0a_P__, dm0a_tag, omega=omega, with_k=with_k)
-        hjb, hkb = fn(intopt, 'ipip2', rhoj0_P, rhok0b_P__, dm0b_tag, omega=omega, with_k=with_k)
-        hj_aux_diag = hja + hjb
-        if with_k:
-            hk_aux_diag = (hka + hkb)
-        hja = hjb = hka = hkb = None
-        t1 = log.timer_debug1('intermediate variables with int3c2e_ipip2', *t1)
+        hk_ipip = 2.0*(hka_ipip + hkb_ipip)
+    t1 = log.timer_debug1('intermediate variables with int3c2e_ipip', *t1)
 
     # int2c contributions
     if hessobj.auxbasis_response > 1:
@@ -277,11 +245,11 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P)
         # (00|0)(2|0)(0|00)
         # p,xp->px
-        hj_aux_diag -= (rhoj0_P*rhoj2c_P).T.reshape(-1,3,3)
+        hj_aux_diag = -(rhoj0_P*rhoj2c_P).T.reshape(-1,3,3)
         if with_k:
             rho2c_0 = contract('pij,qji->pq', rhok0a_P__, rhok0a_P__)
             rho2c_0+= contract('pij,qji->pq', rhok0b_P__, rhok0b_P__)
-            hk_aux_diag -= contract('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3)
+            hk_aux_diag = -contract('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3)
         int2c_ipip1 = None
 
         if omega and omega > 1e-10:
@@ -350,7 +318,6 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
             rho2c_10= int2c_ip1_inv = None
     t1 = log.timer_debug1('contract int2c_*', *t1)
 
-    hj_ao_diag = intopt.unsort_orbitals(hj_ao_diag, axis=[0])
     hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1])
     if hessobj.auxbasis_response:
         hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1])
@@ -358,7 +325,6 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         hj_aux_diag = intopt.unsort_orbitals(hj_aux_diag, aux_axis=[0])
         hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1])
     if with_k:
-        hk_ao_diag = intopt.unsort_orbitals(hk_ao_diag, axis=[0])
         hk_ao_ao = intopt.unsort_orbitals(hk_ao_ao, axis=[0,1])
         if hessobj.auxbasis_response:
             hk_ao_aux = intopt.unsort_orbitals(hk_ao_aux, axis=[0], aux_axis=[1])
@@ -389,14 +355,11 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     # -----------------------------------------
     hk_ao_ao *= 2.0
     e1 = cupy.zeros([len(atmlst),len(atmlst),3,3])
-    ej = cupy.zeros([len(atmlst),len(atmlst),3,3])
-    ek = cupy.zeros([len(atmlst),len(atmlst),3,3])
+    ej = hj_ipip
+    ek = hk_ipip
     for i0, ia in enumerate(atmlst):
         shl0, shl1, p0, p1 = aoslices[ia]
         e1[i0,i0] -= cupy.sum(h1aa[p0:p1], axis=0)
-        ej[i0,i0] += cupy.sum(hj_ao_diag[p0:p1,:,:], axis=0)
-        if with_k:
-            ek[i0,i0] += cupy.sum(hk_ao_diag[p0:p1,:,:], axis=0)
         for j0, ja in enumerate(atmlst[:i0+1]):
             q0, q1 = aoslices[ja][2:]
             ej[i0,j0] += cupy.sum(hj_ao_ao[p0:p1,q0:q1], axis=[0,1])
@@ -702,6 +665,8 @@ def _ao2mo(mat, mocc, mo):
             vk1b = vk1b_int3c[ia] + _ao2mo(vk1b_ao, moccb, mo_coeff[1])
         yield ia, (h1a, h1b), (vj1a, vj1b), (vk1a, vk1b)
 
+_get_jk_mo = df_rhf_hess._get_jk_mo
+
 class Hessian(uhf_hess.Hessian):
     '''Non-relativistic restricted Hartree-Fock hessian'''
 
@@ -713,3 +678,4 @@ class Hessian(uhf_hess.Hessian):
     make_h1 = make_h1
     kernel = rhf_hess.kernel
     hess = kernel
+    get_jk_mo = _get_jk_mo
diff --git a/gpu4pyscf/df/hessian/uks.py b/gpu4pyscf/df/hessian/uks.py
index 5133fc18..a3230490 100644
--- a/gpu4pyscf/df/hessian/uks.py
+++ b/gpu4pyscf/df/hessian/uks.py
@@ -133,3 +133,4 @@ class Hessian(uks_hess.Hessian):
     hess_elec = uhf_hess.hess_elec
     kernel = rhf_hess.kernel
     hess = kernel
+    get_jk_mo = df_uhf_hess._get_jk_mo
diff --git a/gpu4pyscf/df/tests/test_df_hessian.py b/gpu4pyscf/df/tests/test_df_hessian.py
index 8c56692b..3b932195 100644
--- a/gpu4pyscf/df/tests/test_df_hessian.py
+++ b/gpu4pyscf/df/tests/test_df_hessian.py
@@ -136,7 +136,7 @@ def test_hessian_rhf(self, disp=None):
         h = hobj.kernel()
         _check_rhf_hessian(mf, h, ix=0, iy=0)
         _check_rhf_hessian(mf, h, ix=0, iy=1)
-
+    
     def test_hessian_lda(self, disp=None):
         print('-----testing DF LDA Hessian----')
         mf = _make_rks(mol_sph, 'LDA')
@@ -240,7 +240,6 @@ def test_hessian_rks_D3(self):
         hobj = mf.Hessian()
         hobj.set(auxbasis_response=2)
         h = hobj.kernel()
-        print(np.linalg.norm(h))
         _check_dft_hessian(mf, h, ix=0,iy=0)
 
     def test_hessian_rks_D4(self):
diff --git a/gpu4pyscf/hessian/jk.py b/gpu4pyscf/hessian/jk.py
new file mode 100644
index 00000000..7f3eeb60
--- /dev/null
+++ b/gpu4pyscf/hessian/jk.py
@@ -0,0 +1,296 @@
+# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+'''
+Compute J/K matrices for Hessian
+'''
+import ctypes
+import math
+import numpy as np
+import cupy as cp
+from collections import Counter
+from concurrent.futures import ThreadPoolExecutor
+
+from pyscf import lib
+from pyscf.scf import _vhf
+from pyscf import __config__
+
+from gpu4pyscf.scf.jk import (_make_tril_tile_mappings, quartets_scheme, QUEUE_DEPTH, 
+                              _VHFOpt, LMAX, init_constant, libvhf_rys)
+from gpu4pyscf.lib.cupy_helper import (condense, sandwich_dot, transpose_sum,
+                                       reduce_to_device, contract)
+
+from gpu4pyscf.__config__ import props as gpu_specs
+from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.lib import logger
+
+
+def _ao2mo(v_ao, mocc, mo_coeff):
+    v_ao = contract('nij,jo->nio', v_ao, mocc)
+    return contract('nio,ip->npo', v_ao, mo_coeff)
+
+def _jk_task(mol, dms, mo_coeff, mocc, vhfopt, task_list, hermi=0,
+             device_id=0, with_j=True, with_k=True, verbose=0):
+    nao, _ = vhfopt.coeff.shape
+    uniq_l_ctr = vhfopt.uniq_l_ctr
+    uniq_l = uniq_l_ctr[:,0]
+    l_ctr_bas_loc = vhfopt.l_ctr_offsets
+    l_symb = [lib.param.ANGULAR[i] for i in uniq_l]
+    kern = libvhf_rys.RYS_build_jk
+    
+    timing_counter = Counter()
+    kern_counts = 0
+    with cp.cuda.Device(device_id), _streams[device_id]:
+        log = logger.new_logger(mol, verbose)
+        cput0 = log.init_timer()
+        dms = cp.asarray(dms)
+
+        n_dm = dms.shape[0]
+        tile_q_ptr = ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p)
+        q_ptr = ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p)
+        s_ptr = lib.c_null_ptr()
+        if mol.omega < 0:
+            s_ptr = ctypes.cast(vhfopt.s_estimator.data.ptr, ctypes.c_void_p)
+        
+        vj = vk = None
+        vj_ptr = vk_ptr = lib.c_null_ptr()
+        assert with_j or with_k
+        if with_k:
+            vk = cp.zeros(dms.shape)
+            vk_ptr = ctypes.cast(vk.data.ptr, ctypes.c_void_p)
+        if with_j:
+            vj = cp.zeros(dms.shape)
+            vj_ptr = ctypes.cast(vj.data.ptr, ctypes.c_void_p)
+        
+        ao_loc = mol.ao_loc
+        dm_cond = cp.log(condense('absmax', dms, ao_loc) + 1e-300).astype(np.float32)
+        log_max_dm = dm_cond.max()
+        log_cutoff = math.log(vhfopt.direct_scf_tol)
+        tile_mappings = _make_tril_tile_mappings(l_ctr_bas_loc, vhfopt.tile_q_cond,
+                                                 log_cutoff-log_max_dm)
+        workers = gpu_specs['multiProcessorCount']
+        pool = cp.empty((workers, QUEUE_DEPTH*4), dtype=np.uint16)
+        info = cp.empty(2, dtype=np.uint32)
+        t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0)
+
+        for i, j in task_list:
+            ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1],
+                       l_ctr_bas_loc[j], l_ctr_bas_loc[j+1])
+            tile_ij_mapping = tile_mappings[i,j]
+            for k in range(i+1):
+                for l in range(k+1):
+                    llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
+                    kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
+                                l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
+                    tile_kl_mapping = tile_mappings[k,l]
+                    scheme = quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
+                    err = kern(
+                        vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p),
+                        ctypes.c_int(n_dm), ctypes.c_int(nao),
+                        vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
+                        (ctypes.c_int*8)(*ij_shls, *kl_shls),
+                        ctypes.c_int(tile_ij_mapping.size),
+                        ctypes.c_int(tile_kl_mapping.size),
+                        ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
+                        ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
+                        tile_q_ptr, q_ptr, s_ptr,
+                        ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
+                        ctypes.c_float(log_cutoff),
+                        ctypes.cast(pool.data.ptr, ctypes.c_void_p),
+                        ctypes.cast(info.data.ptr, ctypes.c_void_p),
+                        ctypes.c_int(workers),
+                        mol._atm.ctypes, ctypes.c_int(mol.natm),
+                        mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
+                    if err != 0:
+                        raise RuntimeError(f'RYS_build_jk kernel for {llll} failed')
+                    if log.verbose >= logger.DEBUG1:
+                        msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}'
+                        t1, t1p = log.timer_debug1(msg, *t1), t1
+                        timing_counter[llll] += t1[1] - t1p[1]
+                        kern_counts += 1
+        if with_j:
+            vj *= 2.0
+            vj = transpose_sum(vj)
+        if with_k:
+            vk = transpose_sum(vk)
+
+        if isinstance(mocc, tuple):
+            # Unrestricted case
+            mocca, moccb = mocc
+            moa, mob = mo_coeff
+            nmoa, nmob = moa.shape[1], mob.shape[1]
+            nocca, noccb = mocca.shape[1], moccb.shape[1]
+            n_dm_2 = n_dm//2
+            if with_j:
+                vjab = vj[:n_dm_2] + vj[n_dm_2:]
+                vj = cp.empty([n_dm_2,nmoa*nocca+nmob*noccb])
+                vj[:,:nmoa*nocca] = _ao2mo(vjab, mocca, moa).reshape(n_dm_2,-1)
+                vj[:,nmoa*nocca:] = _ao2mo(vjab, moccb, mob).reshape(n_dm_2,-1)
+            if with_k:
+                vka, vkb = vk[:n_dm_2], vk[n_dm_2:]
+                vk = cp.empty([n_dm_2,nmoa*nocca+nmob*noccb])
+                vk[:,:nmoa*nocca] = _ao2mo(vka, mocca, moa).reshape(n_dm_2,-1)
+                vk[:,nmoa*nocca:] = _ao2mo(vkb, moccb, mob).reshape(n_dm_2,-1)
+        else:
+            if with_j:
+                vj = _ao2mo(vj, mocc, mo_coeff).reshape(n_dm,-1)
+            if with_k:
+                vk = _ao2mo(vk, mocc, mo_coeff).reshape(n_dm,-1)
+        
+    return vj, vk, kern_counts, timing_counter
+
+def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None, 
+           with_j=True, with_k=True, verbose=None):
+    '''Compute J, K matrices in MO
+    '''
+    log = logger.new_logger(mol, verbose)
+    cput0 = log.init_timer()
+
+    if vhfopt is None:
+        vhfopt = _VHFOpt(mol).build()
+
+    mol = vhfopt.mol
+    nao, nao_orig = vhfopt.coeff.shape
+
+    dm = cp.asarray(dm, order='C')
+    dms = dm.reshape(-1,nao_orig,nao_orig)
+
+    # Transform MO coeffcients and DM into sorted, cartesian AO basis
+    #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff)
+    dms = sandwich_dot(dms, vhfopt.coeff.T)
+    dms = cp.asarray(dms, order='C')
+    coeff = vhfopt.coeff
+    if isinstance(mocc, tuple):
+        mocc = (coeff.dot(mocc[0]), coeff.dot(mocc[1]))
+        mo_coeff = (coeff.dot(mo_coeff[0]), coeff.dot(mo_coeff[1]))
+    else:
+        mocc = coeff.dot(mocc)
+        mo_coeff = coeff.dot(mo_coeff)
+    n_dm = dms.shape[0]
+
+    assert with_j or with_k
+
+    init_constant(mol)
+
+    uniq_l_ctr = vhfopt.uniq_l_ctr
+    uniq_l = uniq_l_ctr[:,0]
+    l_symb = [lib.param.ANGULAR[i] for i in uniq_l]
+    n_groups = np.count_nonzero(uniq_l <= LMAX)
+
+    tasks = [(i,j) for i in range(n_groups) for j in range(i+1)]
+    tasks = np.array(tasks)
+    task_list = []
+    for device_id in range(_num_devices):
+        task_list.append(tasks[device_id::_num_devices])
+
+    cp.cuda.get_current_stream().synchronize()
+    futures = []
+    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
+        for device_id in range(_num_devices):
+            future = executor.submit(
+                _jk_task,
+                mol, dms, mo_coeff, mocc, vhfopt, task_list[device_id], hermi=hermi,
+                with_j=with_j, with_k=with_k, verbose=verbose, 
+                device_id=device_id)
+            futures.append(future)
+
+    kern_counts = 0
+    timing_collection = Counter()
+    vj_dist = []
+    vk_dist = []
+    for future in futures:
+        vj, vk, counts, counter = future.result()
+        kern_counts += counts
+        timing_collection += counter
+        vj_dist.append(vj)
+        vk_dist.append(vk)
+
+    if log.verbose >= logger.DEBUG1:
+        log.debug1('kernel launches %d', kern_counts)
+        for llll, t in timing_collection.items():
+            log.debug1('%s wall time %.2f', llll, t)
+    
+    for s in _streams:
+        s.synchronize()
+    cp.cuda.get_current_stream().synchronize()
+    vj = vk = None
+    if with_k:
+        vk = reduce_to_device(vk_dist, inplace=True)
+
+    if with_j:
+        vj = reduce_to_device(vj_dist, inplace=True)
+
+    h_shls = vhfopt.h_shls
+    assert len(h_shls) == 0
+    if h_shls:
+        cput1 = log.timer_debug1('get_jk pass 1 on gpu', *cput0)
+        log.debug3('Integrals for %s functions on CPU', l_symb[LMAX+1])
+        scripts = []
+        if with_j:
+            scripts.append('ji->s2kl')
+        if with_k:
+            if hermi == 1:
+                scripts.append('jk->s2il')
+            else:
+                scripts.append('jk->s1il')
+        shls_excludes = [0, h_shls[0]] * 4
+        if hermi == 1:
+            dms = dms.get()
+        else:
+            dms = dms[:n_dm//2].get()
+        vs_h = _vhf.direct_mapdm('int2e_cart', 's8', scripts,
+                                 dms, 1, mol._atm, mol._bas, mol._env,
+                                 shls_excludes=shls_excludes)
+        if with_j and with_k:
+            vj1 = vs_h[0]
+            vk1 = vs_h[1]
+        elif with_j:
+            vj1 = vs_h[0]
+        else:
+            vk1 = vs_h[0]
+        coeff = vhfopt.coeff
+        idx, idy = np.tril_indices(nao, -1)
+        if isinstance(mocc, tuple):
+            mocca, moccb = mocc
+            moa, mob = mo_coeff
+            nmoa, nmob = moa.shape[1], mob.shape[1]
+            nocca, noccb = mocca.shape[1], moccb.shape[1]
+            n_dm_2 = n_dm//2
+            if with_j:
+                vjab = vj1[:n_dm_2] + vj1[n_dm_2:]
+                vj[:,:nmoa*nocca] += _ao2mo(vjab, mocca, moa).reshape(n_dm_2,-1)
+                vj[:,nmoa*nocca:] += _ao2mo(vjab, moccb, mob).reshape(n_dm_2,-1)
+            if with_k:
+                vka, vkb = vk[:n_dm_2], vk[n_dm_2:]
+                vk[:,:nmoa*nocca] += _ao2mo(vka, mocca, moa).reshape(n_dm_2,-1)
+                vk[:,nmoa*nocca:] += _ao2mo(vkb, moccb, mob).reshape(n_dm_2,-1)
+        else:
+            if with_j:
+                vj1[:,idy,idx] = vj1[:,idx,idy]
+                vj += _ao2mo(cp.asarray(vj1), mocc, mo_coeff)
+                #for i, v in enumerate(vj1):
+                #    vj[i] += coeff.T.dot(cp.asarray(v)).dot(coeff)
+            if with_k:
+                if hermi:
+                    vk1[:,idy,idx] = vk1[:,idx,idy]
+                vk += _ao2mo(cp.asarray(vk1), mocc, mo_coeff)
+                #for i, v in enumerate(vk1):
+                #    vk[i] += coeff.T.dot(cp.asarray(v)).dot(coeff)
+
+        # TODO: convert vj and vk into MO
+        log.timer_debug1('get_jk pass 2 for h functions on cpu', *cput1)
+    
+    log.timer('vj and vk', *cput0)
+    return vj, vk
diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py
index 26929dd3..a526eb81 100644
--- a/gpu4pyscf/hessian/rhf.py
+++ b/gpu4pyscf/hessian/rhf.py
@@ -42,6 +42,7 @@
     LMAX, QUEUE_DEPTH, SHM_SIZE, THREADS, libvhf_rys, _VHFOpt, init_constant,
     _make_tril_tile_mappings, _nearest_power2)
 from gpu4pyscf.grad import rhf as rhf_grad
+from gpu4pyscf.hessian import jk
 
 libvhf_rys.RYS_per_atom_jk_ip2_type12.restype = ctypes.c_int
 libvhf_rys.RYS_per_atom_jk_ip2_type3.restype = ctypes.c_int
@@ -79,10 +80,10 @@ def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
             h1mo = h1mo.get()
         t1 = log.timer_debug1('making H1', *t1)
     if mo1 is None or mo_e1 is None:
+        fx = hessobj.gen_vind(mo_coeff, mo_occ)
         mo1, mo_e1 = hessobj.solve_mo1(mo_energy, mo_coeff, mo_occ, h1mo,
-                                       None, atmlst, max_memory, log)
+                                       fx, atmlst, max_memory, log)
         t1 = log.timer_debug1('solving MO1', *t1)
-
     mo1 = cupy.asarray(mo1)
     # *2 for double occupancy, *2 for +c.c.
     de2 += contract('kxpi,lypi->klxy', cupy.asarray(h1mo), mo1) * 4
@@ -365,7 +366,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     avail_mem = get_avail_mem()
     slice_size = int(avail_mem*0.6) // (8*3*nao*nao)
     for atoms_slice in lib.prange(0, natm, slice_size):
-        vj, vk = _get_jk(mol, dm0, atoms_slice=atoms_slice, verbose=verbose)
+        vj, vk = _get_jk_ip1(mol, dm0, atoms_slice=atoms_slice, verbose=verbose)
         #:vhf = vj - vk * .5
         vhf = vk
         vhf *= -.5
@@ -377,9 +378,9 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
         vj = vk = vhf = None
     return h1mo
 
-
 def _build_jk_ip1_task(mol, dms, vhfopt, task_list, atoms_slice,
                        device_id=0, with_j=True, with_k=True, verbose=0):
+    # TODO: compute JK in MO
     assert isinstance(verbose, int)
     nao, _ = vhfopt.coeff.shape
     natm = mol.natm
@@ -475,7 +476,7 @@ def _build_jk_ip1_task(mol, dms, vhfopt, task_list, atoms_slice,
                         kern_counts += 1
     return vj, vk, kern_counts, timing_counter
 
-def _get_jk(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=None):
+def _get_jk_ip1(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=None):
     r'''
     For each atom, compute
     J = ((\nabla_X i) j| kl) (D_lk + D_ji)
@@ -688,7 +689,7 @@ def fvind_vo(mo1):
         mo1[:,:,viridx] *= -e_ai
         mo1[:,:,occidx] = -s1mo_blk[:,:,occidx] * .5
         hs = s1mo_blk = h1mo_blk = None
-
+        
         tol = mf.conv_tol_cpscf * (i1 - i0)
         raw_mo1 = krylov(fvind_vo, mo1.reshape(-1,nmo*nocc),
                          tol=tol, max_cycle=max_cycle, verbose=log)
@@ -706,32 +707,23 @@ def fvind_vo(mo1):
     log.timer('CPHF solver', *t0)
     return mo1s, e1s
 
-def gen_vind(mf, mo_coeff, mo_occ):
-    # Move data to GPU
+def gen_vind(hessobj, mo_coeff, mo_occ):
+    mol = hessobj.mol
     mo_coeff = cupy.asarray(mo_coeff)
     mo_occ = cupy.asarray(mo_occ)
     nao, nmo = mo_coeff.shape
     mocc = mo_coeff[:,mo_occ>0]
     nocc = mocc.shape[1]
     mocc_2 = mocc * 2
-    grids = getattr(mf, 'cphf_grids', None)
-    if grids is not None:
-        logger.info(mf, 'Secondary grids defined for CPHF in Hessian')
-    vresp = mf.gen_response(mo_coeff, mo_occ, hermi=1, grids=grids)
 
     def fx(mo1):
         mo1 = cupy.asarray(mo1)
         mo1 = mo1.reshape(-1,nmo,nocc)
         mo1_mo = contract('npo,ip->nio', mo1, mo_coeff)
-        #dm1 = contract('nio,jo->nij', mo1_mo, mocc_2)
-        #dm1 = dm1 + dm1.transpose(0,2,1)
         dm1 = mo1_mo.dot(mocc_2.T)
-        transpose_sum(dm1)
+        dm1 = transpose_sum(dm1)
         dm1 = tag_array(dm1, mo1=mo1_mo, occ_coeff=mocc, mo_occ=mo_occ)
-        v1 = vresp(dm1)
-        tmp = contract('nij,jo->nio', v1, mocc)
-        v1vo = contract('nio,ip->npo', tmp, mo_coeff)
-        return v1vo
+        return hessobj.get_veff_resp_mo(mol, dm1, mo_coeff, mo_occ, hermi=1)
     return fx
 
 def hess_nuc_elec(mol, dm):
@@ -890,6 +882,25 @@ def get_hcore(iatm, jatm):
 def hcore_generator(hessobj, mol=None):
     raise NotImplementedError
 
+def _get_jk_mo(hessobj, mol, dms, mo_coeff, mocc, 
+            hermi=1, with_j=True, with_k=True, omega=None):
+    ''' Compute J/K matrices in MO for multiple DMs
+    '''
+    mf = hessobj.base
+    vhfopt = mf._opt_gpu.get(omega)
+    if vhfopt is None:
+        with mol.with_range_coulomb(omega):
+            vhfopt = mf._opt_gpu[omega] = _VHFOpt(mol, mf.direct_scf_tol).build()
+    with mol.with_range_coulomb(omega):
+        vj, vk = jk.get_jk(mol, dms, mo_coeff, mocc, hermi, vhfopt, with_j, with_k)
+    return vj, vk
+
+def _get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1, omega=None):
+    mocc = mo_coeff[:,mo_occ>0]
+    vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mocc, 
+                     hermi=hermi, with_j=True, with_k=True, omega=omega)
+    return vj - 0.5 * vk
+
 class HessianBase(lib.StreamObject):
     # attributes
     max_cycle   = rhf_hess_cpu.HessianBase.max_cycle
@@ -901,8 +912,10 @@ class HessianBase(lib.StreamObject):
     make_h1         = rhf_hess_cpu.HessianBase.make_h1
     hcore_generator = hcore_generator  # the functionality is different from cpu version
     hess_nuc        = rhf_hess_cpu.HessianBase.hess_nuc
+    gen_vind        = NotImplemented
+    get_jk          = NotImplemented
     kernel = hess = kernel
-
+    
     def get_hcore(self, mol=None):
         if mol is None: mol = self.mol
         return get_hcore(mol)
@@ -952,6 +965,9 @@ def __init__(self, scf_method):
     hess_elec = hess_elec
     make_h1 = make_h1
     gen_hop = NotImplemented
+    gen_vind = gen_vind
+    get_jk_mo = _get_jk_mo
+    get_veff_resp_mo = _get_veff_resp_mo
 
 # Inject to RHF class
 from gpu4pyscf import scf
diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py
index 4c9b5a0d..93c16bb8 100644
--- a/gpu4pyscf/hessian/rks.py
+++ b/gpu4pyscf/hessian/rks.py
@@ -29,9 +29,11 @@
 # import pyscf.grad.rks to activate nuc_grad_method method
 from gpu4pyscf.grad import rks as rks_grad
 from gpu4pyscf.dft import numint
-from gpu4pyscf.lib.cupy_helper import contract, add_sparse, get_avail_mem, reduce_to_device
+from gpu4pyscf.lib.cupy_helper import (contract, add_sparse, get_avail_mem, 
+                                       reduce_to_device, transpose_sum, tag_array)
 from gpu4pyscf.lib import logger
 from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.hessian import jk
 
 def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
                       atmlst=None, max_memory=4000, verbose=None):
@@ -126,15 +128,15 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     avail_mem -= 8 * h1mo.size
     slice_size = int(avail_mem*0.5) // (8*3*nao*nao)
     for atoms_slice in lib.prange(0, natm, slice_size):
-        vj, vk = rhf_hess._get_jk(mol, dm0, with_k=with_k,
-                                  atoms_slice=atoms_slice, verbose=verbose)
+        vj, vk = rhf_hess._get_jk_ip1(mol, dm0, with_k=with_k,
+                                      atoms_slice=atoms_slice, verbose=verbose)
         veff = vj
         if with_k:
             vk *= .5 * hyb
             veff -= vk
         if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10:
             with mol.with_range_coulomb(omega):
-                vk_lr = rhf_hess._get_jk(mol, dm0, with_j=False, verbose=verbose)[1]
+                vk_lr = rhf_hess._get_jk_ip1(mol, dm0, with_j=False, verbose=verbose)[1]
                 vk_lr *= (alpha-hyb) * .5
                 veff -= vk_lr
         atom0, atom1 = atoms_slice
@@ -699,6 +701,51 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
     vmat = reduce_to_device(vmat_dist, inplace=True)
     return vmat
 
+def get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1, omega=None):
+    mol = hessobj.mol
+    mf = hessobj.base
+    grids = getattr(mf, 'cphf_grids', None)
+    if grids is not None:
+        logger.info(mf, 'Secondary grids defined for CPHF in Hessian')
+    else:
+        # If cphf_grids is not defined, e.g object defined from CPU
+        grids = getattr(mf, 'grids', None)
+        logger.info(mf, 'Primary grids is used for CPHF in Hessian')
+
+    if grids and grids.coords is None:
+        grids.build(mol=mol, with_non0tab=False, sort_grids=True)
+
+    ni = mf._numint
+    omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, mol.spin)
+    hybrid = ni.libxc.is_hybrid_xc(mf.xc)
+    assert not mf.do_nlc()
+    hermi = 1
+
+    mocc = mo_coeff[:,mo_occ>0]
+    nocc = mocc.shape[1]
+    nao, nmo = mo_coeff.shape
+    # TODO: evaluate v1 in MO
+    rho0, vxc, fxc = ni.cache_xc_kernel(mol, grids, mf.xc,
+                                        mo_coeff, mo_occ, 0)
+    v1 = ni.nr_rks_fxc(mol, grids, mf.xc, None, dms, 0, hermi,
+                                    rho0, vxc, fxc, max_memory=None)
+    v1 = jk._ao2mo(v1, mocc, mo_coeff).reshape(-1,nmo*nocc)
+
+    if hybrid:
+        vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mocc, hermi=1)
+        vk *= hyb
+        if omega > 1e-10:  # For range separated Coulomb
+            _, vk_lr = hessobj.get_jk_mo(mol, dms, mo_coeff, mocc, hermi, 
+                                        with_j=False, omega=omega) 
+            vk_lr *= (alpha-hyb)
+            vk += vk_lr
+        v1 += vj - .5 * vk
+    else:
+        v1 += hessobj.get_jk_mo(mol, dms, mo_coeff, mocc, hermi=1, 
+                                with_k=False)[0]
+    
+    return v1
+
 
 class Hessian(rhf_hess.HessianBase):
     '''Non-relativistic RKS hessian'''
@@ -715,6 +762,9 @@ def __init__(self, mf):
     partial_hess_elec = partial_hess_elec
     hess_elec = rhf_hess.hess_elec
     make_h1 = make_h1
+    gen_vind = rhf_hess.gen_vind
+    get_jk_mo = rhf_hess._get_jk_mo
+    get_veff_resp_mo = get_veff_resp_mo
 
 from gpu4pyscf import dft
 dft.rks.RKS.Hessian = lib.class_as_method(Hessian)
diff --git a/gpu4pyscf/hessian/tests/test_rhf_hessian.py b/gpu4pyscf/hessian/tests/test_rhf_hessian.py
index 82c6606c..21266bc3 100644
--- a/gpu4pyscf/hessian/tests/test_rhf_hessian.py
+++ b/gpu4pyscf/hessian/tests/test_rhf_hessian.py
@@ -104,7 +104,7 @@ def test_get_jk(self):
         mo_coeff = np.random.rand(nao, nao)
         dm = mo_coeff.dot(mo_coeff.T) * 2
 
-        vj, vk = rhf_gpu._get_jk(mol, dm)
+        vj, vk = rhf_gpu._get_jk_ip1(mol, dm)
         assert abs(lib.fp(vj.get()) -  87674.69061160382) < 1e-7
         assert abs(lib.fp(vk.get()) - -9.317650662101629) < 1e-7
 
diff --git a/gpu4pyscf/hessian/uhf.py b/gpu4pyscf/hessian/uhf.py
index 81f26c17..07686e6d 100644
--- a/gpu4pyscf/hessian/uhf.py
+++ b/gpu4pyscf/hessian/uhf.py
@@ -30,11 +30,12 @@
 from pyscf.scf import ucphf
 # import _response_functions to load gen_response methods in SCF class
 from gpu4pyscf.scf import _response_functions  # noqa
-from gpu4pyscf.gto.mole import sort_atoms
-from gpu4pyscf.lib.cupy_helper import contract, tag_array, get_avail_mem, krylov
+from gpu4pyscf.lib.cupy_helper import (contract, transpose_sum, get_avail_mem, 
+                                       krylov, tag_array)
 from gpu4pyscf.lib import logger
 from gpu4pyscf.grad import rhf as rhf_grad
 from gpu4pyscf.hessian import rhf as rhf_hess_gpu
+from gpu4pyscf.hessian import jk
 
 GB = 1024*1024*1024
 ALIGNED = 4
@@ -68,8 +69,9 @@ def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
             h1mo = (h1mo[0].get(), h1mo[1].get())
         t1 = log.timer_debug1('making H1', *t1)
     if mo1 is None or mo_e1 is None:
+        fx = hessobj.gen_vind(mo_coeff, mo_occ)
         mo1, mo_e1 = hessobj.solve_mo1(mo_energy, mo_coeff, mo_occ, h1mo,
-                                       None, atmlst, max_memory, log)
+                                       fx, atmlst, max_memory, log)
         t1 = log.timer_debug1('solving MO1', *t1)
 
     mo1a = cupy.asarray(mo1[0])
@@ -192,8 +194,8 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     avail_mem = get_avail_mem()
     slice_size = int(avail_mem*0.6) // (8*3*nao*nao*2)
     for atoms_slice in lib.prange(0, natm, slice_size):
-        vja, vka = rhf_hess_gpu._get_jk(mol, dm0a, atoms_slice=atoms_slice, verbose=verbose)
-        vjb, vkb = rhf_hess_gpu._get_jk(mol, dm0b, atoms_slice=atoms_slice, verbose=verbose)
+        vja, vka = rhf_hess_gpu._get_jk_ip1(mol, dm0a, atoms_slice=atoms_slice, verbose=verbose)
+        vjb, vkb = rhf_hess_gpu._get_jk_ip1(mol, dm0b, atoms_slice=atoms_slice, verbose=verbose)
         #:vhfa = vja+vjb - vka
         #:vhfb = vja+vjb - vkb
         vhfa = vka
@@ -369,8 +371,9 @@ def fvind_vo(mo1):
     log.timer('CPHF solver', *t0)
     return (mo1sa, mo1sb), (e1sa, e1sb)
 
-def gen_vind(mf, mo_coeff, mo_occ):
+def gen_vind(hessobj, mo_coeff, mo_occ):
     # Move data to GPU
+    mol = hessobj.mol
     mo_coeff = cupy.asarray(mo_coeff)
     mo_occ = cupy.asarray(mo_occ)
     nao, nmoa = mo_coeff[0].shape
@@ -379,39 +382,34 @@ def gen_vind(mf, mo_coeff, mo_occ):
     moccb = mo_coeff[1][:,mo_occ[1]>0]
     nocca = mocca.shape[1]
     noccb = moccb.shape[1]
-    grids = getattr(mf, 'cphf_grids', None)
-    if grids is not None:
-        logger.info(mf, 'Secondary grids defined for CPHF in Hessian')
-    vresp = mf.gen_response(mo_coeff, mo_occ, hermi=1, grids=grids)
 
     def fx(mo1):
         mo1 = cupy.asarray(mo1)
         mo1 = mo1.reshape(-1,nmoa*nocca+nmob*noccb)
         nset = len(mo1)
 
+        dm1 = cupy.empty([2,nset,nao,nao])
+
         x = mo1[:,:nmoa*nocca].reshape(nset,nmoa,nocca)
         mo1_moa = contract('npo,ip->nio', x, mo_coeff[0])
         dma = contract('nio,jo->nij', mo1_moa, mocca)
+        dm1[0] = transpose_sum(dma)
 
         x = mo1[:,nmoa*nocca:].reshape(nset,nmob,noccb)
         mo1_mob = contract('npo,ip->nio', x, mo_coeff[1])
         dmb = contract('nio,jo->nij', mo1_mob, moccb)
-
-        dm1 = cupy.empty([2,nset,nao,nao])
-        dm1[0] = dma + dma.transpose(0,2,1)
-        dm1[1] = dmb + dmb.transpose(0,2,1)
+        dm1[1] = transpose_sum(dmb)
 
         dm1 = tag_array(dm1, mo1=[mo1_moa,mo1_mob], occ_coeff=[mocca,moccb], mo_occ=mo_occ)
-        v1 = vresp(dm1)
-        v1vo = cupy.empty_like(mo1)
-        tmp = contract('nij,jo->nio', v1[0], mocca)
-        v1vo[:,:nmoa*nocca] = contract('nio,ip->npo', tmp, mo_coeff[0]).reshape(nset,-1)
-
-        tmp = contract('nij,jo->nio', v1[1], moccb)
-        v1vo[:,nmoa*nocca:] = contract('nio,ip->npo', tmp, mo_coeff[1]).reshape(nset,-1)
-        return v1vo
+        return hessobj.get_veff_resp_mo(mol, dm1, mo_coeff, mo_occ, hermi=1)
     return fx
 
+def _get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1):
+    mocca = mo_coeff[0][:,mo_occ[0]>0]
+    moccb = mo_coeff[1][:,mo_occ[1]>0]
+    vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, (mocca, moccb), 
+                               hermi=hermi, with_j=True, with_k=True)
+    return vj - vk
 
 class Hessian(rhf_hess_gpu.HessianBase):
     '''Non-relativistic unrestricted Hartree-Fock hessian'''
@@ -422,7 +420,10 @@ class Hessian(rhf_hess_gpu.HessianBase):
     partial_hess_elec = partial_hess_elec
     hess_elec = hess_elec
     make_h1 = make_h1
-
+    gen_vind = gen_vind
+    get_jk_mo = rhf_hess_gpu._get_jk_mo
+    get_veff_resp_mo = _get_veff_resp_mo
+    
     def solve_mo1(self, mo_energy, mo_coeff, mo_occ, h1mo,
                   fx=None, atmlst=None, max_memory=4000, verbose=None):
         return solve_mo1(self.base, mo_energy, mo_coeff, mo_occ, h1mo,
diff --git a/gpu4pyscf/hessian/uks.py b/gpu4pyscf/hessian/uks.py
index a363706a..a202d92a 100644
--- a/gpu4pyscf/hessian/uks.py
+++ b/gpu4pyscf/hessian/uks.py
@@ -27,8 +27,10 @@
 # import pyscf.grad.rks to activate nuc_grad_method method
 from gpu4pyscf.grad import rks as rks_grad
 from gpu4pyscf.dft import numint
-from gpu4pyscf.lib.cupy_helper import contract, add_sparse, take_last2d, get_avail_mem
+from gpu4pyscf.lib.cupy_helper import (contract, add_sparse, get_avail_mem, 
+                                       transpose_sum, tag_array)
 from gpu4pyscf.lib import logger
+from gpu4pyscf.hessian import jk
 
 def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
                       atmlst=None, max_memory=4000, verbose=None):
@@ -133,8 +135,8 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     avail_mem -= 8 * (h1moa.size + h1mob.size)
     slice_size = int(avail_mem*0.5) // (8*3*nao*nao)
     for atoms_slice in lib.prange(0, natm, slice_size):
-        vja, vka = rhf_hess._get_jk(mol, dm0a, with_k=with_k, atoms_slice=atoms_slice, verbose=verbose)
-        vjb, vkb = rhf_hess._get_jk(mol, dm0b, with_k=with_k, atoms_slice=atoms_slice, verbose=verbose)
+        vja, vka = rhf_hess._get_jk_ip1(mol, dm0a, with_k=with_k, atoms_slice=atoms_slice, verbose=verbose)
+        vjb, vkb = rhf_hess._get_jk_ip1(mol, dm0b, with_k=with_k, atoms_slice=atoms_slice, verbose=verbose)
         vj = vja + vjb
         if with_k:
             #:veffa = vja + vjb - hyb * vka
@@ -151,8 +153,8 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
         vj = vja = vjb = vka = vkb = None
         if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10:
             with mol.with_range_coulomb(omega):
-                vka_lr = rhf_hess._get_jk(mol, dm0a, with_j=False, verbose=verbose)[1]
-                vkb_lr = rhf_hess._get_jk(mol, dm0b, with_j=False, verbose=verbose)[1]
+                vka_lr = rhf_hess._get_jk_ip1(mol, dm0a, with_j=False, verbose=verbose)[1]
+                vkb_lr = rhf_hess._get_jk_ip1(mol, dm0b, with_j=False, verbose=verbose)[1]
                 vka_lr *= (alpha-hyb)
                 vkb_lr *= (alpha-hyb)
                 veffa -= vka_lr
@@ -843,6 +845,55 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
         vmatb[ia] -= vmat_tmp
     return vmata, vmatb
 
+def get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1):
+    mol = hessobj.mol
+    mf = hessobj.base
+    grids = getattr(mf, 'cphf_grids', None)
+    if grids is not None:
+        logger.info(mf, 'Secondary grids defined for CPHF in Hessian')
+    else:
+        # If cphf_grids is not defined, e.g object defined from CPU
+        grids = getattr(mf, 'grids', None)
+        logger.info(mf, 'Primary grids is used for CPHF in Hessian')
+    
+    if grids and grids.coords is None:
+        grids.build(mol=mol, with_non0tab=False, sort_grids=True)
+
+    nao, nmoa = mo_coeff[0].shape
+    nao, nmob = mo_coeff[1].shape
+    mocca = mo_coeff[0][:,mo_occ[0]>0]
+    moccb = mo_coeff[1][:,mo_occ[1]>0]
+    nocca = mocca.shape[1]
+    noccb = moccb.shape[1]
+    
+    ni = mf._numint
+    omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, mol.spin)
+    hybrid = ni.libxc.is_hybrid_xc(mf.xc)
+    assert not mf.do_nlc()
+    hermi = 1
+
+    rho0, vxc, fxc = ni.cache_xc_kernel(mol, grids, mf.xc,
+                                        mo_coeff, mo_occ, 1)
+    v1 = ni.nr_uks_fxc(mol, grids, mf.xc, None, dms, 0, hermi,
+                        rho0, vxc, fxc, max_memory=None)
+    nset = dms.shape[1]
+    v1vo = cupy.empty([nset, nmoa*nocca+nmob*noccb])
+    v1vo[:,:nmoa*nocca] = jk._ao2mo(v1[0], mocca, mo_coeff[0]).reshape(-1,nmoa*nocca)
+    v1vo[:,nmoa*nocca:] = jk._ao2mo(v1[1], moccb, mo_coeff[1]).reshape(-1,nmob*noccb)
+    if hybrid:
+        vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, (mocca, moccb), hermi=1)
+        vk *= hyb
+        if omega > 1e-10:
+            _, vk_lr = hessobj.get_jk_mo(mol, dms, mo_coeff, (mocca, moccb), 
+                                         hermi, with_j=False, omega=omega) 
+            vk_lr *= (alpha-hyb)
+            vk += vk_lr
+        v1vo += vj - vk
+    else:
+        v1vo += hessobj.get_jk_mo(mol, dms, mo_coeff, (mocca, moccb), 
+                                  hermi=1, with_k=False)[0]
+    return v1vo
+
 
 class Hessian(rhf_hess.HessianBase):
     '''Non-relativistic UKS hessian'''
@@ -857,6 +908,9 @@ def __init__(self, mf):
     solve_mo1 = uhf_hess.Hessian.solve_mo1
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1
+    gen_vind = uhf_hess.gen_vind
+    get_jk_mo = rhf_hess._get_jk_mo
+    get_veff_resp_mo = get_veff_resp_mo
 
 from gpu4pyscf import dft
 dft.uks.UKS.Hessian = lib.class_as_method(Hessian)
diff --git a/gpu4pyscf/properties/ir.py b/gpu4pyscf/properties/ir.py
index 33a8fd3a..c398b99d 100644
--- a/gpu4pyscf/properties/ir.py
+++ b/gpu4pyscf/properties/ir.py
@@ -93,8 +93,9 @@ def eval_ir_freq_intensity(mf, hessian_obj):
     h1ao = hessian_obj.make_h1(mo_coeff, mo_occ, None, atmlst)
     # TODO: compact with hessian method, which can save one time cphf solve.
     # ! Different from PySCF, mo1 is all in mo!
+    fx = hessian_obj.gen_vind(mo_coeff, mo_occ)
     mo1, mo_e1 = hessian_obj.solve_mo1(mo_energy, mo_coeff, mo_occ, h1ao,
-                                       None, atmlst, hessian_obj.max_memory, log)  
+                                       fx, atmlst, hessian_obj.max_memory, log)  
     mo1 = cupy.asarray(mo1)
     mo_e1 = cupy.asarray(mo_e1)
     
diff --git a/gpu4pyscf/scf/jk.py b/gpu4pyscf/scf/jk.py
index 8cc8d99d..aaf0b119 100644
--- a/gpu4pyscf/scf/jk.py
+++ b/gpu4pyscf/scf/jk.py
@@ -63,9 +63,8 @@
 # TODO: test different size for L2 cache efficiency
 NAO_IN_GROUP = 1500
 
-def _jk_task(mol, dms, vhfopt, task_list, 
+def _jk_task(mol, dms, vhfopt, task_list, hermi=0,
              device_id=0, with_j=True, with_k=True, verbose=0):
-    n_dm = dms.shape[0]
     nao, _ = vhfopt.coeff.shape
     uniq_l_ctr = vhfopt.uniq_l_ctr
     uniq_l = uniq_l_ctr[:,0]
@@ -80,6 +79,10 @@ def _jk_task(mol, dms, vhfopt, task_list,
         cput0 = log.init_timer()
         dms = cp.asarray(dms)
 
+        if hermi == 0:
+            # Contract the tril and triu parts separately
+            dms = cp.vstack([dms, dms.transpose(0,2,1)])
+        n_dm = dms.shape[0]
         tile_q_ptr = ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p)
         q_ptr = ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p)
         s_ptr = lib.c_null_ptr()
@@ -142,6 +145,18 @@ def _jk_task(mol, dms, vhfopt, task_list,
                         t1, t1p = log.timer_debug1(msg, *t1), t1
                         timing_counter[llll] += t1[1] - t1p[1]
                         kern_counts += 1
+        if with_j:
+            if hermi == 1:
+                vj *= 2.
+            else:
+                vj, vjT = vj[:n_dm//2], vj[n_dm//2:]
+                vj += vjT.transpose(0,2,1)
+        if with_k:
+            if hermi == 1:
+                vk = transpose_sum(vk)
+            else:
+                vk, vkT = vk[:n_dm//2], vk[n_dm//2:]
+                vk += vkT.transpose(0,2,1)
     return vj, vk, kern_counts, timing_counter
 
 def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None):
@@ -161,9 +176,7 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None
     #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff)
     dms = sandwich_dot(dms, vhfopt.coeff.T)
     dms = cp.asarray(dms, order='C')
-    if hermi == 0:
-        # Contract the tril and triu parts separately
-        dms = cp.vstack([dms, dms.transpose(0,2,1)])
+
     n_dm = dms.shape[0]
 
     assert with_j or with_k
@@ -187,7 +200,7 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None
         for device_id in range(_num_devices):
             future = executor.submit(
                 _jk_task,
-                mol, dms, vhfopt, task_list[device_id],
+                mol, dms, vhfopt, task_list[device_id], hermi=hermi,
                 with_j=with_j, with_k=with_k, verbose=verbose, 
                 device_id=device_id)
             futures.append(future)
@@ -214,28 +227,19 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None
     vj = vk = None
     if with_k:
         vk = reduce_to_device(vk_dist, inplace=True)
-        if hermi == 1:
-            vk = transpose_sum(vk)
-        else:
-            vk, vkT = vk[:n_dm//2], vk[n_dm//2:]
-            vk += vkT.transpose(0,2,1)
         #:vk = cp.einsum('pi,npq,qj->nij', vhfopt.coeff, vk, vhfopt.coeff)
         vk = sandwich_dot(vk, vhfopt.coeff)
         vk = vk.reshape(dm.shape)
 
     if with_j:
         vj = reduce_to_device(vj_dist, inplace=True)
-        if hermi == 1:
-            vj *= 2.
-        else:
-            vj, vjT = vj[:n_dm//2], vj[n_dm//2:]
-            vj += vjT.transpose(0,2,1)
         vj = transpose_sum(vj)
         #:vj = cp.einsum('pi,npq,qj->nij', vhfopt.coeff, vj, vhfopt.coeff)
         vj = sandwich_dot(vj, vhfopt.coeff)
         vj = vj.reshape(dm.shape)
 
     h_shls = vhfopt.h_shls
+    assert len(h_shls) == 0
     if h_shls:
         cput1 = log.timer_debug1('get_jk pass 1 on gpu', *cput0)
         log.debug3('Integrals for %s functions on CPU', l_symb[LMAX+1])
diff --git a/gpu4pyscf/scf/tests/test_scf_jk.py b/gpu4pyscf/scf/tests/test_scf_jk.py
index 4f3dfb1b..4e33ebeb 100644
--- a/gpu4pyscf/scf/tests/test_scf_jk.py
+++ b/gpu4pyscf/scf/tests/test_scf_jk.py
@@ -16,7 +16,7 @@
 import unittest
 import numpy as np
 import pyscf
-from pyscf import lib
+from pyscf import lib, gto
 from gpu4pyscf.scf import jk
 from pyscf.scf.hf import get_jk
 
@@ -126,4 +126,26 @@ def test_jk_hermi0():
 
     assert abs(vj2+vj3 - vj1).max() < 1e-9
     assert abs(vk2+vk3 - vk1).max() < 1e-9
-    
\ No newline at end of file
+    
+def test_jk_qz():
+    basis = {
+        'H': gto.basis.parse('''
+H H
+      1.0240000              1.0000000
+                             ''')
+    }
+    mol = pyscf.M(
+        atom = '''
+        H  -0.757    0.   0.0
+        H   0.757    0.   0.0
+        ''',
+        basis=basis,
+        unit='B',)
+    nao = mol.nao
+    dm = np.random.rand(nao, nao)
+    vj_gpu, vk_gpu = jk.get_jk(mol, dm)
+
+    vj, vk = get_jk(mol, dm)
+
+    assert np.linalg.norm(vj_gpu.get() - vj) < 1e-9
+    assert np.linalg.norm(vk_gpu.get() - vk) < 1e-9 
diff --git a/gpu4pyscf/solvent/hessian/pcm.py b/gpu4pyscf/solvent/hessian/pcm.py
index cd22b710..bd290b3c 100644
--- a/gpu4pyscf/solvent/hessian/pcm.py
+++ b/gpu4pyscf/solvent/hessian/pcm.py
@@ -27,6 +27,7 @@
 from gpu4pyscf.df import int3c2e
 from gpu4pyscf.lib.cupy_helper import contract
 from gpu4pyscf.lib import logger
+from gpu4pyscf.hessian.jk import _ao2mo
 
 def hess_nuc(pcmobj):
     if not pcmobj._intermediates:
@@ -291,6 +292,30 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
             return h1aoa, h1aob
         else:
             raise NotImplementedError('Base object is not supported')
+        
+    def get_veff_resp_mo(self, mol, dms, mo_coeff, mo_occ, hermi=1):
+        v1vo = super().get_veff_resp_mo(mol, dms, mo_coeff, mo_occ, hermi=hermi)
+        if not self.base.with_solvent.equilibrium_solvation:
+            return v1vo
+        v_solvent = self.base.with_solvent._B_dot_x(dms)
+        if isinstance(self.base, scf.uhf.UHF):
+            n_dm = dms.shape[1]
+            mocca = mo_coeff[0][:,mo_occ[0]>0]
+            moccb = mo_coeff[1][:,mo_occ[1]>0]
+            moa, mob = mo_coeff
+            nmoa = moa.shape[1]
+            nocca = mocca.shape[1]
+            v1vo_sol = v_solvent[0] + v_solvent[1]
+            v1vo[:,:nmoa*nocca] += _ao2mo(v1vo_sol, mocca, moa).reshape(n_dm,-1)
+            v1vo[:,nmoa*nocca:] += _ao2mo(v1vo_sol, moccb, mob).reshape(n_dm,-1)
+        elif isinstance(self.base, scf.hf.RHF):
+            n_dm = dms.shape[0]
+            mocc = mo_coeff[:,mo_occ>0]
+            v1vo += _ao2mo(v_solvent, mocc, mo_coeff).reshape(n_dm,-1)
+        else:
+            raise NotImplementedError('Base object is not supported')
+        return v1vo
+    
     def _finalize(self):
         # disable _finalize. It is called in grad_method.kernel method
         # where self.de was not yet initialized.
diff --git a/gpu4pyscf/solvent/hessian/smd.py b/gpu4pyscf/solvent/hessian/smd.py
index 3e8c5238..4bfad79b 100644
--- a/gpu4pyscf/solvent/hessian/smd.py
+++ b/gpu4pyscf/solvent/hessian/smd.py
@@ -26,6 +26,7 @@
 from gpu4pyscf.solvent.grad import smd as smd_grad
 from gpu4pyscf.solvent.grad import pcm as pcm_grad
 from gpu4pyscf.solvent.hessian import pcm as pcm_hess
+from gpu4pyscf.hessian.jk import _ao2mo
 
 def get_cds(smdobj):
     mol = smdobj.mol
@@ -171,6 +172,31 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
             return h1aoa, h1aob
         else:
             raise NotImplementedError('Base object is not supported')
+
+    def get_veff_resp_mo(self, mol, dms, mo_coeff, mo_occ, hermi=1):
+        v1vo = super().get_veff_resp_mo(mol, dms, mo_coeff, mo_occ, hermi=hermi)
+        if not self.base.with_solvent.equilibrium_solvation:
+            return v1vo
+        v_solvent = self.base.with_solvent._B_dot_x(dms)
+        
+        if isinstance(self.base, scf.uhf.UHF):
+            n_dm = dms.shape[1]
+            mocca = mo_coeff[0][:,mo_occ[0]>0]
+            moccb = mo_coeff[1][:,mo_occ[1]>0]
+            moa, mob = mo_coeff
+            nmoa = moa.shape[1]
+            nocca = mocca.shape[1]
+            v1vo_sol = v_solvent[0] + v_solvent[1]
+            v1vo[:,:nmoa*nocca] += _ao2mo(v1vo_sol, mocca, moa).reshape(n_dm,-1)
+            v1vo[:,nmoa*nocca:] += _ao2mo(v1vo_sol, moccb, mob).reshape(n_dm,-1)
+        elif isinstance(self.base, scf.hf.RHF):
+            n_dm = dms.shape[0]
+            mocc = mo_coeff[:,mo_occ>0]
+            v1vo += _ao2mo(v_solvent, mocc, mo_coeff).reshape(n_dm,-1)
+        else:
+            raise NotImplementedError('Base object is not supported')
+        return v1vo
+
     def _finalize(self):
         # disable _finalize. It is called in grad_method.kernel method
         # where self.de was not yet initialized.
diff --git a/gpu4pyscf/solvent/tests/test_smd_hessian.py b/gpu4pyscf/solvent/tests/test_smd_hessian.py
index 82fb8ea2..4134d47d 100644
--- a/gpu4pyscf/solvent/tests/test_smd_hessian.py
+++ b/gpu4pyscf/solvent/tests/test_smd_hessian.py
@@ -258,6 +258,7 @@ def test_to_cpu(self):
         hess_gpu = hessobj.kernel()
         hessobj = hessobj.to_cpu()
         hess_cpu = hessobj.kernel()
+        print(numpy.linalg.norm(hess_cpu - hess_gpu))
         assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5
 
 if __name__ == "__main__":
diff --git a/gpu4pyscf/tests/test_dft.py b/gpu4pyscf/tests/test_dft.py
index 860a435c..182313b9 100644
--- a/gpu4pyscf/tests/test_dft.py
+++ b/gpu4pyscf/tests/test_dft.py
@@ -72,7 +72,7 @@ def test_b3lyp_with_d3bj(self):
 
         h = mf.Hessian().kernel()
         assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4
-
+    
     @pytest.mark.smoke
     def test_b3lyp_d3bj(self):
         print('-------- DFRKS with D3(BJ) -------')

From 41c27c9cb9bdb11439ecd60e89aeb27162112b6e Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Thu, 19 Dec 2024 22:32:28 +0000
Subject: [PATCH 02/49] fixed bug in df.hessian.uhf

---
 gpu4pyscf/df/hessian/uhf.py |   7 +-
 gpu4pyscf/df/int3c2e.py     | 185 ------------------------------------
 gpu4pyscf/hessian/jk.py     |  11 +--
 gpu4pyscf/hessian/rhf.py    |  76 ++++++++-------
 4 files changed, 47 insertions(+), 232 deletions(-)

diff --git a/gpu4pyscf/df/hessian/uhf.py b/gpu4pyscf/df/hessian/uhf.py
index 0345c32c..4182c25d 100644
--- a/gpu4pyscf/df/hessian/uhf.py
+++ b/gpu4pyscf/df/hessian/uhf.py
@@ -349,7 +349,9 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     hk_ao_ao *= 2.0
     e1 = cupy.zeros([len(atmlst),len(atmlst),3,3])
     ej = hj_ipip
-    ek = hk_ipip
+    ek = None
+    if with_k:
+        ek = hk_ipip
     for i0, ia in enumerate(atmlst):
         shl0, shl1, p0, p1 = aoslices[ia]
         e1[i0,i0] -= cupy.sum(h1aa[p0:p1], axis=0)
@@ -401,7 +403,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         for j0 in range(i0):
             e1[j0,i0] = e1[i0,j0].T
             ej[j0,i0] = ej[i0,j0].T
-            ek[j0,i0] = ek[i0,j0].T
+            if with_k:
+                ek[j0,i0] = ek[i0,j0].T
     t1 = log.timer_debug1('hcore contribution', *t1)
     log.timer('UHF partial hessian', *time0)
     return e1, ej, ek
diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
index 68630520..f89fb07c 100644
--- a/gpu4pyscf/df/int3c2e.py
+++ b/gpu4pyscf/df/int3c2e.py
@@ -1071,191 +1071,6 @@ def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None):
         wk = reduce_to_device(wk_total, inplace=True)
     return wj, wk
 
-def _int3c2e_ipip1_hjk(intopt, task_list, rhoj, rhok, dm0, orbo,
-                       device_id=0, with_k=True, omega=None):
-    with cupy.cuda.Device(device_id), _streams[device_id]:
-        rhoj = cupy.asarray(rhoj)
-        rhok = cupy.asarray(rhok)
-        orbo = cupy.asarray(orbo)
-        dm0 = cupy.asarray(dm0)
-        nao = dm0.shape[0]
-        hj = cupy.zeros([nao,9])
-        hk = None
-        if with_k:
-            hk = cupy.zeros([nao,9])
-        for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list,
-                                                                ip_type='ipip1', omega=omega):
-            tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1])
-            hj[i0:i1] += contract('xpi,p->ix', tmp, rhoj[k0:k1])
-            if with_k:
-                rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
-                rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1])
-                hk[i0:i1] += contract('xpji,pij->ix', int3c_blk, rhok_tmp)
-        hj = hj.reshape([nao,3,3])
-        if with_k:
-            hk = hk.reshape([nao,3,3])
-    return hj, hk
-
-def _int3c2e_ipvip1_hjk(intopt, task_list, rhoj, rhok, dm0, orbo, 
-                        device_id=0, with_k=True, omega=None):
-    with cupy.cuda.Device(device_id), _streams[device_id]:
-        rhoj = cupy.asarray(rhoj)
-        rhok = cupy.asarray(rhok)
-        orbo = cupy.asarray(orbo)
-        dm0 = cupy.asarray(dm0)
-        nao = dm0.shape[0]
-        hj = cupy.zeros([nao,nao,9])
-        hk = None
-        if with_k:
-            hk = cupy.zeros([nao,nao,9])
-        for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list,
-                                                                ip_type='ipvip1', omega=omega):
-            tmp = contract('xpji,ij->xpij', int3c_blk, dm0[i0:i1,j0:j1])
-            hj[i0:i1,j0:j1] += contract('xpij,p->ijx', tmp, rhoj[k0:k1])
-            if with_k:
-                rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
-                rhok_tmp = contract('pio,jo->pji', rhok_tmp, orbo[j0:j1])
-                hk[i0:i1,j0:j1] += contract('xpji,pji->ijx', int3c_blk, rhok_tmp)
-        hj = hj.reshape([nao,nao,3,3])
-        if with_k:
-            hk = hk.reshape([nao,nao,3,3])
-    return hj, hk
-
-def _int3c2e_ip1ip2_hjk(intopt, task_list, rhoj, rhok, dm0, orbo, 
-                        device_id=0, with_k=True, omega=None):
-    with cupy.cuda.Device(device_id), _streams[device_id]:
-        naux = rhok.shape[0]
-        rhoj = cupy.asarray(rhoj)
-        rhok = cupy.asarray(rhok)
-        orbo = cupy.asarray(orbo)
-        dm0 = cupy.asarray(dm0)
-        nao = dm0.shape[0]
-        hj = cupy.zeros([nao,naux,9])
-        hk = None
-        if with_k:
-            hk = cupy.zeros([nao,naux,9])
-        for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list,
-                                                                ip_type='ip1ip2', omega=omega):
-            tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1])
-            hj[i0:i1,k0:k1] += contract('xpi,p->ipx', tmp, rhoj[k0:k1])
-            if with_k:
-                rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
-                rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1])
-                hk[i0:i1,k0:k1] += contract('xpji,pij->ipx', int3c_blk, rhok_tmp)
-        hj = hj.reshape([nao,naux,3,3])
-        if with_k:
-            hk = hk.reshape([nao,naux,3,3])
-    return hj, hk
-
-def _int3c2e_ipip2_hjk(intopt, task_list, rhoj, rhok, dm0, orbo, 
-                       device_id=0, with_k=True, omega=None):
-    with cupy.cuda.Device(device_id), _streams[device_id]:
-        naux = rhok.shape[0]
-        rhoj = cupy.asarray(rhoj)
-        rhok = cupy.asarray(rhok)
-        orbo = cupy.asarray(orbo)
-        dm0 = cupy.asarray(dm0)
-        hj = cupy.zeros([naux,9])
-        hk = None
-        if with_k:
-            hk = cupy.zeros([naux,9])
-        for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list, 
-                                                                ip_type='ipip2', omega=omega):
-            tmp = contract('xpji,ij->xp', int3c_blk, dm0[i0:i1,j0:j1])
-            hj[k0:k1] += contract('xp,p->px', tmp, rhoj[k0:k1])
-            if with_k:
-                rhok_tmp = contract('por,jr->pjo', rhok[k0:k1], orbo[j0:j1])
-                rhok_tmp = contract('pjo,io->pji', rhok_tmp, orbo[i0:i1])
-                hk[k0:k1] += contract('xpji,pji->px', int3c_blk, rhok_tmp)
-        hj = hj.reshape([naux,3,3])
-        if with_k:
-            hk = hk.reshape([naux,3,3])
-    return hj, hk
-
-def get_int3c2e_hjk(intopt, task_type, rhoj, rhok, dm0_tag, with_k=True, omega=None):
-    if task_type == 'ipip1':  task_fn = _int3c2e_ipip1_hjk
-    if task_type == 'ipip2':  task_fn = _int3c2e_ipip2_hjk
-    if task_type == 'ip1ip2': task_fn = _int3c2e_ip1ip2_hjk
-    if task_type == 'ipvip1': task_fn = _int3c2e_ipvip1_hjk
-
-    orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
-    futures = []
-    ncp_k = len(intopt.aux_log_qs)
-    ncp_ij = len(intopt.log_qs)
-    tasks = np.array(list(itertools.product(range(ncp_k), range(ncp_ij))))
-    task_list = []
-    for device_id in range(_num_devices):
-        task_list.append(tasks[device_id::_num_devices])
-    
-    cupy.cuda.get_current_stream().synchronize()
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
-            future = executor.submit(
-                task_fn, intopt, task_list[device_id], 
-                rhoj, rhok, dm0_tag, orbo, with_k=with_k, device_id=device_id, omega=omega)
-            futures.append(future)
-    
-    hj_total = []
-    hk_total = []
-    for future in futures:
-        hj, hk = future.result()
-        hj_total.append(hj)
-        hk_total.append(hk)
-        
-    hj = hk = None
-    hj = reduce_to_device(hj_total, inplace=True)
-    if with_k:
-        hk = reduce_to_device(hk_total, inplace=True)
-    return hj, hk
-
-def get_hess_nuc_elec(mol, dm):
-    '''
-    calculate int1e_ipiprinv contribution
-    '''
-    coords = mol.atom_coords()
-    charges = cupy.asarray(mol.atom_charges(), dtype=np.float64)
-
-    fakemol = gto.fakemol_for_charges(coords)
-    fakemol.output = mol.output
-    fakemol.verbose = mol.verbose
-    fakemol.stdout = mol.stdout
-    intopt = VHFOpt(mol, fakemol, 'int2e')
-    intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
-    dm = intopt.sort_orbitals(cupy.asarray(dm), axis=[0,1])
-
-    natm = mol.natm
-    nao = mol.nao
-    hcore_diag = cupy.zeros([9,natm])
-    hcore_aa = cupy.zeros([9,natm,nao])
-    for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipip1'):
-        haa = contract('xpji,ij->xpi', int3c_blk, dm[i0:i1,j0:j1])
-        hcore_aa[:,k0:k1,i0:i1] += haa
-        hcore_diag[:,k0:k1] -= contract('xpji,ij->xp', int3c_blk, dm[i0:i1,j0:j1])
-
-    hcore_ab = cupy.zeros([9,natm,nao])
-    for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipvip1'):
-        hab = contract('xpji,ij->xpi', int3c_blk, dm[i0:i1,j0:j1])
-        hcore_ab[:,k0:k1,i0:i1] += hab
-        hcore_diag[:,k0:k1] -= contract('xpji,ij->xp', int3c_blk, dm[i0:i1,j0:j1])
-
-    hcore_diag = contract('xp,p->xp', hcore_diag, charges)
-    hcore_aa = contract('xpj,p->xpj', hcore_aa, charges)
-    hcore_ab = contract('xpj,p->xpj', hcore_ab, charges)
-
-    aoslices = mol.aoslice_by_atom()
-    ao2atom = get_ao2atom(intopt, aoslices)
-
-    hcore_aa = contract('xpj,jq->xpq', hcore_aa, ao2atom).reshape([3,3,natm,natm])
-    hcore_ab = contract('xpj,jq->xpq', hcore_ab, ao2atom).reshape([3,3,natm,natm])
-    hcore = hcore_aa + hcore_aa.transpose([1,0,3,2])
-    hcore+= hcore_ab.transpose([1,0,2,3]) + hcore_ab.transpose([0,1,3,2])
-    hcore_diag = hcore_diag.reshape([3,3,natm])
-    idx = np.arange(natm)
-    for x in range(3):
-        for y in range(3):
-            hcore[x,y,idx,idx] += hcore_diag[x,y]
-    return hcore
-
 def get_int3c2e_ip_slice(intopt, cp_aux_id, ip_type, out=None, omega=None, stream=None):
     '''
     Generate int3c2e_ip slice along k, full dimension in ij
diff --git a/gpu4pyscf/hessian/jk.py b/gpu4pyscf/hessian/jk.py
index 7f3eeb60..6dedd447 100644
--- a/gpu4pyscf/hessian/jk.py
+++ b/gpu4pyscf/hessian/jk.py
@@ -265,8 +265,8 @@ def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None,
         if isinstance(mocc, tuple):
             mocca, moccb = mocc
             moa, mob = mo_coeff
-            nmoa, nmob = moa.shape[1], mob.shape[1]
-            nocca, noccb = mocca.shape[1], moccb.shape[1]
+            nmoa = moa.shape[1]
+            nocca = mocca.shape[1]
             n_dm_2 = n_dm//2
             if with_j:
                 vjab = vj1[:n_dm_2] + vj1[n_dm_2:]
@@ -280,17 +280,10 @@ def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None,
             if with_j:
                 vj1[:,idy,idx] = vj1[:,idx,idy]
                 vj += _ao2mo(cp.asarray(vj1), mocc, mo_coeff)
-                #for i, v in enumerate(vj1):
-                #    vj[i] += coeff.T.dot(cp.asarray(v)).dot(coeff)
             if with_k:
                 if hermi:
                     vk1[:,idy,idx] = vk1[:,idx,idy]
                 vk += _ao2mo(cp.asarray(vk1), mocc, mo_coeff)
-                #for i, v in enumerate(vk1):
-                #    vk[i] += coeff.T.dot(cp.asarray(v)).dot(coeff)
-
-        # TODO: convert vj and vk into MO
         log.timer_debug1('get_jk pass 2 for h functions on cpu', *cput1)
-    
     log.timer('vj and vk', *cput0)
     return vj, vk
diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py
index 96a90359..bec9d0e0 100644
--- a/gpu4pyscf/hessian/rhf.py
+++ b/gpu4pyscf/hessian/rhf.py
@@ -25,7 +25,7 @@
 from collections import Counter
 from concurrent.futures import ThreadPoolExecutor
 from pyscf.hessian import rhf as rhf_hess_cpu
-from pyscf import lib
+from pyscf import lib, gto
 from pyscf.gto import ATOM_OF
 # import _response_functions to load gen_response methods in SCF class
 from gpu4pyscf.scf import _response_functions  # noqa
@@ -728,46 +728,50 @@ def hess_nuc_elec(mol, dm):
     '''
     calculate hessian contribution due to (nuc, elec) pair
     '''
+    from gpu4pyscf.df import int3c2e
+    coords = mol.atom_coords()
+    charges = cupy.asarray(mol.atom_charges(), dtype=np.float64)
+
+    fakemol = gto.fakemol_for_charges(coords)
+    fakemol.output = mol.output
+    fakemol.verbose = mol.verbose
+    fakemol.stdout = mol.stdout
+    intopt = int3c2e.VHFOpt(mol, fakemol, 'int2e')
+    intopt.build(1e-14, diag_block_with_triu=True, aosym=False, 
+                 group_size=int3c2e.BLKSIZE, group_size_aux=int3c2e.BLKSIZE)
+    dm = intopt.sort_orbitals(cupy.asarray(dm), axis=[0,1])
 
-    '''
-    nao = mol.nao
-    aoslices = mol.aoslice_by_atom()
     natm = mol.natm
-    hcore = numpy.zeros([3,3,natm,natm])
-    # CPU version
-    for ia in range(mol.natm):
-        ish0, ish1, i0, i1 = aoslices[ia]
-        zi = mol.atom_charge(ia)
-        with mol.with_rinv_at_nucleus(ia):
-            rinv2aa = mol.intor('int1e_ipiprinv', comp=9).reshape([3,3,nao,nao])
-            rinv2ab = mol.intor('int1e_iprinvip', comp=9).reshape([3,3,nao,nao])
-            rinv2aa *= zi
-            rinv2ab *= zi
-
-            hcore[:,:,ia,ia] -= numpy.einsum('xypq,pq->xy', rinv2aa+rinv2ab, dm)
-
-            haa = numpy.einsum('xypq,pq->xyp', rinv2aa, dm)
-            hab = numpy.einsum('xypq,pq->xyp', rinv2ab, dm)
-
-            haa = [haa[:,:,p0:p1].sum(axis=2) for p0,p1 in aoslices[:,2:]]
-            hab = [hab[:,:,p0:p1].sum(axis=2) for p0,p1 in aoslices[:,2:]]
-
-            haa = numpy.stack(haa, axis=2)
-            hab = numpy.stack(hab, axis=2)
-
-            hcore[:,:,ia] += haa
-            hcore[:,:,ia] += hab.transpose([1,0,2])
-
-            hcore[:,:,:,ia] += haa.transpose([1,0,2])
-            hcore[:,:,:,ia] += hab
+    nao = mol.nao
+    hcore_diag = cupy.zeros([9,natm])
+    hcore_aa = cupy.zeros([9,natm,nao])
+    for i0,i1,j0,j1,k0,k1,int3c_blk in int3c2e.loop_int3c2e_general(intopt, ip_type='ipip1'):
+        haa = contract('xpji,ij->xpi', int3c_blk, dm[i0:i1,j0:j1])
+        hcore_aa[:,k0:k1,i0:i1] += haa
+        hcore_diag[:,k0:k1] -= contract('xpji,ij->xp', int3c_blk, dm[i0:i1,j0:j1])
+
+    hcore_ab = cupy.zeros([9,natm,nao])
+    for i0,i1,j0,j1,k0,k1,int3c_blk in int3c2e.loop_int3c2e_general(intopt, ip_type='ipvip1'):
+        hab = contract('xpji,ij->xpi', int3c_blk, dm[i0:i1,j0:j1])
+        hcore_ab[:,k0:k1,i0:i1] += hab
+        hcore_diag[:,k0:k1] -= contract('xpji,ij->xp', int3c_blk, dm[i0:i1,j0:j1])
+
+    hcore_diag = contract('xp,p->xp', hcore_diag, charges)
+    hcore_aa = contract('xpj,p->xpj', hcore_aa, charges)
+    hcore_ab = contract('xpj,p->xpj', hcore_ab, charges)
 
-    hcore = cupy.asarray(hcore)
-    '''
-    from gpu4pyscf.df import int3c2e
-    hcore = int3c2e.get_hess_nuc_elec(mol, dm)
+    aoslices = mol.aoslice_by_atom()
+    ao2atom = int3c2e.get_ao2atom(intopt, aoslices)
+
+    hcore_aa = contract('xpj,jq->xpq', hcore_aa, ao2atom).reshape([3,3,natm,natm])
+    hcore_ab = contract('xpj,jq->xpq', hcore_ab, ao2atom).reshape([3,3,natm,natm])
+    hcore = hcore_aa + hcore_aa.transpose([1,0,3,2])
+    hcore+= hcore_ab.transpose([1,0,2,3]) + hcore_ab.transpose([0,1,3,2])
+    hcore_diag = hcore_diag.reshape([3,3,natm])
+    idx = np.arange(natm)
+    hcore[:,:,idx,idx] += hcore_diag
     return hcore * 2.0
 
-
 def kernel(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None):
     cput0 = (logger.process_clock(), logger.perf_counter())
     if mo_energy is None: mo_energy = hessobj.base.mo_energy

From 31418f2be53a13595bbf7bb3455c49501742820a Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Thu, 19 Dec 2024 23:22:07 +0000
Subject: [PATCH 03/49] update license

---
 gpu4pyscf/df/hessian/jk.py | 22 +++++++++++-----------
 gpu4pyscf/hessian/jk.py    | 22 +++++++++++-----------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py
index 16010bda..3fc7ac08 100644
--- a/gpu4pyscf/df/hessian/jk.py
+++ b/gpu4pyscf/df/hessian/jk.py
@@ -1,17 +1,17 @@
-# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
 #
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 #
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 import ctypes
 import itertools
diff --git a/gpu4pyscf/hessian/jk.py b/gpu4pyscf/hessian/jk.py
index 6dedd447..60525fc8 100644
--- a/gpu4pyscf/hessian/jk.py
+++ b/gpu4pyscf/hessian/jk.py
@@ -1,17 +1,17 @@
-# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved.
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
 #
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 #
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 '''
 Compute J/K matrices for Hessian

From a7103a94d622884e3fc34264519d336ba0540e73 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Thu, 19 Dec 2024 23:42:26 +0000
Subject: [PATCH 04/49] format code

---
 gpu4pyscf/df/hessian/rhf.py                   |  3 ---
 gpu4pyscf/df/hessian/rks.py                   |  3 ---
 .../df/hessian/tests/test_df_rhf_hessian.py   | 22 +++++++++---------
 .../df/hessian/tests/test_df_rks_hessian.py   | 22 +++++++++---------
 gpu4pyscf/df/hessian/uhf.py                   |  3 ---
 gpu4pyscf/df/hessian/uks.py                   |  4 ----
 gpu4pyscf/scf/tests/test_scf_jk.py            | 23 -------------------
 gpu4pyscf/solvent/tests/test_smd_hessian.py   |  1 -
 8 files changed, 22 insertions(+), 59 deletions(-)

diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py
index bd806c50..d4fda5e3 100644
--- a/gpu4pyscf/df/hessian/rhf.py
+++ b/gpu4pyscf/df/hessian/rhf.py
@@ -614,10 +614,7 @@ class Hessian(rhf_hess.Hessian):
 
     from gpu4pyscf.lib.utils import to_gpu, device
 
-    #__init__ = rhf_hess.Hessian.__init__
     auxbasis_response = 1
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1
-    #kernel = rhf_hess.kernel
-    #hess = kernel
     get_jk_mo = _get_jk_mo
diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py
index c6d65daa..2606e8e4 100644
--- a/gpu4pyscf/df/hessian/rks.py
+++ b/gpu4pyscf/df/hessian/rks.py
@@ -108,10 +108,7 @@ class Hessian(rks_hess.Hessian):
     '''Non-relativistic RKS hessian'''
     from gpu4pyscf.lib.utils import to_gpu, device
 
-    #__init__ = rks_hess.Hessian.__init__
     auxbasis_response = 1
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1
-    #kernel = rhf_hess.kernel
-    #hess = kernel
     get_jk_mo = df_rhf_hess._get_jk_mo
diff --git a/gpu4pyscf/df/hessian/tests/test_df_rhf_hessian.py b/gpu4pyscf/df/hessian/tests/test_df_rhf_hessian.py
index b8560002..a3e13260 100644
--- a/gpu4pyscf/df/hessian/tests/test_df_rhf_hessian.py
+++ b/gpu4pyscf/df/hessian/tests/test_df_rhf_hessian.py
@@ -1,17 +1,17 @@
-# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved.
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
 #
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 #
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 import unittest
 import numpy
diff --git a/gpu4pyscf/df/hessian/tests/test_df_rks_hessian.py b/gpu4pyscf/df/hessian/tests/test_df_rks_hessian.py
index 5a853a95..f737e92a 100644
--- a/gpu4pyscf/df/hessian/tests/test_df_rks_hessian.py
+++ b/gpu4pyscf/df/hessian/tests/test_df_rks_hessian.py
@@ -1,17 +1,17 @@
-# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved.
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
 #
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 #
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 import unittest
 import numpy
diff --git a/gpu4pyscf/df/hessian/uhf.py b/gpu4pyscf/df/hessian/uhf.py
index 4182c25d..e1c8250b 100644
--- a/gpu4pyscf/df/hessian/uhf.py
+++ b/gpu4pyscf/df/hessian/uhf.py
@@ -668,10 +668,7 @@ class Hessian(uhf_hess.Hessian):
 
     from gpu4pyscf.lib.utils import to_gpu, device
 
-    __init__ = uhf_hess.Hessian.__init__
     auxbasis_response = 1
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1
-    kernel = rhf_hess.kernel
-    hess = kernel
     get_jk_mo = _get_jk_mo
diff --git a/gpu4pyscf/df/hessian/uks.py b/gpu4pyscf/df/hessian/uks.py
index 4b6cdb11..6bf09803 100644
--- a/gpu4pyscf/df/hessian/uks.py
+++ b/gpu4pyscf/df/hessian/uks.py
@@ -119,11 +119,7 @@ class Hessian(uks_hess.Hessian):
     '''Non-relativistic RKS hessian'''
     from gpu4pyscf.lib.utils import to_gpu, device
 
-    __init__ = uks_hess.Hessian.__init__
     auxbasis_response = 1
     partial_hess_elec = partial_hess_elec
     make_h1 = make_h1
-    hess_elec = uhf_hess.hess_elec
-    kernel = rhf_hess.kernel
-    hess = kernel
     get_jk_mo = df_uhf_hess._get_jk_mo
diff --git a/gpu4pyscf/scf/tests/test_scf_jk.py b/gpu4pyscf/scf/tests/test_scf_jk.py
index e04b3ff1..78ae68eb 100644
--- a/gpu4pyscf/scf/tests/test_scf_jk.py
+++ b/gpu4pyscf/scf/tests/test_scf_jk.py
@@ -125,26 +125,3 @@ def test_jk_hermi0():
 
     assert abs(vj2+vj3 - vj1).max() < 1e-9
     assert abs(vk2+vk3 - vk1).max() < 1e-9
-    
-def test_jk_qz():
-    basis = {
-        'H': gto.basis.parse('''
-H H
-      1.0240000              1.0000000
-                             ''')
-    }
-    mol = pyscf.M(
-        atom = '''
-        H  -0.757    0.   0.0
-        H   0.757    0.   0.0
-        ''',
-        basis=basis,
-        unit='B',)
-    nao = mol.nao
-    dm = np.random.rand(nao, nao)
-    vj_gpu, vk_gpu = jk.get_jk(mol, dm)
-
-    vj, vk = get_jk(mol, dm)
-
-    assert np.linalg.norm(vj_gpu.get() - vj) < 1e-9
-    assert np.linalg.norm(vk_gpu.get() - vk) < 1e-9 
diff --git a/gpu4pyscf/solvent/tests/test_smd_hessian.py b/gpu4pyscf/solvent/tests/test_smd_hessian.py
index 6fbd5580..9c536f63 100644
--- a/gpu4pyscf/solvent/tests/test_smd_hessian.py
+++ b/gpu4pyscf/solvent/tests/test_smd_hessian.py
@@ -257,7 +257,6 @@ def test_to_cpu(self):
         hess_gpu = hessobj.kernel()
         hessobj = hessobj.to_cpu()
         hess_cpu = hessobj.kernel()
-        print(numpy.linalg.norm(hess_cpu - hess_gpu))
         assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5
 
 if __name__ == "__main__":

From 286a3b07fbffae8e3e5695f66aab4bae74b22d78 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Fri, 20 Dec 2024 05:46:51 +0000
Subject: [PATCH 05/49] support h function in hessian.jk

---
 gpu4pyscf/df/hessian/jk.py                  |  2 +-
 gpu4pyscf/hessian/jk.py                     | 26 ++++----
 gpu4pyscf/hessian/tests/test_rhf_hessian.py | 59 +++++++++++++++++-
 gpu4pyscf/hessian/tests/test_uhf_hessian.py | 67 ++++++++++++++++++++-
 gpu4pyscf/scf/jk.py                         | 11 ++--
 gpu4pyscf/scf/tests/test_rhf.py             |  4 +-
 6 files changed, 144 insertions(+), 25 deletions(-)

diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py
index 3fc7ac08..5a299bcf 100644
--- a/gpu4pyscf/df/hessian/jk.py
+++ b/gpu4pyscf/df/hessian/jk.py
@@ -345,7 +345,7 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
             hj_ipip2[k0:k1] += contract('xp,p->px', tmp, rhoj[k0:k1])
             if with_k:
                 hk_ipip2[k0:k1] += contract('xpji,pij->px', int3c_blk, rhok_tmp)
-        
+
         auxslices = intopt.auxmol.aoslice_by_atom()
         aoslices = intopt.mol.aoslice_by_atom()
         ao2atom = int3c2e.get_ao2atom(intopt, aoslices)
diff --git a/gpu4pyscf/hessian/jk.py b/gpu4pyscf/hessian/jk.py
index 60525fc8..5a1f75ab 100644
--- a/gpu4pyscf/hessian/jk.py
+++ b/gpu4pyscf/hessian/jk.py
@@ -157,7 +157,7 @@ def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None,
     '''
     log = logger.new_logger(mol, verbose)
     cput0 = log.init_timer()
-
+    assert hermi == 1
     if vhfopt is None:
         vhfopt = _VHFOpt(mol).build()
 
@@ -233,7 +233,6 @@ def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None,
         vj = reduce_to_device(vj_dist, inplace=True)
 
     h_shls = vhfopt.h_shls
-    assert len(h_shls) == 0
     if h_shls:
         cput1 = log.timer_debug1('get_jk pass 1 on gpu', *cput0)
         log.debug3('Integrals for %s functions on CPU', l_symb[LMAX+1])
@@ -246,12 +245,8 @@ def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None,
             else:
                 scripts.append('jk->s1il')
         shls_excludes = [0, h_shls[0]] * 4
-        if hermi == 1:
-            dms = dms.get()
-        else:
-            dms = dms[:n_dm//2].get()
         vs_h = _vhf.direct_mapdm('int2e_cart', 's8', scripts,
-                                 dms, 1, mol._atm, mol._bas, mol._env,
+                                 dms.get(), 1, mol._atm, mol._bas, mol._env,
                                  shls_excludes=shls_excludes)
         if with_j and with_k:
             vj1 = vs_h[0]
@@ -260,8 +255,14 @@ def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None,
             vj1 = vs_h[0]
         else:
             vk1 = vs_h[0]
-        coeff = vhfopt.coeff
+
         idx, idy = np.tril_indices(nao, -1)
+        if hermi == 1:
+            if with_j:
+                vj1[:,idy,idx] = vj1[:,idx,idy]
+            if with_k:
+                vk1[:,idy,idx] = vk1[:,idx,idy]
+
         if isinstance(mocc, tuple):
             mocca, moccb = mocc
             moa, mob = mo_coeff
@@ -273,17 +274,14 @@ def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None,
                 vj[:,:nmoa*nocca] += _ao2mo(vjab, mocca, moa).reshape(n_dm_2,-1)
                 vj[:,nmoa*nocca:] += _ao2mo(vjab, moccb, mob).reshape(n_dm_2,-1)
             if with_k:
-                vka, vkb = vk[:n_dm_2], vk[n_dm_2:]
+                vka, vkb = vk1[:n_dm_2], vk1[n_dm_2:]
                 vk[:,:nmoa*nocca] += _ao2mo(vka, mocca, moa).reshape(n_dm_2,-1)
                 vk[:,nmoa*nocca:] += _ao2mo(vkb, moccb, mob).reshape(n_dm_2,-1)
         else:
             if with_j:
-                vj1[:,idy,idx] = vj1[:,idx,idy]
-                vj += _ao2mo(cp.asarray(vj1), mocc, mo_coeff)
+                vj += _ao2mo(cp.asarray(vj1), mocc, mo_coeff).reshape(n_dm,-1)
             if with_k:
-                if hermi:
-                    vk1[:,idy,idx] = vk1[:,idx,idy]
-                vk += _ao2mo(cp.asarray(vk1), mocc, mo_coeff)
+                vk += _ao2mo(cp.asarray(vk1), mocc, mo_coeff).reshape(n_dm,-1)
         log.timer_debug1('get_jk pass 2 for h functions on cpu', *cput1)
     log.timer('vj and vk', *cput0)
     return vj, vk
diff --git a/gpu4pyscf/hessian/tests/test_rhf_hessian.py b/gpu4pyscf/hessian/tests/test_rhf_hessian.py
index dc27af38..e9aef60c 100644
--- a/gpu4pyscf/hessian/tests/test_rhf_hessian.py
+++ b/gpu4pyscf/hessian/tests/test_rhf_hessian.py
@@ -14,10 +14,14 @@
 
 import unittest
 import numpy as np
-from pyscf import gto, scf, lib
+import cupy
+import pyscf
+from pyscf import gto, lib
 from pyscf import grad, hessian
 from pyscf.hessian import rhf as rhf_cpu
+from gpu4pyscf import scf
 from gpu4pyscf.hessian import rhf as rhf_gpu
+from gpu4pyscf.hessian import jk
 
 def setUpModule():
     global mol
@@ -46,7 +50,7 @@ def test_hessian_rhf(self):
         assert abs(ref - e2_gpu).max() < 1e-6
 
     def test_partial_hess_elec(self):
-        mf = scf.RHF(mol)
+        mf = pyscf.scf.RHF(mol)
         mf.conv_tol = 1e-14
         mf.kernel()
         hobj = mf.Hessian()
@@ -139,6 +143,57 @@ def test_hessian_rhf_D3(self):
         e2_gpu = mf.Hessian().to_gpu().kernel()
         assert abs(ref - e2_gpu).max() < 1e-6
 
+    def test_jk_mix(self):
+        mol1 = pyscf.M(
+            atom='''
+        C  -1.20806619, -0.34108413, -0.00755148
+        C   1.28636081, -0.34128013, -0.00668648
+        H   2.53407081,  1.81906387, -0.00736748
+        H   1.28693681,  3.97963587, -0.00925948
+        ''',
+            basis='''unc
+        #BASIS SET:
+        H    S
+            1.815041   1
+            0.591063   1
+        H    P
+            2.305000   1
+        #BASIS SET:
+        C    S
+            8.383976   1
+            3.577015   1
+            1.547118   1
+        H    P
+            2.305000   1
+            1.098827   1
+            0.806750   1
+            0.282362   1
+        H    D
+            1.81900    1
+            0.72760    1
+            0.29104    1
+        H    F
+            0.970109   1
+        C    G
+            0.625000   1
+        C    H
+            0.4        1
+            ''',
+            output = '/dev/null'
+        )
+        nao = mol1.nao
+        mo_coeff = cupy.random.rand(nao, nao)
+        mocc = mo_coeff[:,:3]
+        dm = mocc.dot(mocc.T) * 2
+        vj_mo, vk_mo = jk.get_jk(mol1, dm, mo_coeff, mocc)
+        
+        mf = scf.RHF(mol1)
+        vj, vk = mf.get_jk(mol1, dm, hermi=1)
+        vj_cpu = (mo_coeff.T @ vj @ mocc).reshape(1,-1)
+        vk_cpu = (mo_coeff.T @ vk @ mocc).reshape(1,-1)
+        assert cupy.linalg.norm(vj_cpu - vj_mo) < 1e-5
+        assert cupy.linalg.norm(vk_cpu - vk_mo) < 1e-5
+
 if __name__ == "__main__":
     print("Full Tests for RHF Hessian")
     unittest.main()
diff --git a/gpu4pyscf/hessian/tests/test_uhf_hessian.py b/gpu4pyscf/hessian/tests/test_uhf_hessian.py
index c4112bec..1e10306c 100644
--- a/gpu4pyscf/hessian/tests/test_uhf_hessian.py
+++ b/gpu4pyscf/hessian/tests/test_uhf_hessian.py
@@ -14,10 +14,14 @@
 
 import unittest
 import numpy
-from pyscf import gto, scf, lib
+import cupy
+import pyscf
+from pyscf import gto, lib
 from pyscf import grad, hessian
 from pyscf.hessian import uhf as uhf_cpu
+from gpu4pyscf import scf
 from gpu4pyscf.hessian import uhf as uhf_gpu
+from gpu4pyscf.hessian import jk
 
 def setUpModule():
     global mol
@@ -48,7 +52,7 @@ def test_hessian_uhf(self):
         assert abs(ref - e2_gpu).max() < 1e-6
 
     def test_partial_hess_elec(self):
-        mf = scf.UHF(mol)
+        mf = pyscf.scf.UHF(mol)
         mf.conv_tol = 1e-14
         mf.kernel()
         hobj = mf.Hessian()
@@ -73,6 +77,65 @@ def test_hessian_uhf_D3(self):
         e2_gpu = mf.Hessian().to_gpu().kernel()
         assert abs(ref - e2_gpu).max() < 1e-6
 
+    def test_jk_mix(self):
+        mol1 = pyscf.M(
+            atom='''
+        C  -1.20806619, -0.34108413, -0.00755148
+        C   1.28636081, -0.34128013, -0.00668648
+        H   2.53407081,  1.81906387, -0.00736748
+        H   1.28693681,  3.97963587, -0.00925948
+        ''',
+            basis='''unc
+        #BASIS SET:
+        H    S
+            1.815041   1
+            0.591063   1
+        H    P
+            2.305000   1
+        #BASIS SET:
+        C    S
+            8.383976   1
+            3.577015   1
+            1.547118   1
+        H    P
+            2.305000   1
+            1.098827   1
+            0.806750   1
+            0.282362   1
+        H    D
+            1.81900    1
+            0.72760    1
+            0.29104    1
+        H    F
+            0.970109   1
+        C    G
+            0.625000   1
+        C    H
+            0.4        1
+            ''',
+            output = '/dev/null'
+        )
+        nao = mol1.nao
+        mo_coeff = cupy.random.rand(2, nao, nao)
+        mocca = mo_coeff[0,:,:3]
+        moccb = mo_coeff[1,:,:2]
+        dm = cupy.empty([2,nao,nao])
+        dm[0] = mocca.dot(mocca.T)
+        dm[1] = moccb.dot(moccb.T)
+        vj_mo, vk_mo = jk.get_jk(mol1, dm, mo_coeff, (mocca,moccb), hermi=1)
+        
+        mf = scf.UHF(mol1)
+        vj, vk = mf.get_jk(mol1, dm, hermi=1)
+        vj2 = cupy.empty([5*nao])
+        vk2 = cupy.empty([5*nao])
+        vj = vj[0] + vj[1]
+        vj2[:3*nao] = (mo_coeff[0].T @ vj @ mocca).reshape(1,-1)
+        vj2[3*nao:] = (mo_coeff[1].T @ vj @ moccb).reshape(1,-1)
+        vk2[:3*nao] = (mo_coeff[0].T @ vk[0] @ mocca).reshape(1,-1)
+        vk2[3*nao:] = (mo_coeff[1].T @ vk[1] @ moccb).reshape(1,-1)
+        assert cupy.linalg.norm(vj2 - vj_mo) < 1e-5
+        assert cupy.linalg.norm(vk2 - vk_mo) < 1e-5
+
 if __name__ == "__main__":
     print("Full Tests for UHF Hessian")
     unittest.main()
diff --git a/gpu4pyscf/scf/jk.py b/gpu4pyscf/scf/jk.py
index 6dd7b5cf..a1f970f3 100644
--- a/gpu4pyscf/scf/jk.py
+++ b/gpu4pyscf/scf/jk.py
@@ -226,17 +226,15 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None
         vk = reduce_to_device(vk_dist, inplace=True)
         #:vk = cp.einsum('pi,npq,qj->nij', vhfopt.coeff, vk, vhfopt.coeff)
         vk = sandwich_dot(vk, vhfopt.coeff)
-        vk = vk.reshape(dm.shape)
-
+        
     if with_j:
         vj = reduce_to_device(vj_dist, inplace=True)
         vj = transpose_sum(vj)
         #:vj = cp.einsum('pi,npq,qj->nij', vhfopt.coeff, vj, vhfopt.coeff)
         vj = sandwich_dot(vj, vhfopt.coeff)
-        vj = vj.reshape(dm.shape)
 
     h_shls = vhfopt.h_shls
-    assert len(h_shls) == 0
+
     if h_shls:
         cput1 = log.timer_debug1('get_jk pass 1 on gpu', *cput0)
         log.debug3('Integrals for %s functions on CPU', l_symb[LMAX+1])
@@ -276,6 +274,11 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None
                 vk[i] += coeff.T.dot(cp.asarray(v)).dot(coeff)
         log.timer_debug1('get_jk pass 2 for h functions on cpu', *cput1)
     
+    if with_j:
+        vj = vj.reshape(dm.shape)
+    if with_k:
+        vk = vk.reshape(dm.shape)
+
     log.timer('vj and vk', *cput0)
     return vj, vk
 
diff --git a/gpu4pyscf/scf/tests/test_rhf.py b/gpu4pyscf/scf/tests/test_rhf.py
index 530f6cc8..dd8f7b51 100644
--- a/gpu4pyscf/scf/tests/test_rhf.py
+++ b/gpu4pyscf/scf/tests/test_rhf.py
@@ -273,8 +273,8 @@ def test_chkfile(self):
         mf_copy = scf.RHF(mol)
         mf_copy.chkfile = ftmp.name
         dm_loaded = mf_copy.init_guess_by_chkfile()
-        assert np.allclose(dm_stored, dm_loaded, atol = 1e-14) # Since we reload the MO coefficients, the density matrix should be identical up to numerical noise.
-
+        # Since we reload the MO coefficients, the density matrix should be identical up to numerical noise.
+        assert np.allclose(dm_stored, dm_loaded, atol = 1e-14) 
     # TODO:
     #test analyze
     #test mulliken_pop

From 2e093dfb380a64ec8cb8b7faa1e84b50f9ff34f2 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Fri, 20 Dec 2024 06:18:59 +0000
Subject: [PATCH 06/49] unit test

---
 gpu4pyscf/hessian/tests/test_rhf_hessian.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu4pyscf/hessian/tests/test_rhf_hessian.py b/gpu4pyscf/hessian/tests/test_rhf_hessian.py
index e9aef60c..30a1c188 100644
--- a/gpu4pyscf/hessian/tests/test_rhf_hessian.py
+++ b/gpu4pyscf/hessian/tests/test_rhf_hessian.py
@@ -185,7 +185,7 @@ def test_jk_mix(self):
         mo_coeff = cupy.random.rand(nao, nao)
         mocc = mo_coeff[:,:3]
         dm = mocc.dot(mocc.T) * 2
-        vj_mo, vk_mo = jk.get_jk(mol1, dm, mo_coeff, mocc)
+        vj_mo, vk_mo = jk.get_jk(mol1, dm, mo_coeff, mocc, hermi=1)
         
         mf = scf.RHF(mol1)
         vj, vk = mf.get_jk(mol1, dm, hermi=1)

From bb400be8e08459035dfc179fe009200d26ddb13a Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Sun, 22 Dec 2024 04:25:58 +0000
Subject: [PATCH 07/49] optimize df hessian memory usage

---
 examples/dft_driver.py                      |   8 +-
 gpu4pyscf/df/hessian/jk.py                  |  89 +++++----
 gpu4pyscf/df/hessian/rhf.py                 |  37 ++--
 gpu4pyscf/df/hessian/rks.py                 |  26 ++-
 gpu4pyscf/df/hessian/uhf.py                 |  48 +++--
 gpu4pyscf/df/hessian/uks.py                 |  40 ++--
 gpu4pyscf/df/int3c2e.py                     |  16 +-
 gpu4pyscf/dft/numint.py                     | 199 +++++++++++++-------
 gpu4pyscf/hessian/jk.py                     |  53 +++---
 gpu4pyscf/hessian/rhf.py                    |  22 ++-
 gpu4pyscf/hessian/rks.py                    |   6 +-
 gpu4pyscf/hessian/tests/test_rhf_hessian.py |   5 +-
 gpu4pyscf/hessian/tests/test_uhf_hessian.py |   5 +-
 gpu4pyscf/hessian/uhf.py                    |   4 +-
 gpu4pyscf/hessian/uks.py                    |   6 +-
 15 files changed, 315 insertions(+), 249 deletions(-)

diff --git a/examples/dft_driver.py b/examples/dft_driver.py
index 13aaa0ce..8060e909 100644
--- a/examples/dft_driver.py
+++ b/examples/dft_driver.py
@@ -35,10 +35,10 @@
     basis=bas,
     max_memory=32000)
 # set verbose >= 6 for debugging timer
-mol.verbose = 6
+mol.verbose = 4
 
-mf_df = dft.RKS(mol, xc=args.xc).density_fit(auxbasis=args.auxbasis)
-mf_df.verbose = 6
+mf_df = dft.RKS(mol, xc=args.xc)#.density_fit(auxbasis=args.auxbasis)
+mf_df.verbose = 4
 
 if args.solvent:
     mf_df = mf_df.PCM()
@@ -52,7 +52,7 @@
 mf_df.direct_scf_tol = 1e-14
 mf_df.conv_tol = 1e-10
 mf_df.chkfile = None
-mf_df.conv_tol_cpscf = 1e-3
+mf_df.conv_tol_cpscf = 1e-6
 e_tot = mf_df.kernel()
 scf_time = time.time() - start_time
 print(f'compute time for energy: {scf_time:.3f} s')
diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py
index 5a299bcf..fb097180 100644
--- a/gpu4pyscf/df/hessian/jk.py
+++ b/gpu4pyscf/df/hessian/jk.py
@@ -195,24 +195,13 @@ def _get_int3c2e_ipip_slice(ip_type, intopt, cp_ij_id, aux_id, omega=None, strea
 
     if omega is None: omega = 0.0
     if stream is None: stream = cupy.cuda.get_current_stream()
-
+    
     fn = getattr(libgint, 'GINTfill_int3c2e_' + ip_type)
-
     nao = intopt._sorted_mol.nao
     naux = intopt._sorted_auxmol.nao
     norb = nao + naux + 1
     comp = 9
     order = 2
-
-    lmax = intopt._sorted_mol._bas[:gto.ANG_OF].max()
-    aux_lmax = intopt._sorted_auxmol._bas[:gto.ANG_OF].max()
-    nroots = (lmax + aux_lmax + order)//2 + 1
-    if nroots > NROOT_ON_GPU:
-        from pyscf.gto.moleintor import getints, make_cintopt
-        pmol = intopt._tot_mol
-        intor = pmol._add_suffix('int3c2e_' + ip_type)
-        opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor)
-
     nbins = 1
 
     cp_kl_id = aux_id + len(intopt.log_qs)
@@ -258,6 +247,11 @@ def _get_int3c2e_ipip_slice(ip_type, intopt, cp_ij_id, aux_id, omega=None, strea
         if err != 0:
             raise RuntimeError(f'GINT_fill_int3c2e general failed, err={err}')
     else:
+        from pyscf.gto.moleintor import getints, make_cintopt
+        pmol = intopt._tot_mol
+        intor = pmol._add_suffix('int3c2e_' + ip_type)
+        opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor)
+    
         # TODO: sph2cart in CPU?
         ishl0, ishl1 = intopt.l_ctr_offsets[cpi], intopt.l_ctr_offsets[cpi+1]
         jshl0, jshl1 = intopt.l_ctr_offsets[cpj], intopt.l_ctr_offsets[cpj+1]
@@ -291,16 +285,17 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
         dm0 = cupy.asarray(dm0)
         nao = dm0.shape[0]
 
-        hj_ipip1 = cupy.zeros([nao,9])
-        hj_ipip2 = cupy.zeros([naux,9])
-        hj_ip1ip2 = cupy.zeros([nao,naux,9])
-        hj_ipvip1 = cupy.zeros([nao,nao,9])
+        hj_ipip1 = cupy.zeros([9,nao])
+        hj_ipip2 = cupy.zeros([9,naux])
+        hj_ip1ip2 = cupy.zeros([9,nao,naux])
+        hj_ipvip1 = cupy.zeros([9,nao,nao])
         if with_k:
-            hk_ipip1 = cupy.zeros([nao,9])
-            hk_ipip2 = cupy.zeros([naux,9])
-            hk_ip1ip2 = cupy.zeros([nao,naux,9])
-            hk_ipvip1 = cupy.zeros([nao,nao,9])
+            hk_ipip1 = cupy.zeros([9,nao])
+            hk_ipip2 = cupy.zeros([9,naux])
+            hk_ip1ip2 = cupy.zeros([9,nao,naux])
+            hk_ipvip1 = cupy.zeros([9,nao,nao])
 
+        cupy.get_default_memory_pool().free_all_blocks()
         for aux_id, cp_ij_id in task_list:
             cpi = intopt.cp_idx[cp_ij_id]
             cpj = intopt.cp_jdx[cp_ij_id]
@@ -309,22 +304,22 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
             k0, k1 = aux_ao_loc[aux_id], aux_ao_loc[aux_id+1]
             
             if with_k:
-                rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
-                rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1])
+                rhok_tmp = contract('por,ir->poi', rhok[k0:k1], orbo[i0:i1])
+                rhok_tmp = contract('poi,jo->pji', rhok_tmp, orbo[j0:j1])
 
             # (20|0), (0|0)(0|00)
             int3c_blk = _get_int3c2e_ipip_slice('ipip1', intopt, cp_ij_id, aux_id, omega=omega)
             tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1])
-            hj_ipip1[i0:i1] += contract('xpi,p->ix', tmp, rhoj[k0:k1])
+            hj_ipip1[:,i0:i1] += contract('xpi,p->xi', tmp, rhoj[k0:k1])
             if with_k:
-                hk_ipip1[i0:i1] += contract('xpji,pij->ix', int3c_blk, rhok_tmp)
+                hk_ipip1[:,i0:i1] += contract('xpji,pji->xi', int3c_blk, rhok_tmp)
 
             # (11|0), (0|0)(0|00) without response of RI basis
             int3c_blk = _get_int3c2e_ipip_slice('ipvip1', intopt, cp_ij_id, aux_id, omega=omega)
             tmp = contract('xpji,ij->xpij', int3c_blk, dm0[i0:i1,j0:j1])
-            hj_ipvip1[i0:i1,j0:j1] += contract('xpij,p->ijx', tmp, rhoj[k0:k1])
+            hj_ipvip1[:,i0:i1,j0:j1] += contract('xpij,p->xij', tmp, rhoj[k0:k1])
             if with_k:
-                hk_ipvip1[i0:i1,j0:j1] += contract('xpji,pij->ijx', int3c_blk, rhok_tmp)
+                hk_ipvip1[:,i0:i1,j0:j1] += contract('xpji,pji->xij', int3c_blk, rhok_tmp)
 
             if auxbasis_response < 1:
                 continue
@@ -332,9 +327,9 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
             # (10|1), (0|0)(0|00)
             int3c_blk = _get_int3c2e_ipip_slice('ip1ip2', intopt, cp_ij_id, aux_id, omega=omega)
             tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1])
-            hj_ip1ip2[i0:i1,k0:k1] += contract('xpi,p->ipx', tmp, rhoj[k0:k1])
+            hj_ip1ip2[:,i0:i1,k0:k1] += contract('xpi,p->xip', tmp, rhoj[k0:k1])
             if with_k:
-                hk_ip1ip2[i0:i1,k0:k1] += contract('xpji,pij->ipx', int3c_blk, rhok_tmp)
+                hk_ip1ip2[:,i0:i1,k0:k1] += contract('xpji,pji->xip', int3c_blk, rhok_tmp)
             
             if auxbasis_response < 2:
                 continue
@@ -342,44 +337,44 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
             # (00|2), (0|0)(0|00)
             int3c_blk = _get_int3c2e_ipip_slice('ipip2', intopt, cp_ij_id, aux_id, omega=omega)
             tmp = contract('xpji,ij->xp', int3c_blk, dm0[i0:i1,j0:j1])
-            hj_ipip2[k0:k1] += contract('xp,p->px', tmp, rhoj[k0:k1])
+            hj_ipip2[:,k0:k1] += contract('xp,p->xp', tmp, rhoj[k0:k1])
             if with_k:
-                hk_ipip2[k0:k1] += contract('xpji,pij->px', int3c_blk, rhok_tmp)
-
+                hk_ipip2[:,k0:k1] += contract('xpji,pji->xp', int3c_blk, rhok_tmp)
+            
         auxslices = intopt.auxmol.aoslice_by_atom()
         aoslices = intopt.mol.aoslice_by_atom()
         ao2atom = int3c2e.get_ao2atom(intopt, aoslices)
         aux2atom = int3c2e.get_aux2atom(intopt, auxslices)
 
-        hj_ipvip1 = hj_ipvip1.reshape([nao,nao,3,3])
-        tmp = contract('ia,ijxy->ajxy', ao2atom, hj_ipvip1)
+        hj_ipvip1 = hj_ipvip1.reshape([3,3,nao,nao])
+        tmp = contract('ia,xyij->ajxy', ao2atom, hj_ipvip1)
         hj = 2.0 * contract('jb,ajxy->abxy', ao2atom, tmp)
 
-        hj_ipip1 = hj_ipip1.reshape([nao,3,3])
-        tmp = contract('ia,ixy->axy', ao2atom, hj_ipip1)
+        hj_ipip1 = hj_ipip1.reshape([3,3,nao])
+        tmp = contract('ia,xyi->axy', ao2atom, hj_ipip1)
         hj[range(natm), range(natm)] += 2.0 * tmp
 
         hk = None
         if with_k:
-            hk_ipvip1 = hk_ipvip1.reshape([nao,nao,3,3])
-            tmp = contract('ia,ijxy->ajxy', ao2atom, hk_ipvip1)
+            hk_ipvip1 = hk_ipvip1.reshape([3,3,nao,nao])
+            tmp = contract('ia,xyij->ajxy', ao2atom, hk_ipvip1)
             hk = contract('jb,ajxy->abxy', ao2atom, tmp)
 
-            hk_ipip1 = hk_ipip1.reshape([nao,3,3])
-            tmp = contract('ia,ixy->axy', ao2atom, hk_ipip1)
+            hk_ipip1 = hk_ipip1.reshape([3,3,nao])
+            tmp = contract('ia,xyi->axy', ao2atom, hk_ipip1)
             hk[range(natm), range(natm)] += tmp
         
         if auxbasis_response > 0:
-            hj_ip1ip2 = hj_ip1ip2.reshape([nao,naux,3,3])
-            tmp = contract('ia,ijxy->ajxy', ao2atom, hj_ip1ip2)
+            hj_ip1ip2 = hj_ip1ip2.reshape([3,3,nao,naux])
+            tmp = contract('ia,xyij->ajxy', ao2atom, hj_ip1ip2)
             tmp = contract('jb,ajxy->abxy',aux2atom, tmp)
             tmp = tmp + tmp.transpose([1,0,3,2])
             hj += tmp
             if auxbasis_response > 1:
                 hj += tmp
             if with_k:
-                hk_ip1ip2 = hk_ip1ip2.reshape([nao,naux,3,3])
-                tmp = contract('ia,ijxy->ajxy', ao2atom, hk_ip1ip2)
+                hk_ip1ip2 = hk_ip1ip2.reshape([3,3,nao,naux])
+                tmp = contract('ia,xyij->ajxy', ao2atom, hk_ip1ip2)
                 tmp = contract('jb,ajxy->abxy', aux2atom, tmp)
                 tmp = 0.5 * (tmp + tmp.transpose([1,0,3,2]))
                 hk += tmp
@@ -387,12 +382,12 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
                     hk += tmp
         
         if auxbasis_response > 1:
-            hj_ipip2 = hj_ipip2.reshape([naux,3,3])
-            tmp = contract('ia,ixy->axy', aux2atom, hj_ipip2)
+            hj_ipip2 = hj_ipip2.reshape([3,3,naux])
+            tmp = contract('ia,xyi->axy', aux2atom, hj_ipip2)
             hj[range(natm), range(natm)] += tmp
             if with_k:
-                hk_ipip2 = hk_ipip2.reshape([naux,3,3])
-                tmp = contract('ia,ixy->axy', aux2atom, hk_ipip2)
+                hk_ipip2 = hk_ipip2.reshape([3,3,naux])
+                tmp = contract('ia,xyi->axy', aux2atom, hk_ipip2)
                 hk[range(natm), range(natm)] += .5 * tmp
         t0 = log.timer_debug1(f'int3c2e_ipip on Device {device_id}', *t0)
     return hj, hk
diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py
index d4fda5e3..bd9ef958 100644
--- a/gpu4pyscf/df/hessian/rhf.py
+++ b/gpu4pyscf/df/hessian/rhf.py
@@ -56,10 +56,9 @@ def _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2):
     hk_ao_ao = cupy.zeros([nao,nao,3,3])
     cupy.get_default_memory_pool().free_all_blocks()
     mem_avail = get_avail_mem()
-    blksize = int((mem_avail*0.2/(nao*nao*3*8)/ALIGNED))*ALIGNED
+    blksize = int((mem_avail*0.4/(nao*nao*3*8)/ALIGNED))*ALIGNED
     for k0, k1 in lib.prange(0,nnz,blksize):
         rhok1_Pko_kslice = cupy.asarray(rhok1_Pko[k0:k1])
-
         # (10|0)(0|10) without response of RI basis
         vk2_ip1_ip1 = contract('piox,pkoy->ikxy', rhok1_Pko_kslice, rhok1_Pko_kslice)
         hk_ao_ao += contract('ikxy,ik->ikxy', vk2_ip1_ip1, dm0)
@@ -68,7 +67,7 @@ def _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2):
         # (10|0)(0|01) without response of RI basis
         rhok1_Pkl_kslice = contract('piox,ko->pikx', rhok1_Pko_kslice, mocc_2)
         hk_ao_ao += contract('pikx,pkiy->ikxy', rhok1_Pkl_kslice, rhok1_Pkl_kslice)
-        rhok1_Pkl_kslice = None
+        rhok1_Pkl_kslice = rhok1_Pko_kslice = None
     return hk_ao_ao
 
 def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
@@ -397,21 +396,18 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
 
 def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
-    mol = hessobj.mol
-    natm = mol.natm
-    nocc = int(cupy.count_nonzero(mo_occ > 0))
-    nmo = len(mo_occ)
-    h1ao = cupy.empty((natm, 3, nmo, nocc))
-    for ia, h1, vj1, vk1 in _gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
-                                    atmlst, verbose, True):
-        h1 += vj1 - vk1 * .5
-        h1ao[ia] = h1
-    return h1ao
-
-def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
+    vj, vk = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, atmlst, verbose, True)
+    # h1mo = h1 + vj - 0.5 * vk
+    h1mo = vk
+    h1mo *= -.5
+    h1mo += vj
+    h1mo += rhf_grad.get_grad_hcore(hessobj.base.nuc_grad_method())
+    return h1mo
+
+def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
             verbose=None, with_k=True, omega=None):
     '''
-    A generator to produce the derivatives of Hcore, J, K matrices in MO bases
+    Derivatives of J, K matrices in MO bases
     '''
     log = logger.new_logger(hessobj, verbose)
     t0 = log.init_timer()
@@ -568,9 +564,7 @@ def _ao2mo(mat):
         vk1_int3c = vk1_int3c_ip1 + vk1_int3c_ip2
         vk1_int3c_ip1 = vk1_int3c_ip2 = None
 
-    grad_hcore = rhf_grad.get_grad_hcore(hessobj.base.nuc_grad_method())
     cupy.get_default_memory_pool().free_all_blocks()
-    vk1 = None
     for i0, ia in enumerate(atmlst):
         shl0, shl1, p0, p1 = aoslices[ia]
         vj1_ao = cupy.zeros([3,nao,nao])
@@ -582,11 +576,10 @@ def _ao2mo(mat):
             vk1_ao[:,p0:p1,:] -= vk1_buf[:,p0:p1,:]
             vk1_ao[:,:,p0:p1] -= vk1_buf[:,p0:p1,:].transpose(0,2,1)
 
-        h1 =  grad_hcore[i0]
-        vj1 = vj1_int3c[ia] + _ao2mo(vj1_ao)
+        vj1_int3c[ia] += _ao2mo(vj1_ao)
         if with_k:
-            vk1 = vk1_int3c[ia] + _ao2mo(vk1_ao)
-        yield ia, h1, vj1, vk1
+            vk1_int3c[ia] += _ao2mo(vk1_ao)
+    return vj1_int3c, vk1_int3c
 
 def _get_jk_mo(hessobj, mol, dms, mo_coeff, mocc, 
            hermi=1, with_j=True, with_k=True, omega=None):
diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py
index 2606e8e4..1d16ff16 100644
--- a/gpu4pyscf/df/hessian/rks.py
+++ b/gpu4pyscf/df/hessian/rks.py
@@ -23,6 +23,7 @@
 import numpy
 import cupy
 from pyscf import lib
+from gpu4pyscf.grad import rhf as rhf_grad
 from gpu4pyscf.hessian import rhf as rhf_hess
 from gpu4pyscf.hessian import rks as rks_hess
 from gpu4pyscf.df.hessian import rhf as df_rhf_hess
@@ -90,18 +91,23 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
     mem_now = lib.current_memory()[0]
     max_memory = max(2000, mf.max_memory*.9-mem_now)
-    h1mo = rks_hess._get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory)
+    
     with_k = ni.libxc.is_hybrid_xc(mf.xc)
-
-    for ia, h1, vj1, vk1 in df_rhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
-                                                atmlst, verbose, with_k):
-        h1mo[ia] += h1 + vj1
-        if with_k:
-            h1mo[ia] -= .5 * hyb * vk1
+    vj1, vk1 = df_rhf_hess._get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile,
+                                                atmlst, verbose, with_k)
+    h1mo = vj1
+    if with_k:
+        h1mo -= .5 * hyb * vk1
+    vj1 = vk1 = None
+    
     if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10:
-        for ia, h1, vj1_lr, vk1_lr in df_rhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
-                                                atmlst, verbose, True, omega=omega):
-            h1mo[ia] -= .5 * (alpha - hyb) * vk1_lr
+        _, vk1_lr = df_rhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
+                                             atmlst, verbose, True, omega=omega)
+        h1mo -= .5 * (alpha - hyb) * vk1_lr
+        vk1_lr = None
+
+    h1mo += rhf_grad.get_grad_hcore(hessobj.base.nuc_grad_method())
+    h1mo += rks_hess._get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory)
     return h1mo
 
 class Hessian(rks_hess.Hessian):
diff --git a/gpu4pyscf/df/hessian/uhf.py b/gpu4pyscf/df/hessian/uhf.py
index e1c8250b..1dc3f3a4 100644
--- a/gpu4pyscf/df/hessian/uhf.py
+++ b/gpu4pyscf/df/hessian/uhf.py
@@ -416,21 +416,21 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     if atmlst is None:
         atmlst = range(natm)
 
-    nocca, noccb = hessobj.base.nelec
-    nmo = len(mo_occ[0])
-    h1aoa = cupy.empty((natm, 3, nmo, nocca))
-    h1aob = cupy.empty((natm, 3, nmo, noccb))
-    for ia, h1, vj1, vk1 in _gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
-                                    atmlst, verbose, True):
-        h1a, h1b = h1
-        vj1a, vj1b = vj1
-        vk1a, vk1b = vk1
-
-        h1aoa[ia] = h1a + vj1a - vk1a
-        h1aob[ia] = h1b + vj1b - vk1b
-    return (h1aoa, h1aob)
-
-def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
+    vj1, vk1 = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, atmlst, verbose, True)
+    vj1a, vj1b = vj1
+    vk1a, vk1b = vk1
+    h1moa = vj1a
+    h1moa-= vk1a
+    h1mob = vj1b
+    h1mob-= vk1b
+    vj1 = vk1 = vj1a = vj1b = vk1a = vk1b = None
+
+    gobj = hessobj.base.nuc_grad_method()
+    h1moa += rhf_grad.get_grad_hcore(gobj, mo_coeff[0], mo_occ[0])
+    h1mob += rhf_grad.get_grad_hcore(gobj, mo_coeff[1], mo_occ[1])
+    return (h1moa, h1mob)
+
+def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
             verbose=None, with_k=True, omega=None):
     '''
     A generator to produce the derivatives of Hcore, J, K matrices in MO bases
@@ -632,12 +632,8 @@ def _ao2mo(mat, mocc, mo):
         tmp = contract('xij,jo->xio', mat, mocc)
         return contract('xik,ip->xpk', tmp, mo)
 
-    gobj = hessobj.base.nuc_grad_method()
-    grad_hcore_a = rhf_grad.get_grad_hcore(gobj, mo_coeff[0], mo_occ[0])
-    grad_hcore_b = rhf_grad.get_grad_hcore(gobj, mo_coeff[1], mo_occ[1])
     cupy.get_default_memory_pool().free_all_blocks()
-
-    vk1a = vk1b = None
+    
     for i0, ia in enumerate(atmlst):
         shl0, shl1, p0, p1 = aoslices[ia]
         vj1_ao = cupy.zeros([3,nao,nao])
@@ -652,14 +648,12 @@ def _ao2mo(mat, mocc, mo):
             vk1b_ao[:,p0:p1,:] -= vk1b_buf[:,p0:p1,:]
             vk1b_ao[:,:,p0:p1] -= vk1b_buf[:,p0:p1,:].transpose(0,2,1)
 
-        h1a = grad_hcore_a[i0]
-        h1b = grad_hcore_b[i0]
-        vj1a = vj1a_int3c[ia] + _ao2mo(vj1_ao, mocca, mo_coeff[0])
-        vj1b = vj1b_int3c[ia] + _ao2mo(vj1_ao, moccb, mo_coeff[1])
+        vj1a_int3c[ia] += _ao2mo(vj1_ao, mocca, mo_coeff[0])
+        vj1b_int3c[ia] += _ao2mo(vj1_ao, moccb, mo_coeff[1])
         if with_k:
-            vk1a = vk1a_int3c[ia] + _ao2mo(vk1a_ao, mocca, mo_coeff[0])
-            vk1b = vk1b_int3c[ia] + _ao2mo(vk1b_ao, moccb, mo_coeff[1])
-        yield ia, (h1a, h1b), (vj1a, vj1b), (vk1a, vk1b)
+            vk1a_int3c[ia] += _ao2mo(vk1a_ao, mocca, mo_coeff[0])
+            vk1b_int3c[ia] += _ao2mo(vk1b_ao, moccb, mo_coeff[1])
+    return (vj1a_int3c, vj1b_int3c), (vk1a_int3c, vk1b_int3c)
 
 _get_jk_mo = df_rhf_hess._get_jk_mo
 
diff --git a/gpu4pyscf/df/hessian/uks.py b/gpu4pyscf/df/hessian/uks.py
index 6bf09803..5fd23a34 100644
--- a/gpu4pyscf/df/hessian/uks.py
+++ b/gpu4pyscf/df/hessian/uks.py
@@ -23,6 +23,7 @@
 import numpy
 import cupy
 from pyscf import lib
+from gpu4pyscf.grad import rhf as rhf_grad
 from gpu4pyscf.hessian import rhf as rhf_hess
 from gpu4pyscf.hessian import uhf as uhf_hess
 from gpu4pyscf.hessian import uks as uks_hess
@@ -95,24 +96,35 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
     mem_now = lib.current_memory()[0]
     max_memory = max(2000, mf.max_memory*.9-mem_now)
-    h1moa, h1mob = uks_hess._get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory)
+    
     with_k = ni.libxc.is_hybrid_xc(mf.xc)
 
-    for ia, h1, vj1, vk1 in df_uhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
-                                                atmlst, verbose, with_k):
+    vj1, vk1 = df_uhf_hess._get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile,
+                                       atmlst, verbose, with_k)
+    vj1a, vj1b = vj1
+    h1moa = vj1a
+    h1mob = vj1b
 
-        h1moa[ia] += h1[0] + vj1[0]
-        h1mob[ia] += h1[1] + vj1[1]
-        if with_k:
-            vk1a, vk1b = vk1
-            h1moa[ia] -= hyb * vk1a
-            h1mob[ia] -= hyb * vk1b
+    if with_k:
+        vk1a, vk1b = vk1
+        h1moa -= hyb * vk1a
+        h1mob -= hyb * vk1b
+    vj1 = vk1 = vj1a = vj1b = vk1a = vk1b = None
+    
     if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10:
-        for ia, h1, vj1_lr, vk1_lr in df_uhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
-                                                atmlst, verbose, True, omega=omega):
-            vk1a, vk1b = vk1_lr
-            h1moa[ia] -= (alpha - hyb) * vk1a
-            h1mob[ia] -= (alpha - hyb) * vk1b
+        _, vk1_lr = df_uhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
+                                             atmlst, verbose, True, omega=omega)
+        vk1a, vk1b = vk1_lr
+        h1moa -= (alpha - hyb) * vk1a
+        h1mob -= (alpha - hyb) * vk1b
+
+    gobj = hessobj.base.nuc_grad_method()
+    h1moa += rhf_grad.get_grad_hcore(gobj, mo_coeff[0], mo_occ[0])
+    h1mob += rhf_grad.get_grad_hcore(gobj, mo_coeff[1], mo_occ[1])
+
+    v1moa, v1mob = uks_hess._get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory)
+    h1moa += v1moa
+    h1mob += v1mob
     return h1moa, h1mob
 
 class Hessian(uks_hess.Hessian):
diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
index f89fb07c..98350c59 100644
--- a/gpu4pyscf/df/int3c2e.py
+++ b/gpu4pyscf/df/int3c2e.py
@@ -1035,8 +1035,8 @@ def _int3c2e_ip2_wjk(intopt, task_list, dm0, orbo, with_k=True, omega=None, devi
         for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list,
                                                                 ip_type='ip2', omega=omega):
             wj[k0:k1] += contract('xpji,ji->px', int3c_blk, dm0[j0:j1,i0:i1])
-            tmp = contract('xpji,jo->piox', int3c_blk, orbo[j0:j1])
             if with_k:
+                tmp = contract('xpji,jo->piox', int3c_blk, orbo[j0:j1])
                 wk[k0:k1] += contract('piox,ir->prox', tmp, orbo[i0:i1])
     return wj, wk
 
@@ -1229,15 +1229,6 @@ def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', di
     intopt = VHFOpt(mol, auxmol, 'int2e')
     intopt.build(direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
 
-    lmax = mol._bas[:gto.ANG_OF].max()
-    aux_lmax = auxmol._bas[:gto.ANG_OF].max()
-    nroots = (lmax + aux_lmax + order)//2 + 1
-    if nroots > NROOT_ON_GPU:
-        from pyscf.gto.moleintor import getints, make_cintopt
-        pmol = intopt._tot_mol
-        intor = pmol._add_suffix('int3c2e_' + ip_type)
-        opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor)
-
     nao_cart = intopt._sorted_mol.nao
     naux_cart = intopt._sorted_auxmol.nao
     norb_cart = nao_cart + naux_cart + 1
@@ -1287,6 +1278,11 @@ def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', di
                 if err != 0:
                     raise RuntimeError("int3c2e failed\n")
             else:
+                from pyscf.gto.moleintor import getints, make_cintopt
+                pmol = intopt._tot_mol
+                intor = pmol._add_suffix('int3c2e_' + ip_type)
+                opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor)
+
                 # TODO: sph2cart in CPU?
                 ishl0, ishl1 = intopt.l_ctr_offsets[cpi], intopt.l_ctr_offsets[cpi+1]
                 jshl0, jshl1 = intopt.l_ctr_offsets[cpj], intopt.l_ctr_offsets[cpj+1]
diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py
index 22f6ff97..aed7a4ca 100644
--- a/gpu4pyscf/dft/numint.py
+++ b/gpu4pyscf/dft/numint.py
@@ -32,11 +32,11 @@
 
 LMAX_ON_GPU = 6
 BAS_ALIGNED = 1
-GRID_BLKSIZE = 32
 MIN_BLK_SIZE = getattr(__config__, 'min_grid_blksize', 64*64)
 ALIGNED = getattr(__config__, 'grid_aligned', 16*16)
 AO_ALIGNMENT = getattr(__config__, 'ao_aligned', 16)
 AO_THRESHOLD = 1e-10
+GB = 1024*1024*1024
 
 # Should we release the cupy cache?
 FREE_CUPY_CACHE = False
@@ -273,26 +273,23 @@ def eval_rho4(mol, ao, mo0, mo1, non0tab=None, xctype='LDA', hermi=0,
     na = mo1.shape[0]
     if xctype == 'LDA' or xctype == 'HF':
         c0 = mo0.T.dot(ao)
-        t1 = log.timer_debug2('eval occ_coeff', *t0)
-        c_0 = contract('aio,ig->aog', mo1, ao)
         rho = cupy.empty([na,ngrids])
         for i in range(na):
-            rho[i] = _contract_rho(c0, c_0[i])
+            c_0 = contract('io,ig->og', mo1[i], ao)
+            rho[i] = _contract_rho(c0, c_0)
     elif xctype in ('GGA', 'NLC'):
         c0 = contract('nig,io->nog', ao, mo0)
-        t1 = log.timer_debug2('eval occ_coeff', *t0)
-        c_0 = contract('nig,aio->anog', ao, mo1)
-        t1 = log.timer_debug2('ao * cpos', *t1)
         rho = cupy.empty([na, 4, ngrids])
         for i in range(na):
-            _contract_rho_gga(c0, c_0[i], rho=rho[i])
+            c_0 = contract('nig,io->nog', ao, mo1[i])
+            _contract_rho_gga(c0, c_0, rho=rho[i])
     else: # meta-GGA
         assert not with_lapl
         rho = cupy.empty((na,5,ngrids))
         c0 = contract('nig,io->nog', ao, mo0)
-        c_0 = contract('nig,aio->anog', ao, mo1)
         for i in range(na):
-            _contract_rho_mgga(c0, c_0[i], rho=rho[i])
+            c_0 = contract('nig,io->nog', ao, mo1[i])
+            _contract_rho_mgga(c0, c_0, rho=rho[i])
     if hermi:
         # corresponding to the density of ao * mo1[i].dot(mo0.T) * ao
         rho *= 2.
@@ -1025,7 +1022,7 @@ def _nr_rks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff,
         p0 = p1 = grid_start
         t1 = t0 = log.init_timer()
         for ao, mask, weights, coords in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
-                                                       max_memory=None,
+                                                       max_memory=None, blksize=None,
                                                        grid_range=(grid_start, grid_end)):
             p0, p1 = p1, p1+len(weights)
             # precompute molecular orbitals
@@ -1133,6 +1130,102 @@ def nr_rks_fxc_st(ni, mol, grids, xc_code, dm0=None, dms_alpha=None,
     return nr_rks_fxc(ni, mol, grids, xc_code, dm0, dms_alpha, hermi=0, fxc=fxc,
                       max_memory=max_memory, verbose=verbose)
 
+def _nr_uks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff,
+                     verbose=None, hermi=1, device_id=0):
+    with cupy.cuda.Device(device_id), _streams[device_id]:
+        if dms is not None: 
+            dma, dmb = dms
+            dma = cupy.asarray(dma)
+            dmb = cupy.asarray(dmb)
+        if mo1 is not None: 
+            mo1a, mo1b = mo1
+            mo1a = cupy.asarray(mo1a)
+            mo1b = cupy.asarray(mo1b)
+        if occ_coeff is not None: 
+            occ_coeff_a, occ_coeff_b = occ_coeff
+            occ_coeff_a = cupy.asarray(occ_coeff_a)
+            occ_coeff_b = cupy.asarray(occ_coeff_b)
+
+        if fxc is not None: fxc = cupy.asarray(fxc)
+        assert isinstance(verbose, int)
+        log = logger.new_logger(mol, verbose)
+        xctype = ni._xc_type(xc_code)
+        opt = getattr(ni, 'gdftopt', None)
+
+        _sorted_mol = opt.mol
+        nao = mol.nao
+        nset = len(dma)
+        vmata = cupy.zeros((nset, nao, nao))
+        vmatb = cupy.zeros((nset, nao, nao))
+
+        if xctype == 'LDA':
+            ao_deriv = 0
+        else:
+            ao_deriv = 1
+
+        ngrids_glob = grids.coords.shape[0]
+        ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
+        grid_start = device_id * ngrids_per_device
+        grid_end = (device_id + 1) * ngrids_per_device
+
+        p0 = p1 = grid_start
+        t1 = t0 = log.init_timer()
+        for ao, mask, weights, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, 
+                                                  max_memory=None,
+                                                  grid_range=(grid_start, grid_end)):
+            
+            t0 = log.init_timer()
+            p0, p1 = p1, p1+len(weights)
+            # precompute fxc_w
+            fxc_w = fxc[:,:,:,:,p0:p1] * weights
+
+            # precompute molecular orbitals
+            if occ_coeff is not None:
+                occ_coeff_a_mask = occ_coeff_a[mask]
+                occ_coeff_b_mask = occ_coeff_b[mask]
+                rho1a = eval_rho4(_sorted_mol, ao, occ_coeff_a_mask, mo1a[:,mask],
+                                xctype=xctype, hermi=hermi).reshape(nset,-1,p1-p0)
+                rho1b = eval_rho4(_sorted_mol, ao, occ_coeff_b_mask, mo1b[:,mask],
+                                xctype=xctype, hermi=hermi).reshape(nset,-1,p1-p0)
+            else: # slow version
+                rho1a = []
+                rho1b = []
+                for i in range(nset):
+                    rho_tmp = eval_rho(_sorted_mol, ao, dma[i,mask[:,None],mask],
+                                       xctype=xctype, hermi=hermi)
+                    rho1a.append(rho_tmp.reshape(-1,p1-p0))
+                    rho_tmp = eval_rho(_sorted_mol, ao, dmb[i,mask[:,None],mask],
+                                       xctype=xctype, hermi=hermi)
+                    rho1b.append(rho_tmp.reshape(-1,p1-p0))
+            t0 = log.timer_debug1('rho', *t0)
+
+            for i in range(nset):
+                wv_a = contract('xg,xyg->yg', rho1a[i], fxc_w[0,:,0])
+                wv_a+= contract('xg,xyg->yg', rho1b[i], fxc_w[1,:,0])
+                wv_b = contract('xg,xyg->yg', rho1a[i], fxc_w[0,:,1])
+                wv_b+= contract('xg,xyg->yg', rho1b[i], fxc_w[1,:,1])
+                if xctype == 'LDA':
+                    va = ao.dot(_scale_ao(ao, wv_a[0]).T)
+                    vb = ao.dot(_scale_ao(ao, wv_b[0]).T)
+                elif xctype == 'GGA':
+                    wv_a[0] *= .5 # for transpose_sum at the end
+                    wv_b[0] *= .5
+                    va = ao[0].dot(_scale_ao(ao, wv_a).T)
+                    vb = ao[0].dot(_scale_ao(ao, wv_b).T)
+                elif xctype == 'NLC':
+                    raise NotImplementedError('NLC')
+                else:
+                    wv_a[[0,4]] *= .5 # for transpose_sum at the end
+                    wv_b[[0,4]] *= .5
+                    va = ao[0].dot(_scale_ao(ao[:4], wv_a[:4]).T)
+                    vb = ao[0].dot(_scale_ao(ao[:4], wv_b[:4]).T)
+                    va += _tau_dot(ao, ao, wv_a[4])
+                    vb += _tau_dot(ao, ao, wv_b[4])
+                add_sparse(vmata[i], va, mask)
+                add_sparse(vmatb[i], vb, mask)
+            t1 = log.timer_debug2('integration', *t1)
+        t0 = log.timer_debug1('vxc', *t0)
+    return vmata, vmatb
 
 def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=0,
                rho0=None, vxc=None, fxc=None, max_memory=2000, verbose=None):
@@ -1144,13 +1237,13 @@ def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
     if opt is None or mol not in [opt.mol, opt._sorted_mol]:
         ni.build(mol, grids.coords)
         opt = ni.gdftopt
-    mol = None
-    _sorted_mol = opt._sorted_mol
+    
     nao, nao0 = opt.coeff.shape
     dma, dmb = dms
     dm_shape = dma.shape
     # AO basis -> gdftopt AO basis
     with_mocc = hasattr(dms, 'mo1')
+    mo1 = occ_coeff = None
     if with_mocc:
         mo1a, mo1b = dms.mo1
         occ_coeffa, occ_coeffb = dms.occ_coeff
@@ -1158,70 +1251,32 @@ def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
         mo1b = opt.sort_orbitals(mo1b, axis=[1])
         occ_coeff_a = opt.sort_orbitals(occ_coeffa, axis=[0])
         occ_coeff_b = opt.sort_orbitals(occ_coeffb, axis=[0])
-
+        occ_coeff = (occ_coeff_a, occ_coeff_b)
+        mo1 = (mo1a, mo1b)
     dma = cupy.asarray(dma).reshape(-1,nao0,nao0)
     dmb = cupy.asarray(dmb).reshape(-1,nao0,nao0)
     dma = opt.sort_orbitals(dma, axis=[1,2])
     dmb = opt.sort_orbitals(dmb, axis=[1,2])
 
-    nset = len(dma)
-    vmata = cupy.zeros((nset, nao, nao))
-    vmatb = cupy.zeros((nset, nao, nao))
-
-    if xctype == 'LDA':
-        ao_deriv = 0
-        nvar = 1
-    elif xctype == 'GGA':
-        ao_deriv = 1
-        nvar = 4
-    else:
-        ao_deriv = 1
-        nvar = 5
-    p0 = p1 = 0
-    for ao, mask, weights, coords in ni.block_loop(
-            _sorted_mol, grids, nao, ao_deriv, max_memory=max_memory):
-        t0 = log.init_timer()
-        p0, p1 = p1, p1+len(weights)
-        # precompute fxc_w
-        fxc_w = fxc[:,:,:,:,p0:p1] * weights
-
-        # precompute molecular orbitals
-        if with_mocc:
-            occ_coeff_a_mask = occ_coeff_a[mask]
-            occ_coeff_b_mask = occ_coeff_b[mask]
-            rho1a = eval_rho4(_sorted_mol, ao, occ_coeff_a_mask, mo1a[:,mask],
-                              xctype=xctype, hermi=hermi)
-            rho1b = eval_rho4(_sorted_mol, ao, occ_coeff_b_mask, mo1b[:,mask],
-                              xctype=xctype, hermi=hermi)
-            rho1 = cupy.stack([rho1a, rho1b]).reshape(2, nset, nvar, p1-p0)
-        else: # slow version
-            rho1 = cupy.empty((2, nset, nvar, p1-p0))
-            for i in range(nset):
-                rho1[0,i] = eval_rho(_sorted_mol, ao, dma[i,mask[:,None],mask],
-                                     xctype=xctype, hermi=hermi)
-                rho1[1,i] = eval_rho(_sorted_mol, ao, dmb[i,mask[:,None],mask],
-                                     xctype=xctype, hermi=hermi)
-        t0 = log.timer_debug1('rho', *t0)
+    futures = []
+    cupy.cuda.get_current_stream().synchronize()
+    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
+        for device_id in range(_num_devices):
+            future = executor.submit(
+                _nr_uks_fxc_task,
+                ni, mol, grids, xc_code, fxc, (dma, dmb), mo1, occ_coeff,
+                verbose=log.verbose, hermi=hermi, device_id=device_id)
+            futures.append(future)
+    vmata_dist = [] 
+    vmatb_dist = []
+    for future in futures:
+        vmata, vmatb = future.result()
+        vmata_dist.append(vmata)
+        vmatb_dist.append(vmatb)
+    
+    vmata = reduce_to_device(vmata_dist, inplace=True)
+    vmatb = reduce_to_device(vmatb_dist, inplace=True)
 
-        for i in range(nset):
-            wv = contract('axg,axbyg->byg', rho1[:,i], fxc_w)
-            if xctype == 'LDA':
-                va = ao.dot(_scale_ao(ao, wv[0,0]).T)
-                vb = ao.dot(_scale_ao(ao, wv[1,0]).T)
-            elif xctype == 'GGA':
-                wv[:,0] *= .5 # for transpose_sum at the end
-                va = ao[0].dot(_scale_ao(ao, wv[0]).T)
-                vb = ao[0].dot(_scale_ao(ao, wv[1]).T)
-            elif xctype == 'NLC':
-                raise NotImplementedError('NLC')
-            else:
-                wv[:,[0,4]] *= .5 # for transpose_sum at the end
-                va = ao[0].dot(_scale_ao(ao[:4], wv[0,:4]).T)
-                vb = ao[0].dot(_scale_ao(ao[:4], wv[1,:4]).T)
-                va += _tau_dot(ao, ao, wv[0,4])
-                vb += _tau_dot(ao, ao, wv[1,4])
-            add_sparse(vmata[i], va, mask)
-            add_sparse(vmatb[i], vb, mask)
     vmata = opt.unsort_orbitals(vmata, axis=[1,2])
     vmatb = opt.unsort_orbitals(vmatb, axis=[1,2])
     if xctype != 'LDA':
@@ -1578,7 +1633,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
 
     comp = (deriv+1)*(deriv+2)*(deriv+3)//6
     if blksize is None:
-        #cupy.get_default_memory_pool().free_all_blocks()
+        # By default, a memory space of [comp,nao,blksize] is reserved
         mem_avail = get_avail_mem()
         blksize = int((mem_avail*.2/8/((comp+1)*nao + extra))/ ALIGNED) * ALIGNED
         blksize = min(blksize, MIN_BLK_SIZE)
diff --git a/gpu4pyscf/hessian/jk.py b/gpu4pyscf/hessian/jk.py
index 5a1f75ab..f4f102c6 100644
--- a/gpu4pyscf/hessian/jk.py
+++ b/gpu4pyscf/hessian/jk.py
@@ -41,7 +41,7 @@ def _ao2mo(v_ao, mocc, mo_coeff):
     v_ao = contract('nij,jo->nio', v_ao, mocc)
     return contract('nio,ip->npo', v_ao, mo_coeff)
 
-def _jk_task(mol, dms, mo_coeff, mocc, vhfopt, task_list, hermi=0,
+def _jk_task(mol, dms, mo_coeff, mo_occ, vhfopt, task_list, hermi=0,
              device_id=0, with_j=True, with_k=True, verbose=0):
     nao, _ = vhfopt.coeff.shape
     uniq_l_ctr = vhfopt.uniq_l_ctr
@@ -56,6 +56,12 @@ def _jk_task(mol, dms, mo_coeff, mocc, vhfopt, task_list, hermi=0,
         log = logger.new_logger(mol, verbose)
         cput0 = log.init_timer()
         dms = cp.asarray(dms)
+        coeff = cp.asarray(vhfopt.coeff)
+
+        # Transform MO coeffcients and DM into sorted, cartesian AO basis
+        #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff)
+        dms = sandwich_dot(dms, coeff.T)
+        dms = cp.asarray(dms, order='C')
 
         n_dm = dms.shape[0]
         tile_q_ptr = ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p)
@@ -126,11 +132,16 @@ def _jk_task(mol, dms, mo_coeff, mocc, vhfopt, task_list, hermi=0,
         if with_k:
             vk = transpose_sum(vk)
 
-        if isinstance(mocc, tuple):
+        assert mo_coeff.ndim == 2 or mo_coeff.ndim == 3
+        if mo_coeff.ndim == 3:
             # Unrestricted case
-            mocca, moccb = mocc
-            moa, mob = mo_coeff
+            mo_coeff = cp.asarray(mo_coeff)
+            mo_occ = cp.asarray(mo_occ)
+            moa = coeff.dot(mo_coeff[0]) 
+            mob = coeff.dot(mo_coeff[1])
             nmoa, nmob = moa.shape[1], mob.shape[1]
+            mocca = moa[:,mo_occ[0] > 0.5]
+            moccb = mob[:,mo_occ[1] > 0.5]
             nocca, noccb = mocca.shape[1], moccb.shape[1]
             n_dm_2 = n_dm//2
             if with_j:
@@ -144,6 +155,10 @@ def _jk_task(mol, dms, mo_coeff, mocc, vhfopt, task_list, hermi=0,
                 vk[:,:nmoa*nocca] = _ao2mo(vka, mocca, moa).reshape(n_dm_2,-1)
                 vk[:,nmoa*nocca:] = _ao2mo(vkb, moccb, mob).reshape(n_dm_2,-1)
         else:
+            mo_coeff = cp.asarray(mo_coeff)
+            mo_occ = cp.asarray(mo_occ)
+            mo_coeff = coeff.dot(mo_coeff)
+            mocc = mo_coeff[:,mo_occ>0.5]
             if with_j:
                 vj = _ao2mo(vj, mocc, mo_coeff).reshape(n_dm,-1)
             if with_k:
@@ -151,7 +166,7 @@ def _jk_task(mol, dms, mo_coeff, mocc, vhfopt, task_list, hermi=0,
         
     return vj, vk, kern_counts, timing_counter
 
-def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None, 
+def get_jk(mol, dm, mo_coeff, mo_occ, hermi=0, vhfopt=None, 
            with_j=True, with_k=True, verbose=None):
     '''Compute J, K matrices in MO
     '''
@@ -166,18 +181,6 @@ def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None,
 
     dm = cp.asarray(dm, order='C')
     dms = dm.reshape(-1,nao_orig,nao_orig)
-
-    # Transform MO coeffcients and DM into sorted, cartesian AO basis
-    #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff)
-    dms = sandwich_dot(dms, vhfopt.coeff.T)
-    dms = cp.asarray(dms, order='C')
-    coeff = vhfopt.coeff
-    if isinstance(mocc, tuple):
-        mocc = (coeff.dot(mocc[0]), coeff.dot(mocc[1]))
-        mo_coeff = (coeff.dot(mo_coeff[0]), coeff.dot(mo_coeff[1]))
-    else:
-        mocc = coeff.dot(mocc)
-        mo_coeff = coeff.dot(mo_coeff)
     n_dm = dms.shape[0]
 
     assert with_j or with_k
@@ -201,7 +204,7 @@ def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None,
         for device_id in range(_num_devices):
             future = executor.submit(
                 _jk_task,
-                mol, dms, mo_coeff, mocc, vhfopt, task_list[device_id], hermi=hermi,
+                mol, dms, mo_coeff, mo_occ, vhfopt, task_list[device_id], hermi=hermi,
                 with_j=with_j, with_k=with_k, verbose=verbose, 
                 device_id=device_id)
             futures.append(future)
@@ -244,6 +247,10 @@ def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None,
                 scripts.append('jk->s2il')
             else:
                 scripts.append('jk->s1il')
+        # Transform MO coeffcients and DM into sorted, cartesian AO basis
+        #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff)
+        dms = sandwich_dot(dms, vhfopt.coeff.T)
+        dms = cp.asarray(dms, order='C')
         shls_excludes = [0, h_shls[0]] * 4
         vs_h = _vhf.direct_mapdm('int2e_cart', 's8', scripts,
                                  dms.get(), 1, mol._atm, mol._bas, mol._env,
@@ -263,9 +270,11 @@ def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None,
             if with_k:
                 vk1[:,idy,idx] = vk1[:,idx,idy]
 
-        if isinstance(mocc, tuple):
-            mocca, moccb = mocc
-            moa, mob = mo_coeff
+        if mo_coeff.ndim == 3:
+            moa = vhfopt.coeff.dot(mo_coeff[0])
+            mob = vhfopt.coeff.dot(mo_coeff[1])
+            mocca = moa[:,mo_occ[0]>0.5]
+            moccb = mob[:,mo_occ[1]>0.5]
             nmoa = moa.shape[1]
             nocca = mocca.shape[1]
             n_dm_2 = n_dm//2
@@ -278,6 +287,8 @@ def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None,
                 vk[:,:nmoa*nocca] += _ao2mo(vka, mocca, moa).reshape(n_dm_2,-1)
                 vk[:,nmoa*nocca:] += _ao2mo(vkb, moccb, mob).reshape(n_dm_2,-1)
         else:
+            mo_coeff = vhfopt.coeff.dot(mo_coeff)
+            mocc = mo_coeff[:,mo_occ>0.5]
             if with_j:
                 vj += _ao2mo(cp.asarray(vj1), mocc, mo_coeff).reshape(n_dm,-1)
             if with_k:
diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py
index bec9d0e0..0b2f3f99 100644
--- a/gpu4pyscf/hessian/rhf.py
+++ b/gpu4pyscf/hessian/rhf.py
@@ -180,6 +180,11 @@ def _ejk_ip2_task(mol, dms, vhfopt, task_list, j_factor=1.0, k_factor=1.0,
         log = logger.new_logger(mol, verbose)
         cput0 = log.init_timer()
         dms = cp.asarray(dms)
+        coeff = cp.asarray(vhfopt.coeff)
+        
+        #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff)
+        dms = sandwich_dot(dms, coeff.T)
+        dms = cp.asarray(dms, order='C')
 
         tile_q_ptr = ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p)
         q_ptr = ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p)
@@ -275,9 +280,6 @@ def _partial_ejk_ip2(mol, dm, vhfopt=None, j_factor=1., k_factor=1., verbose=Non
 
     dm = cp.asarray(dm, order='C')
     dms = dm.reshape(-1,nao_orig,nao_orig)
-    #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff)
-    dms = sandwich_dot(dms, vhfopt.coeff.T)
-    dms = cp.asarray(dms, order='C')
 
     init_constant(mol)
 
@@ -656,10 +658,11 @@ def fvind_vo(mo1):
 
     avail_mem = get_avail_mem()
     # *4 for input dm, vj, vk, and vxc
-    blksize = int(min(avail_mem*.3 / (8*3*nao*nao*4),
-                      avail_mem*.6 / (8*nmo*nocc*natm*3*5)))
+    blksize = int(min(avail_mem*.3 / (8*3*nao*nocc*4), # in MO
+                      avail_mem*.6 / (8*nmo*nocc*3*5), 
+                      avail_mem*.3 / (8*nmo*nmo*3*3))) # vj, vk, dm 
     if blksize < ALIGNED**2:
-        raise RuntimeError('GPU memory insufficient')
+        raise RuntimeError('GPU memory insufficient for solving CPHF equations')
 
     blksize = (blksize // ALIGNED**2) * ALIGNED**2
     log.debug(f'GPU memory {avail_mem/GB:.1f} GB available')
@@ -884,7 +887,7 @@ def get_hcore(iatm, jatm):
 def hcore_generator(hessobj, mol=None):
     raise NotImplementedError
 
-def _get_jk_mo(hessobj, mol, dms, mo_coeff, mocc, 
+def _get_jk_mo(hessobj, mol, dms, mo_coeff, mo_occ, 
             hermi=1, with_j=True, with_k=True, omega=None):
     ''' Compute J/K matrices in MO for multiple DMs
     '''
@@ -894,12 +897,11 @@ def _get_jk_mo(hessobj, mol, dms, mo_coeff, mocc,
         with mol.with_range_coulomb(omega):
             vhfopt = mf._opt_gpu[omega] = _VHFOpt(mol, mf.direct_scf_tol).build()
     with mol.with_range_coulomb(omega):
-        vj, vk = jk.get_jk(mol, dms, mo_coeff, mocc, hermi, vhfopt, with_j, with_k)
+        vj, vk = jk.get_jk(mol, dms, mo_coeff, mo_occ, hermi, vhfopt, with_j, with_k)
     return vj, vk
 
 def _get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1, omega=None):
-    mocc = mo_coeff[:,mo_occ>0]
-    vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mocc, 
+    vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, 
                      hermi=hermi, with_j=True, with_k=True, omega=omega)
     return vj - 0.5 * vk
 
diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py
index d64b35bd..261fa631 100644
--- a/gpu4pyscf/hessian/rks.py
+++ b/gpu4pyscf/hessian/rks.py
@@ -731,16 +731,16 @@ def get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1, omega=None):
     v1 = jk._ao2mo(v1, mocc, mo_coeff).reshape(-1,nmo*nocc)
 
     if hybrid:
-        vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mocc, hermi=1)
+        vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=1)
         vk *= hyb
         if omega > 1e-10:  # For range separated Coulomb
-            _, vk_lr = hessobj.get_jk_mo(mol, dms, mo_coeff, mocc, hermi, 
+            _, vk_lr = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi, 
                                         with_j=False, omega=omega) 
             vk_lr *= (alpha-hyb)
             vk += vk_lr
         v1 += vj - .5 * vk
     else:
-        v1 += hessobj.get_jk_mo(mol, dms, mo_coeff, mocc, hermi=1, 
+        v1 += hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=1, 
                                 with_k=False)[0]
     
     return v1
diff --git a/gpu4pyscf/hessian/tests/test_rhf_hessian.py b/gpu4pyscf/hessian/tests/test_rhf_hessian.py
index 30a1c188..ac657199 100644
--- a/gpu4pyscf/hessian/tests/test_rhf_hessian.py
+++ b/gpu4pyscf/hessian/tests/test_rhf_hessian.py
@@ -106,7 +106,6 @@ def test_get_jk(self):
         nao = mol.nao
         mo_coeff = np.random.rand(nao, nao)
         dm = mo_coeff.dot(mo_coeff.T) * 2
-
         vj, vk = rhf_gpu._get_jk_ip1(mol, dm)
         assert abs(lib.fp(vj.get()) -  87674.69061160382) < 1e-7
         assert abs(lib.fp(vk.get()) - -9.317650662101629) < 1e-7
@@ -183,9 +182,11 @@ def test_jk_mix(self):
         )
         nao = mol1.nao
         mo_coeff = cupy.random.rand(nao, nao)
+        mo_occ = cupy.zeros([nao])
+        mo_occ[:3] = 2
         mocc = mo_coeff[:,:3]
         dm = mocc.dot(mocc.T) * 2
-        vj_mo, vk_mo = jk.get_jk(mol1, dm, mo_coeff, mocc, hermi=1)
+        vj_mo, vk_mo = jk.get_jk(mol1, dm, mo_coeff, mo_occ, hermi=1)
         
         mf = scf.RHF(mol1)
         vj, vk = mf.get_jk(mol1, dm, hermi=1)
diff --git a/gpu4pyscf/hessian/tests/test_uhf_hessian.py b/gpu4pyscf/hessian/tests/test_uhf_hessian.py
index 1e10306c..a7d5c983 100644
--- a/gpu4pyscf/hessian/tests/test_uhf_hessian.py
+++ b/gpu4pyscf/hessian/tests/test_uhf_hessian.py
@@ -119,10 +119,13 @@ def test_jk_mix(self):
         mo_coeff = cupy.random.rand(2, nao, nao)
         mocca = mo_coeff[0,:,:3]
         moccb = mo_coeff[1,:,:2]
+        mo_occ = cupy.zeros([2,nao])
+        mo_occ[0,:3] = 1
+        mo_occ[1,:2] = 1
         dm = cupy.empty([2,nao,nao])
         dm[0] = mocca.dot(mocca.T)
         dm[1] = moccb.dot(moccb.T)
-        vj_mo, vk_mo = jk.get_jk(mol1, dm, mo_coeff, (mocca,moccb), hermi=1)
+        vj_mo, vk_mo = jk.get_jk(mol1, dm, mo_coeff, mo_occ, hermi=1)
         
         mf = scf.UHF(mol1)
         vj, vk = mf.get_jk(mol1, dm, hermi=1)
diff --git a/gpu4pyscf/hessian/uhf.py b/gpu4pyscf/hessian/uhf.py
index 73bec288..c7e836b8 100644
--- a/gpu4pyscf/hessian/uhf.py
+++ b/gpu4pyscf/hessian/uhf.py
@@ -404,9 +404,7 @@ def fx(mo1):
     return fx
 
 def _get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1):
-    mocca = mo_coeff[0][:,mo_occ[0]>0]
-    moccb = mo_coeff[1][:,mo_occ[1]>0]
-    vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, (mocca, moccb), 
+    vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, 
                                hermi=hermi, with_j=True, with_k=True)
     return vj - vk
 
diff --git a/gpu4pyscf/hessian/uks.py b/gpu4pyscf/hessian/uks.py
index 5d565b81..db4bf59e 100644
--- a/gpu4pyscf/hessian/uks.py
+++ b/gpu4pyscf/hessian/uks.py
@@ -880,16 +880,16 @@ def get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1):
     v1vo[:,:nmoa*nocca] = jk._ao2mo(v1[0], mocca, mo_coeff[0]).reshape(-1,nmoa*nocca)
     v1vo[:,nmoa*nocca:] = jk._ao2mo(v1[1], moccb, mo_coeff[1]).reshape(-1,nmob*noccb)
     if hybrid:
-        vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, (mocca, moccb), hermi=1)
+        vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=1)
         vk *= hyb
         if omega > 1e-10:
-            _, vk_lr = hessobj.get_jk_mo(mol, dms, mo_coeff, (mocca, moccb), 
+            _, vk_lr = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, 
                                          hermi, with_j=False, omega=omega) 
             vk_lr *= (alpha-hyb)
             vk += vk_lr
         v1vo += vj - vk
     else:
-        v1vo += hessobj.get_jk_mo(mol, dms, mo_coeff, (mocca, moccb), 
+        v1vo += hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, 
                                   hermi=1, with_k=False)[0]
     return v1vo
 

From 81d9a8655a10230f5e44d222f79b0312789620cf Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Sun, 22 Dec 2024 06:22:43 +0000
Subject: [PATCH 08/49] more accurate memory estimate for hessian

---
 gpu4pyscf/df/hessian/rhf.py | 3 +++
 gpu4pyscf/df/hessian/rks.py | 4 +++-
 gpu4pyscf/df/hessian/uhf.py | 5 ++++-
 gpu4pyscf/df/hessian/uks.py | 2 ++
 gpu4pyscf/hessian/rhf.py    | 6 ++++--
 gpu4pyscf/hessian/rks.py    | 9 ++++++---
 gpu4pyscf/hessian/uhf.py    | 6 ++++--
 gpu4pyscf/hessian/uks.py    | 6 ++++--
 8 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py
index bd9ef958..de6f1ebb 100644
--- a/gpu4pyscf/df/hessian/rhf.py
+++ b/gpu4pyscf/df/hessian/rhf.py
@@ -396,6 +396,9 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
 
 def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
+    mol = hessobj.mol
+    natm = mol.natm
+    assert atmlst is None or atmlst ==range(natm)
     vj, vk = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, atmlst, verbose, True)
     # h1mo = h1 + vj - 0.5 * vk
     h1mo = vk
diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py
index 1d16ff16..31c1d506 100644
--- a/gpu4pyscf/df/hessian/rks.py
+++ b/gpu4pyscf/df/hessian/rks.py
@@ -85,6 +85,8 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
 def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     mol = hessobj.mol
+    natm = mol.natm
+    assert atmlst is None or atmlst ==range(natm)
     mf = hessobj.base
     ni = mf._numint
     ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
@@ -101,7 +103,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     vj1 = vk1 = None
     
     if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10:
-        _, vk1_lr = df_rhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
+        _, vk1_lr = df_rhf_hess._get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile,
                                              atmlst, verbose, True, omega=omega)
         h1mo -= .5 * (alpha - hyb) * vk1_lr
         vk1_lr = None
diff --git a/gpu4pyscf/df/hessian/uhf.py b/gpu4pyscf/df/hessian/uhf.py
index 1dc3f3a4..acd67380 100644
--- a/gpu4pyscf/df/hessian/uhf.py
+++ b/gpu4pyscf/df/hessian/uhf.py
@@ -413,6 +413,9 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     mol = hessobj.mol
     natm = mol.natm
+    mol = hessobj.mol
+    natm = mol.natm
+    assert atmlst is None or atmlst ==range(natm)
     if atmlst is None:
         atmlst = range(natm)
 
@@ -633,7 +636,7 @@ def _ao2mo(mat, mocc, mo):
         return contract('xik,ip->xpk', tmp, mo)
 
     cupy.get_default_memory_pool().free_all_blocks()
-    
+
     for i0, ia in enumerate(atmlst):
         shl0, shl1, p0, p1 = aoslices[ia]
         vj1_ao = cupy.zeros([3,nao,nao])
diff --git a/gpu4pyscf/df/hessian/uks.py b/gpu4pyscf/df/hessian/uks.py
index 5fd23a34..6f94ed06 100644
--- a/gpu4pyscf/df/hessian/uks.py
+++ b/gpu4pyscf/df/hessian/uks.py
@@ -90,6 +90,8 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
 def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     mol = hessobj.mol
+    natm = mol.natm
+    assert atmlst is None or atmlst ==range(natm)
     mf = hessobj.base
     ni = mf._numint
     ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py
index 0b2f3f99..2ca835ad 100644
--- a/gpu4pyscf/hessian/rhf.py
+++ b/gpu4pyscf/hessian/rhf.py
@@ -357,14 +357,16 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     assert atmlst is None
     mol = hessobj.mol
     natm = mol.natm
-    nao = mo_coeff.shape[0]
     mo_coeff = cp.asarray(mo_coeff)
     mocc = cp.asarray(mo_coeff[:,mo_occ>0])
     dm0 = mocc.dot(mocc.T) * 2
     h1mo = rhf_grad.get_grad_hcore(hessobj.base.Gradients())
 
+    # Estimate the size of intermediate variables
+    # dm, vj, and vk in [natm,3,nao_cart,nao_cart]
+    nao_cart = mol.nao_cart()
     avail_mem = get_avail_mem()
-    slice_size = int(avail_mem*0.6) // (8*3*nao*nao)
+    slice_size = int(avail_mem*0.5) // (8*3*nao_cart*nao_cart*3)
     for atoms_slice in lib.prange(0, natm, slice_size):
         vj, vk = _get_jk_ip1(mol, dm0, atoms_slice=atoms_slice, verbose=verbose)
         #:vhf = vj - vk * .5
diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py
index 261fa631..5d909c78 100644
--- a/gpu4pyscf/hessian/rks.py
+++ b/gpu4pyscf/hessian/rks.py
@@ -111,7 +111,6 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     mol = hessobj.mol
     natm = mol.natm
     assert atmlst is None or atmlst == range(natm)
-    nao = mo_coeff.shape[0]
     mocc = mo_coeff[:,mo_occ>0]
     dm0 = numpy.dot(mocc, mocc.T) * 2
     avail_mem = get_avail_mem()
@@ -124,8 +123,11 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
     with_k = ni.libxc.is_hybrid_xc(mf.xc)
 
+    # Estimate the size of intermediate variables
+    # dm, vj, and vk in [natm,3,nao_cart,nao_cart]
+    nao_cart = mol.nao_cart()
     avail_mem -= 8 * h1mo.size
-    slice_size = int(avail_mem*0.5) // (8*3*nao*nao)
+    slice_size = int(avail_mem*0.5) // (8*3*nao_cart*nao_cart*3)
     for atoms_slice in lib.prange(0, natm, slice_size):
         vj, vk = rhf_hess._get_jk_ip1(mol, dm0, with_k=with_k,
                                       atoms_slice=atoms_slice, verbose=verbose)
@@ -133,6 +135,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
         if with_k:
             vk *= .5 * hyb
             veff -= vk
+        vj = vk = None
         if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10:
             with mol.with_range_coulomb(omega):
                 vk_lr = rhf_hess._get_jk_ip1(mol, dm0, with_j=False, verbose=verbose)[1]
@@ -142,7 +145,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
         for i, ia in enumerate(range(atom0, atom1)):
             for ix in range(3):
                 h1mo[ia,ix] += mo_coeff.T.dot(veff[i,ix].dot(mocc))
-        vj = vk = vk_lr = veff = None
+        vk_lr = veff = None
     return h1mo
 
 XX, XY, XZ = 4, 5, 6
diff --git a/gpu4pyscf/hessian/uhf.py b/gpu4pyscf/hessian/uhf.py
index c7e836b8..44154532 100644
--- a/gpu4pyscf/hessian/uhf.py
+++ b/gpu4pyscf/hessian/uhf.py
@@ -183,15 +183,17 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     mo_a, mo_b = mo_coeff
     mocca = mo_a[:,mo_occ[0]>0]
     moccb = mo_b[:,mo_occ[1]>0]
-    nao = mo_a.shape[0]
     dm0a = mocca.dot(mocca.T)
     dm0b = moccb.dot(moccb.T)
     grad_obj = hessobj.base.Gradients()
     h1moa = rhf_grad.get_grad_hcore(grad_obj, mo_a, mo_occ[0])
     h1mob = rhf_grad.get_grad_hcore(grad_obj, mo_b, mo_occ[1])
 
+    # Estimate the size of intermediate variables
+    # dm, vj, and vk in [natm,3,nao_cart,nao_cart]
+    nao_cart = mol.nao_cart()
     avail_mem = get_avail_mem()
-    slice_size = int(avail_mem*0.6) // (8*3*nao*nao*2)
+    slice_size = int(avail_mem*0.5) // (8*3*nao_cart*nao_cart*6)
     for atoms_slice in lib.prange(0, natm, slice_size):
         vja, vka = rhf_hess_gpu._get_jk_ip1(mol, dm0a, atoms_slice=atoms_slice, verbose=verbose)
         vjb, vkb = rhf_hess_gpu._get_jk_ip1(mol, dm0b, atoms_slice=atoms_slice, verbose=verbose)
diff --git a/gpu4pyscf/hessian/uks.py b/gpu4pyscf/hessian/uks.py
index db4bf59e..66571300 100644
--- a/gpu4pyscf/hessian/uks.py
+++ b/gpu4pyscf/hessian/uks.py
@@ -116,7 +116,6 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     mo_a, mo_b = mo_coeff
     mocca = mo_a[:,mo_occ[0]>0]
     moccb = mo_b[:,mo_occ[1]>0]
-    nao = mo_a.shape[0]
     dm0a = mocca.dot(mocca.T)
     dm0b = moccb.dot(moccb.T)
     avail_mem = get_avail_mem()
@@ -131,8 +130,11 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
     with_k = ni.libxc.is_hybrid_xc(mf.xc)
 
+    # Estimate the size of intermediate variables
+    # dm, vj, and vk in [natm,3,nao_cart,nao_cart]
+    nao_cart = mol.nao_cart()
     avail_mem -= 8 * (h1moa.size + h1mob.size)
-    slice_size = int(avail_mem*0.5) // (8*3*nao*nao)
+    slice_size = int(avail_mem*0.5) // (8*3*nao_cart*nao_cart*6)
     for atoms_slice in lib.prange(0, natm, slice_size):
         vja, vka = rhf_hess._get_jk_ip1(mol, dm0a, with_k=with_k, atoms_slice=atoms_slice, verbose=verbose)
         vjb, vkb = rhf_hess._get_jk_ip1(mol, dm0b, with_k=with_k, atoms_slice=atoms_slice, verbose=verbose)

From 31cbd4836338cdf6f9a3ec17428c7d4695717a51 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Sun, 22 Dec 2024 07:10:46 +0000
Subject: [PATCH 09/49] _gen_jk -> _get_jk_ip

---
 gpu4pyscf/df/hessian/uks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu4pyscf/df/hessian/uks.py b/gpu4pyscf/df/hessian/uks.py
index 6f94ed06..31273a7d 100644
--- a/gpu4pyscf/df/hessian/uks.py
+++ b/gpu4pyscf/df/hessian/uks.py
@@ -114,7 +114,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     vj1 = vk1 = vj1a = vj1b = vk1a = vk1b = None
     
     if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10:
-        _, vk1_lr = df_uhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
+        _, vk1_lr = df_uhf_hess._get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile,
                                              atmlst, verbose, True, omega=omega)
         vk1a, vk1b = vk1_lr
         h1moa -= (alpha - hyb) * vk1a

From 4e8501141c77ff58f1f6ba417873f28c4d685cca Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <wxj6000@gmail.com>
Date: Sun, 22 Dec 2024 16:38:07 -0800
Subject: [PATCH 10/49] with_j and with_k for hessian

---
 gpu4pyscf/df/hessian/jk.py  | 119 ++++++------
 gpu4pyscf/df/hessian/rhf.py | 268 +++++++++++++++------------
 gpu4pyscf/df/hessian/rks.py |  25 +--
 gpu4pyscf/df/hessian/uhf.py | 357 ++++++++++++++++++++----------------
 gpu4pyscf/df/hessian/uks.py |  25 +--
 gpu4pyscf/df/int3c2e.py     | 215 +++++++++++++---------
 gpu4pyscf/hessian/rhf.py    |  18 +-
 gpu4pyscf/hessian/rks.py    |  13 +-
 gpu4pyscf/hessian/uhf.py    |   9 +-
 gpu4pyscf/hessian/uks.py    |  14 +-
 10 files changed, 581 insertions(+), 482 deletions(-)

diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py
index fb097180..f8992ca3 100644
--- a/gpu4pyscf/df/hessian/jk.py
+++ b/gpu4pyscf/df/hessian/jk.py
@@ -18,7 +18,6 @@
 import numpy as np
 from concurrent.futures import ThreadPoolExecutor
 import cupy
-from pyscf import gto
 from gpu4pyscf.df import int3c2e
 from gpu4pyscf.scf.int4c2e import libgint
 from gpu4pyscf.hessian.jk import _ao2mo
@@ -79,7 +78,7 @@ def _jk_task_with_mo1(dfobj, dms, mo_coeff, mo1s, occ_coeffs,
                         rhok1 = contract('Lij,jo->Loi', cderi, mo1[i])
                         rhok1 = rhok1.reshape([-1,nao])
                         vk[i] += cupy.dot(rhok1.T, rhok_oo)
-                        
+
                         rhok1 = rhok1.reshape([-1,nocc,nao])
                         rhok1 = contract('Loi,ip->Lop', rhok1, occ_coeff)
                         rhok1 = rhok1.reshape([-1,nocc])
@@ -91,7 +90,7 @@ def _jk_task_with_mo1(dfobj, dms, mo_coeff, mo1s, occ_coeffs,
             vj = cupy.zeros(dms_shape)
             vj[:,rows,cols] = vj_sparse
             vj[:,cols,rows] = vj_sparse
-        
+
         vj_mo = vk_mo = None
         if len(occ_coeffs) == 1:
             # Restricted case
@@ -127,7 +126,7 @@ def _jk_task_with_mo1(dfobj, dms, mo_coeff, mo1s, occ_coeffs,
         t0 = log.timer_debug1(f'vj and vk on Device {device_id}', *t0)
     return vj_mo, vk_mo
 
-def get_jk(dfobj, dms_tag, mo_coeff, mocc, hermi=0, 
+def get_jk(dfobj, dms_tag, mo_coeff, mocc, hermi=0,
            with_j=True, with_k=True, direct_scf_tol=1e-14, omega=None):
     ''' Compute J/K in MO with density fitting
     '''
@@ -195,7 +194,7 @@ def _get_int3c2e_ipip_slice(ip_type, intopt, cp_ij_id, aux_id, omega=None, strea
 
     if omega is None: omega = 0.0
     if stream is None: stream = cupy.cuda.get_current_stream()
-    
+
     fn = getattr(libgint, 'GINTfill_int3c2e_' + ip_type)
     nao = intopt._sorted_mol.nao
     naux = intopt._sorted_auxmol.nao
@@ -206,7 +205,7 @@ def _get_int3c2e_ipip_slice(ip_type, intopt, cp_ij_id, aux_id, omega=None, strea
 
     cp_kl_id = aux_id + len(intopt.log_qs)
     lk = intopt.aux_angular[aux_id]
-    
+
     cpi = intopt.cp_idx[cp_ij_id]
     cpj = intopt.cp_jdx[cp_ij_id]
     li = intopt.angular[cpi]
@@ -251,7 +250,7 @@ def _get_int3c2e_ipip_slice(ip_type, intopt, cp_ij_id, aux_id, omega=None, strea
         pmol = intopt._tot_mol
         intor = pmol._add_suffix('int3c2e_' + ip_type)
         opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor)
-    
+
         # TODO: sph2cart in CPU?
         ishl0, ishl1 = intopt.l_ctr_offsets[cpi], intopt.l_ctr_offsets[cpi+1]
         jshl0, jshl1 = intopt.l_ctr_offsets[cpj], intopt.l_ctr_offsets[cpj+1]
@@ -270,26 +269,29 @@ def _get_int3c2e_ipip_slice(ip_type, intopt, cp_ij_id, aux_id, omega=None, strea
 
 
 def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
-                        device_id=0, with_k=True, omega=None, auxbasis_response=1):
+                        device_id=0, with_j=True, with_k=True, omega=None,
+                        auxbasis_response=1):
     natm = intopt.mol.natm
     nao = dm0.shape[0]
-    naux = rhok.shape[0]
+    assert with_j or with_k
     ao_loc = intopt.ao_loc
     aux_ao_loc = intopt.aux_ao_loc
     with cupy.cuda.Device(device_id), _streams[device_id]:
         log = logger.new_logger(intopt.mol, intopt.mol.verbose)
         t0 = log.init_timer()
-        rhoj = cupy.asarray(rhoj)
-        rhok = cupy.asarray(rhok)
         orbo = cupy.asarray(orbo)
         dm0 = cupy.asarray(dm0)
         nao = dm0.shape[0]
-
-        hj_ipip1 = cupy.zeros([9,nao])
-        hj_ipip2 = cupy.zeros([9,naux])
-        hj_ip1ip2 = cupy.zeros([9,nao,naux])
-        hj_ipvip1 = cupy.zeros([9,nao,nao])
+        if with_j:
+            naux = rhoj.shape[0]
+            rhoj = cupy.asarray(rhoj)
+            hj_ipip1 = cupy.zeros([9,nao])
+            hj_ipip2 = cupy.zeros([9,naux])
+            hj_ip1ip2 = cupy.zeros([9,nao,naux])
+            hj_ipvip1 = cupy.zeros([9,nao,nao])
         if with_k:
+            naux = rhok.shape[0]
+            rhok = cupy.asarray(rhok)
             hk_ipip1 = cupy.zeros([9,nao])
             hk_ipip2 = cupy.zeros([9,naux])
             hk_ip1ip2 = cupy.zeros([9,nao,naux])
@@ -302,57 +304,63 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
             i0, i1 = ao_loc[cpi], ao_loc[cpi+1]
             j0, j1 = ao_loc[cpj], ao_loc[cpj+1]
             k0, k1 = aux_ao_loc[aux_id], aux_ao_loc[aux_id+1]
-            
+
             if with_k:
                 rhok_tmp = contract('por,ir->poi', rhok[k0:k1], orbo[i0:i1])
                 rhok_tmp = contract('poi,jo->pji', rhok_tmp, orbo[j0:j1])
 
             # (20|0), (0|0)(0|00)
             int3c_blk = _get_int3c2e_ipip_slice('ipip1', intopt, cp_ij_id, aux_id, omega=omega)
-            tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1])
-            hj_ipip1[:,i0:i1] += contract('xpi,p->xi', tmp, rhoj[k0:k1])
+            if with_j:
+                tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1])
+                hj_ipip1[:,i0:i1] += contract('xpi,p->xi', tmp, rhoj[k0:k1])
             if with_k:
                 hk_ipip1[:,i0:i1] += contract('xpji,pji->xi', int3c_blk, rhok_tmp)
 
             # (11|0), (0|0)(0|00) without response of RI basis
             int3c_blk = _get_int3c2e_ipip_slice('ipvip1', intopt, cp_ij_id, aux_id, omega=omega)
-            tmp = contract('xpji,ij->xpij', int3c_blk, dm0[i0:i1,j0:j1])
-            hj_ipvip1[:,i0:i1,j0:j1] += contract('xpij,p->xij', tmp, rhoj[k0:k1])
+            if with_j:
+                tmp = contract('xpji,ij->xpij', int3c_blk, dm0[i0:i1,j0:j1])
+                hj_ipvip1[:,i0:i1,j0:j1] += contract('xpij,p->xij', tmp, rhoj[k0:k1])
             if with_k:
                 hk_ipvip1[:,i0:i1,j0:j1] += contract('xpji,pji->xij', int3c_blk, rhok_tmp)
 
             if auxbasis_response < 1:
                 continue
-            
+
             # (10|1), (0|0)(0|00)
             int3c_blk = _get_int3c2e_ipip_slice('ip1ip2', intopt, cp_ij_id, aux_id, omega=omega)
-            tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1])
-            hj_ip1ip2[:,i0:i1,k0:k1] += contract('xpi,p->xip', tmp, rhoj[k0:k1])
+            if with_j:
+                tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1])
+                hj_ip1ip2[:,i0:i1,k0:k1] += contract('xpi,p->xip', tmp, rhoj[k0:k1])
             if with_k:
                 hk_ip1ip2[:,i0:i1,k0:k1] += contract('xpji,pji->xip', int3c_blk, rhok_tmp)
-            
+
             if auxbasis_response < 2:
                 continue
-            
+
             # (00|2), (0|0)(0|00)
             int3c_blk = _get_int3c2e_ipip_slice('ipip2', intopt, cp_ij_id, aux_id, omega=omega)
-            tmp = contract('xpji,ij->xp', int3c_blk, dm0[i0:i1,j0:j1])
-            hj_ipip2[:,k0:k1] += contract('xp,p->xp', tmp, rhoj[k0:k1])
+            if with_j:
+                tmp = contract('xpji,ij->xp', int3c_blk, dm0[i0:i1,j0:j1])
+                hj_ipip2[:,k0:k1] += contract('xp,p->xp', tmp, rhoj[k0:k1])
             if with_k:
                 hk_ipip2[:,k0:k1] += contract('xpji,pji->xp', int3c_blk, rhok_tmp)
-            
+
         auxslices = intopt.auxmol.aoslice_by_atom()
         aoslices = intopt.mol.aoslice_by_atom()
         ao2atom = int3c2e.get_ao2atom(intopt, aoslices)
         aux2atom = int3c2e.get_aux2atom(intopt, auxslices)
 
-        hj_ipvip1 = hj_ipvip1.reshape([3,3,nao,nao])
-        tmp = contract('ia,xyij->ajxy', ao2atom, hj_ipvip1)
-        hj = 2.0 * contract('jb,ajxy->abxy', ao2atom, tmp)
+        hj = None
+        if with_j:
+            hj_ipvip1 = hj_ipvip1.reshape([3,3,nao,nao])
+            tmp = contract('ia,xyij->ajxy', ao2atom, hj_ipvip1)
+            hj = 2.0 * contract('jb,ajxy->abxy', ao2atom, tmp)
 
-        hj_ipip1 = hj_ipip1.reshape([3,3,nao])
-        tmp = contract('ia,xyi->axy', ao2atom, hj_ipip1)
-        hj[range(natm), range(natm)] += 2.0 * tmp
+            hj_ipip1 = hj_ipip1.reshape([3,3,nao])
+            tmp = contract('ia,xyi->axy', ao2atom, hj_ipip1)
+            hj[range(natm), range(natm)] += 2.0 * tmp
 
         hk = None
         if with_k:
@@ -363,15 +371,16 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
             hk_ipip1 = hk_ipip1.reshape([3,3,nao])
             tmp = contract('ia,xyi->axy', ao2atom, hk_ipip1)
             hk[range(natm), range(natm)] += tmp
-        
+
         if auxbasis_response > 0:
-            hj_ip1ip2 = hj_ip1ip2.reshape([3,3,nao,naux])
-            tmp = contract('ia,xyij->ajxy', ao2atom, hj_ip1ip2)
-            tmp = contract('jb,ajxy->abxy',aux2atom, tmp)
-            tmp = tmp + tmp.transpose([1,0,3,2])
-            hj += tmp
-            if auxbasis_response > 1:
+            if with_j:
+                hj_ip1ip2 = hj_ip1ip2.reshape([3,3,nao,naux])
+                tmp = contract('ia,xyij->ajxy', ao2atom, hj_ip1ip2)
+                tmp = contract('jb,ajxy->abxy',aux2atom, tmp)
+                tmp = tmp + tmp.transpose([1,0,3,2])
                 hj += tmp
+                if auxbasis_response > 1:
+                    hj += tmp
             if with_k:
                 hk_ip1ip2 = hk_ip1ip2.reshape([3,3,nao,naux])
                 tmp = contract('ia,xyij->ajxy', ao2atom, hk_ip1ip2)
@@ -380,11 +389,12 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
                 hk += tmp
                 if auxbasis_response > 1:
                     hk += tmp
-        
+
         if auxbasis_response > 1:
-            hj_ipip2 = hj_ipip2.reshape([3,3,naux])
-            tmp = contract('ia,xyi->axy', aux2atom, hj_ipip2)
-            hj[range(natm), range(natm)] += tmp
+            if with_j:
+                hj_ipip2 = hj_ipip2.reshape([3,3,naux])
+                tmp = contract('ia,xyi->axy', aux2atom, hj_ipip2)
+                hj[range(natm), range(natm)] += tmp
             if with_k:
                 hk_ipip2 = hk_ipip2.reshape([3,3,naux])
                 tmp = contract('ia,xyi->axy', aux2atom, hk_ipip2)
@@ -392,7 +402,7 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
         t0 = log.timer_debug1(f'int3c2e_ipip on Device {device_id}', *t0)
     return hj, hk
 
-def get_int3c2e_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, 
+def get_int3c2e_hjk(intopt, rhoj, rhok, dm0_tag, with_j=True, with_k=True,
                     omega=None, auxbasis_response=1):
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
     futures = []
@@ -402,26 +412,27 @@ def get_int3c2e_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True,
     task_list = []
     for device_id in range(_num_devices):
         task_list.append(tasks[device_id::_num_devices])
-    
+
     cupy.cuda.get_current_stream().synchronize()
     with ThreadPoolExecutor(max_workers=_num_devices) as executor:
         for device_id in range(_num_devices):
             future = executor.submit(
-                _int3c2e_ipip_tasks, intopt, task_list[device_id], 
-                rhoj, rhok, dm0_tag, orbo, with_k=with_k, 
-                device_id=device_id, omega=omega, 
+                _int3c2e_ipip_tasks, intopt, task_list[device_id],
+                rhoj, rhok, dm0_tag, orbo, with_j=with_j, with_k=with_k,
+                device_id=device_id, omega=omega,
                 auxbasis_response=auxbasis_response)
             futures.append(future)
-    
+
     hj_total = []
     hk_total = []
     for future in futures:
         hj, hk = future.result()
         hj_total.append(hj)
         hk_total.append(hk)
-        
+
     hj = hk = None
-    hj = reduce_to_device(hj_total, inplace=True)
+    if with_j:
+        hj = reduce_to_device(hj_total, inplace=True)
     if with_k:
         hk = reduce_to_device(hk_total, inplace=True)
     return hj, hk
diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py
index de6f1ebb..938b1384 100644
--- a/gpu4pyscf/df/hessian/rhf.py
+++ b/gpu4pyscf/df/hessian/rhf.py
@@ -70,8 +70,8 @@ def _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2):
         rhok1_Pkl_kslice = rhok1_Pko_kslice = None
     return hk_ao_ao
 
-def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
-                      atmlst=None, max_memory=4000, verbose=None, with_k=True, omega=None):
+def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None,
+                      max_memory=None, verbose=None, with_j=True, with_k=True, omega=None):
     '''Partial derivative
     '''
     log = logger.new_logger(hessobj, verbose)
@@ -121,18 +121,24 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
     int2c_ip1 = cupy.asarray(int2c_ip1, order='C')
     int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2])
-
-    hj_ao_ao = cupy.zeros([nao,nao,3,3])
-    hk_ao_ao = cupy.zeros([nao,nao,3,3])
+    if with_j:
+        hj_ao_ao = cupy.zeros([nao,nao,3,3])
+    if with_k:
+        hk_ao_ao = cupy.zeros([nao,nao,3,3])
     if hessobj.auxbasis_response:
-        hj_ao_aux = cupy.zeros([nao,naux,3,3])
-        hk_ao_aux = cupy.zeros([nao,naux,3,3])
+        if with_j:
+            hj_ao_aux = cupy.zeros([nao,naux,3,3])
+        if with_k:
+            hk_ao_aux = cupy.zeros([nao,naux,3,3])
 
     #  int3c contributions
     wj, wk_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0_tag, omega=omega)
     t1 = log.timer_debug1('intermediate variables with int3c2e', *t1)
-    rhoj0_P = solve_j2c(wj)
-    rhok0_P__ = solve_j2c(wk_P__)
+    rhoj0_P = rhok0_P__ = None
+    if with_j:
+        rhoj0_P = solve_j2c(wj)
+    if with_k:
+        rhok0_P__ = solve_j2c(wk_P__)
     wj = wk_P__ = None
 
     # int3c_ip2 contributions
@@ -142,18 +148,19 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     #  int3c_ip1 contributions
     wj1_P, wk1_Pko = int3c2e.get_int3c2e_ip1_wjk(intopt, dm0_tag, omega=omega)
     #rhoj1_P = contract('pq,pix->qix', int2c_inv, wj1_P)
-    rhoj1_P = solve_j2c(wj1_P)
+    if with_j:
+        rhoj1_P = solve_j2c(wj1_P)
 
-    hj_ao_ao += 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P)   # (10|0)(0|0)(0|01)
-    wj1_P = None
-    if hessobj.auxbasis_response:
-        wj0_01 = contract('ypq,q->yp', int2c_ip1, rhoj0_P)
-        wj1_01 = contract('yqp,pix->iqxy', int2c_ip1, rhoj1_P)
-        hj_ao_aux += contract('pix,py->ipxy', rhoj1_P, wj_ip2)   # (10|0)(1|00)
-        hj_ao_aux -= contract('pix,yp->ipxy', rhoj1_P, wj0_01)   # (10|0)(1|0)(0|00)
-        hj_ao_aux -= contract('q,iqxy->iqxy', rhoj0_P, wj1_01)   # (10|0)(0|1)(0|00)
-        wj1_01 = None
-    rhoj1_P = None
+        hj_ao_ao += 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P)   # (10|0)(0|0)(0|01)
+        wj1_P = None
+        if hessobj.auxbasis_response:
+            wj0_01 = contract('ypq,q->yp', int2c_ip1, rhoj0_P)
+            wj1_01 = contract('yqp,pix->iqxy', int2c_ip1, rhoj1_P)
+            hj_ao_aux += contract('pix,py->ipxy', rhoj1_P, wj_ip2)   # (10|0)(1|00)
+            hj_ao_aux -= contract('pix,yp->ipxy', rhoj1_P, wj0_01)   # (10|0)(1|0)(0|00)
+            hj_ao_aux -= contract('q,iqxy->iqxy', rhoj0_P, wj1_01)   # (10|0)(0|1)(0|00)
+            wj1_01 = None
+        rhoj1_P = None
 
     if with_k:
         cupy.get_default_memory_pool().free_all_blocks()
@@ -203,13 +210,13 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
                 rhok1_Pko[:,i0:i1] = contract('qp,qiox->piox', cd_low, wk1_tmp).get()
             wk1_tmp = None
         cd_low = None
-        
+
         hk_ao_ao += _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2)
     wk1_Pko = rhok1_Pko = None
     t1 = log.timer_debug1('intermediate variables with int3c2e_ip1', *t1)
 
     hj_ipip, hk_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag,
-                                          with_k=with_k, omega=omega, 
+                                          with_j=with_j, with_k=with_k, omega=omega,
                                           auxbasis_response=hessobj.auxbasis_response)
     t1 = log.timer_debug1('intermediate variables with int3c2e_ipip', *t1)
 
@@ -222,10 +229,12 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
             int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1')
         int2c_ipip1 = cupy.asarray(int2c_ipip1, order='C')
         int2c_ipip1 = intopt.sort_orbitals(int2c_ipip1, aux_axis=[1,2])
-        rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P)
+
         # (00|0)(2|0)(0|00)
         # p,xp->px
-        hj_aux_diag = -(rhoj0_P*rhoj2c_P).T.reshape(-1,3,3)
+        if with_j:
+            rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P)
+            hj_aux_diag = -(rhoj0_P*rhoj2c_P).T.reshape(-1,3,3)
         if with_k:
             rho2c_0 = contract('pij,qji->pq', rhok0_P__, rhok0_P__)
             hk_aux_diag = -.5 * contract('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3)
@@ -238,7 +247,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
             int2c_ip1ip2 = auxmol.intor('int2c2e_ip1ip2', aosym='s1')
         int2c_ip1ip2 = cupy.asarray(int2c_ip1ip2, order='C')
         int2c_ip1ip2 = intopt.sort_orbitals(int2c_ip1ip2, aux_axis=[1,2])
-        hj_aux_aux = -.5 * contract('p,xpq->pqx', rhoj0_P, int2c_ip1ip2*rhoj0_P).reshape(naux, naux,3,3)
+        if with_j:
+            hj_aux_aux = -.5 * contract('p,xpq->pqx', rhoj0_P, int2c_ip1ip2*rhoj0_P).reshape(naux, naux,3,3)
         if with_k:
             hk_aux_aux = -.5 * contract('xpq,pq->pqx', int2c_ip1ip2, rho2c_0).reshape(naux,naux,3,3)
         t1 = log.timer_debug1('intermediate variables with int2c_*', *t1)
@@ -249,23 +259,23 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     # aux-aux pair
     if hessobj.auxbasis_response > 1:
         int2c_inv = pinv(int2c, lindep=LINEAR_DEP_THR)
-        wj0_10 = contract('ypq,p->ypq', int2c_ip1, rhoj0_P)
         int2c_ip1_inv = contract('yqp,pr->yqr', int2c_ip1, int2c_inv)
-
-        rhoj0_10 = contract('p,xpq->xpq', rhoj0_P, int2c_ip1_inv)     # (1|0)(0|00)
-        hj_aux_aux += .5 * contract('xpr,yqr->pqxy', rhoj0_10, wj0_10)  # (00|0)(1|0), (0|1)(0|00)
-        hj_aux_aux +=      contract('xpq,yq->pqxy',  rhoj0_10, wj0_01)  # (00|0)(1|0), (1|0)(0|00)
-        rhoj0_10 = rhoj0_P = None
-
-        rhoj1 = contract('px,pq->xpq', wj_ip2, int2c_inv)             # (0|0)(1|00)
-        hj_aux_aux -=      contract('xpq,yq->pqxy',  rhoj1,    wj0_01)  # (00|1),      (1|0)(0|00)
-        hj_aux_aux += .5 * contract('xpq,qy->pqxy',  rhoj1,    wj_ip2)  # (00|1),      (1|00)
-        hj_aux_aux -=      contract('xpr,yqr->pqxy', rhoj1,    wj0_10)  # (00|1),      (0|1)(0|00)
-        wj0_10 = rhoj1 = wj_ip2 = None
-
-        rhoj0_01 = contract('xp,pq->xpq', wj0_01, int2c_inv)          # (0|1)(0|00)
-        hj_aux_aux += .5 * contract('xpq,yq->pqxy',  rhoj0_01, wj0_01)  # (00|0)(0|1), (1|0)(0|00)
-        wj0_01 = rhoj0_01 = None
+        if with_j:
+            wj0_10 = contract('ypq,p->ypq', int2c_ip1, rhoj0_P)
+            rhoj0_10 = contract('p,xpq->xpq', rhoj0_P, int2c_ip1_inv)     # (1|0)(0|00)
+            hj_aux_aux += .5 * contract('xpr,yqr->pqxy', rhoj0_10, wj0_10)  # (00|0)(1|0), (0|1)(0|00)
+            hj_aux_aux +=      contract('xpq,yq->pqxy',  rhoj0_10, wj0_01)  # (00|0)(1|0), (1|0)(0|00)
+            rhoj0_10 = rhoj0_P = None
+
+            rhoj1 = contract('px,pq->xpq', wj_ip2, int2c_inv)             # (0|0)(1|00)
+            hj_aux_aux -=      contract('xpq,yq->pqxy',  rhoj1,    wj0_01)  # (00|1),      (1|0)(0|00)
+            hj_aux_aux += .5 * contract('xpq,qy->pqxy',  rhoj1,    wj_ip2)  # (00|1),      (1|00)
+            hj_aux_aux -=      contract('xpr,yqr->pqxy', rhoj1,    wj0_10)  # (00|1),      (0|1)(0|00)
+            wj0_10 = rhoj1 = wj_ip2 = None
+
+            rhoj0_01 = contract('xp,pq->xpq', wj0_01, int2c_inv)          # (0|1)(0|00)
+            hj_aux_aux += .5 * contract('xpq,yq->pqxy',  rhoj0_01, wj0_01)  # (00|0)(0|1), (1|0)(0|00)
+            wj0_01 = rhoj0_01 = None
 
         if with_k:
             rho2c_10 = contract('rijx,qij->rqx', wk_ip2_P__, rhok0_P__)
@@ -296,11 +306,12 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     t1 = log.timer_debug1('contract int2c_*', *t1)
 
     dm0 = intopt.unsort_orbitals(dm0, axis=[0,1])
-    hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1])
-    if hessobj.auxbasis_response:
-        hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1])
-    if hessobj.auxbasis_response > 1:
-        hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1])
+    if with_j:
+        hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1])
+        if hessobj.auxbasis_response:
+            hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1])
+        if hessobj.auxbasis_response > 1:
+            hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1])
     if with_k:
         hk_ao_ao = intopt.unsort_orbitals(hk_ao_ao, axis=[0,1])
         if hessobj.auxbasis_response:
@@ -334,8 +345,9 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         e1[i0,i0] -= cupy.sum(h1aa[p0:p1], axis=0)
         for j0, ja in enumerate(atmlst[:i0+1]):
             q0, q1 = aoslices[ja][2:]
-            ej[i0,j0] += cupy.sum(hj_ao_ao[p0:p1,q0:q1], axis=[0,1])
             e1[i0,j0] -= cupy.sum(h1ab[p0:p1,q0:q1], axis=[0,1])
+            if with_j:
+                ej[i0,j0] += cupy.sum(hj_ao_ao[p0:p1,q0:q1], axis=[0,1])
             if with_k:
                 ek[i0,j0] += cupy.sum(hk_ao_ao[p0:p1,q0:q1], axis=[0,1])
             e1[i0,j0] += de_hcore(ia, ja)
@@ -344,13 +356,14 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         #
         if hessobj.auxbasis_response:
             for j0, (q0, q1) in enumerate(auxslices[:,2:]):
-                _ej = cupy.sum(hj_ao_aux[p0:p1,q0:q1], axis=[0,1])
-                if hessobj.auxbasis_response > 1:
-                    ej[i0,j0] += _ej * 2
-                    ej[j0,i0] += _ej.T * 2
-                else:
-                    ej[i0,j0] += _ej
-                    ej[j0,i0] += _ej.T
+                if with_j:
+                    _ej = cupy.sum(hj_ao_aux[p0:p1,q0:q1], axis=[0,1])
+                    if hessobj.auxbasis_response > 1:
+                        ej[i0,j0] += _ej * 2
+                        ej[j0,i0] += _ej.T * 2
+                    else:
+                        ej[i0,j0] += _ej
+                        ej[j0,i0] += _ej.T
                 if with_k:
                     _ek = cupy.sum(hk_ao_aux[p0:p1,q0:q1], axis=[0,1])
                     if hessobj.auxbasis_response > 1:
@@ -365,9 +378,10 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         if hessobj.auxbasis_response > 1:
             shl0, shl1, p0, p1 = auxslices[ia]
             for j0, (q0, q1) in enumerate(auxslices[:,2:]):
-                _ej = cupy.sum(hj_aux_aux[p0:p1,q0:q1], axis=[0,1])
-                ej[i0,j0] += _ej
-                ej[j0,i0] += _ej.T
+                if with_j:
+                    _ej = cupy.sum(hj_aux_aux[p0:p1,q0:q1], axis=[0,1])
+                    ej[i0,j0] += _ej
+                    ej[j0,i0] += _ej.T
                 if with_k:
                     _ek = cupy.sum(hk_aux_aux[p0:p1,q0:q1], axis=[0,1])
                     ek[i0,j0] += _ek * .5
@@ -375,22 +389,24 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     for i0, ia in enumerate(atmlst):
         for j0 in range(i0):
             e1[j0,i0] = e1[i0,j0].T
-            ej[j0,i0] = ej[i0,j0].T
+            if with_j:
+                ej[j0,i0] = ej[i0,j0].T
             if with_k:
                 ek[j0,i0] = ek[i0,j0].T
-        
+
     t1 = log.timer_debug1('hcore contribution', *t1)
-    
+
     aux2atom = int3c2e.get_aux2atom(intopt, auxslices)
-    
+
     natm = mol.natm
     idx = range(natm)
     # Diagonal contributions
     if hessobj.auxbasis_response > 1:
-        ej[idx, idx] += contract('ia,ixy->axy', aux2atom, hj_aux_diag)
+        if with_j:
+            ej[idx, idx] += contract('ia,ixy->axy', aux2atom, hj_aux_diag)
         if with_k:
             ek[idx, idx] += contract('ia,ixy->axy', aux2atom, hk_aux_diag)
-    
+
     log.timer('RHF partial hessian', *time0)
     return e1, ej, ek
 
@@ -408,7 +424,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     return h1mo
 
 def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
-            verbose=None, with_k=True, omega=None):
+            verbose=None, with_j=True, with_k=True, omega=None):
     '''
     Derivatives of J, K matrices in MO bases
     '''
@@ -420,8 +436,7 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     mol = hessobj.mol
     if atmlst is None:
         atmlst = range(mol.natm)
-    # FIXME
-    with_k = True
+
     mo_coeff = cupy.asarray(mo_coeff, order='C')
     mo_occ = cupy.asarray(mo_occ, order='C')
 
@@ -457,13 +472,16 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     mo_coeff = intopt.sort_orbitals(mo_coeff, axis=[0])
     dm0 = intopt.sort_orbitals(dm0, axis=[0,1])
     dm0_tag = tag_array(dm0, occ_coeff=mocc)
-    
+
     int2c = intopt.sort_orbitals(int2c, aux_axis=[0,1])
     solve_j2c = _gen_metric_solver(int2c)
-    wj, wk_Pl_ = int3c2e.get_int3c2e_wjk(mol, auxmol, dm0_tag, omega=omega)
-    rhoj0 = solve_j2c(wj)
+    wj, wk_Pl_ = int3c2e.get_int3c2e_wjk(mol, auxmol, dm0_tag,
+                                         with_j=with_j, with_k=True, omega=omega)
+    rhoj0 = None
+    if with_j:
+        rhoj0 = solve_j2c(wj)
+        wj = None
 
-    wj = None
     if isinstance(wk_Pl_, cupy.ndarray):
         rhok0_Pl_ = solve_j2c(wk_Pl_)
     else:
@@ -472,8 +490,10 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
             wk_tmp = cupy.asarray(wk_Pl_[:,p0:p1])
             rhok0_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get()
         wk_tmp = None
-    wk_Pl_ = solve_j2c = None
+    wk_Pl_ = None
+    solve_j2c = None
     t0 = log.timer_debug1('Fock matrix due to int3c2e', *t0)
+    vj1_int3c = vk1_int3c = None
 
     # --------------------------
     #  int3c_ip2 contribution
@@ -481,8 +501,8 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     cupy.get_default_memory_pool().free_all_blocks()
     if hessobj.auxbasis_response:
         fn = int3c2e.get_int3c2e_ip2_vjk
-        vj1_int3c_ip2, vk1_int3c_ip2 = fn(intopt, rhoj0, rhok0_Pl_, dm0_tag, auxslices, omega=omega)
-        vk1_int3c_ip2 *= 2.0
+        vj1_int3c, vk1_int3c = fn(intopt, rhoj0, rhok0_Pl_, dm0_tag, auxslices,
+                                  with_j=with_j, with_k=with_k, omega=omega)
         # Responses due to int2c2e_ip1
         if omega and omega > 1e-10:
             with auxmol.with_range_coulomb(omega):
@@ -492,18 +512,19 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
         int2c_ip1 = cupy.asarray(int2c_ip1, order='C')
         int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2])
 
-        # Generate rhok0_P__
-        if isinstance(rhok0_Pl_, cupy.ndarray):
-            rhok0_P__ = contract('pio,ir->pro', rhok0_Pl_, mocc)
-        else:
-            rhok0_P__ = cupy.empty([naux,nocc,nocc])
-            for p0, p1 in lib.prange(0,naux,64):
-                rhok0_Pl_tmp = cupy.asarray(rhok0_Pl_[p0:p1])
-                rhok0_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, mocc)
-            rhok0_Pl_tmp = None
-
-        wj0_10 = contract('xpq,q->xp', int2c_ip1, rhoj0)
-        wk0_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0_P__)
+        if with_j:
+            wj0_10 = contract('xpq,q->xp', int2c_ip1, rhoj0)
+        if with_k:
+            # Generate rhok0_P__
+            if isinstance(rhok0_Pl_, cupy.ndarray):
+                rhok0_P__ = contract('pio,ir->pro', rhok0_Pl_, mocc)
+            else:
+                rhok0_P__ = cupy.empty([naux,nocc,nocc])
+                for p0, p1 in lib.prange(0,naux,64):
+                    rhok0_Pl_tmp = cupy.asarray(rhok0_Pl_[p0:p1])
+                    rhok0_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, mocc)
+                rhok0_Pl_tmp = None
+            wk0_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0_P__)
 
         aux2atom = int3c2e.get_aux2atom(intopt, auxslices)
         mem_avail = get_avail_mem()
@@ -514,24 +535,21 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
 
         for p0, p1 in lib.prange(0,nao,blksize):
             rhok_tmp = cupy.asarray(rhok0_Pl_[:,p0:p1])
-            vj1_tmp = contract('pio,xp->xpio', rhok_tmp, wj0_10)
-
             wk0_10_Pl_ = contract('xqp,pio->xqio', int2c_ip1, rhok_tmp)
-            vj1_tmp += contract('xpio,p->xpio', wk0_10_Pl_, rhoj0)
-            vj1_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vj1_tmp, aux2atom)
-            vj1_tmp = None
+            if with_j:
+                vj1_tmp = contract('pio,xp->xpio', rhok_tmp, wj0_10)
+                vj1_tmp += contract('xpio,p->xpio', wk0_10_Pl_, rhoj0)
+                vj1_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vj1_tmp, aux2atom)
+                vj1_tmp = None
             if with_k:
                 vk1_tmp = contract('xpio,pro->xpir', wk0_10_Pl_, rhok0_P__)
                 vk1_tmp += contract('xpro,pir->xpio', wk0_10_P__, rhok_tmp)
                 # 2.0 due to spin
-                vk1_int3c_ip2[:,:,p0:p1] += 2.0*contract('xpio,pa->axio', vk1_tmp, aux2atom)
+                vk1_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vk1_tmp, aux2atom)
                 vk1_tmp = None
             wk0_10_Pl_ = rhok_tmp = None
         wj0_10 = wk0_10_P__ = rhok0_P__ = int2c_ip1 = None
         aux2atom = None
-
-        vj1_int3c_ip2 = contract('nxiq,ip->nxpq', vj1_int3c_ip2, mo_coeff)
-        vk1_int3c_ip2 = contract('nxiq,ip->nxpq', vk1_int3c_ip2, mo_coeff)
         t0 = log.timer_debug1('Fock matrix due to int3c2e_ip2', *t0)
 
     # -----------------------------
@@ -539,17 +557,31 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     # ------------------------------
     cupy.get_default_memory_pool().free_all_blocks()
     fn = int3c2e.get_int3c2e_ip1_vjk
-    vj1_buf, vk1_buf, vj1_ao, vk1_ao = fn(intopt, rhoj0, rhok0_Pl_, dm0_tag, aoslices, omega=omega)
-    rhoj0 = rhok0_Pl_ = None
-    vk1_ao *= 2.0
-    vk1_buf *= 2.0
-    
-    vj1_buf = intopt.unsort_orbitals(vj1_buf, axis=[1,2])
-    vk1_buf = intopt.unsort_orbitals(vk1_buf, axis=[1,2])
-
-    vj1_int3c_ip1 = -contract('nxiq,ip->nxpq', vj1_ao, mo_coeff)
-    vk1_int3c_ip1 = -contract('nxiq,ip->nxpq', vk1_ao, mo_coeff)
-    vj1_ao = vk1_ao = None
+    vj1_buf, vk1_buf, vj1_ao, vk1_ao = fn(intopt, rhoj0, rhok0_Pl_, dm0_tag, aoslices,
+                                          omega=omega, with_j=with_j, with_k=with_k)
+    rhoj0 = rhok0_Pl_ = dm0_tag = None
+    if with_j:
+        vj1_buf = intopt.unsort_orbitals(vj1_buf, axis=[1,2])
+        if vj1_int3c is None:
+            vj1_int3c = -vj1_ao
+        else:
+            vj1_int3c -= vj1_ao
+        vj1_ao = None
+        # NOTE: vj1_int3c and vk1_int3c are in [natm,3,nao,nocc]
+        #       axis=2 in AO, axis=3 in MO
+        #       convert axis=2 into MO now
+        vj1_int3c = contract('nxiq,ip->nxpq', vj1_int3c, mo_coeff)
+
+    if with_k:
+        vk1_buf = intopt.unsort_orbitals(vk1_buf, axis=[1,2])
+        if vk1_int3c is None:
+            vk1_int3c = -vk1_ao
+        else:
+            vk1_int3c -= vk1_ao
+        vk1_ao = None
+        # * 2.0 due to the contraction with mocc
+        vk1_buf *= 2.0
+        vk1_int3c = 2.0 * contract('nxiq,ip->nxpq', vk1_int3c, mo_coeff)
     t0 = log.timer_debug1('Fock matrix due to int3c2e_ip1', *t0)
 
     mocc = intopt.unsort_orbitals(mocc, axis=[0])
@@ -561,37 +593,29 @@ def _ao2mo(mat):
         tmp = contract('xij,jo->xio', mat, mocc)
         return contract('xik,ip->xpk', tmp, mo_coeff)
 
-    vj1_int3c = vj1_int3c_ip1 + vj1_int3c_ip2
-    vj1_int3c_ip1 = vj1_int3c_ip2 = None
-    if with_k:
-        vk1_int3c = vk1_int3c_ip1 + vk1_int3c_ip2
-        vk1_int3c_ip1 = vk1_int3c_ip2 = None
-
     cupy.get_default_memory_pool().free_all_blocks()
     for i0, ia in enumerate(atmlst):
         shl0, shl1, p0, p1 = aoslices[ia]
-        vj1_ao = cupy.zeros([3,nao,nao])
-        vk1_ao = cupy.zeros([3,nao,nao])
-
-        vj1_ao[:,p0:p1,:] -= vj1_buf[:,p0:p1,:]
-        vj1_ao[:,:,p0:p1] -= vj1_buf[:,p0:p1,:].transpose(0,2,1)
+        if with_j:
+            vj1_ao = cupy.zeros([3,nao,nao])
+            vj1_ao[:,p0:p1,:] -= vj1_buf[:,p0:p1,:]
+            vj1_ao[:,:,p0:p1] -= vj1_buf[:,p0:p1,:].transpose(0,2,1)
+            vj1_int3c[ia] += _ao2mo(vj1_ao)
         if with_k:
+            vk1_ao = cupy.zeros([3,nao,nao])
             vk1_ao[:,p0:p1,:] -= vk1_buf[:,p0:p1,:]
             vk1_ao[:,:,p0:p1] -= vk1_buf[:,p0:p1,:].transpose(0,2,1)
-
-        vj1_int3c[ia] += _ao2mo(vj1_ao)
-        if with_k:
             vk1_int3c[ia] += _ao2mo(vk1_ao)
     return vj1_int3c, vk1_int3c
 
-def _get_jk_mo(hessobj, mol, dms, mo_coeff, mocc, 
+def _get_jk_mo(hessobj, mol, dms, mo_coeff, mocc,
            hermi=1, with_j=True, with_k=True, omega=None):
     mf = hessobj.base
     dfobj = mf.with_df
     if omega is None:
-        return jk.get_jk(dfobj, dms, mo_coeff, mocc, 
+        return jk.get_jk(dfobj, dms, mo_coeff, mocc,
                          hermi=hermi, with_j=with_j, with_k=with_k)
-    
+
     # A temporary treatment for RSH-DF integrals
     key = '%.6f' % omega
     if key in dfobj._rsh_df:
diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py
index 31c1d506..e0d5cd90 100644
--- a/gpu4pyscf/df/hessian/rks.py
+++ b/gpu4pyscf/df/hessian/rks.py
@@ -27,6 +27,7 @@
 from gpu4pyscf.hessian import rhf as rhf_hess
 from gpu4pyscf.hessian import rks as rks_hess
 from gpu4pyscf.df.hessian import rhf as df_rhf_hess
+from gpu4pyscf.df.hessian.rhf import _get_jk_ip, _partial_hess_ejk
 from gpu4pyscf.lib import logger
 from gpu4pyscf.lib.cupy_helper import contract
 
@@ -50,17 +51,17 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
     omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
     with_k = mf._numint.libxc.is_hybrid_xc(mf.xc)
-    de2, ej, ek = df_rhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
-                                                atmlst, max_memory, verbose,
-                                                with_k=with_k)
+    de2, ej, ek = _partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
+                                    atmlst, max_memory, verbose,
+                                    with_j=True, with_k=with_k)
     de2 += ej  # (A,B,dR_A,dR_B)
     if with_k:
         de2 -= hyb * ek
 
     if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10:
-        ek_lr = df_rhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
-                                            atmlst, max_memory, verbose,
-                                            True, omega=omega)[2]
+        ek_lr = _partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
+                                  atmlst, max_memory, verbose,
+                                  with_j=False, with_k=True, omega=omega)[2]
         de2 -= (alpha - hyb) * ek_lr
 
     max_memory = None
@@ -93,18 +94,18 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
     mem_now = lib.current_memory()[0]
     max_memory = max(2000, mf.max_memory*.9-mem_now)
-    
+
     with_k = ni.libxc.is_hybrid_xc(mf.xc)
-    vj1, vk1 = df_rhf_hess._get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile,
-                                                atmlst, verbose, with_k)
+    vj1, vk1 = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile,
+                          atmlst, verbose, with_j=True, with_k=with_k)
     h1mo = vj1
     if with_k:
         h1mo -= .5 * hyb * vk1
     vj1 = vk1 = None
-    
+
     if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10:
-        _, vk1_lr = df_rhf_hess._get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile,
-                                             atmlst, verbose, True, omega=omega)
+        _, vk1_lr = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, atmlst,
+                               verbose, with_j=False, with_k=True, omega=omega)
         h1mo -= .5 * (alpha - hyb) * vk1_lr
         vk1_lr = None
 
diff --git a/gpu4pyscf/df/hessian/uhf.py b/gpu4pyscf/df/hessian/uhf.py
index acd67380..d6f26e5d 100644
--- a/gpu4pyscf/df/hessian/uhf.py
+++ b/gpu4pyscf/df/hessian/uhf.py
@@ -50,11 +50,13 @@
 def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
                       atmlst=None, max_memory=4000, verbose=None):
     e1, ej, ek = _partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
-                                   atmlst, max_memory, verbose, True)
+                                   atmlst, max_memory, verbose,
+                                   with_j=True, with_k=True)
     return e1 + ej - ek
 
 def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
-                      atmlst=None, max_memory=4000, verbose=None, with_k=True, omega=None):
+                      atmlst=None, max_memory=4000, verbose=None,
+                      with_j=True, with_k=True, omega=None):
     '''Partial derivative
     '''
     log = logger.new_logger(hessobj, verbose)
@@ -113,43 +115,53 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     int2c_ip1 = cupy.asarray(int2c_ip1, order='C')
     int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2])
 
-    hj_ao_ao = cupy.zeros([nao,nao,3,3])
-    hk_ao_ao = cupy.zeros([nao,nao,3,3])
+    if with_j:
+        hj_ao_ao = cupy.zeros([nao,nao,3,3])
+    if with_k:
+        hk_ao_ao = cupy.zeros([nao,nao,3,3])
     if hessobj.auxbasis_response:
-        hj_ao_aux = cupy.zeros([nao,naux,3,3])
-        hk_ao_aux = cupy.zeros([nao,naux,3,3])
+        if with_j:
+            hj_ao_aux = cupy.zeros([nao,naux,3,3])
+        if with_k:
+            hk_ao_aux = cupy.zeros([nao,naux,3,3])
 
     #  int3c contributions
     wja, wka_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0a_tag, omega=omega)
     wjb, wkb_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0b_tag, omega=omega)
-    rhoj0_P = solve_j2c(wja + wjb)
-    rhok0a_P__ = solve_j2c(wka_P__)
-    rhok0b_P__ = solve_j2c(wkb_P__)
+    rhoj0_P = rhok0a_P__ = rhok0b_P__ = None
+    if with_j:
+        rhoj0_P = solve_j2c(wja + wjb)
+    if with_k:
+        rhok0a_P__ = solve_j2c(wka_P__)
+        rhok0b_P__ = solve_j2c(wkb_P__)
     wja = wjb = wka_P__ = wkb_P__ = None
     t1 = log.timer_debug1('intermediate variables with int3c2e', *t1)
 
     # int3c_ip2 contributions
     wja_ip2, wka_ip2_P__ = int3c2e.get_int3c2e_ip2_wjk(intopt, dm0a_tag, omega=omega)
     wjb_ip2, wkb_ip2_P__ = int3c2e.get_int3c2e_ip2_wjk(intopt, dm0b_tag, omega=omega)
-    wj_ip2 = wja_ip2 + wjb_ip2
+    wj_ip2 = None
+    if with_j:
+        wj_ip2 = wja_ip2 + wjb_ip2
     t1 = log.timer_debug1('interdeidate variables with int3c2e_ip2', *t1)
 
     #  int3c_ip1 contributions
     wj1a_P, wk1a_Pko = int3c2e.get_int3c2e_ip1_wjk(intopt, dm0a_tag, omega=omega)
     wj1b_P, wk1b_Pko = int3c2e.get_int3c2e_ip1_wjk(intopt, dm0b_tag, omega=omega)
-    wj1_P = wj1a_P + wj1b_P
-    rhoj1_P = solve_j2c(wj1_P)
-
-    hj_ao_ao += 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P)   # (10|0)(0|0)(0|01)
     wj1_P = None
-    if hessobj.auxbasis_response:
-        wj0_01 = contract('ypq,q->yp', int2c_ip1, rhoj0_P)
-        wj1_01 = contract('yqp,pix->iqxy', int2c_ip1, rhoj1_P)
-        hj_ao_aux += contract('pix,py->ipxy', rhoj1_P, wj_ip2)   # (10|0)(1|00)
-        hj_ao_aux -= contract('pix,yp->ipxy', rhoj1_P, wj0_01)   # (10|0)(1|0)(0|00)
-        hj_ao_aux -= contract('q,iqxy->iqxy', rhoj0_P, wj1_01)   # (10|0)(0|1)(0|00)
-        wj1_01 = None
-    rhoj1_P = None
+    if with_j:
+        wj1_P = wj1a_P + wj1b_P
+        rhoj1_P = solve_j2c(wj1_P)
+        hj_ao_ao += 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P)   # (10|0)(0|0)(0|01)
+        wj1_P = None
+        if hessobj.auxbasis_response:
+            wj0_01 = contract('ypq,q->yp', int2c_ip1, rhoj0_P)
+            wj1_01 = contract('yqp,pix->iqxy', int2c_ip1, rhoj1_P)
+            hj_ao_aux += contract('pix,py->ipxy', rhoj1_P, wj_ip2)   # (10|0)(1|00)
+            hj_ao_aux -= contract('pix,yp->ipxy', rhoj1_P, wj0_01)   # (10|0)(1|0)(0|00)
+            hj_ao_aux -= contract('q,iqxy->iqxy', rhoj0_P, wj1_01)   # (10|0)(0|1)(0|00)
+            wj1_01 = None
+        rhoj1_P = None
 
     if with_k:
         mem_avail = get_avail_mem()
@@ -160,7 +172,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         log.debug(f'GPU Memory {mem_avail/GB:.1f} GB available, block size {blksize}')
         if blksize < ALIGNED:
             raise RuntimeError('Not enough memory for intermediate variables')
-    
+
         for i0, i1 in lib.prange(0,nao,blksize):
             wk1a_Pko_islice = cupy.asarray(wk1a_Pko[:,i0:i1])
             wk1b_Pko_islice = cupy.asarray(wk1b_Pko[:,i0:i1])
@@ -216,12 +228,13 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
     cupy.get_default_memory_pool().free_all_blocks()
     hja_ipip, hka_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0a_P__, dm0a_tag,
-                                          with_k=with_k, omega=omega, 
+                                          with_j=with_j, with_k=with_k, omega=omega,
                                           auxbasis_response=hessobj.auxbasis_response)
     hjb_ipip, hkb_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0b_P__, dm0b_tag,
-                                          with_k=with_k, omega=omega, 
+                                          with_j=with_j, with_k=with_k, omega=omega,
                                           auxbasis_response=hessobj.auxbasis_response)
-    hj_ipip = hja_ipip + hjb_ipip
+    if with_j:
+        hj_ipip = hja_ipip + hjb_ipip
     if with_k:
         hk_ipip = 2.0*(hka_ipip + hkb_ipip)
     t1 = log.timer_debug1('intermediate variables with int3c2e_ipip', *t1)
@@ -235,10 +248,11 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
             int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1')
         int2c_ipip1 = cupy.asarray(int2c_ipip1, order='C')
         int2c_ipip1 = intopt.sort_orbitals(int2c_ipip1, aux_axis=[1,2])
-        rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P)
         # (00|0)(2|0)(0|00)
-        # p,xp->px
-        hj_aux_diag = -(rhoj0_P*rhoj2c_P).T.reshape(-1,3,3)
+        if with_j:
+            rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P)
+            # p,xp->px
+            hj_aux_diag = -(rhoj0_P*rhoj2c_P).T.reshape(-1,3,3)
         if with_k:
             rho2c_0 = contract('pij,qji->pq', rhok0a_P__, rhok0a_P__)
             rho2c_0+= contract('pij,qji->pq', rhok0b_P__, rhok0b_P__)
@@ -252,7 +266,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
             int2c_ip1ip2 = auxmol.intor('int2c2e_ip1ip2', aosym='s1')
         int2c_ip1ip2 = cupy.asarray(int2c_ip1ip2, order='C')
         int2c_ip1ip2 = intopt.sort_orbitals(int2c_ip1ip2, aux_axis=[1,2])
-        hj_aux_aux = -.5 * contract('p,xpq->pqx', rhoj0_P, int2c_ip1ip2*rhoj0_P).reshape(naux, naux,3,3)
+        if with_j:
+            hj_aux_aux = -.5 * contract('p,xpq->pqx', rhoj0_P, int2c_ip1ip2*rhoj0_P).reshape(naux, naux,3,3)
         if with_k:
             hk_aux_aux = -.5 * contract('xpq,pq->pqx', int2c_ip1ip2, rho2c_0).reshape(naux,naux,3,3)
         t1 = log.timer_debug1('intermediate variables with int2c_*', *t1)
@@ -262,23 +277,23 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     release_gpu_stack()
     # aux-aux pair
     if hessobj.auxbasis_response > 1:
-        wj0_10 = contract('ypq,p->ypq', int2c_ip1, rhoj0_P)
         int2c_ip1_inv = contract('yqp,pr->yqr', int2c_ip1, int2c_inv)
-
-        rhoj0_10 = contract('p,xpq->xpq', rhoj0_P, int2c_ip1_inv)     # (1|0)(0|00)
-        hj_aux_aux += .5 * contract('xpr,yqr->pqxy', rhoj0_10, wj0_10)  # (00|0)(1|0), (0|1)(0|00)
-        hj_aux_aux +=      contract('xpq,yq->pqxy',  rhoj0_10, wj0_01)  # (00|0)(1|0), (1|0)(0|00)
-        rhoj0_10 = rhoj0_P = None
-
-        rhoj1 = contract('px,pq->xpq', wj_ip2, int2c_inv)             # (0|0)(1|00)
-        hj_aux_aux -=      contract('xpq,yq->pqxy',  rhoj1,    wj0_01)  # (00|1),      (1|0)(0|00)
-        hj_aux_aux += .5 * contract('xpq,qy->pqxy',  rhoj1,    wj_ip2)  # (00|1),      (1|00)
-        hj_aux_aux -=      contract('xpr,yqr->pqxy', rhoj1,    wj0_10)  # (00|1),      (0|1)(0|00)
-        wj0_10 = rhoj1 = wj_ip2 = None
-
-        rhoj0_01 = contract('xp,pq->xpq', wj0_01, int2c_inv)          # (0|1)(0|00)
-        hj_aux_aux += .5 * contract('xpq,yq->pqxy',  rhoj0_01, wj0_01)  # (00|0)(0|1), (1|0)(0|00)
-        wj0_01 = rhoj0_01 = None
+        if with_j:
+            wj0_10 = contract('ypq,p->ypq', int2c_ip1, rhoj0_P)
+            rhoj0_10 = contract('p,xpq->xpq', rhoj0_P, int2c_ip1_inv)     # (1|0)(0|00)
+            hj_aux_aux += .5 * contract('xpr,yqr->pqxy', rhoj0_10, wj0_10)  # (00|0)(1|0), (0|1)(0|00)
+            hj_aux_aux +=      contract('xpq,yq->pqxy',  rhoj0_10, wj0_01)  # (00|0)(1|0), (1|0)(0|00)
+            rhoj0_10 = rhoj0_P = None
+
+            rhoj1 = contract('px,pq->xpq', wj_ip2, int2c_inv)             # (0|0)(1|00)
+            hj_aux_aux -=      contract('xpq,yq->pqxy',  rhoj1,    wj0_01)  # (00|1),      (1|0)(0|00)
+            hj_aux_aux += .5 * contract('xpq,qy->pqxy',  rhoj1,    wj_ip2)  # (00|1),      (1|00)
+            hj_aux_aux -=      contract('xpr,yqr->pqxy', rhoj1,    wj0_10)  # (00|1),      (0|1)(0|00)
+            wj0_10 = rhoj1 = wj_ip2 = None
+
+            rhoj0_01 = contract('xp,pq->xpq', wj0_01, int2c_inv)          # (0|1)(0|00)
+            hj_aux_aux += .5 * contract('xpq,yq->pqxy',  rhoj0_01, wj0_01)  # (00|0)(0|1), (1|0)(0|00)
+            wj0_01 = rhoj0_01 = None
 
         if with_k:
             rho2c_10 = contract('rijx,qij->rqx', wka_ip2_P__, rhok0a_P__)
@@ -310,13 +325,13 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
             hk_aux_aux -=      contract('pqx,yqp->pqxy', rho2c_10, int2c_ip1_inv)  # (00|1)(0|1)(0|00)
             rho2c_10= int2c_ip1_inv = None
     t1 = log.timer_debug1('contract int2c_*', *t1)
-
-    hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1])
-    if hessobj.auxbasis_response:
-        hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1])
-    if hessobj.auxbasis_response > 1:
-        hj_aux_diag = intopt.unsort_orbitals(hj_aux_diag, aux_axis=[0])
-        hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1])
+    if with_j:
+        hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1])
+        if hessobj.auxbasis_response:
+            hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1])
+        if hessobj.auxbasis_response > 1:
+            hj_aux_diag = intopt.unsort_orbitals(hj_aux_diag, aux_axis=[0])
+            hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1])
     if with_k:
         hk_ao_ao = intopt.unsort_orbitals(hk_ao_ao, axis=[0,1])
         if hessobj.auxbasis_response:
@@ -346,18 +361,20 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     # -----------------------------------------
     #        collecting all
     # -----------------------------------------
-    hk_ao_ao *= 2.0
     e1 = cupy.zeros([len(atmlst),len(atmlst),3,3])
-    ej = hj_ipip
-    ek = None
+    ej = ek = None
+    if with_j:
+        ej = hj_ipip
     if with_k:
+        hk_ao_ao *= 2.0
         ek = hk_ipip
     for i0, ia in enumerate(atmlst):
         shl0, shl1, p0, p1 = aoslices[ia]
         e1[i0,i0] -= cupy.sum(h1aa[p0:p1], axis=0)
         for j0, ja in enumerate(atmlst[:i0+1]):
             q0, q1 = aoslices[ja][2:]
-            ej[i0,j0] += cupy.sum(hj_ao_ao[p0:p1,q0:q1], axis=[0,1])
+            if with_j:
+                ej[i0,j0] += cupy.sum(hj_ao_ao[p0:p1,q0:q1], axis=[0,1])
             e1[i0,j0] -= cupy.sum(h1ab[p0:p1,q0:q1], axis=[0,1])
             if with_k:
                 ek[i0,j0] += cupy.sum(hk_ao_ao[p0:p1,q0:q1], axis=[0,1])
@@ -368,13 +385,14 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         #
         if hessobj.auxbasis_response:
             for j0, (q0, q1) in enumerate(auxslices[:,2:]):
-                _ej = cupy.sum(hj_ao_aux[p0:p1,q0:q1], axis=[0,1])
-                if hessobj.auxbasis_response > 1:
-                    ej[i0,j0] += _ej * 2
-                    ej[j0,i0] += _ej.T * 2
-                else:
-                    ej[i0,j0] += _ej
-                    ej[j0,i0] += _ej.T
+                if with_j:
+                    _ej = cupy.sum(hj_ao_aux[p0:p1,q0:q1], axis=[0,1])
+                    if hessobj.auxbasis_response > 1:
+                        ej[i0,j0] += _ej * 2
+                        ej[j0,i0] += _ej.T * 2
+                    else:
+                        ej[i0,j0] += _ej
+                        ej[j0,i0] += _ej.T
                 if with_k:
                     _ek = cupy.sum(hk_ao_aux[p0:p1,q0:q1], axis=[0,1])
                     if hessobj.auxbasis_response > 1:
@@ -388,13 +406,15 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         #
         if hessobj.auxbasis_response > 1:
             shl0, shl1, p0, p1 = auxslices[ia]
-            ej[i0,i0] += cupy.sum(hj_aux_diag[p0:p1], axis=0)
+            if with_j:
+                ej[i0,i0] += cupy.sum(hj_aux_diag[p0:p1], axis=0)
             if with_k:
                 ek[i0,i0] += cupy.sum(hk_aux_diag[p0:p1], axis=0)
             for j0, (q0, q1) in enumerate(auxslices[:,2:]):
-                _ej = cupy.sum(hj_aux_aux[p0:p1,q0:q1], axis=[0,1])
-                ej[i0,j0] += _ej
-                ej[j0,i0] += _ej.T
+                if with_j:
+                    _ej = cupy.sum(hj_aux_aux[p0:p1,q0:q1], axis=[0,1])
+                    ej[i0,j0] += _ej
+                    ej[j0,i0] += _ej.T
                 if with_k:
                     _ek = cupy.sum(hk_aux_aux[p0:p1,q0:q1], axis=[0,1])
                     ek[i0,j0] += _ek
@@ -402,7 +422,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     for i0, ia in enumerate(atmlst):
         for j0 in range(i0):
             e1[j0,i0] = e1[i0,j0].T
-            ej[j0,i0] = ej[i0,j0].T
+            if with_j:
+                ej[j0,i0] = ej[i0,j0].T
             if with_k:
                 ek[j0,i0] = ek[i0,j0].T
     t1 = log.timer_debug1('hcore contribution', *t1)
@@ -434,7 +455,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     return (h1moa, h1mob)
 
 def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
-            verbose=None, with_k=True, omega=None):
+            verbose=None, with_j=True, with_k=True, omega=None):
     '''
     A generator to produce the derivatives of Hcore, J, K matrices in MO bases
     '''
@@ -443,8 +464,7 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     mol = hessobj.mol
     if atmlst is None:
         atmlst = range(mol.natm)
-    # FIXME
-    with_k = True
+
     mo_coeff = cupy.asarray(mo_coeff, order='C')
     mo_occ = cupy.asarray(mo_occ, order='C')
 
@@ -469,12 +489,12 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     int2c = cupy.asarray(int2c, order='C')
     # ======================= sorted AO begin ======================================
     intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
-    intopt.build(mf.direct_scf_tol, 
-                 diag_block_with_triu=True, 
-                 aosym=False, 
-                 group_size_aux=BLKSIZE, 
+    intopt.build(mf.direct_scf_tol,
+                 diag_block_with_triu=True,
+                 aosym=False,
+                 group_size_aux=BLKSIZE,
                  group_size=BLKSIZE)
-    
+
     mocca = intopt.sort_orbitals(mocca, axis=[0])
     moccb = intopt.sort_orbitals(moccb, axis=[0])
     mo_coeff = intopt.sort_orbitals(mo_coeff, axis=[1])
@@ -488,10 +508,12 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
 
     fn = int3c2e.get_int3c2e_wjk
     dm0_tag = tag_array(dm0, occ_coeff=mocca)
-    wj, wka_Pl_ = fn(mol, auxmol, dm0_tag, omega=omega)
+    wj, wka_Pl_ = fn(mol, auxmol, dm0_tag, with_j=with_j, with_k=with_k, omega=omega)
     dm0_tag = tag_array(dm0, occ_coeff=moccb)
-    wj, wkb_Pl_ = fn(mol, auxmol, dm0_tag, omega=omega)
-    rhoj0 = solve_j2c(wj)
+    wj, wkb_Pl_ = fn(mol, auxmol, dm0_tag, with_j=with_j, with_k=with_k, omega=omega)
+    rhoj0 = None
+    if with_j:
+        rhoj0 = solve_j2c(wj)
     wj = None
 
     if isinstance(wka_Pl_, cupy.ndarray):
@@ -512,27 +534,7 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
             rhok0b_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get()
         wk_tmp = None
     wka_Pl_ = wkb_Pl_ = None
-
-    # -----------------------------
-    # int3c_ip1 contributions
-    # ------------------------------
-    cupy.get_default_memory_pool().free_all_blocks()
-    fn = int3c2e.get_int3c2e_ip1_vjk
-    dm0_tag = tag_array(dm0, occ_coeff=mocca)
-    vj1_buf, vk1a_buf, vj1a_ao, vk1a_ao = fn(intopt, rhoj0, rhok0a_Pl_, dm0_tag, aoslices, omega=omega)
-    dm0_tag = tag_array(dm0, occ_coeff=moccb)
-    vj1_buf, vk1b_buf, vj1b_ao, vk1b_ao = fn(intopt, rhoj0, rhok0b_Pl_, dm0_tag, aoslices, omega=omega)
-
-    vj1_buf = intopt.unsort_orbitals(vj1_buf, axis=[1,2])
-    vk1a_buf = intopt.unsort_orbitals(vk1a_buf, axis=[1,2])
-    vk1b_buf = intopt.unsort_orbitals(vk1b_buf, axis=[1,2])
-
-    vj1a_int3c = -contract('nxiq,ip->nxpq', vj1a_ao, mo_coeff[0])
-    vj1b_int3c = -contract('nxiq,ip->nxpq', vj1b_ao, mo_coeff[1])
-    vk1a_int3c = -contract('nxiq,ip->nxpq', vk1a_ao, mo_coeff[0])
-    vk1b_int3c = -contract('nxiq,ip->nxpq', vk1b_ao, mo_coeff[1])
-    vj1a_ao = vj1b_ao = vk1a_ao = vk1b_ao = None
-    t0 = log.timer_debug1('Fock matrix due to int3c2e_ip1', *t0)
+    vj1a_int3c = vj1b_int3c = vk1a_int3c = vk1b_int3c = None
 
     # --------------------------
     #  int3c_ip2 contribution
@@ -541,9 +543,11 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     if hessobj.auxbasis_response:
         fn = int3c2e.get_int3c2e_ip2_vjk
         dm0_tag = tag_array(dm0, occ_coeff=mocca)
-        vj1a_int3c_ip2, vk1a_int3c_ip2 = fn(intopt, rhoj0, rhok0a_Pl_, dm0_tag, auxslices, omega=omega)
+        vj1a_int3c, vk1a_int3c = fn(intopt, rhoj0, rhok0a_Pl_, dm0_tag, auxslices,
+                                    with_j=with_j, with_k=with_k, omega=omega)
         dm0_tag = tag_array(dm0, occ_coeff=moccb)
-        vj1b_int3c_ip2, vk1b_int3c_ip2 = fn(intopt, rhoj0, rhok0b_Pl_, dm0_tag, auxslices, omega=omega)
+        vj1b_int3c, vk1b_int3c = fn(intopt, rhoj0, rhok0b_Pl_, dm0_tag, auxslices,
+                                    with_j=with_j, with_k=with_k, omega=omega)
 
         # Responses due to int2c2e_ip1
         if omega and omega > 1e-10:
@@ -553,34 +557,35 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
             int2c_ip1 = auxmol.intor('int2c2e_ip1', aosym='s1')
         int2c_ip1 = cupy.asarray(int2c_ip1, order='C')
         int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2])
-
-        # generate rhok0_P__
-        if isinstance(rhok0a_Pl_, cupy.ndarray):
-            rhok0a_P__ = contract('pio,ir->pro', rhok0a_Pl_, mocca)
-        else:
-            naux = auxmol.nao
-            nocc = mocca.shape[1]
-            rhok0a_P__ = cupy.empty([naux,nocc,nocc])
-            for p0, p1 in lib.prange(0,naux,64):
-                rhok0_Pl_tmp = cupy.asarray(rhok0a_Pl_[p0:p1])
-                rhok0a_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, mocca)
-            rhok0_Pl_tmp = None
-
-        # generate rhok0_P__
-        if isinstance(rhok0b_Pl_, cupy.ndarray):
-            rhok0b_P__ = contract('pio,ir->pro', rhok0b_Pl_, moccb)
-        else:
-            naux = auxmol.nao
-            nocc = moccb.shape[1]
-            rhok0b_P__ = cupy.empty([naux,nocc,nocc])
-            for p0, p1 in lib.prange(0,naux,64):
-                rhok0_Pl_tmp = cupy.asarray(rhok0b_Pl_[p0:p1])
-                rhok0b_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, moccb)
-            rhok0_Pl_tmp = None
-
-        wj0_10 = contract('xpq,q->xp', int2c_ip1, rhoj0)
-        wk0a_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0a_P__)
-        wk0b_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0b_P__)
+        if with_k:
+            # generate rhok0_P__
+            if isinstance(rhok0a_Pl_, cupy.ndarray):
+                rhok0a_P__ = contract('pio,ir->pro', rhok0a_Pl_, mocca)
+            else:
+                naux = auxmol.nao
+                nocc = mocca.shape[1]
+                rhok0a_P__ = cupy.empty([naux,nocc,nocc])
+                for p0, p1 in lib.prange(0,naux,64):
+                    rhok0_Pl_tmp = cupy.asarray(rhok0a_Pl_[p0:p1])
+                    rhok0a_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, mocca)
+                rhok0_Pl_tmp = None
+
+            # generate rhok0_P__
+            if isinstance(rhok0b_Pl_, cupy.ndarray):
+                rhok0b_P__ = contract('pio,ir->pro', rhok0b_Pl_, moccb)
+            else:
+                naux = auxmol.nao
+                nocc = moccb.shape[1]
+                rhok0b_P__ = cupy.empty([naux,nocc,nocc])
+                for p0, p1 in lib.prange(0,naux,64):
+                    rhok0_Pl_tmp = cupy.asarray(rhok0b_Pl_[p0:p1])
+                    rhok0b_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, moccb)
+                rhok0_Pl_tmp = None
+        if with_j:
+            wj0_10 = contract('xpq,q->xp', int2c_ip1, rhoj0)
+        if with_k:
+            wk0a_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0a_P__)
+            wk0b_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0b_P__)
 
         aux2atom = int3c2e.get_aux2atom(intopt, auxslices)
         mem_avail = get_avail_mem()
@@ -589,42 +594,74 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
         log.debug(f'GPU Memory {mem_avail/GB:.1f} GB available, block size {blksize}')
         if blksize < ALIGNED:
             raise RuntimeError('Not enough memory to compute int3c2e_ip2')
-        
-        for p0, p1 in lib.prange(0,nao,64):
+
+        for p0, p1 in lib.prange(0,nao,blksize):
             rhoka_tmp = cupy.asarray(rhok0a_Pl_[:,p0:p1])
             rhokb_tmp = cupy.asarray(rhok0b_Pl_[:,p0:p1])
-            vj1a_tmp = contract('pio,xp->xpio', rhoka_tmp, wj0_10)
-            vj1b_tmp = contract('pio,xp->xpio', rhokb_tmp, wj0_10)
-
             wk0a_10_Pl_ = contract('xqp,pio->xqio', int2c_ip1, rhoka_tmp)
             wk0b_10_Pl_ = contract('xqp,pio->xqio', int2c_ip1, rhokb_tmp)
-            vj1a_tmp += contract('xpio,p->xpio', wk0a_10_Pl_, rhoj0)
-            vj1b_tmp += contract('xpio,p->xpio', wk0b_10_Pl_, rhoj0)
-            vj1a_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vj1a_tmp, aux2atom)
-            vj1b_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vj1b_tmp, aux2atom)
-            vj1a_tmp = vj1b_tmp = None
+            if with_j:
+                vj1a_tmp = contract('pio,xp->xpio', rhoka_tmp, wj0_10)
+                vj1b_tmp = contract('pio,xp->xpio', rhokb_tmp, wj0_10)
+
+                vj1a_tmp += contract('xpio,p->xpio', wk0a_10_Pl_, rhoj0)
+                vj1b_tmp += contract('xpio,p->xpio', wk0b_10_Pl_, rhoj0)
+                vj1a_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vj1a_tmp, aux2atom)
+                vj1b_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vj1b_tmp, aux2atom)
+                vj1a_tmp = vj1b_tmp = None
             if with_k:
                 vk1a_tmp = contract('xpio,pro->xpir', wk0a_10_Pl_, rhok0a_P__)
                 vk1a_tmp += contract('xpro,pir->xpio', wk0a_10_P__, rhoka_tmp)
                 vk1b_tmp = contract('xpio,pro->xpir', wk0b_10_Pl_, rhok0b_P__)
                 vk1b_tmp += contract('xpro,pir->xpio', wk0b_10_P__, rhokb_tmp)
 
-                vk1a_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vk1a_tmp, aux2atom)
-                vk1b_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vk1b_tmp, aux2atom)
+                vk1a_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vk1a_tmp, aux2atom)
+                vk1b_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vk1b_tmp, aux2atom)
                 vk1a_tmp = vk1b_tmp = None
             wk0a_10_Pl_ = wk0b_10_Pl_ = rhoka_tmp = rhokb_tmp = None
         wj0_10 = wk0a_10_P__ = wk0b_10_P__ = rhok0a_P__ =rhok0b_P__ = int2c_ip1 = None
-        rhoj0 = rhok0a_Pl_ = rhok0b_Pl_ = None
-        aux2atom = None
 
-        vj1a_int3c += contract('nxiq,ip->nxpq', vj1a_int3c_ip2, mo_coeff[0])
-        vj1b_int3c += contract('nxiq,ip->nxpq', vj1b_int3c_ip2, mo_coeff[1])
-        if with_k:
-            vk1a_int3c += contract('nxiq,ip->nxpq', vk1a_int3c_ip2, mo_coeff[0])
-            vk1b_int3c += contract('nxiq,ip->nxpq', vk1b_int3c_ip2, mo_coeff[1])
-        vk1a_int3c_ip2 = vk1b_int3c_ip2 = None
+        aux2atom = None
         t0 = log.timer_debug1('Fock matrix due to int3c2e_ip2', *t0)
 
+    # -----------------------------
+    # int3c_ip1 contributions
+    # ------------------------------
+    cupy.get_default_memory_pool().free_all_blocks()
+    fn = int3c2e.get_int3c2e_ip1_vjk
+    dm0_tag = tag_array(dm0, occ_coeff=mocca)
+    vj1_buf, vk1a_buf, vj1a_ao, vk1a_ao = fn(intopt, rhoj0, rhok0a_Pl_, dm0_tag, aoslices,
+                                             with_j=with_j, with_k=with_k, omega=omega)
+    dm0_tag = tag_array(dm0, occ_coeff=moccb)
+    vj1_buf, vk1b_buf, vj1b_ao, vk1b_ao = fn(intopt, rhoj0, rhok0b_Pl_, dm0_tag, aoslices,
+                                             with_j=with_j, with_k=with_k, omega=omega)
+    rhoj0 = rhok0a_Pl_ = rhok0b_Pl_ = None
+
+    if with_j:
+        vj1_buf = intopt.unsort_orbitals(vj1_buf, axis=[1,2])
+        if not hessobj.auxbasis_response:
+            vj1a_int3c = -vj1a_ao
+            vj1b_int3c = -vj1b_ao
+        else:
+            vj1a_int3c -= vj1a_ao
+            vj1b_int3c -= vj1b_ao
+        vj1a_ao = vj1b_ao = None
+        vj1a_int3c = contract('nxiq,ip->nxpq', vj1a_int3c, mo_coeff[0])
+        vj1b_int3c = contract('nxiq,ip->nxpq', vj1b_int3c, mo_coeff[1])
+    if with_k:
+        vk1a_buf = intopt.unsort_orbitals(vk1a_buf, axis=[1,2])
+        vk1b_buf = intopt.unsort_orbitals(vk1b_buf, axis=[1,2])
+        if not hessobj.auxbasis_response:
+            vk1a_int3c = -vk1a_ao
+            vk1b_int3c = -vk1b_ao
+        else:
+            vk1a_int3c -= vk1a_ao
+            vk1b_int3c -= vk1b_ao
+        vk1a_ao = vk1b_ao = None
+        vk1a_int3c = contract('nxiq,ip->nxpq', vk1a_int3c, mo_coeff[0])
+        vk1b_int3c = contract('nxiq,ip->nxpq', vk1b_int3c, mo_coeff[1])
+    t0 = log.timer_debug1('Fock matrix due to int3c2e_ip1', *t0)
+
     mocca = intopt.unsort_orbitals(mocca, axis=[0])
     moccb = intopt.unsort_orbitals(moccb, axis=[0])
     mo_coeff = intopt.unsort_orbitals(mo_coeff, axis=[1])
@@ -639,21 +676,19 @@ def _ao2mo(mat, mocc, mo):
 
     for i0, ia in enumerate(atmlst):
         shl0, shl1, p0, p1 = aoslices[ia]
-        vj1_ao = cupy.zeros([3,nao,nao])
-        vk1a_ao = cupy.zeros([3,nao,nao])
-        vk1b_ao = cupy.zeros([3,nao,nao])
-
-        vj1_ao[:,p0:p1,:] -= vj1_buf[:,p0:p1,:]
-        vj1_ao[:,:,p0:p1] -= vj1_buf[:,p0:p1,:].transpose(0,2,1)
+        if with_j:
+            vj1_ao = cupy.zeros([3,nao,nao])
+            vj1_ao[:,p0:p1,:] -= vj1_buf[:,p0:p1,:]
+            vj1_ao[:,:,p0:p1] -= vj1_buf[:,p0:p1,:].transpose(0,2,1)
+            vj1a_int3c[ia] += _ao2mo(vj1_ao, mocca, mo_coeff[0])
+            vj1b_int3c[ia] += _ao2mo(vj1_ao, moccb, mo_coeff[1])
         if with_k:
+            vk1a_ao = cupy.zeros([3,nao,nao])
+            vk1b_ao = cupy.zeros([3,nao,nao])
             vk1a_ao[:,p0:p1,:] -= vk1a_buf[:,p0:p1,:]
             vk1a_ao[:,:,p0:p1] -= vk1a_buf[:,p0:p1,:].transpose(0,2,1)
             vk1b_ao[:,p0:p1,:] -= vk1b_buf[:,p0:p1,:]
             vk1b_ao[:,:,p0:p1] -= vk1b_buf[:,p0:p1,:].transpose(0,2,1)
-
-        vj1a_int3c[ia] += _ao2mo(vj1_ao, mocca, mo_coeff[0])
-        vj1b_int3c[ia] += _ao2mo(vj1_ao, moccb, mo_coeff[1])
-        if with_k:
             vk1a_int3c[ia] += _ao2mo(vk1a_ao, mocca, mo_coeff[0])
             vk1b_int3c[ia] += _ao2mo(vk1b_ao, moccb, mo_coeff[1])
     return (vj1a_int3c, vj1b_int3c), (vk1a_int3c, vk1b_int3c)
diff --git a/gpu4pyscf/df/hessian/uks.py b/gpu4pyscf/df/hessian/uks.py
index 31273a7d..059f571c 100644
--- a/gpu4pyscf/df/hessian/uks.py
+++ b/gpu4pyscf/df/hessian/uks.py
@@ -28,6 +28,7 @@
 from gpu4pyscf.hessian import uhf as uhf_hess
 from gpu4pyscf.hessian import uks as uks_hess
 from gpu4pyscf.df.hessian import uhf as df_uhf_hess
+from gpu4pyscf.df.hessian.uhf import _partial_hess_ejk, _get_jk_ip
 from gpu4pyscf.lib import logger
 from gpu4pyscf.lib.cupy_helper import contract
 
@@ -52,17 +53,17 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
     omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
     with_k = mf._numint.libxc.is_hybrid_xc(mf.xc)
-    de2, ej, ek = df_uhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
-                                                atmlst, max_memory, verbose,
-                                                with_k=with_k)
+    de2, ej, ek = _partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
+                                    atmlst, max_memory, verbose,
+                                    with_j=True, with_k=with_k)
     de2 += ej  # (A,B,dR_A,dR_B)
     if with_k:
         de2 -= hyb * ek
 
     if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10:
-        ek_lr = df_uhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
-                                            atmlst, max_memory, verbose,
-                                            True, omega=omega)[2]
+        ek_lr = _partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
+                                  atmlst, max_memory, verbose,
+                                  with_j=False, with_k=True, omega=omega)[2]
         de2 -= (alpha - hyb) * ek_lr
 
     max_memory = None
@@ -98,11 +99,11 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
     mem_now = lib.current_memory()[0]
     max_memory = max(2000, mf.max_memory*.9-mem_now)
-    
+
     with_k = ni.libxc.is_hybrid_xc(mf.xc)
 
-    vj1, vk1 = df_uhf_hess._get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile,
-                                       atmlst, verbose, with_k)
+    vj1, vk1 = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile,
+                          atmlst, verbose, with_j=True, with_k=True)
     vj1a, vj1b = vj1
     h1moa = vj1a
     h1mob = vj1b
@@ -112,10 +113,10 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
         h1moa -= hyb * vk1a
         h1mob -= hyb * vk1b
     vj1 = vk1 = vj1a = vj1b = vk1a = vk1b = None
-    
+
     if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10:
-        _, vk1_lr = df_uhf_hess._get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile,
-                                             atmlst, verbose, True, omega=omega)
+        _, vk1_lr = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile,
+                               atmlst, verbose, with_j=False, with_k=True, omega=omega)
         vk1a, vk1b = vk1_lr
         h1moa -= (alpha - hyb) * vk1a
         h1mob -= (alpha - hyb) * vk1b
diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
index 98350c59..ce972503 100644
--- a/gpu4pyscf/df/int3c2e.py
+++ b/gpu4pyscf/df/int3c2e.py
@@ -20,7 +20,7 @@
 from pyscf import gto, df, lib
 from pyscf.scf import _vhf
 from gpu4pyscf.scf.int4c2e import BasisProdCache, libgvhf, libgint
-from gpu4pyscf.lib.cupy_helper import (block_c2s_diag, cart2sph, contract, get_avail_mem, 
+from gpu4pyscf.lib.cupy_helper import (block_c2s_diag, cart2sph, contract, get_avail_mem,
                                        reduce_to_device)
 from gpu4pyscf.lib import logger
 from gpu4pyscf.gto.mole import basis_seg_contraction
@@ -115,7 +115,7 @@ def build(self, cutoff=1e-14, group_size=None,
 
         mol = basis_seg_contraction(_mol,allow_replica=True)
         auxmol = basis_seg_contraction(_auxmol, allow_replica=True)
-        
+
         log = logger.new_logger(_mol, _mol.verbose)
         cput0 = log.init_timer()
         _sorted_mol, sorted_idx, uniq_l_ctr, l_ctr_counts = sort_mol(mol, log=log)
@@ -181,7 +181,7 @@ def build(self, cutoff=1e-14, group_size=None,
 
         aux_loc = _auxmol.ao_loc_nr(cart=_auxmol.cart)
         ao_idx = np.array_split(np.arange(_auxmol.nao), aux_loc[1:-1])
-        self._aux_ao_idx = np.hstack([ao_idx[i] for i in sorted_aux_idx])        
+        self._aux_ao_idx = np.hstack([ao_idx[i] for i in sorted_aux_idx])
         cput1 = log.timer_debug1('Aux AO indices', *cput1)
 
         ao_loc = _sorted_mol.ao_loc_nr(cart=_mol.cart)
@@ -260,7 +260,7 @@ def build(self, cutoff=1e-14, group_size=None,
 
         self._sorted_mol = _sorted_mol
         self._sorted_auxmol = _sorted_auxmol
-    
+
     @property
     def bpcache(self):
         device_id = cupy.cuda.Device().id
@@ -310,15 +310,15 @@ def unsort_orbitals(self, sorted_mat, axis=[], aux_axis=[]):
         mat = cupy.empty_like(sorted_mat)
         mat[tuple(fancy_index)] = sorted_mat
         return mat
-    
+
     @property
     def cart2sph(self):
         return block_c2s_diag(self.angular, self.l_ctr_counts)
-    
+
     @property
     def aux_cart2sph(self):
         return block_c2s_diag(self.aux_angular, self.aux_l_ctr_counts)
-    
+
     @property
     def coeff(self):
         nao = self.mol.nao
@@ -339,36 +339,45 @@ def aux_coeff(self):
             self._aux_coeff = self.unsort_orbitals(self.aux_cart2sph, aux_axis=[1])
         return self._aux_coeff
 
-def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None, with_k=True):
+def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None, with_j=True, with_k=True):
     log = logger.new_logger(mol, mol.verbose)
     intopt = VHFOpt(mol, auxmol, 'int2e')
-    intopt.build(thred, diag_block_with_triu=True, aosym=True, group_size=BLKSIZE, group_size_aux=BLKSIZE)
+    intopt.build(thred, diag_block_with_triu=True, aosym=True,
+                 group_size=BLKSIZE, group_size_aux=BLKSIZE)
     orbo = dm0_tag.occ_coeff
     nao = mol.nao
     naux = auxmol.nao
     nocc = orbo.shape[1]
-    wj = cupy.empty([naux])
-    avail_mem = get_avail_mem()
-    use_gpu_memory = True
-    if naux*nao*nocc*8 < 0.4*avail_mem:
-        try:
-            wk = cupy.empty([naux,nao,nocc])
-        except Exception:
+
+    wj = None
+    if with_j:
+        wj = cupy.empty([naux])
+
+    wk = None
+    if with_k:
+        avail_mem = get_avail_mem()
+        use_gpu_memory = True
+        if naux*nao*nocc*8 < 0.4*avail_mem:
+            try:
+                wk = cupy.empty([naux,nao,nocc])
+            except Exception:
+                use_gpu_memory = False
+        else:
             use_gpu_memory = False
-    else:
-        use_gpu_memory = False
-    
-    if not use_gpu_memory:
-        log.debug('Saving int3c2e_wjk on CPU memory')
-        mem = cupy.cuda.alloc_pinned_memory(naux*nao*nocc*8)
-        wk = np.ndarray([naux,nao,nocc], dtype=np.float64, order='C', buffer=mem)
+
+        if not use_gpu_memory:
+            log.debug('Saving int3c2e_wjk on CPU memory')
+            mem = cupy.cuda.alloc_pinned_memory(naux*nao*nocc*8)
+            wk = np.ndarray([naux,nao,nocc], dtype=np.float64, order='C', buffer=mem)
 
     # TODO: async data transfer
     for cp_kl_id, _ in enumerate(intopt.aux_log_qs):
         k0 = intopt.aux_ao_loc[cp_kl_id]
         k1 = intopt.aux_ao_loc[cp_kl_id+1]
-        rhoj_tmp = cupy.zeros([k1-k0], order='C')
-        rhok_tmp = cupy.zeros([k1-k0, nao, nocc], order='C')
+        if with_j:
+            rhoj_tmp = cupy.zeros([k1-k0], order='C')
+        if with_k:
+            rhok_tmp = cupy.zeros([k1-k0, nao, nocc], order='C')
 
         for cp_ij_id, _ in enumerate(intopt.log_qs):
             cpi = intopt.cp_idx[cp_ij_id]
@@ -381,15 +390,17 @@ def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None, with_k=True):
                 int3c_blk = cart2sph(int3c_blk, axis=2, ang=li)
             i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1]
             j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1]
-
-            tmp = contract('Lji,ij->L', int3c_blk, dm0_tag[i0:i1,j0:j1])
-            rhoj_tmp += tmp
-            rhok_tmp[:,j0:j1] += contract('Lji,io->Ljo', int3c_blk, orbo[i0:i1])
-
-            if cpi != cpj and intopt.aosym:
+            if with_j:
+                tmp = contract('Lji,ij->L', int3c_blk, dm0_tag[i0:i1,j0:j1])
                 rhoj_tmp += tmp
-                rhok_tmp[:,i0:i1] += contract('Lji,jo->Lio', int3c_blk, orbo[j0:j1])
-        wj[k0:k1] = rhoj_tmp
+                if cpi != cpj:
+                    rhoj_tmp += tmp
+            if with_k:
+                rhok_tmp[:,j0:j1] += contract('Lji,io->Ljo', int3c_blk, orbo[i0:i1])
+                if cpi != cpj:
+                    rhok_tmp[:,i0:i1] += contract('Lji,jo->Lio', int3c_blk, orbo[j0:j1])
+        if with_j:
+            wj[k0:k1] = rhoj_tmp
         if with_k:
             if isinstance(wk, cupy.ndarray):
                 wk[k0:k1] = rhok_tmp
@@ -505,7 +516,7 @@ def loop_int3c2e_general(intopt, task_list=None, ip_type='', omega=None, stream=
     for aux_id, cp_ij_id in task_list:
         cp_kl_id = aux_id + len(intopt.log_qs)
         lk = intopt.aux_angular[aux_id]
-        
+
         cpi = intopt.cp_idx[cp_ij_id]
         cpj = intopt.cp_jdx[cp_ij_id]
         li = intopt.angular[cpi]
@@ -670,26 +681,26 @@ def get_j_int3c2e_pass1(intopt, dm0, sort_j=True, stream=None):
     get rhoj pass1 for int3c2e
     '''
     if stream is None: stream = cupy.cuda.get_current_stream()
-    
+
     n_dm = 1
 
     naux = intopt._sorted_auxmol.nao
-    
+
     coeff = intopt.coeff
     if dm0.ndim == 3:
         dm0 = dm0[0] + dm0[1]
     dm_cart = coeff @ dm0 @ coeff.T
-    
+
     num_cp_ij = [len(log_qs) for log_qs in intopt.log_qs]
     num_cp_kl = [len(log_qs) for log_qs in intopt.aux_log_qs]
 
     bins_locs_ij = np.append(0, np.cumsum(num_cp_ij)).astype(np.int32)
     bins_locs_kl = np.append(0, np.cumsum(num_cp_kl)).astype(np.int32)
-    
+
     ncp_ij = len(intopt.log_qs)
     ncp_kl = len(intopt.aux_log_qs)
     norb = dm_cart.shape[0]
-    
+
     rhoj = cupy.zeros([naux])
 
     err = libgvhf.GINTbuild_j_int3c2e_pass1(
@@ -706,7 +717,7 @@ def get_j_int3c2e_pass1(intopt, dm0, sort_j=True, stream=None):
         ctypes.c_int(ncp_kl))
     if err != 0:
         raise RuntimeError('CUDA error in get_j_pass1')
-    
+
     if sort_j:
         aux_coeff = intopt.aux_coeff
         rhoj = cupy.dot(rhoj, aux_coeff)
@@ -731,7 +742,7 @@ def get_j_int3c2e_pass2(intopt, rhoj, stream=None):
 
     ncp_ij = len(intopt.log_qs)
     ncp_kl = len(intopt.aux_log_qs)
-    
+
     rhoj = intopt.sort_orbitals(rhoj, aux_axis=[0])
     if not intopt.auxmol.cart:
         rhoj = intopt.aux_cart2sph @ rhoj
@@ -751,7 +762,7 @@ def get_j_int3c2e_pass2(intopt, rhoj, stream=None):
 
     if err != 0:
         raise RuntimeError('CUDA error in get_j_pass2')
-    
+
     if not intopt.mol.cart:
         cart2sph = intopt.cart2sph
         vj = cart2sph.T @ vj @ cart2sph
@@ -804,20 +815,24 @@ def get_int3c2e_jk(mol, auxmol, dm0_tag, with_k=True, omega=None):
             rhok[k0:k1] = rhok_tmp
     return rhoj, rhok
 
-def _int3c2e_ip1_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, device_id=0, with_k=True, omega=None):
+def _int3c2e_ip1_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, device_id=0,
+                          with_j=True, with_k=True, omega=None):
     natom = intopt.mol.natm
     nao = intopt.mol.nao
     aoslices = intopt.mol.aoslice_by_atom()
+    vj1_buf = vk1_buf = vj1 = vk1 = None
     with cupy.cuda.Device(device_id), _streams[device_id]:
         ao2atom = get_ao2atom(intopt, aoslices)
-        rhoj = cupy.asarray(rhoj)
         dm0 = cupy.asarray(dm0)
         orbo = cupy.asarray(orbo)
         nocc = orbo.shape[1]
-        vj1_buf = cupy.zeros([3,nao,nao])
-        vk1_buf = cupy.zeros([3,nao,nao])
-        vj1 = cupy.zeros([natom,3,nao,nocc])
-        vk1 = cupy.zeros([natom,3,nao,nocc])
+        if with_j:
+            rhoj = cupy.asarray(rhoj)
+            vj1_buf = cupy.zeros([3,nao,nao])
+            vj1 = cupy.zeros([natom,3,nao,nocc])
+        if with_k:
+            vk1_buf = cupy.zeros([3,nao,nao])
+            vk1 = cupy.zeros([natom,3,nao,nocc])
         aux_ao_loc = intopt.aux_ao_loc
         ncp_ij = len(intopt.log_qs)
         for cp_k in task_list:
@@ -827,15 +842,18 @@ def _int3c2e_ip1_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, device_id=0,
             if with_k:
                 rhok0 = contract('pio,ir->pro', rhok_tmp, orbo)
                 rhok0 = contract('pro,Jo->prJ', rhok0, orbo)
-            rhoj0 = cupy.zeros([3,k1-k0,nao])
-            int3c_ip1_occ = cupy.zeros([3,k1-k0,nao,nocc])
+                int3c_ip1_occ = cupy.zeros([3,k1-k0,nao,nocc])
+            if with_j:
+                rhoj0 = cupy.zeros([3,k1-k0,nao])
+
             for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list,
                                                                      ip_type='ip1', omega=omega):
-                vj1_buf[:,i0:i1,j0:j1] += contract('xpji,p->xij', int3c_blk, rhoj[k0:k1])
-                rhoj0[:,:,i0:i1] += contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1])
-                int3c_ip1_occ[:,:,i0:i1] += contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1])
-
+                if with_j:
+                    vj1_buf[:,i0:i1,j0:j1] += contract('xpji,p->xij', int3c_blk, rhoj[k0:k1])
+                    rhoj0[:,:,i0:i1] += contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1])
                 if with_k:
+                    int3c_ip1_occ[:,:,i0:i1] += contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1])
+
                     vk1_ao = contract('xpji,poi->xijo', int3c_blk, rhok0[:,:,i0:i1])
                     vk1[:,:,j0:j1] += contract('xijo,ia->axjo', vk1_ao, ao2atom[i0:i1])
 
@@ -845,14 +863,17 @@ def _int3c2e_ip1_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, device_id=0,
                     vk1_ao = contract('xpio,pJi->xiJo', int3c_occ, rhok0_slice)
                     vk1 += contract('xiJo,ia->axJo', vk1_ao, ao2atom[i0:i1])
                     vk1_ao = int3c_occ = None
-            rhoj0_atom = contract('xpi,ia->xpa', rhoj0, 2.0*ao2atom)
-            vj1 += contract('pJo,xpa->axJo', rhok_tmp, rhoj0_atom)
-            rhoj0_atom = None
-            vk1_buf += contract('xpio,plo->xil', int3c_ip1_occ, rhok_tmp)
+            if with_j:
+                rhoj0_atom = contract('xpi,ia->xpa', rhoj0, 2.0*ao2atom)
+                vj1 += contract('pJo,xpa->axJo', rhok_tmp, rhoj0_atom)
+                rhoj0_atom = None
+            if with_k:
+                vk1_buf += contract('xpio,plo->xil', int3c_ip1_occ, rhok_tmp)
     # TODO: absorbe vj1_buf and vk1_buf into vj1 and vk1
     return vj1_buf, vk1_buf, vj1, vk1
 
-def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_k=True, omega=None):
+def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_j=True,
+                        with_k=True, omega=None):
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
     futures = []
     ncp_k = len(intopt.aux_log_qs)
@@ -860,15 +881,16 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_k=True, omeg
     task_list = []
     for device_id in range(_num_devices):
         task_list.append(tasks[device_id::_num_devices])
-    
+
     cupy.cuda.get_current_stream().synchronize()
     with ThreadPoolExecutor(max_workers=_num_devices) as executor:
         for device_id in range(_num_devices):
             future = executor.submit(
-                _int3c2e_ip1_vjk_task, intopt, task_list[device_id], 
-                rhoj, rhok, dm0_tag, orbo, with_k=with_k, device_id=device_id, omega=omega)
+                _int3c2e_ip1_vjk_task, intopt, task_list[device_id],
+                rhoj, rhok, dm0_tag, orbo, with_j=with_j, with_k=with_k,
+                device_id=device_id, omega=omega)
             futures.append(future)
-    
+
     vj1_buf_total = []
     vk1_buf_total = []
     vj1_total = []
@@ -879,45 +901,55 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_k=True, omeg
         vk1_buf_total.append(vk1_buf)
         vj1_total.append(vj1)
         vk1_total.append(vk1)
-        
+
     vj1 = vk1 = vj1_buf = vk1_buf = None
-    vj1 = reduce_to_device(vj1_total, inplace=True)
-    vj1_buf = reduce_to_device(vj1_buf_total, inplace=True)
+    if with_j:
+        vj1 = reduce_to_device(vj1_total, inplace=True)
+        vj1_buf = reduce_to_device(vj1_buf_total, inplace=True)
     if with_k:
         vk1 = reduce_to_device(vk1_total, inplace=True)
         vk1_buf = reduce_to_device(vk1_buf_total, inplace=True)
     return vj1_buf, vk1_buf, vj1, vk1
 
 
-def _int3c2e_ip2_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, device_id=0, with_k=True, omega=None):
+def _int3c2e_ip2_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo,
+                          device_id=0, with_j=True, with_k=True, omega=None):
     natom = intopt.mol.natm
     nao = intopt.mol.nao
     auxslices = intopt.auxmol.aoslice_by_atom()
+    vj1 = vk1 = None
     with cupy.cuda.Device(device_id), _streams[device_id]:
         aux2atom = get_aux2atom(intopt, auxslices)
-        rhoj = cupy.asarray(rhoj)
         dm0 = cupy.asarray(dm0)
         orbo = cupy.asarray(orbo)
         nocc = orbo.shape[1]
-        vj1 = cupy.zeros([natom,3,nao,nocc])
-        vk1 = cupy.zeros([natom,3,nao,nocc])
+        if with_j:
+            rhoj = cupy.asarray(rhoj)
+            vj1 = cupy.zeros([natom,3,nao,nocc])
+        if with_k:
+            vk1 = cupy.zeros([natom,3,nao,nocc])
         aux_ao_loc = intopt.aux_ao_loc
         ncp_ij = len(intopt.log_qs)
         for cp_k in task_list:
             task_list = [(cp_k, cp_ij) for cp_ij in range(ncp_ij)]
             k0, k1 = aux_ao_loc[cp_k], aux_ao_loc[cp_k+1]
-            wj2 = cupy.zeros([3,k1-k0])
+            if with_j:
+                wj2 = cupy.zeros([3,k1-k0])
+
             wk2_P__ = cupy.zeros([3,k1-k0,nao,nocc])
             for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list,
                                                                      ip_type='ip2', omega=omega):
                 # contraction
-                wj2 += contract('xpji,ji->xp', int3c_blk, dm0[j0:j1,i0:i1])
+                if with_j:
+                    wj2 += contract('xpji,ji->xp', int3c_blk, dm0[j0:j1,i0:i1])
+
                 wk2_P__[:,:,i0:i1] += contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1])
             rhok_tmp = cupy.asarray(rhok[k0:k1])
-            vj1_tmp = -contract('pio,xp->xpio', rhok_tmp, wj2)
-            vj1_tmp -= contract('xpio,p->xpio', wk2_P__, rhoj[k0:k1])
+            if with_j:
+                vj1_tmp = -contract('pio,xp->xpio', rhok_tmp, wj2)
+                vj1_tmp -= contract('xpio,p->xpio', wk2_P__, rhoj[k0:k1])
 
-            vj1 += contract('xpio,pa->axio', vj1_tmp, aux2atom[k0:k1])
+                vj1 += contract('xpio,pa->axio', vj1_tmp, aux2atom[k0:k1])
             if with_k:
                 #rhok0_slice = contract('pio,jo->pij', rhok_tmp, orbo)
                 #vk1_tmp = -contract('xpjo,pij->xpio', wk2_P__, rhok0_slice)
@@ -932,7 +964,8 @@ def _int3c2e_ip2_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, device_id=0,
             rhok_tmp = vk1_tmp = None
     return vj1, vk1
 
-def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, with_k=True, omega=None):
+def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices,
+                        with_j=True, with_k=True, omega=None):
     '''
     vj and vk responses (due to int3c2e_ip2) to changes in atomic positions
     '''
@@ -943,24 +976,26 @@ def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, with_k=True, ome
     task_list = []
     for device_id in range(_num_devices):
         task_list.append(tasks[device_id::_num_devices])
-    
+
     cupy.cuda.get_current_stream().synchronize()
     with ThreadPoolExecutor(max_workers=_num_devices) as executor:
         for device_id in range(_num_devices):
             future = executor.submit(
-                _int3c2e_ip2_vjk_task, intopt, task_list[device_id], 
-                rhoj, rhok, dm0_tag, orbo, with_k=with_k, device_id=device_id, omega=omega)
+                _int3c2e_ip2_vjk_task, intopt, task_list[device_id],
+                rhoj, rhok, dm0_tag, orbo, with_j=with_j,
+                with_k=with_k, device_id=device_id, omega=omega)
             futures.append(future)
-    
+
     vj_total = []
     vk_total = []
     for future in futures:
         vj, vk = future.result()
         vj_total.append(vj)
         vk_total.append(vk)
-        
+
     vj = vk = None
-    vj = reduce_to_device(vj_total, inplace=True)
+    if with_j:
+        vj = reduce_to_device(vj_total, inplace=True)
     if with_k:
         vk = reduce_to_device(vk_total, inplace=True)
     return vj, vk
@@ -999,7 +1034,7 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None):
     task_list = []
     for device_id in range(_num_devices):
         task_list.append(tasks[device_id::_num_devices])
-    
+
     nao = intopt.mol.nao
     naux = intopt.auxmol.nao
     nocc = orbo.shape[1]
@@ -1012,7 +1047,7 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None):
     with ThreadPoolExecutor(max_workers=_num_devices) as executor:
         for device_id in range(_num_devices):
             future = executor.submit(
-                _int3c2e_ip1_wjk_task, intopt, task_list[device_id], 
+                _int3c2e_ip1_wjk_task, intopt, task_list[device_id],
                 dm0_tag, orbo, wk, with_k=with_k, device_id=device_id, omega=omega)
             futures.append(future)
     wj_total = []
@@ -1049,7 +1084,7 @@ def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None):
     task_list = []
     for device_id in range(_num_devices):
         task_list.append(tasks[device_id::_num_devices])
-    
+
     cupy.cuda.get_current_stream().synchronize()
     with ThreadPoolExecutor(max_workers=_num_devices) as executor:
         for device_id in range(_num_devices):
@@ -1057,14 +1092,14 @@ def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None):
                 _int3c2e_ip2_wjk, intopt, task_list[device_id],
                 dm0_tag, orbo, with_k=with_k, device_id=device_id, omega=omega)
             futures.append(future)
-    
+
     wj_total = []
     wk_total = []
     for future in futures:
         wj, wk = future.result()
         wj_total.append(wj)
         wk_total.append(wk)
-        
+
     wj = wk = None
     wj = reduce_to_device(wj_total, inplace=True)
     if with_k:
@@ -1373,7 +1408,7 @@ def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, cart=False, aosym=None, out=N
     nbins = 1
     bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32)
     bins_locs_kl = np.array([0, len(log_q_kl)], dtype=np.int32)
-    
+
     cart_ao_loc = intopt.cart_ao_loc
     cart_aux_loc = intopt.cart_aux_loc
     i0, i1 = cart_ao_loc[cpi], cart_ao_loc[cpi+1]
@@ -1415,11 +1450,11 @@ def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, cart=False, aosym=None, out=N
 
     if err != 0:
         raise RuntimeError('GINT_fill_int2e failed')
-    
+
     # move this operation to j2c?
     if lk > 1 and intopt.auxmol.cart == 0:
         int3c_blk = cart2sph(int3c_blk, axis=0, ang=lk, out=out)
-    
+
     stream.synchronize()
 
     return int3c_blk
diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py
index 2ca835ad..e5b4d297 100644
--- a/gpu4pyscf/hessian/rhf.py
+++ b/gpu4pyscf/hessian/rhf.py
@@ -27,8 +27,6 @@
 from pyscf.hessian import rhf as rhf_hess_cpu
 from pyscf import lib, gto
 from pyscf.gto import ATOM_OF
-# import _response_functions to load gen_response methods in SCF class
-from gpu4pyscf.scf import _response_functions  # noqa
 from gpu4pyscf.scf import cphf
 from gpu4pyscf.lib.cupy_helper import (reduce_to_device,
     contract, tag_array, sandwich_dot, transpose_sum, get_avail_mem, condense,
@@ -181,7 +179,7 @@ def _ejk_ip2_task(mol, dms, vhfopt, task_list, j_factor=1.0, k_factor=1.0,
         cput0 = log.init_timer()
         dms = cp.asarray(dms)
         coeff = cp.asarray(vhfopt.coeff)
-        
+
         #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff)
         dms = sandwich_dot(dms, coeff.T)
         dms = cp.asarray(dms, order='C')
@@ -661,8 +659,8 @@ def fvind_vo(mo1):
     avail_mem = get_avail_mem()
     # *4 for input dm, vj, vk, and vxc
     blksize = int(min(avail_mem*.3 / (8*3*nao*nocc*4), # in MO
-                      avail_mem*.6 / (8*nmo*nocc*3*5), 
-                      avail_mem*.3 / (8*nmo*nmo*3*3))) # vj, vk, dm 
+                      avail_mem*.6 / (8*nmo*nocc*3*5),
+                      avail_mem*.3 / (8*nmo*nmo*3*3))) # vj, vk, dm
     if blksize < ALIGNED**2:
         raise RuntimeError('GPU memory insufficient for solving CPHF equations')
 
@@ -692,7 +690,7 @@ def fvind_vo(mo1):
         mo1[:,:,viridx] *= -e_ai
         mo1[:,:,occidx] = -s1mo_blk[:,:,occidx] * .5
         hs = s1mo_blk = h1mo_blk = None
-        
+
         tol = mf.conv_tol_cpscf * (i1 - i0)
         raw_mo1 = krylov(fvind_vo, mo1.reshape(-1,nmo*nocc),
                          tol=tol, max_cycle=max_cycle, verbose=log)
@@ -742,7 +740,7 @@ def hess_nuc_elec(mol, dm):
     fakemol.verbose = mol.verbose
     fakemol.stdout = mol.stdout
     intopt = int3c2e.VHFOpt(mol, fakemol, 'int2e')
-    intopt.build(1e-14, diag_block_with_triu=True, aosym=False, 
+    intopt.build(1e-14, diag_block_with_triu=True, aosym=False,
                  group_size=int3c2e.BLKSIZE, group_size_aux=int3c2e.BLKSIZE)
     dm = intopt.sort_orbitals(cupy.asarray(dm), axis=[0,1])
 
@@ -889,7 +887,7 @@ def get_hcore(iatm, jatm):
 def hcore_generator(hessobj, mol=None):
     raise NotImplementedError
 
-def _get_jk_mo(hessobj, mol, dms, mo_coeff, mo_occ, 
+def _get_jk_mo(hessobj, mol, dms, mo_coeff, mo_occ,
             hermi=1, with_j=True, with_k=True, omega=None):
     ''' Compute J/K matrices in MO for multiple DMs
     '''
@@ -903,7 +901,7 @@ def _get_jk_mo(hessobj, mol, dms, mo_coeff, mo_occ,
     return vj, vk
 
 def _get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1, omega=None):
-    vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, 
+    vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ,
                      hermi=hermi, with_j=True, with_k=True, omega=omega)
     return vj - 0.5 * vk
 
@@ -921,7 +919,7 @@ class HessianBase(lib.StreamObject):
     gen_vind        = NotImplemented
     get_jk          = NotImplemented
     kernel = hess = kernel
-    
+
     def get_hcore(self, mol=None):
         if mol is None: mol = self.mol
         return get_hcore(mol)
diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py
index 5d909c78..bffc221c 100644
--- a/gpu4pyscf/hessian/rks.py
+++ b/gpu4pyscf/hessian/rks.py
@@ -25,11 +25,10 @@
 from pyscf import lib
 from gpu4pyscf.hessian import rhf as rhf_hess
 from gpu4pyscf.grad import rhf as rhf_grad
-# import pyscf.grad.rks to activate nuc_grad_method method
 from gpu4pyscf.grad import rks as rks_grad
 from gpu4pyscf.dft import numint
-from gpu4pyscf.lib.cupy_helper import (contract, add_sparse, get_avail_mem, 
-                                       reduce_to_device, transpose_sum, tag_array)
+from gpu4pyscf.lib.cupy_helper import (contract, add_sparse, get_avail_mem,
+                                       reduce_to_device)
 from gpu4pyscf.lib import logger
 from gpu4pyscf.__config__ import _streams, _num_devices
 from gpu4pyscf.hessian import jk
@@ -737,15 +736,15 @@ def get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1, omega=None):
         vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=1)
         vk *= hyb
         if omega > 1e-10:  # For range separated Coulomb
-            _, vk_lr = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi, 
-                                        with_j=False, omega=omega) 
+            _, vk_lr = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi,
+                                        with_j=False, omega=omega)
             vk_lr *= (alpha-hyb)
             vk += vk_lr
         v1 += vj - .5 * vk
     else:
-        v1 += hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=1, 
+        v1 += hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=1,
                                 with_k=False)[0]
-    
+
     return v1
 
 
diff --git a/gpu4pyscf/hessian/uhf.py b/gpu4pyscf/hessian/uhf.py
index 44154532..b3cff989 100644
--- a/gpu4pyscf/hessian/uhf.py
+++ b/gpu4pyscf/hessian/uhf.py
@@ -21,15 +21,12 @@
 Non-relativistic UHF analytical Hessian
 '''
 
-from functools import reduce
 import numpy as np
 import cupy
 import cupy as cp
 from pyscf import lib
 from pyscf.scf import ucphf
-# import _response_functions to load gen_response methods in SCF class
-from gpu4pyscf.scf import _response_functions  # noqa
-from gpu4pyscf.lib.cupy_helper import (contract, transpose_sum, get_avail_mem, 
+from gpu4pyscf.lib.cupy_helper import (contract, transpose_sum, get_avail_mem,
                                        krylov, tag_array)
 from gpu4pyscf.lib import logger
 from gpu4pyscf.grad import rhf as rhf_grad
@@ -406,7 +403,7 @@ def fx(mo1):
     return fx
 
 def _get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1):
-    vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, 
+    vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ,
                                hermi=hermi, with_j=True, with_k=True)
     return vj - vk
 
@@ -422,7 +419,7 @@ class Hessian(rhf_hess_gpu.HessianBase):
     gen_vind = gen_vind
     get_jk_mo = rhf_hess_gpu._get_jk_mo
     get_veff_resp_mo = _get_veff_resp_mo
-    
+
     def solve_mo1(self, mo_energy, mo_coeff, mo_occ, h1mo,
                   fx=None, atmlst=None, max_memory=4000, verbose=None):
         return solve_mo1(self.base, mo_energy, mo_coeff, mo_occ, h1mo,
diff --git a/gpu4pyscf/hessian/uks.py b/gpu4pyscf/hessian/uks.py
index 66571300..2a048f5f 100644
--- a/gpu4pyscf/hessian/uks.py
+++ b/gpu4pyscf/hessian/uks.py
@@ -23,11 +23,9 @@
 from gpu4pyscf.hessian import rhf as rhf_hess
 from gpu4pyscf.hessian import uhf as uhf_hess
 from gpu4pyscf.grad import rhf as rhf_grad
-# import pyscf.grad.rks to activate nuc_grad_method method
 from gpu4pyscf.grad import rks as rks_grad
 from gpu4pyscf.dft import numint
-from gpu4pyscf.lib.cupy_helper import (contract, add_sparse, get_avail_mem, 
-                                       transpose_sum, tag_array)
+from gpu4pyscf.lib.cupy_helper import (contract, add_sparse, get_avail_mem)
 from gpu4pyscf.lib import logger
 from gpu4pyscf.hessian import jk
 
@@ -856,7 +854,7 @@ def get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1):
         # If cphf_grids is not defined, e.g object defined from CPU
         grids = getattr(mf, 'grids', None)
         logger.info(mf, 'Primary grids is used for CPHF in Hessian')
-    
+
     if grids and grids.coords is None:
         grids.build(mol=mol, with_non0tab=False, sort_grids=True)
 
@@ -866,7 +864,7 @@ def get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1):
     moccb = mo_coeff[1][:,mo_occ[1]>0]
     nocca = mocca.shape[1]
     noccb = moccb.shape[1]
-    
+
     ni = mf._numint
     omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, mol.spin)
     hybrid = ni.libxc.is_hybrid_xc(mf.xc)
@@ -885,13 +883,13 @@ def get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1):
         vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=1)
         vk *= hyb
         if omega > 1e-10:
-            _, vk_lr = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, 
-                                         hermi, with_j=False, omega=omega) 
+            _, vk_lr = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ,
+                                         hermi, with_j=False, omega=omega)
             vk_lr *= (alpha-hyb)
             vk += vk_lr
         v1vo += vj - vk
     else:
-        v1vo += hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, 
+        v1vo += hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ,
                                   hermi=1, with_k=False)[0]
     return v1vo
 

From dfc336d2f70ccf01d3c8d94f6eb7d2d13d75b9c0 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <wxj6000@gmail.com>
Date: Sun, 22 Dec 2024 21:19:04 -0800
Subject: [PATCH 11/49] memory estimate

---
 gpu4pyscf/hessian/rhf.py | 3 +--
 gpu4pyscf/hessian/uhf.py | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py
index e5b4d297..b2aab6c6 100644
--- a/gpu4pyscf/hessian/rhf.py
+++ b/gpu4pyscf/hessian/rhf.py
@@ -659,8 +659,7 @@ def fvind_vo(mo1):
     avail_mem = get_avail_mem()
     # *4 for input dm, vj, vk, and vxc
     blksize = int(min(avail_mem*.3 / (8*3*nao*nocc*4), # in MO
-                      avail_mem*.6 / (8*nmo*nocc*3*5),
-                      avail_mem*.3 / (8*nmo*nmo*3*3))) # vj, vk, dm
+                      avail_mem*.3 / (8*nmo*nmo*3*3))) # vj, vk, dm in AO
     if blksize < ALIGNED**2:
         raise RuntimeError('GPU memory insufficient for solving CPHF equations')
 
diff --git a/gpu4pyscf/hessian/uhf.py b/gpu4pyscf/hessian/uhf.py
index b3cff989..0f695b2b 100644
--- a/gpu4pyscf/hessian/uhf.py
+++ b/gpu4pyscf/hessian/uhf.py
@@ -293,7 +293,7 @@ def fvind_vo(mo1):
     avail_mem = get_avail_mem()
     # *8 for spin-up/down input dm, vj, vk, and vxc
     blksize = int(min(avail_mem*.3 / (8*3*nao*nao*8),
-                      avail_mem*.6 / (8*nmo*nocc*natm*3*5)))
+                      avail_mem*.3 / (8*nmo*nmo*3*6)))  # in vj, vk, dm in AO
     if blksize < ALIGNED**2:
         raise RuntimeError('GPU memory insufficient')
 

From 78759fccab62e1074de66f60e993f2a527cf48fb Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Mon, 23 Dec 2024 17:30:08 +0000
Subject: [PATCH 12/49] tested on 095 molecule

---
 examples/dft_driver.py  | 6 +++---
 gpu4pyscf/df/int3c2e.py | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/dft_driver.py b/examples/dft_driver.py
index 8060e909..e0eccdda 100644
--- a/examples/dft_driver.py
+++ b/examples/dft_driver.py
@@ -35,10 +35,10 @@
     basis=bas,
     max_memory=32000)
 # set verbose >= 6 for debugging timer
-mol.verbose = 4
+mol.verbose = 6
 
-mf_df = dft.RKS(mol, xc=args.xc)#.density_fit(auxbasis=args.auxbasis)
-mf_df.verbose = 4
+mf_df = dft.RKS(mol, xc=args.xc).density_fit(auxbasis=args.auxbasis)
+mf_df.verbose = 6
 
 if args.solvent:
     mf_df = mf_df.PCM()
diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
index ce972503..2606f3ef 100644
--- a/gpu4pyscf/df/int3c2e.py
+++ b/gpu4pyscf/df/int3c2e.py
@@ -815,7 +815,7 @@ def get_int3c2e_jk(mol, auxmol, dm0_tag, with_k=True, omega=None):
             rhok[k0:k1] = rhok_tmp
     return rhoj, rhok
 
-def _int3c2e_ip1_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, device_id=0,
+def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id=0,
                           with_j=True, with_k=True, omega=None):
     natom = intopt.mol.natm
     nao = intopt.mol.nao
@@ -835,7 +835,7 @@ def _int3c2e_ip1_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, device_id=0,
             vk1 = cupy.zeros([natom,3,nao,nocc])
         aux_ao_loc = intopt.aux_ao_loc
         ncp_ij = len(intopt.log_qs)
-        for cp_k in task_list:
+        for cp_k in task_k_list:
             task_list = [(cp_k, cp_ij) for cp_ij in range(ncp_ij)]
             k0, k1 = aux_ao_loc[cp_k], aux_ao_loc[cp_k+1]
             rhok_tmp = cupy.asarray(rhok[k0:k1])

From 65b4bff21de316d49acbe433fde325474b092eeb Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Mon, 23 Dec 2024 18:54:34 +0000
Subject: [PATCH 13/49] improve make_h1 in df.hessian

---
 gpu4pyscf/df/int3c2e.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
index 2606f3ef..89822c96 100644
--- a/gpu4pyscf/df/int3c2e.py
+++ b/gpu4pyscf/df/int3c2e.py
@@ -821,6 +821,7 @@ def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id=
     nao = intopt.mol.nao
     aoslices = intopt.mol.aoslice_by_atom()
     vj1_buf = vk1_buf = vj1 = vk1 = None
+
     with cupy.cuda.Device(device_id), _streams[device_id]:
         ao2atom = get_ao2atom(intopt, aoslices)
         dm0 = cupy.asarray(dm0)
@@ -856,19 +857,20 @@ def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id=
 
                     vk1_ao = contract('xpji,poi->xijo', int3c_blk, rhok0[:,:,i0:i1])
                     vk1[:,:,j0:j1] += contract('xijo,ia->axjo', vk1_ao, ao2atom[i0:i1])
-
-                    int3c_occ = contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1])
-                    rhok0_slice = contract('pJr,ir->pJi', rhok_tmp, orbo[i0:i1])
-
-                    vk1_ao = contract('xpio,pJi->xiJo', int3c_occ, rhok0_slice)
-                    vk1 += contract('xiJo,ia->axJo', vk1_ao, ao2atom[i0:i1])
-                    vk1_ao = int3c_occ = None
             if with_j:
                 rhoj0_atom = contract('xpi,ia->xpa', rhoj0, 2.0*ao2atom)
                 vj1 += contract('pJo,xpa->axJo', rhok_tmp, rhoj0_atom)
                 rhoj0_atom = None
             if with_k:
                 vk1_buf += contract('xpio,plo->xil', int3c_ip1_occ, rhok_tmp)
+                mem_avail = get_avail_mem()
+                blksize = min(int(mem_avail * 0.2 / ((k1-k0) * nao) * 8),
+                              int(mem_avail * 0.2 / (nocc * nao * 3 * 8)))
+                for p0, p1, in lib.prange(0, nao, blksize):
+                    rhok0_slice = contract('pJr,ir->pJi', rhok_tmp[:,p0:p1], orbo)
+                    vk1_ao = contract('xpio,pJi->xiJo', int3c_ip1_occ, rhok0_slice)
+                    vk1[:,:,p0:p1] += contract('xiJo,ia->axJo', vk1_ao, ao2atom)
+
     # TODO: absorbe vj1_buf and vk1_buf into vj1 and vk1
     return vj1_buf, vk1_buf, vj1, vk1
 

From 509fc6e5272b9ffb858297effe7657cef4ff4ce1 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <wxj6000@gmail.com>
Date: Tue, 24 Dec 2024 09:54:39 -0800
Subject: [PATCH 14/49] bugfix

---
 gpu4pyscf/scf/jk.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu4pyscf/scf/jk.py b/gpu4pyscf/scf/jk.py
index ecf166fd..8e09a35b 100644
--- a/gpu4pyscf/scf/jk.py
+++ b/gpu4pyscf/scf/jk.py
@@ -59,7 +59,7 @@
                    int(gpu_specs['sharedMemPerBlockOptin']//9)*8)
 THREADS = 256
 
-def _jk_task(mol, dms, vhfopt, task_list,
+def _jk_task(mol, dms, vhfopt, task_list, hermi=0,
              device_id=0, with_j=True, with_k=True, verbose=None):
     n_dm = dms.shape[0]
     nao, _ = vhfopt.coeff.shape

From 0ae65fb028ec69931bd7246f7dd5c2ff8bad54ea Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <wxj6000@gmail.com>
Date: Tue, 24 Dec 2024 10:52:24 -0800
Subject: [PATCH 15/49] use sorted_mol

---
 gpu4pyscf/hessian/jk.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/gpu4pyscf/hessian/jk.py b/gpu4pyscf/hessian/jk.py
index f4f102c6..a1cd6105 100644
--- a/gpu4pyscf/hessian/jk.py
+++ b/gpu4pyscf/hessian/jk.py
@@ -27,7 +27,7 @@
 from pyscf.scf import _vhf
 from pyscf import __config__
 
-from gpu4pyscf.scf.jk import (_make_tril_tile_mappings, quartets_scheme, QUEUE_DEPTH, 
+from gpu4pyscf.scf.jk import (_make_tril_tile_mappings, quartets_scheme, QUEUE_DEPTH,
                               _VHFOpt, LMAX, init_constant, libvhf_rys)
 from gpu4pyscf.lib.cupy_helper import (condense, sandwich_dot, transpose_sum,
                                        reduce_to_device, contract)
@@ -49,7 +49,7 @@ def _jk_task(mol, dms, mo_coeff, mo_occ, vhfopt, task_list, hermi=0,
     l_ctr_bas_loc = vhfopt.l_ctr_offsets
     l_symb = [lib.param.ANGULAR[i] for i in uniq_l]
     kern = libvhf_rys.RYS_build_jk
-    
+
     timing_counter = Counter()
     kern_counts = 0
     with cp.cuda.Device(device_id), _streams[device_id]:
@@ -69,7 +69,7 @@ def _jk_task(mol, dms, mo_coeff, mo_occ, vhfopt, task_list, hermi=0,
         s_ptr = lib.c_null_ptr()
         if mol.omega < 0:
             s_ptr = ctypes.cast(vhfopt.s_estimator.data.ptr, ctypes.c_void_p)
-        
+
         vj = vk = None
         vj_ptr = vk_ptr = lib.c_null_ptr()
         assert with_j or with_k
@@ -79,7 +79,7 @@ def _jk_task(mol, dms, mo_coeff, mo_occ, vhfopt, task_list, hermi=0,
         if with_j:
             vj = cp.zeros(dms.shape)
             vj_ptr = ctypes.cast(vj.data.ptr, ctypes.c_void_p)
-        
+
         ao_loc = mol.ao_loc
         dm_cond = cp.log(condense('absmax', dms, ao_loc) + 1e-300).astype(np.float32)
         log_max_dm = dm_cond.max()
@@ -137,7 +137,7 @@ def _jk_task(mol, dms, mo_coeff, mo_occ, vhfopt, task_list, hermi=0,
             # Unrestricted case
             mo_coeff = cp.asarray(mo_coeff)
             mo_occ = cp.asarray(mo_occ)
-            moa = coeff.dot(mo_coeff[0]) 
+            moa = coeff.dot(mo_coeff[0])
             mob = coeff.dot(mo_coeff[1])
             nmoa, nmob = moa.shape[1], mob.shape[1]
             mocca = moa[:,mo_occ[0] > 0.5]
@@ -163,10 +163,10 @@ def _jk_task(mol, dms, mo_coeff, mo_occ, vhfopt, task_list, hermi=0,
                 vj = _ao2mo(vj, mocc, mo_coeff).reshape(n_dm,-1)
             if with_k:
                 vk = _ao2mo(vk, mocc, mo_coeff).reshape(n_dm,-1)
-        
+
     return vj, vk, kern_counts, timing_counter
 
-def get_jk(mol, dm, mo_coeff, mo_occ, hermi=0, vhfopt=None, 
+def get_jk(mol, dm, mo_coeff, mo_occ, hermi=0, vhfopt=None,
            with_j=True, with_k=True, verbose=None):
     '''Compute J, K matrices in MO
     '''
@@ -176,7 +176,7 @@ def get_jk(mol, dm, mo_coeff, mo_occ, hermi=0, vhfopt=None,
     if vhfopt is None:
         vhfopt = _VHFOpt(mol).build()
 
-    mol = vhfopt.mol
+    mol = vhfopt.sorted_mol
     nao, nao_orig = vhfopt.coeff.shape
 
     dm = cp.asarray(dm, order='C')
@@ -205,7 +205,7 @@ def get_jk(mol, dm, mo_coeff, mo_occ, hermi=0, vhfopt=None,
             future = executor.submit(
                 _jk_task,
                 mol, dms, mo_coeff, mo_occ, vhfopt, task_list[device_id], hermi=hermi,
-                with_j=with_j, with_k=with_k, verbose=verbose, 
+                with_j=with_j, with_k=with_k, verbose=verbose,
                 device_id=device_id)
             futures.append(future)
 
@@ -224,7 +224,7 @@ def get_jk(mol, dm, mo_coeff, mo_occ, hermi=0, vhfopt=None,
         log.debug1('kernel launches %d', kern_counts)
         for llll, t in timing_collection.items():
             log.debug1('%s wall time %.2f', llll, t)
-    
+
     for s in _streams:
         s.synchronize()
     cp.cuda.get_current_stream().synchronize()

From be6cf61c723aec6259913ad0c27e8c18f9108f38 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Thu, 26 Dec 2024 18:01:44 +0000
Subject: [PATCH 16/49] update nightly build

---
 gpu4pyscf/df/hessian/jk.py           |   5 +-
 gpu4pyscf/gto/int3c1e.py             |  68 +++++---
 gpu4pyscf/gto/int3c1e_ip.py          |  64 ++++----
 gpu4pyscf/solvent/grad/pcm.py        |  13 +-
 gpu4pyscf/solvent/grad/smd.py        |  90 -----------
 gpu4pyscf/tests/020_Vitamin_C.xyz    |  22 +++
 gpu4pyscf/tests/057_Tamoxifen.xyz    |  59 +++++++
 gpu4pyscf/tests/095_Azadirachtin.xyz |  97 +++++++++++
 gpu4pyscf/tests/test_dft.py          | 180 ---------------------
 gpu4pyscf/tests/test_rks.py          | 230 +++++++++++++++++++++++++++
 gpu4pyscf/tests/test_uks.py          |  92 +++++++++++
 11 files changed, 585 insertions(+), 335 deletions(-)
 create mode 100644 gpu4pyscf/tests/020_Vitamin_C.xyz
 create mode 100644 gpu4pyscf/tests/057_Tamoxifen.xyz
 create mode 100644 gpu4pyscf/tests/095_Azadirachtin.xyz
 delete mode 100644 gpu4pyscf/tests/test_dft.py
 create mode 100644 gpu4pyscf/tests/test_rks.py
 create mode 100644 gpu4pyscf/tests/test_uks.py

diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py
index f8992ca3..e66739d2 100644
--- a/gpu4pyscf/df/hessian/jk.py
+++ b/gpu4pyscf/df/hessian/jk.py
@@ -316,6 +316,7 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
                 hj_ipip1[:,i0:i1] += contract('xpi,p->xi', tmp, rhoj[k0:k1])
             if with_k:
                 hk_ipip1[:,i0:i1] += contract('xpji,pji->xi', int3c_blk, rhok_tmp)
+            int3c_blk = None
 
             # (11|0), (0|0)(0|00) without response of RI basis
             int3c_blk = _get_int3c2e_ipip_slice('ipvip1', intopt, cp_ij_id, aux_id, omega=omega)
@@ -324,6 +325,7 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
                 hj_ipvip1[:,i0:i1,j0:j1] += contract('xpij,p->xij', tmp, rhoj[k0:k1])
             if with_k:
                 hk_ipvip1[:,i0:i1,j0:j1] += contract('xpji,pji->xij', int3c_blk, rhok_tmp)
+            int3c_blk = None
 
             if auxbasis_response < 1:
                 continue
@@ -335,6 +337,7 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
                 hj_ip1ip2[:,i0:i1,k0:k1] += contract('xpi,p->xip', tmp, rhoj[k0:k1])
             if with_k:
                 hk_ip1ip2[:,i0:i1,k0:k1] += contract('xpji,pji->xip', int3c_blk, rhok_tmp)
+            int3c_blk = None
 
             if auxbasis_response < 2:
                 continue
@@ -346,7 +349,7 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
                 hj_ipip2[:,k0:k1] += contract('xp,p->xp', tmp, rhoj[k0:k1])
             if with_k:
                 hk_ipip2[:,k0:k1] += contract('xpji,pji->xp', int3c_blk, rhok_tmp)
-
+            int3c_blk = None
         auxslices = intopt.auxmol.aoslice_by_atom()
         aoslices = intopt.mol.aoslice_by_atom()
         ao2atom = int3c2e.get_ao2atom(intopt, aoslices)
diff --git a/gpu4pyscf/gto/int3c1e.py b/gpu4pyscf/gto/int3c1e.py
index 9fa6c98d..d693b804 100644
--- a/gpu4pyscf/gto/int3c1e.py
+++ b/gpu4pyscf/gto/int3c1e.py
@@ -15,7 +15,7 @@
 import ctypes
 import cupy as cp
 import numpy as np
-
+from pyscf import lib
 from pyscf.scf import _vhf
 from pyscf.gto import ATOM_OF
 from pyscf.lib import c_null_ptr
@@ -161,7 +161,6 @@ def get_n_hermite_density_of_angular_pair(l):
 
     def sort_orbitals(self, mat, axis=[]):
         ''' Transform given axis of a matrix into sorted AO,
-        and transform given auxiliary axis of a matrix into sorted auxiliary AO
         '''
         idx = self._ao_idx
         shape_ones = (1,) * mat.ndim
@@ -176,6 +175,24 @@ def sort_orbitals(self, mat, axis=[]):
             fancy_index.append(indices.reshape(idx_shape))
         return mat[tuple(fancy_index)]
 
+    def unsort_orbitals(self, sorted_mat, axis=[]):
+        ''' Transform given axis of a matrix into sorted AO,
+        '''
+        idx = self._ao_idx
+        shape_ones = (1,) * sorted_mat.ndim
+        fancy_index = []
+        for dim, n in enumerate(sorted_mat.shape):
+            if dim in axis:
+                assert n == len(idx)
+                indices = idx
+            else:
+                indices = np.arange(n)
+            idx_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:]
+            fancy_index.append(indices.reshape(idx_shape))
+        mat = cp.empty_like(sorted_mat)
+        mat[tuple(fancy_index)] = sorted_mat
+        return mat
+
     @property
     def bpcache(self):
         device_id = cp.cuda.Device().id
@@ -205,17 +222,17 @@ def get_int3c1e(mol, grids, charge_exponents, intopt):
                         "which requires {total_double_number * 8 / 1e9 : .1f} GB of memory")
     ngrids_per_split = (ngrids + n_grid_split - 1) // n_grid_split
 
-    int3c_pinned_memory_pool = cp.cuda.alloc_pinned_memory(ngrids * nao * nao * np.array([1.0]).nbytes)
-    int3c = np.frombuffer(int3c_pinned_memory_pool, np.float64, ngrids * nao * nao).reshape([ngrids, nao, nao], order='C')
+    buf_size = ngrids * nao * nao * 8
+    int3c_pinned_buf = cp.cuda.alloc_pinned_memory(buf_size)
+    int3c = np.frombuffer(int3c_pinned_buf, np.float64, buf_size).reshape([ngrids, nao, nao], order='C')
     # int3c = np.zeros([ngrids, nao, nao], order='C') # Using unpinned (pageable) memory, each memcpy is much slower, but there's no initialization time
 
     grids = cp.asarray(grids, order='C')
     if charge_exponents is not None:
         charge_exponents = cp.asarray(charge_exponents, order='C')
 
-    for i_grid_split in range(0, ngrids, ngrids_per_split):
-        ngrids_of_split = np.min([ngrids_per_split, ngrids - i_grid_split])
-        int3c_grid_slice = cp.zeros([ngrids_of_split, nao, nao], order='C')
+    for p0, p1 in lib.prange(0, ngrids, ngrids_per_split):
+        int3c_grid_slice = cp.zeros([p1-p0, nao, nao], order='C')
         for cp_ij_id, _ in enumerate(intopt.log_qs):
             cpi = intopt.cp_idx[cp_ij_id]
             cpj = intopt.cp_jdx[cp_ij_id]
@@ -237,18 +254,18 @@ def get_int3c1e(mol, grids, charge_exponents, intopt):
             ao_offsets = np.array([i0, j0], dtype=np.int32)
             strides = np.array([ni, ni*nj], dtype=np.int32)
 
-            int3c_angular_slice = cp.zeros([ngrids_of_split, j1-j0, i1-i0], order='C')
+            int3c_angular_slice = cp.zeros([p1-p0, j1-j0, i1-i0], order='C')
 
             charge_exponents_pointer = c_null_ptr()
             if charge_exponents is not None:
-                charge_exponents_pointer = charge_exponents[i_grid_split : i_grid_split + ngrids_of_split].data.ptr
+                charge_exponents_pointer = charge_exponents[p0:p1].data.ptr
 
             err = libgint.GINTfill_int3c1e(
                 ctypes.cast(stream.ptr, ctypes.c_void_p),
                 intopt.bpcache,
-                ctypes.cast(grids[i_grid_split : i_grid_split + ngrids_of_split, :].data.ptr, ctypes.c_void_p),
+                ctypes.cast(grids[p0:p1, :].data.ptr, ctypes.c_void_p),
                 ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
-                ctypes.c_int(ngrids_of_split),
+                ctypes.c_int(p1-p0),
                 ctypes.cast(int3c_angular_slice.data.ptr, ctypes.c_void_p),
                 strides.ctypes.data_as(ctypes.c_void_p),
                 ao_offsets.ctypes.data_as(ctypes.c_void_p),
@@ -270,11 +287,11 @@ def get_int3c1e(mol, grids, charge_exponents, intopt):
 
         row, col = np.tril_indices(nao)
         int3c_grid_slice[:, row, col] = int3c_grid_slice[:, col, row]
-        ao_idx = np.argsort(intopt._ao_idx)
-        grid_idx = np.arange(ngrids_of_split)
-        int3c_grid_slice = int3c_grid_slice[np.ix_(grid_idx, ao_idx, ao_idx)]
-
-        int3c_grid_slice.get(out = int3c[i_grid_split : i_grid_split + ngrids_of_split, :, :])
+        #ao_idx = np.argsort(intopt._ao_idx)
+        #grid_idx = np.arange(p1-p0)
+        #int3c_grid_slice = int3c_grid_slice[np.ix_(grid_idx, ao_idx, ao_idx)]
+        int3c_grid_slice = intopt.unsort_orbitals(int3c_grid_slice, axis=[1,2])
+        int3c_grid_slice.get(out = int3c[p0:p1, :, :])
 
     return int3c
 
@@ -355,9 +372,9 @@ def get_int3c1e_charge_contracted(mol, grids, charge_exponents, charges, intopt)
 
     row, col = np.tril_indices(nao)
     int1e_charge_contracted[row, col] = int1e_charge_contracted[col, row]
-    ao_idx = np.argsort(intopt._ao_idx)
-    int1e_charge_contracted = int1e_charge_contracted[np.ix_(ao_idx, ao_idx)]
-
+    #ao_idx = np.argsort(intopt._ao_idx)
+    #int1e_charge_contracted = int1e_charge_contracted[np.ix_(ao_idx, ao_idx)]
+    int1e_charge_contracted = intopt.unsort_orbitals(int1e_charge_contracted, axis=[0,1])
     return int1e_charge_contracted
 
 def get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt):
@@ -385,7 +402,7 @@ def get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt):
     bas_coords = intopt._sorted_mol.atom_coords()[intopt._sorted_mol._bas[:, ATOM_OF]].flatten()
 
     n_total_hermite_density = intopt.density_offset[-1]
-    dm_pair_ordered = np.zeros(n_total_hermite_density)
+    dm_pair_ordered = np.empty(n_total_hermite_density)
     libgint.GINTinit_J_density_rys_preprocess(dm.ctypes.data_as(ctypes.c_void_p),
                                               dm_pair_ordered.ctypes.data_as(ctypes.c_void_p),
                                               ctypes.c_int(1), ctypes.c_int(nao_cart), ctypes.c_int(len(intopt.bas_pairs_locs) - 1),
@@ -413,8 +430,7 @@ def get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt):
 
     int3c_density_contracted = cp.zeros(ngrids)
 
-    for i_grid_split in range(0, ngrids, ngrids_per_split):
-        ngrids_of_split = np.min([ngrids_per_split, ngrids - i_grid_split])
+    for p0, p1 in lib.prange(0, ngrids, ngrids_per_split):
         for cp_ij_id, _ in enumerate(intopt.log_qs):
             stream = cp.cuda.get_current_stream()
 
@@ -425,7 +441,7 @@ def get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt):
 
             charge_exponents_pointer = c_null_ptr()
             if charge_exponents is not None:
-                charge_exponents_pointer = charge_exponents[i_grid_split : i_grid_split + ngrids_of_split].data.ptr
+                charge_exponents_pointer = charge_exponents[p0:p1].data.ptr
 
             # n_pair_sum_per_thread = 1 # means every thread processes one pair and one grid
             # n_pair_sum_per_thread = nao_cart # or larger number gaurantees one thread processes one grid and all pairs of the same type
@@ -434,12 +450,12 @@ def get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt):
             err = libgint.GINTfill_int3c1e_density_contracted(
                 ctypes.cast(stream.ptr, ctypes.c_void_p),
                 intopt.bpcache,
-                ctypes.cast(grids[i_grid_split : i_grid_split + ngrids_of_split, :].data.ptr, ctypes.c_void_p),
+                ctypes.cast(grids[p0:p1, :].data.ptr, ctypes.c_void_p),
                 ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
-                ctypes.c_int(ngrids_of_split),
+                ctypes.c_int(p1-p0),
                 ctypes.cast(dm_pair_ordered.data.ptr, ctypes.c_void_p),
                 intopt.density_offset.ctypes.data_as(ctypes.c_void_p),
-                ctypes.cast(int3c_density_contracted[i_grid_split : i_grid_split + ngrids_of_split].data.ptr, ctypes.c_void_p),
+                ctypes.cast(int3c_density_contracted[p0:p1].data.ptr, ctypes.c_void_p),
                 bins_locs_ij.ctypes.data_as(ctypes.c_void_p),
                 ctypes.c_int(nbins),
                 ctypes.c_int(cp_ij_id),
diff --git a/gpu4pyscf/gto/int3c1e_ip.py b/gpu4pyscf/gto/int3c1e_ip.py
index cc53feab..717db68f 100644
--- a/gpu4pyscf/gto/int3c1e_ip.py
+++ b/gpu4pyscf/gto/int3c1e_ip.py
@@ -15,7 +15,7 @@
 import ctypes
 import cupy as cp
 import numpy as np
-
+from pyscf import lib
 from pyscf.gto import ATOM_OF
 from pyscf.lib import c_null_ptr
 from gpu4pyscf.lib.cupy_helper import load_library, cart2sph, get_avail_mem
@@ -40,19 +40,19 @@ def get_int3c1e_ip(mol, grids, charge_exponents, intopt):
                         "the 3 center integral first derivative, "
                         "which requires {total_double_number * 8 / 1e9 : .1f} GB of memory")
     ngrids_per_split = (ngrids + n_grid_split - 1) // n_grid_split
-
-    int3cip1_pinned_memory_pool = cp.cuda.alloc_pinned_memory(ngrids * nao * nao * 3 * np.array([1.0]).nbytes)
-    int3c_ip1 = np.frombuffer(int3cip1_pinned_memory_pool, np.float64, ngrids * nao * nao * 3).reshape([3, ngrids, nao, nao], order='C')
-    int3cip2_pinned_memory_pool = cp.cuda.alloc_pinned_memory(ngrids * nao * nao * 3 * np.array([1.0]).nbytes)
-    int3c_ip2 = np.frombuffer(int3cip2_pinned_memory_pool, np.float64, ngrids * nao * nao * 3).reshape([3, ngrids, nao, nao], order='C')
+    
+    buf_size = ngrids * nao * nao * 3
+    int3cip1_pinned_buf = cp.cuda.alloc_pinned_memory(buf_size * 8)
+    int3c_ip1 = np.frombuffer(int3cip1_pinned_buf, np.float64, buf_size).reshape([3, ngrids, nao, nao], order='C')
+    int3cip2_pinned_buf = cp.cuda.alloc_pinned_memory(buf_size * 8)
+    int3c_ip2 = np.frombuffer(int3cip2_pinned_buf, np.float64, buf_size).reshape([3, ngrids, nao, nao], order='C')
 
     grids = cp.asarray(grids, order='C')
     if charge_exponents is not None:
         charge_exponents = cp.asarray(charge_exponents, order='C')
 
-    for i_grid_split in range(0, ngrids, ngrids_per_split):
-        ngrids_of_split = np.min([ngrids_per_split, ngrids - i_grid_split])
-        int3c_grid_slice = cp.zeros([6, ngrids_of_split, nao, nao], order='C')
+    for p0, p1 in lib.prange(0, ngrids, ngrids_per_split):
+        int3c_grid_slice = cp.zeros([6, p1-p0, nao, nao], order='C')
         for cp_ij_id, _ in enumerate(intopt.log_qs):
             cpi = intopt.cp_idx[cp_ij_id]
             cpj = intopt.cp_jdx[cp_ij_id]
@@ -74,18 +74,18 @@ def get_int3c1e_ip(mol, grids, charge_exponents, intopt):
             ao_offsets = np.array([i0, j0], dtype=np.int32)
             strides = np.array([ni, ni*nj], dtype=np.int32)
 
-            int3c_angular_slice = cp.zeros([6, ngrids_of_split, j1-j0, i1-i0], order='C')
+            int3c_angular_slice = cp.zeros([6, p1-p0, j1-j0, i1-i0], order='C')
 
             charge_exponents_pointer = c_null_ptr()
             if charge_exponents is not None:
-                charge_exponents_pointer = charge_exponents[i_grid_split : i_grid_split + ngrids_of_split].data.ptr
+                charge_exponents_pointer = charge_exponents[p0:p1].data.ptr
 
             err = libgint.GINTfill_int3c1e_ip(
                 ctypes.cast(stream.ptr, ctypes.c_void_p),
                 intopt.bpcache,
-                ctypes.cast(grids[i_grid_split : i_grid_split + ngrids_of_split, :].data.ptr, ctypes.c_void_p),
+                ctypes.cast(grids[p0:p1, :].data.ptr, ctypes.c_void_p),
                 ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
-                ctypes.c_int(ngrids_of_split),
+                ctypes.c_int(p1-p0),
                 ctypes.cast(int3c_angular_slice.data.ptr, ctypes.c_void_p),
                 strides.ctypes.data_as(ctypes.c_void_p),
                 ao_offsets.ctypes.data_as(ctypes.c_void_p),
@@ -106,17 +106,17 @@ def get_int3c1e_ip(mol, grids, charge_exponents, intopt):
             int3c_grid_slice[:, :, j0:j1, i0:i1] = int3c_angular_slice
 
         ao_idx = np.argsort(intopt._ao_idx)
-        grid_idx = np.arange(ngrids_of_split)
+        grid_idx = np.arange(p1-p0)
         derivative_idx = np.arange(6)
         int3c_grid_slice = int3c_grid_slice[np.ix_(derivative_idx, grid_idx, ao_idx, ao_idx)]
 
         # Each piece of the following memory is contiguous
-        int3c_grid_slice[0, :, :, :].get(out = int3c_ip1[0, i_grid_split : i_grid_split + ngrids_of_split, :, :])
-        int3c_grid_slice[1, :, :, :].get(out = int3c_ip1[1, i_grid_split : i_grid_split + ngrids_of_split, :, :])
-        int3c_grid_slice[2, :, :, :].get(out = int3c_ip1[2, i_grid_split : i_grid_split + ngrids_of_split, :, :])
-        int3c_grid_slice[3, :, :, :].get(out = int3c_ip2[0, i_grid_split : i_grid_split + ngrids_of_split, :, :])
-        int3c_grid_slice[4, :, :, :].get(out = int3c_ip2[1, i_grid_split : i_grid_split + ngrids_of_split, :, :])
-        int3c_grid_slice[5, :, :, :].get(out = int3c_ip2[2, i_grid_split : i_grid_split + ngrids_of_split, :, :])
+        int3c_grid_slice[0, :, :, :].get(out = int3c_ip1[0, p0:p1, :, :])
+        int3c_grid_slice[1, :, :, :].get(out = int3c_ip1[1, p0:p1, :, :])
+        int3c_grid_slice[2, :, :, :].get(out = int3c_ip1[2, p0:p1, :, :])
+        int3c_grid_slice[3, :, :, :].get(out = int3c_ip2[0, p0:p1, :, :])
+        int3c_grid_slice[4, :, :, :].get(out = int3c_ip2[1, p0:p1, :, :])
+        int3c_grid_slice[5, :, :, :].get(out = int3c_ip2[2, p0:p1, :, :])
 
     return int3c_ip1, int3c_ip2
 
@@ -134,7 +134,7 @@ def get_int3c1e_ip1_charge_contracted(mol, grids, charge_exponents, charges, int
     charges = charges.reshape([-1, 1], order='C')
     grids = cp.concatenate([grids, charges], axis=1)
 
-    int1e_charge_contracted = cp.zeros([3, mol.nao, mol.nao], order='C')
+    int1e_charge_contracted = cp.empty([3, mol.nao, mol.nao], order='C')
     for cp_ij_id, _ in enumerate(intopt.log_qs):
         cpi = intopt.cp_idx[cp_ij_id]
         cpj = intopt.cp_jdx[cp_ij_id]
@@ -193,11 +193,7 @@ def get_int3c1e_ip1_charge_contracted(mol, grids, charge_exponents, charges, int
 
         int1e_charge_contracted[:, j0:j1, i0:i1] = int1e_angular_slice
 
-    ao_idx = np.argsort(intopt._ao_idx)
-    derivative_idx = np.arange(3)
-    int1e_charge_contracted = int1e_charge_contracted[np.ix_(derivative_idx, ao_idx, ao_idx)]
-
-    return int1e_charge_contracted
+    return intopt.unsort_orbitals(int1e_charge_contracted, axis=[1,2])
 
 def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt):
     omega = mol.omega
@@ -228,10 +224,11 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt)
     bas_coords = intopt._sorted_mol.atom_coords()[intopt._sorted_mol._bas[:, ATOM_OF]].flatten()
 
     n_total_hermite_density = intopt.density_offset[-1]
-    dm_pair_ordered = np.zeros(n_total_hermite_density)
+    dm_pair_ordered = np.empty(n_total_hermite_density)
     libgint.GINTinit_J_density_rys_preprocess(dm.ctypes.data_as(ctypes.c_void_p),
                                               dm_pair_ordered.ctypes.data_as(ctypes.c_void_p),
-                                              ctypes.c_int(1), ctypes.c_int(nao_cart), ctypes.c_int(len(intopt.bas_pairs_locs) - 1),
+                                              ctypes.c_int(1), ctypes.c_int(nao_cart), 
+                                              ctypes.c_int(len(intopt.bas_pairs_locs) - 1),
                                               intopt.bas_pair2shls.ctypes.data_as(ctypes.c_void_p),
                                               intopt.bas_pairs_locs.ctypes.data_as(ctypes.c_void_p),
                                               l_ij.ctypes.data_as(ctypes.c_void_p),
@@ -252,8 +249,7 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt)
 
     int3c_density_contracted = cp.zeros([3, ngrids], order='C')
 
-    for i_grid_split in range(0, ngrids, ngrids_per_split):
-        ngrids_of_split = np.min([ngrids_per_split, ngrids - i_grid_split])
+    for p0, p1 in lib.prange(0, ngrids, ngrids_per_split):
         for cp_ij_id, _ in enumerate(intopt.log_qs):
             stream = cp.cuda.get_current_stream()
 
@@ -264,7 +260,7 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt)
 
             charge_exponents_pointer = c_null_ptr()
             if charge_exponents is not None:
-                charge_exponents_pointer = charge_exponents[i_grid_split : i_grid_split + ngrids_of_split].data.ptr
+                charge_exponents_pointer = charge_exponents[p0:p1].data.ptr
 
             # n_pair_sum_per_thread = 1 # means every thread processes one pair and one grid
             # n_pair_sum_per_thread = nao_cart # or larger number gaurantees one thread processes one grid and all pairs of the same type
@@ -273,12 +269,12 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt)
             err = libgint.GINTfill_int3c1e_ip2_density_contracted(
                 ctypes.cast(stream.ptr, ctypes.c_void_p),
                 intopt.bpcache,
-                ctypes.cast(grids[i_grid_split : i_grid_split + ngrids_of_split, :].data.ptr, ctypes.c_void_p),
+                ctypes.cast(grids[p0:p1, :].data.ptr, ctypes.c_void_p),
                 ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
-                ctypes.c_int(ngrids_of_split),
+                ctypes.c_int(p1-p0),
                 ctypes.cast(dm_pair_ordered.data.ptr, ctypes.c_void_p),
                 intopt.density_offset.ctypes.data_as(ctypes.c_void_p),
-                ctypes.cast(int3c_density_contracted[:, i_grid_split : i_grid_split + ngrids_of_split].data.ptr, ctypes.c_void_p),
+                ctypes.cast(int3c_density_contracted[:, p0:p1].data.ptr, ctypes.c_void_p),
                 bins_locs_ij.ctypes.data_as(ctypes.c_void_p),
                 ctypes.c_int(nbins),
                 ctypes.c_int(cp_ij_id),
diff --git a/gpu4pyscf/solvent/grad/pcm.py b/gpu4pyscf/solvent/grad/pcm.py
index 0544f751..3fe7cb6c 100644
--- a/gpu4pyscf/solvent/grad/pcm.py
+++ b/gpu4pyscf/solvent/grad/pcm.py
@@ -24,7 +24,7 @@
 from pyscf import lib
 from pyscf import gto
 from pyscf.grad import rhf as rhf_grad
-
+from gpu4pyscf.gto import int3c1e
 from gpu4pyscf.solvent.pcm import PI, switch_h, libsolvent
 from gpu4pyscf.gto.int3c1e_ip import int1e_grids_ip1, int1e_grids_ip2
 from gpu4pyscf.lib.cupy_helper import contract
@@ -239,11 +239,16 @@ def grad_qv(pcmobj, dm):
     grid_coords = pcmobj.surface['grid_coords']
     q_sym       = pcmobj._intermediates['q_sym']
 
-    dvj = int1e_grids_ip1(mol, grid_coords, dm = dm, charges = q_sym, direct_scf_tol = 1e-14, charge_exponents = charge_exp**2)
-    dq  = int1e_grids_ip2(mol, grid_coords, dm = dm, charges = q_sym, direct_scf_tol = 1e-14, charge_exponents = charge_exp**2)
+    intopt = int3c1e.VHFOpt(mol)
+    intopt.build(1e-14, aosym=False)
+    dvj = int1e_grids_ip1(mol, grid_coords, dm = dm, charges = q_sym, 
+                          direct_scf_tol = 1e-14, charge_exponents = charge_exp**2,
+                          intopt=intopt)
+    dq  = int1e_grids_ip2(mol, grid_coords, dm = dm, charges = q_sym, 
+                          direct_scf_tol = 1e-14, charge_exponents = charge_exp**2,
+                          intopt=intopt)
 
     aoslice = mol.aoslice_by_atom()
-    aoslice = cupy.array(aoslice)
     dvj = 2.0 * cupy.asarray([cupy.sum(dvj[:,p0:p1], axis=1) for p0,p1 in aoslice[:,2:]])
     dq = cupy.asarray([cupy.sum(dq[:,p0:p1], axis=1) for p0,p1 in gridslice])
     de = dq + dvj
diff --git a/gpu4pyscf/solvent/grad/smd.py b/gpu4pyscf/solvent/grad/smd.py
index a3d850db..32ebc2ee 100644
--- a/gpu4pyscf/solvent/grad/smd.py
+++ b/gpu4pyscf/solvent/grad/smd.py
@@ -25,100 +25,10 @@
 from gpu4pyscf.solvent import pcm, smd
 from gpu4pyscf.solvent.grad import pcm as pcm_grad
 from gpu4pyscf.lib import logger
-from gpu4pyscf.lib.cupy_helper import contract
 
 def get_cds(smdobj):
     return smd.get_cds_legacy(smdobj)[1]
 
-"""
-def grad_solver(smdobj, dm):
-    '''
-    dE = 0.5*v* d(K^-1 R) *v + q*dv
-    v^T* d(K^-1 R)v = v^T*K^-1(dR - dK K^-1R)v = v^T K^-1(dR - dK q)
-    '''
-    mol = smdobj.mol
-    log = logger.new_logger(mol, mol.verbose)
-    t1 = log.init_timer()
-    if not smdobj._intermediates:
-        smdobj.build()
-    dm_cache = smdobj._intermediates.get('dm', None)
-    if dm_cache is not None and cupy.linalg.norm(dm_cache - dm) < 1e-10:
-        pass
-    else:
-        smdobj._get_vind(dm)
-
-    gridslice    = smdobj.surface['gslice_by_atom']
-    v_grids      = smdobj._intermediates['v_grids']
-    A            = smdobj._intermediates['A']
-    D            = smdobj._intermediates['D']
-    S            = smdobj._intermediates['S']
-    K            = smdobj._intermediates['K']
-    q            = smdobj._intermediates['q']
-
-    vK_1 = cupy.linalg.solve(K.T, v_grids)
-
-    dF, dA = pcm_grad.get_dF_dA(smdobj.surface)
-
-    with_D = smdobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SS(V)PE', 'SMD']
-    dD, dS, dSii = pcm_grad.get_dD_dS(smdobj.surface, dF, with_D=with_D, with_S=True)
-
-    epsilon = smdobj.eps
-    de = cupy.zeros([smdobj.mol.natm,3])
-
-    def contract_bra(a, B, c):
-        ''' i,xij,j->jx '''
-        tmp = a.dot(B)
-        return (tmp * c).T
-
-    def contract_ket(a, B, c):
-        ''' i,xij,j->ix '''
-        tmp = B.dot(c)
-        return (a*tmp).T
-
-    # IEF-PCM and SS(V)PE formally are the same in gradient calculation
-    # dR = f_eps/(2*pi) * (dD*A + D*dA),
-    # dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS)
-    f_epsilon = (epsilon - 1.0)/(epsilon + 1.0)
-    fac = f_epsilon/(2.0*np.pi)
-
-    Av = A*v_grids
-    de_dR  = 0.5*fac * contract_ket(vK_1, dD, Av)
-    de_dR -= 0.5*fac * contract_bra(vK_1, dD, Av)
-    de_dR  = cupy.asarray([cupy.sum(de_dR[p0:p1], axis=0) for p0,p1 in gridslice])
-
-    vK_1_D = vK_1.dot(D)
-    vK_1_Dv = vK_1_D * v_grids
-    de_dR += 0.5*fac * contract('j,xjn->nx', vK_1_Dv, dA)
-
-    de_dS0  = 0.5*contract_ket(vK_1, dS, q)
-    de_dS0 -= 0.5*contract_bra(vK_1, dS, q)
-    de_dS0  = cupy.asarray([cupy.sum(de_dS0[p0:p1], axis=0) for p0,p1 in gridslice])
-
-    vK_1_q = vK_1 * q
-    de_dS0 += 0.5*contract('i,xin->nx', vK_1_q, dSii)
-
-    vK_1_DA = vK_1_D*A
-    de_dS1  = 0.5*contract_ket(vK_1_DA, dS, q)
-    de_dS1 -= 0.5*contract_bra(vK_1_DA, dS, q)
-    de_dS1  = cupy.asarray([cupy.sum(de_dS1[p0:p1], axis=0) for p0,p1 in gridslice])
-
-    vK_1_DAq = vK_1_DA*q
-    de_dS1 += 0.5*contract('j,xjn->nx', vK_1_DAq, dSii)
-
-    Sq = cupy.dot(S,q)
-    ASq = A*Sq
-    de_dD  = 0.5*contract_ket(vK_1, dD, ASq)
-    de_dD -= 0.5*contract_bra(vK_1, dD, ASq)
-    de_dD  = cupy.asarray([cupy.sum(de_dD[p0:p1], axis=0) for p0,p1 in gridslice])
-
-    de_dA = 0.5*contract('j,xjn->nx', vK_1_D*Sq, dA)   # 0.5*cupy.einsum('j,xjn,j->nx', vK_1_D, dA, Sq)
-
-    de_dK = de_dS0 - fac * (de_dD + de_dA + de_dS1)
-    de += de_dR - de_dK
-
-    t1 = log.timer_debug1('grad solver', *t1)
-    return de.get()
-"""
 grad_solver = pcm_grad.grad_solver
 
 def make_grad_object(grad_method):
diff --git a/gpu4pyscf/tests/020_Vitamin_C.xyz b/gpu4pyscf/tests/020_Vitamin_C.xyz
new file mode 100644
index 00000000..e119c6d3
--- /dev/null
+++ b/gpu4pyscf/tests/020_Vitamin_C.xyz
@@ -0,0 +1,22 @@
+20
+Vitamin C
+C                 -0.07551087    1.68127663   -0.10745193
+O                  1.33621755    1.87147409   -0.39326987
+C                  1.67074668    2.95729545    0.49387976
+C                  0.41740763    3.77281969    0.78495878
+C                 -0.60481480    3.07572636    0.28906224
+H                 -0.19316298    1.01922455    0.72486113
+O                  0.35092043    5.03413298    1.45545728
+H                  0.42961487    5.74279041    0.81264173
+O                 -1.95331750    3.53349874    0.15912025
+H                 -2.55333895    2.78846397    0.23972698
+O                  2.81976302    3.20110148    0.94542226
+C                 -0.81772499    1.09230218   -1.32146482
+H                 -0.70955636    1.74951833   -2.15888136
+C                 -2.31163857    0.93420736   -0.98260166
+H                 -2.72575463    1.89080093   -0.74107186
+H                 -2.41980721    0.27699120   -0.14518512
+O                 -0.26428017   -0.18613595   -1.64425697
+H                 -0.72695910   -0.55328886   -2.40104423
+O                 -3.00083741    0.38730252   -2.10989934
+H                 -3.93210821    0.28874990   -1.89865997
diff --git a/gpu4pyscf/tests/057_Tamoxifen.xyz b/gpu4pyscf/tests/057_Tamoxifen.xyz
new file mode 100644
index 00000000..b51df6f5
--- /dev/null
+++ b/gpu4pyscf/tests/057_Tamoxifen.xyz
@@ -0,0 +1,59 @@
+57
+Tamoxifen
+C                 -1.42666665    1.35988349    0.01780185
+C                 -0.75139234    2.53486079    0.01780185
+C                 -2.96666665    1.35988349    0.01780185
+C                 -3.66418809    0.15160568    0.01780185
+C                 -3.66417225    2.56778831    0.01791304
+C                 -5.05890001    0.15132789    0.01723115
+H                 -3.11399504   -0.80051230    0.01693694
+C                 -5.05931013    2.56768367    0.01833813
+H                 -3.11457497    3.52019148    0.01809296
+C                 -5.75673144    1.35973487    0.01785909
+H                 -5.60876287   -0.80100973    0.01659711
+H                 -5.60899513    3.52021733    0.01884114
+H                 -6.85641138    1.35926586    0.01746817
+C                 -1.51874951    3.87006226    0.01780185
+C                 -1.63823871    4.60590036   -1.16149287
+C                 -2.09440347    4.34371845    1.19670832
+C                 -2.33266580    5.81544273   -1.16163975
+H                 -1.18363273    4.23258432   -2.09058400
+C                 -2.78991814    5.55312706    1.19651365
+H                 -2.00047584    3.76380313    2.12622693
+C                 -2.90901419    6.28907563    0.01764434
+H                 -2.42635385    6.39580205   -2.09099551
+H                 -3.24404320    5.92613353    2.12608927
+C                  0.78860766    2.53486079    0.01780185
+C                  1.48612910    3.74313859    0.01780185
+C                  1.48611327    1.32695597    0.01791304
+C                  2.88084102    3.74341639    0.01723115
+H                  0.93593606    4.69525658    0.01693694
+C                  2.88125115    1.32706060    0.01833813
+H                  0.93651599    0.37455279    0.01809296
+C                  3.57867246    2.53500940    0.01785909
+H                  3.43070389    4.69575400    0.01659711
+H                  3.43093615    0.37452694    0.01884114
+H                  4.67835240    2.53547842    0.01746817
+C                 -0.65930948    0.02468201    0.01780185
+H                 -0.04466478   -0.03344716   -0.85611628
+H                 -0.04386363   -0.03298673    0.89118649
+C                 -1.66236338   -1.14385651    0.01856968
+H                 -2.27713573   -1.08561745    0.89239069
+H                 -2.27768159   -1.08629703   -0.85491210
+H                 -1.12919956   -2.07156136    0.01876393
+O                 -3.62101473    7.52921876    0.01715974
+C                 -2.69982994    8.60858726    0.19402752
+H                 -2.03011871    8.64615667   -0.63962434
+H                 -2.14108178    8.45680900    1.09384076
+C                 -3.47584819    9.93535894    0.28927757
+H                 -4.05456450   10.07469158   -0.59986462
+H                 -4.12694690    9.90759901    1.13792346
+C                 -1.65137806   10.90285045    1.72438609
+H                 -2.24764703   10.40869908    2.46274761
+H                 -0.79110440   10.30633800    1.50302183
+H                 -1.33836538   11.85545774    2.09783276
+C                 -3.25771829   12.42866058    0.53449492
+H                 -2.56611180   13.24181825    0.60767325
+H                 -3.86037095   12.55070987   -0.34118410
+H                 -3.88574784   12.41553739    1.40069735
+N                 -2.48185199   11.10154878    0.44281205
diff --git a/gpu4pyscf/tests/095_Azadirachtin.xyz b/gpu4pyscf/tests/095_Azadirachtin.xyz
new file mode 100644
index 00000000..8c03f7bb
--- /dev/null
+++ b/gpu4pyscf/tests/095_Azadirachtin.xyz
@@ -0,0 +1,97 @@
+95
+Azadirachtin
+C                  0.24028400   -0.96854600    0.05735800
+C                  1.49955800   -0.38999400    0.79976500
+C                  1.84405900    1.11309900    0.52612700
+C                  0.61115200    2.06994900    0.41027500
+C                 -0.38718900    1.44909800   -0.58288900
+C                 -0.81198100    0.11367700    0.01403200
+H                  1.34464500   -0.48336800    1.89667000
+H                  0.90815500    3.09474100    0.10955200
+H                  0.07146500    1.40030200   -1.59457300
+H                 -1.08538000    0.33936800    1.09841400
+O                 -0.03234300    2.14051500    1.69756400
+H                  0.43832200    2.76739400    2.27637900
+O                 -1.64345600    2.15598600   -0.77527600
+C                 -2.74935800    1.17918600   -0.75355500
+H                 -3.33770900    1.41858200    0.14457000
+H                 -3.31820200    1.39744800   -1.66649800
+C                 -2.11058900   -0.22990000   -0.71994400
+C                  2.72998200    1.32748400   -0.70483200
+H                  2.81316800    2.38444500   -0.97758400
+H                  3.74960400    0.95856700   -0.53283000
+H                  2.35200700    0.78104000   -1.58051000
+C                  2.60140000   -1.34386400    0.30659000
+C                  0.84678200   -1.40613600   -1.29617000
+H                  0.88274800   -0.59319600   -2.03951200
+H                  0.38815200   -2.30137400   -1.74034600
+O                  2.22547600   -1.78168600   -1.02946800
+C                 -0.42290800   -2.19363100    0.75277400
+H                 -0.32012900   -3.08353500    0.10236100
+C                 -1.91400700   -2.00763500    1.11237500
+H                 -2.33420900   -2.99527800    1.38379200
+H                 -1.98093100   -1.38866600    2.03106200
+C                 -2.81353800   -1.37055100    0.02719800
+H                 -3.12020000   -2.14713900   -0.69849000
+C                 -1.82661295   -0.68751599   -2.16270012
+O                 -1.03585236   -0.24261727   -2.99355789
+O                 -2.59156054   -1.74766325   -2.52650357
+C                 -2.29916153   -2.14198817   -3.86960099
+H                 -2.96290254   -2.92828960   -4.16299137
+H                 -2.42740743   -1.30452633   -4.52313804
+H                 -1.28838658   -2.48820275   -3.92764814
+O                 -4.01986539   -0.90962471    0.64138134
+C                 -4.89301012   -1.93494775    0.80793745
+O                 -4.54153100   -3.05110585    0.42818050
+C                 -6.20834727   -1.48087047    1.46771166
+H                 -6.70958996   -0.78829922    0.82428269
+H                 -6.83594045   -2.33131805    1.63434212
+H                 -5.99341406   -1.00749899    2.40292455
+O                  0.29104226   -2.52037085    1.94793763
+C                  0.31248536   -3.86361432    2.13937213
+O                 -0.25336168   -4.56806573    1.30443072
+C                  1.07328546   -4.25938123    3.41849362
+C                  1.18469713   -5.56341278    3.77014145
+H                  0.75137836   -6.32562659    3.15681858
+C                  1.70966559   -3.16955354    4.30104443
+H                  2.52793619   -2.72004059    3.77829081
+H                  0.97813456   -2.42251044    4.52839648
+H                  2.06508607   -3.60889199    5.20964665
+C                  1.93754031   -5.94957419    5.05688405
+H                  1.46239499   -5.49165555    5.89917107
+H                  1.92238977   -7.01309886    5.17344190
+H                  2.95091533   -5.61227499    4.99207421
+C                  3.99823568   -0.71610148    0.14421916
+O                  4.54063921    0.18499764    0.78248292
+O                  4.69984280   -1.27738694   -0.87269582
+O                  2.69271189   -2.53050618    1.09933364
+H                  3.60067733   -2.84219679    1.10624230
+C                  5.98847134   -0.66885730   -0.99113633
+H                  6.49970371   -0.73075570   -0.05320774
+H                  6.55618159   -1.17887968   -1.74112449
+H                  5.87374685    0.35839671   -1.26770006
+C                  2.63486992    1.58151749    1.76176538
+C                  2.13434327    2.21842175    3.11643757
+C                  3.90461234    2.45387090    1.74128354
+O                  2.44467967    0.78466796    2.96396625
+C                  3.35337126    2.98709450    3.79243900
+C                  0.74513758    2.60743687    3.44136489
+O                  5.00327683    3.19196370    1.11718214
+C                  4.47769203    2.16352749    3.16877423
+H                  3.15573566    3.35599353    1.51547111
+C                  3.84794511    4.41584726    3.25643717
+H                  3.24116904    2.99889070    4.88162906
+H                  0.00697023    1.93995296    2.97068106
+H                  0.55491721    2.57388288    4.52449549
+H                  0.54134467    3.63458255    3.09753074
+C                  4.84981258    4.42246076    1.92099071
+H                  4.49637929    1.09030804    3.43212004
+H                  5.51163803    2.50195502    3.32489990
+C                  4.76579887    5.04464694    4.26535741
+O                  2.75093022    5.20578033    2.83652107
+H                  4.60685318    5.22136931    1.20459035
+O                  6.17282363    4.70855901    2.47193815
+H                  4.42807865    5.31783232    5.24674785
+C                  6.01838353    5.12565144    3.78006571
+H                  2.50011685    5.87405238    3.50751412
+H                  6.95619123    5.44887224    4.20308201
diff --git a/gpu4pyscf/tests/test_dft.py b/gpu4pyscf/tests/test_dft.py
deleted file mode 100644
index 94e7ed1e..00000000
--- a/gpu4pyscf/tests/test_dft.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-import pyscf
-import pytest
-import cupy
-from gpu4pyscf.dft import rks, uks
-
-def setUpModule():
-    global mol
-    atom = '''
-C                 -0.07551087    1.68127663   -0.10745193
-O                  1.33621755    1.87147409   -0.39326987
-C                  1.67074668    2.95729545    0.49387976
-C                  0.41740763    3.77281969    0.78495878
-C                 -0.60481480    3.07572636    0.28906224
-H                 -0.19316298    1.01922455    0.72486113
-O                  0.35092043    5.03413298    1.45545728
-H                  0.42961487    5.74279041    0.81264173
-O                 -1.95331750    3.53349874    0.15912025
-H                 -2.55333895    2.78846397    0.23972698
-O                  2.81976302    3.20110148    0.94542226
-C                 -0.81772499    1.09230218   -1.32146482
-H                 -0.70955636    1.74951833   -2.15888136
-C                 -2.31163857    0.93420736   -0.98260166
-H                 -2.72575463    1.89080093   -0.74107186
-H                 -2.41980721    0.27699120   -0.14518512
-O                 -0.26428017   -0.18613595   -1.64425697
-H                 -0.72695910   -0.55328886   -2.40104423
-O                 -3.00083741    0.38730252   -2.10989934
-H                 -3.93210821    0.28874990   -1.89865997
-'''
-
-    mol = pyscf.M(atom=atom, basis='def2-tzvpp', max_memory=32000, cart=0)
-    mol.output = '/dev/null'
-    mol.build()
-    mol.verbose = 1
-
-def tearDownModule():
-    global mol
-    mol.stdout.close()
-    del mol
-
-class KnownValues(unittest.TestCase):
-    @pytest.mark.smoke
-    def test_b3lyp_with_d3bj(self):
-        print('-------- DFRKS with D3(BJ) -------')
-        mf = rks.RKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
-        mf.grids.atom_grid = (99,590)
-        mf.conv_tol = 1e-10
-        mf.conv_tol_cpscf = 1e-8
-        mf.disp = 'd3bj'
-        e_dft = mf.kernel()
-        assert np.abs(e_dft - -685.0326965348272) < 1e-7
-
-        g = mf.nuc_grad_method().kernel()
-        assert np.abs(cupy.linalg.norm(g) - 0.17498362161082373) < 1e-5
-
-        h = mf.Hessian().kernel()
-        assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4
-    
-    @pytest.mark.smoke
-    def test_b3lyp_d3bj(self):
-        print('-------- DFRKS with D3(BJ) -------')
-        mf = rks.RKS(mol, xc='b3lyp-d3bj').density_fit(auxbasis='def2-tzvpp-jkfit')
-        mf.grids.atom_grid = (99,590)
-        mf.conv_tol = 1e-10
-        mf.conv_tol_cpscf = 1e-8
-        e_dft = mf.kernel()
-        assert np.abs(e_dft - -685.0326965348272) < 1e-7
-
-        g = mf.nuc_grad_method().kernel()
-        assert np.abs(cupy.linalg.norm(g) - 0.17498362161082373) < 1e-5
-
-        h = mf.Hessian().kernel()
-        assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4
-
-    @pytest.mark.smoke
-    def test_DFUKS(self):
-        print('------- DFUKS with D3(BJ) -------')
-        mf = uks.UKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
-        mf.grids.atom_grid = (99,590)
-        mf.conv_tol = 1e-10
-        mf.conv_tol_cpscf = 1e-8
-        mf.disp = 'd3bj'
-        e_dft = mf.kernel()
-        assert np.abs(e_dft - -685.0326965349493) < 1e-7
-
-        g = mf.nuc_grad_method().kernel()
-        assert np.abs(cupy.linalg.norm(g) - 0.17498264516108836) < 1e-5
-
-        h = mf.Hessian().kernel()
-        assert np.abs(cupy.linalg.norm(h) - 3.768429871470736) < 1e-4
-
-    @pytest.mark.smoke
-    def test_RKS(self):
-        print('-------- RKS with D3(BJ) -------')
-        mf = rks.RKS(mol, xc='b3lyp')
-        mf.grids.atom_grid = (99,590)
-        mf.conv_tol = 1e-12
-        mf.conv_tol_cpscf = 1e-8
-        mf.disp = 'd3bj'
-        e_dft = mf.kernel()
-        assert np.abs(e_dft - -685.0325611822375) < 1e-7
-
-        g = mf.nuc_grad_method().kernel()
-        assert np.abs(cupy.linalg.norm(g) - 0.1750368231223345) < 1e-6
-
-        h = mf.Hessian().kernel()
-        assert np.abs(cupy.linalg.norm(h) - 3.760381249106394) < 1e-4
-
-    @pytest.mark.smoke
-    def test_UKS(self):
-        print('-------- UKS with D3(BJ) -------')
-        mf = uks.UKS(mol, xc='b3lyp')
-        mf.grids.atom_grid = (99,590)
-        mf.conv_tol = 1e-12
-        mf.conv_tol_cpscf = 1e-8
-        mf.disp = 'd3bj'
-        e_dft = mf.kernel()
-        assert np.abs(e_dft - -685.0325611822375) < 1e-7
-
-        g = mf.nuc_grad_method().kernel()
-        assert np.abs(cupy.linalg.norm(g) - 0.1750368231223345) < 1e-6
-
-        h = mf.Hessian().kernel()
-        assert np.abs(cupy.linalg.norm(h) - 3.760381249106394) < 1e-4
-
-    @pytest.mark.smoke
-    def test_DFRKS_with_SMD(self):
-        print('----- DFRKS with SMD -----')
-        mf = rks.RKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
-        mf = mf.SMD()
-        mf.grids.atom_grid = (99,590)
-        mf.conv_tol = 1e-10
-        mf.conv_tol_cpscf = 1e-8
-        mf.disp = 'd3bj'
-        e_dft = mf.kernel()
-        assert np.abs(e_dft - -685.0578838805443) < 1e-7
-
-        g = mf.nuc_grad_method().kernel()
-        assert np.abs(cupy.linalg.norm(g) - 0.16804945458657145) < 1e-5
-
-        h = mf.Hessian().kernel()
-        assert np.abs(cupy.linalg.norm(h) - 3.741783814494321) < 1e-4
-
-    @pytest.mark.smoke
-    def test_DFUKS_with_SMD(self):
-        print('------- DFUKS with SMD ---------')
-        mf = uks.UKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
-        mf = mf.SMD()
-        mf.grids.atom_grid = (99,590)
-        mf.conv_tol = 1e-10
-        mf.conv_tol_cpscf = 1e-8
-        mf.disp = 'd3bj'
-        e_dft = mf.kernel()
-        assert np.abs(e_dft - -685.05788388063) < 1e-7
-
-        g = mf.nuc_grad_method().kernel()
-        assert np.abs(cupy.linalg.norm(g) - 0.1680496465773684) < 1e-5
-
-        h = mf.Hessian().kernel()
-        assert np.abs(cupy.linalg.norm(h) - 3.7417788481647563) < 1e-4
-
-if __name__ == "__main__":
-    print("Full Smoke Tests")
-    unittest.main()
diff --git a/gpu4pyscf/tests/test_rks.py b/gpu4pyscf/tests/test_rks.py
new file mode 100644
index 00000000..ebf9d8af
--- /dev/null
+++ b/gpu4pyscf/tests/test_rks.py
@@ -0,0 +1,230 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import pyscf
+import pytest
+import cupy
+from gpu4pyscf.dft import rks, uks
+
+# Any task taking more than 1000s will be marked as 'slow'
+
+# How to run
+# 1. run test only
+# pytest test_rks.py --benchmark-disable -s -v -m "not slow" --durations=20
+# 2. benchmark less expensive tasks
+# pytest test_rks.py -v -m "not slow"
+# 3. benchmark all the tests
+# pytest test_rks.py -v
+
+current_folder = os.path.dirname(os.path.abspath(__file__))
+small_mol = os.path.join(current_folder, '020_Vitamin_C.xyz')
+median_mol = os.path.join(current_folder, '057_Tamoxifen.xyz')
+large_mol = os.path.join(current_folder, '095_Azadirachtin.xyz')
+
+def run_rb3lyp(atom, basis, with_df, with_solvent, disp=None):
+    mol = pyscf.M(atom=atom, basis=basis, verbose=0)
+    mf = rks.RKS(mol, xc='b3lyp')
+    if with_df:
+        mf = mf.density_fit()
+    if with_solvent:
+        mf = mf.PCM()
+        mf.with_solvent.method = 'IEF-PCM'
+    if disp is not None:
+        mf.disp = disp
+    mf.grids.atom_grid = (99,590)
+    mf.conv_tol = 1e-10
+    return mf.kernel()
+
+def run_rb3lyp_grad(atom, basis, with_df, with_solvent, disp=None):
+    mol = pyscf.M(atom=atom, basis=basis, verbose=0)
+    mf = rks.RKS(mol, xc='b3lyp')
+    if with_df:
+        mf = mf.density_fit()
+    if with_solvent:
+        mf = mf.PCM()
+        mf.with_solvent.method = 'IEF-PCM'
+    if disp is not None:
+        mf.disp = disp
+    mf.grids.atom_grid = (99,590)
+    mf.conv_tol = 1e-10
+    mf.kernel()
+    g = mf.nuc_grad_method().kernel()
+    return g
+
+def run_rb3lyp_hessian(atom, basis, with_df, with_solvent, disp=None):
+    mol = pyscf.M(atom=atom, basis=basis, verbose=0)
+    mf = rks.RKS(mol, xc='b3lyp')
+    if with_df:
+        mf = mf.density_fit()
+    if with_solvent:
+        mf = mf.PCM()
+        mf.with_solvent.method = 'IEF-PCM'
+    if disp is not None:
+        mf.disp = disp
+    mf.grids.atom_grid = (99,590)
+    mf.conv_tol = 1e-10
+    mf.conv_tol_cpscf = 1e-6
+    mf.kernel()
+    h = mf.Hessian().kernel()
+    return h
+
+# DF
+def test_df_rb3lyp(benchmark):
+    e = benchmark(run_rb3lyp, small_mol, 'def2-tzvpp', True, False)
+    print('testing df rb3lyp')
+    assert np.isclose(np.linalg.norm(e), 684.9998712035579, atol=1e-7)
+def test_df_rb3lyp_grad(benchmark):
+    g = benchmark(run_rb3lyp_grad, small_mol, 'def2-tzvpp', True, False)
+    print('testing df rb3lyp grad')
+    assert np.isclose(np.linalg.norm(g), 0.17435941081837686, atol=1e-5)
+@pytest.mark.slow
+def test_df_rb3lyp_hessian(benchmark):
+    h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', True, False)
+    print('testing df rb3lyp hessian')
+    assert np.isclose(np.linalg.norm(h), 3.7668761221997764, atol=1e-4)
+
+# Direct SCF
+def test_rb3lyp(benchmark):
+    e = benchmark(run_rb3lyp, small_mol, 'def2-tzvpp', False, False)
+    print('testing rb3lyp')
+    assert np.isclose(np.linalg.norm(e), 684.999735850967, atol=1e-7)
+def test_rb3lyp_grad(benchmark):
+    g = benchmark(run_rb3lyp_grad, small_mol, 'def2-tzvpp', False, False)
+    print('testing rb3lyp grad')
+    assert np.isclose(np.linalg.norm(g), 0.1744127474130983, atol=1e-5)
+def test_rb3lyp_hessian(benchmark):
+    h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', False, False)
+    print('testing rb3lyp hessian')
+    assert np.isclose(np.linalg.norm(h), 3.7588443634477833, atol=1e-4)
+
+# median molecule
+def test_df_rb3lyp_median(benchmark):
+    e = benchmark(run_rb3lyp, median_mol, 'def2-tzvpp', True, False)
+    print('testing df rb3lyp median')
+    assert np.isclose(np.linalg.norm(e), 1138.371390377773, atol=1e-7)
+def test_df_rb3lyp_grad_median(benchmark):
+    g = benchmark(run_rb3lyp_grad, median_mol, 'def2-tzvpp', True, False)
+    print('testing df rb3lyp grad median')
+    assert np.isclose(np.linalg.norm(g), 0.26010545073602614, atol=1e-4)
+def test_df_rb3lyp_hessian_median(benchmark):
+    h = benchmark(run_rb3lyp_hessian, median_mol, 'def2-tzvpp', True, False)
+    print('testing df rb3lyp hessian median')
+    assert np.isclose(np.linalg.norm(h), 6.32514169232998, atol=1e-4)
+
+def test_rb3lyp_median(benchmark):
+    e = benchmark(run_rb3lyp, median_mol, 'def2-tzvpp', False, False)
+    print('testing rb3lyp median')
+    assert np.isclose(np.linalg.norm(e), 1138.3710752128077, atol=1e-7)
+def test_rb3lyp_grad_median(benchmark):
+    g = benchmark(run_rb3lyp_grad, median_mol, 'def2-tzvpp', False, False)
+    print('testing rb3lyp grad median')
+    assert np.isclose(np.linalg.norm(g), 0.2601443836937988, atol=1e-5)
+@pytest.mark.high_memory
+@pytest.mark.slow
+def test_rb3lyp_hessian_median(benchmark):
+    h = benchmark(run_rb3lyp_hessian, median_mol, 'def2-tzvpp', False, False)
+    print('testing rb3lyp hessian median')
+    assert np.isclose(np.linalg.norm(h))
+
+# large molecule
+def test_df_rb3lyp_large(benchmark):
+    e = benchmark(run_rb3lyp, large_mol, 'def2-tzvpp', True, False)
+    print('testing df rb3lyp large')
+    assert np.isclose(np.linalg.norm(e), 2564.198712152175, atol=1e-7)
+def test_df_rb3lyp_grad_large(benchmark):
+    g = benchmark(run_rb3lyp_grad, large_mol, 'def2-tzvpp', True, False)
+    print('testing df rb3lyp grad large')
+    assert np.isclose(np.linalg.norm(g), 0.3784358687859323, atol=1e-5)
+@pytest.mark.high_memory
+@pytest.mark.slow
+def test_df_rb3lyp_hessian_large(benchmark):
+    h = benchmark(run_rb3lyp_hessian, large_mol, 'def2-tzvpp', True, False)
+    print('testing df rb3lyp hessian large')
+    assert np.isclose(np.linalg.norm(h), 7.583208736873523, atol=1e-4)
+@pytest.mark.slow
+def test_rb3lyp_large(benchmark):
+    e = benchmark(run_rb3lyp, large_mol, 'def2-tzvpp', False, False)
+    print('testing rb3lyp large')
+    assert np.isclose(np.linalg.norm(e), 2564.198099576358, atol=1e-7)
+@pytest.mark.slow
+def test_rb3lyp_grad_large(benchmark):
+    g = benchmark(run_rb3lyp_grad, large_mol, 'def2-tzvpp', False, False)
+    print('testing rb3lyp grad large')
+    assert np.isclose(np.linalg.norm(g), 0.3784664384209763, atol=1e-5)
+@pytest.mark.slow
+def test_rb3lyp_hessian_large(benchmark):
+    h = benchmark(run_rb3lyp_hessian, large_mol, 'def2-tzvpp', False, False)
+    print('testing rb3lyp hessian large')
+    print(np.linalg.norm(h))
+
+# small basis set
+def test_df_rb3lyp_631gs(benchmark):
+    e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, False)
+    print('testing df rb3lyp 631gs')
+    assert np.isclose(np.linalg.norm(e), 684.6646008642876, atol=1e-7)
+def test_df_rb3lyp_631gs_grad(benchmark):
+    g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, False)
+    print('testing df rb3lyp 631gs grad')
+    assert np.isclose(np.linalg.norm(g), 0.17530687343398219, atol=1e-5)
+def test_df_rb3lyp_631gs_hessian(benchmark):
+    h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, False)
+    print('testing df rb3lyp 631gs hessian')
+    assert np.isclose(np.linalg.norm(h), 3.908874851569459, atol=1e-4)
+
+# small basis set for large molecule
+def test_rb3lyp_631gs_large(benchmark):
+    e = benchmark(run_rb3lyp, large_mol, '6-31gs', False, False)
+    print('testing rb3lyp 631gs large')
+    assert np.isclose(np.linalg.norm(e), 2563.1171191823423, atol=1e-7)
+def test_rb3lyp_631gs_grad_large(benchmark):
+    g = benchmark(run_rb3lyp_grad, large_mol, '6-31gs', False, False)
+    print('testing df rb3lyp 631gs grad large')
+    assert np.isclose(np.linalg.norm(g), 0.37778228700247984, atol=1e-5)
+@pytest.mark.slow
+def test_rb3lyp_631gs_hessian_large(benchmark):
+    h = benchmark(run_rb3lyp_hessian, large_mol, '6-31gs', False, False)
+    print('testing df rb3lyp 631gs hessian large')
+    assert np.isclose(np.linalg.norm(h), 7.920764634100053, atol=1e-4)
+
+#solvent model
+def test_df_rb3lyp_631gs_solvent(benchmark):
+    e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, True)
+    print('testing df rb3lyp 631gs solvent')
+    assert np.isclose(np.linalg.norm(e), 684.6985561053816, atol=1e-7)
+def test_df_rb3lyp_631gs_solvent_grad(benchmark):
+    g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, True)
+    print('testing df rb3lyp 631gs solvent grad')
+    assert np.isclose(np.linalg.norm(g), 0.16956999476137297, atol=1e-5)
+def test_df_rb3lyp_631gs_solvent_hessian(benchmark):
+    h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, True)
+    print('testing df rb3lyp 631gs solvent hessian')
+    assert np.isclose(np.linalg.norm(h), 3.9008165041707294, atol=1e-4)
+
+# b3lyp d3bj
+def test_df_rb3lyp_631gs_d3bj(benchmark):
+    e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, True, 'd3bj')
+    print('testing df rb3lyp 631gs solvent')
+    assert np.isclose(np.linalg.norm(e), 684.7313814096565, atol=1e-7)
+def test_df_rb3lyp_631gs_d3bj_grad(benchmark):
+    g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, True, 'd3bj')
+    print('testing df rb3lyp 631gs solvent grad')
+    assert np.isclose(np.linalg.norm(g), 0.17010044498887264, atol=1e-5)
+def test_df_rb3lyp_631gs_d3bj_hessian(benchmark):
+    h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, True, 'd3bj')
+    print('testing df rb3lyp 631gs solvent hessian')
+    assert np.isclose(np.linalg.norm(h), 3.902367554157861, atol=1e-4)
+
diff --git a/gpu4pyscf/tests/test_uks.py b/gpu4pyscf/tests/test_uks.py
new file mode 100644
index 00000000..0e426f17
--- /dev/null
+++ b/gpu4pyscf/tests/test_uks.py
@@ -0,0 +1,92 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import pyscf
+import pytest
+import cupy
+from gpu4pyscf.dft import rks, uks
+
+current_folder = os.path.dirname(os.path.abspath(__file__))
+small_mol = os.path.join(current_folder, '020_Vitamin_C.xyz')
+
+def run_ub3lyp(atom, basis, with_df, with_solvent):
+    mol = pyscf.M(atom=atom, basis=basis, verbose=0)
+    mf = uks.UKS(mol, xc='b3lyp')
+    if with_df:
+        mf = mf.density_fit()
+    if with_solvent:
+        mf = mf.PCM()
+        mf.with_solvent.method = 'IEF-PCM'
+    mf.grids.atom_grid = (99,590)
+    mf.conv_tol = 1e-10
+    return mf.kernel()
+
+def run_ub3lyp_grad(atom, basis, with_df, with_solvent):
+    mol = pyscf.M(atom=atom, basis=basis, verbose=0)
+    mf = uks.UKS(mol, xc='b3lyp')
+    if with_df:
+        mf = mf.density_fit()
+    if with_solvent:
+        mf = mf.PCM()
+        mf.with_solvent.method = 'IEF-PCM'
+    mf.grids.atom_grid = (99,590)
+    mf.conv_tol = 1e-10
+    mf.kernel()
+    g = mf.nuc_grad_method().kernel()
+    return g
+
+def run_ub3lyp_hessian(atom, basis, with_df, with_solvent):
+    mol = pyscf.M(atom=atom, basis=basis, verbose=0)
+    mf = uks.UKS(mol, xc='b3lyp')
+    if with_df:
+        mf = mf.density_fit()
+    if with_solvent:
+        mf = mf.PCM()
+        mf.with_solvent.method = 'IEF-PCM'
+    mf.grids.atom_grid = (99,590)
+    mf.conv_tol = 1e-10
+    mf.conv_tol_cpscf = 1e-6
+    mf.kernel()
+    h = mf.Hessian().kernel()
+    return h
+
+
+# UKS
+def test_df_ub3lyp(benchmark):
+    e = benchmark(run_ub3lyp, small_mol, 'def2-tzvpp', True, False)
+    print('testing df ub3lyp')
+    assert np.isclose(np.linalg.norm(e), 684.9998712035856, atol=1e-7)
+def test_df_ub3lyp_grad(benchmark):
+    g = benchmark(run_ub3lyp_grad, small_mol, 'def2-tzvpp', True, False)
+    print('testing df ub3lyp grad')
+    assert np.isclose(np.linalg.norm(g), 0.17435842214665462, atol=1e-5)
+def test_df_ub3lyp_hessian(benchmark):
+    h = benchmark(run_ub3lyp_hessian, small_mol, 'def2-tzvpp', True, False)
+    print('testing df ub3lyp hessian')
+    assert np.isclose(np.linalg.norm(h), 3.7669464279078064, atol=1e-4)
+def test_ub3lyp(benchmark):
+    e = benchmark(run_ub3lyp, small_mol, 'def2-tzvpp', False, False)
+    print('testing ub3lyp')
+    assert np.isclose(np.linalg.norm(e), 684.9997358509884, atol=1e-7)
+def test_ub3lyp_grad(benchmark):
+    g = benchmark(run_ub3lyp_grad, small_mol, 'def2-tzvpp', False, False)
+    print('testing ub3lyp grad')
+    assert np.isclose(np.linalg.norm(g), 0.17441176110160253, atol=1e-5)
+def test_ub3lyp_hessian(benchmark):
+    h = benchmark(run_ub3lyp_hessian, small_mol, 'def2-tzvpp', False, False)
+    print('testing ub3lyp hessian')
+    assert np.isclose(np.linalg.norm(h), 3.758916526520172, atol=1e-4)

From fa6ac932eb932fd0ffaf911b989c35fb217db729 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Thu, 26 Dec 2024 18:06:36 +0000
Subject: [PATCH 17/49] assert hermi==1

---
 gpu4pyscf/df/hessian/jk.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py
index f8992ca3..db77e4db 100644
--- a/gpu4pyscf/df/hessian/jk.py
+++ b/gpu4pyscf/df/hessian/jk.py
@@ -32,6 +32,7 @@ def _jk_task_with_mo1(dfobj, dms, mo_coeff, mo1s, occ_coeffs,
     ''' Calculate J and K matrices with mo response
         For CP-HF
     '''
+    assert hermi == 1
     with cupy.cuda.Device(device_id), _streams[device_id]:
         assert isinstance(dfobj.verbose, int)
         log = logger.new_logger(dfobj.mol, dfobj.verbose)

From 9d9ff1e0920922372062db2cdd35d8d73adc73b4 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Thu, 26 Dec 2024 18:42:25 +0000
Subject: [PATCH 18/49] typo in uhf.hessian

---
 gpu4pyscf/hessian/rhf.py | 2 +-
 gpu4pyscf/hessian/uhf.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py
index 53be3349..c8d407d7 100644
--- a/gpu4pyscf/hessian/rhf.py
+++ b/gpu4pyscf/hessian/rhf.py
@@ -659,7 +659,7 @@ def fvind_vo(mo1):
     avail_mem = get_avail_mem()
     # *4 for input dm, vj, vk, and vxc
     blksize = int(min(avail_mem*.3 / (8*3*nao*nocc*4), # in MO
-                      avail_mem*.3 / (8*nmo*nmo*3*3))) # vj, vk, dm in AO
+                      avail_mem*.3 / (8*nao*nao*3*3))) # vj, vk, dm in AO
     if blksize < ALIGNED**2:
         raise RuntimeError('GPU memory insufficient for solving CPHF equations')
 
diff --git a/gpu4pyscf/hessian/uhf.py b/gpu4pyscf/hessian/uhf.py
index 0f695b2b..88a6c9fd 100644
--- a/gpu4pyscf/hessian/uhf.py
+++ b/gpu4pyscf/hessian/uhf.py
@@ -292,8 +292,8 @@ def fvind_vo(mo1):
 
     avail_mem = get_avail_mem()
     # *8 for spin-up/down input dm, vj, vk, and vxc
-    blksize = int(min(avail_mem*.3 / (8*3*nao*nao*8),
-                      avail_mem*.3 / (8*nmo*nmo*3*6)))  # in vj, vk, dm in AO
+    blksize = int(min(avail_mem*.3 / (8*3*nao*nocc*8),
+                      avail_mem*.3 / (8*nao*nao*3*6)))  # in vj, vk, dm in AO
     if blksize < ALIGNED**2:
         raise RuntimeError('GPU memory insufficient')
 

From 5c9e2e69406c10e91d7a6c15e834121ac1cedc6d Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Fri, 27 Dec 2024 00:34:19 +0000
Subject: [PATCH 19/49] inject gen_response into soscf

---
 gpu4pyscf/df/df.py                |   1 +
 gpu4pyscf/grad/rhf.py             |  73 ++++++------
 gpu4pyscf/hessian/jk.py           |  69 +++++------
 gpu4pyscf/hessian/rhf.py          | 185 +++++++++++++++---------------
 gpu4pyscf/scf/jk.py               |  69 +++++------
 gpu4pyscf/scf/soscf.py            |   2 +-
 gpu4pyscf/scf/tests/test_soscf.py |  12 +-
 7 files changed, 213 insertions(+), 198 deletions(-)

diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
index 52b0ecf8..4991af43 100644
--- a/gpu4pyscf/df/df.py
+++ b/gpu4pyscf/df/df.py
@@ -147,6 +147,7 @@ def loop(self, blksize=None, unpack=True):
             and unpack the CDERI in (Lij) format
         '''
         device_id = cupy.cuda.Device().id
+        print(self._cderi.keys(), device_id)
         cderi_sparse = self._cderi[device_id]
         if blksize is None:
             blksize = self.get_blksize()
diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py
index c3390e95..0ee8cd43 100644
--- a/gpu4pyscf/grad/rhf.py
+++ b/gpu4pyscf/grad/rhf.py
@@ -79,43 +79,41 @@ def _ejk_ip1_task(mol, dms, vhfopt, task_list, j_factor=1.0, k_factor=1.0,
         info = cp.empty(2, dtype=np.uint32)
         t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0)
 
-        for i, j in task_list:
+        for i, j, k, l in task_list:
             ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1],
                        l_ctr_bas_loc[j], l_ctr_bas_loc[j+1])
             tile_ij_mapping = tile_mappings[i,j]
-            for k in range(i+1):
-                for l in range(k+1):
-                    llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
-                    kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
-                               l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
-                    tile_kl_mapping = tile_mappings[k,l]
-                    scheme = _ejk_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
-                    err = kern(
-                        ctypes.cast(ejk.data.ptr, ctypes.c_void_p),
-                        ctypes.c_double(j_factor), ctypes.c_double(k_factor),
-                        ctypes.cast(dms.data.ptr, ctypes.c_void_p),
-                        ctypes.c_int(n_dm), ctypes.c_int(nao),
-                        vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
-                        (ctypes.c_int*8)(*ij_shls, *kl_shls),
-                        ctypes.c_int(tile_ij_mapping.size),
-                        ctypes.c_int(tile_kl_mapping.size),
-                        ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
-                        tile_q_ptr, q_ptr, s_ptr,
-                        ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
-                        ctypes.c_float(log_cutoff),
-                        ctypes.cast(pool.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(info.data.ptr, ctypes.c_void_p),
-                        ctypes.c_int(workers),
-                        mol._atm.ctypes, ctypes.c_int(mol.natm),
-                        mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
-                    if err != 0:
-                        raise RuntimeError(f'RYS_per_atom_jk_ip1 kernel for {llll} failed')
-                    if log.verbose >= logger.DEBUG1:
-                        msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}'
-                        t1, t1p = log.timer_debug1(msg, *t1), t1
-                        timing_counter[llll] += t1[1] - t1p[1]
-                        kern_counts += 1
+            llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
+            kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
+                        l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
+            tile_kl_mapping = tile_mappings[k,l]
+            scheme = _ejk_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
+            err = kern(
+                ctypes.cast(ejk.data.ptr, ctypes.c_void_p),
+                ctypes.c_double(j_factor), ctypes.c_double(k_factor),
+                ctypes.cast(dms.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(n_dm), ctypes.c_int(nao),
+                vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
+                (ctypes.c_int*8)(*ij_shls, *kl_shls),
+                ctypes.c_int(tile_ij_mapping.size),
+                ctypes.c_int(tile_kl_mapping.size),
+                ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
+                ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
+                tile_q_ptr, q_ptr, s_ptr,
+                ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
+                ctypes.c_float(log_cutoff),
+                ctypes.cast(pool.data.ptr, ctypes.c_void_p),
+                ctypes.cast(info.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(workers),
+                mol._atm.ctypes, ctypes.c_int(mol.natm),
+                mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
+            if err != 0:
+                raise RuntimeError(f'RYS_per_atom_jk_ip1 kernel for {llll} failed')
+            if log.verbose >= logger.DEBUG1:
+                msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}'
+                t1, t1p = log.timer_debug1(msg, *t1), t1
+                timing_counter[llll] += t1[1] - t1p[1]
+                kern_counts += 1
     return ejk, kern_counts, timing_counter
 
 def _jk_energy_per_atom(mol, dm, vhfopt=None,
@@ -145,7 +143,12 @@ def _jk_energy_per_atom(mol, dm, vhfopt=None,
     assert uniq_l.max() <= LMAX
 
     n_groups = len(uniq_l_ctr)
-    tasks = [(i,j) for i in range(n_groups) for j in range(i+1)]
+    tasks = []
+    for i in range(n_groups):
+        for j in range(i+1):
+            for k in range(i+1):
+                for l in range(k+1):
+                    tasks.append((i,j,k,l))
     tasks = np.array(tasks)
     task_list = []
     for device_id in range(_num_devices):
diff --git a/gpu4pyscf/hessian/jk.py b/gpu4pyscf/hessian/jk.py
index a1cd6105..6f17488d 100644
--- a/gpu4pyscf/hessian/jk.py
+++ b/gpu4pyscf/hessian/jk.py
@@ -91,41 +91,39 @@ def _jk_task(mol, dms, mo_coeff, mo_occ, vhfopt, task_list, hermi=0,
         info = cp.empty(2, dtype=np.uint32)
         t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0)
 
-        for i, j in task_list:
+        for i, j, k, l in task_list:
             ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1],
                        l_ctr_bas_loc[j], l_ctr_bas_loc[j+1])
             tile_ij_mapping = tile_mappings[i,j]
-            for k in range(i+1):
-                for l in range(k+1):
-                    llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
-                    kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
-                                l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
-                    tile_kl_mapping = tile_mappings[k,l]
-                    scheme = quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
-                    err = kern(
-                        vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p),
-                        ctypes.c_int(n_dm), ctypes.c_int(nao),
-                        vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
-                        (ctypes.c_int*8)(*ij_shls, *kl_shls),
-                        ctypes.c_int(tile_ij_mapping.size),
-                        ctypes.c_int(tile_kl_mapping.size),
-                        ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
-                        tile_q_ptr, q_ptr, s_ptr,
-                        ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
-                        ctypes.c_float(log_cutoff),
-                        ctypes.cast(pool.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(info.data.ptr, ctypes.c_void_p),
-                        ctypes.c_int(workers),
-                        mol._atm.ctypes, ctypes.c_int(mol.natm),
-                        mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
-                    if err != 0:
-                        raise RuntimeError(f'RYS_build_jk kernel for {llll} failed')
-                    if log.verbose >= logger.DEBUG1:
-                        msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}'
-                        t1, t1p = log.timer_debug1(msg, *t1), t1
-                        timing_counter[llll] += t1[1] - t1p[1]
-                        kern_counts += 1
+            llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
+            kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
+                        l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
+            tile_kl_mapping = tile_mappings[k,l]
+            scheme = quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
+            err = kern(
+                vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(n_dm), ctypes.c_int(nao),
+                vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
+                (ctypes.c_int*8)(*ij_shls, *kl_shls),
+                ctypes.c_int(tile_ij_mapping.size),
+                ctypes.c_int(tile_kl_mapping.size),
+                ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
+                ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
+                tile_q_ptr, q_ptr, s_ptr,
+                ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
+                ctypes.c_float(log_cutoff),
+                ctypes.cast(pool.data.ptr, ctypes.c_void_p),
+                ctypes.cast(info.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(workers),
+                mol._atm.ctypes, ctypes.c_int(mol.natm),
+                mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
+            if err != 0:
+                raise RuntimeError(f'RYS_build_jk kernel for {llll} failed')
+            if log.verbose >= logger.DEBUG1:
+                msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}'
+                t1, t1p = log.timer_debug1(msg, *t1), t1
+                timing_counter[llll] += t1[1] - t1p[1]
+                kern_counts += 1
         if with_j:
             vj *= 2.0
             vj = transpose_sum(vj)
@@ -192,7 +190,12 @@ def get_jk(mol, dm, mo_coeff, mo_occ, hermi=0, vhfopt=None,
     l_symb = [lib.param.ANGULAR[i] for i in uniq_l]
     n_groups = np.count_nonzero(uniq_l <= LMAX)
 
-    tasks = [(i,j) for i in range(n_groups) for j in range(i+1)]
+    tasks = []
+    for i in range(n_groups):
+        for j in range(i+1):
+            for k in range(i+1):
+                for l in range(k+1):
+                    tasks.append((i,j,k,l))
     tasks = np.array(tasks)
     task_list = []
     for device_id in range(_num_devices):
diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py
index c8d407d7..d7596d13 100644
--- a/gpu4pyscf/hessian/rhf.py
+++ b/gpu4pyscf/hessian/rhf.py
@@ -204,62 +204,60 @@ def _ejk_ip2_task(mol, dms, vhfopt, task_list, j_factor=1.0, k_factor=1.0,
         info = cp.empty(2, dtype=np.uint32)
         t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0)
 
-        for i, j in task_list:
+        for i, j, k, l in task_list:
             ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1],
                        l_ctr_bas_loc[j], l_ctr_bas_loc[j+1])
             tile_ij_mapping = tile_mappings[i,j]
-            for k in range(i+1):
-                for l in range(k+1):
-                    llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
-                    kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
-                               l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
-                    tile_kl_mapping = tile_mappings[k,l]
-                    scheme = _ip2_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
-                    err1 = kern1(
-                        ctypes.cast(ejk.data.ptr, ctypes.c_void_p),
-                        ctypes.c_double(j_factor), ctypes.c_double(k_factor),
-                        ctypes.cast(dms.data.ptr, ctypes.c_void_p),
-                        ctypes.c_int(n_dm), ctypes.c_int(nao),
-                        vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
-                        (ctypes.c_int*8)(*ij_shls, *kl_shls),
-                        ctypes.c_int(tile_ij_mapping.size),
-                        ctypes.c_int(tile_kl_mapping.size),
-                        ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
-                        tile_q_ptr, q_ptr, s_ptr,
-                        ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
-                        ctypes.c_float(log_cutoff),
-                        ctypes.cast(pool.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(info.data.ptr, ctypes.c_void_p),
-                        ctypes.c_int(workers),
-                        mol._atm.ctypes, ctypes.c_int(mol.natm),
-                        mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
-                    err2 = kern2(
-                        ctypes.cast(ejk.data.ptr, ctypes.c_void_p),
-                        ctypes.c_double(j_factor), ctypes.c_double(k_factor),
-                        ctypes.cast(dms.data.ptr, ctypes.c_void_p),
-                        ctypes.c_int(n_dm), ctypes.c_int(nao),
-                        vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
-                        (ctypes.c_int*8)(*ij_shls, *kl_shls),
-                        ctypes.c_int(tile_ij_mapping.size),
-                        ctypes.c_int(tile_kl_mapping.size),
-                        ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
-                        tile_q_ptr, q_ptr, s_ptr,
-                        ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
-                        ctypes.c_float(log_cutoff),
-                        ctypes.cast(pool.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(info.data.ptr, ctypes.c_void_p),
-                        ctypes.c_int(workers),
-                        mol._atm.ctypes, ctypes.c_int(mol.natm),
-                        mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
-                    if err1 != 0 or err2 != 0:
-                        raise RuntimeError(f'RYS_per_atom_jk_ip2 kernel for {llll} failed')
-                    if log.verbose >= logger.DEBUG1:
-                        msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}'
-                        t1, t1p = log.timer_debug1(msg, *t1), t1
-                        timing_counter[llll] += t1[1] - t1p[1]
-                        kern_counts += 1
+            llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
+            kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
+                        l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
+            tile_kl_mapping = tile_mappings[k,l]
+            scheme = _ip2_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
+            err1 = kern1(
+                ctypes.cast(ejk.data.ptr, ctypes.c_void_p),
+                ctypes.c_double(j_factor), ctypes.c_double(k_factor),
+                ctypes.cast(dms.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(n_dm), ctypes.c_int(nao),
+                vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
+                (ctypes.c_int*8)(*ij_shls, *kl_shls),
+                ctypes.c_int(tile_ij_mapping.size),
+                ctypes.c_int(tile_kl_mapping.size),
+                ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
+                ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
+                tile_q_ptr, q_ptr, s_ptr,
+                ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
+                ctypes.c_float(log_cutoff),
+                ctypes.cast(pool.data.ptr, ctypes.c_void_p),
+                ctypes.cast(info.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(workers),
+                mol._atm.ctypes, ctypes.c_int(mol.natm),
+                mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
+            err2 = kern2(
+                ctypes.cast(ejk.data.ptr, ctypes.c_void_p),
+                ctypes.c_double(j_factor), ctypes.c_double(k_factor),
+                ctypes.cast(dms.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(n_dm), ctypes.c_int(nao),
+                vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
+                (ctypes.c_int*8)(*ij_shls, *kl_shls),
+                ctypes.c_int(tile_ij_mapping.size),
+                ctypes.c_int(tile_kl_mapping.size),
+                ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
+                ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
+                tile_q_ptr, q_ptr, s_ptr,
+                ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
+                ctypes.c_float(log_cutoff),
+                ctypes.cast(pool.data.ptr, ctypes.c_void_p),
+                ctypes.cast(info.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(workers),
+                mol._atm.ctypes, ctypes.c_int(mol.natm),
+                mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
+            if err1 != 0 or err2 != 0:
+                raise RuntimeError(f'RYS_per_atom_jk_ip2 kernel for {llll} failed')
+            if log.verbose >= logger.DEBUG1:
+                msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}'
+                t1, t1p = log.timer_debug1(msg, *t1), t1
+                timing_counter[llll] += t1[1] - t1p[1]
+                kern_counts += 1
 
         ejk = ejk + ejk.transpose(1,0,3,2)
     return ejk, kern_counts, timing_counter
@@ -286,7 +284,12 @@ def _partial_ejk_ip2(mol, dm, vhfopt=None, j_factor=1., k_factor=1., verbose=Non
     assert uniq_l.max() <= LMAX
 
     n_groups = len(uniq_l_ctr)
-    tasks = [(i,j) for i in range(n_groups) for j in range(i+1)]
+    tasks = []
+    for i in range(n_groups):
+        for j in range(i+1):
+            for k in range(i+1):
+                for l in range(k+1):
+                    tasks.append((i,j,k,l))
     tasks = np.array(tasks)
     task_list = []
     for device_id in range(_num_devices):
@@ -394,7 +397,6 @@ def _build_jk_ip1_task(mol, dms, vhfopt, task_list, atoms_slice,
     uniq_l = uniq_l_ctr[:,0]
     l_ctr_bas_loc = vhfopt.l_ctr_offsets
     l_symb = [lib.param.ANGULAR[i] for i in uniq_l]
-    n_groups = len(uniq_l_ctr)
     kern = libvhf_rys.RYS_build_jk_ip1
 
     timing_counter = Counter()
@@ -426,7 +428,7 @@ def _build_jk_ip1_task(mol, dms, vhfopt, task_list, atoms_slice,
         info = cp.empty(2, dtype=np.uint32)
         t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0)
 
-        for i, j in task_list:
+        for i, j, k, l in task_list:
             ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1],
                        l_ctr_bas_loc[j], l_ctr_bas_loc[j+1])
             ish0, ish1 = l_ctr_bas_loc[i], l_ctr_bas_loc[i+1]
@@ -441,39 +443,37 @@ def _build_jk_ip1_task(mol, dms, vhfopt, task_list, atoms_slice,
                     cp.arange(jsh0, jsh1, dtype=np.int32))
             idx = cp.argsort(sub_tile_q[mask])[::-1]
             tile_ij_mapping = t_ij[mask][idx]
-            for k in range(n_groups):
-                for l in range(k+1):
-                    llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
-                    kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
-                               l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
-                    tile_kl_mapping = tril_tile_mappings[k,l]
-                    scheme = _ip1_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
-                    err = kern(
-                        vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p),
-                        ctypes.c_int(n_dm), ctypes.c_int(nao), ctypes.c_int(atom0),
-                        vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
-                        (ctypes.c_int*8)(*ij_shls, *kl_shls),
-                        ctypes.c_int(tile_ij_mapping.size),
-                        ctypes.c_int(tile_kl_mapping.size),
-                        ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p),
-                        lib.c_null_ptr(),
-                        ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
-                        ctypes.c_float(log_cutoff),
-                        ctypes.cast(pool.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(info.data.ptr, ctypes.c_void_p),
-                        ctypes.c_int(workers),
-                        mol._atm.ctypes, ctypes.c_int(mol.natm),
-                        mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
-                    if err != 0:
-                        raise RuntimeError(f'RYS_build_jk kernel for {llll} failed')
-                    if log.verbose >= logger.DEBUG1:
-                        msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}'
-                        t1, t1p = log.timer_debug1(msg, *t1), t1
-                        timing_counter[llll] += t1[1] - t1p[1]
-                        kern_counts += 1
+            llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
+            kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
+                        l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
+            tile_kl_mapping = tril_tile_mappings[k,l]
+            scheme = _ip1_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
+            err = kern(
+                vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(n_dm), ctypes.c_int(nao), ctypes.c_int(atom0),
+                vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
+                (ctypes.c_int*8)(*ij_shls, *kl_shls),
+                ctypes.c_int(tile_ij_mapping.size),
+                ctypes.c_int(tile_kl_mapping.size),
+                ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
+                ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
+                ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p),
+                ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p),
+                lib.c_null_ptr(),
+                ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
+                ctypes.c_float(log_cutoff),
+                ctypes.cast(pool.data.ptr, ctypes.c_void_p),
+                ctypes.cast(info.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(workers),
+                mol._atm.ctypes, ctypes.c_int(mol.natm),
+                mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
+            if err != 0:
+                raise RuntimeError(f'RYS_build_jk kernel for {llll} failed')
+            if log.verbose >= logger.DEBUG1:
+                msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}'
+                t1, t1p = log.timer_debug1(msg, *t1), t1
+                timing_counter[llll] += t1[1] - t1p[1]
+                kern_counts += 1
     return vj, vk, kern_counts, timing_counter
 
 def _get_jk_ip1(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=None):
@@ -516,7 +516,12 @@ def _get_jk_ip1(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=Non
     assert vhfopt.tile_q_cond.shape == (nbas, nbas)
 
     n_groups = len(uniq_l_ctr)
-    tasks = [(i,j) for i in range(n_groups) for j in range(n_groups)]
+    tasks = []
+    for i in range(n_groups):
+        for j in range(n_groups):
+            for k in range(n_groups):
+                for l in range(k+1):
+                    tasks.append((i,j,k,l))
     tasks = np.array(tasks)
     task_list = []
     for device_id in range(_num_devices):
diff --git a/gpu4pyscf/scf/jk.py b/gpu4pyscf/scf/jk.py
index 8e09a35b..38d75ee3 100644
--- a/gpu4pyscf/scf/jk.py
+++ b/gpu4pyscf/scf/jk.py
@@ -107,41 +107,39 @@ def _jk_task(mol, dms, vhfopt, task_list, hermi=0,
         info = cp.empty(2, dtype=np.uint32)
         t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0)
 
-        for i, j in task_list:
+        for i, j, k, l in task_list:
             ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1],
                        l_ctr_bas_loc[j], l_ctr_bas_loc[j+1])
             tile_ij_mapping = tile_mappings[i,j]
-            for k in range(i+1):
-                for l in range(k+1):
-                    llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
-                    kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
-                                l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
-                    tile_kl_mapping = tile_mappings[k,l]
-                    scheme = quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
-                    err = kern(
-                        vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p),
-                        ctypes.c_int(n_dm), ctypes.c_int(nao),
-                        vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
-                        (ctypes.c_int*8)(*ij_shls, *kl_shls),
-                        ctypes.c_int(tile_ij_mapping.size),
-                        ctypes.c_int(tile_kl_mapping.size),
-                        ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
-                        tile_q_ptr, q_ptr, s_ptr,
-                        ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
-                        ctypes.c_float(log_cutoff),
-                        ctypes.cast(pool.data.ptr, ctypes.c_void_p),
-                        ctypes.cast(info.data.ptr, ctypes.c_void_p),
-                        ctypes.c_int(workers),
-                        mol._atm.ctypes, ctypes.c_int(mol.natm),
-                        mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
-                    if err != 0:
-                        raise RuntimeError(f'RYS_build_jk kernel for {llll} failed')
-                    if log.verbose >= logger.DEBUG1:
-                        msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}'
-                        t1, t1p = log.timer_debug1(msg, *t1), t1
-                        timing_counter[llll] += t1[1] - t1p[1]
-                        kern_counts += 1
+            llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
+            kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1],
+                        l_ctr_bas_loc[l], l_ctr_bas_loc[l+1])
+            tile_kl_mapping = tile_mappings[k,l]
+            scheme = quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
+            err = kern(
+                vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(n_dm), ctypes.c_int(nao),
+                vhfopt.rys_envs, (ctypes.c_int*2)(*scheme),
+                (ctypes.c_int*8)(*ij_shls, *kl_shls),
+                ctypes.c_int(tile_ij_mapping.size),
+                ctypes.c_int(tile_kl_mapping.size),
+                ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p),
+                ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p),
+                tile_q_ptr, q_ptr, s_ptr,
+                ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
+                ctypes.c_float(log_cutoff),
+                ctypes.cast(pool.data.ptr, ctypes.c_void_p),
+                ctypes.cast(info.data.ptr, ctypes.c_void_p),
+                ctypes.c_int(workers),
+                mol._atm.ctypes, ctypes.c_int(mol.natm),
+                mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
+            if err != 0:
+                raise RuntimeError(f'RYS_build_jk kernel for {llll} failed')
+            if log.verbose >= logger.DEBUG1:
+                msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}'
+                t1, t1p = log.timer_debug1(msg, *t1), t1
+                timing_counter[llll] += t1[1] - t1p[1]
+                kern_counts += 1
         if with_j:
             if hermi == 1:
                 vj *= 2.
@@ -185,7 +183,12 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None
     l_symb = [lib.param.ANGULAR[i] for i in uniq_l]
     n_groups = np.count_nonzero(uniq_l <= LMAX)
 
-    tasks = [(i,j) for i in range(n_groups) for j in range(i+1)]
+    tasks = []
+    for i in range(n_groups):
+        for j in range(i+1):
+            for k in range(i+1):
+                for l in range(k+1): 
+                    tasks.append((i,j,k,l))
     tasks = np.array(tasks)
     task_list = []
     for device_id in range(_num_devices):
diff --git a/gpu4pyscf/scf/soscf.py b/gpu4pyscf/scf/soscf.py
index 81da0361..6d9bf87b 100644
--- a/gpu4pyscf/scf/soscf.py
+++ b/gpu4pyscf/scf/soscf.py
@@ -27,7 +27,7 @@
 from pyscf.soscf import ciah
 from pyscf.soscf.newton_ah import _CIAH_SOSCF as _SOSCF_cpu
 from gpu4pyscf.lib import logger
-from gpu4pyscf.scf import hf, rohf, uhf
+from gpu4pyscf.scf import hf, rohf, uhf, _response_functions
 from gpu4pyscf.lib.cupy_helper import transpose_sum, contract
 from gpu4pyscf.lib import utils
 
diff --git a/gpu4pyscf/scf/tests/test_soscf.py b/gpu4pyscf/scf/tests/test_soscf.py
index 924dfd2e..4a07bcc5 100644
--- a/gpu4pyscf/scf/tests/test_soscf.py
+++ b/gpu4pyscf/scf/tests/test_soscf.py
@@ -24,18 +24,18 @@ def setUpModule():
         verbose = 5,
         output = '/dev/null',
         atom = [
-        ["O" , (0. , 0.     , 0.)],
-        [1   , (0. , -0.757 , 0.587)],
-        [1   , (0. , 0.757  , 0.587)] ],
+            ["O" , (0. , 0.     , 0.)],
+            [1   , (0. , -0.757 , 0.587)],
+            [1   , (0. , 0.757  , 0.587)] ],
         basis = '6-31g')
 
     h2o_z1 = gto.M(
         verbose = 5,
         output = '/dev/null',
         atom = [
-        ["O" , (0. , 0.     , 0.)],
-        [1   , (0. , -0.757 , 0.587)],
-        [1   , (0. , 0.757  , 0.587)] ],
+            ["O" , (0. , 0.     , 0.)],
+            [1   , (0. , -0.757 , 0.587)],
+            [1   , (0. , 0.757  , 0.587)] ],
         basis = '6-31g',
         charge = 1,
         spin = 1,)

From 53566e9f95be2ca886a305f0d46f25f743e8f593 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Fri, 27 Dec 2024 03:52:09 +0000
Subject: [PATCH 20/49] update tests for nightly build

---
 .github/workflows/nightly_build.yml | 5 +++--
 gpu4pyscf/df/df.py                  | 1 -
 gpu4pyscf/scf/jk.py                 | 3 ++-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/nightly_build.yml b/.github/workflows/nightly_build.yml
index 7f2b816e..f7f03ac6 100644
--- a/.github/workflows/nightly_build.yml
+++ b/.github/workflows/nightly_build.yml
@@ -14,7 +14,7 @@ permissions:
 jobs:
   build:
 
-    runs-on: self-hosted
+    runs-on: [self-hosted, Linux, X64, v100]
 
     steps:
     - uses: actions/checkout@v3
@@ -23,6 +23,7 @@ jobs:
         pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
         python3 -m pip install --upgrade pip
         pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion
+        pip3 install pytest-benchmark
         pip3 install pyscf --upgrade
         pip3 install numpy --upgrade
         pip3 install scipy --upgrade
@@ -39,4 +40,4 @@ jobs:
       run: |
         echo $GITHUB_WORKSPACE
         export PYTHONPATH="${PYTHONPATH}:$(pwd)"
-        pytest --durations=0
+        pytest tests/ -v -m "not slow"
diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
index 4991af43..52b0ecf8 100644
--- a/gpu4pyscf/df/df.py
+++ b/gpu4pyscf/df/df.py
@@ -147,7 +147,6 @@ def loop(self, blksize=None, unpack=True):
             and unpack the CDERI in (Lij) format
         '''
         device_id = cupy.cuda.Device().id
-        print(self._cderi.keys(), device_id)
         cderi_sparse = self._cderi[device_id]
         if blksize is None:
             blksize = self.get_blksize()
diff --git a/gpu4pyscf/scf/jk.py b/gpu4pyscf/scf/jk.py
index 38d75ee3..8577457d 100644
--- a/gpu4pyscf/scf/jk.py
+++ b/gpu4pyscf/scf/jk.py
@@ -58,6 +58,7 @@
 SHM_SIZE = getattr(__config__, 'GPU_SHM_SIZE',
                    int(gpu_specs['sharedMemPerBlockOptin']//9)*8)
 THREADS = 256
+GROUP_SIZE = 256
 
 def _jk_task(mol, dms, vhfopt, task_list, hermi=0,
              device_id=0, with_j=True, with_k=True, verbose=None):
@@ -461,7 +462,7 @@ def __init__(self, mol, cutoff=1e-13):
         self._tile_q_cond = {}
         self._s_estimator = {}
 
-    def build(self, group_size=None, verbose=None):
+    def build(self, group_size=GROUP_SIZE, verbose=None):
         mol = self.mol
         log = logger.new_logger(mol, verbose)
         cput0 = log.init_timer()

From 9b7a8d5cd3a94c2bf4f209ef18e2b3223a9ece69 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Fri, 27 Dec 2024 03:59:23 +0000
Subject: [PATCH 21/49] disable benchmark for ci

---
 .github/workflows/unittest.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
index 4eb534e3..31e8473a 100644
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -38,7 +38,7 @@ jobs:
       run: |
         echo $GITHUB_WORKSPACE
         export PYTHONPATH="${PYTHONPATH}:$(pwd)"
-        pytest -m "not smoke" --cov=$GITHUB_WORKSPACE
+        pytest -m "not smoke" --benchmark-disable -s --cov=$GITHUB_WORKSPACE
 
   multi-gpu:
     runs-on: [self-hosted, Linux, X64, 2T4]
@@ -65,4 +65,4 @@ jobs:
       run: |
         echo $GITHUB_WORKSPACE
         export PYTHONPATH="${PYTHONPATH}:$(pwd)"
-        pytest -m "not smoke" --cov=$GITHUB_WORKSPACE
+        pytest -m "not smoke" --benchmark-disable -s --cov=$GITHUB_WORKSPACE

From da30fcf140c61f249f0c98acc78d06e5d8b84f65 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Fri, 27 Dec 2024 04:05:30 +0000
Subject: [PATCH 22/49] install pytest-benchmark

---
 .github/workflows/unittest.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
index 31e8473a..de3303cb 100644
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -21,6 +21,7 @@ jobs:
       run: |
         pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
         python3 -m pip install --upgrade pip
+        pip3 install pytest-benchmark
         pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion
         pip3 install pyscf --upgrade
         pip3 install git+https://github.com/pyscf/properties --upgrade
@@ -48,6 +49,7 @@ jobs:
       run: |
         pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
         python3 -m pip install --upgrade pip
+        pip3 install pytest-benchmark
         pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion
         pip3 install pyscf --upgrade
         pip3 install git+https://github.com/pyscf/properties --upgrade

From 0c9a0c3ab3a98d65b4c93ec4fc881cb87e8de141 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Fri, 27 Dec 2024 04:10:50 +0000
Subject: [PATCH 23/49] change the file names of benchmark tests

---
 gpu4pyscf/tests/{test_rks.py => test_benchmark_rks.py} | 0
 gpu4pyscf/tests/{test_uks.py => test_benchmark_uks.py} | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename gpu4pyscf/tests/{test_rks.py => test_benchmark_rks.py} (100%)
 rename gpu4pyscf/tests/{test_uks.py => test_benchmark_uks.py} (100%)

diff --git a/gpu4pyscf/tests/test_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py
similarity index 100%
rename from gpu4pyscf/tests/test_rks.py
rename to gpu4pyscf/tests/test_benchmark_rks.py
diff --git a/gpu4pyscf/tests/test_uks.py b/gpu4pyscf/tests/test_benchmark_uks.py
similarity index 100%
rename from gpu4pyscf/tests/test_uks.py
rename to gpu4pyscf/tests/test_benchmark_uks.py

From 275925bb08c15e695aefe61756deedf3b2cc32c1 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Fri, 27 Dec 2024 05:19:32 +0000
Subject: [PATCH 24/49] disable benchmark for ci

---
 .github/workflows/unittest.yml        |  4 ++--
 gpu4pyscf/tests/test_benchmark_rks.py | 31 +++++++++++++++++++++++++++
 gpu4pyscf/tests/test_benchmark_uks.py |  6 ++++++
 3 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
index de3303cb..12464ab5 100644
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -39,7 +39,7 @@ jobs:
       run: |
         echo $GITHUB_WORKSPACE
         export PYTHONPATH="${PYTHONPATH}:$(pwd)"
-        pytest -m "not smoke" --benchmark-disable -s --cov=$GITHUB_WORKSPACE
+        pytest -m "not benchmark" --cov=$GITHUB_WORKSPACE
 
   multi-gpu:
     runs-on: [self-hosted, Linux, X64, 2T4]
@@ -67,4 +67,4 @@ jobs:
       run: |
         echo $GITHUB_WORKSPACE
         export PYTHONPATH="${PYTHONPATH}:$(pwd)"
-        pytest -m "not smoke" --benchmark-disable -s --cov=$GITHUB_WORKSPACE
+        pytest -m "not benchmark" --cov=$GITHUB_WORKSPACE
diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py
index ebf9d8af..cdaa2801 100644
--- a/gpu4pyscf/tests/test_benchmark_rks.py
+++ b/gpu4pyscf/tests/test_benchmark_rks.py
@@ -83,146 +83,177 @@ def run_rb3lyp_hessian(atom, basis, with_df, with_solvent, disp=None):
     return h
 
 # DF
+@pytest.mark.benchmark
 def test_df_rb3lyp(benchmark):
     e = benchmark(run_rb3lyp, small_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp')
     assert np.isclose(np.linalg.norm(e), 684.9998712035579, atol=1e-7)
+@pytest.mark.benchmark
 def test_df_rb3lyp_grad(benchmark):
     g = benchmark(run_rb3lyp_grad, small_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp grad')
     assert np.isclose(np.linalg.norm(g), 0.17435941081837686, atol=1e-5)
 @pytest.mark.slow
+@pytest.mark.benchmark
 def test_df_rb3lyp_hessian(benchmark):
     h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp hessian')
     assert np.isclose(np.linalg.norm(h), 3.7668761221997764, atol=1e-4)
 
 # Direct SCF
+@pytest.mark.benchmark
 def test_rb3lyp(benchmark):
     e = benchmark(run_rb3lyp, small_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp')
     assert np.isclose(np.linalg.norm(e), 684.999735850967, atol=1e-7)
+@pytest.mark.benchmark
 def test_rb3lyp_grad(benchmark):
     g = benchmark(run_rb3lyp_grad, small_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp grad')
     assert np.isclose(np.linalg.norm(g), 0.1744127474130983, atol=1e-5)
+@pytest.mark.benchmark
 def test_rb3lyp_hessian(benchmark):
     h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp hessian')
     assert np.isclose(np.linalg.norm(h), 3.7588443634477833, atol=1e-4)
 
 # median molecule
+@pytest.mark.benchmark
 def test_df_rb3lyp_median(benchmark):
     e = benchmark(run_rb3lyp, median_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp median')
     assert np.isclose(np.linalg.norm(e), 1138.371390377773, atol=1e-7)
+@pytest.mark.benchmark
 def test_df_rb3lyp_grad_median(benchmark):
     g = benchmark(run_rb3lyp_grad, median_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp grad median')
     assert np.isclose(np.linalg.norm(g), 0.26010545073602614, atol=1e-4)
+@pytest.mark.benchmark
 def test_df_rb3lyp_hessian_median(benchmark):
     h = benchmark(run_rb3lyp_hessian, median_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp hessian median')
     assert np.isclose(np.linalg.norm(h), 6.32514169232998, atol=1e-4)
 
+@pytest.mark.benchmark
 def test_rb3lyp_median(benchmark):
     e = benchmark(run_rb3lyp, median_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp median')
     assert np.isclose(np.linalg.norm(e), 1138.3710752128077, atol=1e-7)
+@pytest.mark.benchmark
 def test_rb3lyp_grad_median(benchmark):
     g = benchmark(run_rb3lyp_grad, median_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp grad median')
     assert np.isclose(np.linalg.norm(g), 0.2601443836937988, atol=1e-5)
 @pytest.mark.high_memory
 @pytest.mark.slow
+@pytest.mark.benchmark
 def test_rb3lyp_hessian_median(benchmark):
     h = benchmark(run_rb3lyp_hessian, median_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp hessian median')
     assert np.isclose(np.linalg.norm(h))
 
 # large molecule
+@pytest.mark.benchmark
 def test_df_rb3lyp_large(benchmark):
     e = benchmark(run_rb3lyp, large_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp large')
     assert np.isclose(np.linalg.norm(e), 2564.198712152175, atol=1e-7)
+@pytest.mark.benchmark
 def test_df_rb3lyp_grad_large(benchmark):
     g = benchmark(run_rb3lyp_grad, large_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp grad large')
     assert np.isclose(np.linalg.norm(g), 0.3784358687859323, atol=1e-5)
 @pytest.mark.high_memory
 @pytest.mark.slow
+@pytest.mark.benchmark
 def test_df_rb3lyp_hessian_large(benchmark):
     h = benchmark(run_rb3lyp_hessian, large_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp hessian large')
     assert np.isclose(np.linalg.norm(h), 7.583208736873523, atol=1e-4)
 @pytest.mark.slow
+@pytest.mark.benchmark
 def test_rb3lyp_large(benchmark):
     e = benchmark(run_rb3lyp, large_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp large')
     assert np.isclose(np.linalg.norm(e), 2564.198099576358, atol=1e-7)
 @pytest.mark.slow
+@pytest.mark.benchmark
 def test_rb3lyp_grad_large(benchmark):
     g = benchmark(run_rb3lyp_grad, large_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp grad large')
     assert np.isclose(np.linalg.norm(g), 0.3784664384209763, atol=1e-5)
 @pytest.mark.slow
+@pytest.mark.benchmark
 def test_rb3lyp_hessian_large(benchmark):
     h = benchmark(run_rb3lyp_hessian, large_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp hessian large')
     print(np.linalg.norm(h))
 
 # small basis set
+@pytest.mark.benchmark
 def test_df_rb3lyp_631gs(benchmark):
     e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, False)
     print('testing df rb3lyp 631gs')
     assert np.isclose(np.linalg.norm(e), 684.6646008642876, atol=1e-7)
+
+@pytest.mark.benchmark
 def test_df_rb3lyp_631gs_grad(benchmark):
     g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, False)
     print('testing df rb3lyp 631gs grad')
     assert np.isclose(np.linalg.norm(g), 0.17530687343398219, atol=1e-5)
+@pytest.mark.benchmark
 def test_df_rb3lyp_631gs_hessian(benchmark):
     h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, False)
     print('testing df rb3lyp 631gs hessian')
     assert np.isclose(np.linalg.norm(h), 3.908874851569459, atol=1e-4)
 
 # small basis set for large molecule
+@pytest.mark.benchmark
 def test_rb3lyp_631gs_large(benchmark):
     e = benchmark(run_rb3lyp, large_mol, '6-31gs', False, False)
     print('testing rb3lyp 631gs large')
     assert np.isclose(np.linalg.norm(e), 2563.1171191823423, atol=1e-7)
+@pytest.mark.benchmark
 def test_rb3lyp_631gs_grad_large(benchmark):
     g = benchmark(run_rb3lyp_grad, large_mol, '6-31gs', False, False)
     print('testing df rb3lyp 631gs grad large')
     assert np.isclose(np.linalg.norm(g), 0.37778228700247984, atol=1e-5)
 @pytest.mark.slow
+@pytest.mark.benchmark
 def test_rb3lyp_631gs_hessian_large(benchmark):
     h = benchmark(run_rb3lyp_hessian, large_mol, '6-31gs', False, False)
     print('testing df rb3lyp 631gs hessian large')
     assert np.isclose(np.linalg.norm(h), 7.920764634100053, atol=1e-4)
 
 #solvent model
+@pytest.mark.benchmark
 def test_df_rb3lyp_631gs_solvent(benchmark):
     e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, True)
     print('testing df rb3lyp 631gs solvent')
     assert np.isclose(np.linalg.norm(e), 684.6985561053816, atol=1e-7)
+@pytest.mark.benchmark
 def test_df_rb3lyp_631gs_solvent_grad(benchmark):
     g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, True)
     print('testing df rb3lyp 631gs solvent grad')
     assert np.isclose(np.linalg.norm(g), 0.16956999476137297, atol=1e-5)
+@pytest.mark.benchmark
 def test_df_rb3lyp_631gs_solvent_hessian(benchmark):
     h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, True)
     print('testing df rb3lyp 631gs solvent hessian')
     assert np.isclose(np.linalg.norm(h), 3.9008165041707294, atol=1e-4)
 
 # b3lyp d3bj
+@pytest.mark.benchmark
 def test_df_rb3lyp_631gs_d3bj(benchmark):
     e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, True, 'd3bj')
     print('testing df rb3lyp 631gs solvent')
     assert np.isclose(np.linalg.norm(e), 684.7313814096565, atol=1e-7)
+@pytest.mark.benchmark
 def test_df_rb3lyp_631gs_d3bj_grad(benchmark):
     g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, True, 'd3bj')
     print('testing df rb3lyp 631gs solvent grad')
     assert np.isclose(np.linalg.norm(g), 0.17010044498887264, atol=1e-5)
+@pytest.mark.benchmark
 def test_df_rb3lyp_631gs_d3bj_hessian(benchmark):
     h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, True, 'd3bj')
     print('testing df rb3lyp 631gs solvent hessian')
diff --git a/gpu4pyscf/tests/test_benchmark_uks.py b/gpu4pyscf/tests/test_benchmark_uks.py
index 0e426f17..39acd9ba 100644
--- a/gpu4pyscf/tests/test_benchmark_uks.py
+++ b/gpu4pyscf/tests/test_benchmark_uks.py
@@ -66,26 +66,32 @@ def run_ub3lyp_hessian(atom, basis, with_df, with_solvent):
 
 
 # UKS
+@pytest.mark.benchmark
 def test_df_ub3lyp(benchmark):
     e = benchmark(run_ub3lyp, small_mol, 'def2-tzvpp', True, False)
     print('testing df ub3lyp')
     assert np.isclose(np.linalg.norm(e), 684.9998712035856, atol=1e-7)
+@pytest.mark.benchmark
 def test_df_ub3lyp_grad(benchmark):
     g = benchmark(run_ub3lyp_grad, small_mol, 'def2-tzvpp', True, False)
     print('testing df ub3lyp grad')
     assert np.isclose(np.linalg.norm(g), 0.17435842214665462, atol=1e-5)
+@pytest.mark.benchmark
 def test_df_ub3lyp_hessian(benchmark):
     h = benchmark(run_ub3lyp_hessian, small_mol, 'def2-tzvpp', True, False)
     print('testing df ub3lyp hessian')
     assert np.isclose(np.linalg.norm(h), 3.7669464279078064, atol=1e-4)
+@pytest.mark.benchmark
 def test_ub3lyp(benchmark):
     e = benchmark(run_ub3lyp, small_mol, 'def2-tzvpp', False, False)
     print('testing ub3lyp')
     assert np.isclose(np.linalg.norm(e), 684.9997358509884, atol=1e-7)
+@pytest.mark.benchmark
 def test_ub3lyp_grad(benchmark):
     g = benchmark(run_ub3lyp_grad, small_mol, 'def2-tzvpp', False, False)
     print('testing ub3lyp grad')
     assert np.isclose(np.linalg.norm(g), 0.17441176110160253, atol=1e-5)
+@pytest.mark.benchmark
 def test_ub3lyp_hessian(benchmark):
     h = benchmark(run_ub3lyp_hessian, small_mol, 'def2-tzvpp', False, False)
     print('testing ub3lyp hessian')

From e0b1eafeff0cfb83f6e390a4201390c9d9e32965 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Fri, 27 Dec 2024 05:41:21 +0000
Subject: [PATCH 25/49] test dir

---
 .github/workflows/nightly_build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nightly_build.yml b/.github/workflows/nightly_build.yml
index f7f03ac6..b802fc3e 100644
--- a/.github/workflows/nightly_build.yml
+++ b/.github/workflows/nightly_build.yml
@@ -40,4 +40,4 @@ jobs:
       run: |
         echo $GITHUB_WORKSPACE
         export PYTHONPATH="${PYTHONPATH}:$(pwd)"
-        pytest tests/ -v -m "not slow"
+        pytest gpu4pyscf/tests/ -v -m "not slow"

From 92be2aa0d82b5ee63de13329d6926512f88ebd74 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Sat, 28 Dec 2024 23:35:03 +0000
Subject: [PATCH 26/49] save changes

---
 gpu4pyscf/df/hessian/jk.py            | 3 ++-
 gpu4pyscf/tests/test_benchmark_rks.py | 8 ++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py
index bc6cceee..8a2b59bd 100644
--- a/gpu4pyscf/df/hessian/jk.py
+++ b/gpu4pyscf/df/hessian/jk.py
@@ -54,7 +54,8 @@ def _jk_task_with_mo1(dfobj, dms, mo_coeff, mo1s, occ_coeffs,
             else:
                 dm_sparse *= 2
             dm_sparse[:, intopt.cderi_diag] *= .5
-
+        dms = None
+        
         if with_k:
             vks = [cupy.zeros_like(mo1) for mo1 in mo1s]
 
diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py
index cdaa2801..59eb2e39 100644
--- a/gpu4pyscf/tests/test_benchmark_rks.py
+++ b/gpu4pyscf/tests/test_benchmark_rks.py
@@ -144,12 +144,13 @@ def test_rb3lyp_grad_median(benchmark):
     g = benchmark(run_rb3lyp_grad, median_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp grad median')
     assert np.isclose(np.linalg.norm(g), 0.2601443836937988, atol=1e-5)
-@pytest.mark.high_memory
+
 @pytest.mark.slow
 @pytest.mark.benchmark
 def test_rb3lyp_hessian_median(benchmark):
     h = benchmark(run_rb3lyp_hessian, median_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp hessian median')
+    print(np.linalg.norm(h))
     assert np.isclose(np.linalg.norm(h))
 
 # large molecule
@@ -182,13 +183,16 @@ def test_rb3lyp_grad_large(benchmark):
     g = benchmark(run_rb3lyp_grad, large_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp grad large')
     assert np.isclose(np.linalg.norm(g), 0.3784664384209763, atol=1e-5)
+
+# Hessian for large molecule with large basis set is too slow
+'''
 @pytest.mark.slow
 @pytest.mark.benchmark
 def test_rb3lyp_hessian_large(benchmark):
     h = benchmark(run_rb3lyp_hessian, large_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp hessian large')
     print(np.linalg.norm(h))
-
+'''
 # small basis set
 @pytest.mark.benchmark
 def test_df_rb3lyp_631gs(benchmark):

From 09ab367679a172acdf1bd193e9694287e32da499 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Mon, 30 Dec 2024 18:08:39 +0000
Subject: [PATCH 27/49] add copy_array

---
 .../cupy_helper/benchmark_memory_copy.py      |  97 ++++++++++
 gpu4pyscf/df/df.py                            |   9 +-
 gpu4pyscf/df/grad/rhf.py                      |   1 +
 gpu4pyscf/df/hessian/rhf.py                   |  42 ++--
 gpu4pyscf/df/hessian/uhf.py                   |  42 ++--
 gpu4pyscf/df/int3c2e.py                       |  29 ++-
 gpu4pyscf/gto/int3c1e.py                      |   4 +-
 gpu4pyscf/lib/cupy_helper.py                  |  11 +-
 gpu4pyscf/lib/memcpy.py                       |  90 +++++++++
 gpu4pyscf/lib/tests/test_cupy_helper.py       |  38 +++-
 gpu4pyscf/tests/test_benchmark_rks.py         |   3 +-
 gpu4pyscf/tests/test_dft.py                   | 181 ------------------
 12 files changed, 319 insertions(+), 228 deletions(-)
 create mode 100644 benchmarks/cupy_helper/benchmark_memory_copy.py
 create mode 100644 gpu4pyscf/lib/memcpy.py
 delete mode 100644 gpu4pyscf/tests/test_dft.py

diff --git a/benchmarks/cupy_helper/benchmark_memory_copy.py b/benchmarks/cupy_helper/benchmark_memory_copy.py
new file mode 100644
index 00000000..c658674f
--- /dev/null
+++ b/benchmarks/cupy_helper/benchmark_memory_copy.py
@@ -0,0 +1,97 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import cupy as cp
+from cupyx import profiler
+from gpu4pyscf.lib.cupy_helper import copy_array
+
+'''
+Benchmark different ways of transfering data from pinned memory to device
+'''
+
+# Host array
+host_array = cp.cuda.alloc_pinned_memory(512*512*512 * 8)
+big_host_data = np.ndarray(512**3, dtype=cp.float64, buffer=host_array)
+big_host_data = big_host_data.reshape(512,512,512)
+big_host_data += np.random.rand(512,512,512)
+
+# Device array
+big_device_data = cp.empty_like(big_host_data)
+
+# Create views on both arrays
+host_view = big_host_data[:, 128:]  # Non-contiguous view on the host
+device_view = big_device_data[:, 128:]  # Non-contiguous view on the device
+
+print("Host View Shape:", host_view.shape)
+print("Device View Shape:", device_view.shape)
+'''
+print("------ Benchmark device to host transfer ----------")
+size = host_view.nbytes
+perf_custom = profiler.benchmark(copy_array, (host_view, device_view), n_repeat=100, n_warmup=3)
+t_kernel = perf_custom.gpu_times.mean()
+bandwidth = size / t_kernel / 1e9
+print('using custom function', t_kernel)
+print(f"Effective PCIe Bandwidth: {bandwidth:.2f} GB/s")
+
+def cupy_copy(c, out):
+    out[:] = cp.asarray(c)
+    return out
+perf_cupy = profiler.benchmark(cupy_copy, (host_view, device_view), n_repeat=100, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = size / t_kernel / 1e9
+print('using cupy function', t_kernel)
+print(f"Effective PCIe Bandwidth: {bandwidth:.2f} GB/s")
+
+print("------- Benchmark host to device transfer ---------")
+size = host_view.nbytes
+perf_custom = profiler.benchmark(copy_array, (device_view, host_view), n_repeat=100, n_warmup=3)
+t_kernel = perf_custom.gpu_times.mean()
+bandwidth = size / t_kernel / 1e9
+print('using custom function', t_kernel)
+print(f"Effective PCIe Bandwidth: {bandwidth:.2f} GB/s")
+
+def cupy_copy(c, out):
+    out[:] = c.get()
+    return out
+perf_cupy = profiler.benchmark(cupy_copy, (device_view, host_view), n_repeat=100, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = size / t_kernel / 1e9
+print('using cupy function', t_kernel)
+print(f"Effective PCIe Bandwidth: {bandwidth:.2f} GB/s")
+'''
+with cp.cuda.Device(0):
+    a = cp.random.rand(512,512,512)
+    device0_view = a[:,128:, 128:]
+with cp.cuda.Device(1):
+    b = cp.random.rand(512,512,512)
+    device1_view = b[:,128:, 128:]
+perf_cupy = profiler.benchmark(copy_array, (device0_view, device1_view), n_repeat=100, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = device0_view.nbytes / t_kernel / 1e9
+print('using custom function', t_kernel)
+print(f"Effective PCIe Bandwidth: {bandwidth:.2f} GB/s")
+
+assert np.linalg.norm(device0_view.get() - device1_view.get()) < 1e-10
+
+def cupy_copy(c, out):
+    with cp.cuda.Device(out.device):
+        out[:] = cp.asarray(c.get())
+    return out
+perf_cupy = profiler.benchmark(cupy_copy, (device0_view, device1_view), n_repeat=100, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = device0_view.nbytes / t_kernel / 1e9
+print('using cupy function', t_kernel)
+print(f"Effective PCIe Bandwidth: {bandwidth:.2f} GB/s")
diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
index 48e0e8e1..442c1bab 100644
--- a/gpu4pyscf/df/df.py
+++ b/gpu4pyscf/df/df.py
@@ -20,7 +20,8 @@
 from cupyx.scipy.linalg import solve_triangular
 from pyscf import lib
 from pyscf.df import df, addons, incore
-from gpu4pyscf.lib.cupy_helper import cholesky, tag_array, get_avail_mem, cart2sph, p2p_transfer
+from gpu4pyscf.lib.cupy_helper import (cholesky, tag_array, get_avail_mem, 
+                                       cart2sph, p2p_transfer, copy_array)
 from gpu4pyscf.df import int3c2e, df_jk
 from gpu4pyscf.lib import logger
 from gpu4pyscf import __config__
@@ -347,11 +348,13 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de
             if isinstance(_cderi[0], np.ndarray):
                 for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)):
                     for i in range(p0,p1):
-                        cderi_block[i].get(out=_cderi[slice_id][i-p0,ij0:ij1])
+                        #cderi_block[i].get(out=_cderi[slice_id][i-p0,ij0:ij1])
+                        copy_array(cderi_block[i], _cderi[slice_id][i-p0,ij0:ij1])
             else:
                 # Copy data to other Devices
                 for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)):
                     #_cderi[slice_id][:,ij0:ij1] = cderi_block[p0:p1]
-                    p2p_transfer(_cderi[slice_id][:,ij0:ij1], cderi_block[p0:p1])
+                    tmp = cupy.array(cderi_block[p0:p1], order='C', copy=True)
+                    p2p_transfer(_cderi[slice_id][:,ij0:ij1], tmp)
             t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1)
     return
diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py
index 681e18be..7c1c901a 100644
--- a/gpu4pyscf/df/grad/rhf.py
+++ b/gpu4pyscf/df/grad/rhf.py
@@ -44,6 +44,7 @@ def j2c_solver(v):
     mask = w > lindep
     v1 = v[:,mask]
     j2c = cupy.dot(v1/w[mask], v1.conj().T)
+    w = v = v1 = mask = None
     def j2c_solver(b): # noqa: F811
         return j2c.dot(b.reshape(j2c.shape[0],-1)).reshape(b.shape)
     return j2c_solver
diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py
index 938b1384..57dfc363 100644
--- a/gpu4pyscf/df/hessian/rhf.py
+++ b/gpu4pyscf/df/hessian/rhf.py
@@ -30,7 +30,7 @@
 from gpu4pyscf.grad import rhf as rhf_grad
 from gpu4pyscf.hessian import rhf as rhf_hess
 from gpu4pyscf.lib.cupy_helper import (
-    contract, tag_array, get_avail_mem, release_gpu_stack, pinv)
+    contract, tag_array, get_avail_mem, release_gpu_stack, pinv, copy_array)
 from gpu4pyscf.df import int3c2e, df
 from gpu4pyscf.lib import logger
 from gpu4pyscf import __config__
@@ -58,7 +58,9 @@ def _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2):
     mem_avail = get_avail_mem()
     blksize = int((mem_avail*0.4/(nao*nao*3*8)/ALIGNED))*ALIGNED
     for k0, k1 in lib.prange(0,nnz,blksize):
-        rhok1_Pko_kslice = cupy.asarray(rhok1_Pko[k0:k1])
+        #rhok1_Pko_kslice = cupy.asarray(rhok1_Pko[k0:k1])
+        rhok1_Pko_kslice = copy_array(rhok1_Pko[k0:k1])
+
         # (10|0)(0|10) without response of RI basis
         vk2_ip1_ip1 = contract('piox,pkoy->ikxy', rhok1_Pko_kslice, rhok1_Pko_kslice)
         hk_ao_ao += contract('ikxy,ik->ikxy', vk2_ip1_ip1, dm0)
@@ -147,6 +149,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
 
     #  int3c_ip1 contributions
     wj1_P, wk1_Pko = int3c2e.get_int3c2e_ip1_wjk(intopt, dm0_tag, omega=omega)
+    t1 = log.timer_debug1('interdeidate variables with int3c2e_ip1', *t1)
+
     #rhoj1_P = contract('pq,pix->qix', int2c_inv, wj1_P)
     if with_j:
         rhoj1_P = solve_j2c(wj1_P)
@@ -173,7 +177,9 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
             raise RuntimeError('Not enough memory for intermediate variables')
 
         for i0, i1 in lib.prange(0,nao,blksize):
-            wk1_Pko_islice = cupy.asarray(wk1_Pko[:,i0:i1])
+            #wk1_Pko_islice = cupy.asarray(wk1_Pko[:,i0:i1])
+            wk1_Pko_islice = copy_array(wk1_Pko[:,i0:i1])
+
             #rhok1_Pko = contract('pq,qiox->piox', int2c_inv, wk1_Pko_islice)
             rhok1_Pko = solve_j2c(wk1_Pko_islice)
             wk1_Pko_islice = None
@@ -194,7 +200,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
                 hk_ao_aux[i0:i1] -= contract('qoi,qioxy->iqxy', rhok0_P_I, wk1_I)
                 wk1_I = rhok0_P_I = None
         rhok1_Pko = None
-
+        t1 = log.timer_debug1('contractions with int3c2e_ip1', *t1)
+        
         w, v = cupy.linalg.eigh(int2c)
         idx = w > LINEAR_DEP_THR
         cd_low = (v[:,idx] / cupy.sqrt(w[idx]))
@@ -203,17 +210,18 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
 
         rhok1_Pko = wk1_Pko[:nnz]  # Reuse the same memory
         for i0, i1 in lib.prange(0,nao,blksize):
-            wk1_tmp = cupy.asarray(wk1_Pko[:,i0:i1])
+            #wk1_tmp = cupy.asarray(wk1_Pko[:,i0:i1])
+            wk1_tmp = copy_array(wk1_Pko[:,i0:i1])
             if isinstance(rhok1_Pko, cupy.ndarray):
                 rhok1_Pko[:,i0:i1] = contract('qp,qiox->piox', cd_low, wk1_tmp)
             else:
                 rhok1_Pko[:,i0:i1] = contract('qp,qiox->piox', cd_low, wk1_tmp).get()
             wk1_tmp = None
         cd_low = None
-
+        t1 = log.timer_debug1('data transfer', *t1)
         hk_ao_ao += _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2)
     wk1_Pko = rhok1_Pko = None
-    t1 = log.timer_debug1('intermediate variables with int3c2e_ip1', *t1)
+    t1 = log.timer_debug1('contractions with int3c2e_ip1', *t1)
 
     hj_ipip, hk_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag,
                                           with_j=with_j, with_k=with_k, omega=omega,
@@ -487,8 +495,11 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     else:
         rhok0_Pl_ = wk_Pl_ # reuse the memory
         for p0, p1 in lib.prange(0,nao,64):
-            wk_tmp = cupy.asarray(wk_Pl_[:,p0:p1])
-            rhok0_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get()
+            #wk_tmp = cupy.asarray(wk_Pl_[:,p0:p1])
+            #rhok0_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get()
+            wk_tmp = copy_array(wk_Pl_[:,p0:p1])
+            wk_tmp = solve_j2c(wk_tmp)
+            copy_array(wk_tmp, rhok0_Pl_[:,p0:p1])
         wk_tmp = None
     wk_Pl_ = None
     solve_j2c = None
@@ -503,6 +514,8 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
         fn = int3c2e.get_int3c2e_ip2_vjk
         vj1_int3c, vk1_int3c = fn(intopt, rhoj0, rhok0_Pl_, dm0_tag, auxslices,
                                   with_j=with_j, with_k=with_k, omega=omega)
+        t0 = log.timer_debug1('Fock matrix due to int3c2e_ip2', *t0)
+
         # Responses due to int2c2e_ip1
         if omega and omega > 1e-10:
             with auxmol.with_range_coulomb(omega):
@@ -521,7 +534,8 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
             else:
                 rhok0_P__ = cupy.empty([naux,nocc,nocc])
                 for p0, p1 in lib.prange(0,naux,64):
-                    rhok0_Pl_tmp = cupy.asarray(rhok0_Pl_[p0:p1])
+                    #rhok0_Pl_tmp = cupy.asarray(rhok0_Pl_[p0:p1])
+                    rhok0_Pl_tmp = copy_array(rhok0_Pl_[p0:p1])
                     rhok0_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, mocc)
                 rhok0_Pl_tmp = None
             wk0_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0_P__)
@@ -531,10 +545,11 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
         blksize = int(0.2*mem_avail/(3*naux*nocc*8)/ALIGNED) * ALIGNED
         log.debug(f'GPU Memory {mem_avail/GB:.1f} GB available, {blksize} AOs per block')
         if blksize < ALIGNED:
-            raise RuntimeError('Not enough memory to compute int3c2e_ip2')
+            raise RuntimeError('Not enough memory to compute int2c2e_ip2')
 
         for p0, p1 in lib.prange(0,nao,blksize):
-            rhok_tmp = cupy.asarray(rhok0_Pl_[:,p0:p1])
+            #rhok_tmp = cupy.asarray(rhok0_Pl_[:,p0:p1])
+            rhok_tmp = copy_array(rhok0_Pl_[:,p0:p1])
             wk0_10_Pl_ = contract('xqp,pio->xqio', int2c_ip1, rhok_tmp)
             if with_j:
                 vj1_tmp = contract('pio,xp->xpio', rhok_tmp, wj0_10)
@@ -544,13 +559,12 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
             if with_k:
                 vk1_tmp = contract('xpio,pro->xpir', wk0_10_Pl_, rhok0_P__)
                 vk1_tmp += contract('xpro,pir->xpio', wk0_10_P__, rhok_tmp)
-                # 2.0 due to spin
                 vk1_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vk1_tmp, aux2atom)
                 vk1_tmp = None
             wk0_10_Pl_ = rhok_tmp = None
         wj0_10 = wk0_10_P__ = rhok0_P__ = int2c_ip1 = None
         aux2atom = None
-        t0 = log.timer_debug1('Fock matrix due to int3c2e_ip2', *t0)
+        t0 = log.timer_debug1('Fock matrix due to int2c2e_ip1', *t0)
 
     # -----------------------------
     # int3c_ip1 contributions
diff --git a/gpu4pyscf/df/hessian/uhf.py b/gpu4pyscf/df/hessian/uhf.py
index d6f26e5d..1b18fc9a 100644
--- a/gpu4pyscf/df/hessian/uhf.py
+++ b/gpu4pyscf/df/hessian/uhf.py
@@ -34,7 +34,7 @@
 from gpu4pyscf.hessian import uhf as uhf_hess
 from gpu4pyscf.hessian import rhf as rhf_hess
 from gpu4pyscf.lib.cupy_helper import (
-    contract, tag_array, get_avail_mem, release_gpu_stack, pinv)
+    contract, tag_array, get_avail_mem, release_gpu_stack, pinv, copy_array)
 from gpu4pyscf.df import int3c2e, df
 from gpu4pyscf.df.hessian import rhf as df_rhf_hess
 from gpu4pyscf.lib import logger
@@ -174,15 +174,19 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
             raise RuntimeError('Not enough memory for intermediate variables')
 
         for i0, i1 in lib.prange(0,nao,blksize):
-            wk1a_Pko_islice = cupy.asarray(wk1a_Pko[:,i0:i1])
-            wk1b_Pko_islice = cupy.asarray(wk1b_Pko[:,i0:i1])
+            #wk1a_Pko_islice = cupy.asarray(wk1a_Pko[:,i0:i1])
+            #wk1b_Pko_islice = cupy.asarray(wk1b_Pko[:,i0:i1])
+            wk1a_Pko_islice = copy_array(wk1a_Pko[:,i0:i1])
+            wk1b_Pko_islice = copy_array(wk1b_Pko[:,i0:i1])
             rhok1a_Pko = solve_j2c(wk1a_Pko_islice)
             rhok1b_Pko = solve_j2c(wk1b_Pko_islice)
             wk1a_Pko_islice = wk1b_Pko_islice = None
             for k0, k1 in lib.prange(0,nao,blksize):
-                wk1a_Pko_kslice = cupy.asarray(wk1a_Pko[:,k0:k1])
-                wk1b_Pko_kslice = cupy.asarray(wk1b_Pko[:,k0:k1])
-
+                #wk1a_Pko_kslice = cupy.asarray(wk1a_Pko[:,k0:k1])
+                #wk1b_Pko_kslice = cupy.asarray(wk1b_Pko[:,k0:k1])
+                wk1a_Pko_kslice = copy_array(wk1a_Pko[:,k0:k1])
+                wk1b_Pko_kslice = copy_array(wk1b_Pko[:,k0:k1])
+                
                 # (10|0)(0|10) without response of RI basis
                 vk2_ip1_ip1 = contract('piox,pkoy->ikxy', rhok1a_Pko, wk1a_Pko_kslice)
                 hk_ao_ao[i0:i1,k0:k1] += contract('ikxy,ik->ikxy', vk2_ip1_ip1, dm0a[i0:i1,k0:k1])
@@ -521,8 +525,11 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     else:
         rhok0a_Pl_ = np.empty_like(wka_Pl_)
         for p0, p1 in lib.prange(0,nao,64):
-            wk_tmp = cupy.asarray(wka_Pl_[:,p0:p1])
-            rhok0a_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get()
+            # wk_tmp = cupy.asarray(wka_Pl_[:,p0:p1])
+            # rhok0a_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get()
+            wk_tmp = copy_array(wka_Pl_[:,p0:p1])
+            wk_tmp = solve_j2c(wk_tmp)
+            copy_array(wk_tmp, rhok0a_Pl_[:,p0:p1])
         wk_tmp = None
 
     if isinstance(wkb_Pl_, cupy.ndarray):
@@ -530,8 +537,11 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     else:
         rhok0b_Pl_ = np.empty_like(wkb_Pl_)
         for p0, p1 in lib.prange(0,nao,64):
-            wk_tmp = cupy.asarray(wkb_Pl_[:,p0:p1])
-            rhok0b_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get()
+            #wk_tmp = cupy.asarray(wkb_Pl_[:,p0:p1])
+            #rhok0b_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get()
+            wk_tmp = copy_array(wkb_Pl_[:,p0:p1])
+            wk_tmp = solve_j2c(wk_tmp)
+            copy_array(wk_tmp, rhok0b_Pl_[:,p0:p1])
         wk_tmp = None
     wka_Pl_ = wkb_Pl_ = None
     vj1a_int3c = vj1b_int3c = vk1a_int3c = vk1b_int3c = None
@@ -566,7 +576,8 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
                 nocc = mocca.shape[1]
                 rhok0a_P__ = cupy.empty([naux,nocc,nocc])
                 for p0, p1 in lib.prange(0,naux,64):
-                    rhok0_Pl_tmp = cupy.asarray(rhok0a_Pl_[p0:p1])
+                    #rhok0_Pl_tmp = cupy.asarray(rhok0a_Pl_[p0:p1])
+                    rhok0_Pl_tmp = copy_array(rhok0a_Pl_[p0:p1])
                     rhok0a_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, mocca)
                 rhok0_Pl_tmp = None
 
@@ -578,7 +589,8 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
                 nocc = moccb.shape[1]
                 rhok0b_P__ = cupy.empty([naux,nocc,nocc])
                 for p0, p1 in lib.prange(0,naux,64):
-                    rhok0_Pl_tmp = cupy.asarray(rhok0b_Pl_[p0:p1])
+                    #rhok0_Pl_tmp = cupy.asarray(rhok0b_Pl_[p0:p1])
+                    rhok0_Pl_tmp = copy_array(rhok0b_Pl_[p0:p1])
                     rhok0b_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, moccb)
                 rhok0_Pl_tmp = None
         if with_j:
@@ -596,8 +608,10 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
             raise RuntimeError('Not enough memory to compute int3c2e_ip2')
 
         for p0, p1 in lib.prange(0,nao,blksize):
-            rhoka_tmp = cupy.asarray(rhok0a_Pl_[:,p0:p1])
-            rhokb_tmp = cupy.asarray(rhok0b_Pl_[:,p0:p1])
+            #rhoka_tmp = cupy.asarray(rhok0a_Pl_[:,p0:p1])
+            #rhokb_tmp = cupy.asarray(rhok0b_Pl_[:,p0:p1])
+            rhoka_tmp = copy_array(rhok0a_Pl_[:,p0:p1])
+            rhokb_tmp = copy_array(rhok0b_Pl_[:,p0:p1])
             wk0a_10_Pl_ = contract('xqp,pio->xqio', int2c_ip1, rhoka_tmp)
             wk0b_10_Pl_ = contract('xqp,pio->xqio', int2c_ip1, rhokb_tmp)
             if with_j:
diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
index f01167a1..8089ef76 100644
--- a/gpu4pyscf/df/int3c2e.py
+++ b/gpu4pyscf/df/int3c2e.py
@@ -21,7 +21,7 @@
 from pyscf.scf import _vhf
 from gpu4pyscf.scf.int4c2e import BasisProdCache, libgvhf, libgint
 from gpu4pyscf.lib.cupy_helper import (block_c2s_diag, cart2sph, contract, get_avail_mem,
-                                       reduce_to_device)
+                                       reduce_to_device, copy_array)
 from gpu4pyscf.lib import logger
 from gpu4pyscf.gto.mole import basis_seg_contraction
 from gpu4pyscf.__config__ import _num_devices, _streams
@@ -839,7 +839,8 @@ def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id=
         for cp_k in task_k_list:
             task_list = [(cp_k, cp_ij) for cp_ij in range(ncp_ij)]
             k0, k1 = aux_ao_loc[cp_k], aux_ao_loc[cp_k+1]
-            rhok_tmp = cupy.asarray(rhok[k0:k1])
+            #rhok_tmp = cupy.asarray(rhok[k0:k1])
+            rhok_tmp = copy_array(rhok[k0:k1])
             if with_k:
                 rhok0 = contract('pio,ir->pro', rhok_tmp, orbo)
                 rhok0 = contract('pro,Jo->prJ', rhok0, orbo)
@@ -857,11 +858,13 @@ def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id=
 
                     vk1_ao = contract('xpji,poi->xijo', int3c_blk, rhok0[:,:,i0:i1])
                     vk1[:,:,j0:j1] += contract('xijo,ia->axjo', vk1_ao, ao2atom[i0:i1])
+                    vk1_ao = int3c_blk = None
             if with_j:
                 rhoj0_atom = contract('xpi,ia->xpa', rhoj0, 2.0*ao2atom)
                 vj1 += contract('pJo,xpa->axJo', rhok_tmp, rhoj0_atom)
-                rhoj0_atom = None
+                rhoj0_atom = rhoj0 = None
             if with_k:
+                rhok0 = None
                 vk1_buf += contract('xpio,plo->xil', int3c_ip1_occ, rhok_tmp)
                 mem_avail = get_avail_mem()
                 blksize = min(int(mem_avail * 0.2 / ((k1-k0) * nao) * 8),
@@ -870,6 +873,8 @@ def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id=
                     rhok0_slice = contract('pJr,ir->pJi', rhok_tmp[:,p0:p1], orbo)
                     vk1_ao = contract('xpio,pJi->xiJo', int3c_ip1_occ, rhok0_slice)
                     vk1[:,:,p0:p1] += contract('xiJo,ia->axJo', vk1_ao, ao2atom)
+                    rhok0_slice = vk1_ao = None
+            rhok_tmp = int3c_ip1_occ = None
 
     # TODO: absorbe vj1_buf and vk1_buf into vj1 and vk1
     return vj1_buf, vk1_buf, vj1, vk1
@@ -946,15 +951,16 @@ def _int3c2e_ip2_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo,
                     wj2 += contract('xpji,ji->xp', int3c_blk, dm0[j0:j1,i0:i1])
 
                 wk2_P__[:,:,i0:i1] += contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1])
-            rhok_tmp = cupy.asarray(rhok[k0:k1])
+                int3c_blk = None
+            #rhok_tmp = cupy.asarray(rhok[k0:k1])
+            rhok_tmp = copy_array(rhok[k0:k1])
             if with_j:
                 vj1_tmp = -contract('pio,xp->xpio', rhok_tmp, wj2)
                 vj1_tmp -= contract('xpio,p->xpio', wk2_P__, rhoj[k0:k1])
 
                 vj1 += contract('xpio,pa->axio', vj1_tmp, aux2atom[k0:k1])
+                vj1_tmp = wj2 = None
             if with_k:
-                #rhok0_slice = contract('pio,jo->pij', rhok_tmp, orbo)
-                #vk1_tmp = -contract('xpjo,pij->xpio', wk2_P__, rhok0_slice)
                 rhok0_slice = contract('xpjo,jr->xpro', wk2_P__, orbo)
                 vk1_tmp = -contract('xpro,pir->xpio', rhok0_slice, rhok_tmp)
 
@@ -962,8 +968,8 @@ def _int3c2e_ip2_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo,
                 vk1_tmp -= contract('xpio,pro->xpir', wk2_P__, rhok0_oo)
 
                 vk1 += contract('xpir,pa->axir', vk1_tmp, aux2atom[k0:k1])
-            wj2 = wk2_P__ = rhok0_slice = rhok0_oo = None
-            rhok_tmp = vk1_tmp = None
+                vk1_tmp = rhok0_oo = rhok0_slice = None
+            rhok_tmp = wk2_P__ = None
     return vj1, vk1
 
 def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices,
@@ -1022,8 +1028,11 @@ def _int3c2e_ip1_wjk_task(intopt, task_list, dm0, orbo, wk, device_id=0, with_k=
                 wj[k0:k1,i0:i1] += contract('xpji,ij->pix', int3c_blk, dm0[i0:i1,j0:j1])
                 if with_k:
                     wk_tmp[:,i0:i1] += contract('xpji,jo->piox', int3c_blk, orbo[j0:j1])
+                int3c_blk = None
             if with_k:
-                wk_tmp.get(out=wk[k0:k1])
+                #wk_tmp.get(out=wk[k0:k1])
+                copy_array(wk_tmp, wk[k0:k1])
+            wk_tmp = None
     return wj
 
 def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None):
@@ -1075,6 +1084,8 @@ def _int3c2e_ip2_wjk(intopt, task_list, dm0, orbo, with_k=True, omega=None, devi
             if with_k:
                 tmp = contract('xpji,jo->piox', int3c_blk, orbo[j0:j1])
                 wk[k0:k1] += contract('piox,ir->prox', tmp, orbo[i0:i1])
+                tmp = None
+            int3c_blk = None
     return wj, wk
 
 def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None):
diff --git a/gpu4pyscf/gto/int3c1e.py b/gpu4pyscf/gto/int3c1e.py
index 1e97c39b..cab38b98 100644
--- a/gpu4pyscf/gto/int3c1e.py
+++ b/gpu4pyscf/gto/int3c1e.py
@@ -222,8 +222,8 @@ def get_int3c1e(mol, grids, charge_exponents, intopt):
                         "which requires {total_double_number * 8 / 1e9 : .1f} GB of memory")
     ngrids_per_split = (ngrids + n_grid_split - 1) // n_grid_split
 
-    buf_size = ngrids * nao * nao * 8
-    int3c_pinned_buf = cp.cuda.alloc_pinned_memory(buf_size)
+    buf_size = ngrids * nao * nao
+    int3c_pinned_buf = cp.cuda.alloc_pinned_memory(buf_size * 8)
     int3c = np.frombuffer(int3c_pinned_buf, np.float64, buf_size).reshape([ngrids, nao, nao], order='C')
     # int3c = np.zeros([ngrids, nao, nao], order='C') # Using unpinned (pageable) memory, each memcpy is much slower, but there's no initialization time
 
diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py
index 5828bbfe..41e11307 100644
--- a/gpu4pyscf/lib/cupy_helper.py
+++ b/gpu4pyscf/lib/cupy_helper.py
@@ -23,6 +23,7 @@
 from gpu4pyscf.gto import mole
 from gpu4pyscf.lib.cutensor import contract
 from gpu4pyscf.lib.cusolver import eigh, cholesky  #NOQA
+from gpu4pyscf.lib.memcpy import copy_array  #NOQA
 from gpu4pyscf.__config__ import _streams, _num_devices, _p2p_access
 
 LMAX_ON_GPU = 7
@@ -93,10 +94,16 @@ def p2p_transfer(a, b):
         # https://github.com/cupy/cupy/blob/v13.3.0/cupy/_core/_routines_indexing.pyx#L1015
         a[:] = b
     else:
+        #copy_array(b, a)
         with cupy.cuda.Device(a.device):
             # TODO: reduce memory copy, a can be non-contiguous array
-            a[:] = cupy.asarray(b.get())
-
+            #a[:] = cupy.asarray(b.get())
+            copy_array(b, a)
+            if np.linalg.norm(a.get() - b.get()) > 1e-3:
+                print(a[:5], a.device, a.strides, a.shape)
+                print(b[:5], b.device, b.strides, b.shape)
+                print(a.shape, b.shape)
+                exit()
 def concatenate(array_list):
     ''' Concatenate axis=0 only
     '''
diff --git a/gpu4pyscf/lib/memcpy.py b/gpu4pyscf/lib/memcpy.py
new file mode 100644
index 00000000..d3695168
--- /dev/null
+++ b/gpu4pyscf/lib/memcpy.py
@@ -0,0 +1,90 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import cupy
+import numpy as np
+
+def find_contiguous_chunks(shape, h_strides, d_strides):
+    """
+    Find the largest contiguous chunk size based on strides and shape.
+    """
+    chunk_shape = []
+    chunk_size = 1
+    for dim, h_stride, d_stride in zip(reversed(shape), reversed(h_strides), reversed(d_strides)):
+        if h_stride == chunk_size and d_stride == chunk_size:
+            chunk_shape.append(dim)
+            chunk_size *= dim
+        else:
+            break
+    chunk_shape = tuple(reversed(chunk_shape))
+    return chunk_shape, chunk_size
+
+def copy_array(src_view, out=None):
+    ''' Copy cupy/numpy array to cupy array if out is None
+        Copy cupy/numpy array to cupy/numpy array (out)
+    '''
+    if out is None:
+        out = cupy.empty_like(src_view)
+    else:
+        # Ensure both arrays have the same shape
+        if src_view.shape != out.shape:
+            raise ValueError("Host and device views must have the same shape.")
+    return _copy_array(src_view, out)
+
+def _copy_array(src_view, dst_view):
+    ''' Copy data from cupy/numpy array to another cupy/numpy array
+    Check memory layout, then copy memory chunks by cupy.cuda.runtime.memcpy
+    '''
+    shape = src_view.shape
+    itemsize = src_view.itemsize
+    strides_src = [stride // itemsize for stride in src_view.strides]
+    strides_dst = [stride // itemsize for stride in dst_view.strides]
+
+    # Find the largest contiguous chunk
+    chunk_shape, chunk_size = find_contiguous_chunks(shape, strides_src, strides_dst)
+
+    if isinstance(src_view, cupy.ndarray):
+        src_data_ptr = src_view.data.ptr
+    else:
+        src_data_ptr = src_view.ctypes.data
+
+    if isinstance(dst_view, cupy.ndarray):
+        dst_data_ptr = dst_view.data.ptr
+    else:
+        dst_data_ptr = dst_view.ctypes.data
+
+    if isinstance(src_view, cupy.ndarray) and isinstance(dst_view, cupy.ndarray):
+        kind = cupy.cuda.runtime.memcpyDeviceToDevice
+    elif isinstance(src_view, cupy.ndarray) and isinstance(dst_view, np.ndarray):
+        kind = cupy.cuda.runtime.memcpyDeviceToHost
+    elif isinstance(src_view, np.ndarray) and isinstance(dst_view, cupy.ndarray):
+        kind = cupy.cuda.runtime.memcpyHostToDevice
+    else:
+        raise NotImplementedError
+    
+    # Transfer data chunk-by-chunk
+    outer_dims = shape[:-len(chunk_shape)]
+    for outer_index in np.ndindex(*outer_dims):
+        # Compute offsets for the current outer slice
+        src_offset = sum(outer_index[i] * strides_src[i] for i in range(len(outer_dims)))
+        dst_offset = sum(outer_index[i] * strides_dst[i] for i in range(len(outer_dims)))
+        # Perform the memcpy for the contiguous chunk
+        cupy.cuda.runtime.memcpy(
+            dst_data_ptr + dst_offset * dst_view.itemsize,
+            src_data_ptr + src_offset * src_view.itemsize,
+            chunk_size * src_view.itemsize,
+            kind
+        )
+    return dst_view
diff --git a/gpu4pyscf/lib/tests/test_cupy_helper.py b/gpu4pyscf/lib/tests/test_cupy_helper.py
index 0f406c82..21556df2 100644
--- a/gpu4pyscf/lib/tests/test_cupy_helper.py
+++ b/gpu4pyscf/lib/tests/test_cupy_helper.py
@@ -19,7 +19,8 @@
 from gpu4pyscf.lib.cupy_helper import (
     take_last2d, transpose_sum, krylov, unpack_sparse,
     add_sparse, takebak, empty_mapped, dist_matrix,
-    grouped_dot, grouped_gemm, cond, cart2sph_cutensor, cart2sph)
+    grouped_dot, grouped_gemm, cond, cart2sph_cutensor, cart2sph,
+    copy_array)
 
 class KnownValues(unittest.TestCase):
     def test_take_last2d(self):
@@ -214,6 +215,41 @@ def test_unpack_tril(self):
         ref[:,idx,idy] = atril
         assert abs(a - ref).max() < 1e-12
 
+    def test_copy_host2dev(self):
+        host_array = cupy.cuda.alloc_pinned_memory(10*10*10 * 8)
+        host_data = numpy.ndarray(10**3, dtype=cupy.float64, buffer=host_array)
+        host_data = host_data.reshape(10,10,10)
+        host_data += numpy.random.rand(10,10,10)
+
+        device_data = cupy.empty_like(host_data)
+        host_view = host_data[:, 8:]  # Non-contiguous view on the host
+        device_view = device_data[:, 8:]  # Non-contiguous view on the device
+
+        copy_array(host_view, device_view)
+        assert numpy.linalg.norm(host_view - device_view.get()) < 1e-10
+
+        copy_array(host_view.copy(), device_view)
+        assert numpy.linalg.norm(host_view - device_view.get()) < 1e-10
+
+        device_view = copy_array(host_view)
+        assert numpy.linalg.norm(host_view - device_view.get()) < 1e-10
+
+    def test_copy_dev2host(self):
+        host_array = cupy.cuda.alloc_pinned_memory(10*10*10 * 8)
+        host_data = numpy.ndarray(10**3, dtype=cupy.float64, buffer=host_array)
+        host_data = host_data.reshape(10,10,10)
+
+        device_data = cupy.zeros_like(host_data)
+        device_data += cupy.random.rand(10,10,10)
+        host_view = host_data[:, 8:]  # Non-contiguous view on the host
+        device_view = device_data[:, 8:]  # Non-contiguous view on the device
+
+        copy_array(device_view, host_view)
+        assert numpy.linalg.norm(host_view - device_view.get()) < 1e-10
+
+        copy_array(device_view.copy(), host_view)
+        assert numpy.linalg.norm(host_view - device_view.get()) < 1e-10
+
 if __name__ == "__main__":
     print("Full tests for cupy helper module")
     unittest.main()
diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py
index 59eb2e39..c1be2979 100644
--- a/gpu4pyscf/tests/test_benchmark_rks.py
+++ b/gpu4pyscf/tests/test_benchmark_rks.py
@@ -150,8 +150,7 @@ def test_rb3lyp_grad_median(benchmark):
 def test_rb3lyp_hessian_median(benchmark):
     h = benchmark(run_rb3lyp_hessian, median_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp hessian median')
-    print(np.linalg.norm(h))
-    assert np.isclose(np.linalg.norm(h))
+    assert np.isclose(np.linalg.norm(h), 6.312714778020796, atol=1e-4)
 
 # large molecule
 @pytest.mark.benchmark
diff --git a/gpu4pyscf/tests/test_dft.py b/gpu4pyscf/tests/test_dft.py
deleted file mode 100644
index 06bfbe4c..00000000
--- a/gpu4pyscf/tests/test_dft.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-import pyscf
-import pytest
-import cupy
-from gpu4pyscf.dft import rks, uks
-
-def setUpModule():
-    global mol
-    atom = '''
-C                 -0.07551087    1.68127663   -0.10745193
-O                  1.33621755    1.87147409   -0.39326987
-C                  1.67074668    2.95729545    0.49387976
-C                  0.41740763    3.77281969    0.78495878
-C                 -0.60481480    3.07572636    0.28906224
-H                 -0.19316298    1.01922455    0.72486113
-O                  0.35092043    5.03413298    1.45545728
-H                  0.42961487    5.74279041    0.81264173
-O                 -1.95331750    3.53349874    0.15912025
-H                 -2.55333895    2.78846397    0.23972698
-O                  2.81976302    3.20110148    0.94542226
-C                 -0.81772499    1.09230218   -1.32146482
-H                 -0.70955636    1.74951833   -2.15888136
-C                 -2.31163857    0.93420736   -0.98260166
-H                 -2.72575463    1.89080093   -0.74107186
-H                 -2.41980721    0.27699120   -0.14518512
-O                 -0.26428017   -0.18613595   -1.64425697
-H                 -0.72695910   -0.55328886   -2.40104423
-O                 -3.00083741    0.38730252   -2.10989934
-H                 -3.93210821    0.28874990   -1.89865997
-'''
-
-    mol = pyscf.M(atom=atom, basis='def2-tzvpp', max_memory=32000, cart=0)
-    mol.output = '/dev/null'
-    mol.build()
-    mol.verbose = 1
-
-def tearDownModule():
-    global mol
-    mol.stdout.close()
-    del mol
-
-class KnownValues(unittest.TestCase):
-    @pytest.mark.smoke
-    def test_b3lyp_with_d3bj(self):
-        print('-------- DFRKS with D3(BJ) -------')
-        mf = rks.RKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
-        mf.grids.atom_grid = (99,590)
-        mf.conv_tol = 1e-10
-        mf.conv_tol_cpscf = 1e-8
-        mf.disp = 'd3bj'
-        e_dft = mf.kernel()
-        assert np.abs(e_dft - -685.0326965348272) < 1e-7
-
-        g = mf.nuc_grad_method().kernel()
-        assert np.abs(cupy.linalg.norm(g) - 0.17498362161082373) < 1e-5
-
-        h = mf.Hessian().kernel()
-        assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4
-    
-    @pytest.mark.smoke
-    def test_b3lyp_d3bj(self):
-        print('-------- DFRKS with D3(BJ) -------')
-        mf = rks.RKS(mol, xc='b3lyp-d3bj').density_fit(auxbasis='def2-tzvpp-jkfit')
-        mf.grids.atom_grid = (99,590)
-        mf.conv_tol = 1e-10
-        mf.conv_tol_cpscf = 1e-8
-        e_dft = mf.kernel()
-        assert np.abs(e_dft - -685.0326965348272) < 1e-7
-
-        g = mf.nuc_grad_method().kernel()
-        assert np.abs(cupy.linalg.norm(g) - 0.17498362161082373) < 1e-5
-
-        h = mf.Hessian().kernel()
-        assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4
-
-    @pytest.mark.smoke
-    def test_DFUKS(self):
-        print('------- DFUKS with D3(BJ) -------')
-        mf = uks.UKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
-        mf.grids.atom_grid = (99,590)
-        mf.conv_tol = 1e-10
-        mf.conv_tol_cpscf = 1e-8
-        mf.disp = 'd3bj'
-        e_dft = mf.kernel()
-        assert np.abs(e_dft - -685.0326965349493) < 1e-7
-
-        g = mf.nuc_grad_method().kernel()
-        assert np.abs(cupy.linalg.norm(g) - 0.17498264516108836) < 1e-5
-
-        h = mf.Hessian().kernel()
-        assert np.abs(cupy.linalg.norm(h) - 3.768429871470736) < 1e-4
-
-    @pytest.mark.smoke
-    def test_RKS(self):
-        print('-------- RKS with D3(BJ) -------')
-        mf = rks.RKS(mol, xc='b3lyp')
-        mf.grids.atom_grid = (99,590)
-        mf.conv_tol = 1e-12
-        mf.conv_tol_cpscf = 1e-8
-        mf.disp = 'd3bj'
-        e_dft = mf.kernel()
-        assert np.abs(e_dft - -685.0325611822375) < 1e-7
-
-        g = mf.nuc_grad_method().kernel()
-        assert np.abs(cupy.linalg.norm(g) - 0.1750368231223345) < 1e-6
-
-        h = mf.Hessian().kernel()
-        assert np.abs(cupy.linalg.norm(h) - 3.760381249106394) < 1e-4
-
-    @pytest.mark.smoke
-    def test_UKS(self):
-        print('-------- UKS with D3(BJ) -------')
-        mf = uks.UKS(mol, xc='b3lyp')
-        mf.grids.atom_grid = (99,590)
-        mf.conv_tol = 1e-12
-        mf.conv_tol_cpscf = 1e-8
-        mf.disp = 'd3bj'
-        e_dft = mf.kernel()
-        assert np.abs(e_dft - -685.0325611822375) < 1e-7
-
-        g = mf.nuc_grad_method().kernel()
-        assert np.abs(cupy.linalg.norm(g) - 0.1750368231223345) < 1e-6
-
-        h = mf.Hessian().kernel()
-        assert np.abs(cupy.linalg.norm(h) - 3.760381249106394) < 1e-4
-
-    @pytest.mark.smoke
-    def test_DFRKS_with_SMD(self):
-        print('----- DFRKS with SMD -----')
-        mf = rks.RKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
-        mf = mf.SMD()
-        mf.grids.atom_grid = (99,590)
-        mf.conv_tol = 1e-10
-        mf.conv_tol_cpscf = 1e-8
-        mf.disp = 'd3bj'
-        e_dft = mf.kernel()
-        assert np.abs(e_dft - -685.0578838805443) < 1e-7
-
-        g = mf.nuc_grad_method().kernel()
-        assert np.abs(cupy.linalg.norm(g) - 0.16804945458657145) < 1e-5
-
-        h = mf.Hessian().kernel()
-        assert np.abs(cupy.linalg.norm(h) - 3.741783814494321) < 1e-4
-
-    @pytest.mark.smoke
-    def test_DFUKS_with_SMD(self):
-        print('------- DFUKS with SMD ---------')
-        mf = uks.UKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit')
-        mf = mf.SMD()
-        mf.grids.atom_grid = (99,590)
-        mf.conv_tol = 1e-10
-        mf.conv_tol_cpscf = 1e-8
-        mf.disp = 'd3bj'
-        e_dft = mf.kernel()
-        assert np.abs(e_dft - -685.05788388063) < 1e-7
-
-        g = mf.nuc_grad_method().kernel()
-        assert np.abs(cupy.linalg.norm(g) - 0.1680496465773684) < 1e-5
-
-        h = mf.Hessian().kernel()
-        assert np.abs(cupy.linalg.norm(h) - 3.7417788481647563) < 1e-4
-
-if __name__ == "__main__":
-    print("Full Smoke Tests")
-    unittest.main()
-

From c8846d2ac5a6203fad64c4cc494174d5c322ecf1 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Mon, 30 Dec 2024 22:16:53 +0000
Subject: [PATCH 28/49] assert chunk_shape

---
 gpu4pyscf/lib/memcpy.py                 | 3 +++
 gpu4pyscf/lib/tests/test_cupy_helper.py | 6 +++---
 gpu4pyscf/tests/test_benchmark_rks.py   | 4 +++-
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/gpu4pyscf/lib/memcpy.py b/gpu4pyscf/lib/memcpy.py
index d3695168..0f4faa6b 100644
--- a/gpu4pyscf/lib/memcpy.py
+++ b/gpu4pyscf/lib/memcpy.py
@@ -47,6 +47,7 @@ def _copy_array(src_view, dst_view):
     ''' Copy data from cupy/numpy array to another cupy/numpy array
     Check memory layout, then copy memory chunks by cupy.cuda.runtime.memcpy
     '''
+
     shape = src_view.shape
     itemsize = src_view.itemsize
     strides_src = [stride // itemsize for stride in src_view.strides]
@@ -74,6 +75,8 @@ def _copy_array(src_view, dst_view):
     else:
         raise NotImplementedError
     
+    assert len(chunk_shape) > 0
+
     # Transfer data chunk-by-chunk
     outer_dims = shape[:-len(chunk_shape)]
     for outer_index in np.ndindex(*outer_dims):
diff --git a/gpu4pyscf/lib/tests/test_cupy_helper.py b/gpu4pyscf/lib/tests/test_cupy_helper.py
index 21556df2..b322f8ed 100644
--- a/gpu4pyscf/lib/tests/test_cupy_helper.py
+++ b/gpu4pyscf/lib/tests/test_cupy_helper.py
@@ -236,11 +236,11 @@ def test_copy_host2dev(self):
 
     def test_copy_dev2host(self):
         host_array = cupy.cuda.alloc_pinned_memory(10*10*10 * 8)
-        host_data = numpy.ndarray(10**3, dtype=cupy.float64, buffer=host_array)
-        host_data = host_data.reshape(10,10,10)
+        host_data = numpy.ndarray(3*10**2, dtype=cupy.float64, buffer=host_array)
+        host_data = host_data.reshape(3,10,10)
 
         device_data = cupy.zeros_like(host_data)
-        device_data += cupy.random.rand(10,10,10)
+        device_data += cupy.random.rand(3,10,10)
         host_view = host_data[:, 8:]  # Non-contiguous view on the host
         device_view = device_data[:, 8:]  # Non-contiguous view on the device
 
diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py
index c1be2979..321fd56f 100644
--- a/gpu4pyscf/tests/test_benchmark_rks.py
+++ b/gpu4pyscf/tests/test_benchmark_rks.py
@@ -245,6 +245,8 @@ def test_df_rb3lyp_631gs_solvent_hessian(benchmark):
     print('testing df rb3lyp 631gs solvent hessian')
     assert np.isclose(np.linalg.norm(h), 3.9008165041707294, atol=1e-4)
 
+# No need to test d3bj generally
+'''
 # b3lyp d3bj
 @pytest.mark.benchmark
 def test_df_rb3lyp_631gs_d3bj(benchmark):
@@ -261,4 +263,4 @@ def test_df_rb3lyp_631gs_d3bj_hessian(benchmark):
     h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, True, 'd3bj')
     print('testing df rb3lyp 631gs solvent hessian')
     assert np.isclose(np.linalg.norm(h), 3.902367554157861, atol=1e-4)
-
+'''

From ca18282854c1fb3d5e03047f7fd3517ea61a083e Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Tue, 31 Dec 2024 22:23:16 +0000
Subject: [PATCH 29/49] improve hcore derivatives

---
 examples/dft_driver.py                |   1 -
 gpu4pyscf/df/df.py                    |   9 +-
 gpu4pyscf/df/hessian/jk.py            |  12 +-
 gpu4pyscf/df/hessian/rhf.py           |  13 ++-
 gpu4pyscf/df/int3c2e.py               | 151 +++++++++++++++++---------
 gpu4pyscf/hessian/rhf.py              |  29 +++--
 gpu4pyscf/hessian/rks.py              | 126 ++++++++++++++++++++-
 gpu4pyscf/lib/memcpy.py               |  10 +-
 gpu4pyscf/tests/test_benchmark_rks.py |  40 +++----
 9 files changed, 288 insertions(+), 103 deletions(-)

diff --git a/examples/dft_driver.py b/examples/dft_driver.py
index e0eccdda..0be7f410 100644
--- a/examples/dft_driver.py
+++ b/examples/dft_driver.py
@@ -27,7 +27,6 @@
 parser.add_argument("--solvent",      type=str,  default='')
 args = parser.parse_args()
 
-lib.num_threads(16)
 start_time = time.time()
 bas = args.basis
 mol = pyscf.M(
diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
index 442c1bab..67b30c0a 100644
--- a/gpu4pyscf/df/df.py
+++ b/gpu4pyscf/df/df.py
@@ -347,14 +347,15 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de
             ij1 = pairs_loc[cp_ij_id+1]
             if isinstance(_cderi[0], np.ndarray):
                 for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)):
-                    for i in range(p0,p1):
-                        #cderi_block[i].get(out=_cderi[slice_id][i-p0,ij0:ij1])
-                        copy_array(cderi_block[i], _cderi[slice_id][i-p0,ij0:ij1])
+                    #for i in range(p0,p1):
+                    #    cderi_block[i].get(out=_cderi[slice_id][i-p0,ij0:ij1])
+                    tmp = cupy.array(cderi_block[p0:p1], order='C', copy=True)
+                    copy_array(tmp, _cderi[slice_id][:p1-p0,ij0:ij1])
             else:
                 # Copy data to other Devices
                 for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)):
                     #_cderi[slice_id][:,ij0:ij1] = cderi_block[p0:p1]
                     tmp = cupy.array(cderi_block[p0:p1], order='C', copy=True)
                     p2p_transfer(_cderi[slice_id][:,ij0:ij1], tmp)
-            t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1)
+            t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1)    
     return
diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py
index 8a2b59bd..6b08cee5 100644
--- a/gpu4pyscf/df/hessian/jk.py
+++ b/gpu4pyscf/df/hessian/jk.py
@@ -314,8 +314,10 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
             # (20|0), (0|0)(0|00)
             int3c_blk = _get_int3c2e_ipip_slice('ipip1', intopt, cp_ij_id, aux_id, omega=omega)
             if with_j:
-                tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1])
-                hj_ipip1[:,i0:i1] += contract('xpi,p->xi', tmp, rhoj[k0:k1])
+                #tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1])
+                #hj_ipip1[:,i0:i1] += contract('xpi,p->xi', tmp, rhoj[k0:k1])
+                tmp = contract('xpji,p->xji', int3c_blk, rhoj[k0:k1])
+                hj_ipip1[:,i0:i1] += contract('xji,ij->xi', tmp, dm0[i0:i1,j0:j1])
             if with_k:
                 hk_ipip1[:,i0:i1] += contract('xpji,pji->xi', int3c_blk, rhok_tmp)
             int3c_blk = None
@@ -323,8 +325,10 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
             # (11|0), (0|0)(0|00) without response of RI basis
             int3c_blk = _get_int3c2e_ipip_slice('ipvip1', intopt, cp_ij_id, aux_id, omega=omega)
             if with_j:
-                tmp = contract('xpji,ij->xpij', int3c_blk, dm0[i0:i1,j0:j1])
-                hj_ipvip1[:,i0:i1,j0:j1] += contract('xpij,p->xij', tmp, rhoj[k0:k1])
+                #tmp = contract('xpji,ij->xpij', int3c_blk, dm0[i0:i1,j0:j1])
+                #hj_ipvip1[:,i0:i1,j0:j1] += contract('xpij,p->xij', tmp, rhoj[k0:k1])
+                tmp = contract('xpji,p->xji', int3c_blk, rhoj[k0:k1])
+                hj_ipvip1[:,i0:i1,j0:j1] += contract('xji,ij->xij', tmp, dm0[i0:i1,j0:j1])
             if with_k:
                 hk_ipvip1[:,i0:i1,j0:j1] += contract('xpji,pji->xij', int3c_blk, rhok_tmp)
             int3c_blk = None
diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py
index 57dfc363..a9023dc6 100644
--- a/gpu4pyscf/df/hessian/rhf.py
+++ b/gpu4pyscf/df/hessian/rhf.py
@@ -200,7 +200,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
                 hk_ao_aux[i0:i1] -= contract('qoi,qioxy->iqxy', rhok0_P_I, wk1_I)
                 wk1_I = rhok0_P_I = None
         rhok1_Pko = None
-        t1 = log.timer_debug1('contractions with int3c2e_ip1', *t1)
+        t1 = log.timer_debug1('contract int3c2e_ip1 with int2c_ip1', *t1)
         
         w, v = cupy.linalg.eigh(int2c)
         idx = w > LINEAR_DEP_THR
@@ -215,13 +215,14 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
             if isinstance(rhok1_Pko, cupy.ndarray):
                 rhok1_Pko[:,i0:i1] = contract('qp,qiox->piox', cd_low, wk1_tmp)
             else:
-                rhok1_Pko[:,i0:i1] = contract('qp,qiox->piox', cd_low, wk1_tmp).get()
+                #rhok1_Pko[:,i0:i1] = contract('qp,qiox->piox', cd_low, wk1_tmp).get()
+                wk1_tmp = contract('qp,qiox->piox', cd_low, wk1_tmp)
+                copy_array(wk1_tmp, rhok1_Pko[:,i0:i1])
             wk1_tmp = None
         cd_low = None
-        t1 = log.timer_debug1('data transfer', *t1)
         hk_ao_ao += _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2)
     wk1_Pko = rhok1_Pko = None
-    t1 = log.timer_debug1('contractions with int3c2e_ip1', *t1)
+    t1 = log.timer_debug1('contract int3c2e_ip1 with int3c2e_ip1', *t1)
 
     hj_ipip, hk_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag,
                                           with_j=with_j, with_k=with_k, omega=omega,
@@ -344,7 +345,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
     # -----------------------------------------
     #        collecting all
     # -----------------------------------------
-    e1 = cupy.zeros([len(atmlst),len(atmlst),3,3])
+    natm = len(atmlst)
+    e1 = cupy.zeros([natm,natm,3,3])
     ej = hj_ipip
     ek = hk_ipip
 
@@ -394,6 +396,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
                     _ek = cupy.sum(hk_aux_aux[p0:p1,q0:q1], axis=[0,1])
                     ek[i0,j0] += _ek * .5
                     ek[j0,i0] += _ek.T * .5
+    
     for i0, ia in enumerate(atmlst):
         for j0 in range(i0):
             e1[j0,i0] = e1[i0,j0].T
diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
index 8089ef76..54432c66 100644
--- a/gpu4pyscf/df/int3c2e.py
+++ b/gpu4pyscf/df/int3c2e.py
@@ -405,7 +405,8 @@ def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None, with_j=True,
             if isinstance(wk, cupy.ndarray):
                 wk[k0:k1] = rhok_tmp
             else:
-                rhok_tmp.get(out=wk[k0:k1])
+                #rhok_tmp.get(out=wk[k0:k1])
+                copy_array(rhok_tmp, wk[k0:k1])
     return wj, wk
 
 def get_int3c2e_ip_jk(intopt, cp_aux_id, ip_type, rhoj, rhok, dm, omega=None, stream=None):
@@ -770,6 +771,48 @@ def get_j_int3c2e_pass2(intopt, rhoj, stream=None):
     vj = vj + vj.T
     return vj
 
+def _int3c2e_jk_task(intopt, task_list, dm0, mocc, device_id=0, omega=None):
+    with cupy.cuda.Device(device_id), _streams[device_id]:
+        log = logger.new_logger(intopt.mol, intopt.mol.verbose)
+        t0 = log.init_timer()
+        mocc = cupy.asarray(mocc)
+        dm0 = cupy.asarray(dm0)
+        naux = intopt.auxmol.nao
+        nocc = mocc.shape[1]
+        rhoj = cupy.zeros([naux])
+        rhok = cupy.zeros([naux,nocc,nocc])
+        for cp_kl_id in task_list:
+            k0 = intopt.aux_ao_loc[cp_kl_id]
+            k1 = intopt.aux_ao_loc[cp_kl_id+1]
+            rhoj_tmp = cupy.zeros([k1-k0], order='C')
+            rhok_tmp = cupy.zeros([k1-k0, nocc, nocc], order='C')
+            for cp_ij_id, _ in enumerate(intopt.log_qs):
+                cpi = intopt.cp_idx[cp_ij_id]
+                cpj = intopt.cp_jdx[cp_ij_id]
+                li = intopt.angular[cpi]
+                lj = intopt.angular[cpj]
+                int3c_blk = get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, omega=omega)
+                if not intopt.mol.cart:
+                    int3c_blk = cart2sph(int3c_blk, axis=1, ang=lj)
+                    int3c_blk = cart2sph(int3c_blk, axis=2, ang=li)
+                i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1]
+                j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1]
+                if cpi == cpj and intopt.aosym:
+                    int3c_blk *= 0.5
+
+                rhoj_tmp += contract('pji,ij->p', int3c_blk, dm0[i0:i1,j0:j1])
+                ints_o = contract('pji,jo->poi', int3c_blk, mocc[j0:j1])
+                rhok_tmp += contract('poi,ir->por', ints_o, mocc[i0:i1])
+
+            if intopt.aosym:
+                rhoj[k0:k1] = 2.0 * rhoj_tmp
+                rhok[k0:k1] = rhok_tmp + rhok_tmp.transpose([0,2,1])
+            else:
+                rhoj[k0:k1] = rhoj_tmp
+                rhok[k0:k1] = rhok_tmp
+        t0 = log.timer_debug1(f'int3c2e_vjk on Device {device_id}', *t0)
+    return rhoj, rhok
+
 def get_int3c2e_jk(mol, auxmol, dm0_tag, with_k=True, omega=None):
     '''
     get rhoj and rhok for int3c2e
@@ -777,44 +820,46 @@ def get_int3c2e_jk(mol, auxmol, dm0_tag, with_k=True, omega=None):
     intopt = VHFOpt(mol, auxmol, 'int2e')
     intopt.build(1e-14, diag_block_with_triu=True, aosym=True, group_size=BLKSIZE, group_size_aux=BLKSIZE)
 
-    if omega is None: omega = 0.0
-    naux = auxmol.nao
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
-    nocc = orbo.shape[1]
-    rhoj = cupy.empty([naux])
-    rhok = cupy.empty([naux,nocc,nocc])
+    futures = []
+    aux_ao_loc = np.array(intopt.aux_ao_loc)
+    loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
+    task_list = _split_tasks(loads, _num_devices)
 
-    for cp_kl_id, _ in enumerate(intopt.aux_log_qs):
-        k0 = intopt.aux_ao_loc[cp_kl_id]
-        k1 = intopt.aux_ao_loc[cp_kl_id+1]
-        rhoj_tmp = cupy.zeros([k1-k0], order='C')
-        rhok_tmp = cupy.zeros([k1-k0, nocc, nocc], order='C')
-        for cp_ij_id, _ in enumerate(intopt.log_qs):
-            cpi = intopt.cp_idx[cp_ij_id]
-            cpj = intopt.cp_jdx[cp_ij_id]
-            li = intopt.angular[cpi]
-            lj = intopt.angular[cpj]
-            int3c_blk = get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, omega=omega)
-            if not intopt.mol.cart:
-                int3c_blk = cart2sph(int3c_blk, axis=1, ang=lj)
-                int3c_blk = cart2sph(int3c_blk, axis=2, ang=li)
-            i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1]
-            j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1]
-            if cpi == cpj and intopt.aosym:
-                int3c_blk *= 0.5
+    cupy.cuda.get_current_stream().synchronize()
+    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
+        for device_id in range(_num_devices):
+            future = executor.submit(
+                _int3c2e_jk_task, intopt, task_list[device_id],
+                dm0_tag, orbo, device_id=device_id, omega=omega)
+            futures.append(future)
 
-            rhoj_tmp += contract('pji,ij->p', int3c_blk, dm0_tag[i0:i1,j0:j1])
-            ints_o = contract('pji,jo->poi', int3c_blk, orbo[j0:j1])
-            rhok_tmp += contract('poi,ir->por', ints_o, orbo[i0:i1])
+    rhoj_total = []
+    rhok_total = []
+    for future in futures:
+        rhoj, rhok = future.result()
+        rhoj_total.append(rhoj)
+        rhok_total.append(rhok)
 
-        if intopt.aosym:
-            rhoj[k0:k1] = 2.0 * rhoj_tmp
-            rhok[k0:k1] = rhok_tmp + rhok_tmp.transpose([0,2,1])
-        else:
-            rhoj[k0:k1] = rhoj_tmp
-            rhok[k0:k1] = rhok_tmp
+    rhoj = rhok = None
+    rhoj = reduce_to_device(rhoj_total, inplace=True)
+    if with_k:
+        rhok = reduce_to_device(rhok_total, inplace=True)
     return rhoj, rhok
 
+def _split_tasks(loads, ngroups):
+    ''' Split a list of numbers into sublists with sums as close as possible
+    '''
+    if ngroups == 1:
+        return [range(len(loads))]
+    groups = [[] for _ in range(ngroups)]
+    sums = [0] * 4
+    for i, load in enumerate(loads):
+        min_index = sums.index(min(sums))
+        groups[min_index].append(i)
+        sums[min_index] += load
+    return groups
+
 def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id=0,
                           with_j=True, with_k=True, omega=None):
     natom = intopt.mol.natm
@@ -823,6 +868,8 @@ def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id=
     vj1_buf = vk1_buf = vj1 = vk1 = None
 
     with cupy.cuda.Device(device_id), _streams[device_id]:
+        log = logger.new_logger(intopt.mol, intopt.mol.verbose)
+        t0 = log.init_timer()
         ao2atom = get_ao2atom(intopt, aoslices)
         dm0 = cupy.asarray(dm0)
         orbo = cupy.asarray(orbo)
@@ -875,7 +922,7 @@ def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id=
                     vk1[:,:,p0:p1] += contract('xiJo,ia->axJo', vk1_ao, ao2atom)
                     rhok0_slice = vk1_ao = None
             rhok_tmp = int3c_ip1_occ = None
-
+        t0 = log.timer_debug1(f'int3c2e_ip1 on Device {device_id}', *t0)
     # TODO: absorbe vj1_buf and vk1_buf into vj1 and vk1
     return vj1_buf, vk1_buf, vj1, vk1
 
@@ -883,11 +930,10 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_j=True,
                         with_k=True, omega=None):
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
     futures = []
-    ncp_k = len(intopt.aux_log_qs)
-    tasks = np.array(list(range(ncp_k)))
-    task_list = []
-    for device_id in range(_num_devices):
-        task_list.append(tasks[device_id::_num_devices])
+
+    aux_ao_loc = np.array(intopt.aux_ao_loc)
+    loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
+    task_list = _split_tasks(loads, _num_devices)
 
     cupy.cuda.get_current_stream().synchronize()
     with ThreadPoolExecutor(max_workers=_num_devices) as executor:
@@ -926,6 +972,8 @@ def _int3c2e_ip2_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo,
     auxslices = intopt.auxmol.aoslice_by_atom()
     vj1 = vk1 = None
     with cupy.cuda.Device(device_id), _streams[device_id]:
+        log = logger.new_logger(intopt.mol, intopt.mol.verbose)
+        t0 = log.init_timer()
         aux2atom = get_aux2atom(intopt, auxslices)
         dm0 = cupy.asarray(dm0)
         orbo = cupy.asarray(orbo)
@@ -970,6 +1018,7 @@ def _int3c2e_ip2_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo,
                 vk1 += contract('xpir,pa->axir', vk1_tmp, aux2atom[k0:k1])
                 vk1_tmp = rhok0_oo = rhok0_slice = None
             rhok_tmp = wk2_P__ = None
+        t0 = log.timer_debug1(f'int3c2e_ip2 on Device {device_id}', *t0)
     return vj1, vk1
 
 def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices,
@@ -979,11 +1028,10 @@ def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices,
     '''
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
     futures = []
-    ncp_k = len(intopt.aux_log_qs)
-    tasks = np.array(list(range(ncp_k)))
-    task_list = []
-    for device_id in range(_num_devices):
-        task_list.append(tasks[device_id::_num_devices])
+
+    aux_ao_loc = np.array(intopt.aux_ao_loc)
+    loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
+    task_list = _split_tasks(loads, _num_devices)
 
     cupy.cuda.get_current_stream().synchronize()
     with ThreadPoolExecutor(max_workers=_num_devices) as executor:
@@ -1013,6 +1061,8 @@ def _int3c2e_ip1_wjk_task(intopt, task_list, dm0, orbo, wk, device_id=0, with_k=
     naux = intopt.auxmol.nao
     aux_ao_loc = intopt.aux_ao_loc
     with cupy.cuda.Device(device_id), _streams[device_id]:
+        log = logger.new_logger(intopt.mol, intopt.mol.verbose)
+        t0 = log.init_timer()
         ncp_ij = len(intopt.log_qs)
         nocc = orbo.shape[1]
         wj = cupy.zeros([naux,nao,3])
@@ -1033,6 +1083,7 @@ def _int3c2e_ip1_wjk_task(intopt, task_list, dm0, orbo, wk, device_id=0, with_k=
                 #wk_tmp.get(out=wk[k0:k1])
                 copy_array(wk_tmp, wk[k0:k1])
             wk_tmp = None
+        t0 = log.timer_debug1(f'int3c2e_ip1_wjk on Device {device_id}', *t0)
     return wj
 
 def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None):
@@ -1040,11 +1091,10 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None):
     '''
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
     futures = []
-    ncp_k = len(intopt.aux_log_qs)
-    tasks = np.array(list(range(ncp_k)))
-    task_list = []
-    for device_id in range(_num_devices):
-        task_list.append(tasks[device_id::_num_devices])
+
+    aux_ao_loc = np.array(intopt.aux_ao_loc)
+    loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
+    task_list = _split_tasks(loads, _num_devices)
 
     nao = intopt.mol.nao
     naux = intopt.auxmol.nao
@@ -1070,6 +1120,8 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None):
 
 def _int3c2e_ip2_wjk(intopt, task_list, dm0, orbo, with_k=True, omega=None, device_id=0):
     with cupy.cuda.Device(device_id), _streams[device_id]:
+        log = logger.new_logger(intopt.mol, intopt.mol.verbose)
+        t0 = log.init_timer()
         dm0 = cupy.asarray(dm0)
         orbo = cupy.asarray(orbo)
         naux = intopt.auxmol.nao
@@ -1086,6 +1138,7 @@ def _int3c2e_ip2_wjk(intopt, task_list, dm0, orbo, with_k=True, omega=None, devi
                 wk[k0:k1] += contract('piox,ir->prox', tmp, orbo[i0:i1])
                 tmp = None
             int3c_blk = None
+        t0 = log.timer_debug1(f'int3c2e_ip2_wjk on Device {device_id}', *t0)
     return wj, wk
 
 def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None):
diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py
index d7596d13..52150457 100644
--- a/gpu4pyscf/hessian/rhf.py
+++ b/gpu4pyscf/hessian/rhf.py
@@ -835,14 +835,14 @@ def _e_hcore_generator(hessobj, dm):
     h1aa = cupy.asarray(h1aa)
     h1ab = cupy.asarray(h1ab)
 
-    hcore = cupy.empty((3,3,nao,nao))
     t1 = log.timer_debug1('get_hcore', *t1)
     def get_hcore(iatm, jatm):
-        nonlocal hcore
         ish0, ish1, i0, i1 = aoslices[iatm]
         jsh0, jsh1, j0, j1 = aoslices[jatm]
         rinv2aa = rinv2ab = None
         if iatm == jatm:
+            de = contract('xypq,pq->xy', h1aa[:,:,i0:i1], dm[i0:i1])
+            de+= contract('xypq,pq->xy', h1ab[:,:,i0:i1,i0:i1], dm[i0:i1,i0:i1])
             with mol.with_rinv_at_nucleus(iatm):
                 # The remaining integrals like int1e_ipiprinv are computed in
                 # hess_nuc_elec(mol, dm)
@@ -853,18 +853,16 @@ def get_hcore(iatm, jatm):
                     rinv2ab = cupy.asarray(rinv2ab)
                     rinv2aa = rinv2aa.reshape(3,3,nao,nao)
                     rinv2ab = rinv2ab.reshape(3,3,nao,nao)
-            hcore[:] = 0.
-            hcore[:,:,i0:i1] += h1aa[:,:,i0:i1]
-            hcore[:,:,i0:i1,i0:i1] += h1ab[:,:,i0:i1,i0:i1]
+            
             if rinv2aa is not None or rinv2ab is not None:
-                hcore -= rinv2aa + rinv2ab
+                hcore = -(rinv2aa + rinv2ab)
                 hcore[:,:,i0:i1] += rinv2aa[:,:,i0:i1]
                 hcore[:,:,i0:i1] += rinv2ab[:,:,i0:i1]
                 hcore[:,:,:,i0:i1] += rinv2aa[:,:,i0:i1].transpose(0,1,3,2)
                 hcore[:,:,:,i0:i1] += rinv2ab[:,:,:,i0:i1]
+                de += cupy.einsum('xypq,pq->xy', hcore, dm)
         else:
-            hcore[:] = 0.
-            hcore[:,:,i0:i1,j0:j1] += h1ab[:,:,i0:i1,j0:j1]
+            de = contract('xypq,pq->xy',h1ab[:,:,i0:i1,j0:j1],dm[i0:i1,j0:j1])
             with mol.with_rinv_at_nucleus(iatm):
                 if with_ecp and iatm in ecp_atoms:
                     shls_slice = (jsh0, jsh1, 0, nbas)
@@ -872,8 +870,9 @@ def get_hcore(iatm, jatm):
                     rinv2ab = -mol.intor('ECPscalar_iprinvip', comp=9, shls_slice=shls_slice)
                     rinv2aa = cupy.asarray(rinv2aa)
                     rinv2ab = cupy.asarray(rinv2ab)
-                    hcore[:,:,j0:j1] += rinv2aa.reshape(3,3,j1-j0,nao)
-                    hcore[:,:,j0:j1] += rinv2ab.reshape(3,3,j1-j0,nao).transpose(1,0,2,3)
+                    hcore = rinv2aa.reshape(3,3,j1-j0,nao)
+                    hcore+= rinv2ab.reshape(3,3,j1-j0,nao).transpose(1,0,2,3)
+                    de += contract('xypq,pq->xy', hcore, dm[j0:j1])
             with mol.with_rinv_at_nucleus(jatm):
                 if with_ecp and jatm in ecp_atoms:
                     shls_slice = (ish0, ish1, 0, nbas)
@@ -881,11 +880,11 @@ def get_hcore(iatm, jatm):
                     rinv2ab = -mol.intor('ECPscalar_iprinvip', comp=9, shls_slice=shls_slice)
                     rinv2aa = cupy.asarray(rinv2aa)
                     rinv2ab = cupy.asarray(rinv2ab)
-                    hcore[:,:,i0:i1] += rinv2aa.reshape(3,3,i1-i0,nao)
-                    hcore[:,:,i0:i1] += rinv2ab.reshape(3,3,i1-i0,nao)
-        de = cupy.einsum('xypq,pq->xy', hcore, dm)
-        de += cupy.einsum('xyqp,pq->xy', hcore, dm)
-        return cp.asarray(de + de_nuc_elec[:,:,iatm,jatm])
+                    hcore = rinv2aa.reshape(3,3,i1-i0,nao)
+                    hcore+= rinv2ab.reshape(3,3,i1-i0,nao)
+                    de += contract('xypq,pq->xy', hcore, dm[i0:i1])
+        # 2.0* due to the symmetry
+        return cp.asarray(2.0*de + de_nuc_elec[:,:,iatm,jatm])
     return get_hcore
 
 def hcore_generator(hessobj, mol=None):
diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py
index bffc221c..912748c7 100644
--- a/gpu4pyscf/hessian/rks.py
+++ b/gpu4pyscf/hessian/rks.py
@@ -28,7 +28,7 @@
 from gpu4pyscf.grad import rks as rks_grad
 from gpu4pyscf.dft import numint
 from gpu4pyscf.lib.cupy_helper import (contract, add_sparse, get_avail_mem,
-                                       reduce_to_device)
+                                       reduce_to_device, transpose_sum)
 from gpu4pyscf.lib import logger
 from gpu4pyscf.__config__ import _streams, _num_devices
 from gpu4pyscf.hessian import jk
@@ -702,6 +702,124 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
     vmat = reduce_to_device(vmat_dist, inplace=True)
     return vmat
 
+def _nr_rks_fxc_mo_task(ni, mol, grids, xc_code, fxc, mo_coeff, mo1, mocc,
+                        verbose=None, hermi=1, device_id=0):
+    with cupy.cuda.Device(device_id), _streams[device_id]:
+        if mo_coeff is not None: mo_coeff = cupy.asarray(mo_coeff)
+        if mo1 is not None: mo1 = cupy.asarray(mo1)
+        if mocc is not None: mocc = cupy.asarray(mocc)
+        if fxc is not None: fxc = cupy.asarray(fxc)
+
+        assert isinstance(verbose, int)
+        log = logger.new_logger(mol, verbose)
+        xctype = ni._xc_type(xc_code)
+        opt = getattr(ni, 'gdftopt', None)
+
+        _sorted_mol = opt.mol
+        nao = mol.nao
+        nset = mo1.shape[0]
+        vmat = cupy.zeros((nset, nao, nao))
+
+        if xctype == 'LDA':
+            ao_deriv = 0
+        else:
+            ao_deriv = 1
+
+        ngrids_glob = grids.coords.shape[0]
+        ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
+        grid_start = device_id * ngrids_per_device
+        grid_end = (device_id + 1) * ngrids_per_device
+
+        p0 = p1 = grid_start
+        t1 = t0 = log.init_timer()
+        for ao, mask, weights, coords in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
+                                                       max_memory=None, blksize=None,
+                                                       grid_range=(grid_start, grid_end)):
+            p0, p1 = p1, p1+len(weights)
+            occ_coeff_mask = mocc[mask]
+            rho1 = numint.eval_rho4(_sorted_mol, ao, 2.0*occ_coeff_mask, mo1[:,mask],
+                                    xctype=xctype, hermi=hermi)
+            t1 = log.timer_debug2('eval rho', *t1)
+
+            # precompute fxc_w
+            if xctype == 'LDA':
+                fxc_w = fxc[0,0,p0:p1] * weights
+                wv = rho1 * fxc_w
+            else:
+                fxc_w = fxc[:,:,p0:p1] * weights
+                wv = contract('axg,xyg->ayg', rho1, fxc_w)
+
+            for i in range(nset):
+                if xctype == 'LDA':
+                    vmat_tmp = ao.dot(numint._scale_ao(ao, wv[i]).T)
+                elif xctype == 'GGA':
+                    wv[i,0] *= .5
+                    aow = numint._scale_ao(ao, wv[i])
+                    vmat_tmp = aow.dot(ao[0].T)
+                elif xctype == 'NLC':
+                    raise NotImplementedError('NLC')
+                else:
+                    wv[i,0] *= .5
+                    wv[i,4] *= .5
+                    vmat_tmp = ao[0].dot(numint._scale_ao(ao[:4], wv[i,:4]).T)
+                    vmat_tmp+= numint._tau_dot(ao, ao, wv[i,4])
+                add_sparse(vmat[i], vmat_tmp, mask)
+
+            t1 = log.timer_debug2('integration', *t1)
+            ao = rho1 = None
+        t0 = log.timer_debug1('vxc', *t0)
+        if xctype != 'LDA':
+            transpose_sum(vmat)
+        vmat = jk._ao2mo(vmat, mocc, mo_coeff)
+    return vmat
+
+def nr_rks_fxc_mo(ni, mol, grids, xc_code, dm0=None, dms=None, mo_coeff=None, relativity=0, hermi=0,
+               rho0=None, vxc=None, fxc=None, max_memory=2000, verbose=None):
+    log = logger.new_logger(mol, verbose)
+    t0 = log.init_timer()
+    if fxc is None:
+        raise RuntimeError('fxc was not initialized')
+    #xctype = ni._xc_type(xc_code)
+    opt = getattr(ni, 'gdftopt', None)
+    if opt is None or mol not in [opt.mol, opt._sorted_mol]:
+        ni.build(mol, grids.coords)
+        opt = ni.gdftopt
+
+    nao = mol.nao
+    dms = cupy.asarray(dms)
+    dm_shape = dms.shape
+    # AO basis -> gdftopt AO basis
+    with_mocc = hasattr(dms, 'mo1')
+    mo1 = mocc = None
+    if with_mocc:
+        mo1 = opt.sort_orbitals(dms.mo1, axis=[1])
+        mocc = opt.sort_orbitals(dms.occ_coeff, axis=[0])
+    mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0])
+    dms = opt.sort_orbitals(dms.reshape(-1,nao,nao), axis=[1,2])
+    
+    futures = []
+    cupy.cuda.get_current_stream().synchronize()
+    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
+        for device_id in range(_num_devices):
+            future = executor.submit(
+                _nr_rks_fxc_mo_task,
+                ni, mol, grids, xc_code, fxc, mo_coeff, mo1, mocc,
+                verbose=log.verbose, hermi=hermi, device_id=device_id)
+            futures.append(future)
+    dms = None
+    vmat_dist = []
+    for future in futures:
+        vmat_dist.append(future.result())
+    vmat = reduce_to_device(vmat_dist, inplace=True)
+    #vmat = opt.unsort_orbitals(vmat, axis=[1,2])
+    #if xctype != 'LDA':
+    #    transpose_sum(vmat)
+
+    if len(dm_shape) == 2:
+        vmat = vmat[0]
+    t0 = log.timer_debug1('nr_rks_fxc', *t0)
+    return cupy.asarray(vmat)
+
 def get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1, omega=None):
     mol = hessobj.mol
     mf = hessobj.base
@@ -728,10 +846,10 @@ def get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1, omega=None):
     # TODO: evaluate v1 in MO
     rho0, vxc, fxc = ni.cache_xc_kernel(mol, grids, mf.xc,
                                         mo_coeff, mo_occ, 0)
-    v1 = ni.nr_rks_fxc(mol, grids, mf.xc, None, dms, 0, hermi,
+    v1 = nr_rks_fxc_mo(ni, mol, grids, mf.xc, None, dms, mo_coeff, 0, hermi,
                                     rho0, vxc, fxc, max_memory=None)
-    v1 = jk._ao2mo(v1, mocc, mo_coeff).reshape(-1,nmo*nocc)
-
+    v1 = v1.reshape(-1,nmo*nocc)
+    
     if hybrid:
         vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=1)
         vk *= hyb
diff --git a/gpu4pyscf/lib/memcpy.py b/gpu4pyscf/lib/memcpy.py
index 0f4faa6b..19b19e41 100644
--- a/gpu4pyscf/lib/memcpy.py
+++ b/gpu4pyscf/lib/memcpy.py
@@ -47,7 +47,9 @@ def _copy_array(src_view, dst_view):
     ''' Copy data from cupy/numpy array to another cupy/numpy array
     Check memory layout, then copy memory chunks by cupy.cuda.runtime.memcpy
     '''
-
+    if src_view.nbytes == 0:
+        return dst_view
+    
     shape = src_view.shape
     itemsize = src_view.itemsize
     strides_src = [stride // itemsize for stride in src_view.strides]
@@ -75,6 +77,12 @@ def _copy_array(src_view, dst_view):
     else:
         raise NotImplementedError
     
+
+    if len(chunk_shape) == 0:
+        print('here')
+        print(src_view.nbytes, dst_view.nbytes)
+        print(shape, strides_src, strides_dst)
+        
     assert len(chunk_shape) > 0
 
     # Transfer data chunk-by-chunk
diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py
index 321fd56f..e709eafc 100644
--- a/gpu4pyscf/tests/test_benchmark_rks.py
+++ b/gpu4pyscf/tests/test_benchmark_rks.py
@@ -32,7 +32,7 @@
 
 current_folder = os.path.dirname(os.path.abspath(__file__))
 small_mol = os.path.join(current_folder, '020_Vitamin_C.xyz')
-median_mol = os.path.join(current_folder, '057_Tamoxifen.xyz')
+medium_mol = os.path.join(current_folder, '057_Tamoxifen.xyz')
 large_mol = os.path.join(current_folder, '095_Azadirachtin.xyz')
 
 def run_rb3lyp(atom, basis, with_df, with_solvent, disp=None):
@@ -117,39 +117,39 @@ def test_rb3lyp_hessian(benchmark):
     print('testing rb3lyp hessian')
     assert np.isclose(np.linalg.norm(h), 3.7588443634477833, atol=1e-4)
 
-# median molecule
+# medium molecule
 @pytest.mark.benchmark
-def test_df_rb3lyp_median(benchmark):
-    e = benchmark(run_rb3lyp, median_mol, 'def2-tzvpp', True, False)
-    print('testing df rb3lyp median')
+def test_df_rb3lyp_medium(benchmark):
+    e = benchmark(run_rb3lyp, medium_mol, 'def2-tzvpp', True, False)
+    print('testing df rb3lyp medium')
     assert np.isclose(np.linalg.norm(e), 1138.371390377773, atol=1e-7)
 @pytest.mark.benchmark
-def test_df_rb3lyp_grad_median(benchmark):
-    g = benchmark(run_rb3lyp_grad, median_mol, 'def2-tzvpp', True, False)
-    print('testing df rb3lyp grad median')
+def test_df_rb3lyp_grad_medium(benchmark):
+    g = benchmark(run_rb3lyp_grad, medium_mol, 'def2-tzvpp', True, False)
+    print('testing df rb3lyp grad medium')
     assert np.isclose(np.linalg.norm(g), 0.26010545073602614, atol=1e-4)
 @pytest.mark.benchmark
-def test_df_rb3lyp_hessian_median(benchmark):
-    h = benchmark(run_rb3lyp_hessian, median_mol, 'def2-tzvpp', True, False)
-    print('testing df rb3lyp hessian median')
+def test_df_rb3lyp_hessian_medium(benchmark):
+    h = benchmark(run_rb3lyp_hessian, medium_mol, 'def2-tzvpp', True, False)
+    print('testing df rb3lyp hessian medium')
     assert np.isclose(np.linalg.norm(h), 6.32514169232998, atol=1e-4)
 
 @pytest.mark.benchmark
-def test_rb3lyp_median(benchmark):
-    e = benchmark(run_rb3lyp, median_mol, 'def2-tzvpp', False, False)
-    print('testing rb3lyp median')
+def test_rb3lyp_medium(benchmark):
+    e = benchmark(run_rb3lyp, medium_mol, 'def2-tzvpp', False, False)
+    print('testing rb3lyp medium')
     assert np.isclose(np.linalg.norm(e), 1138.3710752128077, atol=1e-7)
 @pytest.mark.benchmark
-def test_rb3lyp_grad_median(benchmark):
-    g = benchmark(run_rb3lyp_grad, median_mol, 'def2-tzvpp', False, False)
-    print('testing rb3lyp grad median')
+def test_rb3lyp_grad_medium(benchmark):
+    g = benchmark(run_rb3lyp_grad, medium_mol, 'def2-tzvpp', False, False)
+    print('testing rb3lyp grad medium')
     assert np.isclose(np.linalg.norm(g), 0.2601443836937988, atol=1e-5)
 
 @pytest.mark.slow
 @pytest.mark.benchmark
-def test_rb3lyp_hessian_median(benchmark):
-    h = benchmark(run_rb3lyp_hessian, median_mol, 'def2-tzvpp', False, False)
-    print('testing rb3lyp hessian median')
+def test_rb3lyp_hessian_medium(benchmark):
+    h = benchmark(run_rb3lyp_hessian, medium_mol, 'def2-tzvpp', False, False)
+    print('testing rb3lyp hessian medium')
     assert np.isclose(np.linalg.norm(h), 6.312714778020796, atol=1e-4)
 
 # large molecule

From 904763e04ca54a5cd651deec8df6a6cf6ada47ed Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Wed, 1 Jan 2025 01:48:55 +0000
Subject: [PATCH 30/49] cupy copy -> copy_array

---
 .../cupy_helper/benchmark_memory_copy.py      | 71 +++++++++++++------
 gpu4pyscf/df/hessian/rhf.py                   |  7 +-
 gpu4pyscf/df/int3c2e.py                       | 12 ++--
 gpu4pyscf/lib/cupy_helper.py                  | 21 +++---
 4 files changed, 66 insertions(+), 45 deletions(-)

diff --git a/benchmarks/cupy_helper/benchmark_memory_copy.py b/benchmarks/cupy_helper/benchmark_memory_copy.py
index c658674f..5e36ffe5 100644
--- a/benchmarks/cupy_helper/benchmark_memory_copy.py
+++ b/benchmarks/cupy_helper/benchmark_memory_copy.py
@@ -37,52 +37,54 @@
 
 print("Host View Shape:", host_view.shape)
 print("Device View Shape:", device_view.shape)
-'''
+
 print("------ Benchmark device to host transfer ----------")
 size = host_view.nbytes
-perf_custom = profiler.benchmark(copy_array, (host_view, device_view), n_repeat=100, n_warmup=3)
+perf_custom = profiler.benchmark(copy_array, (host_view, device_view), n_repeat=20, n_warmup=3)
 t_kernel = perf_custom.gpu_times.mean()
 bandwidth = size / t_kernel / 1e9
-print('using custom function', t_kernel)
-print(f"Effective PCIe Bandwidth: {bandwidth:.2f} GB/s")
+print('Using custom function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
 
 def cupy_copy(c, out):
     out[:] = cp.asarray(c)
     return out
-perf_cupy = profiler.benchmark(cupy_copy, (host_view, device_view), n_repeat=100, n_warmup=3)
+perf_cupy = profiler.benchmark(cupy_copy, (host_view, device_view), n_repeat=20, n_warmup=3)
 t_kernel = perf_cupy.gpu_times.mean()
 bandwidth = size / t_kernel / 1e9
-print('using cupy function', t_kernel)
-print(f"Effective PCIe Bandwidth: {bandwidth:.2f} GB/s")
+print('Using cupy function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
 
 print("------- Benchmark host to device transfer ---------")
 size = host_view.nbytes
-perf_custom = profiler.benchmark(copy_array, (device_view, host_view), n_repeat=100, n_warmup=3)
+perf_custom = profiler.benchmark(copy_array, (device_view, host_view), n_repeat=20, n_warmup=3)
 t_kernel = perf_custom.gpu_times.mean()
 bandwidth = size / t_kernel / 1e9
-print('using custom function', t_kernel)
-print(f"Effective PCIe Bandwidth: {bandwidth:.2f} GB/s")
+print('Using custom function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
 
 def cupy_copy(c, out):
     out[:] = c.get()
     return out
-perf_cupy = profiler.benchmark(cupy_copy, (device_view, host_view), n_repeat=100, n_warmup=3)
+perf_cupy = profiler.benchmark(cupy_copy, (device_view, host_view), n_repeat=20, n_warmup=3)
 t_kernel = perf_cupy.gpu_times.mean()
 bandwidth = size / t_kernel / 1e9
-print('using cupy function', t_kernel)
-print(f"Effective PCIe Bandwidth: {bandwidth:.2f} GB/s")
-'''
+print('Using cupy function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+print("-------- Benchmark device to device transfer (non-contiguous) ---------")
+
 with cp.cuda.Device(0):
     a = cp.random.rand(512,512,512)
-    device0_view = a[:,128:, 128:]
+    device0_view = a[:,128:]
 with cp.cuda.Device(1):
     b = cp.random.rand(512,512,512)
-    device1_view = b[:,128:, 128:]
-perf_cupy = profiler.benchmark(copy_array, (device0_view, device1_view), n_repeat=100, n_warmup=3)
+    device1_view = b[:,128:]
+perf_cupy = profiler.benchmark(copy_array, (device0_view, device1_view), n_repeat=20, n_warmup=3)
 t_kernel = perf_cupy.gpu_times.mean()
 bandwidth = device0_view.nbytes / t_kernel / 1e9
-print('using custom function', t_kernel)
-print(f"Effective PCIe Bandwidth: {bandwidth:.2f} GB/s")
+print('Using custom function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
 
 assert np.linalg.norm(device0_view.get() - device1_view.get()) < 1e-10
 
@@ -90,8 +92,33 @@ def cupy_copy(c, out):
     with cp.cuda.Device(out.device):
         out[:] = cp.asarray(c.get())
     return out
-perf_cupy = profiler.benchmark(cupy_copy, (device0_view, device1_view), n_repeat=100, n_warmup=3)
+perf_cupy = profiler.benchmark(cupy_copy, (device0_view, device1_view), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = device0_view.nbytes / t_kernel / 1e9
+print('Using cupy function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+print("-------- Benchmark device to device transfer (contiguous) ---------")
+perf_cupy = profiler.benchmark(copy_array, (a, b), n_repeat=20, n_warmup=3)
 t_kernel = perf_cupy.gpu_times.mean()
 bandwidth = device0_view.nbytes / t_kernel / 1e9
-print('using cupy function', t_kernel)
-print(f"Effective PCIe Bandwidth: {bandwidth:.2f} GB/s")
+print('Using custom function', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+def cupy_copy_contiguous(a, b):
+    b[:] = a
+perf_cupy = profiler.benchmark(cupy_copy, (a, b), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = device0_view.nbytes / t_kernel / 1e9
+print('Cupy copy contiguous array', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+def cupy_set_contiguous(a, b):
+    b.set(a)
+perf_cupy = profiler.benchmark(cupy_copy, (a, b), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = device0_view.nbytes / t_kernel / 1e9
+print('Cupy set contiguous array', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
+
+assert np.linalg.norm(a.get() - b.get()) < 1e-10
diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py
index a9023dc6..a29a50bd 100644
--- a/gpu4pyscf/df/hessian/rhf.py
+++ b/gpu4pyscf/df/hessian/rhf.py
@@ -145,11 +145,11 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
 
     # int3c_ip2 contributions
     wj_ip2, wk_ip2_P__ = int3c2e.get_int3c2e_ip2_wjk(intopt, dm0_tag, omega=omega)
-    t1 = log.timer_debug1('interdeidate variables with int3c2e_ip2', *t1)
+    t1 = log.timer_debug1('interdediate variables with int3c2e_ip2', *t1)
 
     #  int3c_ip1 contributions
     wj1_P, wk1_Pko = int3c2e.get_int3c2e_ip1_wjk(intopt, dm0_tag, omega=omega)
-    t1 = log.timer_debug1('interdeidate variables with int3c2e_ip1', *t1)
+    t1 = log.timer_debug1('interdediate variables with int3c2e_ip1', *t1)
 
     #rhoj1_P = contract('pq,pix->qix', int2c_inv, wj1_P)
     if with_j:
@@ -332,6 +332,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
     # pi,qi,i->pq
     dme0 = cupy.dot(mocc, (mocc * mo_energy[mo_occ>0] * 2).T)
     de_hcore = rhf_hess._e_hcore_generator(hessobj, dm0)
+    t1 = log.timer_debug1('hcore generate', *t1)
 
     # ------------------------------------
     #      overlap matrix contributions
@@ -396,7 +397,6 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
                     _ek = cupy.sum(hk_aux_aux[p0:p1,q0:q1], axis=[0,1])
                     ek[i0,j0] += _ek * .5
                     ek[j0,i0] += _ek.T * .5
-    
     for i0, ia in enumerate(atmlst):
         for j0 in range(i0):
             e1[j0,i0] = e1[i0,j0].T
@@ -404,7 +404,6 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
                 ej[j0,i0] = ej[i0,j0].T
             if with_k:
                 ek[j0,i0] = ek[i0,j0].T
-
     t1 = log.timer_debug1('hcore contribution', *t1)
 
     aux2atom = int3c2e.get_aux2atom(intopt, auxslices)
diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
index 54432c66..f4d3bbab 100644
--- a/gpu4pyscf/df/int3c2e.py
+++ b/gpu4pyscf/df/int3c2e.py
@@ -771,7 +771,7 @@ def get_j_int3c2e_pass2(intopt, rhoj, stream=None):
     vj = vj + vj.T
     return vj
 
-def _int3c2e_jk_task(intopt, task_list, dm0, mocc, device_id=0, omega=None):
+def _int3c2e_jk_task(intopt, task_k_list, dm0, mocc, device_id=0, omega=None):
     with cupy.cuda.Device(device_id), _streams[device_id]:
         log = logger.new_logger(intopt.mol, intopt.mol.verbose)
         t0 = log.init_timer()
@@ -781,7 +781,7 @@ def _int3c2e_jk_task(intopt, task_list, dm0, mocc, device_id=0, omega=None):
         nocc = mocc.shape[1]
         rhoj = cupy.zeros([naux])
         rhok = cupy.zeros([naux,nocc,nocc])
-        for cp_kl_id in task_list:
+        for cp_kl_id in task_k_list:
             k0 = intopt.aux_ao_loc[cp_kl_id]
             k1 = intopt.aux_ao_loc[cp_kl_id+1]
             rhoj_tmp = cupy.zeros([k1-k0], order='C')
@@ -803,7 +803,7 @@ def _int3c2e_jk_task(intopt, task_list, dm0, mocc, device_id=0, omega=None):
                 rhoj_tmp += contract('pji,ij->p', int3c_blk, dm0[i0:i1,j0:j1])
                 ints_o = contract('pji,jo->poi', int3c_blk, mocc[j0:j1])
                 rhok_tmp += contract('poi,ir->por', ints_o, mocc[i0:i1])
-
+                int3c_blk = ints_o = None
             if intopt.aosym:
                 rhoj[k0:k1] = 2.0 * rhoj_tmp
                 rhok[k0:k1] = rhok_tmp + rhok_tmp.transpose([0,2,1])
@@ -853,7 +853,7 @@ def _split_tasks(loads, ngroups):
     if ngroups == 1:
         return [range(len(loads))]
     groups = [[] for _ in range(ngroups)]
-    sums = [0] * 4
+    sums = [0] * ngroups
     for i, load in enumerate(loads):
         min_index = sums.index(min(sums))
         groups[min_index].append(i)
@@ -965,7 +965,7 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_j=True,
     return vj1_buf, vk1_buf, vj1, vk1
 
 
-def _int3c2e_ip2_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo,
+def _int3c2e_ip2_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo,
                           device_id=0, with_j=True, with_k=True, omega=None):
     natom = intopt.mol.natm
     nao = intopt.mol.nao
@@ -985,7 +985,7 @@ def _int3c2e_ip2_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo,
             vk1 = cupy.zeros([natom,3,nao,nocc])
         aux_ao_loc = intopt.aux_ao_loc
         ncp_ij = len(intopt.log_qs)
-        for cp_k in task_list:
+        for cp_k in task_k_list:
             task_list = [(cp_k, cp_ij) for cp_ij in range(ncp_ij)]
             k0, k1 = aux_ao_loc[cp_k], aux_ao_loc[cp_k+1]
             if with_j:
diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py
index 41e11307..898a2846 100644
--- a/gpu4pyscf/lib/cupy_helper.py
+++ b/gpu4pyscf/lib/cupy_helper.py
@@ -88,22 +88,16 @@ def p2p_transfer(a, b):
         a[:] = b
     elif _p2p_access:
         a[:] = b
+        '''
     elif a.strides == b.strides and a.flags.c_contiguous and a.dtype == b.dtype:
         # cupy supports a direct copy from different devices without p2p. See also
         # https://github.com/cupy/cupy/blob/v13.3.0/cupy/_core/_routines_indexing.pyx#L48
         # https://github.com/cupy/cupy/blob/v13.3.0/cupy/_core/_routines_indexing.pyx#L1015
         a[:] = b
+        '''
     else:
-        #copy_array(b, a)
-        with cupy.cuda.Device(a.device):
-            # TODO: reduce memory copy, a can be non-contiguous array
-            #a[:] = cupy.asarray(b.get())
-            copy_array(b, a)
-            if np.linalg.norm(a.get() - b.get()) > 1e-3:
-                print(a[:5], a.device, a.strides, a.shape)
-                print(b[:5], b.device, b.strides, b.shape)
-                print(a.shape, b.shape)
-                exit()
+        copy_array(b, a)
+
 def concatenate(array_list):
     ''' Concatenate axis=0 only
     '''
@@ -118,7 +112,8 @@ def concatenate(array_list):
         p0 = p1 = 0
         for a in array_list_cpu:
             p1 = p0 + a.shape[0]
-            out[p0:p1].set(a)
+            #out[p0:p1].set(a)
+            copy_array(a, out[p0:p1])
             p0 = p1
         return out
 
@@ -153,8 +148,8 @@ def reduce_to_device(array_list, inplace=False):
         matrix = matrix.reshape(-1)
         blksize = 1024*1024*128 # 1GB
         for p0, p1 in lib.prange(0,len(matrix), blksize):
-            result[p0:p1] += cupy.asarray(matrix[p0:p1])
-    
+            result[p0:p1] += copy_array(matrix[p0:p1])#cupy.asarray(matrix[p0:p1]) 
+            #result[p0:p1] += cupy.asarray(matrix[p0:p1]) 
     return result.reshape(out_shape)
     
 def device2host_2d(a_cpu, a_gpu, stream=None):

From ef2553343fdcc4680c60406882ddf15bbd155d5e Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Thu, 2 Jan 2025 06:09:45 +0000
Subject: [PATCH 31/49] optimize multi-GPU

---
 .../cupy_helper/benchmark_memory_copy.py      |   9 +-
 gpu4pyscf/df/df.py                            |  39 +++---
 gpu4pyscf/df/grad/jk.py                       | 115 ++++++++++++++++-
 gpu4pyscf/df/grad/rhf.py                      |  71 ++--------
 gpu4pyscf/df/grad/uhf.py                      |  48 ++-----
 gpu4pyscf/df/hessian/rhf.py                   |  17 ++-
 gpu4pyscf/df/hessian/uhf.py                   |   5 +-
 gpu4pyscf/df/int3c2e.py                       | 121 +++++++++---------
 gpu4pyscf/lib/cupy_helper.py                  |  13 +-
 gpu4pyscf/tests/test_benchmark_rks.py         |   6 +-
 10 files changed, 248 insertions(+), 196 deletions(-)

diff --git a/benchmarks/cupy_helper/benchmark_memory_copy.py b/benchmarks/cupy_helper/benchmark_memory_copy.py
index 5e36ffe5..d10f97ac 100644
--- a/benchmarks/cupy_helper/benchmark_memory_copy.py
+++ b/benchmarks/cupy_helper/benchmark_memory_copy.py
@@ -107,15 +107,16 @@ def cupy_copy(c, out):
 
 def cupy_copy_contiguous(a, b):
     b[:] = a
-perf_cupy = profiler.benchmark(cupy_copy, (a, b), n_repeat=20, n_warmup=3)
+perf_cupy = profiler.benchmark(cupy_copy_contiguous, (a, b), n_repeat=20, n_warmup=3)
 t_kernel = perf_cupy.gpu_times.mean()
 bandwidth = device0_view.nbytes / t_kernel / 1e9
 print('Cupy copy contiguous array', t_kernel)
 print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
 
-def cupy_set_contiguous(a, b):
-    b.set(a)
-perf_cupy = profiler.benchmark(cupy_copy, (a, b), n_repeat=20, n_warmup=3)
+def cupy_asarray_contiguous(a, b):
+    with cp.cuda.Device(b.device):
+        b = cp.asarray(a) 
+perf_cupy = profiler.benchmark(cupy_asarray_contiguous, (a, b), n_repeat=20, n_warmup=3)
 t_kernel = perf_cupy.gpu_times.mean()
 bandwidth = device0_view.nbytes / t_kernel / 1e9
 print('Cupy set contiguous array', t_kernel)
diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
index 67b30c0a..a7f04370 100644
--- a/gpu4pyscf/df/df.py
+++ b/gpu4pyscf/df/df.py
@@ -138,13 +138,12 @@ def get_blksize(self, extra=0, nao=None):
         '''
         if nao is None: nao = self.nao
         mem_avail = get_avail_mem()
-        blksize = int(mem_avail*0.2/8/(nao*nao + extra) / ALIGNED) * ALIGNED
+        blksize = int(mem_avail*0.4/8/(nao*nao + extra) / ALIGNED) * ALIGNED
         blksize = min(blksize, MIN_BLK_SIZE)
         log = logger.new_logger(self.mol, self.mol.verbose)
         device_id = cupy.cuda.Device().id
         log.debug(f"{mem_avail/1e9:.3f} GB memory available on Device {device_id}, block size = {blksize}")
-        if blksize < ALIGNED:
-            raise RuntimeError("Not enough GPU memory")
+        assert blksize > 0
         return blksize
 
     def loop(self, blksize=None, unpack=True):
@@ -227,12 +226,16 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
         log.debug("Saving CDERI on CPU")
 
     _cderi = {}
-    blksize = (naux + _num_devices - 1) // _num_devices
-    for device_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)):
+    aux_blksize = (naux + _num_devices - 1) // _num_devices
+    aux_blksize = (aux_blksize + ALIGNED - 1) // ALIGNED * ALIGNED
+    for device_id in range(_num_devices):
+        p0 = min(aux_blksize*device_id, naux)
+        p1 = min(aux_blksize*(device_id+1), naux)
+        #for device_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
         if use_gpu_memory:
             with cupy.cuda.Device(device_id), _streams[device_id]:
                 _cderi[device_id] = cupy.empty([p1-p0, npairs])
-            log.debug(f"CDERI size {_cderi[device_id].nbytes/GB:.3f} on Device {device_id}")
+            log.debug(f"CDERI size {_cderi[device_id].nbytes/GB:.3f} GB on Device {device_id}")
         else:
             mem = cupy.cuda.alloc_pinned_memory((p1-p0) * npairs * 8)
             cderi_blk = np.ndarray([p1-p0, npairs], dtype=np.float64, order='C', buffer=mem)
@@ -254,7 +257,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
     with ThreadPoolExecutor(max_workers=_num_devices) as executor:
         for device_id in range(_num_devices):
             task_list = task_list_per_device[device_id]
-            future = executor.submit(_cderi_task, intopt, cd_low_f, task_list, _cderi,
+            future = executor.submit(_cderi_task, intopt, cd_low_f, task_list, _cderi, aux_blksize,
                                      omega=omega, sr_only=sr_only, device_id=device_id)
             futures.append(future)
 
@@ -266,7 +269,8 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
 
     return _cderi
 
-def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, device_id=0):
+def _cderi_task(intopt, cd_low, task_list, _cderi, aux_blksize, 
+                omega=None, sr_only=False, device_id=0):
     ''' Execute CDERI tasks on one device
     '''
     nq = len(intopt.log_qs)
@@ -275,7 +279,6 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de
     naoaux = cd_low.shape[0]
     npairs = [len(intopt.ao_pairs_row[cp_ij]) for cp_ij in range(len(intopt.log_qs))]
     pairs_loc = np.append(0, np.cumsum(npairs))
-    blksize = (naux + _num_devices - 1) // _num_devices
     with cupy.cuda.Device(device_id), _streams[device_id]:
         assert isinstance(mol.verbose, int)
         log = logger.new_logger(mol, mol.verbose)
@@ -346,16 +349,18 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de
             ij0 = pairs_loc[cp_ij_id]
             ij1 = pairs_loc[cp_ij_id+1]
             if isinstance(_cderi[0], np.ndarray):
-                for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)):
-                    #for i in range(p0,p1):
-                    #    cderi_block[i].get(out=_cderi[slice_id][i-p0,ij0:ij1])
+                for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
                     tmp = cupy.array(cderi_block[p0:p1], order='C', copy=True)
                     copy_array(tmp, _cderi[slice_id][:p1-p0,ij0:ij1])
-            else:
-                # Copy data to other Devices
-                for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)):
-                    #_cderi[slice_id][:,ij0:ij1] = cderi_block[p0:p1]
+            elif _num_devices > 1:
+                # Multi-GPU case, copy data to other Devices
+                for dev_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
+                    # Making a copy for contiguous data transfer
                     tmp = cupy.array(cderi_block[p0:p1], order='C', copy=True)
-                    p2p_transfer(_cderi[slice_id][:,ij0:ij1], tmp)
+                    with cupy.cuda.Device(dev_id):
+                        tmp = copy_array(tmp)
+                        _cderi[dev_id][:,ij0:ij1] = tmp
+            else:
+                _cderi[slice_id][:,ij0:ij1] = cderi_block[p0:p1]
             t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1)    
     return
diff --git a/gpu4pyscf/df/grad/jk.py b/gpu4pyscf/df/grad/jk.py
index 4139726e..2bbf9d9e 100644
--- a/gpu4pyscf/df/grad/jk.py
+++ b/gpu4pyscf/df/grad/jk.py
@@ -13,8 +13,10 @@
 # limitations under the License.
 
 from concurrent.futures import ThreadPoolExecutor
+import numpy as np
 import cupy
-from gpu4pyscf.lib.cupy_helper import contract, concatenate
+from gpu4pyscf.df.int3c2e import get_int3c2e_ip_jk, VHFOpt, _split_tasks
+from gpu4pyscf.lib.cupy_helper import contract, concatenate, reduce_to_device
 from gpu4pyscf.lib import logger
 from gpu4pyscf.__config__ import _streams, _num_devices
 
@@ -54,7 +56,7 @@ def _jk_task(with_df, dm, orbo, with_j=True, with_k=True, device_id=0):
         t0 = log.timer_debug1(f'rhoj and rhok on Device {device_id}', *t0)
     return rhoj, rhok
 
-def get_rhoj_rhok(with_df, dm, orbo, with_j=True, with_k=True):
+def get_rhojk(with_df, dm, orbo, with_j=True, with_k=True):
     ''' Calculate rhoj and rhok on Multi-GPU system
     '''
     futures = []
@@ -80,3 +82,112 @@ def get_rhoj_rhok(with_df, dm, orbo, with_j=True, with_k=True):
         rhok = concatenate(rhok_total)
 
     return rhoj, rhok
+
+def _jk_ip_task(intopt, rhoj_cart, dm_cart, rhok_cart, orbo_cart, task_list,
+                with_j=True, with_k=True, device_id=0, omega=None):
+    mol = intopt.mol
+    with cupy.cuda.Device(device_id), _streams[device_id]:
+        log = logger.new_logger(mol, mol.verbose)
+        t0 = (logger.process_clock(), logger.perf_counter())
+
+        orbo_cart = cupy.asarray(orbo_cart)
+        cart_aux_loc = intopt.cart_aux_loc
+        nao_cart = dm_cart.shape[0]
+        naux_cart = intopt._sorted_auxmol.nao
+        vj = vk = vjaux = vkaux = None
+        if with_j:
+            rhoj_cart = cupy.asarray(rhoj_cart)
+            dm_cart = cupy.asarray(dm_cart)
+            vj = cupy.zeros((3,nao_cart), order='C')
+            vjaux = cupy.zeros((3,naux_cart))
+        if with_k:
+            rhok_cart = cupy.asarray(rhok_cart)
+            vk = cupy.zeros((3,nao_cart), order='C')
+            vkaux = cupy.zeros((3,naux_cart))
+        
+        for cp_kl_id in task_list:
+            k0, k1 = cart_aux_loc[cp_kl_id], cart_aux_loc[cp_kl_id+1]
+            rhoj_tmp = rhok_tmp = None
+            if with_j:
+                rhoj_tmp = rhoj_cart[k0:k1]
+            if with_k:
+                rhok_tmp = contract('por,ir->pio', rhok_cart[k0:k1], orbo_cart)
+                rhok_tmp = contract('pio,jo->pji', rhok_tmp, orbo_cart)
+            '''
+            if(rhoj_tmp.flags['C_CONTIGUOUS'] == False):
+                rhoj_tmp = rhoj_tmp.astype(cupy.float64, order='C')
+
+            if(rhok_tmp.flags['C_CONTIGUOUS'] == False):
+                rhok_tmp = rhok_tmp.astype(cupy.float64, order='C')
+            '''
+            '''
+            # outcore implementation
+            buf = int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 1)
+            size = 3*(k1-k0)*nao_cart*nao_cart
+            int3c_ip = buf[:size].reshape([3,k1-k0,nao_cart,nao_cart], order='C')
+            rhoj_tmp0 = contract('xpji,ij->xip', int3c_ip, dm_cart)
+            vj_outcore = contract('xip,p->xi', rhoj_tmp0, rhoj_cart[k0:k1])
+            vk_outcore = contract('pji,xpji->xi', rhok_tmp, int3c_ip)
+
+            buf = int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 2)
+            int3c_ip = buf[:size].reshape([3,k1-k0,nao_cart,nao_cart], order='C')
+            rhoj_tmp0 = contract('xpji,ji->xp', int3c_ip, dm_cart)
+            vjaux_outcore = contract('xp,p->xp', rhoj_tmp0, rhoj_cart[k0:k1])
+            vkaux_outcore = contract('xpji,pji->xp', int3c_ip, rhok_tmp)
+            '''
+            vj_tmp, vk_tmp = get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip1', rhoj_tmp, rhok_tmp, dm_cart, omega=omega)
+            if with_j: vj += vj_tmp
+            if with_k: vk += vk_tmp
+            vj_tmp, vk_tmp = get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip2', rhoj_tmp, rhok_tmp, dm_cart, omega=omega)
+            if with_j: vjaux[:, k0:k1] = vj_tmp
+            if with_k: vkaux[:, k0:k1] = vk_tmp
+
+            rhoj_tmp = rhok_tmp = vj_tmp = vk_tmp = None
+            t0 = log.timer_debug1(f'calculate {cp_kl_id:3d} / {len(intopt.aux_log_qs):3d}, {k1-k0:3d} slices', *t0)
+    return vj, vk, vjaux, vkaux
+
+def get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart, 
+                 with_j=True, with_k=True, omega=None):
+    '''
+    Calculate vj    = (i'j|L)(L|kl)(ij)(kl), vk    = (i'j|L)(L|kl)(ik)(jl)
+              vjaux = (ij|L')(L|kl)(ij)(kl), vkaux = (ij|L')(L|kl)(ik)(jl)
+    '''
+    nao_cart = dm_cart.shape[0]
+    block_size = with_df.get_blksize(nao=nao_cart)
+
+    intopt = VHFOpt(mol, auxmol, 'int2e')
+    intopt.build(1e-14, diag_block_with_triu=True, aosym=False,
+                 group_size_aux=block_size, verbose=0)#, group_size=block_size)
+
+    aux_ao_loc = np.array(intopt.aux_ao_loc)
+    loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
+    task_list = _split_tasks(loads, _num_devices)
+
+    futures = []
+    cupy.cuda.get_current_stream().synchronize()
+    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
+        for device_id in range(_num_devices):
+            future = executor.submit(
+                _jk_ip_task, intopt, rhoj_cart, dm_cart, rhok_cart, orbo_cart, task_list[device_id],
+                with_j=with_j, with_k=with_k, device_id=device_id, omega=omega)
+            futures.append(future)
+
+    rhoj_total = []
+    rhok_total = []
+    vjaux_total = []
+    vkaux_total = []
+    for future in futures:
+        rhoj, rhok, vjaux, vkaux = future.result()
+        rhoj_total.append(rhoj)
+        rhok_total.append(rhok)
+        vjaux_total.append(vjaux)
+        vkaux_total.append(vkaux)
+
+    rhoj = rhok = vjaux = vkaux = None
+    if with_j:
+        rhoj = reduce_to_device(rhoj_total)
+        vjaux = reduce_to_device(vjaux_total)
+    if with_k:
+        rhok = reduce_to_device(rhok_total)
+        vkaux = reduce_to_device(vkaux_total)
+    return rhoj, rhok, vjaux, vkaux
diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py
index 7c1c901a..ea0537ed 100644
--- a/gpu4pyscf/df/grad/rhf.py
+++ b/gpu4pyscf/df/grad/rhf.py
@@ -22,7 +22,7 @@
 from gpu4pyscf.grad import rhf as rhf_grad
 from gpu4pyscf import __config__
 from gpu4pyscf.lib import logger
-from gpu4pyscf.df.grad.jk import get_rhoj_rhok
+from gpu4pyscf.df.grad.jk import get_rhojk, get_grad_vjk
 
 LINEAR_DEP_THRESHOLD = df.LINEAR_DEP_THR
 MIN_BLK_SIZE = getattr(__config__, 'min_ao_blksize', 128)
@@ -62,7 +62,6 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
     # extended to any 1-particle density matrix
 
     if(dm0 is None): dm0 = mf_grad.base.make_rdm1()
-    mf = mf_grad.base
     if omega is None:
         with_df = mf_grad.base.with_df
     else:
@@ -92,7 +91,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
     mo_coeff = None
     orbo = intopt.sort_orbitals(orbo, axis=[0])
 
-    rhoj, rhok = get_rhoj_rhok(with_df, dm, orbo, with_j=with_j, with_k=with_k)
+    rhoj, rhok = get_rhojk(with_df, dm, orbo, with_j=with_j, with_k=with_k)
     
     # (d/dX P|Q) contributions
     if omega and omega > 1e-10:
@@ -102,6 +101,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
         int2c_e1 = auxmol.intor('int2c2e_ip1')
     int2c_e1 = cupy.asarray(int2c_e1)
 
+    rhoj_cart = rhok_cart = None
     auxslices = auxmol.aoslice_by_atom()
     aux_cart2sph = intopt.aux_cart2sph
     low = with_df.cd_low
@@ -129,6 +129,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
         elif low.tag == 'cd':
             #rhok = solve_triangular(low_t, rhok, lower=False)
             rhok = solve_triangular(low_t, rhok.reshape(naux, -1), lower=False, overwrite_b=True).reshape(naux, nocc, nocc)
+            rhok = rhok.copy(order='C')
         tmp = contract('pij,qij->pq', rhok, rhok)
         tmp = intopt.unsort_orbitals(tmp, aux_axis=[0,1])
         vkaux = -contract('xpq,pq->xp', int2c_e1, tmp)
@@ -143,12 +144,6 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
     t0 = log.timer_debug1('rhoj and rhok', *t0)
     int2c_e1 = None
 
-    nao_cart = intopt._sorted_mol.nao
-    block_size = with_df.get_blksize(nao=nao_cart)
-
-    intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
-    intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False,
-                 group_size_aux=block_size)#, group_size=block_size)
     dm_cart = dm
     orbo_cart = orbo
     if not mol.cart:
@@ -156,63 +151,13 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
         cart2sph = intopt.cart2sph
         orbo_cart = cart2sph @ orbo
         dm_cart = cart2sph @ dm @ cart2sph.T
-
-    dm = orbo = None
-    vj = vk = rhoj_tmp = rhok_tmp = None
-    vjaux = vkaux = None
-
-    naux_cart = intopt._sorted_auxmol.nao
-    if with_j:
-        vj = cupy.zeros((3,nao_cart), order='C')
-        vjaux = cupy.zeros((3,naux_cart))
-    if with_k:
-        vk = cupy.zeros((3,nao_cart), order='C')
-        vkaux = cupy.zeros((3,naux_cart))
-    cupy.get_default_memory_pool().free_all_blocks()
-    t1 = log.init_timer()
-    for cp_kl_id in range(len(intopt.aux_log_qs)):
-        k0, k1 = intopt.cart_aux_loc[cp_kl_id], intopt.cart_aux_loc[cp_kl_id+1]
-        assert k1-k0 <= block_size
-        if with_j:
-            rhoj_tmp = rhoj_cart[k0:k1]
-        if with_k:
-            rhok_tmp = contract('por,ir->pio', rhok_cart[k0:k1], orbo_cart)
-            rhok_tmp = contract('pio,jo->pji', rhok_tmp, orbo_cart)
-        '''
-        if(rhoj_tmp.flags['C_CONTIGUOUS'] == False):
-            rhoj_tmp = rhoj_tmp.astype(cupy.float64, order='C')
-
-        if(rhok_tmp.flags['C_CONTIGUOUS'] == False):
-            rhok_tmp = rhok_tmp.astype(cupy.float64, order='C')
-        '''
-        '''
-        # outcore implementation
-        buf = int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 1)
-        size = 3*(k1-k0)*nao_cart*nao_cart
-        int3c_ip = buf[:size].reshape([3,k1-k0,nao_cart,nao_cart], order='C')
-        rhoj_tmp0 = contract('xpji,ij->xip', int3c_ip, dm_cart)
-        vj_outcore = contract('xip,p->xi', rhoj_tmp0, rhoj_cart[k0:k1])
-        vk_outcore = contract('pji,xpji->xi', rhok_tmp, int3c_ip)
-
-        buf = int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 2)
-        int3c_ip = buf[:size].reshape([3,k1-k0,nao_cart,nao_cart], order='C')
-        rhoj_tmp0 = contract('xpji,ji->xp', int3c_ip, dm_cart)
-        vjaux_outcore = contract('xp,p->xp', rhoj_tmp0, rhoj_cart[k0:k1])
-        vkaux_outcore = contract('xpji,pji->xp', int3c_ip, rhok_tmp)
-        '''
-        vj_tmp, vk_tmp = int3c2e.get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip1', rhoj_tmp, rhok_tmp, dm_cart, omega=omega)
-        if with_j: vj += vj_tmp
-        if with_k: vk += vk_tmp
-        vj_tmp, vk_tmp = int3c2e.get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip2', rhoj_tmp, rhok_tmp, dm_cart, omega=omega)
-        if with_j: vjaux[:, k0:k1] = vj_tmp
-        if with_k: vkaux[:, k0:k1] = vk_tmp
-
-        rhoj_tmp = rhok_tmp = vj_tmp = vk_tmp = None
-        t1 = log.timer_debug1(f'calculate {cp_kl_id:3d} / {len(intopt.aux_log_qs):3d}, {k1-k0:3d} slices', *t1)
     
+    vj, vk, vjaux, vkaux = get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart,
+                                        with_j=with_j, with_k=with_k, omega=omega)
     # NOTE: vj and vk are still in cartesian
     _sorted_mol = intopt._sorted_mol
     natm = _sorted_mol.natm
+    nao_cart = _sorted_mol.nao
     ao2atom = numpy.zeros([nao_cart, natm])
     ao_loc = _sorted_mol.ao_loc
     for ibas, iatm in enumerate(_sorted_mol._bas[:,gto.ATOM_OF]):
@@ -226,6 +171,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
 
     _sorted_auxmol = intopt._sorted_auxmol
     natm = _sorted_auxmol.natm
+    naux_cart = _sorted_auxmol.nao
     aux2atom = numpy.zeros([naux_cart, natm])
     ao_loc = _sorted_auxmol.ao_loc
     for ibas, iatm in enumerate(_sorted_auxmol._bas[:,gto.ATOM_OF]):
@@ -238,7 +184,6 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
     if with_k:
         vkaux_3c = aux2atom.T @ vkaux.T
         vkaux = vkaux_2c - vkaux_3c
-    
     return vj, vk, vjaux, vkaux
 
 
diff --git a/gpu4pyscf/df/grad/uhf.py b/gpu4pyscf/df/grad/uhf.py
index fc8de3be..42107967 100644
--- a/gpu4pyscf/df/grad/uhf.py
+++ b/gpu4pyscf/df/grad/uhf.py
@@ -18,11 +18,11 @@
 from cupyx.scipy.linalg import solve_triangular
 from pyscf import scf, gto
 from gpu4pyscf.df import int3c2e
-from gpu4pyscf.lib.cupy_helper import tag_array, contract, load_library
+from gpu4pyscf.lib.cupy_helper import tag_array, contract
 from gpu4pyscf.grad import uhf as uhf_grad
 from gpu4pyscf import __config__
 from gpu4pyscf.lib import logger
-from gpu4pyscf.df.grad.jk import get_rhoj_rhok
+from gpu4pyscf.df.grad.jk import get_rhojk, get_grad_vjk
 
 FREE_CUPY_CACHE = True
 BINSIZE = 128
@@ -80,39 +80,9 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True,
 
     # (L|ij) -> rhoj: (L), rhok: (L|oo)
     low = with_df.cd_low
-    rhoj, rhok = get_rhoj_rhok(with_df, dm, orbo, with_j=with_j, with_k=with_k)
+    rhoj, rhok = get_rhojk(with_df, dm, orbo, with_j=with_j, with_k=with_k)
     if dm2 is not None:
-        rhoj2, _   = get_rhoj_rhok(with_df, dm2_tmp, orbo, with_j=with_j, with_k=False)
-    '''
-    rows = with_df.intopt.cderi_row
-    cols = with_df.intopt.cderi_col
-    dm_sparse = dm[rows, cols]
-    dm_sparse[with_df.intopt.cderi_diag] *= .5
-    if dm2 is not None:
-        dm2_sparse = dm2_tmp[rows, cols]
-        dm2_sparse[with_df.intopt.cderi_diag] *= .5
-
-    blksize = with_df.get_blksize()
-    if with_j:
-        rhoj = cupy.empty([naux])
-        if dm2 is not None:
-            rhoj2 = cupy.empty([naux])
-    if with_k:
-        rhok = cupy.empty([naux, nocc, nocc], order='C')
-    p0 = p1 = 0
-
-    for cderi, cderi_sparse in with_df.loop(blksize=blksize):
-        p1 = p0 + cderi.shape[0]
-        if with_j:
-            rhoj[p0:p1] = 2.0*dm_sparse.dot(cderi_sparse)
-            if dm2 is not None:
-                rhoj2[p0:p1] = 2.0*dm2_sparse.dot(cderi_sparse)
-        if with_k:
-            tmp = contract('Lij,jk->Lki', cderi, orbo)
-            contract('Lki,il->Lkl', tmp, orbo, out=rhok[p0:p1])
-        p0 = p1
-    tmp = dm_sparse = cderi_sparse = cderi = None
-    '''
+        rhoj2, _   = get_rhojk(with_df, dm2_tmp, orbo, with_j=with_j, with_k=False)
 
     # (d/dX P|Q) contributions
     if omega and omega > 1e-10:
@@ -120,7 +90,9 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True,
             int2c_e1 = auxmol.intor('int2c2e_ip1')
     else:
         int2c_e1 = auxmol.intor('int2c2e_ip1')
+
     int2c_e1 = cupy.asarray(int2c_e1)
+    rhoj_cart = rhok_cart = None
     auxslices = auxmol.aoslice_by_atom()
     aux_cart2sph = intopt.aux_cart2sph
     low_t = low.T.copy()
@@ -154,6 +126,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True,
             rhok = contract('pq,qij->pij', low_t.T, rhok)
         elif low.tag == 'cd':
             rhok = solve_triangular(low_t, rhok.reshape(naux, -1), lower=False, overwrite_b=True).reshape(naux, nocc, nocc)
+            rhok = rhok.copy(order='C')
         tmp = contract('pij,qij->pq', rhok, rhok)
         tmp = intopt.unsort_orbitals(tmp, aux_axis=[0,1])
         vkaux = -contract('xpq,pq->xp', int2c_e1, tmp)
@@ -192,6 +165,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True,
         orbo_cart = orbo
     dm = orbo = None
 
+    """
     vj = vk = rhoj_tmp = rhok_tmp = None
     vjaux = vkaux = None
 
@@ -243,7 +217,10 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True,
 
         rhoj_tmp = rhok_tmp = vj_tmp = vk_tmp = None
         t1 = log.timer_debug1(f'calculate {cp_kl_id:3d} / {len(intopt.aux_log_qs):3d}, {k1-k0:3d} slices', *t1)
-
+    """
+    vj, vk, vjaux, vkaux = get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart,
+                                        with_j=with_j, with_k=with_k, omega=omega)
+    
     # NOTE: vj and vk are still in cartesian
     _sorted_mol = intopt._sorted_mol
     natm = _sorted_mol.natm
@@ -260,6 +237,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True,
 
     _sorted_auxmol = intopt._sorted_auxmol
     natm = _sorted_auxmol.natm
+    naux_cart = _sorted_auxmol.nao
     aux2atom = np.zeros([naux_cart, natm])
     ao_loc = _sorted_auxmol.ao_loc
     for ibas, iatm in enumerate(_sorted_auxmol._bas[:,gto.ATOM_OF]):
diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py
index a29a50bd..47faa476 100644
--- a/gpu4pyscf/df/hessian/rhf.py
+++ b/gpu4pyscf/df/hessian/rhf.py
@@ -38,7 +38,7 @@
 from gpu4pyscf.df.hessian import jk
 
 LINEAR_DEP_THR = df.LINEAR_DEP_THR
-BLKSIZE = 128
+BLKSIZE = 256
 ALIGNED = getattr(__config__, 'ao_aligned', 32)
 GB = 1024*1024*1024
 
@@ -111,7 +111,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
 
     # ================================ sorted AO begin ===============================================
     intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
-    intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
+    intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, verbose=0,
+                 group_size=BLKSIZE, group_size_aux=BLKSIZE)
     naux = auxmol.nao
     mocc_2 = intopt.sort_orbitals(mocc_2, axis=[0])
     dm0 = intopt.sort_orbitals(dm0, axis=[0,1])
@@ -135,21 +136,23 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
 
     #  int3c contributions
     wj, wk_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0_tag, omega=omega)
-    t1 = log.timer_debug1('intermediate variables with int3c2e', *t1)
     rhoj0_P = rhok0_P__ = None
+    
     if with_j:
         rhoj0_P = solve_j2c(wj)
+        wj = None
     if with_k:
         rhok0_P__ = solve_j2c(wk_P__)
-    wj = wk_P__ = None
+        wk_P__ = None
+    t1 = log.timer_debug1('intermediate variables with int3c2e', *t1)
 
     # int3c_ip2 contributions
     wj_ip2, wk_ip2_P__ = int3c2e.get_int3c2e_ip2_wjk(intopt, dm0_tag, omega=omega)
-    t1 = log.timer_debug1('interdediate variables with int3c2e_ip2', *t1)
+    t1 = log.timer_debug1('intermediate variables with int3c2e_ip2', *t1)
 
     #  int3c_ip1 contributions
     wj1_P, wk1_Pko = int3c2e.get_int3c2e_ip1_wjk(intopt, dm0_tag, omega=omega)
-    t1 = log.timer_debug1('interdediate variables with int3c2e_ip1', *t1)
+    t1 = log.timer_debug1('intermediate variables with int3c2e_ip1', *t1)
 
     #rhoj1_P = contract('pq,pix->qix', int2c_inv, wj1_P)
     if with_j:
@@ -473,7 +476,7 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
 
     intopt.build(mf.direct_scf_tol,
                  diag_block_with_triu=True,
-                 aosym=False,
+                 aosym=False, verbose=0,
                  group_size_aux=BLKSIZE,
                  group_size=BLKSIZE)
     naux = auxmol.nao
diff --git a/gpu4pyscf/df/hessian/uhf.py b/gpu4pyscf/df/hessian/uhf.py
index 1b18fc9a..b77015f6 100644
--- a/gpu4pyscf/df/hessian/uhf.py
+++ b/gpu4pyscf/df/hessian/uhf.py
@@ -96,7 +96,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
     # ================================ sorted AO begin ===============================================
     intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
-    intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
+    intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, verbose=0,
+                 group_size=BLKSIZE, group_size_aux=BLKSIZE)
 
     mocca = intopt.sort_orbitals(mocca, axis=[0])
     moccb = intopt.sort_orbitals(moccb, axis=[0])
@@ -495,7 +496,7 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
     intopt.build(mf.direct_scf_tol,
                  diag_block_with_triu=True,
-                 aosym=False,
+                 aosym=False, verbose=0,
                  group_size_aux=BLKSIZE,
                  group_size=BLKSIZE)
 
diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
index f4d3bbab..8bfa8a81 100644
--- a/gpu4pyscf/df/int3c2e.py
+++ b/gpu4pyscf/df/int3c2e.py
@@ -21,7 +21,7 @@
 from pyscf.scf import _vhf
 from gpu4pyscf.scf.int4c2e import BasisProdCache, libgvhf, libgint
 from gpu4pyscf.lib.cupy_helper import (block_c2s_diag, cart2sph, contract, get_avail_mem,
-                                       reduce_to_device, copy_array)
+                                       reduce_to_device, copy_array, transpose_sum)
 from gpu4pyscf.lib import logger
 from gpu4pyscf.gto.mole import basis_seg_contraction
 from gpu4pyscf.__config__ import _num_devices, _streams
@@ -29,7 +29,7 @@
 LMAX_ON_GPU = 8
 FREE_CUPY_CACHE = True
 STACK_SIZE_PER_THREAD = 8192 * 4
-BLKSIZE = 128
+BLKSIZE = 256
 NROOT_ON_GPU = 7
 
 def make_fake_mol():
@@ -103,8 +103,8 @@ def __del__(self):
         except AttributeError:
             pass
 
-    def build(self, cutoff=1e-14, group_size=None,
-              group_size_aux=None, diag_block_with_triu=False, aosym=False):
+    def build(self, cutoff=1e-14, group_size=None, group_size_aux=None, 
+              diag_block_with_triu=False, aosym=False, verbose=None):
         '''
         int3c2e is based on int2e with (ao,ao|aux,1)
         a tot_mol is created with concatenating [mol, fake_mol, aux_mol]
@@ -116,7 +116,9 @@ def build(self, cutoff=1e-14, group_size=None,
         mol = basis_seg_contraction(_mol, allow_replica=True)[0]
         auxmol = basis_seg_contraction(_auxmol, allow_replica=True)[0]
         
-        log = logger.new_logger(_mol, _mol.verbose)
+        if verbose is None:
+            verbose = _mol.verbose
+        log = logger.new_logger(_mol, verbose)
         cput0 = log.init_timer()
         _sorted_mol, sorted_idx, uniq_l_ctr, l_ctr_counts = sort_mol(mol, log=log)
 
@@ -218,28 +220,10 @@ def build(self, cutoff=1e-14, group_size=None,
         self.pair2bra = pair2bra
         self.pair2ket = pair2ket
         self.l_ctr_offsets = l_ctr_offsets
-        bas_pair2shls = np.hstack(pair2bra + pair2ket).astype(np.int32).reshape(2,-1)
-        bas_pairs_locs = np.append(0, np.cumsum([x.size for x in pair2bra])).astype(np.int32)
-        log_qs = log_qs + aux_log_qs
-        ao_loc = _tot_mol.ao_loc_nr(cart=True)
-        ncptype = len(log_qs)
 
         self._bpcache = {}
-        for n in range(_num_devices):
-            with cupy.cuda.Device(n), _streams[n]:
-                bpcache = ctypes.POINTER(BasisProdCache)()
-                scale_shellpair_diag = 1.
-                libgint.GINTinit_basis_prod(
-                    ctypes.byref(bpcache), ctypes.c_double(scale_shellpair_diag),
-                    ao_loc.ctypes.data_as(ctypes.c_void_p),
-                    bas_pair2shls.ctypes.data_as(ctypes.c_void_p),
-                    bas_pairs_locs.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(ncptype),
-                    _tot_mol._atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(_tot_mol.natm),
-                    _tot_mol._bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(_tot_mol.nbas),
-                    _tot_mol._env.ctypes.data_as(ctypes.c_void_p))
-                self._bpcache[n] = bpcache
 
-        cput1 = log.timer_debug1('Initialize GPU cache', *cput1)
+        bas_pairs_locs = np.append(0, np.cumsum([x.size for x in pair2bra])).astype(np.int32)
         self.bas_pairs_locs = bas_pairs_locs
         ncptype = len(self.log_qs)
         self.aosym = aosym
@@ -264,6 +248,27 @@ def build(self, cutoff=1e-14, group_size=None,
     @property
     def bpcache(self):
         device_id = cupy.cuda.Device().id
+        if device_id not in self._bpcache:
+            with cupy.cuda.Device(device_id), _streams[device_id]:
+                log = logger.new_logger(self.mol, self.mol.verbose)
+                cput0 = log.init_timer()
+                bpcache = ctypes.POINTER(BasisProdCache)()
+                scale_shellpair_diag = 1.
+                _tot_mol = self._tot_mol
+                log_qs = self.log_qs + self.aux_log_qs
+                ao_loc = _tot_mol.ao_loc_nr(cart=True)
+                bas_pair2shls = np.hstack(self.pair2bra + self.pair2ket).astype(np.int32).reshape(2,-1)
+                ncptype = len(log_qs)
+                libgint.GINTinit_basis_prod(
+                    ctypes.byref(bpcache), ctypes.c_double(scale_shellpair_diag),
+                    ao_loc.ctypes.data_as(ctypes.c_void_p),
+                    bas_pair2shls.ctypes.data_as(ctypes.c_void_p),
+                    self.bas_pairs_locs.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(ncptype),
+                    _tot_mol._atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(_tot_mol.natm),
+                    _tot_mol._bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(_tot_mol.nbas),
+                    _tot_mol._env.ctypes.data_as(ctypes.c_void_p))
+                self._bpcache[device_id] = bpcache
+                cput0 = log.timer_debug1(f'Initialize GPU cache on Device {device_id}', *cput0)
         bpcache = self._bpcache[device_id]
         return bpcache
 
@@ -496,16 +501,6 @@ def loop_int3c2e_general(intopt, task_list=None, ip_type='', omega=None, stream=
     ao_loc = intopt.ao_loc
     aux_ao_loc = intopt.aux_ao_loc
     comp = 3**order
-
-    lmax = intopt._sorted_mol._bas[:gto.ANG_OF].max()
-    aux_lmax = intopt._sorted_auxmol._bas[:gto.ANG_OF].max()
-    nroots = (lmax + aux_lmax + order)//2 + 1
-    if nroots > NROOT_ON_GPU:
-        from pyscf.gto.moleintor import getints, make_cintopt
-        pmol = intopt._tot_mol
-        intor = pmol._add_suffix('int3c2e_' + ip_type)
-        opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor)
-
     nbins = 1
 
     # If task_list is not given, generate all the tasks
@@ -558,6 +553,11 @@ def loop_int3c2e_general(intopt, task_list=None, ip_type='', omega=None, stream=
             if err != 0:
                 raise RuntimeError(f'GINT_fill_int3c2e general failed, err={err}')
         else:
+            from pyscf.gto.moleintor import getints, make_cintopt
+            pmol = intopt._tot_mol
+            intor = pmol._add_suffix('int3c2e_' + ip_type)
+            opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor)
+
             # TODO: sph2cart in CPU?
             ishl0, ishl1 = intopt.l_ctr_offsets[cpi], intopt.l_ctr_offsets[cpi+1]
             jshl0, jshl1 = intopt.l_ctr_offsets[cpj], intopt.l_ctr_offsets[cpj+1]
@@ -806,7 +806,7 @@ def _int3c2e_jk_task(intopt, task_k_list, dm0, mocc, device_id=0, omega=None):
                 int3c_blk = ints_o = None
             if intopt.aosym:
                 rhoj[k0:k1] = 2.0 * rhoj_tmp
-                rhok[k0:k1] = rhok_tmp + rhok_tmp.transpose([0,2,1])
+                rhok[k0:k1] = transpose_sum(rhok_tmp)
             else:
                 rhoj[k0:k1] = rhoj_tmp
                 rhok[k0:k1] = rhok_tmp
@@ -831,7 +831,7 @@ def get_int3c2e_jk(mol, auxmol, dm0_tag, with_k=True, omega=None):
         for device_id in range(_num_devices):
             future = executor.submit(
                 _int3c2e_jk_task, intopt, task_list[device_id],
-                dm0_tag, orbo, device_id=device_id, omega=omega)
+                dm0_tag.get(), orbo.get(), device_id=device_id, omega=omega)
             futures.append(future)
 
     rhoj_total = []
@@ -854,10 +854,12 @@ def _split_tasks(loads, ngroups):
         return [range(len(loads))]
     groups = [[] for _ in range(ngroups)]
     sums = [0] * ngroups
-    for i, load in enumerate(loads):
+
+    sorted_indices = np.argsort(loads)[::-1]
+    for idx in sorted_indices:
         min_index = sums.index(min(sums))
-        groups[min_index].append(i)
-        sums[min_index] += load
+        groups[min_index].append(idx)
+        sums[min_index] += loads[idx]
     return groups
 
 def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id=0,
@@ -922,7 +924,7 @@ def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id=
                     vk1[:,:,p0:p1] += contract('xiJo,ia->axJo', vk1_ao, ao2atom)
                     rhok0_slice = vk1_ao = None
             rhok_tmp = int3c_ip1_occ = None
-        t0 = log.timer_debug1(f'int3c2e_ip1 on Device {device_id}', *t0)
+        t0 = log.timer_debug1(f'int3c2e_ip1_vjk on Device {device_id}', *t0)
     # TODO: absorbe vj1_buf and vk1_buf into vj1 and vk1
     return vj1_buf, vk1_buf, vj1, vk1
 
@@ -1018,7 +1020,7 @@ def _int3c2e_ip2_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo,
                 vk1 += contract('xpir,pa->axir', vk1_tmp, aux2atom[k0:k1])
                 vk1_tmp = rhok0_oo = rhok0_slice = None
             rhok_tmp = wk2_P__ = None
-        t0 = log.timer_debug1(f'int3c2e_ip2 on Device {device_id}', *t0)
+        t0 = log.timer_debug1(f'int3c2e_ip2_vjk on Device {device_id}', *t0)
     return vj1, vk1
 
 def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices,
@@ -1056,7 +1058,7 @@ def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices,
         vk = reduce_to_device(vk_total, inplace=True)
     return vj, vk
 
-def _int3c2e_ip1_wjk_task(intopt, task_list, dm0, orbo, wk, device_id=0, with_k=True, omega=None):
+def _int3c2e_ip1_wjk_task(intopt, task_k_list, dm0, orbo, wk, device_id=0, with_k=True, omega=None):
     nao = intopt.mol.nao
     naux = intopt.auxmol.nao
     aux_ao_loc = intopt.aux_ao_loc
@@ -1068,7 +1070,7 @@ def _int3c2e_ip1_wjk_task(intopt, task_list, dm0, orbo, wk, device_id=0, with_k=
         wj = cupy.zeros([naux,nao,3])
         dm0 = cupy.asarray(dm0)
         orbo = cupy.asarray(orbo)
-        for cp_k in task_list:
+        for cp_k in task_k_list:
             k0, k1 = aux_ao_loc[cp_k], aux_ao_loc[cp_k+1]
             if with_k:
                 wk_tmp = cupy.zeros([k1-k0,nao,nocc,3])
@@ -1119,9 +1121,12 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None):
     return wj, wk
 
 def _int3c2e_ip2_wjk(intopt, task_list, dm0, orbo, with_k=True, omega=None, device_id=0):
+    aux_ao_loc = intopt.aux_ao_loc
     with cupy.cuda.Device(device_id), _streams[device_id]:
+        cupy.get_default_memory_pool().free_all_blocks()
         log = logger.new_logger(intopt.mol, intopt.mol.verbose)
         t0 = log.init_timer()
+        ncp_ij = len(intopt.log_qs)
         dm0 = cupy.asarray(dm0)
         orbo = cupy.asarray(orbo)
         naux = intopt.auxmol.nao
@@ -1130,26 +1135,28 @@ def _int3c2e_ip2_wjk(intopt, task_list, dm0, orbo, with_k=True, omega=None, devi
         wk = None
         if with_k:
             wk = cupy.zeros([naux,nocc,nocc,3])
-        for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list,
-                                                                ip_type='ip2', omega=omega):
-            wj[k0:k1] += contract('xpji,ji->px', int3c_blk, dm0[j0:j1,i0:i1])
-            if with_k:
-                tmp = contract('xpji,jo->piox', int3c_blk, orbo[j0:j1])
-                wk[k0:k1] += contract('piox,ir->prox', tmp, orbo[i0:i1])
-                tmp = None
-            int3c_blk = None
+        for cp_k in task_list:
+            k0, k1 = aux_ao_loc[cp_k], aux_ao_loc[cp_k+1]
+            task_list = [(cp_k, cp_ij) for cp_ij in range(ncp_ij)]
+
+            for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list,
+                                                                    ip_type='ip2', omega=omega):
+                wj[k0:k1] += contract('xpji,ji->px', int3c_blk, dm0[j0:j1,i0:i1])
+                if with_k:
+                    tmp = contract('xpji,jo->piox', int3c_blk, orbo[j0:j1])
+                    wk[k0:k1] += contract('piox,ir->prox', tmp, orbo[i0:i1])
+                    tmp = None
+                int3c_blk = None
         t0 = log.timer_debug1(f'int3c2e_ip2_wjk on Device {device_id}', *t0)
     return wj, wk
 
 def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None):
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
     futures = []
-    ncp_k = len(intopt.aux_log_qs)
-    ncp_ij = len(intopt.log_qs)
-    tasks = np.array(list(itertools.product(range(ncp_k), range(ncp_ij))))
-    task_list = []
-    for device_id in range(_num_devices):
-        task_list.append(tasks[device_id::_num_devices])
+
+    aux_ao_loc = np.array(intopt.aux_ao_loc)
+    loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
+    task_list = _split_tasks(loads, _num_devices)
 
     cupy.cuda.get_current_stream().synchronize()
     with ThreadPoolExecutor(max_workers=_num_devices) as executor:
diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py
index 898a2846..95ae1f24 100644
--- a/gpu4pyscf/lib/cupy_helper.py
+++ b/gpu4pyscf/lib/cupy_helper.py
@@ -104,13 +104,13 @@ def concatenate(array_list):
     if _p2p_access:
         return cupy.concatenate(array_list)
     else:
-        array_list_cpu = [a.get() for a in array_list]
-        n = sum([a.shape[0] for a in array_list_cpu])
-        a0_shape = list(array_list_cpu[0].shape)
+        #array_list_cpu = [a.get() for a in array_list]
+        n = sum([a.shape[0] for a in array_list])
+        a0_shape = list(array_list[0].shape)
         out_shape = tuple([n] + a0_shape[1:])
         out = cupy.empty(out_shape)
         p0 = p1 = 0
-        for a in array_list_cpu:
+        for a in array_list:
             p1 = p0 + a.shape[0]
             #out[p0:p1].set(a)
             copy_array(a, out[p0:p1])
@@ -138,15 +138,16 @@ def reduce_to_device(array_list, inplace=False):
         result = array_list[0]
     else:
         result = array_list[0].copy()
+    
+    # Transfer data chunk by chunk, reduce memory footprint,
     result = result.reshape(-1)
-    # Asynchronously add each matrix from its device
     for device_id, matrix in enumerate(array_list):
         if device_id == 0:
             continue
         
         assert matrix.device.id == device_id
         matrix = matrix.reshape(-1)
-        blksize = 1024*1024*128 # 1GB
+        blksize = 1024*1024*1024 // matrix.itemsize # 1GB
         for p0, p1 in lib.prange(0,len(matrix), blksize):
             result[p0:p1] += copy_array(matrix[p0:p1])#cupy.asarray(matrix[p0:p1]) 
             #result[p0:p1] += cupy.asarray(matrix[p0:p1]) 
diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py
index e709eafc..3d738890 100644
--- a/gpu4pyscf/tests/test_benchmark_rks.py
+++ b/gpu4pyscf/tests/test_benchmark_rks.py
@@ -24,11 +24,11 @@
 
 # How to run
 # 1. run test only
-# pytest test_rks.py --benchmark-disable -s -v -m "not slow" --durations=20
+# pytest test_benchmark_rks.py --benchmark-disable -s -v -m "not slow" --durations=20
 # 2. benchmark less expensive tasks
-# pytest test_rks.py -v -m "not slow"
+# pytest test_benchmark_rks.py -v -m "not slow"
 # 3. benchmark all the tests
-# pytest test_rks.py -v
+# pytest test_benchmark_rks.py -v
 
 current_folder = os.path.dirname(os.path.abspath(__file__))
 small_mol = os.path.join(current_folder, '020_Vitamin_C.xyz')

From ca79edb0c5d76c928180d5110838a368faa2f1f1 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Thu, 2 Jan 2025 06:46:51 +0000
Subject: [PATCH 32/49] bugfix for single gpu

---
 gpu4pyscf/df/df.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
index a7f04370..ab1adeba 100644
--- a/gpu4pyscf/df/df.py
+++ b/gpu4pyscf/df/df.py
@@ -361,6 +361,6 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, aux_blksize,
                         tmp = copy_array(tmp)
                         _cderi[dev_id][:,ij0:ij1] = tmp
             else:
-                _cderi[slice_id][:,ij0:ij1] = cderi_block[p0:p1]
+                _cderi[0][:,ij0:ij1] = cderi_block
             t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1)    
     return

From 8526424901176d65d3c365b077d66a14f5598ef4 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Fri, 3 Jan 2025 02:05:51 +0800
Subject: [PATCH 33/49] update benchmark script

---
 .../Linux-CPython-3.9-64bit/0001_v100.json    | 838 ++++++++++++++++++
 gpu4pyscf/tests/test_benchmark_rks.py         |  36 +-
 2 files changed, 867 insertions(+), 7 deletions(-)
 create mode 100644 gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/0001_v100.json

diff --git a/gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/0001_v100.json b/gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/0001_v100.json
new file mode 100644
index 00000000..4101f07e
--- /dev/null
+++ b/gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/0001_v100.json
@@ -0,0 +1,838 @@
+{
+    "machine_info": {
+        "node": "mlxlabzskrh20v669fd914-20240723162348-uim1c3-1eg5qx-worker",
+        "processor": "",
+        "machine": "x86_64",
+        "python_compiler": "GCC 10.2.1 20210110",
+        "python_implementation": "CPython",
+        "python_implementation_version": "3.9.2",
+        "python_version": "3.9.2",
+        "python_build": [
+            "default",
+            "Feb 28 2021 17:03:44"
+        ],
+        "release": "5.4.143.bsk.7-amd64",
+        "system": "Linux",
+        "cpu": {
+            "python_version": "3.9.2.final.0 (64 bit)",
+            "cpuinfo_version": [
+                9,
+                0,
+                0
+            ],
+            "cpuinfo_version_string": "9.0.0",
+            "arch": "X86_64",
+            "bits": 64,
+            "count": 96,
+            "arch_string_raw": "x86_64",
+            "vendor_id_raw": "GenuineIntel",
+            "brand_raw": "Intel(R) Xeon(R) Platinum 8260 CPU @ 2.40GHz",
+            "hz_advertised_friendly": "2.4000 GHz",
+            "hz_actual_friendly": "3.1000 GHz",
+            "hz_advertised": [
+                2400000000,
+                0
+            ],
+            "hz_actual": [
+                3100000000,
+                0
+            ],
+            "stepping": 7,
+            "model": 85,
+            "family": 6,
+            "flags": [
+                "3dnowprefetch",
+                "abm",
+                "acpi",
+                "adx",
+                "aes",
+                "aperfmperf",
+                "apic",
+                "arat",
+                "arch_capabilities",
+                "arch_perfmon",
+                "art",
+                "avx",
+                "avx2",
+                "avx512_vnni",
+                "avx512bw",
+                "avx512cd",
+                "avx512dq",
+                "avx512f",
+                "avx512vl",
+                "avx512vnni",
+                "bmi1",
+                "bmi2",
+                "bts",
+                "cat_l3",
+                "cdp_l3",
+                "clflush",
+                "clflushopt",
+                "clwb",
+                "cmov",
+                "constant_tsc",
+                "cpuid",
+                "cpuid_fault",
+                "cqm",
+                "cqm_llc",
+                "cqm_mbm_local",
+                "cqm_mbm_total",
+                "cqm_occup_llc",
+                "cx16",
+                "cx8",
+                "dca",
+                "de",
+                "ds_cpl",
+                "dtes64",
+                "dtherm",
+                "dts",
+                "epb",
+                "ept",
+                "ept_ad",
+                "erms",
+                "est",
+                "f16c",
+                "flexpriority",
+                "flush_l1d",
+                "fma",
+                "fpu",
+                "fsgsbase",
+                "fxsr",
+                "ht",
+                "hwp",
+                "hwp_act_window",
+                "hwp_epp",
+                "hwp_pkg_req",
+                "ibpb",
+                "ibrs",
+                "ibrs_enhanced",
+                "ida",
+                "intel_ppin",
+                "intel_pt",
+                "invpcid",
+                "invpcid_single",
+                "lahf_lm",
+                "lm",
+                "mba",
+                "mca",
+                "mce",
+                "md_clear",
+                "mmx",
+                "movbe",
+                "mpx",
+                "msr",
+                "mtrr",
+                "nonstop_tsc",
+                "nopl",
+                "nx",
+                "ospke",
+                "osxsave",
+                "pae",
+                "pat",
+                "pbe",
+                "pcid",
+                "pclmulqdq",
+                "pdcm",
+                "pdpe1gb",
+                "pebs",
+                "pge",
+                "pku",
+                "pln",
+                "pni",
+                "popcnt",
+                "pqe",
+                "pqm",
+                "pse",
+                "pse36",
+                "pts",
+                "rdrand",
+                "rdrnd",
+                "rdseed",
+                "rdt_a",
+                "rdtscp",
+                "rep_good",
+                "sdbg",
+                "sep",
+                "smap",
+                "smep",
+                "smx",
+                "ss",
+                "ssbd",
+                "sse",
+                "sse2",
+                "sse4_1",
+                "sse4_2",
+                "ssse3",
+                "stibp",
+                "syscall",
+                "tm",
+                "tm2",
+                "tpr_shadow",
+                "tsc",
+                "tsc_adjust",
+                "tsc_deadline_timer",
+                "tscdeadline",
+                "vme",
+                "vmx",
+                "vnmi",
+                "vpid",
+                "x2apic",
+                "xgetbv1",
+                "xsave",
+                "xsavec",
+                "xsaveopt",
+                "xsaves",
+                "xtopology",
+                "xtpr"
+            ],
+            "l3_cache_size": 37486592,
+            "l2_cache_size": 50331648,
+            "l1_data_cache_size": "1.5 MiB",
+            "l1_instruction_cache_size": "1.5 MiB",
+            "l2_cache_line_size": 256,
+            "l2_cache_associativity": 6
+        }
+    },
+    "commit_info": {
+        "id": "ca79edb0c5d76c928180d5110838a368faa2f1f1",
+        "time": "2025-01-02T06:46:51+00:00",
+        "author_time": "2025-01-02T06:46:51+00:00",
+        "dirty": true,
+        "project": "gpu4pyscf",
+        "branch": "benchmark_ci"
+    },
+    "benchmarks": [
+        {
+            "group": null,
+            "name": "test_df_rb3lyp",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 5,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 3.1617951644584537,
+                "max": 3.7628091080114245,
+                "mean": 3.4210989899933337,
+                "stddev": 0.22879307906931964,
+                "rounds": 5,
+                "median": 3.411895725876093,
+                "iqr": 0.3097648276016116,
+                "q1": 3.2490662271156907,
+                "q3": 3.5588310547173023,
+                "iqr_outliers": 0,
+                "stddev_outliers": 2,
+                "outliers": "2;0",
+                "ld15iqr": 3.1617951644584537,
+                "hd15iqr": 3.7628091080114245,
+                "ops": 0.2923037313228836,
+                "total": 17.10549494996667,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_rb3lyp_grad",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_grad",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 5,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 4.783565265126526,
+                "max": 5.297894531860948,
+                "mean": 5.111189672350884,
+                "stddev": 0.21592611330232545,
+                "rounds": 5,
+                "median": 5.1955106453970075,
+                "iqr": 0.32867400418035686,
+                "q1": 4.950462192296982,
+                "q3": 5.279136196477339,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 4.783565265126526,
+                "hd15iqr": 5.297894531860948,
+                "ops": 0.1956491666528297,
+                "total": 25.555948361754417,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_rb3lyp",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 5,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 40.746477760374546,
+                "max": 41.372105130925775,
+                "mean": 41.03762242179364,
+                "stddev": 0.23753417144600847,
+                "rounds": 5,
+                "median": 41.043180647306144,
+                "iqr": 0.33331693802028894,
+                "q1": 40.858045106288046,
+                "q3": 41.191362044308335,
+                "iqr_outliers": 0,
+                "stddev_outliers": 2,
+                "outliers": "2;0",
+                "ld15iqr": 40.746477760374546,
+                "hd15iqr": 41.372105130925775,
+                "ops": 0.02436788344416696,
+                "total": 205.1881121089682,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_rb3lyp_grad",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp_grad",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 5,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 49.95022037625313,
+                "max": 50.498252051882446,
+                "mean": 50.14596408084035,
+                "stddev": 0.20855077359110574,
+                "rounds": 5,
+                "median": 50.08650258369744,
+                "iqr": 0.19796669483184814,
+                "q1": 50.0301427282393,
+                "q3": 50.228109423071146,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 49.95022037625313,
+                "hd15iqr": 50.498252051882446,
+                "ops": 0.01994178431564102,
+                "total": 250.72982040420175,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_rb3lyp_hessian",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp_hessian",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 5,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 613.87584313564,
+                "max": 620.2618521982804,
+                "mean": 616.6373227955773,
+                "stddev": 2.3903446370287127,
+                "rounds": 5,
+                "median": 616.8295351918787,
+                "iqr": 2.8947918817866594,
+                "q1": 614.9020847703796,
+                "q3": 617.7968766521662,
+                "iqr_outliers": 0,
+                "stddev_outliers": 2,
+                "outliers": "2;0",
+                "ld15iqr": 613.87584313564,
+                "hd15iqr": 620.2618521982804,
+                "ops": 0.0016216987896003692,
+                "total": 3083.1866139778867,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_rb3lyp_medium",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_medium",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 5,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 18.224224156700075,
+                "max": 20.157692234031856,
+                "mean": 19.024707913957535,
+                "stddev": 0.8064738008793728,
+                "rounds": 5,
+                "median": 19.21132487989962,
+                "iqr": 1.2390491359401494,
+                "q1": 18.252076843054965,
+                "q3": 19.491125978995115,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 18.224224156700075,
+                "hd15iqr": 20.157692234031856,
+                "ops": 0.052563224861199936,
+                "total": 95.12353956978768,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_rb3lyp_grad_medium",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_grad_medium",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 5,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 28.5077072577551,
+                "max": 32.310115557163954,
+                "mean": 29.75330361928791,
+                "stddev": 1.5559254109717362,
+                "rounds": 5,
+                "median": 28.967016119509935,
+                "iqr": 1.9220157761592418,
+                "q1": 28.759349649539217,
+                "q3": 30.68136542569846,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 28.5077072577551,
+                "hd15iqr": 32.310115557163954,
+                "ops": 0.03360971315305434,
+                "total": 148.76651809643954,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_rb3lyp_hessian_medium",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_hessian_medium",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 5,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 603.508519613184,
+                "max": 605.9056743234396,
+                "mean": 605.091381527856,
+                "stddev": 0.9420383756463966,
+                "rounds": 5,
+                "median": 605.3425533128902,
+                "iqr": 1.0546113743912429,
+                "q1": 604.6620287010446,
+                "q3": 605.7166400754359,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 603.508519613184,
+                "hd15iqr": 605.9056743234396,
+                "ops": 0.0016526429404348803,
+                "total": 3025.45690763928,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_rb3lyp_medium",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp_medium",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 5,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 462.8745871502906,
+                "max": 464.5682009514421,
+                "mean": 463.4215483263135,
+                "stddev": 0.6988488955671605,
+                "rounds": 5,
+                "median": 463.2153818728402,
+                "iqr": 0.9204953673761338,
+                "q1": 462.88869020040147,
+                "q3": 463.8091855677776,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 462.8745871502906,
+                "hd15iqr": 464.5682009514421,
+                "ops": 0.002157862541376389,
+                "total": 2317.1077416315675,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_rb3lyp_grad_medium",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp_grad_medium",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 5,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 552.8754901243374,
+                "max": 554.911680762656,
+                "mean": 554.3076192354783,
+                "stddev": 0.8161092352116484,
+                "rounds": 5,
+                "median": 554.5314849242568,
+                "iqr": 0.6415546571370214,
+                "q1": 554.1099091696087,
+                "q3": 554.7514638267457,
+                "iqr_outliers": 1,
+                "stddev_outliers": 1,
+                "outliers": "1;1",
+                "ld15iqr": 554.5213821846992,
+                "hd15iqr": 554.911680762656,
+                "ops": 0.0018040524165611094,
+                "total": 2771.5380961773917,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_rb3lyp_631gs",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_631gs",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 5,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 1.5329652335494757,
+                "max": 1.6051436373963952,
+                "mean": 1.5614585245028139,
+                "stddev": 0.027036901841682143,
+                "rounds": 5,
+                "median": 1.5542930895462632,
+                "iqr": 0.02967151813209057,
+                "q1": 1.5455118480604142,
+                "q3": 1.5751833661925048,
+                "iqr_outliers": 0,
+                "stddev_outliers": 2,
+                "outliers": "2;0",
+                "ld15iqr": 1.5329652335494757,
+                "hd15iqr": 1.6051436373963952,
+                "ops": 0.6404268728933491,
+                "total": 7.807292622514069,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_rb3lyp_631gs_grad",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_631gs_grad",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 5,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 2.280210480093956,
+                "max": 2.3804853092879057,
+                "mean": 2.317926473915577,
+                "stddev": 0.04595680285671692,
+                "rounds": 5,
+                "median": 2.290554977953434,
+                "iqr": 0.07659115269780159,
+                "q1": 2.283684498164803,
+                "q3": 2.3602756508626044,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 2.280210480093956,
+                "hd15iqr": 2.3804853092879057,
+                "ops": 0.43142006929613325,
+                "total": 11.589632369577885,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_rb3lyp_631gs_hessian",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_631gs_hessian",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 5,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 13.266008610837162,
+                "max": 13.778700362890959,
+                "mean": 13.425486170500516,
+                "stddev": 0.20500983387954833,
+                "rounds": 5,
+                "median": 13.3493886096403,
+                "iqr": 0.20474978047423065,
+                "q1": 13.303213707404211,
+                "q3": 13.507963487878442,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 13.266008610837162,
+                "hd15iqr": 13.778700362890959,
+                "ops": 0.07448519832356425,
+                "total": 67.12743085250258,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_rb3lyp_631gs_large",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp_631gs_large",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 5,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 54.99403030797839,
+                "max": 55.44687832519412,
+                "mean": 55.280799315497276,
+                "stddev": 0.17365676139080558,
+                "rounds": 5,
+                "median": 55.29263620171696,
+                "iqr": 0.18626192840747535,
+                "q1": 55.21340201841667,
+                "q3": 55.39966394682415,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 54.99403030797839,
+                "hd15iqr": 55.44687832519412,
+                "ops": 0.018089463473435388,
+                "total": 276.4039965774864,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_rb3lyp_631gs_grad_large",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp_631gs_grad_large",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 5,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 68.62203936092556,
+                "max": 69.5449710758403,
+                "mean": 69.09264576677234,
+                "stddev": 0.4051404972610001,
+                "rounds": 5,
+                "median": 68.97573095746338,
+                "iqr": 0.7140946059953421,
+                "q1": 68.78401179146022,
+                "q3": 69.49810639745556,
+                "iqr_outliers": 0,
+                "stddev_outliers": 2,
+                "outliers": "2;0",
+                "ld15iqr": 68.62203936092556,
+                "hd15iqr": 69.5449710758403,
+                "ops": 0.014473320407725865,
+                "total": 345.46322883386165,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_rb3lyp_631gs_solvent",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_631gs_solvent",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 5,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 2.3869710564613342,
+                "max": 2.530611244030297,
+                "mean": 2.4419715087860823,
+                "stddev": 0.05630153012020871,
+                "rounds": 5,
+                "median": 2.43410103302449,
+                "iqr": 0.07564499671570957,
+                "q1": 2.398690618108958,
+                "q3": 2.4743356148246676,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 2.3869710564613342,
+                "hd15iqr": 2.530611244030297,
+                "ops": 0.40950518726449253,
+                "total": 12.209857543930411,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_rb3lyp_631gs_solvent_grad",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_631gs_solvent_grad",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 5,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 2.973248357884586,
+                "max": 3.1569794192910194,
+                "mean": 3.076459738984704,
+                "stddev": 0.06885617804924284,
+                "rounds": 5,
+                "median": 3.072229014709592,
+                "iqr": 0.08629889693111181,
+                "q1": 3.040569737320766,
+                "q3": 3.1268686342518777,
+                "iqr_outliers": 0,
+                "stddev_outliers": 2,
+                "outliers": "2;0",
+                "ld15iqr": 2.973248357884586,
+                "hd15iqr": 3.1569794192910194,
+                "ops": 0.3250489474404826,
+                "total": 15.38229869492352,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_rb3lyp_631gs_solvent_hessian",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_631gs_solvent_hessian",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 5,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 107.97044172231108,
+                "max": 109.67866003140807,
+                "mean": 108.94072148483247,
+                "stddev": 0.7142319621034929,
+                "rounds": 5,
+                "median": 108.94882048200816,
+                "iqr": 1.1987205250188708,
+                "q1": 108.39640940236859,
+                "q3": 109.59512992738746,
+                "iqr_outliers": 0,
+                "stddev_outliers": 2,
+                "outliers": "2;0",
+                "ld15iqr": 107.97044172231108,
+                "hd15iqr": 109.67866003140807,
+                "ops": 0.009179303995514913,
+                "total": 544.7036074241623,
+                "iterations": 1
+            }
+        }
+    ],
+    "datetime": "2025-01-02T13:26:31.464476+00:00",
+    "version": "5.1.0"
+}
\ No newline at end of file
diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py
index 3d738890..7847993d 100644
--- a/gpu4pyscf/tests/test_benchmark_rks.py
+++ b/gpu4pyscf/tests/test_benchmark_rks.py
@@ -25,11 +25,19 @@
 # How to run
 # 1. run test only
 # pytest test_benchmark_rks.py --benchmark-disable -s -v -m "not slow" --durations=20
+
 # 2. benchmark less expensive tasks
 # pytest test_benchmark_rks.py -v -m "not slow"
+
 # 3. benchmark all the tests
 # pytest test_benchmark_rks.py -v
 
+# 4. save benchmark results
+# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-save=v100
+
+# 5. compare benchmark results, fail if performance regresses by more than 10%
+# pytest test_benchmark_rks.py -s -v -m "not slow" --benchmark-compare=v100 --benchmark-compare-fail=10%
+
 current_folder = os.path.dirname(os.path.abspath(__file__))
 small_mol = os.path.join(current_folder, '020_Vitamin_C.xyz')
 medium_mol = os.path.join(current_folder, '057_Tamoxifen.xyz')
@@ -81,8 +89,9 @@ def run_rb3lyp_hessian(atom, basis, with_df, with_solvent, disp=None):
     mf.kernel()
     h = mf.Hessian().kernel()
     return h
-
+#######
 # DF
+#######
 @pytest.mark.benchmark
 def test_df_rb3lyp(benchmark):
     e = benchmark(run_rb3lyp, small_mol, 'def2-tzvpp', True, False)
@@ -100,7 +109,9 @@ def test_df_rb3lyp_hessian(benchmark):
     print('testing df rb3lyp hessian')
     assert np.isclose(np.linalg.norm(h), 3.7668761221997764, atol=1e-4)
 
+################
 # Direct SCF
+################
 @pytest.mark.benchmark
 def test_rb3lyp(benchmark):
     e = benchmark(run_rb3lyp, small_mol, 'def2-tzvpp', False, False)
@@ -117,7 +128,9 @@ def test_rb3lyp_hessian(benchmark):
     print('testing rb3lyp hessian')
     assert np.isclose(np.linalg.norm(h), 3.7588443634477833, atol=1e-4)
 
-# medium molecule
+####################
+# Medium molecule
+####################
 @pytest.mark.benchmark
 def test_df_rb3lyp_medium(benchmark):
     e = benchmark(run_rb3lyp, medium_mol, 'def2-tzvpp', True, False)
@@ -144,7 +157,6 @@ def test_rb3lyp_grad_medium(benchmark):
     g = benchmark(run_rb3lyp_grad, medium_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp grad medium')
     assert np.isclose(np.linalg.norm(g), 0.2601443836937988, atol=1e-5)
-
 @pytest.mark.slow
 @pytest.mark.benchmark
 def test_rb3lyp_hessian_medium(benchmark):
@@ -152,12 +164,16 @@ def test_rb3lyp_hessian_medium(benchmark):
     print('testing rb3lyp hessian medium')
     assert np.isclose(np.linalg.norm(h), 6.312714778020796, atol=1e-4)
 
+####################
 # large molecule
+####################
+@pytest.mark.high_memory
 @pytest.mark.benchmark
 def test_df_rb3lyp_large(benchmark):
     e = benchmark(run_rb3lyp, large_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp large')
     assert np.isclose(np.linalg.norm(e), 2564.198712152175, atol=1e-7)
+@pytest.mark.high_memory
 @pytest.mark.benchmark
 def test_df_rb3lyp_grad_large(benchmark):
     g = benchmark(run_rb3lyp_grad, large_mol, 'def2-tzvpp', True, False)
@@ -192,13 +208,15 @@ def test_rb3lyp_hessian_large(benchmark):
     print('testing rb3lyp hessian large')
     print(np.linalg.norm(h))
 '''
-# small basis set
+
+#####################
+# Small basis set
+#####################
 @pytest.mark.benchmark
 def test_df_rb3lyp_631gs(benchmark):
     e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, False)
     print('testing df rb3lyp 631gs')
     assert np.isclose(np.linalg.norm(e), 684.6646008642876, atol=1e-7)
-
 @pytest.mark.benchmark
 def test_df_rb3lyp_631gs_grad(benchmark):
     g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, False)
@@ -210,7 +228,9 @@ def test_df_rb3lyp_631gs_hessian(benchmark):
     print('testing df rb3lyp 631gs hessian')
     assert np.isclose(np.linalg.norm(h), 3.908874851569459, atol=1e-4)
 
-# small basis set for large molecule
+#########################################
+# Small basis set for large molecule
+#########################################
 @pytest.mark.benchmark
 def test_rb3lyp_631gs_large(benchmark):
     e = benchmark(run_rb3lyp, large_mol, '6-31gs', False, False)
@@ -228,7 +248,9 @@ def test_rb3lyp_631gs_hessian_large(benchmark):
     print('testing df rb3lyp 631gs hessian large')
     assert np.isclose(np.linalg.norm(h), 7.920764634100053, atol=1e-4)
 
-#solvent model
+###################
+# Solvent model
+###################
 @pytest.mark.benchmark
 def test_df_rb3lyp_631gs_solvent(benchmark):
     e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, True)

From a30c19611e1fb417e53a817c612bbebd6d021406 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Fri, 3 Jan 2025 04:24:12 +0800
Subject: [PATCH 34/49] np.isclose

---
 gpu4pyscf/tests/test_benchmark_rks.py | 59 +++++++++++++++------------
 gpu4pyscf/tests/test_benchmark_uks.py | 15 +++----
 2 files changed, 40 insertions(+), 34 deletions(-)

diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py
index 7847993d..b8c593b7 100644
--- a/gpu4pyscf/tests/test_benchmark_rks.py
+++ b/gpu4pyscf/tests/test_benchmark_rks.py
@@ -87,8 +87,12 @@ def run_rb3lyp_hessian(atom, basis, with_df, with_solvent, disp=None):
     mf.conv_tol = 1e-10
     mf.conv_tol_cpscf = 1e-6
     mf.kernel()
-    h = mf.Hessian().kernel()
+    h = mf.Hessian()
+    if with_df:
+        h.auxbasis_response = 2
+    h.kernel()
     return h
+
 #######
 # DF
 #######
@@ -96,18 +100,18 @@ def run_rb3lyp_hessian(atom, basis, with_df, with_solvent, disp=None):
 def test_df_rb3lyp(benchmark):
     e = benchmark(run_rb3lyp, small_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp')
-    assert np.isclose(np.linalg.norm(e), 684.9998712035579, atol=1e-7)
+    assert np.isclose(np.linalg.norm(e), 684.9998712035579, atol=1e-7, rtol=1e-16)
 @pytest.mark.benchmark
 def test_df_rb3lyp_grad(benchmark):
     g = benchmark(run_rb3lyp_grad, small_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp grad')
-    assert np.isclose(np.linalg.norm(g), 0.17435941081837686, atol=1e-5)
+    assert np.isclose(np.linalg.norm(g), 0.17435941081837686, atol=1e-5, rtol=1e-16)
 @pytest.mark.slow
 @pytest.mark.benchmark
 def test_df_rb3lyp_hessian(benchmark):
     h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp hessian')
-    assert np.isclose(np.linalg.norm(h), 3.7668761221997764, atol=1e-4)
+    assert np.isclose(np.linalg.norm(h), 3.7668761221997764, atol=1e-4, rtol=1e-16)
 
 ################
 # Direct SCF
@@ -116,17 +120,17 @@ def test_df_rb3lyp_hessian(benchmark):
 def test_rb3lyp(benchmark):
     e = benchmark(run_rb3lyp, small_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp')
-    assert np.isclose(np.linalg.norm(e), 684.999735850967, atol=1e-7)
+    assert np.isclose(np.linalg.norm(e), 684.999735850967, atol=1e-7, rtol=1e-16)
 @pytest.mark.benchmark
 def test_rb3lyp_grad(benchmark):
     g = benchmark(run_rb3lyp_grad, small_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp grad')
-    assert np.isclose(np.linalg.norm(g), 0.1744127474130983, atol=1e-5)
+    assert np.isclose(np.linalg.norm(g), 0.1744127474130983, atol=1e-5, rtol=1e-16)
 @pytest.mark.benchmark
 def test_rb3lyp_hessian(benchmark):
     h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp hessian')
-    assert np.isclose(np.linalg.norm(h), 3.7588443634477833, atol=1e-4)
+    assert np.isclose(np.linalg.norm(h), 3.7588443634477833, atol=1e-4, rtol=1e-16)
 
 ####################
 # Medium molecule
@@ -135,34 +139,35 @@ def test_rb3lyp_hessian(benchmark):
 def test_df_rb3lyp_medium(benchmark):
     e = benchmark(run_rb3lyp, medium_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp medium')
-    assert np.isclose(np.linalg.norm(e), 1138.371390377773, atol=1e-7)
+    assert np.isclose(np.linalg.norm(e), 1138.371390377773, atol=1e-7, rtol=1e-16)
 @pytest.mark.benchmark
 def test_df_rb3lyp_grad_medium(benchmark):
     g = benchmark(run_rb3lyp_grad, medium_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp grad medium')
-    assert np.isclose(np.linalg.norm(g), 0.26010545073602614, atol=1e-4)
+    assert np.isclose(np.linalg.norm(g), 0.26010545073602614, atol=1e-5, rtol=1e-16)
 @pytest.mark.benchmark
 def test_df_rb3lyp_hessian_medium(benchmark):
     h = benchmark(run_rb3lyp_hessian, medium_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp hessian medium')
-    assert np.isclose(np.linalg.norm(h), 6.32514169232998, atol=1e-4)
+    print(np.linalg.norm(h) - 6.32514169232998)
+    assert np.isclose(np.linalg.norm(h), 6.32514169232998, atol=1e-4, rtol=1e-16)
 
 @pytest.mark.benchmark
 def test_rb3lyp_medium(benchmark):
     e = benchmark(run_rb3lyp, medium_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp medium')
-    assert np.isclose(np.linalg.norm(e), 1138.3710752128077, atol=1e-7)
+    assert np.isclose(np.linalg.norm(e), 1138.3710752128077, atol=1e-7, rtol=1e-16)
 @pytest.mark.benchmark
 def test_rb3lyp_grad_medium(benchmark):
     g = benchmark(run_rb3lyp_grad, medium_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp grad medium')
-    assert np.isclose(np.linalg.norm(g), 0.2601443836937988, atol=1e-5)
+    assert np.isclose(np.linalg.norm(g), 0.2601443836937988, atol=1e-5, rtol=1e-16)
 @pytest.mark.slow
 @pytest.mark.benchmark
 def test_rb3lyp_hessian_medium(benchmark):
     h = benchmark(run_rb3lyp_hessian, medium_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp hessian medium')
-    assert np.isclose(np.linalg.norm(h), 6.312714778020796, atol=1e-4)
+    assert np.isclose(np.linalg.norm(h), 6.312714778020796, atol=1e-4, rtol=1e-16)
 
 ####################
 # large molecule
@@ -172,32 +177,32 @@ def test_rb3lyp_hessian_medium(benchmark):
 def test_df_rb3lyp_large(benchmark):
     e = benchmark(run_rb3lyp, large_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp large')
-    assert np.isclose(np.linalg.norm(e), 2564.198712152175, atol=1e-7)
+    assert np.isclose(np.linalg.norm(e), 2564.198712152175, atol=1e-7, rtol=1e-16)
 @pytest.mark.high_memory
 @pytest.mark.benchmark
 def test_df_rb3lyp_grad_large(benchmark):
     g = benchmark(run_rb3lyp_grad, large_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp grad large')
-    assert np.isclose(np.linalg.norm(g), 0.3784358687859323, atol=1e-5)
+    assert np.isclose(np.linalg.norm(g), 0.3784358687859323, atol=1e-5, rtol=1e-16)
 @pytest.mark.high_memory
 @pytest.mark.slow
 @pytest.mark.benchmark
 def test_df_rb3lyp_hessian_large(benchmark):
     h = benchmark(run_rb3lyp_hessian, large_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp hessian large')
-    assert np.isclose(np.linalg.norm(h), 7.583208736873523, atol=1e-4)
+    assert np.isclose(np.linalg.norm(h), 7.583208736873523, atol=1e-4, rtol=1e-16)
 @pytest.mark.slow
 @pytest.mark.benchmark
 def test_rb3lyp_large(benchmark):
     e = benchmark(run_rb3lyp, large_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp large')
-    assert np.isclose(np.linalg.norm(e), 2564.198099576358, atol=1e-7)
+    assert np.isclose(np.linalg.norm(e), 2564.198099576358, atol=1e-7, rtol=1e-16)
 @pytest.mark.slow
 @pytest.mark.benchmark
 def test_rb3lyp_grad_large(benchmark):
     g = benchmark(run_rb3lyp_grad, large_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp grad large')
-    assert np.isclose(np.linalg.norm(g), 0.3784664384209763, atol=1e-5)
+    assert np.isclose(np.linalg.norm(g), 0.3784664384209763, atol=1e-5, rtol=1e-16)
 
 # Hessian for large molecule with large basis set is too slow
 '''
@@ -216,17 +221,17 @@ def test_rb3lyp_hessian_large(benchmark):
 def test_df_rb3lyp_631gs(benchmark):
     e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, False)
     print('testing df rb3lyp 631gs')
-    assert np.isclose(np.linalg.norm(e), 684.6646008642876, atol=1e-7)
+    assert np.isclose(np.linalg.norm(e), 684.6646008642876, atol=1e-7, rtol=1e-16)
 @pytest.mark.benchmark
 def test_df_rb3lyp_631gs_grad(benchmark):
     g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, False)
     print('testing df rb3lyp 631gs grad')
-    assert np.isclose(np.linalg.norm(g), 0.17530687343398219, atol=1e-5)
+    assert np.isclose(np.linalg.norm(g), 0.17530687343398219, atol=1e-5, rtol=1e-16)
 @pytest.mark.benchmark
 def test_df_rb3lyp_631gs_hessian(benchmark):
     h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, False)
     print('testing df rb3lyp 631gs hessian')
-    assert np.isclose(np.linalg.norm(h), 3.908874851569459, atol=1e-4)
+    assert np.isclose(np.linalg.norm(h), 3.908874851569459, atol=1e-4, rtol=1e-16)
 
 #########################################
 # Small basis set for large molecule
@@ -235,18 +240,18 @@ def test_df_rb3lyp_631gs_hessian(benchmark):
 def test_rb3lyp_631gs_large(benchmark):
     e = benchmark(run_rb3lyp, large_mol, '6-31gs', False, False)
     print('testing rb3lyp 631gs large')
-    assert np.isclose(np.linalg.norm(e), 2563.1171191823423, atol=1e-7)
+    assert np.isclose(np.linalg.norm(e), 2563.1171191823423, atol=1e-7, rtol=1e-16)
 @pytest.mark.benchmark
 def test_rb3lyp_631gs_grad_large(benchmark):
     g = benchmark(run_rb3lyp_grad, large_mol, '6-31gs', False, False)
     print('testing df rb3lyp 631gs grad large')
-    assert np.isclose(np.linalg.norm(g), 0.37778228700247984, atol=1e-5)
+    assert np.isclose(np.linalg.norm(g), 0.37778228700247984, atol=1e-5, rtol=1e-16)
 @pytest.mark.slow
 @pytest.mark.benchmark
 def test_rb3lyp_631gs_hessian_large(benchmark):
     h = benchmark(run_rb3lyp_hessian, large_mol, '6-31gs', False, False)
     print('testing df rb3lyp 631gs hessian large')
-    assert np.isclose(np.linalg.norm(h), 7.920764634100053, atol=1e-4)
+    assert np.isclose(np.linalg.norm(h), 7.920764634100053, atol=1e-4, rtol=1e-16)
 
 ###################
 # Solvent model
@@ -255,17 +260,17 @@ def test_rb3lyp_631gs_hessian_large(benchmark):
 def test_df_rb3lyp_631gs_solvent(benchmark):
     e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, True)
     print('testing df rb3lyp 631gs solvent')
-    assert np.isclose(np.linalg.norm(e), 684.6985561053816, atol=1e-7)
+    assert np.isclose(np.linalg.norm(e), 684.6985561053816, atol=1e-7, rtol=1e-16)
 @pytest.mark.benchmark
 def test_df_rb3lyp_631gs_solvent_grad(benchmark):
     g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, True)
     print('testing df rb3lyp 631gs solvent grad')
-    assert np.isclose(np.linalg.norm(g), 0.16956999476137297, atol=1e-5)
+    assert np.isclose(np.linalg.norm(g), 0.16956999476137297, atol=1e-5, rtol=1e-16)
 @pytest.mark.benchmark
 def test_df_rb3lyp_631gs_solvent_hessian(benchmark):
     h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, True)
     print('testing df rb3lyp 631gs solvent hessian')
-    assert np.isclose(np.linalg.norm(h), 3.9008165041707294, atol=1e-4)
+    assert np.isclose(np.linalg.norm(h), 3.9008165041707294, atol=1e-4, rtol=1e-16)
 
 # No need to test d3bj generally
 '''
diff --git a/gpu4pyscf/tests/test_benchmark_uks.py b/gpu4pyscf/tests/test_benchmark_uks.py
index 39acd9ba..33d86c99 100644
--- a/gpu4pyscf/tests/test_benchmark_uks.py
+++ b/gpu4pyscf/tests/test_benchmark_uks.py
@@ -64,35 +64,36 @@ def run_ub3lyp_hessian(atom, basis, with_df, with_solvent):
     h = mf.Hessian().kernel()
     return h
 
-
+##########
 # UKS
+##########
 @pytest.mark.benchmark
 def test_df_ub3lyp(benchmark):
     e = benchmark(run_ub3lyp, small_mol, 'def2-tzvpp', True, False)
     print('testing df ub3lyp')
-    assert np.isclose(np.linalg.norm(e), 684.9998712035856, atol=1e-7)
+    assert np.isclose(np.linalg.norm(e), 684.9998712035856, atol=1e-7, rtol=1e-16)
 @pytest.mark.benchmark
 def test_df_ub3lyp_grad(benchmark):
     g = benchmark(run_ub3lyp_grad, small_mol, 'def2-tzvpp', True, False)
     print('testing df ub3lyp grad')
-    assert np.isclose(np.linalg.norm(g), 0.17435842214665462, atol=1e-5)
+    assert np.isclose(np.linalg.norm(g), 0.17435842214665462, atol=1e-5, rtol=1e-16)
 @pytest.mark.benchmark
 def test_df_ub3lyp_hessian(benchmark):
     h = benchmark(run_ub3lyp_hessian, small_mol, 'def2-tzvpp', True, False)
     print('testing df ub3lyp hessian')
-    assert np.isclose(np.linalg.norm(h), 3.7669464279078064, atol=1e-4)
+    assert np.isclose(np.linalg.norm(h), 3.7669464279078064, atol=1e-4, rtol=1e-16)
 @pytest.mark.benchmark
 def test_ub3lyp(benchmark):
     e = benchmark(run_ub3lyp, small_mol, 'def2-tzvpp', False, False)
     print('testing ub3lyp')
-    assert np.isclose(np.linalg.norm(e), 684.9997358509884, atol=1e-7)
+    assert np.isclose(np.linalg.norm(e), 684.9997358509884, atol=1e-7, rtol=1e-16)
 @pytest.mark.benchmark
 def test_ub3lyp_grad(benchmark):
     g = benchmark(run_ub3lyp_grad, small_mol, 'def2-tzvpp', False, False)
     print('testing ub3lyp grad')
-    assert np.isclose(np.linalg.norm(g), 0.17441176110160253, atol=1e-5)
+    assert np.isclose(np.linalg.norm(g), 0.17441176110160253, atol=1e-5, rtol=1e-16)
 @pytest.mark.benchmark
 def test_ub3lyp_hessian(benchmark):
     h = benchmark(run_ub3lyp_hessian, small_mol, 'def2-tzvpp', False, False)
     print('testing ub3lyp hessian')
-    assert np.isclose(np.linalg.norm(h), 3.758916526520172, atol=1e-4)
+    assert np.isclose(np.linalg.norm(h), 3.758916526520172, atol=1e-4, rtol=1e-16)

From b49d34e8ce5fe6b572532bd1682bfb86e4790e11 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Fri, 3 Jan 2025 04:31:48 +0800
Subject: [PATCH 35/49] bugfix

---
 gpu4pyscf/tests/test_benchmark_rks.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py
index b8c593b7..d25f98b0 100644
--- a/gpu4pyscf/tests/test_benchmark_rks.py
+++ b/gpu4pyscf/tests/test_benchmark_rks.py
@@ -87,10 +87,10 @@ def run_rb3lyp_hessian(atom, basis, with_df, with_solvent, disp=None):
     mf.conv_tol = 1e-10
     mf.conv_tol_cpscf = 1e-6
     mf.kernel()
-    h = mf.Hessian()
+    hobj = mf.Hessian()
     if with_df:
-        h.auxbasis_response = 2
-    h.kernel()
+        hobj.auxbasis_response = 2
+    h = hobj.kernel()
     return h
 
 #######
@@ -130,6 +130,7 @@ def test_rb3lyp_grad(benchmark):
 def test_rb3lyp_hessian(benchmark):
     h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp hessian')
+    print(np.linalg.norm(h) - 3.7588443634477833)
     assert np.isclose(np.linalg.norm(h), 3.7588443634477833, atol=1e-4, rtol=1e-16)
 
 ####################

From ba388eec82973e4722d1afa3e83e00a3101248a0 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Fri, 3 Jan 2025 06:10:51 +0800
Subject: [PATCH 36/49] auxbasis_response

---
 gpu4pyscf/tests/test_benchmark_rks.py | 13 +++++--------
 gpu4pyscf/tests/test_benchmark_uks.py |  5 ++++-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py
index d25f98b0..ff952e99 100644
--- a/gpu4pyscf/tests/test_benchmark_rks.py
+++ b/gpu4pyscf/tests/test_benchmark_rks.py
@@ -36,7 +36,7 @@
 # pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-save=v100
 
 # 5. compare benchmark results, fail if performance regresses by more than 10%
-# pytest test_benchmark_rks.py -s -v -m "not slow" --benchmark-compare=v100 --benchmark-compare-fail=10%
+# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare=v100 --benchmark-compare-fail=10%
 
 current_folder = os.path.dirname(os.path.abspath(__file__))
 small_mol = os.path.join(current_folder, '020_Vitamin_C.xyz')
@@ -106,12 +106,11 @@ def test_df_rb3lyp_grad(benchmark):
     g = benchmark(run_rb3lyp_grad, small_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp grad')
     assert np.isclose(np.linalg.norm(g), 0.17435941081837686, atol=1e-5, rtol=1e-16)
-@pytest.mark.slow
 @pytest.mark.benchmark
 def test_df_rb3lyp_hessian(benchmark):
     h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp hessian')
-    assert np.isclose(np.linalg.norm(h), 3.7668761221997764, atol=1e-4, rtol=1e-16)
+    assert np.isclose(np.linalg.norm(h), 3.7587394873290885, atol=1e-4, rtol=1e-16)
 
 ################
 # Direct SCF
@@ -130,7 +129,6 @@ def test_rb3lyp_grad(benchmark):
 def test_rb3lyp_hessian(benchmark):
     h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp hessian')
-    print(np.linalg.norm(h) - 3.7588443634477833)
     assert np.isclose(np.linalg.norm(h), 3.7588443634477833, atol=1e-4, rtol=1e-16)
 
 ####################
@@ -150,8 +148,7 @@ def test_df_rb3lyp_grad_medium(benchmark):
 def test_df_rb3lyp_hessian_medium(benchmark):
     h = benchmark(run_rb3lyp_hessian, medium_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp hessian medium')
-    print(np.linalg.norm(h) - 6.32514169232998)
-    assert np.isclose(np.linalg.norm(h), 6.32514169232998, atol=1e-4, rtol=1e-16)
+    assert np.isclose(np.linalg.norm(h), 6.31265424196621, atol=1e-4, rtol=1e-16)
 
 @pytest.mark.benchmark
 def test_rb3lyp_medium(benchmark):
@@ -232,7 +229,7 @@ def test_df_rb3lyp_631gs_grad(benchmark):
 def test_df_rb3lyp_631gs_hessian(benchmark):
     h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, False)
     print('testing df rb3lyp 631gs hessian')
-    assert np.isclose(np.linalg.norm(h), 3.908874851569459, atol=1e-4, rtol=1e-16)
+    assert np.isclose(np.linalg.norm(h), 3.9071846157996553, atol=1e-4, rtol=1e-16)
 
 #########################################
 # Small basis set for large molecule
@@ -271,7 +268,7 @@ def test_df_rb3lyp_631gs_solvent_grad(benchmark):
 def test_df_rb3lyp_631gs_solvent_hessian(benchmark):
     h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, True)
     print('testing df rb3lyp 631gs solvent hessian')
-    assert np.isclose(np.linalg.norm(h), 3.9008165041707294, atol=1e-4, rtol=1e-16)
+    assert np.isclose(np.linalg.norm(h), 3.8991230592666737, atol=1e-4, rtol=1e-16)
 
 # No need to test d3bj generally
 '''
diff --git a/gpu4pyscf/tests/test_benchmark_uks.py b/gpu4pyscf/tests/test_benchmark_uks.py
index 33d86c99..0e5bf311 100644
--- a/gpu4pyscf/tests/test_benchmark_uks.py
+++ b/gpu4pyscf/tests/test_benchmark_uks.py
@@ -61,7 +61,10 @@ def run_ub3lyp_hessian(atom, basis, with_df, with_solvent):
     mf.conv_tol = 1e-10
     mf.conv_tol_cpscf = 1e-6
     mf.kernel()
-    h = mf.Hessian().kernel()
+    hobj = mf.Hessian()
+    if with_df:
+        hobj.auxbasis_response = 2
+    h = hobj.kernel()
     return h
 
 ##########

From 16858f9bc17ea91329bec7801bca9e5797d6a325 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Fri, 3 Jan 2025 22:38:58 +0800
Subject: [PATCH 37/49] add benchmark results

---
 .github/workflows/nightly_build.yml           |   2 +-
 .../{0001_v100.json => v1.3.0_1v100.json}     | 517 ++++++++++--------
 gpu4pyscf/tests/test_benchmark_rks.py         |   4 +-
 3 files changed, 279 insertions(+), 244 deletions(-)
 rename gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/{0001_v100.json => v1.3.0_1v100.json} (61%)

diff --git a/.github/workflows/nightly_build.yml b/.github/workflows/nightly_build.yml
index b802fc3e..7be6d721 100644
--- a/.github/workflows/nightly_build.yml
+++ b/.github/workflows/nightly_build.yml
@@ -40,4 +40,4 @@ jobs:
       run: |
         echo $GITHUB_WORKSPACE
         export PYTHONPATH="${PYTHONPATH}:$(pwd)"
-        pytest gpu4pyscf/tests/ -v -m "not slow"
+        pytest gpu4pyscf/tests/ -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_1v100 --benchmark-storage=gpu4pyscf/tests/
diff --git a/gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/0001_v100.json b/gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/v1.3.0_1v100.json
similarity index 61%
rename from gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/0001_v100.json
rename to gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/v1.3.0_1v100.json
index 4101f07e..81cb8ad0 100644
--- a/gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/0001_v100.json
+++ b/gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/v1.3.0_1v100.json
@@ -1,6 +1,6 @@
 {
     "machine_info": {
-        "node": "mlxlabzskrh20v669fd914-20240723162348-uim1c3-1eg5qx-worker",
+        "node": "mlxlabzskrh20v669fd914-20240723162348-uim1c3-k3ligr-worker",
         "processor": "",
         "machine": "x86_64",
         "python_compiler": "GCC 10.2.1 20210110",
@@ -34,7 +34,7 @@
                 0
             ],
             "hz_actual": [
-                3100000000,
+                3100001000,
                 0
             ],
             "stepping": 7,
@@ -194,9 +194,9 @@
         }
     },
     "commit_info": {
-        "id": "ca79edb0c5d76c928180d5110838a368faa2f1f1",
-        "time": "2025-01-02T06:46:51+00:00",
-        "author_time": "2025-01-02T06:46:51+00:00",
+        "id": "ba388eec82973e4722d1afa3e83e00a3101248a0",
+        "time": "2025-01-03T06:10:51+08:00",
+        "author_time": "2025-01-03T06:10:51+08:00",
         "dirty": true,
         "project": "gpu4pyscf",
         "branch": "benchmark_ci"
@@ -218,22 +218,22 @@
                 "warmup": false
             },
             "stats": {
-                "min": 3.1617951644584537,
-                "max": 3.7628091080114245,
-                "mean": 3.4210989899933337,
-                "stddev": 0.22879307906931964,
+                "min": 2.912467209622264,
+                "max": 3.132086180150509,
+                "mean": 2.9854499623179436,
+                "stddev": 0.08575128159932316,
                 "rounds": 5,
-                "median": 3.411895725876093,
-                "iqr": 0.3097648276016116,
-                "q1": 3.2490662271156907,
-                "q3": 3.5588310547173023,
+                "median": 2.9598704893141985,
+                "iqr": 0.08442470477893949,
+                "q1": 2.934416546020657,
+                "q3": 3.0188412507995963,
                 "iqr_outliers": 0,
-                "stddev_outliers": 2,
-                "outliers": "2;0",
-                "ld15iqr": 3.1617951644584537,
-                "hd15iqr": 3.7628091080114245,
-                "ops": 0.2923037313228836,
-                "total": 17.10549494996667,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 2.912467209622264,
+                "hd15iqr": 3.132086180150509,
+                "ops": 0.33495788327451537,
+                "total": 14.927249811589718,
                 "iterations": 1
             }
         },
@@ -253,22 +253,57 @@
                 "warmup": false
             },
             "stats": {
-                "min": 4.783565265126526,
-                "max": 5.297894531860948,
-                "mean": 5.111189672350884,
-                "stddev": 0.21592611330232545,
+                "min": 4.693447925150394,
+                "max": 4.811241740360856,
+                "mean": 4.7545236147940155,
+                "stddev": 0.05376631322845494,
                 "rounds": 5,
-                "median": 5.1955106453970075,
-                "iqr": 0.32867400418035686,
-                "q1": 4.950462192296982,
-                "q3": 5.279136196477339,
+                "median": 4.767030920833349,
+                "iqr": 0.10001574829220772,
+                "q1": 4.700914891902357,
+                "q3": 4.800930640194565,
+                "iqr_outliers": 0,
+                "stddev_outliers": 2,
+                "outliers": "2;0",
+                "ld15iqr": 4.693447925150394,
+                "hd15iqr": 4.811241740360856,
+                "ops": 0.21032601392249553,
+                "total": 23.77261807397008,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_rb3lyp_hessian",
+            "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_hessian",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 5,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 43.1729771643877,
+                "max": 44.22008949518204,
+                "mean": 43.53323510922492,
+                "stddev": 0.4568445249288835,
+                "rounds": 5,
+                "median": 43.26318317092955,
+                "iqr": 0.6843766365200281,
+                "q1": 43.210667157545686,
+                "q3": 43.895043794065714,
                 "iqr_outliers": 0,
                 "stddev_outliers": 1,
                 "outliers": "1;0",
-                "ld15iqr": 4.783565265126526,
-                "hd15iqr": 5.297894531860948,
-                "ops": 0.1956491666528297,
-                "total": 25.555948361754417,
+                "ld15iqr": 43.1729771643877,
+                "hd15iqr": 44.22008949518204,
+                "ops": 0.02297095535149178,
+                "total": 217.66617554612458,
                 "iterations": 1
             }
         },
@@ -288,22 +323,22 @@
                 "warmup": false
             },
             "stats": {
-                "min": 40.746477760374546,
-                "max": 41.372105130925775,
-                "mean": 41.03762242179364,
-                "stddev": 0.23753417144600847,
+                "min": 40.87554130144417,
+                "max": 41.24961415119469,
+                "mean": 41.05381844490766,
+                "stddev": 0.13780925683914672,
                 "rounds": 5,
-                "median": 41.043180647306144,
-                "iqr": 0.33331693802028894,
-                "q1": 40.858045106288046,
-                "q3": 41.191362044308335,
+                "median": 41.05546211451292,
+                "iqr": 0.17331884242594242,
+                "q1": 40.96216300688684,
+                "q3": 41.13548184931278,
                 "iqr_outliers": 0,
                 "stddev_outliers": 2,
                 "outliers": "2;0",
-                "ld15iqr": 40.746477760374546,
-                "hd15iqr": 41.372105130925775,
-                "ops": 0.02436788344416696,
-                "total": 205.1881121089682,
+                "ld15iqr": 40.87554130144417,
+                "hd15iqr": 41.24961415119469,
+                "ops": 0.02435827014098467,
+                "total": 205.26909222453833,
                 "iterations": 1
             }
         },
@@ -323,22 +358,22 @@
                 "warmup": false
             },
             "stats": {
-                "min": 49.95022037625313,
-                "max": 50.498252051882446,
-                "mean": 50.14596408084035,
-                "stddev": 0.20855077359110574,
+                "min": 49.98093665204942,
+                "max": 50.76574368029833,
+                "mean": 50.31307061091066,
+                "stddev": 0.33630120438295324,
                 "rounds": 5,
-                "median": 50.08650258369744,
-                "iqr": 0.19796669483184814,
-                "q1": 50.0301427282393,
-                "q3": 50.228109423071146,
+                "median": 50.36884331330657,
+                "iqr": 0.5613440982997417,
+                "q1": 49.981349020730704,
+                "q3": 50.542693119030446,
                 "iqr_outliers": 0,
                 "stddev_outliers": 1,
                 "outliers": "1;0",
-                "ld15iqr": 49.95022037625313,
-                "hd15iqr": 50.498252051882446,
-                "ops": 0.01994178431564102,
-                "total": 250.72982040420175,
+                "ld15iqr": 49.98093665204942,
+                "hd15iqr": 50.76574368029833,
+                "ops": 0.01987555098223611,
+                "total": 251.56535305455327,
                 "iterations": 1
             }
         },
@@ -358,22 +393,22 @@
                 "warmup": false
             },
             "stats": {
-                "min": 613.87584313564,
-                "max": 620.2618521982804,
-                "mean": 616.6373227955773,
-                "stddev": 2.3903446370287127,
+                "min": 611.3098333217204,
+                "max": 620.2315559927374,
+                "mean": 614.9859318509698,
+                "stddev": 3.295612103075669,
                 "rounds": 5,
-                "median": 616.8295351918787,
-                "iqr": 2.8947918817866594,
-                "q1": 614.9020847703796,
-                "q3": 617.7968766521662,
+                "median": 614.4812579210848,
+                "iqr": 3.568844774272293,
+                "q1": 612.998380784411,
+                "q3": 616.5672255586833,
                 "iqr_outliers": 0,
                 "stddev_outliers": 2,
                 "outliers": "2;0",
-                "ld15iqr": 613.87584313564,
-                "hd15iqr": 620.2618521982804,
-                "ops": 0.0016216987896003692,
-                "total": 3083.1866139778867,
+                "ld15iqr": 611.3098333217204,
+                "hd15iqr": 620.2315559927374,
+                "ops": 0.001626053456198948,
+                "total": 3074.929659254849,
                 "iterations": 1
             }
         },
@@ -393,22 +428,22 @@
                 "warmup": false
             },
             "stats": {
-                "min": 18.224224156700075,
-                "max": 20.157692234031856,
-                "mean": 19.024707913957535,
-                "stddev": 0.8064738008793728,
+                "min": 18.450319150462747,
+                "max": 19.34435743652284,
+                "mean": 18.962213665619494,
+                "stddev": 0.34090358565345374,
                 "rounds": 5,
-                "median": 19.21132487989962,
-                "iqr": 1.2390491359401494,
-                "q1": 18.252076843054965,
-                "q3": 19.491125978995115,
+                "median": 19.017266055569053,
+                "iqr": 0.4629710176959634,
+                "q1": 18.742521196603775,
+                "q3": 19.20549221429974,
                 "iqr_outliers": 0,
-                "stddev_outliers": 1,
-                "outliers": "1;0",
-                "ld15iqr": 18.224224156700075,
-                "hd15iqr": 20.157692234031856,
-                "ops": 0.052563224861199936,
-                "total": 95.12353956978768,
+                "stddev_outliers": 2,
+                "outliers": "2;0",
+                "ld15iqr": 18.450319150462747,
+                "hd15iqr": 19.34435743652284,
+                "ops": 0.05273645881404165,
+                "total": 94.81106832809746,
                 "iterations": 1
             }
         },
@@ -428,22 +463,22 @@
                 "warmup": false
             },
             "stats": {
-                "min": 28.5077072577551,
-                "max": 32.310115557163954,
-                "mean": 29.75330361928791,
-                "stddev": 1.5559254109717362,
+                "min": 28.927994549274445,
+                "max": 29.407788010314107,
+                "mean": 29.06979787014425,
+                "stddev": 0.19948441635503308,
                 "rounds": 5,
-                "median": 28.967016119509935,
-                "iqr": 1.9220157761592418,
-                "q1": 28.759349649539217,
-                "q3": 30.68136542569846,
+                "median": 28.980533458292484,
+                "iqr": 0.2319285492412746,
+                "q1": 28.93826104514301,
+                "q3": 29.170189594384283,
                 "iqr_outliers": 0,
                 "stddev_outliers": 1,
                 "outliers": "1;0",
-                "ld15iqr": 28.5077072577551,
-                "hd15iqr": 32.310115557163954,
-                "ops": 0.03360971315305434,
-                "total": 148.76651809643954,
+                "ld15iqr": 28.927994549274445,
+                "hd15iqr": 29.407788010314107,
+                "ops": 0.034399963992423795,
+                "total": 145.34898935072124,
                 "iterations": 1
             }
         },
@@ -463,22 +498,22 @@
                 "warmup": false
             },
             "stats": {
-                "min": 603.508519613184,
-                "max": 605.9056743234396,
-                "mean": 605.091381527856,
-                "stddev": 0.9420383756463966,
+                "min": 674.9359990525991,
+                "max": 678.2040371634066,
+                "mean": 676.7355838540941,
+                "stddev": 1.3332352353981456,
                 "rounds": 5,
-                "median": 605.3425533128902,
-                "iqr": 1.0546113743912429,
-                "q1": 604.6620287010446,
-                "q3": 605.7166400754359,
+                "median": 676.6573997996747,
+                "iqr": 2.1692251418717206,
+                "q1": 675.7630731766112,
+                "q3": 677.9322983184829,
                 "iqr_outliers": 0,
-                "stddev_outliers": 1,
-                "outliers": "1;0",
-                "ld15iqr": 603.508519613184,
-                "hd15iqr": 605.9056743234396,
-                "ops": 0.0016526429404348803,
-                "total": 3025.45690763928,
+                "stddev_outliers": 2,
+                "outliers": "2;0",
+                "ld15iqr": 674.9359990525991,
+                "hd15iqr": 678.2040371634066,
+                "ops": 0.0014776820132685715,
+                "total": 3383.6779192704707,
                 "iterations": 1
             }
         },
@@ -498,22 +533,22 @@
                 "warmup": false
             },
             "stats": {
-                "min": 462.8745871502906,
-                "max": 464.5682009514421,
-                "mean": 463.4215483263135,
-                "stddev": 0.6988488955671605,
+                "min": 465.53933845460415,
+                "max": 469.9319954663515,
+                "mean": 467.35331859253347,
+                "stddev": 1.7196427730040629,
                 "rounds": 5,
-                "median": 463.2153818728402,
-                "iqr": 0.9204953673761338,
-                "q1": 462.88869020040147,
-                "q3": 463.8091855677776,
+                "median": 467.1924539171159,
+                "iqr": 2.4731017132289708,
+                "q1": 465.9859178052284,
+                "q3": 468.45901951845735,
                 "iqr_outliers": 0,
-                "stddev_outliers": 1,
-                "outliers": "1;0",
-                "ld15iqr": 462.8745871502906,
-                "hd15iqr": 464.5682009514421,
-                "ops": 0.002157862541376389,
-                "total": 2317.1077416315675,
+                "stddev_outliers": 2,
+                "outliers": "2;0",
+                "ld15iqr": 465.53933845460415,
+                "hd15iqr": 469.9319954663515,
+                "ops": 0.00213970878180895,
+                "total": 2336.7665929626673,
                 "iterations": 1
             }
         },
@@ -533,22 +568,22 @@
                 "warmup": false
             },
             "stats": {
-                "min": 552.8754901243374,
-                "max": 554.911680762656,
-                "mean": 554.3076192354783,
-                "stddev": 0.8161092352116484,
+                "min": 559.7645257860422,
+                "max": 562.2628617957234,
+                "mean": 560.7982054453344,
+                "stddev": 0.9539619574856013,
                 "rounds": 5,
-                "median": 554.5314849242568,
-                "iqr": 0.6415546571370214,
-                "q1": 554.1099091696087,
-                "q3": 554.7514638267457,
-                "iqr_outliers": 1,
-                "stddev_outliers": 1,
-                "outliers": "1;1",
-                "ld15iqr": 554.5213821846992,
-                "hd15iqr": 554.911680762656,
-                "ops": 0.0018040524165611094,
-                "total": 2771.5380961773917,
+                "median": 560.6089988369495,
+                "iqr": 1.2622483419254422,
+                "q1": 560.1302895797417,
+                "q3": 561.3925379216671,
+                "iqr_outliers": 0,
+                "stddev_outliers": 2,
+                "outliers": "2;0",
+                "ld15iqr": 559.7645257860422,
+                "hd15iqr": 562.2628617957234,
+                "ops": 0.0017831726105576463,
+                "total": 2803.9910272266716,
                 "iterations": 1
             }
         },
@@ -568,22 +603,22 @@
                 "warmup": false
             },
             "stats": {
-                "min": 1.5329652335494757,
-                "max": 1.6051436373963952,
-                "mean": 1.5614585245028139,
-                "stddev": 0.027036901841682143,
+                "min": 2.3893800172954798,
+                "max": 4.065618982538581,
+                "mean": 3.419478364661336,
+                "stddev": 0.6287030173245606,
                 "rounds": 5,
-                "median": 1.5542930895462632,
-                "iqr": 0.02967151813209057,
-                "q1": 1.5455118480604142,
-                "q3": 1.5751833661925048,
+                "median": 3.485863795503974,
+                "iqr": 0.6505572367459536,
+                "q1": 3.1652946420945227,
+                "q3": 3.8158518788404763,
                 "iqr_outliers": 0,
                 "stddev_outliers": 2,
                 "outliers": "2;0",
-                "ld15iqr": 1.5329652335494757,
-                "hd15iqr": 1.6051436373963952,
-                "ops": 0.6404268728933491,
-                "total": 7.807292622514069,
+                "ld15iqr": 2.3893800172954798,
+                "hd15iqr": 4.065618982538581,
+                "ops": 0.29244226556147246,
+                "total": 17.09739182330668,
                 "iterations": 1
             }
         },
@@ -603,22 +638,22 @@
                 "warmup": false
             },
             "stats": {
-                "min": 2.280210480093956,
-                "max": 2.3804853092879057,
-                "mean": 2.317926473915577,
-                "stddev": 0.04595680285671692,
+                "min": 3.086430249735713,
+                "max": 3.4464661311358213,
+                "mean": 3.21519818790257,
+                "stddev": 0.14631235080321897,
                 "rounds": 5,
-                "median": 2.290554977953434,
-                "iqr": 0.07659115269780159,
-                "q1": 2.283684498164803,
-                "q3": 2.3602756508626044,
+                "median": 3.1610366478562355,
+                "iqr": 0.20356103358790278,
+                "q1": 3.108103247359395,
+                "q3": 3.311664280947298,
                 "iqr_outliers": 0,
                 "stddev_outliers": 1,
                 "outliers": "1;0",
-                "ld15iqr": 2.280210480093956,
-                "hd15iqr": 2.3804853092879057,
-                "ops": 0.43142006929613325,
-                "total": 11.589632369577885,
+                "ld15iqr": 3.086430249735713,
+                "hd15iqr": 3.4464661311358213,
+                "ops": 0.31102281774186635,
+                "total": 16.07599093951285,
                 "iterations": 1
             }
         },
@@ -638,22 +673,22 @@
                 "warmup": false
             },
             "stats": {
-                "min": 13.266008610837162,
-                "max": 13.778700362890959,
-                "mean": 13.425486170500516,
-                "stddev": 0.20500983387954833,
+                "min": 15.165010405704379,
+                "max": 15.594494730234146,
+                "mean": 15.329469257220627,
+                "stddev": 0.16923297471060986,
                 "rounds": 5,
-                "median": 13.3493886096403,
-                "iqr": 0.20474978047423065,
-                "q1": 13.303213707404211,
-                "q3": 13.507963487878442,
+                "median": 15.25765424221754,
+                "iqr": 0.22211500210687518,
+                "q1": 15.220200731419027,
+                "q3": 15.442315733525902,
                 "iqr_outliers": 0,
                 "stddev_outliers": 1,
                 "outliers": "1;0",
-                "ld15iqr": 13.266008610837162,
-                "hd15iqr": 13.778700362890959,
-                "ops": 0.07448519832356425,
-                "total": 67.12743085250258,
+                "ld15iqr": 15.165010405704379,
+                "hd15iqr": 15.594494730234146,
+                "ops": 0.0652338305534597,
+                "total": 76.64734628610313,
                 "iterations": 1
             }
         },
@@ -673,22 +708,22 @@
                 "warmup": false
             },
             "stats": {
-                "min": 54.99403030797839,
-                "max": 55.44687832519412,
-                "mean": 55.280799315497276,
-                "stddev": 0.17365676139080558,
+                "min": 55.83294254913926,
+                "max": 59.17145283520222,
+                "mean": 57.62203652448952,
+                "stddev": 1.374501327462731,
                 "rounds": 5,
-                "median": 55.29263620171696,
-                "iqr": 0.18626192840747535,
-                "q1": 55.21340201841667,
-                "q3": 55.39966394682415,
+                "median": 57.98267317190766,
+                "iqr": 2.264786566141993,
+                "q1": 56.414323914796114,
+                "q3": 58.67911048093811,
                 "iqr_outliers": 0,
-                "stddev_outliers": 1,
-                "outliers": "1;0",
-                "ld15iqr": 54.99403030797839,
-                "hd15iqr": 55.44687832519412,
-                "ops": 0.018089463473435388,
-                "total": 276.4039965774864,
+                "stddev_outliers": 2,
+                "outliers": "2;0",
+                "ld15iqr": 55.83294254913926,
+                "hd15iqr": 59.17145283520222,
+                "ops": 0.017354471662503586,
+                "total": 288.1101826224476,
                 "iterations": 1
             }
         },
@@ -708,22 +743,22 @@
                 "warmup": false
             },
             "stats": {
-                "min": 68.62203936092556,
-                "max": 69.5449710758403,
-                "mean": 69.09264576677234,
-                "stddev": 0.4051404972610001,
+                "min": 72.57952445559204,
+                "max": 73.64580446854234,
+                "mean": 73.0582833636552,
+                "stddev": 0.48907908462094757,
                 "rounds": 5,
-                "median": 68.97573095746338,
-                "iqr": 0.7140946059953421,
-                "q1": 68.78401179146022,
-                "q3": 69.49810639745556,
+                "median": 72.9562251791358,
+                "iqr": 0.9134543887339532,
+                "q1": 72.61263743927702,
+                "q3": 73.52609182801098,
                 "iqr_outliers": 0,
-                "stddev_outliers": 2,
-                "outliers": "2;0",
-                "ld15iqr": 68.62203936092556,
-                "hd15iqr": 69.5449710758403,
-                "ops": 0.014473320407725865,
-                "total": 345.46322883386165,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 72.57952445559204,
+                "hd15iqr": 73.64580446854234,
+                "ops": 0.01368770184514733,
+                "total": 365.29141681827605,
                 "iterations": 1
             }
         },
@@ -743,22 +778,22 @@
                 "warmup": false
             },
             "stats": {
-                "min": 2.3869710564613342,
-                "max": 2.530611244030297,
-                "mean": 2.4419715087860823,
-                "stddev": 0.05630153012020871,
+                "min": 3.96594556607306,
+                "max": 4.621823711320758,
+                "mean": 4.35534807741642,
+                "stddev": 0.24887039802683908,
                 "rounds": 5,
-                "median": 2.43410103302449,
-                "iqr": 0.07564499671570957,
-                "q1": 2.398690618108958,
-                "q3": 2.4743356148246676,
+                "median": 4.413319645449519,
+                "iqr": 0.31258321227505803,
+                "q1": 4.208048852626234,
+                "q3": 4.520632064901292,
                 "iqr_outliers": 0,
-                "stddev_outliers": 1,
-                "outliers": "1;0",
-                "ld15iqr": 2.3869710564613342,
-                "hd15iqr": 2.530611244030297,
-                "ops": 0.40950518726449253,
-                "total": 12.209857543930411,
+                "stddev_outliers": 2,
+                "outliers": "2;0",
+                "ld15iqr": 3.96594556607306,
+                "hd15iqr": 4.621823711320758,
+                "ops": 0.22960277392873663,
+                "total": 21.7767403870821,
                 "iterations": 1
             }
         },
@@ -778,22 +813,22 @@
                 "warmup": false
             },
             "stats": {
-                "min": 2.973248357884586,
-                "max": 3.1569794192910194,
-                "mean": 3.076459738984704,
-                "stddev": 0.06885617804924284,
+                "min": 4.794365357607603,
+                "max": 5.184939783066511,
+                "mean": 4.969379325211048,
+                "stddev": 0.15483049255542325,
                 "rounds": 5,
-                "median": 3.072229014709592,
-                "iqr": 0.08629889693111181,
-                "q1": 3.040569737320766,
-                "q3": 3.1268686342518777,
+                "median": 4.9499497301876545,
+                "iqr": 0.24070796929299831,
+                "q1": 4.846174816135317,
+                "q3": 5.086882785428315,
                 "iqr_outliers": 0,
                 "stddev_outliers": 2,
                 "outliers": "2;0",
-                "ld15iqr": 2.973248357884586,
-                "hd15iqr": 3.1569794192910194,
-                "ops": 0.3250489474404826,
-                "total": 15.38229869492352,
+                "ld15iqr": 4.794365357607603,
+                "hd15iqr": 5.184939783066511,
+                "ops": 0.20123237421758508,
+                "total": 24.84689662605524,
                 "iterations": 1
             }
         },
@@ -813,26 +848,26 @@
                 "warmup": false
             },
             "stats": {
-                "min": 107.97044172231108,
-                "max": 109.67866003140807,
-                "mean": 108.94072148483247,
-                "stddev": 0.7142319621034929,
+                "min": 152.7211031857878,
+                "max": 161.58804737962782,
+                "mean": 158.70457714907826,
+                "stddev": 3.529131682005075,
                 "rounds": 5,
-                "median": 108.94882048200816,
-                "iqr": 1.1987205250188708,
-                "q1": 108.39640940236859,
-                "q3": 109.59512992738746,
+                "median": 159.99357100203633,
+                "iqr": 3.860361324157566,
+                "q1": 157.0660247253254,
+                "q3": 160.92638604948297,
                 "iqr_outliers": 0,
-                "stddev_outliers": 2,
-                "outliers": "2;0",
-                "ld15iqr": 107.97044172231108,
-                "hd15iqr": 109.67866003140807,
-                "ops": 0.009179303995514913,
-                "total": 544.7036074241623,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 152.7211031857878,
+                "hd15iqr": 161.58804737962782,
+                "ops": 0.006301015496614541,
+                "total": 793.5228857453912,
                 "iterations": 1
             }
         }
     ],
-    "datetime": "2025-01-02T13:26:31.464476+00:00",
+    "datetime": "2025-01-03T13:58:40.332127+00:00",
     "version": "5.1.0"
 }
\ No newline at end of file
diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py
index ff952e99..cf812475 100644
--- a/gpu4pyscf/tests/test_benchmark_rks.py
+++ b/gpu4pyscf/tests/test_benchmark_rks.py
@@ -33,10 +33,10 @@
 # pytest test_benchmark_rks.py -v
 
 # 4. save benchmark results
-# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-save=v100
+# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-save=1v100
 
 # 5. compare benchmark results, fail if performance regresses by more than 10%
-# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare=v100 --benchmark-compare-fail=10%
+# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=1v100 --benchmark-storage=./benchmark_results/
 
 current_folder = os.path.dirname(os.path.abspath(__file__))
 small_mol = os.path.join(current_folder, '020_Vitamin_C.xyz')

From 7bed7c83bb4cdbe4b4768d2551b83a85427e2546 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Sat, 4 Jan 2025 07:57:13 +0800
Subject: [PATCH 38/49] split nightly benchmark

---
 .github/workflows/nightly_build.yml   |  9 +++++++--
 .gitignore                            |  1 +
 gpu4pyscf/tests/test_benchmark_rks.py |  8 +++-----
 gpu4pyscf/tests/test_benchmark_uks.py | 19 +++++++++----------
 4 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/nightly_build.yml b/.github/workflows/nightly_build.yml
index 7be6d721..012ca12f 100644
--- a/.github/workflows/nightly_build.yml
+++ b/.github/workflows/nightly_build.yml
@@ -36,8 +36,13 @@ jobs:
         export PATH=${CUDA_HOME}/bin:${PATH}
         export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:$LD_LIBRARY_PATH
         sh build.sh
-    - name: Smoke Test
+    - name: Test RKS
       run: |
         echo $GITHUB_WORKSPACE
         export PYTHONPATH="${PYTHONPATH}:$(pwd)"
-        pytest gpu4pyscf/tests/ -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_1v100 --benchmark-storage=gpu4pyscf/tests/
+        pytest gpu4pyscf/tests/test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/
+    - name: Test UKS
+      run: |
+        echo $GITHUB_WORKSPACE
+        export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+        pytest gpu4pyscf/tests/test_benchmark_uks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/
diff --git a/.gitignore b/.gitignore
index 427ffd8a..b8dd78e9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@
 **/build
 **/launch_logs
 **/deps
+**/.benchmarks
 core
 **tmp*
 *.egg-info/
diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py
index cf812475..5027da8f 100644
--- a/gpu4pyscf/tests/test_benchmark_rks.py
+++ b/gpu4pyscf/tests/test_benchmark_rks.py
@@ -13,12 +13,10 @@
 # limitations under the License.
 
 import os
-import unittest
 import numpy as np
 import pyscf
 import pytest
-import cupy
-from gpu4pyscf.dft import rks, uks
+from gpu4pyscf.dft import rks
 
 # Any task taking more than 1000s will be marked as 'slow'
 
@@ -33,10 +31,10 @@
 # pytest test_benchmark_rks.py -v
 
 # 4. save benchmark results
-# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-save=1v100
+# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-save=v1.3.0_rks_1v100
 
 # 5. compare benchmark results, fail if performance regresses by more than 10%
-# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=1v100 --benchmark-storage=./benchmark_results/
+# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=1v100 --benchmark-storage=benchmark_results/
 
 current_folder = os.path.dirname(os.path.abspath(__file__))
 small_mol = os.path.join(current_folder, '020_Vitamin_C.xyz')
diff --git a/gpu4pyscf/tests/test_benchmark_uks.py b/gpu4pyscf/tests/test_benchmark_uks.py
index 0e5bf311..2c9d8ce6 100644
--- a/gpu4pyscf/tests/test_benchmark_uks.py
+++ b/gpu4pyscf/tests/test_benchmark_uks.py
@@ -13,12 +13,10 @@
 # limitations under the License.
 
 import os
-import unittest
 import numpy as np
 import pyscf
 import pytest
-import cupy
-from gpu4pyscf.dft import rks, uks
+from gpu4pyscf.dft import uks
 
 current_folder = os.path.dirname(os.path.abspath(__file__))
 small_mol = os.path.join(current_folder, '020_Vitamin_C.xyz')
@@ -84,19 +82,20 @@ def test_df_ub3lyp_grad(benchmark):
 def test_df_ub3lyp_hessian(benchmark):
     h = benchmark(run_ub3lyp_hessian, small_mol, 'def2-tzvpp', True, False)
     print('testing df ub3lyp hessian')
-    assert np.isclose(np.linalg.norm(h), 3.7669464279078064, atol=1e-4, rtol=1e-16)
+    assert np.isclose(np.linalg.norm(h), 3.758810345806532, atol=1e-4, rtol=1e-16)
 @pytest.mark.benchmark
 def test_ub3lyp(benchmark):
-    e = benchmark(run_ub3lyp, small_mol, 'def2-tzvpp', False, False)
+    e = benchmark(run_ub3lyp, small_mol, '6-31gs', False, False)
     print('testing ub3lyp')
-    assert np.isclose(np.linalg.norm(e), 684.9997358509884, atol=1e-7, rtol=1e-16)
+    assert np.isclose(np.linalg.norm(e), 684.6643858622429, atol=1e-7, rtol=1e-16)
 @pytest.mark.benchmark
 def test_ub3lyp_grad(benchmark):
-    g = benchmark(run_ub3lyp_grad, small_mol, 'def2-tzvpp', False, False)
+    g = benchmark(run_ub3lyp_grad, small_mol, '6-31gs', False, False)
     print('testing ub3lyp grad')
-    assert np.isclose(np.linalg.norm(g), 0.17441176110160253, atol=1e-5, rtol=1e-16)
+    assert np.isclose(np.linalg.norm(g), 0.17540045665419984, atol=1e-5, rtol=1e-16)
 @pytest.mark.benchmark
 def test_ub3lyp_hessian(benchmark):
-    h = benchmark(run_ub3lyp_hessian, small_mol, 'def2-tzvpp', False, False)
+    h = benchmark(run_ub3lyp_hessian, small_mol, '6-31gs', False, False)
     print('testing ub3lyp hessian')
-    assert np.isclose(np.linalg.norm(h), 3.758916526520172, atol=1e-4, rtol=1e-16)
+    print(np.linalg.norm(h), np.linalg.norm(h) - 3.758916526520172)
+    assert np.isclose(np.linalg.norm(h), 3.907289414559395, atol=1e-4, rtol=1e-16)

From 651708e26ba01ada3e94292d908ccb752aed920e Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Sat, 4 Jan 2025 08:12:11 +0000
Subject: [PATCH 39/49] optimize df.hessian memory

---
 .github/workflows/nightly_build.yml   |  9 +++-
 gpu4pyscf/df/hessian/jk.py            | 12 +++---
 gpu4pyscf/df/hessian/rhf.py           | 61 ++++++++++++---------------
 gpu4pyscf/df/hessian/uhf.py           | 41 ++++++++----------
 gpu4pyscf/df/int3c2e.py               |  2 +-
 gpu4pyscf/hessian/rks.py              |  2 +-
 gpu4pyscf/tests/test_benchmark_rks.py |  4 +-
 7 files changed, 62 insertions(+), 69 deletions(-)

diff --git a/.github/workflows/nightly_build.yml b/.github/workflows/nightly_build.yml
index 7be6d721..012ca12f 100644
--- a/.github/workflows/nightly_build.yml
+++ b/.github/workflows/nightly_build.yml
@@ -36,8 +36,13 @@ jobs:
         export PATH=${CUDA_HOME}/bin:${PATH}
         export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:$LD_LIBRARY_PATH
         sh build.sh
-    - name: Smoke Test
+    - name: Test RKS
       run: |
         echo $GITHUB_WORKSPACE
         export PYTHONPATH="${PYTHONPATH}:$(pwd)"
-        pytest gpu4pyscf/tests/ -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_1v100 --benchmark-storage=gpu4pyscf/tests/
+        pytest gpu4pyscf/tests/test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/
+    - name: Test UKS
+      run: |
+        echo $GITHUB_WORKSPACE
+        export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+        pytest gpu4pyscf/tests/test_benchmark_uks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/
diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py
index 6b08cee5..7859e97b 100644
--- a/gpu4pyscf/df/hessian/jk.py
+++ b/gpu4pyscf/df/hessian/jk.py
@@ -22,7 +22,7 @@
 from gpu4pyscf.scf.int4c2e import libgint
 from gpu4pyscf.hessian.jk import _ao2mo
 from gpu4pyscf.lib import logger
-from gpu4pyscf.lib.cupy_helper import contract, cart2sph, reduce_to_device
+from gpu4pyscf.lib.cupy_helper import contract, cart2sph, reduce_to_device, get_avail_mem, release_gpu_stack
 from gpu4pyscf.__config__ import _streams, _num_devices
 
 NROOT_ON_GPU = 7
@@ -310,7 +310,7 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
             if with_k:
                 rhok_tmp = contract('por,ir->poi', rhok[k0:k1], orbo[i0:i1])
                 rhok_tmp = contract('poi,jo->pji', rhok_tmp, orbo[j0:j1])
-
+            
             # (20|0), (0|0)(0|00)
             int3c_blk = _get_int3c2e_ipip_slice('ipip1', intopt, cp_ij_id, aux_id, omega=omega)
             if with_j:
@@ -320,7 +320,7 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
                 hj_ipip1[:,i0:i1] += contract('xji,ij->xi', tmp, dm0[i0:i1,j0:j1])
             if with_k:
                 hk_ipip1[:,i0:i1] += contract('xpji,pji->xi', int3c_blk, rhok_tmp)
-            int3c_blk = None
+            int3c_blk = tmp = None
 
             # (11|0), (0|0)(0|00) without response of RI basis
             int3c_blk = _get_int3c2e_ipip_slice('ipvip1', intopt, cp_ij_id, aux_id, omega=omega)
@@ -331,7 +331,7 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
                 hj_ipvip1[:,i0:i1,j0:j1] += contract('xji,ij->xij', tmp, dm0[i0:i1,j0:j1])
             if with_k:
                 hk_ipvip1[:,i0:i1,j0:j1] += contract('xpji,pji->xij', int3c_blk, rhok_tmp)
-            int3c_blk = None
+            int3c_blk = tmp = None
 
             if auxbasis_response < 1:
                 continue
@@ -343,7 +343,7 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
                 hj_ip1ip2[:,i0:i1,k0:k1] += contract('xpi,p->xip', tmp, rhoj[k0:k1])
             if with_k:
                 hk_ip1ip2[:,i0:i1,k0:k1] += contract('xpji,pji->xip', int3c_blk, rhok_tmp)
-            int3c_blk = None
+            int3c_blk = tmp = None
 
             if auxbasis_response < 2:
                 continue
@@ -355,7 +355,7 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
                 hj_ipip2[:,k0:k1] += contract('xp,p->xp', tmp, rhoj[k0:k1])
             if with_k:
                 hk_ipip2[:,k0:k1] += contract('xpji,pji->xp', int3c_blk, rhok_tmp)
-            int3c_blk = None
+            int3c_blk = tmp = None
         auxslices = intopt.auxmol.aoslice_by_atom()
         aoslices = intopt.mol.aoslice_by_atom()
         ao2atom = int3c2e.get_ao2atom(intopt, aoslices)
diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py
index 47faa476..9dfee665 100644
--- a/gpu4pyscf/df/hessian/rhf.py
+++ b/gpu4pyscf/df/hessian/rhf.py
@@ -122,22 +122,10 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
     int2c = intopt.sort_orbitals(int2c, aux_axis=[0,1])
     solve_j2c = _gen_metric_solver(int2c)
 
-    int2c_ip1 = cupy.asarray(int2c_ip1, order='C')
-    int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2])
-    if with_j:
-        hj_ao_ao = cupy.zeros([nao,nao,3,3])
-    if with_k:
-        hk_ao_ao = cupy.zeros([nao,nao,3,3])
-    if hessobj.auxbasis_response:
-        if with_j:
-            hj_ao_aux = cupy.zeros([nao,naux,3,3])
-        if with_k:
-            hk_ao_aux = cupy.zeros([nao,naux,3,3])
-
     #  int3c contributions
     wj, wk_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0_tag, omega=omega)
     rhoj0_P = rhok0_P__ = None
-    
+
     if with_j:
         rhoj0_P = solve_j2c(wj)
         wj = None
@@ -146,6 +134,11 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
         wk_P__ = None
     t1 = log.timer_debug1('intermediate variables with int3c2e', *t1)
 
+    hj_ipip, hk_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag,
+                                          with_j=with_j, with_k=with_k, omega=omega,
+                                          auxbasis_response=hessobj.auxbasis_response)
+    t1 = log.timer_debug1('intermediate variables with int3c2e_ipip', *t1)
+
     # int3c_ip2 contributions
     wj_ip2, wk_ip2_P__ = int3c2e.get_int3c2e_ip2_wjk(intopt, dm0_tag, omega=omega)
     t1 = log.timer_debug1('intermediate variables with int3c2e_ip2', *t1)
@@ -153,17 +146,22 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
     #  int3c_ip1 contributions
     wj1_P, wk1_Pko = int3c2e.get_int3c2e_ip1_wjk(intopt, dm0_tag, omega=omega)
     t1 = log.timer_debug1('intermediate variables with int3c2e_ip1', *t1)
+    
+    cupy.get_default_memory_pool().free_all_blocks()
+    release_gpu_stack()
 
     #rhoj1_P = contract('pq,pix->qix', int2c_inv, wj1_P)
+    int2c_ip1 = cupy.asarray(int2c_ip1, order='C')
+    int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2])
+
     if with_j:
         rhoj1_P = solve_j2c(wj1_P)
-
-        hj_ao_ao += 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P)   # (10|0)(0|0)(0|01)
+        hj_ao_ao = 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P)   # (10|0)(0|0)(0|01)
         wj1_P = None
         if hessobj.auxbasis_response:
             wj0_01 = contract('ypq,q->yp', int2c_ip1, rhoj0_P)
             wj1_01 = contract('yqp,pix->iqxy', int2c_ip1, rhoj1_P)
-            hj_ao_aux += contract('pix,py->ipxy', rhoj1_P, wj_ip2)   # (10|0)(1|00)
+            hj_ao_aux = contract('pix,py->ipxy', rhoj1_P, wj_ip2)   # (10|0)(1|00)
             hj_ao_aux -= contract('pix,yp->ipxy', rhoj1_P, wj0_01)   # (10|0)(1|0)(0|00)
             hj_ao_aux -= contract('q,iqxy->iqxy', rhoj0_P, wj1_01)   # (10|0)(0|1)(0|00)
             wj1_01 = None
@@ -174,11 +172,12 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
         mem_avail = get_avail_mem()
         nocc = mocc.shape[1]
         slice_size = naux*nocc*9   # largest slice of intermediate variables
-        blksize = int(mem_avail*0.2/8/slice_size/ALIGNED) * ALIGNED
+        blksize = int(mem_avail*0.4/8/slice_size/ALIGNED) * ALIGNED
         log.debug(f'GPU Memory {mem_avail/GB:.1f} GB available, {blksize} aux AOs per block')
         if blksize < ALIGNED:
             raise RuntimeError('Not enough memory for intermediate variables')
-
+        if hessobj.auxbasis_response:
+            hk_ao_aux = cupy.zeros([nao,naux,3,3])
         for i0, i1 in lib.prange(0,nao,blksize):
             #wk1_Pko_islice = cupy.asarray(wk1_Pko[:,i0:i1])
             wk1_Pko_islice = copy_array(wk1_Pko[:,i0:i1])
@@ -187,6 +186,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
             rhok1_Pko = solve_j2c(wk1_Pko_islice)
             wk1_Pko_islice = None
             if hessobj.auxbasis_response:
+                hk_ao_aux = cupy.zeros([nao,naux,3,3])
                 # (10|0)(1|00)
                 wk_ip2_Ipo = contract('porx,io->pirx', wk_ip2_P__, mocc_2[i0:i1])
                 hk_ao_aux[i0:i1] += contract('piox,pioy->ipxy', rhok1_Pko, wk_ip2_Ipo)
@@ -205,6 +205,11 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
         rhok1_Pko = None
         t1 = log.timer_debug1('contract int3c2e_ip1 with int2c_ip1', *t1)
         
+        rho2c_0 = contract('pij,qji->pq', rhok0_P__, rhok0_P__)
+        rho2c_10 = contract('rijx,qij->rqx', wk_ip2_P__, rhok0_P__)
+        rho2c_11 = contract('pijx,qijy->pqxy', wk_ip2_P__, wk_ip2_P__)
+        rhok0_P__ = wk_ip2_P__ = None
+
         w, v = cupy.linalg.eigh(int2c)
         idx = w > LINEAR_DEP_THR
         cd_low = (v[:,idx] / cupy.sqrt(w[idx]))
@@ -223,17 +228,14 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
                 copy_array(wk1_tmp, rhok1_Pko[:,i0:i1])
             wk1_tmp = None
         cd_low = None
-        hk_ao_ao += _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2)
+        hk_ao_ao = _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2)
     wk1_Pko = rhok1_Pko = None
+    solve_j2c = None
     t1 = log.timer_debug1('contract int3c2e_ip1 with int3c2e_ip1', *t1)
 
-    hj_ipip, hk_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag,
-                                          with_j=with_j, with_k=with_k, omega=omega,
-                                          auxbasis_response=hessobj.auxbasis_response)
-    t1 = log.timer_debug1('intermediate variables with int3c2e_ipip', *t1)
-
     # int2c contributions
     if hessobj.auxbasis_response > 1:
+        cupy.get_default_memory_pool().free_all_blocks()
         if omega and omega > 1e-10:
             with auxmol.with_range_coulomb(omega):
                 int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1')
@@ -248,7 +250,6 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
             rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P)
             hj_aux_diag = -(rhoj0_P*rhoj2c_P).T.reshape(-1,3,3)
         if with_k:
-            rho2c_0 = contract('pij,qji->pq', rhok0_P__, rhok0_P__)
             hk_aux_diag = -.5 * contract('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3)
         int2c_ipip1 = None
 
@@ -266,10 +267,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
         t1 = log.timer_debug1('intermediate variables with int2c_*', *t1)
         int2c_ip1ip2 = None
 
-    cupy.get_default_memory_pool().free_all_blocks()
-    release_gpu_stack()
-    # aux-aux pair
-    if hessobj.auxbasis_response > 1:
+        # aux-aux pair
         int2c_inv = pinv(int2c, lindep=LINEAR_DEP_THR)
         int2c_ip1_inv = contract('yqp,pr->yqr', int2c_ip1, int2c_inv)
         if with_j:
@@ -290,11 +288,6 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
             wj0_01 = rhoj0_01 = None
 
         if with_k:
-            rho2c_10 = contract('rijx,qij->rqx', wk_ip2_P__, rhok0_P__)
-            rhok0_P__ = None
-
-            rho2c_11 = contract('pijx,qijy->pqxy', wk_ip2_P__, wk_ip2_P__)
-            wk_ip2_P__ = None
             hk_aux_aux += .5 * contract('pqxy,pq->pqxy', rho2c_11, int2c_inv)      # (00|1)(1|00)
             rho2c_11 = None
 
diff --git a/gpu4pyscf/df/hessian/uhf.py b/gpu4pyscf/df/hessian/uhf.py
index b77015f6..5e94a248 100644
--- a/gpu4pyscf/df/hessian/uhf.py
+++ b/gpu4pyscf/df/hessian/uhf.py
@@ -116,16 +116,6 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     int2c_ip1 = cupy.asarray(int2c_ip1, order='C')
     int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2])
 
-    if with_j:
-        hj_ao_ao = cupy.zeros([nao,nao,3,3])
-    if with_k:
-        hk_ao_ao = cupy.zeros([nao,nao,3,3])
-    if hessobj.auxbasis_response:
-        if with_j:
-            hj_ao_aux = cupy.zeros([nao,naux,3,3])
-        if with_k:
-            hk_ao_aux = cupy.zeros([nao,naux,3,3])
-
     #  int3c contributions
     wja, wka_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0a_tag, omega=omega)
     wjb, wkb_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0b_tag, omega=omega)
@@ -153,12 +143,12 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     if with_j:
         wj1_P = wj1a_P + wj1b_P
         rhoj1_P = solve_j2c(wj1_P)
-        hj_ao_ao += 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P)   # (10|0)(0|0)(0|01)
+        hj_ao_ao = 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P)   # (10|0)(0|0)(0|01)
         wj1_P = None
         if hessobj.auxbasis_response:
             wj0_01 = contract('ypq,q->yp', int2c_ip1, rhoj0_P)
             wj1_01 = contract('yqp,pix->iqxy', int2c_ip1, rhoj1_P)
-            hj_ao_aux += contract('pix,py->ipxy', rhoj1_P, wj_ip2)   # (10|0)(1|00)
+            hj_ao_aux = contract('pix,py->ipxy', rhoj1_P, wj_ip2)   # (10|0)(1|00)
             hj_ao_aux -= contract('pix,yp->ipxy', rhoj1_P, wj0_01)   # (10|0)(1|0)(0|00)
             hj_ao_aux -= contract('q,iqxy->iqxy', rhoj0_P, wj1_01)   # (10|0)(0|1)(0|00)
             wj1_01 = None
@@ -173,7 +163,9 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         log.debug(f'GPU Memory {mem_avail/GB:.1f} GB available, block size {blksize}')
         if blksize < ALIGNED:
             raise RuntimeError('Not enough memory for intermediate variables')
-
+        hk_ao_ao = cupy.zeros([nao,nao,3,3])
+        if hessobj.auxbasis_response:
+            hk_ao_aux = cupy.zeros([nao,naux,3,3])
         for i0, i1 in lib.prange(0,nao,blksize):
             #wk1a_Pko_islice = cupy.asarray(wk1a_Pko[:,i0:i1])
             #wk1b_Pko_islice = cupy.asarray(wk1b_Pko[:,i0:i1])
@@ -244,6 +236,19 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         hk_ipip = 2.0*(hka_ipip + hkb_ipip)
     t1 = log.timer_debug1('intermediate variables with int3c2e_ipip', *t1)
 
+    if hessobj.auxbasis_response > 1:
+        if with_k:
+            rho2c_0 = contract('pij,qji->pq', rhok0a_P__, rhok0a_P__)
+            rho2c_0+= contract('pij,qji->pq', rhok0b_P__, rhok0b_P__)
+        
+            rho2c_10 = contract('rijx,qij->rqx', wka_ip2_P__, rhok0a_P__)
+            rho2c_10+= contract('rijx,qij->rqx', wkb_ip2_P__, rhok0b_P__)
+            rhok0a_P__ = rhok0b_P__ = None
+
+            rho2c_11 = contract('pijx,qijy->pqxy', wka_ip2_P__, wka_ip2_P__)
+            rho2c_11+= contract('pijx,qijy->pqxy', wkb_ip2_P__, wkb_ip2_P__)
+            wka_ip2_P__ = wkb_ip2_P__ = None
+
     # int2c contributions
     if hessobj.auxbasis_response > 1:
         if omega and omega > 1e-10:
@@ -259,8 +264,6 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
             # p,xp->px
             hj_aux_diag = -(rhoj0_P*rhoj2c_P).T.reshape(-1,3,3)
         if with_k:
-            rho2c_0 = contract('pij,qji->pq', rhok0a_P__, rhok0a_P__)
-            rho2c_0+= contract('pij,qji->pq', rhok0b_P__, rhok0b_P__)
             hk_aux_diag = -contract('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3)
         int2c_ipip1 = None
 
@@ -301,14 +304,6 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
             wj0_01 = rhoj0_01 = None
 
         if with_k:
-            rho2c_10 = contract('rijx,qij->rqx', wka_ip2_P__, rhok0a_P__)
-            rho2c_10+= contract('rijx,qij->rqx', wkb_ip2_P__, rhok0b_P__)
-            rhok0a_P__ = rhok0b_P__ = None
-
-
-            rho2c_11 = contract('pijx,qijy->pqxy', wka_ip2_P__, wka_ip2_P__)
-            rho2c_11+= contract('pijx,qijy->pqxy', wkb_ip2_P__, wkb_ip2_P__)
-            wka_ip2_P__ = wkb_ip2_P__ = None
             hk_aux_aux += .5 * contract('pqxy,pq->pqxy', rho2c_11, int2c_inv)      # (00|1)(1|00)
             rho2c_11 = None
 
diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
index 8bfa8a81..e77e30ca 100644
--- a/gpu4pyscf/df/int3c2e.py
+++ b/gpu4pyscf/df/int3c2e.py
@@ -831,7 +831,7 @@ def get_int3c2e_jk(mol, auxmol, dm0_tag, with_k=True, omega=None):
         for device_id in range(_num_devices):
             future = executor.submit(
                 _int3c2e_jk_task, intopt, task_list[device_id],
-                dm0_tag.get(), orbo.get(), device_id=device_id, omega=omega)
+                dm0_tag, orbo, device_id=device_id, omega=omega)
             futures.append(future)
 
     rhoj_total = []
diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py
index 912748c7..a2ee9da7 100644
--- a/gpu4pyscf/hessian/rks.py
+++ b/gpu4pyscf/hessian/rks.py
@@ -767,7 +767,7 @@ def _nr_rks_fxc_mo_task(ni, mol, grids, xc_code, fxc, mo_coeff, mo1, mocc,
 
             t1 = log.timer_debug2('integration', *t1)
             ao = rho1 = None
-        t0 = log.timer_debug1('vxc', *t0)
+        t0 = log.timer_debug1(f'vxc on Device {device_id} ', *t0)
         if xctype != 'LDA':
             transpose_sum(vmat)
         vmat = jk._ao2mo(vmat, mocc, mo_coeff)
diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py
index cf812475..554dbba1 100644
--- a/gpu4pyscf/tests/test_benchmark_rks.py
+++ b/gpu4pyscf/tests/test_benchmark_rks.py
@@ -33,10 +33,10 @@
 # pytest test_benchmark_rks.py -v
 
 # 4. save benchmark results
-# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-save=1v100
+# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-save=v1.3.0_rks_1v100
 
 # 5. compare benchmark results, fail if performance regresses by more than 10%
-# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=1v100 --benchmark-storage=./benchmark_results/
+# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=1v100 --benchmark-storage=benchmark_results/
 
 current_folder = os.path.dirname(os.path.abspath(__file__))
 small_mol = os.path.join(current_folder, '020_Vitamin_C.xyz')

From e85afa66692c94942a74efac9b72382abc412923 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Sun, 5 Jan 2025 07:57:00 +0000
Subject: [PATCH 40/49] small fixes

---
 .../cupy_helper/benchmark_memory_copy.py      | 16 ++++++
 examples/dft_driver.py                        |  4 +-
 gpu4pyscf/df/grad/rhf.py                      |  3 +-
 gpu4pyscf/df/grad/uhf.py                      | 54 +------------------
 gpu4pyscf/df/hessian/rhf.py                   |  7 ++-
 gpu4pyscf/dft/numint.py                       | 29 +++++++---
 6 files changed, 45 insertions(+), 68 deletions(-)

diff --git a/benchmarks/cupy_helper/benchmark_memory_copy.py b/benchmarks/cupy_helper/benchmark_memory_copy.py
index d10f97ac..8455f3f0 100644
--- a/benchmarks/cupy_helper/benchmark_memory_copy.py
+++ b/benchmarks/cupy_helper/benchmark_memory_copy.py
@@ -123,3 +123,19 @@ def cupy_asarray_contiguous(a, b):
 print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
 
 assert np.linalg.norm(a.get() - b.get()) < 1e-10
+
+
+print('----------- Benchmark reduction across devices ------ ')
+from gpu4pyscf.lib.cupy_helper import reduce_to_device
+_num_devices = cp.cuda.runtime.getDeviceCount()
+a_dist = []
+for device_id in range(_num_devices):
+    with cp.cuda.Device(device_id):
+        a = cp.random.rand(512,512,512)
+        a_dist.append(a)
+
+perf_cupy = profiler.benchmark(reduce_to_device, (a_dist,), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = a_dist[0].nbytes * _num_devices / t_kernel / 1e9
+print('Cupy set contiguous array', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
diff --git a/examples/dft_driver.py b/examples/dft_driver.py
index 0be7f410..0c8cea48 100644
--- a/examples/dft_driver.py
+++ b/examples/dft_driver.py
@@ -34,10 +34,10 @@
     basis=bas,
     max_memory=32000)
 # set verbose >= 6 for debugging timer
-mol.verbose = 6
+mol.verbose = 4
 
 mf_df = dft.RKS(mol, xc=args.xc).density_fit(auxbasis=args.auxbasis)
-mf_df.verbose = 6
+mf_df.verbose = 4
 
 if args.solvent:
     mf_df = mf_df.PCM()
diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py
index ea0537ed..17816bc8 100644
--- a/gpu4pyscf/df/grad/rhf.py
+++ b/gpu4pyscf/df/grad/rhf.py
@@ -151,7 +151,8 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
         cart2sph = intopt.cart2sph
         orbo_cart = cart2sph @ orbo
         dm_cart = cart2sph @ dm @ cart2sph.T
-    
+        
+    with_df._cderi = None # release GPU memory
     vj, vk, vjaux, vkaux = get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart,
                                         with_j=with_j, with_k=with_k, omega=omega)
     # NOTE: vj and vk are still in cartesian
diff --git a/gpu4pyscf/df/grad/uhf.py b/gpu4pyscf/df/grad/uhf.py
index 42107967..53acd7e0 100644
--- a/gpu4pyscf/df/grad/uhf.py
+++ b/gpu4pyscf/df/grad/uhf.py
@@ -165,59 +165,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True,
         orbo_cart = orbo
     dm = orbo = None
 
-    """
-    vj = vk = rhoj_tmp = rhok_tmp = None
-    vjaux = vkaux = None
-
-    naux_cart = intopt._sorted_auxmol.nao
-    if with_j:
-        vj = cupy.zeros((3,nao_cart), order='C')
-        vjaux = cupy.zeros((3,naux_cart))
-    if with_k:
-        vk = cupy.zeros((3,nao_cart), order='C')
-        vkaux = cupy.zeros((3,naux_cart))
-    cupy.get_default_memory_pool().free_all_blocks()
-    t1 = log.init_timer()
-    for cp_kl_id in range(len(intopt.aux_log_qs)):
-        k0, k1 = intopt.cart_aux_loc[cp_kl_id], intopt.cart_aux_loc[cp_kl_id+1]
-        assert k1-k0 <= block_size
-        if with_j:
-            rhoj_tmp = rhoj_cart[k0:k1]
-        if with_k:
-            rhok_tmp = contract('por,ir->pio', rhok_cart[k0:k1], orbo_cart)
-            rhok_tmp = contract('pio,jo->pji', rhok_tmp, orbo_cart)
-        '''
-        if(rhoj_tmp.flags['C_CONTIGUOUS'] == False):
-            rhoj_tmp = rhoj_tmp.astype(cupy.float64, order='C')
-
-        if(rhok_tmp.flags['C_CONTIGUOUS'] == False):
-            rhok_tmp = rhok_tmp.astype(cupy.float64, order='C')
-        '''
-        '''
-        # outcore implementation
-        int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 1, out=buf)
-        size = 3*(k1-k0)*nao_cart*nao_cart
-        int3c_ip = buf[:size].reshape([3,k1-k0,nao_cart,nao_cart], order='C')
-        rhoj_tmp = contract('xpji,ij->xip', int3c_ip, dm_cart)
-        vj += contract('xip,p->xi', rhoj_tmp, rhoj_cart[k0:k1])
-        vk += contract('pji,xpji->xi', rhok_tmp, int3c_ip)
-
-        int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 2, out=buf)
-        rhoj_tmp = contract('xpji,ji->xp', int3c_ip, dm_cart)
-        vjaux[:, k0:k1] = contract('xp,p->xp', rhoj_tmp, rhoj_cart[k0:k1])
-        vkaux[:, k0:k1] = contract('xpji,pji->xp', int3c_ip, rhok_tmp)
-        '''
-        vj_tmp, vk_tmp = int3c2e.get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip1', rhoj_tmp, rhok_tmp, dm_cart, omega=omega)
-        if with_j: vj += vj_tmp
-        if with_k: vk += vk_tmp
-
-        vj_tmp, vk_tmp = int3c2e.get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip2', rhoj_tmp, rhok_tmp, dm_cart, omega=omega)
-        if with_j: vjaux[:, k0:k1] = vj_tmp
-        if with_k: vkaux[:, k0:k1] = vk_tmp
-
-        rhoj_tmp = rhok_tmp = vj_tmp = vk_tmp = None
-        t1 = log.timer_debug1(f'calculate {cp_kl_id:3d} / {len(intopt.aux_log_qs):3d}, {k1-k0:3d} slices', *t1)
-    """
+    with_df._cderi = None  # release GPU memory
     vj, vk, vjaux, vkaux = get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart,
                                         with_j=with_j, with_k=with_k, omega=omega)
     
diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py
index 9dfee665..aa0c5047 100644
--- a/gpu4pyscf/df/hessian/rhf.py
+++ b/gpu4pyscf/df/hessian/rhf.py
@@ -121,7 +121,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
     int2c = cupy.asarray(int2c, order='C')
     int2c = intopt.sort_orbitals(int2c, aux_axis=[0,1])
     solve_j2c = _gen_metric_solver(int2c)
-
+    
     #  int3c contributions
     wj, wk_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0_tag, omega=omega)
     rhoj0_P = rhok0_P__ = None
@@ -172,10 +172,9 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
         mem_avail = get_avail_mem()
         nocc = mocc.shape[1]
         slice_size = naux*nocc*9   # largest slice of intermediate variables
-        blksize = int(mem_avail*0.4/8/slice_size/ALIGNED) * ALIGNED
+        blksize = int(mem_avail*0.2/8/slice_size)
         log.debug(f'GPU Memory {mem_avail/GB:.1f} GB available, {blksize} aux AOs per block')
-        if blksize < ALIGNED:
-            raise RuntimeError('Not enough memory for intermediate variables')
+        assert blksize > 0
         if hessobj.auxbasis_response:
             hk_ao_aux = cupy.zeros([nao,naux,3,3])
         for i0, i1 in lib.prange(0,nao,blksize):
diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py
index a40c976f..0c533bcf 100644
--- a/gpu4pyscf/dft/numint.py
+++ b/gpu4pyscf/dft/numint.py
@@ -414,9 +414,11 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
 
         ngrids_glob = grids.coords.shape[0]
         ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
+        ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE
         grid_start = device_id * ngrids_per_device
-        grid_end = (device_id + 1) * ngrids_per_device
+        grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob)
         ngrids_local = grid_end - grid_start
+        log.debug(f"{ngrids_local} on Device {device_id}")
 
         weights = cupy.empty([ngrids_local])
         if xctype == 'LDA':
@@ -425,7 +427,7 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
             rho_tot = cupy.empty([nset,4,ngrids_local])
         else:
             rho_tot = cupy.empty([nset,5,ngrids_local])
-
+        
         p0 = p1 = 0
         for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
                                                      max_memory=None,
@@ -433,8 +435,10 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
             p1 = p0 + weight.size
             weights[p0:p1] = weight
             for i in range(nset):
-                if mo_coeff is None:
-                    rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms[i][idx[:,None],idx],
+                # If AO is sparse enough, use density matrix to calculate rho
+                if mo_coeff is None or len(idx) < mo_occ.sum():
+                    dms_mask = dms[i][idx[:,None],idx]
+                    rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms_mask,
                                                 xctype=xctype, hermi=hermi, with_lapl=with_lapl)
                 else:
                     assert hermi == 1
@@ -443,7 +447,7 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
                                                 None, xctype, with_lapl)
             p0 = p1
         t0 = log.timer_debug1(f'eval rho on Device {device_id}', *t0)
-
+        
         # libxc calls are still running on default stream
         nelec = cupy.zeros(nset)
         excsum = cupy.zeros(nset)
@@ -814,8 +818,11 @@ def _nr_uks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
 
         ngrids_glob = grids.coords.shape[0]
         ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
+        ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE
         grid_start = device_id * ngrids_per_device
-        grid_end = (device_id + 1) * ngrids_per_device
+        grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob)
+        ngrids_local = grid_end - grid_start
+        log.debug(f"{ngrids_local} on Device {device_id}")
 
         for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
                                                      max_memory=None,
@@ -1016,8 +1023,11 @@ def _nr_rks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff,
 
         ngrids_glob = grids.coords.shape[0]
         ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
+        ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE
         grid_start = device_id * ngrids_per_device
-        grid_end = (device_id + 1) * ngrids_per_device
+        grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob)
+        ngrids_local = grid_end - grid_start
+        log.debug(f"{ngrids_local} on Device {device_id}")
 
         p0 = p1 = grid_start
         t1 = t0 = log.init_timer()
@@ -1165,8 +1175,11 @@ def _nr_uks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff,
 
         ngrids_glob = grids.coords.shape[0]
         ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
+        ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE
         grid_start = device_id * ngrids_per_device
-        grid_end = (device_id + 1) * ngrids_per_device
+        grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob)
+        ngrids_local = grid_end - grid_start
+        log.debug(f"{ngrids_local} on Device {device_id}")
 
         p0 = p1 = grid_start
         t1 = t0 = log.init_timer()

From 1ed8e5eccf5aaae256f4ad591db81f6689dd8a13 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Sun, 5 Jan 2025 23:21:10 +0000
Subject: [PATCH 41/49] bugfix in df.hessian

---
 gpu4pyscf/df/hessian/rhf.py           |  1 -
 gpu4pyscf/dft/numint.py               |  2 +-
 gpu4pyscf/tests/test_benchmark_rks.py | 52 +++++++++++++--------------
 gpu4pyscf/tests/test_benchmark_uks.py | 13 ++++---
 4 files changed, 33 insertions(+), 35 deletions(-)

diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py
index aa0c5047..2eab8ef5 100644
--- a/gpu4pyscf/df/hessian/rhf.py
+++ b/gpu4pyscf/df/hessian/rhf.py
@@ -185,7 +185,6 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
             rhok1_Pko = solve_j2c(wk1_Pko_islice)
             wk1_Pko_islice = None
             if hessobj.auxbasis_response:
-                hk_ao_aux = cupy.zeros([nao,naux,3,3])
                 # (10|0)(1|00)
                 wk_ip2_Ipo = contract('porx,io->pirx', wk_ip2_P__, mocc_2[i0:i1])
                 hk_ao_aux[i0:i1] += contract('piox,pioy->ipxy', rhok1_Pko, wk_ip2_Ipo)
diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py
index 0c533bcf..872ce750 100644
--- a/gpu4pyscf/dft/numint.py
+++ b/gpu4pyscf/dft/numint.py
@@ -436,7 +436,7 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
             weights[p0:p1] = weight
             for i in range(nset):
                 # If AO is sparse enough, use density matrix to calculate rho
-                if mo_coeff is None or len(idx) < mo_occ.sum():
+                if mo_coeff is None:
                     dms_mask = dms[i][idx[:,None],idx]
                     rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms_mask,
                                                 xctype=xctype, hermi=hermi, with_lapl=with_lapl)
diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py
index 5027da8f..2afa6e70 100644
--- a/gpu4pyscf/tests/test_benchmark_rks.py
+++ b/gpu4pyscf/tests/test_benchmark_rks.py
@@ -94,17 +94,17 @@ def run_rb3lyp_hessian(atom, basis, with_df, with_solvent, disp=None):
 #######
 # DF
 #######
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
 def test_df_rb3lyp(benchmark):
     e = benchmark(run_rb3lyp, small_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp')
     assert np.isclose(np.linalg.norm(e), 684.9998712035579, atol=1e-7, rtol=1e-16)
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
 def test_df_rb3lyp_grad(benchmark):
     g = benchmark(run_rb3lyp_grad, small_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp grad')
     assert np.isclose(np.linalg.norm(g), 0.17435941081837686, atol=1e-5, rtol=1e-16)
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=False, min_rounds=1)
 def test_df_rb3lyp_hessian(benchmark):
     h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp hessian')
@@ -113,17 +113,17 @@ def test_df_rb3lyp_hessian(benchmark):
 ################
 # Direct SCF
 ################
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
 def test_rb3lyp(benchmark):
     e = benchmark(run_rb3lyp, small_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp')
     assert np.isclose(np.linalg.norm(e), 684.999735850967, atol=1e-7, rtol=1e-16)
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
 def test_rb3lyp_grad(benchmark):
     g = benchmark(run_rb3lyp_grad, small_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp grad')
     assert np.isclose(np.linalg.norm(g), 0.1744127474130983, atol=1e-5, rtol=1e-16)
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=False, min_rounds=1)
 def test_rb3lyp_hessian(benchmark):
     h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp hessian')
@@ -132,34 +132,34 @@ def test_rb3lyp_hessian(benchmark):
 ####################
 # Medium molecule
 ####################
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
 def test_df_rb3lyp_medium(benchmark):
     e = benchmark(run_rb3lyp, medium_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp medium')
     assert np.isclose(np.linalg.norm(e), 1138.371390377773, atol=1e-7, rtol=1e-16)
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
 def test_df_rb3lyp_grad_medium(benchmark):
     g = benchmark(run_rb3lyp_grad, medium_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp grad medium')
     assert np.isclose(np.linalg.norm(g), 0.26010545073602614, atol=1e-5, rtol=1e-16)
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=False, min_rounds=1)
 def test_df_rb3lyp_hessian_medium(benchmark):
     h = benchmark(run_rb3lyp_hessian, medium_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp hessian medium')
     assert np.isclose(np.linalg.norm(h), 6.31265424196621, atol=1e-4, rtol=1e-16)
 
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
 def test_rb3lyp_medium(benchmark):
     e = benchmark(run_rb3lyp, medium_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp medium')
     assert np.isclose(np.linalg.norm(e), 1138.3710752128077, atol=1e-7, rtol=1e-16)
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
 def test_rb3lyp_grad_medium(benchmark):
     g = benchmark(run_rb3lyp_grad, medium_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp grad medium')
     assert np.isclose(np.linalg.norm(g), 0.2601443836937988, atol=1e-5, rtol=1e-16)
 @pytest.mark.slow
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=False, min_rounds=1)
 def test_rb3lyp_hessian_medium(benchmark):
     h = benchmark(run_rb3lyp_hessian, medium_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp hessian medium')
@@ -169,32 +169,32 @@ def test_rb3lyp_hessian_medium(benchmark):
 # large molecule
 ####################
 @pytest.mark.high_memory
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
 def test_df_rb3lyp_large(benchmark):
     e = benchmark(run_rb3lyp, large_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp large')
     assert np.isclose(np.linalg.norm(e), 2564.198712152175, atol=1e-7, rtol=1e-16)
 @pytest.mark.high_memory
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
 def test_df_rb3lyp_grad_large(benchmark):
     g = benchmark(run_rb3lyp_grad, large_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp grad large')
     assert np.isclose(np.linalg.norm(g), 0.3784358687859323, atol=1e-5, rtol=1e-16)
 @pytest.mark.high_memory
 @pytest.mark.slow
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=False, min_rounds=1)
 def test_df_rb3lyp_hessian_large(benchmark):
     h = benchmark(run_rb3lyp_hessian, large_mol, 'def2-tzvpp', True, False)
     print('testing df rb3lyp hessian large')
     assert np.isclose(np.linalg.norm(h), 7.583208736873523, atol=1e-4, rtol=1e-16)
 @pytest.mark.slow
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
 def test_rb3lyp_large(benchmark):
     e = benchmark(run_rb3lyp, large_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp large')
     assert np.isclose(np.linalg.norm(e), 2564.198099576358, atol=1e-7, rtol=1e-16)
 @pytest.mark.slow
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
 def test_rb3lyp_grad_large(benchmark):
     g = benchmark(run_rb3lyp_grad, large_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp grad large')
@@ -213,17 +213,17 @@ def test_rb3lyp_hessian_large(benchmark):
 #####################
 # Small basis set
 #####################
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
 def test_df_rb3lyp_631gs(benchmark):
     e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, False)
     print('testing df rb3lyp 631gs')
     assert np.isclose(np.linalg.norm(e), 684.6646008642876, atol=1e-7, rtol=1e-16)
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
 def test_df_rb3lyp_631gs_grad(benchmark):
     g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, False)
     print('testing df rb3lyp 631gs grad')
     assert np.isclose(np.linalg.norm(g), 0.17530687343398219, atol=1e-5, rtol=1e-16)
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=False, min_rounds=1)
 def test_df_rb3lyp_631gs_hessian(benchmark):
     h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, False)
     print('testing df rb3lyp 631gs hessian')
@@ -232,18 +232,18 @@ def test_df_rb3lyp_631gs_hessian(benchmark):
 #########################################
 # Small basis set for large molecule
 #########################################
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
 def test_rb3lyp_631gs_large(benchmark):
     e = benchmark(run_rb3lyp, large_mol, '6-31gs', False, False)
     print('testing rb3lyp 631gs large')
     assert np.isclose(np.linalg.norm(e), 2563.1171191823423, atol=1e-7, rtol=1e-16)
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
 def test_rb3lyp_631gs_grad_large(benchmark):
     g = benchmark(run_rb3lyp_grad, large_mol, '6-31gs', False, False)
     print('testing df rb3lyp 631gs grad large')
     assert np.isclose(np.linalg.norm(g), 0.37778228700247984, atol=1e-5, rtol=1e-16)
 @pytest.mark.slow
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=False, min_rounds=1)
 def test_rb3lyp_631gs_hessian_large(benchmark):
     h = benchmark(run_rb3lyp_hessian, large_mol, '6-31gs', False, False)
     print('testing df rb3lyp 631gs hessian large')
@@ -252,17 +252,17 @@ def test_rb3lyp_631gs_hessian_large(benchmark):
 ###################
 # Solvent model
 ###################
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
 def test_df_rb3lyp_631gs_solvent(benchmark):
     e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, True)
     print('testing df rb3lyp 631gs solvent')
     assert np.isclose(np.linalg.norm(e), 684.6985561053816, atol=1e-7, rtol=1e-16)
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
 def test_df_rb3lyp_631gs_solvent_grad(benchmark):
     g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, True)
     print('testing df rb3lyp 631gs solvent grad')
     assert np.isclose(np.linalg.norm(g), 0.16956999476137297, atol=1e-5, rtol=1e-16)
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=False, min_rounds=1)
 def test_df_rb3lyp_631gs_solvent_hessian(benchmark):
     h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, True)
     print('testing df rb3lyp 631gs solvent hessian')
diff --git a/gpu4pyscf/tests/test_benchmark_uks.py b/gpu4pyscf/tests/test_benchmark_uks.py
index 2c9d8ce6..5962c08e 100644
--- a/gpu4pyscf/tests/test_benchmark_uks.py
+++ b/gpu4pyscf/tests/test_benchmark_uks.py
@@ -68,34 +68,33 @@ def run_ub3lyp_hessian(atom, basis, with_df, with_solvent):
 ##########
 # UKS
 ##########
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=6)
 def test_df_ub3lyp(benchmark):
     e = benchmark(run_ub3lyp, small_mol, 'def2-tzvpp', True, False)
     print('testing df ub3lyp')
     assert np.isclose(np.linalg.norm(e), 684.9998712035856, atol=1e-7, rtol=1e-16)
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=6)
 def test_df_ub3lyp_grad(benchmark):
     g = benchmark(run_ub3lyp_grad, small_mol, 'def2-tzvpp', True, False)
     print('testing df ub3lyp grad')
     assert np.isclose(np.linalg.norm(g), 0.17435842214665462, atol=1e-5, rtol=1e-16)
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=False, min_rounds=1)
 def test_df_ub3lyp_hessian(benchmark):
     h = benchmark(run_ub3lyp_hessian, small_mol, 'def2-tzvpp', True, False)
     print('testing df ub3lyp hessian')
     assert np.isclose(np.linalg.norm(h), 3.758810345806532, atol=1e-4, rtol=1e-16)
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=6)
 def test_ub3lyp(benchmark):
     e = benchmark(run_ub3lyp, small_mol, '6-31gs', False, False)
     print('testing ub3lyp')
     assert np.isclose(np.linalg.norm(e), 684.6643858622429, atol=1e-7, rtol=1e-16)
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=6)
 def test_ub3lyp_grad(benchmark):
     g = benchmark(run_ub3lyp_grad, small_mol, '6-31gs', False, False)
     print('testing ub3lyp grad')
     assert np.isclose(np.linalg.norm(g), 0.17540045665419984, atol=1e-5, rtol=1e-16)
-@pytest.mark.benchmark
+@pytest.mark.benchmark(warmup=False, min_rounds=1)
 def test_ub3lyp_hessian(benchmark):
     h = benchmark(run_ub3lyp_hessian, small_mol, '6-31gs', False, False)
     print('testing ub3lyp hessian')
-    print(np.linalg.norm(h), np.linalg.norm(h) - 3.758916526520172)
     assert np.isclose(np.linalg.norm(h), 3.907289414559395, atol=1e-4, rtol=1e-16)

From 7a00f6ad12e770b999fae2b6cdf0bcca8f741ed0 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Mon, 6 Jan 2025 01:38:03 +0000
Subject: [PATCH 42/49] bugfix

---
 gpu4pyscf/dft/numint.py               | 8 ++++----
 gpu4pyscf/tests/test_benchmark_rks.py | 4 ++--
 gpu4pyscf/tests/test_benchmark_uks.py | 8 ++++----
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py
index 872ce750..17498c7d 100644
--- a/gpu4pyscf/dft/numint.py
+++ b/gpu4pyscf/dft/numint.py
@@ -415,7 +415,7 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
         ngrids_glob = grids.coords.shape[0]
         ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
         ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE
-        grid_start = device_id * ngrids_per_device
+        grid_start = min(device_id * ngrids_per_device, ngrids_glob)
         grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob)
         ngrids_local = grid_end - grid_start
         log.debug(f"{ngrids_local} on Device {device_id}")
@@ -819,7 +819,7 @@ def _nr_uks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
         ngrids_glob = grids.coords.shape[0]
         ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
         ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE
-        grid_start = device_id * ngrids_per_device
+        grid_start = min(device_id * ngrids_per_device, ngrids_glob)
         grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob)
         ngrids_local = grid_end - grid_start
         log.debug(f"{ngrids_local} on Device {device_id}")
@@ -1024,7 +1024,7 @@ def _nr_rks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff,
         ngrids_glob = grids.coords.shape[0]
         ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
         ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE
-        grid_start = device_id * ngrids_per_device
+        grid_start = min(device_id * ngrids_per_device, ngrids_glob)
         grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob)
         ngrids_local = grid_end - grid_start
         log.debug(f"{ngrids_local} on Device {device_id}")
@@ -1176,7 +1176,7 @@ def _nr_uks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff,
         ngrids_glob = grids.coords.shape[0]
         ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
         ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE
-        grid_start = device_id * ngrids_per_device
+        grid_start = min(device_id * ngrids_per_device, ngrids_glob)
         grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob)
         ngrids_local = grid_end - grid_start
         log.debug(f"{ngrids_local} on Device {device_id}")
diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py
index 2afa6e70..ec294234 100644
--- a/gpu4pyscf/tests/test_benchmark_rks.py
+++ b/gpu4pyscf/tests/test_benchmark_rks.py
@@ -148,12 +148,12 @@ def test_df_rb3lyp_hessian_medium(benchmark):
     print('testing df rb3lyp hessian medium')
     assert np.isclose(np.linalg.norm(h), 6.31265424196621, atol=1e-4, rtol=1e-16)
 
-@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
+@pytest.mark.benchmark(warmup=False, min_rounds=1)
 def test_rb3lyp_medium(benchmark):
     e = benchmark(run_rb3lyp, medium_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp medium')
     assert np.isclose(np.linalg.norm(e), 1138.3710752128077, atol=1e-7, rtol=1e-16)
-@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
+@pytest.mark.benchmark(warmup=False, min_rounds=1)
 def test_rb3lyp_grad_medium(benchmark):
     g = benchmark(run_rb3lyp_grad, medium_mol, 'def2-tzvpp', False, False)
     print('testing rb3lyp grad medium')
diff --git a/gpu4pyscf/tests/test_benchmark_uks.py b/gpu4pyscf/tests/test_benchmark_uks.py
index 5962c08e..236a433b 100644
--- a/gpu4pyscf/tests/test_benchmark_uks.py
+++ b/gpu4pyscf/tests/test_benchmark_uks.py
@@ -68,12 +68,12 @@ def run_ub3lyp_hessian(atom, basis, with_df, with_solvent):
 ##########
 # UKS
 ##########
-@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=6)
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
 def test_df_ub3lyp(benchmark):
     e = benchmark(run_ub3lyp, small_mol, 'def2-tzvpp', True, False)
     print('testing df ub3lyp')
     assert np.isclose(np.linalg.norm(e), 684.9998712035856, atol=1e-7, rtol=1e-16)
-@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=6)
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
 def test_df_ub3lyp_grad(benchmark):
     g = benchmark(run_ub3lyp_grad, small_mol, 'def2-tzvpp', True, False)
     print('testing df ub3lyp grad')
@@ -83,12 +83,12 @@ def test_df_ub3lyp_hessian(benchmark):
     h = benchmark(run_ub3lyp_hessian, small_mol, 'def2-tzvpp', True, False)
     print('testing df ub3lyp hessian')
     assert np.isclose(np.linalg.norm(h), 3.758810345806532, atol=1e-4, rtol=1e-16)
-@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=6)
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
 def test_ub3lyp(benchmark):
     e = benchmark(run_ub3lyp, small_mol, '6-31gs', False, False)
     print('testing ub3lyp')
     assert np.isclose(np.linalg.norm(e), 684.6643858622429, atol=1e-7, rtol=1e-16)
-@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=6)
+@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3)
 def test_ub3lyp_grad(benchmark):
     g = benchmark(run_ub3lyp_grad, small_mol, '6-31gs', False, False)
     print('testing ub3lyp grad')

From 0fa182d7d6d84cc5dc9886a70a859d058e491147 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Mon, 6 Jan 2025 13:33:36 +0800
Subject: [PATCH 43/49] add benchmark data

---
 .github/workflows/nightly_build.yml           |   4 +-
 .../v1.3.0_rks_1v100.json}                    | 630 +++++++++---------
 .../benchmark_results/v1.3.0_uks_1v100.json   | 418 ++++++++++++
 3 files changed, 735 insertions(+), 317 deletions(-)
 rename gpu4pyscf/tests/{.benchmarks/Linux-CPython-3.9-64bit/v1.3.0_1v100.json => benchmark_results/v1.3.0_rks_1v100.json} (56%)
 create mode 100644 gpu4pyscf/tests/benchmark_results/v1.3.0_uks_1v100.json

diff --git a/.github/workflows/nightly_build.yml b/.github/workflows/nightly_build.yml
index 012ca12f..29ec300f 100644
--- a/.github/workflows/nightly_build.yml
+++ b/.github/workflows/nightly_build.yml
@@ -40,9 +40,9 @@ jobs:
       run: |
         echo $GITHUB_WORKSPACE
         export PYTHONPATH="${PYTHONPATH}:$(pwd)"
-        pytest gpu4pyscf/tests/test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/
+        pytest gpu4pyscf/tests/test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_rks_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/
     - name: Test UKS
       run: |
         echo $GITHUB_WORKSPACE
         export PYTHONPATH="${PYTHONPATH}:$(pwd)"
-        pytest gpu4pyscf/tests/test_benchmark_uks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/
+        pytest gpu4pyscf/tests/test_benchmark_uks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_uks_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/
diff --git a/gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/v1.3.0_1v100.json b/gpu4pyscf/tests/benchmark_results/v1.3.0_rks_1v100.json
similarity index 56%
rename from gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/v1.3.0_1v100.json
rename to gpu4pyscf/tests/benchmark_results/v1.3.0_rks_1v100.json
index 81cb8ad0..1c5a9fc2 100644
--- a/gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/v1.3.0_1v100.json
+++ b/gpu4pyscf/tests/benchmark_results/v1.3.0_rks_1v100.json
@@ -1,6 +1,6 @@
 {
     "machine_info": {
-        "node": "mlxlabzskrh20v669fd914-20240723162348-uim1c3-k3ligr-worker",
+        "node": "mlxlabzskrh20v669fd914-20240723162348-uim1c3-7vr5b9-worker",
         "processor": "",
         "machine": "x86_64",
         "python_compiler": "GCC 10.2.1 20210110",
@@ -34,7 +34,7 @@
                 0
             ],
             "hz_actual": [
-                3100001000,
+                3100005000,
                 0
             ],
             "stepping": 7,
@@ -194,10 +194,10 @@
         }
     },
     "commit_info": {
-        "id": "ba388eec82973e4722d1afa3e83e00a3101248a0",
-        "time": "2025-01-03T06:10:51+08:00",
-        "author_time": "2025-01-03T06:10:51+08:00",
-        "dirty": true,
+        "id": "1ed8e5eccf5aaae256f4ad591db81f6689dd8a13",
+        "time": "2025-01-05T23:21:10+00:00",
+        "author_time": "2025-01-05T23:21:10+00:00",
+        "dirty": false,
         "project": "gpu4pyscf",
         "branch": "benchmark_ci"
     },
@@ -212,28 +212,28 @@
             "options": {
                 "disable_gc": false,
                 "timer": "perf_counter",
-                "min_rounds": 5,
+                "min_rounds": 3,
                 "max_time": 1.0,
                 "min_time": 5e-06,
-                "warmup": false
+                "warmup": 2
             },
             "stats": {
-                "min": 2.912467209622264,
-                "max": 3.132086180150509,
-                "mean": 2.9854499623179436,
-                "stddev": 0.08575128159932316,
-                "rounds": 5,
-                "median": 2.9598704893141985,
-                "iqr": 0.08442470477893949,
-                "q1": 2.934416546020657,
-                "q3": 3.0188412507995963,
+                "min": 2.725358221679926,
+                "max": 2.835785958915949,
+                "mean": 2.782431565846006,
+                "stddev": 0.055307723110869685,
+                "rounds": 3,
+                "median": 2.7861505169421434,
+                "iqr": 0.08282080292701721,
+                "q1": 2.7405562954954803,
+                "q3": 2.8233770984224975,
                 "iqr_outliers": 0,
                 "stddev_outliers": 1,
                 "outliers": "1;0",
-                "ld15iqr": 2.912467209622264,
-                "hd15iqr": 3.132086180150509,
-                "ops": 0.33495788327451537,
-                "total": 14.927249811589718,
+                "ld15iqr": 2.725358221679926,
+                "hd15iqr": 2.835785958915949,
+                "ops": 0.35939787783997024,
+                "total": 8.347294697538018,
                 "iterations": 1
             }
         },
@@ -247,28 +247,28 @@
             "options": {
                 "disable_gc": false,
                 "timer": "perf_counter",
-                "min_rounds": 5,
+                "min_rounds": 3,
                 "max_time": 1.0,
                 "min_time": 5e-06,
-                "warmup": false
+                "warmup": 2
             },
             "stats": {
-                "min": 4.693447925150394,
-                "max": 4.811241740360856,
-                "mean": 4.7545236147940155,
-                "stddev": 0.05376631322845494,
-                "rounds": 5,
-                "median": 4.767030920833349,
-                "iqr": 0.10001574829220772,
-                "q1": 4.700914891902357,
-                "q3": 4.800930640194565,
+                "min": 4.394210334867239,
+                "max": 4.473813105374575,
+                "mean": 4.42994485112528,
+                "stddev": 0.04041990275091787,
+                "rounds": 3,
+                "median": 4.4218111131340265,
+                "iqr": 0.05970207788050175,
+                "q1": 4.401110529433936,
+                "q3": 4.460812607314438,
                 "iqr_outliers": 0,
-                "stddev_outliers": 2,
-                "outliers": "2;0",
-                "ld15iqr": 4.693447925150394,
-                "hd15iqr": 4.811241740360856,
-                "ops": 0.21032601392249553,
-                "total": 23.77261807397008,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 4.394210334867239,
+                "hd15iqr": 4.473813105374575,
+                "ops": 0.22573644449455918,
+                "total": 13.28983455337584,
                 "iterations": 1
             }
         },
@@ -282,28 +282,28 @@
             "options": {
                 "disable_gc": false,
                 "timer": "perf_counter",
-                "min_rounds": 5,
+                "min_rounds": 1,
                 "max_time": 1.0,
                 "min_time": 5e-06,
                 "warmup": false
             },
             "stats": {
-                "min": 43.1729771643877,
-                "max": 44.22008949518204,
-                "mean": 43.53323510922492,
-                "stddev": 0.4568445249288835,
-                "rounds": 5,
-                "median": 43.26318317092955,
-                "iqr": 0.6843766365200281,
-                "q1": 43.210667157545686,
-                "q3": 43.895043794065714,
+                "min": 43.774112831801176,
+                "max": 43.774112831801176,
+                "mean": 43.774112831801176,
+                "stddev": 0,
+                "rounds": 1,
+                "median": 43.774112831801176,
+                "iqr": 0.0,
+                "q1": 43.774112831801176,
+                "q3": 43.774112831801176,
                 "iqr_outliers": 0,
-                "stddev_outliers": 1,
-                "outliers": "1;0",
-                "ld15iqr": 43.1729771643877,
-                "hd15iqr": 44.22008949518204,
-                "ops": 0.02297095535149178,
-                "total": 217.66617554612458,
+                "stddev_outliers": 0,
+                "outliers": "0;0",
+                "ld15iqr": 43.774112831801176,
+                "hd15iqr": 43.774112831801176,
+                "ops": 0.022844552072189946,
+                "total": 43.774112831801176,
                 "iterations": 1
             }
         },
@@ -317,28 +317,28 @@
             "options": {
                 "disable_gc": false,
                 "timer": "perf_counter",
-                "min_rounds": 5,
+                "min_rounds": 3,
                 "max_time": 1.0,
                 "min_time": 5e-06,
-                "warmup": false
+                "warmup": 2
             },
             "stats": {
-                "min": 40.87554130144417,
-                "max": 41.24961415119469,
-                "mean": 41.05381844490766,
-                "stddev": 0.13780925683914672,
-                "rounds": 5,
-                "median": 41.05546211451292,
-                "iqr": 0.17331884242594242,
-                "q1": 40.96216300688684,
-                "q3": 41.13548184931278,
+                "min": 40.097773076966405,
+                "max": 40.15744375810027,
+                "mean": 40.11991243995726,
+                "stddev": 0.03267769513443882,
+                "rounds": 3,
+                "median": 40.10452048480511,
+                "iqr": 0.04475301085039973,
+                "q1": 40.09945992892608,
+                "q3": 40.14421293977648,
                 "iqr_outliers": 0,
-                "stddev_outliers": 2,
-                "outliers": "2;0",
-                "ld15iqr": 40.87554130144417,
-                "hd15iqr": 41.24961415119469,
-                "ops": 0.02435827014098467,
-                "total": 205.26909222453833,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 40.097773076966405,
+                "hd15iqr": 40.15744375810027,
+                "ops": 0.024925278725285903,
+                "total": 120.35973731987178,
                 "iterations": 1
             }
         },
@@ -352,28 +352,28 @@
             "options": {
                 "disable_gc": false,
                 "timer": "perf_counter",
-                "min_rounds": 5,
+                "min_rounds": 3,
                 "max_time": 1.0,
                 "min_time": 5e-06,
-                "warmup": false
+                "warmup": 2
             },
             "stats": {
-                "min": 49.98093665204942,
-                "max": 50.76574368029833,
-                "mean": 50.31307061091066,
-                "stddev": 0.33630120438295324,
-                "rounds": 5,
-                "median": 50.36884331330657,
-                "iqr": 0.5613440982997417,
-                "q1": 49.981349020730704,
-                "q3": 50.542693119030446,
+                "min": 48.99313645064831,
+                "max": 49.26371451281011,
+                "mean": 49.142610578487314,
+                "stddev": 0.13750190122656403,
+                "rounds": 3,
+                "median": 49.17098077200353,
+                "iqr": 0.20293354662135243,
+                "q1": 49.037597530987114,
+                "q3": 49.240531077608466,
                 "iqr_outliers": 0,
                 "stddev_outliers": 1,
                 "outliers": "1;0",
-                "ld15iqr": 49.98093665204942,
-                "hd15iqr": 50.76574368029833,
-                "ops": 0.01987555098223611,
-                "total": 251.56535305455327,
+                "ld15iqr": 48.99313645064831,
+                "hd15iqr": 49.26371451281011,
+                "ops": 0.02034893930599935,
+                "total": 147.42783173546195,
                 "iterations": 1
             }
         },
@@ -387,28 +387,28 @@
             "options": {
                 "disable_gc": false,
                 "timer": "perf_counter",
-                "min_rounds": 5,
+                "min_rounds": 1,
                 "max_time": 1.0,
                 "min_time": 5e-06,
                 "warmup": false
             },
             "stats": {
-                "min": 611.3098333217204,
-                "max": 620.2315559927374,
-                "mean": 614.9859318509698,
-                "stddev": 3.295612103075669,
-                "rounds": 5,
-                "median": 614.4812579210848,
-                "iqr": 3.568844774272293,
-                "q1": 612.998380784411,
-                "q3": 616.5672255586833,
+                "min": 615.0911720395088,
+                "max": 615.0911720395088,
+                "mean": 615.0911720395088,
+                "stddev": 0,
+                "rounds": 1,
+                "median": 615.0911720395088,
+                "iqr": 0.0,
+                "q1": 615.0911720395088,
+                "q3": 615.0911720395088,
                 "iqr_outliers": 0,
-                "stddev_outliers": 2,
-                "outliers": "2;0",
-                "ld15iqr": 611.3098333217204,
-                "hd15iqr": 620.2315559927374,
-                "ops": 0.001626053456198948,
-                "total": 3074.929659254849,
+                "stddev_outliers": 0,
+                "outliers": "0;0",
+                "ld15iqr": 615.0911720395088,
+                "hd15iqr": 615.0911720395088,
+                "ops": 0.0016257752435044988,
+                "total": 615.0911720395088,
                 "iterations": 1
             }
         },
@@ -422,28 +422,28 @@
             "options": {
                 "disable_gc": false,
                 "timer": "perf_counter",
-                "min_rounds": 5,
+                "min_rounds": 3,
                 "max_time": 1.0,
                 "min_time": 5e-06,
-                "warmup": false
+                "warmup": 2
             },
             "stats": {
-                "min": 18.450319150462747,
-                "max": 19.34435743652284,
-                "mean": 18.962213665619494,
-                "stddev": 0.34090358565345374,
-                "rounds": 5,
-                "median": 19.017266055569053,
-                "iqr": 0.4629710176959634,
-                "q1": 18.742521196603775,
-                "q3": 19.20549221429974,
+                "min": 18.244548039510846,
+                "max": 18.375720830634236,
+                "mean": 18.312131161491077,
+                "stddev": 0.06567751542153955,
+                "rounds": 3,
+                "median": 18.316124614328146,
+                "iqr": 0.09837959334254265,
+                "q1": 18.26244218321517,
+                "q3": 18.360821776557714,
                 "iqr_outliers": 0,
-                "stddev_outliers": 2,
-                "outliers": "2;0",
-                "ld15iqr": 18.450319150462747,
-                "hd15iqr": 19.34435743652284,
-                "ops": 0.05273645881404165,
-                "total": 94.81106832809746,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 18.244548039510846,
+                "hd15iqr": 18.375720830634236,
+                "ops": 0.05460860842362896,
+                "total": 54.93639348447323,
                 "iterations": 1
             }
         },
@@ -457,28 +457,28 @@
             "options": {
                 "disable_gc": false,
                 "timer": "perf_counter",
-                "min_rounds": 5,
+                "min_rounds": 3,
                 "max_time": 1.0,
                 "min_time": 5e-06,
-                "warmup": false
+                "warmup": 2
             },
             "stats": {
-                "min": 28.927994549274445,
-                "max": 29.407788010314107,
-                "mean": 29.06979787014425,
-                "stddev": 0.19948441635503308,
-                "rounds": 5,
-                "median": 28.980533458292484,
-                "iqr": 0.2319285492412746,
-                "q1": 28.93826104514301,
-                "q3": 29.170189594384283,
+                "min": 30.697130125015974,
+                "max": 30.711910048499703,
+                "mean": 30.70534764789045,
+                "stddev": 0.00752768934207856,
+                "rounds": 3,
+                "median": 30.70700277015567,
+                "iqr": 0.011084942612797022,
+                "q1": 30.699598286300898,
+                "q3": 30.710683228913695,
                 "iqr_outliers": 0,
                 "stddev_outliers": 1,
                 "outliers": "1;0",
-                "ld15iqr": 28.927994549274445,
-                "hd15iqr": 29.407788010314107,
-                "ops": 0.034399963992423795,
-                "total": 145.34898935072124,
+                "ld15iqr": 30.697130125015974,
+                "hd15iqr": 30.711910048499703,
+                "ops": 0.03256761693329022,
+                "total": 92.11604294367135,
                 "iterations": 1
             }
         },
@@ -492,28 +492,28 @@
             "options": {
                 "disable_gc": false,
                 "timer": "perf_counter",
-                "min_rounds": 5,
+                "min_rounds": 1,
                 "max_time": 1.0,
                 "min_time": 5e-06,
                 "warmup": false
             },
             "stats": {
-                "min": 674.9359990525991,
-                "max": 678.2040371634066,
-                "mean": 676.7355838540941,
-                "stddev": 1.3332352353981456,
-                "rounds": 5,
-                "median": 676.6573997996747,
-                "iqr": 2.1692251418717206,
-                "q1": 675.7630731766112,
-                "q3": 677.9322983184829,
+                "min": 667.9882875829935,
+                "max": 667.9882875829935,
+                "mean": 667.9882875829935,
+                "stddev": 0,
+                "rounds": 1,
+                "median": 667.9882875829935,
+                "iqr": 0.0,
+                "q1": 667.9882875829935,
+                "q3": 667.9882875829935,
                 "iqr_outliers": 0,
-                "stddev_outliers": 2,
-                "outliers": "2;0",
-                "ld15iqr": 674.9359990525991,
-                "hd15iqr": 678.2040371634066,
-                "ops": 0.0014776820132685715,
-                "total": 3383.6779192704707,
+                "stddev_outliers": 0,
+                "outliers": "0;0",
+                "ld15iqr": 667.9882875829935,
+                "hd15iqr": 667.9882875829935,
+                "ops": 0.0014970322363260838,
+                "total": 667.9882875829935,
                 "iterations": 1
             }
         },
@@ -527,28 +527,28 @@
             "options": {
                 "disable_gc": false,
                 "timer": "perf_counter",
-                "min_rounds": 5,
+                "min_rounds": 3,
                 "max_time": 1.0,
                 "min_time": 5e-06,
-                "warmup": false
+                "warmup": 2
             },
             "stats": {
-                "min": 465.53933845460415,
-                "max": 469.9319954663515,
-                "mean": 467.35331859253347,
-                "stddev": 1.7196427730040629,
-                "rounds": 5,
-                "median": 467.1924539171159,
-                "iqr": 2.4731017132289708,
-                "q1": 465.9859178052284,
-                "q3": 468.45901951845735,
+                "min": 460.72668202780187,
+                "max": 461.77398146130145,
+                "mean": 461.4145879279822,
+                "stddev": 0.5959440470695604,
+                "rounds": 3,
+                "median": 461.7431002948433,
+                "iqr": 0.785474575124681,
+                "q1": 460.98078659456223,
+                "q3": 461.7662611696869,
                 "iqr_outliers": 0,
-                "stddev_outliers": 2,
-                "outliers": "2;0",
-                "ld15iqr": 465.53933845460415,
-                "hd15iqr": 469.9319954663515,
-                "ops": 0.00213970878180895,
-                "total": 2336.7665929626673,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 460.72668202780187,
+                "hd15iqr": 461.77398146130145,
+                "ops": 0.0021672483405662944,
+                "total": 1384.2437637839466,
                 "iterations": 1
             }
         },
@@ -562,28 +562,28 @@
             "options": {
                 "disable_gc": false,
                 "timer": "perf_counter",
-                "min_rounds": 5,
+                "min_rounds": 3,
                 "max_time": 1.0,
                 "min_time": 5e-06,
-                "warmup": false
+                "warmup": 2
             },
             "stats": {
-                "min": 559.7645257860422,
-                "max": 562.2628617957234,
-                "mean": 560.7982054453344,
-                "stddev": 0.9539619574856013,
-                "rounds": 5,
-                "median": 560.6089988369495,
-                "iqr": 1.2622483419254422,
-                "q1": 560.1302895797417,
-                "q3": 561.3925379216671,
+                "min": 552.0836905632168,
+                "max": 553.4436832498759,
+                "mean": 552.8364644367248,
+                "stddev": 0.6915813282891417,
+                "rounds": 3,
+                "median": 552.9820194970816,
+                "iqr": 1.0199945149943233,
+                "q1": 552.308272796683,
+                "q3": 553.3282673116773,
                 "iqr_outliers": 0,
-                "stddev_outliers": 2,
-                "outliers": "2;0",
-                "ld15iqr": 559.7645257860422,
-                "hd15iqr": 562.2628617957234,
-                "ops": 0.0017831726105576463,
-                "total": 2803.9910272266716,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 552.0836905632168,
+                "hd15iqr": 553.4436832498759,
+                "ops": 0.0018088531859396832,
+                "total": 1658.5093933101743,
                 "iterations": 1
             }
         },
@@ -597,28 +597,28 @@
             "options": {
                 "disable_gc": false,
                 "timer": "perf_counter",
-                "min_rounds": 5,
+                "min_rounds": 3,
                 "max_time": 1.0,
                 "min_time": 5e-06,
-                "warmup": false
+                "warmup": 2
             },
             "stats": {
-                "min": 2.3893800172954798,
-                "max": 4.065618982538581,
-                "mean": 3.419478364661336,
-                "stddev": 0.6287030173245606,
-                "rounds": 5,
-                "median": 3.485863795503974,
-                "iqr": 0.6505572367459536,
-                "q1": 3.1652946420945227,
-                "q3": 3.8158518788404763,
+                "min": 1.6017291732132435,
+                "max": 1.647629827260971,
+                "mean": 1.6208390643199284,
+                "stddev": 0.02389486042236203,
+                "rounds": 3,
+                "median": 1.613158192485571,
+                "iqr": 0.03442549053579569,
+                "q1": 1.6045864280313253,
+                "q3": 1.639011918567121,
                 "iqr_outliers": 0,
-                "stddev_outliers": 2,
-                "outliers": "2;0",
-                "ld15iqr": 2.3893800172954798,
-                "hd15iqr": 4.065618982538581,
-                "ops": 0.29244226556147246,
-                "total": 17.09739182330668,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 1.6017291732132435,
+                "hd15iqr": 1.647629827260971,
+                "ops": 0.6169643994973554,
+                "total": 4.8625171929597855,
                 "iterations": 1
             }
         },
@@ -632,28 +632,28 @@
             "options": {
                 "disable_gc": false,
                 "timer": "perf_counter",
-                "min_rounds": 5,
+                "min_rounds": 3,
                 "max_time": 1.0,
                 "min_time": 5e-06,
-                "warmup": false
+                "warmup": 2
             },
             "stats": {
-                "min": 3.086430249735713,
-                "max": 3.4464661311358213,
-                "mean": 3.21519818790257,
-                "stddev": 0.14631235080321897,
-                "rounds": 5,
-                "median": 3.1610366478562355,
-                "iqr": 0.20356103358790278,
-                "q1": 3.108103247359395,
-                "q3": 3.311664280947298,
+                "min": 2.1184212770313025,
+                "max": 2.20925628952682,
+                "mean": 2.15202548665305,
+                "stddev": 0.04981377124137081,
+                "rounds": 3,
+                "median": 2.1283988934010267,
+                "iqr": 0.0681262593716383,
+                "q1": 2.1209156811237335,
+                "q3": 2.189041940495372,
                 "iqr_outliers": 0,
                 "stddev_outliers": 1,
                 "outliers": "1;0",
-                "ld15iqr": 3.086430249735713,
-                "hd15iqr": 3.4464661311358213,
-                "ops": 0.31102281774186635,
-                "total": 16.07599093951285,
+                "ld15iqr": 2.1184212770313025,
+                "hd15iqr": 2.20925628952682,
+                "ops": 0.46467851157063006,
+                "total": 6.456076459959149,
                 "iterations": 1
             }
         },
@@ -667,28 +667,28 @@
             "options": {
                 "disable_gc": false,
                 "timer": "perf_counter",
-                "min_rounds": 5,
+                "min_rounds": 1,
                 "max_time": 1.0,
                 "min_time": 5e-06,
                 "warmup": false
             },
             "stats": {
-                "min": 15.165010405704379,
-                "max": 15.594494730234146,
-                "mean": 15.329469257220627,
-                "stddev": 0.16923297471060986,
-                "rounds": 5,
-                "median": 15.25765424221754,
-                "iqr": 0.22211500210687518,
-                "q1": 15.220200731419027,
-                "q3": 15.442315733525902,
+                "min": 16.1142161693424,
+                "max": 16.1142161693424,
+                "mean": 16.1142161693424,
+                "stddev": 0,
+                "rounds": 1,
+                "median": 16.1142161693424,
+                "iqr": 0.0,
+                "q1": 16.1142161693424,
+                "q3": 16.1142161693424,
                 "iqr_outliers": 0,
-                "stddev_outliers": 1,
-                "outliers": "1;0",
-                "ld15iqr": 15.165010405704379,
-                "hd15iqr": 15.594494730234146,
-                "ops": 0.0652338305534597,
-                "total": 76.64734628610313,
+                "stddev_outliers": 0,
+                "outliers": "0;0",
+                "ld15iqr": 16.1142161693424,
+                "hd15iqr": 16.1142161693424,
+                "ops": 0.06205700541007504,
+                "total": 16.1142161693424,
                 "iterations": 1
             }
         },
@@ -702,28 +702,28 @@
             "options": {
                 "disable_gc": false,
                 "timer": "perf_counter",
-                "min_rounds": 5,
+                "min_rounds": 3,
                 "max_time": 1.0,
                 "min_time": 5e-06,
-                "warmup": false
+                "warmup": 2
             },
             "stats": {
-                "min": 55.83294254913926,
-                "max": 59.17145283520222,
-                "mean": 57.62203652448952,
-                "stddev": 1.374501327462731,
-                "rounds": 5,
-                "median": 57.98267317190766,
-                "iqr": 2.264786566141993,
-                "q1": 56.414323914796114,
-                "q3": 58.67911048093811,
+                "min": 55.4929311927408,
+                "max": 56.77203128859401,
+                "mean": 56.066467080265284,
+                "stddev": 0.6496905970719544,
+                "rounds": 3,
+                "median": 55.934438759461045,
+                "iqr": 0.9593250718899071,
+                "q1": 55.60330808442086,
+                "q3": 56.56263315631077,
                 "iqr_outliers": 0,
-                "stddev_outliers": 2,
-                "outliers": "2;0",
-                "ld15iqr": 55.83294254913926,
-                "hd15iqr": 59.17145283520222,
-                "ops": 0.017354471662503586,
-                "total": 288.1101826224476,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 55.4929311927408,
+                "hd15iqr": 56.77203128859401,
+                "ops": 0.01783597312397784,
+                "total": 168.19940124079585,
                 "iterations": 1
             }
         },
@@ -737,28 +737,28 @@
             "options": {
                 "disable_gc": false,
                 "timer": "perf_counter",
-                "min_rounds": 5,
+                "min_rounds": 3,
                 "max_time": 1.0,
                 "min_time": 5e-06,
-                "warmup": false
+                "warmup": 2
             },
             "stats": {
-                "min": 72.57952445559204,
-                "max": 73.64580446854234,
-                "mean": 73.0582833636552,
-                "stddev": 0.48907908462094757,
-                "rounds": 5,
-                "median": 72.9562251791358,
-                "iqr": 0.9134543887339532,
-                "q1": 72.61263743927702,
-                "q3": 73.52609182801098,
+                "min": 70.14288471080363,
+                "max": 70.61111964285374,
+                "mean": 70.3403081515183,
+                "stddev": 0.24259089508559126,
+                "rounds": 3,
+                "median": 70.26692010089755,
+                "iqr": 0.3511761990375817,
+                "q1": 70.17389355832711,
+                "q3": 70.52506975736469,
                 "iqr_outliers": 0,
                 "stddev_outliers": 1,
                 "outliers": "1;0",
-                "ld15iqr": 72.57952445559204,
-                "hd15iqr": 73.64580446854234,
-                "ops": 0.01368770184514733,
-                "total": 365.29141681827605,
+                "ld15iqr": 70.14288471080363,
+                "hd15iqr": 70.61111964285374,
+                "ops": 0.014216599646477592,
+                "total": 211.02092445455492,
                 "iterations": 1
             }
         },
@@ -772,28 +772,28 @@
             "options": {
                 "disable_gc": false,
                 "timer": "perf_counter",
-                "min_rounds": 5,
+                "min_rounds": 3,
                 "max_time": 1.0,
                 "min_time": 5e-06,
-                "warmup": false
+                "warmup": 2
             },
             "stats": {
-                "min": 3.96594556607306,
-                "max": 4.621823711320758,
-                "mean": 4.35534807741642,
-                "stddev": 0.24887039802683908,
-                "rounds": 5,
-                "median": 4.413319645449519,
-                "iqr": 0.31258321227505803,
-                "q1": 4.208048852626234,
-                "q3": 4.520632064901292,
+                "min": 2.51676319912076,
+                "max": 2.569052016362548,
+                "mean": 2.540054644147555,
+                "stddev": 0.02660729798277223,
+                "rounds": 3,
+                "median": 2.5343487169593573,
+                "iqr": 0.03921661293134093,
+                "q1": 2.5211595785804093,
+                "q3": 2.56037619151175,
                 "iqr_outliers": 0,
-                "stddev_outliers": 2,
-                "outliers": "2;0",
-                "ld15iqr": 3.96594556607306,
-                "hd15iqr": 4.621823711320758,
-                "ops": 0.22960277392873663,
-                "total": 21.7767403870821,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 2.51676319912076,
+                "hd15iqr": 2.569052016362548,
+                "ops": 0.393692317723976,
+                "total": 7.620163932442665,
                 "iterations": 1
             }
         },
@@ -807,28 +807,28 @@
             "options": {
                 "disable_gc": false,
                 "timer": "perf_counter",
-                "min_rounds": 5,
+                "min_rounds": 3,
                 "max_time": 1.0,
                 "min_time": 5e-06,
-                "warmup": false
+                "warmup": 2
             },
             "stats": {
-                "min": 4.794365357607603,
-                "max": 5.184939783066511,
-                "mean": 4.969379325211048,
-                "stddev": 0.15483049255542325,
-                "rounds": 5,
-                "median": 4.9499497301876545,
-                "iqr": 0.24070796929299831,
-                "q1": 4.846174816135317,
-                "q3": 5.086882785428315,
+                "min": 3.7774324007332325,
+                "max": 3.8614633549004793,
+                "mean": 3.8227184594919286,
+                "stddev": 0.04239564161614309,
+                "rounds": 3,
+                "median": 3.8292596228420734,
+                "iqr": 0.06302321562543511,
+                "q1": 3.7903892062604427,
+                "q3": 3.853412421885878,
                 "iqr_outliers": 0,
-                "stddev_outliers": 2,
-                "outliers": "2;0",
-                "ld15iqr": 4.794365357607603,
-                "hd15iqr": 5.184939783066511,
-                "ops": 0.20123237421758508,
-                "total": 24.84689662605524,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 3.7774324007332325,
+                "hd15iqr": 3.8614633549004793,
+                "ops": 0.26159394436097405,
+                "total": 11.468155378475785,
                 "iterations": 1
             }
         },
@@ -842,32 +842,32 @@
             "options": {
                 "disable_gc": false,
                 "timer": "perf_counter",
-                "min_rounds": 5,
+                "min_rounds": 1,
                 "max_time": 1.0,
                 "min_time": 5e-06,
                 "warmup": false
             },
             "stats": {
-                "min": 152.7211031857878,
-                "max": 161.58804737962782,
-                "mean": 158.70457714907826,
-                "stddev": 3.529131682005075,
-                "rounds": 5,
-                "median": 159.99357100203633,
-                "iqr": 3.860361324157566,
-                "q1": 157.0660247253254,
-                "q3": 160.92638604948297,
+                "min": 122.75680537335575,
+                "max": 122.75680537335575,
+                "mean": 122.75680537335575,
+                "stddev": 0,
+                "rounds": 1,
+                "median": 122.75680537335575,
+                "iqr": 0.0,
+                "q1": 122.75680537335575,
+                "q3": 122.75680537335575,
                 "iqr_outliers": 0,
-                "stddev_outliers": 1,
-                "outliers": "1;0",
-                "ld15iqr": 152.7211031857878,
-                "hd15iqr": 161.58804737962782,
-                "ops": 0.006301015496614541,
-                "total": 793.5228857453912,
+                "stddev_outliers": 0,
+                "outliers": "0;0",
+                "ld15iqr": 122.75680537335575,
+                "hd15iqr": 122.75680537335575,
+                "ops": 0.00814618787902287,
+                "total": 122.75680537335575,
                 "iterations": 1
             }
         }
     ],
-    "datetime": "2025-01-03T13:58:40.332127+00:00",
+    "datetime": "2025-01-06T03:31:22.391433+00:00",
     "version": "5.1.0"
 }
\ No newline at end of file
diff --git a/gpu4pyscf/tests/benchmark_results/v1.3.0_uks_1v100.json b/gpu4pyscf/tests/benchmark_results/v1.3.0_uks_1v100.json
new file mode 100644
index 00000000..7bfabd8a
--- /dev/null
+++ b/gpu4pyscf/tests/benchmark_results/v1.3.0_uks_1v100.json
@@ -0,0 +1,418 @@
+{
+    "machine_info": {
+        "node": "mlxlabzskrh20v669fd914-20240723162348-uim1c3-7vr5b9-worker",
+        "processor": "",
+        "machine": "x86_64",
+        "python_compiler": "GCC 10.2.1 20210110",
+        "python_implementation": "CPython",
+        "python_implementation_version": "3.9.2",
+        "python_version": "3.9.2",
+        "python_build": [
+            "default",
+            "Feb 28 2021 17:03:44"
+        ],
+        "release": "5.4.143.bsk.7-amd64",
+        "system": "Linux",
+        "cpu": {
+            "python_version": "3.9.2.final.0 (64 bit)",
+            "cpuinfo_version": [
+                9,
+                0,
+                0
+            ],
+            "cpuinfo_version_string": "9.0.0",
+            "arch": "X86_64",
+            "bits": 64,
+            "count": 96,
+            "arch_string_raw": "x86_64",
+            "vendor_id_raw": "GenuineIntel",
+            "brand_raw": "Intel(R) Xeon(R) Platinum 8260 CPU @ 2.40GHz",
+            "hz_advertised_friendly": "2.4000 GHz",
+            "hz_actual_friendly": "3.1000 GHz",
+            "hz_advertised": [
+                2400000000,
+                0
+            ],
+            "hz_actual": [
+                3100012000,
+                0
+            ],
+            "stepping": 7,
+            "model": 85,
+            "family": 6,
+            "flags": [
+                "3dnowprefetch",
+                "abm",
+                "acpi",
+                "adx",
+                "aes",
+                "aperfmperf",
+                "apic",
+                "arat",
+                "arch_capabilities",
+                "arch_perfmon",
+                "art",
+                "avx",
+                "avx2",
+                "avx512_vnni",
+                "avx512bw",
+                "avx512cd",
+                "avx512dq",
+                "avx512f",
+                "avx512vl",
+                "avx512vnni",
+                "bmi1",
+                "bmi2",
+                "bts",
+                "cat_l3",
+                "cdp_l3",
+                "clflush",
+                "clflushopt",
+                "clwb",
+                "cmov",
+                "constant_tsc",
+                "cpuid",
+                "cpuid_fault",
+                "cqm",
+                "cqm_llc",
+                "cqm_mbm_local",
+                "cqm_mbm_total",
+                "cqm_occup_llc",
+                "cx16",
+                "cx8",
+                "dca",
+                "de",
+                "ds_cpl",
+                "dtes64",
+                "dtherm",
+                "dts",
+                "epb",
+                "ept",
+                "ept_ad",
+                "erms",
+                "est",
+                "f16c",
+                "flexpriority",
+                "flush_l1d",
+                "fma",
+                "fpu",
+                "fsgsbase",
+                "fxsr",
+                "ht",
+                "hwp",
+                "hwp_act_window",
+                "hwp_epp",
+                "hwp_pkg_req",
+                "ibpb",
+                "ibrs",
+                "ibrs_enhanced",
+                "ida",
+                "intel_ppin",
+                "intel_pt",
+                "invpcid",
+                "invpcid_single",
+                "lahf_lm",
+                "lm",
+                "mba",
+                "mca",
+                "mce",
+                "md_clear",
+                "mmx",
+                "movbe",
+                "mpx",
+                "msr",
+                "mtrr",
+                "nonstop_tsc",
+                "nopl",
+                "nx",
+                "ospke",
+                "osxsave",
+                "pae",
+                "pat",
+                "pbe",
+                "pcid",
+                "pclmulqdq",
+                "pdcm",
+                "pdpe1gb",
+                "pebs",
+                "pge",
+                "pku",
+                "pln",
+                "pni",
+                "popcnt",
+                "pqe",
+                "pqm",
+                "pse",
+                "pse36",
+                "pts",
+                "rdrand",
+                "rdrnd",
+                "rdseed",
+                "rdt_a",
+                "rdtscp",
+                "rep_good",
+                "sdbg",
+                "sep",
+                "smap",
+                "smep",
+                "smx",
+                "ss",
+                "ssbd",
+                "sse",
+                "sse2",
+                "sse4_1",
+                "sse4_2",
+                "ssse3",
+                "stibp",
+                "syscall",
+                "tm",
+                "tm2",
+                "tpr_shadow",
+                "tsc",
+                "tsc_adjust",
+                "tsc_deadline_timer",
+                "tscdeadline",
+                "vme",
+                "vmx",
+                "vnmi",
+                "vpid",
+                "x2apic",
+                "xgetbv1",
+                "xsave",
+                "xsavec",
+                "xsaveopt",
+                "xsaves",
+                "xtopology",
+                "xtpr"
+            ],
+            "l3_cache_size": 37486592,
+            "l2_cache_size": 50331648,
+            "l1_data_cache_size": "1.5 MiB",
+            "l1_instruction_cache_size": "1.5 MiB",
+            "l2_cache_line_size": 256,
+            "l2_cache_associativity": 6
+        }
+    },
+    "commit_info": {
+        "id": "1ed8e5eccf5aaae256f4ad591db81f6689dd8a13",
+        "time": "2025-01-05T23:21:10+00:00",
+        "author_time": "2025-01-05T23:21:10+00:00",
+        "dirty": false,
+        "project": "gpu4pyscf",
+        "branch": "benchmark_ci"
+    },
+    "benchmarks": [
+        {
+            "group": null,
+            "name": "test_df_ub3lyp",
+            "fullname": "gpu4pyscf/tests/test_benchmark_uks.py::test_df_ub3lyp",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 6,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": 2
+            },
+            "stats": {
+                "min": 6.552961312234402,
+                "max": 6.817228589206934,
+                "mean": 6.699132799791793,
+                "stddev": 0.10053109169956066,
+                "rounds": 6,
+                "median": 6.730765865184367,
+                "iqr": 0.15081804990768433,
+                "q1": 6.606128558516502,
+                "q3": 6.756946608424187,
+                "iqr_outliers": 0,
+                "stddev_outliers": 2,
+                "outliers": "2;0",
+                "ld15iqr": 6.552961312234402,
+                "hd15iqr": 6.817228589206934,
+                "ops": 0.14927305218237794,
+                "total": 40.19479679875076,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_ub3lyp_grad",
+            "fullname": "gpu4pyscf/tests/test_benchmark_uks.py::test_df_ub3lyp_grad",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 6,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": 2
+            },
+            "stats": {
+                "min": 13.294025084003806,
+                "max": 14.571726197376847,
+                "mean": 13.735415458368758,
+                "stddev": 0.5932420341119666,
+                "rounds": 6,
+                "median": 13.415598810650408,
+                "iqr": 1.1223390139639378,
+                "q1": 13.296602416783571,
+                "q3": 14.418941430747509,
+                "iqr_outliers": 0,
+                "stddev_outliers": 2,
+                "outliers": "2;0",
+                "ld15iqr": 13.294025084003806,
+                "hd15iqr": 14.571726197376847,
+                "ops": 0.07280449601476865,
+                "total": 82.41249275021255,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_df_ub3lyp_hessian",
+            "fullname": "gpu4pyscf/tests/test_benchmark_uks.py::test_df_ub3lyp_hessian",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 1,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 93.588756557554,
+                "max": 93.588756557554,
+                "mean": 93.588756557554,
+                "stddev": 0,
+                "rounds": 1,
+                "median": 93.588756557554,
+                "iqr": 0.0,
+                "q1": 93.588756557554,
+                "q3": 93.588756557554,
+                "iqr_outliers": 0,
+                "stddev_outliers": 0,
+                "outliers": "0;0",
+                "ld15iqr": 93.588756557554,
+                "hd15iqr": 93.588756557554,
+                "ops": 0.01068504419529319,
+                "total": 93.588756557554,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_ub3lyp",
+            "fullname": "gpu4pyscf/tests/test_benchmark_uks.py::test_ub3lyp",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 6,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": 2
+            },
+            "stats": {
+                "min": 6.713842295110226,
+                "max": 7.0260709673166275,
+                "mean": 6.852823034239312,
+                "stddev": 0.11983568503202911,
+                "rounds": 6,
+                "median": 6.869919722899795,
+                "iqr": 0.19665820337831974,
+                "q1": 6.720263646915555,
+                "q3": 6.916921850293875,
+                "iqr_outliers": 0,
+                "stddev_outliers": 3,
+                "outliers": "3;0",
+                "ld15iqr": 6.713842295110226,
+                "hd15iqr": 7.0260709673166275,
+                "ops": 0.14592526247994722,
+                "total": 41.11693820543587,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_ub3lyp_grad",
+            "fullname": "gpu4pyscf/tests/test_benchmark_uks.py::test_ub3lyp_grad",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 6,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": 2
+            },
+            "stats": {
+                "min": 7.483015248551965,
+                "max": 7.855705849826336,
+                "mean": 7.595327176774542,
+                "stddev": 0.14647552264068445,
+                "rounds": 6,
+                "median": 7.529973562806845,
+                "iqr": 0.19051661528646946,
+                "q1": 7.491389110684395,
+                "q3": 7.681905725970864,
+                "iqr_outliers": 0,
+                "stddev_outliers": 1,
+                "outliers": "1;0",
+                "ld15iqr": 7.483015248551965,
+                "hd15iqr": 7.855705849826336,
+                "ops": 0.13165989781952533,
+                "total": 45.57196306064725,
+                "iterations": 1
+            }
+        },
+        {
+            "group": null,
+            "name": "test_ub3lyp_hessian",
+            "fullname": "gpu4pyscf/tests/test_benchmark_uks.py::test_ub3lyp_hessian",
+            "params": null,
+            "param": null,
+            "extra_info": {},
+            "options": {
+                "disable_gc": false,
+                "timer": "perf_counter",
+                "min_rounds": 1,
+                "max_time": 1.0,
+                "min_time": 5e-06,
+                "warmup": false
+            },
+            "stats": {
+                "min": 61.551909405738115,
+                "max": 61.551909405738115,
+                "mean": 61.551909405738115,
+                "stddev": 0,
+                "rounds": 1,
+                "median": 61.551909405738115,
+                "iqr": 0.0,
+                "q1": 61.551909405738115,
+                "q3": 61.551909405738115,
+                "iqr_outliers": 0,
+                "stddev_outliers": 0,
+                "outliers": "0;0",
+                "ld15iqr": 61.551909405738115,
+                "hd15iqr": 61.551909405738115,
+                "ops": 0.016246449698387032,
+                "total": 61.551909405738115,
+                "iterations": 1
+            }
+        }
+    ],
+    "datetime": "2025-01-06T03:46:22.404689+00:00",
+    "version": "5.1.0"
+}
\ No newline at end of file

From 7edf50d09643fc1d0e18b67a8e8e50c3cff38b6a Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Mon, 6 Jan 2025 17:24:25 +0000
Subject: [PATCH 44/49] remove comments

---
 examples/dft_driver.py       | 4 ++--
 gpu4pyscf/df/df.py           | 2 +-
 gpu4pyscf/df/hessian/jk.py   | 6 +-----
 gpu4pyscf/hessian/rks.py     | 3 ---
 gpu4pyscf/lib/cupy_helper.py | 2 +-
 5 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/examples/dft_driver.py b/examples/dft_driver.py
index 0c8cea48..0be7f410 100644
--- a/examples/dft_driver.py
+++ b/examples/dft_driver.py
@@ -34,10 +34,10 @@
     basis=bas,
     max_memory=32000)
 # set verbose >= 6 for debugging timer
-mol.verbose = 4
+mol.verbose = 6
 
 mf_df = dft.RKS(mol, xc=args.xc).density_fit(auxbasis=args.auxbasis)
-mf_df.verbose = 4
+mf_df.verbose = 6
 
 if args.solvent:
     mf_df = mf_df.PCM()
diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
index ab1adeba..da61804c 100644
--- a/gpu4pyscf/df/df.py
+++ b/gpu4pyscf/df/df.py
@@ -138,7 +138,7 @@ def get_blksize(self, extra=0, nao=None):
         '''
         if nao is None: nao = self.nao
         mem_avail = get_avail_mem()
-        blksize = int(mem_avail*0.4/8/(nao*nao + extra) / ALIGNED) * ALIGNED
+        blksize = int(mem_avail*0.2/8/(nao*nao + extra) / ALIGNED) * ALIGNED
         blksize = min(blksize, MIN_BLK_SIZE)
         log = logger.new_logger(self.mol, self.mol.verbose)
         device_id = cupy.cuda.Device().id
diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py
index 7859e97b..40ab3bfd 100644
--- a/gpu4pyscf/df/hessian/jk.py
+++ b/gpu4pyscf/df/hessian/jk.py
@@ -22,7 +22,7 @@
 from gpu4pyscf.scf.int4c2e import libgint
 from gpu4pyscf.hessian.jk import _ao2mo
 from gpu4pyscf.lib import logger
-from gpu4pyscf.lib.cupy_helper import contract, cart2sph, reduce_to_device, get_avail_mem, release_gpu_stack
+from gpu4pyscf.lib.cupy_helper import contract, cart2sph, reduce_to_device
 from gpu4pyscf.__config__ import _streams, _num_devices
 
 NROOT_ON_GPU = 7
@@ -314,8 +314,6 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
             # (20|0), (0|0)(0|00)
             int3c_blk = _get_int3c2e_ipip_slice('ipip1', intopt, cp_ij_id, aux_id, omega=omega)
             if with_j:
-                #tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1])
-                #hj_ipip1[:,i0:i1] += contract('xpi,p->xi', tmp, rhoj[k0:k1])
                 tmp = contract('xpji,p->xji', int3c_blk, rhoj[k0:k1])
                 hj_ipip1[:,i0:i1] += contract('xji,ij->xi', tmp, dm0[i0:i1,j0:j1])
             if with_k:
@@ -325,8 +323,6 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
             # (11|0), (0|0)(0|00) without response of RI basis
             int3c_blk = _get_int3c2e_ipip_slice('ipvip1', intopt, cp_ij_id, aux_id, omega=omega)
             if with_j:
-                #tmp = contract('xpji,ij->xpij', int3c_blk, dm0[i0:i1,j0:j1])
-                #hj_ipvip1[:,i0:i1,j0:j1] += contract('xpij,p->xij', tmp, rhoj[k0:k1])
                 tmp = contract('xpji,p->xji', int3c_blk, rhoj[k0:k1])
                 hj_ipvip1[:,i0:i1,j0:j1] += contract('xji,ij->xij', tmp, dm0[i0:i1,j0:j1])
             if with_k:
diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py
index a2ee9da7..d506b934 100644
--- a/gpu4pyscf/hessian/rks.py
+++ b/gpu4pyscf/hessian/rks.py
@@ -811,9 +811,6 @@ def nr_rks_fxc_mo(ni, mol, grids, xc_code, dm0=None, dms=None, mo_coeff=None, re
     for future in futures:
         vmat_dist.append(future.result())
     vmat = reduce_to_device(vmat_dist, inplace=True)
-    #vmat = opt.unsort_orbitals(vmat, axis=[1,2])
-    #if xctype != 'LDA':
-    #    transpose_sum(vmat)
 
     if len(dm_shape) == 2:
         vmat = vmat[0]
diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py
index 95ae1f24..d68cbcff 100644
--- a/gpu4pyscf/lib/cupy_helper.py
+++ b/gpu4pyscf/lib/cupy_helper.py
@@ -149,7 +149,7 @@ def reduce_to_device(array_list, inplace=False):
         matrix = matrix.reshape(-1)
         blksize = 1024*1024*1024 // matrix.itemsize # 1GB
         for p0, p1 in lib.prange(0,len(matrix), blksize):
-            result[p0:p1] += copy_array(matrix[p0:p1])#cupy.asarray(matrix[p0:p1]) 
+            result[p0:p1] += copy_array(matrix[p0:p1])
             #result[p0:p1] += cupy.asarray(matrix[p0:p1]) 
     return result.reshape(out_shape)
     

From ee0e63643bab148574e9c74dd13e5b004ec168ac Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Tue, 7 Jan 2025 06:11:14 +0000
Subject: [PATCH 45/49] resolve comments

---
 gpu4pyscf/grad/rhf.py                 |  7 ++++++-
 gpu4pyscf/hessian/jk.py               |  8 ++++++--
 gpu4pyscf/hessian/rhf.py              | 19 ++++++++++++++++---
 gpu4pyscf/lib/memcpy.py               |  6 ------
 gpu4pyscf/scf/j_engine.py             |  6 +++++-
 gpu4pyscf/scf/jk.py                   |  2 +-
 gpu4pyscf/tests/test_benchmark_rks.py |  2 +-
 7 files changed, 35 insertions(+), 15 deletions(-)

diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py
index 0ee8cd43..dd374cc3 100644
--- a/gpu4pyscf/grad/rhf.py
+++ b/gpu4pyscf/grad/rhf.py
@@ -30,6 +30,7 @@
 from gpu4pyscf.__config__ import _streams, _num_devices
 from gpu4pyscf.df import int3c2e      #TODO: move int3c2e to out of df
 from gpu4pyscf.lib import logger
+from gpu4pyscf.scf import jk
 from gpu4pyscf.scf.jk import (
     LMAX, QUEUE_DEPTH, SHM_SIZE, THREADS, libvhf_rys, _VHFOpt, init_constant,
     _make_tril_tile_mappings, _nearest_power2)
@@ -124,7 +125,11 @@ def _jk_energy_per_atom(mol, dm, vhfopt=None,
     log = logger.new_logger(mol, verbose)
     cput0 = log.init_timer()
     if vhfopt is None:
-        vhfopt = _VHFOpt(mol).build()
+        # Small group size for load balance
+        group_size = None
+        if _num_devices > 1: 
+            group_size = jk.GROUP_SIZE
+        vhfopt = _VHFOpt(mol).build(group_size=group_size)
 
     mol = vhfopt.sorted_mol
     nao, nao_orig = vhfopt.coeff.shape
diff --git a/gpu4pyscf/hessian/jk.py b/gpu4pyscf/hessian/jk.py
index 6f17488d..65edff6b 100644
--- a/gpu4pyscf/hessian/jk.py
+++ b/gpu4pyscf/hessian/jk.py
@@ -26,7 +26,7 @@
 from pyscf import lib
 from pyscf.scf import _vhf
 from pyscf import __config__
-
+from gpu4pyscf.scf import jk
 from gpu4pyscf.scf.jk import (_make_tril_tile_mappings, quartets_scheme, QUEUE_DEPTH,
                               _VHFOpt, LMAX, init_constant, libvhf_rys)
 from gpu4pyscf.lib.cupy_helper import (condense, sandwich_dot, transpose_sum,
@@ -172,7 +172,11 @@ def get_jk(mol, dm, mo_coeff, mo_occ, hermi=0, vhfopt=None,
     cput0 = log.init_timer()
     assert hermi == 1
     if vhfopt is None:
-        vhfopt = _VHFOpt(mol).build()
+        # Small group size for load balance
+        group_size = None
+        if _num_devices > 1:
+            group_size = jk.GROUP_SIZE
+        vhfopt = _VHFOpt(mol).build(group_size=group_size)
 
     mol = vhfopt.sorted_mol
     nao, nao_orig = vhfopt.coeff.shape
diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py
index 52150457..2e10d049 100644
--- a/gpu4pyscf/hessian/rhf.py
+++ b/gpu4pyscf/hessian/rhf.py
@@ -269,7 +269,11 @@ def _partial_ejk_ip2(mol, dm, vhfopt=None, j_factor=1., k_factor=1., verbose=Non
     log = logger.new_logger(mol, verbose)
     cput0 = log.init_timer()
     if vhfopt is None:
-        vhfopt = _VHFOpt(mol).build()
+        # Small group size for load balance
+        group_size = None
+        if _num_devices > 1: 
+            group_size = jk.GROUP_SIZE
+        vhfopt = _VHFOpt(mol).build(group_size=group_size)
 
     mol = vhfopt.sorted_mol
     nao, nao_orig = vhfopt.coeff.shape
@@ -488,7 +492,11 @@ def _get_jk_ip1(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=Non
     vhfopt = _VHFOpt(mol)
     # tile must set to 1. This tile size is assumed in the GPU kernel code
     vhfopt.tile = 1
-    vhfopt.build()
+    # Small group size for load balance
+    group_size = None
+    if _num_devices > 1: 
+        group_size = jk.GROUP_SIZE
+    vhfopt.build(group_size=group_size)
 
     mol = vhfopt.sorted_mol
     nao, nao_orig = vhfopt.coeff.shape
@@ -898,7 +906,12 @@ def _get_jk_mo(hessobj, mol, dms, mo_coeff, mo_occ,
     vhfopt = mf._opt_gpu.get(omega)
     if vhfopt is None:
         with mol.with_range_coulomb(omega):
-            vhfopt = mf._opt_gpu[omega] = _VHFOpt(mol, mf.direct_scf_tol).build()
+            # Small group size for load balance
+            group_size = None
+            if _num_devices > 1: 
+                group_size = jk.GROUP_SIZE
+            vhfopt = _VHFOpt(mol, mf.direct_scf_tol).build(group_size=group_size)
+            mf._opt_gpu[omega] = vhfopt
     with mol.with_range_coulomb(omega):
         vj, vk = jk.get_jk(mol, dms, mo_coeff, mo_occ, hermi, vhfopt, with_j, with_k)
     return vj, vk
diff --git a/gpu4pyscf/lib/memcpy.py b/gpu4pyscf/lib/memcpy.py
index 19b19e41..c961a9a2 100644
--- a/gpu4pyscf/lib/memcpy.py
+++ b/gpu4pyscf/lib/memcpy.py
@@ -76,12 +76,6 @@ def _copy_array(src_view, dst_view):
         kind = cupy.cuda.runtime.memcpyHostToDevice
     else:
         raise NotImplementedError
-    
-
-    if len(chunk_shape) == 0:
-        print('here')
-        print(src_view.nbytes, dst_view.nbytes)
-        print(shape, strides_src, strides_dst)
         
     assert len(chunk_shape) > 0
 
diff --git a/gpu4pyscf/scf/j_engine.py b/gpu4pyscf/scf/j_engine.py
index 2ecb5293..3d98ae5f 100644
--- a/gpu4pyscf/scf/j_engine.py
+++ b/gpu4pyscf/scf/j_engine.py
@@ -26,6 +26,7 @@
 from pyscf import __config__
 from gpu4pyscf.lib.cupy_helper import load_library, condense, sandwich_dot, transpose_sum
 from gpu4pyscf.__config__ import props as gpu_specs
+from gpu4pyscf.__config__ import _num_devices
 from gpu4pyscf.lib import logger
 from gpu4pyscf.scf import jk
 from gpu4pyscf.scf.jk import _make_j_engine_pair_locs, RysIntEnvVars, _scale_sp_ctr_coeff
@@ -51,7 +52,10 @@ def get_j(mol, dm, hermi=1, vhfopt=None, omega=None, verbose=None):
     cput0 = log.init_timer()
     if vhfopt is None:
         with mol.with_range_coulomb(omega):
-            vhfopt = _VHFOpt(mol).build()
+            groupsize = None
+            if _num_devices > 1:                
+                groupsize = jk.GROUP_SIZE
+            vhfopt = _VHFOpt(mol).build(group_size=groupsize)
     if omega is None:
         omega = mol.omega
 
diff --git a/gpu4pyscf/scf/jk.py b/gpu4pyscf/scf/jk.py
index 8577457d..0e328204 100644
--- a/gpu4pyscf/scf/jk.py
+++ b/gpu4pyscf/scf/jk.py
@@ -462,7 +462,7 @@ def __init__(self, mol, cutoff=1e-13):
         self._tile_q_cond = {}
         self._s_estimator = {}
 
-    def build(self, group_size=GROUP_SIZE, verbose=None):
+    def build(self, group_size=None, verbose=None):
         mol = self.mol
         log = logger.new_logger(mol, verbose)
         cput0 = log.init_timer()
diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py
index ec294234..c367ac90 100644
--- a/gpu4pyscf/tests/test_benchmark_rks.py
+++ b/gpu4pyscf/tests/test_benchmark_rks.py
@@ -17,7 +17,7 @@
 import pyscf
 import pytest
 from gpu4pyscf.dft import rks
-
+CUDA_VISIBLE_DEVICES=0
 # Any task taking more than 1000s will be marked as 'slow'
 
 # How to run

From 9dbc08354022eb0da72ee00f2b44d649e1035d3a Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Tue, 7 Jan 2025 07:29:02 +0000
Subject: [PATCH 46/49] group_size in hessian

---
 gpu4pyscf/hessian/rhf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py
index 2e10d049..8463e68c 100644
--- a/gpu4pyscf/hessian/rhf.py
+++ b/gpu4pyscf/hessian/rhf.py
@@ -35,7 +35,7 @@
 from gpu4pyscf.__config__ import _streams, _num_devices
 from gpu4pyscf.lib import logger
 from gpu4pyscf.scf.jk import (
-    LMAX, QUEUE_DEPTH, SHM_SIZE, THREADS, libvhf_rys, _VHFOpt, init_constant,
+    LMAX, QUEUE_DEPTH, SHM_SIZE, THREADS, GROUP_SIZE, libvhf_rys, _VHFOpt, init_constant,
     _make_tril_tile_mappings, _nearest_power2)
 from gpu4pyscf.grad import rhf as rhf_grad
 from gpu4pyscf.hessian import jk
@@ -272,7 +272,7 @@ def _partial_ejk_ip2(mol, dm, vhfopt=None, j_factor=1., k_factor=1., verbose=Non
         # Small group size for load balance
         group_size = None
         if _num_devices > 1: 
-            group_size = jk.GROUP_SIZE
+            group_size = GROUP_SIZE
         vhfopt = _VHFOpt(mol).build(group_size=group_size)
 
     mol = vhfopt.sorted_mol

From 3abc22674763e5546eae5a194d13714a90043262 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Tue, 7 Jan 2025 07:57:55 +0000
Subject: [PATCH 47/49] resolve possible memory leak

---
 gpu4pyscf/gto/int3c1e.py    | 14 ++++++++------
 gpu4pyscf/gto/int3c1e_ip.py | 12 ++++++++----
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/gpu4pyscf/gto/int3c1e.py b/gpu4pyscf/gto/int3c1e.py
index cab38b98..8e6ce88c 100644
--- a/gpu4pyscf/gto/int3c1e.py
+++ b/gpu4pyscf/gto/int3c1e.py
@@ -258,12 +258,13 @@ def get_int3c1e(mol, grids, charge_exponents, intopt):
 
             charge_exponents_pointer = c_null_ptr()
             if charge_exponents is not None:
-                charge_exponents_pointer = charge_exponents[p0:p1].data.ptr
-
+                exponents_slice = charge_exponents[p0:p1]
+                charge_exponents_pointer = exponents_slice.data.ptr
+            grids_slice = grids[p0:p1]
             err = libgint.GINTfill_int3c1e(
                 ctypes.cast(stream.ptr, ctypes.c_void_p),
                 intopt.bpcache,
-                ctypes.cast(grids[p0:p1, :].data.ptr, ctypes.c_void_p),
+                ctypes.cast(grids_slice.data.ptr, ctypes.c_void_p),
                 ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
                 ctypes.c_int(p1-p0),
                 ctypes.cast(int3c_angular_slice.data.ptr, ctypes.c_void_p),
@@ -441,16 +442,17 @@ def get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt):
 
             charge_exponents_pointer = c_null_ptr()
             if charge_exponents is not None:
-                charge_exponents_pointer = charge_exponents[p0:p1].data.ptr
+                exponents_slice = charge_exponents[p0:p1]
+                charge_exponents_pointer = exponents_slice.data.ptr
 
             # n_pair_sum_per_thread = 1 # means every thread processes one pair and one grid
             # n_pair_sum_per_thread = nao_cart # or larger number gaurantees one thread processes one grid and all pairs of the same type
             n_pair_sum_per_thread = nao_cart
-
+            grids_slice = grids[p0:p1, :]
             err = libgint.GINTfill_int3c1e_density_contracted(
                 ctypes.cast(stream.ptr, ctypes.c_void_p),
                 intopt.bpcache,
-                ctypes.cast(grids[p0:p1, :].data.ptr, ctypes.c_void_p),
+                ctypes.cast(grids_slice.data.ptr, ctypes.c_void_p),
                 ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
                 ctypes.c_int(p1-p0),
                 ctypes.cast(dm_pair_ordered.data.ptr, ctypes.c_void_p),
diff --git a/gpu4pyscf/gto/int3c1e_ip.py b/gpu4pyscf/gto/int3c1e_ip.py
index 717db68f..56e51662 100644
--- a/gpu4pyscf/gto/int3c1e_ip.py
+++ b/gpu4pyscf/gto/int3c1e_ip.py
@@ -78,12 +78,14 @@ def get_int3c1e_ip(mol, grids, charge_exponents, intopt):
 
             charge_exponents_pointer = c_null_ptr()
             if charge_exponents is not None:
-                charge_exponents_pointer = charge_exponents[p0:p1].data.ptr
+                exponents_slice = charge_exponents[p0:p1]
+                charge_exponents_pointer = exponents_slice.data.ptr
 
+            grids_slice = grids[p0:p1, :]
             err = libgint.GINTfill_int3c1e_ip(
                 ctypes.cast(stream.ptr, ctypes.c_void_p),
                 intopt.bpcache,
-                ctypes.cast(grids[p0:p1, :].data.ptr, ctypes.c_void_p),
+                ctypes.cast(grids_slice.data.ptr, ctypes.c_void_p),
                 ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
                 ctypes.c_int(p1-p0),
                 ctypes.cast(int3c_angular_slice.data.ptr, ctypes.c_void_p),
@@ -260,7 +262,9 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt)
 
             charge_exponents_pointer = c_null_ptr()
             if charge_exponents is not None:
-                charge_exponents_pointer = charge_exponents[p0:p1].data.ptr
+                exponents_slice = charge_exponents[p0:p1]
+                charge_exponents_pointer = exponents_slice.data.ptr
+            grids_slice = grids[p0:p1].data.ptr
 
             # n_pair_sum_per_thread = 1 # means every thread processes one pair and one grid
             # n_pair_sum_per_thread = nao_cart # or larger number gaurantees one thread processes one grid and all pairs of the same type
@@ -269,7 +273,7 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt)
             err = libgint.GINTfill_int3c1e_ip2_density_contracted(
                 ctypes.cast(stream.ptr, ctypes.c_void_p),
                 intopt.bpcache,
-                ctypes.cast(grids[p0:p1, :].data.ptr, ctypes.c_void_p),
+                ctypes.cast(grids_slice.data.ptr, ctypes.c_void_p),
                 ctypes.cast(charge_exponents_pointer, ctypes.c_void_p),
                 ctypes.c_int(p1-p0),
                 ctypes.cast(dm_pair_ordered.data.ptr, ctypes.c_void_p),

From 2f5ce8b6291e0fc8251c05d87dd9547d5ae08219 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Tue, 7 Jan 2025 17:32:35 +0000
Subject: [PATCH 48/49] bugfix

---
 gpu4pyscf/gto/int3c1e_ip.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu4pyscf/gto/int3c1e_ip.py b/gpu4pyscf/gto/int3c1e_ip.py
index 56e51662..c2aeb1be 100644
--- a/gpu4pyscf/gto/int3c1e_ip.py
+++ b/gpu4pyscf/gto/int3c1e_ip.py
@@ -264,7 +264,7 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt)
             if charge_exponents is not None:
                 exponents_slice = charge_exponents[p0:p1]
                 charge_exponents_pointer = exponents_slice.data.ptr
-            grids_slice = grids[p0:p1].data.ptr
+            grids_slice = grids[p0:p1]
 
             # n_pair_sum_per_thread = 1 # means every thread processes one pair and one grid
             # n_pair_sum_per_thread = nao_cart # or larger number gaurantees one thread processes one grid and all pairs of the same type

From e04cb3fd02a4737b812abfc11099555706deaaa6 Mon Sep 17 00:00:00 2001
From: Xiaojie Wu <xiaojie.wu@bytedance.com>
Date: Tue, 7 Jan 2025 18:27:25 +0000
Subject: [PATCH 49/49] bugfix

---
 gpu4pyscf/hessian/rhf.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py
index 8463e68c..775a6e98 100644
--- a/gpu4pyscf/hessian/rhf.py
+++ b/gpu4pyscf/hessian/rhf.py
@@ -35,8 +35,8 @@
 from gpu4pyscf.__config__ import _streams, _num_devices
 from gpu4pyscf.lib import logger
 from gpu4pyscf.scf.jk import (
-    LMAX, QUEUE_DEPTH, SHM_SIZE, THREADS, GROUP_SIZE, libvhf_rys, _VHFOpt, init_constant,
-    _make_tril_tile_mappings, _nearest_power2)
+    LMAX, QUEUE_DEPTH, SHM_SIZE, THREADS, GROUP_SIZE, libvhf_rys, _VHFOpt, 
+    init_constant, _make_tril_tile_mappings, _nearest_power2)
 from gpu4pyscf.grad import rhf as rhf_grad
 from gpu4pyscf.hessian import jk
 
@@ -495,7 +495,7 @@ def _get_jk_ip1(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=Non
     # Small group size for load balance
     group_size = None
     if _num_devices > 1: 
-        group_size = jk.GROUP_SIZE
+        group_size = GROUP_SIZE
     vhfopt.build(group_size=group_size)
 
     mol = vhfopt.sorted_mol
@@ -909,7 +909,7 @@ def _get_jk_mo(hessobj, mol, dms, mo_coeff, mo_occ,
             # Small group size for load balance
             group_size = None
             if _num_devices > 1: 
-                group_size = jk.GROUP_SIZE
+                group_size = GROUP_SIZE
             vhfopt = _VHFOpt(mol, mf.direct_scf_tol).build(group_size=group_size)
             mf._opt_gpu[omega] = vhfopt
     with mol.with_range_coulomb(omega):