From 8cb5ce49d5bb183a1649b04c659af33ce08d762e Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Thu, 19 Dec 2024 21:39:56 +0000 Subject: [PATCH 01/49] refactor hessian class --- gpu4pyscf/df/hessian/jk.py | 432 ++++++++++++++++++ gpu4pyscf/df/hessian/rhf.py | 111 +++-- gpu4pyscf/df/hessian/rks.py | 7 +- .../df/hessian/tests/test_df_rhf_hessian.py | 145 ++++++ .../df/hessian/tests/test_df_rks_hessian.py | 107 +++++ .../df/hessian/tests/test_df_uhf_hessian.py | 6 +- gpu4pyscf/df/hessian/uhf.py | 72 +-- gpu4pyscf/df/hessian/uks.py | 1 + gpu4pyscf/df/tests/test_df_hessian.py | 3 +- gpu4pyscf/hessian/jk.py | 296 ++++++++++++ gpu4pyscf/hessian/rhf.py | 56 ++- gpu4pyscf/hessian/rks.py | 58 ++- gpu4pyscf/hessian/tests/test_rhf_hessian.py | 2 +- gpu4pyscf/hessian/uhf.py | 47 +- gpu4pyscf/hessian/uks.py | 64 ++- gpu4pyscf/properties/ir.py | 3 +- gpu4pyscf/scf/jk.py | 36 +- gpu4pyscf/scf/tests/test_scf_jk.py | 26 +- gpu4pyscf/solvent/hessian/pcm.py | 25 + gpu4pyscf/solvent/hessian/smd.py | 26 ++ gpu4pyscf/solvent/tests/test_smd_hessian.py | 1 + gpu4pyscf/tests/test_dft.py | 2 +- 22 files changed, 1334 insertions(+), 192 deletions(-) create mode 100644 gpu4pyscf/df/hessian/jk.py create mode 100644 gpu4pyscf/df/hessian/tests/test_df_rhf_hessian.py create mode 100644 gpu4pyscf/df/hessian/tests/test_df_rks_hessian.py create mode 100644 gpu4pyscf/hessian/jk.py diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py new file mode 100644 index 00000000..16010bda --- /dev/null +++ b/gpu4pyscf/df/hessian/jk.py @@ -0,0 +1,432 @@ +# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import ctypes +import itertools +import numpy as np +from concurrent.futures import ThreadPoolExecutor +import cupy +from pyscf import gto +from gpu4pyscf.df import int3c2e +from gpu4pyscf.scf.int4c2e import libgint +from gpu4pyscf.hessian.jk import _ao2mo +from gpu4pyscf.lib import logger +from gpu4pyscf.lib.cupy_helper import contract, cart2sph, reduce_to_device +from gpu4pyscf.__config__ import _streams, _num_devices + +NROOT_ON_GPU = 7 + +def _jk_task_with_mo1(dfobj, dms, mo_coeff, mo1s, occ_coeffs, + with_j=True, with_k=True, hermi=0, device_id=0): + ''' Calculate J and K matrices with mo response + For CP-HF + ''' + with cupy.cuda.Device(device_id), _streams[device_id]: + assert isinstance(dfobj.verbose, int) + log = logger.new_logger(dfobj.mol, dfobj.verbose) + t0 = log.init_timer() + dms = cupy.asarray(dms) + n_dm = dms.shape[0] + mo1s = [cupy.asarray(mo1) for mo1 in mo1s] + occ_coeffs = [cupy.asarray(occ_coeff) for occ_coeff in occ_coeffs] + mo_coeff = [cupy.asarray(mo) for mo in mo_coeff] + nao = dms.shape[-1] + intopt = dfobj.intopt + rows = intopt.cderi_row + cols = intopt.cderi_col + dms_shape = dms.shape + if with_j: + dm_sparse = dms[:,rows,cols] + if hermi == 0: + dm_sparse += dms[:,cols,rows] + else: + dm_sparse *= 2 + dm_sparse[:, intopt.cderi_diag] *= .5 + + if with_k: + vks = [cupy.zeros_like(mo1) for mo1 in mo1s] + + if with_j: + vj_sparse = cupy.zeros_like(dm_sparse) + + nocc = max([mo1.shape[2] for mo1 in mo1s]) + blksize = dfobj.get_blksize(extra=2*nao*nocc) + for cderi, cderi_sparse in dfobj.loop(blksize=blksize, unpack=with_k): + if with_j: + rhoj = dm_sparse.dot(cderi_sparse) + vj_sparse += cupy.dot(rhoj, cderi_sparse.T) + rhoj = None + cderi_sparse = None + if with_k: + for occ_coeff, mo1, vk in zip(occ_coeffs, mo1s, vks): + nocc = occ_coeff.shape[1] + rhok = contract('Lij,jo->Loi', cderi, occ_coeff) + rhok_oo = contract('Loi,ip->Lop', rhok, occ_coeff).reshape([-1,nocc]) + rhok = rhok.reshape([-1,nao]) + for i in range(mo1.shape[0]): + rhok1 = contract('Lij,jo->Loi', cderi, mo1[i]) + rhok1 = rhok1.reshape([-1,nao]) + vk[i] += cupy.dot(rhok1.T, rhok_oo) + + rhok1 = rhok1.reshape([-1,nocc,nao]) + rhok1 = contract('Loi,ip->Lop', rhok1, occ_coeff) + rhok1 = rhok1.reshape([-1,nocc]) + vk[i] += cupy.dot(rhok.T, rhok1) + mo1 = rhok1 = rhok = rhok_oo = None + cderi = None + mo1s = None + if with_j: + vj = cupy.zeros(dms_shape) + vj[:,rows,cols] = vj_sparse + vj[:,cols,rows] = vj_sparse + + vj_mo = vk_mo = None + if len(occ_coeffs) == 1: + # Restricted case + mo = mo_coeff[0] + if with_j: + vj_mo = _ao2mo(vj, occ_coeffs[0], mo).reshape(n_dm,-1) + vj = None + mo *= 2.0 # Due to double occupancy + if with_k: + vk_mo = contract('nio,ip->npo', vks[0], mo).reshape(n_dm,-1) + elif len(occ_coeffs) == 2: + # Unrestricted case + n_dm_2 = n_dm // 2 + mocca, moccb = occ_coeffs + moa, mob = mo_coeff + nmoa, nmob = moa.shape[1], mob.shape[1] + nocca, noccb = mocca.shape[1], moccb.shape[1] + + if with_j: + vjab = vj[:n_dm_2] + vj[n_dm_2:] + vj = None + vj_mo = cupy.empty([n_dm_2,nmoa*nocca+nmob*noccb]) + vj_mo[:,:nmoa*nocca] = _ao2mo(vjab, mocca, moa).reshape(n_dm_2,-1) + vj_mo[:,nmoa*nocca:] = _ao2mo(vjab, moccb, mob).reshape(n_dm_2,-1) + vjab = None + + if with_k: + vka, vkb = vks + vk_mo = cupy.empty([n_dm_2,nmoa*nocca+nmob*noccb]) + vk_mo[:,:nmoa*nocca] = contract('nio,ip->npo', vka, moa).reshape(n_dm_2,-1) + vk_mo[:,nmoa*nocca:] = contract('nio,ip->npo', vkb, mob).reshape(n_dm_2,-1) + + t0 = log.timer_debug1(f'vj and vk on Device {device_id}', *t0) + return vj_mo, vk_mo + +def get_jk(dfobj, dms_tag, mo_coeff, mocc, hermi=0, + with_j=True, with_k=True, direct_scf_tol=1e-14, omega=None): + ''' Compute J/K in MO with density fitting + ''' + + log = logger.new_logger(dfobj.mol, dfobj.verbose) + if not isinstance(dms_tag, cupy.ndarray): + dms_tag = cupy.asarray(dms_tag) + + assert(with_j or with_k) + if dms_tag is None: logger.error("dm is not given") + nao = dms_tag.shape[-1] + t1 = t0 = log.init_timer() + if dfobj._cderi is None: + log.debug('Build CDERI ...') + dfobj.build(direct_scf_tol=direct_scf_tol, omega=omega) + t1 = log.timer_debug1('init jk', *t0) + + assert nao == dfobj.nao + intopt = dfobj.intopt + + nao = dms_tag.shape[-1] + dms = dms_tag.reshape([-1,nao,nao]) + intopt = dfobj.intopt + dms = intopt.sort_orbitals(dms, axis=[1,2]) + + cupy.cuda.get_current_stream().synchronize() + occ_coeffs = dms_tag.occ_coeff + mo1s = dms_tag.mo1 + + if not isinstance(occ_coeffs, (tuple, list)): + occ_coeffs = [occ_coeffs] + mo1s = [mo1s] + mo_coeff = [mo_coeff] + else: + assert isinstance(mo1s, (tuple, list)) + mo_coeff = [mo_coeff[0], mo_coeff[1]] + + occ_coeffs = [intopt.sort_orbitals(occ_coeff, axis=[0]) for occ_coeff in occ_coeffs] + mo1s = [intopt.sort_orbitals(mo1, axis=[1]) for mo1 in mo1s] + mo_coeff = [intopt.sort_orbitals(mo, axis=[0]) for mo in mo_coeff] + + futures = [] + with ThreadPoolExecutor(max_workers=_num_devices) as executor: + for device_id in range(_num_devices): + future = executor.submit( + _jk_task_with_mo1, + dfobj, dms, mo_coeff, mo1s, occ_coeffs, + hermi=hermi, device_id=device_id, + with_j=with_j, with_k=with_k) + futures.append(future) + + vj = vk = None + if with_j: + vj = [future.result()[0] for future in futures] + vj = reduce_to_device(vj, inplace=True) + + if with_k: + vk = [future.result()[1] for future in futures] + vk = reduce_to_device(vk, inplace=True) + t1 = log.timer_debug1('vj and vk', *t1) + return vj, vk + + +def _get_int3c2e_ipip_slice(ip_type, intopt, cp_ij_id, aux_id, omega=None, stream=None): + + if omega is None: omega = 0.0 + if stream is None: stream = cupy.cuda.get_current_stream() + + fn = getattr(libgint, 'GINTfill_int3c2e_' + ip_type) + + nao = intopt._sorted_mol.nao + naux = intopt._sorted_auxmol.nao + norb = nao + naux + 1 + comp = 9 + order = 2 + + lmax = intopt._sorted_mol._bas[:gto.ANG_OF].max() + aux_lmax = intopt._sorted_auxmol._bas[:gto.ANG_OF].max() + nroots = (lmax + aux_lmax + order)//2 + 1 + if nroots > NROOT_ON_GPU: + from pyscf.gto.moleintor import getints, make_cintopt + pmol = intopt._tot_mol + intor = pmol._add_suffix('int3c2e_' + ip_type) + opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor) + + nbins = 1 + + cp_kl_id = aux_id + len(intopt.log_qs) + lk = intopt.aux_angular[aux_id] + + cpi = intopt.cp_idx[cp_ij_id] + cpj = intopt.cp_jdx[cp_ij_id] + li = intopt.angular[cpi] + lj = intopt.angular[cpj] + + i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1] + j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1] + k0, k1 = intopt.cart_aux_loc[aux_id], intopt.cart_aux_loc[aux_id+1] + ni = i1 - i0 + nj = j1 - j0 + nk = k1 - k0 + + log_q_ij = intopt.log_qs[cp_ij_id] + log_q_kl = intopt.aux_log_qs[aux_id] + + bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32) + bins_locs_kl = np.array([0, len(log_q_kl)], dtype=np.int32) + + ao_offsets = np.array([i0,j0,nao+1+k0,nao], dtype=np.int32) + strides = np.array([1, ni, ni*nj, ni*nj*nk], dtype=np.int32) + + # Use GPU kernels for low-angular momentum + if (li + lj + lk + order)//2 + 1 < NROOT_ON_GPU: + int3c_blk = cupy.zeros([comp, nk, nj, ni], order='C', dtype=np.float64) + err = fn( + ctypes.cast(stream.ptr, ctypes.c_void_p), + intopt.bpcache, + ctypes.cast(int3c_blk.data.ptr, ctypes.c_void_p), + ctypes.c_int(norb), + strides.ctypes.data_as(ctypes.c_void_p), + ao_offsets.ctypes.data_as(ctypes.c_void_p), + bins_locs_ij.ctypes.data_as(ctypes.c_void_p), + bins_locs_kl.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nbins), + ctypes.c_int(cp_ij_id), + ctypes.c_int(cp_kl_id), + ctypes.c_double(omega)) + if err != 0: + raise RuntimeError(f'GINT_fill_int3c2e general failed, err={err}') + else: + # TODO: sph2cart in CPU? + ishl0, ishl1 = intopt.l_ctr_offsets[cpi], intopt.l_ctr_offsets[cpi+1] + jshl0, jshl1 = intopt.l_ctr_offsets[cpj], intopt.l_ctr_offsets[cpj+1] + kshl0, kshl1 = intopt.l_ctr_offsets[aux_id+1+intopt.nctr], intopt.l_ctr_offsets[aux_id+1+intopt.nctr+1] + shls_slice = np.array([ishl0, ishl1, jshl0, jshl1, kshl0, kshl1], dtype=np.int64) + int3c_cpu = getints(intor, pmol._atm, pmol._bas, pmol._env, shls_slice, cintopt=opt).transpose([0,3,2,1]) + int3c_blk = cupy.asarray(int3c_cpu) + + if not intopt.auxmol.cart: + int3c_blk = cart2sph(int3c_blk, axis=1, ang=lk) + if not intopt.mol.cart: + int3c_blk = cart2sph(int3c_blk, axis=2, ang=lj) + int3c_blk = cart2sph(int3c_blk, axis=3, ang=li) + + return int3c_blk + + +def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, + device_id=0, with_k=True, omega=None, auxbasis_response=1): + natm = intopt.mol.natm + nao = dm0.shape[0] + naux = rhok.shape[0] + ao_loc = intopt.ao_loc + aux_ao_loc = intopt.aux_ao_loc + with cupy.cuda.Device(device_id), _streams[device_id]: + log = logger.new_logger(intopt.mol, intopt.mol.verbose) + t0 = log.init_timer() + rhoj = cupy.asarray(rhoj) + rhok = cupy.asarray(rhok) + orbo = cupy.asarray(orbo) + dm0 = cupy.asarray(dm0) + nao = dm0.shape[0] + + hj_ipip1 = cupy.zeros([nao,9]) + hj_ipip2 = cupy.zeros([naux,9]) + hj_ip1ip2 = cupy.zeros([nao,naux,9]) + hj_ipvip1 = cupy.zeros([nao,nao,9]) + if with_k: + hk_ipip1 = cupy.zeros([nao,9]) + hk_ipip2 = cupy.zeros([naux,9]) + hk_ip1ip2 = cupy.zeros([nao,naux,9]) + hk_ipvip1 = cupy.zeros([nao,nao,9]) + + for aux_id, cp_ij_id in task_list: + cpi = intopt.cp_idx[cp_ij_id] + cpj = intopt.cp_jdx[cp_ij_id] + i0, i1 = ao_loc[cpi], ao_loc[cpi+1] + j0, j1 = ao_loc[cpj], ao_loc[cpj+1] + k0, k1 = aux_ao_loc[aux_id], aux_ao_loc[aux_id+1] + + if with_k: + rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1]) + rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1]) + + # (20|0), (0|0)(0|00) + int3c_blk = _get_int3c2e_ipip_slice('ipip1', intopt, cp_ij_id, aux_id, omega=omega) + tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1]) + hj_ipip1[i0:i1] += contract('xpi,p->ix', tmp, rhoj[k0:k1]) + if with_k: + hk_ipip1[i0:i1] += contract('xpji,pij->ix', int3c_blk, rhok_tmp) + + # (11|0), (0|0)(0|00) without response of RI basis + int3c_blk = _get_int3c2e_ipip_slice('ipvip1', intopt, cp_ij_id, aux_id, omega=omega) + tmp = contract('xpji,ij->xpij', int3c_blk, dm0[i0:i1,j0:j1]) + hj_ipvip1[i0:i1,j0:j1] += contract('xpij,p->ijx', tmp, rhoj[k0:k1]) + if with_k: + hk_ipvip1[i0:i1,j0:j1] += contract('xpji,pij->ijx', int3c_blk, rhok_tmp) + + if auxbasis_response < 1: + continue + + # (10|1), (0|0)(0|00) + int3c_blk = _get_int3c2e_ipip_slice('ip1ip2', intopt, cp_ij_id, aux_id, omega=omega) + tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1]) + hj_ip1ip2[i0:i1,k0:k1] += contract('xpi,p->ipx', tmp, rhoj[k0:k1]) + if with_k: + hk_ip1ip2[i0:i1,k0:k1] += contract('xpji,pij->ipx', int3c_blk, rhok_tmp) + + if auxbasis_response < 2: + continue + + # (00|2), (0|0)(0|00) + int3c_blk = _get_int3c2e_ipip_slice('ipip2', intopt, cp_ij_id, aux_id, omega=omega) + tmp = contract('xpji,ij->xp', int3c_blk, dm0[i0:i1,j0:j1]) + hj_ipip2[k0:k1] += contract('xp,p->px', tmp, rhoj[k0:k1]) + if with_k: + hk_ipip2[k0:k1] += contract('xpji,pij->px', int3c_blk, rhok_tmp) + + auxslices = intopt.auxmol.aoslice_by_atom() + aoslices = intopt.mol.aoslice_by_atom() + ao2atom = int3c2e.get_ao2atom(intopt, aoslices) + aux2atom = int3c2e.get_aux2atom(intopt, auxslices) + + hj_ipvip1 = hj_ipvip1.reshape([nao,nao,3,3]) + tmp = contract('ia,ijxy->ajxy', ao2atom, hj_ipvip1) + hj = 2.0 * contract('jb,ajxy->abxy', ao2atom, tmp) + + hj_ipip1 = hj_ipip1.reshape([nao,3,3]) + tmp = contract('ia,ixy->axy', ao2atom, hj_ipip1) + hj[range(natm), range(natm)] += 2.0 * tmp + + hk = None + if with_k: + hk_ipvip1 = hk_ipvip1.reshape([nao,nao,3,3]) + tmp = contract('ia,ijxy->ajxy', ao2atom, hk_ipvip1) + hk = contract('jb,ajxy->abxy', ao2atom, tmp) + + hk_ipip1 = hk_ipip1.reshape([nao,3,3]) + tmp = contract('ia,ixy->axy', ao2atom, hk_ipip1) + hk[range(natm), range(natm)] += tmp + + if auxbasis_response > 0: + hj_ip1ip2 = hj_ip1ip2.reshape([nao,naux,3,3]) + tmp = contract('ia,ijxy->ajxy', ao2atom, hj_ip1ip2) + tmp = contract('jb,ajxy->abxy',aux2atom, tmp) + tmp = tmp + tmp.transpose([1,0,3,2]) + hj += tmp + if auxbasis_response > 1: + hj += tmp + if with_k: + hk_ip1ip2 = hk_ip1ip2.reshape([nao,naux,3,3]) + tmp = contract('ia,ijxy->ajxy', ao2atom, hk_ip1ip2) + tmp = contract('jb,ajxy->abxy', aux2atom, tmp) + tmp = 0.5 * (tmp + tmp.transpose([1,0,3,2])) + hk += tmp + if auxbasis_response > 1: + hk += tmp + + if auxbasis_response > 1: + hj_ipip2 = hj_ipip2.reshape([naux,3,3]) + tmp = contract('ia,ixy->axy', aux2atom, hj_ipip2) + hj[range(natm), range(natm)] += tmp + if with_k: + hk_ipip2 = hk_ipip2.reshape([naux,3,3]) + tmp = contract('ia,ixy->axy', aux2atom, hk_ipip2) + hk[range(natm), range(natm)] += .5 * tmp + t0 = log.timer_debug1(f'int3c2e_ipip on Device {device_id}', *t0) + return hj, hk + +def get_int3c2e_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, + omega=None, auxbasis_response=1): + orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') + futures = [] + ncp_k = len(intopt.aux_log_qs) + ncp_ij = len(intopt.log_qs) + tasks = np.array(list(itertools.product(range(ncp_k), range(ncp_ij)))) + task_list = [] + for device_id in range(_num_devices): + task_list.append(tasks[device_id::_num_devices]) + + cupy.cuda.get_current_stream().synchronize() + with ThreadPoolExecutor(max_workers=_num_devices) as executor: + for device_id in range(_num_devices): + future = executor.submit( + _int3c2e_ipip_tasks, intopt, task_list[device_id], + rhoj, rhok, dm0_tag, orbo, with_k=with_k, + device_id=device_id, omega=omega, + auxbasis_response=auxbasis_response) + futures.append(future) + + hj_total = [] + hk_total = [] + for future in futures: + hj, hk = future.result() + hj_total.append(hj) + hk_total.append(hk) + + hj = hk = None + hj = reduce_to_device(hj_total, inplace=True) + if with_k: + hk = reduce_to_device(hk_total, inplace=True) + return hj, hk diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py index e1a25ec3..9471c849 100644 --- a/gpu4pyscf/df/hessian/rhf.py +++ b/gpu4pyscf/df/hessian/rhf.py @@ -42,7 +42,7 @@ from gpu4pyscf.lib import logger from gpu4pyscf import __config__ from gpu4pyscf.df.grad.rhf import _gen_metric_solver -from gpu4pyscf.gto.mole import sort_atoms +from gpu4pyscf.df.hessian import jk LINEAR_DEP_THR = df.LINEAR_DEP_THR BLKSIZE = 128 @@ -60,9 +60,10 @@ def _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2): ''' nnz = rhok1_Pko.shape[0] nao = dm0.shape[0] - mem_avail = get_avail_mem() - blksize = int((mem_avail*0.4/(nao*nao*3*8)/ALIGNED))*ALIGNED hk_ao_ao = cupy.zeros([nao,nao,3,3]) + cupy.get_default_memory_pool().free_all_blocks() + mem_avail = get_avail_mem() + blksize = int((mem_avail*0.2/(nao*nao*3*8)/ALIGNED))*ALIGNED for k0, k1 in lib.prange(0,nnz,blksize): rhok1_Pko_kslice = cupy.asarray(rhok1_Pko[k0:k1]) @@ -77,7 +78,6 @@ def _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2): rhok1_Pkl_kslice = None return hk_ao_ao - def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None, max_memory=4000, verbose=None, with_k=True, omega=None): '''Partial derivative @@ -216,41 +216,11 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, wk1_Pko = rhok1_Pko = None t1 = log.timer_debug1('intermediate variables with int3c2e_ip1', *t1) - cupy.get_default_memory_pool().free_all_blocks() - # int3c_ipip1 contributions - hj_ao_diag, hk_ao_diag = int3c2e.get_int3c2e_hjk(intopt, 'ipip1', rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k) - hj_ao_diag *= 2.0 - t1 = log.timer_debug1('intermediate variables with int3c2e_ipip1', *t1) - - # int3c_ipvip1 contributions - # (11|0), (0|00) without response of RI basis - hj, hk = int3c2e.get_int3c2e_hjk(intopt, 'ipvip1', rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k) - hj_ao_ao += 2.0*hj - if with_k: - hk_ao_ao += hk - hj = hk = None - t1 = log.timer_debug1('intermediate variables with int3c2e_ipvip1', *t1) + hj_ipip, hk_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, + with_k=with_k, omega=omega, + auxbasis_response=hessobj.auxbasis_response) + t1 = log.timer_debug1('intermediate variables with int3c2e_ipip', *t1) - # int3c_ip1ip2 contributions - # (10|1), (0|0)(0|00) - if hessobj.auxbasis_response: - hj, hk = int3c2e.get_int3c2e_hjk(intopt, 'ip1ip2', rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k) - hj_ao_aux += hj - if with_k: - hk_ao_aux += hk - hj = hk = None - t1 = log.timer_debug1('intermediate variables with int3c2e_ip1ip2', *t1) - - # int3c_ipip2 contributions - if hessobj.auxbasis_response > 1: - # (00|2), (0|0)(0|00) - hj, hk = int3c2e.get_int3c2e_hjk(intopt, 'ipip2', rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k) - hj_aux_diag = hj - if with_k: - hk_aux_diag = .5*hk - hj = hk = None - t1 = log.timer_debug1('intermediate variables with int3c2e_ipip2', *t1) - # int2c contributions if hessobj.auxbasis_response > 1: if omega and omega > 1e-10: @@ -263,10 +233,10 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P) # (00|0)(2|0)(0|00) # p,xp->px - hj_aux_diag -= (rhoj0_P*rhoj2c_P).T.reshape(-1,3,3) + hj_aux_diag = -(rhoj0_P*rhoj2c_P).T.reshape(-1,3,3) if with_k: rho2c_0 = contract('pij,qji->pq', rhok0_P__, rhok0_P__) - hk_aux_diag -= .5 * contract('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3) + hk_aux_diag = -.5 * contract('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3) int2c_ipip1 = None if omega and omega > 1e-10: @@ -334,20 +304,16 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, t1 = log.timer_debug1('contract int2c_*', *t1) dm0 = intopt.unsort_orbitals(dm0, axis=[0,1]) - hj_ao_diag = intopt.unsort_orbitals(hj_ao_diag, axis=[0]) hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1]) if hessobj.auxbasis_response: hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1]) if hessobj.auxbasis_response > 1: - hj_aux_diag = intopt.unsort_orbitals(hj_aux_diag, aux_axis=[0]) hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1]) if with_k: - hk_ao_diag = intopt.unsort_orbitals(hk_ao_diag, axis=[0]) hk_ao_ao = intopt.unsort_orbitals(hk_ao_ao, axis=[0,1]) if hessobj.auxbasis_response: hk_ao_aux = intopt.unsort_orbitals(hk_ao_aux, axis=[0], aux_axis=[1]) if hessobj.auxbasis_response > 1: - hk_aux_diag = intopt.unsort_orbitals(hk_aux_diag, aux_axis=[0]) hk_aux_aux = intopt.unsort_orbitals(hk_aux_aux, aux_axis=[0,1]) #======================================== sort AO end =========================================== # Energy weighted density matrix @@ -368,14 +334,12 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # collecting all # ----------------------------------------- e1 = cupy.zeros([len(atmlst),len(atmlst),3,3]) - ej = cupy.zeros([len(atmlst),len(atmlst),3,3]) - ek = cupy.zeros([len(atmlst),len(atmlst),3,3]) + ej = hj_ipip + ek = hk_ipip + for i0, ia in enumerate(atmlst): shl0, shl1, p0, p1 = aoslices[ia] e1[i0,i0] -= cupy.sum(h1aa[p0:p1], axis=0) - ej[i0,i0] += cupy.sum(hj_ao_diag[p0:p1,:,:], axis=0) - if with_k: - ek[i0,i0] += cupy.sum(hk_ao_diag[p0:p1,:,:], axis=0) for j0, ja in enumerate(atmlst[:i0+1]): q0, q1 = aoslices[ja][2:] ej[i0,j0] += cupy.sum(hj_ao_ao[p0:p1,q0:q1], axis=[0,1]) @@ -408,9 +372,6 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # if hessobj.auxbasis_response > 1: shl0, shl1, p0, p1 = auxslices[ia] - ej[i0,i0] += cupy.sum(hj_aux_diag[p0:p1], axis=0) - if with_k: - ek[i0,i0] += cupy.sum(hk_aux_diag[p0:p1], axis=0) for j0, (q0, q1) in enumerate(auxslices[:,2:]): _ej = cupy.sum(hj_aux_aux[p0:p1,q0:q1], axis=[0,1]) ej[i0,j0] += _ej @@ -423,8 +384,21 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, for j0 in range(i0): e1[j0,i0] = e1[i0,j0].T ej[j0,i0] = ej[i0,j0].T - ek[j0,i0] = ek[i0,j0].T + if with_k: + ek[j0,i0] = ek[i0,j0].T + t1 = log.timer_debug1('hcore contribution', *t1) + + aux2atom = int3c2e.get_aux2atom(intopt, auxslices) + + natm = mol.natm + idx = range(natm) + # Diagonal contributions + if hessobj.auxbasis_response > 1: + ej[idx, idx] += contract('ia,ixy->axy', aux2atom, hj_aux_diag) + if with_k: + ek[idx, idx] += contract('ia,ixy->axy', aux2atom, hk_aux_diag) + log.timer('RHF partial hessian', *time0) return e1, ej, ek @@ -501,9 +475,6 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, if isinstance(wk_Pl_, cupy.ndarray): rhok0_Pl_ = solve_j2c(wk_Pl_) else: - #rhok0_Pl_ = np.empty_like(wk_Pl_) - #mem = cupy.cuda.alloc_pinned_memory(wk_Pl_.nbytes) - #rhok0_Pl_ = np.ndarray(wk_Pl_.shape, dtype=np.float64, order='C', buffer=mem) rhok0_Pl_ = wk_Pl_ # reuse the memory for p0, p1 in lib.prange(0,nao,64): wk_tmp = cupy.asarray(wk_Pl_[:,p0:p1]) @@ -624,14 +595,36 @@ def _ao2mo(mat): vk1 = vk1_int3c[ia] + _ao2mo(vk1_ao) yield ia, h1, vj1, vk1 +def _get_jk_mo(hessobj, mol, dms, mo_coeff, mocc, + hermi=1, with_j=True, with_k=True, omega=None): + mf = hessobj.base + dfobj = mf.with_df + if omega is None: + return jk.get_jk(dfobj, dms, mo_coeff, mocc, + hermi=hermi, with_j=with_j, with_k=with_k) + + # A temporary treatment for RSH-DF integrals + key = '%.6f' % omega + if key in dfobj._rsh_df: + rsh_df = dfobj._rsh_df[key] + else: + rsh_df = dfobj._rsh_df[key] = dfobj.copy().reset() + logger.info(dfobj, 'Create RSH-DF object %s for omega=%s', rsh_df, omega) + + with rsh_df.mol.with_range_coulomb(omega): + return jk.get_jk(rsh_df, dms, mo_coeff, mocc, + hermi=hermi, with_j=with_j, with_k=with_k, omega=omega) + + class Hessian(rhf_hess.Hessian): '''Non-relativistic restricted Hartree-Fock hessian''' from gpu4pyscf.lib.utils import to_gpu, device - __init__ = rhf_hess.Hessian.__init__ + #__init__ = rhf_hess.Hessian.__init__ auxbasis_response = 1 partial_hess_elec = partial_hess_elec make_h1 = make_h1 - kernel = rhf_hess.kernel - hess = kernel + #kernel = rhf_hess.kernel + #hess = kernel + get_jk_mo = _get_jk_mo diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py index 014142fa..ad5dc96e 100644 --- a/gpu4pyscf/df/hessian/rks.py +++ b/gpu4pyscf/df/hessian/rks.py @@ -115,9 +115,10 @@ class Hessian(rks_hess.Hessian): '''Non-relativistic RKS hessian''' from gpu4pyscf.lib.utils import to_gpu, device - __init__ = rks_hess.Hessian.__init__ + #__init__ = rks_hess.Hessian.__init__ auxbasis_response = 1 partial_hess_elec = partial_hess_elec make_h1 = make_h1 - kernel = rhf_hess.kernel - hess = kernel + #kernel = rhf_hess.kernel + #hess = kernel + get_jk_mo = df_rhf_hess._get_jk_mo diff --git a/gpu4pyscf/df/hessian/tests/test_df_rhf_hessian.py b/gpu4pyscf/df/hessian/tests/test_df_rhf_hessian.py new file mode 100644 index 00000000..b8560002 --- /dev/null +++ b/gpu4pyscf/df/hessian/tests/test_df_rhf_hessian.py @@ -0,0 +1,145 @@ +# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import unittest +import numpy +import cupy +from pyscf import gto, scf +from pyscf.df.hessian import rhf as df_rhf_cpu +from pyscf.hessian import rhf as rhf_cpu +from gpu4pyscf.df.hessian import rhf as df_rhf_gpu +from gpu4pyscf.hessian import rhf as rhf_gpu + +def setUpModule(): + global mol + mol = gto.Mole() + mol.verbose = 1 + mol.output = '/dev/null' + mol.atom.extend([ + ["O" , (0. , 0. , 0.)], + [1 , (0. , -0.757 , 0.587)], + [1 , (0. , 0.757 , 0.587)] ]) + mol.basis = 'sto3g' + mol.build() + +def tearDownModule(): + global mol + mol.stdout.close() + del mol + +class KnownValues(unittest.TestCase): + def test_gen_vind(self): + mf = scf.RHF(mol).density_fit() + mf.conv_tol = 1e-10 + mf.conv_tol_cpscf = 1e-8 + mf.kernel() + mo_coeff = mf.mo_coeff + mo_occ = mf.mo_occ + + nao, nmo = mo_coeff.shape + mocc = mo_coeff[:,mo_occ>0] + nocc = mocc.shape[1] + + fx_cpu = rhf_cpu.gen_vind(mf, mo_coeff, mo_occ) + mo1 = numpy.random.rand(100, nmo*nocc) + v1vo_cpu = fx_cpu(mo1).reshape(-1,nmo*nocc) + + mf = mf.to_gpu() + hessobj = mf.Hessian() + fx_gpu = hessobj.gen_vind(mo_coeff, mo_occ) + mo1 = cupy.asarray(mo1) + v1vo_gpu = fx_gpu(mo1) + assert numpy.linalg.norm(v1vo_cpu - v1vo_gpu.get()) < 1e-8 + + def test_partial_hess_elec(self): + mf = scf.RHF(mol).density_fit() + mf.conv_tol = 1e-10 + mf.conv_tol_cpscf = 1e-8 + mf.kernel() + hobj = mf.Hessian() + hobj.auxbasis_response = 2 + e1_cpu, ej_cpu, ek_cpu = df_rhf_cpu._partial_hess_ejk(hobj) + + mf = mf.to_gpu() + hobj = mf.Hessian() + hobj.auxbasis_response = 2 + e1_gpu, ej_gpu, ek_gpu = df_rhf_gpu._partial_hess_ejk(hobj) + assert numpy.linalg.norm(e1_cpu - e1_gpu.get()) < 1e-5 + assert numpy.linalg.norm(ej_cpu - ej_gpu.get()) < 1e-5 + assert numpy.linalg.norm(ek_cpu - ek_gpu.get()) < 1e-5 + + def test_make_h1(self): + mf = scf.RHF(mol).density_fit() + mf.conv_tol = 1e-10 + mf.conv_tol_cpscf = 1e-8 + mf.kernel() + mo_energy = mf.mo_energy + mo_coeff = mf.mo_coeff + mo_occ = mf.mo_occ + mocc = mo_coeff[:,mo_occ>0] + hobj = mf.Hessian() + hobj.auxbasis_response = 1 + h1_cpu = df_rhf_cpu.make_h1(hobj, mo_coeff, mo_occ) + mo1_cpu, mo_e1_cpu = hobj.solve_mo1(mo_energy, mo_coeff, mo_occ, h1_cpu, verbose=1) + h1_cpu = numpy.asarray(h1_cpu) + h1_cpu = numpy.einsum('xypq,pi,qj->xyij', h1_cpu, mo_coeff, mocc) + + mf = mf.to_gpu() + mf.conv_tol = 1e-10 + mf.conv_tol_cpscf = 1e-8 + hobj = mf.Hessian() + hobj.auxbasis_response = 1 + mo_occ = cupy.asarray(mo_occ) + h1_gpu = df_rhf_gpu.make_h1(hobj, mo_coeff, mo_occ) + h1_gpu = cupy.asarray(h1_gpu) + mo_energy = cupy.asarray(mo_energy) + mo_coeff = cupy.asarray(mo_coeff) + fx = hobj.gen_vind(mo_coeff, mo_occ) + mo1_gpu, mo_e1_gpu = hobj.solve_mo1(mo_energy, mo_coeff, mo_occ, h1_gpu, fx, verbose=1) + assert numpy.linalg.norm(h1_cpu - h1_gpu.get()) < 1e-5 + assert numpy.linalg.norm((mo_e1_cpu - mo_e1_gpu)) < 1e-4 + + def test_df_rhf_hess_elec(self): + mf = scf.RHF(mol).density_fit() + mf.conv_tol = 1e-10 + mf.conv_tol_cpscf = 1e-8 + mf.kernel() + hobj = mf.Hessian() + hobj.auxbasis_response = 2 + hess_cpu = hobj.hess_elec() + + mf = mf.to_gpu() + hobj = mf.Hessian() + hobj.auxbasis_response = 2 + hess_gpu = hobj.hess_elec() + assert numpy.linalg.norm(hess_cpu - hess_gpu.get()) < 1e-5 + + def test_df_rhf_hessian(self): + mf = scf.RHF(mol).density_fit() + mf.conv_tol = 1e-10 + mf.conv_tol_cpscf = 1e-8 + mf.kernel() + hobj = mf.Hessian() + hobj.auxbasis_response = 2 + hess_cpu = hobj.kernel() + mf = mf.to_gpu() + hobj = mf.Hessian() + hobj.auxbasis_response = 2 + hess_gpu = hobj.kernel() + assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5 + +if __name__ == "__main__": + print("Full Tests for DF RHF Hessian") + unittest.main() diff --git a/gpu4pyscf/df/hessian/tests/test_df_rks_hessian.py b/gpu4pyscf/df/hessian/tests/test_df_rks_hessian.py new file mode 100644 index 00000000..5a853a95 --- /dev/null +++ b/gpu4pyscf/df/hessian/tests/test_df_rks_hessian.py @@ -0,0 +1,107 @@ +# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import unittest +import numpy +from pyscf import gto, dft + +def setUpModule(): + global mol + mol = gto.Mole() + mol.verbose = 1 + mol.output = '/dev/null' + mol.atom.extend([ + ["O" , (0. , 0. , 0.)], + [1 , (0. , -0.757 , 0.587)], + [1 , (0. , 0.757 , 0.587)] ]) + mol.basis = 'sto3g' + mol.build() + +def tearDownModule(): + global mol + mol.stdout.close() + del mol + +class KnownValues(unittest.TestCase): + + def test_df_rks_hess_elec(self): + mf = dft.RKS(mol, xc='b3lyp').density_fit() + mf.conv_tol = 1e-10 + mf.conv_tol_cpscf = 1e-8 + mf.grids.level = 1 + mf.kernel() + hobj = mf.Hessian() + hobj.auxbasis_response = 2 + hess_cpu = hobj.partial_hess_elec() + + mf = mf.to_gpu() + mf.grids.level = 1 + mf.kernel() + hobj = mf.Hessian() + hobj.auxbasis_response = 2 + hess_gpu = hobj.partial_hess_elec() + assert numpy.linalg.norm(hess_cpu - hess_gpu.get()) < 1e-5 + + def test_df_lda(self): + mf = dft.RKS(mol).density_fit() + mf.conv_tol = 1e-10 + mf.grids.level = 1 + mf.conv_tol_cpscf = 1e-8 + mf.kernel() + + hessobj = mf.Hessian() + hess_cpu = hessobj.kernel() + + mf = mf.to_gpu() + hessobj = mf.Hessian() + hess_gpu = hessobj.kernel() + assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5 + + def test_df_gga(self): + mf = dft.RKS(mol, xc='b3lyp').density_fit() + mf.conv_tol = 1e-10 + mf.grids.level = 1 + mf.conv_tol_cpscf = 1e-8 + mf.kernel() + + hessobj = mf.Hessian() + hess_cpu = hessobj.kernel() + + mf = mf.to_gpu() + hessobj = mf.Hessian() + hessobj.base.cphf_grids = hessobj.base.grids + hess_gpu = hessobj.kernel() + assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5 + + def test_df_mgga(self): + mf = dft.RKS(mol, xc='tpss').density_fit() + mf.conv_tol = 1e-10 + mf.grids.level = 1 + mf.conv_tol_cpscf = 1e-8 + mf.kernel() + + hessobj = mf.Hessian() + hess_cpu = hessobj.kernel() + + mf = mf.to_gpu() + hessobj = mf.Hessian() + hessobj.base.cphf_grids = hessobj.base.grids + hess_gpu = hessobj.kernel() + assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5 + +if __name__ == "__main__": + print("Full Tests for DF RKS Hessian") + unittest.main() + \ No newline at end of file diff --git a/gpu4pyscf/df/hessian/tests/test_df_uhf_hessian.py b/gpu4pyscf/df/hessian/tests/test_df_uhf_hessian.py index 4d137cea..0443f546 100644 --- a/gpu4pyscf/df/hessian/tests/test_df_uhf_hessian.py +++ b/gpu4pyscf/df/hessian/tests/test_df_uhf_hessian.py @@ -62,7 +62,8 @@ def test_gen_vind(self): v1vo_cpu = fx_cpu(mo1) mf = mf.to_gpu() - fx_gpu = uhf_gpu.gen_vind(mf, mo_coeff, mo_occ) + hessobj = mf.Hessian() + fx_gpu = hessobj.gen_vind(mo_coeff, mo_occ) mo1 = cupy.asarray(mo1) v1vo_gpu = fx_gpu(mo1) assert numpy.linalg.norm(v1vo_cpu - v1vo_gpu.get()) < 1e-8 @@ -114,7 +115,8 @@ def test_make_h1(self): mo_energy = cupy.asarray(mo_energy) mo_coeff = cupy.asarray(mo_coeff) mo_occ = cupy.asarray(mo_occ) - mo1_gpu, mo_e1_gpu = hobj.solve_mo1(mo_energy, mo_coeff, mo_occ, (h1a_gpu, h1b_gpu), verbose=1) + fx = hobj.gen_vind(mo_coeff, mo_occ) + mo1_gpu, mo_e1_gpu = hobj.solve_mo1(mo_energy, mo_coeff, mo_occ, (h1a_gpu, h1b_gpu), fx, verbose=1) assert numpy.linalg.norm(h1a_cpu - h1a_gpu.get()) < 1e-5 assert numpy.linalg.norm(h1b_cpu - h1b_gpu.get()) < 1e-5 mo1_cpu = (numpy.asarray(mo1_cpu[0]), numpy.asarray(mo1_cpu[1])) diff --git a/gpu4pyscf/df/hessian/uhf.py b/gpu4pyscf/df/hessian/uhf.py index 5d93c708..035f9505 100644 --- a/gpu4pyscf/df/hessian/uhf.py +++ b/gpu4pyscf/df/hessian/uhf.py @@ -43,13 +43,14 @@ from gpu4pyscf.lib.cupy_helper import ( contract, tag_array, get_avail_mem, release_gpu_stack, pinv) from gpu4pyscf.df import int3c2e, df +from gpu4pyscf.df.hessian import rhf as df_rhf_hess from gpu4pyscf.lib import logger from gpu4pyscf import __config__ from gpu4pyscf.df.grad.rhf import _gen_metric_solver -from gpu4pyscf.gto.mole import sort_atoms +from gpu4pyscf.df.hessian import jk LINEAR_DEP_THR = df.LINEAR_DEP_THR -BLKSIZE = 256 +BLKSIZE = 128 ALIGNED = getattr(__config__, 'ao_aligned', 32) GB = 1024*1024*1024 @@ -221,49 +222,16 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, t1 = log.timer_debug1('intermediate variables with int3c2e_ip1', *t1) cupy.get_default_memory_pool().free_all_blocks() - # int3c_ipip1 contributions - fn = int3c2e.get_int3c2e_hjk - hja_ao_diag, hka_ao_diag = fn(intopt, 'ipip1', rhoj0_P, rhok0a_P__, dm0a_tag, omega=omega, with_k=with_k) - hjb_ao_diag, hkb_ao_diag = fn(intopt, 'ipip1', rhoj0_P, rhok0b_P__, dm0b_tag, omega=omega, with_k=with_k) - hj_ao_diag = 2.0 * (hja_ao_diag + hjb_ao_diag) + hja_ipip, hka_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0a_P__, dm0a_tag, + with_k=with_k, omega=omega, + auxbasis_response=hessobj.auxbasis_response) + hjb_ipip, hkb_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0b_P__, dm0b_tag, + with_k=with_k, omega=omega, + auxbasis_response=hessobj.auxbasis_response) + hj_ipip = hja_ipip + hjb_ipip if with_k: - hk_ao_diag = 2.0 * (hka_ao_diag + hkb_ao_diag) - t1 = log.timer_debug1('intermediate variables with int3c2e_ipip1', *t1) - - # int3c_ipvip1 contributions - # (11|0), (0|00) without response of RI basis - fn = int3c2e.get_int3c2e_hjk - hja, hka = fn(intopt, 'ipvip1', rhoj0_P, rhok0a_P__, dm0a_tag, omega=omega, with_k=with_k) - hjb, hkb = fn(intopt, 'ipvip1', rhoj0_P, rhok0b_P__, dm0b_tag, omega=omega, with_k=with_k) - hj_ao_ao += 2.0*(hja + hjb) - if with_k: - hk_ao_ao += (hka + hkb) - hja = hjb = hka = hkb = None - t1 = log.timer_debug1('intermediate variables with int3c2e_ipvip1', *t1) - - # int3c_ip1ip2 contributions - # (10|1), (0|0)(0|00) - if hessobj.auxbasis_response: - fn = int3c2e.get_int3c2e_hjk - hja, hka = fn(intopt, 'ip1ip2', rhoj0_P, rhok0a_P__, dm0a_tag, omega=omega, with_k=with_k) - hjb, hkb = fn(intopt, 'ip1ip2', rhoj0_P, rhok0b_P__, dm0b_tag, omega=omega, with_k=with_k) - hj_ao_aux += hja + hjb - if with_k: - hk_ao_aux += hka + hkb - hja = hjb = hka = hkb = None - t1 = log.timer_debug1('intermediate variables with int3c2e_ip1ip2', *t1) - - # int3c_ipip2 contributions - if hessobj.auxbasis_response > 1: - # (00|2), (0|0)(0|00) - fn = int3c2e.get_int3c2e_hjk - hja, hka = fn(intopt, 'ipip2', rhoj0_P, rhok0a_P__, dm0a_tag, omega=omega, with_k=with_k) - hjb, hkb = fn(intopt, 'ipip2', rhoj0_P, rhok0b_P__, dm0b_tag, omega=omega, with_k=with_k) - hj_aux_diag = hja + hjb - if with_k: - hk_aux_diag = (hka + hkb) - hja = hjb = hka = hkb = None - t1 = log.timer_debug1('intermediate variables with int3c2e_ipip2', *t1) + hk_ipip = 2.0*(hka_ipip + hkb_ipip) + t1 = log.timer_debug1('intermediate variables with int3c2e_ipip', *t1) # int2c contributions if hessobj.auxbasis_response > 1: @@ -277,11 +245,11 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P) # (00|0)(2|0)(0|00) # p,xp->px - hj_aux_diag -= (rhoj0_P*rhoj2c_P).T.reshape(-1,3,3) + hj_aux_diag = -(rhoj0_P*rhoj2c_P).T.reshape(-1,3,3) if with_k: rho2c_0 = contract('pij,qji->pq', rhok0a_P__, rhok0a_P__) rho2c_0+= contract('pij,qji->pq', rhok0b_P__, rhok0b_P__) - hk_aux_diag -= contract('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3) + hk_aux_diag = -contract('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3) int2c_ipip1 = None if omega and omega > 1e-10: @@ -350,7 +318,6 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, rho2c_10= int2c_ip1_inv = None t1 = log.timer_debug1('contract int2c_*', *t1) - hj_ao_diag = intopt.unsort_orbitals(hj_ao_diag, axis=[0]) hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1]) if hessobj.auxbasis_response: hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1]) @@ -358,7 +325,6 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, hj_aux_diag = intopt.unsort_orbitals(hj_aux_diag, aux_axis=[0]) hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1]) if with_k: - hk_ao_diag = intopt.unsort_orbitals(hk_ao_diag, axis=[0]) hk_ao_ao = intopt.unsort_orbitals(hk_ao_ao, axis=[0,1]) if hessobj.auxbasis_response: hk_ao_aux = intopt.unsort_orbitals(hk_ao_aux, axis=[0], aux_axis=[1]) @@ -389,14 +355,11 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # ----------------------------------------- hk_ao_ao *= 2.0 e1 = cupy.zeros([len(atmlst),len(atmlst),3,3]) - ej = cupy.zeros([len(atmlst),len(atmlst),3,3]) - ek = cupy.zeros([len(atmlst),len(atmlst),3,3]) + ej = hj_ipip + ek = hk_ipip for i0, ia in enumerate(atmlst): shl0, shl1, p0, p1 = aoslices[ia] e1[i0,i0] -= cupy.sum(h1aa[p0:p1], axis=0) - ej[i0,i0] += cupy.sum(hj_ao_diag[p0:p1,:,:], axis=0) - if with_k: - ek[i0,i0] += cupy.sum(hk_ao_diag[p0:p1,:,:], axis=0) for j0, ja in enumerate(atmlst[:i0+1]): q0, q1 = aoslices[ja][2:] ej[i0,j0] += cupy.sum(hj_ao_ao[p0:p1,q0:q1], axis=[0,1]) @@ -702,6 +665,8 @@ def _ao2mo(mat, mocc, mo): vk1b = vk1b_int3c[ia] + _ao2mo(vk1b_ao, moccb, mo_coeff[1]) yield ia, (h1a, h1b), (vj1a, vj1b), (vk1a, vk1b) +_get_jk_mo = df_rhf_hess._get_jk_mo + class Hessian(uhf_hess.Hessian): '''Non-relativistic restricted Hartree-Fock hessian''' @@ -713,3 +678,4 @@ class Hessian(uhf_hess.Hessian): make_h1 = make_h1 kernel = rhf_hess.kernel hess = kernel + get_jk_mo = _get_jk_mo diff --git a/gpu4pyscf/df/hessian/uks.py b/gpu4pyscf/df/hessian/uks.py index 5133fc18..a3230490 100644 --- a/gpu4pyscf/df/hessian/uks.py +++ b/gpu4pyscf/df/hessian/uks.py @@ -133,3 +133,4 @@ class Hessian(uks_hess.Hessian): hess_elec = uhf_hess.hess_elec kernel = rhf_hess.kernel hess = kernel + get_jk_mo = df_uhf_hess._get_jk_mo diff --git a/gpu4pyscf/df/tests/test_df_hessian.py b/gpu4pyscf/df/tests/test_df_hessian.py index 8c56692b..3b932195 100644 --- a/gpu4pyscf/df/tests/test_df_hessian.py +++ b/gpu4pyscf/df/tests/test_df_hessian.py @@ -136,7 +136,7 @@ def test_hessian_rhf(self, disp=None): h = hobj.kernel() _check_rhf_hessian(mf, h, ix=0, iy=0) _check_rhf_hessian(mf, h, ix=0, iy=1) - + def test_hessian_lda(self, disp=None): print('-----testing DF LDA Hessian----') mf = _make_rks(mol_sph, 'LDA') @@ -240,7 +240,6 @@ def test_hessian_rks_D3(self): hobj = mf.Hessian() hobj.set(auxbasis_response=2) h = hobj.kernel() - print(np.linalg.norm(h)) _check_dft_hessian(mf, h, ix=0,iy=0) def test_hessian_rks_D4(self): diff --git a/gpu4pyscf/hessian/jk.py b/gpu4pyscf/hessian/jk.py new file mode 100644 index 00000000..7f3eeb60 --- /dev/null +++ b/gpu4pyscf/hessian/jk.py @@ -0,0 +1,296 @@ +# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +''' +Compute J/K matrices for Hessian +''' +import ctypes +import math +import numpy as np +import cupy as cp +from collections import Counter +from concurrent.futures import ThreadPoolExecutor + +from pyscf import lib +from pyscf.scf import _vhf +from pyscf import __config__ + +from gpu4pyscf.scf.jk import (_make_tril_tile_mappings, quartets_scheme, QUEUE_DEPTH, + _VHFOpt, LMAX, init_constant, libvhf_rys) +from gpu4pyscf.lib.cupy_helper import (condense, sandwich_dot, transpose_sum, + reduce_to_device, contract) + +from gpu4pyscf.__config__ import props as gpu_specs +from gpu4pyscf.__config__ import _streams, _num_devices +from gpu4pyscf.lib import logger + + +def _ao2mo(v_ao, mocc, mo_coeff): + v_ao = contract('nij,jo->nio', v_ao, mocc) + return contract('nio,ip->npo', v_ao, mo_coeff) + +def _jk_task(mol, dms, mo_coeff, mocc, vhfopt, task_list, hermi=0, + device_id=0, with_j=True, with_k=True, verbose=0): + nao, _ = vhfopt.coeff.shape + uniq_l_ctr = vhfopt.uniq_l_ctr + uniq_l = uniq_l_ctr[:,0] + l_ctr_bas_loc = vhfopt.l_ctr_offsets + l_symb = [lib.param.ANGULAR[i] for i in uniq_l] + kern = libvhf_rys.RYS_build_jk + + timing_counter = Counter() + kern_counts = 0 + with cp.cuda.Device(device_id), _streams[device_id]: + log = logger.new_logger(mol, verbose) + cput0 = log.init_timer() + dms = cp.asarray(dms) + + n_dm = dms.shape[0] + tile_q_ptr = ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p) + q_ptr = ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p) + s_ptr = lib.c_null_ptr() + if mol.omega < 0: + s_ptr = ctypes.cast(vhfopt.s_estimator.data.ptr, ctypes.c_void_p) + + vj = vk = None + vj_ptr = vk_ptr = lib.c_null_ptr() + assert with_j or with_k + if with_k: + vk = cp.zeros(dms.shape) + vk_ptr = ctypes.cast(vk.data.ptr, ctypes.c_void_p) + if with_j: + vj = cp.zeros(dms.shape) + vj_ptr = ctypes.cast(vj.data.ptr, ctypes.c_void_p) + + ao_loc = mol.ao_loc + dm_cond = cp.log(condense('absmax', dms, ao_loc) + 1e-300).astype(np.float32) + log_max_dm = dm_cond.max() + log_cutoff = math.log(vhfopt.direct_scf_tol) + tile_mappings = _make_tril_tile_mappings(l_ctr_bas_loc, vhfopt.tile_q_cond, + log_cutoff-log_max_dm) + workers = gpu_specs['multiProcessorCount'] + pool = cp.empty((workers, QUEUE_DEPTH*4), dtype=np.uint16) + info = cp.empty(2, dtype=np.uint32) + t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0) + + for i, j in task_list: + ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1], + l_ctr_bas_loc[j], l_ctr_bas_loc[j+1]) + tile_ij_mapping = tile_mappings[i,j] + for k in range(i+1): + for l in range(k+1): + llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' + kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1], + l_ctr_bas_loc[l], l_ctr_bas_loc[l+1]) + tile_kl_mapping = tile_mappings[k,l] + scheme = quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]]) + err = kern( + vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p), + ctypes.c_int(n_dm), ctypes.c_int(nao), + vhfopt.rys_envs, (ctypes.c_int*2)(*scheme), + (ctypes.c_int*8)(*ij_shls, *kl_shls), + ctypes.c_int(tile_ij_mapping.size), + ctypes.c_int(tile_kl_mapping.size), + ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), + ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), + tile_q_ptr, q_ptr, s_ptr, + ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), + ctypes.c_float(log_cutoff), + ctypes.cast(pool.data.ptr, ctypes.c_void_p), + ctypes.cast(info.data.ptr, ctypes.c_void_p), + ctypes.c_int(workers), + mol._atm.ctypes, ctypes.c_int(mol.natm), + mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) + if err != 0: + raise RuntimeError(f'RYS_build_jk kernel for {llll} failed') + if log.verbose >= logger.DEBUG1: + msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}' + t1, t1p = log.timer_debug1(msg, *t1), t1 + timing_counter[llll] += t1[1] - t1p[1] + kern_counts += 1 + if with_j: + vj *= 2.0 + vj = transpose_sum(vj) + if with_k: + vk = transpose_sum(vk) + + if isinstance(mocc, tuple): + # Unrestricted case + mocca, moccb = mocc + moa, mob = mo_coeff + nmoa, nmob = moa.shape[1], mob.shape[1] + nocca, noccb = mocca.shape[1], moccb.shape[1] + n_dm_2 = n_dm//2 + if with_j: + vjab = vj[:n_dm_2] + vj[n_dm_2:] + vj = cp.empty([n_dm_2,nmoa*nocca+nmob*noccb]) + vj[:,:nmoa*nocca] = _ao2mo(vjab, mocca, moa).reshape(n_dm_2,-1) + vj[:,nmoa*nocca:] = _ao2mo(vjab, moccb, mob).reshape(n_dm_2,-1) + if with_k: + vka, vkb = vk[:n_dm_2], vk[n_dm_2:] + vk = cp.empty([n_dm_2,nmoa*nocca+nmob*noccb]) + vk[:,:nmoa*nocca] = _ao2mo(vka, mocca, moa).reshape(n_dm_2,-1) + vk[:,nmoa*nocca:] = _ao2mo(vkb, moccb, mob).reshape(n_dm_2,-1) + else: + if with_j: + vj = _ao2mo(vj, mocc, mo_coeff).reshape(n_dm,-1) + if with_k: + vk = _ao2mo(vk, mocc, mo_coeff).reshape(n_dm,-1) + + return vj, vk, kern_counts, timing_counter + +def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None, + with_j=True, with_k=True, verbose=None): + '''Compute J, K matrices in MO + ''' + log = logger.new_logger(mol, verbose) + cput0 = log.init_timer() + + if vhfopt is None: + vhfopt = _VHFOpt(mol).build() + + mol = vhfopt.mol + nao, nao_orig = vhfopt.coeff.shape + + dm = cp.asarray(dm, order='C') + dms = dm.reshape(-1,nao_orig,nao_orig) + + # Transform MO coeffcients and DM into sorted, cartesian AO basis + #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff) + dms = sandwich_dot(dms, vhfopt.coeff.T) + dms = cp.asarray(dms, order='C') + coeff = vhfopt.coeff + if isinstance(mocc, tuple): + mocc = (coeff.dot(mocc[0]), coeff.dot(mocc[1])) + mo_coeff = (coeff.dot(mo_coeff[0]), coeff.dot(mo_coeff[1])) + else: + mocc = coeff.dot(mocc) + mo_coeff = coeff.dot(mo_coeff) + n_dm = dms.shape[0] + + assert with_j or with_k + + init_constant(mol) + + uniq_l_ctr = vhfopt.uniq_l_ctr + uniq_l = uniq_l_ctr[:,0] + l_symb = [lib.param.ANGULAR[i] for i in uniq_l] + n_groups = np.count_nonzero(uniq_l <= LMAX) + + tasks = [(i,j) for i in range(n_groups) for j in range(i+1)] + tasks = np.array(tasks) + task_list = [] + for device_id in range(_num_devices): + task_list.append(tasks[device_id::_num_devices]) + + cp.cuda.get_current_stream().synchronize() + futures = [] + with ThreadPoolExecutor(max_workers=_num_devices) as executor: + for device_id in range(_num_devices): + future = executor.submit( + _jk_task, + mol, dms, mo_coeff, mocc, vhfopt, task_list[device_id], hermi=hermi, + with_j=with_j, with_k=with_k, verbose=verbose, + device_id=device_id) + futures.append(future) + + kern_counts = 0 + timing_collection = Counter() + vj_dist = [] + vk_dist = [] + for future in futures: + vj, vk, counts, counter = future.result() + kern_counts += counts + timing_collection += counter + vj_dist.append(vj) + vk_dist.append(vk) + + if log.verbose >= logger.DEBUG1: + log.debug1('kernel launches %d', kern_counts) + for llll, t in timing_collection.items(): + log.debug1('%s wall time %.2f', llll, t) + + for s in _streams: + s.synchronize() + cp.cuda.get_current_stream().synchronize() + vj = vk = None + if with_k: + vk = reduce_to_device(vk_dist, inplace=True) + + if with_j: + vj = reduce_to_device(vj_dist, inplace=True) + + h_shls = vhfopt.h_shls + assert len(h_shls) == 0 + if h_shls: + cput1 = log.timer_debug1('get_jk pass 1 on gpu', *cput0) + log.debug3('Integrals for %s functions on CPU', l_symb[LMAX+1]) + scripts = [] + if with_j: + scripts.append('ji->s2kl') + if with_k: + if hermi == 1: + scripts.append('jk->s2il') + else: + scripts.append('jk->s1il') + shls_excludes = [0, h_shls[0]] * 4 + if hermi == 1: + dms = dms.get() + else: + dms = dms[:n_dm//2].get() + vs_h = _vhf.direct_mapdm('int2e_cart', 's8', scripts, + dms, 1, mol._atm, mol._bas, mol._env, + shls_excludes=shls_excludes) + if with_j and with_k: + vj1 = vs_h[0] + vk1 = vs_h[1] + elif with_j: + vj1 = vs_h[0] + else: + vk1 = vs_h[0] + coeff = vhfopt.coeff + idx, idy = np.tril_indices(nao, -1) + if isinstance(mocc, tuple): + mocca, moccb = mocc + moa, mob = mo_coeff + nmoa, nmob = moa.shape[1], mob.shape[1] + nocca, noccb = mocca.shape[1], moccb.shape[1] + n_dm_2 = n_dm//2 + if with_j: + vjab = vj1[:n_dm_2] + vj1[n_dm_2:] + vj[:,:nmoa*nocca] += _ao2mo(vjab, mocca, moa).reshape(n_dm_2,-1) + vj[:,nmoa*nocca:] += _ao2mo(vjab, moccb, mob).reshape(n_dm_2,-1) + if with_k: + vka, vkb = vk[:n_dm_2], vk[n_dm_2:] + vk[:,:nmoa*nocca] += _ao2mo(vka, mocca, moa).reshape(n_dm_2,-1) + vk[:,nmoa*nocca:] += _ao2mo(vkb, moccb, mob).reshape(n_dm_2,-1) + else: + if with_j: + vj1[:,idy,idx] = vj1[:,idx,idy] + vj += _ao2mo(cp.asarray(vj1), mocc, mo_coeff) + #for i, v in enumerate(vj1): + # vj[i] += coeff.T.dot(cp.asarray(v)).dot(coeff) + if with_k: + if hermi: + vk1[:,idy,idx] = vk1[:,idx,idy] + vk += _ao2mo(cp.asarray(vk1), mocc, mo_coeff) + #for i, v in enumerate(vk1): + # vk[i] += coeff.T.dot(cp.asarray(v)).dot(coeff) + + # TODO: convert vj and vk into MO + log.timer_debug1('get_jk pass 2 for h functions on cpu', *cput1) + + log.timer('vj and vk', *cput0) + return vj, vk diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py index 26929dd3..a526eb81 100644 --- a/gpu4pyscf/hessian/rhf.py +++ b/gpu4pyscf/hessian/rhf.py @@ -42,6 +42,7 @@ LMAX, QUEUE_DEPTH, SHM_SIZE, THREADS, libvhf_rys, _VHFOpt, init_constant, _make_tril_tile_mappings, _nearest_power2) from gpu4pyscf.grad import rhf as rhf_grad +from gpu4pyscf.hessian import jk libvhf_rys.RYS_per_atom_jk_ip2_type12.restype = ctypes.c_int libvhf_rys.RYS_per_atom_jk_ip2_type3.restype = ctypes.c_int @@ -79,10 +80,10 @@ def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, h1mo = h1mo.get() t1 = log.timer_debug1('making H1', *t1) if mo1 is None or mo_e1 is None: + fx = hessobj.gen_vind(mo_coeff, mo_occ) mo1, mo_e1 = hessobj.solve_mo1(mo_energy, mo_coeff, mo_occ, h1mo, - None, atmlst, max_memory, log) + fx, atmlst, max_memory, log) t1 = log.timer_debug1('solving MO1', *t1) - mo1 = cupy.asarray(mo1) # *2 for double occupancy, *2 for +c.c. de2 += contract('kxpi,lypi->klxy', cupy.asarray(h1mo), mo1) * 4 @@ -365,7 +366,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): avail_mem = get_avail_mem() slice_size = int(avail_mem*0.6) // (8*3*nao*nao) for atoms_slice in lib.prange(0, natm, slice_size): - vj, vk = _get_jk(mol, dm0, atoms_slice=atoms_slice, verbose=verbose) + vj, vk = _get_jk_ip1(mol, dm0, atoms_slice=atoms_slice, verbose=verbose) #:vhf = vj - vk * .5 vhf = vk vhf *= -.5 @@ -377,9 +378,9 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): vj = vk = vhf = None return h1mo - def _build_jk_ip1_task(mol, dms, vhfopt, task_list, atoms_slice, device_id=0, with_j=True, with_k=True, verbose=0): + # TODO: compute JK in MO assert isinstance(verbose, int) nao, _ = vhfopt.coeff.shape natm = mol.natm @@ -475,7 +476,7 @@ def _build_jk_ip1_task(mol, dms, vhfopt, task_list, atoms_slice, kern_counts += 1 return vj, vk, kern_counts, timing_counter -def _get_jk(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=None): +def _get_jk_ip1(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=None): r''' For each atom, compute J = ((\nabla_X i) j| kl) (D_lk + D_ji) @@ -688,7 +689,7 @@ def fvind_vo(mo1): mo1[:,:,viridx] *= -e_ai mo1[:,:,occidx] = -s1mo_blk[:,:,occidx] * .5 hs = s1mo_blk = h1mo_blk = None - + tol = mf.conv_tol_cpscf * (i1 - i0) raw_mo1 = krylov(fvind_vo, mo1.reshape(-1,nmo*nocc), tol=tol, max_cycle=max_cycle, verbose=log) @@ -706,32 +707,23 @@ def fvind_vo(mo1): log.timer('CPHF solver', *t0) return mo1s, e1s -def gen_vind(mf, mo_coeff, mo_occ): - # Move data to GPU +def gen_vind(hessobj, mo_coeff, mo_occ): + mol = hessobj.mol mo_coeff = cupy.asarray(mo_coeff) mo_occ = cupy.asarray(mo_occ) nao, nmo = mo_coeff.shape mocc = mo_coeff[:,mo_occ>0] nocc = mocc.shape[1] mocc_2 = mocc * 2 - grids = getattr(mf, 'cphf_grids', None) - if grids is not None: - logger.info(mf, 'Secondary grids defined for CPHF in Hessian') - vresp = mf.gen_response(mo_coeff, mo_occ, hermi=1, grids=grids) def fx(mo1): mo1 = cupy.asarray(mo1) mo1 = mo1.reshape(-1,nmo,nocc) mo1_mo = contract('npo,ip->nio', mo1, mo_coeff) - #dm1 = contract('nio,jo->nij', mo1_mo, mocc_2) - #dm1 = dm1 + dm1.transpose(0,2,1) dm1 = mo1_mo.dot(mocc_2.T) - transpose_sum(dm1) + dm1 = transpose_sum(dm1) dm1 = tag_array(dm1, mo1=mo1_mo, occ_coeff=mocc, mo_occ=mo_occ) - v1 = vresp(dm1) - tmp = contract('nij,jo->nio', v1, mocc) - v1vo = contract('nio,ip->npo', tmp, mo_coeff) - return v1vo + return hessobj.get_veff_resp_mo(mol, dm1, mo_coeff, mo_occ, hermi=1) return fx def hess_nuc_elec(mol, dm): @@ -890,6 +882,25 @@ def get_hcore(iatm, jatm): def hcore_generator(hessobj, mol=None): raise NotImplementedError +def _get_jk_mo(hessobj, mol, dms, mo_coeff, mocc, + hermi=1, with_j=True, with_k=True, omega=None): + ''' Compute J/K matrices in MO for multiple DMs + ''' + mf = hessobj.base + vhfopt = mf._opt_gpu.get(omega) + if vhfopt is None: + with mol.with_range_coulomb(omega): + vhfopt = mf._opt_gpu[omega] = _VHFOpt(mol, mf.direct_scf_tol).build() + with mol.with_range_coulomb(omega): + vj, vk = jk.get_jk(mol, dms, mo_coeff, mocc, hermi, vhfopt, with_j, with_k) + return vj, vk + +def _get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1, omega=None): + mocc = mo_coeff[:,mo_occ>0] + vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mocc, + hermi=hermi, with_j=True, with_k=True, omega=omega) + return vj - 0.5 * vk + class HessianBase(lib.StreamObject): # attributes max_cycle = rhf_hess_cpu.HessianBase.max_cycle @@ -901,8 +912,10 @@ class HessianBase(lib.StreamObject): make_h1 = rhf_hess_cpu.HessianBase.make_h1 hcore_generator = hcore_generator # the functionality is different from cpu version hess_nuc = rhf_hess_cpu.HessianBase.hess_nuc + gen_vind = NotImplemented + get_jk = NotImplemented kernel = hess = kernel - + def get_hcore(self, mol=None): if mol is None: mol = self.mol return get_hcore(mol) @@ -952,6 +965,9 @@ def __init__(self, scf_method): hess_elec = hess_elec make_h1 = make_h1 gen_hop = NotImplemented + gen_vind = gen_vind + get_jk_mo = _get_jk_mo + get_veff_resp_mo = _get_veff_resp_mo # Inject to RHF class from gpu4pyscf import scf diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py index 4c9b5a0d..93c16bb8 100644 --- a/gpu4pyscf/hessian/rks.py +++ b/gpu4pyscf/hessian/rks.py @@ -29,9 +29,11 @@ # import pyscf.grad.rks to activate nuc_grad_method method from gpu4pyscf.grad import rks as rks_grad from gpu4pyscf.dft import numint -from gpu4pyscf.lib.cupy_helper import contract, add_sparse, get_avail_mem, reduce_to_device +from gpu4pyscf.lib.cupy_helper import (contract, add_sparse, get_avail_mem, + reduce_to_device, transpose_sum, tag_array) from gpu4pyscf.lib import logger from gpu4pyscf.__config__ import _streams, _num_devices +from gpu4pyscf.hessian import jk def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None, max_memory=4000, verbose=None): @@ -126,15 +128,15 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): avail_mem -= 8 * h1mo.size slice_size = int(avail_mem*0.5) // (8*3*nao*nao) for atoms_slice in lib.prange(0, natm, slice_size): - vj, vk = rhf_hess._get_jk(mol, dm0, with_k=with_k, - atoms_slice=atoms_slice, verbose=verbose) + vj, vk = rhf_hess._get_jk_ip1(mol, dm0, with_k=with_k, + atoms_slice=atoms_slice, verbose=verbose) veff = vj if with_k: vk *= .5 * hyb veff -= vk if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10: with mol.with_range_coulomb(omega): - vk_lr = rhf_hess._get_jk(mol, dm0, with_j=False, verbose=verbose)[1] + vk_lr = rhf_hess._get_jk_ip1(mol, dm0, with_j=False, verbose=verbose)[1] vk_lr *= (alpha-hyb) * .5 veff -= vk_lr atom0, atom1 = atoms_slice @@ -699,6 +701,51 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): vmat = reduce_to_device(vmat_dist, inplace=True) return vmat +def get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1, omega=None): + mol = hessobj.mol + mf = hessobj.base + grids = getattr(mf, 'cphf_grids', None) + if grids is not None: + logger.info(mf, 'Secondary grids defined for CPHF in Hessian') + else: + # If cphf_grids is not defined, e.g object defined from CPU + grids = getattr(mf, 'grids', None) + logger.info(mf, 'Primary grids is used for CPHF in Hessian') + + if grids and grids.coords is None: + grids.build(mol=mol, with_non0tab=False, sort_grids=True) + + ni = mf._numint + omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, mol.spin) + hybrid = ni.libxc.is_hybrid_xc(mf.xc) + assert not mf.do_nlc() + hermi = 1 + + mocc = mo_coeff[:,mo_occ>0] + nocc = mocc.shape[1] + nao, nmo = mo_coeff.shape + # TODO: evaluate v1 in MO + rho0, vxc, fxc = ni.cache_xc_kernel(mol, grids, mf.xc, + mo_coeff, mo_occ, 0) + v1 = ni.nr_rks_fxc(mol, grids, mf.xc, None, dms, 0, hermi, + rho0, vxc, fxc, max_memory=None) + v1 = jk._ao2mo(v1, mocc, mo_coeff).reshape(-1,nmo*nocc) + + if hybrid: + vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mocc, hermi=1) + vk *= hyb + if omega > 1e-10: # For range separated Coulomb + _, vk_lr = hessobj.get_jk_mo(mol, dms, mo_coeff, mocc, hermi, + with_j=False, omega=omega) + vk_lr *= (alpha-hyb) + vk += vk_lr + v1 += vj - .5 * vk + else: + v1 += hessobj.get_jk_mo(mol, dms, mo_coeff, mocc, hermi=1, + with_k=False)[0] + + return v1 + class Hessian(rhf_hess.HessianBase): '''Non-relativistic RKS hessian''' @@ -715,6 +762,9 @@ def __init__(self, mf): partial_hess_elec = partial_hess_elec hess_elec = rhf_hess.hess_elec make_h1 = make_h1 + gen_vind = rhf_hess.gen_vind + get_jk_mo = rhf_hess._get_jk_mo + get_veff_resp_mo = get_veff_resp_mo from gpu4pyscf import dft dft.rks.RKS.Hessian = lib.class_as_method(Hessian) diff --git a/gpu4pyscf/hessian/tests/test_rhf_hessian.py b/gpu4pyscf/hessian/tests/test_rhf_hessian.py index 82c6606c..21266bc3 100644 --- a/gpu4pyscf/hessian/tests/test_rhf_hessian.py +++ b/gpu4pyscf/hessian/tests/test_rhf_hessian.py @@ -104,7 +104,7 @@ def test_get_jk(self): mo_coeff = np.random.rand(nao, nao) dm = mo_coeff.dot(mo_coeff.T) * 2 - vj, vk = rhf_gpu._get_jk(mol, dm) + vj, vk = rhf_gpu._get_jk_ip1(mol, dm) assert abs(lib.fp(vj.get()) - 87674.69061160382) < 1e-7 assert abs(lib.fp(vk.get()) - -9.317650662101629) < 1e-7 diff --git a/gpu4pyscf/hessian/uhf.py b/gpu4pyscf/hessian/uhf.py index 81f26c17..07686e6d 100644 --- a/gpu4pyscf/hessian/uhf.py +++ b/gpu4pyscf/hessian/uhf.py @@ -30,11 +30,12 @@ from pyscf.scf import ucphf # import _response_functions to load gen_response methods in SCF class from gpu4pyscf.scf import _response_functions # noqa -from gpu4pyscf.gto.mole import sort_atoms -from gpu4pyscf.lib.cupy_helper import contract, tag_array, get_avail_mem, krylov +from gpu4pyscf.lib.cupy_helper import (contract, transpose_sum, get_avail_mem, + krylov, tag_array) from gpu4pyscf.lib import logger from gpu4pyscf.grad import rhf as rhf_grad from gpu4pyscf.hessian import rhf as rhf_hess_gpu +from gpu4pyscf.hessian import jk GB = 1024*1024*1024 ALIGNED = 4 @@ -68,8 +69,9 @@ def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, h1mo = (h1mo[0].get(), h1mo[1].get()) t1 = log.timer_debug1('making H1', *t1) if mo1 is None or mo_e1 is None: + fx = hessobj.gen_vind(mo_coeff, mo_occ) mo1, mo_e1 = hessobj.solve_mo1(mo_energy, mo_coeff, mo_occ, h1mo, - None, atmlst, max_memory, log) + fx, atmlst, max_memory, log) t1 = log.timer_debug1('solving MO1', *t1) mo1a = cupy.asarray(mo1[0]) @@ -192,8 +194,8 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): avail_mem = get_avail_mem() slice_size = int(avail_mem*0.6) // (8*3*nao*nao*2) for atoms_slice in lib.prange(0, natm, slice_size): - vja, vka = rhf_hess_gpu._get_jk(mol, dm0a, atoms_slice=atoms_slice, verbose=verbose) - vjb, vkb = rhf_hess_gpu._get_jk(mol, dm0b, atoms_slice=atoms_slice, verbose=verbose) + vja, vka = rhf_hess_gpu._get_jk_ip1(mol, dm0a, atoms_slice=atoms_slice, verbose=verbose) + vjb, vkb = rhf_hess_gpu._get_jk_ip1(mol, dm0b, atoms_slice=atoms_slice, verbose=verbose) #:vhfa = vja+vjb - vka #:vhfb = vja+vjb - vkb vhfa = vka @@ -369,8 +371,9 @@ def fvind_vo(mo1): log.timer('CPHF solver', *t0) return (mo1sa, mo1sb), (e1sa, e1sb) -def gen_vind(mf, mo_coeff, mo_occ): +def gen_vind(hessobj, mo_coeff, mo_occ): # Move data to GPU + mol = hessobj.mol mo_coeff = cupy.asarray(mo_coeff) mo_occ = cupy.asarray(mo_occ) nao, nmoa = mo_coeff[0].shape @@ -379,39 +382,34 @@ def gen_vind(mf, mo_coeff, mo_occ): moccb = mo_coeff[1][:,mo_occ[1]>0] nocca = mocca.shape[1] noccb = moccb.shape[1] - grids = getattr(mf, 'cphf_grids', None) - if grids is not None: - logger.info(mf, 'Secondary grids defined for CPHF in Hessian') - vresp = mf.gen_response(mo_coeff, mo_occ, hermi=1, grids=grids) def fx(mo1): mo1 = cupy.asarray(mo1) mo1 = mo1.reshape(-1,nmoa*nocca+nmob*noccb) nset = len(mo1) + dm1 = cupy.empty([2,nset,nao,nao]) + x = mo1[:,:nmoa*nocca].reshape(nset,nmoa,nocca) mo1_moa = contract('npo,ip->nio', x, mo_coeff[0]) dma = contract('nio,jo->nij', mo1_moa, mocca) + dm1[0] = transpose_sum(dma) x = mo1[:,nmoa*nocca:].reshape(nset,nmob,noccb) mo1_mob = contract('npo,ip->nio', x, mo_coeff[1]) dmb = contract('nio,jo->nij', mo1_mob, moccb) - - dm1 = cupy.empty([2,nset,nao,nao]) - dm1[0] = dma + dma.transpose(0,2,1) - dm1[1] = dmb + dmb.transpose(0,2,1) + dm1[1] = transpose_sum(dmb) dm1 = tag_array(dm1, mo1=[mo1_moa,mo1_mob], occ_coeff=[mocca,moccb], mo_occ=mo_occ) - v1 = vresp(dm1) - v1vo = cupy.empty_like(mo1) - tmp = contract('nij,jo->nio', v1[0], mocca) - v1vo[:,:nmoa*nocca] = contract('nio,ip->npo', tmp, mo_coeff[0]).reshape(nset,-1) - - tmp = contract('nij,jo->nio', v1[1], moccb) - v1vo[:,nmoa*nocca:] = contract('nio,ip->npo', tmp, mo_coeff[1]).reshape(nset,-1) - return v1vo + return hessobj.get_veff_resp_mo(mol, dm1, mo_coeff, mo_occ, hermi=1) return fx +def _get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1): + mocca = mo_coeff[0][:,mo_occ[0]>0] + moccb = mo_coeff[1][:,mo_occ[1]>0] + vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, (mocca, moccb), + hermi=hermi, with_j=True, with_k=True) + return vj - vk class Hessian(rhf_hess_gpu.HessianBase): '''Non-relativistic unrestricted Hartree-Fock hessian''' @@ -422,7 +420,10 @@ class Hessian(rhf_hess_gpu.HessianBase): partial_hess_elec = partial_hess_elec hess_elec = hess_elec make_h1 = make_h1 - + gen_vind = gen_vind + get_jk_mo = rhf_hess_gpu._get_jk_mo + get_veff_resp_mo = _get_veff_resp_mo + def solve_mo1(self, mo_energy, mo_coeff, mo_occ, h1mo, fx=None, atmlst=None, max_memory=4000, verbose=None): return solve_mo1(self.base, mo_energy, mo_coeff, mo_occ, h1mo, diff --git a/gpu4pyscf/hessian/uks.py b/gpu4pyscf/hessian/uks.py index a363706a..a202d92a 100644 --- a/gpu4pyscf/hessian/uks.py +++ b/gpu4pyscf/hessian/uks.py @@ -27,8 +27,10 @@ # import pyscf.grad.rks to activate nuc_grad_method method from gpu4pyscf.grad import rks as rks_grad from gpu4pyscf.dft import numint -from gpu4pyscf.lib.cupy_helper import contract, add_sparse, take_last2d, get_avail_mem +from gpu4pyscf.lib.cupy_helper import (contract, add_sparse, get_avail_mem, + transpose_sum, tag_array) from gpu4pyscf.lib import logger +from gpu4pyscf.hessian import jk def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None, max_memory=4000, verbose=None): @@ -133,8 +135,8 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): avail_mem -= 8 * (h1moa.size + h1mob.size) slice_size = int(avail_mem*0.5) // (8*3*nao*nao) for atoms_slice in lib.prange(0, natm, slice_size): - vja, vka = rhf_hess._get_jk(mol, dm0a, with_k=with_k, atoms_slice=atoms_slice, verbose=verbose) - vjb, vkb = rhf_hess._get_jk(mol, dm0b, with_k=with_k, atoms_slice=atoms_slice, verbose=verbose) + vja, vka = rhf_hess._get_jk_ip1(mol, dm0a, with_k=with_k, atoms_slice=atoms_slice, verbose=verbose) + vjb, vkb = rhf_hess._get_jk_ip1(mol, dm0b, with_k=with_k, atoms_slice=atoms_slice, verbose=verbose) vj = vja + vjb if with_k: #:veffa = vja + vjb - hyb * vka @@ -151,8 +153,8 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): vj = vja = vjb = vka = vkb = None if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10: with mol.with_range_coulomb(omega): - vka_lr = rhf_hess._get_jk(mol, dm0a, with_j=False, verbose=verbose)[1] - vkb_lr = rhf_hess._get_jk(mol, dm0b, with_j=False, verbose=verbose)[1] + vka_lr = rhf_hess._get_jk_ip1(mol, dm0a, with_j=False, verbose=verbose)[1] + vkb_lr = rhf_hess._get_jk_ip1(mol, dm0b, with_j=False, verbose=verbose)[1] vka_lr *= (alpha-hyb) vkb_lr *= (alpha-hyb) veffa -= vka_lr @@ -843,6 +845,55 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): vmatb[ia] -= vmat_tmp return vmata, vmatb +def get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1): + mol = hessobj.mol + mf = hessobj.base + grids = getattr(mf, 'cphf_grids', None) + if grids is not None: + logger.info(mf, 'Secondary grids defined for CPHF in Hessian') + else: + # If cphf_grids is not defined, e.g object defined from CPU + grids = getattr(mf, 'grids', None) + logger.info(mf, 'Primary grids is used for CPHF in Hessian') + + if grids and grids.coords is None: + grids.build(mol=mol, with_non0tab=False, sort_grids=True) + + nao, nmoa = mo_coeff[0].shape + nao, nmob = mo_coeff[1].shape + mocca = mo_coeff[0][:,mo_occ[0]>0] + moccb = mo_coeff[1][:,mo_occ[1]>0] + nocca = mocca.shape[1] + noccb = moccb.shape[1] + + ni = mf._numint + omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, mol.spin) + hybrid = ni.libxc.is_hybrid_xc(mf.xc) + assert not mf.do_nlc() + hermi = 1 + + rho0, vxc, fxc = ni.cache_xc_kernel(mol, grids, mf.xc, + mo_coeff, mo_occ, 1) + v1 = ni.nr_uks_fxc(mol, grids, mf.xc, None, dms, 0, hermi, + rho0, vxc, fxc, max_memory=None) + nset = dms.shape[1] + v1vo = cupy.empty([nset, nmoa*nocca+nmob*noccb]) + v1vo[:,:nmoa*nocca] = jk._ao2mo(v1[0], mocca, mo_coeff[0]).reshape(-1,nmoa*nocca) + v1vo[:,nmoa*nocca:] = jk._ao2mo(v1[1], moccb, mo_coeff[1]).reshape(-1,nmob*noccb) + if hybrid: + vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, (mocca, moccb), hermi=1) + vk *= hyb + if omega > 1e-10: + _, vk_lr = hessobj.get_jk_mo(mol, dms, mo_coeff, (mocca, moccb), + hermi, with_j=False, omega=omega) + vk_lr *= (alpha-hyb) + vk += vk_lr + v1vo += vj - vk + else: + v1vo += hessobj.get_jk_mo(mol, dms, mo_coeff, (mocca, moccb), + hermi=1, with_k=False)[0] + return v1vo + class Hessian(rhf_hess.HessianBase): '''Non-relativistic UKS hessian''' @@ -857,6 +908,9 @@ def __init__(self, mf): solve_mo1 = uhf_hess.Hessian.solve_mo1 partial_hess_elec = partial_hess_elec make_h1 = make_h1 + gen_vind = uhf_hess.gen_vind + get_jk_mo = rhf_hess._get_jk_mo + get_veff_resp_mo = get_veff_resp_mo from gpu4pyscf import dft dft.uks.UKS.Hessian = lib.class_as_method(Hessian) diff --git a/gpu4pyscf/properties/ir.py b/gpu4pyscf/properties/ir.py index 33a8fd3a..c398b99d 100644 --- a/gpu4pyscf/properties/ir.py +++ b/gpu4pyscf/properties/ir.py @@ -93,8 +93,9 @@ def eval_ir_freq_intensity(mf, hessian_obj): h1ao = hessian_obj.make_h1(mo_coeff, mo_occ, None, atmlst) # TODO: compact with hessian method, which can save one time cphf solve. # ! Different from PySCF, mo1 is all in mo! + fx = hessian_obj.gen_vind(mo_coeff, mo_occ) mo1, mo_e1 = hessian_obj.solve_mo1(mo_energy, mo_coeff, mo_occ, h1ao, - None, atmlst, hessian_obj.max_memory, log) + fx, atmlst, hessian_obj.max_memory, log) mo1 = cupy.asarray(mo1) mo_e1 = cupy.asarray(mo_e1) diff --git a/gpu4pyscf/scf/jk.py b/gpu4pyscf/scf/jk.py index 8cc8d99d..aaf0b119 100644 --- a/gpu4pyscf/scf/jk.py +++ b/gpu4pyscf/scf/jk.py @@ -63,9 +63,8 @@ # TODO: test different size for L2 cache efficiency NAO_IN_GROUP = 1500 -def _jk_task(mol, dms, vhfopt, task_list, +def _jk_task(mol, dms, vhfopt, task_list, hermi=0, device_id=0, with_j=True, with_k=True, verbose=0): - n_dm = dms.shape[0] nao, _ = vhfopt.coeff.shape uniq_l_ctr = vhfopt.uniq_l_ctr uniq_l = uniq_l_ctr[:,0] @@ -80,6 +79,10 @@ def _jk_task(mol, dms, vhfopt, task_list, cput0 = log.init_timer() dms = cp.asarray(dms) + if hermi == 0: + # Contract the tril and triu parts separately + dms = cp.vstack([dms, dms.transpose(0,2,1)]) + n_dm = dms.shape[0] tile_q_ptr = ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p) q_ptr = ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p) s_ptr = lib.c_null_ptr() @@ -142,6 +145,18 @@ def _jk_task(mol, dms, vhfopt, task_list, t1, t1p = log.timer_debug1(msg, *t1), t1 timing_counter[llll] += t1[1] - t1p[1] kern_counts += 1 + if with_j: + if hermi == 1: + vj *= 2. + else: + vj, vjT = vj[:n_dm//2], vj[n_dm//2:] + vj += vjT.transpose(0,2,1) + if with_k: + if hermi == 1: + vk = transpose_sum(vk) + else: + vk, vkT = vk[:n_dm//2], vk[n_dm//2:] + vk += vkT.transpose(0,2,1) return vj, vk, kern_counts, timing_counter def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None): @@ -161,9 +176,7 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff) dms = sandwich_dot(dms, vhfopt.coeff.T) dms = cp.asarray(dms, order='C') - if hermi == 0: - # Contract the tril and triu parts separately - dms = cp.vstack([dms, dms.transpose(0,2,1)]) + n_dm = dms.shape[0] assert with_j or with_k @@ -187,7 +200,7 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None for device_id in range(_num_devices): future = executor.submit( _jk_task, - mol, dms, vhfopt, task_list[device_id], + mol, dms, vhfopt, task_list[device_id], hermi=hermi, with_j=with_j, with_k=with_k, verbose=verbose, device_id=device_id) futures.append(future) @@ -214,28 +227,19 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None vj = vk = None if with_k: vk = reduce_to_device(vk_dist, inplace=True) - if hermi == 1: - vk = transpose_sum(vk) - else: - vk, vkT = vk[:n_dm//2], vk[n_dm//2:] - vk += vkT.transpose(0,2,1) #:vk = cp.einsum('pi,npq,qj->nij', vhfopt.coeff, vk, vhfopt.coeff) vk = sandwich_dot(vk, vhfopt.coeff) vk = vk.reshape(dm.shape) if with_j: vj = reduce_to_device(vj_dist, inplace=True) - if hermi == 1: - vj *= 2. - else: - vj, vjT = vj[:n_dm//2], vj[n_dm//2:] - vj += vjT.transpose(0,2,1) vj = transpose_sum(vj) #:vj = cp.einsum('pi,npq,qj->nij', vhfopt.coeff, vj, vhfopt.coeff) vj = sandwich_dot(vj, vhfopt.coeff) vj = vj.reshape(dm.shape) h_shls = vhfopt.h_shls + assert len(h_shls) == 0 if h_shls: cput1 = log.timer_debug1('get_jk pass 1 on gpu', *cput0) log.debug3('Integrals for %s functions on CPU', l_symb[LMAX+1]) diff --git a/gpu4pyscf/scf/tests/test_scf_jk.py b/gpu4pyscf/scf/tests/test_scf_jk.py index 4f3dfb1b..4e33ebeb 100644 --- a/gpu4pyscf/scf/tests/test_scf_jk.py +++ b/gpu4pyscf/scf/tests/test_scf_jk.py @@ -16,7 +16,7 @@ import unittest import numpy as np import pyscf -from pyscf import lib +from pyscf import lib, gto from gpu4pyscf.scf import jk from pyscf.scf.hf import get_jk @@ -126,4 +126,26 @@ def test_jk_hermi0(): assert abs(vj2+vj3 - vj1).max() < 1e-9 assert abs(vk2+vk3 - vk1).max() < 1e-9 - \ No newline at end of file + +def test_jk_qz(): + basis = { + 'H': gto.basis.parse(''' +H H + 1.0240000 1.0000000 + ''') + } + mol = pyscf.M( + atom = ''' + H -0.757 0. 0.0 + H 0.757 0. 0.0 + ''', + basis=basis, + unit='B',) + nao = mol.nao + dm = np.random.rand(nao, nao) + vj_gpu, vk_gpu = jk.get_jk(mol, dm) + + vj, vk = get_jk(mol, dm) + + assert np.linalg.norm(vj_gpu.get() - vj) < 1e-9 + assert np.linalg.norm(vk_gpu.get() - vk) < 1e-9 diff --git a/gpu4pyscf/solvent/hessian/pcm.py b/gpu4pyscf/solvent/hessian/pcm.py index cd22b710..bd290b3c 100644 --- a/gpu4pyscf/solvent/hessian/pcm.py +++ b/gpu4pyscf/solvent/hessian/pcm.py @@ -27,6 +27,7 @@ from gpu4pyscf.df import int3c2e from gpu4pyscf.lib.cupy_helper import contract from gpu4pyscf.lib import logger +from gpu4pyscf.hessian.jk import _ao2mo def hess_nuc(pcmobj): if not pcmobj._intermediates: @@ -291,6 +292,30 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): return h1aoa, h1aob else: raise NotImplementedError('Base object is not supported') + + def get_veff_resp_mo(self, mol, dms, mo_coeff, mo_occ, hermi=1): + v1vo = super().get_veff_resp_mo(mol, dms, mo_coeff, mo_occ, hermi=hermi) + if not self.base.with_solvent.equilibrium_solvation: + return v1vo + v_solvent = self.base.with_solvent._B_dot_x(dms) + if isinstance(self.base, scf.uhf.UHF): + n_dm = dms.shape[1] + mocca = mo_coeff[0][:,mo_occ[0]>0] + moccb = mo_coeff[1][:,mo_occ[1]>0] + moa, mob = mo_coeff + nmoa = moa.shape[1] + nocca = mocca.shape[1] + v1vo_sol = v_solvent[0] + v_solvent[1] + v1vo[:,:nmoa*nocca] += _ao2mo(v1vo_sol, mocca, moa).reshape(n_dm,-1) + v1vo[:,nmoa*nocca:] += _ao2mo(v1vo_sol, moccb, mob).reshape(n_dm,-1) + elif isinstance(self.base, scf.hf.RHF): + n_dm = dms.shape[0] + mocc = mo_coeff[:,mo_occ>0] + v1vo += _ao2mo(v_solvent, mocc, mo_coeff).reshape(n_dm,-1) + else: + raise NotImplementedError('Base object is not supported') + return v1vo + def _finalize(self): # disable _finalize. It is called in grad_method.kernel method # where self.de was not yet initialized. diff --git a/gpu4pyscf/solvent/hessian/smd.py b/gpu4pyscf/solvent/hessian/smd.py index 3e8c5238..4bfad79b 100644 --- a/gpu4pyscf/solvent/hessian/smd.py +++ b/gpu4pyscf/solvent/hessian/smd.py @@ -26,6 +26,7 @@ from gpu4pyscf.solvent.grad import smd as smd_grad from gpu4pyscf.solvent.grad import pcm as pcm_grad from gpu4pyscf.solvent.hessian import pcm as pcm_hess +from gpu4pyscf.hessian.jk import _ao2mo def get_cds(smdobj): mol = smdobj.mol @@ -171,6 +172,31 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): return h1aoa, h1aob else: raise NotImplementedError('Base object is not supported') + + def get_veff_resp_mo(self, mol, dms, mo_coeff, mo_occ, hermi=1): + v1vo = super().get_veff_resp_mo(mol, dms, mo_coeff, mo_occ, hermi=hermi) + if not self.base.with_solvent.equilibrium_solvation: + return v1vo + v_solvent = self.base.with_solvent._B_dot_x(dms) + + if isinstance(self.base, scf.uhf.UHF): + n_dm = dms.shape[1] + mocca = mo_coeff[0][:,mo_occ[0]>0] + moccb = mo_coeff[1][:,mo_occ[1]>0] + moa, mob = mo_coeff + nmoa = moa.shape[1] + nocca = mocca.shape[1] + v1vo_sol = v_solvent[0] + v_solvent[1] + v1vo[:,:nmoa*nocca] += _ao2mo(v1vo_sol, mocca, moa).reshape(n_dm,-1) + v1vo[:,nmoa*nocca:] += _ao2mo(v1vo_sol, moccb, mob).reshape(n_dm,-1) + elif isinstance(self.base, scf.hf.RHF): + n_dm = dms.shape[0] + mocc = mo_coeff[:,mo_occ>0] + v1vo += _ao2mo(v_solvent, mocc, mo_coeff).reshape(n_dm,-1) + else: + raise NotImplementedError('Base object is not supported') + return v1vo + def _finalize(self): # disable _finalize. It is called in grad_method.kernel method # where self.de was not yet initialized. diff --git a/gpu4pyscf/solvent/tests/test_smd_hessian.py b/gpu4pyscf/solvent/tests/test_smd_hessian.py index 82fb8ea2..4134d47d 100644 --- a/gpu4pyscf/solvent/tests/test_smd_hessian.py +++ b/gpu4pyscf/solvent/tests/test_smd_hessian.py @@ -258,6 +258,7 @@ def test_to_cpu(self): hess_gpu = hessobj.kernel() hessobj = hessobj.to_cpu() hess_cpu = hessobj.kernel() + print(numpy.linalg.norm(hess_cpu - hess_gpu)) assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5 if __name__ == "__main__": diff --git a/gpu4pyscf/tests/test_dft.py b/gpu4pyscf/tests/test_dft.py index 860a435c..182313b9 100644 --- a/gpu4pyscf/tests/test_dft.py +++ b/gpu4pyscf/tests/test_dft.py @@ -72,7 +72,7 @@ def test_b3lyp_with_d3bj(self): h = mf.Hessian().kernel() assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4 - + @pytest.mark.smoke def test_b3lyp_d3bj(self): print('-------- DFRKS with D3(BJ) -------') From 41c27c9cb9bdb11439ecd60e89aeb27162112b6e Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Thu, 19 Dec 2024 22:32:28 +0000 Subject: [PATCH 02/49] fixed bug in df.hessian.uhf --- gpu4pyscf/df/hessian/uhf.py | 7 +- gpu4pyscf/df/int3c2e.py | 185 ------------------------------------ gpu4pyscf/hessian/jk.py | 11 +-- gpu4pyscf/hessian/rhf.py | 76 ++++++++------- 4 files changed, 47 insertions(+), 232 deletions(-) diff --git a/gpu4pyscf/df/hessian/uhf.py b/gpu4pyscf/df/hessian/uhf.py index 0345c32c..4182c25d 100644 --- a/gpu4pyscf/df/hessian/uhf.py +++ b/gpu4pyscf/df/hessian/uhf.py @@ -349,7 +349,9 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, hk_ao_ao *= 2.0 e1 = cupy.zeros([len(atmlst),len(atmlst),3,3]) ej = hj_ipip - ek = hk_ipip + ek = None + if with_k: + ek = hk_ipip for i0, ia in enumerate(atmlst): shl0, shl1, p0, p1 = aoslices[ia] e1[i0,i0] -= cupy.sum(h1aa[p0:p1], axis=0) @@ -401,7 +403,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, for j0 in range(i0): e1[j0,i0] = e1[i0,j0].T ej[j0,i0] = ej[i0,j0].T - ek[j0,i0] = ek[i0,j0].T + if with_k: + ek[j0,i0] = ek[i0,j0].T t1 = log.timer_debug1('hcore contribution', *t1) log.timer('UHF partial hessian', *time0) return e1, ej, ek diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py index 68630520..f89fb07c 100644 --- a/gpu4pyscf/df/int3c2e.py +++ b/gpu4pyscf/df/int3c2e.py @@ -1071,191 +1071,6 @@ def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None): wk = reduce_to_device(wk_total, inplace=True) return wj, wk -def _int3c2e_ipip1_hjk(intopt, task_list, rhoj, rhok, dm0, orbo, - device_id=0, with_k=True, omega=None): - with cupy.cuda.Device(device_id), _streams[device_id]: - rhoj = cupy.asarray(rhoj) - rhok = cupy.asarray(rhok) - orbo = cupy.asarray(orbo) - dm0 = cupy.asarray(dm0) - nao = dm0.shape[0] - hj = cupy.zeros([nao,9]) - hk = None - if with_k: - hk = cupy.zeros([nao,9]) - for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list, - ip_type='ipip1', omega=omega): - tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1]) - hj[i0:i1] += contract('xpi,p->ix', tmp, rhoj[k0:k1]) - if with_k: - rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1]) - rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1]) - hk[i0:i1] += contract('xpji,pij->ix', int3c_blk, rhok_tmp) - hj = hj.reshape([nao,3,3]) - if with_k: - hk = hk.reshape([nao,3,3]) - return hj, hk - -def _int3c2e_ipvip1_hjk(intopt, task_list, rhoj, rhok, dm0, orbo, - device_id=0, with_k=True, omega=None): - with cupy.cuda.Device(device_id), _streams[device_id]: - rhoj = cupy.asarray(rhoj) - rhok = cupy.asarray(rhok) - orbo = cupy.asarray(orbo) - dm0 = cupy.asarray(dm0) - nao = dm0.shape[0] - hj = cupy.zeros([nao,nao,9]) - hk = None - if with_k: - hk = cupy.zeros([nao,nao,9]) - for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list, - ip_type='ipvip1', omega=omega): - tmp = contract('xpji,ij->xpij', int3c_blk, dm0[i0:i1,j0:j1]) - hj[i0:i1,j0:j1] += contract('xpij,p->ijx', tmp, rhoj[k0:k1]) - if with_k: - rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1]) - rhok_tmp = contract('pio,jo->pji', rhok_tmp, orbo[j0:j1]) - hk[i0:i1,j0:j1] += contract('xpji,pji->ijx', int3c_blk, rhok_tmp) - hj = hj.reshape([nao,nao,3,3]) - if with_k: - hk = hk.reshape([nao,nao,3,3]) - return hj, hk - -def _int3c2e_ip1ip2_hjk(intopt, task_list, rhoj, rhok, dm0, orbo, - device_id=0, with_k=True, omega=None): - with cupy.cuda.Device(device_id), _streams[device_id]: - naux = rhok.shape[0] - rhoj = cupy.asarray(rhoj) - rhok = cupy.asarray(rhok) - orbo = cupy.asarray(orbo) - dm0 = cupy.asarray(dm0) - nao = dm0.shape[0] - hj = cupy.zeros([nao,naux,9]) - hk = None - if with_k: - hk = cupy.zeros([nao,naux,9]) - for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list, - ip_type='ip1ip2', omega=omega): - tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1]) - hj[i0:i1,k0:k1] += contract('xpi,p->ipx', tmp, rhoj[k0:k1]) - if with_k: - rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1]) - rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1]) - hk[i0:i1,k0:k1] += contract('xpji,pij->ipx', int3c_blk, rhok_tmp) - hj = hj.reshape([nao,naux,3,3]) - if with_k: - hk = hk.reshape([nao,naux,3,3]) - return hj, hk - -def _int3c2e_ipip2_hjk(intopt, task_list, rhoj, rhok, dm0, orbo, - device_id=0, with_k=True, omega=None): - with cupy.cuda.Device(device_id), _streams[device_id]: - naux = rhok.shape[0] - rhoj = cupy.asarray(rhoj) - rhok = cupy.asarray(rhok) - orbo = cupy.asarray(orbo) - dm0 = cupy.asarray(dm0) - hj = cupy.zeros([naux,9]) - hk = None - if with_k: - hk = cupy.zeros([naux,9]) - for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list, - ip_type='ipip2', omega=omega): - tmp = contract('xpji,ij->xp', int3c_blk, dm0[i0:i1,j0:j1]) - hj[k0:k1] += contract('xp,p->px', tmp, rhoj[k0:k1]) - if with_k: - rhok_tmp = contract('por,jr->pjo', rhok[k0:k1], orbo[j0:j1]) - rhok_tmp = contract('pjo,io->pji', rhok_tmp, orbo[i0:i1]) - hk[k0:k1] += contract('xpji,pji->px', int3c_blk, rhok_tmp) - hj = hj.reshape([naux,3,3]) - if with_k: - hk = hk.reshape([naux,3,3]) - return hj, hk - -def get_int3c2e_hjk(intopt, task_type, rhoj, rhok, dm0_tag, with_k=True, omega=None): - if task_type == 'ipip1': task_fn = _int3c2e_ipip1_hjk - if task_type == 'ipip2': task_fn = _int3c2e_ipip2_hjk - if task_type == 'ip1ip2': task_fn = _int3c2e_ip1ip2_hjk - if task_type == 'ipvip1': task_fn = _int3c2e_ipvip1_hjk - - orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') - futures = [] - ncp_k = len(intopt.aux_log_qs) - ncp_ij = len(intopt.log_qs) - tasks = np.array(list(itertools.product(range(ncp_k), range(ncp_ij)))) - task_list = [] - for device_id in range(_num_devices): - task_list.append(tasks[device_id::_num_devices]) - - cupy.cuda.get_current_stream().synchronize() - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): - future = executor.submit( - task_fn, intopt, task_list[device_id], - rhoj, rhok, dm0_tag, orbo, with_k=with_k, device_id=device_id, omega=omega) - futures.append(future) - - hj_total = [] - hk_total = [] - for future in futures: - hj, hk = future.result() - hj_total.append(hj) - hk_total.append(hk) - - hj = hk = None - hj = reduce_to_device(hj_total, inplace=True) - if with_k: - hk = reduce_to_device(hk_total, inplace=True) - return hj, hk - -def get_hess_nuc_elec(mol, dm): - ''' - calculate int1e_ipiprinv contribution - ''' - coords = mol.atom_coords() - charges = cupy.asarray(mol.atom_charges(), dtype=np.float64) - - fakemol = gto.fakemol_for_charges(coords) - fakemol.output = mol.output - fakemol.verbose = mol.verbose - fakemol.stdout = mol.stdout - intopt = VHFOpt(mol, fakemol, 'int2e') - intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE) - dm = intopt.sort_orbitals(cupy.asarray(dm), axis=[0,1]) - - natm = mol.natm - nao = mol.nao - hcore_diag = cupy.zeros([9,natm]) - hcore_aa = cupy.zeros([9,natm,nao]) - for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipip1'): - haa = contract('xpji,ij->xpi', int3c_blk, dm[i0:i1,j0:j1]) - hcore_aa[:,k0:k1,i0:i1] += haa - hcore_diag[:,k0:k1] -= contract('xpji,ij->xp', int3c_blk, dm[i0:i1,j0:j1]) - - hcore_ab = cupy.zeros([9,natm,nao]) - for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipvip1'): - hab = contract('xpji,ij->xpi', int3c_blk, dm[i0:i1,j0:j1]) - hcore_ab[:,k0:k1,i0:i1] += hab - hcore_diag[:,k0:k1] -= contract('xpji,ij->xp', int3c_blk, dm[i0:i1,j0:j1]) - - hcore_diag = contract('xp,p->xp', hcore_diag, charges) - hcore_aa = contract('xpj,p->xpj', hcore_aa, charges) - hcore_ab = contract('xpj,p->xpj', hcore_ab, charges) - - aoslices = mol.aoslice_by_atom() - ao2atom = get_ao2atom(intopt, aoslices) - - hcore_aa = contract('xpj,jq->xpq', hcore_aa, ao2atom).reshape([3,3,natm,natm]) - hcore_ab = contract('xpj,jq->xpq', hcore_ab, ao2atom).reshape([3,3,natm,natm]) - hcore = hcore_aa + hcore_aa.transpose([1,0,3,2]) - hcore+= hcore_ab.transpose([1,0,2,3]) + hcore_ab.transpose([0,1,3,2]) - hcore_diag = hcore_diag.reshape([3,3,natm]) - idx = np.arange(natm) - for x in range(3): - for y in range(3): - hcore[x,y,idx,idx] += hcore_diag[x,y] - return hcore - def get_int3c2e_ip_slice(intopt, cp_aux_id, ip_type, out=None, omega=None, stream=None): ''' Generate int3c2e_ip slice along k, full dimension in ij diff --git a/gpu4pyscf/hessian/jk.py b/gpu4pyscf/hessian/jk.py index 7f3eeb60..6dedd447 100644 --- a/gpu4pyscf/hessian/jk.py +++ b/gpu4pyscf/hessian/jk.py @@ -265,8 +265,8 @@ def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None, if isinstance(mocc, tuple): mocca, moccb = mocc moa, mob = mo_coeff - nmoa, nmob = moa.shape[1], mob.shape[1] - nocca, noccb = mocca.shape[1], moccb.shape[1] + nmoa = moa.shape[1] + nocca = mocca.shape[1] n_dm_2 = n_dm//2 if with_j: vjab = vj1[:n_dm_2] + vj1[n_dm_2:] @@ -280,17 +280,10 @@ def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None, if with_j: vj1[:,idy,idx] = vj1[:,idx,idy] vj += _ao2mo(cp.asarray(vj1), mocc, mo_coeff) - #for i, v in enumerate(vj1): - # vj[i] += coeff.T.dot(cp.asarray(v)).dot(coeff) if with_k: if hermi: vk1[:,idy,idx] = vk1[:,idx,idy] vk += _ao2mo(cp.asarray(vk1), mocc, mo_coeff) - #for i, v in enumerate(vk1): - # vk[i] += coeff.T.dot(cp.asarray(v)).dot(coeff) - - # TODO: convert vj and vk into MO log.timer_debug1('get_jk pass 2 for h functions on cpu', *cput1) - log.timer('vj and vk', *cput0) return vj, vk diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py index 96a90359..bec9d0e0 100644 --- a/gpu4pyscf/hessian/rhf.py +++ b/gpu4pyscf/hessian/rhf.py @@ -25,7 +25,7 @@ from collections import Counter from concurrent.futures import ThreadPoolExecutor from pyscf.hessian import rhf as rhf_hess_cpu -from pyscf import lib +from pyscf import lib, gto from pyscf.gto import ATOM_OF # import _response_functions to load gen_response methods in SCF class from gpu4pyscf.scf import _response_functions # noqa @@ -728,46 +728,50 @@ def hess_nuc_elec(mol, dm): ''' calculate hessian contribution due to (nuc, elec) pair ''' + from gpu4pyscf.df import int3c2e + coords = mol.atom_coords() + charges = cupy.asarray(mol.atom_charges(), dtype=np.float64) + + fakemol = gto.fakemol_for_charges(coords) + fakemol.output = mol.output + fakemol.verbose = mol.verbose + fakemol.stdout = mol.stdout + intopt = int3c2e.VHFOpt(mol, fakemol, 'int2e') + intopt.build(1e-14, diag_block_with_triu=True, aosym=False, + group_size=int3c2e.BLKSIZE, group_size_aux=int3c2e.BLKSIZE) + dm = intopt.sort_orbitals(cupy.asarray(dm), axis=[0,1]) - ''' - nao = mol.nao - aoslices = mol.aoslice_by_atom() natm = mol.natm - hcore = numpy.zeros([3,3,natm,natm]) - # CPU version - for ia in range(mol.natm): - ish0, ish1, i0, i1 = aoslices[ia] - zi = mol.atom_charge(ia) - with mol.with_rinv_at_nucleus(ia): - rinv2aa = mol.intor('int1e_ipiprinv', comp=9).reshape([3,3,nao,nao]) - rinv2ab = mol.intor('int1e_iprinvip', comp=9).reshape([3,3,nao,nao]) - rinv2aa *= zi - rinv2ab *= zi - - hcore[:,:,ia,ia] -= numpy.einsum('xypq,pq->xy', rinv2aa+rinv2ab, dm) - - haa = numpy.einsum('xypq,pq->xyp', rinv2aa, dm) - hab = numpy.einsum('xypq,pq->xyp', rinv2ab, dm) - - haa = [haa[:,:,p0:p1].sum(axis=2) for p0,p1 in aoslices[:,2:]] - hab = [hab[:,:,p0:p1].sum(axis=2) for p0,p1 in aoslices[:,2:]] - - haa = numpy.stack(haa, axis=2) - hab = numpy.stack(hab, axis=2) - - hcore[:,:,ia] += haa - hcore[:,:,ia] += hab.transpose([1,0,2]) - - hcore[:,:,:,ia] += haa.transpose([1,0,2]) - hcore[:,:,:,ia] += hab + nao = mol.nao + hcore_diag = cupy.zeros([9,natm]) + hcore_aa = cupy.zeros([9,natm,nao]) + for i0,i1,j0,j1,k0,k1,int3c_blk in int3c2e.loop_int3c2e_general(intopt, ip_type='ipip1'): + haa = contract('xpji,ij->xpi', int3c_blk, dm[i0:i1,j0:j1]) + hcore_aa[:,k0:k1,i0:i1] += haa + hcore_diag[:,k0:k1] -= contract('xpji,ij->xp', int3c_blk, dm[i0:i1,j0:j1]) + + hcore_ab = cupy.zeros([9,natm,nao]) + for i0,i1,j0,j1,k0,k1,int3c_blk in int3c2e.loop_int3c2e_general(intopt, ip_type='ipvip1'): + hab = contract('xpji,ij->xpi', int3c_blk, dm[i0:i1,j0:j1]) + hcore_ab[:,k0:k1,i0:i1] += hab + hcore_diag[:,k0:k1] -= contract('xpji,ij->xp', int3c_blk, dm[i0:i1,j0:j1]) + + hcore_diag = contract('xp,p->xp', hcore_diag, charges) + hcore_aa = contract('xpj,p->xpj', hcore_aa, charges) + hcore_ab = contract('xpj,p->xpj', hcore_ab, charges) - hcore = cupy.asarray(hcore) - ''' - from gpu4pyscf.df import int3c2e - hcore = int3c2e.get_hess_nuc_elec(mol, dm) + aoslices = mol.aoslice_by_atom() + ao2atom = int3c2e.get_ao2atom(intopt, aoslices) + + hcore_aa = contract('xpj,jq->xpq', hcore_aa, ao2atom).reshape([3,3,natm,natm]) + hcore_ab = contract('xpj,jq->xpq', hcore_ab, ao2atom).reshape([3,3,natm,natm]) + hcore = hcore_aa + hcore_aa.transpose([1,0,3,2]) + hcore+= hcore_ab.transpose([1,0,2,3]) + hcore_ab.transpose([0,1,3,2]) + hcore_diag = hcore_diag.reshape([3,3,natm]) + idx = np.arange(natm) + hcore[:,:,idx,idx] += hcore_diag return hcore * 2.0 - def kernel(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None): cput0 = (logger.process_clock(), logger.perf_counter()) if mo_energy is None: mo_energy = hessobj.base.mo_energy From 31418f2be53a13595bbf7bb3455c49501742820a Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Thu, 19 Dec 2024 23:22:07 +0000 Subject: [PATCH 03/49] update license --- gpu4pyscf/df/hessian/jk.py | 22 +++++++++++----------- gpu4pyscf/hessian/jk.py | 22 +++++++++++----------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py index 16010bda..3fc7ac08 100644 --- a/gpu4pyscf/df/hessian/jk.py +++ b/gpu4pyscf/df/hessian/jk.py @@ -1,17 +1,17 @@ -# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved. +# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. # -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . import ctypes import itertools diff --git a/gpu4pyscf/hessian/jk.py b/gpu4pyscf/hessian/jk.py index 6dedd447..60525fc8 100644 --- a/gpu4pyscf/hessian/jk.py +++ b/gpu4pyscf/hessian/jk.py @@ -1,17 +1,17 @@ -# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved. +# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. # -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . ''' Compute J/K matrices for Hessian From a7103a94d622884e3fc34264519d336ba0540e73 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Thu, 19 Dec 2024 23:42:26 +0000 Subject: [PATCH 04/49] format code --- gpu4pyscf/df/hessian/rhf.py | 3 --- gpu4pyscf/df/hessian/rks.py | 3 --- .../df/hessian/tests/test_df_rhf_hessian.py | 22 +++++++++--------- .../df/hessian/tests/test_df_rks_hessian.py | 22 +++++++++--------- gpu4pyscf/df/hessian/uhf.py | 3 --- gpu4pyscf/df/hessian/uks.py | 4 ---- gpu4pyscf/scf/tests/test_scf_jk.py | 23 ------------------- gpu4pyscf/solvent/tests/test_smd_hessian.py | 1 - 8 files changed, 22 insertions(+), 59 deletions(-) diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py index bd806c50..d4fda5e3 100644 --- a/gpu4pyscf/df/hessian/rhf.py +++ b/gpu4pyscf/df/hessian/rhf.py @@ -614,10 +614,7 @@ class Hessian(rhf_hess.Hessian): from gpu4pyscf.lib.utils import to_gpu, device - #__init__ = rhf_hess.Hessian.__init__ auxbasis_response = 1 partial_hess_elec = partial_hess_elec make_h1 = make_h1 - #kernel = rhf_hess.kernel - #hess = kernel get_jk_mo = _get_jk_mo diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py index c6d65daa..2606e8e4 100644 --- a/gpu4pyscf/df/hessian/rks.py +++ b/gpu4pyscf/df/hessian/rks.py @@ -108,10 +108,7 @@ class Hessian(rks_hess.Hessian): '''Non-relativistic RKS hessian''' from gpu4pyscf.lib.utils import to_gpu, device - #__init__ = rks_hess.Hessian.__init__ auxbasis_response = 1 partial_hess_elec = partial_hess_elec make_h1 = make_h1 - #kernel = rhf_hess.kernel - #hess = kernel get_jk_mo = df_rhf_hess._get_jk_mo diff --git a/gpu4pyscf/df/hessian/tests/test_df_rhf_hessian.py b/gpu4pyscf/df/hessian/tests/test_df_rhf_hessian.py index b8560002..a3e13260 100644 --- a/gpu4pyscf/df/hessian/tests/test_df_rhf_hessian.py +++ b/gpu4pyscf/df/hessian/tests/test_df_rhf_hessian.py @@ -1,17 +1,17 @@ -# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved. +# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. # -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . import unittest import numpy diff --git a/gpu4pyscf/df/hessian/tests/test_df_rks_hessian.py b/gpu4pyscf/df/hessian/tests/test_df_rks_hessian.py index 5a853a95..f737e92a 100644 --- a/gpu4pyscf/df/hessian/tests/test_df_rks_hessian.py +++ b/gpu4pyscf/df/hessian/tests/test_df_rks_hessian.py @@ -1,17 +1,17 @@ -# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved. +# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. # -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . import unittest import numpy diff --git a/gpu4pyscf/df/hessian/uhf.py b/gpu4pyscf/df/hessian/uhf.py index 4182c25d..e1c8250b 100644 --- a/gpu4pyscf/df/hessian/uhf.py +++ b/gpu4pyscf/df/hessian/uhf.py @@ -668,10 +668,7 @@ class Hessian(uhf_hess.Hessian): from gpu4pyscf.lib.utils import to_gpu, device - __init__ = uhf_hess.Hessian.__init__ auxbasis_response = 1 partial_hess_elec = partial_hess_elec make_h1 = make_h1 - kernel = rhf_hess.kernel - hess = kernel get_jk_mo = _get_jk_mo diff --git a/gpu4pyscf/df/hessian/uks.py b/gpu4pyscf/df/hessian/uks.py index 4b6cdb11..6bf09803 100644 --- a/gpu4pyscf/df/hessian/uks.py +++ b/gpu4pyscf/df/hessian/uks.py @@ -119,11 +119,7 @@ class Hessian(uks_hess.Hessian): '''Non-relativistic RKS hessian''' from gpu4pyscf.lib.utils import to_gpu, device - __init__ = uks_hess.Hessian.__init__ auxbasis_response = 1 partial_hess_elec = partial_hess_elec make_h1 = make_h1 - hess_elec = uhf_hess.hess_elec - kernel = rhf_hess.kernel - hess = kernel get_jk_mo = df_uhf_hess._get_jk_mo diff --git a/gpu4pyscf/scf/tests/test_scf_jk.py b/gpu4pyscf/scf/tests/test_scf_jk.py index e04b3ff1..78ae68eb 100644 --- a/gpu4pyscf/scf/tests/test_scf_jk.py +++ b/gpu4pyscf/scf/tests/test_scf_jk.py @@ -125,26 +125,3 @@ def test_jk_hermi0(): assert abs(vj2+vj3 - vj1).max() < 1e-9 assert abs(vk2+vk3 - vk1).max() < 1e-9 - -def test_jk_qz(): - basis = { - 'H': gto.basis.parse(''' -H H - 1.0240000 1.0000000 - ''') - } - mol = pyscf.M( - atom = ''' - H -0.757 0. 0.0 - H 0.757 0. 0.0 - ''', - basis=basis, - unit='B',) - nao = mol.nao - dm = np.random.rand(nao, nao) - vj_gpu, vk_gpu = jk.get_jk(mol, dm) - - vj, vk = get_jk(mol, dm) - - assert np.linalg.norm(vj_gpu.get() - vj) < 1e-9 - assert np.linalg.norm(vk_gpu.get() - vk) < 1e-9 diff --git a/gpu4pyscf/solvent/tests/test_smd_hessian.py b/gpu4pyscf/solvent/tests/test_smd_hessian.py index 6fbd5580..9c536f63 100644 --- a/gpu4pyscf/solvent/tests/test_smd_hessian.py +++ b/gpu4pyscf/solvent/tests/test_smd_hessian.py @@ -257,7 +257,6 @@ def test_to_cpu(self): hess_gpu = hessobj.kernel() hessobj = hessobj.to_cpu() hess_cpu = hessobj.kernel() - print(numpy.linalg.norm(hess_cpu - hess_gpu)) assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5 if __name__ == "__main__": From 286a3b07fbffae8e3e5695f66aab4bae74b22d78 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Fri, 20 Dec 2024 05:46:51 +0000 Subject: [PATCH 05/49] support h function in hessian.jk --- gpu4pyscf/df/hessian/jk.py | 2 +- gpu4pyscf/hessian/jk.py | 26 ++++---- gpu4pyscf/hessian/tests/test_rhf_hessian.py | 59 +++++++++++++++++- gpu4pyscf/hessian/tests/test_uhf_hessian.py | 67 ++++++++++++++++++++- gpu4pyscf/scf/jk.py | 11 ++-- gpu4pyscf/scf/tests/test_rhf.py | 4 +- 6 files changed, 144 insertions(+), 25 deletions(-) diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py index 3fc7ac08..5a299bcf 100644 --- a/gpu4pyscf/df/hessian/jk.py +++ b/gpu4pyscf/df/hessian/jk.py @@ -345,7 +345,7 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, hj_ipip2[k0:k1] += contract('xp,p->px', tmp, rhoj[k0:k1]) if with_k: hk_ipip2[k0:k1] += contract('xpji,pij->px', int3c_blk, rhok_tmp) - + auxslices = intopt.auxmol.aoslice_by_atom() aoslices = intopt.mol.aoslice_by_atom() ao2atom = int3c2e.get_ao2atom(intopt, aoslices) diff --git a/gpu4pyscf/hessian/jk.py b/gpu4pyscf/hessian/jk.py index 60525fc8..5a1f75ab 100644 --- a/gpu4pyscf/hessian/jk.py +++ b/gpu4pyscf/hessian/jk.py @@ -157,7 +157,7 @@ def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None, ''' log = logger.new_logger(mol, verbose) cput0 = log.init_timer() - + assert hermi == 1 if vhfopt is None: vhfopt = _VHFOpt(mol).build() @@ -233,7 +233,6 @@ def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None, vj = reduce_to_device(vj_dist, inplace=True) h_shls = vhfopt.h_shls - assert len(h_shls) == 0 if h_shls: cput1 = log.timer_debug1('get_jk pass 1 on gpu', *cput0) log.debug3('Integrals for %s functions on CPU', l_symb[LMAX+1]) @@ -246,12 +245,8 @@ def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None, else: scripts.append('jk->s1il') shls_excludes = [0, h_shls[0]] * 4 - if hermi == 1: - dms = dms.get() - else: - dms = dms[:n_dm//2].get() vs_h = _vhf.direct_mapdm('int2e_cart', 's8', scripts, - dms, 1, mol._atm, mol._bas, mol._env, + dms.get(), 1, mol._atm, mol._bas, mol._env, shls_excludes=shls_excludes) if with_j and with_k: vj1 = vs_h[0] @@ -260,8 +255,14 @@ def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None, vj1 = vs_h[0] else: vk1 = vs_h[0] - coeff = vhfopt.coeff + idx, idy = np.tril_indices(nao, -1) + if hermi == 1: + if with_j: + vj1[:,idy,idx] = vj1[:,idx,idy] + if with_k: + vk1[:,idy,idx] = vk1[:,idx,idy] + if isinstance(mocc, tuple): mocca, moccb = mocc moa, mob = mo_coeff @@ -273,17 +274,14 @@ def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None, vj[:,:nmoa*nocca] += _ao2mo(vjab, mocca, moa).reshape(n_dm_2,-1) vj[:,nmoa*nocca:] += _ao2mo(vjab, moccb, mob).reshape(n_dm_2,-1) if with_k: - vka, vkb = vk[:n_dm_2], vk[n_dm_2:] + vka, vkb = vk1[:n_dm_2], vk1[n_dm_2:] vk[:,:nmoa*nocca] += _ao2mo(vka, mocca, moa).reshape(n_dm_2,-1) vk[:,nmoa*nocca:] += _ao2mo(vkb, moccb, mob).reshape(n_dm_2,-1) else: if with_j: - vj1[:,idy,idx] = vj1[:,idx,idy] - vj += _ao2mo(cp.asarray(vj1), mocc, mo_coeff) + vj += _ao2mo(cp.asarray(vj1), mocc, mo_coeff).reshape(n_dm,-1) if with_k: - if hermi: - vk1[:,idy,idx] = vk1[:,idx,idy] - vk += _ao2mo(cp.asarray(vk1), mocc, mo_coeff) + vk += _ao2mo(cp.asarray(vk1), mocc, mo_coeff).reshape(n_dm,-1) log.timer_debug1('get_jk pass 2 for h functions on cpu', *cput1) log.timer('vj and vk', *cput0) return vj, vk diff --git a/gpu4pyscf/hessian/tests/test_rhf_hessian.py b/gpu4pyscf/hessian/tests/test_rhf_hessian.py index dc27af38..e9aef60c 100644 --- a/gpu4pyscf/hessian/tests/test_rhf_hessian.py +++ b/gpu4pyscf/hessian/tests/test_rhf_hessian.py @@ -14,10 +14,14 @@ import unittest import numpy as np -from pyscf import gto, scf, lib +import cupy +import pyscf +from pyscf import gto, lib from pyscf import grad, hessian from pyscf.hessian import rhf as rhf_cpu +from gpu4pyscf import scf from gpu4pyscf.hessian import rhf as rhf_gpu +from gpu4pyscf.hessian import jk def setUpModule(): global mol @@ -46,7 +50,7 @@ def test_hessian_rhf(self): assert abs(ref - e2_gpu).max() < 1e-6 def test_partial_hess_elec(self): - mf = scf.RHF(mol) + mf = pyscf.scf.RHF(mol) mf.conv_tol = 1e-14 mf.kernel() hobj = mf.Hessian() @@ -139,6 +143,57 @@ def test_hessian_rhf_D3(self): e2_gpu = mf.Hessian().to_gpu().kernel() assert abs(ref - e2_gpu).max() < 1e-6 + def test_jk_mix(self): + mol1 = pyscf.M( + atom=''' + C -1.20806619, -0.34108413, -0.00755148 + C 1.28636081, -0.34128013, -0.00668648 + H 2.53407081, 1.81906387, -0.00736748 + H 1.28693681, 3.97963587, -0.00925948 + ''', + basis='''unc + #BASIS SET: + H S + 1.815041 1 + 0.591063 1 + H P + 2.305000 1 + #BASIS SET: + C S + 8.383976 1 + 3.577015 1 + 1.547118 1 + H P + 2.305000 1 + 1.098827 1 + 0.806750 1 + 0.282362 1 + H D + 1.81900 1 + 0.72760 1 + 0.29104 1 + H F + 0.970109 1 + C G + 0.625000 1 + C H + 0.4 1 + ''', + output = '/dev/null' + ) + nao = mol1.nao + mo_coeff = cupy.random.rand(nao, nao) + mocc = mo_coeff[:,:3] + dm = mocc.dot(mocc.T) * 2 + vj_mo, vk_mo = jk.get_jk(mol1, dm, mo_coeff, mocc) + + mf = scf.RHF(mol1) + vj, vk = mf.get_jk(mol1, dm, hermi=1) + vj_cpu = (mo_coeff.T @ vj @ mocc).reshape(1,-1) + vk_cpu = (mo_coeff.T @ vk @ mocc).reshape(1,-1) + assert cupy.linalg.norm(vj_cpu - vj_mo) < 1e-5 + assert cupy.linalg.norm(vk_cpu - vk_mo) < 1e-5 + if __name__ == "__main__": print("Full Tests for RHF Hessian") unittest.main() diff --git a/gpu4pyscf/hessian/tests/test_uhf_hessian.py b/gpu4pyscf/hessian/tests/test_uhf_hessian.py index c4112bec..1e10306c 100644 --- a/gpu4pyscf/hessian/tests/test_uhf_hessian.py +++ b/gpu4pyscf/hessian/tests/test_uhf_hessian.py @@ -14,10 +14,14 @@ import unittest import numpy -from pyscf import gto, scf, lib +import cupy +import pyscf +from pyscf import gto, lib from pyscf import grad, hessian from pyscf.hessian import uhf as uhf_cpu +from gpu4pyscf import scf from gpu4pyscf.hessian import uhf as uhf_gpu +from gpu4pyscf.hessian import jk def setUpModule(): global mol @@ -48,7 +52,7 @@ def test_hessian_uhf(self): assert abs(ref - e2_gpu).max() < 1e-6 def test_partial_hess_elec(self): - mf = scf.UHF(mol) + mf = pyscf.scf.UHF(mol) mf.conv_tol = 1e-14 mf.kernel() hobj = mf.Hessian() @@ -73,6 +77,65 @@ def test_hessian_uhf_D3(self): e2_gpu = mf.Hessian().to_gpu().kernel() assert abs(ref - e2_gpu).max() < 1e-6 + def test_jk_mix(self): + mol1 = pyscf.M( + atom=''' + C -1.20806619, -0.34108413, -0.00755148 + C 1.28636081, -0.34128013, -0.00668648 + H 2.53407081, 1.81906387, -0.00736748 + H 1.28693681, 3.97963587, -0.00925948 + ''', + basis='''unc + #BASIS SET: + H S + 1.815041 1 + 0.591063 1 + H P + 2.305000 1 + #BASIS SET: + C S + 8.383976 1 + 3.577015 1 + 1.547118 1 + H P + 2.305000 1 + 1.098827 1 + 0.806750 1 + 0.282362 1 + H D + 1.81900 1 + 0.72760 1 + 0.29104 1 + H F + 0.970109 1 + C G + 0.625000 1 + C H + 0.4 1 + ''', + output = '/dev/null' + ) + nao = mol1.nao + mo_coeff = cupy.random.rand(2, nao, nao) + mocca = mo_coeff[0,:,:3] + moccb = mo_coeff[1,:,:2] + dm = cupy.empty([2,nao,nao]) + dm[0] = mocca.dot(mocca.T) + dm[1] = moccb.dot(moccb.T) + vj_mo, vk_mo = jk.get_jk(mol1, dm, mo_coeff, (mocca,moccb), hermi=1) + + mf = scf.UHF(mol1) + vj, vk = mf.get_jk(mol1, dm, hermi=1) + vj2 = cupy.empty([5*nao]) + vk2 = cupy.empty([5*nao]) + vj = vj[0] + vj[1] + vj2[:3*nao] = (mo_coeff[0].T @ vj @ mocca).reshape(1,-1) + vj2[3*nao:] = (mo_coeff[1].T @ vj @ moccb).reshape(1,-1) + vk2[:3*nao] = (mo_coeff[0].T @ vk[0] @ mocca).reshape(1,-1) + vk2[3*nao:] = (mo_coeff[1].T @ vk[1] @ moccb).reshape(1,-1) + assert cupy.linalg.norm(vj2 - vj_mo) < 1e-5 + assert cupy.linalg.norm(vk2 - vk_mo) < 1e-5 + if __name__ == "__main__": print("Full Tests for UHF Hessian") unittest.main() diff --git a/gpu4pyscf/scf/jk.py b/gpu4pyscf/scf/jk.py index 6dd7b5cf..a1f970f3 100644 --- a/gpu4pyscf/scf/jk.py +++ b/gpu4pyscf/scf/jk.py @@ -226,17 +226,15 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None vk = reduce_to_device(vk_dist, inplace=True) #:vk = cp.einsum('pi,npq,qj->nij', vhfopt.coeff, vk, vhfopt.coeff) vk = sandwich_dot(vk, vhfopt.coeff) - vk = vk.reshape(dm.shape) - + if with_j: vj = reduce_to_device(vj_dist, inplace=True) vj = transpose_sum(vj) #:vj = cp.einsum('pi,npq,qj->nij', vhfopt.coeff, vj, vhfopt.coeff) vj = sandwich_dot(vj, vhfopt.coeff) - vj = vj.reshape(dm.shape) h_shls = vhfopt.h_shls - assert len(h_shls) == 0 + if h_shls: cput1 = log.timer_debug1('get_jk pass 1 on gpu', *cput0) log.debug3('Integrals for %s functions on CPU', l_symb[LMAX+1]) @@ -276,6 +274,11 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None vk[i] += coeff.T.dot(cp.asarray(v)).dot(coeff) log.timer_debug1('get_jk pass 2 for h functions on cpu', *cput1) + if with_j: + vj = vj.reshape(dm.shape) + if with_k: + vk = vk.reshape(dm.shape) + log.timer('vj and vk', *cput0) return vj, vk diff --git a/gpu4pyscf/scf/tests/test_rhf.py b/gpu4pyscf/scf/tests/test_rhf.py index 530f6cc8..dd8f7b51 100644 --- a/gpu4pyscf/scf/tests/test_rhf.py +++ b/gpu4pyscf/scf/tests/test_rhf.py @@ -273,8 +273,8 @@ def test_chkfile(self): mf_copy = scf.RHF(mol) mf_copy.chkfile = ftmp.name dm_loaded = mf_copy.init_guess_by_chkfile() - assert np.allclose(dm_stored, dm_loaded, atol = 1e-14) # Since we reload the MO coefficients, the density matrix should be identical up to numerical noise. - + # Since we reload the MO coefficients, the density matrix should be identical up to numerical noise. + assert np.allclose(dm_stored, dm_loaded, atol = 1e-14) # TODO: #test analyze #test mulliken_pop From 2e093dfb380a64ec8cb8b7faa1e84b50f9ff34f2 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Fri, 20 Dec 2024 06:18:59 +0000 Subject: [PATCH 06/49] unit test --- gpu4pyscf/hessian/tests/test_rhf_hessian.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu4pyscf/hessian/tests/test_rhf_hessian.py b/gpu4pyscf/hessian/tests/test_rhf_hessian.py index e9aef60c..30a1c188 100644 --- a/gpu4pyscf/hessian/tests/test_rhf_hessian.py +++ b/gpu4pyscf/hessian/tests/test_rhf_hessian.py @@ -185,7 +185,7 @@ def test_jk_mix(self): mo_coeff = cupy.random.rand(nao, nao) mocc = mo_coeff[:,:3] dm = mocc.dot(mocc.T) * 2 - vj_mo, vk_mo = jk.get_jk(mol1, dm, mo_coeff, mocc) + vj_mo, vk_mo = jk.get_jk(mol1, dm, mo_coeff, mocc, hermi=1) mf = scf.RHF(mol1) vj, vk = mf.get_jk(mol1, dm, hermi=1) From bb400be8e08459035dfc179fe009200d26ddb13a Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Sun, 22 Dec 2024 04:25:58 +0000 Subject: [PATCH 07/49] optimize df hessian memory usage --- examples/dft_driver.py | 8 +- gpu4pyscf/df/hessian/jk.py | 89 +++++---- gpu4pyscf/df/hessian/rhf.py | 37 ++-- gpu4pyscf/df/hessian/rks.py | 26 ++- gpu4pyscf/df/hessian/uhf.py | 48 +++-- gpu4pyscf/df/hessian/uks.py | 40 ++-- gpu4pyscf/df/int3c2e.py | 16 +- gpu4pyscf/dft/numint.py | 199 +++++++++++++------- gpu4pyscf/hessian/jk.py | 53 +++--- gpu4pyscf/hessian/rhf.py | 22 ++- gpu4pyscf/hessian/rks.py | 6 +- gpu4pyscf/hessian/tests/test_rhf_hessian.py | 5 +- gpu4pyscf/hessian/tests/test_uhf_hessian.py | 5 +- gpu4pyscf/hessian/uhf.py | 4 +- gpu4pyscf/hessian/uks.py | 6 +- 15 files changed, 315 insertions(+), 249 deletions(-) diff --git a/examples/dft_driver.py b/examples/dft_driver.py index 13aaa0ce..8060e909 100644 --- a/examples/dft_driver.py +++ b/examples/dft_driver.py @@ -35,10 +35,10 @@ basis=bas, max_memory=32000) # set verbose >= 6 for debugging timer -mol.verbose = 6 +mol.verbose = 4 -mf_df = dft.RKS(mol, xc=args.xc).density_fit(auxbasis=args.auxbasis) -mf_df.verbose = 6 +mf_df = dft.RKS(mol, xc=args.xc)#.density_fit(auxbasis=args.auxbasis) +mf_df.verbose = 4 if args.solvent: mf_df = mf_df.PCM() @@ -52,7 +52,7 @@ mf_df.direct_scf_tol = 1e-14 mf_df.conv_tol = 1e-10 mf_df.chkfile = None -mf_df.conv_tol_cpscf = 1e-3 +mf_df.conv_tol_cpscf = 1e-6 e_tot = mf_df.kernel() scf_time = time.time() - start_time print(f'compute time for energy: {scf_time:.3f} s') diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py index 5a299bcf..fb097180 100644 --- a/gpu4pyscf/df/hessian/jk.py +++ b/gpu4pyscf/df/hessian/jk.py @@ -195,24 +195,13 @@ def _get_int3c2e_ipip_slice(ip_type, intopt, cp_ij_id, aux_id, omega=None, strea if omega is None: omega = 0.0 if stream is None: stream = cupy.cuda.get_current_stream() - + fn = getattr(libgint, 'GINTfill_int3c2e_' + ip_type) - nao = intopt._sorted_mol.nao naux = intopt._sorted_auxmol.nao norb = nao + naux + 1 comp = 9 order = 2 - - lmax = intopt._sorted_mol._bas[:gto.ANG_OF].max() - aux_lmax = intopt._sorted_auxmol._bas[:gto.ANG_OF].max() - nroots = (lmax + aux_lmax + order)//2 + 1 - if nroots > NROOT_ON_GPU: - from pyscf.gto.moleintor import getints, make_cintopt - pmol = intopt._tot_mol - intor = pmol._add_suffix('int3c2e_' + ip_type) - opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor) - nbins = 1 cp_kl_id = aux_id + len(intopt.log_qs) @@ -258,6 +247,11 @@ def _get_int3c2e_ipip_slice(ip_type, intopt, cp_ij_id, aux_id, omega=None, strea if err != 0: raise RuntimeError(f'GINT_fill_int3c2e general failed, err={err}') else: + from pyscf.gto.moleintor import getints, make_cintopt + pmol = intopt._tot_mol + intor = pmol._add_suffix('int3c2e_' + ip_type) + opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor) + # TODO: sph2cart in CPU? ishl0, ishl1 = intopt.l_ctr_offsets[cpi], intopt.l_ctr_offsets[cpi+1] jshl0, jshl1 = intopt.l_ctr_offsets[cpj], intopt.l_ctr_offsets[cpj+1] @@ -291,16 +285,17 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, dm0 = cupy.asarray(dm0) nao = dm0.shape[0] - hj_ipip1 = cupy.zeros([nao,9]) - hj_ipip2 = cupy.zeros([naux,9]) - hj_ip1ip2 = cupy.zeros([nao,naux,9]) - hj_ipvip1 = cupy.zeros([nao,nao,9]) + hj_ipip1 = cupy.zeros([9,nao]) + hj_ipip2 = cupy.zeros([9,naux]) + hj_ip1ip2 = cupy.zeros([9,nao,naux]) + hj_ipvip1 = cupy.zeros([9,nao,nao]) if with_k: - hk_ipip1 = cupy.zeros([nao,9]) - hk_ipip2 = cupy.zeros([naux,9]) - hk_ip1ip2 = cupy.zeros([nao,naux,9]) - hk_ipvip1 = cupy.zeros([nao,nao,9]) + hk_ipip1 = cupy.zeros([9,nao]) + hk_ipip2 = cupy.zeros([9,naux]) + hk_ip1ip2 = cupy.zeros([9,nao,naux]) + hk_ipvip1 = cupy.zeros([9,nao,nao]) + cupy.get_default_memory_pool().free_all_blocks() for aux_id, cp_ij_id in task_list: cpi = intopt.cp_idx[cp_ij_id] cpj = intopt.cp_jdx[cp_ij_id] @@ -309,22 +304,22 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, k0, k1 = aux_ao_loc[aux_id], aux_ao_loc[aux_id+1] if with_k: - rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1]) - rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1]) + rhok_tmp = contract('por,ir->poi', rhok[k0:k1], orbo[i0:i1]) + rhok_tmp = contract('poi,jo->pji', rhok_tmp, orbo[j0:j1]) # (20|0), (0|0)(0|00) int3c_blk = _get_int3c2e_ipip_slice('ipip1', intopt, cp_ij_id, aux_id, omega=omega) tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1]) - hj_ipip1[i0:i1] += contract('xpi,p->ix', tmp, rhoj[k0:k1]) + hj_ipip1[:,i0:i1] += contract('xpi,p->xi', tmp, rhoj[k0:k1]) if with_k: - hk_ipip1[i0:i1] += contract('xpji,pij->ix', int3c_blk, rhok_tmp) + hk_ipip1[:,i0:i1] += contract('xpji,pji->xi', int3c_blk, rhok_tmp) # (11|0), (0|0)(0|00) without response of RI basis int3c_blk = _get_int3c2e_ipip_slice('ipvip1', intopt, cp_ij_id, aux_id, omega=omega) tmp = contract('xpji,ij->xpij', int3c_blk, dm0[i0:i1,j0:j1]) - hj_ipvip1[i0:i1,j0:j1] += contract('xpij,p->ijx', tmp, rhoj[k0:k1]) + hj_ipvip1[:,i0:i1,j0:j1] += contract('xpij,p->xij', tmp, rhoj[k0:k1]) if with_k: - hk_ipvip1[i0:i1,j0:j1] += contract('xpji,pij->ijx', int3c_blk, rhok_tmp) + hk_ipvip1[:,i0:i1,j0:j1] += contract('xpji,pji->xij', int3c_blk, rhok_tmp) if auxbasis_response < 1: continue @@ -332,9 +327,9 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, # (10|1), (0|0)(0|00) int3c_blk = _get_int3c2e_ipip_slice('ip1ip2', intopt, cp_ij_id, aux_id, omega=omega) tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1]) - hj_ip1ip2[i0:i1,k0:k1] += contract('xpi,p->ipx', tmp, rhoj[k0:k1]) + hj_ip1ip2[:,i0:i1,k0:k1] += contract('xpi,p->xip', tmp, rhoj[k0:k1]) if with_k: - hk_ip1ip2[i0:i1,k0:k1] += contract('xpji,pij->ipx', int3c_blk, rhok_tmp) + hk_ip1ip2[:,i0:i1,k0:k1] += contract('xpji,pji->xip', int3c_blk, rhok_tmp) if auxbasis_response < 2: continue @@ -342,44 +337,44 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, # (00|2), (0|0)(0|00) int3c_blk = _get_int3c2e_ipip_slice('ipip2', intopt, cp_ij_id, aux_id, omega=omega) tmp = contract('xpji,ij->xp', int3c_blk, dm0[i0:i1,j0:j1]) - hj_ipip2[k0:k1] += contract('xp,p->px', tmp, rhoj[k0:k1]) + hj_ipip2[:,k0:k1] += contract('xp,p->xp', tmp, rhoj[k0:k1]) if with_k: - hk_ipip2[k0:k1] += contract('xpji,pij->px', int3c_blk, rhok_tmp) - + hk_ipip2[:,k0:k1] += contract('xpji,pji->xp', int3c_blk, rhok_tmp) + auxslices = intopt.auxmol.aoslice_by_atom() aoslices = intopt.mol.aoslice_by_atom() ao2atom = int3c2e.get_ao2atom(intopt, aoslices) aux2atom = int3c2e.get_aux2atom(intopt, auxslices) - hj_ipvip1 = hj_ipvip1.reshape([nao,nao,3,3]) - tmp = contract('ia,ijxy->ajxy', ao2atom, hj_ipvip1) + hj_ipvip1 = hj_ipvip1.reshape([3,3,nao,nao]) + tmp = contract('ia,xyij->ajxy', ao2atom, hj_ipvip1) hj = 2.0 * contract('jb,ajxy->abxy', ao2atom, tmp) - hj_ipip1 = hj_ipip1.reshape([nao,3,3]) - tmp = contract('ia,ixy->axy', ao2atom, hj_ipip1) + hj_ipip1 = hj_ipip1.reshape([3,3,nao]) + tmp = contract('ia,xyi->axy', ao2atom, hj_ipip1) hj[range(natm), range(natm)] += 2.0 * tmp hk = None if with_k: - hk_ipvip1 = hk_ipvip1.reshape([nao,nao,3,3]) - tmp = contract('ia,ijxy->ajxy', ao2atom, hk_ipvip1) + hk_ipvip1 = hk_ipvip1.reshape([3,3,nao,nao]) + tmp = contract('ia,xyij->ajxy', ao2atom, hk_ipvip1) hk = contract('jb,ajxy->abxy', ao2atom, tmp) - hk_ipip1 = hk_ipip1.reshape([nao,3,3]) - tmp = contract('ia,ixy->axy', ao2atom, hk_ipip1) + hk_ipip1 = hk_ipip1.reshape([3,3,nao]) + tmp = contract('ia,xyi->axy', ao2atom, hk_ipip1) hk[range(natm), range(natm)] += tmp if auxbasis_response > 0: - hj_ip1ip2 = hj_ip1ip2.reshape([nao,naux,3,3]) - tmp = contract('ia,ijxy->ajxy', ao2atom, hj_ip1ip2) + hj_ip1ip2 = hj_ip1ip2.reshape([3,3,nao,naux]) + tmp = contract('ia,xyij->ajxy', ao2atom, hj_ip1ip2) tmp = contract('jb,ajxy->abxy',aux2atom, tmp) tmp = tmp + tmp.transpose([1,0,3,2]) hj += tmp if auxbasis_response > 1: hj += tmp if with_k: - hk_ip1ip2 = hk_ip1ip2.reshape([nao,naux,3,3]) - tmp = contract('ia,ijxy->ajxy', ao2atom, hk_ip1ip2) + hk_ip1ip2 = hk_ip1ip2.reshape([3,3,nao,naux]) + tmp = contract('ia,xyij->ajxy', ao2atom, hk_ip1ip2) tmp = contract('jb,ajxy->abxy', aux2atom, tmp) tmp = 0.5 * (tmp + tmp.transpose([1,0,3,2])) hk += tmp @@ -387,12 +382,12 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, hk += tmp if auxbasis_response > 1: - hj_ipip2 = hj_ipip2.reshape([naux,3,3]) - tmp = contract('ia,ixy->axy', aux2atom, hj_ipip2) + hj_ipip2 = hj_ipip2.reshape([3,3,naux]) + tmp = contract('ia,xyi->axy', aux2atom, hj_ipip2) hj[range(natm), range(natm)] += tmp if with_k: - hk_ipip2 = hk_ipip2.reshape([naux,3,3]) - tmp = contract('ia,ixy->axy', aux2atom, hk_ipip2) + hk_ipip2 = hk_ipip2.reshape([3,3,naux]) + tmp = contract('ia,xyi->axy', aux2atom, hk_ipip2) hk[range(natm), range(natm)] += .5 * tmp t0 = log.timer_debug1(f'int3c2e_ipip on Device {device_id}', *t0) return hj, hk diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py index d4fda5e3..bd9ef958 100644 --- a/gpu4pyscf/df/hessian/rhf.py +++ b/gpu4pyscf/df/hessian/rhf.py @@ -56,10 +56,9 @@ def _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2): hk_ao_ao = cupy.zeros([nao,nao,3,3]) cupy.get_default_memory_pool().free_all_blocks() mem_avail = get_avail_mem() - blksize = int((mem_avail*0.2/(nao*nao*3*8)/ALIGNED))*ALIGNED + blksize = int((mem_avail*0.4/(nao*nao*3*8)/ALIGNED))*ALIGNED for k0, k1 in lib.prange(0,nnz,blksize): rhok1_Pko_kslice = cupy.asarray(rhok1_Pko[k0:k1]) - # (10|0)(0|10) without response of RI basis vk2_ip1_ip1 = contract('piox,pkoy->ikxy', rhok1_Pko_kslice, rhok1_Pko_kslice) hk_ao_ao += contract('ikxy,ik->ikxy', vk2_ip1_ip1, dm0) @@ -68,7 +67,7 @@ def _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2): # (10|0)(0|01) without response of RI basis rhok1_Pkl_kslice = contract('piox,ko->pikx', rhok1_Pko_kslice, mocc_2) hk_ao_ao += contract('pikx,pkiy->ikxy', rhok1_Pkl_kslice, rhok1_Pkl_kslice) - rhok1_Pkl_kslice = None + rhok1_Pkl_kslice = rhok1_Pko_kslice = None return hk_ao_ao def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, @@ -397,21 +396,18 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): - mol = hessobj.mol - natm = mol.natm - nocc = int(cupy.count_nonzero(mo_occ > 0)) - nmo = len(mo_occ) - h1ao = cupy.empty((natm, 3, nmo, nocc)) - for ia, h1, vj1, vk1 in _gen_jk(hessobj, mo_coeff, mo_occ, chkfile, - atmlst, verbose, True): - h1 += vj1 - vk1 * .5 - h1ao[ia] = h1 - return h1ao - -def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, + vj, vk = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, atmlst, verbose, True) + # h1mo = h1 + vj - 0.5 * vk + h1mo = vk + h1mo *= -.5 + h1mo += vj + h1mo += rhf_grad.get_grad_hcore(hessobj.base.nuc_grad_method()) + return h1mo + +def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None, with_k=True, omega=None): ''' - A generator to produce the derivatives of Hcore, J, K matrices in MO bases + Derivatives of J, K matrices in MO bases ''' log = logger.new_logger(hessobj, verbose) t0 = log.init_timer() @@ -568,9 +564,7 @@ def _ao2mo(mat): vk1_int3c = vk1_int3c_ip1 + vk1_int3c_ip2 vk1_int3c_ip1 = vk1_int3c_ip2 = None - grad_hcore = rhf_grad.get_grad_hcore(hessobj.base.nuc_grad_method()) cupy.get_default_memory_pool().free_all_blocks() - vk1 = None for i0, ia in enumerate(atmlst): shl0, shl1, p0, p1 = aoslices[ia] vj1_ao = cupy.zeros([3,nao,nao]) @@ -582,11 +576,10 @@ def _ao2mo(mat): vk1_ao[:,p0:p1,:] -= vk1_buf[:,p0:p1,:] vk1_ao[:,:,p0:p1] -= vk1_buf[:,p0:p1,:].transpose(0,2,1) - h1 = grad_hcore[i0] - vj1 = vj1_int3c[ia] + _ao2mo(vj1_ao) + vj1_int3c[ia] += _ao2mo(vj1_ao) if with_k: - vk1 = vk1_int3c[ia] + _ao2mo(vk1_ao) - yield ia, h1, vj1, vk1 + vk1_int3c[ia] += _ao2mo(vk1_ao) + return vj1_int3c, vk1_int3c def _get_jk_mo(hessobj, mol, dms, mo_coeff, mocc, hermi=1, with_j=True, with_k=True, omega=None): diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py index 2606e8e4..1d16ff16 100644 --- a/gpu4pyscf/df/hessian/rks.py +++ b/gpu4pyscf/df/hessian/rks.py @@ -23,6 +23,7 @@ import numpy import cupy from pyscf import lib +from gpu4pyscf.grad import rhf as rhf_grad from gpu4pyscf.hessian import rhf as rhf_hess from gpu4pyscf.hessian import rks as rks_hess from gpu4pyscf.df.hessian import rhf as df_rhf_hess @@ -90,18 +91,23 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) mem_now = lib.current_memory()[0] max_memory = max(2000, mf.max_memory*.9-mem_now) - h1mo = rks_hess._get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory) + with_k = ni.libxc.is_hybrid_xc(mf.xc) - - for ia, h1, vj1, vk1 in df_rhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile, - atmlst, verbose, with_k): - h1mo[ia] += h1 + vj1 - if with_k: - h1mo[ia] -= .5 * hyb * vk1 + vj1, vk1 = df_rhf_hess._get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, + atmlst, verbose, with_k) + h1mo = vj1 + if with_k: + h1mo -= .5 * hyb * vk1 + vj1 = vk1 = None + if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10: - for ia, h1, vj1_lr, vk1_lr in df_rhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile, - atmlst, verbose, True, omega=omega): - h1mo[ia] -= .5 * (alpha - hyb) * vk1_lr + _, vk1_lr = df_rhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile, + atmlst, verbose, True, omega=omega) + h1mo -= .5 * (alpha - hyb) * vk1_lr + vk1_lr = None + + h1mo += rhf_grad.get_grad_hcore(hessobj.base.nuc_grad_method()) + h1mo += rks_hess._get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory) return h1mo class Hessian(rks_hess.Hessian): diff --git a/gpu4pyscf/df/hessian/uhf.py b/gpu4pyscf/df/hessian/uhf.py index e1c8250b..1dc3f3a4 100644 --- a/gpu4pyscf/df/hessian/uhf.py +++ b/gpu4pyscf/df/hessian/uhf.py @@ -416,21 +416,21 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): if atmlst is None: atmlst = range(natm) - nocca, noccb = hessobj.base.nelec - nmo = len(mo_occ[0]) - h1aoa = cupy.empty((natm, 3, nmo, nocca)) - h1aob = cupy.empty((natm, 3, nmo, noccb)) - for ia, h1, vj1, vk1 in _gen_jk(hessobj, mo_coeff, mo_occ, chkfile, - atmlst, verbose, True): - h1a, h1b = h1 - vj1a, vj1b = vj1 - vk1a, vk1b = vk1 - - h1aoa[ia] = h1a + vj1a - vk1a - h1aob[ia] = h1b + vj1b - vk1b - return (h1aoa, h1aob) - -def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, + vj1, vk1 = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, atmlst, verbose, True) + vj1a, vj1b = vj1 + vk1a, vk1b = vk1 + h1moa = vj1a + h1moa-= vk1a + h1mob = vj1b + h1mob-= vk1b + vj1 = vk1 = vj1a = vj1b = vk1a = vk1b = None + + gobj = hessobj.base.nuc_grad_method() + h1moa += rhf_grad.get_grad_hcore(gobj, mo_coeff[0], mo_occ[0]) + h1mob += rhf_grad.get_grad_hcore(gobj, mo_coeff[1], mo_occ[1]) + return (h1moa, h1mob) + +def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None, with_k=True, omega=None): ''' A generator to produce the derivatives of Hcore, J, K matrices in MO bases @@ -632,12 +632,8 @@ def _ao2mo(mat, mocc, mo): tmp = contract('xij,jo->xio', mat, mocc) return contract('xik,ip->xpk', tmp, mo) - gobj = hessobj.base.nuc_grad_method() - grad_hcore_a = rhf_grad.get_grad_hcore(gobj, mo_coeff[0], mo_occ[0]) - grad_hcore_b = rhf_grad.get_grad_hcore(gobj, mo_coeff[1], mo_occ[1]) cupy.get_default_memory_pool().free_all_blocks() - - vk1a = vk1b = None + for i0, ia in enumerate(atmlst): shl0, shl1, p0, p1 = aoslices[ia] vj1_ao = cupy.zeros([3,nao,nao]) @@ -652,14 +648,12 @@ def _ao2mo(mat, mocc, mo): vk1b_ao[:,p0:p1,:] -= vk1b_buf[:,p0:p1,:] vk1b_ao[:,:,p0:p1] -= vk1b_buf[:,p0:p1,:].transpose(0,2,1) - h1a = grad_hcore_a[i0] - h1b = grad_hcore_b[i0] - vj1a = vj1a_int3c[ia] + _ao2mo(vj1_ao, mocca, mo_coeff[0]) - vj1b = vj1b_int3c[ia] + _ao2mo(vj1_ao, moccb, mo_coeff[1]) + vj1a_int3c[ia] += _ao2mo(vj1_ao, mocca, mo_coeff[0]) + vj1b_int3c[ia] += _ao2mo(vj1_ao, moccb, mo_coeff[1]) if with_k: - vk1a = vk1a_int3c[ia] + _ao2mo(vk1a_ao, mocca, mo_coeff[0]) - vk1b = vk1b_int3c[ia] + _ao2mo(vk1b_ao, moccb, mo_coeff[1]) - yield ia, (h1a, h1b), (vj1a, vj1b), (vk1a, vk1b) + vk1a_int3c[ia] += _ao2mo(vk1a_ao, mocca, mo_coeff[0]) + vk1b_int3c[ia] += _ao2mo(vk1b_ao, moccb, mo_coeff[1]) + return (vj1a_int3c, vj1b_int3c), (vk1a_int3c, vk1b_int3c) _get_jk_mo = df_rhf_hess._get_jk_mo diff --git a/gpu4pyscf/df/hessian/uks.py b/gpu4pyscf/df/hessian/uks.py index 6bf09803..5fd23a34 100644 --- a/gpu4pyscf/df/hessian/uks.py +++ b/gpu4pyscf/df/hessian/uks.py @@ -23,6 +23,7 @@ import numpy import cupy from pyscf import lib +from gpu4pyscf.grad import rhf as rhf_grad from gpu4pyscf.hessian import rhf as rhf_hess from gpu4pyscf.hessian import uhf as uhf_hess from gpu4pyscf.hessian import uks as uks_hess @@ -95,24 +96,35 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) mem_now = lib.current_memory()[0] max_memory = max(2000, mf.max_memory*.9-mem_now) - h1moa, h1mob = uks_hess._get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory) + with_k = ni.libxc.is_hybrid_xc(mf.xc) - for ia, h1, vj1, vk1 in df_uhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile, - atmlst, verbose, with_k): + vj1, vk1 = df_uhf_hess._get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, + atmlst, verbose, with_k) + vj1a, vj1b = vj1 + h1moa = vj1a + h1mob = vj1b - h1moa[ia] += h1[0] + vj1[0] - h1mob[ia] += h1[1] + vj1[1] - if with_k: - vk1a, vk1b = vk1 - h1moa[ia] -= hyb * vk1a - h1mob[ia] -= hyb * vk1b + if with_k: + vk1a, vk1b = vk1 + h1moa -= hyb * vk1a + h1mob -= hyb * vk1b + vj1 = vk1 = vj1a = vj1b = vk1a = vk1b = None + if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10: - for ia, h1, vj1_lr, vk1_lr in df_uhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile, - atmlst, verbose, True, omega=omega): - vk1a, vk1b = vk1_lr - h1moa[ia] -= (alpha - hyb) * vk1a - h1mob[ia] -= (alpha - hyb) * vk1b + _, vk1_lr = df_uhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile, + atmlst, verbose, True, omega=omega) + vk1a, vk1b = vk1_lr + h1moa -= (alpha - hyb) * vk1a + h1mob -= (alpha - hyb) * vk1b + + gobj = hessobj.base.nuc_grad_method() + h1moa += rhf_grad.get_grad_hcore(gobj, mo_coeff[0], mo_occ[0]) + h1mob += rhf_grad.get_grad_hcore(gobj, mo_coeff[1], mo_occ[1]) + + v1moa, v1mob = uks_hess._get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory) + h1moa += v1moa + h1mob += v1mob return h1moa, h1mob class Hessian(uks_hess.Hessian): diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py index f89fb07c..98350c59 100644 --- a/gpu4pyscf/df/int3c2e.py +++ b/gpu4pyscf/df/int3c2e.py @@ -1035,8 +1035,8 @@ def _int3c2e_ip2_wjk(intopt, task_list, dm0, orbo, with_k=True, omega=None, devi for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list, ip_type='ip2', omega=omega): wj[k0:k1] += contract('xpji,ji->px', int3c_blk, dm0[j0:j1,i0:i1]) - tmp = contract('xpji,jo->piox', int3c_blk, orbo[j0:j1]) if with_k: + tmp = contract('xpji,jo->piox', int3c_blk, orbo[j0:j1]) wk[k0:k1] += contract('piox,ir->prox', tmp, orbo[i0:i1]) return wj, wk @@ -1229,15 +1229,6 @@ def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', di intopt = VHFOpt(mol, auxmol, 'int2e') intopt.build(direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE) - lmax = mol._bas[:gto.ANG_OF].max() - aux_lmax = auxmol._bas[:gto.ANG_OF].max() - nroots = (lmax + aux_lmax + order)//2 + 1 - if nroots > NROOT_ON_GPU: - from pyscf.gto.moleintor import getints, make_cintopt - pmol = intopt._tot_mol - intor = pmol._add_suffix('int3c2e_' + ip_type) - opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor) - nao_cart = intopt._sorted_mol.nao naux_cart = intopt._sorted_auxmol.nao norb_cart = nao_cart + naux_cart + 1 @@ -1287,6 +1278,11 @@ def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', di if err != 0: raise RuntimeError("int3c2e failed\n") else: + from pyscf.gto.moleintor import getints, make_cintopt + pmol = intopt._tot_mol + intor = pmol._add_suffix('int3c2e_' + ip_type) + opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor) + # TODO: sph2cart in CPU? ishl0, ishl1 = intopt.l_ctr_offsets[cpi], intopt.l_ctr_offsets[cpi+1] jshl0, jshl1 = intopt.l_ctr_offsets[cpj], intopt.l_ctr_offsets[cpj+1] diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py index 22f6ff97..aed7a4ca 100644 --- a/gpu4pyscf/dft/numint.py +++ b/gpu4pyscf/dft/numint.py @@ -32,11 +32,11 @@ LMAX_ON_GPU = 6 BAS_ALIGNED = 1 -GRID_BLKSIZE = 32 MIN_BLK_SIZE = getattr(__config__, 'min_grid_blksize', 64*64) ALIGNED = getattr(__config__, 'grid_aligned', 16*16) AO_ALIGNMENT = getattr(__config__, 'ao_aligned', 16) AO_THRESHOLD = 1e-10 +GB = 1024*1024*1024 # Should we release the cupy cache? FREE_CUPY_CACHE = False @@ -273,26 +273,23 @@ def eval_rho4(mol, ao, mo0, mo1, non0tab=None, xctype='LDA', hermi=0, na = mo1.shape[0] if xctype == 'LDA' or xctype == 'HF': c0 = mo0.T.dot(ao) - t1 = log.timer_debug2('eval occ_coeff', *t0) - c_0 = contract('aio,ig->aog', mo1, ao) rho = cupy.empty([na,ngrids]) for i in range(na): - rho[i] = _contract_rho(c0, c_0[i]) + c_0 = contract('io,ig->og', mo1[i], ao) + rho[i] = _contract_rho(c0, c_0) elif xctype in ('GGA', 'NLC'): c0 = contract('nig,io->nog', ao, mo0) - t1 = log.timer_debug2('eval occ_coeff', *t0) - c_0 = contract('nig,aio->anog', ao, mo1) - t1 = log.timer_debug2('ao * cpos', *t1) rho = cupy.empty([na, 4, ngrids]) for i in range(na): - _contract_rho_gga(c0, c_0[i], rho=rho[i]) + c_0 = contract('nig,io->nog', ao, mo1[i]) + _contract_rho_gga(c0, c_0, rho=rho[i]) else: # meta-GGA assert not with_lapl rho = cupy.empty((na,5,ngrids)) c0 = contract('nig,io->nog', ao, mo0) - c_0 = contract('nig,aio->anog', ao, mo1) for i in range(na): - _contract_rho_mgga(c0, c_0[i], rho=rho[i]) + c_0 = contract('nig,io->nog', ao, mo1[i]) + _contract_rho_mgga(c0, c_0, rho=rho[i]) if hermi: # corresponding to the density of ao * mo1[i].dot(mo0.T) * ao rho *= 2. @@ -1025,7 +1022,7 @@ def _nr_rks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff, p0 = p1 = grid_start t1 = t0 = log.init_timer() for ao, mask, weights, coords in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, - max_memory=None, + max_memory=None, blksize=None, grid_range=(grid_start, grid_end)): p0, p1 = p1, p1+len(weights) # precompute molecular orbitals @@ -1133,6 +1130,102 @@ def nr_rks_fxc_st(ni, mol, grids, xc_code, dm0=None, dms_alpha=None, return nr_rks_fxc(ni, mol, grids, xc_code, dm0, dms_alpha, hermi=0, fxc=fxc, max_memory=max_memory, verbose=verbose) +def _nr_uks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff, + verbose=None, hermi=1, device_id=0): + with cupy.cuda.Device(device_id), _streams[device_id]: + if dms is not None: + dma, dmb = dms + dma = cupy.asarray(dma) + dmb = cupy.asarray(dmb) + if mo1 is not None: + mo1a, mo1b = mo1 + mo1a = cupy.asarray(mo1a) + mo1b = cupy.asarray(mo1b) + if occ_coeff is not None: + occ_coeff_a, occ_coeff_b = occ_coeff + occ_coeff_a = cupy.asarray(occ_coeff_a) + occ_coeff_b = cupy.asarray(occ_coeff_b) + + if fxc is not None: fxc = cupy.asarray(fxc) + assert isinstance(verbose, int) + log = logger.new_logger(mol, verbose) + xctype = ni._xc_type(xc_code) + opt = getattr(ni, 'gdftopt', None) + + _sorted_mol = opt.mol + nao = mol.nao + nset = len(dma) + vmata = cupy.zeros((nset, nao, nao)) + vmatb = cupy.zeros((nset, nao, nao)) + + if xctype == 'LDA': + ao_deriv = 0 + else: + ao_deriv = 1 + + ngrids_glob = grids.coords.shape[0] + ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices + grid_start = device_id * ngrids_per_device + grid_end = (device_id + 1) * ngrids_per_device + + p0 = p1 = grid_start + t1 = t0 = log.init_timer() + for ao, mask, weights, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, + max_memory=None, + grid_range=(grid_start, grid_end)): + + t0 = log.init_timer() + p0, p1 = p1, p1+len(weights) + # precompute fxc_w + fxc_w = fxc[:,:,:,:,p0:p1] * weights + + # precompute molecular orbitals + if occ_coeff is not None: + occ_coeff_a_mask = occ_coeff_a[mask] + occ_coeff_b_mask = occ_coeff_b[mask] + rho1a = eval_rho4(_sorted_mol, ao, occ_coeff_a_mask, mo1a[:,mask], + xctype=xctype, hermi=hermi).reshape(nset,-1,p1-p0) + rho1b = eval_rho4(_sorted_mol, ao, occ_coeff_b_mask, mo1b[:,mask], + xctype=xctype, hermi=hermi).reshape(nset,-1,p1-p0) + else: # slow version + rho1a = [] + rho1b = [] + for i in range(nset): + rho_tmp = eval_rho(_sorted_mol, ao, dma[i,mask[:,None],mask], + xctype=xctype, hermi=hermi) + rho1a.append(rho_tmp.reshape(-1,p1-p0)) + rho_tmp = eval_rho(_sorted_mol, ao, dmb[i,mask[:,None],mask], + xctype=xctype, hermi=hermi) + rho1b.append(rho_tmp.reshape(-1,p1-p0)) + t0 = log.timer_debug1('rho', *t0) + + for i in range(nset): + wv_a = contract('xg,xyg->yg', rho1a[i], fxc_w[0,:,0]) + wv_a+= contract('xg,xyg->yg', rho1b[i], fxc_w[1,:,0]) + wv_b = contract('xg,xyg->yg', rho1a[i], fxc_w[0,:,1]) + wv_b+= contract('xg,xyg->yg', rho1b[i], fxc_w[1,:,1]) + if xctype == 'LDA': + va = ao.dot(_scale_ao(ao, wv_a[0]).T) + vb = ao.dot(_scale_ao(ao, wv_b[0]).T) + elif xctype == 'GGA': + wv_a[0] *= .5 # for transpose_sum at the end + wv_b[0] *= .5 + va = ao[0].dot(_scale_ao(ao, wv_a).T) + vb = ao[0].dot(_scale_ao(ao, wv_b).T) + elif xctype == 'NLC': + raise NotImplementedError('NLC') + else: + wv_a[[0,4]] *= .5 # for transpose_sum at the end + wv_b[[0,4]] *= .5 + va = ao[0].dot(_scale_ao(ao[:4], wv_a[:4]).T) + vb = ao[0].dot(_scale_ao(ao[:4], wv_b[:4]).T) + va += _tau_dot(ao, ao, wv_a[4]) + vb += _tau_dot(ao, ao, wv_b[4]) + add_sparse(vmata[i], va, mask) + add_sparse(vmatb[i], vb, mask) + t1 = log.timer_debug2('integration', *t1) + t0 = log.timer_debug1('vxc', *t0) + return vmata, vmatb def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=0, rho0=None, vxc=None, fxc=None, max_memory=2000, verbose=None): @@ -1144,13 +1237,13 @@ def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= if opt is None or mol not in [opt.mol, opt._sorted_mol]: ni.build(mol, grids.coords) opt = ni.gdftopt - mol = None - _sorted_mol = opt._sorted_mol + nao, nao0 = opt.coeff.shape dma, dmb = dms dm_shape = dma.shape # AO basis -> gdftopt AO basis with_mocc = hasattr(dms, 'mo1') + mo1 = occ_coeff = None if with_mocc: mo1a, mo1b = dms.mo1 occ_coeffa, occ_coeffb = dms.occ_coeff @@ -1158,70 +1251,32 @@ def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= mo1b = opt.sort_orbitals(mo1b, axis=[1]) occ_coeff_a = opt.sort_orbitals(occ_coeffa, axis=[0]) occ_coeff_b = opt.sort_orbitals(occ_coeffb, axis=[0]) - + occ_coeff = (occ_coeff_a, occ_coeff_b) + mo1 = (mo1a, mo1b) dma = cupy.asarray(dma).reshape(-1,nao0,nao0) dmb = cupy.asarray(dmb).reshape(-1,nao0,nao0) dma = opt.sort_orbitals(dma, axis=[1,2]) dmb = opt.sort_orbitals(dmb, axis=[1,2]) - nset = len(dma) - vmata = cupy.zeros((nset, nao, nao)) - vmatb = cupy.zeros((nset, nao, nao)) - - if xctype == 'LDA': - ao_deriv = 0 - nvar = 1 - elif xctype == 'GGA': - ao_deriv = 1 - nvar = 4 - else: - ao_deriv = 1 - nvar = 5 - p0 = p1 = 0 - for ao, mask, weights, coords in ni.block_loop( - _sorted_mol, grids, nao, ao_deriv, max_memory=max_memory): - t0 = log.init_timer() - p0, p1 = p1, p1+len(weights) - # precompute fxc_w - fxc_w = fxc[:,:,:,:,p0:p1] * weights - - # precompute molecular orbitals - if with_mocc: - occ_coeff_a_mask = occ_coeff_a[mask] - occ_coeff_b_mask = occ_coeff_b[mask] - rho1a = eval_rho4(_sorted_mol, ao, occ_coeff_a_mask, mo1a[:,mask], - xctype=xctype, hermi=hermi) - rho1b = eval_rho4(_sorted_mol, ao, occ_coeff_b_mask, mo1b[:,mask], - xctype=xctype, hermi=hermi) - rho1 = cupy.stack([rho1a, rho1b]).reshape(2, nset, nvar, p1-p0) - else: # slow version - rho1 = cupy.empty((2, nset, nvar, p1-p0)) - for i in range(nset): - rho1[0,i] = eval_rho(_sorted_mol, ao, dma[i,mask[:,None],mask], - xctype=xctype, hermi=hermi) - rho1[1,i] = eval_rho(_sorted_mol, ao, dmb[i,mask[:,None],mask], - xctype=xctype, hermi=hermi) - t0 = log.timer_debug1('rho', *t0) + futures = [] + cupy.cuda.get_current_stream().synchronize() + with ThreadPoolExecutor(max_workers=_num_devices) as executor: + for device_id in range(_num_devices): + future = executor.submit( + _nr_uks_fxc_task, + ni, mol, grids, xc_code, fxc, (dma, dmb), mo1, occ_coeff, + verbose=log.verbose, hermi=hermi, device_id=device_id) + futures.append(future) + vmata_dist = [] + vmatb_dist = [] + for future in futures: + vmata, vmatb = future.result() + vmata_dist.append(vmata) + vmatb_dist.append(vmatb) + + vmata = reduce_to_device(vmata_dist, inplace=True) + vmatb = reduce_to_device(vmatb_dist, inplace=True) - for i in range(nset): - wv = contract('axg,axbyg->byg', rho1[:,i], fxc_w) - if xctype == 'LDA': - va = ao.dot(_scale_ao(ao, wv[0,0]).T) - vb = ao.dot(_scale_ao(ao, wv[1,0]).T) - elif xctype == 'GGA': - wv[:,0] *= .5 # for transpose_sum at the end - va = ao[0].dot(_scale_ao(ao, wv[0]).T) - vb = ao[0].dot(_scale_ao(ao, wv[1]).T) - elif xctype == 'NLC': - raise NotImplementedError('NLC') - else: - wv[:,[0,4]] *= .5 # for transpose_sum at the end - va = ao[0].dot(_scale_ao(ao[:4], wv[0,:4]).T) - vb = ao[0].dot(_scale_ao(ao[:4], wv[1,:4]).T) - va += _tau_dot(ao, ao, wv[0,4]) - vb += _tau_dot(ao, ao, wv[1,4]) - add_sparse(vmata[i], va, mask) - add_sparse(vmatb[i], vb, mask) vmata = opt.unsort_orbitals(vmata, axis=[1,2]) vmatb = opt.unsort_orbitals(vmatb, axis=[1,2]) if xctype != 'LDA': @@ -1578,7 +1633,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, comp = (deriv+1)*(deriv+2)*(deriv+3)//6 if blksize is None: - #cupy.get_default_memory_pool().free_all_blocks() + # By default, a memory space of [comp,nao,blksize] is reserved mem_avail = get_avail_mem() blksize = int((mem_avail*.2/8/((comp+1)*nao + extra))/ ALIGNED) * ALIGNED blksize = min(blksize, MIN_BLK_SIZE) diff --git a/gpu4pyscf/hessian/jk.py b/gpu4pyscf/hessian/jk.py index 5a1f75ab..f4f102c6 100644 --- a/gpu4pyscf/hessian/jk.py +++ b/gpu4pyscf/hessian/jk.py @@ -41,7 +41,7 @@ def _ao2mo(v_ao, mocc, mo_coeff): v_ao = contract('nij,jo->nio', v_ao, mocc) return contract('nio,ip->npo', v_ao, mo_coeff) -def _jk_task(mol, dms, mo_coeff, mocc, vhfopt, task_list, hermi=0, +def _jk_task(mol, dms, mo_coeff, mo_occ, vhfopt, task_list, hermi=0, device_id=0, with_j=True, with_k=True, verbose=0): nao, _ = vhfopt.coeff.shape uniq_l_ctr = vhfopt.uniq_l_ctr @@ -56,6 +56,12 @@ def _jk_task(mol, dms, mo_coeff, mocc, vhfopt, task_list, hermi=0, log = logger.new_logger(mol, verbose) cput0 = log.init_timer() dms = cp.asarray(dms) + coeff = cp.asarray(vhfopt.coeff) + + # Transform MO coeffcients and DM into sorted, cartesian AO basis + #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff) + dms = sandwich_dot(dms, coeff.T) + dms = cp.asarray(dms, order='C') n_dm = dms.shape[0] tile_q_ptr = ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p) @@ -126,11 +132,16 @@ def _jk_task(mol, dms, mo_coeff, mocc, vhfopt, task_list, hermi=0, if with_k: vk = transpose_sum(vk) - if isinstance(mocc, tuple): + assert mo_coeff.ndim == 2 or mo_coeff.ndim == 3 + if mo_coeff.ndim == 3: # Unrestricted case - mocca, moccb = mocc - moa, mob = mo_coeff + mo_coeff = cp.asarray(mo_coeff) + mo_occ = cp.asarray(mo_occ) + moa = coeff.dot(mo_coeff[0]) + mob = coeff.dot(mo_coeff[1]) nmoa, nmob = moa.shape[1], mob.shape[1] + mocca = moa[:,mo_occ[0] > 0.5] + moccb = mob[:,mo_occ[1] > 0.5] nocca, noccb = mocca.shape[1], moccb.shape[1] n_dm_2 = n_dm//2 if with_j: @@ -144,6 +155,10 @@ def _jk_task(mol, dms, mo_coeff, mocc, vhfopt, task_list, hermi=0, vk[:,:nmoa*nocca] = _ao2mo(vka, mocca, moa).reshape(n_dm_2,-1) vk[:,nmoa*nocca:] = _ao2mo(vkb, moccb, mob).reshape(n_dm_2,-1) else: + mo_coeff = cp.asarray(mo_coeff) + mo_occ = cp.asarray(mo_occ) + mo_coeff = coeff.dot(mo_coeff) + mocc = mo_coeff[:,mo_occ>0.5] if with_j: vj = _ao2mo(vj, mocc, mo_coeff).reshape(n_dm,-1) if with_k: @@ -151,7 +166,7 @@ def _jk_task(mol, dms, mo_coeff, mocc, vhfopt, task_list, hermi=0, return vj, vk, kern_counts, timing_counter -def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None, +def get_jk(mol, dm, mo_coeff, mo_occ, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None): '''Compute J, K matrices in MO ''' @@ -166,18 +181,6 @@ def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None, dm = cp.asarray(dm, order='C') dms = dm.reshape(-1,nao_orig,nao_orig) - - # Transform MO coeffcients and DM into sorted, cartesian AO basis - #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff) - dms = sandwich_dot(dms, vhfopt.coeff.T) - dms = cp.asarray(dms, order='C') - coeff = vhfopt.coeff - if isinstance(mocc, tuple): - mocc = (coeff.dot(mocc[0]), coeff.dot(mocc[1])) - mo_coeff = (coeff.dot(mo_coeff[0]), coeff.dot(mo_coeff[1])) - else: - mocc = coeff.dot(mocc) - mo_coeff = coeff.dot(mo_coeff) n_dm = dms.shape[0] assert with_j or with_k @@ -201,7 +204,7 @@ def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None, for device_id in range(_num_devices): future = executor.submit( _jk_task, - mol, dms, mo_coeff, mocc, vhfopt, task_list[device_id], hermi=hermi, + mol, dms, mo_coeff, mo_occ, vhfopt, task_list[device_id], hermi=hermi, with_j=with_j, with_k=with_k, verbose=verbose, device_id=device_id) futures.append(future) @@ -244,6 +247,10 @@ def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None, scripts.append('jk->s2il') else: scripts.append('jk->s1il') + # Transform MO coeffcients and DM into sorted, cartesian AO basis + #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff) + dms = sandwich_dot(dms, vhfopt.coeff.T) + dms = cp.asarray(dms, order='C') shls_excludes = [0, h_shls[0]] * 4 vs_h = _vhf.direct_mapdm('int2e_cart', 's8', scripts, dms.get(), 1, mol._atm, mol._bas, mol._env, @@ -263,9 +270,11 @@ def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None, if with_k: vk1[:,idy,idx] = vk1[:,idx,idy] - if isinstance(mocc, tuple): - mocca, moccb = mocc - moa, mob = mo_coeff + if mo_coeff.ndim == 3: + moa = vhfopt.coeff.dot(mo_coeff[0]) + mob = vhfopt.coeff.dot(mo_coeff[1]) + mocca = moa[:,mo_occ[0]>0.5] + moccb = mob[:,mo_occ[1]>0.5] nmoa = moa.shape[1] nocca = mocca.shape[1] n_dm_2 = n_dm//2 @@ -278,6 +287,8 @@ def get_jk(mol, dm, mo_coeff, mocc, hermi=0, vhfopt=None, vk[:,:nmoa*nocca] += _ao2mo(vka, mocca, moa).reshape(n_dm_2,-1) vk[:,nmoa*nocca:] += _ao2mo(vkb, moccb, mob).reshape(n_dm_2,-1) else: + mo_coeff = vhfopt.coeff.dot(mo_coeff) + mocc = mo_coeff[:,mo_occ>0.5] if with_j: vj += _ao2mo(cp.asarray(vj1), mocc, mo_coeff).reshape(n_dm,-1) if with_k: diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py index bec9d0e0..0b2f3f99 100644 --- a/gpu4pyscf/hessian/rhf.py +++ b/gpu4pyscf/hessian/rhf.py @@ -180,6 +180,11 @@ def _ejk_ip2_task(mol, dms, vhfopt, task_list, j_factor=1.0, k_factor=1.0, log = logger.new_logger(mol, verbose) cput0 = log.init_timer() dms = cp.asarray(dms) + coeff = cp.asarray(vhfopt.coeff) + + #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff) + dms = sandwich_dot(dms, coeff.T) + dms = cp.asarray(dms, order='C') tile_q_ptr = ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p) q_ptr = ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p) @@ -275,9 +280,6 @@ def _partial_ejk_ip2(mol, dm, vhfopt=None, j_factor=1., k_factor=1., verbose=Non dm = cp.asarray(dm, order='C') dms = dm.reshape(-1,nao_orig,nao_orig) - #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff) - dms = sandwich_dot(dms, vhfopt.coeff.T) - dms = cp.asarray(dms, order='C') init_constant(mol) @@ -656,10 +658,11 @@ def fvind_vo(mo1): avail_mem = get_avail_mem() # *4 for input dm, vj, vk, and vxc - blksize = int(min(avail_mem*.3 / (8*3*nao*nao*4), - avail_mem*.6 / (8*nmo*nocc*natm*3*5))) + blksize = int(min(avail_mem*.3 / (8*3*nao*nocc*4), # in MO + avail_mem*.6 / (8*nmo*nocc*3*5), + avail_mem*.3 / (8*nmo*nmo*3*3))) # vj, vk, dm if blksize < ALIGNED**2: - raise RuntimeError('GPU memory insufficient') + raise RuntimeError('GPU memory insufficient for solving CPHF equations') blksize = (blksize // ALIGNED**2) * ALIGNED**2 log.debug(f'GPU memory {avail_mem/GB:.1f} GB available') @@ -884,7 +887,7 @@ def get_hcore(iatm, jatm): def hcore_generator(hessobj, mol=None): raise NotImplementedError -def _get_jk_mo(hessobj, mol, dms, mo_coeff, mocc, +def _get_jk_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1, with_j=True, with_k=True, omega=None): ''' Compute J/K matrices in MO for multiple DMs ''' @@ -894,12 +897,11 @@ def _get_jk_mo(hessobj, mol, dms, mo_coeff, mocc, with mol.with_range_coulomb(omega): vhfopt = mf._opt_gpu[omega] = _VHFOpt(mol, mf.direct_scf_tol).build() with mol.with_range_coulomb(omega): - vj, vk = jk.get_jk(mol, dms, mo_coeff, mocc, hermi, vhfopt, with_j, with_k) + vj, vk = jk.get_jk(mol, dms, mo_coeff, mo_occ, hermi, vhfopt, with_j, with_k) return vj, vk def _get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1, omega=None): - mocc = mo_coeff[:,mo_occ>0] - vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mocc, + vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=hermi, with_j=True, with_k=True, omega=omega) return vj - 0.5 * vk diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py index d64b35bd..261fa631 100644 --- a/gpu4pyscf/hessian/rks.py +++ b/gpu4pyscf/hessian/rks.py @@ -731,16 +731,16 @@ def get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1, omega=None): v1 = jk._ao2mo(v1, mocc, mo_coeff).reshape(-1,nmo*nocc) if hybrid: - vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mocc, hermi=1) + vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=1) vk *= hyb if omega > 1e-10: # For range separated Coulomb - _, vk_lr = hessobj.get_jk_mo(mol, dms, mo_coeff, mocc, hermi, + _, vk_lr = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi, with_j=False, omega=omega) vk_lr *= (alpha-hyb) vk += vk_lr v1 += vj - .5 * vk else: - v1 += hessobj.get_jk_mo(mol, dms, mo_coeff, mocc, hermi=1, + v1 += hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=1, with_k=False)[0] return v1 diff --git a/gpu4pyscf/hessian/tests/test_rhf_hessian.py b/gpu4pyscf/hessian/tests/test_rhf_hessian.py index 30a1c188..ac657199 100644 --- a/gpu4pyscf/hessian/tests/test_rhf_hessian.py +++ b/gpu4pyscf/hessian/tests/test_rhf_hessian.py @@ -106,7 +106,6 @@ def test_get_jk(self): nao = mol.nao mo_coeff = np.random.rand(nao, nao) dm = mo_coeff.dot(mo_coeff.T) * 2 - vj, vk = rhf_gpu._get_jk_ip1(mol, dm) assert abs(lib.fp(vj.get()) - 87674.69061160382) < 1e-7 assert abs(lib.fp(vk.get()) - -9.317650662101629) < 1e-7 @@ -183,9 +182,11 @@ def test_jk_mix(self): ) nao = mol1.nao mo_coeff = cupy.random.rand(nao, nao) + mo_occ = cupy.zeros([nao]) + mo_occ[:3] = 2 mocc = mo_coeff[:,:3] dm = mocc.dot(mocc.T) * 2 - vj_mo, vk_mo = jk.get_jk(mol1, dm, mo_coeff, mocc, hermi=1) + vj_mo, vk_mo = jk.get_jk(mol1, dm, mo_coeff, mo_occ, hermi=1) mf = scf.RHF(mol1) vj, vk = mf.get_jk(mol1, dm, hermi=1) diff --git a/gpu4pyscf/hessian/tests/test_uhf_hessian.py b/gpu4pyscf/hessian/tests/test_uhf_hessian.py index 1e10306c..a7d5c983 100644 --- a/gpu4pyscf/hessian/tests/test_uhf_hessian.py +++ b/gpu4pyscf/hessian/tests/test_uhf_hessian.py @@ -119,10 +119,13 @@ def test_jk_mix(self): mo_coeff = cupy.random.rand(2, nao, nao) mocca = mo_coeff[0,:,:3] moccb = mo_coeff[1,:,:2] + mo_occ = cupy.zeros([2,nao]) + mo_occ[0,:3] = 1 + mo_occ[1,:2] = 1 dm = cupy.empty([2,nao,nao]) dm[0] = mocca.dot(mocca.T) dm[1] = moccb.dot(moccb.T) - vj_mo, vk_mo = jk.get_jk(mol1, dm, mo_coeff, (mocca,moccb), hermi=1) + vj_mo, vk_mo = jk.get_jk(mol1, dm, mo_coeff, mo_occ, hermi=1) mf = scf.UHF(mol1) vj, vk = mf.get_jk(mol1, dm, hermi=1) diff --git a/gpu4pyscf/hessian/uhf.py b/gpu4pyscf/hessian/uhf.py index 73bec288..c7e836b8 100644 --- a/gpu4pyscf/hessian/uhf.py +++ b/gpu4pyscf/hessian/uhf.py @@ -404,9 +404,7 @@ def fx(mo1): return fx def _get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1): - mocca = mo_coeff[0][:,mo_occ[0]>0] - moccb = mo_coeff[1][:,mo_occ[1]>0] - vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, (mocca, moccb), + vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=hermi, with_j=True, with_k=True) return vj - vk diff --git a/gpu4pyscf/hessian/uks.py b/gpu4pyscf/hessian/uks.py index 5d565b81..db4bf59e 100644 --- a/gpu4pyscf/hessian/uks.py +++ b/gpu4pyscf/hessian/uks.py @@ -880,16 +880,16 @@ def get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1): v1vo[:,:nmoa*nocca] = jk._ao2mo(v1[0], mocca, mo_coeff[0]).reshape(-1,nmoa*nocca) v1vo[:,nmoa*nocca:] = jk._ao2mo(v1[1], moccb, mo_coeff[1]).reshape(-1,nmob*noccb) if hybrid: - vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, (mocca, moccb), hermi=1) + vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=1) vk *= hyb if omega > 1e-10: - _, vk_lr = hessobj.get_jk_mo(mol, dms, mo_coeff, (mocca, moccb), + _, vk_lr = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi, with_j=False, omega=omega) vk_lr *= (alpha-hyb) vk += vk_lr v1vo += vj - vk else: - v1vo += hessobj.get_jk_mo(mol, dms, mo_coeff, (mocca, moccb), + v1vo += hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=1, with_k=False)[0] return v1vo From 81d9a8655a10230f5e44d222f79b0312789620cf Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Sun, 22 Dec 2024 06:22:43 +0000 Subject: [PATCH 08/49] more accurate memory estimate for hessian --- gpu4pyscf/df/hessian/rhf.py | 3 +++ gpu4pyscf/df/hessian/rks.py | 4 +++- gpu4pyscf/df/hessian/uhf.py | 5 ++++- gpu4pyscf/df/hessian/uks.py | 2 ++ gpu4pyscf/hessian/rhf.py | 6 ++++-- gpu4pyscf/hessian/rks.py | 9 ++++++--- gpu4pyscf/hessian/uhf.py | 6 ++++-- gpu4pyscf/hessian/uks.py | 6 ++++-- 8 files changed, 30 insertions(+), 11 deletions(-) diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py index bd9ef958..de6f1ebb 100644 --- a/gpu4pyscf/df/hessian/rhf.py +++ b/gpu4pyscf/df/hessian/rhf.py @@ -396,6 +396,9 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): + mol = hessobj.mol + natm = mol.natm + assert atmlst is None or atmlst ==range(natm) vj, vk = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, atmlst, verbose, True) # h1mo = h1 + vj - 0.5 * vk h1mo = vk diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py index 1d16ff16..31c1d506 100644 --- a/gpu4pyscf/df/hessian/rks.py +++ b/gpu4pyscf/df/hessian/rks.py @@ -85,6 +85,8 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): mol = hessobj.mol + natm = mol.natm + assert atmlst is None or atmlst ==range(natm) mf = hessobj.base ni = mf._numint ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True) @@ -101,7 +103,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): vj1 = vk1 = None if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10: - _, vk1_lr = df_rhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile, + _, vk1_lr = df_rhf_hess._get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, atmlst, verbose, True, omega=omega) h1mo -= .5 * (alpha - hyb) * vk1_lr vk1_lr = None diff --git a/gpu4pyscf/df/hessian/uhf.py b/gpu4pyscf/df/hessian/uhf.py index 1dc3f3a4..acd67380 100644 --- a/gpu4pyscf/df/hessian/uhf.py +++ b/gpu4pyscf/df/hessian/uhf.py @@ -413,6 +413,9 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): mol = hessobj.mol natm = mol.natm + mol = hessobj.mol + natm = mol.natm + assert atmlst is None or atmlst ==range(natm) if atmlst is None: atmlst = range(natm) @@ -633,7 +636,7 @@ def _ao2mo(mat, mocc, mo): return contract('xik,ip->xpk', tmp, mo) cupy.get_default_memory_pool().free_all_blocks() - + for i0, ia in enumerate(atmlst): shl0, shl1, p0, p1 = aoslices[ia] vj1_ao = cupy.zeros([3,nao,nao]) diff --git a/gpu4pyscf/df/hessian/uks.py b/gpu4pyscf/df/hessian/uks.py index 5fd23a34..6f94ed06 100644 --- a/gpu4pyscf/df/hessian/uks.py +++ b/gpu4pyscf/df/hessian/uks.py @@ -90,6 +90,8 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): mol = hessobj.mol + natm = mol.natm + assert atmlst is None or atmlst ==range(natm) mf = hessobj.base ni = mf._numint ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True) diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py index 0b2f3f99..2ca835ad 100644 --- a/gpu4pyscf/hessian/rhf.py +++ b/gpu4pyscf/hessian/rhf.py @@ -357,14 +357,16 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): assert atmlst is None mol = hessobj.mol natm = mol.natm - nao = mo_coeff.shape[0] mo_coeff = cp.asarray(mo_coeff) mocc = cp.asarray(mo_coeff[:,mo_occ>0]) dm0 = mocc.dot(mocc.T) * 2 h1mo = rhf_grad.get_grad_hcore(hessobj.base.Gradients()) + # Estimate the size of intermediate variables + # dm, vj, and vk in [natm,3,nao_cart,nao_cart] + nao_cart = mol.nao_cart() avail_mem = get_avail_mem() - slice_size = int(avail_mem*0.6) // (8*3*nao*nao) + slice_size = int(avail_mem*0.5) // (8*3*nao_cart*nao_cart*3) for atoms_slice in lib.prange(0, natm, slice_size): vj, vk = _get_jk_ip1(mol, dm0, atoms_slice=atoms_slice, verbose=verbose) #:vhf = vj - vk * .5 diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py index 261fa631..5d909c78 100644 --- a/gpu4pyscf/hessian/rks.py +++ b/gpu4pyscf/hessian/rks.py @@ -111,7 +111,6 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): mol = hessobj.mol natm = mol.natm assert atmlst is None or atmlst == range(natm) - nao = mo_coeff.shape[0] mocc = mo_coeff[:,mo_occ>0] dm0 = numpy.dot(mocc, mocc.T) * 2 avail_mem = get_avail_mem() @@ -124,8 +123,11 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) with_k = ni.libxc.is_hybrid_xc(mf.xc) + # Estimate the size of intermediate variables + # dm, vj, and vk in [natm,3,nao_cart,nao_cart] + nao_cart = mol.nao_cart() avail_mem -= 8 * h1mo.size - slice_size = int(avail_mem*0.5) // (8*3*nao*nao) + slice_size = int(avail_mem*0.5) // (8*3*nao_cart*nao_cart*3) for atoms_slice in lib.prange(0, natm, slice_size): vj, vk = rhf_hess._get_jk_ip1(mol, dm0, with_k=with_k, atoms_slice=atoms_slice, verbose=verbose) @@ -133,6 +135,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): if with_k: vk *= .5 * hyb veff -= vk + vj = vk = None if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10: with mol.with_range_coulomb(omega): vk_lr = rhf_hess._get_jk_ip1(mol, dm0, with_j=False, verbose=verbose)[1] @@ -142,7 +145,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): for i, ia in enumerate(range(atom0, atom1)): for ix in range(3): h1mo[ia,ix] += mo_coeff.T.dot(veff[i,ix].dot(mocc)) - vj = vk = vk_lr = veff = None + vk_lr = veff = None return h1mo XX, XY, XZ = 4, 5, 6 diff --git a/gpu4pyscf/hessian/uhf.py b/gpu4pyscf/hessian/uhf.py index c7e836b8..44154532 100644 --- a/gpu4pyscf/hessian/uhf.py +++ b/gpu4pyscf/hessian/uhf.py @@ -183,15 +183,17 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): mo_a, mo_b = mo_coeff mocca = mo_a[:,mo_occ[0]>0] moccb = mo_b[:,mo_occ[1]>0] - nao = mo_a.shape[0] dm0a = mocca.dot(mocca.T) dm0b = moccb.dot(moccb.T) grad_obj = hessobj.base.Gradients() h1moa = rhf_grad.get_grad_hcore(grad_obj, mo_a, mo_occ[0]) h1mob = rhf_grad.get_grad_hcore(grad_obj, mo_b, mo_occ[1]) + # Estimate the size of intermediate variables + # dm, vj, and vk in [natm,3,nao_cart,nao_cart] + nao_cart = mol.nao_cart() avail_mem = get_avail_mem() - slice_size = int(avail_mem*0.6) // (8*3*nao*nao*2) + slice_size = int(avail_mem*0.5) // (8*3*nao_cart*nao_cart*6) for atoms_slice in lib.prange(0, natm, slice_size): vja, vka = rhf_hess_gpu._get_jk_ip1(mol, dm0a, atoms_slice=atoms_slice, verbose=verbose) vjb, vkb = rhf_hess_gpu._get_jk_ip1(mol, dm0b, atoms_slice=atoms_slice, verbose=verbose) diff --git a/gpu4pyscf/hessian/uks.py b/gpu4pyscf/hessian/uks.py index db4bf59e..66571300 100644 --- a/gpu4pyscf/hessian/uks.py +++ b/gpu4pyscf/hessian/uks.py @@ -116,7 +116,6 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): mo_a, mo_b = mo_coeff mocca = mo_a[:,mo_occ[0]>0] moccb = mo_b[:,mo_occ[1]>0] - nao = mo_a.shape[0] dm0a = mocca.dot(mocca.T) dm0b = moccb.dot(moccb.T) avail_mem = get_avail_mem() @@ -131,8 +130,11 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) with_k = ni.libxc.is_hybrid_xc(mf.xc) + # Estimate the size of intermediate variables + # dm, vj, and vk in [natm,3,nao_cart,nao_cart] + nao_cart = mol.nao_cart() avail_mem -= 8 * (h1moa.size + h1mob.size) - slice_size = int(avail_mem*0.5) // (8*3*nao*nao) + slice_size = int(avail_mem*0.5) // (8*3*nao_cart*nao_cart*6) for atoms_slice in lib.prange(0, natm, slice_size): vja, vka = rhf_hess._get_jk_ip1(mol, dm0a, with_k=with_k, atoms_slice=atoms_slice, verbose=verbose) vjb, vkb = rhf_hess._get_jk_ip1(mol, dm0b, with_k=with_k, atoms_slice=atoms_slice, verbose=verbose) From 31cbd4836338cdf6f9a3ec17428c7d4695717a51 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Sun, 22 Dec 2024 07:10:46 +0000 Subject: [PATCH 09/49] _gen_jk -> _get_jk_ip --- gpu4pyscf/df/hessian/uks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu4pyscf/df/hessian/uks.py b/gpu4pyscf/df/hessian/uks.py index 6f94ed06..31273a7d 100644 --- a/gpu4pyscf/df/hessian/uks.py +++ b/gpu4pyscf/df/hessian/uks.py @@ -114,7 +114,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): vj1 = vk1 = vj1a = vj1b = vk1a = vk1b = None if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10: - _, vk1_lr = df_uhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile, + _, vk1_lr = df_uhf_hess._get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, atmlst, verbose, True, omega=omega) vk1a, vk1b = vk1_lr h1moa -= (alpha - hyb) * vk1a From 4e8501141c77ff58f1f6ba417873f28c4d685cca Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Sun, 22 Dec 2024 16:38:07 -0800 Subject: [PATCH 10/49] with_j and with_k for hessian --- gpu4pyscf/df/hessian/jk.py | 119 ++++++------ gpu4pyscf/df/hessian/rhf.py | 268 +++++++++++++++------------ gpu4pyscf/df/hessian/rks.py | 25 +-- gpu4pyscf/df/hessian/uhf.py | 357 ++++++++++++++++++++---------------- gpu4pyscf/df/hessian/uks.py | 25 +-- gpu4pyscf/df/int3c2e.py | 215 +++++++++++++--------- gpu4pyscf/hessian/rhf.py | 18 +- gpu4pyscf/hessian/rks.py | 13 +- gpu4pyscf/hessian/uhf.py | 9 +- gpu4pyscf/hessian/uks.py | 14 +- 10 files changed, 581 insertions(+), 482 deletions(-) diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py index fb097180..f8992ca3 100644 --- a/gpu4pyscf/df/hessian/jk.py +++ b/gpu4pyscf/df/hessian/jk.py @@ -18,7 +18,6 @@ import numpy as np from concurrent.futures import ThreadPoolExecutor import cupy -from pyscf import gto from gpu4pyscf.df import int3c2e from gpu4pyscf.scf.int4c2e import libgint from gpu4pyscf.hessian.jk import _ao2mo @@ -79,7 +78,7 @@ def _jk_task_with_mo1(dfobj, dms, mo_coeff, mo1s, occ_coeffs, rhok1 = contract('Lij,jo->Loi', cderi, mo1[i]) rhok1 = rhok1.reshape([-1,nao]) vk[i] += cupy.dot(rhok1.T, rhok_oo) - + rhok1 = rhok1.reshape([-1,nocc,nao]) rhok1 = contract('Loi,ip->Lop', rhok1, occ_coeff) rhok1 = rhok1.reshape([-1,nocc]) @@ -91,7 +90,7 @@ def _jk_task_with_mo1(dfobj, dms, mo_coeff, mo1s, occ_coeffs, vj = cupy.zeros(dms_shape) vj[:,rows,cols] = vj_sparse vj[:,cols,rows] = vj_sparse - + vj_mo = vk_mo = None if len(occ_coeffs) == 1: # Restricted case @@ -127,7 +126,7 @@ def _jk_task_with_mo1(dfobj, dms, mo_coeff, mo1s, occ_coeffs, t0 = log.timer_debug1(f'vj and vk on Device {device_id}', *t0) return vj_mo, vk_mo -def get_jk(dfobj, dms_tag, mo_coeff, mocc, hermi=0, +def get_jk(dfobj, dms_tag, mo_coeff, mocc, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-14, omega=None): ''' Compute J/K in MO with density fitting ''' @@ -195,7 +194,7 @@ def _get_int3c2e_ipip_slice(ip_type, intopt, cp_ij_id, aux_id, omega=None, strea if omega is None: omega = 0.0 if stream is None: stream = cupy.cuda.get_current_stream() - + fn = getattr(libgint, 'GINTfill_int3c2e_' + ip_type) nao = intopt._sorted_mol.nao naux = intopt._sorted_auxmol.nao @@ -206,7 +205,7 @@ def _get_int3c2e_ipip_slice(ip_type, intopt, cp_ij_id, aux_id, omega=None, strea cp_kl_id = aux_id + len(intopt.log_qs) lk = intopt.aux_angular[aux_id] - + cpi = intopt.cp_idx[cp_ij_id] cpj = intopt.cp_jdx[cp_ij_id] li = intopt.angular[cpi] @@ -251,7 +250,7 @@ def _get_int3c2e_ipip_slice(ip_type, intopt, cp_ij_id, aux_id, omega=None, strea pmol = intopt._tot_mol intor = pmol._add_suffix('int3c2e_' + ip_type) opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor) - + # TODO: sph2cart in CPU? ishl0, ishl1 = intopt.l_ctr_offsets[cpi], intopt.l_ctr_offsets[cpi+1] jshl0, jshl1 = intopt.l_ctr_offsets[cpj], intopt.l_ctr_offsets[cpj+1] @@ -270,26 +269,29 @@ def _get_int3c2e_ipip_slice(ip_type, intopt, cp_ij_id, aux_id, omega=None, strea def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, - device_id=0, with_k=True, omega=None, auxbasis_response=1): + device_id=0, with_j=True, with_k=True, omega=None, + auxbasis_response=1): natm = intopt.mol.natm nao = dm0.shape[0] - naux = rhok.shape[0] + assert with_j or with_k ao_loc = intopt.ao_loc aux_ao_loc = intopt.aux_ao_loc with cupy.cuda.Device(device_id), _streams[device_id]: log = logger.new_logger(intopt.mol, intopt.mol.verbose) t0 = log.init_timer() - rhoj = cupy.asarray(rhoj) - rhok = cupy.asarray(rhok) orbo = cupy.asarray(orbo) dm0 = cupy.asarray(dm0) nao = dm0.shape[0] - - hj_ipip1 = cupy.zeros([9,nao]) - hj_ipip2 = cupy.zeros([9,naux]) - hj_ip1ip2 = cupy.zeros([9,nao,naux]) - hj_ipvip1 = cupy.zeros([9,nao,nao]) + if with_j: + naux = rhoj.shape[0] + rhoj = cupy.asarray(rhoj) + hj_ipip1 = cupy.zeros([9,nao]) + hj_ipip2 = cupy.zeros([9,naux]) + hj_ip1ip2 = cupy.zeros([9,nao,naux]) + hj_ipvip1 = cupy.zeros([9,nao,nao]) if with_k: + naux = rhok.shape[0] + rhok = cupy.asarray(rhok) hk_ipip1 = cupy.zeros([9,nao]) hk_ipip2 = cupy.zeros([9,naux]) hk_ip1ip2 = cupy.zeros([9,nao,naux]) @@ -302,57 +304,63 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, i0, i1 = ao_loc[cpi], ao_loc[cpi+1] j0, j1 = ao_loc[cpj], ao_loc[cpj+1] k0, k1 = aux_ao_loc[aux_id], aux_ao_loc[aux_id+1] - + if with_k: rhok_tmp = contract('por,ir->poi', rhok[k0:k1], orbo[i0:i1]) rhok_tmp = contract('poi,jo->pji', rhok_tmp, orbo[j0:j1]) # (20|0), (0|0)(0|00) int3c_blk = _get_int3c2e_ipip_slice('ipip1', intopt, cp_ij_id, aux_id, omega=omega) - tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1]) - hj_ipip1[:,i0:i1] += contract('xpi,p->xi', tmp, rhoj[k0:k1]) + if with_j: + tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1]) + hj_ipip1[:,i0:i1] += contract('xpi,p->xi', tmp, rhoj[k0:k1]) if with_k: hk_ipip1[:,i0:i1] += contract('xpji,pji->xi', int3c_blk, rhok_tmp) # (11|0), (0|0)(0|00) without response of RI basis int3c_blk = _get_int3c2e_ipip_slice('ipvip1', intopt, cp_ij_id, aux_id, omega=omega) - tmp = contract('xpji,ij->xpij', int3c_blk, dm0[i0:i1,j0:j1]) - hj_ipvip1[:,i0:i1,j0:j1] += contract('xpij,p->xij', tmp, rhoj[k0:k1]) + if with_j: + tmp = contract('xpji,ij->xpij', int3c_blk, dm0[i0:i1,j0:j1]) + hj_ipvip1[:,i0:i1,j0:j1] += contract('xpij,p->xij', tmp, rhoj[k0:k1]) if with_k: hk_ipvip1[:,i0:i1,j0:j1] += contract('xpji,pji->xij', int3c_blk, rhok_tmp) if auxbasis_response < 1: continue - + # (10|1), (0|0)(0|00) int3c_blk = _get_int3c2e_ipip_slice('ip1ip2', intopt, cp_ij_id, aux_id, omega=omega) - tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1]) - hj_ip1ip2[:,i0:i1,k0:k1] += contract('xpi,p->xip', tmp, rhoj[k0:k1]) + if with_j: + tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1]) + hj_ip1ip2[:,i0:i1,k0:k1] += contract('xpi,p->xip', tmp, rhoj[k0:k1]) if with_k: hk_ip1ip2[:,i0:i1,k0:k1] += contract('xpji,pji->xip', int3c_blk, rhok_tmp) - + if auxbasis_response < 2: continue - + # (00|2), (0|0)(0|00) int3c_blk = _get_int3c2e_ipip_slice('ipip2', intopt, cp_ij_id, aux_id, omega=omega) - tmp = contract('xpji,ij->xp', int3c_blk, dm0[i0:i1,j0:j1]) - hj_ipip2[:,k0:k1] += contract('xp,p->xp', tmp, rhoj[k0:k1]) + if with_j: + tmp = contract('xpji,ij->xp', int3c_blk, dm0[i0:i1,j0:j1]) + hj_ipip2[:,k0:k1] += contract('xp,p->xp', tmp, rhoj[k0:k1]) if with_k: hk_ipip2[:,k0:k1] += contract('xpji,pji->xp', int3c_blk, rhok_tmp) - + auxslices = intopt.auxmol.aoslice_by_atom() aoslices = intopt.mol.aoslice_by_atom() ao2atom = int3c2e.get_ao2atom(intopt, aoslices) aux2atom = int3c2e.get_aux2atom(intopt, auxslices) - hj_ipvip1 = hj_ipvip1.reshape([3,3,nao,nao]) - tmp = contract('ia,xyij->ajxy', ao2atom, hj_ipvip1) - hj = 2.0 * contract('jb,ajxy->abxy', ao2atom, tmp) + hj = None + if with_j: + hj_ipvip1 = hj_ipvip1.reshape([3,3,nao,nao]) + tmp = contract('ia,xyij->ajxy', ao2atom, hj_ipvip1) + hj = 2.0 * contract('jb,ajxy->abxy', ao2atom, tmp) - hj_ipip1 = hj_ipip1.reshape([3,3,nao]) - tmp = contract('ia,xyi->axy', ao2atom, hj_ipip1) - hj[range(natm), range(natm)] += 2.0 * tmp + hj_ipip1 = hj_ipip1.reshape([3,3,nao]) + tmp = contract('ia,xyi->axy', ao2atom, hj_ipip1) + hj[range(natm), range(natm)] += 2.0 * tmp hk = None if with_k: @@ -363,15 +371,16 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, hk_ipip1 = hk_ipip1.reshape([3,3,nao]) tmp = contract('ia,xyi->axy', ao2atom, hk_ipip1) hk[range(natm), range(natm)] += tmp - + if auxbasis_response > 0: - hj_ip1ip2 = hj_ip1ip2.reshape([3,3,nao,naux]) - tmp = contract('ia,xyij->ajxy', ao2atom, hj_ip1ip2) - tmp = contract('jb,ajxy->abxy',aux2atom, tmp) - tmp = tmp + tmp.transpose([1,0,3,2]) - hj += tmp - if auxbasis_response > 1: + if with_j: + hj_ip1ip2 = hj_ip1ip2.reshape([3,3,nao,naux]) + tmp = contract('ia,xyij->ajxy', ao2atom, hj_ip1ip2) + tmp = contract('jb,ajxy->abxy',aux2atom, tmp) + tmp = tmp + tmp.transpose([1,0,3,2]) hj += tmp + if auxbasis_response > 1: + hj += tmp if with_k: hk_ip1ip2 = hk_ip1ip2.reshape([3,3,nao,naux]) tmp = contract('ia,xyij->ajxy', ao2atom, hk_ip1ip2) @@ -380,11 +389,12 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, hk += tmp if auxbasis_response > 1: hk += tmp - + if auxbasis_response > 1: - hj_ipip2 = hj_ipip2.reshape([3,3,naux]) - tmp = contract('ia,xyi->axy', aux2atom, hj_ipip2) - hj[range(natm), range(natm)] += tmp + if with_j: + hj_ipip2 = hj_ipip2.reshape([3,3,naux]) + tmp = contract('ia,xyi->axy', aux2atom, hj_ipip2) + hj[range(natm), range(natm)] += tmp if with_k: hk_ipip2 = hk_ipip2.reshape([3,3,naux]) tmp = contract('ia,xyi->axy', aux2atom, hk_ipip2) @@ -392,7 +402,7 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, t0 = log.timer_debug1(f'int3c2e_ipip on Device {device_id}', *t0) return hj, hk -def get_int3c2e_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, +def get_int3c2e_hjk(intopt, rhoj, rhok, dm0_tag, with_j=True, with_k=True, omega=None, auxbasis_response=1): orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') futures = [] @@ -402,26 +412,27 @@ def get_int3c2e_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, task_list = [] for device_id in range(_num_devices): task_list.append(tasks[device_id::_num_devices]) - + cupy.cuda.get_current_stream().synchronize() with ThreadPoolExecutor(max_workers=_num_devices) as executor: for device_id in range(_num_devices): future = executor.submit( - _int3c2e_ipip_tasks, intopt, task_list[device_id], - rhoj, rhok, dm0_tag, orbo, with_k=with_k, - device_id=device_id, omega=omega, + _int3c2e_ipip_tasks, intopt, task_list[device_id], + rhoj, rhok, dm0_tag, orbo, with_j=with_j, with_k=with_k, + device_id=device_id, omega=omega, auxbasis_response=auxbasis_response) futures.append(future) - + hj_total = [] hk_total = [] for future in futures: hj, hk = future.result() hj_total.append(hj) hk_total.append(hk) - + hj = hk = None - hj = reduce_to_device(hj_total, inplace=True) + if with_j: + hj = reduce_to_device(hj_total, inplace=True) if with_k: hk = reduce_to_device(hk_total, inplace=True) return hj, hk diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py index de6f1ebb..938b1384 100644 --- a/gpu4pyscf/df/hessian/rhf.py +++ b/gpu4pyscf/df/hessian/rhf.py @@ -70,8 +70,8 @@ def _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2): rhok1_Pkl_kslice = rhok1_Pko_kslice = None return hk_ao_ao -def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, - atmlst=None, max_memory=4000, verbose=None, with_k=True, omega=None): +def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None, + max_memory=None, verbose=None, with_j=True, with_k=True, omega=None): '''Partial derivative ''' log = logger.new_logger(hessobj, verbose) @@ -121,18 +121,24 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, int2c_ip1 = cupy.asarray(int2c_ip1, order='C') int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2]) - - hj_ao_ao = cupy.zeros([nao,nao,3,3]) - hk_ao_ao = cupy.zeros([nao,nao,3,3]) + if with_j: + hj_ao_ao = cupy.zeros([nao,nao,3,3]) + if with_k: + hk_ao_ao = cupy.zeros([nao,nao,3,3]) if hessobj.auxbasis_response: - hj_ao_aux = cupy.zeros([nao,naux,3,3]) - hk_ao_aux = cupy.zeros([nao,naux,3,3]) + if with_j: + hj_ao_aux = cupy.zeros([nao,naux,3,3]) + if with_k: + hk_ao_aux = cupy.zeros([nao,naux,3,3]) # int3c contributions wj, wk_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0_tag, omega=omega) t1 = log.timer_debug1('intermediate variables with int3c2e', *t1) - rhoj0_P = solve_j2c(wj) - rhok0_P__ = solve_j2c(wk_P__) + rhoj0_P = rhok0_P__ = None + if with_j: + rhoj0_P = solve_j2c(wj) + if with_k: + rhok0_P__ = solve_j2c(wk_P__) wj = wk_P__ = None # int3c_ip2 contributions @@ -142,18 +148,19 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # int3c_ip1 contributions wj1_P, wk1_Pko = int3c2e.get_int3c2e_ip1_wjk(intopt, dm0_tag, omega=omega) #rhoj1_P = contract('pq,pix->qix', int2c_inv, wj1_P) - rhoj1_P = solve_j2c(wj1_P) + if with_j: + rhoj1_P = solve_j2c(wj1_P) - hj_ao_ao += 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P) # (10|0)(0|0)(0|01) - wj1_P = None - if hessobj.auxbasis_response: - wj0_01 = contract('ypq,q->yp', int2c_ip1, rhoj0_P) - wj1_01 = contract('yqp,pix->iqxy', int2c_ip1, rhoj1_P) - hj_ao_aux += contract('pix,py->ipxy', rhoj1_P, wj_ip2) # (10|0)(1|00) - hj_ao_aux -= contract('pix,yp->ipxy', rhoj1_P, wj0_01) # (10|0)(1|0)(0|00) - hj_ao_aux -= contract('q,iqxy->iqxy', rhoj0_P, wj1_01) # (10|0)(0|1)(0|00) - wj1_01 = None - rhoj1_P = None + hj_ao_ao += 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P) # (10|0)(0|0)(0|01) + wj1_P = None + if hessobj.auxbasis_response: + wj0_01 = contract('ypq,q->yp', int2c_ip1, rhoj0_P) + wj1_01 = contract('yqp,pix->iqxy', int2c_ip1, rhoj1_P) + hj_ao_aux += contract('pix,py->ipxy', rhoj1_P, wj_ip2) # (10|0)(1|00) + hj_ao_aux -= contract('pix,yp->ipxy', rhoj1_P, wj0_01) # (10|0)(1|0)(0|00) + hj_ao_aux -= contract('q,iqxy->iqxy', rhoj0_P, wj1_01) # (10|0)(0|1)(0|00) + wj1_01 = None + rhoj1_P = None if with_k: cupy.get_default_memory_pool().free_all_blocks() @@ -203,13 +210,13 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, rhok1_Pko[:,i0:i1] = contract('qp,qiox->piox', cd_low, wk1_tmp).get() wk1_tmp = None cd_low = None - + hk_ao_ao += _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2) wk1_Pko = rhok1_Pko = None t1 = log.timer_debug1('intermediate variables with int3c2e_ip1', *t1) hj_ipip, hk_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, - with_k=with_k, omega=omega, + with_j=with_j, with_k=with_k, omega=omega, auxbasis_response=hessobj.auxbasis_response) t1 = log.timer_debug1('intermediate variables with int3c2e_ipip', *t1) @@ -222,10 +229,12 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1') int2c_ipip1 = cupy.asarray(int2c_ipip1, order='C') int2c_ipip1 = intopt.sort_orbitals(int2c_ipip1, aux_axis=[1,2]) - rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P) + # (00|0)(2|0)(0|00) # p,xp->px - hj_aux_diag = -(rhoj0_P*rhoj2c_P).T.reshape(-1,3,3) + if with_j: + rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P) + hj_aux_diag = -(rhoj0_P*rhoj2c_P).T.reshape(-1,3,3) if with_k: rho2c_0 = contract('pij,qji->pq', rhok0_P__, rhok0_P__) hk_aux_diag = -.5 * contract('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3) @@ -238,7 +247,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, int2c_ip1ip2 = auxmol.intor('int2c2e_ip1ip2', aosym='s1') int2c_ip1ip2 = cupy.asarray(int2c_ip1ip2, order='C') int2c_ip1ip2 = intopt.sort_orbitals(int2c_ip1ip2, aux_axis=[1,2]) - hj_aux_aux = -.5 * contract('p,xpq->pqx', rhoj0_P, int2c_ip1ip2*rhoj0_P).reshape(naux, naux,3,3) + if with_j: + hj_aux_aux = -.5 * contract('p,xpq->pqx', rhoj0_P, int2c_ip1ip2*rhoj0_P).reshape(naux, naux,3,3) if with_k: hk_aux_aux = -.5 * contract('xpq,pq->pqx', int2c_ip1ip2, rho2c_0).reshape(naux,naux,3,3) t1 = log.timer_debug1('intermediate variables with int2c_*', *t1) @@ -249,23 +259,23 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # aux-aux pair if hessobj.auxbasis_response > 1: int2c_inv = pinv(int2c, lindep=LINEAR_DEP_THR) - wj0_10 = contract('ypq,p->ypq', int2c_ip1, rhoj0_P) int2c_ip1_inv = contract('yqp,pr->yqr', int2c_ip1, int2c_inv) - - rhoj0_10 = contract('p,xpq->xpq', rhoj0_P, int2c_ip1_inv) # (1|0)(0|00) - hj_aux_aux += .5 * contract('xpr,yqr->pqxy', rhoj0_10, wj0_10) # (00|0)(1|0), (0|1)(0|00) - hj_aux_aux += contract('xpq,yq->pqxy', rhoj0_10, wj0_01) # (00|0)(1|0), (1|0)(0|00) - rhoj0_10 = rhoj0_P = None - - rhoj1 = contract('px,pq->xpq', wj_ip2, int2c_inv) # (0|0)(1|00) - hj_aux_aux -= contract('xpq,yq->pqxy', rhoj1, wj0_01) # (00|1), (1|0)(0|00) - hj_aux_aux += .5 * contract('xpq,qy->pqxy', rhoj1, wj_ip2) # (00|1), (1|00) - hj_aux_aux -= contract('xpr,yqr->pqxy', rhoj1, wj0_10) # (00|1), (0|1)(0|00) - wj0_10 = rhoj1 = wj_ip2 = None - - rhoj0_01 = contract('xp,pq->xpq', wj0_01, int2c_inv) # (0|1)(0|00) - hj_aux_aux += .5 * contract('xpq,yq->pqxy', rhoj0_01, wj0_01) # (00|0)(0|1), (1|0)(0|00) - wj0_01 = rhoj0_01 = None + if with_j: + wj0_10 = contract('ypq,p->ypq', int2c_ip1, rhoj0_P) + rhoj0_10 = contract('p,xpq->xpq', rhoj0_P, int2c_ip1_inv) # (1|0)(0|00) + hj_aux_aux += .5 * contract('xpr,yqr->pqxy', rhoj0_10, wj0_10) # (00|0)(1|0), (0|1)(0|00) + hj_aux_aux += contract('xpq,yq->pqxy', rhoj0_10, wj0_01) # (00|0)(1|0), (1|0)(0|00) + rhoj0_10 = rhoj0_P = None + + rhoj1 = contract('px,pq->xpq', wj_ip2, int2c_inv) # (0|0)(1|00) + hj_aux_aux -= contract('xpq,yq->pqxy', rhoj1, wj0_01) # (00|1), (1|0)(0|00) + hj_aux_aux += .5 * contract('xpq,qy->pqxy', rhoj1, wj_ip2) # (00|1), (1|00) + hj_aux_aux -= contract('xpr,yqr->pqxy', rhoj1, wj0_10) # (00|1), (0|1)(0|00) + wj0_10 = rhoj1 = wj_ip2 = None + + rhoj0_01 = contract('xp,pq->xpq', wj0_01, int2c_inv) # (0|1)(0|00) + hj_aux_aux += .5 * contract('xpq,yq->pqxy', rhoj0_01, wj0_01) # (00|0)(0|1), (1|0)(0|00) + wj0_01 = rhoj0_01 = None if with_k: rho2c_10 = contract('rijx,qij->rqx', wk_ip2_P__, rhok0_P__) @@ -296,11 +306,12 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, t1 = log.timer_debug1('contract int2c_*', *t1) dm0 = intopt.unsort_orbitals(dm0, axis=[0,1]) - hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1]) - if hessobj.auxbasis_response: - hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1]) - if hessobj.auxbasis_response > 1: - hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1]) + if with_j: + hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1]) + if hessobj.auxbasis_response: + hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1]) + if hessobj.auxbasis_response > 1: + hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1]) if with_k: hk_ao_ao = intopt.unsort_orbitals(hk_ao_ao, axis=[0,1]) if hessobj.auxbasis_response: @@ -334,8 +345,9 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, e1[i0,i0] -= cupy.sum(h1aa[p0:p1], axis=0) for j0, ja in enumerate(atmlst[:i0+1]): q0, q1 = aoslices[ja][2:] - ej[i0,j0] += cupy.sum(hj_ao_ao[p0:p1,q0:q1], axis=[0,1]) e1[i0,j0] -= cupy.sum(h1ab[p0:p1,q0:q1], axis=[0,1]) + if with_j: + ej[i0,j0] += cupy.sum(hj_ao_ao[p0:p1,q0:q1], axis=[0,1]) if with_k: ek[i0,j0] += cupy.sum(hk_ao_ao[p0:p1,q0:q1], axis=[0,1]) e1[i0,j0] += de_hcore(ia, ja) @@ -344,13 +356,14 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # if hessobj.auxbasis_response: for j0, (q0, q1) in enumerate(auxslices[:,2:]): - _ej = cupy.sum(hj_ao_aux[p0:p1,q0:q1], axis=[0,1]) - if hessobj.auxbasis_response > 1: - ej[i0,j0] += _ej * 2 - ej[j0,i0] += _ej.T * 2 - else: - ej[i0,j0] += _ej - ej[j0,i0] += _ej.T + if with_j: + _ej = cupy.sum(hj_ao_aux[p0:p1,q0:q1], axis=[0,1]) + if hessobj.auxbasis_response > 1: + ej[i0,j0] += _ej * 2 + ej[j0,i0] += _ej.T * 2 + else: + ej[i0,j0] += _ej + ej[j0,i0] += _ej.T if with_k: _ek = cupy.sum(hk_ao_aux[p0:p1,q0:q1], axis=[0,1]) if hessobj.auxbasis_response > 1: @@ -365,9 +378,10 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, if hessobj.auxbasis_response > 1: shl0, shl1, p0, p1 = auxslices[ia] for j0, (q0, q1) in enumerate(auxslices[:,2:]): - _ej = cupy.sum(hj_aux_aux[p0:p1,q0:q1], axis=[0,1]) - ej[i0,j0] += _ej - ej[j0,i0] += _ej.T + if with_j: + _ej = cupy.sum(hj_aux_aux[p0:p1,q0:q1], axis=[0,1]) + ej[i0,j0] += _ej + ej[j0,i0] += _ej.T if with_k: _ek = cupy.sum(hk_aux_aux[p0:p1,q0:q1], axis=[0,1]) ek[i0,j0] += _ek * .5 @@ -375,22 +389,24 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, for i0, ia in enumerate(atmlst): for j0 in range(i0): e1[j0,i0] = e1[i0,j0].T - ej[j0,i0] = ej[i0,j0].T + if with_j: + ej[j0,i0] = ej[i0,j0].T if with_k: ek[j0,i0] = ek[i0,j0].T - + t1 = log.timer_debug1('hcore contribution', *t1) - + aux2atom = int3c2e.get_aux2atom(intopt, auxslices) - + natm = mol.natm idx = range(natm) # Diagonal contributions if hessobj.auxbasis_response > 1: - ej[idx, idx] += contract('ia,ixy->axy', aux2atom, hj_aux_diag) + if with_j: + ej[idx, idx] += contract('ia,ixy->axy', aux2atom, hj_aux_diag) if with_k: ek[idx, idx] += contract('ia,ixy->axy', aux2atom, hk_aux_diag) - + log.timer('RHF partial hessian', *time0) return e1, ej, ek @@ -408,7 +424,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): return h1mo def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, - verbose=None, with_k=True, omega=None): + verbose=None, with_j=True, with_k=True, omega=None): ''' Derivatives of J, K matrices in MO bases ''' @@ -420,8 +436,7 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, mol = hessobj.mol if atmlst is None: atmlst = range(mol.natm) - # FIXME - with_k = True + mo_coeff = cupy.asarray(mo_coeff, order='C') mo_occ = cupy.asarray(mo_occ, order='C') @@ -457,13 +472,16 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, mo_coeff = intopt.sort_orbitals(mo_coeff, axis=[0]) dm0 = intopt.sort_orbitals(dm0, axis=[0,1]) dm0_tag = tag_array(dm0, occ_coeff=mocc) - + int2c = intopt.sort_orbitals(int2c, aux_axis=[0,1]) solve_j2c = _gen_metric_solver(int2c) - wj, wk_Pl_ = int3c2e.get_int3c2e_wjk(mol, auxmol, dm0_tag, omega=omega) - rhoj0 = solve_j2c(wj) + wj, wk_Pl_ = int3c2e.get_int3c2e_wjk(mol, auxmol, dm0_tag, + with_j=with_j, with_k=True, omega=omega) + rhoj0 = None + if with_j: + rhoj0 = solve_j2c(wj) + wj = None - wj = None if isinstance(wk_Pl_, cupy.ndarray): rhok0_Pl_ = solve_j2c(wk_Pl_) else: @@ -472,8 +490,10 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, wk_tmp = cupy.asarray(wk_Pl_[:,p0:p1]) rhok0_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get() wk_tmp = None - wk_Pl_ = solve_j2c = None + wk_Pl_ = None + solve_j2c = None t0 = log.timer_debug1('Fock matrix due to int3c2e', *t0) + vj1_int3c = vk1_int3c = None # -------------------------- # int3c_ip2 contribution @@ -481,8 +501,8 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, cupy.get_default_memory_pool().free_all_blocks() if hessobj.auxbasis_response: fn = int3c2e.get_int3c2e_ip2_vjk - vj1_int3c_ip2, vk1_int3c_ip2 = fn(intopt, rhoj0, rhok0_Pl_, dm0_tag, auxslices, omega=omega) - vk1_int3c_ip2 *= 2.0 + vj1_int3c, vk1_int3c = fn(intopt, rhoj0, rhok0_Pl_, dm0_tag, auxslices, + with_j=with_j, with_k=with_k, omega=omega) # Responses due to int2c2e_ip1 if omega and omega > 1e-10: with auxmol.with_range_coulomb(omega): @@ -492,18 +512,19 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, int2c_ip1 = cupy.asarray(int2c_ip1, order='C') int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2]) - # Generate rhok0_P__ - if isinstance(rhok0_Pl_, cupy.ndarray): - rhok0_P__ = contract('pio,ir->pro', rhok0_Pl_, mocc) - else: - rhok0_P__ = cupy.empty([naux,nocc,nocc]) - for p0, p1 in lib.prange(0,naux,64): - rhok0_Pl_tmp = cupy.asarray(rhok0_Pl_[p0:p1]) - rhok0_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, mocc) - rhok0_Pl_tmp = None - - wj0_10 = contract('xpq,q->xp', int2c_ip1, rhoj0) - wk0_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0_P__) + if with_j: + wj0_10 = contract('xpq,q->xp', int2c_ip1, rhoj0) + if with_k: + # Generate rhok0_P__ + if isinstance(rhok0_Pl_, cupy.ndarray): + rhok0_P__ = contract('pio,ir->pro', rhok0_Pl_, mocc) + else: + rhok0_P__ = cupy.empty([naux,nocc,nocc]) + for p0, p1 in lib.prange(0,naux,64): + rhok0_Pl_tmp = cupy.asarray(rhok0_Pl_[p0:p1]) + rhok0_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, mocc) + rhok0_Pl_tmp = None + wk0_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0_P__) aux2atom = int3c2e.get_aux2atom(intopt, auxslices) mem_avail = get_avail_mem() @@ -514,24 +535,21 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, for p0, p1 in lib.prange(0,nao,blksize): rhok_tmp = cupy.asarray(rhok0_Pl_[:,p0:p1]) - vj1_tmp = contract('pio,xp->xpio', rhok_tmp, wj0_10) - wk0_10_Pl_ = contract('xqp,pio->xqio', int2c_ip1, rhok_tmp) - vj1_tmp += contract('xpio,p->xpio', wk0_10_Pl_, rhoj0) - vj1_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vj1_tmp, aux2atom) - vj1_tmp = None + if with_j: + vj1_tmp = contract('pio,xp->xpio', rhok_tmp, wj0_10) + vj1_tmp += contract('xpio,p->xpio', wk0_10_Pl_, rhoj0) + vj1_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vj1_tmp, aux2atom) + vj1_tmp = None if with_k: vk1_tmp = contract('xpio,pro->xpir', wk0_10_Pl_, rhok0_P__) vk1_tmp += contract('xpro,pir->xpio', wk0_10_P__, rhok_tmp) # 2.0 due to spin - vk1_int3c_ip2[:,:,p0:p1] += 2.0*contract('xpio,pa->axio', vk1_tmp, aux2atom) + vk1_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vk1_tmp, aux2atom) vk1_tmp = None wk0_10_Pl_ = rhok_tmp = None wj0_10 = wk0_10_P__ = rhok0_P__ = int2c_ip1 = None aux2atom = None - - vj1_int3c_ip2 = contract('nxiq,ip->nxpq', vj1_int3c_ip2, mo_coeff) - vk1_int3c_ip2 = contract('nxiq,ip->nxpq', vk1_int3c_ip2, mo_coeff) t0 = log.timer_debug1('Fock matrix due to int3c2e_ip2', *t0) # ----------------------------- @@ -539,17 +557,31 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, # ------------------------------ cupy.get_default_memory_pool().free_all_blocks() fn = int3c2e.get_int3c2e_ip1_vjk - vj1_buf, vk1_buf, vj1_ao, vk1_ao = fn(intopt, rhoj0, rhok0_Pl_, dm0_tag, aoslices, omega=omega) - rhoj0 = rhok0_Pl_ = None - vk1_ao *= 2.0 - vk1_buf *= 2.0 - - vj1_buf = intopt.unsort_orbitals(vj1_buf, axis=[1,2]) - vk1_buf = intopt.unsort_orbitals(vk1_buf, axis=[1,2]) - - vj1_int3c_ip1 = -contract('nxiq,ip->nxpq', vj1_ao, mo_coeff) - vk1_int3c_ip1 = -contract('nxiq,ip->nxpq', vk1_ao, mo_coeff) - vj1_ao = vk1_ao = None + vj1_buf, vk1_buf, vj1_ao, vk1_ao = fn(intopt, rhoj0, rhok0_Pl_, dm0_tag, aoslices, + omega=omega, with_j=with_j, with_k=with_k) + rhoj0 = rhok0_Pl_ = dm0_tag = None + if with_j: + vj1_buf = intopt.unsort_orbitals(vj1_buf, axis=[1,2]) + if vj1_int3c is None: + vj1_int3c = -vj1_ao + else: + vj1_int3c -= vj1_ao + vj1_ao = None + # NOTE: vj1_int3c and vk1_int3c are in [natm,3,nao,nocc] + # axis=2 in AO, axis=3 in MO + # convert axis=2 into MO now + vj1_int3c = contract('nxiq,ip->nxpq', vj1_int3c, mo_coeff) + + if with_k: + vk1_buf = intopt.unsort_orbitals(vk1_buf, axis=[1,2]) + if vk1_int3c is None: + vk1_int3c = -vk1_ao + else: + vk1_int3c -= vk1_ao + vk1_ao = None + # * 2.0 due to the contraction with mocc + vk1_buf *= 2.0 + vk1_int3c = 2.0 * contract('nxiq,ip->nxpq', vk1_int3c, mo_coeff) t0 = log.timer_debug1('Fock matrix due to int3c2e_ip1', *t0) mocc = intopt.unsort_orbitals(mocc, axis=[0]) @@ -561,37 +593,29 @@ def _ao2mo(mat): tmp = contract('xij,jo->xio', mat, mocc) return contract('xik,ip->xpk', tmp, mo_coeff) - vj1_int3c = vj1_int3c_ip1 + vj1_int3c_ip2 - vj1_int3c_ip1 = vj1_int3c_ip2 = None - if with_k: - vk1_int3c = vk1_int3c_ip1 + vk1_int3c_ip2 - vk1_int3c_ip1 = vk1_int3c_ip2 = None - cupy.get_default_memory_pool().free_all_blocks() for i0, ia in enumerate(atmlst): shl0, shl1, p0, p1 = aoslices[ia] - vj1_ao = cupy.zeros([3,nao,nao]) - vk1_ao = cupy.zeros([3,nao,nao]) - - vj1_ao[:,p0:p1,:] -= vj1_buf[:,p0:p1,:] - vj1_ao[:,:,p0:p1] -= vj1_buf[:,p0:p1,:].transpose(0,2,1) + if with_j: + vj1_ao = cupy.zeros([3,nao,nao]) + vj1_ao[:,p0:p1,:] -= vj1_buf[:,p0:p1,:] + vj1_ao[:,:,p0:p1] -= vj1_buf[:,p0:p1,:].transpose(0,2,1) + vj1_int3c[ia] += _ao2mo(vj1_ao) if with_k: + vk1_ao = cupy.zeros([3,nao,nao]) vk1_ao[:,p0:p1,:] -= vk1_buf[:,p0:p1,:] vk1_ao[:,:,p0:p1] -= vk1_buf[:,p0:p1,:].transpose(0,2,1) - - vj1_int3c[ia] += _ao2mo(vj1_ao) - if with_k: vk1_int3c[ia] += _ao2mo(vk1_ao) return vj1_int3c, vk1_int3c -def _get_jk_mo(hessobj, mol, dms, mo_coeff, mocc, +def _get_jk_mo(hessobj, mol, dms, mo_coeff, mocc, hermi=1, with_j=True, with_k=True, omega=None): mf = hessobj.base dfobj = mf.with_df if omega is None: - return jk.get_jk(dfobj, dms, mo_coeff, mocc, + return jk.get_jk(dfobj, dms, mo_coeff, mocc, hermi=hermi, with_j=with_j, with_k=with_k) - + # A temporary treatment for RSH-DF integrals key = '%.6f' % omega if key in dfobj._rsh_df: diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py index 31c1d506..e0d5cd90 100644 --- a/gpu4pyscf/df/hessian/rks.py +++ b/gpu4pyscf/df/hessian/rks.py @@ -27,6 +27,7 @@ from gpu4pyscf.hessian import rhf as rhf_hess from gpu4pyscf.hessian import rks as rks_hess from gpu4pyscf.df.hessian import rhf as df_rhf_hess +from gpu4pyscf.df.hessian.rhf import _get_jk_ip, _partial_hess_ejk from gpu4pyscf.lib import logger from gpu4pyscf.lib.cupy_helper import contract @@ -50,17 +51,17 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) with_k = mf._numint.libxc.is_hybrid_xc(mf.xc) - de2, ej, ek = df_rhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ, - atmlst, max_memory, verbose, - with_k=with_k) + de2, ej, ek = _partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ, + atmlst, max_memory, verbose, + with_j=True, with_k=with_k) de2 += ej # (A,B,dR_A,dR_B) if with_k: de2 -= hyb * ek if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10: - ek_lr = df_rhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ, - atmlst, max_memory, verbose, - True, omega=omega)[2] + ek_lr = _partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ, + atmlst, max_memory, verbose, + with_j=False, with_k=True, omega=omega)[2] de2 -= (alpha - hyb) * ek_lr max_memory = None @@ -93,18 +94,18 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) mem_now = lib.current_memory()[0] max_memory = max(2000, mf.max_memory*.9-mem_now) - + with_k = ni.libxc.is_hybrid_xc(mf.xc) - vj1, vk1 = df_rhf_hess._get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, - atmlst, verbose, with_k) + vj1, vk1 = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, + atmlst, verbose, with_j=True, with_k=with_k) h1mo = vj1 if with_k: h1mo -= .5 * hyb * vk1 vj1 = vk1 = None - + if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10: - _, vk1_lr = df_rhf_hess._get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, - atmlst, verbose, True, omega=omega) + _, vk1_lr = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, atmlst, + verbose, with_j=False, with_k=True, omega=omega) h1mo -= .5 * (alpha - hyb) * vk1_lr vk1_lr = None diff --git a/gpu4pyscf/df/hessian/uhf.py b/gpu4pyscf/df/hessian/uhf.py index acd67380..d6f26e5d 100644 --- a/gpu4pyscf/df/hessian/uhf.py +++ b/gpu4pyscf/df/hessian/uhf.py @@ -50,11 +50,13 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None, max_memory=4000, verbose=None): e1, ej, ek = _partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ, - atmlst, max_memory, verbose, True) + atmlst, max_memory, verbose, + with_j=True, with_k=True) return e1 + ej - ek def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, - atmlst=None, max_memory=4000, verbose=None, with_k=True, omega=None): + atmlst=None, max_memory=4000, verbose=None, + with_j=True, with_k=True, omega=None): '''Partial derivative ''' log = logger.new_logger(hessobj, verbose) @@ -113,43 +115,53 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, int2c_ip1 = cupy.asarray(int2c_ip1, order='C') int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2]) - hj_ao_ao = cupy.zeros([nao,nao,3,3]) - hk_ao_ao = cupy.zeros([nao,nao,3,3]) + if with_j: + hj_ao_ao = cupy.zeros([nao,nao,3,3]) + if with_k: + hk_ao_ao = cupy.zeros([nao,nao,3,3]) if hessobj.auxbasis_response: - hj_ao_aux = cupy.zeros([nao,naux,3,3]) - hk_ao_aux = cupy.zeros([nao,naux,3,3]) + if with_j: + hj_ao_aux = cupy.zeros([nao,naux,3,3]) + if with_k: + hk_ao_aux = cupy.zeros([nao,naux,3,3]) # int3c contributions wja, wka_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0a_tag, omega=omega) wjb, wkb_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0b_tag, omega=omega) - rhoj0_P = solve_j2c(wja + wjb) - rhok0a_P__ = solve_j2c(wka_P__) - rhok0b_P__ = solve_j2c(wkb_P__) + rhoj0_P = rhok0a_P__ = rhok0b_P__ = None + if with_j: + rhoj0_P = solve_j2c(wja + wjb) + if with_k: + rhok0a_P__ = solve_j2c(wka_P__) + rhok0b_P__ = solve_j2c(wkb_P__) wja = wjb = wka_P__ = wkb_P__ = None t1 = log.timer_debug1('intermediate variables with int3c2e', *t1) # int3c_ip2 contributions wja_ip2, wka_ip2_P__ = int3c2e.get_int3c2e_ip2_wjk(intopt, dm0a_tag, omega=omega) wjb_ip2, wkb_ip2_P__ = int3c2e.get_int3c2e_ip2_wjk(intopt, dm0b_tag, omega=omega) - wj_ip2 = wja_ip2 + wjb_ip2 + wj_ip2 = None + if with_j: + wj_ip2 = wja_ip2 + wjb_ip2 t1 = log.timer_debug1('interdeidate variables with int3c2e_ip2', *t1) # int3c_ip1 contributions wj1a_P, wk1a_Pko = int3c2e.get_int3c2e_ip1_wjk(intopt, dm0a_tag, omega=omega) wj1b_P, wk1b_Pko = int3c2e.get_int3c2e_ip1_wjk(intopt, dm0b_tag, omega=omega) - wj1_P = wj1a_P + wj1b_P - rhoj1_P = solve_j2c(wj1_P) - - hj_ao_ao += 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P) # (10|0)(0|0)(0|01) wj1_P = None - if hessobj.auxbasis_response: - wj0_01 = contract('ypq,q->yp', int2c_ip1, rhoj0_P) - wj1_01 = contract('yqp,pix->iqxy', int2c_ip1, rhoj1_P) - hj_ao_aux += contract('pix,py->ipxy', rhoj1_P, wj_ip2) # (10|0)(1|00) - hj_ao_aux -= contract('pix,yp->ipxy', rhoj1_P, wj0_01) # (10|0)(1|0)(0|00) - hj_ao_aux -= contract('q,iqxy->iqxy', rhoj0_P, wj1_01) # (10|0)(0|1)(0|00) - wj1_01 = None - rhoj1_P = None + if with_j: + wj1_P = wj1a_P + wj1b_P + rhoj1_P = solve_j2c(wj1_P) + hj_ao_ao += 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P) # (10|0)(0|0)(0|01) + wj1_P = None + if hessobj.auxbasis_response: + wj0_01 = contract('ypq,q->yp', int2c_ip1, rhoj0_P) + wj1_01 = contract('yqp,pix->iqxy', int2c_ip1, rhoj1_P) + hj_ao_aux += contract('pix,py->ipxy', rhoj1_P, wj_ip2) # (10|0)(1|00) + hj_ao_aux -= contract('pix,yp->ipxy', rhoj1_P, wj0_01) # (10|0)(1|0)(0|00) + hj_ao_aux -= contract('q,iqxy->iqxy', rhoj0_P, wj1_01) # (10|0)(0|1)(0|00) + wj1_01 = None + rhoj1_P = None if with_k: mem_avail = get_avail_mem() @@ -160,7 +172,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, log.debug(f'GPU Memory {mem_avail/GB:.1f} GB available, block size {blksize}') if blksize < ALIGNED: raise RuntimeError('Not enough memory for intermediate variables') - + for i0, i1 in lib.prange(0,nao,blksize): wk1a_Pko_islice = cupy.asarray(wk1a_Pko[:,i0:i1]) wk1b_Pko_islice = cupy.asarray(wk1b_Pko[:,i0:i1]) @@ -216,12 +228,13 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, cupy.get_default_memory_pool().free_all_blocks() hja_ipip, hka_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0a_P__, dm0a_tag, - with_k=with_k, omega=omega, + with_j=with_j, with_k=with_k, omega=omega, auxbasis_response=hessobj.auxbasis_response) hjb_ipip, hkb_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0b_P__, dm0b_tag, - with_k=with_k, omega=omega, + with_j=with_j, with_k=with_k, omega=omega, auxbasis_response=hessobj.auxbasis_response) - hj_ipip = hja_ipip + hjb_ipip + if with_j: + hj_ipip = hja_ipip + hjb_ipip if with_k: hk_ipip = 2.0*(hka_ipip + hkb_ipip) t1 = log.timer_debug1('intermediate variables with int3c2e_ipip', *t1) @@ -235,10 +248,11 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1') int2c_ipip1 = cupy.asarray(int2c_ipip1, order='C') int2c_ipip1 = intopt.sort_orbitals(int2c_ipip1, aux_axis=[1,2]) - rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P) # (00|0)(2|0)(0|00) - # p,xp->px - hj_aux_diag = -(rhoj0_P*rhoj2c_P).T.reshape(-1,3,3) + if with_j: + rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P) + # p,xp->px + hj_aux_diag = -(rhoj0_P*rhoj2c_P).T.reshape(-1,3,3) if with_k: rho2c_0 = contract('pij,qji->pq', rhok0a_P__, rhok0a_P__) rho2c_0+= contract('pij,qji->pq', rhok0b_P__, rhok0b_P__) @@ -252,7 +266,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, int2c_ip1ip2 = auxmol.intor('int2c2e_ip1ip2', aosym='s1') int2c_ip1ip2 = cupy.asarray(int2c_ip1ip2, order='C') int2c_ip1ip2 = intopt.sort_orbitals(int2c_ip1ip2, aux_axis=[1,2]) - hj_aux_aux = -.5 * contract('p,xpq->pqx', rhoj0_P, int2c_ip1ip2*rhoj0_P).reshape(naux, naux,3,3) + if with_j: + hj_aux_aux = -.5 * contract('p,xpq->pqx', rhoj0_P, int2c_ip1ip2*rhoj0_P).reshape(naux, naux,3,3) if with_k: hk_aux_aux = -.5 * contract('xpq,pq->pqx', int2c_ip1ip2, rho2c_0).reshape(naux,naux,3,3) t1 = log.timer_debug1('intermediate variables with int2c_*', *t1) @@ -262,23 +277,23 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, release_gpu_stack() # aux-aux pair if hessobj.auxbasis_response > 1: - wj0_10 = contract('ypq,p->ypq', int2c_ip1, rhoj0_P) int2c_ip1_inv = contract('yqp,pr->yqr', int2c_ip1, int2c_inv) - - rhoj0_10 = contract('p,xpq->xpq', rhoj0_P, int2c_ip1_inv) # (1|0)(0|00) - hj_aux_aux += .5 * contract('xpr,yqr->pqxy', rhoj0_10, wj0_10) # (00|0)(1|0), (0|1)(0|00) - hj_aux_aux += contract('xpq,yq->pqxy', rhoj0_10, wj0_01) # (00|0)(1|0), (1|0)(0|00) - rhoj0_10 = rhoj0_P = None - - rhoj1 = contract('px,pq->xpq', wj_ip2, int2c_inv) # (0|0)(1|00) - hj_aux_aux -= contract('xpq,yq->pqxy', rhoj1, wj0_01) # (00|1), (1|0)(0|00) - hj_aux_aux += .5 * contract('xpq,qy->pqxy', rhoj1, wj_ip2) # (00|1), (1|00) - hj_aux_aux -= contract('xpr,yqr->pqxy', rhoj1, wj0_10) # (00|1), (0|1)(0|00) - wj0_10 = rhoj1 = wj_ip2 = None - - rhoj0_01 = contract('xp,pq->xpq', wj0_01, int2c_inv) # (0|1)(0|00) - hj_aux_aux += .5 * contract('xpq,yq->pqxy', rhoj0_01, wj0_01) # (00|0)(0|1), (1|0)(0|00) - wj0_01 = rhoj0_01 = None + if with_j: + wj0_10 = contract('ypq,p->ypq', int2c_ip1, rhoj0_P) + rhoj0_10 = contract('p,xpq->xpq', rhoj0_P, int2c_ip1_inv) # (1|0)(0|00) + hj_aux_aux += .5 * contract('xpr,yqr->pqxy', rhoj0_10, wj0_10) # (00|0)(1|0), (0|1)(0|00) + hj_aux_aux += contract('xpq,yq->pqxy', rhoj0_10, wj0_01) # (00|0)(1|0), (1|0)(0|00) + rhoj0_10 = rhoj0_P = None + + rhoj1 = contract('px,pq->xpq', wj_ip2, int2c_inv) # (0|0)(1|00) + hj_aux_aux -= contract('xpq,yq->pqxy', rhoj1, wj0_01) # (00|1), (1|0)(0|00) + hj_aux_aux += .5 * contract('xpq,qy->pqxy', rhoj1, wj_ip2) # (00|1), (1|00) + hj_aux_aux -= contract('xpr,yqr->pqxy', rhoj1, wj0_10) # (00|1), (0|1)(0|00) + wj0_10 = rhoj1 = wj_ip2 = None + + rhoj0_01 = contract('xp,pq->xpq', wj0_01, int2c_inv) # (0|1)(0|00) + hj_aux_aux += .5 * contract('xpq,yq->pqxy', rhoj0_01, wj0_01) # (00|0)(0|1), (1|0)(0|00) + wj0_01 = rhoj0_01 = None if with_k: rho2c_10 = contract('rijx,qij->rqx', wka_ip2_P__, rhok0a_P__) @@ -310,13 +325,13 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, hk_aux_aux -= contract('pqx,yqp->pqxy', rho2c_10, int2c_ip1_inv) # (00|1)(0|1)(0|00) rho2c_10= int2c_ip1_inv = None t1 = log.timer_debug1('contract int2c_*', *t1) - - hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1]) - if hessobj.auxbasis_response: - hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1]) - if hessobj.auxbasis_response > 1: - hj_aux_diag = intopt.unsort_orbitals(hj_aux_diag, aux_axis=[0]) - hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1]) + if with_j: + hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1]) + if hessobj.auxbasis_response: + hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1]) + if hessobj.auxbasis_response > 1: + hj_aux_diag = intopt.unsort_orbitals(hj_aux_diag, aux_axis=[0]) + hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1]) if with_k: hk_ao_ao = intopt.unsort_orbitals(hk_ao_ao, axis=[0,1]) if hessobj.auxbasis_response: @@ -346,18 +361,20 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # ----------------------------------------- # collecting all # ----------------------------------------- - hk_ao_ao *= 2.0 e1 = cupy.zeros([len(atmlst),len(atmlst),3,3]) - ej = hj_ipip - ek = None + ej = ek = None + if with_j: + ej = hj_ipip if with_k: + hk_ao_ao *= 2.0 ek = hk_ipip for i0, ia in enumerate(atmlst): shl0, shl1, p0, p1 = aoslices[ia] e1[i0,i0] -= cupy.sum(h1aa[p0:p1], axis=0) for j0, ja in enumerate(atmlst[:i0+1]): q0, q1 = aoslices[ja][2:] - ej[i0,j0] += cupy.sum(hj_ao_ao[p0:p1,q0:q1], axis=[0,1]) + if with_j: + ej[i0,j0] += cupy.sum(hj_ao_ao[p0:p1,q0:q1], axis=[0,1]) e1[i0,j0] -= cupy.sum(h1ab[p0:p1,q0:q1], axis=[0,1]) if with_k: ek[i0,j0] += cupy.sum(hk_ao_ao[p0:p1,q0:q1], axis=[0,1]) @@ -368,13 +385,14 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # if hessobj.auxbasis_response: for j0, (q0, q1) in enumerate(auxslices[:,2:]): - _ej = cupy.sum(hj_ao_aux[p0:p1,q0:q1], axis=[0,1]) - if hessobj.auxbasis_response > 1: - ej[i0,j0] += _ej * 2 - ej[j0,i0] += _ej.T * 2 - else: - ej[i0,j0] += _ej - ej[j0,i0] += _ej.T + if with_j: + _ej = cupy.sum(hj_ao_aux[p0:p1,q0:q1], axis=[0,1]) + if hessobj.auxbasis_response > 1: + ej[i0,j0] += _ej * 2 + ej[j0,i0] += _ej.T * 2 + else: + ej[i0,j0] += _ej + ej[j0,i0] += _ej.T if with_k: _ek = cupy.sum(hk_ao_aux[p0:p1,q0:q1], axis=[0,1]) if hessobj.auxbasis_response > 1: @@ -388,13 +406,15 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # if hessobj.auxbasis_response > 1: shl0, shl1, p0, p1 = auxslices[ia] - ej[i0,i0] += cupy.sum(hj_aux_diag[p0:p1], axis=0) + if with_j: + ej[i0,i0] += cupy.sum(hj_aux_diag[p0:p1], axis=0) if with_k: ek[i0,i0] += cupy.sum(hk_aux_diag[p0:p1], axis=0) for j0, (q0, q1) in enumerate(auxslices[:,2:]): - _ej = cupy.sum(hj_aux_aux[p0:p1,q0:q1], axis=[0,1]) - ej[i0,j0] += _ej - ej[j0,i0] += _ej.T + if with_j: + _ej = cupy.sum(hj_aux_aux[p0:p1,q0:q1], axis=[0,1]) + ej[i0,j0] += _ej + ej[j0,i0] += _ej.T if with_k: _ek = cupy.sum(hk_aux_aux[p0:p1,q0:q1], axis=[0,1]) ek[i0,j0] += _ek @@ -402,7 +422,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, for i0, ia in enumerate(atmlst): for j0 in range(i0): e1[j0,i0] = e1[i0,j0].T - ej[j0,i0] = ej[i0,j0].T + if with_j: + ej[j0,i0] = ej[i0,j0].T if with_k: ek[j0,i0] = ek[i0,j0].T t1 = log.timer_debug1('hcore contribution', *t1) @@ -434,7 +455,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): return (h1moa, h1mob) def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, - verbose=None, with_k=True, omega=None): + verbose=None, with_j=True, with_k=True, omega=None): ''' A generator to produce the derivatives of Hcore, J, K matrices in MO bases ''' @@ -443,8 +464,7 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, mol = hessobj.mol if atmlst is None: atmlst = range(mol.natm) - # FIXME - with_k = True + mo_coeff = cupy.asarray(mo_coeff, order='C') mo_occ = cupy.asarray(mo_occ, order='C') @@ -469,12 +489,12 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, int2c = cupy.asarray(int2c, order='C') # ======================= sorted AO begin ====================================== intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e') - intopt.build(mf.direct_scf_tol, - diag_block_with_triu=True, - aosym=False, - group_size_aux=BLKSIZE, + intopt.build(mf.direct_scf_tol, + diag_block_with_triu=True, + aosym=False, + group_size_aux=BLKSIZE, group_size=BLKSIZE) - + mocca = intopt.sort_orbitals(mocca, axis=[0]) moccb = intopt.sort_orbitals(moccb, axis=[0]) mo_coeff = intopt.sort_orbitals(mo_coeff, axis=[1]) @@ -488,10 +508,12 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, fn = int3c2e.get_int3c2e_wjk dm0_tag = tag_array(dm0, occ_coeff=mocca) - wj, wka_Pl_ = fn(mol, auxmol, dm0_tag, omega=omega) + wj, wka_Pl_ = fn(mol, auxmol, dm0_tag, with_j=with_j, with_k=with_k, omega=omega) dm0_tag = tag_array(dm0, occ_coeff=moccb) - wj, wkb_Pl_ = fn(mol, auxmol, dm0_tag, omega=omega) - rhoj0 = solve_j2c(wj) + wj, wkb_Pl_ = fn(mol, auxmol, dm0_tag, with_j=with_j, with_k=with_k, omega=omega) + rhoj0 = None + if with_j: + rhoj0 = solve_j2c(wj) wj = None if isinstance(wka_Pl_, cupy.ndarray): @@ -512,27 +534,7 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, rhok0b_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get() wk_tmp = None wka_Pl_ = wkb_Pl_ = None - - # ----------------------------- - # int3c_ip1 contributions - # ------------------------------ - cupy.get_default_memory_pool().free_all_blocks() - fn = int3c2e.get_int3c2e_ip1_vjk - dm0_tag = tag_array(dm0, occ_coeff=mocca) - vj1_buf, vk1a_buf, vj1a_ao, vk1a_ao = fn(intopt, rhoj0, rhok0a_Pl_, dm0_tag, aoslices, omega=omega) - dm0_tag = tag_array(dm0, occ_coeff=moccb) - vj1_buf, vk1b_buf, vj1b_ao, vk1b_ao = fn(intopt, rhoj0, rhok0b_Pl_, dm0_tag, aoslices, omega=omega) - - vj1_buf = intopt.unsort_orbitals(vj1_buf, axis=[1,2]) - vk1a_buf = intopt.unsort_orbitals(vk1a_buf, axis=[1,2]) - vk1b_buf = intopt.unsort_orbitals(vk1b_buf, axis=[1,2]) - - vj1a_int3c = -contract('nxiq,ip->nxpq', vj1a_ao, mo_coeff[0]) - vj1b_int3c = -contract('nxiq,ip->nxpq', vj1b_ao, mo_coeff[1]) - vk1a_int3c = -contract('nxiq,ip->nxpq', vk1a_ao, mo_coeff[0]) - vk1b_int3c = -contract('nxiq,ip->nxpq', vk1b_ao, mo_coeff[1]) - vj1a_ao = vj1b_ao = vk1a_ao = vk1b_ao = None - t0 = log.timer_debug1('Fock matrix due to int3c2e_ip1', *t0) + vj1a_int3c = vj1b_int3c = vk1a_int3c = vk1b_int3c = None # -------------------------- # int3c_ip2 contribution @@ -541,9 +543,11 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, if hessobj.auxbasis_response: fn = int3c2e.get_int3c2e_ip2_vjk dm0_tag = tag_array(dm0, occ_coeff=mocca) - vj1a_int3c_ip2, vk1a_int3c_ip2 = fn(intopt, rhoj0, rhok0a_Pl_, dm0_tag, auxslices, omega=omega) + vj1a_int3c, vk1a_int3c = fn(intopt, rhoj0, rhok0a_Pl_, dm0_tag, auxslices, + with_j=with_j, with_k=with_k, omega=omega) dm0_tag = tag_array(dm0, occ_coeff=moccb) - vj1b_int3c_ip2, vk1b_int3c_ip2 = fn(intopt, rhoj0, rhok0b_Pl_, dm0_tag, auxslices, omega=omega) + vj1b_int3c, vk1b_int3c = fn(intopt, rhoj0, rhok0b_Pl_, dm0_tag, auxslices, + with_j=with_j, with_k=with_k, omega=omega) # Responses due to int2c2e_ip1 if omega and omega > 1e-10: @@ -553,34 +557,35 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, int2c_ip1 = auxmol.intor('int2c2e_ip1', aosym='s1') int2c_ip1 = cupy.asarray(int2c_ip1, order='C') int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2]) - - # generate rhok0_P__ - if isinstance(rhok0a_Pl_, cupy.ndarray): - rhok0a_P__ = contract('pio,ir->pro', rhok0a_Pl_, mocca) - else: - naux = auxmol.nao - nocc = mocca.shape[1] - rhok0a_P__ = cupy.empty([naux,nocc,nocc]) - for p0, p1 in lib.prange(0,naux,64): - rhok0_Pl_tmp = cupy.asarray(rhok0a_Pl_[p0:p1]) - rhok0a_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, mocca) - rhok0_Pl_tmp = None - - # generate rhok0_P__ - if isinstance(rhok0b_Pl_, cupy.ndarray): - rhok0b_P__ = contract('pio,ir->pro', rhok0b_Pl_, moccb) - else: - naux = auxmol.nao - nocc = moccb.shape[1] - rhok0b_P__ = cupy.empty([naux,nocc,nocc]) - for p0, p1 in lib.prange(0,naux,64): - rhok0_Pl_tmp = cupy.asarray(rhok0b_Pl_[p0:p1]) - rhok0b_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, moccb) - rhok0_Pl_tmp = None - - wj0_10 = contract('xpq,q->xp', int2c_ip1, rhoj0) - wk0a_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0a_P__) - wk0b_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0b_P__) + if with_k: + # generate rhok0_P__ + if isinstance(rhok0a_Pl_, cupy.ndarray): + rhok0a_P__ = contract('pio,ir->pro', rhok0a_Pl_, mocca) + else: + naux = auxmol.nao + nocc = mocca.shape[1] + rhok0a_P__ = cupy.empty([naux,nocc,nocc]) + for p0, p1 in lib.prange(0,naux,64): + rhok0_Pl_tmp = cupy.asarray(rhok0a_Pl_[p0:p1]) + rhok0a_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, mocca) + rhok0_Pl_tmp = None + + # generate rhok0_P__ + if isinstance(rhok0b_Pl_, cupy.ndarray): + rhok0b_P__ = contract('pio,ir->pro', rhok0b_Pl_, moccb) + else: + naux = auxmol.nao + nocc = moccb.shape[1] + rhok0b_P__ = cupy.empty([naux,nocc,nocc]) + for p0, p1 in lib.prange(0,naux,64): + rhok0_Pl_tmp = cupy.asarray(rhok0b_Pl_[p0:p1]) + rhok0b_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, moccb) + rhok0_Pl_tmp = None + if with_j: + wj0_10 = contract('xpq,q->xp', int2c_ip1, rhoj0) + if with_k: + wk0a_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0a_P__) + wk0b_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0b_P__) aux2atom = int3c2e.get_aux2atom(intopt, auxslices) mem_avail = get_avail_mem() @@ -589,42 +594,74 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, log.debug(f'GPU Memory {mem_avail/GB:.1f} GB available, block size {blksize}') if blksize < ALIGNED: raise RuntimeError('Not enough memory to compute int3c2e_ip2') - - for p0, p1 in lib.prange(0,nao,64): + + for p0, p1 in lib.prange(0,nao,blksize): rhoka_tmp = cupy.asarray(rhok0a_Pl_[:,p0:p1]) rhokb_tmp = cupy.asarray(rhok0b_Pl_[:,p0:p1]) - vj1a_tmp = contract('pio,xp->xpio', rhoka_tmp, wj0_10) - vj1b_tmp = contract('pio,xp->xpio', rhokb_tmp, wj0_10) - wk0a_10_Pl_ = contract('xqp,pio->xqio', int2c_ip1, rhoka_tmp) wk0b_10_Pl_ = contract('xqp,pio->xqio', int2c_ip1, rhokb_tmp) - vj1a_tmp += contract('xpio,p->xpio', wk0a_10_Pl_, rhoj0) - vj1b_tmp += contract('xpio,p->xpio', wk0b_10_Pl_, rhoj0) - vj1a_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vj1a_tmp, aux2atom) - vj1b_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vj1b_tmp, aux2atom) - vj1a_tmp = vj1b_tmp = None + if with_j: + vj1a_tmp = contract('pio,xp->xpio', rhoka_tmp, wj0_10) + vj1b_tmp = contract('pio,xp->xpio', rhokb_tmp, wj0_10) + + vj1a_tmp += contract('xpio,p->xpio', wk0a_10_Pl_, rhoj0) + vj1b_tmp += contract('xpio,p->xpio', wk0b_10_Pl_, rhoj0) + vj1a_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vj1a_tmp, aux2atom) + vj1b_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vj1b_tmp, aux2atom) + vj1a_tmp = vj1b_tmp = None if with_k: vk1a_tmp = contract('xpio,pro->xpir', wk0a_10_Pl_, rhok0a_P__) vk1a_tmp += contract('xpro,pir->xpio', wk0a_10_P__, rhoka_tmp) vk1b_tmp = contract('xpio,pro->xpir', wk0b_10_Pl_, rhok0b_P__) vk1b_tmp += contract('xpro,pir->xpio', wk0b_10_P__, rhokb_tmp) - vk1a_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vk1a_tmp, aux2atom) - vk1b_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vk1b_tmp, aux2atom) + vk1a_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vk1a_tmp, aux2atom) + vk1b_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vk1b_tmp, aux2atom) vk1a_tmp = vk1b_tmp = None wk0a_10_Pl_ = wk0b_10_Pl_ = rhoka_tmp = rhokb_tmp = None wj0_10 = wk0a_10_P__ = wk0b_10_P__ = rhok0a_P__ =rhok0b_P__ = int2c_ip1 = None - rhoj0 = rhok0a_Pl_ = rhok0b_Pl_ = None - aux2atom = None - vj1a_int3c += contract('nxiq,ip->nxpq', vj1a_int3c_ip2, mo_coeff[0]) - vj1b_int3c += contract('nxiq,ip->nxpq', vj1b_int3c_ip2, mo_coeff[1]) - if with_k: - vk1a_int3c += contract('nxiq,ip->nxpq', vk1a_int3c_ip2, mo_coeff[0]) - vk1b_int3c += contract('nxiq,ip->nxpq', vk1b_int3c_ip2, mo_coeff[1]) - vk1a_int3c_ip2 = vk1b_int3c_ip2 = None + aux2atom = None t0 = log.timer_debug1('Fock matrix due to int3c2e_ip2', *t0) + # ----------------------------- + # int3c_ip1 contributions + # ------------------------------ + cupy.get_default_memory_pool().free_all_blocks() + fn = int3c2e.get_int3c2e_ip1_vjk + dm0_tag = tag_array(dm0, occ_coeff=mocca) + vj1_buf, vk1a_buf, vj1a_ao, vk1a_ao = fn(intopt, rhoj0, rhok0a_Pl_, dm0_tag, aoslices, + with_j=with_j, with_k=with_k, omega=omega) + dm0_tag = tag_array(dm0, occ_coeff=moccb) + vj1_buf, vk1b_buf, vj1b_ao, vk1b_ao = fn(intopt, rhoj0, rhok0b_Pl_, dm0_tag, aoslices, + with_j=with_j, with_k=with_k, omega=omega) + rhoj0 = rhok0a_Pl_ = rhok0b_Pl_ = None + + if with_j: + vj1_buf = intopt.unsort_orbitals(vj1_buf, axis=[1,2]) + if not hessobj.auxbasis_response: + vj1a_int3c = -vj1a_ao + vj1b_int3c = -vj1b_ao + else: + vj1a_int3c -= vj1a_ao + vj1b_int3c -= vj1b_ao + vj1a_ao = vj1b_ao = None + vj1a_int3c = contract('nxiq,ip->nxpq', vj1a_int3c, mo_coeff[0]) + vj1b_int3c = contract('nxiq,ip->nxpq', vj1b_int3c, mo_coeff[1]) + if with_k: + vk1a_buf = intopt.unsort_orbitals(vk1a_buf, axis=[1,2]) + vk1b_buf = intopt.unsort_orbitals(vk1b_buf, axis=[1,2]) + if not hessobj.auxbasis_response: + vk1a_int3c = -vk1a_ao + vk1b_int3c = -vk1b_ao + else: + vk1a_int3c -= vk1a_ao + vk1b_int3c -= vk1b_ao + vk1a_ao = vk1b_ao = None + vk1a_int3c = contract('nxiq,ip->nxpq', vk1a_int3c, mo_coeff[0]) + vk1b_int3c = contract('nxiq,ip->nxpq', vk1b_int3c, mo_coeff[1]) + t0 = log.timer_debug1('Fock matrix due to int3c2e_ip1', *t0) + mocca = intopt.unsort_orbitals(mocca, axis=[0]) moccb = intopt.unsort_orbitals(moccb, axis=[0]) mo_coeff = intopt.unsort_orbitals(mo_coeff, axis=[1]) @@ -639,21 +676,19 @@ def _ao2mo(mat, mocc, mo): for i0, ia in enumerate(atmlst): shl0, shl1, p0, p1 = aoslices[ia] - vj1_ao = cupy.zeros([3,nao,nao]) - vk1a_ao = cupy.zeros([3,nao,nao]) - vk1b_ao = cupy.zeros([3,nao,nao]) - - vj1_ao[:,p0:p1,:] -= vj1_buf[:,p0:p1,:] - vj1_ao[:,:,p0:p1] -= vj1_buf[:,p0:p1,:].transpose(0,2,1) + if with_j: + vj1_ao = cupy.zeros([3,nao,nao]) + vj1_ao[:,p0:p1,:] -= vj1_buf[:,p0:p1,:] + vj1_ao[:,:,p0:p1] -= vj1_buf[:,p0:p1,:].transpose(0,2,1) + vj1a_int3c[ia] += _ao2mo(vj1_ao, mocca, mo_coeff[0]) + vj1b_int3c[ia] += _ao2mo(vj1_ao, moccb, mo_coeff[1]) if with_k: + vk1a_ao = cupy.zeros([3,nao,nao]) + vk1b_ao = cupy.zeros([3,nao,nao]) vk1a_ao[:,p0:p1,:] -= vk1a_buf[:,p0:p1,:] vk1a_ao[:,:,p0:p1] -= vk1a_buf[:,p0:p1,:].transpose(0,2,1) vk1b_ao[:,p0:p1,:] -= vk1b_buf[:,p0:p1,:] vk1b_ao[:,:,p0:p1] -= vk1b_buf[:,p0:p1,:].transpose(0,2,1) - - vj1a_int3c[ia] += _ao2mo(vj1_ao, mocca, mo_coeff[0]) - vj1b_int3c[ia] += _ao2mo(vj1_ao, moccb, mo_coeff[1]) - if with_k: vk1a_int3c[ia] += _ao2mo(vk1a_ao, mocca, mo_coeff[0]) vk1b_int3c[ia] += _ao2mo(vk1b_ao, moccb, mo_coeff[1]) return (vj1a_int3c, vj1b_int3c), (vk1a_int3c, vk1b_int3c) diff --git a/gpu4pyscf/df/hessian/uks.py b/gpu4pyscf/df/hessian/uks.py index 31273a7d..059f571c 100644 --- a/gpu4pyscf/df/hessian/uks.py +++ b/gpu4pyscf/df/hessian/uks.py @@ -28,6 +28,7 @@ from gpu4pyscf.hessian import uhf as uhf_hess from gpu4pyscf.hessian import uks as uks_hess from gpu4pyscf.df.hessian import uhf as df_uhf_hess +from gpu4pyscf.df.hessian.uhf import _partial_hess_ejk, _get_jk_ip from gpu4pyscf.lib import logger from gpu4pyscf.lib.cupy_helper import contract @@ -52,17 +53,17 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) with_k = mf._numint.libxc.is_hybrid_xc(mf.xc) - de2, ej, ek = df_uhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ, - atmlst, max_memory, verbose, - with_k=with_k) + de2, ej, ek = _partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ, + atmlst, max_memory, verbose, + with_j=True, with_k=with_k) de2 += ej # (A,B,dR_A,dR_B) if with_k: de2 -= hyb * ek if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10: - ek_lr = df_uhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ, - atmlst, max_memory, verbose, - True, omega=omega)[2] + ek_lr = _partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ, + atmlst, max_memory, verbose, + with_j=False, with_k=True, omega=omega)[2] de2 -= (alpha - hyb) * ek_lr max_memory = None @@ -98,11 +99,11 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) mem_now = lib.current_memory()[0] max_memory = max(2000, mf.max_memory*.9-mem_now) - + with_k = ni.libxc.is_hybrid_xc(mf.xc) - vj1, vk1 = df_uhf_hess._get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, - atmlst, verbose, with_k) + vj1, vk1 = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, + atmlst, verbose, with_j=True, with_k=True) vj1a, vj1b = vj1 h1moa = vj1a h1mob = vj1b @@ -112,10 +113,10 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): h1moa -= hyb * vk1a h1mob -= hyb * vk1b vj1 = vk1 = vj1a = vj1b = vk1a = vk1b = None - + if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10: - _, vk1_lr = df_uhf_hess._get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, - atmlst, verbose, True, omega=omega) + _, vk1_lr = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, + atmlst, verbose, with_j=False, with_k=True, omega=omega) vk1a, vk1b = vk1_lr h1moa -= (alpha - hyb) * vk1a h1mob -= (alpha - hyb) * vk1b diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py index 98350c59..ce972503 100644 --- a/gpu4pyscf/df/int3c2e.py +++ b/gpu4pyscf/df/int3c2e.py @@ -20,7 +20,7 @@ from pyscf import gto, df, lib from pyscf.scf import _vhf from gpu4pyscf.scf.int4c2e import BasisProdCache, libgvhf, libgint -from gpu4pyscf.lib.cupy_helper import (block_c2s_diag, cart2sph, contract, get_avail_mem, +from gpu4pyscf.lib.cupy_helper import (block_c2s_diag, cart2sph, contract, get_avail_mem, reduce_to_device) from gpu4pyscf.lib import logger from gpu4pyscf.gto.mole import basis_seg_contraction @@ -115,7 +115,7 @@ def build(self, cutoff=1e-14, group_size=None, mol = basis_seg_contraction(_mol,allow_replica=True) auxmol = basis_seg_contraction(_auxmol, allow_replica=True) - + log = logger.new_logger(_mol, _mol.verbose) cput0 = log.init_timer() _sorted_mol, sorted_idx, uniq_l_ctr, l_ctr_counts = sort_mol(mol, log=log) @@ -181,7 +181,7 @@ def build(self, cutoff=1e-14, group_size=None, aux_loc = _auxmol.ao_loc_nr(cart=_auxmol.cart) ao_idx = np.array_split(np.arange(_auxmol.nao), aux_loc[1:-1]) - self._aux_ao_idx = np.hstack([ao_idx[i] for i in sorted_aux_idx]) + self._aux_ao_idx = np.hstack([ao_idx[i] for i in sorted_aux_idx]) cput1 = log.timer_debug1('Aux AO indices', *cput1) ao_loc = _sorted_mol.ao_loc_nr(cart=_mol.cart) @@ -260,7 +260,7 @@ def build(self, cutoff=1e-14, group_size=None, self._sorted_mol = _sorted_mol self._sorted_auxmol = _sorted_auxmol - + @property def bpcache(self): device_id = cupy.cuda.Device().id @@ -310,15 +310,15 @@ def unsort_orbitals(self, sorted_mat, axis=[], aux_axis=[]): mat = cupy.empty_like(sorted_mat) mat[tuple(fancy_index)] = sorted_mat return mat - + @property def cart2sph(self): return block_c2s_diag(self.angular, self.l_ctr_counts) - + @property def aux_cart2sph(self): return block_c2s_diag(self.aux_angular, self.aux_l_ctr_counts) - + @property def coeff(self): nao = self.mol.nao @@ -339,36 +339,45 @@ def aux_coeff(self): self._aux_coeff = self.unsort_orbitals(self.aux_cart2sph, aux_axis=[1]) return self._aux_coeff -def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None, with_k=True): +def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None, with_j=True, with_k=True): log = logger.new_logger(mol, mol.verbose) intopt = VHFOpt(mol, auxmol, 'int2e') - intopt.build(thred, diag_block_with_triu=True, aosym=True, group_size=BLKSIZE, group_size_aux=BLKSIZE) + intopt.build(thred, diag_block_with_triu=True, aosym=True, + group_size=BLKSIZE, group_size_aux=BLKSIZE) orbo = dm0_tag.occ_coeff nao = mol.nao naux = auxmol.nao nocc = orbo.shape[1] - wj = cupy.empty([naux]) - avail_mem = get_avail_mem() - use_gpu_memory = True - if naux*nao*nocc*8 < 0.4*avail_mem: - try: - wk = cupy.empty([naux,nao,nocc]) - except Exception: + + wj = None + if with_j: + wj = cupy.empty([naux]) + + wk = None + if with_k: + avail_mem = get_avail_mem() + use_gpu_memory = True + if naux*nao*nocc*8 < 0.4*avail_mem: + try: + wk = cupy.empty([naux,nao,nocc]) + except Exception: + use_gpu_memory = False + else: use_gpu_memory = False - else: - use_gpu_memory = False - - if not use_gpu_memory: - log.debug('Saving int3c2e_wjk on CPU memory') - mem = cupy.cuda.alloc_pinned_memory(naux*nao*nocc*8) - wk = np.ndarray([naux,nao,nocc], dtype=np.float64, order='C', buffer=mem) + + if not use_gpu_memory: + log.debug('Saving int3c2e_wjk on CPU memory') + mem = cupy.cuda.alloc_pinned_memory(naux*nao*nocc*8) + wk = np.ndarray([naux,nao,nocc], dtype=np.float64, order='C', buffer=mem) # TODO: async data transfer for cp_kl_id, _ in enumerate(intopt.aux_log_qs): k0 = intopt.aux_ao_loc[cp_kl_id] k1 = intopt.aux_ao_loc[cp_kl_id+1] - rhoj_tmp = cupy.zeros([k1-k0], order='C') - rhok_tmp = cupy.zeros([k1-k0, nao, nocc], order='C') + if with_j: + rhoj_tmp = cupy.zeros([k1-k0], order='C') + if with_k: + rhok_tmp = cupy.zeros([k1-k0, nao, nocc], order='C') for cp_ij_id, _ in enumerate(intopt.log_qs): cpi = intopt.cp_idx[cp_ij_id] @@ -381,15 +390,17 @@ def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None, with_k=True): int3c_blk = cart2sph(int3c_blk, axis=2, ang=li) i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1] j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1] - - tmp = contract('Lji,ij->L', int3c_blk, dm0_tag[i0:i1,j0:j1]) - rhoj_tmp += tmp - rhok_tmp[:,j0:j1] += contract('Lji,io->Ljo', int3c_blk, orbo[i0:i1]) - - if cpi != cpj and intopt.aosym: + if with_j: + tmp = contract('Lji,ij->L', int3c_blk, dm0_tag[i0:i1,j0:j1]) rhoj_tmp += tmp - rhok_tmp[:,i0:i1] += contract('Lji,jo->Lio', int3c_blk, orbo[j0:j1]) - wj[k0:k1] = rhoj_tmp + if cpi != cpj: + rhoj_tmp += tmp + if with_k: + rhok_tmp[:,j0:j1] += contract('Lji,io->Ljo', int3c_blk, orbo[i0:i1]) + if cpi != cpj: + rhok_tmp[:,i0:i1] += contract('Lji,jo->Lio', int3c_blk, orbo[j0:j1]) + if with_j: + wj[k0:k1] = rhoj_tmp if with_k: if isinstance(wk, cupy.ndarray): wk[k0:k1] = rhok_tmp @@ -505,7 +516,7 @@ def loop_int3c2e_general(intopt, task_list=None, ip_type='', omega=None, stream= for aux_id, cp_ij_id in task_list: cp_kl_id = aux_id + len(intopt.log_qs) lk = intopt.aux_angular[aux_id] - + cpi = intopt.cp_idx[cp_ij_id] cpj = intopt.cp_jdx[cp_ij_id] li = intopt.angular[cpi] @@ -670,26 +681,26 @@ def get_j_int3c2e_pass1(intopt, dm0, sort_j=True, stream=None): get rhoj pass1 for int3c2e ''' if stream is None: stream = cupy.cuda.get_current_stream() - + n_dm = 1 naux = intopt._sorted_auxmol.nao - + coeff = intopt.coeff if dm0.ndim == 3: dm0 = dm0[0] + dm0[1] dm_cart = coeff @ dm0 @ coeff.T - + num_cp_ij = [len(log_qs) for log_qs in intopt.log_qs] num_cp_kl = [len(log_qs) for log_qs in intopt.aux_log_qs] bins_locs_ij = np.append(0, np.cumsum(num_cp_ij)).astype(np.int32) bins_locs_kl = np.append(0, np.cumsum(num_cp_kl)).astype(np.int32) - + ncp_ij = len(intopt.log_qs) ncp_kl = len(intopt.aux_log_qs) norb = dm_cart.shape[0] - + rhoj = cupy.zeros([naux]) err = libgvhf.GINTbuild_j_int3c2e_pass1( @@ -706,7 +717,7 @@ def get_j_int3c2e_pass1(intopt, dm0, sort_j=True, stream=None): ctypes.c_int(ncp_kl)) if err != 0: raise RuntimeError('CUDA error in get_j_pass1') - + if sort_j: aux_coeff = intopt.aux_coeff rhoj = cupy.dot(rhoj, aux_coeff) @@ -731,7 +742,7 @@ def get_j_int3c2e_pass2(intopt, rhoj, stream=None): ncp_ij = len(intopt.log_qs) ncp_kl = len(intopt.aux_log_qs) - + rhoj = intopt.sort_orbitals(rhoj, aux_axis=[0]) if not intopt.auxmol.cart: rhoj = intopt.aux_cart2sph @ rhoj @@ -751,7 +762,7 @@ def get_j_int3c2e_pass2(intopt, rhoj, stream=None): if err != 0: raise RuntimeError('CUDA error in get_j_pass2') - + if not intopt.mol.cart: cart2sph = intopt.cart2sph vj = cart2sph.T @ vj @ cart2sph @@ -804,20 +815,24 @@ def get_int3c2e_jk(mol, auxmol, dm0_tag, with_k=True, omega=None): rhok[k0:k1] = rhok_tmp return rhoj, rhok -def _int3c2e_ip1_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, device_id=0, with_k=True, omega=None): +def _int3c2e_ip1_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, device_id=0, + with_j=True, with_k=True, omega=None): natom = intopt.mol.natm nao = intopt.mol.nao aoslices = intopt.mol.aoslice_by_atom() + vj1_buf = vk1_buf = vj1 = vk1 = None with cupy.cuda.Device(device_id), _streams[device_id]: ao2atom = get_ao2atom(intopt, aoslices) - rhoj = cupy.asarray(rhoj) dm0 = cupy.asarray(dm0) orbo = cupy.asarray(orbo) nocc = orbo.shape[1] - vj1_buf = cupy.zeros([3,nao,nao]) - vk1_buf = cupy.zeros([3,nao,nao]) - vj1 = cupy.zeros([natom,3,nao,nocc]) - vk1 = cupy.zeros([natom,3,nao,nocc]) + if with_j: + rhoj = cupy.asarray(rhoj) + vj1_buf = cupy.zeros([3,nao,nao]) + vj1 = cupy.zeros([natom,3,nao,nocc]) + if with_k: + vk1_buf = cupy.zeros([3,nao,nao]) + vk1 = cupy.zeros([natom,3,nao,nocc]) aux_ao_loc = intopt.aux_ao_loc ncp_ij = len(intopt.log_qs) for cp_k in task_list: @@ -827,15 +842,18 @@ def _int3c2e_ip1_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, device_id=0, if with_k: rhok0 = contract('pio,ir->pro', rhok_tmp, orbo) rhok0 = contract('pro,Jo->prJ', rhok0, orbo) - rhoj0 = cupy.zeros([3,k1-k0,nao]) - int3c_ip1_occ = cupy.zeros([3,k1-k0,nao,nocc]) + int3c_ip1_occ = cupy.zeros([3,k1-k0,nao,nocc]) + if with_j: + rhoj0 = cupy.zeros([3,k1-k0,nao]) + for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list, ip_type='ip1', omega=omega): - vj1_buf[:,i0:i1,j0:j1] += contract('xpji,p->xij', int3c_blk, rhoj[k0:k1]) - rhoj0[:,:,i0:i1] += contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1]) - int3c_ip1_occ[:,:,i0:i1] += contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1]) - + if with_j: + vj1_buf[:,i0:i1,j0:j1] += contract('xpji,p->xij', int3c_blk, rhoj[k0:k1]) + rhoj0[:,:,i0:i1] += contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1]) if with_k: + int3c_ip1_occ[:,:,i0:i1] += contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1]) + vk1_ao = contract('xpji,poi->xijo', int3c_blk, rhok0[:,:,i0:i1]) vk1[:,:,j0:j1] += contract('xijo,ia->axjo', vk1_ao, ao2atom[i0:i1]) @@ -845,14 +863,17 @@ def _int3c2e_ip1_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, device_id=0, vk1_ao = contract('xpio,pJi->xiJo', int3c_occ, rhok0_slice) vk1 += contract('xiJo,ia->axJo', vk1_ao, ao2atom[i0:i1]) vk1_ao = int3c_occ = None - rhoj0_atom = contract('xpi,ia->xpa', rhoj0, 2.0*ao2atom) - vj1 += contract('pJo,xpa->axJo', rhok_tmp, rhoj0_atom) - rhoj0_atom = None - vk1_buf += contract('xpio,plo->xil', int3c_ip1_occ, rhok_tmp) + if with_j: + rhoj0_atom = contract('xpi,ia->xpa', rhoj0, 2.0*ao2atom) + vj1 += contract('pJo,xpa->axJo', rhok_tmp, rhoj0_atom) + rhoj0_atom = None + if with_k: + vk1_buf += contract('xpio,plo->xil', int3c_ip1_occ, rhok_tmp) # TODO: absorbe vj1_buf and vk1_buf into vj1 and vk1 return vj1_buf, vk1_buf, vj1, vk1 -def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_k=True, omega=None): +def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_j=True, + with_k=True, omega=None): orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') futures = [] ncp_k = len(intopt.aux_log_qs) @@ -860,15 +881,16 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_k=True, omeg task_list = [] for device_id in range(_num_devices): task_list.append(tasks[device_id::_num_devices]) - + cupy.cuda.get_current_stream().synchronize() with ThreadPoolExecutor(max_workers=_num_devices) as executor: for device_id in range(_num_devices): future = executor.submit( - _int3c2e_ip1_vjk_task, intopt, task_list[device_id], - rhoj, rhok, dm0_tag, orbo, with_k=with_k, device_id=device_id, omega=omega) + _int3c2e_ip1_vjk_task, intopt, task_list[device_id], + rhoj, rhok, dm0_tag, orbo, with_j=with_j, with_k=with_k, + device_id=device_id, omega=omega) futures.append(future) - + vj1_buf_total = [] vk1_buf_total = [] vj1_total = [] @@ -879,45 +901,55 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_k=True, omeg vk1_buf_total.append(vk1_buf) vj1_total.append(vj1) vk1_total.append(vk1) - + vj1 = vk1 = vj1_buf = vk1_buf = None - vj1 = reduce_to_device(vj1_total, inplace=True) - vj1_buf = reduce_to_device(vj1_buf_total, inplace=True) + if with_j: + vj1 = reduce_to_device(vj1_total, inplace=True) + vj1_buf = reduce_to_device(vj1_buf_total, inplace=True) if with_k: vk1 = reduce_to_device(vk1_total, inplace=True) vk1_buf = reduce_to_device(vk1_buf_total, inplace=True) return vj1_buf, vk1_buf, vj1, vk1 -def _int3c2e_ip2_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, device_id=0, with_k=True, omega=None): +def _int3c2e_ip2_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, + device_id=0, with_j=True, with_k=True, omega=None): natom = intopt.mol.natm nao = intopt.mol.nao auxslices = intopt.auxmol.aoslice_by_atom() + vj1 = vk1 = None with cupy.cuda.Device(device_id), _streams[device_id]: aux2atom = get_aux2atom(intopt, auxslices) - rhoj = cupy.asarray(rhoj) dm0 = cupy.asarray(dm0) orbo = cupy.asarray(orbo) nocc = orbo.shape[1] - vj1 = cupy.zeros([natom,3,nao,nocc]) - vk1 = cupy.zeros([natom,3,nao,nocc]) + if with_j: + rhoj = cupy.asarray(rhoj) + vj1 = cupy.zeros([natom,3,nao,nocc]) + if with_k: + vk1 = cupy.zeros([natom,3,nao,nocc]) aux_ao_loc = intopt.aux_ao_loc ncp_ij = len(intopt.log_qs) for cp_k in task_list: task_list = [(cp_k, cp_ij) for cp_ij in range(ncp_ij)] k0, k1 = aux_ao_loc[cp_k], aux_ao_loc[cp_k+1] - wj2 = cupy.zeros([3,k1-k0]) + if with_j: + wj2 = cupy.zeros([3,k1-k0]) + wk2_P__ = cupy.zeros([3,k1-k0,nao,nocc]) for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list, ip_type='ip2', omega=omega): # contraction - wj2 += contract('xpji,ji->xp', int3c_blk, dm0[j0:j1,i0:i1]) + if with_j: + wj2 += contract('xpji,ji->xp', int3c_blk, dm0[j0:j1,i0:i1]) + wk2_P__[:,:,i0:i1] += contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1]) rhok_tmp = cupy.asarray(rhok[k0:k1]) - vj1_tmp = -contract('pio,xp->xpio', rhok_tmp, wj2) - vj1_tmp -= contract('xpio,p->xpio', wk2_P__, rhoj[k0:k1]) + if with_j: + vj1_tmp = -contract('pio,xp->xpio', rhok_tmp, wj2) + vj1_tmp -= contract('xpio,p->xpio', wk2_P__, rhoj[k0:k1]) - vj1 += contract('xpio,pa->axio', vj1_tmp, aux2atom[k0:k1]) + vj1 += contract('xpio,pa->axio', vj1_tmp, aux2atom[k0:k1]) if with_k: #rhok0_slice = contract('pio,jo->pij', rhok_tmp, orbo) #vk1_tmp = -contract('xpjo,pij->xpio', wk2_P__, rhok0_slice) @@ -932,7 +964,8 @@ def _int3c2e_ip2_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, device_id=0, rhok_tmp = vk1_tmp = None return vj1, vk1 -def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, with_k=True, omega=None): +def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, + with_j=True, with_k=True, omega=None): ''' vj and vk responses (due to int3c2e_ip2) to changes in atomic positions ''' @@ -943,24 +976,26 @@ def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, with_k=True, ome task_list = [] for device_id in range(_num_devices): task_list.append(tasks[device_id::_num_devices]) - + cupy.cuda.get_current_stream().synchronize() with ThreadPoolExecutor(max_workers=_num_devices) as executor: for device_id in range(_num_devices): future = executor.submit( - _int3c2e_ip2_vjk_task, intopt, task_list[device_id], - rhoj, rhok, dm0_tag, orbo, with_k=with_k, device_id=device_id, omega=omega) + _int3c2e_ip2_vjk_task, intopt, task_list[device_id], + rhoj, rhok, dm0_tag, orbo, with_j=with_j, + with_k=with_k, device_id=device_id, omega=omega) futures.append(future) - + vj_total = [] vk_total = [] for future in futures: vj, vk = future.result() vj_total.append(vj) vk_total.append(vk) - + vj = vk = None - vj = reduce_to_device(vj_total, inplace=True) + if with_j: + vj = reduce_to_device(vj_total, inplace=True) if with_k: vk = reduce_to_device(vk_total, inplace=True) return vj, vk @@ -999,7 +1034,7 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None): task_list = [] for device_id in range(_num_devices): task_list.append(tasks[device_id::_num_devices]) - + nao = intopt.mol.nao naux = intopt.auxmol.nao nocc = orbo.shape[1] @@ -1012,7 +1047,7 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None): with ThreadPoolExecutor(max_workers=_num_devices) as executor: for device_id in range(_num_devices): future = executor.submit( - _int3c2e_ip1_wjk_task, intopt, task_list[device_id], + _int3c2e_ip1_wjk_task, intopt, task_list[device_id], dm0_tag, orbo, wk, with_k=with_k, device_id=device_id, omega=omega) futures.append(future) wj_total = [] @@ -1049,7 +1084,7 @@ def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None): task_list = [] for device_id in range(_num_devices): task_list.append(tasks[device_id::_num_devices]) - + cupy.cuda.get_current_stream().synchronize() with ThreadPoolExecutor(max_workers=_num_devices) as executor: for device_id in range(_num_devices): @@ -1057,14 +1092,14 @@ def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None): _int3c2e_ip2_wjk, intopt, task_list[device_id], dm0_tag, orbo, with_k=with_k, device_id=device_id, omega=omega) futures.append(future) - + wj_total = [] wk_total = [] for future in futures: wj, wk = future.result() wj_total.append(wj) wk_total.append(wk) - + wj = wk = None wj = reduce_to_device(wj_total, inplace=True) if with_k: @@ -1373,7 +1408,7 @@ def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, cart=False, aosym=None, out=N nbins = 1 bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32) bins_locs_kl = np.array([0, len(log_q_kl)], dtype=np.int32) - + cart_ao_loc = intopt.cart_ao_loc cart_aux_loc = intopt.cart_aux_loc i0, i1 = cart_ao_loc[cpi], cart_ao_loc[cpi+1] @@ -1415,11 +1450,11 @@ def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, cart=False, aosym=None, out=N if err != 0: raise RuntimeError('GINT_fill_int2e failed') - + # move this operation to j2c? if lk > 1 and intopt.auxmol.cart == 0: int3c_blk = cart2sph(int3c_blk, axis=0, ang=lk, out=out) - + stream.synchronize() return int3c_blk diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py index 2ca835ad..e5b4d297 100644 --- a/gpu4pyscf/hessian/rhf.py +++ b/gpu4pyscf/hessian/rhf.py @@ -27,8 +27,6 @@ from pyscf.hessian import rhf as rhf_hess_cpu from pyscf import lib, gto from pyscf.gto import ATOM_OF -# import _response_functions to load gen_response methods in SCF class -from gpu4pyscf.scf import _response_functions # noqa from gpu4pyscf.scf import cphf from gpu4pyscf.lib.cupy_helper import (reduce_to_device, contract, tag_array, sandwich_dot, transpose_sum, get_avail_mem, condense, @@ -181,7 +179,7 @@ def _ejk_ip2_task(mol, dms, vhfopt, task_list, j_factor=1.0, k_factor=1.0, cput0 = log.init_timer() dms = cp.asarray(dms) coeff = cp.asarray(vhfopt.coeff) - + #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff) dms = sandwich_dot(dms, coeff.T) dms = cp.asarray(dms, order='C') @@ -661,8 +659,8 @@ def fvind_vo(mo1): avail_mem = get_avail_mem() # *4 for input dm, vj, vk, and vxc blksize = int(min(avail_mem*.3 / (8*3*nao*nocc*4), # in MO - avail_mem*.6 / (8*nmo*nocc*3*5), - avail_mem*.3 / (8*nmo*nmo*3*3))) # vj, vk, dm + avail_mem*.6 / (8*nmo*nocc*3*5), + avail_mem*.3 / (8*nmo*nmo*3*3))) # vj, vk, dm if blksize < ALIGNED**2: raise RuntimeError('GPU memory insufficient for solving CPHF equations') @@ -692,7 +690,7 @@ def fvind_vo(mo1): mo1[:,:,viridx] *= -e_ai mo1[:,:,occidx] = -s1mo_blk[:,:,occidx] * .5 hs = s1mo_blk = h1mo_blk = None - + tol = mf.conv_tol_cpscf * (i1 - i0) raw_mo1 = krylov(fvind_vo, mo1.reshape(-1,nmo*nocc), tol=tol, max_cycle=max_cycle, verbose=log) @@ -742,7 +740,7 @@ def hess_nuc_elec(mol, dm): fakemol.verbose = mol.verbose fakemol.stdout = mol.stdout intopt = int3c2e.VHFOpt(mol, fakemol, 'int2e') - intopt.build(1e-14, diag_block_with_triu=True, aosym=False, + intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=int3c2e.BLKSIZE, group_size_aux=int3c2e.BLKSIZE) dm = intopt.sort_orbitals(cupy.asarray(dm), axis=[0,1]) @@ -889,7 +887,7 @@ def get_hcore(iatm, jatm): def hcore_generator(hessobj, mol=None): raise NotImplementedError -def _get_jk_mo(hessobj, mol, dms, mo_coeff, mo_occ, +def _get_jk_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1, with_j=True, with_k=True, omega=None): ''' Compute J/K matrices in MO for multiple DMs ''' @@ -903,7 +901,7 @@ def _get_jk_mo(hessobj, mol, dms, mo_coeff, mo_occ, return vj, vk def _get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1, omega=None): - vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, + vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=hermi, with_j=True, with_k=True, omega=omega) return vj - 0.5 * vk @@ -921,7 +919,7 @@ class HessianBase(lib.StreamObject): gen_vind = NotImplemented get_jk = NotImplemented kernel = hess = kernel - + def get_hcore(self, mol=None): if mol is None: mol = self.mol return get_hcore(mol) diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py index 5d909c78..bffc221c 100644 --- a/gpu4pyscf/hessian/rks.py +++ b/gpu4pyscf/hessian/rks.py @@ -25,11 +25,10 @@ from pyscf import lib from gpu4pyscf.hessian import rhf as rhf_hess from gpu4pyscf.grad import rhf as rhf_grad -# import pyscf.grad.rks to activate nuc_grad_method method from gpu4pyscf.grad import rks as rks_grad from gpu4pyscf.dft import numint -from gpu4pyscf.lib.cupy_helper import (contract, add_sparse, get_avail_mem, - reduce_to_device, transpose_sum, tag_array) +from gpu4pyscf.lib.cupy_helper import (contract, add_sparse, get_avail_mem, + reduce_to_device) from gpu4pyscf.lib import logger from gpu4pyscf.__config__ import _streams, _num_devices from gpu4pyscf.hessian import jk @@ -737,15 +736,15 @@ def get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1, omega=None): vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=1) vk *= hyb if omega > 1e-10: # For range separated Coulomb - _, vk_lr = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi, - with_j=False, omega=omega) + _, vk_lr = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi, + with_j=False, omega=omega) vk_lr *= (alpha-hyb) vk += vk_lr v1 += vj - .5 * vk else: - v1 += hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=1, + v1 += hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=1, with_k=False)[0] - + return v1 diff --git a/gpu4pyscf/hessian/uhf.py b/gpu4pyscf/hessian/uhf.py index 44154532..b3cff989 100644 --- a/gpu4pyscf/hessian/uhf.py +++ b/gpu4pyscf/hessian/uhf.py @@ -21,15 +21,12 @@ Non-relativistic UHF analytical Hessian ''' -from functools import reduce import numpy as np import cupy import cupy as cp from pyscf import lib from pyscf.scf import ucphf -# import _response_functions to load gen_response methods in SCF class -from gpu4pyscf.scf import _response_functions # noqa -from gpu4pyscf.lib.cupy_helper import (contract, transpose_sum, get_avail_mem, +from gpu4pyscf.lib.cupy_helper import (contract, transpose_sum, get_avail_mem, krylov, tag_array) from gpu4pyscf.lib import logger from gpu4pyscf.grad import rhf as rhf_grad @@ -406,7 +403,7 @@ def fx(mo1): return fx def _get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1): - vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, + vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=hermi, with_j=True, with_k=True) return vj - vk @@ -422,7 +419,7 @@ class Hessian(rhf_hess_gpu.HessianBase): gen_vind = gen_vind get_jk_mo = rhf_hess_gpu._get_jk_mo get_veff_resp_mo = _get_veff_resp_mo - + def solve_mo1(self, mo_energy, mo_coeff, mo_occ, h1mo, fx=None, atmlst=None, max_memory=4000, verbose=None): return solve_mo1(self.base, mo_energy, mo_coeff, mo_occ, h1mo, diff --git a/gpu4pyscf/hessian/uks.py b/gpu4pyscf/hessian/uks.py index 66571300..2a048f5f 100644 --- a/gpu4pyscf/hessian/uks.py +++ b/gpu4pyscf/hessian/uks.py @@ -23,11 +23,9 @@ from gpu4pyscf.hessian import rhf as rhf_hess from gpu4pyscf.hessian import uhf as uhf_hess from gpu4pyscf.grad import rhf as rhf_grad -# import pyscf.grad.rks to activate nuc_grad_method method from gpu4pyscf.grad import rks as rks_grad from gpu4pyscf.dft import numint -from gpu4pyscf.lib.cupy_helper import (contract, add_sparse, get_avail_mem, - transpose_sum, tag_array) +from gpu4pyscf.lib.cupy_helper import (contract, add_sparse, get_avail_mem) from gpu4pyscf.lib import logger from gpu4pyscf.hessian import jk @@ -856,7 +854,7 @@ def get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1): # If cphf_grids is not defined, e.g object defined from CPU grids = getattr(mf, 'grids', None) logger.info(mf, 'Primary grids is used for CPHF in Hessian') - + if grids and grids.coords is None: grids.build(mol=mol, with_non0tab=False, sort_grids=True) @@ -866,7 +864,7 @@ def get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1): moccb = mo_coeff[1][:,mo_occ[1]>0] nocca = mocca.shape[1] noccb = moccb.shape[1] - + ni = mf._numint omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, mol.spin) hybrid = ni.libxc.is_hybrid_xc(mf.xc) @@ -885,13 +883,13 @@ def get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1): vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=1) vk *= hyb if omega > 1e-10: - _, vk_lr = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, - hermi, with_j=False, omega=omega) + _, vk_lr = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, + hermi, with_j=False, omega=omega) vk_lr *= (alpha-hyb) vk += vk_lr v1vo += vj - vk else: - v1vo += hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, + v1vo += hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=1, with_k=False)[0] return v1vo From dfc336d2f70ccf01d3c8d94f6eb7d2d13d75b9c0 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Sun, 22 Dec 2024 21:19:04 -0800 Subject: [PATCH 11/49] memory estimate --- gpu4pyscf/hessian/rhf.py | 3 +-- gpu4pyscf/hessian/uhf.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py index e5b4d297..b2aab6c6 100644 --- a/gpu4pyscf/hessian/rhf.py +++ b/gpu4pyscf/hessian/rhf.py @@ -659,8 +659,7 @@ def fvind_vo(mo1): avail_mem = get_avail_mem() # *4 for input dm, vj, vk, and vxc blksize = int(min(avail_mem*.3 / (8*3*nao*nocc*4), # in MO - avail_mem*.6 / (8*nmo*nocc*3*5), - avail_mem*.3 / (8*nmo*nmo*3*3))) # vj, vk, dm + avail_mem*.3 / (8*nmo*nmo*3*3))) # vj, vk, dm in AO if blksize < ALIGNED**2: raise RuntimeError('GPU memory insufficient for solving CPHF equations') diff --git a/gpu4pyscf/hessian/uhf.py b/gpu4pyscf/hessian/uhf.py index b3cff989..0f695b2b 100644 --- a/gpu4pyscf/hessian/uhf.py +++ b/gpu4pyscf/hessian/uhf.py @@ -293,7 +293,7 @@ def fvind_vo(mo1): avail_mem = get_avail_mem() # *8 for spin-up/down input dm, vj, vk, and vxc blksize = int(min(avail_mem*.3 / (8*3*nao*nao*8), - avail_mem*.6 / (8*nmo*nocc*natm*3*5))) + avail_mem*.3 / (8*nmo*nmo*3*6))) # in vj, vk, dm in AO if blksize < ALIGNED**2: raise RuntimeError('GPU memory insufficient') From 78759fccab62e1074de66f60e993f2a527cf48fb Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Mon, 23 Dec 2024 17:30:08 +0000 Subject: [PATCH 12/49] tested on 095 molecule --- examples/dft_driver.py | 6 +++--- gpu4pyscf/df/int3c2e.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/dft_driver.py b/examples/dft_driver.py index 8060e909..e0eccdda 100644 --- a/examples/dft_driver.py +++ b/examples/dft_driver.py @@ -35,10 +35,10 @@ basis=bas, max_memory=32000) # set verbose >= 6 for debugging timer -mol.verbose = 4 +mol.verbose = 6 -mf_df = dft.RKS(mol, xc=args.xc)#.density_fit(auxbasis=args.auxbasis) -mf_df.verbose = 4 +mf_df = dft.RKS(mol, xc=args.xc).density_fit(auxbasis=args.auxbasis) +mf_df.verbose = 6 if args.solvent: mf_df = mf_df.PCM() diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py index ce972503..2606f3ef 100644 --- a/gpu4pyscf/df/int3c2e.py +++ b/gpu4pyscf/df/int3c2e.py @@ -815,7 +815,7 @@ def get_int3c2e_jk(mol, auxmol, dm0_tag, with_k=True, omega=None): rhok[k0:k1] = rhok_tmp return rhoj, rhok -def _int3c2e_ip1_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, device_id=0, +def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id=0, with_j=True, with_k=True, omega=None): natom = intopt.mol.natm nao = intopt.mol.nao @@ -835,7 +835,7 @@ def _int3c2e_ip1_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, device_id=0, vk1 = cupy.zeros([natom,3,nao,nocc]) aux_ao_loc = intopt.aux_ao_loc ncp_ij = len(intopt.log_qs) - for cp_k in task_list: + for cp_k in task_k_list: task_list = [(cp_k, cp_ij) for cp_ij in range(ncp_ij)] k0, k1 = aux_ao_loc[cp_k], aux_ao_loc[cp_k+1] rhok_tmp = cupy.asarray(rhok[k0:k1]) From 65b4bff21de316d49acbe433fde325474b092eeb Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Mon, 23 Dec 2024 18:54:34 +0000 Subject: [PATCH 13/49] improve make_h1 in df.hessian --- gpu4pyscf/df/int3c2e.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py index 2606f3ef..89822c96 100644 --- a/gpu4pyscf/df/int3c2e.py +++ b/gpu4pyscf/df/int3c2e.py @@ -821,6 +821,7 @@ def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id= nao = intopt.mol.nao aoslices = intopt.mol.aoslice_by_atom() vj1_buf = vk1_buf = vj1 = vk1 = None + with cupy.cuda.Device(device_id), _streams[device_id]: ao2atom = get_ao2atom(intopt, aoslices) dm0 = cupy.asarray(dm0) @@ -856,19 +857,20 @@ def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id= vk1_ao = contract('xpji,poi->xijo', int3c_blk, rhok0[:,:,i0:i1]) vk1[:,:,j0:j1] += contract('xijo,ia->axjo', vk1_ao, ao2atom[i0:i1]) - - int3c_occ = contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1]) - rhok0_slice = contract('pJr,ir->pJi', rhok_tmp, orbo[i0:i1]) - - vk1_ao = contract('xpio,pJi->xiJo', int3c_occ, rhok0_slice) - vk1 += contract('xiJo,ia->axJo', vk1_ao, ao2atom[i0:i1]) - vk1_ao = int3c_occ = None if with_j: rhoj0_atom = contract('xpi,ia->xpa', rhoj0, 2.0*ao2atom) vj1 += contract('pJo,xpa->axJo', rhok_tmp, rhoj0_atom) rhoj0_atom = None if with_k: vk1_buf += contract('xpio,plo->xil', int3c_ip1_occ, rhok_tmp) + mem_avail = get_avail_mem() + blksize = min(int(mem_avail * 0.2 / ((k1-k0) * nao) * 8), + int(mem_avail * 0.2 / (nocc * nao * 3 * 8))) + for p0, p1, in lib.prange(0, nao, blksize): + rhok0_slice = contract('pJr,ir->pJi', rhok_tmp[:,p0:p1], orbo) + vk1_ao = contract('xpio,pJi->xiJo', int3c_ip1_occ, rhok0_slice) + vk1[:,:,p0:p1] += contract('xiJo,ia->axJo', vk1_ao, ao2atom) + # TODO: absorbe vj1_buf and vk1_buf into vj1 and vk1 return vj1_buf, vk1_buf, vj1, vk1 From 509fc6e5272b9ffb858297effe7657cef4ff4ce1 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Tue, 24 Dec 2024 09:54:39 -0800 Subject: [PATCH 14/49] bugfix --- gpu4pyscf/scf/jk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu4pyscf/scf/jk.py b/gpu4pyscf/scf/jk.py index ecf166fd..8e09a35b 100644 --- a/gpu4pyscf/scf/jk.py +++ b/gpu4pyscf/scf/jk.py @@ -59,7 +59,7 @@ int(gpu_specs['sharedMemPerBlockOptin']//9)*8) THREADS = 256 -def _jk_task(mol, dms, vhfopt, task_list, +def _jk_task(mol, dms, vhfopt, task_list, hermi=0, device_id=0, with_j=True, with_k=True, verbose=None): n_dm = dms.shape[0] nao, _ = vhfopt.coeff.shape From 0ae65fb028ec69931bd7246f7dd5c2ff8bad54ea Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Tue, 24 Dec 2024 10:52:24 -0800 Subject: [PATCH 15/49] use sorted_mol --- gpu4pyscf/hessian/jk.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/gpu4pyscf/hessian/jk.py b/gpu4pyscf/hessian/jk.py index f4f102c6..a1cd6105 100644 --- a/gpu4pyscf/hessian/jk.py +++ b/gpu4pyscf/hessian/jk.py @@ -27,7 +27,7 @@ from pyscf.scf import _vhf from pyscf import __config__ -from gpu4pyscf.scf.jk import (_make_tril_tile_mappings, quartets_scheme, QUEUE_DEPTH, +from gpu4pyscf.scf.jk import (_make_tril_tile_mappings, quartets_scheme, QUEUE_DEPTH, _VHFOpt, LMAX, init_constant, libvhf_rys) from gpu4pyscf.lib.cupy_helper import (condense, sandwich_dot, transpose_sum, reduce_to_device, contract) @@ -49,7 +49,7 @@ def _jk_task(mol, dms, mo_coeff, mo_occ, vhfopt, task_list, hermi=0, l_ctr_bas_loc = vhfopt.l_ctr_offsets l_symb = [lib.param.ANGULAR[i] for i in uniq_l] kern = libvhf_rys.RYS_build_jk - + timing_counter = Counter() kern_counts = 0 with cp.cuda.Device(device_id), _streams[device_id]: @@ -69,7 +69,7 @@ def _jk_task(mol, dms, mo_coeff, mo_occ, vhfopt, task_list, hermi=0, s_ptr = lib.c_null_ptr() if mol.omega < 0: s_ptr = ctypes.cast(vhfopt.s_estimator.data.ptr, ctypes.c_void_p) - + vj = vk = None vj_ptr = vk_ptr = lib.c_null_ptr() assert with_j or with_k @@ -79,7 +79,7 @@ def _jk_task(mol, dms, mo_coeff, mo_occ, vhfopt, task_list, hermi=0, if with_j: vj = cp.zeros(dms.shape) vj_ptr = ctypes.cast(vj.data.ptr, ctypes.c_void_p) - + ao_loc = mol.ao_loc dm_cond = cp.log(condense('absmax', dms, ao_loc) + 1e-300).astype(np.float32) log_max_dm = dm_cond.max() @@ -137,7 +137,7 @@ def _jk_task(mol, dms, mo_coeff, mo_occ, vhfopt, task_list, hermi=0, # Unrestricted case mo_coeff = cp.asarray(mo_coeff) mo_occ = cp.asarray(mo_occ) - moa = coeff.dot(mo_coeff[0]) + moa = coeff.dot(mo_coeff[0]) mob = coeff.dot(mo_coeff[1]) nmoa, nmob = moa.shape[1], mob.shape[1] mocca = moa[:,mo_occ[0] > 0.5] @@ -163,10 +163,10 @@ def _jk_task(mol, dms, mo_coeff, mo_occ, vhfopt, task_list, hermi=0, vj = _ao2mo(vj, mocc, mo_coeff).reshape(n_dm,-1) if with_k: vk = _ao2mo(vk, mocc, mo_coeff).reshape(n_dm,-1) - + return vj, vk, kern_counts, timing_counter -def get_jk(mol, dm, mo_coeff, mo_occ, hermi=0, vhfopt=None, +def get_jk(mol, dm, mo_coeff, mo_occ, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None): '''Compute J, K matrices in MO ''' @@ -176,7 +176,7 @@ def get_jk(mol, dm, mo_coeff, mo_occ, hermi=0, vhfopt=None, if vhfopt is None: vhfopt = _VHFOpt(mol).build() - mol = vhfopt.mol + mol = vhfopt.sorted_mol nao, nao_orig = vhfopt.coeff.shape dm = cp.asarray(dm, order='C') @@ -205,7 +205,7 @@ def get_jk(mol, dm, mo_coeff, mo_occ, hermi=0, vhfopt=None, future = executor.submit( _jk_task, mol, dms, mo_coeff, mo_occ, vhfopt, task_list[device_id], hermi=hermi, - with_j=with_j, with_k=with_k, verbose=verbose, + with_j=with_j, with_k=with_k, verbose=verbose, device_id=device_id) futures.append(future) @@ -224,7 +224,7 @@ def get_jk(mol, dm, mo_coeff, mo_occ, hermi=0, vhfopt=None, log.debug1('kernel launches %d', kern_counts) for llll, t in timing_collection.items(): log.debug1('%s wall time %.2f', llll, t) - + for s in _streams: s.synchronize() cp.cuda.get_current_stream().synchronize() From be6cf61c723aec6259913ad0c27e8c18f9108f38 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Thu, 26 Dec 2024 18:01:44 +0000 Subject: [PATCH 16/49] update nightly build --- gpu4pyscf/df/hessian/jk.py | 5 +- gpu4pyscf/gto/int3c1e.py | 68 +++++--- gpu4pyscf/gto/int3c1e_ip.py | 64 ++++---- gpu4pyscf/solvent/grad/pcm.py | 13 +- gpu4pyscf/solvent/grad/smd.py | 90 ----------- gpu4pyscf/tests/020_Vitamin_C.xyz | 22 +++ gpu4pyscf/tests/057_Tamoxifen.xyz | 59 +++++++ gpu4pyscf/tests/095_Azadirachtin.xyz | 97 +++++++++++ gpu4pyscf/tests/test_dft.py | 180 --------------------- gpu4pyscf/tests/test_rks.py | 230 +++++++++++++++++++++++++++ gpu4pyscf/tests/test_uks.py | 92 +++++++++++ 11 files changed, 585 insertions(+), 335 deletions(-) create mode 100644 gpu4pyscf/tests/020_Vitamin_C.xyz create mode 100644 gpu4pyscf/tests/057_Tamoxifen.xyz create mode 100644 gpu4pyscf/tests/095_Azadirachtin.xyz delete mode 100644 gpu4pyscf/tests/test_dft.py create mode 100644 gpu4pyscf/tests/test_rks.py create mode 100644 gpu4pyscf/tests/test_uks.py diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py index f8992ca3..e66739d2 100644 --- a/gpu4pyscf/df/hessian/jk.py +++ b/gpu4pyscf/df/hessian/jk.py @@ -316,6 +316,7 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, hj_ipip1[:,i0:i1] += contract('xpi,p->xi', tmp, rhoj[k0:k1]) if with_k: hk_ipip1[:,i0:i1] += contract('xpji,pji->xi', int3c_blk, rhok_tmp) + int3c_blk = None # (11|0), (0|0)(0|00) without response of RI basis int3c_blk = _get_int3c2e_ipip_slice('ipvip1', intopt, cp_ij_id, aux_id, omega=omega) @@ -324,6 +325,7 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, hj_ipvip1[:,i0:i1,j0:j1] += contract('xpij,p->xij', tmp, rhoj[k0:k1]) if with_k: hk_ipvip1[:,i0:i1,j0:j1] += contract('xpji,pji->xij', int3c_blk, rhok_tmp) + int3c_blk = None if auxbasis_response < 1: continue @@ -335,6 +337,7 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, hj_ip1ip2[:,i0:i1,k0:k1] += contract('xpi,p->xip', tmp, rhoj[k0:k1]) if with_k: hk_ip1ip2[:,i0:i1,k0:k1] += contract('xpji,pji->xip', int3c_blk, rhok_tmp) + int3c_blk = None if auxbasis_response < 2: continue @@ -346,7 +349,7 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, hj_ipip2[:,k0:k1] += contract('xp,p->xp', tmp, rhoj[k0:k1]) if with_k: hk_ipip2[:,k0:k1] += contract('xpji,pji->xp', int3c_blk, rhok_tmp) - + int3c_blk = None auxslices = intopt.auxmol.aoslice_by_atom() aoslices = intopt.mol.aoslice_by_atom() ao2atom = int3c2e.get_ao2atom(intopt, aoslices) diff --git a/gpu4pyscf/gto/int3c1e.py b/gpu4pyscf/gto/int3c1e.py index 9fa6c98d..d693b804 100644 --- a/gpu4pyscf/gto/int3c1e.py +++ b/gpu4pyscf/gto/int3c1e.py @@ -15,7 +15,7 @@ import ctypes import cupy as cp import numpy as np - +from pyscf import lib from pyscf.scf import _vhf from pyscf.gto import ATOM_OF from pyscf.lib import c_null_ptr @@ -161,7 +161,6 @@ def get_n_hermite_density_of_angular_pair(l): def sort_orbitals(self, mat, axis=[]): ''' Transform given axis of a matrix into sorted AO, - and transform given auxiliary axis of a matrix into sorted auxiliary AO ''' idx = self._ao_idx shape_ones = (1,) * mat.ndim @@ -176,6 +175,24 @@ def sort_orbitals(self, mat, axis=[]): fancy_index.append(indices.reshape(idx_shape)) return mat[tuple(fancy_index)] + def unsort_orbitals(self, sorted_mat, axis=[]): + ''' Transform given axis of a matrix into sorted AO, + ''' + idx = self._ao_idx + shape_ones = (1,) * sorted_mat.ndim + fancy_index = [] + for dim, n in enumerate(sorted_mat.shape): + if dim in axis: + assert n == len(idx) + indices = idx + else: + indices = np.arange(n) + idx_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:] + fancy_index.append(indices.reshape(idx_shape)) + mat = cp.empty_like(sorted_mat) + mat[tuple(fancy_index)] = sorted_mat + return mat + @property def bpcache(self): device_id = cp.cuda.Device().id @@ -205,17 +222,17 @@ def get_int3c1e(mol, grids, charge_exponents, intopt): "which requires {total_double_number * 8 / 1e9 : .1f} GB of memory") ngrids_per_split = (ngrids + n_grid_split - 1) // n_grid_split - int3c_pinned_memory_pool = cp.cuda.alloc_pinned_memory(ngrids * nao * nao * np.array([1.0]).nbytes) - int3c = np.frombuffer(int3c_pinned_memory_pool, np.float64, ngrids * nao * nao).reshape([ngrids, nao, nao], order='C') + buf_size = ngrids * nao * nao * 8 + int3c_pinned_buf = cp.cuda.alloc_pinned_memory(buf_size) + int3c = np.frombuffer(int3c_pinned_buf, np.float64, buf_size).reshape([ngrids, nao, nao], order='C') # int3c = np.zeros([ngrids, nao, nao], order='C') # Using unpinned (pageable) memory, each memcpy is much slower, but there's no initialization time grids = cp.asarray(grids, order='C') if charge_exponents is not None: charge_exponents = cp.asarray(charge_exponents, order='C') - for i_grid_split in range(0, ngrids, ngrids_per_split): - ngrids_of_split = np.min([ngrids_per_split, ngrids - i_grid_split]) - int3c_grid_slice = cp.zeros([ngrids_of_split, nao, nao], order='C') + for p0, p1 in lib.prange(0, ngrids, ngrids_per_split): + int3c_grid_slice = cp.zeros([p1-p0, nao, nao], order='C') for cp_ij_id, _ in enumerate(intopt.log_qs): cpi = intopt.cp_idx[cp_ij_id] cpj = intopt.cp_jdx[cp_ij_id] @@ -237,18 +254,18 @@ def get_int3c1e(mol, grids, charge_exponents, intopt): ao_offsets = np.array([i0, j0], dtype=np.int32) strides = np.array([ni, ni*nj], dtype=np.int32) - int3c_angular_slice = cp.zeros([ngrids_of_split, j1-j0, i1-i0], order='C') + int3c_angular_slice = cp.zeros([p1-p0, j1-j0, i1-i0], order='C') charge_exponents_pointer = c_null_ptr() if charge_exponents is not None: - charge_exponents_pointer = charge_exponents[i_grid_split : i_grid_split + ngrids_of_split].data.ptr + charge_exponents_pointer = charge_exponents[p0:p1].data.ptr err = libgint.GINTfill_int3c1e( ctypes.cast(stream.ptr, ctypes.c_void_p), intopt.bpcache, - ctypes.cast(grids[i_grid_split : i_grid_split + ngrids_of_split, :].data.ptr, ctypes.c_void_p), + ctypes.cast(grids[p0:p1, :].data.ptr, ctypes.c_void_p), ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), - ctypes.c_int(ngrids_of_split), + ctypes.c_int(p1-p0), ctypes.cast(int3c_angular_slice.data.ptr, ctypes.c_void_p), strides.ctypes.data_as(ctypes.c_void_p), ao_offsets.ctypes.data_as(ctypes.c_void_p), @@ -270,11 +287,11 @@ def get_int3c1e(mol, grids, charge_exponents, intopt): row, col = np.tril_indices(nao) int3c_grid_slice[:, row, col] = int3c_grid_slice[:, col, row] - ao_idx = np.argsort(intopt._ao_idx) - grid_idx = np.arange(ngrids_of_split) - int3c_grid_slice = int3c_grid_slice[np.ix_(grid_idx, ao_idx, ao_idx)] - - int3c_grid_slice.get(out = int3c[i_grid_split : i_grid_split + ngrids_of_split, :, :]) + #ao_idx = np.argsort(intopt._ao_idx) + #grid_idx = np.arange(p1-p0) + #int3c_grid_slice = int3c_grid_slice[np.ix_(grid_idx, ao_idx, ao_idx)] + int3c_grid_slice = intopt.unsort_orbitals(int3c_grid_slice, axis=[1,2]) + int3c_grid_slice.get(out = int3c[p0:p1, :, :]) return int3c @@ -355,9 +372,9 @@ def get_int3c1e_charge_contracted(mol, grids, charge_exponents, charges, intopt) row, col = np.tril_indices(nao) int1e_charge_contracted[row, col] = int1e_charge_contracted[col, row] - ao_idx = np.argsort(intopt._ao_idx) - int1e_charge_contracted = int1e_charge_contracted[np.ix_(ao_idx, ao_idx)] - + #ao_idx = np.argsort(intopt._ao_idx) + #int1e_charge_contracted = int1e_charge_contracted[np.ix_(ao_idx, ao_idx)] + int1e_charge_contracted = intopt.unsort_orbitals(int1e_charge_contracted, axis=[0,1]) return int1e_charge_contracted def get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt): @@ -385,7 +402,7 @@ def get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt): bas_coords = intopt._sorted_mol.atom_coords()[intopt._sorted_mol._bas[:, ATOM_OF]].flatten() n_total_hermite_density = intopt.density_offset[-1] - dm_pair_ordered = np.zeros(n_total_hermite_density) + dm_pair_ordered = np.empty(n_total_hermite_density) libgint.GINTinit_J_density_rys_preprocess(dm.ctypes.data_as(ctypes.c_void_p), dm_pair_ordered.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(1), ctypes.c_int(nao_cart), ctypes.c_int(len(intopt.bas_pairs_locs) - 1), @@ -413,8 +430,7 @@ def get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt): int3c_density_contracted = cp.zeros(ngrids) - for i_grid_split in range(0, ngrids, ngrids_per_split): - ngrids_of_split = np.min([ngrids_per_split, ngrids - i_grid_split]) + for p0, p1 in lib.prange(0, ngrids, ngrids_per_split): for cp_ij_id, _ in enumerate(intopt.log_qs): stream = cp.cuda.get_current_stream() @@ -425,7 +441,7 @@ def get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt): charge_exponents_pointer = c_null_ptr() if charge_exponents is not None: - charge_exponents_pointer = charge_exponents[i_grid_split : i_grid_split + ngrids_of_split].data.ptr + charge_exponents_pointer = charge_exponents[p0:p1].data.ptr # n_pair_sum_per_thread = 1 # means every thread processes one pair and one grid # n_pair_sum_per_thread = nao_cart # or larger number gaurantees one thread processes one grid and all pairs of the same type @@ -434,12 +450,12 @@ def get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt): err = libgint.GINTfill_int3c1e_density_contracted( ctypes.cast(stream.ptr, ctypes.c_void_p), intopt.bpcache, - ctypes.cast(grids[i_grid_split : i_grid_split + ngrids_of_split, :].data.ptr, ctypes.c_void_p), + ctypes.cast(grids[p0:p1, :].data.ptr, ctypes.c_void_p), ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), - ctypes.c_int(ngrids_of_split), + ctypes.c_int(p1-p0), ctypes.cast(dm_pair_ordered.data.ptr, ctypes.c_void_p), intopt.density_offset.ctypes.data_as(ctypes.c_void_p), - ctypes.cast(int3c_density_contracted[i_grid_split : i_grid_split + ngrids_of_split].data.ptr, ctypes.c_void_p), + ctypes.cast(int3c_density_contracted[p0:p1].data.ptr, ctypes.c_void_p), bins_locs_ij.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nbins), ctypes.c_int(cp_ij_id), diff --git a/gpu4pyscf/gto/int3c1e_ip.py b/gpu4pyscf/gto/int3c1e_ip.py index cc53feab..717db68f 100644 --- a/gpu4pyscf/gto/int3c1e_ip.py +++ b/gpu4pyscf/gto/int3c1e_ip.py @@ -15,7 +15,7 @@ import ctypes import cupy as cp import numpy as np - +from pyscf import lib from pyscf.gto import ATOM_OF from pyscf.lib import c_null_ptr from gpu4pyscf.lib.cupy_helper import load_library, cart2sph, get_avail_mem @@ -40,19 +40,19 @@ def get_int3c1e_ip(mol, grids, charge_exponents, intopt): "the 3 center integral first derivative, " "which requires {total_double_number * 8 / 1e9 : .1f} GB of memory") ngrids_per_split = (ngrids + n_grid_split - 1) // n_grid_split - - int3cip1_pinned_memory_pool = cp.cuda.alloc_pinned_memory(ngrids * nao * nao * 3 * np.array([1.0]).nbytes) - int3c_ip1 = np.frombuffer(int3cip1_pinned_memory_pool, np.float64, ngrids * nao * nao * 3).reshape([3, ngrids, nao, nao], order='C') - int3cip2_pinned_memory_pool = cp.cuda.alloc_pinned_memory(ngrids * nao * nao * 3 * np.array([1.0]).nbytes) - int3c_ip2 = np.frombuffer(int3cip2_pinned_memory_pool, np.float64, ngrids * nao * nao * 3).reshape([3, ngrids, nao, nao], order='C') + + buf_size = ngrids * nao * nao * 3 + int3cip1_pinned_buf = cp.cuda.alloc_pinned_memory(buf_size * 8) + int3c_ip1 = np.frombuffer(int3cip1_pinned_buf, np.float64, buf_size).reshape([3, ngrids, nao, nao], order='C') + int3cip2_pinned_buf = cp.cuda.alloc_pinned_memory(buf_size * 8) + int3c_ip2 = np.frombuffer(int3cip2_pinned_buf, np.float64, buf_size).reshape([3, ngrids, nao, nao], order='C') grids = cp.asarray(grids, order='C') if charge_exponents is not None: charge_exponents = cp.asarray(charge_exponents, order='C') - for i_grid_split in range(0, ngrids, ngrids_per_split): - ngrids_of_split = np.min([ngrids_per_split, ngrids - i_grid_split]) - int3c_grid_slice = cp.zeros([6, ngrids_of_split, nao, nao], order='C') + for p0, p1 in lib.prange(0, ngrids, ngrids_per_split): + int3c_grid_slice = cp.zeros([6, p1-p0, nao, nao], order='C') for cp_ij_id, _ in enumerate(intopt.log_qs): cpi = intopt.cp_idx[cp_ij_id] cpj = intopt.cp_jdx[cp_ij_id] @@ -74,18 +74,18 @@ def get_int3c1e_ip(mol, grids, charge_exponents, intopt): ao_offsets = np.array([i0, j0], dtype=np.int32) strides = np.array([ni, ni*nj], dtype=np.int32) - int3c_angular_slice = cp.zeros([6, ngrids_of_split, j1-j0, i1-i0], order='C') + int3c_angular_slice = cp.zeros([6, p1-p0, j1-j0, i1-i0], order='C') charge_exponents_pointer = c_null_ptr() if charge_exponents is not None: - charge_exponents_pointer = charge_exponents[i_grid_split : i_grid_split + ngrids_of_split].data.ptr + charge_exponents_pointer = charge_exponents[p0:p1].data.ptr err = libgint.GINTfill_int3c1e_ip( ctypes.cast(stream.ptr, ctypes.c_void_p), intopt.bpcache, - ctypes.cast(grids[i_grid_split : i_grid_split + ngrids_of_split, :].data.ptr, ctypes.c_void_p), + ctypes.cast(grids[p0:p1, :].data.ptr, ctypes.c_void_p), ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), - ctypes.c_int(ngrids_of_split), + ctypes.c_int(p1-p0), ctypes.cast(int3c_angular_slice.data.ptr, ctypes.c_void_p), strides.ctypes.data_as(ctypes.c_void_p), ao_offsets.ctypes.data_as(ctypes.c_void_p), @@ -106,17 +106,17 @@ def get_int3c1e_ip(mol, grids, charge_exponents, intopt): int3c_grid_slice[:, :, j0:j1, i0:i1] = int3c_angular_slice ao_idx = np.argsort(intopt._ao_idx) - grid_idx = np.arange(ngrids_of_split) + grid_idx = np.arange(p1-p0) derivative_idx = np.arange(6) int3c_grid_slice = int3c_grid_slice[np.ix_(derivative_idx, grid_idx, ao_idx, ao_idx)] # Each piece of the following memory is contiguous - int3c_grid_slice[0, :, :, :].get(out = int3c_ip1[0, i_grid_split : i_grid_split + ngrids_of_split, :, :]) - int3c_grid_slice[1, :, :, :].get(out = int3c_ip1[1, i_grid_split : i_grid_split + ngrids_of_split, :, :]) - int3c_grid_slice[2, :, :, :].get(out = int3c_ip1[2, i_grid_split : i_grid_split + ngrids_of_split, :, :]) - int3c_grid_slice[3, :, :, :].get(out = int3c_ip2[0, i_grid_split : i_grid_split + ngrids_of_split, :, :]) - int3c_grid_slice[4, :, :, :].get(out = int3c_ip2[1, i_grid_split : i_grid_split + ngrids_of_split, :, :]) - int3c_grid_slice[5, :, :, :].get(out = int3c_ip2[2, i_grid_split : i_grid_split + ngrids_of_split, :, :]) + int3c_grid_slice[0, :, :, :].get(out = int3c_ip1[0, p0:p1, :, :]) + int3c_grid_slice[1, :, :, :].get(out = int3c_ip1[1, p0:p1, :, :]) + int3c_grid_slice[2, :, :, :].get(out = int3c_ip1[2, p0:p1, :, :]) + int3c_grid_slice[3, :, :, :].get(out = int3c_ip2[0, p0:p1, :, :]) + int3c_grid_slice[4, :, :, :].get(out = int3c_ip2[1, p0:p1, :, :]) + int3c_grid_slice[5, :, :, :].get(out = int3c_ip2[2, p0:p1, :, :]) return int3c_ip1, int3c_ip2 @@ -134,7 +134,7 @@ def get_int3c1e_ip1_charge_contracted(mol, grids, charge_exponents, charges, int charges = charges.reshape([-1, 1], order='C') grids = cp.concatenate([grids, charges], axis=1) - int1e_charge_contracted = cp.zeros([3, mol.nao, mol.nao], order='C') + int1e_charge_contracted = cp.empty([3, mol.nao, mol.nao], order='C') for cp_ij_id, _ in enumerate(intopt.log_qs): cpi = intopt.cp_idx[cp_ij_id] cpj = intopt.cp_jdx[cp_ij_id] @@ -193,11 +193,7 @@ def get_int3c1e_ip1_charge_contracted(mol, grids, charge_exponents, charges, int int1e_charge_contracted[:, j0:j1, i0:i1] = int1e_angular_slice - ao_idx = np.argsort(intopt._ao_idx) - derivative_idx = np.arange(3) - int1e_charge_contracted = int1e_charge_contracted[np.ix_(derivative_idx, ao_idx, ao_idx)] - - return int1e_charge_contracted + return intopt.unsort_orbitals(int1e_charge_contracted, axis=[1,2]) def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt): omega = mol.omega @@ -228,10 +224,11 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt) bas_coords = intopt._sorted_mol.atom_coords()[intopt._sorted_mol._bas[:, ATOM_OF]].flatten() n_total_hermite_density = intopt.density_offset[-1] - dm_pair_ordered = np.zeros(n_total_hermite_density) + dm_pair_ordered = np.empty(n_total_hermite_density) libgint.GINTinit_J_density_rys_preprocess(dm.ctypes.data_as(ctypes.c_void_p), dm_pair_ordered.ctypes.data_as(ctypes.c_void_p), - ctypes.c_int(1), ctypes.c_int(nao_cart), ctypes.c_int(len(intopt.bas_pairs_locs) - 1), + ctypes.c_int(1), ctypes.c_int(nao_cart), + ctypes.c_int(len(intopt.bas_pairs_locs) - 1), intopt.bas_pair2shls.ctypes.data_as(ctypes.c_void_p), intopt.bas_pairs_locs.ctypes.data_as(ctypes.c_void_p), l_ij.ctypes.data_as(ctypes.c_void_p), @@ -252,8 +249,7 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt) int3c_density_contracted = cp.zeros([3, ngrids], order='C') - for i_grid_split in range(0, ngrids, ngrids_per_split): - ngrids_of_split = np.min([ngrids_per_split, ngrids - i_grid_split]) + for p0, p1 in lib.prange(0, ngrids, ngrids_per_split): for cp_ij_id, _ in enumerate(intopt.log_qs): stream = cp.cuda.get_current_stream() @@ -264,7 +260,7 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt) charge_exponents_pointer = c_null_ptr() if charge_exponents is not None: - charge_exponents_pointer = charge_exponents[i_grid_split : i_grid_split + ngrids_of_split].data.ptr + charge_exponents_pointer = charge_exponents[p0:p1].data.ptr # n_pair_sum_per_thread = 1 # means every thread processes one pair and one grid # n_pair_sum_per_thread = nao_cart # or larger number gaurantees one thread processes one grid and all pairs of the same type @@ -273,12 +269,12 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt) err = libgint.GINTfill_int3c1e_ip2_density_contracted( ctypes.cast(stream.ptr, ctypes.c_void_p), intopt.bpcache, - ctypes.cast(grids[i_grid_split : i_grid_split + ngrids_of_split, :].data.ptr, ctypes.c_void_p), + ctypes.cast(grids[p0:p1, :].data.ptr, ctypes.c_void_p), ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), - ctypes.c_int(ngrids_of_split), + ctypes.c_int(p1-p0), ctypes.cast(dm_pair_ordered.data.ptr, ctypes.c_void_p), intopt.density_offset.ctypes.data_as(ctypes.c_void_p), - ctypes.cast(int3c_density_contracted[:, i_grid_split : i_grid_split + ngrids_of_split].data.ptr, ctypes.c_void_p), + ctypes.cast(int3c_density_contracted[:, p0:p1].data.ptr, ctypes.c_void_p), bins_locs_ij.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nbins), ctypes.c_int(cp_ij_id), diff --git a/gpu4pyscf/solvent/grad/pcm.py b/gpu4pyscf/solvent/grad/pcm.py index 0544f751..3fe7cb6c 100644 --- a/gpu4pyscf/solvent/grad/pcm.py +++ b/gpu4pyscf/solvent/grad/pcm.py @@ -24,7 +24,7 @@ from pyscf import lib from pyscf import gto from pyscf.grad import rhf as rhf_grad - +from gpu4pyscf.gto import int3c1e from gpu4pyscf.solvent.pcm import PI, switch_h, libsolvent from gpu4pyscf.gto.int3c1e_ip import int1e_grids_ip1, int1e_grids_ip2 from gpu4pyscf.lib.cupy_helper import contract @@ -239,11 +239,16 @@ def grad_qv(pcmobj, dm): grid_coords = pcmobj.surface['grid_coords'] q_sym = pcmobj._intermediates['q_sym'] - dvj = int1e_grids_ip1(mol, grid_coords, dm = dm, charges = q_sym, direct_scf_tol = 1e-14, charge_exponents = charge_exp**2) - dq = int1e_grids_ip2(mol, grid_coords, dm = dm, charges = q_sym, direct_scf_tol = 1e-14, charge_exponents = charge_exp**2) + intopt = int3c1e.VHFOpt(mol) + intopt.build(1e-14, aosym=False) + dvj = int1e_grids_ip1(mol, grid_coords, dm = dm, charges = q_sym, + direct_scf_tol = 1e-14, charge_exponents = charge_exp**2, + intopt=intopt) + dq = int1e_grids_ip2(mol, grid_coords, dm = dm, charges = q_sym, + direct_scf_tol = 1e-14, charge_exponents = charge_exp**2, + intopt=intopt) aoslice = mol.aoslice_by_atom() - aoslice = cupy.array(aoslice) dvj = 2.0 * cupy.asarray([cupy.sum(dvj[:,p0:p1], axis=1) for p0,p1 in aoslice[:,2:]]) dq = cupy.asarray([cupy.sum(dq[:,p0:p1], axis=1) for p0,p1 in gridslice]) de = dq + dvj diff --git a/gpu4pyscf/solvent/grad/smd.py b/gpu4pyscf/solvent/grad/smd.py index a3d850db..32ebc2ee 100644 --- a/gpu4pyscf/solvent/grad/smd.py +++ b/gpu4pyscf/solvent/grad/smd.py @@ -25,100 +25,10 @@ from gpu4pyscf.solvent import pcm, smd from gpu4pyscf.solvent.grad import pcm as pcm_grad from gpu4pyscf.lib import logger -from gpu4pyscf.lib.cupy_helper import contract def get_cds(smdobj): return smd.get_cds_legacy(smdobj)[1] -""" -def grad_solver(smdobj, dm): - ''' - dE = 0.5*v* d(K^-1 R) *v + q*dv - v^T* d(K^-1 R)v = v^T*K^-1(dR - dK K^-1R)v = v^T K^-1(dR - dK q) - ''' - mol = smdobj.mol - log = logger.new_logger(mol, mol.verbose) - t1 = log.init_timer() - if not smdobj._intermediates: - smdobj.build() - dm_cache = smdobj._intermediates.get('dm', None) - if dm_cache is not None and cupy.linalg.norm(dm_cache - dm) < 1e-10: - pass - else: - smdobj._get_vind(dm) - - gridslice = smdobj.surface['gslice_by_atom'] - v_grids = smdobj._intermediates['v_grids'] - A = smdobj._intermediates['A'] - D = smdobj._intermediates['D'] - S = smdobj._intermediates['S'] - K = smdobj._intermediates['K'] - q = smdobj._intermediates['q'] - - vK_1 = cupy.linalg.solve(K.T, v_grids) - - dF, dA = pcm_grad.get_dF_dA(smdobj.surface) - - with_D = smdobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SS(V)PE', 'SMD'] - dD, dS, dSii = pcm_grad.get_dD_dS(smdobj.surface, dF, with_D=with_D, with_S=True) - - epsilon = smdobj.eps - de = cupy.zeros([smdobj.mol.natm,3]) - - def contract_bra(a, B, c): - ''' i,xij,j->jx ''' - tmp = a.dot(B) - return (tmp * c).T - - def contract_ket(a, B, c): - ''' i,xij,j->ix ''' - tmp = B.dot(c) - return (a*tmp).T - - # IEF-PCM and SS(V)PE formally are the same in gradient calculation - # dR = f_eps/(2*pi) * (dD*A + D*dA), - # dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS) - f_epsilon = (epsilon - 1.0)/(epsilon + 1.0) - fac = f_epsilon/(2.0*np.pi) - - Av = A*v_grids - de_dR = 0.5*fac * contract_ket(vK_1, dD, Av) - de_dR -= 0.5*fac * contract_bra(vK_1, dD, Av) - de_dR = cupy.asarray([cupy.sum(de_dR[p0:p1], axis=0) for p0,p1 in gridslice]) - - vK_1_D = vK_1.dot(D) - vK_1_Dv = vK_1_D * v_grids - de_dR += 0.5*fac * contract('j,xjn->nx', vK_1_Dv, dA) - - de_dS0 = 0.5*contract_ket(vK_1, dS, q) - de_dS0 -= 0.5*contract_bra(vK_1, dS, q) - de_dS0 = cupy.asarray([cupy.sum(de_dS0[p0:p1], axis=0) for p0,p1 in gridslice]) - - vK_1_q = vK_1 * q - de_dS0 += 0.5*contract('i,xin->nx', vK_1_q, dSii) - - vK_1_DA = vK_1_D*A - de_dS1 = 0.5*contract_ket(vK_1_DA, dS, q) - de_dS1 -= 0.5*contract_bra(vK_1_DA, dS, q) - de_dS1 = cupy.asarray([cupy.sum(de_dS1[p0:p1], axis=0) for p0,p1 in gridslice]) - - vK_1_DAq = vK_1_DA*q - de_dS1 += 0.5*contract('j,xjn->nx', vK_1_DAq, dSii) - - Sq = cupy.dot(S,q) - ASq = A*Sq - de_dD = 0.5*contract_ket(vK_1, dD, ASq) - de_dD -= 0.5*contract_bra(vK_1, dD, ASq) - de_dD = cupy.asarray([cupy.sum(de_dD[p0:p1], axis=0) for p0,p1 in gridslice]) - - de_dA = 0.5*contract('j,xjn->nx', vK_1_D*Sq, dA) # 0.5*cupy.einsum('j,xjn,j->nx', vK_1_D, dA, Sq) - - de_dK = de_dS0 - fac * (de_dD + de_dA + de_dS1) - de += de_dR - de_dK - - t1 = log.timer_debug1('grad solver', *t1) - return de.get() -""" grad_solver = pcm_grad.grad_solver def make_grad_object(grad_method): diff --git a/gpu4pyscf/tests/020_Vitamin_C.xyz b/gpu4pyscf/tests/020_Vitamin_C.xyz new file mode 100644 index 00000000..e119c6d3 --- /dev/null +++ b/gpu4pyscf/tests/020_Vitamin_C.xyz @@ -0,0 +1,22 @@ +20 +Vitamin C +C -0.07551087 1.68127663 -0.10745193 +O 1.33621755 1.87147409 -0.39326987 +C 1.67074668 2.95729545 0.49387976 +C 0.41740763 3.77281969 0.78495878 +C -0.60481480 3.07572636 0.28906224 +H -0.19316298 1.01922455 0.72486113 +O 0.35092043 5.03413298 1.45545728 +H 0.42961487 5.74279041 0.81264173 +O -1.95331750 3.53349874 0.15912025 +H -2.55333895 2.78846397 0.23972698 +O 2.81976302 3.20110148 0.94542226 +C -0.81772499 1.09230218 -1.32146482 +H -0.70955636 1.74951833 -2.15888136 +C -2.31163857 0.93420736 -0.98260166 +H -2.72575463 1.89080093 -0.74107186 +H -2.41980721 0.27699120 -0.14518512 +O -0.26428017 -0.18613595 -1.64425697 +H -0.72695910 -0.55328886 -2.40104423 +O -3.00083741 0.38730252 -2.10989934 +H -3.93210821 0.28874990 -1.89865997 diff --git a/gpu4pyscf/tests/057_Tamoxifen.xyz b/gpu4pyscf/tests/057_Tamoxifen.xyz new file mode 100644 index 00000000..b51df6f5 --- /dev/null +++ b/gpu4pyscf/tests/057_Tamoxifen.xyz @@ -0,0 +1,59 @@ +57 +Tamoxifen +C -1.42666665 1.35988349 0.01780185 +C -0.75139234 2.53486079 0.01780185 +C -2.96666665 1.35988349 0.01780185 +C -3.66418809 0.15160568 0.01780185 +C -3.66417225 2.56778831 0.01791304 +C -5.05890001 0.15132789 0.01723115 +H -3.11399504 -0.80051230 0.01693694 +C -5.05931013 2.56768367 0.01833813 +H -3.11457497 3.52019148 0.01809296 +C -5.75673144 1.35973487 0.01785909 +H -5.60876287 -0.80100973 0.01659711 +H -5.60899513 3.52021733 0.01884114 +H -6.85641138 1.35926586 0.01746817 +C -1.51874951 3.87006226 0.01780185 +C -1.63823871 4.60590036 -1.16149287 +C -2.09440347 4.34371845 1.19670832 +C -2.33266580 5.81544273 -1.16163975 +H -1.18363273 4.23258432 -2.09058400 +C -2.78991814 5.55312706 1.19651365 +H -2.00047584 3.76380313 2.12622693 +C -2.90901419 6.28907563 0.01764434 +H -2.42635385 6.39580205 -2.09099551 +H -3.24404320 5.92613353 2.12608927 +C 0.78860766 2.53486079 0.01780185 +C 1.48612910 3.74313859 0.01780185 +C 1.48611327 1.32695597 0.01791304 +C 2.88084102 3.74341639 0.01723115 +H 0.93593606 4.69525658 0.01693694 +C 2.88125115 1.32706060 0.01833813 +H 0.93651599 0.37455279 0.01809296 +C 3.57867246 2.53500940 0.01785909 +H 3.43070389 4.69575400 0.01659711 +H 3.43093615 0.37452694 0.01884114 +H 4.67835240 2.53547842 0.01746817 +C -0.65930948 0.02468201 0.01780185 +H -0.04466478 -0.03344716 -0.85611628 +H -0.04386363 -0.03298673 0.89118649 +C -1.66236338 -1.14385651 0.01856968 +H -2.27713573 -1.08561745 0.89239069 +H -2.27768159 -1.08629703 -0.85491210 +H -1.12919956 -2.07156136 0.01876393 +O -3.62101473 7.52921876 0.01715974 +C -2.69982994 8.60858726 0.19402752 +H -2.03011871 8.64615667 -0.63962434 +H -2.14108178 8.45680900 1.09384076 +C -3.47584819 9.93535894 0.28927757 +H -4.05456450 10.07469158 -0.59986462 +H -4.12694690 9.90759901 1.13792346 +C -1.65137806 10.90285045 1.72438609 +H -2.24764703 10.40869908 2.46274761 +H -0.79110440 10.30633800 1.50302183 +H -1.33836538 11.85545774 2.09783276 +C -3.25771829 12.42866058 0.53449492 +H -2.56611180 13.24181825 0.60767325 +H -3.86037095 12.55070987 -0.34118410 +H -3.88574784 12.41553739 1.40069735 +N -2.48185199 11.10154878 0.44281205 diff --git a/gpu4pyscf/tests/095_Azadirachtin.xyz b/gpu4pyscf/tests/095_Azadirachtin.xyz new file mode 100644 index 00000000..8c03f7bb --- /dev/null +++ b/gpu4pyscf/tests/095_Azadirachtin.xyz @@ -0,0 +1,97 @@ +95 +Azadirachtin +C 0.24028400 -0.96854600 0.05735800 +C 1.49955800 -0.38999400 0.79976500 +C 1.84405900 1.11309900 0.52612700 +C 0.61115200 2.06994900 0.41027500 +C -0.38718900 1.44909800 -0.58288900 +C -0.81198100 0.11367700 0.01403200 +H 1.34464500 -0.48336800 1.89667000 +H 0.90815500 3.09474100 0.10955200 +H 0.07146500 1.40030200 -1.59457300 +H -1.08538000 0.33936800 1.09841400 +O -0.03234300 2.14051500 1.69756400 +H 0.43832200 2.76739400 2.27637900 +O -1.64345600 2.15598600 -0.77527600 +C -2.74935800 1.17918600 -0.75355500 +H -3.33770900 1.41858200 0.14457000 +H -3.31820200 1.39744800 -1.66649800 +C -2.11058900 -0.22990000 -0.71994400 +C 2.72998200 1.32748400 -0.70483200 +H 2.81316800 2.38444500 -0.97758400 +H 3.74960400 0.95856700 -0.53283000 +H 2.35200700 0.78104000 -1.58051000 +C 2.60140000 -1.34386400 0.30659000 +C 0.84678200 -1.40613600 -1.29617000 +H 0.88274800 -0.59319600 -2.03951200 +H 0.38815200 -2.30137400 -1.74034600 +O 2.22547600 -1.78168600 -1.02946800 +C -0.42290800 -2.19363100 0.75277400 +H -0.32012900 -3.08353500 0.10236100 +C -1.91400700 -2.00763500 1.11237500 +H -2.33420900 -2.99527800 1.38379200 +H -1.98093100 -1.38866600 2.03106200 +C -2.81353800 -1.37055100 0.02719800 +H -3.12020000 -2.14713900 -0.69849000 +C -1.82661295 -0.68751599 -2.16270012 +O -1.03585236 -0.24261727 -2.99355789 +O -2.59156054 -1.74766325 -2.52650357 +C -2.29916153 -2.14198817 -3.86960099 +H -2.96290254 -2.92828960 -4.16299137 +H -2.42740743 -1.30452633 -4.52313804 +H -1.28838658 -2.48820275 -3.92764814 +O -4.01986539 -0.90962471 0.64138134 +C -4.89301012 -1.93494775 0.80793745 +O -4.54153100 -3.05110585 0.42818050 +C -6.20834727 -1.48087047 1.46771166 +H -6.70958996 -0.78829922 0.82428269 +H -6.83594045 -2.33131805 1.63434212 +H -5.99341406 -1.00749899 2.40292455 +O 0.29104226 -2.52037085 1.94793763 +C 0.31248536 -3.86361432 2.13937213 +O -0.25336168 -4.56806573 1.30443072 +C 1.07328546 -4.25938123 3.41849362 +C 1.18469713 -5.56341278 3.77014145 +H 0.75137836 -6.32562659 3.15681858 +C 1.70966559 -3.16955354 4.30104443 +H 2.52793619 -2.72004059 3.77829081 +H 0.97813456 -2.42251044 4.52839648 +H 2.06508607 -3.60889199 5.20964665 +C 1.93754031 -5.94957419 5.05688405 +H 1.46239499 -5.49165555 5.89917107 +H 1.92238977 -7.01309886 5.17344190 +H 2.95091533 -5.61227499 4.99207421 +C 3.99823568 -0.71610148 0.14421916 +O 4.54063921 0.18499764 0.78248292 +O 4.69984280 -1.27738694 -0.87269582 +O 2.69271189 -2.53050618 1.09933364 +H 3.60067733 -2.84219679 1.10624230 +C 5.98847134 -0.66885730 -0.99113633 +H 6.49970371 -0.73075570 -0.05320774 +H 6.55618159 -1.17887968 -1.74112449 +H 5.87374685 0.35839671 -1.26770006 +C 2.63486992 1.58151749 1.76176538 +C 2.13434327 2.21842175 3.11643757 +C 3.90461234 2.45387090 1.74128354 +O 2.44467967 0.78466796 2.96396625 +C 3.35337126 2.98709450 3.79243900 +C 0.74513758 2.60743687 3.44136489 +O 5.00327683 3.19196370 1.11718214 +C 4.47769203 2.16352749 3.16877423 +H 3.15573566 3.35599353 1.51547111 +C 3.84794511 4.41584726 3.25643717 +H 3.24116904 2.99889070 4.88162906 +H 0.00697023 1.93995296 2.97068106 +H 0.55491721 2.57388288 4.52449549 +H 0.54134467 3.63458255 3.09753074 +C 4.84981258 4.42246076 1.92099071 +H 4.49637929 1.09030804 3.43212004 +H 5.51163803 2.50195502 3.32489990 +C 4.76579887 5.04464694 4.26535741 +O 2.75093022 5.20578033 2.83652107 +H 4.60685318 5.22136931 1.20459035 +O 6.17282363 4.70855901 2.47193815 +H 4.42807865 5.31783232 5.24674785 +C 6.01838353 5.12565144 3.78006571 +H 2.50011685 5.87405238 3.50751412 +H 6.95619123 5.44887224 4.20308201 diff --git a/gpu4pyscf/tests/test_dft.py b/gpu4pyscf/tests/test_dft.py deleted file mode 100644 index 94e7ed1e..00000000 --- a/gpu4pyscf/tests/test_dft.py +++ /dev/null @@ -1,180 +0,0 @@ -# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest -import numpy as np -import pyscf -import pytest -import cupy -from gpu4pyscf.dft import rks, uks - -def setUpModule(): - global mol - atom = ''' -C -0.07551087 1.68127663 -0.10745193 -O 1.33621755 1.87147409 -0.39326987 -C 1.67074668 2.95729545 0.49387976 -C 0.41740763 3.77281969 0.78495878 -C -0.60481480 3.07572636 0.28906224 -H -0.19316298 1.01922455 0.72486113 -O 0.35092043 5.03413298 1.45545728 -H 0.42961487 5.74279041 0.81264173 -O -1.95331750 3.53349874 0.15912025 -H -2.55333895 2.78846397 0.23972698 -O 2.81976302 3.20110148 0.94542226 -C -0.81772499 1.09230218 -1.32146482 -H -0.70955636 1.74951833 -2.15888136 -C -2.31163857 0.93420736 -0.98260166 -H -2.72575463 1.89080093 -0.74107186 -H -2.41980721 0.27699120 -0.14518512 -O -0.26428017 -0.18613595 -1.64425697 -H -0.72695910 -0.55328886 -2.40104423 -O -3.00083741 0.38730252 -2.10989934 -H -3.93210821 0.28874990 -1.89865997 -''' - - mol = pyscf.M(atom=atom, basis='def2-tzvpp', max_memory=32000, cart=0) - mol.output = '/dev/null' - mol.build() - mol.verbose = 1 - -def tearDownModule(): - global mol - mol.stdout.close() - del mol - -class KnownValues(unittest.TestCase): - @pytest.mark.smoke - def test_b3lyp_with_d3bj(self): - print('-------- DFRKS with D3(BJ) -------') - mf = rks.RKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit') - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-10 - mf.conv_tol_cpscf = 1e-8 - mf.disp = 'd3bj' - e_dft = mf.kernel() - assert np.abs(e_dft - -685.0326965348272) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.17498362161082373) < 1e-5 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4 - - @pytest.mark.smoke - def test_b3lyp_d3bj(self): - print('-------- DFRKS with D3(BJ) -------') - mf = rks.RKS(mol, xc='b3lyp-d3bj').density_fit(auxbasis='def2-tzvpp-jkfit') - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-10 - mf.conv_tol_cpscf = 1e-8 - e_dft = mf.kernel() - assert np.abs(e_dft - -685.0326965348272) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.17498362161082373) < 1e-5 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4 - - @pytest.mark.smoke - def test_DFUKS(self): - print('------- DFUKS with D3(BJ) -------') - mf = uks.UKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit') - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-10 - mf.conv_tol_cpscf = 1e-8 - mf.disp = 'd3bj' - e_dft = mf.kernel() - assert np.abs(e_dft - -685.0326965349493) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.17498264516108836) < 1e-5 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.768429871470736) < 1e-4 - - @pytest.mark.smoke - def test_RKS(self): - print('-------- RKS with D3(BJ) -------') - mf = rks.RKS(mol, xc='b3lyp') - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-12 - mf.conv_tol_cpscf = 1e-8 - mf.disp = 'd3bj' - e_dft = mf.kernel() - assert np.abs(e_dft - -685.0325611822375) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.1750368231223345) < 1e-6 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.760381249106394) < 1e-4 - - @pytest.mark.smoke - def test_UKS(self): - print('-------- UKS with D3(BJ) -------') - mf = uks.UKS(mol, xc='b3lyp') - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-12 - mf.conv_tol_cpscf = 1e-8 - mf.disp = 'd3bj' - e_dft = mf.kernel() - assert np.abs(e_dft - -685.0325611822375) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.1750368231223345) < 1e-6 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.760381249106394) < 1e-4 - - @pytest.mark.smoke - def test_DFRKS_with_SMD(self): - print('----- DFRKS with SMD -----') - mf = rks.RKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit') - mf = mf.SMD() - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-10 - mf.conv_tol_cpscf = 1e-8 - mf.disp = 'd3bj' - e_dft = mf.kernel() - assert np.abs(e_dft - -685.0578838805443) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.16804945458657145) < 1e-5 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.741783814494321) < 1e-4 - - @pytest.mark.smoke - def test_DFUKS_with_SMD(self): - print('------- DFUKS with SMD ---------') - mf = uks.UKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit') - mf = mf.SMD() - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-10 - mf.conv_tol_cpscf = 1e-8 - mf.disp = 'd3bj' - e_dft = mf.kernel() - assert np.abs(e_dft - -685.05788388063) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.1680496465773684) < 1e-5 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.7417788481647563) < 1e-4 - -if __name__ == "__main__": - print("Full Smoke Tests") - unittest.main() diff --git a/gpu4pyscf/tests/test_rks.py b/gpu4pyscf/tests/test_rks.py new file mode 100644 index 00000000..ebf9d8af --- /dev/null +++ b/gpu4pyscf/tests/test_rks.py @@ -0,0 +1,230 @@ +# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +import numpy as np +import pyscf +import pytest +import cupy +from gpu4pyscf.dft import rks, uks + +# Any task taking more than 1000s will be marked as 'slow' + +# How to run +# 1. run test only +# pytest test_rks.py --benchmark-disable -s -v -m "not slow" --durations=20 +# 2. benchmark less expensive tasks +# pytest test_rks.py -v -m "not slow" +# 3. benchmark all the tests +# pytest test_rks.py -v + +current_folder = os.path.dirname(os.path.abspath(__file__)) +small_mol = os.path.join(current_folder, '020_Vitamin_C.xyz') +median_mol = os.path.join(current_folder, '057_Tamoxifen.xyz') +large_mol = os.path.join(current_folder, '095_Azadirachtin.xyz') + +def run_rb3lyp(atom, basis, with_df, with_solvent, disp=None): + mol = pyscf.M(atom=atom, basis=basis, verbose=0) + mf = rks.RKS(mol, xc='b3lyp') + if with_df: + mf = mf.density_fit() + if with_solvent: + mf = mf.PCM() + mf.with_solvent.method = 'IEF-PCM' + if disp is not None: + mf.disp = disp + mf.grids.atom_grid = (99,590) + mf.conv_tol = 1e-10 + return mf.kernel() + +def run_rb3lyp_grad(atom, basis, with_df, with_solvent, disp=None): + mol = pyscf.M(atom=atom, basis=basis, verbose=0) + mf = rks.RKS(mol, xc='b3lyp') + if with_df: + mf = mf.density_fit() + if with_solvent: + mf = mf.PCM() + mf.with_solvent.method = 'IEF-PCM' + if disp is not None: + mf.disp = disp + mf.grids.atom_grid = (99,590) + mf.conv_tol = 1e-10 + mf.kernel() + g = mf.nuc_grad_method().kernel() + return g + +def run_rb3lyp_hessian(atom, basis, with_df, with_solvent, disp=None): + mol = pyscf.M(atom=atom, basis=basis, verbose=0) + mf = rks.RKS(mol, xc='b3lyp') + if with_df: + mf = mf.density_fit() + if with_solvent: + mf = mf.PCM() + mf.with_solvent.method = 'IEF-PCM' + if disp is not None: + mf.disp = disp + mf.grids.atom_grid = (99,590) + mf.conv_tol = 1e-10 + mf.conv_tol_cpscf = 1e-6 + mf.kernel() + h = mf.Hessian().kernel() + return h + +# DF +def test_df_rb3lyp(benchmark): + e = benchmark(run_rb3lyp, small_mol, 'def2-tzvpp', True, False) + print('testing df rb3lyp') + assert np.isclose(np.linalg.norm(e), 684.9998712035579, atol=1e-7) +def test_df_rb3lyp_grad(benchmark): + g = benchmark(run_rb3lyp_grad, small_mol, 'def2-tzvpp', True, False) + print('testing df rb3lyp grad') + assert np.isclose(np.linalg.norm(g), 0.17435941081837686, atol=1e-5) +@pytest.mark.slow +def test_df_rb3lyp_hessian(benchmark): + h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', True, False) + print('testing df rb3lyp hessian') + assert np.isclose(np.linalg.norm(h), 3.7668761221997764, atol=1e-4) + +# Direct SCF +def test_rb3lyp(benchmark): + e = benchmark(run_rb3lyp, small_mol, 'def2-tzvpp', False, False) + print('testing rb3lyp') + assert np.isclose(np.linalg.norm(e), 684.999735850967, atol=1e-7) +def test_rb3lyp_grad(benchmark): + g = benchmark(run_rb3lyp_grad, small_mol, 'def2-tzvpp', False, False) + print('testing rb3lyp grad') + assert np.isclose(np.linalg.norm(g), 0.1744127474130983, atol=1e-5) +def test_rb3lyp_hessian(benchmark): + h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', False, False) + print('testing rb3lyp hessian') + assert np.isclose(np.linalg.norm(h), 3.7588443634477833, atol=1e-4) + +# median molecule +def test_df_rb3lyp_median(benchmark): + e = benchmark(run_rb3lyp, median_mol, 'def2-tzvpp', True, False) + print('testing df rb3lyp median') + assert np.isclose(np.linalg.norm(e), 1138.371390377773, atol=1e-7) +def test_df_rb3lyp_grad_median(benchmark): + g = benchmark(run_rb3lyp_grad, median_mol, 'def2-tzvpp', True, False) + print('testing df rb3lyp grad median') + assert np.isclose(np.linalg.norm(g), 0.26010545073602614, atol=1e-4) +def test_df_rb3lyp_hessian_median(benchmark): + h = benchmark(run_rb3lyp_hessian, median_mol, 'def2-tzvpp', True, False) + print('testing df rb3lyp hessian median') + assert np.isclose(np.linalg.norm(h), 6.32514169232998, atol=1e-4) + +def test_rb3lyp_median(benchmark): + e = benchmark(run_rb3lyp, median_mol, 'def2-tzvpp', False, False) + print('testing rb3lyp median') + assert np.isclose(np.linalg.norm(e), 1138.3710752128077, atol=1e-7) +def test_rb3lyp_grad_median(benchmark): + g = benchmark(run_rb3lyp_grad, median_mol, 'def2-tzvpp', False, False) + print('testing rb3lyp grad median') + assert np.isclose(np.linalg.norm(g), 0.2601443836937988, atol=1e-5) +@pytest.mark.high_memory +@pytest.mark.slow +def test_rb3lyp_hessian_median(benchmark): + h = benchmark(run_rb3lyp_hessian, median_mol, 'def2-tzvpp', False, False) + print('testing rb3lyp hessian median') + assert np.isclose(np.linalg.norm(h)) + +# large molecule +def test_df_rb3lyp_large(benchmark): + e = benchmark(run_rb3lyp, large_mol, 'def2-tzvpp', True, False) + print('testing df rb3lyp large') + assert np.isclose(np.linalg.norm(e), 2564.198712152175, atol=1e-7) +def test_df_rb3lyp_grad_large(benchmark): + g = benchmark(run_rb3lyp_grad, large_mol, 'def2-tzvpp', True, False) + print('testing df rb3lyp grad large') + assert np.isclose(np.linalg.norm(g), 0.3784358687859323, atol=1e-5) +@pytest.mark.high_memory +@pytest.mark.slow +def test_df_rb3lyp_hessian_large(benchmark): + h = benchmark(run_rb3lyp_hessian, large_mol, 'def2-tzvpp', True, False) + print('testing df rb3lyp hessian large') + assert np.isclose(np.linalg.norm(h), 7.583208736873523, atol=1e-4) +@pytest.mark.slow +def test_rb3lyp_large(benchmark): + e = benchmark(run_rb3lyp, large_mol, 'def2-tzvpp', False, False) + print('testing rb3lyp large') + assert np.isclose(np.linalg.norm(e), 2564.198099576358, atol=1e-7) +@pytest.mark.slow +def test_rb3lyp_grad_large(benchmark): + g = benchmark(run_rb3lyp_grad, large_mol, 'def2-tzvpp', False, False) + print('testing rb3lyp grad large') + assert np.isclose(np.linalg.norm(g), 0.3784664384209763, atol=1e-5) +@pytest.mark.slow +def test_rb3lyp_hessian_large(benchmark): + h = benchmark(run_rb3lyp_hessian, large_mol, 'def2-tzvpp', False, False) + print('testing rb3lyp hessian large') + print(np.linalg.norm(h)) + +# small basis set +def test_df_rb3lyp_631gs(benchmark): + e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, False) + print('testing df rb3lyp 631gs') + assert np.isclose(np.linalg.norm(e), 684.6646008642876, atol=1e-7) +def test_df_rb3lyp_631gs_grad(benchmark): + g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, False) + print('testing df rb3lyp 631gs grad') + assert np.isclose(np.linalg.norm(g), 0.17530687343398219, atol=1e-5) +def test_df_rb3lyp_631gs_hessian(benchmark): + h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, False) + print('testing df rb3lyp 631gs hessian') + assert np.isclose(np.linalg.norm(h), 3.908874851569459, atol=1e-4) + +# small basis set for large molecule +def test_rb3lyp_631gs_large(benchmark): + e = benchmark(run_rb3lyp, large_mol, '6-31gs', False, False) + print('testing rb3lyp 631gs large') + assert np.isclose(np.linalg.norm(e), 2563.1171191823423, atol=1e-7) +def test_rb3lyp_631gs_grad_large(benchmark): + g = benchmark(run_rb3lyp_grad, large_mol, '6-31gs', False, False) + print('testing df rb3lyp 631gs grad large') + assert np.isclose(np.linalg.norm(g), 0.37778228700247984, atol=1e-5) +@pytest.mark.slow +def test_rb3lyp_631gs_hessian_large(benchmark): + h = benchmark(run_rb3lyp_hessian, large_mol, '6-31gs', False, False) + print('testing df rb3lyp 631gs hessian large') + assert np.isclose(np.linalg.norm(h), 7.920764634100053, atol=1e-4) + +#solvent model +def test_df_rb3lyp_631gs_solvent(benchmark): + e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, True) + print('testing df rb3lyp 631gs solvent') + assert np.isclose(np.linalg.norm(e), 684.6985561053816, atol=1e-7) +def test_df_rb3lyp_631gs_solvent_grad(benchmark): + g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, True) + print('testing df rb3lyp 631gs solvent grad') + assert np.isclose(np.linalg.norm(g), 0.16956999476137297, atol=1e-5) +def test_df_rb3lyp_631gs_solvent_hessian(benchmark): + h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, True) + print('testing df rb3lyp 631gs solvent hessian') + assert np.isclose(np.linalg.norm(h), 3.9008165041707294, atol=1e-4) + +# b3lyp d3bj +def test_df_rb3lyp_631gs_d3bj(benchmark): + e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, True, 'd3bj') + print('testing df rb3lyp 631gs solvent') + assert np.isclose(np.linalg.norm(e), 684.7313814096565, atol=1e-7) +def test_df_rb3lyp_631gs_d3bj_grad(benchmark): + g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, True, 'd3bj') + print('testing df rb3lyp 631gs solvent grad') + assert np.isclose(np.linalg.norm(g), 0.17010044498887264, atol=1e-5) +def test_df_rb3lyp_631gs_d3bj_hessian(benchmark): + h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, True, 'd3bj') + print('testing df rb3lyp 631gs solvent hessian') + assert np.isclose(np.linalg.norm(h), 3.902367554157861, atol=1e-4) + diff --git a/gpu4pyscf/tests/test_uks.py b/gpu4pyscf/tests/test_uks.py new file mode 100644 index 00000000..0e426f17 --- /dev/null +++ b/gpu4pyscf/tests/test_uks.py @@ -0,0 +1,92 @@ +# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +import numpy as np +import pyscf +import pytest +import cupy +from gpu4pyscf.dft import rks, uks + +current_folder = os.path.dirname(os.path.abspath(__file__)) +small_mol = os.path.join(current_folder, '020_Vitamin_C.xyz') + +def run_ub3lyp(atom, basis, with_df, with_solvent): + mol = pyscf.M(atom=atom, basis=basis, verbose=0) + mf = uks.UKS(mol, xc='b3lyp') + if with_df: + mf = mf.density_fit() + if with_solvent: + mf = mf.PCM() + mf.with_solvent.method = 'IEF-PCM' + mf.grids.atom_grid = (99,590) + mf.conv_tol = 1e-10 + return mf.kernel() + +def run_ub3lyp_grad(atom, basis, with_df, with_solvent): + mol = pyscf.M(atom=atom, basis=basis, verbose=0) + mf = uks.UKS(mol, xc='b3lyp') + if with_df: + mf = mf.density_fit() + if with_solvent: + mf = mf.PCM() + mf.with_solvent.method = 'IEF-PCM' + mf.grids.atom_grid = (99,590) + mf.conv_tol = 1e-10 + mf.kernel() + g = mf.nuc_grad_method().kernel() + return g + +def run_ub3lyp_hessian(atom, basis, with_df, with_solvent): + mol = pyscf.M(atom=atom, basis=basis, verbose=0) + mf = uks.UKS(mol, xc='b3lyp') + if with_df: + mf = mf.density_fit() + if with_solvent: + mf = mf.PCM() + mf.with_solvent.method = 'IEF-PCM' + mf.grids.atom_grid = (99,590) + mf.conv_tol = 1e-10 + mf.conv_tol_cpscf = 1e-6 + mf.kernel() + h = mf.Hessian().kernel() + return h + + +# UKS +def test_df_ub3lyp(benchmark): + e = benchmark(run_ub3lyp, small_mol, 'def2-tzvpp', True, False) + print('testing df ub3lyp') + assert np.isclose(np.linalg.norm(e), 684.9998712035856, atol=1e-7) +def test_df_ub3lyp_grad(benchmark): + g = benchmark(run_ub3lyp_grad, small_mol, 'def2-tzvpp', True, False) + print('testing df ub3lyp grad') + assert np.isclose(np.linalg.norm(g), 0.17435842214665462, atol=1e-5) +def test_df_ub3lyp_hessian(benchmark): + h = benchmark(run_ub3lyp_hessian, small_mol, 'def2-tzvpp', True, False) + print('testing df ub3lyp hessian') + assert np.isclose(np.linalg.norm(h), 3.7669464279078064, atol=1e-4) +def test_ub3lyp(benchmark): + e = benchmark(run_ub3lyp, small_mol, 'def2-tzvpp', False, False) + print('testing ub3lyp') + assert np.isclose(np.linalg.norm(e), 684.9997358509884, atol=1e-7) +def test_ub3lyp_grad(benchmark): + g = benchmark(run_ub3lyp_grad, small_mol, 'def2-tzvpp', False, False) + print('testing ub3lyp grad') + assert np.isclose(np.linalg.norm(g), 0.17441176110160253, atol=1e-5) +def test_ub3lyp_hessian(benchmark): + h = benchmark(run_ub3lyp_hessian, small_mol, 'def2-tzvpp', False, False) + print('testing ub3lyp hessian') + assert np.isclose(np.linalg.norm(h), 3.758916526520172, atol=1e-4) From fa6ac932eb932fd0ffaf911b989c35fb217db729 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Thu, 26 Dec 2024 18:06:36 +0000 Subject: [PATCH 17/49] assert hermi==1 --- gpu4pyscf/df/hessian/jk.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py index f8992ca3..db77e4db 100644 --- a/gpu4pyscf/df/hessian/jk.py +++ b/gpu4pyscf/df/hessian/jk.py @@ -32,6 +32,7 @@ def _jk_task_with_mo1(dfobj, dms, mo_coeff, mo1s, occ_coeffs, ''' Calculate J and K matrices with mo response For CP-HF ''' + assert hermi == 1 with cupy.cuda.Device(device_id), _streams[device_id]: assert isinstance(dfobj.verbose, int) log = logger.new_logger(dfobj.mol, dfobj.verbose) From 9d9ff1e0920922372062db2cdd35d8d73adc73b4 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Thu, 26 Dec 2024 18:42:25 +0000 Subject: [PATCH 18/49] typo in uhf.hessian --- gpu4pyscf/hessian/rhf.py | 2 +- gpu4pyscf/hessian/uhf.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py index 53be3349..c8d407d7 100644 --- a/gpu4pyscf/hessian/rhf.py +++ b/gpu4pyscf/hessian/rhf.py @@ -659,7 +659,7 @@ def fvind_vo(mo1): avail_mem = get_avail_mem() # *4 for input dm, vj, vk, and vxc blksize = int(min(avail_mem*.3 / (8*3*nao*nocc*4), # in MO - avail_mem*.3 / (8*nmo*nmo*3*3))) # vj, vk, dm in AO + avail_mem*.3 / (8*nao*nao*3*3))) # vj, vk, dm in AO if blksize < ALIGNED**2: raise RuntimeError('GPU memory insufficient for solving CPHF equations') diff --git a/gpu4pyscf/hessian/uhf.py b/gpu4pyscf/hessian/uhf.py index 0f695b2b..88a6c9fd 100644 --- a/gpu4pyscf/hessian/uhf.py +++ b/gpu4pyscf/hessian/uhf.py @@ -292,8 +292,8 @@ def fvind_vo(mo1): avail_mem = get_avail_mem() # *8 for spin-up/down input dm, vj, vk, and vxc - blksize = int(min(avail_mem*.3 / (8*3*nao*nao*8), - avail_mem*.3 / (8*nmo*nmo*3*6))) # in vj, vk, dm in AO + blksize = int(min(avail_mem*.3 / (8*3*nao*nocc*8), + avail_mem*.3 / (8*nao*nao*3*6))) # in vj, vk, dm in AO if blksize < ALIGNED**2: raise RuntimeError('GPU memory insufficient') From 5c9e2e69406c10e91d7a6c15e834121ac1cedc6d Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Fri, 27 Dec 2024 00:34:19 +0000 Subject: [PATCH 19/49] inject gen_response into soscf --- gpu4pyscf/df/df.py | 1 + gpu4pyscf/grad/rhf.py | 73 ++++++------ gpu4pyscf/hessian/jk.py | 69 +++++------ gpu4pyscf/hessian/rhf.py | 185 +++++++++++++++--------------- gpu4pyscf/scf/jk.py | 69 +++++------ gpu4pyscf/scf/soscf.py | 2 +- gpu4pyscf/scf/tests/test_soscf.py | 12 +- 7 files changed, 213 insertions(+), 198 deletions(-) diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py index 52b0ecf8..4991af43 100644 --- a/gpu4pyscf/df/df.py +++ b/gpu4pyscf/df/df.py @@ -147,6 +147,7 @@ def loop(self, blksize=None, unpack=True): and unpack the CDERI in (Lij) format ''' device_id = cupy.cuda.Device().id + print(self._cderi.keys(), device_id) cderi_sparse = self._cderi[device_id] if blksize is None: blksize = self.get_blksize() diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py index c3390e95..0ee8cd43 100644 --- a/gpu4pyscf/grad/rhf.py +++ b/gpu4pyscf/grad/rhf.py @@ -79,43 +79,41 @@ def _ejk_ip1_task(mol, dms, vhfopt, task_list, j_factor=1.0, k_factor=1.0, info = cp.empty(2, dtype=np.uint32) t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0) - for i, j in task_list: + for i, j, k, l in task_list: ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1], l_ctr_bas_loc[j], l_ctr_bas_loc[j+1]) tile_ij_mapping = tile_mappings[i,j] - for k in range(i+1): - for l in range(k+1): - llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' - kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1], - l_ctr_bas_loc[l], l_ctr_bas_loc[l+1]) - tile_kl_mapping = tile_mappings[k,l] - scheme = _ejk_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]]) - err = kern( - ctypes.cast(ejk.data.ptr, ctypes.c_void_p), - ctypes.c_double(j_factor), ctypes.c_double(k_factor), - ctypes.cast(dms.data.ptr, ctypes.c_void_p), - ctypes.c_int(n_dm), ctypes.c_int(nao), - vhfopt.rys_envs, (ctypes.c_int*2)(*scheme), - (ctypes.c_int*8)(*ij_shls, *kl_shls), - ctypes.c_int(tile_ij_mapping.size), - ctypes.c_int(tile_kl_mapping.size), - ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), - ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), - tile_q_ptr, q_ptr, s_ptr, - ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), - ctypes.c_float(log_cutoff), - ctypes.cast(pool.data.ptr, ctypes.c_void_p), - ctypes.cast(info.data.ptr, ctypes.c_void_p), - ctypes.c_int(workers), - mol._atm.ctypes, ctypes.c_int(mol.natm), - mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) - if err != 0: - raise RuntimeError(f'RYS_per_atom_jk_ip1 kernel for {llll} failed') - if log.verbose >= logger.DEBUG1: - msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}' - t1, t1p = log.timer_debug1(msg, *t1), t1 - timing_counter[llll] += t1[1] - t1p[1] - kern_counts += 1 + llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' + kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1], + l_ctr_bas_loc[l], l_ctr_bas_loc[l+1]) + tile_kl_mapping = tile_mappings[k,l] + scheme = _ejk_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]]) + err = kern( + ctypes.cast(ejk.data.ptr, ctypes.c_void_p), + ctypes.c_double(j_factor), ctypes.c_double(k_factor), + ctypes.cast(dms.data.ptr, ctypes.c_void_p), + ctypes.c_int(n_dm), ctypes.c_int(nao), + vhfopt.rys_envs, (ctypes.c_int*2)(*scheme), + (ctypes.c_int*8)(*ij_shls, *kl_shls), + ctypes.c_int(tile_ij_mapping.size), + ctypes.c_int(tile_kl_mapping.size), + ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), + ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), + tile_q_ptr, q_ptr, s_ptr, + ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), + ctypes.c_float(log_cutoff), + ctypes.cast(pool.data.ptr, ctypes.c_void_p), + ctypes.cast(info.data.ptr, ctypes.c_void_p), + ctypes.c_int(workers), + mol._atm.ctypes, ctypes.c_int(mol.natm), + mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) + if err != 0: + raise RuntimeError(f'RYS_per_atom_jk_ip1 kernel for {llll} failed') + if log.verbose >= logger.DEBUG1: + msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}' + t1, t1p = log.timer_debug1(msg, *t1), t1 + timing_counter[llll] += t1[1] - t1p[1] + kern_counts += 1 return ejk, kern_counts, timing_counter def _jk_energy_per_atom(mol, dm, vhfopt=None, @@ -145,7 +143,12 @@ def _jk_energy_per_atom(mol, dm, vhfopt=None, assert uniq_l.max() <= LMAX n_groups = len(uniq_l_ctr) - tasks = [(i,j) for i in range(n_groups) for j in range(i+1)] + tasks = [] + for i in range(n_groups): + for j in range(i+1): + for k in range(i+1): + for l in range(k+1): + tasks.append((i,j,k,l)) tasks = np.array(tasks) task_list = [] for device_id in range(_num_devices): diff --git a/gpu4pyscf/hessian/jk.py b/gpu4pyscf/hessian/jk.py index a1cd6105..6f17488d 100644 --- a/gpu4pyscf/hessian/jk.py +++ b/gpu4pyscf/hessian/jk.py @@ -91,41 +91,39 @@ def _jk_task(mol, dms, mo_coeff, mo_occ, vhfopt, task_list, hermi=0, info = cp.empty(2, dtype=np.uint32) t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0) - for i, j in task_list: + for i, j, k, l in task_list: ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1], l_ctr_bas_loc[j], l_ctr_bas_loc[j+1]) tile_ij_mapping = tile_mappings[i,j] - for k in range(i+1): - for l in range(k+1): - llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' - kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1], - l_ctr_bas_loc[l], l_ctr_bas_loc[l+1]) - tile_kl_mapping = tile_mappings[k,l] - scheme = quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]]) - err = kern( - vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p), - ctypes.c_int(n_dm), ctypes.c_int(nao), - vhfopt.rys_envs, (ctypes.c_int*2)(*scheme), - (ctypes.c_int*8)(*ij_shls, *kl_shls), - ctypes.c_int(tile_ij_mapping.size), - ctypes.c_int(tile_kl_mapping.size), - ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), - ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), - tile_q_ptr, q_ptr, s_ptr, - ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), - ctypes.c_float(log_cutoff), - ctypes.cast(pool.data.ptr, ctypes.c_void_p), - ctypes.cast(info.data.ptr, ctypes.c_void_p), - ctypes.c_int(workers), - mol._atm.ctypes, ctypes.c_int(mol.natm), - mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) - if err != 0: - raise RuntimeError(f'RYS_build_jk kernel for {llll} failed') - if log.verbose >= logger.DEBUG1: - msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}' - t1, t1p = log.timer_debug1(msg, *t1), t1 - timing_counter[llll] += t1[1] - t1p[1] - kern_counts += 1 + llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' + kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1], + l_ctr_bas_loc[l], l_ctr_bas_loc[l+1]) + tile_kl_mapping = tile_mappings[k,l] + scheme = quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]]) + err = kern( + vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p), + ctypes.c_int(n_dm), ctypes.c_int(nao), + vhfopt.rys_envs, (ctypes.c_int*2)(*scheme), + (ctypes.c_int*8)(*ij_shls, *kl_shls), + ctypes.c_int(tile_ij_mapping.size), + ctypes.c_int(tile_kl_mapping.size), + ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), + ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), + tile_q_ptr, q_ptr, s_ptr, + ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), + ctypes.c_float(log_cutoff), + ctypes.cast(pool.data.ptr, ctypes.c_void_p), + ctypes.cast(info.data.ptr, ctypes.c_void_p), + ctypes.c_int(workers), + mol._atm.ctypes, ctypes.c_int(mol.natm), + mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) + if err != 0: + raise RuntimeError(f'RYS_build_jk kernel for {llll} failed') + if log.verbose >= logger.DEBUG1: + msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}' + t1, t1p = log.timer_debug1(msg, *t1), t1 + timing_counter[llll] += t1[1] - t1p[1] + kern_counts += 1 if with_j: vj *= 2.0 vj = transpose_sum(vj) @@ -192,7 +190,12 @@ def get_jk(mol, dm, mo_coeff, mo_occ, hermi=0, vhfopt=None, l_symb = [lib.param.ANGULAR[i] for i in uniq_l] n_groups = np.count_nonzero(uniq_l <= LMAX) - tasks = [(i,j) for i in range(n_groups) for j in range(i+1)] + tasks = [] + for i in range(n_groups): + for j in range(i+1): + for k in range(i+1): + for l in range(k+1): + tasks.append((i,j,k,l)) tasks = np.array(tasks) task_list = [] for device_id in range(_num_devices): diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py index c8d407d7..d7596d13 100644 --- a/gpu4pyscf/hessian/rhf.py +++ b/gpu4pyscf/hessian/rhf.py @@ -204,62 +204,60 @@ def _ejk_ip2_task(mol, dms, vhfopt, task_list, j_factor=1.0, k_factor=1.0, info = cp.empty(2, dtype=np.uint32) t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0) - for i, j in task_list: + for i, j, k, l in task_list: ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1], l_ctr_bas_loc[j], l_ctr_bas_loc[j+1]) tile_ij_mapping = tile_mappings[i,j] - for k in range(i+1): - for l in range(k+1): - llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' - kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1], - l_ctr_bas_loc[l], l_ctr_bas_loc[l+1]) - tile_kl_mapping = tile_mappings[k,l] - scheme = _ip2_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]]) - err1 = kern1( - ctypes.cast(ejk.data.ptr, ctypes.c_void_p), - ctypes.c_double(j_factor), ctypes.c_double(k_factor), - ctypes.cast(dms.data.ptr, ctypes.c_void_p), - ctypes.c_int(n_dm), ctypes.c_int(nao), - vhfopt.rys_envs, (ctypes.c_int*2)(*scheme), - (ctypes.c_int*8)(*ij_shls, *kl_shls), - ctypes.c_int(tile_ij_mapping.size), - ctypes.c_int(tile_kl_mapping.size), - ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), - ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), - tile_q_ptr, q_ptr, s_ptr, - ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), - ctypes.c_float(log_cutoff), - ctypes.cast(pool.data.ptr, ctypes.c_void_p), - ctypes.cast(info.data.ptr, ctypes.c_void_p), - ctypes.c_int(workers), - mol._atm.ctypes, ctypes.c_int(mol.natm), - mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) - err2 = kern2( - ctypes.cast(ejk.data.ptr, ctypes.c_void_p), - ctypes.c_double(j_factor), ctypes.c_double(k_factor), - ctypes.cast(dms.data.ptr, ctypes.c_void_p), - ctypes.c_int(n_dm), ctypes.c_int(nao), - vhfopt.rys_envs, (ctypes.c_int*2)(*scheme), - (ctypes.c_int*8)(*ij_shls, *kl_shls), - ctypes.c_int(tile_ij_mapping.size), - ctypes.c_int(tile_kl_mapping.size), - ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), - ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), - tile_q_ptr, q_ptr, s_ptr, - ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), - ctypes.c_float(log_cutoff), - ctypes.cast(pool.data.ptr, ctypes.c_void_p), - ctypes.cast(info.data.ptr, ctypes.c_void_p), - ctypes.c_int(workers), - mol._atm.ctypes, ctypes.c_int(mol.natm), - mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) - if err1 != 0 or err2 != 0: - raise RuntimeError(f'RYS_per_atom_jk_ip2 kernel for {llll} failed') - if log.verbose >= logger.DEBUG1: - msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}' - t1, t1p = log.timer_debug1(msg, *t1), t1 - timing_counter[llll] += t1[1] - t1p[1] - kern_counts += 1 + llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' + kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1], + l_ctr_bas_loc[l], l_ctr_bas_loc[l+1]) + tile_kl_mapping = tile_mappings[k,l] + scheme = _ip2_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]]) + err1 = kern1( + ctypes.cast(ejk.data.ptr, ctypes.c_void_p), + ctypes.c_double(j_factor), ctypes.c_double(k_factor), + ctypes.cast(dms.data.ptr, ctypes.c_void_p), + ctypes.c_int(n_dm), ctypes.c_int(nao), + vhfopt.rys_envs, (ctypes.c_int*2)(*scheme), + (ctypes.c_int*8)(*ij_shls, *kl_shls), + ctypes.c_int(tile_ij_mapping.size), + ctypes.c_int(tile_kl_mapping.size), + ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), + ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), + tile_q_ptr, q_ptr, s_ptr, + ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), + ctypes.c_float(log_cutoff), + ctypes.cast(pool.data.ptr, ctypes.c_void_p), + ctypes.cast(info.data.ptr, ctypes.c_void_p), + ctypes.c_int(workers), + mol._atm.ctypes, ctypes.c_int(mol.natm), + mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) + err2 = kern2( + ctypes.cast(ejk.data.ptr, ctypes.c_void_p), + ctypes.c_double(j_factor), ctypes.c_double(k_factor), + ctypes.cast(dms.data.ptr, ctypes.c_void_p), + ctypes.c_int(n_dm), ctypes.c_int(nao), + vhfopt.rys_envs, (ctypes.c_int*2)(*scheme), + (ctypes.c_int*8)(*ij_shls, *kl_shls), + ctypes.c_int(tile_ij_mapping.size), + ctypes.c_int(tile_kl_mapping.size), + ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), + ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), + tile_q_ptr, q_ptr, s_ptr, + ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), + ctypes.c_float(log_cutoff), + ctypes.cast(pool.data.ptr, ctypes.c_void_p), + ctypes.cast(info.data.ptr, ctypes.c_void_p), + ctypes.c_int(workers), + mol._atm.ctypes, ctypes.c_int(mol.natm), + mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) + if err1 != 0 or err2 != 0: + raise RuntimeError(f'RYS_per_atom_jk_ip2 kernel for {llll} failed') + if log.verbose >= logger.DEBUG1: + msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}' + t1, t1p = log.timer_debug1(msg, *t1), t1 + timing_counter[llll] += t1[1] - t1p[1] + kern_counts += 1 ejk = ejk + ejk.transpose(1,0,3,2) return ejk, kern_counts, timing_counter @@ -286,7 +284,12 @@ def _partial_ejk_ip2(mol, dm, vhfopt=None, j_factor=1., k_factor=1., verbose=Non assert uniq_l.max() <= LMAX n_groups = len(uniq_l_ctr) - tasks = [(i,j) for i in range(n_groups) for j in range(i+1)] + tasks = [] + for i in range(n_groups): + for j in range(i+1): + for k in range(i+1): + for l in range(k+1): + tasks.append((i,j,k,l)) tasks = np.array(tasks) task_list = [] for device_id in range(_num_devices): @@ -394,7 +397,6 @@ def _build_jk_ip1_task(mol, dms, vhfopt, task_list, atoms_slice, uniq_l = uniq_l_ctr[:,0] l_ctr_bas_loc = vhfopt.l_ctr_offsets l_symb = [lib.param.ANGULAR[i] for i in uniq_l] - n_groups = len(uniq_l_ctr) kern = libvhf_rys.RYS_build_jk_ip1 timing_counter = Counter() @@ -426,7 +428,7 @@ def _build_jk_ip1_task(mol, dms, vhfopt, task_list, atoms_slice, info = cp.empty(2, dtype=np.uint32) t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0) - for i, j in task_list: + for i, j, k, l in task_list: ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1], l_ctr_bas_loc[j], l_ctr_bas_loc[j+1]) ish0, ish1 = l_ctr_bas_loc[i], l_ctr_bas_loc[i+1] @@ -441,39 +443,37 @@ def _build_jk_ip1_task(mol, dms, vhfopt, task_list, atoms_slice, cp.arange(jsh0, jsh1, dtype=np.int32)) idx = cp.argsort(sub_tile_q[mask])[::-1] tile_ij_mapping = t_ij[mask][idx] - for k in range(n_groups): - for l in range(k+1): - llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' - kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1], - l_ctr_bas_loc[l], l_ctr_bas_loc[l+1]) - tile_kl_mapping = tril_tile_mappings[k,l] - scheme = _ip1_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]]) - err = kern( - vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p), - ctypes.c_int(n_dm), ctypes.c_int(nao), ctypes.c_int(atom0), - vhfopt.rys_envs, (ctypes.c_int*2)(*scheme), - (ctypes.c_int*8)(*ij_shls, *kl_shls), - ctypes.c_int(tile_ij_mapping.size), - ctypes.c_int(tile_kl_mapping.size), - ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), - ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), - ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p), - ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p), - lib.c_null_ptr(), - ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), - ctypes.c_float(log_cutoff), - ctypes.cast(pool.data.ptr, ctypes.c_void_p), - ctypes.cast(info.data.ptr, ctypes.c_void_p), - ctypes.c_int(workers), - mol._atm.ctypes, ctypes.c_int(mol.natm), - mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) - if err != 0: - raise RuntimeError(f'RYS_build_jk kernel for {llll} failed') - if log.verbose >= logger.DEBUG1: - msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}' - t1, t1p = log.timer_debug1(msg, *t1), t1 - timing_counter[llll] += t1[1] - t1p[1] - kern_counts += 1 + llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' + kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1], + l_ctr_bas_loc[l], l_ctr_bas_loc[l+1]) + tile_kl_mapping = tril_tile_mappings[k,l] + scheme = _ip1_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]]) + err = kern( + vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p), + ctypes.c_int(n_dm), ctypes.c_int(nao), ctypes.c_int(atom0), + vhfopt.rys_envs, (ctypes.c_int*2)(*scheme), + (ctypes.c_int*8)(*ij_shls, *kl_shls), + ctypes.c_int(tile_ij_mapping.size), + ctypes.c_int(tile_kl_mapping.size), + ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), + ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), + ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p), + ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p), + lib.c_null_ptr(), + ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), + ctypes.c_float(log_cutoff), + ctypes.cast(pool.data.ptr, ctypes.c_void_p), + ctypes.cast(info.data.ptr, ctypes.c_void_p), + ctypes.c_int(workers), + mol._atm.ctypes, ctypes.c_int(mol.natm), + mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) + if err != 0: + raise RuntimeError(f'RYS_build_jk kernel for {llll} failed') + if log.verbose >= logger.DEBUG1: + msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}' + t1, t1p = log.timer_debug1(msg, *t1), t1 + timing_counter[llll] += t1[1] - t1p[1] + kern_counts += 1 return vj, vk, kern_counts, timing_counter def _get_jk_ip1(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=None): @@ -516,7 +516,12 @@ def _get_jk_ip1(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=Non assert vhfopt.tile_q_cond.shape == (nbas, nbas) n_groups = len(uniq_l_ctr) - tasks = [(i,j) for i in range(n_groups) for j in range(n_groups)] + tasks = [] + for i in range(n_groups): + for j in range(n_groups): + for k in range(n_groups): + for l in range(k+1): + tasks.append((i,j,k,l)) tasks = np.array(tasks) task_list = [] for device_id in range(_num_devices): diff --git a/gpu4pyscf/scf/jk.py b/gpu4pyscf/scf/jk.py index 8e09a35b..38d75ee3 100644 --- a/gpu4pyscf/scf/jk.py +++ b/gpu4pyscf/scf/jk.py @@ -107,41 +107,39 @@ def _jk_task(mol, dms, vhfopt, task_list, hermi=0, info = cp.empty(2, dtype=np.uint32) t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0) - for i, j in task_list: + for i, j, k, l in task_list: ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1], l_ctr_bas_loc[j], l_ctr_bas_loc[j+1]) tile_ij_mapping = tile_mappings[i,j] - for k in range(i+1): - for l in range(k+1): - llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' - kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1], - l_ctr_bas_loc[l], l_ctr_bas_loc[l+1]) - tile_kl_mapping = tile_mappings[k,l] - scheme = quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]]) - err = kern( - vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p), - ctypes.c_int(n_dm), ctypes.c_int(nao), - vhfopt.rys_envs, (ctypes.c_int*2)(*scheme), - (ctypes.c_int*8)(*ij_shls, *kl_shls), - ctypes.c_int(tile_ij_mapping.size), - ctypes.c_int(tile_kl_mapping.size), - ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), - ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), - tile_q_ptr, q_ptr, s_ptr, - ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), - ctypes.c_float(log_cutoff), - ctypes.cast(pool.data.ptr, ctypes.c_void_p), - ctypes.cast(info.data.ptr, ctypes.c_void_p), - ctypes.c_int(workers), - mol._atm.ctypes, ctypes.c_int(mol.natm), - mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) - if err != 0: - raise RuntimeError(f'RYS_build_jk kernel for {llll} failed') - if log.verbose >= logger.DEBUG1: - msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}' - t1, t1p = log.timer_debug1(msg, *t1), t1 - timing_counter[llll] += t1[1] - t1p[1] - kern_counts += 1 + llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' + kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1], + l_ctr_bas_loc[l], l_ctr_bas_loc[l+1]) + tile_kl_mapping = tile_mappings[k,l] + scheme = quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]]) + err = kern( + vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p), + ctypes.c_int(n_dm), ctypes.c_int(nao), + vhfopt.rys_envs, (ctypes.c_int*2)(*scheme), + (ctypes.c_int*8)(*ij_shls, *kl_shls), + ctypes.c_int(tile_ij_mapping.size), + ctypes.c_int(tile_kl_mapping.size), + ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), + ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), + tile_q_ptr, q_ptr, s_ptr, + ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), + ctypes.c_float(log_cutoff), + ctypes.cast(pool.data.ptr, ctypes.c_void_p), + ctypes.cast(info.data.ptr, ctypes.c_void_p), + ctypes.c_int(workers), + mol._atm.ctypes, ctypes.c_int(mol.natm), + mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) + if err != 0: + raise RuntimeError(f'RYS_build_jk kernel for {llll} failed') + if log.verbose >= logger.DEBUG1: + msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}' + t1, t1p = log.timer_debug1(msg, *t1), t1 + timing_counter[llll] += t1[1] - t1p[1] + kern_counts += 1 if with_j: if hermi == 1: vj *= 2. @@ -185,7 +183,12 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None l_symb = [lib.param.ANGULAR[i] for i in uniq_l] n_groups = np.count_nonzero(uniq_l <= LMAX) - tasks = [(i,j) for i in range(n_groups) for j in range(i+1)] + tasks = [] + for i in range(n_groups): + for j in range(i+1): + for k in range(i+1): + for l in range(k+1): + tasks.append((i,j,k,l)) tasks = np.array(tasks) task_list = [] for device_id in range(_num_devices): diff --git a/gpu4pyscf/scf/soscf.py b/gpu4pyscf/scf/soscf.py index 81da0361..6d9bf87b 100644 --- a/gpu4pyscf/scf/soscf.py +++ b/gpu4pyscf/scf/soscf.py @@ -27,7 +27,7 @@ from pyscf.soscf import ciah from pyscf.soscf.newton_ah import _CIAH_SOSCF as _SOSCF_cpu from gpu4pyscf.lib import logger -from gpu4pyscf.scf import hf, rohf, uhf +from gpu4pyscf.scf import hf, rohf, uhf, _response_functions from gpu4pyscf.lib.cupy_helper import transpose_sum, contract from gpu4pyscf.lib import utils diff --git a/gpu4pyscf/scf/tests/test_soscf.py b/gpu4pyscf/scf/tests/test_soscf.py index 924dfd2e..4a07bcc5 100644 --- a/gpu4pyscf/scf/tests/test_soscf.py +++ b/gpu4pyscf/scf/tests/test_soscf.py @@ -24,18 +24,18 @@ def setUpModule(): verbose = 5, output = '/dev/null', atom = [ - ["O" , (0. , 0. , 0.)], - [1 , (0. , -0.757 , 0.587)], - [1 , (0. , 0.757 , 0.587)] ], + ["O" , (0. , 0. , 0.)], + [1 , (0. , -0.757 , 0.587)], + [1 , (0. , 0.757 , 0.587)] ], basis = '6-31g') h2o_z1 = gto.M( verbose = 5, output = '/dev/null', atom = [ - ["O" , (0. , 0. , 0.)], - [1 , (0. , -0.757 , 0.587)], - [1 , (0. , 0.757 , 0.587)] ], + ["O" , (0. , 0. , 0.)], + [1 , (0. , -0.757 , 0.587)], + [1 , (0. , 0.757 , 0.587)] ], basis = '6-31g', charge = 1, spin = 1,) From 53566e9f95be2ca886a305f0d46f25f743e8f593 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Fri, 27 Dec 2024 03:52:09 +0000 Subject: [PATCH 20/49] update tests for nightly build --- .github/workflows/nightly_build.yml | 5 +++-- gpu4pyscf/df/df.py | 1 - gpu4pyscf/scf/jk.py | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/nightly_build.yml b/.github/workflows/nightly_build.yml index 7f2b816e..f7f03ac6 100644 --- a/.github/workflows/nightly_build.yml +++ b/.github/workflows/nightly_build.yml @@ -14,7 +14,7 @@ permissions: jobs: build: - runs-on: self-hosted + runs-on: [self-hosted, Linux, X64, v100] steps: - uses: actions/checkout@v3 @@ -23,6 +23,7 @@ jobs: pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple python3 -m pip install --upgrade pip pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion + pip3 install pytest-benchmark pip3 install pyscf --upgrade pip3 install numpy --upgrade pip3 install scipy --upgrade @@ -39,4 +40,4 @@ jobs: run: | echo $GITHUB_WORKSPACE export PYTHONPATH="${PYTHONPATH}:$(pwd)" - pytest --durations=0 + pytest tests/ -v -m "not slow" diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py index 4991af43..52b0ecf8 100644 --- a/gpu4pyscf/df/df.py +++ b/gpu4pyscf/df/df.py @@ -147,7 +147,6 @@ def loop(self, blksize=None, unpack=True): and unpack the CDERI in (Lij) format ''' device_id = cupy.cuda.Device().id - print(self._cderi.keys(), device_id) cderi_sparse = self._cderi[device_id] if blksize is None: blksize = self.get_blksize() diff --git a/gpu4pyscf/scf/jk.py b/gpu4pyscf/scf/jk.py index 38d75ee3..8577457d 100644 --- a/gpu4pyscf/scf/jk.py +++ b/gpu4pyscf/scf/jk.py @@ -58,6 +58,7 @@ SHM_SIZE = getattr(__config__, 'GPU_SHM_SIZE', int(gpu_specs['sharedMemPerBlockOptin']//9)*8) THREADS = 256 +GROUP_SIZE = 256 def _jk_task(mol, dms, vhfopt, task_list, hermi=0, device_id=0, with_j=True, with_k=True, verbose=None): @@ -461,7 +462,7 @@ def __init__(self, mol, cutoff=1e-13): self._tile_q_cond = {} self._s_estimator = {} - def build(self, group_size=None, verbose=None): + def build(self, group_size=GROUP_SIZE, verbose=None): mol = self.mol log = logger.new_logger(mol, verbose) cput0 = log.init_timer() From 9b7a8d5cd3a94c2bf4f209ef18e2b3223a9ece69 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Fri, 27 Dec 2024 03:59:23 +0000 Subject: [PATCH 21/49] disable benchmark for ci --- .github/workflows/unittest.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 4eb534e3..31e8473a 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -38,7 +38,7 @@ jobs: run: | echo $GITHUB_WORKSPACE export PYTHONPATH="${PYTHONPATH}:$(pwd)" - pytest -m "not smoke" --cov=$GITHUB_WORKSPACE + pytest -m "not smoke" --benchmark-disable -s --cov=$GITHUB_WORKSPACE multi-gpu: runs-on: [self-hosted, Linux, X64, 2T4] @@ -65,4 +65,4 @@ jobs: run: | echo $GITHUB_WORKSPACE export PYTHONPATH="${PYTHONPATH}:$(pwd)" - pytest -m "not smoke" --cov=$GITHUB_WORKSPACE + pytest -m "not smoke" --benchmark-disable -s --cov=$GITHUB_WORKSPACE From da30fcf140c61f249f0c98acc78d06e5d8b84f65 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Fri, 27 Dec 2024 04:05:30 +0000 Subject: [PATCH 22/49] install pytest-benchmark --- .github/workflows/unittest.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 31e8473a..de3303cb 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -21,6 +21,7 @@ jobs: run: | pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple python3 -m pip install --upgrade pip + pip3 install pytest-benchmark pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion pip3 install pyscf --upgrade pip3 install git+https://github.com/pyscf/properties --upgrade @@ -48,6 +49,7 @@ jobs: run: | pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple python3 -m pip install --upgrade pip + pip3 install pytest-benchmark pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion pip3 install pyscf --upgrade pip3 install git+https://github.com/pyscf/properties --upgrade From 0c9a0c3ab3a98d65b4c93ec4fc881cb87e8de141 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Fri, 27 Dec 2024 04:10:50 +0000 Subject: [PATCH 23/49] change the file names of benchmark tests --- gpu4pyscf/tests/{test_rks.py => test_benchmark_rks.py} | 0 gpu4pyscf/tests/{test_uks.py => test_benchmark_uks.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename gpu4pyscf/tests/{test_rks.py => test_benchmark_rks.py} (100%) rename gpu4pyscf/tests/{test_uks.py => test_benchmark_uks.py} (100%) diff --git a/gpu4pyscf/tests/test_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py similarity index 100% rename from gpu4pyscf/tests/test_rks.py rename to gpu4pyscf/tests/test_benchmark_rks.py diff --git a/gpu4pyscf/tests/test_uks.py b/gpu4pyscf/tests/test_benchmark_uks.py similarity index 100% rename from gpu4pyscf/tests/test_uks.py rename to gpu4pyscf/tests/test_benchmark_uks.py From 275925bb08c15e695aefe61756deedf3b2cc32c1 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Fri, 27 Dec 2024 05:19:32 +0000 Subject: [PATCH 24/49] disable benchmark for ci --- .github/workflows/unittest.yml | 4 ++-- gpu4pyscf/tests/test_benchmark_rks.py | 31 +++++++++++++++++++++++++++ gpu4pyscf/tests/test_benchmark_uks.py | 6 ++++++ 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index de3303cb..12464ab5 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -39,7 +39,7 @@ jobs: run: | echo $GITHUB_WORKSPACE export PYTHONPATH="${PYTHONPATH}:$(pwd)" - pytest -m "not smoke" --benchmark-disable -s --cov=$GITHUB_WORKSPACE + pytest -m "not benchmark" --cov=$GITHUB_WORKSPACE multi-gpu: runs-on: [self-hosted, Linux, X64, 2T4] @@ -67,4 +67,4 @@ jobs: run: | echo $GITHUB_WORKSPACE export PYTHONPATH="${PYTHONPATH}:$(pwd)" - pytest -m "not smoke" --benchmark-disable -s --cov=$GITHUB_WORKSPACE + pytest -m "not benchmark" --cov=$GITHUB_WORKSPACE diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py index ebf9d8af..cdaa2801 100644 --- a/gpu4pyscf/tests/test_benchmark_rks.py +++ b/gpu4pyscf/tests/test_benchmark_rks.py @@ -83,146 +83,177 @@ def run_rb3lyp_hessian(atom, basis, with_df, with_solvent, disp=None): return h # DF +@pytest.mark.benchmark def test_df_rb3lyp(benchmark): e = benchmark(run_rb3lyp, small_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp') assert np.isclose(np.linalg.norm(e), 684.9998712035579, atol=1e-7) +@pytest.mark.benchmark def test_df_rb3lyp_grad(benchmark): g = benchmark(run_rb3lyp_grad, small_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp grad') assert np.isclose(np.linalg.norm(g), 0.17435941081837686, atol=1e-5) @pytest.mark.slow +@pytest.mark.benchmark def test_df_rb3lyp_hessian(benchmark): h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp hessian') assert np.isclose(np.linalg.norm(h), 3.7668761221997764, atol=1e-4) # Direct SCF +@pytest.mark.benchmark def test_rb3lyp(benchmark): e = benchmark(run_rb3lyp, small_mol, 'def2-tzvpp', False, False) print('testing rb3lyp') assert np.isclose(np.linalg.norm(e), 684.999735850967, atol=1e-7) +@pytest.mark.benchmark def test_rb3lyp_grad(benchmark): g = benchmark(run_rb3lyp_grad, small_mol, 'def2-tzvpp', False, False) print('testing rb3lyp grad') assert np.isclose(np.linalg.norm(g), 0.1744127474130983, atol=1e-5) +@pytest.mark.benchmark def test_rb3lyp_hessian(benchmark): h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', False, False) print('testing rb3lyp hessian') assert np.isclose(np.linalg.norm(h), 3.7588443634477833, atol=1e-4) # median molecule +@pytest.mark.benchmark def test_df_rb3lyp_median(benchmark): e = benchmark(run_rb3lyp, median_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp median') assert np.isclose(np.linalg.norm(e), 1138.371390377773, atol=1e-7) +@pytest.mark.benchmark def test_df_rb3lyp_grad_median(benchmark): g = benchmark(run_rb3lyp_grad, median_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp grad median') assert np.isclose(np.linalg.norm(g), 0.26010545073602614, atol=1e-4) +@pytest.mark.benchmark def test_df_rb3lyp_hessian_median(benchmark): h = benchmark(run_rb3lyp_hessian, median_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp hessian median') assert np.isclose(np.linalg.norm(h), 6.32514169232998, atol=1e-4) +@pytest.mark.benchmark def test_rb3lyp_median(benchmark): e = benchmark(run_rb3lyp, median_mol, 'def2-tzvpp', False, False) print('testing rb3lyp median') assert np.isclose(np.linalg.norm(e), 1138.3710752128077, atol=1e-7) +@pytest.mark.benchmark def test_rb3lyp_grad_median(benchmark): g = benchmark(run_rb3lyp_grad, median_mol, 'def2-tzvpp', False, False) print('testing rb3lyp grad median') assert np.isclose(np.linalg.norm(g), 0.2601443836937988, atol=1e-5) @pytest.mark.high_memory @pytest.mark.slow +@pytest.mark.benchmark def test_rb3lyp_hessian_median(benchmark): h = benchmark(run_rb3lyp_hessian, median_mol, 'def2-tzvpp', False, False) print('testing rb3lyp hessian median') assert np.isclose(np.linalg.norm(h)) # large molecule +@pytest.mark.benchmark def test_df_rb3lyp_large(benchmark): e = benchmark(run_rb3lyp, large_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp large') assert np.isclose(np.linalg.norm(e), 2564.198712152175, atol=1e-7) +@pytest.mark.benchmark def test_df_rb3lyp_grad_large(benchmark): g = benchmark(run_rb3lyp_grad, large_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp grad large') assert np.isclose(np.linalg.norm(g), 0.3784358687859323, atol=1e-5) @pytest.mark.high_memory @pytest.mark.slow +@pytest.mark.benchmark def test_df_rb3lyp_hessian_large(benchmark): h = benchmark(run_rb3lyp_hessian, large_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp hessian large') assert np.isclose(np.linalg.norm(h), 7.583208736873523, atol=1e-4) @pytest.mark.slow +@pytest.mark.benchmark def test_rb3lyp_large(benchmark): e = benchmark(run_rb3lyp, large_mol, 'def2-tzvpp', False, False) print('testing rb3lyp large') assert np.isclose(np.linalg.norm(e), 2564.198099576358, atol=1e-7) @pytest.mark.slow +@pytest.mark.benchmark def test_rb3lyp_grad_large(benchmark): g = benchmark(run_rb3lyp_grad, large_mol, 'def2-tzvpp', False, False) print('testing rb3lyp grad large') assert np.isclose(np.linalg.norm(g), 0.3784664384209763, atol=1e-5) @pytest.mark.slow +@pytest.mark.benchmark def test_rb3lyp_hessian_large(benchmark): h = benchmark(run_rb3lyp_hessian, large_mol, 'def2-tzvpp', False, False) print('testing rb3lyp hessian large') print(np.linalg.norm(h)) # small basis set +@pytest.mark.benchmark def test_df_rb3lyp_631gs(benchmark): e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, False) print('testing df rb3lyp 631gs') assert np.isclose(np.linalg.norm(e), 684.6646008642876, atol=1e-7) + +@pytest.mark.benchmark def test_df_rb3lyp_631gs_grad(benchmark): g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, False) print('testing df rb3lyp 631gs grad') assert np.isclose(np.linalg.norm(g), 0.17530687343398219, atol=1e-5) +@pytest.mark.benchmark def test_df_rb3lyp_631gs_hessian(benchmark): h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, False) print('testing df rb3lyp 631gs hessian') assert np.isclose(np.linalg.norm(h), 3.908874851569459, atol=1e-4) # small basis set for large molecule +@pytest.mark.benchmark def test_rb3lyp_631gs_large(benchmark): e = benchmark(run_rb3lyp, large_mol, '6-31gs', False, False) print('testing rb3lyp 631gs large') assert np.isclose(np.linalg.norm(e), 2563.1171191823423, atol=1e-7) +@pytest.mark.benchmark def test_rb3lyp_631gs_grad_large(benchmark): g = benchmark(run_rb3lyp_grad, large_mol, '6-31gs', False, False) print('testing df rb3lyp 631gs grad large') assert np.isclose(np.linalg.norm(g), 0.37778228700247984, atol=1e-5) @pytest.mark.slow +@pytest.mark.benchmark def test_rb3lyp_631gs_hessian_large(benchmark): h = benchmark(run_rb3lyp_hessian, large_mol, '6-31gs', False, False) print('testing df rb3lyp 631gs hessian large') assert np.isclose(np.linalg.norm(h), 7.920764634100053, atol=1e-4) #solvent model +@pytest.mark.benchmark def test_df_rb3lyp_631gs_solvent(benchmark): e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, True) print('testing df rb3lyp 631gs solvent') assert np.isclose(np.linalg.norm(e), 684.6985561053816, atol=1e-7) +@pytest.mark.benchmark def test_df_rb3lyp_631gs_solvent_grad(benchmark): g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, True) print('testing df rb3lyp 631gs solvent grad') assert np.isclose(np.linalg.norm(g), 0.16956999476137297, atol=1e-5) +@pytest.mark.benchmark def test_df_rb3lyp_631gs_solvent_hessian(benchmark): h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, True) print('testing df rb3lyp 631gs solvent hessian') assert np.isclose(np.linalg.norm(h), 3.9008165041707294, atol=1e-4) # b3lyp d3bj +@pytest.mark.benchmark def test_df_rb3lyp_631gs_d3bj(benchmark): e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, True, 'd3bj') print('testing df rb3lyp 631gs solvent') assert np.isclose(np.linalg.norm(e), 684.7313814096565, atol=1e-7) +@pytest.mark.benchmark def test_df_rb3lyp_631gs_d3bj_grad(benchmark): g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, True, 'd3bj') print('testing df rb3lyp 631gs solvent grad') assert np.isclose(np.linalg.norm(g), 0.17010044498887264, atol=1e-5) +@pytest.mark.benchmark def test_df_rb3lyp_631gs_d3bj_hessian(benchmark): h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, True, 'd3bj') print('testing df rb3lyp 631gs solvent hessian') diff --git a/gpu4pyscf/tests/test_benchmark_uks.py b/gpu4pyscf/tests/test_benchmark_uks.py index 0e426f17..39acd9ba 100644 --- a/gpu4pyscf/tests/test_benchmark_uks.py +++ b/gpu4pyscf/tests/test_benchmark_uks.py @@ -66,26 +66,32 @@ def run_ub3lyp_hessian(atom, basis, with_df, with_solvent): # UKS +@pytest.mark.benchmark def test_df_ub3lyp(benchmark): e = benchmark(run_ub3lyp, small_mol, 'def2-tzvpp', True, False) print('testing df ub3lyp') assert np.isclose(np.linalg.norm(e), 684.9998712035856, atol=1e-7) +@pytest.mark.benchmark def test_df_ub3lyp_grad(benchmark): g = benchmark(run_ub3lyp_grad, small_mol, 'def2-tzvpp', True, False) print('testing df ub3lyp grad') assert np.isclose(np.linalg.norm(g), 0.17435842214665462, atol=1e-5) +@pytest.mark.benchmark def test_df_ub3lyp_hessian(benchmark): h = benchmark(run_ub3lyp_hessian, small_mol, 'def2-tzvpp', True, False) print('testing df ub3lyp hessian') assert np.isclose(np.linalg.norm(h), 3.7669464279078064, atol=1e-4) +@pytest.mark.benchmark def test_ub3lyp(benchmark): e = benchmark(run_ub3lyp, small_mol, 'def2-tzvpp', False, False) print('testing ub3lyp') assert np.isclose(np.linalg.norm(e), 684.9997358509884, atol=1e-7) +@pytest.mark.benchmark def test_ub3lyp_grad(benchmark): g = benchmark(run_ub3lyp_grad, small_mol, 'def2-tzvpp', False, False) print('testing ub3lyp grad') assert np.isclose(np.linalg.norm(g), 0.17441176110160253, atol=1e-5) +@pytest.mark.benchmark def test_ub3lyp_hessian(benchmark): h = benchmark(run_ub3lyp_hessian, small_mol, 'def2-tzvpp', False, False) print('testing ub3lyp hessian') From e0b1eafeff0cfb83f6e390a4201390c9d9e32965 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Fri, 27 Dec 2024 05:41:21 +0000 Subject: [PATCH 25/49] test dir --- .github/workflows/nightly_build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nightly_build.yml b/.github/workflows/nightly_build.yml index f7f03ac6..b802fc3e 100644 --- a/.github/workflows/nightly_build.yml +++ b/.github/workflows/nightly_build.yml @@ -40,4 +40,4 @@ jobs: run: | echo $GITHUB_WORKSPACE export PYTHONPATH="${PYTHONPATH}:$(pwd)" - pytest tests/ -v -m "not slow" + pytest gpu4pyscf/tests/ -v -m "not slow" From 92be2aa0d82b5ee63de13329d6926512f88ebd74 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Sat, 28 Dec 2024 23:35:03 +0000 Subject: [PATCH 26/49] save changes --- gpu4pyscf/df/hessian/jk.py | 3 ++- gpu4pyscf/tests/test_benchmark_rks.py | 8 ++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py index bc6cceee..8a2b59bd 100644 --- a/gpu4pyscf/df/hessian/jk.py +++ b/gpu4pyscf/df/hessian/jk.py @@ -54,7 +54,8 @@ def _jk_task_with_mo1(dfobj, dms, mo_coeff, mo1s, occ_coeffs, else: dm_sparse *= 2 dm_sparse[:, intopt.cderi_diag] *= .5 - + dms = None + if with_k: vks = [cupy.zeros_like(mo1) for mo1 in mo1s] diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py index cdaa2801..59eb2e39 100644 --- a/gpu4pyscf/tests/test_benchmark_rks.py +++ b/gpu4pyscf/tests/test_benchmark_rks.py @@ -144,12 +144,13 @@ def test_rb3lyp_grad_median(benchmark): g = benchmark(run_rb3lyp_grad, median_mol, 'def2-tzvpp', False, False) print('testing rb3lyp grad median') assert np.isclose(np.linalg.norm(g), 0.2601443836937988, atol=1e-5) -@pytest.mark.high_memory + @pytest.mark.slow @pytest.mark.benchmark def test_rb3lyp_hessian_median(benchmark): h = benchmark(run_rb3lyp_hessian, median_mol, 'def2-tzvpp', False, False) print('testing rb3lyp hessian median') + print(np.linalg.norm(h)) assert np.isclose(np.linalg.norm(h)) # large molecule @@ -182,13 +183,16 @@ def test_rb3lyp_grad_large(benchmark): g = benchmark(run_rb3lyp_grad, large_mol, 'def2-tzvpp', False, False) print('testing rb3lyp grad large') assert np.isclose(np.linalg.norm(g), 0.3784664384209763, atol=1e-5) + +# Hessian for large molecule with large basis set is too slow +''' @pytest.mark.slow @pytest.mark.benchmark def test_rb3lyp_hessian_large(benchmark): h = benchmark(run_rb3lyp_hessian, large_mol, 'def2-tzvpp', False, False) print('testing rb3lyp hessian large') print(np.linalg.norm(h)) - +''' # small basis set @pytest.mark.benchmark def test_df_rb3lyp_631gs(benchmark): From 09ab367679a172acdf1bd193e9694287e32da499 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Mon, 30 Dec 2024 18:08:39 +0000 Subject: [PATCH 27/49] add copy_array --- .../cupy_helper/benchmark_memory_copy.py | 97 ++++++++++ gpu4pyscf/df/df.py | 9 +- gpu4pyscf/df/grad/rhf.py | 1 + gpu4pyscf/df/hessian/rhf.py | 42 ++-- gpu4pyscf/df/hessian/uhf.py | 42 ++-- gpu4pyscf/df/int3c2e.py | 29 ++- gpu4pyscf/gto/int3c1e.py | 4 +- gpu4pyscf/lib/cupy_helper.py | 11 +- gpu4pyscf/lib/memcpy.py | 90 +++++++++ gpu4pyscf/lib/tests/test_cupy_helper.py | 38 +++- gpu4pyscf/tests/test_benchmark_rks.py | 3 +- gpu4pyscf/tests/test_dft.py | 181 ------------------ 12 files changed, 319 insertions(+), 228 deletions(-) create mode 100644 benchmarks/cupy_helper/benchmark_memory_copy.py create mode 100644 gpu4pyscf/lib/memcpy.py delete mode 100644 gpu4pyscf/tests/test_dft.py diff --git a/benchmarks/cupy_helper/benchmark_memory_copy.py b/benchmarks/cupy_helper/benchmark_memory_copy.py new file mode 100644 index 00000000..c658674f --- /dev/null +++ b/benchmarks/cupy_helper/benchmark_memory_copy.py @@ -0,0 +1,97 @@ +# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy as np +import cupy as cp +from cupyx import profiler +from gpu4pyscf.lib.cupy_helper import copy_array + +''' +Benchmark different ways of transfering data from pinned memory to device +''' + +# Host array +host_array = cp.cuda.alloc_pinned_memory(512*512*512 * 8) +big_host_data = np.ndarray(512**3, dtype=cp.float64, buffer=host_array) +big_host_data = big_host_data.reshape(512,512,512) +big_host_data += np.random.rand(512,512,512) + +# Device array +big_device_data = cp.empty_like(big_host_data) + +# Create views on both arrays +host_view = big_host_data[:, 128:] # Non-contiguous view on the host +device_view = big_device_data[:, 128:] # Non-contiguous view on the device + +print("Host View Shape:", host_view.shape) +print("Device View Shape:", device_view.shape) +''' +print("------ Benchmark device to host transfer ----------") +size = host_view.nbytes +perf_custom = profiler.benchmark(copy_array, (host_view, device_view), n_repeat=100, n_warmup=3) +t_kernel = perf_custom.gpu_times.mean() +bandwidth = size / t_kernel / 1e9 +print('using custom function', t_kernel) +print(f"Effective PCIe Bandwidth: {bandwidth:.2f} GB/s") + +def cupy_copy(c, out): + out[:] = cp.asarray(c) + return out +perf_cupy = profiler.benchmark(cupy_copy, (host_view, device_view), n_repeat=100, n_warmup=3) +t_kernel = perf_cupy.gpu_times.mean() +bandwidth = size / t_kernel / 1e9 +print('using cupy function', t_kernel) +print(f"Effective PCIe Bandwidth: {bandwidth:.2f} GB/s") + +print("------- Benchmark host to device transfer ---------") +size = host_view.nbytes +perf_custom = profiler.benchmark(copy_array, (device_view, host_view), n_repeat=100, n_warmup=3) +t_kernel = perf_custom.gpu_times.mean() +bandwidth = size / t_kernel / 1e9 +print('using custom function', t_kernel) +print(f"Effective PCIe Bandwidth: {bandwidth:.2f} GB/s") + +def cupy_copy(c, out): + out[:] = c.get() + return out +perf_cupy = profiler.benchmark(cupy_copy, (device_view, host_view), n_repeat=100, n_warmup=3) +t_kernel = perf_cupy.gpu_times.mean() +bandwidth = size / t_kernel / 1e9 +print('using cupy function', t_kernel) +print(f"Effective PCIe Bandwidth: {bandwidth:.2f} GB/s") +''' +with cp.cuda.Device(0): + a = cp.random.rand(512,512,512) + device0_view = a[:,128:, 128:] +with cp.cuda.Device(1): + b = cp.random.rand(512,512,512) + device1_view = b[:,128:, 128:] +perf_cupy = profiler.benchmark(copy_array, (device0_view, device1_view), n_repeat=100, n_warmup=3) +t_kernel = perf_cupy.gpu_times.mean() +bandwidth = device0_view.nbytes / t_kernel / 1e9 +print('using custom function', t_kernel) +print(f"Effective PCIe Bandwidth: {bandwidth:.2f} GB/s") + +assert np.linalg.norm(device0_view.get() - device1_view.get()) < 1e-10 + +def cupy_copy(c, out): + with cp.cuda.Device(out.device): + out[:] = cp.asarray(c.get()) + return out +perf_cupy = profiler.benchmark(cupy_copy, (device0_view, device1_view), n_repeat=100, n_warmup=3) +t_kernel = perf_cupy.gpu_times.mean() +bandwidth = device0_view.nbytes / t_kernel / 1e9 +print('using cupy function', t_kernel) +print(f"Effective PCIe Bandwidth: {bandwidth:.2f} GB/s") diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py index 48e0e8e1..442c1bab 100644 --- a/gpu4pyscf/df/df.py +++ b/gpu4pyscf/df/df.py @@ -20,7 +20,8 @@ from cupyx.scipy.linalg import solve_triangular from pyscf import lib from pyscf.df import df, addons, incore -from gpu4pyscf.lib.cupy_helper import cholesky, tag_array, get_avail_mem, cart2sph, p2p_transfer +from gpu4pyscf.lib.cupy_helper import (cholesky, tag_array, get_avail_mem, + cart2sph, p2p_transfer, copy_array) from gpu4pyscf.df import int3c2e, df_jk from gpu4pyscf.lib import logger from gpu4pyscf import __config__ @@ -347,11 +348,13 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de if isinstance(_cderi[0], np.ndarray): for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)): for i in range(p0,p1): - cderi_block[i].get(out=_cderi[slice_id][i-p0,ij0:ij1]) + #cderi_block[i].get(out=_cderi[slice_id][i-p0,ij0:ij1]) + copy_array(cderi_block[i], _cderi[slice_id][i-p0,ij0:ij1]) else: # Copy data to other Devices for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)): #_cderi[slice_id][:,ij0:ij1] = cderi_block[p0:p1] - p2p_transfer(_cderi[slice_id][:,ij0:ij1], cderi_block[p0:p1]) + tmp = cupy.array(cderi_block[p0:p1], order='C', copy=True) + p2p_transfer(_cderi[slice_id][:,ij0:ij1], tmp) t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1) return diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py index 681e18be..7c1c901a 100644 --- a/gpu4pyscf/df/grad/rhf.py +++ b/gpu4pyscf/df/grad/rhf.py @@ -44,6 +44,7 @@ def j2c_solver(v): mask = w > lindep v1 = v[:,mask] j2c = cupy.dot(v1/w[mask], v1.conj().T) + w = v = v1 = mask = None def j2c_solver(b): # noqa: F811 return j2c.dot(b.reshape(j2c.shape[0],-1)).reshape(b.shape) return j2c_solver diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py index 938b1384..57dfc363 100644 --- a/gpu4pyscf/df/hessian/rhf.py +++ b/gpu4pyscf/df/hessian/rhf.py @@ -30,7 +30,7 @@ from gpu4pyscf.grad import rhf as rhf_grad from gpu4pyscf.hessian import rhf as rhf_hess from gpu4pyscf.lib.cupy_helper import ( - contract, tag_array, get_avail_mem, release_gpu_stack, pinv) + contract, tag_array, get_avail_mem, release_gpu_stack, pinv, copy_array) from gpu4pyscf.df import int3c2e, df from gpu4pyscf.lib import logger from gpu4pyscf import __config__ @@ -58,7 +58,9 @@ def _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2): mem_avail = get_avail_mem() blksize = int((mem_avail*0.4/(nao*nao*3*8)/ALIGNED))*ALIGNED for k0, k1 in lib.prange(0,nnz,blksize): - rhok1_Pko_kslice = cupy.asarray(rhok1_Pko[k0:k1]) + #rhok1_Pko_kslice = cupy.asarray(rhok1_Pko[k0:k1]) + rhok1_Pko_kslice = copy_array(rhok1_Pko[k0:k1]) + # (10|0)(0|10) without response of RI basis vk2_ip1_ip1 = contract('piox,pkoy->ikxy', rhok1_Pko_kslice, rhok1_Pko_kslice) hk_ao_ao += contract('ikxy,ik->ikxy', vk2_ip1_ip1, dm0) @@ -147,6 +149,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls # int3c_ip1 contributions wj1_P, wk1_Pko = int3c2e.get_int3c2e_ip1_wjk(intopt, dm0_tag, omega=omega) + t1 = log.timer_debug1('interdeidate variables with int3c2e_ip1', *t1) + #rhoj1_P = contract('pq,pix->qix', int2c_inv, wj1_P) if with_j: rhoj1_P = solve_j2c(wj1_P) @@ -173,7 +177,9 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls raise RuntimeError('Not enough memory for intermediate variables') for i0, i1 in lib.prange(0,nao,blksize): - wk1_Pko_islice = cupy.asarray(wk1_Pko[:,i0:i1]) + #wk1_Pko_islice = cupy.asarray(wk1_Pko[:,i0:i1]) + wk1_Pko_islice = copy_array(wk1_Pko[:,i0:i1]) + #rhok1_Pko = contract('pq,qiox->piox', int2c_inv, wk1_Pko_islice) rhok1_Pko = solve_j2c(wk1_Pko_islice) wk1_Pko_islice = None @@ -194,7 +200,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls hk_ao_aux[i0:i1] -= contract('qoi,qioxy->iqxy', rhok0_P_I, wk1_I) wk1_I = rhok0_P_I = None rhok1_Pko = None - + t1 = log.timer_debug1('contractions with int3c2e_ip1', *t1) + w, v = cupy.linalg.eigh(int2c) idx = w > LINEAR_DEP_THR cd_low = (v[:,idx] / cupy.sqrt(w[idx])) @@ -203,17 +210,18 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls rhok1_Pko = wk1_Pko[:nnz] # Reuse the same memory for i0, i1 in lib.prange(0,nao,blksize): - wk1_tmp = cupy.asarray(wk1_Pko[:,i0:i1]) + #wk1_tmp = cupy.asarray(wk1_Pko[:,i0:i1]) + wk1_tmp = copy_array(wk1_Pko[:,i0:i1]) if isinstance(rhok1_Pko, cupy.ndarray): rhok1_Pko[:,i0:i1] = contract('qp,qiox->piox', cd_low, wk1_tmp) else: rhok1_Pko[:,i0:i1] = contract('qp,qiox->piox', cd_low, wk1_tmp).get() wk1_tmp = None cd_low = None - + t1 = log.timer_debug1('data transfer', *t1) hk_ao_ao += _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2) wk1_Pko = rhok1_Pko = None - t1 = log.timer_debug1('intermediate variables with int3c2e_ip1', *t1) + t1 = log.timer_debug1('contractions with int3c2e_ip1', *t1) hj_ipip, hk_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, with_j=with_j, with_k=with_k, omega=omega, @@ -487,8 +495,11 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, else: rhok0_Pl_ = wk_Pl_ # reuse the memory for p0, p1 in lib.prange(0,nao,64): - wk_tmp = cupy.asarray(wk_Pl_[:,p0:p1]) - rhok0_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get() + #wk_tmp = cupy.asarray(wk_Pl_[:,p0:p1]) + #rhok0_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get() + wk_tmp = copy_array(wk_Pl_[:,p0:p1]) + wk_tmp = solve_j2c(wk_tmp) + copy_array(wk_tmp, rhok0_Pl_[:,p0:p1]) wk_tmp = None wk_Pl_ = None solve_j2c = None @@ -503,6 +514,8 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, fn = int3c2e.get_int3c2e_ip2_vjk vj1_int3c, vk1_int3c = fn(intopt, rhoj0, rhok0_Pl_, dm0_tag, auxslices, with_j=with_j, with_k=with_k, omega=omega) + t0 = log.timer_debug1('Fock matrix due to int3c2e_ip2', *t0) + # Responses due to int2c2e_ip1 if omega and omega > 1e-10: with auxmol.with_range_coulomb(omega): @@ -521,7 +534,8 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, else: rhok0_P__ = cupy.empty([naux,nocc,nocc]) for p0, p1 in lib.prange(0,naux,64): - rhok0_Pl_tmp = cupy.asarray(rhok0_Pl_[p0:p1]) + #rhok0_Pl_tmp = cupy.asarray(rhok0_Pl_[p0:p1]) + rhok0_Pl_tmp = copy_array(rhok0_Pl_[p0:p1]) rhok0_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, mocc) rhok0_Pl_tmp = None wk0_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0_P__) @@ -531,10 +545,11 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, blksize = int(0.2*mem_avail/(3*naux*nocc*8)/ALIGNED) * ALIGNED log.debug(f'GPU Memory {mem_avail/GB:.1f} GB available, {blksize} AOs per block') if blksize < ALIGNED: - raise RuntimeError('Not enough memory to compute int3c2e_ip2') + raise RuntimeError('Not enough memory to compute int2c2e_ip2') for p0, p1 in lib.prange(0,nao,blksize): - rhok_tmp = cupy.asarray(rhok0_Pl_[:,p0:p1]) + #rhok_tmp = cupy.asarray(rhok0_Pl_[:,p0:p1]) + rhok_tmp = copy_array(rhok0_Pl_[:,p0:p1]) wk0_10_Pl_ = contract('xqp,pio->xqio', int2c_ip1, rhok_tmp) if with_j: vj1_tmp = contract('pio,xp->xpio', rhok_tmp, wj0_10) @@ -544,13 +559,12 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, if with_k: vk1_tmp = contract('xpio,pro->xpir', wk0_10_Pl_, rhok0_P__) vk1_tmp += contract('xpro,pir->xpio', wk0_10_P__, rhok_tmp) - # 2.0 due to spin vk1_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vk1_tmp, aux2atom) vk1_tmp = None wk0_10_Pl_ = rhok_tmp = None wj0_10 = wk0_10_P__ = rhok0_P__ = int2c_ip1 = None aux2atom = None - t0 = log.timer_debug1('Fock matrix due to int3c2e_ip2', *t0) + t0 = log.timer_debug1('Fock matrix due to int2c2e_ip1', *t0) # ----------------------------- # int3c_ip1 contributions diff --git a/gpu4pyscf/df/hessian/uhf.py b/gpu4pyscf/df/hessian/uhf.py index d6f26e5d..1b18fc9a 100644 --- a/gpu4pyscf/df/hessian/uhf.py +++ b/gpu4pyscf/df/hessian/uhf.py @@ -34,7 +34,7 @@ from gpu4pyscf.hessian import uhf as uhf_hess from gpu4pyscf.hessian import rhf as rhf_hess from gpu4pyscf.lib.cupy_helper import ( - contract, tag_array, get_avail_mem, release_gpu_stack, pinv) + contract, tag_array, get_avail_mem, release_gpu_stack, pinv, copy_array) from gpu4pyscf.df import int3c2e, df from gpu4pyscf.df.hessian import rhf as df_rhf_hess from gpu4pyscf.lib import logger @@ -174,15 +174,19 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, raise RuntimeError('Not enough memory for intermediate variables') for i0, i1 in lib.prange(0,nao,blksize): - wk1a_Pko_islice = cupy.asarray(wk1a_Pko[:,i0:i1]) - wk1b_Pko_islice = cupy.asarray(wk1b_Pko[:,i0:i1]) + #wk1a_Pko_islice = cupy.asarray(wk1a_Pko[:,i0:i1]) + #wk1b_Pko_islice = cupy.asarray(wk1b_Pko[:,i0:i1]) + wk1a_Pko_islice = copy_array(wk1a_Pko[:,i0:i1]) + wk1b_Pko_islice = copy_array(wk1b_Pko[:,i0:i1]) rhok1a_Pko = solve_j2c(wk1a_Pko_islice) rhok1b_Pko = solve_j2c(wk1b_Pko_islice) wk1a_Pko_islice = wk1b_Pko_islice = None for k0, k1 in lib.prange(0,nao,blksize): - wk1a_Pko_kslice = cupy.asarray(wk1a_Pko[:,k0:k1]) - wk1b_Pko_kslice = cupy.asarray(wk1b_Pko[:,k0:k1]) - + #wk1a_Pko_kslice = cupy.asarray(wk1a_Pko[:,k0:k1]) + #wk1b_Pko_kslice = cupy.asarray(wk1b_Pko[:,k0:k1]) + wk1a_Pko_kslice = copy_array(wk1a_Pko[:,k0:k1]) + wk1b_Pko_kslice = copy_array(wk1b_Pko[:,k0:k1]) + # (10|0)(0|10) without response of RI basis vk2_ip1_ip1 = contract('piox,pkoy->ikxy', rhok1a_Pko, wk1a_Pko_kslice) hk_ao_ao[i0:i1,k0:k1] += contract('ikxy,ik->ikxy', vk2_ip1_ip1, dm0a[i0:i1,k0:k1]) @@ -521,8 +525,11 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, else: rhok0a_Pl_ = np.empty_like(wka_Pl_) for p0, p1 in lib.prange(0,nao,64): - wk_tmp = cupy.asarray(wka_Pl_[:,p0:p1]) - rhok0a_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get() + # wk_tmp = cupy.asarray(wka_Pl_[:,p0:p1]) + # rhok0a_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get() + wk_tmp = copy_array(wka_Pl_[:,p0:p1]) + wk_tmp = solve_j2c(wk_tmp) + copy_array(wk_tmp, rhok0a_Pl_[:,p0:p1]) wk_tmp = None if isinstance(wkb_Pl_, cupy.ndarray): @@ -530,8 +537,11 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, else: rhok0b_Pl_ = np.empty_like(wkb_Pl_) for p0, p1 in lib.prange(0,nao,64): - wk_tmp = cupy.asarray(wkb_Pl_[:,p0:p1]) - rhok0b_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get() + #wk_tmp = cupy.asarray(wkb_Pl_[:,p0:p1]) + #rhok0b_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get() + wk_tmp = copy_array(wkb_Pl_[:,p0:p1]) + wk_tmp = solve_j2c(wk_tmp) + copy_array(wk_tmp, rhok0b_Pl_[:,p0:p1]) wk_tmp = None wka_Pl_ = wkb_Pl_ = None vj1a_int3c = vj1b_int3c = vk1a_int3c = vk1b_int3c = None @@ -566,7 +576,8 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, nocc = mocca.shape[1] rhok0a_P__ = cupy.empty([naux,nocc,nocc]) for p0, p1 in lib.prange(0,naux,64): - rhok0_Pl_tmp = cupy.asarray(rhok0a_Pl_[p0:p1]) + #rhok0_Pl_tmp = cupy.asarray(rhok0a_Pl_[p0:p1]) + rhok0_Pl_tmp = copy_array(rhok0a_Pl_[p0:p1]) rhok0a_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, mocca) rhok0_Pl_tmp = None @@ -578,7 +589,8 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, nocc = moccb.shape[1] rhok0b_P__ = cupy.empty([naux,nocc,nocc]) for p0, p1 in lib.prange(0,naux,64): - rhok0_Pl_tmp = cupy.asarray(rhok0b_Pl_[p0:p1]) + #rhok0_Pl_tmp = cupy.asarray(rhok0b_Pl_[p0:p1]) + rhok0_Pl_tmp = copy_array(rhok0b_Pl_[p0:p1]) rhok0b_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, moccb) rhok0_Pl_tmp = None if with_j: @@ -596,8 +608,10 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, raise RuntimeError('Not enough memory to compute int3c2e_ip2') for p0, p1 in lib.prange(0,nao,blksize): - rhoka_tmp = cupy.asarray(rhok0a_Pl_[:,p0:p1]) - rhokb_tmp = cupy.asarray(rhok0b_Pl_[:,p0:p1]) + #rhoka_tmp = cupy.asarray(rhok0a_Pl_[:,p0:p1]) + #rhokb_tmp = cupy.asarray(rhok0b_Pl_[:,p0:p1]) + rhoka_tmp = copy_array(rhok0a_Pl_[:,p0:p1]) + rhokb_tmp = copy_array(rhok0b_Pl_[:,p0:p1]) wk0a_10_Pl_ = contract('xqp,pio->xqio', int2c_ip1, rhoka_tmp) wk0b_10_Pl_ = contract('xqp,pio->xqio', int2c_ip1, rhokb_tmp) if with_j: diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py index f01167a1..8089ef76 100644 --- a/gpu4pyscf/df/int3c2e.py +++ b/gpu4pyscf/df/int3c2e.py @@ -21,7 +21,7 @@ from pyscf.scf import _vhf from gpu4pyscf.scf.int4c2e import BasisProdCache, libgvhf, libgint from gpu4pyscf.lib.cupy_helper import (block_c2s_diag, cart2sph, contract, get_avail_mem, - reduce_to_device) + reduce_to_device, copy_array) from gpu4pyscf.lib import logger from gpu4pyscf.gto.mole import basis_seg_contraction from gpu4pyscf.__config__ import _num_devices, _streams @@ -839,7 +839,8 @@ def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id= for cp_k in task_k_list: task_list = [(cp_k, cp_ij) for cp_ij in range(ncp_ij)] k0, k1 = aux_ao_loc[cp_k], aux_ao_loc[cp_k+1] - rhok_tmp = cupy.asarray(rhok[k0:k1]) + #rhok_tmp = cupy.asarray(rhok[k0:k1]) + rhok_tmp = copy_array(rhok[k0:k1]) if with_k: rhok0 = contract('pio,ir->pro', rhok_tmp, orbo) rhok0 = contract('pro,Jo->prJ', rhok0, orbo) @@ -857,11 +858,13 @@ def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id= vk1_ao = contract('xpji,poi->xijo', int3c_blk, rhok0[:,:,i0:i1]) vk1[:,:,j0:j1] += contract('xijo,ia->axjo', vk1_ao, ao2atom[i0:i1]) + vk1_ao = int3c_blk = None if with_j: rhoj0_atom = contract('xpi,ia->xpa', rhoj0, 2.0*ao2atom) vj1 += contract('pJo,xpa->axJo', rhok_tmp, rhoj0_atom) - rhoj0_atom = None + rhoj0_atom = rhoj0 = None if with_k: + rhok0 = None vk1_buf += contract('xpio,plo->xil', int3c_ip1_occ, rhok_tmp) mem_avail = get_avail_mem() blksize = min(int(mem_avail * 0.2 / ((k1-k0) * nao) * 8), @@ -870,6 +873,8 @@ def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id= rhok0_slice = contract('pJr,ir->pJi', rhok_tmp[:,p0:p1], orbo) vk1_ao = contract('xpio,pJi->xiJo', int3c_ip1_occ, rhok0_slice) vk1[:,:,p0:p1] += contract('xiJo,ia->axJo', vk1_ao, ao2atom) + rhok0_slice = vk1_ao = None + rhok_tmp = int3c_ip1_occ = None # TODO: absorbe vj1_buf and vk1_buf into vj1 and vk1 return vj1_buf, vk1_buf, vj1, vk1 @@ -946,15 +951,16 @@ def _int3c2e_ip2_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, wj2 += contract('xpji,ji->xp', int3c_blk, dm0[j0:j1,i0:i1]) wk2_P__[:,:,i0:i1] += contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1]) - rhok_tmp = cupy.asarray(rhok[k0:k1]) + int3c_blk = None + #rhok_tmp = cupy.asarray(rhok[k0:k1]) + rhok_tmp = copy_array(rhok[k0:k1]) if with_j: vj1_tmp = -contract('pio,xp->xpio', rhok_tmp, wj2) vj1_tmp -= contract('xpio,p->xpio', wk2_P__, rhoj[k0:k1]) vj1 += contract('xpio,pa->axio', vj1_tmp, aux2atom[k0:k1]) + vj1_tmp = wj2 = None if with_k: - #rhok0_slice = contract('pio,jo->pij', rhok_tmp, orbo) - #vk1_tmp = -contract('xpjo,pij->xpio', wk2_P__, rhok0_slice) rhok0_slice = contract('xpjo,jr->xpro', wk2_P__, orbo) vk1_tmp = -contract('xpro,pir->xpio', rhok0_slice, rhok_tmp) @@ -962,8 +968,8 @@ def _int3c2e_ip2_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, vk1_tmp -= contract('xpio,pro->xpir', wk2_P__, rhok0_oo) vk1 += contract('xpir,pa->axir', vk1_tmp, aux2atom[k0:k1]) - wj2 = wk2_P__ = rhok0_slice = rhok0_oo = None - rhok_tmp = vk1_tmp = None + vk1_tmp = rhok0_oo = rhok0_slice = None + rhok_tmp = wk2_P__ = None return vj1, vk1 def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, @@ -1022,8 +1028,11 @@ def _int3c2e_ip1_wjk_task(intopt, task_list, dm0, orbo, wk, device_id=0, with_k= wj[k0:k1,i0:i1] += contract('xpji,ij->pix', int3c_blk, dm0[i0:i1,j0:j1]) if with_k: wk_tmp[:,i0:i1] += contract('xpji,jo->piox', int3c_blk, orbo[j0:j1]) + int3c_blk = None if with_k: - wk_tmp.get(out=wk[k0:k1]) + #wk_tmp.get(out=wk[k0:k1]) + copy_array(wk_tmp, wk[k0:k1]) + wk_tmp = None return wj def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None): @@ -1075,6 +1084,8 @@ def _int3c2e_ip2_wjk(intopt, task_list, dm0, orbo, with_k=True, omega=None, devi if with_k: tmp = contract('xpji,jo->piox', int3c_blk, orbo[j0:j1]) wk[k0:k1] += contract('piox,ir->prox', tmp, orbo[i0:i1]) + tmp = None + int3c_blk = None return wj, wk def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None): diff --git a/gpu4pyscf/gto/int3c1e.py b/gpu4pyscf/gto/int3c1e.py index 1e97c39b..cab38b98 100644 --- a/gpu4pyscf/gto/int3c1e.py +++ b/gpu4pyscf/gto/int3c1e.py @@ -222,8 +222,8 @@ def get_int3c1e(mol, grids, charge_exponents, intopt): "which requires {total_double_number * 8 / 1e9 : .1f} GB of memory") ngrids_per_split = (ngrids + n_grid_split - 1) // n_grid_split - buf_size = ngrids * nao * nao * 8 - int3c_pinned_buf = cp.cuda.alloc_pinned_memory(buf_size) + buf_size = ngrids * nao * nao + int3c_pinned_buf = cp.cuda.alloc_pinned_memory(buf_size * 8) int3c = np.frombuffer(int3c_pinned_buf, np.float64, buf_size).reshape([ngrids, nao, nao], order='C') # int3c = np.zeros([ngrids, nao, nao], order='C') # Using unpinned (pageable) memory, each memcpy is much slower, but there's no initialization time diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py index 5828bbfe..41e11307 100644 --- a/gpu4pyscf/lib/cupy_helper.py +++ b/gpu4pyscf/lib/cupy_helper.py @@ -23,6 +23,7 @@ from gpu4pyscf.gto import mole from gpu4pyscf.lib.cutensor import contract from gpu4pyscf.lib.cusolver import eigh, cholesky #NOQA +from gpu4pyscf.lib.memcpy import copy_array #NOQA from gpu4pyscf.__config__ import _streams, _num_devices, _p2p_access LMAX_ON_GPU = 7 @@ -93,10 +94,16 @@ def p2p_transfer(a, b): # https://github.com/cupy/cupy/blob/v13.3.0/cupy/_core/_routines_indexing.pyx#L1015 a[:] = b else: + #copy_array(b, a) with cupy.cuda.Device(a.device): # TODO: reduce memory copy, a can be non-contiguous array - a[:] = cupy.asarray(b.get()) - + #a[:] = cupy.asarray(b.get()) + copy_array(b, a) + if np.linalg.norm(a.get() - b.get()) > 1e-3: + print(a[:5], a.device, a.strides, a.shape) + print(b[:5], b.device, b.strides, b.shape) + print(a.shape, b.shape) + exit() def concatenate(array_list): ''' Concatenate axis=0 only ''' diff --git a/gpu4pyscf/lib/memcpy.py b/gpu4pyscf/lib/memcpy.py new file mode 100644 index 00000000..d3695168 --- /dev/null +++ b/gpu4pyscf/lib/memcpy.py @@ -0,0 +1,90 @@ +# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import cupy +import numpy as np + +def find_contiguous_chunks(shape, h_strides, d_strides): + """ + Find the largest contiguous chunk size based on strides and shape. + """ + chunk_shape = [] + chunk_size = 1 + for dim, h_stride, d_stride in zip(reversed(shape), reversed(h_strides), reversed(d_strides)): + if h_stride == chunk_size and d_stride == chunk_size: + chunk_shape.append(dim) + chunk_size *= dim + else: + break + chunk_shape = tuple(reversed(chunk_shape)) + return chunk_shape, chunk_size + +def copy_array(src_view, out=None): + ''' Copy cupy/numpy array to cupy array if out is None + Copy cupy/numpy array to cupy/numpy array (out) + ''' + if out is None: + out = cupy.empty_like(src_view) + else: + # Ensure both arrays have the same shape + if src_view.shape != out.shape: + raise ValueError("Host and device views must have the same shape.") + return _copy_array(src_view, out) + +def _copy_array(src_view, dst_view): + ''' Copy data from cupy/numpy array to another cupy/numpy array + Check memory layout, then copy memory chunks by cupy.cuda.runtime.memcpy + ''' + shape = src_view.shape + itemsize = src_view.itemsize + strides_src = [stride // itemsize for stride in src_view.strides] + strides_dst = [stride // itemsize for stride in dst_view.strides] + + # Find the largest contiguous chunk + chunk_shape, chunk_size = find_contiguous_chunks(shape, strides_src, strides_dst) + + if isinstance(src_view, cupy.ndarray): + src_data_ptr = src_view.data.ptr + else: + src_data_ptr = src_view.ctypes.data + + if isinstance(dst_view, cupy.ndarray): + dst_data_ptr = dst_view.data.ptr + else: + dst_data_ptr = dst_view.ctypes.data + + if isinstance(src_view, cupy.ndarray) and isinstance(dst_view, cupy.ndarray): + kind = cupy.cuda.runtime.memcpyDeviceToDevice + elif isinstance(src_view, cupy.ndarray) and isinstance(dst_view, np.ndarray): + kind = cupy.cuda.runtime.memcpyDeviceToHost + elif isinstance(src_view, np.ndarray) and isinstance(dst_view, cupy.ndarray): + kind = cupy.cuda.runtime.memcpyHostToDevice + else: + raise NotImplementedError + + # Transfer data chunk-by-chunk + outer_dims = shape[:-len(chunk_shape)] + for outer_index in np.ndindex(*outer_dims): + # Compute offsets for the current outer slice + src_offset = sum(outer_index[i] * strides_src[i] for i in range(len(outer_dims))) + dst_offset = sum(outer_index[i] * strides_dst[i] for i in range(len(outer_dims))) + # Perform the memcpy for the contiguous chunk + cupy.cuda.runtime.memcpy( + dst_data_ptr + dst_offset * dst_view.itemsize, + src_data_ptr + src_offset * src_view.itemsize, + chunk_size * src_view.itemsize, + kind + ) + return dst_view diff --git a/gpu4pyscf/lib/tests/test_cupy_helper.py b/gpu4pyscf/lib/tests/test_cupy_helper.py index 0f406c82..21556df2 100644 --- a/gpu4pyscf/lib/tests/test_cupy_helper.py +++ b/gpu4pyscf/lib/tests/test_cupy_helper.py @@ -19,7 +19,8 @@ from gpu4pyscf.lib.cupy_helper import ( take_last2d, transpose_sum, krylov, unpack_sparse, add_sparse, takebak, empty_mapped, dist_matrix, - grouped_dot, grouped_gemm, cond, cart2sph_cutensor, cart2sph) + grouped_dot, grouped_gemm, cond, cart2sph_cutensor, cart2sph, + copy_array) class KnownValues(unittest.TestCase): def test_take_last2d(self): @@ -214,6 +215,41 @@ def test_unpack_tril(self): ref[:,idx,idy] = atril assert abs(a - ref).max() < 1e-12 + def test_copy_host2dev(self): + host_array = cupy.cuda.alloc_pinned_memory(10*10*10 * 8) + host_data = numpy.ndarray(10**3, dtype=cupy.float64, buffer=host_array) + host_data = host_data.reshape(10,10,10) + host_data += numpy.random.rand(10,10,10) + + device_data = cupy.empty_like(host_data) + host_view = host_data[:, 8:] # Non-contiguous view on the host + device_view = device_data[:, 8:] # Non-contiguous view on the device + + copy_array(host_view, device_view) + assert numpy.linalg.norm(host_view - device_view.get()) < 1e-10 + + copy_array(host_view.copy(), device_view) + assert numpy.linalg.norm(host_view - device_view.get()) < 1e-10 + + device_view = copy_array(host_view) + assert numpy.linalg.norm(host_view - device_view.get()) < 1e-10 + + def test_copy_dev2host(self): + host_array = cupy.cuda.alloc_pinned_memory(10*10*10 * 8) + host_data = numpy.ndarray(10**3, dtype=cupy.float64, buffer=host_array) + host_data = host_data.reshape(10,10,10) + + device_data = cupy.zeros_like(host_data) + device_data += cupy.random.rand(10,10,10) + host_view = host_data[:, 8:] # Non-contiguous view on the host + device_view = device_data[:, 8:] # Non-contiguous view on the device + + copy_array(device_view, host_view) + assert numpy.linalg.norm(host_view - device_view.get()) < 1e-10 + + copy_array(device_view.copy(), host_view) + assert numpy.linalg.norm(host_view - device_view.get()) < 1e-10 + if __name__ == "__main__": print("Full tests for cupy helper module") unittest.main() diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py index 59eb2e39..c1be2979 100644 --- a/gpu4pyscf/tests/test_benchmark_rks.py +++ b/gpu4pyscf/tests/test_benchmark_rks.py @@ -150,8 +150,7 @@ def test_rb3lyp_grad_median(benchmark): def test_rb3lyp_hessian_median(benchmark): h = benchmark(run_rb3lyp_hessian, median_mol, 'def2-tzvpp', False, False) print('testing rb3lyp hessian median') - print(np.linalg.norm(h)) - assert np.isclose(np.linalg.norm(h)) + assert np.isclose(np.linalg.norm(h), 6.312714778020796, atol=1e-4) # large molecule @pytest.mark.benchmark diff --git a/gpu4pyscf/tests/test_dft.py b/gpu4pyscf/tests/test_dft.py deleted file mode 100644 index 06bfbe4c..00000000 --- a/gpu4pyscf/tests/test_dft.py +++ /dev/null @@ -1,181 +0,0 @@ -# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest -import numpy as np -import pyscf -import pytest -import cupy -from gpu4pyscf.dft import rks, uks - -def setUpModule(): - global mol - atom = ''' -C -0.07551087 1.68127663 -0.10745193 -O 1.33621755 1.87147409 -0.39326987 -C 1.67074668 2.95729545 0.49387976 -C 0.41740763 3.77281969 0.78495878 -C -0.60481480 3.07572636 0.28906224 -H -0.19316298 1.01922455 0.72486113 -O 0.35092043 5.03413298 1.45545728 -H 0.42961487 5.74279041 0.81264173 -O -1.95331750 3.53349874 0.15912025 -H -2.55333895 2.78846397 0.23972698 -O 2.81976302 3.20110148 0.94542226 -C -0.81772499 1.09230218 -1.32146482 -H -0.70955636 1.74951833 -2.15888136 -C -2.31163857 0.93420736 -0.98260166 -H -2.72575463 1.89080093 -0.74107186 -H -2.41980721 0.27699120 -0.14518512 -O -0.26428017 -0.18613595 -1.64425697 -H -0.72695910 -0.55328886 -2.40104423 -O -3.00083741 0.38730252 -2.10989934 -H -3.93210821 0.28874990 -1.89865997 -''' - - mol = pyscf.M(atom=atom, basis='def2-tzvpp', max_memory=32000, cart=0) - mol.output = '/dev/null' - mol.build() - mol.verbose = 1 - -def tearDownModule(): - global mol - mol.stdout.close() - del mol - -class KnownValues(unittest.TestCase): - @pytest.mark.smoke - def test_b3lyp_with_d3bj(self): - print('-------- DFRKS with D3(BJ) -------') - mf = rks.RKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit') - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-10 - mf.conv_tol_cpscf = 1e-8 - mf.disp = 'd3bj' - e_dft = mf.kernel() - assert np.abs(e_dft - -685.0326965348272) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.17498362161082373) < 1e-5 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4 - - @pytest.mark.smoke - def test_b3lyp_d3bj(self): - print('-------- DFRKS with D3(BJ) -------') - mf = rks.RKS(mol, xc='b3lyp-d3bj').density_fit(auxbasis='def2-tzvpp-jkfit') - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-10 - mf.conv_tol_cpscf = 1e-8 - e_dft = mf.kernel() - assert np.abs(e_dft - -685.0326965348272) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.17498362161082373) < 1e-5 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4 - - @pytest.mark.smoke - def test_DFUKS(self): - print('------- DFUKS with D3(BJ) -------') - mf = uks.UKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit') - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-10 - mf.conv_tol_cpscf = 1e-8 - mf.disp = 'd3bj' - e_dft = mf.kernel() - assert np.abs(e_dft - -685.0326965349493) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.17498264516108836) < 1e-5 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.768429871470736) < 1e-4 - - @pytest.mark.smoke - def test_RKS(self): - print('-------- RKS with D3(BJ) -------') - mf = rks.RKS(mol, xc='b3lyp') - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-12 - mf.conv_tol_cpscf = 1e-8 - mf.disp = 'd3bj' - e_dft = mf.kernel() - assert np.abs(e_dft - -685.0325611822375) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.1750368231223345) < 1e-6 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.760381249106394) < 1e-4 - - @pytest.mark.smoke - def test_UKS(self): - print('-------- UKS with D3(BJ) -------') - mf = uks.UKS(mol, xc='b3lyp') - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-12 - mf.conv_tol_cpscf = 1e-8 - mf.disp = 'd3bj' - e_dft = mf.kernel() - assert np.abs(e_dft - -685.0325611822375) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.1750368231223345) < 1e-6 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.760381249106394) < 1e-4 - - @pytest.mark.smoke - def test_DFRKS_with_SMD(self): - print('----- DFRKS with SMD -----') - mf = rks.RKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit') - mf = mf.SMD() - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-10 - mf.conv_tol_cpscf = 1e-8 - mf.disp = 'd3bj' - e_dft = mf.kernel() - assert np.abs(e_dft - -685.0578838805443) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.16804945458657145) < 1e-5 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.741783814494321) < 1e-4 - - @pytest.mark.smoke - def test_DFUKS_with_SMD(self): - print('------- DFUKS with SMD ---------') - mf = uks.UKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit') - mf = mf.SMD() - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-10 - mf.conv_tol_cpscf = 1e-8 - mf.disp = 'd3bj' - e_dft = mf.kernel() - assert np.abs(e_dft - -685.05788388063) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.1680496465773684) < 1e-5 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.7417788481647563) < 1e-4 - -if __name__ == "__main__": - print("Full Smoke Tests") - unittest.main() - From c8846d2ac5a6203fad64c4cc494174d5c322ecf1 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Mon, 30 Dec 2024 22:16:53 +0000 Subject: [PATCH 28/49] assert chunk_shape --- gpu4pyscf/lib/memcpy.py | 3 +++ gpu4pyscf/lib/tests/test_cupy_helper.py | 6 +++--- gpu4pyscf/tests/test_benchmark_rks.py | 4 +++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/gpu4pyscf/lib/memcpy.py b/gpu4pyscf/lib/memcpy.py index d3695168..0f4faa6b 100644 --- a/gpu4pyscf/lib/memcpy.py +++ b/gpu4pyscf/lib/memcpy.py @@ -47,6 +47,7 @@ def _copy_array(src_view, dst_view): ''' Copy data from cupy/numpy array to another cupy/numpy array Check memory layout, then copy memory chunks by cupy.cuda.runtime.memcpy ''' + shape = src_view.shape itemsize = src_view.itemsize strides_src = [stride // itemsize for stride in src_view.strides] @@ -74,6 +75,8 @@ def _copy_array(src_view, dst_view): else: raise NotImplementedError + assert len(chunk_shape) > 0 + # Transfer data chunk-by-chunk outer_dims = shape[:-len(chunk_shape)] for outer_index in np.ndindex(*outer_dims): diff --git a/gpu4pyscf/lib/tests/test_cupy_helper.py b/gpu4pyscf/lib/tests/test_cupy_helper.py index 21556df2..b322f8ed 100644 --- a/gpu4pyscf/lib/tests/test_cupy_helper.py +++ b/gpu4pyscf/lib/tests/test_cupy_helper.py @@ -236,11 +236,11 @@ def test_copy_host2dev(self): def test_copy_dev2host(self): host_array = cupy.cuda.alloc_pinned_memory(10*10*10 * 8) - host_data = numpy.ndarray(10**3, dtype=cupy.float64, buffer=host_array) - host_data = host_data.reshape(10,10,10) + host_data = numpy.ndarray(3*10**2, dtype=cupy.float64, buffer=host_array) + host_data = host_data.reshape(3,10,10) device_data = cupy.zeros_like(host_data) - device_data += cupy.random.rand(10,10,10) + device_data += cupy.random.rand(3,10,10) host_view = host_data[:, 8:] # Non-contiguous view on the host device_view = device_data[:, 8:] # Non-contiguous view on the device diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py index c1be2979..321fd56f 100644 --- a/gpu4pyscf/tests/test_benchmark_rks.py +++ b/gpu4pyscf/tests/test_benchmark_rks.py @@ -245,6 +245,8 @@ def test_df_rb3lyp_631gs_solvent_hessian(benchmark): print('testing df rb3lyp 631gs solvent hessian') assert np.isclose(np.linalg.norm(h), 3.9008165041707294, atol=1e-4) +# No need to test d3bj generally +''' # b3lyp d3bj @pytest.mark.benchmark def test_df_rb3lyp_631gs_d3bj(benchmark): @@ -261,4 +263,4 @@ def test_df_rb3lyp_631gs_d3bj_hessian(benchmark): h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, True, 'd3bj') print('testing df rb3lyp 631gs solvent hessian') assert np.isclose(np.linalg.norm(h), 3.902367554157861, atol=1e-4) - +''' From ca18282854c1fb3d5e03047f7fd3517ea61a083e Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Tue, 31 Dec 2024 22:23:16 +0000 Subject: [PATCH 29/49] improve hcore derivatives --- examples/dft_driver.py | 1 - gpu4pyscf/df/df.py | 9 +- gpu4pyscf/df/hessian/jk.py | 12 +- gpu4pyscf/df/hessian/rhf.py | 13 ++- gpu4pyscf/df/int3c2e.py | 151 +++++++++++++++++--------- gpu4pyscf/hessian/rhf.py | 29 +++-- gpu4pyscf/hessian/rks.py | 126 ++++++++++++++++++++- gpu4pyscf/lib/memcpy.py | 10 +- gpu4pyscf/tests/test_benchmark_rks.py | 40 +++---- 9 files changed, 288 insertions(+), 103 deletions(-) diff --git a/examples/dft_driver.py b/examples/dft_driver.py index e0eccdda..0be7f410 100644 --- a/examples/dft_driver.py +++ b/examples/dft_driver.py @@ -27,7 +27,6 @@ parser.add_argument("--solvent", type=str, default='') args = parser.parse_args() -lib.num_threads(16) start_time = time.time() bas = args.basis mol = pyscf.M( diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py index 442c1bab..67b30c0a 100644 --- a/gpu4pyscf/df/df.py +++ b/gpu4pyscf/df/df.py @@ -347,14 +347,15 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de ij1 = pairs_loc[cp_ij_id+1] if isinstance(_cderi[0], np.ndarray): for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)): - for i in range(p0,p1): - #cderi_block[i].get(out=_cderi[slice_id][i-p0,ij0:ij1]) - copy_array(cderi_block[i], _cderi[slice_id][i-p0,ij0:ij1]) + #for i in range(p0,p1): + # cderi_block[i].get(out=_cderi[slice_id][i-p0,ij0:ij1]) + tmp = cupy.array(cderi_block[p0:p1], order='C', copy=True) + copy_array(tmp, _cderi[slice_id][:p1-p0,ij0:ij1]) else: # Copy data to other Devices for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)): #_cderi[slice_id][:,ij0:ij1] = cderi_block[p0:p1] tmp = cupy.array(cderi_block[p0:p1], order='C', copy=True) p2p_transfer(_cderi[slice_id][:,ij0:ij1], tmp) - t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1) + t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1) return diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py index 8a2b59bd..6b08cee5 100644 --- a/gpu4pyscf/df/hessian/jk.py +++ b/gpu4pyscf/df/hessian/jk.py @@ -314,8 +314,10 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, # (20|0), (0|0)(0|00) int3c_blk = _get_int3c2e_ipip_slice('ipip1', intopt, cp_ij_id, aux_id, omega=omega) if with_j: - tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1]) - hj_ipip1[:,i0:i1] += contract('xpi,p->xi', tmp, rhoj[k0:k1]) + #tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1]) + #hj_ipip1[:,i0:i1] += contract('xpi,p->xi', tmp, rhoj[k0:k1]) + tmp = contract('xpji,p->xji', int3c_blk, rhoj[k0:k1]) + hj_ipip1[:,i0:i1] += contract('xji,ij->xi', tmp, dm0[i0:i1,j0:j1]) if with_k: hk_ipip1[:,i0:i1] += contract('xpji,pji->xi', int3c_blk, rhok_tmp) int3c_blk = None @@ -323,8 +325,10 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, # (11|0), (0|0)(0|00) without response of RI basis int3c_blk = _get_int3c2e_ipip_slice('ipvip1', intopt, cp_ij_id, aux_id, omega=omega) if with_j: - tmp = contract('xpji,ij->xpij', int3c_blk, dm0[i0:i1,j0:j1]) - hj_ipvip1[:,i0:i1,j0:j1] += contract('xpij,p->xij', tmp, rhoj[k0:k1]) + #tmp = contract('xpji,ij->xpij', int3c_blk, dm0[i0:i1,j0:j1]) + #hj_ipvip1[:,i0:i1,j0:j1] += contract('xpij,p->xij', tmp, rhoj[k0:k1]) + tmp = contract('xpji,p->xji', int3c_blk, rhoj[k0:k1]) + hj_ipvip1[:,i0:i1,j0:j1] += contract('xji,ij->xij', tmp, dm0[i0:i1,j0:j1]) if with_k: hk_ipvip1[:,i0:i1,j0:j1] += contract('xpji,pji->xij', int3c_blk, rhok_tmp) int3c_blk = None diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py index 57dfc363..a9023dc6 100644 --- a/gpu4pyscf/df/hessian/rhf.py +++ b/gpu4pyscf/df/hessian/rhf.py @@ -200,7 +200,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls hk_ao_aux[i0:i1] -= contract('qoi,qioxy->iqxy', rhok0_P_I, wk1_I) wk1_I = rhok0_P_I = None rhok1_Pko = None - t1 = log.timer_debug1('contractions with int3c2e_ip1', *t1) + t1 = log.timer_debug1('contract int3c2e_ip1 with int2c_ip1', *t1) w, v = cupy.linalg.eigh(int2c) idx = w > LINEAR_DEP_THR @@ -215,13 +215,14 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls if isinstance(rhok1_Pko, cupy.ndarray): rhok1_Pko[:,i0:i1] = contract('qp,qiox->piox', cd_low, wk1_tmp) else: - rhok1_Pko[:,i0:i1] = contract('qp,qiox->piox', cd_low, wk1_tmp).get() + #rhok1_Pko[:,i0:i1] = contract('qp,qiox->piox', cd_low, wk1_tmp).get() + wk1_tmp = contract('qp,qiox->piox', cd_low, wk1_tmp) + copy_array(wk1_tmp, rhok1_Pko[:,i0:i1]) wk1_tmp = None cd_low = None - t1 = log.timer_debug1('data transfer', *t1) hk_ao_ao += _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2) wk1_Pko = rhok1_Pko = None - t1 = log.timer_debug1('contractions with int3c2e_ip1', *t1) + t1 = log.timer_debug1('contract int3c2e_ip1 with int3c2e_ip1', *t1) hj_ipip, hk_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, with_j=with_j, with_k=with_k, omega=omega, @@ -344,7 +345,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls # ----------------------------------------- # collecting all # ----------------------------------------- - e1 = cupy.zeros([len(atmlst),len(atmlst),3,3]) + natm = len(atmlst) + e1 = cupy.zeros([natm,natm,3,3]) ej = hj_ipip ek = hk_ipip @@ -394,6 +396,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls _ek = cupy.sum(hk_aux_aux[p0:p1,q0:q1], axis=[0,1]) ek[i0,j0] += _ek * .5 ek[j0,i0] += _ek.T * .5 + for i0, ia in enumerate(atmlst): for j0 in range(i0): e1[j0,i0] = e1[i0,j0].T diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py index 8089ef76..54432c66 100644 --- a/gpu4pyscf/df/int3c2e.py +++ b/gpu4pyscf/df/int3c2e.py @@ -405,7 +405,8 @@ def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None, with_j=True, if isinstance(wk, cupy.ndarray): wk[k0:k1] = rhok_tmp else: - rhok_tmp.get(out=wk[k0:k1]) + #rhok_tmp.get(out=wk[k0:k1]) + copy_array(rhok_tmp, wk[k0:k1]) return wj, wk def get_int3c2e_ip_jk(intopt, cp_aux_id, ip_type, rhoj, rhok, dm, omega=None, stream=None): @@ -770,6 +771,48 @@ def get_j_int3c2e_pass2(intopt, rhoj, stream=None): vj = vj + vj.T return vj +def _int3c2e_jk_task(intopt, task_list, dm0, mocc, device_id=0, omega=None): + with cupy.cuda.Device(device_id), _streams[device_id]: + log = logger.new_logger(intopt.mol, intopt.mol.verbose) + t0 = log.init_timer() + mocc = cupy.asarray(mocc) + dm0 = cupy.asarray(dm0) + naux = intopt.auxmol.nao + nocc = mocc.shape[1] + rhoj = cupy.zeros([naux]) + rhok = cupy.zeros([naux,nocc,nocc]) + for cp_kl_id in task_list: + k0 = intopt.aux_ao_loc[cp_kl_id] + k1 = intopt.aux_ao_loc[cp_kl_id+1] + rhoj_tmp = cupy.zeros([k1-k0], order='C') + rhok_tmp = cupy.zeros([k1-k0, nocc, nocc], order='C') + for cp_ij_id, _ in enumerate(intopt.log_qs): + cpi = intopt.cp_idx[cp_ij_id] + cpj = intopt.cp_jdx[cp_ij_id] + li = intopt.angular[cpi] + lj = intopt.angular[cpj] + int3c_blk = get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, omega=omega) + if not intopt.mol.cart: + int3c_blk = cart2sph(int3c_blk, axis=1, ang=lj) + int3c_blk = cart2sph(int3c_blk, axis=2, ang=li) + i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1] + j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1] + if cpi == cpj and intopt.aosym: + int3c_blk *= 0.5 + + rhoj_tmp += contract('pji,ij->p', int3c_blk, dm0[i0:i1,j0:j1]) + ints_o = contract('pji,jo->poi', int3c_blk, mocc[j0:j1]) + rhok_tmp += contract('poi,ir->por', ints_o, mocc[i0:i1]) + + if intopt.aosym: + rhoj[k0:k1] = 2.0 * rhoj_tmp + rhok[k0:k1] = rhok_tmp + rhok_tmp.transpose([0,2,1]) + else: + rhoj[k0:k1] = rhoj_tmp + rhok[k0:k1] = rhok_tmp + t0 = log.timer_debug1(f'int3c2e_vjk on Device {device_id}', *t0) + return rhoj, rhok + def get_int3c2e_jk(mol, auxmol, dm0_tag, with_k=True, omega=None): ''' get rhoj and rhok for int3c2e @@ -777,44 +820,46 @@ def get_int3c2e_jk(mol, auxmol, dm0_tag, with_k=True, omega=None): intopt = VHFOpt(mol, auxmol, 'int2e') intopt.build(1e-14, diag_block_with_triu=True, aosym=True, group_size=BLKSIZE, group_size_aux=BLKSIZE) - if omega is None: omega = 0.0 - naux = auxmol.nao orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') - nocc = orbo.shape[1] - rhoj = cupy.empty([naux]) - rhok = cupy.empty([naux,nocc,nocc]) + futures = [] + aux_ao_loc = np.array(intopt.aux_ao_loc) + loads = aux_ao_loc[1:] - aux_ao_loc[:-1] + task_list = _split_tasks(loads, _num_devices) - for cp_kl_id, _ in enumerate(intopt.aux_log_qs): - k0 = intopt.aux_ao_loc[cp_kl_id] - k1 = intopt.aux_ao_loc[cp_kl_id+1] - rhoj_tmp = cupy.zeros([k1-k0], order='C') - rhok_tmp = cupy.zeros([k1-k0, nocc, nocc], order='C') - for cp_ij_id, _ in enumerate(intopt.log_qs): - cpi = intopt.cp_idx[cp_ij_id] - cpj = intopt.cp_jdx[cp_ij_id] - li = intopt.angular[cpi] - lj = intopt.angular[cpj] - int3c_blk = get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, omega=omega) - if not intopt.mol.cart: - int3c_blk = cart2sph(int3c_blk, axis=1, ang=lj) - int3c_blk = cart2sph(int3c_blk, axis=2, ang=li) - i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1] - j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1] - if cpi == cpj and intopt.aosym: - int3c_blk *= 0.5 + cupy.cuda.get_current_stream().synchronize() + with ThreadPoolExecutor(max_workers=_num_devices) as executor: + for device_id in range(_num_devices): + future = executor.submit( + _int3c2e_jk_task, intopt, task_list[device_id], + dm0_tag, orbo, device_id=device_id, omega=omega) + futures.append(future) - rhoj_tmp += contract('pji,ij->p', int3c_blk, dm0_tag[i0:i1,j0:j1]) - ints_o = contract('pji,jo->poi', int3c_blk, orbo[j0:j1]) - rhok_tmp += contract('poi,ir->por', ints_o, orbo[i0:i1]) + rhoj_total = [] + rhok_total = [] + for future in futures: + rhoj, rhok = future.result() + rhoj_total.append(rhoj) + rhok_total.append(rhok) - if intopt.aosym: - rhoj[k0:k1] = 2.0 * rhoj_tmp - rhok[k0:k1] = rhok_tmp + rhok_tmp.transpose([0,2,1]) - else: - rhoj[k0:k1] = rhoj_tmp - rhok[k0:k1] = rhok_tmp + rhoj = rhok = None + rhoj = reduce_to_device(rhoj_total, inplace=True) + if with_k: + rhok = reduce_to_device(rhok_total, inplace=True) return rhoj, rhok +def _split_tasks(loads, ngroups): + ''' Split a list of numbers into sublists with sums as close as possible + ''' + if ngroups == 1: + return [range(len(loads))] + groups = [[] for _ in range(ngroups)] + sums = [0] * 4 + for i, load in enumerate(loads): + min_index = sums.index(min(sums)) + groups[min_index].append(i) + sums[min_index] += load + return groups + def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id=0, with_j=True, with_k=True, omega=None): natom = intopt.mol.natm @@ -823,6 +868,8 @@ def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id= vj1_buf = vk1_buf = vj1 = vk1 = None with cupy.cuda.Device(device_id), _streams[device_id]: + log = logger.new_logger(intopt.mol, intopt.mol.verbose) + t0 = log.init_timer() ao2atom = get_ao2atom(intopt, aoslices) dm0 = cupy.asarray(dm0) orbo = cupy.asarray(orbo) @@ -875,7 +922,7 @@ def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id= vk1[:,:,p0:p1] += contract('xiJo,ia->axJo', vk1_ao, ao2atom) rhok0_slice = vk1_ao = None rhok_tmp = int3c_ip1_occ = None - + t0 = log.timer_debug1(f'int3c2e_ip1 on Device {device_id}', *t0) # TODO: absorbe vj1_buf and vk1_buf into vj1 and vk1 return vj1_buf, vk1_buf, vj1, vk1 @@ -883,11 +930,10 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_j=True, with_k=True, omega=None): orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') futures = [] - ncp_k = len(intopt.aux_log_qs) - tasks = np.array(list(range(ncp_k))) - task_list = [] - for device_id in range(_num_devices): - task_list.append(tasks[device_id::_num_devices]) + + aux_ao_loc = np.array(intopt.aux_ao_loc) + loads = aux_ao_loc[1:] - aux_ao_loc[:-1] + task_list = _split_tasks(loads, _num_devices) cupy.cuda.get_current_stream().synchronize() with ThreadPoolExecutor(max_workers=_num_devices) as executor: @@ -926,6 +972,8 @@ def _int3c2e_ip2_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, auxslices = intopt.auxmol.aoslice_by_atom() vj1 = vk1 = None with cupy.cuda.Device(device_id), _streams[device_id]: + log = logger.new_logger(intopt.mol, intopt.mol.verbose) + t0 = log.init_timer() aux2atom = get_aux2atom(intopt, auxslices) dm0 = cupy.asarray(dm0) orbo = cupy.asarray(orbo) @@ -970,6 +1018,7 @@ def _int3c2e_ip2_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, vk1 += contract('xpir,pa->axir', vk1_tmp, aux2atom[k0:k1]) vk1_tmp = rhok0_oo = rhok0_slice = None rhok_tmp = wk2_P__ = None + t0 = log.timer_debug1(f'int3c2e_ip2 on Device {device_id}', *t0) return vj1, vk1 def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, @@ -979,11 +1028,10 @@ def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, ''' orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') futures = [] - ncp_k = len(intopt.aux_log_qs) - tasks = np.array(list(range(ncp_k))) - task_list = [] - for device_id in range(_num_devices): - task_list.append(tasks[device_id::_num_devices]) + + aux_ao_loc = np.array(intopt.aux_ao_loc) + loads = aux_ao_loc[1:] - aux_ao_loc[:-1] + task_list = _split_tasks(loads, _num_devices) cupy.cuda.get_current_stream().synchronize() with ThreadPoolExecutor(max_workers=_num_devices) as executor: @@ -1013,6 +1061,8 @@ def _int3c2e_ip1_wjk_task(intopt, task_list, dm0, orbo, wk, device_id=0, with_k= naux = intopt.auxmol.nao aux_ao_loc = intopt.aux_ao_loc with cupy.cuda.Device(device_id), _streams[device_id]: + log = logger.new_logger(intopt.mol, intopt.mol.verbose) + t0 = log.init_timer() ncp_ij = len(intopt.log_qs) nocc = orbo.shape[1] wj = cupy.zeros([naux,nao,3]) @@ -1033,6 +1083,7 @@ def _int3c2e_ip1_wjk_task(intopt, task_list, dm0, orbo, wk, device_id=0, with_k= #wk_tmp.get(out=wk[k0:k1]) copy_array(wk_tmp, wk[k0:k1]) wk_tmp = None + t0 = log.timer_debug1(f'int3c2e_ip1_wjk on Device {device_id}', *t0) return wj def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None): @@ -1040,11 +1091,10 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None): ''' orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') futures = [] - ncp_k = len(intopt.aux_log_qs) - tasks = np.array(list(range(ncp_k))) - task_list = [] - for device_id in range(_num_devices): - task_list.append(tasks[device_id::_num_devices]) + + aux_ao_loc = np.array(intopt.aux_ao_loc) + loads = aux_ao_loc[1:] - aux_ao_loc[:-1] + task_list = _split_tasks(loads, _num_devices) nao = intopt.mol.nao naux = intopt.auxmol.nao @@ -1070,6 +1120,8 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None): def _int3c2e_ip2_wjk(intopt, task_list, dm0, orbo, with_k=True, omega=None, device_id=0): with cupy.cuda.Device(device_id), _streams[device_id]: + log = logger.new_logger(intopt.mol, intopt.mol.verbose) + t0 = log.init_timer() dm0 = cupy.asarray(dm0) orbo = cupy.asarray(orbo) naux = intopt.auxmol.nao @@ -1086,6 +1138,7 @@ def _int3c2e_ip2_wjk(intopt, task_list, dm0, orbo, with_k=True, omega=None, devi wk[k0:k1] += contract('piox,ir->prox', tmp, orbo[i0:i1]) tmp = None int3c_blk = None + t0 = log.timer_debug1(f'int3c2e_ip2_wjk on Device {device_id}', *t0) return wj, wk def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None): diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py index d7596d13..52150457 100644 --- a/gpu4pyscf/hessian/rhf.py +++ b/gpu4pyscf/hessian/rhf.py @@ -835,14 +835,14 @@ def _e_hcore_generator(hessobj, dm): h1aa = cupy.asarray(h1aa) h1ab = cupy.asarray(h1ab) - hcore = cupy.empty((3,3,nao,nao)) t1 = log.timer_debug1('get_hcore', *t1) def get_hcore(iatm, jatm): - nonlocal hcore ish0, ish1, i0, i1 = aoslices[iatm] jsh0, jsh1, j0, j1 = aoslices[jatm] rinv2aa = rinv2ab = None if iatm == jatm: + de = contract('xypq,pq->xy', h1aa[:,:,i0:i1], dm[i0:i1]) + de+= contract('xypq,pq->xy', h1ab[:,:,i0:i1,i0:i1], dm[i0:i1,i0:i1]) with mol.with_rinv_at_nucleus(iatm): # The remaining integrals like int1e_ipiprinv are computed in # hess_nuc_elec(mol, dm) @@ -853,18 +853,16 @@ def get_hcore(iatm, jatm): rinv2ab = cupy.asarray(rinv2ab) rinv2aa = rinv2aa.reshape(3,3,nao,nao) rinv2ab = rinv2ab.reshape(3,3,nao,nao) - hcore[:] = 0. - hcore[:,:,i0:i1] += h1aa[:,:,i0:i1] - hcore[:,:,i0:i1,i0:i1] += h1ab[:,:,i0:i1,i0:i1] + if rinv2aa is not None or rinv2ab is not None: - hcore -= rinv2aa + rinv2ab + hcore = -(rinv2aa + rinv2ab) hcore[:,:,i0:i1] += rinv2aa[:,:,i0:i1] hcore[:,:,i0:i1] += rinv2ab[:,:,i0:i1] hcore[:,:,:,i0:i1] += rinv2aa[:,:,i0:i1].transpose(0,1,3,2) hcore[:,:,:,i0:i1] += rinv2ab[:,:,:,i0:i1] + de += cupy.einsum('xypq,pq->xy', hcore, dm) else: - hcore[:] = 0. - hcore[:,:,i0:i1,j0:j1] += h1ab[:,:,i0:i1,j0:j1] + de = contract('xypq,pq->xy',h1ab[:,:,i0:i1,j0:j1],dm[i0:i1,j0:j1]) with mol.with_rinv_at_nucleus(iatm): if with_ecp and iatm in ecp_atoms: shls_slice = (jsh0, jsh1, 0, nbas) @@ -872,8 +870,9 @@ def get_hcore(iatm, jatm): rinv2ab = -mol.intor('ECPscalar_iprinvip', comp=9, shls_slice=shls_slice) rinv2aa = cupy.asarray(rinv2aa) rinv2ab = cupy.asarray(rinv2ab) - hcore[:,:,j0:j1] += rinv2aa.reshape(3,3,j1-j0,nao) - hcore[:,:,j0:j1] += rinv2ab.reshape(3,3,j1-j0,nao).transpose(1,0,2,3) + hcore = rinv2aa.reshape(3,3,j1-j0,nao) + hcore+= rinv2ab.reshape(3,3,j1-j0,nao).transpose(1,0,2,3) + de += contract('xypq,pq->xy', hcore, dm[j0:j1]) with mol.with_rinv_at_nucleus(jatm): if with_ecp and jatm in ecp_atoms: shls_slice = (ish0, ish1, 0, nbas) @@ -881,11 +880,11 @@ def get_hcore(iatm, jatm): rinv2ab = -mol.intor('ECPscalar_iprinvip', comp=9, shls_slice=shls_slice) rinv2aa = cupy.asarray(rinv2aa) rinv2ab = cupy.asarray(rinv2ab) - hcore[:,:,i0:i1] += rinv2aa.reshape(3,3,i1-i0,nao) - hcore[:,:,i0:i1] += rinv2ab.reshape(3,3,i1-i0,nao) - de = cupy.einsum('xypq,pq->xy', hcore, dm) - de += cupy.einsum('xyqp,pq->xy', hcore, dm) - return cp.asarray(de + de_nuc_elec[:,:,iatm,jatm]) + hcore = rinv2aa.reshape(3,3,i1-i0,nao) + hcore+= rinv2ab.reshape(3,3,i1-i0,nao) + de += contract('xypq,pq->xy', hcore, dm[i0:i1]) + # 2.0* due to the symmetry + return cp.asarray(2.0*de + de_nuc_elec[:,:,iatm,jatm]) return get_hcore def hcore_generator(hessobj, mol=None): diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py index bffc221c..912748c7 100644 --- a/gpu4pyscf/hessian/rks.py +++ b/gpu4pyscf/hessian/rks.py @@ -28,7 +28,7 @@ from gpu4pyscf.grad import rks as rks_grad from gpu4pyscf.dft import numint from gpu4pyscf.lib.cupy_helper import (contract, add_sparse, get_avail_mem, - reduce_to_device) + reduce_to_device, transpose_sum) from gpu4pyscf.lib import logger from gpu4pyscf.__config__ import _streams, _num_devices from gpu4pyscf.hessian import jk @@ -702,6 +702,124 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): vmat = reduce_to_device(vmat_dist, inplace=True) return vmat +def _nr_rks_fxc_mo_task(ni, mol, grids, xc_code, fxc, mo_coeff, mo1, mocc, + verbose=None, hermi=1, device_id=0): + with cupy.cuda.Device(device_id), _streams[device_id]: + if mo_coeff is not None: mo_coeff = cupy.asarray(mo_coeff) + if mo1 is not None: mo1 = cupy.asarray(mo1) + if mocc is not None: mocc = cupy.asarray(mocc) + if fxc is not None: fxc = cupy.asarray(fxc) + + assert isinstance(verbose, int) + log = logger.new_logger(mol, verbose) + xctype = ni._xc_type(xc_code) + opt = getattr(ni, 'gdftopt', None) + + _sorted_mol = opt.mol + nao = mol.nao + nset = mo1.shape[0] + vmat = cupy.zeros((nset, nao, nao)) + + if xctype == 'LDA': + ao_deriv = 0 + else: + ao_deriv = 1 + + ngrids_glob = grids.coords.shape[0] + ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices + grid_start = device_id * ngrids_per_device + grid_end = (device_id + 1) * ngrids_per_device + + p0 = p1 = grid_start + t1 = t0 = log.init_timer() + for ao, mask, weights, coords in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, + max_memory=None, blksize=None, + grid_range=(grid_start, grid_end)): + p0, p1 = p1, p1+len(weights) + occ_coeff_mask = mocc[mask] + rho1 = numint.eval_rho4(_sorted_mol, ao, 2.0*occ_coeff_mask, mo1[:,mask], + xctype=xctype, hermi=hermi) + t1 = log.timer_debug2('eval rho', *t1) + + # precompute fxc_w + if xctype == 'LDA': + fxc_w = fxc[0,0,p0:p1] * weights + wv = rho1 * fxc_w + else: + fxc_w = fxc[:,:,p0:p1] * weights + wv = contract('axg,xyg->ayg', rho1, fxc_w) + + for i in range(nset): + if xctype == 'LDA': + vmat_tmp = ao.dot(numint._scale_ao(ao, wv[i]).T) + elif xctype == 'GGA': + wv[i,0] *= .5 + aow = numint._scale_ao(ao, wv[i]) + vmat_tmp = aow.dot(ao[0].T) + elif xctype == 'NLC': + raise NotImplementedError('NLC') + else: + wv[i,0] *= .5 + wv[i,4] *= .5 + vmat_tmp = ao[0].dot(numint._scale_ao(ao[:4], wv[i,:4]).T) + vmat_tmp+= numint._tau_dot(ao, ao, wv[i,4]) + add_sparse(vmat[i], vmat_tmp, mask) + + t1 = log.timer_debug2('integration', *t1) + ao = rho1 = None + t0 = log.timer_debug1('vxc', *t0) + if xctype != 'LDA': + transpose_sum(vmat) + vmat = jk._ao2mo(vmat, mocc, mo_coeff) + return vmat + +def nr_rks_fxc_mo(ni, mol, grids, xc_code, dm0=None, dms=None, mo_coeff=None, relativity=0, hermi=0, + rho0=None, vxc=None, fxc=None, max_memory=2000, verbose=None): + log = logger.new_logger(mol, verbose) + t0 = log.init_timer() + if fxc is None: + raise RuntimeError('fxc was not initialized') + #xctype = ni._xc_type(xc_code) + opt = getattr(ni, 'gdftopt', None) + if opt is None or mol not in [opt.mol, opt._sorted_mol]: + ni.build(mol, grids.coords) + opt = ni.gdftopt + + nao = mol.nao + dms = cupy.asarray(dms) + dm_shape = dms.shape + # AO basis -> gdftopt AO basis + with_mocc = hasattr(dms, 'mo1') + mo1 = mocc = None + if with_mocc: + mo1 = opt.sort_orbitals(dms.mo1, axis=[1]) + mocc = opt.sort_orbitals(dms.occ_coeff, axis=[0]) + mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0]) + dms = opt.sort_orbitals(dms.reshape(-1,nao,nao), axis=[1,2]) + + futures = [] + cupy.cuda.get_current_stream().synchronize() + with ThreadPoolExecutor(max_workers=_num_devices) as executor: + for device_id in range(_num_devices): + future = executor.submit( + _nr_rks_fxc_mo_task, + ni, mol, grids, xc_code, fxc, mo_coeff, mo1, mocc, + verbose=log.verbose, hermi=hermi, device_id=device_id) + futures.append(future) + dms = None + vmat_dist = [] + for future in futures: + vmat_dist.append(future.result()) + vmat = reduce_to_device(vmat_dist, inplace=True) + #vmat = opt.unsort_orbitals(vmat, axis=[1,2]) + #if xctype != 'LDA': + # transpose_sum(vmat) + + if len(dm_shape) == 2: + vmat = vmat[0] + t0 = log.timer_debug1('nr_rks_fxc', *t0) + return cupy.asarray(vmat) + def get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1, omega=None): mol = hessobj.mol mf = hessobj.base @@ -728,10 +846,10 @@ def get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1, omega=None): # TODO: evaluate v1 in MO rho0, vxc, fxc = ni.cache_xc_kernel(mol, grids, mf.xc, mo_coeff, mo_occ, 0) - v1 = ni.nr_rks_fxc(mol, grids, mf.xc, None, dms, 0, hermi, + v1 = nr_rks_fxc_mo(ni, mol, grids, mf.xc, None, dms, mo_coeff, 0, hermi, rho0, vxc, fxc, max_memory=None) - v1 = jk._ao2mo(v1, mocc, mo_coeff).reshape(-1,nmo*nocc) - + v1 = v1.reshape(-1,nmo*nocc) + if hybrid: vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=1) vk *= hyb diff --git a/gpu4pyscf/lib/memcpy.py b/gpu4pyscf/lib/memcpy.py index 0f4faa6b..19b19e41 100644 --- a/gpu4pyscf/lib/memcpy.py +++ b/gpu4pyscf/lib/memcpy.py @@ -47,7 +47,9 @@ def _copy_array(src_view, dst_view): ''' Copy data from cupy/numpy array to another cupy/numpy array Check memory layout, then copy memory chunks by cupy.cuda.runtime.memcpy ''' - + if src_view.nbytes == 0: + return dst_view + shape = src_view.shape itemsize = src_view.itemsize strides_src = [stride // itemsize for stride in src_view.strides] @@ -75,6 +77,12 @@ def _copy_array(src_view, dst_view): else: raise NotImplementedError + + if len(chunk_shape) == 0: + print('here') + print(src_view.nbytes, dst_view.nbytes) + print(shape, strides_src, strides_dst) + assert len(chunk_shape) > 0 # Transfer data chunk-by-chunk diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py index 321fd56f..e709eafc 100644 --- a/gpu4pyscf/tests/test_benchmark_rks.py +++ b/gpu4pyscf/tests/test_benchmark_rks.py @@ -32,7 +32,7 @@ current_folder = os.path.dirname(os.path.abspath(__file__)) small_mol = os.path.join(current_folder, '020_Vitamin_C.xyz') -median_mol = os.path.join(current_folder, '057_Tamoxifen.xyz') +medium_mol = os.path.join(current_folder, '057_Tamoxifen.xyz') large_mol = os.path.join(current_folder, '095_Azadirachtin.xyz') def run_rb3lyp(atom, basis, with_df, with_solvent, disp=None): @@ -117,39 +117,39 @@ def test_rb3lyp_hessian(benchmark): print('testing rb3lyp hessian') assert np.isclose(np.linalg.norm(h), 3.7588443634477833, atol=1e-4) -# median molecule +# medium molecule @pytest.mark.benchmark -def test_df_rb3lyp_median(benchmark): - e = benchmark(run_rb3lyp, median_mol, 'def2-tzvpp', True, False) - print('testing df rb3lyp median') +def test_df_rb3lyp_medium(benchmark): + e = benchmark(run_rb3lyp, medium_mol, 'def2-tzvpp', True, False) + print('testing df rb3lyp medium') assert np.isclose(np.linalg.norm(e), 1138.371390377773, atol=1e-7) @pytest.mark.benchmark -def test_df_rb3lyp_grad_median(benchmark): - g = benchmark(run_rb3lyp_grad, median_mol, 'def2-tzvpp', True, False) - print('testing df rb3lyp grad median') +def test_df_rb3lyp_grad_medium(benchmark): + g = benchmark(run_rb3lyp_grad, medium_mol, 'def2-tzvpp', True, False) + print('testing df rb3lyp grad medium') assert np.isclose(np.linalg.norm(g), 0.26010545073602614, atol=1e-4) @pytest.mark.benchmark -def test_df_rb3lyp_hessian_median(benchmark): - h = benchmark(run_rb3lyp_hessian, median_mol, 'def2-tzvpp', True, False) - print('testing df rb3lyp hessian median') +def test_df_rb3lyp_hessian_medium(benchmark): + h = benchmark(run_rb3lyp_hessian, medium_mol, 'def2-tzvpp', True, False) + print('testing df rb3lyp hessian medium') assert np.isclose(np.linalg.norm(h), 6.32514169232998, atol=1e-4) @pytest.mark.benchmark -def test_rb3lyp_median(benchmark): - e = benchmark(run_rb3lyp, median_mol, 'def2-tzvpp', False, False) - print('testing rb3lyp median') +def test_rb3lyp_medium(benchmark): + e = benchmark(run_rb3lyp, medium_mol, 'def2-tzvpp', False, False) + print('testing rb3lyp medium') assert np.isclose(np.linalg.norm(e), 1138.3710752128077, atol=1e-7) @pytest.mark.benchmark -def test_rb3lyp_grad_median(benchmark): - g = benchmark(run_rb3lyp_grad, median_mol, 'def2-tzvpp', False, False) - print('testing rb3lyp grad median') +def test_rb3lyp_grad_medium(benchmark): + g = benchmark(run_rb3lyp_grad, medium_mol, 'def2-tzvpp', False, False) + print('testing rb3lyp grad medium') assert np.isclose(np.linalg.norm(g), 0.2601443836937988, atol=1e-5) @pytest.mark.slow @pytest.mark.benchmark -def test_rb3lyp_hessian_median(benchmark): - h = benchmark(run_rb3lyp_hessian, median_mol, 'def2-tzvpp', False, False) - print('testing rb3lyp hessian median') +def test_rb3lyp_hessian_medium(benchmark): + h = benchmark(run_rb3lyp_hessian, medium_mol, 'def2-tzvpp', False, False) + print('testing rb3lyp hessian medium') assert np.isclose(np.linalg.norm(h), 6.312714778020796, atol=1e-4) # large molecule From 904763e04ca54a5cd651deec8df6a6cf6ada47ed Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Wed, 1 Jan 2025 01:48:55 +0000 Subject: [PATCH 30/49] cupy copy -> copy_array --- .../cupy_helper/benchmark_memory_copy.py | 71 +++++++++++++------ gpu4pyscf/df/hessian/rhf.py | 7 +- gpu4pyscf/df/int3c2e.py | 12 ++-- gpu4pyscf/lib/cupy_helper.py | 21 +++--- 4 files changed, 66 insertions(+), 45 deletions(-) diff --git a/benchmarks/cupy_helper/benchmark_memory_copy.py b/benchmarks/cupy_helper/benchmark_memory_copy.py index c658674f..5e36ffe5 100644 --- a/benchmarks/cupy_helper/benchmark_memory_copy.py +++ b/benchmarks/cupy_helper/benchmark_memory_copy.py @@ -37,52 +37,54 @@ print("Host View Shape:", host_view.shape) print("Device View Shape:", device_view.shape) -''' + print("------ Benchmark device to host transfer ----------") size = host_view.nbytes -perf_custom = profiler.benchmark(copy_array, (host_view, device_view), n_repeat=100, n_warmup=3) +perf_custom = profiler.benchmark(copy_array, (host_view, device_view), n_repeat=20, n_warmup=3) t_kernel = perf_custom.gpu_times.mean() bandwidth = size / t_kernel / 1e9 -print('using custom function', t_kernel) -print(f"Effective PCIe Bandwidth: {bandwidth:.2f} GB/s") +print('Using custom function', t_kernel) +print(f"Effective Bandwidth: {bandwidth:.2f} GB/s") def cupy_copy(c, out): out[:] = cp.asarray(c) return out -perf_cupy = profiler.benchmark(cupy_copy, (host_view, device_view), n_repeat=100, n_warmup=3) +perf_cupy = profiler.benchmark(cupy_copy, (host_view, device_view), n_repeat=20, n_warmup=3) t_kernel = perf_cupy.gpu_times.mean() bandwidth = size / t_kernel / 1e9 -print('using cupy function', t_kernel) -print(f"Effective PCIe Bandwidth: {bandwidth:.2f} GB/s") +print('Using cupy function', t_kernel) +print(f"Effective Bandwidth: {bandwidth:.2f} GB/s") print("------- Benchmark host to device transfer ---------") size = host_view.nbytes -perf_custom = profiler.benchmark(copy_array, (device_view, host_view), n_repeat=100, n_warmup=3) +perf_custom = profiler.benchmark(copy_array, (device_view, host_view), n_repeat=20, n_warmup=3) t_kernel = perf_custom.gpu_times.mean() bandwidth = size / t_kernel / 1e9 -print('using custom function', t_kernel) -print(f"Effective PCIe Bandwidth: {bandwidth:.2f} GB/s") +print('Using custom function', t_kernel) +print(f"Effective Bandwidth: {bandwidth:.2f} GB/s") def cupy_copy(c, out): out[:] = c.get() return out -perf_cupy = profiler.benchmark(cupy_copy, (device_view, host_view), n_repeat=100, n_warmup=3) +perf_cupy = profiler.benchmark(cupy_copy, (device_view, host_view), n_repeat=20, n_warmup=3) t_kernel = perf_cupy.gpu_times.mean() bandwidth = size / t_kernel / 1e9 -print('using cupy function', t_kernel) -print(f"Effective PCIe Bandwidth: {bandwidth:.2f} GB/s") -''' +print('Using cupy function', t_kernel) +print(f"Effective Bandwidth: {bandwidth:.2f} GB/s") + +print("-------- Benchmark device to device transfer (non-contiguous) ---------") + with cp.cuda.Device(0): a = cp.random.rand(512,512,512) - device0_view = a[:,128:, 128:] + device0_view = a[:,128:] with cp.cuda.Device(1): b = cp.random.rand(512,512,512) - device1_view = b[:,128:, 128:] -perf_cupy = profiler.benchmark(copy_array, (device0_view, device1_view), n_repeat=100, n_warmup=3) + device1_view = b[:,128:] +perf_cupy = profiler.benchmark(copy_array, (device0_view, device1_view), n_repeat=20, n_warmup=3) t_kernel = perf_cupy.gpu_times.mean() bandwidth = device0_view.nbytes / t_kernel / 1e9 -print('using custom function', t_kernel) -print(f"Effective PCIe Bandwidth: {bandwidth:.2f} GB/s") +print('Using custom function', t_kernel) +print(f"Effective Bandwidth: {bandwidth:.2f} GB/s") assert np.linalg.norm(device0_view.get() - device1_view.get()) < 1e-10 @@ -90,8 +92,33 @@ def cupy_copy(c, out): with cp.cuda.Device(out.device): out[:] = cp.asarray(c.get()) return out -perf_cupy = profiler.benchmark(cupy_copy, (device0_view, device1_view), n_repeat=100, n_warmup=3) +perf_cupy = profiler.benchmark(cupy_copy, (device0_view, device1_view), n_repeat=20, n_warmup=3) +t_kernel = perf_cupy.gpu_times.mean() +bandwidth = device0_view.nbytes / t_kernel / 1e9 +print('Using cupy function', t_kernel) +print(f"Effective Bandwidth: {bandwidth:.2f} GB/s") + +print("-------- Benchmark device to device transfer (contiguous) ---------") +perf_cupy = profiler.benchmark(copy_array, (a, b), n_repeat=20, n_warmup=3) t_kernel = perf_cupy.gpu_times.mean() bandwidth = device0_view.nbytes / t_kernel / 1e9 -print('using cupy function', t_kernel) -print(f"Effective PCIe Bandwidth: {bandwidth:.2f} GB/s") +print('Using custom function', t_kernel) +print(f"Effective Bandwidth: {bandwidth:.2f} GB/s") + +def cupy_copy_contiguous(a, b): + b[:] = a +perf_cupy = profiler.benchmark(cupy_copy, (a, b), n_repeat=20, n_warmup=3) +t_kernel = perf_cupy.gpu_times.mean() +bandwidth = device0_view.nbytes / t_kernel / 1e9 +print('Cupy copy contiguous array', t_kernel) +print(f"Effective Bandwidth: {bandwidth:.2f} GB/s") + +def cupy_set_contiguous(a, b): + b.set(a) +perf_cupy = profiler.benchmark(cupy_copy, (a, b), n_repeat=20, n_warmup=3) +t_kernel = perf_cupy.gpu_times.mean() +bandwidth = device0_view.nbytes / t_kernel / 1e9 +print('Cupy set contiguous array', t_kernel) +print(f"Effective Bandwidth: {bandwidth:.2f} GB/s") + +assert np.linalg.norm(a.get() - b.get()) < 1e-10 diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py index a9023dc6..a29a50bd 100644 --- a/gpu4pyscf/df/hessian/rhf.py +++ b/gpu4pyscf/df/hessian/rhf.py @@ -145,11 +145,11 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls # int3c_ip2 contributions wj_ip2, wk_ip2_P__ = int3c2e.get_int3c2e_ip2_wjk(intopt, dm0_tag, omega=omega) - t1 = log.timer_debug1('interdeidate variables with int3c2e_ip2', *t1) + t1 = log.timer_debug1('interdediate variables with int3c2e_ip2', *t1) # int3c_ip1 contributions wj1_P, wk1_Pko = int3c2e.get_int3c2e_ip1_wjk(intopt, dm0_tag, omega=omega) - t1 = log.timer_debug1('interdeidate variables with int3c2e_ip1', *t1) + t1 = log.timer_debug1('interdediate variables with int3c2e_ip1', *t1) #rhoj1_P = contract('pq,pix->qix', int2c_inv, wj1_P) if with_j: @@ -332,6 +332,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls # pi,qi,i->pq dme0 = cupy.dot(mocc, (mocc * mo_energy[mo_occ>0] * 2).T) de_hcore = rhf_hess._e_hcore_generator(hessobj, dm0) + t1 = log.timer_debug1('hcore generate', *t1) # ------------------------------------ # overlap matrix contributions @@ -396,7 +397,6 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls _ek = cupy.sum(hk_aux_aux[p0:p1,q0:q1], axis=[0,1]) ek[i0,j0] += _ek * .5 ek[j0,i0] += _ek.T * .5 - for i0, ia in enumerate(atmlst): for j0 in range(i0): e1[j0,i0] = e1[i0,j0].T @@ -404,7 +404,6 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls ej[j0,i0] = ej[i0,j0].T if with_k: ek[j0,i0] = ek[i0,j0].T - t1 = log.timer_debug1('hcore contribution', *t1) aux2atom = int3c2e.get_aux2atom(intopt, auxslices) diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py index 54432c66..f4d3bbab 100644 --- a/gpu4pyscf/df/int3c2e.py +++ b/gpu4pyscf/df/int3c2e.py @@ -771,7 +771,7 @@ def get_j_int3c2e_pass2(intopt, rhoj, stream=None): vj = vj + vj.T return vj -def _int3c2e_jk_task(intopt, task_list, dm0, mocc, device_id=0, omega=None): +def _int3c2e_jk_task(intopt, task_k_list, dm0, mocc, device_id=0, omega=None): with cupy.cuda.Device(device_id), _streams[device_id]: log = logger.new_logger(intopt.mol, intopt.mol.verbose) t0 = log.init_timer() @@ -781,7 +781,7 @@ def _int3c2e_jk_task(intopt, task_list, dm0, mocc, device_id=0, omega=None): nocc = mocc.shape[1] rhoj = cupy.zeros([naux]) rhok = cupy.zeros([naux,nocc,nocc]) - for cp_kl_id in task_list: + for cp_kl_id in task_k_list: k0 = intopt.aux_ao_loc[cp_kl_id] k1 = intopt.aux_ao_loc[cp_kl_id+1] rhoj_tmp = cupy.zeros([k1-k0], order='C') @@ -803,7 +803,7 @@ def _int3c2e_jk_task(intopt, task_list, dm0, mocc, device_id=0, omega=None): rhoj_tmp += contract('pji,ij->p', int3c_blk, dm0[i0:i1,j0:j1]) ints_o = contract('pji,jo->poi', int3c_blk, mocc[j0:j1]) rhok_tmp += contract('poi,ir->por', ints_o, mocc[i0:i1]) - + int3c_blk = ints_o = None if intopt.aosym: rhoj[k0:k1] = 2.0 * rhoj_tmp rhok[k0:k1] = rhok_tmp + rhok_tmp.transpose([0,2,1]) @@ -853,7 +853,7 @@ def _split_tasks(loads, ngroups): if ngroups == 1: return [range(len(loads))] groups = [[] for _ in range(ngroups)] - sums = [0] * 4 + sums = [0] * ngroups for i, load in enumerate(loads): min_index = sums.index(min(sums)) groups[min_index].append(i) @@ -965,7 +965,7 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_j=True, return vj1_buf, vk1_buf, vj1, vk1 -def _int3c2e_ip2_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, +def _int3c2e_ip2_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id=0, with_j=True, with_k=True, omega=None): natom = intopt.mol.natm nao = intopt.mol.nao @@ -985,7 +985,7 @@ def _int3c2e_ip2_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, vk1 = cupy.zeros([natom,3,nao,nocc]) aux_ao_loc = intopt.aux_ao_loc ncp_ij = len(intopt.log_qs) - for cp_k in task_list: + for cp_k in task_k_list: task_list = [(cp_k, cp_ij) for cp_ij in range(ncp_ij)] k0, k1 = aux_ao_loc[cp_k], aux_ao_loc[cp_k+1] if with_j: diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py index 41e11307..898a2846 100644 --- a/gpu4pyscf/lib/cupy_helper.py +++ b/gpu4pyscf/lib/cupy_helper.py @@ -88,22 +88,16 @@ def p2p_transfer(a, b): a[:] = b elif _p2p_access: a[:] = b + ''' elif a.strides == b.strides and a.flags.c_contiguous and a.dtype == b.dtype: # cupy supports a direct copy from different devices without p2p. See also # https://github.com/cupy/cupy/blob/v13.3.0/cupy/_core/_routines_indexing.pyx#L48 # https://github.com/cupy/cupy/blob/v13.3.0/cupy/_core/_routines_indexing.pyx#L1015 a[:] = b + ''' else: - #copy_array(b, a) - with cupy.cuda.Device(a.device): - # TODO: reduce memory copy, a can be non-contiguous array - #a[:] = cupy.asarray(b.get()) - copy_array(b, a) - if np.linalg.norm(a.get() - b.get()) > 1e-3: - print(a[:5], a.device, a.strides, a.shape) - print(b[:5], b.device, b.strides, b.shape) - print(a.shape, b.shape) - exit() + copy_array(b, a) + def concatenate(array_list): ''' Concatenate axis=0 only ''' @@ -118,7 +112,8 @@ def concatenate(array_list): p0 = p1 = 0 for a in array_list_cpu: p1 = p0 + a.shape[0] - out[p0:p1].set(a) + #out[p0:p1].set(a) + copy_array(a, out[p0:p1]) p0 = p1 return out @@ -153,8 +148,8 @@ def reduce_to_device(array_list, inplace=False): matrix = matrix.reshape(-1) blksize = 1024*1024*128 # 1GB for p0, p1 in lib.prange(0,len(matrix), blksize): - result[p0:p1] += cupy.asarray(matrix[p0:p1]) - + result[p0:p1] += copy_array(matrix[p0:p1])#cupy.asarray(matrix[p0:p1]) + #result[p0:p1] += cupy.asarray(matrix[p0:p1]) return result.reshape(out_shape) def device2host_2d(a_cpu, a_gpu, stream=None): From ef2553343fdcc4680c60406882ddf15bbd155d5e Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Thu, 2 Jan 2025 06:09:45 +0000 Subject: [PATCH 31/49] optimize multi-GPU --- .../cupy_helper/benchmark_memory_copy.py | 9 +- gpu4pyscf/df/df.py | 39 +++--- gpu4pyscf/df/grad/jk.py | 115 ++++++++++++++++- gpu4pyscf/df/grad/rhf.py | 71 ++-------- gpu4pyscf/df/grad/uhf.py | 48 ++----- gpu4pyscf/df/hessian/rhf.py | 17 ++- gpu4pyscf/df/hessian/uhf.py | 5 +- gpu4pyscf/df/int3c2e.py | 121 +++++++++--------- gpu4pyscf/lib/cupy_helper.py | 13 +- gpu4pyscf/tests/test_benchmark_rks.py | 6 +- 10 files changed, 248 insertions(+), 196 deletions(-) diff --git a/benchmarks/cupy_helper/benchmark_memory_copy.py b/benchmarks/cupy_helper/benchmark_memory_copy.py index 5e36ffe5..d10f97ac 100644 --- a/benchmarks/cupy_helper/benchmark_memory_copy.py +++ b/benchmarks/cupy_helper/benchmark_memory_copy.py @@ -107,15 +107,16 @@ def cupy_copy(c, out): def cupy_copy_contiguous(a, b): b[:] = a -perf_cupy = profiler.benchmark(cupy_copy, (a, b), n_repeat=20, n_warmup=3) +perf_cupy = profiler.benchmark(cupy_copy_contiguous, (a, b), n_repeat=20, n_warmup=3) t_kernel = perf_cupy.gpu_times.mean() bandwidth = device0_view.nbytes / t_kernel / 1e9 print('Cupy copy contiguous array', t_kernel) print(f"Effective Bandwidth: {bandwidth:.2f} GB/s") -def cupy_set_contiguous(a, b): - b.set(a) -perf_cupy = profiler.benchmark(cupy_copy, (a, b), n_repeat=20, n_warmup=3) +def cupy_asarray_contiguous(a, b): + with cp.cuda.Device(b.device): + b = cp.asarray(a) +perf_cupy = profiler.benchmark(cupy_asarray_contiguous, (a, b), n_repeat=20, n_warmup=3) t_kernel = perf_cupy.gpu_times.mean() bandwidth = device0_view.nbytes / t_kernel / 1e9 print('Cupy set contiguous array', t_kernel) diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py index 67b30c0a..a7f04370 100644 --- a/gpu4pyscf/df/df.py +++ b/gpu4pyscf/df/df.py @@ -138,13 +138,12 @@ def get_blksize(self, extra=0, nao=None): ''' if nao is None: nao = self.nao mem_avail = get_avail_mem() - blksize = int(mem_avail*0.2/8/(nao*nao + extra) / ALIGNED) * ALIGNED + blksize = int(mem_avail*0.4/8/(nao*nao + extra) / ALIGNED) * ALIGNED blksize = min(blksize, MIN_BLK_SIZE) log = logger.new_logger(self.mol, self.mol.verbose) device_id = cupy.cuda.Device().id log.debug(f"{mem_avail/1e9:.3f} GB memory available on Device {device_id}, block size = {blksize}") - if blksize < ALIGNED: - raise RuntimeError("Not enough GPU memory") + assert blksize > 0 return blksize def loop(self, blksize=None, unpack=True): @@ -227,12 +226,16 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, log.debug("Saving CDERI on CPU") _cderi = {} - blksize = (naux + _num_devices - 1) // _num_devices - for device_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)): + aux_blksize = (naux + _num_devices - 1) // _num_devices + aux_blksize = (aux_blksize + ALIGNED - 1) // ALIGNED * ALIGNED + for device_id in range(_num_devices): + p0 = min(aux_blksize*device_id, naux) + p1 = min(aux_blksize*(device_id+1), naux) + #for device_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)): if use_gpu_memory: with cupy.cuda.Device(device_id), _streams[device_id]: _cderi[device_id] = cupy.empty([p1-p0, npairs]) - log.debug(f"CDERI size {_cderi[device_id].nbytes/GB:.3f} on Device {device_id}") + log.debug(f"CDERI size {_cderi[device_id].nbytes/GB:.3f} GB on Device {device_id}") else: mem = cupy.cuda.alloc_pinned_memory((p1-p0) * npairs * 8) cderi_blk = np.ndarray([p1-p0, npairs], dtype=np.float64, order='C', buffer=mem) @@ -254,7 +257,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, with ThreadPoolExecutor(max_workers=_num_devices) as executor: for device_id in range(_num_devices): task_list = task_list_per_device[device_id] - future = executor.submit(_cderi_task, intopt, cd_low_f, task_list, _cderi, + future = executor.submit(_cderi_task, intopt, cd_low_f, task_list, _cderi, aux_blksize, omega=omega, sr_only=sr_only, device_id=device_id) futures.append(future) @@ -266,7 +269,8 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, return _cderi -def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, device_id=0): +def _cderi_task(intopt, cd_low, task_list, _cderi, aux_blksize, + omega=None, sr_only=False, device_id=0): ''' Execute CDERI tasks on one device ''' nq = len(intopt.log_qs) @@ -275,7 +279,6 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de naoaux = cd_low.shape[0] npairs = [len(intopt.ao_pairs_row[cp_ij]) for cp_ij in range(len(intopt.log_qs))] pairs_loc = np.append(0, np.cumsum(npairs)) - blksize = (naux + _num_devices - 1) // _num_devices with cupy.cuda.Device(device_id), _streams[device_id]: assert isinstance(mol.verbose, int) log = logger.new_logger(mol, mol.verbose) @@ -346,16 +349,18 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de ij0 = pairs_loc[cp_ij_id] ij1 = pairs_loc[cp_ij_id+1] if isinstance(_cderi[0], np.ndarray): - for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)): - #for i in range(p0,p1): - # cderi_block[i].get(out=_cderi[slice_id][i-p0,ij0:ij1]) + for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)): tmp = cupy.array(cderi_block[p0:p1], order='C', copy=True) copy_array(tmp, _cderi[slice_id][:p1-p0,ij0:ij1]) - else: - # Copy data to other Devices - for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)): - #_cderi[slice_id][:,ij0:ij1] = cderi_block[p0:p1] + elif _num_devices > 1: + # Multi-GPU case, copy data to other Devices + for dev_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)): + # Making a copy for contiguous data transfer tmp = cupy.array(cderi_block[p0:p1], order='C', copy=True) - p2p_transfer(_cderi[slice_id][:,ij0:ij1], tmp) + with cupy.cuda.Device(dev_id): + tmp = copy_array(tmp) + _cderi[dev_id][:,ij0:ij1] = tmp + else: + _cderi[slice_id][:,ij0:ij1] = cderi_block[p0:p1] t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1) return diff --git a/gpu4pyscf/df/grad/jk.py b/gpu4pyscf/df/grad/jk.py index 4139726e..2bbf9d9e 100644 --- a/gpu4pyscf/df/grad/jk.py +++ b/gpu4pyscf/df/grad/jk.py @@ -13,8 +13,10 @@ # limitations under the License. from concurrent.futures import ThreadPoolExecutor +import numpy as np import cupy -from gpu4pyscf.lib.cupy_helper import contract, concatenate +from gpu4pyscf.df.int3c2e import get_int3c2e_ip_jk, VHFOpt, _split_tasks +from gpu4pyscf.lib.cupy_helper import contract, concatenate, reduce_to_device from gpu4pyscf.lib import logger from gpu4pyscf.__config__ import _streams, _num_devices @@ -54,7 +56,7 @@ def _jk_task(with_df, dm, orbo, with_j=True, with_k=True, device_id=0): t0 = log.timer_debug1(f'rhoj and rhok on Device {device_id}', *t0) return rhoj, rhok -def get_rhoj_rhok(with_df, dm, orbo, with_j=True, with_k=True): +def get_rhojk(with_df, dm, orbo, with_j=True, with_k=True): ''' Calculate rhoj and rhok on Multi-GPU system ''' futures = [] @@ -80,3 +82,112 @@ def get_rhoj_rhok(with_df, dm, orbo, with_j=True, with_k=True): rhok = concatenate(rhok_total) return rhoj, rhok + +def _jk_ip_task(intopt, rhoj_cart, dm_cart, rhok_cart, orbo_cart, task_list, + with_j=True, with_k=True, device_id=0, omega=None): + mol = intopt.mol + with cupy.cuda.Device(device_id), _streams[device_id]: + log = logger.new_logger(mol, mol.verbose) + t0 = (logger.process_clock(), logger.perf_counter()) + + orbo_cart = cupy.asarray(orbo_cart) + cart_aux_loc = intopt.cart_aux_loc + nao_cart = dm_cart.shape[0] + naux_cart = intopt._sorted_auxmol.nao + vj = vk = vjaux = vkaux = None + if with_j: + rhoj_cart = cupy.asarray(rhoj_cart) + dm_cart = cupy.asarray(dm_cart) + vj = cupy.zeros((3,nao_cart), order='C') + vjaux = cupy.zeros((3,naux_cart)) + if with_k: + rhok_cart = cupy.asarray(rhok_cart) + vk = cupy.zeros((3,nao_cart), order='C') + vkaux = cupy.zeros((3,naux_cart)) + + for cp_kl_id in task_list: + k0, k1 = cart_aux_loc[cp_kl_id], cart_aux_loc[cp_kl_id+1] + rhoj_tmp = rhok_tmp = None + if with_j: + rhoj_tmp = rhoj_cart[k0:k1] + if with_k: + rhok_tmp = contract('por,ir->pio', rhok_cart[k0:k1], orbo_cart) + rhok_tmp = contract('pio,jo->pji', rhok_tmp, orbo_cart) + ''' + if(rhoj_tmp.flags['C_CONTIGUOUS'] == False): + rhoj_tmp = rhoj_tmp.astype(cupy.float64, order='C') + + if(rhok_tmp.flags['C_CONTIGUOUS'] == False): + rhok_tmp = rhok_tmp.astype(cupy.float64, order='C') + ''' + ''' + # outcore implementation + buf = int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 1) + size = 3*(k1-k0)*nao_cart*nao_cart + int3c_ip = buf[:size].reshape([3,k1-k0,nao_cart,nao_cart], order='C') + rhoj_tmp0 = contract('xpji,ij->xip', int3c_ip, dm_cart) + vj_outcore = contract('xip,p->xi', rhoj_tmp0, rhoj_cart[k0:k1]) + vk_outcore = contract('pji,xpji->xi', rhok_tmp, int3c_ip) + + buf = int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 2) + int3c_ip = buf[:size].reshape([3,k1-k0,nao_cart,nao_cart], order='C') + rhoj_tmp0 = contract('xpji,ji->xp', int3c_ip, dm_cart) + vjaux_outcore = contract('xp,p->xp', rhoj_tmp0, rhoj_cart[k0:k1]) + vkaux_outcore = contract('xpji,pji->xp', int3c_ip, rhok_tmp) + ''' + vj_tmp, vk_tmp = get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip1', rhoj_tmp, rhok_tmp, dm_cart, omega=omega) + if with_j: vj += vj_tmp + if with_k: vk += vk_tmp + vj_tmp, vk_tmp = get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip2', rhoj_tmp, rhok_tmp, dm_cart, omega=omega) + if with_j: vjaux[:, k0:k1] = vj_tmp + if with_k: vkaux[:, k0:k1] = vk_tmp + + rhoj_tmp = rhok_tmp = vj_tmp = vk_tmp = None + t0 = log.timer_debug1(f'calculate {cp_kl_id:3d} / {len(intopt.aux_log_qs):3d}, {k1-k0:3d} slices', *t0) + return vj, vk, vjaux, vkaux + +def get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart, + with_j=True, with_k=True, omega=None): + ''' + Calculate vj = (i'j|L)(L|kl)(ij)(kl), vk = (i'j|L)(L|kl)(ik)(jl) + vjaux = (ij|L')(L|kl)(ij)(kl), vkaux = (ij|L')(L|kl)(ik)(jl) + ''' + nao_cart = dm_cart.shape[0] + block_size = with_df.get_blksize(nao=nao_cart) + + intopt = VHFOpt(mol, auxmol, 'int2e') + intopt.build(1e-14, diag_block_with_triu=True, aosym=False, + group_size_aux=block_size, verbose=0)#, group_size=block_size) + + aux_ao_loc = np.array(intopt.aux_ao_loc) + loads = aux_ao_loc[1:] - aux_ao_loc[:-1] + task_list = _split_tasks(loads, _num_devices) + + futures = [] + cupy.cuda.get_current_stream().synchronize() + with ThreadPoolExecutor(max_workers=_num_devices) as executor: + for device_id in range(_num_devices): + future = executor.submit( + _jk_ip_task, intopt, rhoj_cart, dm_cart, rhok_cart, orbo_cart, task_list[device_id], + with_j=with_j, with_k=with_k, device_id=device_id, omega=omega) + futures.append(future) + + rhoj_total = [] + rhok_total = [] + vjaux_total = [] + vkaux_total = [] + for future in futures: + rhoj, rhok, vjaux, vkaux = future.result() + rhoj_total.append(rhoj) + rhok_total.append(rhok) + vjaux_total.append(vjaux) + vkaux_total.append(vkaux) + + rhoj = rhok = vjaux = vkaux = None + if with_j: + rhoj = reduce_to_device(rhoj_total) + vjaux = reduce_to_device(vjaux_total) + if with_k: + rhok = reduce_to_device(rhok_total) + vkaux = reduce_to_device(vkaux_total) + return rhoj, rhok, vjaux, vkaux diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py index 7c1c901a..ea0537ed 100644 --- a/gpu4pyscf/df/grad/rhf.py +++ b/gpu4pyscf/df/grad/rhf.py @@ -22,7 +22,7 @@ from gpu4pyscf.grad import rhf as rhf_grad from gpu4pyscf import __config__ from gpu4pyscf.lib import logger -from gpu4pyscf.df.grad.jk import get_rhoj_rhok +from gpu4pyscf.df.grad.jk import get_rhojk, get_grad_vjk LINEAR_DEP_THRESHOLD = df.LINEAR_DEP_THR MIN_BLK_SIZE = getattr(__config__, 'min_ao_blksize', 128) @@ -62,7 +62,6 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega # extended to any 1-particle density matrix if(dm0 is None): dm0 = mf_grad.base.make_rdm1() - mf = mf_grad.base if omega is None: with_df = mf_grad.base.with_df else: @@ -92,7 +91,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega mo_coeff = None orbo = intopt.sort_orbitals(orbo, axis=[0]) - rhoj, rhok = get_rhoj_rhok(with_df, dm, orbo, with_j=with_j, with_k=with_k) + rhoj, rhok = get_rhojk(with_df, dm, orbo, with_j=with_j, with_k=with_k) # (d/dX P|Q) contributions if omega and omega > 1e-10: @@ -102,6 +101,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega int2c_e1 = auxmol.intor('int2c2e_ip1') int2c_e1 = cupy.asarray(int2c_e1) + rhoj_cart = rhok_cart = None auxslices = auxmol.aoslice_by_atom() aux_cart2sph = intopt.aux_cart2sph low = with_df.cd_low @@ -129,6 +129,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega elif low.tag == 'cd': #rhok = solve_triangular(low_t, rhok, lower=False) rhok = solve_triangular(low_t, rhok.reshape(naux, -1), lower=False, overwrite_b=True).reshape(naux, nocc, nocc) + rhok = rhok.copy(order='C') tmp = contract('pij,qij->pq', rhok, rhok) tmp = intopt.unsort_orbitals(tmp, aux_axis=[0,1]) vkaux = -contract('xpq,pq->xp', int2c_e1, tmp) @@ -143,12 +144,6 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega t0 = log.timer_debug1('rhoj and rhok', *t0) int2c_e1 = None - nao_cart = intopt._sorted_mol.nao - block_size = with_df.get_blksize(nao=nao_cart) - - intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e') - intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, - group_size_aux=block_size)#, group_size=block_size) dm_cart = dm orbo_cart = orbo if not mol.cart: @@ -156,63 +151,13 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega cart2sph = intopt.cart2sph orbo_cart = cart2sph @ orbo dm_cart = cart2sph @ dm @ cart2sph.T - - dm = orbo = None - vj = vk = rhoj_tmp = rhok_tmp = None - vjaux = vkaux = None - - naux_cart = intopt._sorted_auxmol.nao - if with_j: - vj = cupy.zeros((3,nao_cart), order='C') - vjaux = cupy.zeros((3,naux_cart)) - if with_k: - vk = cupy.zeros((3,nao_cart), order='C') - vkaux = cupy.zeros((3,naux_cart)) - cupy.get_default_memory_pool().free_all_blocks() - t1 = log.init_timer() - for cp_kl_id in range(len(intopt.aux_log_qs)): - k0, k1 = intopt.cart_aux_loc[cp_kl_id], intopt.cart_aux_loc[cp_kl_id+1] - assert k1-k0 <= block_size - if with_j: - rhoj_tmp = rhoj_cart[k0:k1] - if with_k: - rhok_tmp = contract('por,ir->pio', rhok_cart[k0:k1], orbo_cart) - rhok_tmp = contract('pio,jo->pji', rhok_tmp, orbo_cart) - ''' - if(rhoj_tmp.flags['C_CONTIGUOUS'] == False): - rhoj_tmp = rhoj_tmp.astype(cupy.float64, order='C') - - if(rhok_tmp.flags['C_CONTIGUOUS'] == False): - rhok_tmp = rhok_tmp.astype(cupy.float64, order='C') - ''' - ''' - # outcore implementation - buf = int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 1) - size = 3*(k1-k0)*nao_cart*nao_cart - int3c_ip = buf[:size].reshape([3,k1-k0,nao_cart,nao_cart], order='C') - rhoj_tmp0 = contract('xpji,ij->xip', int3c_ip, dm_cart) - vj_outcore = contract('xip,p->xi', rhoj_tmp0, rhoj_cart[k0:k1]) - vk_outcore = contract('pji,xpji->xi', rhok_tmp, int3c_ip) - - buf = int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 2) - int3c_ip = buf[:size].reshape([3,k1-k0,nao_cart,nao_cart], order='C') - rhoj_tmp0 = contract('xpji,ji->xp', int3c_ip, dm_cart) - vjaux_outcore = contract('xp,p->xp', rhoj_tmp0, rhoj_cart[k0:k1]) - vkaux_outcore = contract('xpji,pji->xp', int3c_ip, rhok_tmp) - ''' - vj_tmp, vk_tmp = int3c2e.get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip1', rhoj_tmp, rhok_tmp, dm_cart, omega=omega) - if with_j: vj += vj_tmp - if with_k: vk += vk_tmp - vj_tmp, vk_tmp = int3c2e.get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip2', rhoj_tmp, rhok_tmp, dm_cart, omega=omega) - if with_j: vjaux[:, k0:k1] = vj_tmp - if with_k: vkaux[:, k0:k1] = vk_tmp - - rhoj_tmp = rhok_tmp = vj_tmp = vk_tmp = None - t1 = log.timer_debug1(f'calculate {cp_kl_id:3d} / {len(intopt.aux_log_qs):3d}, {k1-k0:3d} slices', *t1) + vj, vk, vjaux, vkaux = get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart, + with_j=with_j, with_k=with_k, omega=omega) # NOTE: vj and vk are still in cartesian _sorted_mol = intopt._sorted_mol natm = _sorted_mol.natm + nao_cart = _sorted_mol.nao ao2atom = numpy.zeros([nao_cart, natm]) ao_loc = _sorted_mol.ao_loc for ibas, iatm in enumerate(_sorted_mol._bas[:,gto.ATOM_OF]): @@ -226,6 +171,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega _sorted_auxmol = intopt._sorted_auxmol natm = _sorted_auxmol.natm + naux_cart = _sorted_auxmol.nao aux2atom = numpy.zeros([naux_cart, natm]) ao_loc = _sorted_auxmol.ao_loc for ibas, iatm in enumerate(_sorted_auxmol._bas[:,gto.ATOM_OF]): @@ -238,7 +184,6 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega if with_k: vkaux_3c = aux2atom.T @ vkaux.T vkaux = vkaux_2c - vkaux_3c - return vj, vk, vjaux, vkaux diff --git a/gpu4pyscf/df/grad/uhf.py b/gpu4pyscf/df/grad/uhf.py index fc8de3be..42107967 100644 --- a/gpu4pyscf/df/grad/uhf.py +++ b/gpu4pyscf/df/grad/uhf.py @@ -18,11 +18,11 @@ from cupyx.scipy.linalg import solve_triangular from pyscf import scf, gto from gpu4pyscf.df import int3c2e -from gpu4pyscf.lib.cupy_helper import tag_array, contract, load_library +from gpu4pyscf.lib.cupy_helper import tag_array, contract from gpu4pyscf.grad import uhf as uhf_grad from gpu4pyscf import __config__ from gpu4pyscf.lib import logger -from gpu4pyscf.df.grad.jk import get_rhoj_rhok +from gpu4pyscf.df.grad.jk import get_rhojk, get_grad_vjk FREE_CUPY_CACHE = True BINSIZE = 128 @@ -80,39 +80,9 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, # (L|ij) -> rhoj: (L), rhok: (L|oo) low = with_df.cd_low - rhoj, rhok = get_rhoj_rhok(with_df, dm, orbo, with_j=with_j, with_k=with_k) + rhoj, rhok = get_rhojk(with_df, dm, orbo, with_j=with_j, with_k=with_k) if dm2 is not None: - rhoj2, _ = get_rhoj_rhok(with_df, dm2_tmp, orbo, with_j=with_j, with_k=False) - ''' - rows = with_df.intopt.cderi_row - cols = with_df.intopt.cderi_col - dm_sparse = dm[rows, cols] - dm_sparse[with_df.intopt.cderi_diag] *= .5 - if dm2 is not None: - dm2_sparse = dm2_tmp[rows, cols] - dm2_sparse[with_df.intopt.cderi_diag] *= .5 - - blksize = with_df.get_blksize() - if with_j: - rhoj = cupy.empty([naux]) - if dm2 is not None: - rhoj2 = cupy.empty([naux]) - if with_k: - rhok = cupy.empty([naux, nocc, nocc], order='C') - p0 = p1 = 0 - - for cderi, cderi_sparse in with_df.loop(blksize=blksize): - p1 = p0 + cderi.shape[0] - if with_j: - rhoj[p0:p1] = 2.0*dm_sparse.dot(cderi_sparse) - if dm2 is not None: - rhoj2[p0:p1] = 2.0*dm2_sparse.dot(cderi_sparse) - if with_k: - tmp = contract('Lij,jk->Lki', cderi, orbo) - contract('Lki,il->Lkl', tmp, orbo, out=rhok[p0:p1]) - p0 = p1 - tmp = dm_sparse = cderi_sparse = cderi = None - ''' + rhoj2, _ = get_rhojk(with_df, dm2_tmp, orbo, with_j=with_j, with_k=False) # (d/dX P|Q) contributions if omega and omega > 1e-10: @@ -120,7 +90,9 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, int2c_e1 = auxmol.intor('int2c2e_ip1') else: int2c_e1 = auxmol.intor('int2c2e_ip1') + int2c_e1 = cupy.asarray(int2c_e1) + rhoj_cart = rhok_cart = None auxslices = auxmol.aoslice_by_atom() aux_cart2sph = intopt.aux_cart2sph low_t = low.T.copy() @@ -154,6 +126,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, rhok = contract('pq,qij->pij', low_t.T, rhok) elif low.tag == 'cd': rhok = solve_triangular(low_t, rhok.reshape(naux, -1), lower=False, overwrite_b=True).reshape(naux, nocc, nocc) + rhok = rhok.copy(order='C') tmp = contract('pij,qij->pq', rhok, rhok) tmp = intopt.unsort_orbitals(tmp, aux_axis=[0,1]) vkaux = -contract('xpq,pq->xp', int2c_e1, tmp) @@ -192,6 +165,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, orbo_cart = orbo dm = orbo = None + """ vj = vk = rhoj_tmp = rhok_tmp = None vjaux = vkaux = None @@ -243,7 +217,10 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, rhoj_tmp = rhok_tmp = vj_tmp = vk_tmp = None t1 = log.timer_debug1(f'calculate {cp_kl_id:3d} / {len(intopt.aux_log_qs):3d}, {k1-k0:3d} slices', *t1) - + """ + vj, vk, vjaux, vkaux = get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart, + with_j=with_j, with_k=with_k, omega=omega) + # NOTE: vj and vk are still in cartesian _sorted_mol = intopt._sorted_mol natm = _sorted_mol.natm @@ -260,6 +237,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, _sorted_auxmol = intopt._sorted_auxmol natm = _sorted_auxmol.natm + naux_cart = _sorted_auxmol.nao aux2atom = np.zeros([naux_cart, natm]) ao_loc = _sorted_auxmol.ao_loc for ibas, iatm in enumerate(_sorted_auxmol._bas[:,gto.ATOM_OF]): diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py index a29a50bd..47faa476 100644 --- a/gpu4pyscf/df/hessian/rhf.py +++ b/gpu4pyscf/df/hessian/rhf.py @@ -38,7 +38,7 @@ from gpu4pyscf.df.hessian import jk LINEAR_DEP_THR = df.LINEAR_DEP_THR -BLKSIZE = 128 +BLKSIZE = 256 ALIGNED = getattr(__config__, 'ao_aligned', 32) GB = 1024*1024*1024 @@ -111,7 +111,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls # ================================ sorted AO begin =============================================== intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e') - intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE) + intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, verbose=0, + group_size=BLKSIZE, group_size_aux=BLKSIZE) naux = auxmol.nao mocc_2 = intopt.sort_orbitals(mocc_2, axis=[0]) dm0 = intopt.sort_orbitals(dm0, axis=[0,1]) @@ -135,21 +136,23 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls # int3c contributions wj, wk_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0_tag, omega=omega) - t1 = log.timer_debug1('intermediate variables with int3c2e', *t1) rhoj0_P = rhok0_P__ = None + if with_j: rhoj0_P = solve_j2c(wj) + wj = None if with_k: rhok0_P__ = solve_j2c(wk_P__) - wj = wk_P__ = None + wk_P__ = None + t1 = log.timer_debug1('intermediate variables with int3c2e', *t1) # int3c_ip2 contributions wj_ip2, wk_ip2_P__ = int3c2e.get_int3c2e_ip2_wjk(intopt, dm0_tag, omega=omega) - t1 = log.timer_debug1('interdediate variables with int3c2e_ip2', *t1) + t1 = log.timer_debug1('intermediate variables with int3c2e_ip2', *t1) # int3c_ip1 contributions wj1_P, wk1_Pko = int3c2e.get_int3c2e_ip1_wjk(intopt, dm0_tag, omega=omega) - t1 = log.timer_debug1('interdediate variables with int3c2e_ip1', *t1) + t1 = log.timer_debug1('intermediate variables with int3c2e_ip1', *t1) #rhoj1_P = contract('pq,pix->qix', int2c_inv, wj1_P) if with_j: @@ -473,7 +476,7 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, - aosym=False, + aosym=False, verbose=0, group_size_aux=BLKSIZE, group_size=BLKSIZE) naux = auxmol.nao diff --git a/gpu4pyscf/df/hessian/uhf.py b/gpu4pyscf/df/hessian/uhf.py index 1b18fc9a..b77015f6 100644 --- a/gpu4pyscf/df/hessian/uhf.py +++ b/gpu4pyscf/df/hessian/uhf.py @@ -96,7 +96,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # ================================ sorted AO begin =============================================== intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e') - intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE) + intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, verbose=0, + group_size=BLKSIZE, group_size_aux=BLKSIZE) mocca = intopt.sort_orbitals(mocca, axis=[0]) moccb = intopt.sort_orbitals(moccb, axis=[0]) @@ -495,7 +496,7 @@ def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e') intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, - aosym=False, + aosym=False, verbose=0, group_size_aux=BLKSIZE, group_size=BLKSIZE) diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py index f4d3bbab..8bfa8a81 100644 --- a/gpu4pyscf/df/int3c2e.py +++ b/gpu4pyscf/df/int3c2e.py @@ -21,7 +21,7 @@ from pyscf.scf import _vhf from gpu4pyscf.scf.int4c2e import BasisProdCache, libgvhf, libgint from gpu4pyscf.lib.cupy_helper import (block_c2s_diag, cart2sph, contract, get_avail_mem, - reduce_to_device, copy_array) + reduce_to_device, copy_array, transpose_sum) from gpu4pyscf.lib import logger from gpu4pyscf.gto.mole import basis_seg_contraction from gpu4pyscf.__config__ import _num_devices, _streams @@ -29,7 +29,7 @@ LMAX_ON_GPU = 8 FREE_CUPY_CACHE = True STACK_SIZE_PER_THREAD = 8192 * 4 -BLKSIZE = 128 +BLKSIZE = 256 NROOT_ON_GPU = 7 def make_fake_mol(): @@ -103,8 +103,8 @@ def __del__(self): except AttributeError: pass - def build(self, cutoff=1e-14, group_size=None, - group_size_aux=None, diag_block_with_triu=False, aosym=False): + def build(self, cutoff=1e-14, group_size=None, group_size_aux=None, + diag_block_with_triu=False, aosym=False, verbose=None): ''' int3c2e is based on int2e with (ao,ao|aux,1) a tot_mol is created with concatenating [mol, fake_mol, aux_mol] @@ -116,7 +116,9 @@ def build(self, cutoff=1e-14, group_size=None, mol = basis_seg_contraction(_mol, allow_replica=True)[0] auxmol = basis_seg_contraction(_auxmol, allow_replica=True)[0] - log = logger.new_logger(_mol, _mol.verbose) + if verbose is None: + verbose = _mol.verbose + log = logger.new_logger(_mol, verbose) cput0 = log.init_timer() _sorted_mol, sorted_idx, uniq_l_ctr, l_ctr_counts = sort_mol(mol, log=log) @@ -218,28 +220,10 @@ def build(self, cutoff=1e-14, group_size=None, self.pair2bra = pair2bra self.pair2ket = pair2ket self.l_ctr_offsets = l_ctr_offsets - bas_pair2shls = np.hstack(pair2bra + pair2ket).astype(np.int32).reshape(2,-1) - bas_pairs_locs = np.append(0, np.cumsum([x.size for x in pair2bra])).astype(np.int32) - log_qs = log_qs + aux_log_qs - ao_loc = _tot_mol.ao_loc_nr(cart=True) - ncptype = len(log_qs) self._bpcache = {} - for n in range(_num_devices): - with cupy.cuda.Device(n), _streams[n]: - bpcache = ctypes.POINTER(BasisProdCache)() - scale_shellpair_diag = 1. - libgint.GINTinit_basis_prod( - ctypes.byref(bpcache), ctypes.c_double(scale_shellpair_diag), - ao_loc.ctypes.data_as(ctypes.c_void_p), - bas_pair2shls.ctypes.data_as(ctypes.c_void_p), - bas_pairs_locs.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(ncptype), - _tot_mol._atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(_tot_mol.natm), - _tot_mol._bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(_tot_mol.nbas), - _tot_mol._env.ctypes.data_as(ctypes.c_void_p)) - self._bpcache[n] = bpcache - cput1 = log.timer_debug1('Initialize GPU cache', *cput1) + bas_pairs_locs = np.append(0, np.cumsum([x.size for x in pair2bra])).astype(np.int32) self.bas_pairs_locs = bas_pairs_locs ncptype = len(self.log_qs) self.aosym = aosym @@ -264,6 +248,27 @@ def build(self, cutoff=1e-14, group_size=None, @property def bpcache(self): device_id = cupy.cuda.Device().id + if device_id not in self._bpcache: + with cupy.cuda.Device(device_id), _streams[device_id]: + log = logger.new_logger(self.mol, self.mol.verbose) + cput0 = log.init_timer() + bpcache = ctypes.POINTER(BasisProdCache)() + scale_shellpair_diag = 1. + _tot_mol = self._tot_mol + log_qs = self.log_qs + self.aux_log_qs + ao_loc = _tot_mol.ao_loc_nr(cart=True) + bas_pair2shls = np.hstack(self.pair2bra + self.pair2ket).astype(np.int32).reshape(2,-1) + ncptype = len(log_qs) + libgint.GINTinit_basis_prod( + ctypes.byref(bpcache), ctypes.c_double(scale_shellpair_diag), + ao_loc.ctypes.data_as(ctypes.c_void_p), + bas_pair2shls.ctypes.data_as(ctypes.c_void_p), + self.bas_pairs_locs.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(ncptype), + _tot_mol._atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(_tot_mol.natm), + _tot_mol._bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(_tot_mol.nbas), + _tot_mol._env.ctypes.data_as(ctypes.c_void_p)) + self._bpcache[device_id] = bpcache + cput0 = log.timer_debug1(f'Initialize GPU cache on Device {device_id}', *cput0) bpcache = self._bpcache[device_id] return bpcache @@ -496,16 +501,6 @@ def loop_int3c2e_general(intopt, task_list=None, ip_type='', omega=None, stream= ao_loc = intopt.ao_loc aux_ao_loc = intopt.aux_ao_loc comp = 3**order - - lmax = intopt._sorted_mol._bas[:gto.ANG_OF].max() - aux_lmax = intopt._sorted_auxmol._bas[:gto.ANG_OF].max() - nroots = (lmax + aux_lmax + order)//2 + 1 - if nroots > NROOT_ON_GPU: - from pyscf.gto.moleintor import getints, make_cintopt - pmol = intopt._tot_mol - intor = pmol._add_suffix('int3c2e_' + ip_type) - opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor) - nbins = 1 # If task_list is not given, generate all the tasks @@ -558,6 +553,11 @@ def loop_int3c2e_general(intopt, task_list=None, ip_type='', omega=None, stream= if err != 0: raise RuntimeError(f'GINT_fill_int3c2e general failed, err={err}') else: + from pyscf.gto.moleintor import getints, make_cintopt + pmol = intopt._tot_mol + intor = pmol._add_suffix('int3c2e_' + ip_type) + opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor) + # TODO: sph2cart in CPU? ishl0, ishl1 = intopt.l_ctr_offsets[cpi], intopt.l_ctr_offsets[cpi+1] jshl0, jshl1 = intopt.l_ctr_offsets[cpj], intopt.l_ctr_offsets[cpj+1] @@ -806,7 +806,7 @@ def _int3c2e_jk_task(intopt, task_k_list, dm0, mocc, device_id=0, omega=None): int3c_blk = ints_o = None if intopt.aosym: rhoj[k0:k1] = 2.0 * rhoj_tmp - rhok[k0:k1] = rhok_tmp + rhok_tmp.transpose([0,2,1]) + rhok[k0:k1] = transpose_sum(rhok_tmp) else: rhoj[k0:k1] = rhoj_tmp rhok[k0:k1] = rhok_tmp @@ -831,7 +831,7 @@ def get_int3c2e_jk(mol, auxmol, dm0_tag, with_k=True, omega=None): for device_id in range(_num_devices): future = executor.submit( _int3c2e_jk_task, intopt, task_list[device_id], - dm0_tag, orbo, device_id=device_id, omega=omega) + dm0_tag.get(), orbo.get(), device_id=device_id, omega=omega) futures.append(future) rhoj_total = [] @@ -854,10 +854,12 @@ def _split_tasks(loads, ngroups): return [range(len(loads))] groups = [[] for _ in range(ngroups)] sums = [0] * ngroups - for i, load in enumerate(loads): + + sorted_indices = np.argsort(loads)[::-1] + for idx in sorted_indices: min_index = sums.index(min(sums)) - groups[min_index].append(i) - sums[min_index] += load + groups[min_index].append(idx) + sums[min_index] += loads[idx] return groups def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id=0, @@ -922,7 +924,7 @@ def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id= vk1[:,:,p0:p1] += contract('xiJo,ia->axJo', vk1_ao, ao2atom) rhok0_slice = vk1_ao = None rhok_tmp = int3c_ip1_occ = None - t0 = log.timer_debug1(f'int3c2e_ip1 on Device {device_id}', *t0) + t0 = log.timer_debug1(f'int3c2e_ip1_vjk on Device {device_id}', *t0) # TODO: absorbe vj1_buf and vk1_buf into vj1 and vk1 return vj1_buf, vk1_buf, vj1, vk1 @@ -1018,7 +1020,7 @@ def _int3c2e_ip2_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, vk1 += contract('xpir,pa->axir', vk1_tmp, aux2atom[k0:k1]) vk1_tmp = rhok0_oo = rhok0_slice = None rhok_tmp = wk2_P__ = None - t0 = log.timer_debug1(f'int3c2e_ip2 on Device {device_id}', *t0) + t0 = log.timer_debug1(f'int3c2e_ip2_vjk on Device {device_id}', *t0) return vj1, vk1 def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, @@ -1056,7 +1058,7 @@ def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, vk = reduce_to_device(vk_total, inplace=True) return vj, vk -def _int3c2e_ip1_wjk_task(intopt, task_list, dm0, orbo, wk, device_id=0, with_k=True, omega=None): +def _int3c2e_ip1_wjk_task(intopt, task_k_list, dm0, orbo, wk, device_id=0, with_k=True, omega=None): nao = intopt.mol.nao naux = intopt.auxmol.nao aux_ao_loc = intopt.aux_ao_loc @@ -1068,7 +1070,7 @@ def _int3c2e_ip1_wjk_task(intopt, task_list, dm0, orbo, wk, device_id=0, with_k= wj = cupy.zeros([naux,nao,3]) dm0 = cupy.asarray(dm0) orbo = cupy.asarray(orbo) - for cp_k in task_list: + for cp_k in task_k_list: k0, k1 = aux_ao_loc[cp_k], aux_ao_loc[cp_k+1] if with_k: wk_tmp = cupy.zeros([k1-k0,nao,nocc,3]) @@ -1119,9 +1121,12 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None): return wj, wk def _int3c2e_ip2_wjk(intopt, task_list, dm0, orbo, with_k=True, omega=None, device_id=0): + aux_ao_loc = intopt.aux_ao_loc with cupy.cuda.Device(device_id), _streams[device_id]: + cupy.get_default_memory_pool().free_all_blocks() log = logger.new_logger(intopt.mol, intopt.mol.verbose) t0 = log.init_timer() + ncp_ij = len(intopt.log_qs) dm0 = cupy.asarray(dm0) orbo = cupy.asarray(orbo) naux = intopt.auxmol.nao @@ -1130,26 +1135,28 @@ def _int3c2e_ip2_wjk(intopt, task_list, dm0, orbo, with_k=True, omega=None, devi wk = None if with_k: wk = cupy.zeros([naux,nocc,nocc,3]) - for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list, - ip_type='ip2', omega=omega): - wj[k0:k1] += contract('xpji,ji->px', int3c_blk, dm0[j0:j1,i0:i1]) - if with_k: - tmp = contract('xpji,jo->piox', int3c_blk, orbo[j0:j1]) - wk[k0:k1] += contract('piox,ir->prox', tmp, orbo[i0:i1]) - tmp = None - int3c_blk = None + for cp_k in task_list: + k0, k1 = aux_ao_loc[cp_k], aux_ao_loc[cp_k+1] + task_list = [(cp_k, cp_ij) for cp_ij in range(ncp_ij)] + + for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list, + ip_type='ip2', omega=omega): + wj[k0:k1] += contract('xpji,ji->px', int3c_blk, dm0[j0:j1,i0:i1]) + if with_k: + tmp = contract('xpji,jo->piox', int3c_blk, orbo[j0:j1]) + wk[k0:k1] += contract('piox,ir->prox', tmp, orbo[i0:i1]) + tmp = None + int3c_blk = None t0 = log.timer_debug1(f'int3c2e_ip2_wjk on Device {device_id}', *t0) return wj, wk def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None): orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') futures = [] - ncp_k = len(intopt.aux_log_qs) - ncp_ij = len(intopt.log_qs) - tasks = np.array(list(itertools.product(range(ncp_k), range(ncp_ij)))) - task_list = [] - for device_id in range(_num_devices): - task_list.append(tasks[device_id::_num_devices]) + + aux_ao_loc = np.array(intopt.aux_ao_loc) + loads = aux_ao_loc[1:] - aux_ao_loc[:-1] + task_list = _split_tasks(loads, _num_devices) cupy.cuda.get_current_stream().synchronize() with ThreadPoolExecutor(max_workers=_num_devices) as executor: diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py index 898a2846..95ae1f24 100644 --- a/gpu4pyscf/lib/cupy_helper.py +++ b/gpu4pyscf/lib/cupy_helper.py @@ -104,13 +104,13 @@ def concatenate(array_list): if _p2p_access: return cupy.concatenate(array_list) else: - array_list_cpu = [a.get() for a in array_list] - n = sum([a.shape[0] for a in array_list_cpu]) - a0_shape = list(array_list_cpu[0].shape) + #array_list_cpu = [a.get() for a in array_list] + n = sum([a.shape[0] for a in array_list]) + a0_shape = list(array_list[0].shape) out_shape = tuple([n] + a0_shape[1:]) out = cupy.empty(out_shape) p0 = p1 = 0 - for a in array_list_cpu: + for a in array_list: p1 = p0 + a.shape[0] #out[p0:p1].set(a) copy_array(a, out[p0:p1]) @@ -138,15 +138,16 @@ def reduce_to_device(array_list, inplace=False): result = array_list[0] else: result = array_list[0].copy() + + # Transfer data chunk by chunk, reduce memory footprint, result = result.reshape(-1) - # Asynchronously add each matrix from its device for device_id, matrix in enumerate(array_list): if device_id == 0: continue assert matrix.device.id == device_id matrix = matrix.reshape(-1) - blksize = 1024*1024*128 # 1GB + blksize = 1024*1024*1024 // matrix.itemsize # 1GB for p0, p1 in lib.prange(0,len(matrix), blksize): result[p0:p1] += copy_array(matrix[p0:p1])#cupy.asarray(matrix[p0:p1]) #result[p0:p1] += cupy.asarray(matrix[p0:p1]) diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py index e709eafc..3d738890 100644 --- a/gpu4pyscf/tests/test_benchmark_rks.py +++ b/gpu4pyscf/tests/test_benchmark_rks.py @@ -24,11 +24,11 @@ # How to run # 1. run test only -# pytest test_rks.py --benchmark-disable -s -v -m "not slow" --durations=20 +# pytest test_benchmark_rks.py --benchmark-disable -s -v -m "not slow" --durations=20 # 2. benchmark less expensive tasks -# pytest test_rks.py -v -m "not slow" +# pytest test_benchmark_rks.py -v -m "not slow" # 3. benchmark all the tests -# pytest test_rks.py -v +# pytest test_benchmark_rks.py -v current_folder = os.path.dirname(os.path.abspath(__file__)) small_mol = os.path.join(current_folder, '020_Vitamin_C.xyz') From ca79edb0c5d76c928180d5110838a368faa2f1f1 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Thu, 2 Jan 2025 06:46:51 +0000 Subject: [PATCH 32/49] bugfix for single gpu --- gpu4pyscf/df/df.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py index a7f04370..ab1adeba 100644 --- a/gpu4pyscf/df/df.py +++ b/gpu4pyscf/df/df.py @@ -361,6 +361,6 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, aux_blksize, tmp = copy_array(tmp) _cderi[dev_id][:,ij0:ij1] = tmp else: - _cderi[slice_id][:,ij0:ij1] = cderi_block[p0:p1] + _cderi[0][:,ij0:ij1] = cderi_block t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1) return From 8526424901176d65d3c365b077d66a14f5598ef4 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Fri, 3 Jan 2025 02:05:51 +0800 Subject: [PATCH 33/49] update benchmark script --- .../Linux-CPython-3.9-64bit/0001_v100.json | 838 ++++++++++++++++++ gpu4pyscf/tests/test_benchmark_rks.py | 36 +- 2 files changed, 867 insertions(+), 7 deletions(-) create mode 100644 gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/0001_v100.json diff --git a/gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/0001_v100.json b/gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/0001_v100.json new file mode 100644 index 00000000..4101f07e --- /dev/null +++ b/gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/0001_v100.json @@ -0,0 +1,838 @@ +{ + "machine_info": { + "node": "mlxlabzskrh20v669fd914-20240723162348-uim1c3-1eg5qx-worker", + "processor": "", + "machine": "x86_64", + "python_compiler": "GCC 10.2.1 20210110", + "python_implementation": "CPython", + "python_implementation_version": "3.9.2", + "python_version": "3.9.2", + "python_build": [ + "default", + "Feb 28 2021 17:03:44" + ], + "release": "5.4.143.bsk.7-amd64", + "system": "Linux", + "cpu": { + "python_version": "3.9.2.final.0 (64 bit)", + "cpuinfo_version": [ + 9, + 0, + 0 + ], + "cpuinfo_version_string": "9.0.0", + "arch": "X86_64", + "bits": 64, + "count": 96, + "arch_string_raw": "x86_64", + "vendor_id_raw": "GenuineIntel", + "brand_raw": "Intel(R) Xeon(R) Platinum 8260 CPU @ 2.40GHz", + "hz_advertised_friendly": "2.4000 GHz", + "hz_actual_friendly": "3.1000 GHz", + "hz_advertised": [ + 2400000000, + 0 + ], + "hz_actual": [ + 3100000000, + 0 + ], + "stepping": 7, + "model": 85, + "family": 6, + "flags": [ + "3dnowprefetch", + "abm", + "acpi", + "adx", + "aes", + "aperfmperf", + "apic", + "arat", + "arch_capabilities", + "arch_perfmon", + "art", + "avx", + "avx2", + "avx512_vnni", + "avx512bw", + "avx512cd", + "avx512dq", + "avx512f", + "avx512vl", + "avx512vnni", + "bmi1", + "bmi2", + "bts", + "cat_l3", + "cdp_l3", + "clflush", + "clflushopt", + "clwb", + "cmov", + "constant_tsc", + "cpuid", + "cpuid_fault", + "cqm", + "cqm_llc", + "cqm_mbm_local", + "cqm_mbm_total", + "cqm_occup_llc", + "cx16", + "cx8", + "dca", + "de", + "ds_cpl", + "dtes64", + "dtherm", + "dts", + "epb", + "ept", + "ept_ad", + "erms", + "est", + "f16c", + "flexpriority", + "flush_l1d", + "fma", + "fpu", + "fsgsbase", + "fxsr", + "ht", + "hwp", + "hwp_act_window", + "hwp_epp", + "hwp_pkg_req", + "ibpb", + "ibrs", + "ibrs_enhanced", + "ida", + "intel_ppin", + "intel_pt", + "invpcid", + "invpcid_single", + "lahf_lm", + "lm", + "mba", + "mca", + "mce", + "md_clear", + "mmx", + "movbe", + "mpx", + "msr", + "mtrr", + "nonstop_tsc", + "nopl", + "nx", + "ospke", + "osxsave", + "pae", + "pat", + "pbe", + "pcid", + "pclmulqdq", + "pdcm", + "pdpe1gb", + "pebs", + "pge", + "pku", + "pln", + "pni", + "popcnt", + "pqe", + "pqm", + "pse", + "pse36", + "pts", + "rdrand", + "rdrnd", + "rdseed", + "rdt_a", + "rdtscp", + "rep_good", + "sdbg", + "sep", + "smap", + "smep", + "smx", + "ss", + "ssbd", + "sse", + "sse2", + "sse4_1", + "sse4_2", + "ssse3", + "stibp", + "syscall", + "tm", + "tm2", + "tpr_shadow", + "tsc", + "tsc_adjust", + "tsc_deadline_timer", + "tscdeadline", + "vme", + "vmx", + "vnmi", + "vpid", + "x2apic", + "xgetbv1", + "xsave", + "xsavec", + "xsaveopt", + "xsaves", + "xtopology", + "xtpr" + ], + "l3_cache_size": 37486592, + "l2_cache_size": 50331648, + "l1_data_cache_size": "1.5 MiB", + "l1_instruction_cache_size": "1.5 MiB", + "l2_cache_line_size": 256, + "l2_cache_associativity": 6 + } + }, + "commit_info": { + "id": "ca79edb0c5d76c928180d5110838a368faa2f1f1", + "time": "2025-01-02T06:46:51+00:00", + "author_time": "2025-01-02T06:46:51+00:00", + "dirty": true, + "project": "gpu4pyscf", + "branch": "benchmark_ci" + }, + "benchmarks": [ + { + "group": null, + "name": "test_df_rb3lyp", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 3.1617951644584537, + "max": 3.7628091080114245, + "mean": 3.4210989899933337, + "stddev": 0.22879307906931964, + "rounds": 5, + "median": 3.411895725876093, + "iqr": 0.3097648276016116, + "q1": 3.2490662271156907, + "q3": 3.5588310547173023, + "iqr_outliers": 0, + "stddev_outliers": 2, + "outliers": "2;0", + "ld15iqr": 3.1617951644584537, + "hd15iqr": 3.7628091080114245, + "ops": 0.2923037313228836, + "total": 17.10549494996667, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_rb3lyp_grad", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_grad", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 4.783565265126526, + "max": 5.297894531860948, + "mean": 5.111189672350884, + "stddev": 0.21592611330232545, + "rounds": 5, + "median": 5.1955106453970075, + "iqr": 0.32867400418035686, + "q1": 4.950462192296982, + "q3": 5.279136196477339, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 4.783565265126526, + "hd15iqr": 5.297894531860948, + "ops": 0.1956491666528297, + "total": 25.555948361754417, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_rb3lyp", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 40.746477760374546, + "max": 41.372105130925775, + "mean": 41.03762242179364, + "stddev": 0.23753417144600847, + "rounds": 5, + "median": 41.043180647306144, + "iqr": 0.33331693802028894, + "q1": 40.858045106288046, + "q3": 41.191362044308335, + "iqr_outliers": 0, + "stddev_outliers": 2, + "outliers": "2;0", + "ld15iqr": 40.746477760374546, + "hd15iqr": 41.372105130925775, + "ops": 0.02436788344416696, + "total": 205.1881121089682, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_rb3lyp_grad", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp_grad", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 49.95022037625313, + "max": 50.498252051882446, + "mean": 50.14596408084035, + "stddev": 0.20855077359110574, + "rounds": 5, + "median": 50.08650258369744, + "iqr": 0.19796669483184814, + "q1": 50.0301427282393, + "q3": 50.228109423071146, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 49.95022037625313, + "hd15iqr": 50.498252051882446, + "ops": 0.01994178431564102, + "total": 250.72982040420175, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_rb3lyp_hessian", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp_hessian", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 613.87584313564, + "max": 620.2618521982804, + "mean": 616.6373227955773, + "stddev": 2.3903446370287127, + "rounds": 5, + "median": 616.8295351918787, + "iqr": 2.8947918817866594, + "q1": 614.9020847703796, + "q3": 617.7968766521662, + "iqr_outliers": 0, + "stddev_outliers": 2, + "outliers": "2;0", + "ld15iqr": 613.87584313564, + "hd15iqr": 620.2618521982804, + "ops": 0.0016216987896003692, + "total": 3083.1866139778867, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_rb3lyp_medium", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_medium", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 18.224224156700075, + "max": 20.157692234031856, + "mean": 19.024707913957535, + "stddev": 0.8064738008793728, + "rounds": 5, + "median": 19.21132487989962, + "iqr": 1.2390491359401494, + "q1": 18.252076843054965, + "q3": 19.491125978995115, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 18.224224156700075, + "hd15iqr": 20.157692234031856, + "ops": 0.052563224861199936, + "total": 95.12353956978768, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_rb3lyp_grad_medium", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_grad_medium", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 28.5077072577551, + "max": 32.310115557163954, + "mean": 29.75330361928791, + "stddev": 1.5559254109717362, + "rounds": 5, + "median": 28.967016119509935, + "iqr": 1.9220157761592418, + "q1": 28.759349649539217, + "q3": 30.68136542569846, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 28.5077072577551, + "hd15iqr": 32.310115557163954, + "ops": 0.03360971315305434, + "total": 148.76651809643954, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_rb3lyp_hessian_medium", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_hessian_medium", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 603.508519613184, + "max": 605.9056743234396, + "mean": 605.091381527856, + "stddev": 0.9420383756463966, + "rounds": 5, + "median": 605.3425533128902, + "iqr": 1.0546113743912429, + "q1": 604.6620287010446, + "q3": 605.7166400754359, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 603.508519613184, + "hd15iqr": 605.9056743234396, + "ops": 0.0016526429404348803, + "total": 3025.45690763928, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_rb3lyp_medium", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp_medium", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 462.8745871502906, + "max": 464.5682009514421, + "mean": 463.4215483263135, + "stddev": 0.6988488955671605, + "rounds": 5, + "median": 463.2153818728402, + "iqr": 0.9204953673761338, + "q1": 462.88869020040147, + "q3": 463.8091855677776, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 462.8745871502906, + "hd15iqr": 464.5682009514421, + "ops": 0.002157862541376389, + "total": 2317.1077416315675, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_rb3lyp_grad_medium", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp_grad_medium", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 552.8754901243374, + "max": 554.911680762656, + "mean": 554.3076192354783, + "stddev": 0.8161092352116484, + "rounds": 5, + "median": 554.5314849242568, + "iqr": 0.6415546571370214, + "q1": 554.1099091696087, + "q3": 554.7514638267457, + "iqr_outliers": 1, + "stddev_outliers": 1, + "outliers": "1;1", + "ld15iqr": 554.5213821846992, + "hd15iqr": 554.911680762656, + "ops": 0.0018040524165611094, + "total": 2771.5380961773917, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_rb3lyp_631gs", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_631gs", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 1.5329652335494757, + "max": 1.6051436373963952, + "mean": 1.5614585245028139, + "stddev": 0.027036901841682143, + "rounds": 5, + "median": 1.5542930895462632, + "iqr": 0.02967151813209057, + "q1": 1.5455118480604142, + "q3": 1.5751833661925048, + "iqr_outliers": 0, + "stddev_outliers": 2, + "outliers": "2;0", + "ld15iqr": 1.5329652335494757, + "hd15iqr": 1.6051436373963952, + "ops": 0.6404268728933491, + "total": 7.807292622514069, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_rb3lyp_631gs_grad", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_631gs_grad", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 2.280210480093956, + "max": 2.3804853092879057, + "mean": 2.317926473915577, + "stddev": 0.04595680285671692, + "rounds": 5, + "median": 2.290554977953434, + "iqr": 0.07659115269780159, + "q1": 2.283684498164803, + "q3": 2.3602756508626044, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 2.280210480093956, + "hd15iqr": 2.3804853092879057, + "ops": 0.43142006929613325, + "total": 11.589632369577885, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_rb3lyp_631gs_hessian", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_631gs_hessian", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 13.266008610837162, + "max": 13.778700362890959, + "mean": 13.425486170500516, + "stddev": 0.20500983387954833, + "rounds": 5, + "median": 13.3493886096403, + "iqr": 0.20474978047423065, + "q1": 13.303213707404211, + "q3": 13.507963487878442, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 13.266008610837162, + "hd15iqr": 13.778700362890959, + "ops": 0.07448519832356425, + "total": 67.12743085250258, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_rb3lyp_631gs_large", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp_631gs_large", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 54.99403030797839, + "max": 55.44687832519412, + "mean": 55.280799315497276, + "stddev": 0.17365676139080558, + "rounds": 5, + "median": 55.29263620171696, + "iqr": 0.18626192840747535, + "q1": 55.21340201841667, + "q3": 55.39966394682415, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 54.99403030797839, + "hd15iqr": 55.44687832519412, + "ops": 0.018089463473435388, + "total": 276.4039965774864, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_rb3lyp_631gs_grad_large", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp_631gs_grad_large", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 68.62203936092556, + "max": 69.5449710758403, + "mean": 69.09264576677234, + "stddev": 0.4051404972610001, + "rounds": 5, + "median": 68.97573095746338, + "iqr": 0.7140946059953421, + "q1": 68.78401179146022, + "q3": 69.49810639745556, + "iqr_outliers": 0, + "stddev_outliers": 2, + "outliers": "2;0", + "ld15iqr": 68.62203936092556, + "hd15iqr": 69.5449710758403, + "ops": 0.014473320407725865, + "total": 345.46322883386165, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_rb3lyp_631gs_solvent", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_631gs_solvent", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 2.3869710564613342, + "max": 2.530611244030297, + "mean": 2.4419715087860823, + "stddev": 0.05630153012020871, + "rounds": 5, + "median": 2.43410103302449, + "iqr": 0.07564499671570957, + "q1": 2.398690618108958, + "q3": 2.4743356148246676, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 2.3869710564613342, + "hd15iqr": 2.530611244030297, + "ops": 0.40950518726449253, + "total": 12.209857543930411, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_rb3lyp_631gs_solvent_grad", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_631gs_solvent_grad", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 2.973248357884586, + "max": 3.1569794192910194, + "mean": 3.076459738984704, + "stddev": 0.06885617804924284, + "rounds": 5, + "median": 3.072229014709592, + "iqr": 0.08629889693111181, + "q1": 3.040569737320766, + "q3": 3.1268686342518777, + "iqr_outliers": 0, + "stddev_outliers": 2, + "outliers": "2;0", + "ld15iqr": 2.973248357884586, + "hd15iqr": 3.1569794192910194, + "ops": 0.3250489474404826, + "total": 15.38229869492352, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_rb3lyp_631gs_solvent_hessian", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_631gs_solvent_hessian", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 107.97044172231108, + "max": 109.67866003140807, + "mean": 108.94072148483247, + "stddev": 0.7142319621034929, + "rounds": 5, + "median": 108.94882048200816, + "iqr": 1.1987205250188708, + "q1": 108.39640940236859, + "q3": 109.59512992738746, + "iqr_outliers": 0, + "stddev_outliers": 2, + "outliers": "2;0", + "ld15iqr": 107.97044172231108, + "hd15iqr": 109.67866003140807, + "ops": 0.009179303995514913, + "total": 544.7036074241623, + "iterations": 1 + } + } + ], + "datetime": "2025-01-02T13:26:31.464476+00:00", + "version": "5.1.0" +} \ No newline at end of file diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py index 3d738890..7847993d 100644 --- a/gpu4pyscf/tests/test_benchmark_rks.py +++ b/gpu4pyscf/tests/test_benchmark_rks.py @@ -25,11 +25,19 @@ # How to run # 1. run test only # pytest test_benchmark_rks.py --benchmark-disable -s -v -m "not slow" --durations=20 + # 2. benchmark less expensive tasks # pytest test_benchmark_rks.py -v -m "not slow" + # 3. benchmark all the tests # pytest test_benchmark_rks.py -v +# 4. save benchmark results +# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-save=v100 + +# 5. compare benchmark results, fail if performance regresses by more than 10% +# pytest test_benchmark_rks.py -s -v -m "not slow" --benchmark-compare=v100 --benchmark-compare-fail=10% + current_folder = os.path.dirname(os.path.abspath(__file__)) small_mol = os.path.join(current_folder, '020_Vitamin_C.xyz') medium_mol = os.path.join(current_folder, '057_Tamoxifen.xyz') @@ -81,8 +89,9 @@ def run_rb3lyp_hessian(atom, basis, with_df, with_solvent, disp=None): mf.kernel() h = mf.Hessian().kernel() return h - +####### # DF +####### @pytest.mark.benchmark def test_df_rb3lyp(benchmark): e = benchmark(run_rb3lyp, small_mol, 'def2-tzvpp', True, False) @@ -100,7 +109,9 @@ def test_df_rb3lyp_hessian(benchmark): print('testing df rb3lyp hessian') assert np.isclose(np.linalg.norm(h), 3.7668761221997764, atol=1e-4) +################ # Direct SCF +################ @pytest.mark.benchmark def test_rb3lyp(benchmark): e = benchmark(run_rb3lyp, small_mol, 'def2-tzvpp', False, False) @@ -117,7 +128,9 @@ def test_rb3lyp_hessian(benchmark): print('testing rb3lyp hessian') assert np.isclose(np.linalg.norm(h), 3.7588443634477833, atol=1e-4) -# medium molecule +#################### +# Medium molecule +#################### @pytest.mark.benchmark def test_df_rb3lyp_medium(benchmark): e = benchmark(run_rb3lyp, medium_mol, 'def2-tzvpp', True, False) @@ -144,7 +157,6 @@ def test_rb3lyp_grad_medium(benchmark): g = benchmark(run_rb3lyp_grad, medium_mol, 'def2-tzvpp', False, False) print('testing rb3lyp grad medium') assert np.isclose(np.linalg.norm(g), 0.2601443836937988, atol=1e-5) - @pytest.mark.slow @pytest.mark.benchmark def test_rb3lyp_hessian_medium(benchmark): @@ -152,12 +164,16 @@ def test_rb3lyp_hessian_medium(benchmark): print('testing rb3lyp hessian medium') assert np.isclose(np.linalg.norm(h), 6.312714778020796, atol=1e-4) +#################### # large molecule +#################### +@pytest.mark.high_memory @pytest.mark.benchmark def test_df_rb3lyp_large(benchmark): e = benchmark(run_rb3lyp, large_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp large') assert np.isclose(np.linalg.norm(e), 2564.198712152175, atol=1e-7) +@pytest.mark.high_memory @pytest.mark.benchmark def test_df_rb3lyp_grad_large(benchmark): g = benchmark(run_rb3lyp_grad, large_mol, 'def2-tzvpp', True, False) @@ -192,13 +208,15 @@ def test_rb3lyp_hessian_large(benchmark): print('testing rb3lyp hessian large') print(np.linalg.norm(h)) ''' -# small basis set + +##################### +# Small basis set +##################### @pytest.mark.benchmark def test_df_rb3lyp_631gs(benchmark): e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, False) print('testing df rb3lyp 631gs') assert np.isclose(np.linalg.norm(e), 684.6646008642876, atol=1e-7) - @pytest.mark.benchmark def test_df_rb3lyp_631gs_grad(benchmark): g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, False) @@ -210,7 +228,9 @@ def test_df_rb3lyp_631gs_hessian(benchmark): print('testing df rb3lyp 631gs hessian') assert np.isclose(np.linalg.norm(h), 3.908874851569459, atol=1e-4) -# small basis set for large molecule +######################################### +# Small basis set for large molecule +######################################### @pytest.mark.benchmark def test_rb3lyp_631gs_large(benchmark): e = benchmark(run_rb3lyp, large_mol, '6-31gs', False, False) @@ -228,7 +248,9 @@ def test_rb3lyp_631gs_hessian_large(benchmark): print('testing df rb3lyp 631gs hessian large') assert np.isclose(np.linalg.norm(h), 7.920764634100053, atol=1e-4) -#solvent model +################### +# Solvent model +################### @pytest.mark.benchmark def test_df_rb3lyp_631gs_solvent(benchmark): e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, True) From a30c19611e1fb417e53a817c612bbebd6d021406 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Fri, 3 Jan 2025 04:24:12 +0800 Subject: [PATCH 34/49] np.isclose --- gpu4pyscf/tests/test_benchmark_rks.py | 59 +++++++++++++++------------ gpu4pyscf/tests/test_benchmark_uks.py | 15 +++---- 2 files changed, 40 insertions(+), 34 deletions(-) diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py index 7847993d..b8c593b7 100644 --- a/gpu4pyscf/tests/test_benchmark_rks.py +++ b/gpu4pyscf/tests/test_benchmark_rks.py @@ -87,8 +87,12 @@ def run_rb3lyp_hessian(atom, basis, with_df, with_solvent, disp=None): mf.conv_tol = 1e-10 mf.conv_tol_cpscf = 1e-6 mf.kernel() - h = mf.Hessian().kernel() + h = mf.Hessian() + if with_df: + h.auxbasis_response = 2 + h.kernel() return h + ####### # DF ####### @@ -96,18 +100,18 @@ def run_rb3lyp_hessian(atom, basis, with_df, with_solvent, disp=None): def test_df_rb3lyp(benchmark): e = benchmark(run_rb3lyp, small_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp') - assert np.isclose(np.linalg.norm(e), 684.9998712035579, atol=1e-7) + assert np.isclose(np.linalg.norm(e), 684.9998712035579, atol=1e-7, rtol=1e-16) @pytest.mark.benchmark def test_df_rb3lyp_grad(benchmark): g = benchmark(run_rb3lyp_grad, small_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp grad') - assert np.isclose(np.linalg.norm(g), 0.17435941081837686, atol=1e-5) + assert np.isclose(np.linalg.norm(g), 0.17435941081837686, atol=1e-5, rtol=1e-16) @pytest.mark.slow @pytest.mark.benchmark def test_df_rb3lyp_hessian(benchmark): h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp hessian') - assert np.isclose(np.linalg.norm(h), 3.7668761221997764, atol=1e-4) + assert np.isclose(np.linalg.norm(h), 3.7668761221997764, atol=1e-4, rtol=1e-16) ################ # Direct SCF @@ -116,17 +120,17 @@ def test_df_rb3lyp_hessian(benchmark): def test_rb3lyp(benchmark): e = benchmark(run_rb3lyp, small_mol, 'def2-tzvpp', False, False) print('testing rb3lyp') - assert np.isclose(np.linalg.norm(e), 684.999735850967, atol=1e-7) + assert np.isclose(np.linalg.norm(e), 684.999735850967, atol=1e-7, rtol=1e-16) @pytest.mark.benchmark def test_rb3lyp_grad(benchmark): g = benchmark(run_rb3lyp_grad, small_mol, 'def2-tzvpp', False, False) print('testing rb3lyp grad') - assert np.isclose(np.linalg.norm(g), 0.1744127474130983, atol=1e-5) + assert np.isclose(np.linalg.norm(g), 0.1744127474130983, atol=1e-5, rtol=1e-16) @pytest.mark.benchmark def test_rb3lyp_hessian(benchmark): h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', False, False) print('testing rb3lyp hessian') - assert np.isclose(np.linalg.norm(h), 3.7588443634477833, atol=1e-4) + assert np.isclose(np.linalg.norm(h), 3.7588443634477833, atol=1e-4, rtol=1e-16) #################### # Medium molecule @@ -135,34 +139,35 @@ def test_rb3lyp_hessian(benchmark): def test_df_rb3lyp_medium(benchmark): e = benchmark(run_rb3lyp, medium_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp medium') - assert np.isclose(np.linalg.norm(e), 1138.371390377773, atol=1e-7) + assert np.isclose(np.linalg.norm(e), 1138.371390377773, atol=1e-7, rtol=1e-16) @pytest.mark.benchmark def test_df_rb3lyp_grad_medium(benchmark): g = benchmark(run_rb3lyp_grad, medium_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp grad medium') - assert np.isclose(np.linalg.norm(g), 0.26010545073602614, atol=1e-4) + assert np.isclose(np.linalg.norm(g), 0.26010545073602614, atol=1e-5, rtol=1e-16) @pytest.mark.benchmark def test_df_rb3lyp_hessian_medium(benchmark): h = benchmark(run_rb3lyp_hessian, medium_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp hessian medium') - assert np.isclose(np.linalg.norm(h), 6.32514169232998, atol=1e-4) + print(np.linalg.norm(h) - 6.32514169232998) + assert np.isclose(np.linalg.norm(h), 6.32514169232998, atol=1e-4, rtol=1e-16) @pytest.mark.benchmark def test_rb3lyp_medium(benchmark): e = benchmark(run_rb3lyp, medium_mol, 'def2-tzvpp', False, False) print('testing rb3lyp medium') - assert np.isclose(np.linalg.norm(e), 1138.3710752128077, atol=1e-7) + assert np.isclose(np.linalg.norm(e), 1138.3710752128077, atol=1e-7, rtol=1e-16) @pytest.mark.benchmark def test_rb3lyp_grad_medium(benchmark): g = benchmark(run_rb3lyp_grad, medium_mol, 'def2-tzvpp', False, False) print('testing rb3lyp grad medium') - assert np.isclose(np.linalg.norm(g), 0.2601443836937988, atol=1e-5) + assert np.isclose(np.linalg.norm(g), 0.2601443836937988, atol=1e-5, rtol=1e-16) @pytest.mark.slow @pytest.mark.benchmark def test_rb3lyp_hessian_medium(benchmark): h = benchmark(run_rb3lyp_hessian, medium_mol, 'def2-tzvpp', False, False) print('testing rb3lyp hessian medium') - assert np.isclose(np.linalg.norm(h), 6.312714778020796, atol=1e-4) + assert np.isclose(np.linalg.norm(h), 6.312714778020796, atol=1e-4, rtol=1e-16) #################### # large molecule @@ -172,32 +177,32 @@ def test_rb3lyp_hessian_medium(benchmark): def test_df_rb3lyp_large(benchmark): e = benchmark(run_rb3lyp, large_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp large') - assert np.isclose(np.linalg.norm(e), 2564.198712152175, atol=1e-7) + assert np.isclose(np.linalg.norm(e), 2564.198712152175, atol=1e-7, rtol=1e-16) @pytest.mark.high_memory @pytest.mark.benchmark def test_df_rb3lyp_grad_large(benchmark): g = benchmark(run_rb3lyp_grad, large_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp grad large') - assert np.isclose(np.linalg.norm(g), 0.3784358687859323, atol=1e-5) + assert np.isclose(np.linalg.norm(g), 0.3784358687859323, atol=1e-5, rtol=1e-16) @pytest.mark.high_memory @pytest.mark.slow @pytest.mark.benchmark def test_df_rb3lyp_hessian_large(benchmark): h = benchmark(run_rb3lyp_hessian, large_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp hessian large') - assert np.isclose(np.linalg.norm(h), 7.583208736873523, atol=1e-4) + assert np.isclose(np.linalg.norm(h), 7.583208736873523, atol=1e-4, rtol=1e-16) @pytest.mark.slow @pytest.mark.benchmark def test_rb3lyp_large(benchmark): e = benchmark(run_rb3lyp, large_mol, 'def2-tzvpp', False, False) print('testing rb3lyp large') - assert np.isclose(np.linalg.norm(e), 2564.198099576358, atol=1e-7) + assert np.isclose(np.linalg.norm(e), 2564.198099576358, atol=1e-7, rtol=1e-16) @pytest.mark.slow @pytest.mark.benchmark def test_rb3lyp_grad_large(benchmark): g = benchmark(run_rb3lyp_grad, large_mol, 'def2-tzvpp', False, False) print('testing rb3lyp grad large') - assert np.isclose(np.linalg.norm(g), 0.3784664384209763, atol=1e-5) + assert np.isclose(np.linalg.norm(g), 0.3784664384209763, atol=1e-5, rtol=1e-16) # Hessian for large molecule with large basis set is too slow ''' @@ -216,17 +221,17 @@ def test_rb3lyp_hessian_large(benchmark): def test_df_rb3lyp_631gs(benchmark): e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, False) print('testing df rb3lyp 631gs') - assert np.isclose(np.linalg.norm(e), 684.6646008642876, atol=1e-7) + assert np.isclose(np.linalg.norm(e), 684.6646008642876, atol=1e-7, rtol=1e-16) @pytest.mark.benchmark def test_df_rb3lyp_631gs_grad(benchmark): g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, False) print('testing df rb3lyp 631gs grad') - assert np.isclose(np.linalg.norm(g), 0.17530687343398219, atol=1e-5) + assert np.isclose(np.linalg.norm(g), 0.17530687343398219, atol=1e-5, rtol=1e-16) @pytest.mark.benchmark def test_df_rb3lyp_631gs_hessian(benchmark): h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, False) print('testing df rb3lyp 631gs hessian') - assert np.isclose(np.linalg.norm(h), 3.908874851569459, atol=1e-4) + assert np.isclose(np.linalg.norm(h), 3.908874851569459, atol=1e-4, rtol=1e-16) ######################################### # Small basis set for large molecule @@ -235,18 +240,18 @@ def test_df_rb3lyp_631gs_hessian(benchmark): def test_rb3lyp_631gs_large(benchmark): e = benchmark(run_rb3lyp, large_mol, '6-31gs', False, False) print('testing rb3lyp 631gs large') - assert np.isclose(np.linalg.norm(e), 2563.1171191823423, atol=1e-7) + assert np.isclose(np.linalg.norm(e), 2563.1171191823423, atol=1e-7, rtol=1e-16) @pytest.mark.benchmark def test_rb3lyp_631gs_grad_large(benchmark): g = benchmark(run_rb3lyp_grad, large_mol, '6-31gs', False, False) print('testing df rb3lyp 631gs grad large') - assert np.isclose(np.linalg.norm(g), 0.37778228700247984, atol=1e-5) + assert np.isclose(np.linalg.norm(g), 0.37778228700247984, atol=1e-5, rtol=1e-16) @pytest.mark.slow @pytest.mark.benchmark def test_rb3lyp_631gs_hessian_large(benchmark): h = benchmark(run_rb3lyp_hessian, large_mol, '6-31gs', False, False) print('testing df rb3lyp 631gs hessian large') - assert np.isclose(np.linalg.norm(h), 7.920764634100053, atol=1e-4) + assert np.isclose(np.linalg.norm(h), 7.920764634100053, atol=1e-4, rtol=1e-16) ################### # Solvent model @@ -255,17 +260,17 @@ def test_rb3lyp_631gs_hessian_large(benchmark): def test_df_rb3lyp_631gs_solvent(benchmark): e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, True) print('testing df rb3lyp 631gs solvent') - assert np.isclose(np.linalg.norm(e), 684.6985561053816, atol=1e-7) + assert np.isclose(np.linalg.norm(e), 684.6985561053816, atol=1e-7, rtol=1e-16) @pytest.mark.benchmark def test_df_rb3lyp_631gs_solvent_grad(benchmark): g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, True) print('testing df rb3lyp 631gs solvent grad') - assert np.isclose(np.linalg.norm(g), 0.16956999476137297, atol=1e-5) + assert np.isclose(np.linalg.norm(g), 0.16956999476137297, atol=1e-5, rtol=1e-16) @pytest.mark.benchmark def test_df_rb3lyp_631gs_solvent_hessian(benchmark): h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, True) print('testing df rb3lyp 631gs solvent hessian') - assert np.isclose(np.linalg.norm(h), 3.9008165041707294, atol=1e-4) + assert np.isclose(np.linalg.norm(h), 3.9008165041707294, atol=1e-4, rtol=1e-16) # No need to test d3bj generally ''' diff --git a/gpu4pyscf/tests/test_benchmark_uks.py b/gpu4pyscf/tests/test_benchmark_uks.py index 39acd9ba..33d86c99 100644 --- a/gpu4pyscf/tests/test_benchmark_uks.py +++ b/gpu4pyscf/tests/test_benchmark_uks.py @@ -64,35 +64,36 @@ def run_ub3lyp_hessian(atom, basis, with_df, with_solvent): h = mf.Hessian().kernel() return h - +########## # UKS +########## @pytest.mark.benchmark def test_df_ub3lyp(benchmark): e = benchmark(run_ub3lyp, small_mol, 'def2-tzvpp', True, False) print('testing df ub3lyp') - assert np.isclose(np.linalg.norm(e), 684.9998712035856, atol=1e-7) + assert np.isclose(np.linalg.norm(e), 684.9998712035856, atol=1e-7, rtol=1e-16) @pytest.mark.benchmark def test_df_ub3lyp_grad(benchmark): g = benchmark(run_ub3lyp_grad, small_mol, 'def2-tzvpp', True, False) print('testing df ub3lyp grad') - assert np.isclose(np.linalg.norm(g), 0.17435842214665462, atol=1e-5) + assert np.isclose(np.linalg.norm(g), 0.17435842214665462, atol=1e-5, rtol=1e-16) @pytest.mark.benchmark def test_df_ub3lyp_hessian(benchmark): h = benchmark(run_ub3lyp_hessian, small_mol, 'def2-tzvpp', True, False) print('testing df ub3lyp hessian') - assert np.isclose(np.linalg.norm(h), 3.7669464279078064, atol=1e-4) + assert np.isclose(np.linalg.norm(h), 3.7669464279078064, atol=1e-4, rtol=1e-16) @pytest.mark.benchmark def test_ub3lyp(benchmark): e = benchmark(run_ub3lyp, small_mol, 'def2-tzvpp', False, False) print('testing ub3lyp') - assert np.isclose(np.linalg.norm(e), 684.9997358509884, atol=1e-7) + assert np.isclose(np.linalg.norm(e), 684.9997358509884, atol=1e-7, rtol=1e-16) @pytest.mark.benchmark def test_ub3lyp_grad(benchmark): g = benchmark(run_ub3lyp_grad, small_mol, 'def2-tzvpp', False, False) print('testing ub3lyp grad') - assert np.isclose(np.linalg.norm(g), 0.17441176110160253, atol=1e-5) + assert np.isclose(np.linalg.norm(g), 0.17441176110160253, atol=1e-5, rtol=1e-16) @pytest.mark.benchmark def test_ub3lyp_hessian(benchmark): h = benchmark(run_ub3lyp_hessian, small_mol, 'def2-tzvpp', False, False) print('testing ub3lyp hessian') - assert np.isclose(np.linalg.norm(h), 3.758916526520172, atol=1e-4) + assert np.isclose(np.linalg.norm(h), 3.758916526520172, atol=1e-4, rtol=1e-16) From b49d34e8ce5fe6b572532bd1682bfb86e4790e11 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Fri, 3 Jan 2025 04:31:48 +0800 Subject: [PATCH 35/49] bugfix --- gpu4pyscf/tests/test_benchmark_rks.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py index b8c593b7..d25f98b0 100644 --- a/gpu4pyscf/tests/test_benchmark_rks.py +++ b/gpu4pyscf/tests/test_benchmark_rks.py @@ -87,10 +87,10 @@ def run_rb3lyp_hessian(atom, basis, with_df, with_solvent, disp=None): mf.conv_tol = 1e-10 mf.conv_tol_cpscf = 1e-6 mf.kernel() - h = mf.Hessian() + hobj = mf.Hessian() if with_df: - h.auxbasis_response = 2 - h.kernel() + hobj.auxbasis_response = 2 + h = hobj.kernel() return h ####### @@ -130,6 +130,7 @@ def test_rb3lyp_grad(benchmark): def test_rb3lyp_hessian(benchmark): h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', False, False) print('testing rb3lyp hessian') + print(np.linalg.norm(h) - 3.7588443634477833) assert np.isclose(np.linalg.norm(h), 3.7588443634477833, atol=1e-4, rtol=1e-16) #################### From ba388eec82973e4722d1afa3e83e00a3101248a0 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Fri, 3 Jan 2025 06:10:51 +0800 Subject: [PATCH 36/49] auxbasis_response --- gpu4pyscf/tests/test_benchmark_rks.py | 13 +++++-------- gpu4pyscf/tests/test_benchmark_uks.py | 5 ++++- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py index d25f98b0..ff952e99 100644 --- a/gpu4pyscf/tests/test_benchmark_rks.py +++ b/gpu4pyscf/tests/test_benchmark_rks.py @@ -36,7 +36,7 @@ # pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-save=v100 # 5. compare benchmark results, fail if performance regresses by more than 10% -# pytest test_benchmark_rks.py -s -v -m "not slow" --benchmark-compare=v100 --benchmark-compare-fail=10% +# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare=v100 --benchmark-compare-fail=10% current_folder = os.path.dirname(os.path.abspath(__file__)) small_mol = os.path.join(current_folder, '020_Vitamin_C.xyz') @@ -106,12 +106,11 @@ def test_df_rb3lyp_grad(benchmark): g = benchmark(run_rb3lyp_grad, small_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp grad') assert np.isclose(np.linalg.norm(g), 0.17435941081837686, atol=1e-5, rtol=1e-16) -@pytest.mark.slow @pytest.mark.benchmark def test_df_rb3lyp_hessian(benchmark): h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp hessian') - assert np.isclose(np.linalg.norm(h), 3.7668761221997764, atol=1e-4, rtol=1e-16) + assert np.isclose(np.linalg.norm(h), 3.7587394873290885, atol=1e-4, rtol=1e-16) ################ # Direct SCF @@ -130,7 +129,6 @@ def test_rb3lyp_grad(benchmark): def test_rb3lyp_hessian(benchmark): h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', False, False) print('testing rb3lyp hessian') - print(np.linalg.norm(h) - 3.7588443634477833) assert np.isclose(np.linalg.norm(h), 3.7588443634477833, atol=1e-4, rtol=1e-16) #################### @@ -150,8 +148,7 @@ def test_df_rb3lyp_grad_medium(benchmark): def test_df_rb3lyp_hessian_medium(benchmark): h = benchmark(run_rb3lyp_hessian, medium_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp hessian medium') - print(np.linalg.norm(h) - 6.32514169232998) - assert np.isclose(np.linalg.norm(h), 6.32514169232998, atol=1e-4, rtol=1e-16) + assert np.isclose(np.linalg.norm(h), 6.31265424196621, atol=1e-4, rtol=1e-16) @pytest.mark.benchmark def test_rb3lyp_medium(benchmark): @@ -232,7 +229,7 @@ def test_df_rb3lyp_631gs_grad(benchmark): def test_df_rb3lyp_631gs_hessian(benchmark): h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, False) print('testing df rb3lyp 631gs hessian') - assert np.isclose(np.linalg.norm(h), 3.908874851569459, atol=1e-4, rtol=1e-16) + assert np.isclose(np.linalg.norm(h), 3.9071846157996553, atol=1e-4, rtol=1e-16) ######################################### # Small basis set for large molecule @@ -271,7 +268,7 @@ def test_df_rb3lyp_631gs_solvent_grad(benchmark): def test_df_rb3lyp_631gs_solvent_hessian(benchmark): h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, True) print('testing df rb3lyp 631gs solvent hessian') - assert np.isclose(np.linalg.norm(h), 3.9008165041707294, atol=1e-4, rtol=1e-16) + assert np.isclose(np.linalg.norm(h), 3.8991230592666737, atol=1e-4, rtol=1e-16) # No need to test d3bj generally ''' diff --git a/gpu4pyscf/tests/test_benchmark_uks.py b/gpu4pyscf/tests/test_benchmark_uks.py index 33d86c99..0e5bf311 100644 --- a/gpu4pyscf/tests/test_benchmark_uks.py +++ b/gpu4pyscf/tests/test_benchmark_uks.py @@ -61,7 +61,10 @@ def run_ub3lyp_hessian(atom, basis, with_df, with_solvent): mf.conv_tol = 1e-10 mf.conv_tol_cpscf = 1e-6 mf.kernel() - h = mf.Hessian().kernel() + hobj = mf.Hessian() + if with_df: + hobj.auxbasis_response = 2 + h = hobj.kernel() return h ########## From 16858f9bc17ea91329bec7801bca9e5797d6a325 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Fri, 3 Jan 2025 22:38:58 +0800 Subject: [PATCH 37/49] add benchmark results --- .github/workflows/nightly_build.yml | 2 +- .../{0001_v100.json => v1.3.0_1v100.json} | 517 ++++++++++-------- gpu4pyscf/tests/test_benchmark_rks.py | 4 +- 3 files changed, 279 insertions(+), 244 deletions(-) rename gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/{0001_v100.json => v1.3.0_1v100.json} (61%) diff --git a/.github/workflows/nightly_build.yml b/.github/workflows/nightly_build.yml index b802fc3e..7be6d721 100644 --- a/.github/workflows/nightly_build.yml +++ b/.github/workflows/nightly_build.yml @@ -40,4 +40,4 @@ jobs: run: | echo $GITHUB_WORKSPACE export PYTHONPATH="${PYTHONPATH}:$(pwd)" - pytest gpu4pyscf/tests/ -v -m "not slow" + pytest gpu4pyscf/tests/ -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_1v100 --benchmark-storage=gpu4pyscf/tests/ diff --git a/gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/0001_v100.json b/gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/v1.3.0_1v100.json similarity index 61% rename from gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/0001_v100.json rename to gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/v1.3.0_1v100.json index 4101f07e..81cb8ad0 100644 --- a/gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/0001_v100.json +++ b/gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/v1.3.0_1v100.json @@ -1,6 +1,6 @@ { "machine_info": { - "node": "mlxlabzskrh20v669fd914-20240723162348-uim1c3-1eg5qx-worker", + "node": "mlxlabzskrh20v669fd914-20240723162348-uim1c3-k3ligr-worker", "processor": "", "machine": "x86_64", "python_compiler": "GCC 10.2.1 20210110", @@ -34,7 +34,7 @@ 0 ], "hz_actual": [ - 3100000000, + 3100001000, 0 ], "stepping": 7, @@ -194,9 +194,9 @@ } }, "commit_info": { - "id": "ca79edb0c5d76c928180d5110838a368faa2f1f1", - "time": "2025-01-02T06:46:51+00:00", - "author_time": "2025-01-02T06:46:51+00:00", + "id": "ba388eec82973e4722d1afa3e83e00a3101248a0", + "time": "2025-01-03T06:10:51+08:00", + "author_time": "2025-01-03T06:10:51+08:00", "dirty": true, "project": "gpu4pyscf", "branch": "benchmark_ci" @@ -218,22 +218,22 @@ "warmup": false }, "stats": { - "min": 3.1617951644584537, - "max": 3.7628091080114245, - "mean": 3.4210989899933337, - "stddev": 0.22879307906931964, + "min": 2.912467209622264, + "max": 3.132086180150509, + "mean": 2.9854499623179436, + "stddev": 0.08575128159932316, "rounds": 5, - "median": 3.411895725876093, - "iqr": 0.3097648276016116, - "q1": 3.2490662271156907, - "q3": 3.5588310547173023, + "median": 2.9598704893141985, + "iqr": 0.08442470477893949, + "q1": 2.934416546020657, + "q3": 3.0188412507995963, "iqr_outliers": 0, - "stddev_outliers": 2, - "outliers": "2;0", - "ld15iqr": 3.1617951644584537, - "hd15iqr": 3.7628091080114245, - "ops": 0.2923037313228836, - "total": 17.10549494996667, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 2.912467209622264, + "hd15iqr": 3.132086180150509, + "ops": 0.33495788327451537, + "total": 14.927249811589718, "iterations": 1 } }, @@ -253,22 +253,57 @@ "warmup": false }, "stats": { - "min": 4.783565265126526, - "max": 5.297894531860948, - "mean": 5.111189672350884, - "stddev": 0.21592611330232545, + "min": 4.693447925150394, + "max": 4.811241740360856, + "mean": 4.7545236147940155, + "stddev": 0.05376631322845494, "rounds": 5, - "median": 5.1955106453970075, - "iqr": 0.32867400418035686, - "q1": 4.950462192296982, - "q3": 5.279136196477339, + "median": 4.767030920833349, + "iqr": 0.10001574829220772, + "q1": 4.700914891902357, + "q3": 4.800930640194565, + "iqr_outliers": 0, + "stddev_outliers": 2, + "outliers": "2;0", + "ld15iqr": 4.693447925150394, + "hd15iqr": 4.811241740360856, + "ops": 0.21032601392249553, + "total": 23.77261807397008, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_rb3lyp_hessian", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_hessian", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 43.1729771643877, + "max": 44.22008949518204, + "mean": 43.53323510922492, + "stddev": 0.4568445249288835, + "rounds": 5, + "median": 43.26318317092955, + "iqr": 0.6843766365200281, + "q1": 43.210667157545686, + "q3": 43.895043794065714, "iqr_outliers": 0, "stddev_outliers": 1, "outliers": "1;0", - "ld15iqr": 4.783565265126526, - "hd15iqr": 5.297894531860948, - "ops": 0.1956491666528297, - "total": 25.555948361754417, + "ld15iqr": 43.1729771643877, + "hd15iqr": 44.22008949518204, + "ops": 0.02297095535149178, + "total": 217.66617554612458, "iterations": 1 } }, @@ -288,22 +323,22 @@ "warmup": false }, "stats": { - "min": 40.746477760374546, - "max": 41.372105130925775, - "mean": 41.03762242179364, - "stddev": 0.23753417144600847, + "min": 40.87554130144417, + "max": 41.24961415119469, + "mean": 41.05381844490766, + "stddev": 0.13780925683914672, "rounds": 5, - "median": 41.043180647306144, - "iqr": 0.33331693802028894, - "q1": 40.858045106288046, - "q3": 41.191362044308335, + "median": 41.05546211451292, + "iqr": 0.17331884242594242, + "q1": 40.96216300688684, + "q3": 41.13548184931278, "iqr_outliers": 0, "stddev_outliers": 2, "outliers": "2;0", - "ld15iqr": 40.746477760374546, - "hd15iqr": 41.372105130925775, - "ops": 0.02436788344416696, - "total": 205.1881121089682, + "ld15iqr": 40.87554130144417, + "hd15iqr": 41.24961415119469, + "ops": 0.02435827014098467, + "total": 205.26909222453833, "iterations": 1 } }, @@ -323,22 +358,22 @@ "warmup": false }, "stats": { - "min": 49.95022037625313, - "max": 50.498252051882446, - "mean": 50.14596408084035, - "stddev": 0.20855077359110574, + "min": 49.98093665204942, + "max": 50.76574368029833, + "mean": 50.31307061091066, + "stddev": 0.33630120438295324, "rounds": 5, - "median": 50.08650258369744, - "iqr": 0.19796669483184814, - "q1": 50.0301427282393, - "q3": 50.228109423071146, + "median": 50.36884331330657, + "iqr": 0.5613440982997417, + "q1": 49.981349020730704, + "q3": 50.542693119030446, "iqr_outliers": 0, "stddev_outliers": 1, "outliers": "1;0", - "ld15iqr": 49.95022037625313, - "hd15iqr": 50.498252051882446, - "ops": 0.01994178431564102, - "total": 250.72982040420175, + "ld15iqr": 49.98093665204942, + "hd15iqr": 50.76574368029833, + "ops": 0.01987555098223611, + "total": 251.56535305455327, "iterations": 1 } }, @@ -358,22 +393,22 @@ "warmup": false }, "stats": { - "min": 613.87584313564, - "max": 620.2618521982804, - "mean": 616.6373227955773, - "stddev": 2.3903446370287127, + "min": 611.3098333217204, + "max": 620.2315559927374, + "mean": 614.9859318509698, + "stddev": 3.295612103075669, "rounds": 5, - "median": 616.8295351918787, - "iqr": 2.8947918817866594, - "q1": 614.9020847703796, - "q3": 617.7968766521662, + "median": 614.4812579210848, + "iqr": 3.568844774272293, + "q1": 612.998380784411, + "q3": 616.5672255586833, "iqr_outliers": 0, "stddev_outliers": 2, "outliers": "2;0", - "ld15iqr": 613.87584313564, - "hd15iqr": 620.2618521982804, - "ops": 0.0016216987896003692, - "total": 3083.1866139778867, + "ld15iqr": 611.3098333217204, + "hd15iqr": 620.2315559927374, + "ops": 0.001626053456198948, + "total": 3074.929659254849, "iterations": 1 } }, @@ -393,22 +428,22 @@ "warmup": false }, "stats": { - "min": 18.224224156700075, - "max": 20.157692234031856, - "mean": 19.024707913957535, - "stddev": 0.8064738008793728, + "min": 18.450319150462747, + "max": 19.34435743652284, + "mean": 18.962213665619494, + "stddev": 0.34090358565345374, "rounds": 5, - "median": 19.21132487989962, - "iqr": 1.2390491359401494, - "q1": 18.252076843054965, - "q3": 19.491125978995115, + "median": 19.017266055569053, + "iqr": 0.4629710176959634, + "q1": 18.742521196603775, + "q3": 19.20549221429974, "iqr_outliers": 0, - "stddev_outliers": 1, - "outliers": "1;0", - "ld15iqr": 18.224224156700075, - "hd15iqr": 20.157692234031856, - "ops": 0.052563224861199936, - "total": 95.12353956978768, + "stddev_outliers": 2, + "outliers": "2;0", + "ld15iqr": 18.450319150462747, + "hd15iqr": 19.34435743652284, + "ops": 0.05273645881404165, + "total": 94.81106832809746, "iterations": 1 } }, @@ -428,22 +463,22 @@ "warmup": false }, "stats": { - "min": 28.5077072577551, - "max": 32.310115557163954, - "mean": 29.75330361928791, - "stddev": 1.5559254109717362, + "min": 28.927994549274445, + "max": 29.407788010314107, + "mean": 29.06979787014425, + "stddev": 0.19948441635503308, "rounds": 5, - "median": 28.967016119509935, - "iqr": 1.9220157761592418, - "q1": 28.759349649539217, - "q3": 30.68136542569846, + "median": 28.980533458292484, + "iqr": 0.2319285492412746, + "q1": 28.93826104514301, + "q3": 29.170189594384283, "iqr_outliers": 0, "stddev_outliers": 1, "outliers": "1;0", - "ld15iqr": 28.5077072577551, - "hd15iqr": 32.310115557163954, - "ops": 0.03360971315305434, - "total": 148.76651809643954, + "ld15iqr": 28.927994549274445, + "hd15iqr": 29.407788010314107, + "ops": 0.034399963992423795, + "total": 145.34898935072124, "iterations": 1 } }, @@ -463,22 +498,22 @@ "warmup": false }, "stats": { - "min": 603.508519613184, - "max": 605.9056743234396, - "mean": 605.091381527856, - "stddev": 0.9420383756463966, + "min": 674.9359990525991, + "max": 678.2040371634066, + "mean": 676.7355838540941, + "stddev": 1.3332352353981456, "rounds": 5, - "median": 605.3425533128902, - "iqr": 1.0546113743912429, - "q1": 604.6620287010446, - "q3": 605.7166400754359, + "median": 676.6573997996747, + "iqr": 2.1692251418717206, + "q1": 675.7630731766112, + "q3": 677.9322983184829, "iqr_outliers": 0, - "stddev_outliers": 1, - "outliers": "1;0", - "ld15iqr": 603.508519613184, - "hd15iqr": 605.9056743234396, - "ops": 0.0016526429404348803, - "total": 3025.45690763928, + "stddev_outliers": 2, + "outliers": "2;0", + "ld15iqr": 674.9359990525991, + "hd15iqr": 678.2040371634066, + "ops": 0.0014776820132685715, + "total": 3383.6779192704707, "iterations": 1 } }, @@ -498,22 +533,22 @@ "warmup": false }, "stats": { - "min": 462.8745871502906, - "max": 464.5682009514421, - "mean": 463.4215483263135, - "stddev": 0.6988488955671605, + "min": 465.53933845460415, + "max": 469.9319954663515, + "mean": 467.35331859253347, + "stddev": 1.7196427730040629, "rounds": 5, - "median": 463.2153818728402, - "iqr": 0.9204953673761338, - "q1": 462.88869020040147, - "q3": 463.8091855677776, + "median": 467.1924539171159, + "iqr": 2.4731017132289708, + "q1": 465.9859178052284, + "q3": 468.45901951845735, "iqr_outliers": 0, - "stddev_outliers": 1, - "outliers": "1;0", - "ld15iqr": 462.8745871502906, - "hd15iqr": 464.5682009514421, - "ops": 0.002157862541376389, - "total": 2317.1077416315675, + "stddev_outliers": 2, + "outliers": "2;0", + "ld15iqr": 465.53933845460415, + "hd15iqr": 469.9319954663515, + "ops": 0.00213970878180895, + "total": 2336.7665929626673, "iterations": 1 } }, @@ -533,22 +568,22 @@ "warmup": false }, "stats": { - "min": 552.8754901243374, - "max": 554.911680762656, - "mean": 554.3076192354783, - "stddev": 0.8161092352116484, + "min": 559.7645257860422, + "max": 562.2628617957234, + "mean": 560.7982054453344, + "stddev": 0.9539619574856013, "rounds": 5, - "median": 554.5314849242568, - "iqr": 0.6415546571370214, - "q1": 554.1099091696087, - "q3": 554.7514638267457, - "iqr_outliers": 1, - "stddev_outliers": 1, - "outliers": "1;1", - "ld15iqr": 554.5213821846992, - "hd15iqr": 554.911680762656, - "ops": 0.0018040524165611094, - "total": 2771.5380961773917, + "median": 560.6089988369495, + "iqr": 1.2622483419254422, + "q1": 560.1302895797417, + "q3": 561.3925379216671, + "iqr_outliers": 0, + "stddev_outliers": 2, + "outliers": "2;0", + "ld15iqr": 559.7645257860422, + "hd15iqr": 562.2628617957234, + "ops": 0.0017831726105576463, + "total": 2803.9910272266716, "iterations": 1 } }, @@ -568,22 +603,22 @@ "warmup": false }, "stats": { - "min": 1.5329652335494757, - "max": 1.6051436373963952, - "mean": 1.5614585245028139, - "stddev": 0.027036901841682143, + "min": 2.3893800172954798, + "max": 4.065618982538581, + "mean": 3.419478364661336, + "stddev": 0.6287030173245606, "rounds": 5, - "median": 1.5542930895462632, - "iqr": 0.02967151813209057, - "q1": 1.5455118480604142, - "q3": 1.5751833661925048, + "median": 3.485863795503974, + "iqr": 0.6505572367459536, + "q1": 3.1652946420945227, + "q3": 3.8158518788404763, "iqr_outliers": 0, "stddev_outliers": 2, "outliers": "2;0", - "ld15iqr": 1.5329652335494757, - "hd15iqr": 1.6051436373963952, - "ops": 0.6404268728933491, - "total": 7.807292622514069, + "ld15iqr": 2.3893800172954798, + "hd15iqr": 4.065618982538581, + "ops": 0.29244226556147246, + "total": 17.09739182330668, "iterations": 1 } }, @@ -603,22 +638,22 @@ "warmup": false }, "stats": { - "min": 2.280210480093956, - "max": 2.3804853092879057, - "mean": 2.317926473915577, - "stddev": 0.04595680285671692, + "min": 3.086430249735713, + "max": 3.4464661311358213, + "mean": 3.21519818790257, + "stddev": 0.14631235080321897, "rounds": 5, - "median": 2.290554977953434, - "iqr": 0.07659115269780159, - "q1": 2.283684498164803, - "q3": 2.3602756508626044, + "median": 3.1610366478562355, + "iqr": 0.20356103358790278, + "q1": 3.108103247359395, + "q3": 3.311664280947298, "iqr_outliers": 0, "stddev_outliers": 1, "outliers": "1;0", - "ld15iqr": 2.280210480093956, - "hd15iqr": 2.3804853092879057, - "ops": 0.43142006929613325, - "total": 11.589632369577885, + "ld15iqr": 3.086430249735713, + "hd15iqr": 3.4464661311358213, + "ops": 0.31102281774186635, + "total": 16.07599093951285, "iterations": 1 } }, @@ -638,22 +673,22 @@ "warmup": false }, "stats": { - "min": 13.266008610837162, - "max": 13.778700362890959, - "mean": 13.425486170500516, - "stddev": 0.20500983387954833, + "min": 15.165010405704379, + "max": 15.594494730234146, + "mean": 15.329469257220627, + "stddev": 0.16923297471060986, "rounds": 5, - "median": 13.3493886096403, - "iqr": 0.20474978047423065, - "q1": 13.303213707404211, - "q3": 13.507963487878442, + "median": 15.25765424221754, + "iqr": 0.22211500210687518, + "q1": 15.220200731419027, + "q3": 15.442315733525902, "iqr_outliers": 0, "stddev_outliers": 1, "outliers": "1;0", - "ld15iqr": 13.266008610837162, - "hd15iqr": 13.778700362890959, - "ops": 0.07448519832356425, - "total": 67.12743085250258, + "ld15iqr": 15.165010405704379, + "hd15iqr": 15.594494730234146, + "ops": 0.0652338305534597, + "total": 76.64734628610313, "iterations": 1 } }, @@ -673,22 +708,22 @@ "warmup": false }, "stats": { - "min": 54.99403030797839, - "max": 55.44687832519412, - "mean": 55.280799315497276, - "stddev": 0.17365676139080558, + "min": 55.83294254913926, + "max": 59.17145283520222, + "mean": 57.62203652448952, + "stddev": 1.374501327462731, "rounds": 5, - "median": 55.29263620171696, - "iqr": 0.18626192840747535, - "q1": 55.21340201841667, - "q3": 55.39966394682415, + "median": 57.98267317190766, + "iqr": 2.264786566141993, + "q1": 56.414323914796114, + "q3": 58.67911048093811, "iqr_outliers": 0, - "stddev_outliers": 1, - "outliers": "1;0", - "ld15iqr": 54.99403030797839, - "hd15iqr": 55.44687832519412, - "ops": 0.018089463473435388, - "total": 276.4039965774864, + "stddev_outliers": 2, + "outliers": "2;0", + "ld15iqr": 55.83294254913926, + "hd15iqr": 59.17145283520222, + "ops": 0.017354471662503586, + "total": 288.1101826224476, "iterations": 1 } }, @@ -708,22 +743,22 @@ "warmup": false }, "stats": { - "min": 68.62203936092556, - "max": 69.5449710758403, - "mean": 69.09264576677234, - "stddev": 0.4051404972610001, + "min": 72.57952445559204, + "max": 73.64580446854234, + "mean": 73.0582833636552, + "stddev": 0.48907908462094757, "rounds": 5, - "median": 68.97573095746338, - "iqr": 0.7140946059953421, - "q1": 68.78401179146022, - "q3": 69.49810639745556, + "median": 72.9562251791358, + "iqr": 0.9134543887339532, + "q1": 72.61263743927702, + "q3": 73.52609182801098, "iqr_outliers": 0, - "stddev_outliers": 2, - "outliers": "2;0", - "ld15iqr": 68.62203936092556, - "hd15iqr": 69.5449710758403, - "ops": 0.014473320407725865, - "total": 345.46322883386165, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 72.57952445559204, + "hd15iqr": 73.64580446854234, + "ops": 0.01368770184514733, + "total": 365.29141681827605, "iterations": 1 } }, @@ -743,22 +778,22 @@ "warmup": false }, "stats": { - "min": 2.3869710564613342, - "max": 2.530611244030297, - "mean": 2.4419715087860823, - "stddev": 0.05630153012020871, + "min": 3.96594556607306, + "max": 4.621823711320758, + "mean": 4.35534807741642, + "stddev": 0.24887039802683908, "rounds": 5, - "median": 2.43410103302449, - "iqr": 0.07564499671570957, - "q1": 2.398690618108958, - "q3": 2.4743356148246676, + "median": 4.413319645449519, + "iqr": 0.31258321227505803, + "q1": 4.208048852626234, + "q3": 4.520632064901292, "iqr_outliers": 0, - "stddev_outliers": 1, - "outliers": "1;0", - "ld15iqr": 2.3869710564613342, - "hd15iqr": 2.530611244030297, - "ops": 0.40950518726449253, - "total": 12.209857543930411, + "stddev_outliers": 2, + "outliers": "2;0", + "ld15iqr": 3.96594556607306, + "hd15iqr": 4.621823711320758, + "ops": 0.22960277392873663, + "total": 21.7767403870821, "iterations": 1 } }, @@ -778,22 +813,22 @@ "warmup": false }, "stats": { - "min": 2.973248357884586, - "max": 3.1569794192910194, - "mean": 3.076459738984704, - "stddev": 0.06885617804924284, + "min": 4.794365357607603, + "max": 5.184939783066511, + "mean": 4.969379325211048, + "stddev": 0.15483049255542325, "rounds": 5, - "median": 3.072229014709592, - "iqr": 0.08629889693111181, - "q1": 3.040569737320766, - "q3": 3.1268686342518777, + "median": 4.9499497301876545, + "iqr": 0.24070796929299831, + "q1": 4.846174816135317, + "q3": 5.086882785428315, "iqr_outliers": 0, "stddev_outliers": 2, "outliers": "2;0", - "ld15iqr": 2.973248357884586, - "hd15iqr": 3.1569794192910194, - "ops": 0.3250489474404826, - "total": 15.38229869492352, + "ld15iqr": 4.794365357607603, + "hd15iqr": 5.184939783066511, + "ops": 0.20123237421758508, + "total": 24.84689662605524, "iterations": 1 } }, @@ -813,26 +848,26 @@ "warmup": false }, "stats": { - "min": 107.97044172231108, - "max": 109.67866003140807, - "mean": 108.94072148483247, - "stddev": 0.7142319621034929, + "min": 152.7211031857878, + "max": 161.58804737962782, + "mean": 158.70457714907826, + "stddev": 3.529131682005075, "rounds": 5, - "median": 108.94882048200816, - "iqr": 1.1987205250188708, - "q1": 108.39640940236859, - "q3": 109.59512992738746, + "median": 159.99357100203633, + "iqr": 3.860361324157566, + "q1": 157.0660247253254, + "q3": 160.92638604948297, "iqr_outliers": 0, - "stddev_outliers": 2, - "outliers": "2;0", - "ld15iqr": 107.97044172231108, - "hd15iqr": 109.67866003140807, - "ops": 0.009179303995514913, - "total": 544.7036074241623, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 152.7211031857878, + "hd15iqr": 161.58804737962782, + "ops": 0.006301015496614541, + "total": 793.5228857453912, "iterations": 1 } } ], - "datetime": "2025-01-02T13:26:31.464476+00:00", + "datetime": "2025-01-03T13:58:40.332127+00:00", "version": "5.1.0" } \ No newline at end of file diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py index ff952e99..cf812475 100644 --- a/gpu4pyscf/tests/test_benchmark_rks.py +++ b/gpu4pyscf/tests/test_benchmark_rks.py @@ -33,10 +33,10 @@ # pytest test_benchmark_rks.py -v # 4. save benchmark results -# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-save=v100 +# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-save=1v100 # 5. compare benchmark results, fail if performance regresses by more than 10% -# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare=v100 --benchmark-compare-fail=10% +# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=1v100 --benchmark-storage=./benchmark_results/ current_folder = os.path.dirname(os.path.abspath(__file__)) small_mol = os.path.join(current_folder, '020_Vitamin_C.xyz') From 7bed7c83bb4cdbe4b4768d2551b83a85427e2546 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Sat, 4 Jan 2025 07:57:13 +0800 Subject: [PATCH 38/49] split nightly benchmark --- .github/workflows/nightly_build.yml | 9 +++++++-- .gitignore | 1 + gpu4pyscf/tests/test_benchmark_rks.py | 8 +++----- gpu4pyscf/tests/test_benchmark_uks.py | 19 +++++++++---------- 4 files changed, 20 insertions(+), 17 deletions(-) diff --git a/.github/workflows/nightly_build.yml b/.github/workflows/nightly_build.yml index 7be6d721..012ca12f 100644 --- a/.github/workflows/nightly_build.yml +++ b/.github/workflows/nightly_build.yml @@ -36,8 +36,13 @@ jobs: export PATH=${CUDA_HOME}/bin:${PATH} export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:$LD_LIBRARY_PATH sh build.sh - - name: Smoke Test + - name: Test RKS run: | echo $GITHUB_WORKSPACE export PYTHONPATH="${PYTHONPATH}:$(pwd)" - pytest gpu4pyscf/tests/ -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_1v100 --benchmark-storage=gpu4pyscf/tests/ + pytest gpu4pyscf/tests/test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/ + - name: Test UKS + run: | + echo $GITHUB_WORKSPACE + export PYTHONPATH="${PYTHONPATH}:$(pwd)" + pytest gpu4pyscf/tests/test_benchmark_uks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/ diff --git a/.gitignore b/.gitignore index 427ffd8a..b8dd78e9 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ **/build **/launch_logs **/deps +**/.benchmarks core **tmp* *.egg-info/ diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py index cf812475..5027da8f 100644 --- a/gpu4pyscf/tests/test_benchmark_rks.py +++ b/gpu4pyscf/tests/test_benchmark_rks.py @@ -13,12 +13,10 @@ # limitations under the License. import os -import unittest import numpy as np import pyscf import pytest -import cupy -from gpu4pyscf.dft import rks, uks +from gpu4pyscf.dft import rks # Any task taking more than 1000s will be marked as 'slow' @@ -33,10 +31,10 @@ # pytest test_benchmark_rks.py -v # 4. save benchmark results -# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-save=1v100 +# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-save=v1.3.0_rks_1v100 # 5. compare benchmark results, fail if performance regresses by more than 10% -# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=1v100 --benchmark-storage=./benchmark_results/ +# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=1v100 --benchmark-storage=benchmark_results/ current_folder = os.path.dirname(os.path.abspath(__file__)) small_mol = os.path.join(current_folder, '020_Vitamin_C.xyz') diff --git a/gpu4pyscf/tests/test_benchmark_uks.py b/gpu4pyscf/tests/test_benchmark_uks.py index 0e5bf311..2c9d8ce6 100644 --- a/gpu4pyscf/tests/test_benchmark_uks.py +++ b/gpu4pyscf/tests/test_benchmark_uks.py @@ -13,12 +13,10 @@ # limitations under the License. import os -import unittest import numpy as np import pyscf import pytest -import cupy -from gpu4pyscf.dft import rks, uks +from gpu4pyscf.dft import uks current_folder = os.path.dirname(os.path.abspath(__file__)) small_mol = os.path.join(current_folder, '020_Vitamin_C.xyz') @@ -84,19 +82,20 @@ def test_df_ub3lyp_grad(benchmark): def test_df_ub3lyp_hessian(benchmark): h = benchmark(run_ub3lyp_hessian, small_mol, 'def2-tzvpp', True, False) print('testing df ub3lyp hessian') - assert np.isclose(np.linalg.norm(h), 3.7669464279078064, atol=1e-4, rtol=1e-16) + assert np.isclose(np.linalg.norm(h), 3.758810345806532, atol=1e-4, rtol=1e-16) @pytest.mark.benchmark def test_ub3lyp(benchmark): - e = benchmark(run_ub3lyp, small_mol, 'def2-tzvpp', False, False) + e = benchmark(run_ub3lyp, small_mol, '6-31gs', False, False) print('testing ub3lyp') - assert np.isclose(np.linalg.norm(e), 684.9997358509884, atol=1e-7, rtol=1e-16) + assert np.isclose(np.linalg.norm(e), 684.6643858622429, atol=1e-7, rtol=1e-16) @pytest.mark.benchmark def test_ub3lyp_grad(benchmark): - g = benchmark(run_ub3lyp_grad, small_mol, 'def2-tzvpp', False, False) + g = benchmark(run_ub3lyp_grad, small_mol, '6-31gs', False, False) print('testing ub3lyp grad') - assert np.isclose(np.linalg.norm(g), 0.17441176110160253, atol=1e-5, rtol=1e-16) + assert np.isclose(np.linalg.norm(g), 0.17540045665419984, atol=1e-5, rtol=1e-16) @pytest.mark.benchmark def test_ub3lyp_hessian(benchmark): - h = benchmark(run_ub3lyp_hessian, small_mol, 'def2-tzvpp', False, False) + h = benchmark(run_ub3lyp_hessian, small_mol, '6-31gs', False, False) print('testing ub3lyp hessian') - assert np.isclose(np.linalg.norm(h), 3.758916526520172, atol=1e-4, rtol=1e-16) + print(np.linalg.norm(h), np.linalg.norm(h) - 3.758916526520172) + assert np.isclose(np.linalg.norm(h), 3.907289414559395, atol=1e-4, rtol=1e-16) From 651708e26ba01ada3e94292d908ccb752aed920e Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Sat, 4 Jan 2025 08:12:11 +0000 Subject: [PATCH 39/49] optimize df.hessian memory --- .github/workflows/nightly_build.yml | 9 +++- gpu4pyscf/df/hessian/jk.py | 12 +++--- gpu4pyscf/df/hessian/rhf.py | 61 ++++++++++++--------------- gpu4pyscf/df/hessian/uhf.py | 41 ++++++++---------- gpu4pyscf/df/int3c2e.py | 2 +- gpu4pyscf/hessian/rks.py | 2 +- gpu4pyscf/tests/test_benchmark_rks.py | 4 +- 7 files changed, 62 insertions(+), 69 deletions(-) diff --git a/.github/workflows/nightly_build.yml b/.github/workflows/nightly_build.yml index 7be6d721..012ca12f 100644 --- a/.github/workflows/nightly_build.yml +++ b/.github/workflows/nightly_build.yml @@ -36,8 +36,13 @@ jobs: export PATH=${CUDA_HOME}/bin:${PATH} export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:$LD_LIBRARY_PATH sh build.sh - - name: Smoke Test + - name: Test RKS run: | echo $GITHUB_WORKSPACE export PYTHONPATH="${PYTHONPATH}:$(pwd)" - pytest gpu4pyscf/tests/ -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_1v100 --benchmark-storage=gpu4pyscf/tests/ + pytest gpu4pyscf/tests/test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/ + - name: Test UKS + run: | + echo $GITHUB_WORKSPACE + export PYTHONPATH="${PYTHONPATH}:$(pwd)" + pytest gpu4pyscf/tests/test_benchmark_uks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/ diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py index 6b08cee5..7859e97b 100644 --- a/gpu4pyscf/df/hessian/jk.py +++ b/gpu4pyscf/df/hessian/jk.py @@ -22,7 +22,7 @@ from gpu4pyscf.scf.int4c2e import libgint from gpu4pyscf.hessian.jk import _ao2mo from gpu4pyscf.lib import logger -from gpu4pyscf.lib.cupy_helper import contract, cart2sph, reduce_to_device +from gpu4pyscf.lib.cupy_helper import contract, cart2sph, reduce_to_device, get_avail_mem, release_gpu_stack from gpu4pyscf.__config__ import _streams, _num_devices NROOT_ON_GPU = 7 @@ -310,7 +310,7 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, if with_k: rhok_tmp = contract('por,ir->poi', rhok[k0:k1], orbo[i0:i1]) rhok_tmp = contract('poi,jo->pji', rhok_tmp, orbo[j0:j1]) - + # (20|0), (0|0)(0|00) int3c_blk = _get_int3c2e_ipip_slice('ipip1', intopt, cp_ij_id, aux_id, omega=omega) if with_j: @@ -320,7 +320,7 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, hj_ipip1[:,i0:i1] += contract('xji,ij->xi', tmp, dm0[i0:i1,j0:j1]) if with_k: hk_ipip1[:,i0:i1] += contract('xpji,pji->xi', int3c_blk, rhok_tmp) - int3c_blk = None + int3c_blk = tmp = None # (11|0), (0|0)(0|00) without response of RI basis int3c_blk = _get_int3c2e_ipip_slice('ipvip1', intopt, cp_ij_id, aux_id, omega=omega) @@ -331,7 +331,7 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, hj_ipvip1[:,i0:i1,j0:j1] += contract('xji,ij->xij', tmp, dm0[i0:i1,j0:j1]) if with_k: hk_ipvip1[:,i0:i1,j0:j1] += contract('xpji,pji->xij', int3c_blk, rhok_tmp) - int3c_blk = None + int3c_blk = tmp = None if auxbasis_response < 1: continue @@ -343,7 +343,7 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, hj_ip1ip2[:,i0:i1,k0:k1] += contract('xpi,p->xip', tmp, rhoj[k0:k1]) if with_k: hk_ip1ip2[:,i0:i1,k0:k1] += contract('xpji,pji->xip', int3c_blk, rhok_tmp) - int3c_blk = None + int3c_blk = tmp = None if auxbasis_response < 2: continue @@ -355,7 +355,7 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, hj_ipip2[:,k0:k1] += contract('xp,p->xp', tmp, rhoj[k0:k1]) if with_k: hk_ipip2[:,k0:k1] += contract('xpji,pji->xp', int3c_blk, rhok_tmp) - int3c_blk = None + int3c_blk = tmp = None auxslices = intopt.auxmol.aoslice_by_atom() aoslices = intopt.mol.aoslice_by_atom() ao2atom = int3c2e.get_ao2atom(intopt, aoslices) diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py index 47faa476..9dfee665 100644 --- a/gpu4pyscf/df/hessian/rhf.py +++ b/gpu4pyscf/df/hessian/rhf.py @@ -122,22 +122,10 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls int2c = intopt.sort_orbitals(int2c, aux_axis=[0,1]) solve_j2c = _gen_metric_solver(int2c) - int2c_ip1 = cupy.asarray(int2c_ip1, order='C') - int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2]) - if with_j: - hj_ao_ao = cupy.zeros([nao,nao,3,3]) - if with_k: - hk_ao_ao = cupy.zeros([nao,nao,3,3]) - if hessobj.auxbasis_response: - if with_j: - hj_ao_aux = cupy.zeros([nao,naux,3,3]) - if with_k: - hk_ao_aux = cupy.zeros([nao,naux,3,3]) - # int3c contributions wj, wk_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0_tag, omega=omega) rhoj0_P = rhok0_P__ = None - + if with_j: rhoj0_P = solve_j2c(wj) wj = None @@ -146,6 +134,11 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls wk_P__ = None t1 = log.timer_debug1('intermediate variables with int3c2e', *t1) + hj_ipip, hk_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, + with_j=with_j, with_k=with_k, omega=omega, + auxbasis_response=hessobj.auxbasis_response) + t1 = log.timer_debug1('intermediate variables with int3c2e_ipip', *t1) + # int3c_ip2 contributions wj_ip2, wk_ip2_P__ = int3c2e.get_int3c2e_ip2_wjk(intopt, dm0_tag, omega=omega) t1 = log.timer_debug1('intermediate variables with int3c2e_ip2', *t1) @@ -153,17 +146,22 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls # int3c_ip1 contributions wj1_P, wk1_Pko = int3c2e.get_int3c2e_ip1_wjk(intopt, dm0_tag, omega=omega) t1 = log.timer_debug1('intermediate variables with int3c2e_ip1', *t1) + + cupy.get_default_memory_pool().free_all_blocks() + release_gpu_stack() #rhoj1_P = contract('pq,pix->qix', int2c_inv, wj1_P) + int2c_ip1 = cupy.asarray(int2c_ip1, order='C') + int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2]) + if with_j: rhoj1_P = solve_j2c(wj1_P) - - hj_ao_ao += 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P) # (10|0)(0|0)(0|01) + hj_ao_ao = 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P) # (10|0)(0|0)(0|01) wj1_P = None if hessobj.auxbasis_response: wj0_01 = contract('ypq,q->yp', int2c_ip1, rhoj0_P) wj1_01 = contract('yqp,pix->iqxy', int2c_ip1, rhoj1_P) - hj_ao_aux += contract('pix,py->ipxy', rhoj1_P, wj_ip2) # (10|0)(1|00) + hj_ao_aux = contract('pix,py->ipxy', rhoj1_P, wj_ip2) # (10|0)(1|00) hj_ao_aux -= contract('pix,yp->ipxy', rhoj1_P, wj0_01) # (10|0)(1|0)(0|00) hj_ao_aux -= contract('q,iqxy->iqxy', rhoj0_P, wj1_01) # (10|0)(0|1)(0|00) wj1_01 = None @@ -174,11 +172,12 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls mem_avail = get_avail_mem() nocc = mocc.shape[1] slice_size = naux*nocc*9 # largest slice of intermediate variables - blksize = int(mem_avail*0.2/8/slice_size/ALIGNED) * ALIGNED + blksize = int(mem_avail*0.4/8/slice_size/ALIGNED) * ALIGNED log.debug(f'GPU Memory {mem_avail/GB:.1f} GB available, {blksize} aux AOs per block') if blksize < ALIGNED: raise RuntimeError('Not enough memory for intermediate variables') - + if hessobj.auxbasis_response: + hk_ao_aux = cupy.zeros([nao,naux,3,3]) for i0, i1 in lib.prange(0,nao,blksize): #wk1_Pko_islice = cupy.asarray(wk1_Pko[:,i0:i1]) wk1_Pko_islice = copy_array(wk1_Pko[:,i0:i1]) @@ -187,6 +186,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls rhok1_Pko = solve_j2c(wk1_Pko_islice) wk1_Pko_islice = None if hessobj.auxbasis_response: + hk_ao_aux = cupy.zeros([nao,naux,3,3]) # (10|0)(1|00) wk_ip2_Ipo = contract('porx,io->pirx', wk_ip2_P__, mocc_2[i0:i1]) hk_ao_aux[i0:i1] += contract('piox,pioy->ipxy', rhok1_Pko, wk_ip2_Ipo) @@ -205,6 +205,11 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls rhok1_Pko = None t1 = log.timer_debug1('contract int3c2e_ip1 with int2c_ip1', *t1) + rho2c_0 = contract('pij,qji->pq', rhok0_P__, rhok0_P__) + rho2c_10 = contract('rijx,qij->rqx', wk_ip2_P__, rhok0_P__) + rho2c_11 = contract('pijx,qijy->pqxy', wk_ip2_P__, wk_ip2_P__) + rhok0_P__ = wk_ip2_P__ = None + w, v = cupy.linalg.eigh(int2c) idx = w > LINEAR_DEP_THR cd_low = (v[:,idx] / cupy.sqrt(w[idx])) @@ -223,17 +228,14 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls copy_array(wk1_tmp, rhok1_Pko[:,i0:i1]) wk1_tmp = None cd_low = None - hk_ao_ao += _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2) + hk_ao_ao = _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2) wk1_Pko = rhok1_Pko = None + solve_j2c = None t1 = log.timer_debug1('contract int3c2e_ip1 with int3c2e_ip1', *t1) - hj_ipip, hk_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, - with_j=with_j, with_k=with_k, omega=omega, - auxbasis_response=hessobj.auxbasis_response) - t1 = log.timer_debug1('intermediate variables with int3c2e_ipip', *t1) - # int2c contributions if hessobj.auxbasis_response > 1: + cupy.get_default_memory_pool().free_all_blocks() if omega and omega > 1e-10: with auxmol.with_range_coulomb(omega): int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1') @@ -248,7 +250,6 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P) hj_aux_diag = -(rhoj0_P*rhoj2c_P).T.reshape(-1,3,3) if with_k: - rho2c_0 = contract('pij,qji->pq', rhok0_P__, rhok0_P__) hk_aux_diag = -.5 * contract('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3) int2c_ipip1 = None @@ -266,10 +267,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls t1 = log.timer_debug1('intermediate variables with int2c_*', *t1) int2c_ip1ip2 = None - cupy.get_default_memory_pool().free_all_blocks() - release_gpu_stack() - # aux-aux pair - if hessobj.auxbasis_response > 1: + # aux-aux pair int2c_inv = pinv(int2c, lindep=LINEAR_DEP_THR) int2c_ip1_inv = contract('yqp,pr->yqr', int2c_ip1, int2c_inv) if with_j: @@ -290,11 +288,6 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls wj0_01 = rhoj0_01 = None if with_k: - rho2c_10 = contract('rijx,qij->rqx', wk_ip2_P__, rhok0_P__) - rhok0_P__ = None - - rho2c_11 = contract('pijx,qijy->pqxy', wk_ip2_P__, wk_ip2_P__) - wk_ip2_P__ = None hk_aux_aux += .5 * contract('pqxy,pq->pqxy', rho2c_11, int2c_inv) # (00|1)(1|00) rho2c_11 = None diff --git a/gpu4pyscf/df/hessian/uhf.py b/gpu4pyscf/df/hessian/uhf.py index b77015f6..5e94a248 100644 --- a/gpu4pyscf/df/hessian/uhf.py +++ b/gpu4pyscf/df/hessian/uhf.py @@ -116,16 +116,6 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, int2c_ip1 = cupy.asarray(int2c_ip1, order='C') int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2]) - if with_j: - hj_ao_ao = cupy.zeros([nao,nao,3,3]) - if with_k: - hk_ao_ao = cupy.zeros([nao,nao,3,3]) - if hessobj.auxbasis_response: - if with_j: - hj_ao_aux = cupy.zeros([nao,naux,3,3]) - if with_k: - hk_ao_aux = cupy.zeros([nao,naux,3,3]) - # int3c contributions wja, wka_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0a_tag, omega=omega) wjb, wkb_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0b_tag, omega=omega) @@ -153,12 +143,12 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, if with_j: wj1_P = wj1a_P + wj1b_P rhoj1_P = solve_j2c(wj1_P) - hj_ao_ao += 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P) # (10|0)(0|0)(0|01) + hj_ao_ao = 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P) # (10|0)(0|0)(0|01) wj1_P = None if hessobj.auxbasis_response: wj0_01 = contract('ypq,q->yp', int2c_ip1, rhoj0_P) wj1_01 = contract('yqp,pix->iqxy', int2c_ip1, rhoj1_P) - hj_ao_aux += contract('pix,py->ipxy', rhoj1_P, wj_ip2) # (10|0)(1|00) + hj_ao_aux = contract('pix,py->ipxy', rhoj1_P, wj_ip2) # (10|0)(1|00) hj_ao_aux -= contract('pix,yp->ipxy', rhoj1_P, wj0_01) # (10|0)(1|0)(0|00) hj_ao_aux -= contract('q,iqxy->iqxy', rhoj0_P, wj1_01) # (10|0)(0|1)(0|00) wj1_01 = None @@ -173,7 +163,9 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, log.debug(f'GPU Memory {mem_avail/GB:.1f} GB available, block size {blksize}') if blksize < ALIGNED: raise RuntimeError('Not enough memory for intermediate variables') - + hk_ao_ao = cupy.zeros([nao,nao,3,3]) + if hessobj.auxbasis_response: + hk_ao_aux = cupy.zeros([nao,naux,3,3]) for i0, i1 in lib.prange(0,nao,blksize): #wk1a_Pko_islice = cupy.asarray(wk1a_Pko[:,i0:i1]) #wk1b_Pko_islice = cupy.asarray(wk1b_Pko[:,i0:i1]) @@ -244,6 +236,19 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, hk_ipip = 2.0*(hka_ipip + hkb_ipip) t1 = log.timer_debug1('intermediate variables with int3c2e_ipip', *t1) + if hessobj.auxbasis_response > 1: + if with_k: + rho2c_0 = contract('pij,qji->pq', rhok0a_P__, rhok0a_P__) + rho2c_0+= contract('pij,qji->pq', rhok0b_P__, rhok0b_P__) + + rho2c_10 = contract('rijx,qij->rqx', wka_ip2_P__, rhok0a_P__) + rho2c_10+= contract('rijx,qij->rqx', wkb_ip2_P__, rhok0b_P__) + rhok0a_P__ = rhok0b_P__ = None + + rho2c_11 = contract('pijx,qijy->pqxy', wka_ip2_P__, wka_ip2_P__) + rho2c_11+= contract('pijx,qijy->pqxy', wkb_ip2_P__, wkb_ip2_P__) + wka_ip2_P__ = wkb_ip2_P__ = None + # int2c contributions if hessobj.auxbasis_response > 1: if omega and omega > 1e-10: @@ -259,8 +264,6 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # p,xp->px hj_aux_diag = -(rhoj0_P*rhoj2c_P).T.reshape(-1,3,3) if with_k: - rho2c_0 = contract('pij,qji->pq', rhok0a_P__, rhok0a_P__) - rho2c_0+= contract('pij,qji->pq', rhok0b_P__, rhok0b_P__) hk_aux_diag = -contract('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3) int2c_ipip1 = None @@ -301,14 +304,6 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, wj0_01 = rhoj0_01 = None if with_k: - rho2c_10 = contract('rijx,qij->rqx', wka_ip2_P__, rhok0a_P__) - rho2c_10+= contract('rijx,qij->rqx', wkb_ip2_P__, rhok0b_P__) - rhok0a_P__ = rhok0b_P__ = None - - - rho2c_11 = contract('pijx,qijy->pqxy', wka_ip2_P__, wka_ip2_P__) - rho2c_11+= contract('pijx,qijy->pqxy', wkb_ip2_P__, wkb_ip2_P__) - wka_ip2_P__ = wkb_ip2_P__ = None hk_aux_aux += .5 * contract('pqxy,pq->pqxy', rho2c_11, int2c_inv) # (00|1)(1|00) rho2c_11 = None diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py index 8bfa8a81..e77e30ca 100644 --- a/gpu4pyscf/df/int3c2e.py +++ b/gpu4pyscf/df/int3c2e.py @@ -831,7 +831,7 @@ def get_int3c2e_jk(mol, auxmol, dm0_tag, with_k=True, omega=None): for device_id in range(_num_devices): future = executor.submit( _int3c2e_jk_task, intopt, task_list[device_id], - dm0_tag.get(), orbo.get(), device_id=device_id, omega=omega) + dm0_tag, orbo, device_id=device_id, omega=omega) futures.append(future) rhoj_total = [] diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py index 912748c7..a2ee9da7 100644 --- a/gpu4pyscf/hessian/rks.py +++ b/gpu4pyscf/hessian/rks.py @@ -767,7 +767,7 @@ def _nr_rks_fxc_mo_task(ni, mol, grids, xc_code, fxc, mo_coeff, mo1, mocc, t1 = log.timer_debug2('integration', *t1) ao = rho1 = None - t0 = log.timer_debug1('vxc', *t0) + t0 = log.timer_debug1(f'vxc on Device {device_id} ', *t0) if xctype != 'LDA': transpose_sum(vmat) vmat = jk._ao2mo(vmat, mocc, mo_coeff) diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py index cf812475..554dbba1 100644 --- a/gpu4pyscf/tests/test_benchmark_rks.py +++ b/gpu4pyscf/tests/test_benchmark_rks.py @@ -33,10 +33,10 @@ # pytest test_benchmark_rks.py -v # 4. save benchmark results -# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-save=1v100 +# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-save=v1.3.0_rks_1v100 # 5. compare benchmark results, fail if performance regresses by more than 10% -# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=1v100 --benchmark-storage=./benchmark_results/ +# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=1v100 --benchmark-storage=benchmark_results/ current_folder = os.path.dirname(os.path.abspath(__file__)) small_mol = os.path.join(current_folder, '020_Vitamin_C.xyz') From e85afa66692c94942a74efac9b72382abc412923 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Sun, 5 Jan 2025 07:57:00 +0000 Subject: [PATCH 40/49] small fixes --- .../cupy_helper/benchmark_memory_copy.py | 16 ++++++ examples/dft_driver.py | 4 +- gpu4pyscf/df/grad/rhf.py | 3 +- gpu4pyscf/df/grad/uhf.py | 54 +------------------ gpu4pyscf/df/hessian/rhf.py | 7 ++- gpu4pyscf/dft/numint.py | 29 +++++++--- 6 files changed, 45 insertions(+), 68 deletions(-) diff --git a/benchmarks/cupy_helper/benchmark_memory_copy.py b/benchmarks/cupy_helper/benchmark_memory_copy.py index d10f97ac..8455f3f0 100644 --- a/benchmarks/cupy_helper/benchmark_memory_copy.py +++ b/benchmarks/cupy_helper/benchmark_memory_copy.py @@ -123,3 +123,19 @@ def cupy_asarray_contiguous(a, b): print(f"Effective Bandwidth: {bandwidth:.2f} GB/s") assert np.linalg.norm(a.get() - b.get()) < 1e-10 + + +print('----------- Benchmark reduction across devices ------ ') +from gpu4pyscf.lib.cupy_helper import reduce_to_device +_num_devices = cp.cuda.runtime.getDeviceCount() +a_dist = [] +for device_id in range(_num_devices): + with cp.cuda.Device(device_id): + a = cp.random.rand(512,512,512) + a_dist.append(a) + +perf_cupy = profiler.benchmark(reduce_to_device, (a_dist,), n_repeat=20, n_warmup=3) +t_kernel = perf_cupy.gpu_times.mean() +bandwidth = a_dist[0].nbytes * _num_devices / t_kernel / 1e9 +print('Cupy set contiguous array', t_kernel) +print(f"Effective Bandwidth: {bandwidth:.2f} GB/s") diff --git a/examples/dft_driver.py b/examples/dft_driver.py index 0be7f410..0c8cea48 100644 --- a/examples/dft_driver.py +++ b/examples/dft_driver.py @@ -34,10 +34,10 @@ basis=bas, max_memory=32000) # set verbose >= 6 for debugging timer -mol.verbose = 6 +mol.verbose = 4 mf_df = dft.RKS(mol, xc=args.xc).density_fit(auxbasis=args.auxbasis) -mf_df.verbose = 6 +mf_df.verbose = 4 if args.solvent: mf_df = mf_df.PCM() diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py index ea0537ed..17816bc8 100644 --- a/gpu4pyscf/df/grad/rhf.py +++ b/gpu4pyscf/df/grad/rhf.py @@ -151,7 +151,8 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega cart2sph = intopt.cart2sph orbo_cart = cart2sph @ orbo dm_cart = cart2sph @ dm @ cart2sph.T - + + with_df._cderi = None # release GPU memory vj, vk, vjaux, vkaux = get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart, with_j=with_j, with_k=with_k, omega=omega) # NOTE: vj and vk are still in cartesian diff --git a/gpu4pyscf/df/grad/uhf.py b/gpu4pyscf/df/grad/uhf.py index 42107967..53acd7e0 100644 --- a/gpu4pyscf/df/grad/uhf.py +++ b/gpu4pyscf/df/grad/uhf.py @@ -165,59 +165,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, orbo_cart = orbo dm = orbo = None - """ - vj = vk = rhoj_tmp = rhok_tmp = None - vjaux = vkaux = None - - naux_cart = intopt._sorted_auxmol.nao - if with_j: - vj = cupy.zeros((3,nao_cart), order='C') - vjaux = cupy.zeros((3,naux_cart)) - if with_k: - vk = cupy.zeros((3,nao_cart), order='C') - vkaux = cupy.zeros((3,naux_cart)) - cupy.get_default_memory_pool().free_all_blocks() - t1 = log.init_timer() - for cp_kl_id in range(len(intopt.aux_log_qs)): - k0, k1 = intopt.cart_aux_loc[cp_kl_id], intopt.cart_aux_loc[cp_kl_id+1] - assert k1-k0 <= block_size - if with_j: - rhoj_tmp = rhoj_cart[k0:k1] - if with_k: - rhok_tmp = contract('por,ir->pio', rhok_cart[k0:k1], orbo_cart) - rhok_tmp = contract('pio,jo->pji', rhok_tmp, orbo_cart) - ''' - if(rhoj_tmp.flags['C_CONTIGUOUS'] == False): - rhoj_tmp = rhoj_tmp.astype(cupy.float64, order='C') - - if(rhok_tmp.flags['C_CONTIGUOUS'] == False): - rhok_tmp = rhok_tmp.astype(cupy.float64, order='C') - ''' - ''' - # outcore implementation - int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 1, out=buf) - size = 3*(k1-k0)*nao_cart*nao_cart - int3c_ip = buf[:size].reshape([3,k1-k0,nao_cart,nao_cart], order='C') - rhoj_tmp = contract('xpji,ij->xip', int3c_ip, dm_cart) - vj += contract('xip,p->xi', rhoj_tmp, rhoj_cart[k0:k1]) - vk += contract('pji,xpji->xi', rhok_tmp, int3c_ip) - - int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 2, out=buf) - rhoj_tmp = contract('xpji,ji->xp', int3c_ip, dm_cart) - vjaux[:, k0:k1] = contract('xp,p->xp', rhoj_tmp, rhoj_cart[k0:k1]) - vkaux[:, k0:k1] = contract('xpji,pji->xp', int3c_ip, rhok_tmp) - ''' - vj_tmp, vk_tmp = int3c2e.get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip1', rhoj_tmp, rhok_tmp, dm_cart, omega=omega) - if with_j: vj += vj_tmp - if with_k: vk += vk_tmp - - vj_tmp, vk_tmp = int3c2e.get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip2', rhoj_tmp, rhok_tmp, dm_cart, omega=omega) - if with_j: vjaux[:, k0:k1] = vj_tmp - if with_k: vkaux[:, k0:k1] = vk_tmp - - rhoj_tmp = rhok_tmp = vj_tmp = vk_tmp = None - t1 = log.timer_debug1(f'calculate {cp_kl_id:3d} / {len(intopt.aux_log_qs):3d}, {k1-k0:3d} slices', *t1) - """ + with_df._cderi = None # release GPU memory vj, vk, vjaux, vkaux = get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart, with_j=with_j, with_k=with_k, omega=omega) diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py index 9dfee665..aa0c5047 100644 --- a/gpu4pyscf/df/hessian/rhf.py +++ b/gpu4pyscf/df/hessian/rhf.py @@ -121,7 +121,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls int2c = cupy.asarray(int2c, order='C') int2c = intopt.sort_orbitals(int2c, aux_axis=[0,1]) solve_j2c = _gen_metric_solver(int2c) - + # int3c contributions wj, wk_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0_tag, omega=omega) rhoj0_P = rhok0_P__ = None @@ -172,10 +172,9 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls mem_avail = get_avail_mem() nocc = mocc.shape[1] slice_size = naux*nocc*9 # largest slice of intermediate variables - blksize = int(mem_avail*0.4/8/slice_size/ALIGNED) * ALIGNED + blksize = int(mem_avail*0.2/8/slice_size) log.debug(f'GPU Memory {mem_avail/GB:.1f} GB available, {blksize} aux AOs per block') - if blksize < ALIGNED: - raise RuntimeError('Not enough memory for intermediate variables') + assert blksize > 0 if hessobj.auxbasis_response: hk_ao_aux = cupy.zeros([nao,naux,3,3]) for i0, i1 in lib.prange(0,nao,blksize): diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py index a40c976f..0c533bcf 100644 --- a/gpu4pyscf/dft/numint.py +++ b/gpu4pyscf/dft/numint.py @@ -414,9 +414,11 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, ngrids_glob = grids.coords.shape[0] ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices + ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE grid_start = device_id * ngrids_per_device - grid_end = (device_id + 1) * ngrids_per_device + grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob) ngrids_local = grid_end - grid_start + log.debug(f"{ngrids_local} on Device {device_id}") weights = cupy.empty([ngrids_local]) if xctype == 'LDA': @@ -425,7 +427,7 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, rho_tot = cupy.empty([nset,4,ngrids_local]) else: rho_tot = cupy.empty([nset,5,ngrids_local]) - + p0 = p1 = 0 for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, max_memory=None, @@ -433,8 +435,10 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, p1 = p0 + weight.size weights[p0:p1] = weight for i in range(nset): - if mo_coeff is None: - rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms[i][idx[:,None],idx], + # If AO is sparse enough, use density matrix to calculate rho + if mo_coeff is None or len(idx) < mo_occ.sum(): + dms_mask = dms[i][idx[:,None],idx] + rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms_mask, xctype=xctype, hermi=hermi, with_lapl=with_lapl) else: assert hermi == 1 @@ -443,7 +447,7 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, None, xctype, with_lapl) p0 = p1 t0 = log.timer_debug1(f'eval rho on Device {device_id}', *t0) - + # libxc calls are still running on default stream nelec = cupy.zeros(nset) excsum = cupy.zeros(nset) @@ -814,8 +818,11 @@ def _nr_uks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, ngrids_glob = grids.coords.shape[0] ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices + ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE grid_start = device_id * ngrids_per_device - grid_end = (device_id + 1) * ngrids_per_device + grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob) + ngrids_local = grid_end - grid_start + log.debug(f"{ngrids_local} on Device {device_id}") for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, max_memory=None, @@ -1016,8 +1023,11 @@ def _nr_rks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff, ngrids_glob = grids.coords.shape[0] ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices + ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE grid_start = device_id * ngrids_per_device - grid_end = (device_id + 1) * ngrids_per_device + grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob) + ngrids_local = grid_end - grid_start + log.debug(f"{ngrids_local} on Device {device_id}") p0 = p1 = grid_start t1 = t0 = log.init_timer() @@ -1165,8 +1175,11 @@ def _nr_uks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff, ngrids_glob = grids.coords.shape[0] ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices + ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE grid_start = device_id * ngrids_per_device - grid_end = (device_id + 1) * ngrids_per_device + grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob) + ngrids_local = grid_end - grid_start + log.debug(f"{ngrids_local} on Device {device_id}") p0 = p1 = grid_start t1 = t0 = log.init_timer() From 1ed8e5eccf5aaae256f4ad591db81f6689dd8a13 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Sun, 5 Jan 2025 23:21:10 +0000 Subject: [PATCH 41/49] bugfix in df.hessian --- gpu4pyscf/df/hessian/rhf.py | 1 - gpu4pyscf/dft/numint.py | 2 +- gpu4pyscf/tests/test_benchmark_rks.py | 52 +++++++++++++-------------- gpu4pyscf/tests/test_benchmark_uks.py | 13 ++++--- 4 files changed, 33 insertions(+), 35 deletions(-) diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py index aa0c5047..2eab8ef5 100644 --- a/gpu4pyscf/df/hessian/rhf.py +++ b/gpu4pyscf/df/hessian/rhf.py @@ -185,7 +185,6 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls rhok1_Pko = solve_j2c(wk1_Pko_islice) wk1_Pko_islice = None if hessobj.auxbasis_response: - hk_ao_aux = cupy.zeros([nao,naux,3,3]) # (10|0)(1|00) wk_ip2_Ipo = contract('porx,io->pirx', wk_ip2_P__, mocc_2[i0:i1]) hk_ao_aux[i0:i1] += contract('piox,pioy->ipxy', rhok1_Pko, wk_ip2_Ipo) diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py index 0c533bcf..872ce750 100644 --- a/gpu4pyscf/dft/numint.py +++ b/gpu4pyscf/dft/numint.py @@ -436,7 +436,7 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, weights[p0:p1] = weight for i in range(nset): # If AO is sparse enough, use density matrix to calculate rho - if mo_coeff is None or len(idx) < mo_occ.sum(): + if mo_coeff is None: dms_mask = dms[i][idx[:,None],idx] rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms_mask, xctype=xctype, hermi=hermi, with_lapl=with_lapl) diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py index 5027da8f..2afa6e70 100644 --- a/gpu4pyscf/tests/test_benchmark_rks.py +++ b/gpu4pyscf/tests/test_benchmark_rks.py @@ -94,17 +94,17 @@ def run_rb3lyp_hessian(atom, basis, with_df, with_solvent, disp=None): ####### # DF ####### -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) def test_df_rb3lyp(benchmark): e = benchmark(run_rb3lyp, small_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp') assert np.isclose(np.linalg.norm(e), 684.9998712035579, atol=1e-7, rtol=1e-16) -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) def test_df_rb3lyp_grad(benchmark): g = benchmark(run_rb3lyp_grad, small_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp grad') assert np.isclose(np.linalg.norm(g), 0.17435941081837686, atol=1e-5, rtol=1e-16) -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=False, min_rounds=1) def test_df_rb3lyp_hessian(benchmark): h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp hessian') @@ -113,17 +113,17 @@ def test_df_rb3lyp_hessian(benchmark): ################ # Direct SCF ################ -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) def test_rb3lyp(benchmark): e = benchmark(run_rb3lyp, small_mol, 'def2-tzvpp', False, False) print('testing rb3lyp') assert np.isclose(np.linalg.norm(e), 684.999735850967, atol=1e-7, rtol=1e-16) -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) def test_rb3lyp_grad(benchmark): g = benchmark(run_rb3lyp_grad, small_mol, 'def2-tzvpp', False, False) print('testing rb3lyp grad') assert np.isclose(np.linalg.norm(g), 0.1744127474130983, atol=1e-5, rtol=1e-16) -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=False, min_rounds=1) def test_rb3lyp_hessian(benchmark): h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', False, False) print('testing rb3lyp hessian') @@ -132,34 +132,34 @@ def test_rb3lyp_hessian(benchmark): #################### # Medium molecule #################### -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) def test_df_rb3lyp_medium(benchmark): e = benchmark(run_rb3lyp, medium_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp medium') assert np.isclose(np.linalg.norm(e), 1138.371390377773, atol=1e-7, rtol=1e-16) -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) def test_df_rb3lyp_grad_medium(benchmark): g = benchmark(run_rb3lyp_grad, medium_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp grad medium') assert np.isclose(np.linalg.norm(g), 0.26010545073602614, atol=1e-5, rtol=1e-16) -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=False, min_rounds=1) def test_df_rb3lyp_hessian_medium(benchmark): h = benchmark(run_rb3lyp_hessian, medium_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp hessian medium') assert np.isclose(np.linalg.norm(h), 6.31265424196621, atol=1e-4, rtol=1e-16) -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) def test_rb3lyp_medium(benchmark): e = benchmark(run_rb3lyp, medium_mol, 'def2-tzvpp', False, False) print('testing rb3lyp medium') assert np.isclose(np.linalg.norm(e), 1138.3710752128077, atol=1e-7, rtol=1e-16) -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) def test_rb3lyp_grad_medium(benchmark): g = benchmark(run_rb3lyp_grad, medium_mol, 'def2-tzvpp', False, False) print('testing rb3lyp grad medium') assert np.isclose(np.linalg.norm(g), 0.2601443836937988, atol=1e-5, rtol=1e-16) @pytest.mark.slow -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=False, min_rounds=1) def test_rb3lyp_hessian_medium(benchmark): h = benchmark(run_rb3lyp_hessian, medium_mol, 'def2-tzvpp', False, False) print('testing rb3lyp hessian medium') @@ -169,32 +169,32 @@ def test_rb3lyp_hessian_medium(benchmark): # large molecule #################### @pytest.mark.high_memory -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) def test_df_rb3lyp_large(benchmark): e = benchmark(run_rb3lyp, large_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp large') assert np.isclose(np.linalg.norm(e), 2564.198712152175, atol=1e-7, rtol=1e-16) @pytest.mark.high_memory -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) def test_df_rb3lyp_grad_large(benchmark): g = benchmark(run_rb3lyp_grad, large_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp grad large') assert np.isclose(np.linalg.norm(g), 0.3784358687859323, atol=1e-5, rtol=1e-16) @pytest.mark.high_memory @pytest.mark.slow -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=False, min_rounds=1) def test_df_rb3lyp_hessian_large(benchmark): h = benchmark(run_rb3lyp_hessian, large_mol, 'def2-tzvpp', True, False) print('testing df rb3lyp hessian large') assert np.isclose(np.linalg.norm(h), 7.583208736873523, atol=1e-4, rtol=1e-16) @pytest.mark.slow -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) def test_rb3lyp_large(benchmark): e = benchmark(run_rb3lyp, large_mol, 'def2-tzvpp', False, False) print('testing rb3lyp large') assert np.isclose(np.linalg.norm(e), 2564.198099576358, atol=1e-7, rtol=1e-16) @pytest.mark.slow -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) def test_rb3lyp_grad_large(benchmark): g = benchmark(run_rb3lyp_grad, large_mol, 'def2-tzvpp', False, False) print('testing rb3lyp grad large') @@ -213,17 +213,17 @@ def test_rb3lyp_hessian_large(benchmark): ##################### # Small basis set ##################### -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) def test_df_rb3lyp_631gs(benchmark): e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, False) print('testing df rb3lyp 631gs') assert np.isclose(np.linalg.norm(e), 684.6646008642876, atol=1e-7, rtol=1e-16) -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) def test_df_rb3lyp_631gs_grad(benchmark): g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, False) print('testing df rb3lyp 631gs grad') assert np.isclose(np.linalg.norm(g), 0.17530687343398219, atol=1e-5, rtol=1e-16) -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=False, min_rounds=1) def test_df_rb3lyp_631gs_hessian(benchmark): h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, False) print('testing df rb3lyp 631gs hessian') @@ -232,18 +232,18 @@ def test_df_rb3lyp_631gs_hessian(benchmark): ######################################### # Small basis set for large molecule ######################################### -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) def test_rb3lyp_631gs_large(benchmark): e = benchmark(run_rb3lyp, large_mol, '6-31gs', False, False) print('testing rb3lyp 631gs large') assert np.isclose(np.linalg.norm(e), 2563.1171191823423, atol=1e-7, rtol=1e-16) -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) def test_rb3lyp_631gs_grad_large(benchmark): g = benchmark(run_rb3lyp_grad, large_mol, '6-31gs', False, False) print('testing df rb3lyp 631gs grad large') assert np.isclose(np.linalg.norm(g), 0.37778228700247984, atol=1e-5, rtol=1e-16) @pytest.mark.slow -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=False, min_rounds=1) def test_rb3lyp_631gs_hessian_large(benchmark): h = benchmark(run_rb3lyp_hessian, large_mol, '6-31gs', False, False) print('testing df rb3lyp 631gs hessian large') @@ -252,17 +252,17 @@ def test_rb3lyp_631gs_hessian_large(benchmark): ################### # Solvent model ################### -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) def test_df_rb3lyp_631gs_solvent(benchmark): e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, True) print('testing df rb3lyp 631gs solvent') assert np.isclose(np.linalg.norm(e), 684.6985561053816, atol=1e-7, rtol=1e-16) -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) def test_df_rb3lyp_631gs_solvent_grad(benchmark): g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, True) print('testing df rb3lyp 631gs solvent grad') assert np.isclose(np.linalg.norm(g), 0.16956999476137297, atol=1e-5, rtol=1e-16) -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=False, min_rounds=1) def test_df_rb3lyp_631gs_solvent_hessian(benchmark): h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, True) print('testing df rb3lyp 631gs solvent hessian') diff --git a/gpu4pyscf/tests/test_benchmark_uks.py b/gpu4pyscf/tests/test_benchmark_uks.py index 2c9d8ce6..5962c08e 100644 --- a/gpu4pyscf/tests/test_benchmark_uks.py +++ b/gpu4pyscf/tests/test_benchmark_uks.py @@ -68,34 +68,33 @@ def run_ub3lyp_hessian(atom, basis, with_df, with_solvent): ########## # UKS ########## -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=6) def test_df_ub3lyp(benchmark): e = benchmark(run_ub3lyp, small_mol, 'def2-tzvpp', True, False) print('testing df ub3lyp') assert np.isclose(np.linalg.norm(e), 684.9998712035856, atol=1e-7, rtol=1e-16) -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=6) def test_df_ub3lyp_grad(benchmark): g = benchmark(run_ub3lyp_grad, small_mol, 'def2-tzvpp', True, False) print('testing df ub3lyp grad') assert np.isclose(np.linalg.norm(g), 0.17435842214665462, atol=1e-5, rtol=1e-16) -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=False, min_rounds=1) def test_df_ub3lyp_hessian(benchmark): h = benchmark(run_ub3lyp_hessian, small_mol, 'def2-tzvpp', True, False) print('testing df ub3lyp hessian') assert np.isclose(np.linalg.norm(h), 3.758810345806532, atol=1e-4, rtol=1e-16) -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=6) def test_ub3lyp(benchmark): e = benchmark(run_ub3lyp, small_mol, '6-31gs', False, False) print('testing ub3lyp') assert np.isclose(np.linalg.norm(e), 684.6643858622429, atol=1e-7, rtol=1e-16) -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=6) def test_ub3lyp_grad(benchmark): g = benchmark(run_ub3lyp_grad, small_mol, '6-31gs', False, False) print('testing ub3lyp grad') assert np.isclose(np.linalg.norm(g), 0.17540045665419984, atol=1e-5, rtol=1e-16) -@pytest.mark.benchmark +@pytest.mark.benchmark(warmup=False, min_rounds=1) def test_ub3lyp_hessian(benchmark): h = benchmark(run_ub3lyp_hessian, small_mol, '6-31gs', False, False) print('testing ub3lyp hessian') - print(np.linalg.norm(h), np.linalg.norm(h) - 3.758916526520172) assert np.isclose(np.linalg.norm(h), 3.907289414559395, atol=1e-4, rtol=1e-16) From 7a00f6ad12e770b999fae2b6cdf0bcca8f741ed0 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Mon, 6 Jan 2025 01:38:03 +0000 Subject: [PATCH 42/49] bugfix --- gpu4pyscf/dft/numint.py | 8 ++++---- gpu4pyscf/tests/test_benchmark_rks.py | 4 ++-- gpu4pyscf/tests/test_benchmark_uks.py | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py index 872ce750..17498c7d 100644 --- a/gpu4pyscf/dft/numint.py +++ b/gpu4pyscf/dft/numint.py @@ -415,7 +415,7 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, ngrids_glob = grids.coords.shape[0] ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE - grid_start = device_id * ngrids_per_device + grid_start = min(device_id * ngrids_per_device, ngrids_glob) grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob) ngrids_local = grid_end - grid_start log.debug(f"{ngrids_local} on Device {device_id}") @@ -819,7 +819,7 @@ def _nr_uks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, ngrids_glob = grids.coords.shape[0] ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE - grid_start = device_id * ngrids_per_device + grid_start = min(device_id * ngrids_per_device, ngrids_glob) grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob) ngrids_local = grid_end - grid_start log.debug(f"{ngrids_local} on Device {device_id}") @@ -1024,7 +1024,7 @@ def _nr_rks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff, ngrids_glob = grids.coords.shape[0] ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE - grid_start = device_id * ngrids_per_device + grid_start = min(device_id * ngrids_per_device, ngrids_glob) grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob) ngrids_local = grid_end - grid_start log.debug(f"{ngrids_local} on Device {device_id}") @@ -1176,7 +1176,7 @@ def _nr_uks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff, ngrids_glob = grids.coords.shape[0] ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE - grid_start = device_id * ngrids_per_device + grid_start = min(device_id * ngrids_per_device, ngrids_glob) grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob) ngrids_local = grid_end - grid_start log.debug(f"{ngrids_local} on Device {device_id}") diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py index 2afa6e70..ec294234 100644 --- a/gpu4pyscf/tests/test_benchmark_rks.py +++ b/gpu4pyscf/tests/test_benchmark_rks.py @@ -148,12 +148,12 @@ def test_df_rb3lyp_hessian_medium(benchmark): print('testing df rb3lyp hessian medium') assert np.isclose(np.linalg.norm(h), 6.31265424196621, atol=1e-4, rtol=1e-16) -@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) +@pytest.mark.benchmark(warmup=False, min_rounds=1) def test_rb3lyp_medium(benchmark): e = benchmark(run_rb3lyp, medium_mol, 'def2-tzvpp', False, False) print('testing rb3lyp medium') assert np.isclose(np.linalg.norm(e), 1138.3710752128077, atol=1e-7, rtol=1e-16) -@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) +@pytest.mark.benchmark(warmup=False, min_rounds=1) def test_rb3lyp_grad_medium(benchmark): g = benchmark(run_rb3lyp_grad, medium_mol, 'def2-tzvpp', False, False) print('testing rb3lyp grad medium') diff --git a/gpu4pyscf/tests/test_benchmark_uks.py b/gpu4pyscf/tests/test_benchmark_uks.py index 5962c08e..236a433b 100644 --- a/gpu4pyscf/tests/test_benchmark_uks.py +++ b/gpu4pyscf/tests/test_benchmark_uks.py @@ -68,12 +68,12 @@ def run_ub3lyp_hessian(atom, basis, with_df, with_solvent): ########## # UKS ########## -@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=6) +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) def test_df_ub3lyp(benchmark): e = benchmark(run_ub3lyp, small_mol, 'def2-tzvpp', True, False) print('testing df ub3lyp') assert np.isclose(np.linalg.norm(e), 684.9998712035856, atol=1e-7, rtol=1e-16) -@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=6) +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) def test_df_ub3lyp_grad(benchmark): g = benchmark(run_ub3lyp_grad, small_mol, 'def2-tzvpp', True, False) print('testing df ub3lyp grad') @@ -83,12 +83,12 @@ def test_df_ub3lyp_hessian(benchmark): h = benchmark(run_ub3lyp_hessian, small_mol, 'def2-tzvpp', True, False) print('testing df ub3lyp hessian') assert np.isclose(np.linalg.norm(h), 3.758810345806532, atol=1e-4, rtol=1e-16) -@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=6) +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) def test_ub3lyp(benchmark): e = benchmark(run_ub3lyp, small_mol, '6-31gs', False, False) print('testing ub3lyp') assert np.isclose(np.linalg.norm(e), 684.6643858622429, atol=1e-7, rtol=1e-16) -@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=6) +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) def test_ub3lyp_grad(benchmark): g = benchmark(run_ub3lyp_grad, small_mol, '6-31gs', False, False) print('testing ub3lyp grad') From 0fa182d7d6d84cc5dc9886a70a859d058e491147 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Mon, 6 Jan 2025 13:33:36 +0800 Subject: [PATCH 43/49] add benchmark data --- .github/workflows/nightly_build.yml | 4 +- .../v1.3.0_rks_1v100.json} | 630 +++++++++--------- .../benchmark_results/v1.3.0_uks_1v100.json | 418 ++++++++++++ 3 files changed, 735 insertions(+), 317 deletions(-) rename gpu4pyscf/tests/{.benchmarks/Linux-CPython-3.9-64bit/v1.3.0_1v100.json => benchmark_results/v1.3.0_rks_1v100.json} (56%) create mode 100644 gpu4pyscf/tests/benchmark_results/v1.3.0_uks_1v100.json diff --git a/.github/workflows/nightly_build.yml b/.github/workflows/nightly_build.yml index 012ca12f..29ec300f 100644 --- a/.github/workflows/nightly_build.yml +++ b/.github/workflows/nightly_build.yml @@ -40,9 +40,9 @@ jobs: run: | echo $GITHUB_WORKSPACE export PYTHONPATH="${PYTHONPATH}:$(pwd)" - pytest gpu4pyscf/tests/test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/ + pytest gpu4pyscf/tests/test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_rks_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/ - name: Test UKS run: | echo $GITHUB_WORKSPACE export PYTHONPATH="${PYTHONPATH}:$(pwd)" - pytest gpu4pyscf/tests/test_benchmark_uks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/ + pytest gpu4pyscf/tests/test_benchmark_uks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_uks_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/ diff --git a/gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/v1.3.0_1v100.json b/gpu4pyscf/tests/benchmark_results/v1.3.0_rks_1v100.json similarity index 56% rename from gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/v1.3.0_1v100.json rename to gpu4pyscf/tests/benchmark_results/v1.3.0_rks_1v100.json index 81cb8ad0..1c5a9fc2 100644 --- a/gpu4pyscf/tests/.benchmarks/Linux-CPython-3.9-64bit/v1.3.0_1v100.json +++ b/gpu4pyscf/tests/benchmark_results/v1.3.0_rks_1v100.json @@ -1,6 +1,6 @@ { "machine_info": { - "node": "mlxlabzskrh20v669fd914-20240723162348-uim1c3-k3ligr-worker", + "node": "mlxlabzskrh20v669fd914-20240723162348-uim1c3-7vr5b9-worker", "processor": "", "machine": "x86_64", "python_compiler": "GCC 10.2.1 20210110", @@ -34,7 +34,7 @@ 0 ], "hz_actual": [ - 3100001000, + 3100005000, 0 ], "stepping": 7, @@ -194,10 +194,10 @@ } }, "commit_info": { - "id": "ba388eec82973e4722d1afa3e83e00a3101248a0", - "time": "2025-01-03T06:10:51+08:00", - "author_time": "2025-01-03T06:10:51+08:00", - "dirty": true, + "id": "1ed8e5eccf5aaae256f4ad591db81f6689dd8a13", + "time": "2025-01-05T23:21:10+00:00", + "author_time": "2025-01-05T23:21:10+00:00", + "dirty": false, "project": "gpu4pyscf", "branch": "benchmark_ci" }, @@ -212,28 +212,28 @@ "options": { "disable_gc": false, "timer": "perf_counter", - "min_rounds": 5, + "min_rounds": 3, "max_time": 1.0, "min_time": 5e-06, - "warmup": false + "warmup": 2 }, "stats": { - "min": 2.912467209622264, - "max": 3.132086180150509, - "mean": 2.9854499623179436, - "stddev": 0.08575128159932316, - "rounds": 5, - "median": 2.9598704893141985, - "iqr": 0.08442470477893949, - "q1": 2.934416546020657, - "q3": 3.0188412507995963, + "min": 2.725358221679926, + "max": 2.835785958915949, + "mean": 2.782431565846006, + "stddev": 0.055307723110869685, + "rounds": 3, + "median": 2.7861505169421434, + "iqr": 0.08282080292701721, + "q1": 2.7405562954954803, + "q3": 2.8233770984224975, "iqr_outliers": 0, "stddev_outliers": 1, "outliers": "1;0", - "ld15iqr": 2.912467209622264, - "hd15iqr": 3.132086180150509, - "ops": 0.33495788327451537, - "total": 14.927249811589718, + "ld15iqr": 2.725358221679926, + "hd15iqr": 2.835785958915949, + "ops": 0.35939787783997024, + "total": 8.347294697538018, "iterations": 1 } }, @@ -247,28 +247,28 @@ "options": { "disable_gc": false, "timer": "perf_counter", - "min_rounds": 5, + "min_rounds": 3, "max_time": 1.0, "min_time": 5e-06, - "warmup": false + "warmup": 2 }, "stats": { - "min": 4.693447925150394, - "max": 4.811241740360856, - "mean": 4.7545236147940155, - "stddev": 0.05376631322845494, - "rounds": 5, - "median": 4.767030920833349, - "iqr": 0.10001574829220772, - "q1": 4.700914891902357, - "q3": 4.800930640194565, + "min": 4.394210334867239, + "max": 4.473813105374575, + "mean": 4.42994485112528, + "stddev": 0.04041990275091787, + "rounds": 3, + "median": 4.4218111131340265, + "iqr": 0.05970207788050175, + "q1": 4.401110529433936, + "q3": 4.460812607314438, "iqr_outliers": 0, - "stddev_outliers": 2, - "outliers": "2;0", - "ld15iqr": 4.693447925150394, - "hd15iqr": 4.811241740360856, - "ops": 0.21032601392249553, - "total": 23.77261807397008, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 4.394210334867239, + "hd15iqr": 4.473813105374575, + "ops": 0.22573644449455918, + "total": 13.28983455337584, "iterations": 1 } }, @@ -282,28 +282,28 @@ "options": { "disable_gc": false, "timer": "perf_counter", - "min_rounds": 5, + "min_rounds": 1, "max_time": 1.0, "min_time": 5e-06, "warmup": false }, "stats": { - "min": 43.1729771643877, - "max": 44.22008949518204, - "mean": 43.53323510922492, - "stddev": 0.4568445249288835, - "rounds": 5, - "median": 43.26318317092955, - "iqr": 0.6843766365200281, - "q1": 43.210667157545686, - "q3": 43.895043794065714, + "min": 43.774112831801176, + "max": 43.774112831801176, + "mean": 43.774112831801176, + "stddev": 0, + "rounds": 1, + "median": 43.774112831801176, + "iqr": 0.0, + "q1": 43.774112831801176, + "q3": 43.774112831801176, "iqr_outliers": 0, - "stddev_outliers": 1, - "outliers": "1;0", - "ld15iqr": 43.1729771643877, - "hd15iqr": 44.22008949518204, - "ops": 0.02297095535149178, - "total": 217.66617554612458, + "stddev_outliers": 0, + "outliers": "0;0", + "ld15iqr": 43.774112831801176, + "hd15iqr": 43.774112831801176, + "ops": 0.022844552072189946, + "total": 43.774112831801176, "iterations": 1 } }, @@ -317,28 +317,28 @@ "options": { "disable_gc": false, "timer": "perf_counter", - "min_rounds": 5, + "min_rounds": 3, "max_time": 1.0, "min_time": 5e-06, - "warmup": false + "warmup": 2 }, "stats": { - "min": 40.87554130144417, - "max": 41.24961415119469, - "mean": 41.05381844490766, - "stddev": 0.13780925683914672, - "rounds": 5, - "median": 41.05546211451292, - "iqr": 0.17331884242594242, - "q1": 40.96216300688684, - "q3": 41.13548184931278, + "min": 40.097773076966405, + "max": 40.15744375810027, + "mean": 40.11991243995726, + "stddev": 0.03267769513443882, + "rounds": 3, + "median": 40.10452048480511, + "iqr": 0.04475301085039973, + "q1": 40.09945992892608, + "q3": 40.14421293977648, "iqr_outliers": 0, - "stddev_outliers": 2, - "outliers": "2;0", - "ld15iqr": 40.87554130144417, - "hd15iqr": 41.24961415119469, - "ops": 0.02435827014098467, - "total": 205.26909222453833, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 40.097773076966405, + "hd15iqr": 40.15744375810027, + "ops": 0.024925278725285903, + "total": 120.35973731987178, "iterations": 1 } }, @@ -352,28 +352,28 @@ "options": { "disable_gc": false, "timer": "perf_counter", - "min_rounds": 5, + "min_rounds": 3, "max_time": 1.0, "min_time": 5e-06, - "warmup": false + "warmup": 2 }, "stats": { - "min": 49.98093665204942, - "max": 50.76574368029833, - "mean": 50.31307061091066, - "stddev": 0.33630120438295324, - "rounds": 5, - "median": 50.36884331330657, - "iqr": 0.5613440982997417, - "q1": 49.981349020730704, - "q3": 50.542693119030446, + "min": 48.99313645064831, + "max": 49.26371451281011, + "mean": 49.142610578487314, + "stddev": 0.13750190122656403, + "rounds": 3, + "median": 49.17098077200353, + "iqr": 0.20293354662135243, + "q1": 49.037597530987114, + "q3": 49.240531077608466, "iqr_outliers": 0, "stddev_outliers": 1, "outliers": "1;0", - "ld15iqr": 49.98093665204942, - "hd15iqr": 50.76574368029833, - "ops": 0.01987555098223611, - "total": 251.56535305455327, + "ld15iqr": 48.99313645064831, + "hd15iqr": 49.26371451281011, + "ops": 0.02034893930599935, + "total": 147.42783173546195, "iterations": 1 } }, @@ -387,28 +387,28 @@ "options": { "disable_gc": false, "timer": "perf_counter", - "min_rounds": 5, + "min_rounds": 1, "max_time": 1.0, "min_time": 5e-06, "warmup": false }, "stats": { - "min": 611.3098333217204, - "max": 620.2315559927374, - "mean": 614.9859318509698, - "stddev": 3.295612103075669, - "rounds": 5, - "median": 614.4812579210848, - "iqr": 3.568844774272293, - "q1": 612.998380784411, - "q3": 616.5672255586833, + "min": 615.0911720395088, + "max": 615.0911720395088, + "mean": 615.0911720395088, + "stddev": 0, + "rounds": 1, + "median": 615.0911720395088, + "iqr": 0.0, + "q1": 615.0911720395088, + "q3": 615.0911720395088, "iqr_outliers": 0, - "stddev_outliers": 2, - "outliers": "2;0", - "ld15iqr": 611.3098333217204, - "hd15iqr": 620.2315559927374, - "ops": 0.001626053456198948, - "total": 3074.929659254849, + "stddev_outliers": 0, + "outliers": "0;0", + "ld15iqr": 615.0911720395088, + "hd15iqr": 615.0911720395088, + "ops": 0.0016257752435044988, + "total": 615.0911720395088, "iterations": 1 } }, @@ -422,28 +422,28 @@ "options": { "disable_gc": false, "timer": "perf_counter", - "min_rounds": 5, + "min_rounds": 3, "max_time": 1.0, "min_time": 5e-06, - "warmup": false + "warmup": 2 }, "stats": { - "min": 18.450319150462747, - "max": 19.34435743652284, - "mean": 18.962213665619494, - "stddev": 0.34090358565345374, - "rounds": 5, - "median": 19.017266055569053, - "iqr": 0.4629710176959634, - "q1": 18.742521196603775, - "q3": 19.20549221429974, + "min": 18.244548039510846, + "max": 18.375720830634236, + "mean": 18.312131161491077, + "stddev": 0.06567751542153955, + "rounds": 3, + "median": 18.316124614328146, + "iqr": 0.09837959334254265, + "q1": 18.26244218321517, + "q3": 18.360821776557714, "iqr_outliers": 0, - "stddev_outliers": 2, - "outliers": "2;0", - "ld15iqr": 18.450319150462747, - "hd15iqr": 19.34435743652284, - "ops": 0.05273645881404165, - "total": 94.81106832809746, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 18.244548039510846, + "hd15iqr": 18.375720830634236, + "ops": 0.05460860842362896, + "total": 54.93639348447323, "iterations": 1 } }, @@ -457,28 +457,28 @@ "options": { "disable_gc": false, "timer": "perf_counter", - "min_rounds": 5, + "min_rounds": 3, "max_time": 1.0, "min_time": 5e-06, - "warmup": false + "warmup": 2 }, "stats": { - "min": 28.927994549274445, - "max": 29.407788010314107, - "mean": 29.06979787014425, - "stddev": 0.19948441635503308, - "rounds": 5, - "median": 28.980533458292484, - "iqr": 0.2319285492412746, - "q1": 28.93826104514301, - "q3": 29.170189594384283, + "min": 30.697130125015974, + "max": 30.711910048499703, + "mean": 30.70534764789045, + "stddev": 0.00752768934207856, + "rounds": 3, + "median": 30.70700277015567, + "iqr": 0.011084942612797022, + "q1": 30.699598286300898, + "q3": 30.710683228913695, "iqr_outliers": 0, "stddev_outliers": 1, "outliers": "1;0", - "ld15iqr": 28.927994549274445, - "hd15iqr": 29.407788010314107, - "ops": 0.034399963992423795, - "total": 145.34898935072124, + "ld15iqr": 30.697130125015974, + "hd15iqr": 30.711910048499703, + "ops": 0.03256761693329022, + "total": 92.11604294367135, "iterations": 1 } }, @@ -492,28 +492,28 @@ "options": { "disable_gc": false, "timer": "perf_counter", - "min_rounds": 5, + "min_rounds": 1, "max_time": 1.0, "min_time": 5e-06, "warmup": false }, "stats": { - "min": 674.9359990525991, - "max": 678.2040371634066, - "mean": 676.7355838540941, - "stddev": 1.3332352353981456, - "rounds": 5, - "median": 676.6573997996747, - "iqr": 2.1692251418717206, - "q1": 675.7630731766112, - "q3": 677.9322983184829, + "min": 667.9882875829935, + "max": 667.9882875829935, + "mean": 667.9882875829935, + "stddev": 0, + "rounds": 1, + "median": 667.9882875829935, + "iqr": 0.0, + "q1": 667.9882875829935, + "q3": 667.9882875829935, "iqr_outliers": 0, - "stddev_outliers": 2, - "outliers": "2;0", - "ld15iqr": 674.9359990525991, - "hd15iqr": 678.2040371634066, - "ops": 0.0014776820132685715, - "total": 3383.6779192704707, + "stddev_outliers": 0, + "outliers": "0;0", + "ld15iqr": 667.9882875829935, + "hd15iqr": 667.9882875829935, + "ops": 0.0014970322363260838, + "total": 667.9882875829935, "iterations": 1 } }, @@ -527,28 +527,28 @@ "options": { "disable_gc": false, "timer": "perf_counter", - "min_rounds": 5, + "min_rounds": 3, "max_time": 1.0, "min_time": 5e-06, - "warmup": false + "warmup": 2 }, "stats": { - "min": 465.53933845460415, - "max": 469.9319954663515, - "mean": 467.35331859253347, - "stddev": 1.7196427730040629, - "rounds": 5, - "median": 467.1924539171159, - "iqr": 2.4731017132289708, - "q1": 465.9859178052284, - "q3": 468.45901951845735, + "min": 460.72668202780187, + "max": 461.77398146130145, + "mean": 461.4145879279822, + "stddev": 0.5959440470695604, + "rounds": 3, + "median": 461.7431002948433, + "iqr": 0.785474575124681, + "q1": 460.98078659456223, + "q3": 461.7662611696869, "iqr_outliers": 0, - "stddev_outliers": 2, - "outliers": "2;0", - "ld15iqr": 465.53933845460415, - "hd15iqr": 469.9319954663515, - "ops": 0.00213970878180895, - "total": 2336.7665929626673, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 460.72668202780187, + "hd15iqr": 461.77398146130145, + "ops": 0.0021672483405662944, + "total": 1384.2437637839466, "iterations": 1 } }, @@ -562,28 +562,28 @@ "options": { "disable_gc": false, "timer": "perf_counter", - "min_rounds": 5, + "min_rounds": 3, "max_time": 1.0, "min_time": 5e-06, - "warmup": false + "warmup": 2 }, "stats": { - "min": 559.7645257860422, - "max": 562.2628617957234, - "mean": 560.7982054453344, - "stddev": 0.9539619574856013, - "rounds": 5, - "median": 560.6089988369495, - "iqr": 1.2622483419254422, - "q1": 560.1302895797417, - "q3": 561.3925379216671, + "min": 552.0836905632168, + "max": 553.4436832498759, + "mean": 552.8364644367248, + "stddev": 0.6915813282891417, + "rounds": 3, + "median": 552.9820194970816, + "iqr": 1.0199945149943233, + "q1": 552.308272796683, + "q3": 553.3282673116773, "iqr_outliers": 0, - "stddev_outliers": 2, - "outliers": "2;0", - "ld15iqr": 559.7645257860422, - "hd15iqr": 562.2628617957234, - "ops": 0.0017831726105576463, - "total": 2803.9910272266716, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 552.0836905632168, + "hd15iqr": 553.4436832498759, + "ops": 0.0018088531859396832, + "total": 1658.5093933101743, "iterations": 1 } }, @@ -597,28 +597,28 @@ "options": { "disable_gc": false, "timer": "perf_counter", - "min_rounds": 5, + "min_rounds": 3, "max_time": 1.0, "min_time": 5e-06, - "warmup": false + "warmup": 2 }, "stats": { - "min": 2.3893800172954798, - "max": 4.065618982538581, - "mean": 3.419478364661336, - "stddev": 0.6287030173245606, - "rounds": 5, - "median": 3.485863795503974, - "iqr": 0.6505572367459536, - "q1": 3.1652946420945227, - "q3": 3.8158518788404763, + "min": 1.6017291732132435, + "max": 1.647629827260971, + "mean": 1.6208390643199284, + "stddev": 0.02389486042236203, + "rounds": 3, + "median": 1.613158192485571, + "iqr": 0.03442549053579569, + "q1": 1.6045864280313253, + "q3": 1.639011918567121, "iqr_outliers": 0, - "stddev_outliers": 2, - "outliers": "2;0", - "ld15iqr": 2.3893800172954798, - "hd15iqr": 4.065618982538581, - "ops": 0.29244226556147246, - "total": 17.09739182330668, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 1.6017291732132435, + "hd15iqr": 1.647629827260971, + "ops": 0.6169643994973554, + "total": 4.8625171929597855, "iterations": 1 } }, @@ -632,28 +632,28 @@ "options": { "disable_gc": false, "timer": "perf_counter", - "min_rounds": 5, + "min_rounds": 3, "max_time": 1.0, "min_time": 5e-06, - "warmup": false + "warmup": 2 }, "stats": { - "min": 3.086430249735713, - "max": 3.4464661311358213, - "mean": 3.21519818790257, - "stddev": 0.14631235080321897, - "rounds": 5, - "median": 3.1610366478562355, - "iqr": 0.20356103358790278, - "q1": 3.108103247359395, - "q3": 3.311664280947298, + "min": 2.1184212770313025, + "max": 2.20925628952682, + "mean": 2.15202548665305, + "stddev": 0.04981377124137081, + "rounds": 3, + "median": 2.1283988934010267, + "iqr": 0.0681262593716383, + "q1": 2.1209156811237335, + "q3": 2.189041940495372, "iqr_outliers": 0, "stddev_outliers": 1, "outliers": "1;0", - "ld15iqr": 3.086430249735713, - "hd15iqr": 3.4464661311358213, - "ops": 0.31102281774186635, - "total": 16.07599093951285, + "ld15iqr": 2.1184212770313025, + "hd15iqr": 2.20925628952682, + "ops": 0.46467851157063006, + "total": 6.456076459959149, "iterations": 1 } }, @@ -667,28 +667,28 @@ "options": { "disable_gc": false, "timer": "perf_counter", - "min_rounds": 5, + "min_rounds": 1, "max_time": 1.0, "min_time": 5e-06, "warmup": false }, "stats": { - "min": 15.165010405704379, - "max": 15.594494730234146, - "mean": 15.329469257220627, - "stddev": 0.16923297471060986, - "rounds": 5, - "median": 15.25765424221754, - "iqr": 0.22211500210687518, - "q1": 15.220200731419027, - "q3": 15.442315733525902, + "min": 16.1142161693424, + "max": 16.1142161693424, + "mean": 16.1142161693424, + "stddev": 0, + "rounds": 1, + "median": 16.1142161693424, + "iqr": 0.0, + "q1": 16.1142161693424, + "q3": 16.1142161693424, "iqr_outliers": 0, - "stddev_outliers": 1, - "outliers": "1;0", - "ld15iqr": 15.165010405704379, - "hd15iqr": 15.594494730234146, - "ops": 0.0652338305534597, - "total": 76.64734628610313, + "stddev_outliers": 0, + "outliers": "0;0", + "ld15iqr": 16.1142161693424, + "hd15iqr": 16.1142161693424, + "ops": 0.06205700541007504, + "total": 16.1142161693424, "iterations": 1 } }, @@ -702,28 +702,28 @@ "options": { "disable_gc": false, "timer": "perf_counter", - "min_rounds": 5, + "min_rounds": 3, "max_time": 1.0, "min_time": 5e-06, - "warmup": false + "warmup": 2 }, "stats": { - "min": 55.83294254913926, - "max": 59.17145283520222, - "mean": 57.62203652448952, - "stddev": 1.374501327462731, - "rounds": 5, - "median": 57.98267317190766, - "iqr": 2.264786566141993, - "q1": 56.414323914796114, - "q3": 58.67911048093811, + "min": 55.4929311927408, + "max": 56.77203128859401, + "mean": 56.066467080265284, + "stddev": 0.6496905970719544, + "rounds": 3, + "median": 55.934438759461045, + "iqr": 0.9593250718899071, + "q1": 55.60330808442086, + "q3": 56.56263315631077, "iqr_outliers": 0, - "stddev_outliers": 2, - "outliers": "2;0", - "ld15iqr": 55.83294254913926, - "hd15iqr": 59.17145283520222, - "ops": 0.017354471662503586, - "total": 288.1101826224476, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 55.4929311927408, + "hd15iqr": 56.77203128859401, + "ops": 0.01783597312397784, + "total": 168.19940124079585, "iterations": 1 } }, @@ -737,28 +737,28 @@ "options": { "disable_gc": false, "timer": "perf_counter", - "min_rounds": 5, + "min_rounds": 3, "max_time": 1.0, "min_time": 5e-06, - "warmup": false + "warmup": 2 }, "stats": { - "min": 72.57952445559204, - "max": 73.64580446854234, - "mean": 73.0582833636552, - "stddev": 0.48907908462094757, - "rounds": 5, - "median": 72.9562251791358, - "iqr": 0.9134543887339532, - "q1": 72.61263743927702, - "q3": 73.52609182801098, + "min": 70.14288471080363, + "max": 70.61111964285374, + "mean": 70.3403081515183, + "stddev": 0.24259089508559126, + "rounds": 3, + "median": 70.26692010089755, + "iqr": 0.3511761990375817, + "q1": 70.17389355832711, + "q3": 70.52506975736469, "iqr_outliers": 0, "stddev_outliers": 1, "outliers": "1;0", - "ld15iqr": 72.57952445559204, - "hd15iqr": 73.64580446854234, - "ops": 0.01368770184514733, - "total": 365.29141681827605, + "ld15iqr": 70.14288471080363, + "hd15iqr": 70.61111964285374, + "ops": 0.014216599646477592, + "total": 211.02092445455492, "iterations": 1 } }, @@ -772,28 +772,28 @@ "options": { "disable_gc": false, "timer": "perf_counter", - "min_rounds": 5, + "min_rounds": 3, "max_time": 1.0, "min_time": 5e-06, - "warmup": false + "warmup": 2 }, "stats": { - "min": 3.96594556607306, - "max": 4.621823711320758, - "mean": 4.35534807741642, - "stddev": 0.24887039802683908, - "rounds": 5, - "median": 4.413319645449519, - "iqr": 0.31258321227505803, - "q1": 4.208048852626234, - "q3": 4.520632064901292, + "min": 2.51676319912076, + "max": 2.569052016362548, + "mean": 2.540054644147555, + "stddev": 0.02660729798277223, + "rounds": 3, + "median": 2.5343487169593573, + "iqr": 0.03921661293134093, + "q1": 2.5211595785804093, + "q3": 2.56037619151175, "iqr_outliers": 0, - "stddev_outliers": 2, - "outliers": "2;0", - "ld15iqr": 3.96594556607306, - "hd15iqr": 4.621823711320758, - "ops": 0.22960277392873663, - "total": 21.7767403870821, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 2.51676319912076, + "hd15iqr": 2.569052016362548, + "ops": 0.393692317723976, + "total": 7.620163932442665, "iterations": 1 } }, @@ -807,28 +807,28 @@ "options": { "disable_gc": false, "timer": "perf_counter", - "min_rounds": 5, + "min_rounds": 3, "max_time": 1.0, "min_time": 5e-06, - "warmup": false + "warmup": 2 }, "stats": { - "min": 4.794365357607603, - "max": 5.184939783066511, - "mean": 4.969379325211048, - "stddev": 0.15483049255542325, - "rounds": 5, - "median": 4.9499497301876545, - "iqr": 0.24070796929299831, - "q1": 4.846174816135317, - "q3": 5.086882785428315, + "min": 3.7774324007332325, + "max": 3.8614633549004793, + "mean": 3.8227184594919286, + "stddev": 0.04239564161614309, + "rounds": 3, + "median": 3.8292596228420734, + "iqr": 0.06302321562543511, + "q1": 3.7903892062604427, + "q3": 3.853412421885878, "iqr_outliers": 0, - "stddev_outliers": 2, - "outliers": "2;0", - "ld15iqr": 4.794365357607603, - "hd15iqr": 5.184939783066511, - "ops": 0.20123237421758508, - "total": 24.84689662605524, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 3.7774324007332325, + "hd15iqr": 3.8614633549004793, + "ops": 0.26159394436097405, + "total": 11.468155378475785, "iterations": 1 } }, @@ -842,32 +842,32 @@ "options": { "disable_gc": false, "timer": "perf_counter", - "min_rounds": 5, + "min_rounds": 1, "max_time": 1.0, "min_time": 5e-06, "warmup": false }, "stats": { - "min": 152.7211031857878, - "max": 161.58804737962782, - "mean": 158.70457714907826, - "stddev": 3.529131682005075, - "rounds": 5, - "median": 159.99357100203633, - "iqr": 3.860361324157566, - "q1": 157.0660247253254, - "q3": 160.92638604948297, + "min": 122.75680537335575, + "max": 122.75680537335575, + "mean": 122.75680537335575, + "stddev": 0, + "rounds": 1, + "median": 122.75680537335575, + "iqr": 0.0, + "q1": 122.75680537335575, + "q3": 122.75680537335575, "iqr_outliers": 0, - "stddev_outliers": 1, - "outliers": "1;0", - "ld15iqr": 152.7211031857878, - "hd15iqr": 161.58804737962782, - "ops": 0.006301015496614541, - "total": 793.5228857453912, + "stddev_outliers": 0, + "outliers": "0;0", + "ld15iqr": 122.75680537335575, + "hd15iqr": 122.75680537335575, + "ops": 0.00814618787902287, + "total": 122.75680537335575, "iterations": 1 } } ], - "datetime": "2025-01-03T13:58:40.332127+00:00", + "datetime": "2025-01-06T03:31:22.391433+00:00", "version": "5.1.0" } \ No newline at end of file diff --git a/gpu4pyscf/tests/benchmark_results/v1.3.0_uks_1v100.json b/gpu4pyscf/tests/benchmark_results/v1.3.0_uks_1v100.json new file mode 100644 index 00000000..7bfabd8a --- /dev/null +++ b/gpu4pyscf/tests/benchmark_results/v1.3.0_uks_1v100.json @@ -0,0 +1,418 @@ +{ + "machine_info": { + "node": "mlxlabzskrh20v669fd914-20240723162348-uim1c3-7vr5b9-worker", + "processor": "", + "machine": "x86_64", + "python_compiler": "GCC 10.2.1 20210110", + "python_implementation": "CPython", + "python_implementation_version": "3.9.2", + "python_version": "3.9.2", + "python_build": [ + "default", + "Feb 28 2021 17:03:44" + ], + "release": "5.4.143.bsk.7-amd64", + "system": "Linux", + "cpu": { + "python_version": "3.9.2.final.0 (64 bit)", + "cpuinfo_version": [ + 9, + 0, + 0 + ], + "cpuinfo_version_string": "9.0.0", + "arch": "X86_64", + "bits": 64, + "count": 96, + "arch_string_raw": "x86_64", + "vendor_id_raw": "GenuineIntel", + "brand_raw": "Intel(R) Xeon(R) Platinum 8260 CPU @ 2.40GHz", + "hz_advertised_friendly": "2.4000 GHz", + "hz_actual_friendly": "3.1000 GHz", + "hz_advertised": [ + 2400000000, + 0 + ], + "hz_actual": [ + 3100012000, + 0 + ], + "stepping": 7, + "model": 85, + "family": 6, + "flags": [ + "3dnowprefetch", + "abm", + "acpi", + "adx", + "aes", + "aperfmperf", + "apic", + "arat", + "arch_capabilities", + "arch_perfmon", + "art", + "avx", + "avx2", + "avx512_vnni", + "avx512bw", + "avx512cd", + "avx512dq", + "avx512f", + "avx512vl", + "avx512vnni", + "bmi1", + "bmi2", + "bts", + "cat_l3", + "cdp_l3", + "clflush", + "clflushopt", + "clwb", + "cmov", + "constant_tsc", + "cpuid", + "cpuid_fault", + "cqm", + "cqm_llc", + "cqm_mbm_local", + "cqm_mbm_total", + "cqm_occup_llc", + "cx16", + "cx8", + "dca", + "de", + "ds_cpl", + "dtes64", + "dtherm", + "dts", + "epb", + "ept", + "ept_ad", + "erms", + "est", + "f16c", + "flexpriority", + "flush_l1d", + "fma", + "fpu", + "fsgsbase", + "fxsr", + "ht", + "hwp", + "hwp_act_window", + "hwp_epp", + "hwp_pkg_req", + "ibpb", + "ibrs", + "ibrs_enhanced", + "ida", + "intel_ppin", + "intel_pt", + "invpcid", + "invpcid_single", + "lahf_lm", + "lm", + "mba", + "mca", + "mce", + "md_clear", + "mmx", + "movbe", + "mpx", + "msr", + "mtrr", + "nonstop_tsc", + "nopl", + "nx", + "ospke", + "osxsave", + "pae", + "pat", + "pbe", + "pcid", + "pclmulqdq", + "pdcm", + "pdpe1gb", + "pebs", + "pge", + "pku", + "pln", + "pni", + "popcnt", + "pqe", + "pqm", + "pse", + "pse36", + "pts", + "rdrand", + "rdrnd", + "rdseed", + "rdt_a", + "rdtscp", + "rep_good", + "sdbg", + "sep", + "smap", + "smep", + "smx", + "ss", + "ssbd", + "sse", + "sse2", + "sse4_1", + "sse4_2", + "ssse3", + "stibp", + "syscall", + "tm", + "tm2", + "tpr_shadow", + "tsc", + "tsc_adjust", + "tsc_deadline_timer", + "tscdeadline", + "vme", + "vmx", + "vnmi", + "vpid", + "x2apic", + "xgetbv1", + "xsave", + "xsavec", + "xsaveopt", + "xsaves", + "xtopology", + "xtpr" + ], + "l3_cache_size": 37486592, + "l2_cache_size": 50331648, + "l1_data_cache_size": "1.5 MiB", + "l1_instruction_cache_size": "1.5 MiB", + "l2_cache_line_size": 256, + "l2_cache_associativity": 6 + } + }, + "commit_info": { + "id": "1ed8e5eccf5aaae256f4ad591db81f6689dd8a13", + "time": "2025-01-05T23:21:10+00:00", + "author_time": "2025-01-05T23:21:10+00:00", + "dirty": false, + "project": "gpu4pyscf", + "branch": "benchmark_ci" + }, + "benchmarks": [ + { + "group": null, + "name": "test_df_ub3lyp", + "fullname": "gpu4pyscf/tests/test_benchmark_uks.py::test_df_ub3lyp", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 6, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": 2 + }, + "stats": { + "min": 6.552961312234402, + "max": 6.817228589206934, + "mean": 6.699132799791793, + "stddev": 0.10053109169956066, + "rounds": 6, + "median": 6.730765865184367, + "iqr": 0.15081804990768433, + "q1": 6.606128558516502, + "q3": 6.756946608424187, + "iqr_outliers": 0, + "stddev_outliers": 2, + "outliers": "2;0", + "ld15iqr": 6.552961312234402, + "hd15iqr": 6.817228589206934, + "ops": 0.14927305218237794, + "total": 40.19479679875076, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_ub3lyp_grad", + "fullname": "gpu4pyscf/tests/test_benchmark_uks.py::test_df_ub3lyp_grad", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 6, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": 2 + }, + "stats": { + "min": 13.294025084003806, + "max": 14.571726197376847, + "mean": 13.735415458368758, + "stddev": 0.5932420341119666, + "rounds": 6, + "median": 13.415598810650408, + "iqr": 1.1223390139639378, + "q1": 13.296602416783571, + "q3": 14.418941430747509, + "iqr_outliers": 0, + "stddev_outliers": 2, + "outliers": "2;0", + "ld15iqr": 13.294025084003806, + "hd15iqr": 14.571726197376847, + "ops": 0.07280449601476865, + "total": 82.41249275021255, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_ub3lyp_hessian", + "fullname": "gpu4pyscf/tests/test_benchmark_uks.py::test_df_ub3lyp_hessian", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 1, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 93.588756557554, + "max": 93.588756557554, + "mean": 93.588756557554, + "stddev": 0, + "rounds": 1, + "median": 93.588756557554, + "iqr": 0.0, + "q1": 93.588756557554, + "q3": 93.588756557554, + "iqr_outliers": 0, + "stddev_outliers": 0, + "outliers": "0;0", + "ld15iqr": 93.588756557554, + "hd15iqr": 93.588756557554, + "ops": 0.01068504419529319, + "total": 93.588756557554, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_ub3lyp", + "fullname": "gpu4pyscf/tests/test_benchmark_uks.py::test_ub3lyp", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 6, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": 2 + }, + "stats": { + "min": 6.713842295110226, + "max": 7.0260709673166275, + "mean": 6.852823034239312, + "stddev": 0.11983568503202911, + "rounds": 6, + "median": 6.869919722899795, + "iqr": 0.19665820337831974, + "q1": 6.720263646915555, + "q3": 6.916921850293875, + "iqr_outliers": 0, + "stddev_outliers": 3, + "outliers": "3;0", + "ld15iqr": 6.713842295110226, + "hd15iqr": 7.0260709673166275, + "ops": 0.14592526247994722, + "total": 41.11693820543587, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_ub3lyp_grad", + "fullname": "gpu4pyscf/tests/test_benchmark_uks.py::test_ub3lyp_grad", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 6, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": 2 + }, + "stats": { + "min": 7.483015248551965, + "max": 7.855705849826336, + "mean": 7.595327176774542, + "stddev": 0.14647552264068445, + "rounds": 6, + "median": 7.529973562806845, + "iqr": 0.19051661528646946, + "q1": 7.491389110684395, + "q3": 7.681905725970864, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 7.483015248551965, + "hd15iqr": 7.855705849826336, + "ops": 0.13165989781952533, + "total": 45.57196306064725, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_ub3lyp_hessian", + "fullname": "gpu4pyscf/tests/test_benchmark_uks.py::test_ub3lyp_hessian", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 1, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 61.551909405738115, + "max": 61.551909405738115, + "mean": 61.551909405738115, + "stddev": 0, + "rounds": 1, + "median": 61.551909405738115, + "iqr": 0.0, + "q1": 61.551909405738115, + "q3": 61.551909405738115, + "iqr_outliers": 0, + "stddev_outliers": 0, + "outliers": "0;0", + "ld15iqr": 61.551909405738115, + "hd15iqr": 61.551909405738115, + "ops": 0.016246449698387032, + "total": 61.551909405738115, + "iterations": 1 + } + } + ], + "datetime": "2025-01-06T03:46:22.404689+00:00", + "version": "5.1.0" +} \ No newline at end of file From 7edf50d09643fc1d0e18b67a8e8e50c3cff38b6a Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Mon, 6 Jan 2025 17:24:25 +0000 Subject: [PATCH 44/49] remove comments --- examples/dft_driver.py | 4 ++-- gpu4pyscf/df/df.py | 2 +- gpu4pyscf/df/hessian/jk.py | 6 +----- gpu4pyscf/hessian/rks.py | 3 --- gpu4pyscf/lib/cupy_helper.py | 2 +- 5 files changed, 5 insertions(+), 12 deletions(-) diff --git a/examples/dft_driver.py b/examples/dft_driver.py index 0c8cea48..0be7f410 100644 --- a/examples/dft_driver.py +++ b/examples/dft_driver.py @@ -34,10 +34,10 @@ basis=bas, max_memory=32000) # set verbose >= 6 for debugging timer -mol.verbose = 4 +mol.verbose = 6 mf_df = dft.RKS(mol, xc=args.xc).density_fit(auxbasis=args.auxbasis) -mf_df.verbose = 4 +mf_df.verbose = 6 if args.solvent: mf_df = mf_df.PCM() diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py index ab1adeba..da61804c 100644 --- a/gpu4pyscf/df/df.py +++ b/gpu4pyscf/df/df.py @@ -138,7 +138,7 @@ def get_blksize(self, extra=0, nao=None): ''' if nao is None: nao = self.nao mem_avail = get_avail_mem() - blksize = int(mem_avail*0.4/8/(nao*nao + extra) / ALIGNED) * ALIGNED + blksize = int(mem_avail*0.2/8/(nao*nao + extra) / ALIGNED) * ALIGNED blksize = min(blksize, MIN_BLK_SIZE) log = logger.new_logger(self.mol, self.mol.verbose) device_id = cupy.cuda.Device().id diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py index 7859e97b..40ab3bfd 100644 --- a/gpu4pyscf/df/hessian/jk.py +++ b/gpu4pyscf/df/hessian/jk.py @@ -22,7 +22,7 @@ from gpu4pyscf.scf.int4c2e import libgint from gpu4pyscf.hessian.jk import _ao2mo from gpu4pyscf.lib import logger -from gpu4pyscf.lib.cupy_helper import contract, cart2sph, reduce_to_device, get_avail_mem, release_gpu_stack +from gpu4pyscf.lib.cupy_helper import contract, cart2sph, reduce_to_device from gpu4pyscf.__config__ import _streams, _num_devices NROOT_ON_GPU = 7 @@ -314,8 +314,6 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, # (20|0), (0|0)(0|00) int3c_blk = _get_int3c2e_ipip_slice('ipip1', intopt, cp_ij_id, aux_id, omega=omega) if with_j: - #tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1]) - #hj_ipip1[:,i0:i1] += contract('xpi,p->xi', tmp, rhoj[k0:k1]) tmp = contract('xpji,p->xji', int3c_blk, rhoj[k0:k1]) hj_ipip1[:,i0:i1] += contract('xji,ij->xi', tmp, dm0[i0:i1,j0:j1]) if with_k: @@ -325,8 +323,6 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, # (11|0), (0|0)(0|00) without response of RI basis int3c_blk = _get_int3c2e_ipip_slice('ipvip1', intopt, cp_ij_id, aux_id, omega=omega) if with_j: - #tmp = contract('xpji,ij->xpij', int3c_blk, dm0[i0:i1,j0:j1]) - #hj_ipvip1[:,i0:i1,j0:j1] += contract('xpij,p->xij', tmp, rhoj[k0:k1]) tmp = contract('xpji,p->xji', int3c_blk, rhoj[k0:k1]) hj_ipvip1[:,i0:i1,j0:j1] += contract('xji,ij->xij', tmp, dm0[i0:i1,j0:j1]) if with_k: diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py index a2ee9da7..d506b934 100644 --- a/gpu4pyscf/hessian/rks.py +++ b/gpu4pyscf/hessian/rks.py @@ -811,9 +811,6 @@ def nr_rks_fxc_mo(ni, mol, grids, xc_code, dm0=None, dms=None, mo_coeff=None, re for future in futures: vmat_dist.append(future.result()) vmat = reduce_to_device(vmat_dist, inplace=True) - #vmat = opt.unsort_orbitals(vmat, axis=[1,2]) - #if xctype != 'LDA': - # transpose_sum(vmat) if len(dm_shape) == 2: vmat = vmat[0] diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py index 95ae1f24..d68cbcff 100644 --- a/gpu4pyscf/lib/cupy_helper.py +++ b/gpu4pyscf/lib/cupy_helper.py @@ -149,7 +149,7 @@ def reduce_to_device(array_list, inplace=False): matrix = matrix.reshape(-1) blksize = 1024*1024*1024 // matrix.itemsize # 1GB for p0, p1 in lib.prange(0,len(matrix), blksize): - result[p0:p1] += copy_array(matrix[p0:p1])#cupy.asarray(matrix[p0:p1]) + result[p0:p1] += copy_array(matrix[p0:p1]) #result[p0:p1] += cupy.asarray(matrix[p0:p1]) return result.reshape(out_shape) From ee0e63643bab148574e9c74dd13e5b004ec168ac Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Tue, 7 Jan 2025 06:11:14 +0000 Subject: [PATCH 45/49] resolve comments --- gpu4pyscf/grad/rhf.py | 7 ++++++- gpu4pyscf/hessian/jk.py | 8 ++++++-- gpu4pyscf/hessian/rhf.py | 19 ++++++++++++++++--- gpu4pyscf/lib/memcpy.py | 6 ------ gpu4pyscf/scf/j_engine.py | 6 +++++- gpu4pyscf/scf/jk.py | 2 +- gpu4pyscf/tests/test_benchmark_rks.py | 2 +- 7 files changed, 35 insertions(+), 15 deletions(-) diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py index 0ee8cd43..dd374cc3 100644 --- a/gpu4pyscf/grad/rhf.py +++ b/gpu4pyscf/grad/rhf.py @@ -30,6 +30,7 @@ from gpu4pyscf.__config__ import _streams, _num_devices from gpu4pyscf.df import int3c2e #TODO: move int3c2e to out of df from gpu4pyscf.lib import logger +from gpu4pyscf.scf import jk from gpu4pyscf.scf.jk import ( LMAX, QUEUE_DEPTH, SHM_SIZE, THREADS, libvhf_rys, _VHFOpt, init_constant, _make_tril_tile_mappings, _nearest_power2) @@ -124,7 +125,11 @@ def _jk_energy_per_atom(mol, dm, vhfopt=None, log = logger.new_logger(mol, verbose) cput0 = log.init_timer() if vhfopt is None: - vhfopt = _VHFOpt(mol).build() + # Small group size for load balance + group_size = None + if _num_devices > 1: + group_size = jk.GROUP_SIZE + vhfopt = _VHFOpt(mol).build(group_size=group_size) mol = vhfopt.sorted_mol nao, nao_orig = vhfopt.coeff.shape diff --git a/gpu4pyscf/hessian/jk.py b/gpu4pyscf/hessian/jk.py index 6f17488d..65edff6b 100644 --- a/gpu4pyscf/hessian/jk.py +++ b/gpu4pyscf/hessian/jk.py @@ -26,7 +26,7 @@ from pyscf import lib from pyscf.scf import _vhf from pyscf import __config__ - +from gpu4pyscf.scf import jk from gpu4pyscf.scf.jk import (_make_tril_tile_mappings, quartets_scheme, QUEUE_DEPTH, _VHFOpt, LMAX, init_constant, libvhf_rys) from gpu4pyscf.lib.cupy_helper import (condense, sandwich_dot, transpose_sum, @@ -172,7 +172,11 @@ def get_jk(mol, dm, mo_coeff, mo_occ, hermi=0, vhfopt=None, cput0 = log.init_timer() assert hermi == 1 if vhfopt is None: - vhfopt = _VHFOpt(mol).build() + # Small group size for load balance + group_size = None + if _num_devices > 1: + group_size = jk.GROUP_SIZE + vhfopt = _VHFOpt(mol).build(group_size=group_size) mol = vhfopt.sorted_mol nao, nao_orig = vhfopt.coeff.shape diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py index 52150457..2e10d049 100644 --- a/gpu4pyscf/hessian/rhf.py +++ b/gpu4pyscf/hessian/rhf.py @@ -269,7 +269,11 @@ def _partial_ejk_ip2(mol, dm, vhfopt=None, j_factor=1., k_factor=1., verbose=Non log = logger.new_logger(mol, verbose) cput0 = log.init_timer() if vhfopt is None: - vhfopt = _VHFOpt(mol).build() + # Small group size for load balance + group_size = None + if _num_devices > 1: + group_size = jk.GROUP_SIZE + vhfopt = _VHFOpt(mol).build(group_size=group_size) mol = vhfopt.sorted_mol nao, nao_orig = vhfopt.coeff.shape @@ -488,7 +492,11 @@ def _get_jk_ip1(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=Non vhfopt = _VHFOpt(mol) # tile must set to 1. This tile size is assumed in the GPU kernel code vhfopt.tile = 1 - vhfopt.build() + # Small group size for load balance + group_size = None + if _num_devices > 1: + group_size = jk.GROUP_SIZE + vhfopt.build(group_size=group_size) mol = vhfopt.sorted_mol nao, nao_orig = vhfopt.coeff.shape @@ -898,7 +906,12 @@ def _get_jk_mo(hessobj, mol, dms, mo_coeff, mo_occ, vhfopt = mf._opt_gpu.get(omega) if vhfopt is None: with mol.with_range_coulomb(omega): - vhfopt = mf._opt_gpu[omega] = _VHFOpt(mol, mf.direct_scf_tol).build() + # Small group size for load balance + group_size = None + if _num_devices > 1: + group_size = jk.GROUP_SIZE + vhfopt = _VHFOpt(mol, mf.direct_scf_tol).build(group_size=group_size) + mf._opt_gpu[omega] = vhfopt with mol.with_range_coulomb(omega): vj, vk = jk.get_jk(mol, dms, mo_coeff, mo_occ, hermi, vhfopt, with_j, with_k) return vj, vk diff --git a/gpu4pyscf/lib/memcpy.py b/gpu4pyscf/lib/memcpy.py index 19b19e41..c961a9a2 100644 --- a/gpu4pyscf/lib/memcpy.py +++ b/gpu4pyscf/lib/memcpy.py @@ -76,12 +76,6 @@ def _copy_array(src_view, dst_view): kind = cupy.cuda.runtime.memcpyHostToDevice else: raise NotImplementedError - - - if len(chunk_shape) == 0: - print('here') - print(src_view.nbytes, dst_view.nbytes) - print(shape, strides_src, strides_dst) assert len(chunk_shape) > 0 diff --git a/gpu4pyscf/scf/j_engine.py b/gpu4pyscf/scf/j_engine.py index 2ecb5293..3d98ae5f 100644 --- a/gpu4pyscf/scf/j_engine.py +++ b/gpu4pyscf/scf/j_engine.py @@ -26,6 +26,7 @@ from pyscf import __config__ from gpu4pyscf.lib.cupy_helper import load_library, condense, sandwich_dot, transpose_sum from gpu4pyscf.__config__ import props as gpu_specs +from gpu4pyscf.__config__ import _num_devices from gpu4pyscf.lib import logger from gpu4pyscf.scf import jk from gpu4pyscf.scf.jk import _make_j_engine_pair_locs, RysIntEnvVars, _scale_sp_ctr_coeff @@ -51,7 +52,10 @@ def get_j(mol, dm, hermi=1, vhfopt=None, omega=None, verbose=None): cput0 = log.init_timer() if vhfopt is None: with mol.with_range_coulomb(omega): - vhfopt = _VHFOpt(mol).build() + groupsize = None + if _num_devices > 1: + groupsize = jk.GROUP_SIZE + vhfopt = _VHFOpt(mol).build(group_size=groupsize) if omega is None: omega = mol.omega diff --git a/gpu4pyscf/scf/jk.py b/gpu4pyscf/scf/jk.py index 8577457d..0e328204 100644 --- a/gpu4pyscf/scf/jk.py +++ b/gpu4pyscf/scf/jk.py @@ -462,7 +462,7 @@ def __init__(self, mol, cutoff=1e-13): self._tile_q_cond = {} self._s_estimator = {} - def build(self, group_size=GROUP_SIZE, verbose=None): + def build(self, group_size=None, verbose=None): mol = self.mol log = logger.new_logger(mol, verbose) cput0 = log.init_timer() diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py index ec294234..c367ac90 100644 --- a/gpu4pyscf/tests/test_benchmark_rks.py +++ b/gpu4pyscf/tests/test_benchmark_rks.py @@ -17,7 +17,7 @@ import pyscf import pytest from gpu4pyscf.dft import rks - +CUDA_VISIBLE_DEVICES=0 # Any task taking more than 1000s will be marked as 'slow' # How to run From 9dbc08354022eb0da72ee00f2b44d649e1035d3a Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Tue, 7 Jan 2025 07:29:02 +0000 Subject: [PATCH 46/49] group_size in hessian --- gpu4pyscf/hessian/rhf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py index 2e10d049..8463e68c 100644 --- a/gpu4pyscf/hessian/rhf.py +++ b/gpu4pyscf/hessian/rhf.py @@ -35,7 +35,7 @@ from gpu4pyscf.__config__ import _streams, _num_devices from gpu4pyscf.lib import logger from gpu4pyscf.scf.jk import ( - LMAX, QUEUE_DEPTH, SHM_SIZE, THREADS, libvhf_rys, _VHFOpt, init_constant, + LMAX, QUEUE_DEPTH, SHM_SIZE, THREADS, GROUP_SIZE, libvhf_rys, _VHFOpt, init_constant, _make_tril_tile_mappings, _nearest_power2) from gpu4pyscf.grad import rhf as rhf_grad from gpu4pyscf.hessian import jk @@ -272,7 +272,7 @@ def _partial_ejk_ip2(mol, dm, vhfopt=None, j_factor=1., k_factor=1., verbose=Non # Small group size for load balance group_size = None if _num_devices > 1: - group_size = jk.GROUP_SIZE + group_size = GROUP_SIZE vhfopt = _VHFOpt(mol).build(group_size=group_size) mol = vhfopt.sorted_mol From 3abc22674763e5546eae5a194d13714a90043262 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Tue, 7 Jan 2025 07:57:55 +0000 Subject: [PATCH 47/49] resolve possible memory leak --- gpu4pyscf/gto/int3c1e.py | 14 ++++++++------ gpu4pyscf/gto/int3c1e_ip.py | 12 ++++++++---- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/gpu4pyscf/gto/int3c1e.py b/gpu4pyscf/gto/int3c1e.py index cab38b98..8e6ce88c 100644 --- a/gpu4pyscf/gto/int3c1e.py +++ b/gpu4pyscf/gto/int3c1e.py @@ -258,12 +258,13 @@ def get_int3c1e(mol, grids, charge_exponents, intopt): charge_exponents_pointer = c_null_ptr() if charge_exponents is not None: - charge_exponents_pointer = charge_exponents[p0:p1].data.ptr - + exponents_slice = charge_exponents[p0:p1] + charge_exponents_pointer = exponents_slice.data.ptr + grids_slice = grids[p0:p1] err = libgint.GINTfill_int3c1e( ctypes.cast(stream.ptr, ctypes.c_void_p), intopt.bpcache, - ctypes.cast(grids[p0:p1, :].data.ptr, ctypes.c_void_p), + ctypes.cast(grids_slice.data.ptr, ctypes.c_void_p), ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), ctypes.c_int(p1-p0), ctypes.cast(int3c_angular_slice.data.ptr, ctypes.c_void_p), @@ -441,16 +442,17 @@ def get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt): charge_exponents_pointer = c_null_ptr() if charge_exponents is not None: - charge_exponents_pointer = charge_exponents[p0:p1].data.ptr + exponents_slice = charge_exponents[p0:p1] + charge_exponents_pointer = exponents_slice.data.ptr # n_pair_sum_per_thread = 1 # means every thread processes one pair and one grid # n_pair_sum_per_thread = nao_cart # or larger number gaurantees one thread processes one grid and all pairs of the same type n_pair_sum_per_thread = nao_cart - + grids_slice = grids[p0:p1, :] err = libgint.GINTfill_int3c1e_density_contracted( ctypes.cast(stream.ptr, ctypes.c_void_p), intopt.bpcache, - ctypes.cast(grids[p0:p1, :].data.ptr, ctypes.c_void_p), + ctypes.cast(grids_slice.data.ptr, ctypes.c_void_p), ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), ctypes.c_int(p1-p0), ctypes.cast(dm_pair_ordered.data.ptr, ctypes.c_void_p), diff --git a/gpu4pyscf/gto/int3c1e_ip.py b/gpu4pyscf/gto/int3c1e_ip.py index 717db68f..56e51662 100644 --- a/gpu4pyscf/gto/int3c1e_ip.py +++ b/gpu4pyscf/gto/int3c1e_ip.py @@ -78,12 +78,14 @@ def get_int3c1e_ip(mol, grids, charge_exponents, intopt): charge_exponents_pointer = c_null_ptr() if charge_exponents is not None: - charge_exponents_pointer = charge_exponents[p0:p1].data.ptr + exponents_slice = charge_exponents[p0:p1] + charge_exponents_pointer = exponents_slice.data.ptr + grids_slice = grids[p0:p1, :] err = libgint.GINTfill_int3c1e_ip( ctypes.cast(stream.ptr, ctypes.c_void_p), intopt.bpcache, - ctypes.cast(grids[p0:p1, :].data.ptr, ctypes.c_void_p), + ctypes.cast(grids_slice.data.ptr, ctypes.c_void_p), ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), ctypes.c_int(p1-p0), ctypes.cast(int3c_angular_slice.data.ptr, ctypes.c_void_p), @@ -260,7 +262,9 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt) charge_exponents_pointer = c_null_ptr() if charge_exponents is not None: - charge_exponents_pointer = charge_exponents[p0:p1].data.ptr + exponents_slice = charge_exponents[p0:p1] + charge_exponents_pointer = exponents_slice.data.ptr + grids_slice = grids[p0:p1].data.ptr # n_pair_sum_per_thread = 1 # means every thread processes one pair and one grid # n_pair_sum_per_thread = nao_cart # or larger number gaurantees one thread processes one grid and all pairs of the same type @@ -269,7 +273,7 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt) err = libgint.GINTfill_int3c1e_ip2_density_contracted( ctypes.cast(stream.ptr, ctypes.c_void_p), intopt.bpcache, - ctypes.cast(grids[p0:p1, :].data.ptr, ctypes.c_void_p), + ctypes.cast(grids_slice.data.ptr, ctypes.c_void_p), ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), ctypes.c_int(p1-p0), ctypes.cast(dm_pair_ordered.data.ptr, ctypes.c_void_p), From 2f5ce8b6291e0fc8251c05d87dd9547d5ae08219 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Tue, 7 Jan 2025 17:32:35 +0000 Subject: [PATCH 48/49] bugfix --- gpu4pyscf/gto/int3c1e_ip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu4pyscf/gto/int3c1e_ip.py b/gpu4pyscf/gto/int3c1e_ip.py index 56e51662..c2aeb1be 100644 --- a/gpu4pyscf/gto/int3c1e_ip.py +++ b/gpu4pyscf/gto/int3c1e_ip.py @@ -264,7 +264,7 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt) if charge_exponents is not None: exponents_slice = charge_exponents[p0:p1] charge_exponents_pointer = exponents_slice.data.ptr - grids_slice = grids[p0:p1].data.ptr + grids_slice = grids[p0:p1] # n_pair_sum_per_thread = 1 # means every thread processes one pair and one grid # n_pair_sum_per_thread = nao_cart # or larger number gaurantees one thread processes one grid and all pairs of the same type From e04cb3fd02a4737b812abfc11099555706deaaa6 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Tue, 7 Jan 2025 18:27:25 +0000 Subject: [PATCH 49/49] bugfix --- gpu4pyscf/hessian/rhf.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py index 8463e68c..775a6e98 100644 --- a/gpu4pyscf/hessian/rhf.py +++ b/gpu4pyscf/hessian/rhf.py @@ -35,8 +35,8 @@ from gpu4pyscf.__config__ import _streams, _num_devices from gpu4pyscf.lib import logger from gpu4pyscf.scf.jk import ( - LMAX, QUEUE_DEPTH, SHM_SIZE, THREADS, GROUP_SIZE, libvhf_rys, _VHFOpt, init_constant, - _make_tril_tile_mappings, _nearest_power2) + LMAX, QUEUE_DEPTH, SHM_SIZE, THREADS, GROUP_SIZE, libvhf_rys, _VHFOpt, + init_constant, _make_tril_tile_mappings, _nearest_power2) from gpu4pyscf.grad import rhf as rhf_grad from gpu4pyscf.hessian import jk @@ -495,7 +495,7 @@ def _get_jk_ip1(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=Non # Small group size for load balance group_size = None if _num_devices > 1: - group_size = jk.GROUP_SIZE + group_size = GROUP_SIZE vhfopt.build(group_size=group_size) mol = vhfopt.sorted_mol @@ -909,7 +909,7 @@ def _get_jk_mo(hessobj, mol, dms, mo_coeff, mo_occ, # Small group size for load balance group_size = None if _num_devices > 1: - group_size = jk.GROUP_SIZE + group_size = GROUP_SIZE vhfopt = _VHFOpt(mol, mf.direct_scf_tol).build(group_size=group_size) mf._opt_gpu[omega] = vhfopt with mol.with_range_coulomb(omega):