Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DFT with small batch grids #193

Draft
wants to merge 24 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions examples/00-h2o.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,16 @@
atom=atom, # water molecule
basis='def2-tzvpp', # basis set
output='./pyscf.log', # save log file
verbose=6 # control the level of print info
verbose=6, # control the level of print info
cart=True
)

mf_GPU = rks.RKS( # restricted Kohn-Sham DFT
mol, # pyscf.gto.object
xc='b3lyp' # xc funtionals, such as pbe0, wb97m-v, tpss,
).density_fit() # density fitting

mf_GPU.grids.atom_grid = (99,590) # (99,590) lebedev grids, (75,302) is often enough
mf_GPU.grids.atom_grid = (75,302) # (99,590) lebedev grids, (75,302) is often enough
mf_GPU.conv_tol = 1e-10 # controls SCF convergence tolerance
mf_GPU.max_cycle = 50 # controls max iterations of SCF
mf_GPU.conv_tol_cpscf = 1e-3 # controls max iterations of CPSCF (for hessian)
Expand Down
39 changes: 8 additions & 31 deletions gpu4pyscf/__config__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,14 @@

props = cupy.cuda.runtime.getDeviceProperties(0)
GB = 1024*1024*1024
# such as A100-80G
if props['totalGlobalMem'] >= 64 * GB:
min_ao_blksize = 128
min_grid_blksize = 128*128
ao_aligned = 32
grid_aligned = 256
mem_fraction = 0.9
number_of_threads = 2048 * 108
# such as V100-32G
elif props['totalGlobalMem'] >= 32 * GB:
min_ao_blksize = 128
min_grid_blksize = 128*128
ao_aligned = 32
grid_aligned = 256
mem_fraction = 0.9
number_of_threads = 1024 * 80
# such as A30-24GB
elif props['totalGlobalMem'] >= 16 * GB:
min_ao_blksize = 128
min_grid_blksize = 128*128
ao_aligned = 32
grid_aligned = 256
mem_fraction = 0.9
number_of_threads = 1024 * 80
# other gaming cards
else:

min_ao_blksize = 128
min_grid_blksize = 16384
ao_aligned = 32
grid_aligned = 256
mem_fraction = 0.9

if props['totalGlobalMem'] < 16 * GB:
min_ao_blksize = 64
min_grid_blksize = 64*64
ao_aligned = 32
grid_aligned = 128
mem_fraction = 0.9
number_of_threads = 1024 * 80

cupy.get_default_memory_pool().set_limit(fraction=mem_fraction)
5 changes: 2 additions & 3 deletions gpu4pyscf/df/df.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def build(self, direct_scf_tol=1e-14, omega=None):

v = w = None
naux = self.naux = self.cd_low.shape[1]
log.debug('size of aux basis %d', naux)
log.debug('Size of aux basis %d', naux)

self._cderi = cholesky_eri_gpu(intopt, mol, auxmol, self.cd_low, omega=omega)
log.timer_debug1('cholesky_eri', *t0)
Expand Down Expand Up @@ -213,7 +213,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
if naux * npair * 8 < 0.4 * avail_mem:
try:
cderi = cupy.empty([naux, npair], order='C')
log.debug(f"Saving CDERI on GPU. CDERI size {cderi.nbytes/GB}")
log.debug(f"Saving CDERI on GPU. CDERI size {cderi.nbytes/GB:.3f} GB")
except Exception:
use_gpu_memory = False
else:
Expand Down Expand Up @@ -276,7 +276,6 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
ints_slices_f= cupy.empty([naoaux,len(row)], order='F')
ints_slices_f[:] = ints_slices[:,col,row]
ints_slices = None

if cd_low.tag == 'eig':
cderi_block = cupy.dot(cd_low.T, ints_slices_f)
ints_slices = None
Expand Down
138 changes: 121 additions & 17 deletions gpu4pyscf/dft/gen_grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,11 @@
import cupy
from pyscf import lib
from pyscf import gto
from pyscf.gto.eval_gto import BLKSIZE, NBINS, CUTOFF, make_screen_index
from pyscf.gto.eval_gto import BLKSIZE, NBINS, CUTOFF
from pyscf import __config__
from cupyx.scipy.spatial.distance import cdist
from gpu4pyscf.lib import logger
from gpu4pyscf.gto.eval_gto import make_screen_index
from gpu4pyscf.dft import radi
from gpu4pyscf.lib.cupy_helper import load_library
from gpu4pyscf import __config__ as __gpu4pyscf_config__
Expand All @@ -44,8 +45,12 @@

from pyscf.dft.gen_grid import GROUP_BOUNDARY_PENALTY, NELEC_ERROR_TOL, LEBEDEV_ORDER, LEBEDEV_NGRID

GROUP_BOX_SIZE = 3.0
AO_ALIGNMENT = getattr(__config__, 'ao_aligned', 16)
GROUP_BOX_SIZE = 1.2
ALIGNMENT_UNIT = getattr(__gpu4pyscf_config__, 'grid_aligned', 128)
MIN_BLK_SIZE = getattr(__gpu4pyscf_config__, 'min_grid_blksize', 64*64)
GRID_BLKSIZE = MIN_BLK_SIZE

# SG0
# S. Chien and P. Gill, J. Comput. Chem. 27 (2006) 730-739.

Expand Down Expand Up @@ -350,7 +355,7 @@ def gen_grid_partition(coords):
gen_partition = get_partition

def make_mask(mol, coords, relativity=0, shls_slice=None, cutoff=CUTOFF,
verbose=None):
verbose=None, blksize=ALIGNMENT_UNIT):
'''Mask to indicate whether a shell is ignorable on grids. See also the
function gto.eval_gto.make_screen_index

Expand All @@ -374,7 +379,39 @@ def make_mask(mol, coords, relativity=0, shls_slice=None, cutoff=CUTOFF,
2D mask array of shape (N,nbas), where N is the number of grids, nbas
is the number of shells.
'''
return make_screen_index(mol, coords, shls_slice, cutoff)

log = logger.new_logger(mol)
t0 = log.init_timer()
s_index = make_screen_index(mol, coords, blksize=blksize, cutoff=1e-10)
t0 = log.timer_debug1('s_index', *t0)
return s_index

def gen_grid_sparsity(mol, s_index):
''' Generate nonzero AO indices, and corresponding ao_loc based on s_index
'''
if isinstance(s_index, cupy.ndarray):
s_index = s_index.get()

log = logger.new_logger(mol, mol.verbose)
t0 = log.init_timer()
ao_loc = mol.ao_loc_nr()
nblocks = s_index.shape[0]
nbas = mol.nbas
nao = mol.nao
ao_indices = -numpy.ones([nblocks, nao], dtype=numpy.int32)
ao_loc_non0 = numpy.zeros([nblocks, nbas+1], dtype=numpy.int32)

# Running on CPU
libgdft.GDFTmake_sparse_cache(
s_index.ctypes.data_as(ctypes.c_void_p),
ao_loc.ctypes.data_as(ctypes.c_void_p),
ctypes.c_int(nblocks),
ao_loc_non0.ctypes.data_as(ctypes.c_void_p),
ctypes.c_int(nbas),
ao_indices.ctypes.data_as(ctypes.c_void_p),
ctypes.c_int(nao))
t0 = log.timer_debug1('sparse ao kernel', *t0)
return ao_indices, ao_loc_non0

def argsort_group(group_ids, ngroup):
'''Sort the grids based on the group_ids.
Expand Down Expand Up @@ -419,7 +456,8 @@ def atomic_group_grids(mol, coords):
ctypes.c_int(ngrids)
)
if err != 0:
raise RuntimeError('CUDA Error')
raise RuntimeError('CUDA Error in GDFTgroup_grids kernel')

idx = group_ids.argsort()
return idx

Expand Down Expand Up @@ -464,6 +502,35 @@ def _load_conf(mod, name, default):
else:
return var

def gen_sparse_cache(mol, coords, blksize):
'''
determine sparse AO indices
'''
log = logger.new_logger(mol, 6)
ao_loc = mol.ao_loc_nr()
ngrids = coords.shape[0]
t0 = log.init_timer()
s_index = make_screen_index(mol, coords, blksize=blksize)
t0 = log.timer_debug1('s_index', *t0)
s_index_cpu = s_index.get()
nblocks = (ngrids + blksize - 1)//blksize
nbas = mol.nbas
nao = mol.nao
ao_indices = numpy.zeros([nblocks, nao], dtype=numpy.int32)
ao_loc_non0 = numpy.zeros([nblocks, nbas+1], dtype=numpy.int32)

# Running on CPU
libgdft.GDFTmake_sparse_cache(
s_index_cpu.ctypes.data_as(ctypes.c_void_p),
ao_loc.ctypes.data_as(ctypes.c_void_p),
ctypes.c_int(nblocks),
ao_loc_non0.ctypes.data_as(ctypes.c_void_p),
ctypes.c_int(nbas),
ao_indices.ctypes.data_as(ctypes.c_void_p),
ctypes.c_int(nao))
t0 = log.timer_debug1('sparse ao kernel', *t0)
return ao_indices, s_index, ao_loc_non0

from pyscf.dft import gen_grid
from gpu4pyscf.lib import utils
class Grids(lib.StreamObject):
Expand All @@ -483,8 +550,9 @@ class Grids(lib.StreamObject):
alignment = ALIGNMENT_UNIT
cutoff = CUTOFF
_keys = gen_grid.Grids._keys

__init__ = gen_grid.Grids.__init__
__init__ = gen_grid.Grids.__init__

_keys.update({'sparse_cache'})

def __setattr__(self, key, val):
if key in ('atom_grid', 'atomic_radii', 'radii_adjust', 'radi_method',
Expand All @@ -502,27 +570,30 @@ def build(self, mol=None, with_non0tab=False, sort_grids=True, **kwargs):
self.check_sanity()
atom_grids_tab = self.gen_atomic_grids(
mol, self.atom_grid, self.radi_method, self.level, self.prune, **kwargs)
self.coords, self.weights = self.get_partition(
mol, atom_grids_tab, self.radii_adjust, self.atomic_radii, self.becke_scheme)
coords, weights = self.get_partition(
mol, atom_grids_tab, self.radii_adjust, self.atomic_radii, self.becke_scheme,
concat=False)
if self.alignment > 1:
padding = _padding_size(self.size, self.alignment)
grid_size = sum([w.size for w in weights])
padding = _padding_size(grid_size, self.alignment)
logger.debug(self, 'Padding %d grids', padding)
if padding > 0:
# cupy.vstack and cupy.hstack convert numpy array into cupy array first
self.coords = cupy.vstack(
[self.coords, numpy.repeat([[1e4]*3], padding, axis=0)])
self.weights = cupy.hstack([self.weights, numpy.zeros(padding)])
coords.append(cupy.asarray(numpy.repeat([[1e4]*3], padding, axis=0)))
weights.append(cupy.zeros(padding))
self.coords = cupy.vstack(coords)
self.weights = cupy.hstack(weights)

if sort_grids:
#idx = arg_group_grids(mol, self.coords)
idx = atomic_group_grids(mol, self.coords)
idx = arg_group_grids(mol, self.coords)
#idx = atomic_group_grids(mol, self.coords)
self.coords = self.coords[idx]
self.weights = self.weights[idx]

if with_non0tab:
self.non0tab = self.make_mask(mol, self.coords)
self.screen_index = self.non0tab
else:
self.screen_index = self.non0tab = None

logger.info(self, 'tot grids = %d', len(self.weights))
return self

Expand All @@ -538,6 +609,7 @@ def reset(self, mol=None):
self.weights = None
self.non0tab = None
self.screen_index = None
self.sparse_cache = {}
return self

gen_atomic_grids = lib.module_method(
Expand All @@ -556,6 +628,38 @@ def get_partition(self, mol, atom_grids_tab=None,

make_mask = lib.module_method(make_mask, absences=['cutoff'])

def cache_sparsity(self, sorted_mol, blksize=ALIGNMENT_UNIT, sort_grids=False):
''' Build sparsity data for sparse AO evaluation
Sort grids based on the number of nonzero AOs
'''
log = logger.new_logger(sorted_mol, sorted_mol.verbose) #ni.verbose)
t1 = log.init_timer()
ngrids = self.coords.shape[0]
s_index = make_mask(sorted_mol, self.coords, blksize=blksize)
ao_indices, ao_loc_non0 = gen_grid_sparsity(sorted_mol, s_index)
nao_non0 = ao_loc_non0[:,-1]
if sort_grids:
# Sort grids based on the number of nonzero AOs
idx = numpy.argsort(nao_non0)
nao_non0 = nao_non0[idx]
ao_indices = ao_indices[idx]
s_index = s_index[idx]
ao_loc_non0 = cupy.asarray(ao_loc_non0[idx])

ao_indices = cupy.asarray(ao_indices)
idx = cupy.asarray(idx)
idx_grids = cupy.tile(idx*blksize, (blksize,1)).T
idx_grids += cupy.arange(blksize)
idx_grids = idx_grids.ravel(order='C')[:ngrids]

self.coords = self.coords[idx_grids]
self.weights = self.weights[idx_grids]

sparse_data = (ao_indices, s_index, ao_loc_non0, nao_non0)
self.sparse_cache[blksize, ngrids] = sparse_data
t1 = log.timer_debug1('generate sparsity', *t1)
return

def prune_by_density_(self, rho, threshold=0):
'''Prune grids if the electron density on the grid is small'''
if threshold == 0:
Expand Down
Loading
Loading