pyscf · wxj6000 · Aug 7, 2024 · Aug 7, 2024 · Aug 10, 2024 · Aug 11, 2024
diff --git a/examples/00-h2o.py b/examples/00-h2o.py
@@ -36,15 +36,16 @@
     atom=atom,                         # water molecule
     basis='def2-tzvpp',                # basis set
     output='./pyscf.log',              # save log file
-    verbose=6                          # control the level of print info
+    verbose=6,                          # control the level of print info
+    cart=True
     )
 
 mf_GPU = rks.RKS(                      # restricted Kohn-Sham DFT
     mol,                               # pyscf.gto.object
     xc='b3lyp'                         # xc funtionals, such as pbe0, wb97m-v, tpss,
     ).density_fit()                    # density fitting
 
-mf_GPU.grids.atom_grid = (99,590)      # (99,590) lebedev grids, (75,302) is often enough
+mf_GPU.grids.atom_grid = (75,302)      # (99,590) lebedev grids, (75,302) is often enough
 mf_GPU.conv_tol = 1e-10                # controls SCF convergence tolerance
 mf_GPU.max_cycle = 50                  # controls max iterations of SCF
 mf_GPU.conv_tol_cpscf = 1e-3           # controls max iterations of CPSCF (for hessian)

diff --git a/gpu4pyscf/__config__.py b/gpu4pyscf/__config__.py
@@ -2,37 +2,14 @@
 
 props = cupy.cuda.runtime.getDeviceProperties(0)
 GB = 1024*1024*1024
-# such as A100-80G
-if props['totalGlobalMem'] >= 64 * GB:
-    min_ao_blksize = 128
-    min_grid_blksize = 128*128
-    ao_aligned = 32
-    grid_aligned = 256
-    mem_fraction = 0.9
-    number_of_threads = 2048 * 108
-# such as V100-32G
-elif props['totalGlobalMem'] >= 32 * GB:
-    min_ao_blksize = 128
-    min_grid_blksize = 128*128
-    ao_aligned = 32
-    grid_aligned = 256
-    mem_fraction = 0.9
-    number_of_threads = 1024 * 80
-# such as A30-24GB
-elif props['totalGlobalMem'] >= 16 * GB:
-    min_ao_blksize = 128
-    min_grid_blksize = 128*128
-    ao_aligned = 32
-    grid_aligned = 256
-    mem_fraction = 0.9
-    number_of_threads = 1024 * 80
-# other gaming cards
-else:
+
+min_ao_blksize = 128
+min_grid_blksize = 16384
+ao_aligned = 32
+grid_aligned = 256
+mem_fraction = 0.9
+
+if props['totalGlobalMem'] < 16 * GB:
     min_ao_blksize = 64
-    min_grid_blksize = 64*64
-    ao_aligned = 32
-    grid_aligned = 128
-    mem_fraction = 0.9
-    number_of_threads = 1024 * 80
 
 cupy.get_default_memory_pool().set_limit(fraction=mem_fraction)
diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
@@ -103,7 +103,7 @@ def build(self, direct_scf_tol=1e-14, omega=None):
 
         v = w = None
         naux = self.naux = self.cd_low.shape[1]
-        log.debug('size of aux basis %d', naux)
+        log.debug('Size of aux basis %d', naux)
 
         self._cderi = cholesky_eri_gpu(intopt, mol, auxmol, self.cd_low, omega=omega)
         log.timer_debug1('cholesky_eri', *t0)
@@ -213,7 +213,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
     if naux * npair * 8 < 0.4 * avail_mem:
         try:
             cderi = cupy.empty([naux, npair], order='C')
-            log.debug(f"Saving CDERI on GPU. CDERI size {cderi.nbytes/GB}")
+            log.debug(f"Saving CDERI on GPU. CDERI size {cderi.nbytes/GB:.3f} GB")
         except Exception:
             use_gpu_memory = False
     else:
@@ -276,7 +276,6 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
         ints_slices_f= cupy.empty([naoaux,len(row)], order='F')
         ints_slices_f[:] = ints_slices[:,col,row]
         ints_slices = None
-
         if cd_low.tag == 'eig':
             cderi_block = cupy.dot(cd_low.T, ints_slices_f)
             ints_slices = None

diff --git a/gpu4pyscf/dft/gen_grid.py b/gpu4pyscf/dft/gen_grid.py
@@ -31,10 +31,11 @@
 import cupy
 from pyscf import lib
 from pyscf import gto
-from pyscf.gto.eval_gto import BLKSIZE, NBINS, CUTOFF, make_screen_index
+from pyscf.gto.eval_gto import BLKSIZE, NBINS, CUTOFF
 from pyscf import __config__
 from cupyx.scipy.spatial.distance import cdist
 from gpu4pyscf.lib import logger
+from gpu4pyscf.gto.eval_gto import make_screen_index
 from gpu4pyscf.dft import radi
 from gpu4pyscf.lib.cupy_helper import load_library
 from gpu4pyscf import __config__ as __gpu4pyscf_config__
@@ -44,8 +45,12 @@
 
 from pyscf.dft.gen_grid import GROUP_BOUNDARY_PENALTY, NELEC_ERROR_TOL, LEBEDEV_ORDER, LEBEDEV_NGRID
 
-GROUP_BOX_SIZE = 3.0
+AO_ALIGNMENT = getattr(__config__, 'ao_aligned', 16)
+GROUP_BOX_SIZE = 1.2
 ALIGNMENT_UNIT = getattr(__gpu4pyscf_config__, 'grid_aligned', 128)
+MIN_BLK_SIZE = getattr(__gpu4pyscf_config__, 'min_grid_blksize', 64*64)
+GRID_BLKSIZE = MIN_BLK_SIZE
+
 # SG0
 # S. Chien and P. Gill,  J. Comput. Chem. 27 (2006) 730-739.
 
@@ -350,7 +355,7 @@ def gen_grid_partition(coords):
 gen_partition = get_partition
 
 def make_mask(mol, coords, relativity=0, shls_slice=None, cutoff=CUTOFF,
-              verbose=None):
+              verbose=None, blksize=ALIGNMENT_UNIT):
     '''Mask to indicate whether a shell is ignorable on grids. See also the
     function gto.eval_gto.make_screen_index
 
@@ -374,7 +379,39 @@ def make_mask(mol, coords, relativity=0, shls_slice=None, cutoff=CUTOFF,
         2D mask array of shape (N,nbas), where N is the number of grids, nbas
         is the number of shells.
     '''
-    return make_screen_index(mol, coords, shls_slice, cutoff)
+
+    log = logger.new_logger(mol)
+    t0 = log.init_timer()
+    s_index = make_screen_index(mol, coords, blksize=blksize, cutoff=1e-10)
+    t0 = log.timer_debug1('s_index', *t0)
+    return s_index
+
+def gen_grid_sparsity(mol, s_index):
+    ''' Generate nonzero AO indices, and corresponding ao_loc based on s_index 
+    '''
+    if isinstance(s_index, cupy.ndarray):
+        s_index = s_index.get()
+
+    log = logger.new_logger(mol, mol.verbose)
+    t0 = log.init_timer()
+    ao_loc = mol.ao_loc_nr()
+    nblocks = s_index.shape[0]
+    nbas = mol.nbas
+    nao = mol.nao
+    ao_indices = -numpy.ones([nblocks, nao], dtype=numpy.int32)
+    ao_loc_non0 = numpy.zeros([nblocks, nbas+1], dtype=numpy.int32)
+
+    # Running on CPU
+    libgdft.GDFTmake_sparse_cache(
+        s_index.ctypes.data_as(ctypes.c_void_p),
+        ao_loc.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(nblocks),
+        ao_loc_non0.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(nbas),
+        ao_indices.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(nao))
+    t0 = log.timer_debug1('sparse ao kernel', *t0)
+    return ao_indices, ao_loc_non0
 
 def argsort_group(group_ids, ngroup):
     '''Sort the grids based on the group_ids.
@@ -419,7 +456,8 @@ def atomic_group_grids(mol, coords):
         ctypes.c_int(ngrids)
     )
     if err != 0:
-        raise RuntimeError('CUDA Error')
+        raise RuntimeError('CUDA Error in GDFTgroup_grids kernel')
+
     idx = group_ids.argsort()
     return idx
 
@@ -464,6 +502,35 @@ def _load_conf(mod, name, default):
     else:
         return var
 
+def gen_sparse_cache(mol, coords, blksize):
+    '''
+    determine sparse AO indices
+    '''
+    log = logger.new_logger(mol, 6)
+    ao_loc = mol.ao_loc_nr()
+    ngrids = coords.shape[0]
+    t0 = log.init_timer()
+    s_index = make_screen_index(mol, coords, blksize=blksize)
+    t0 = log.timer_debug1('s_index', *t0)
+    s_index_cpu = s_index.get()
+    nblocks = (ngrids + blksize - 1)//blksize
+    nbas = mol.nbas
+    nao = mol.nao
+    ao_indices = numpy.zeros([nblocks, nao], dtype=numpy.int32)
+    ao_loc_non0 = numpy.zeros([nblocks, nbas+1], dtype=numpy.int32)
+
+    # Running on CPU
+    libgdft.GDFTmake_sparse_cache(
+        s_index_cpu.ctypes.data_as(ctypes.c_void_p),
+        ao_loc.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(nblocks),
+        ao_loc_non0.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(nbas),
+        ao_indices.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(nao))
+    t0 = log.timer_debug1('sparse ao kernel', *t0)
+    return ao_indices, s_index, ao_loc_non0
+
 from pyscf.dft import gen_grid
 from gpu4pyscf.lib import utils
 class Grids(lib.StreamObject):
@@ -483,8 +550,9 @@ class Grids(lib.StreamObject):
     alignment    = ALIGNMENT_UNIT
     cutoff       = CUTOFF
     _keys        = gen_grid.Grids._keys
-
-    __init__    = gen_grid.Grids.__init__
+    __init__     = gen_grid.Grids.__init__
+
+    _keys.update({'sparse_cache'})
 
     def __setattr__(self, key, val):
         if key in ('atom_grid', 'atomic_radii', 'radii_adjust', 'radi_method',
@@ -502,27 +570,30 @@ def build(self, mol=None, with_non0tab=False, sort_grids=True, **kwargs):
             self.check_sanity()
         atom_grids_tab = self.gen_atomic_grids(
             mol, self.atom_grid, self.radi_method, self.level, self.prune, **kwargs)
-        self.coords, self.weights = self.get_partition(
-            mol, atom_grids_tab, self.radii_adjust, self.atomic_radii, self.becke_scheme)
+        coords, weights = self.get_partition(
+            mol, atom_grids_tab, self.radii_adjust, self.atomic_radii, self.becke_scheme,
+            concat=False)
         if self.alignment > 1:
-            padding = _padding_size(self.size, self.alignment)
+            grid_size = sum([w.size for w in weights])
+            padding = _padding_size(grid_size, self.alignment)
             logger.debug(self, 'Padding %d grids', padding)
             if padding > 0:
-                # cupy.vstack and cupy.hstack convert numpy array into cupy array first
-                self.coords = cupy.vstack(
-                    [self.coords, numpy.repeat([[1e4]*3], padding, axis=0)])
-                self.weights = cupy.hstack([self.weights, numpy.zeros(padding)])
+                coords.append(cupy.asarray(numpy.repeat([[1e4]*3], padding, axis=0)))
+                weights.append(cupy.zeros(padding))
+        self.coords = cupy.vstack(coords)
+        self.weights = cupy.hstack(weights)
+
         if sort_grids:
-            #idx = arg_group_grids(mol, self.coords)
-            idx = atomic_group_grids(mol, self.coords)
+            idx = arg_group_grids(mol, self.coords)
+            #idx = atomic_group_grids(mol, self.coords)
             self.coords = self.coords[idx]
             self.weights = self.weights[idx]
-
         if with_non0tab:
             self.non0tab = self.make_mask(mol, self.coords)
             self.screen_index = self.non0tab
         else:
             self.screen_index = self.non0tab = None
+
         logger.info(self, 'tot grids = %d', len(self.weights))
         return self
 
@@ -538,6 +609,7 @@ def reset(self, mol=None):
         self.weights = None
         self.non0tab = None
         self.screen_index = None
+        self.sparse_cache = {}
         return self
 
     gen_atomic_grids = lib.module_method(
@@ -556,6 +628,38 @@ def get_partition(self, mol, atom_grids_tab=None,
 
     make_mask = lib.module_method(make_mask, absences=['cutoff'])
 
+    def cache_sparsity(self, sorted_mol, blksize=ALIGNMENT_UNIT, sort_grids=False):
+        ''' Build sparsity data for sparse AO evaluation
+        Sort grids based on the number of nonzero AOs
+        '''
+        log = logger.new_logger(sorted_mol, sorted_mol.verbose) #ni.verbose)
+        t1 = log.init_timer()
+        ngrids = self.coords.shape[0]
+        s_index = make_mask(sorted_mol, self.coords, blksize=blksize)
+        ao_indices, ao_loc_non0 = gen_grid_sparsity(sorted_mol, s_index)
+        nao_non0 = ao_loc_non0[:,-1]
+        if sort_grids:
+            # Sort grids based on the number of nonzero AOs
+            idx = numpy.argsort(nao_non0)
+            nao_non0 = nao_non0[idx]
+            ao_indices = ao_indices[idx]
+            s_index = s_index[idx]
+            ao_loc_non0 = cupy.asarray(ao_loc_non0[idx])
+
+            ao_indices = cupy.asarray(ao_indices)
+            idx = cupy.asarray(idx)
+            idx_grids = cupy.tile(idx*blksize, (blksize,1)).T
+            idx_grids += cupy.arange(blksize)
+            idx_grids = idx_grids.ravel(order='C')[:ngrids]
+
+            self.coords = self.coords[idx_grids]
+            self.weights = self.weights[idx_grids]
+
+        sparse_data = (ao_indices, s_index, ao_loc_non0, nao_non0)
+        self.sparse_cache[blksize, ngrids] = sparse_data
+        t1 = log.timer_debug1('generate sparsity', *t1)
+        return
+
     def prune_by_density_(self, rho, threshold=0):
         '''Prune grids if the electron density on the grid is small'''
         if threshold == 0: