Improving the RDKitConverter caching system (#2942)

Fixes #2958 ## Work done in this PR * use lru_cache for the RDKit converter * add force parameter to allow for the conversion of systems with no hydrogens
MDAnalysis · May 11, 2021 · 6d5ef34 · 6d5ef34
1 parent d0fc581
commit 6d5ef34
Showing 3 changed files with 219 additions and 158 deletions.
diff --git a/package/CHANGELOG b/package/CHANGELOG
@@ -23,6 +23,7 @@ The rules for this file:
   * 2.0.0
 
 Fixes
+  * New RDKitConverter cache system to fix issue #2958 (PR #2942)
   * Fixed OpenMM converter documentation not showing up in the Converters
     section (Issue #3262, PR #2882)
   * WaterBridgeAnalysis double counts the water (Issue #3119, PR #3120)
@@ -109,6 +110,8 @@ Fixes
   * Fix syntax warning over comparison of literals using is (Issue #3066)
 
 Enhancements
+  * Added guessers for aromaticity and Gasteiger partial charges (Issue #2468,
+    PR #2926)
   * Added `Results` class for storing analysis results (#3115, PR #3233)
   * Added intra_bonds, intra_angles, intra_dihedrals etc. to return only
     the connections involving atoms within the AtomGroup, instead of
@@ -177,6 +180,9 @@ Enhancements
     checking if it can be used in parallel analysis. (Issue #2996, PR #2950)
 
 Changes
+  * Adds a `force` parameter to the RDKitConverter to allow conversion of
+    molecules with no hydrogen atom, and a `set_converter_cache_size` function
+    to change how many items are retained in the cache (PR #2942)
   * `hydrogenbonds.WaterBridgeAnalysis.generate_table()` now returns the table; as previously, 
     it also generates the `table` attribute but the attribute will be removed in 3.0.0.
   * `hydrogenbonds.WaterBridgeAnalysis` now stores data using the

diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py
@@ -67,6 +67,7 @@
 import warnings
 import re
 import copy
+from functools import lru_cache
 
 import numpy as np
 
@@ -244,19 +245,13 @@ class RDKitConverter(base.ConverterBase):
     Since one of the main use case of the converter is converting trajectories
     and not just a topology, creating a new molecule from scratch for every
     frame would be too slow so the converter uses a caching system. The cache
-    only remembers the id of the last AtomGroup that was converted, as well
-    as the arguments that were passed to the converter. This means that using
-    ``u.select_atoms("protein").convert_to("RDKIT")`` will not benefit from the
-    cache since the selection is deleted from memory as soon as the conversion
-    is finished. Instead, users should do this in two steps by first saving the
-    selection in a variable and then converting the saved AtomGroup. It also
-    means that ``ag.convert_to("RDKIT")`` followed by
-    ``ag.convert_to("RDKIT", NoImplicit=False)`` will not use the cache.
-    Finally if you're modifying the AtomGroup in place between two conversions,
-    the id of the AtomGroup won't change and thus the converter will use the
-    cached molecule. For this reason, you can pass a ``cache=False`` argument
-    to the converter to bypass the caching system.
-    Note that the cached molecule doesn't contain the coordinates of the atoms.
+    only stores the 2 most recent AtomGroups that were converted, and is
+    sensitive to the arguments that were passed to the converter. The number of
+    objects cached can be changed with the function
+    :func:`set_converter_cache_size`. However, ``ag.convert_to("RDKIT")``
+    followed by ``ag.convert_to("RDKIT", NoImplicit=False)`` will not use the
+    cache since the arguments given are different. You can pass a
+    ``cache=False`` argument to the converter to bypass the caching system.
 
 
     .. versionadded:: 2.0.0
@@ -265,9 +260,9 @@ class RDKitConverter(base.ConverterBase):
 
     lib = 'RDKIT'
     units = {'time': None, 'length': 'Angstrom'}
-    _cache = dict()
 
-    def convert(self, obj, cache=True, NoImplicit=True, max_iter=200):
+    def convert(self, obj, cache=True, NoImplicit=True, max_iter=200,
+                force=False):
         """Write selection at current trajectory frame to
         :class:`~rdkit.Chem.rdchem.Mol`.
 
@@ -278,16 +273,17 @@ def convert(self, obj, cache=True, NoImplicit=True, max_iter=200):
         cache : bool
             Use a cached copy of the molecule's topology when available. To be
             used, the cached molecule and the new one have to be made from the
-            same AtomGroup object (same id) and with the same arguments passed
-            to the converter (with the exception of this `cache` argument)
+            same AtomGroup selection and with the same arguments passed
+            to the converter
         NoImplicit : bool
             Prevent adding hydrogens to the molecule
         max_iter : int
             Maximum number of iterations to standardize conjugated systems.
             See :func:`_rebuild_conjugated_bonds`
+        force : bool
+            Force the conversion when no hydrogens were detected but
+            ``NoImplicit=True``. Useful for inorganic molecules mostly.
         """
-        # parameters passed to atomgroup_to_mol and used by the cache
-        kwargs = dict(NoImplicit=NoImplicit, max_iter=max_iter)
 
         try:
             from rdkit import Chem
@@ -303,22 +299,13 @@ def convert(self, obj, cache=True, NoImplicit=True, max_iter=200):
                             "please use a valid AtomGroup or Universe".format(
                                 type(obj))) from None
 
+        # parameters passed to atomgroup_to_mol
+        kwargs = dict(NoImplicit=NoImplicit, max_iter=max_iter, force=force)
         if cache:
-            # key used to search the cache
-            key = f"<{id(ag):#x}>" + ",".join(f"{key}={value}"
-                                            for key, value in kwargs.items())
-            try:
-                mol = self._cache[key]
-            except KeyError:
-                # only keep the current molecule in cache
-                self._cache.clear()
-                # create the topology
-                self._cache[key] = mol = self.atomgroup_to_mol(ag, **kwargs)
-            # continue on copy of the cached molecule
+            mol = atomgroup_to_mol(ag, **kwargs)
             mol = copy.deepcopy(mol)
         else:
-            self._cache.clear()
-            mol = self.atomgroup_to_mol(ag, **kwargs)
+            mol = atomgroup_to_mol.__wrapped__(ag, **kwargs)
 
         # add a conformer for the current Timestep
         if hasattr(ag, "positions"):
@@ -339,108 +326,131 @@ def convert(self, obj, cache=True, NoImplicit=True, max_iter=200):
 
         return mol
 
-    def atomgroup_to_mol(self, ag, NoImplicit=True, max_iter=200):
-        """Converts an AtomGroup to an RDKit molecule.
 
-        Parameters
-        -----------
-        ag : MDAnalysis.core.groups.AtomGroup
-            The AtomGroup to convert
-        NoImplicit : bool
-            Prevent adding hydrogens to the molecule
-        max_iter : int
-            Maximum number of iterations to standardize conjugated systems.
-            See :func:`_rebuild_conjugated_bonds`
-        """
-        try:
-            elements = ag.elements
-        except NoDataError:
-            raise AttributeError(
-                "The `elements` attribute is required for the RDKitConverter "
-                "but is not present in this AtomGroup. Please refer to the "
-                "documentation to guess elements from other attributes or "
-                "type `help(mda.topology.guessers)`") from None
+@lru_cache(maxsize=2)
+def atomgroup_to_mol(ag, NoImplicit=True, max_iter=200, force=False):
+    """Converts an AtomGroup to an RDKit molecule without coordinates.
 
-        if "H" not in ag.elements:
+    Parameters
+    -----------
+    ag : MDAnalysis.core.groups.AtomGroup
+        The AtomGroup to convert
+    NoImplicit : bool
+        Prevent adding hydrogens to the molecule and allow bond orders and
+        formal charges to be guessed from the valence of each atom.
+    max_iter : int
+        Maximum number of iterations to standardize conjugated systems.
+        See :func:`_rebuild_conjugated_bonds`
+    force : bool
+        Force the conversion when no hydrogens were detected but
+        ``NoImplicit=True``. Mostly useful for inorganic molecules.
+    """
+    try:
+        elements = ag.elements
+    except NoDataError:
+        raise AttributeError(
+            "The `elements` attribute is required for the RDKitConverter "
+            "but is not present in this AtomGroup. Please refer to the "
+            "documentation to guess elements from other attributes or "
+            "type `help(MDAnalysis.topology.guessers)`") from None
+
+    if "H" not in ag.elements:
+        if force:
             warnings.warn(
-                "No hydrogen atom could be found in the topology, but the "
-                "converter requires all hydrogens to be explicit. Please "
-                "check carefully the output molecule as the converter is "
-                "likely to add negative charges and assign incorrect bond "
-                "orders to structures with implicit hydrogens. Alternatively, "
-                "you can use the parameter `NoImplicit=False` when using the "
-                "converter to allow implicit hydrogens and disable inferring "
-                "bond orders and charges."
+                "No hydrogen atom found in the topology. "
+                "Forcing to continue the conversion."
             )
-
-        # attributes accepted in PDBResidueInfo object
-        pdb_attrs = {}
-        if hasattr(ag, "bfactors") and hasattr(ag, "tempfactors"):
+        elif NoImplicit:
             raise AttributeError(
-                "Both `tempfactors` and `bfactors` attributes are present but "
-                "only one can be assigned to the RDKit molecule. Please "
-                "delete the unnecessary one and retry."
-            )
-        for attr in RDATTRIBUTES.keys():
-            if hasattr(ag, attr):
-                pdb_attrs[attr] = getattr(ag, attr)
-
-        other_attrs = {}
-        for attr in ["charges", "segids", "types"]:
-            if hasattr(ag, attr):
-                other_attrs[attr] = getattr(ag, attr)
-
-        mol = Chem.RWMol()
-        # map index in universe to index in mol
-        atom_mapper = {}
-
-        for i, (atom, element) in enumerate(zip(ag, elements)):
-            # create atom
-            rdatom = Chem.Atom(element.capitalize())
-            # enable/disable adding implicit H to the molecule
-            rdatom.SetNoImplicit(NoImplicit)
-            # add PDB-like properties
-            mi = Chem.AtomPDBResidueInfo()
-            for attr, values in pdb_attrs.items():
-                _add_mda_attr_to_rdkit(attr, values[i], mi)
-            rdatom.SetMonomerInfo(mi)
-            # other properties
-            for attr in other_attrs.keys():
-                value = other_attrs[attr][i]
-                attr = "_MDAnalysis_%s" % _TOPOLOGY_ATTRS[attr].singular
-                _set_atom_property(rdatom, attr, value)
-            _set_atom_property(rdatom, "_MDAnalysis_index", i)
-            # add atom
-            index = mol.AddAtom(rdatom)
-            atom_mapper[atom.ix] = index
-
+                "No hydrogen atom could be found in the topology, but the "
+                "converter requires all hydrogens to be explicit. You can use "
+                "the parameter ``NoImplicit=False`` when using the converter "
+                "to allow implicit hydrogens and disable inferring bond "
+                "orders and charges. You can also use ``force=True`` to "
+                "ignore this error.")
+
+    # attributes accepted in PDBResidueInfo object
+    pdb_attrs = {}
+    if hasattr(ag, "bfactors") and hasattr(ag, "tempfactors"):
+        raise AttributeError(
+            "Both `tempfactors` and `bfactors` attributes are present but "
+            "only one can be assigned to the RDKit molecule. Please "
+            "delete the unnecessary one and retry."
+        )
+    for attr in RDATTRIBUTES.keys():
+        if hasattr(ag, attr):
+            pdb_attrs[attr] = getattr(ag, attr)
+
+    other_attrs = {}
+    for attr in ["charges", "segids", "types"]:
+        if hasattr(ag, attr):
+            other_attrs[attr] = getattr(ag, attr)
+
+    mol = Chem.RWMol()
+    # map index in universe to index in mol
+    atom_mapper = {}
+
+    for i, (atom, element) in enumerate(zip(ag, elements)):
+        # create atom
+        rdatom = Chem.Atom(element.capitalize())
+        # enable/disable adding implicit H to the molecule
+        rdatom.SetNoImplicit(NoImplicit)
+        # add PDB-like properties
+        mi = Chem.AtomPDBResidueInfo()
+        for attr, values in pdb_attrs.items():
+            _add_mda_attr_to_rdkit(attr, values[i], mi)
+        rdatom.SetMonomerInfo(mi)
+        # other properties
+        for attr in other_attrs.keys():
+            value = other_attrs[attr][i]
+            attr = "_MDAnalysis_%s" % _TOPOLOGY_ATTRS[attr].singular
+            _set_atom_property(rdatom, attr, value)
+        _set_atom_property(rdatom, "_MDAnalysis_index", i)
+        # add atom
+        index = mol.AddAtom(rdatom)
+        atom_mapper[atom.ix] = index
+
+    try:
+        ag.bonds
+    except NoDataError:
+        warnings.warn(
+            "No `bonds` attribute in this AtomGroup. Guessing bonds based "
+            "on atoms coordinates")
+        ag.guess_bonds()
+
+    for bond in ag.bonds:
         try:
-            ag.bonds
-        except NoDataError:
-            warnings.warn(
-                "No `bonds` attribute in this AtomGroup. Guessing bonds based "
-                "on atoms coordinates")
-            ag.guess_bonds()
+            bond_indices = [atom_mapper[i] for i in bond.indices]
+        except KeyError:
+            continue
+        bond_type = RDBONDORDER.get(bond.order, Chem.BondType.SINGLE)
+        mol.AddBond(*bond_indices, bond_type)
+
+    mol.UpdatePropertyCache(strict=False)
+
+    if NoImplicit:
+        # infer bond orders and formal charges from the connectivity
+        _infer_bo_and_charges(mol)
+        mol = _standardize_patterns(mol, max_iter)
 
-        for bond in ag.bonds:
-            try:
-                bond_indices = [atom_mapper[i] for i in bond.indices]
-            except KeyError:
-                continue
-            bond_type = RDBONDORDER.get(bond.order, Chem.BondType.SINGLE)
-            mol.AddBond(*bond_indices, bond_type)
+    # sanitize
+    Chem.SanitizeMol(mol)
 
-        mol.UpdatePropertyCache(strict=False)
+    return mol
 
-        if NoImplicit:
-            # infer bond orders and formal charges from the connectivity
-            _infer_bo_and_charges(mol)
-            mol = _standardize_patterns(mol, max_iter)
 
-        # sanitize
-        Chem.SanitizeMol(mol)
+def set_converter_cache_size(maxsize):
+    """Set the maximum cache size of the RDKit converter
 
-        return mol
+    Parameters
+    ----------
+    maxsize : int or None
+        If int, the cache will only keep the ``maxsize`` most recent
+        conversions in memory. Using ``maxsize=None`` will remove all limits
+        to the cache size, i.e. everything is cached.
+    """
+    global atomgroup_to_mol
+    atomgroup_to_mol = lru_cache(maxsize=maxsize)(atomgroup_to_mol.__wrapped__)
 
 
 def _add_mda_attr_to_rdkit(attr, value, mi):