Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improving the RDKitConverter caching system #2942

Merged
merged 12 commits into from
May 11, 2021
256 changes: 133 additions & 123 deletions package/MDAnalysis/coordinates/RDKit.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
import warnings
import re
import copy
from functools import lru_cache

import numpy as np

Expand Down Expand Up @@ -244,19 +245,13 @@ class RDKitConverter(base.ConverterBase):
Since one of the main use case of the converter is converting trajectories
and not just a topology, creating a new molecule from scratch for every
frame would be too slow so the converter uses a caching system. The cache
only remembers the id of the last AtomGroup that was converted, as well
as the arguments that were passed to the converter. This means that using
``u.select_atoms("protein").convert_to("RDKIT")`` will not benefit from the
cache since the selection is deleted from memory as soon as the conversion
is finished. Instead, users should do this in two steps by first saving the
selection in a variable and then converting the saved AtomGroup. It also
means that ``ag.convert_to("RDKIT")`` followed by
``ag.convert_to("RDKIT", NoImplicit=False)`` will not use the cache.
Finally if you're modifying the AtomGroup in place between two conversions,
the id of the AtomGroup won't change and thus the converter will use the
cached molecule. For this reason, you can pass a ``cache=False`` argument
to the converter to bypass the caching system.
Note that the cached molecule doesn't contain the coordinates of the atoms.
only stores the 2 most recent AtomGroups that were converted, and is
sensitive to the arguments that were passed to the converter. The number of
objects cached can be changed with the function
:func:`set_converter_cache_size`. However, ``ag.convert_to("RDKIT")``
followed by ``ag.convert_to("RDKIT", NoImplicit=False)`` will not use the
cache since the arguments given are different. You can pass a
``cache=False`` argument to the converter to bypass the caching system.


.. versionadded:: 2.0.0
Expand All @@ -265,9 +260,9 @@ class RDKitConverter(base.ConverterBase):

lib = 'RDKIT'
units = {'time': None, 'length': 'Angstrom'}
_cache = dict()

def convert(self, obj, cache=True, NoImplicit=True, max_iter=200):
def convert(self, obj, cache=True, NoImplicit=True, max_iter=200,
force=False):
"""Write selection at current trajectory frame to
:class:`~rdkit.Chem.rdchem.Mol`.

Expand All @@ -278,16 +273,17 @@ def convert(self, obj, cache=True, NoImplicit=True, max_iter=200):
cache : bool
Use a cached copy of the molecule's topology when available. To be
used, the cached molecule and the new one have to be made from the
same AtomGroup object (same id) and with the same arguments passed
to the converter (with the exception of this `cache` argument)
same AtomGroup selection and with the same arguments passed
to the converter
NoImplicit : bool
Prevent adding hydrogens to the molecule
max_iter : int
Maximum number of iterations to standardize conjugated systems.
See :func:`_rebuild_conjugated_bonds`
force : bool
Force the conversion when no hydrogens were detected but
``NoImplicit=True``. Useful for inorganic molecules mostly.
"""
# parameters passed to atomgroup_to_mol and used by the cache
kwargs = dict(NoImplicit=NoImplicit, max_iter=max_iter)

try:
from rdkit import Chem
Expand All @@ -303,22 +299,13 @@ def convert(self, obj, cache=True, NoImplicit=True, max_iter=200):
"please use a valid AtomGroup or Universe".format(
type(obj))) from None

# parameters passed to atomgroup_to_mol
kwargs = dict(NoImplicit=NoImplicit, max_iter=max_iter, force=force)
if cache:
# key used to search the cache
key = f"<{id(ag):#x}>" + ",".join(f"{key}={value}"
for key, value in kwargs.items())
try:
mol = self._cache[key]
except KeyError:
# only keep the current molecule in cache
self._cache.clear()
# create the topology
self._cache[key] = mol = self.atomgroup_to_mol(ag, **kwargs)
# continue on copy of the cached molecule
mol = atomgroup_to_mol(ag, **kwargs)
mol = copy.deepcopy(mol)
else:
self._cache.clear()
mol = self.atomgroup_to_mol(ag, **kwargs)
mol = atomgroup_to_mol.__wrapped__(ag, **kwargs)
IAlibay marked this conversation as resolved.
Show resolved Hide resolved

# add a conformer for the current Timestep
if hasattr(ag, "positions"):
Expand All @@ -339,108 +326,131 @@ def convert(self, obj, cache=True, NoImplicit=True, max_iter=200):

return mol

def atomgroup_to_mol(self, ag, NoImplicit=True, max_iter=200):
"""Converts an AtomGroup to an RDKit molecule.

Parameters
-----------
ag : MDAnalysis.core.groups.AtomGroup
The AtomGroup to convert
NoImplicit : bool
Prevent adding hydrogens to the molecule
max_iter : int
Maximum number of iterations to standardize conjugated systems.
See :func:`_rebuild_conjugated_bonds`
"""
try:
elements = ag.elements
except NoDataError:
raise AttributeError(
"The `elements` attribute is required for the RDKitConverter "
"but is not present in this AtomGroup. Please refer to the "
"documentation to guess elements from other attributes or "
"type `help(mda.topology.guessers)`") from None
@lru_cache(maxsize=2)
def atomgroup_to_mol(ag, NoImplicit=True, max_iter=200, force=False):
"""Converts an AtomGroup to an RDKit molecule without coordinates.

if "H" not in ag.elements:
Parameters
-----------
ag : MDAnalysis.core.groups.AtomGroup
The AtomGroup to convert
NoImplicit : bool
Prevent adding hydrogens to the molecule and allow bond orders and
formal charges to be guessed from the valence of each atom.
max_iter : int
Maximum number of iterations to standardize conjugated systems.
See :func:`_rebuild_conjugated_bonds`
force : bool
Force the conversion when no hydrogens were detected but
``NoImplicit=True``. Mostly useful for inorganic molecules.
"""
try:
elements = ag.elements
except NoDataError:
raise AttributeError(
"The `elements` attribute is required for the RDKitConverter "
"but is not present in this AtomGroup. Please refer to the "
"documentation to guess elements from other attributes or "
"type `help(MDAnalysis.topology.guessers)`") from None

if "H" not in ag.elements:
cbouy marked this conversation as resolved.
Show resolved Hide resolved
if force:
warnings.warn(
"No hydrogen atom could be found in the topology, but the "
"converter requires all hydrogens to be explicit. Please "
"check carefully the output molecule as the converter is "
"likely to add negative charges and assign incorrect bond "
"orders to structures with implicit hydrogens. Alternatively, "
"you can use the parameter `NoImplicit=False` when using the "
"converter to allow implicit hydrogens and disable inferring "
"bond orders and charges."
"No hydrogen atom found in the topology. "
"Forcing to continue the conversion."
)

# attributes accepted in PDBResidueInfo object
pdb_attrs = {}
if hasattr(ag, "bfactors") and hasattr(ag, "tempfactors"):
elif NoImplicit:
raise AttributeError(
"Both `tempfactors` and `bfactors` attributes are present but "
"only one can be assigned to the RDKit molecule. Please "
"delete the unnecessary one and retry."
)
for attr in RDATTRIBUTES.keys():
if hasattr(ag, attr):
pdb_attrs[attr] = getattr(ag, attr)

other_attrs = {}
for attr in ["charges", "segids", "types"]:
if hasattr(ag, attr):
other_attrs[attr] = getattr(ag, attr)

mol = Chem.RWMol()
# map index in universe to index in mol
atom_mapper = {}

for i, (atom, element) in enumerate(zip(ag, elements)):
# create atom
rdatom = Chem.Atom(element.capitalize())
# enable/disable adding implicit H to the molecule
rdatom.SetNoImplicit(NoImplicit)
# add PDB-like properties
mi = Chem.AtomPDBResidueInfo()
for attr, values in pdb_attrs.items():
_add_mda_attr_to_rdkit(attr, values[i], mi)
rdatom.SetMonomerInfo(mi)
# other properties
for attr in other_attrs.keys():
value = other_attrs[attr][i]
attr = "_MDAnalysis_%s" % _TOPOLOGY_ATTRS[attr].singular
_set_atom_property(rdatom, attr, value)
_set_atom_property(rdatom, "_MDAnalysis_index", i)
# add atom
index = mol.AddAtom(rdatom)
atom_mapper[atom.ix] = index

"No hydrogen atom could be found in the topology, but the "
"converter requires all hydrogens to be explicit. You can use "
"the parameter ``NoImplicit=False`` when using the converter "
"to allow implicit hydrogens and disable inferring bond "
"orders and charges. You can also use ``force=True`` to "
"ignore this error.")

# attributes accepted in PDBResidueInfo object
pdb_attrs = {}
if hasattr(ag, "bfactors") and hasattr(ag, "tempfactors"):
raise AttributeError(
"Both `tempfactors` and `bfactors` attributes are present but "
"only one can be assigned to the RDKit molecule. Please "
"delete the unnecessary one and retry."
)
for attr in RDATTRIBUTES.keys():
if hasattr(ag, attr):
pdb_attrs[attr] = getattr(ag, attr)

other_attrs = {}
for attr in ["charges", "segids", "types"]:
if hasattr(ag, attr):
other_attrs[attr] = getattr(ag, attr)

mol = Chem.RWMol()
# map index in universe to index in mol
atom_mapper = {}

for i, (atom, element) in enumerate(zip(ag, elements)):
# create atom
rdatom = Chem.Atom(element.capitalize())
# enable/disable adding implicit H to the molecule
rdatom.SetNoImplicit(NoImplicit)
# add PDB-like properties
mi = Chem.AtomPDBResidueInfo()
for attr, values in pdb_attrs.items():
_add_mda_attr_to_rdkit(attr, values[i], mi)
rdatom.SetMonomerInfo(mi)
# other properties
for attr in other_attrs.keys():
value = other_attrs[attr][i]
attr = "_MDAnalysis_%s" % _TOPOLOGY_ATTRS[attr].singular
_set_atom_property(rdatom, attr, value)
_set_atom_property(rdatom, "_MDAnalysis_index", i)
# add atom
index = mol.AddAtom(rdatom)
atom_mapper[atom.ix] = index

try:
ag.bonds
except NoDataError:
warnings.warn(
"No `bonds` attribute in this AtomGroup. Guessing bonds based "
"on atoms coordinates")
ag.guess_bonds()

for bond in ag.bonds:
try:
ag.bonds
except NoDataError:
warnings.warn(
"No `bonds` attribute in this AtomGroup. Guessing bonds based "
"on atoms coordinates")
ag.guess_bonds()
bond_indices = [atom_mapper[i] for i in bond.indices]
except KeyError:
cbouy marked this conversation as resolved.
Show resolved Hide resolved
continue
bond_type = RDBONDORDER.get(bond.order, Chem.BondType.SINGLE)
mol.AddBond(*bond_indices, bond_type)

mol.UpdatePropertyCache(strict=False)

if NoImplicit:
# infer bond orders and formal charges from the connectivity
_infer_bo_and_charges(mol)
mol = _standardize_patterns(mol, max_iter)

for bond in ag.bonds:
try:
bond_indices = [atom_mapper[i] for i in bond.indices]
except KeyError:
continue
bond_type = RDBONDORDER.get(bond.order, Chem.BondType.SINGLE)
mol.AddBond(*bond_indices, bond_type)
# sanitize
Chem.SanitizeMol(mol)

mol.UpdatePropertyCache(strict=False)
return mol

if NoImplicit:
# infer bond orders and formal charges from the connectivity
_infer_bo_and_charges(mol)
mol = _standardize_patterns(mol, max_iter)

# sanitize
Chem.SanitizeMol(mol)
def set_converter_cache_size(maxsize):
"""Set the maximum cache size of the RDKit converter

return mol
Parameters
----------
maxsize : int or None
If int, the cache will only keep the ``maxsize`` most recent
conversions in memory. Using ``maxsize=None`` will remove all limits
to the cache size, i.e. everything is cached.
"""
global atomgroup_to_mol
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

justified use of global ;-)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably not thread-safe – not a big deal, though, and I don't have a better idea.

(Although, we don't really encourage use of threads for parallelization; multiprocessing should do just fine.)

atomgroup_to_mol = lru_cache(maxsize=maxsize)(atomgroup_to_mol.__wrapped__)


def _add_mda_attr_to_rdkit(attr, value, mi):
Expand Down
Loading