Skip to content

Commit

Permalink
Faster name selections (#2755)
Browse files Browse the repository at this point in the history
* modified AtomNames topologyattr to include lookup table index

* cheeky little optimisation

* rework atom name selection to use lookup tables

* Update topologyattrs.py

* fixed test supplying integer as atom name

really topologyattrs need to be statically typed and protective about this

* Update test_topologyattrs.py

* use dict-lookup string attrs EVERYWHERERE

* removed some code duplication

made protein selection faster, 48ms -> 0.5ms on GRO testfile

* improved nucleic/backbone selections

* Added explicit tests for Resnames topologyattr

tests now provide str types for resnames/icodes

* use fnmatchcase to be case sensitive

* Update package/MDAnalysis/core/selection.py

@jbarnoud's fix

* apply suggestions from code review

Co-authored-by: Irfan Alibay <[email protected]>

* added test for setting multiple segids at once

Co-authored-by: Oliver Beckstein <[email protected]>
Co-authored-by: Irfan Alibay <[email protected]>
  • Loading branch information
3 people authored Aug 25, 2020
1 parent ca2cbe4 commit 45e56e8
Show file tree
Hide file tree
Showing 6 changed files with 369 additions and 73 deletions.
3 changes: 3 additions & 0 deletions package/CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ Fixes
* In hydrogenbonds.hbond_analysis.HydrogenbondAnalysis an AttributeError
was thrown when finding D-H pairs via the topology if `hydrogens` was an
empty AtomGroup (Issue #2848)
* Fixed performance regression on select_atoms for string selections (#2751)
* Fixed the DMSParser, allowing the creation of multiple segids sharing
residues with identical resids (Issue #1387, PR #2872)
* H5MD files are now picklable with H5PYPicklable (Issue #2890, PR #2894)
Expand Down Expand Up @@ -79,6 +80,8 @@ Enhancements
* Added new kwargs `select_remove` and `select_protein` to
analysis.dihedrals.Janin analysis to give user more fine grained control
over selections (PR #2899)
* Improved performance of select_atoms on strings (e.g. name, type, resname) and
'protein' selection (#2751 PR #2755)
* Added an RDKit converter that works for any input with all hydrogens
explicit in the topology (Issue #2468, PR #2775)

Expand Down
175 changes: 142 additions & 33 deletions package/MDAnalysis/core/selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,7 +515,7 @@ def apply(self, group):
return group[mask]


class StringSelection(Selection):
class _ProtoStringSelection(Selection):
"""Selections based on text attributes
.. versionchanged:: 1.0.0
Expand All @@ -530,11 +530,23 @@ def __init__(self, parser, tokens):

@return_empty_on_apply
def apply(self, group):
mask = np.zeros(len(group), dtype=bool)
for val in self.values:
values = getattr(group, self.field)
mask |= [fnmatch.fnmatch(x, val) for x in values]
return group[mask].unique
# rather than work on group.names, cheat and look at the lookup table
nmattr = getattr(group.universe._topology, self.field)

matches = [] # list of passing indices
# iterate through set of known atom names, check which pass
for nm, ix in nmattr.namedict.items():
if any(fnmatch.fnmatchcase(nm, val) for val in self.values):
matches.append(ix)

# atomname indices for members of this group
nmidx = nmattr.nmidx[getattr(group, self.level)]

return group[np.in1d(nmidx, matches)].unique


class StringSelection(_ProtoStringSelection):
level = 'ix' # operates on atom level attribute, i.e. '.ix'


class AtomNameSelection(StringSelection):
Expand All @@ -561,22 +573,27 @@ class AtomICodeSelection(StringSelection):
field = 'icodes'


class ResidueNameSelection(StringSelection):
class _ResidueStringSelection(_ProtoStringSelection):
level= 'resindices'


class ResidueNameSelection(_ResidueStringSelection):
"""Select atoms based on 'resnames' attribute"""
token = 'resname'
field = 'resnames'


class MoleculeTypeSelection(StringSelection):
class MoleculeTypeSelection(_ResidueStringSelection):
"""Select atoms based on 'moltypes' attribute"""
token = 'moltype'
field = 'moltypes'


class SegmentNameSelection(StringSelection):
class SegmentNameSelection(_ProtoStringSelection):
"""Select atoms based on 'segids' attribute"""
token = 'segid'
field = 'segids'
level = 'segindices'


class AltlocSelection(StringSelection):
Expand Down Expand Up @@ -802,10 +819,15 @@ class ProteinSelection(Selection):
See Also
--------
:func:`MDAnalysis.lib.util.convert_aa_code`
.. versionchanged:: 2.0.0
prot_res changed to set (from numpy array)
performance improved by ~100x on larger systems
"""
token = 'protein'

prot_res = np.array([
prot_res = {
# CHARMM top_all27_prot_lipid.rtf
'ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HSD',
'HSE', 'HSP', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR',
Expand All @@ -828,14 +850,20 @@ class ProteinSelection(Selection):
'CLEU', 'CILE', 'CVAL', 'CASF', 'CASN', 'CGLN', 'CARG', 'CHID', 'CHIE',
'CHIP', 'CTRP', 'CPHE', 'CTYR', 'CGLU', 'CASP', 'CLYS', 'CPRO', 'CCYS',
'CCYX', 'CMET', 'CME', 'ASF',
])
}

def __init__(self, parser, tokens):
pass

def apply(self, group):
mask = np.in1d(group.resnames, self.prot_res)
return group[mask].unique
resname_attr = group.universe._topology.resnames
# which values in resname attr are in prot_res?
matches = [ix for (nm, ix) in resname_attr.namedict.items()
if nm in self.prot_res]
# index of each atom's resname
nmidx = resname_attr.nmidx[group.resindices]
# intersect atom's resname index and matches to prot_res
return group[np.in1d(nmidx, matches)].unique


class NucleicSelection(Selection):
Expand All @@ -850,23 +878,32 @@ class NucleicSelection(Selection):
.. versionchanged:: 0.8
additional Gromacs selections
.. versionchanged:: 2.0.0
nucl_res changed to set (from numpy array)
performance improved by ~100x on larger systems
"""
token = 'nucleic'

nucl_res = np.array([
nucl_res = {
'ADE', 'URA', 'CYT', 'GUA', 'THY', 'DA', 'DC', 'DG', 'DT', 'RA',
'RU', 'RG', 'RC', 'A', 'T', 'U', 'C', 'G',
'DA5', 'DC5', 'DG5', 'DT5',
'DA3', 'DC3', 'DG3', 'DT3',
'RA5', 'RU5', 'RG5', 'RC5',
'RA3', 'RU3', 'RG3', 'RC3'
])
}

def __init__(self, parser, tokens):
pass

def apply(self, group):
mask = np.in1d(group.resnames, self.nucl_res)
resnames = group.universe._topology.resnames
nmidx = resnames.nmidx[group.resindices]

matches = [ix for (nm, ix) in resnames.namedict.items()
if nm in self.nucl_res]
mask = np.in1d(nmidx, matches)

return group[mask].unique


Expand All @@ -875,29 +912,65 @@ class BackboneSelection(ProteinSelection):
This excludes OT* on C-termini
(which are included by, eg VMD's backbone selection).
.. versionchanged:: 2.0.0
bb_atoms changed to set (from numpy array)
performance improved by ~100x on larger systems
"""
token = 'backbone'
bb_atoms = np.array(['N', 'CA', 'C', 'O'])
bb_atoms = {'N', 'CA', 'C', 'O'}

def apply(self, group):
mask = np.in1d(group.names, self.bb_atoms)
mask &= np.in1d(group.resnames, self.prot_res)
return group[mask].unique
atomnames = group.universe._topology.names
resnames = group.universe._topology.resnames

# filter by atom names
name_matches = [ix for (nm, ix) in atomnames.namedict.items()
if nm in self.bb_atoms]
nmidx = atomnames.nmidx[group.ix]
group = group[np.in1d(nmidx, name_matches)]

# filter by resnames
resname_matches = [ix for (nm, ix) in resnames.namedict.items()
if nm in self.prot_res]
nmidx = resnames.nmidx[group.resindices]
group = group[np.in1d(nmidx, resname_matches)]

return group.unique


class NucleicBackboneSelection(NucleicSelection):
"""Contains all atoms with name "P", "C5'", C3'", "O3'", "O5'".
These atoms are only recognized if they are in a residue matched
by the :class:`NucleicSelection`.
.. versionchanged:: 2.0.0
bb_atoms changed to set (from numpy array)
performance improved by ~100x on larger systems
"""
token = 'nucleicbackbone'
bb_atoms = np.array(["P", "C5'", "C3'", "O3'", "O5'"])
bb_atoms = {"P", "C5'", "C3'", "O3'", "O5'"}

def apply(self, group):
mask = np.in1d(group.names, self.bb_atoms)
mask &= np.in1d(group.resnames, self.nucl_res)
return group[mask].unique
atomnames = group.universe._topology.names
resnames = group.universe._topology.resnames

# filter by atom names
name_matches = [ix for (nm, ix) in atomnames.namedict.items()
if nm in self.bb_atoms]
nmidx = atomnames.nmidx[group.ix]
group = group[np.in1d(nmidx, name_matches)]

# filter by resnames
resname_matches = [ix for (nm, ix) in resnames.namedict.items()
if nm in self.nucl_res]
nmidx = resnames.nmidx[group.resindices]
group = group[np.in1d(nmidx, resname_matches)]

return group.unique


class BaseSelection(NucleicSelection):
Expand All @@ -907,29 +980,65 @@ class BaseSelection(NucleicSelection):
'N9', 'N7', 'C8', 'C5', 'C4', 'N3', 'C2', 'N1', 'C6',
'O6','N2','N6', 'O2','N4','O4','C5M'
.. versionchanged:: 2.0.0
base_atoms changed to set (from numpy array)
performance improved by ~100x on larger systems
"""
token = 'nucleicbase'
base_atoms = np.array([
base_atoms = {
'N9', 'N7', 'C8', 'C5', 'C4', 'N3', 'C2', 'N1', 'C6',
'O6', 'N2', 'N6',
'O2', 'N4', 'O4', 'C5M'])
'O2', 'N4', 'O4', 'C5M'}

def apply(self, group):
mask = np.in1d(group.names, self.base_atoms)
mask &= np.in1d(group.resnames, self.nucl_res)
return group[mask].unique
atomnames = group.universe._topology.names
resnames = group.universe._topology.resnames

# filter by atom names
name_matches = [ix for (nm, ix) in atomnames.namedict.items()
if nm in self.base_atoms]
nmidx = atomnames.nmidx[group.ix]
group = group[np.in1d(nmidx, name_matches)]

# filter by resnames
resname_matches = [ix for (nm, ix) in resnames.namedict.items()
if nm in self.nucl_res]
nmidx = resnames.nmidx[group.resindices]
group = group[np.in1d(nmidx, resname_matches)]

return group.unique


class NucleicSugarSelection(NucleicSelection):
"""Contains all atoms with name C1', C2', C3', C4', O2', O4', O3'.
.. versionchanged:: 2.0.0
sug_atoms changed to set (from numpy array)
performance improved by ~100x on larger systems
"""
token = 'nucleicsugar'
sug_atoms = np.array(["C1'", "C2'", "C3'", "C4'", "O4'"])
sug_atoms = {"C1'", "C2'", "C3'", "C4'", "O4'"}

def apply(self, group):
mask = np.in1d(group.names, self.sug_atoms)
mask &= np.in1d(group.resnames, self.nucl_res)
return group[mask].unique
atomnames = group.universe._topology.names
resnames = group.universe._topology.resnames

# filter by atom names
name_matches = [ix for (nm, ix) in atomnames.namedict.items()
if nm in self.sug_atoms]
nmidx = atomnames.nmidx[group.ix]
group = group[np.in1d(nmidx, name_matches)]

# filter by resnames
resname_matches = [ix for (nm, ix) in resnames.namedict.items()
if nm in self.nucl_res]
nmidx = resnames.nmidx[group.resindices]
group = group[np.in1d(nmidx, resname_matches)]

return group.unique


class PropertySelection(Selection):
Expand Down
Loading

0 comments on commit 45e56e8

Please sign in to comment.