Merge pull request MDAnalysis#1066 from MDAnalysis/issue-839-icodesel…

…ection Issue MDAnalysis#839 - Resid selection with icodes
abiedermann · Nov 11, 2016 · 9d5d7bf · 9d5d7bf
2 parents 2dfa20a + 941b5a9
commit 9d5d7bf
Show file tree

Hide file tree

Showing 6 changed files with 264 additions and 18 deletions.
diff --git a/package/CHANGELOG b/package/CHANGELOG
@@ -43,6 +43,8 @@ Enhancements
     analysis code (Issue #743)
   * Added ProgressMeter to 'transfer_to_memory' function, to show progress
     of loading trajectory to memory (Issue #1028 and PR #1055).
+  * Selecting atoms by resid now respects icodes if they were present. Ie
+    select_atoms('resid 163A') works! (Issue #839 PR #1066)
 
 Fixes
   * Removed MDAnalysis.analysis.PDBToBinaryTraj (Issue #1035)

diff --git a/package/MDAnalysis/core/groups.py b/package/MDAnalysis/core/groups.py
@@ -995,6 +995,11 @@ def select_atoms(self, sel, *othersel, **selgroups):
                 range consists of two numbers separated by a colon (inclusive)
                 such as ``resid 1:5``. A residue number ("resid") is taken
                 directly from the topology.
+                If icodes are present in the topology, then these will be
+                taken into account.  Ie 'resid 163B' will only select resid
+                163 with icode B while 'resid 163' will select only residue 163.
+                Range selections will also respect icodes, so 'resid 162-163B'
+                will select all residues in 162 and those in 163 up to icode B.
             resnum *resnum-number-range*
                 resnum is the canonical residue number; typically it is set to
                 the residue id in the original PDB structure.
@@ -1098,7 +1103,8 @@ def select_atoms(self, sel, *othersel, **selgroups):
            Added *group* and *fullgroup* selections.
         .. versionchanged:: 0.13.0
            Added *bonded* selection
-
+        .. versionchanged:: 0.16.0
+           Resid selection now takes icodes into account where present.
         """
         atomgrp = selection.Parser.parse(sel, selgroups).apply(self)
         if othersel:

diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py
@@ -616,22 +616,137 @@ class AltlocSelection(StringSelection):
     field = 'altLocs'
 
 
-class RangeSelection(Selection):
+class ResidSelection(Selection):
     """Select atoms based on numerical fields
 
     Allows the use of ':' and '-' to specify a range of values
     For example
 
       resid 1:10
     """
+    token = 'resid'
+
     def __init__(self, parser, tokens):
         values = grab_not_keywords(tokens)
         if not values:
             raise ValueError("Unexpected token: '{0}'".format(tokens[0]))
 
+        # each value in uppers and lowers is a tuple of (resid, icode)
         uppers = []
         lowers = []
 
+        for val in values:
+            m1 = re.match("(\d+)(\w?)$", val)
+            if not m1 is None:
+                res = m1.groups()
+                lower = int(res[0]), res[1]
+                upper = None, None
+            else:
+                # check if in appropriate format 'lower:upper' or 'lower-upper'
+                # each val is one or more digits, maybe a letter
+                selrange = re.match("(\d+)(\w?)[:-](\d+)(\w?)", val)
+                if selrange is None:  # re.match returns None on failure
+                    raise ValueError("Failed to parse value: {0}".format(val))
+                res = selrange.groups()
+                # resid and icode
+                lower = int(res[0]), res[1]
+                upper = int(res[2]), res[3]
+
+            lowers.append(lower)
+            uppers.append(upper)
+
+        self.lowers = lowers
+        self.uppers = uppers
+
+    def apply(self, group):
+        # Grab arrays here to reduce number of calls to main topology
+        vals = group.resids
+        try:  # optional attribute
+            icodes = group.icodes
+        except (AttributeError, NoDataError):
+            icodes = None
+            # if no icodes and icodes are part of selection, cause a fuss
+            if (any(v[1] for v in self.uppers) or
+                any(v[1] for v in self.lowers)):
+                raise ValueError("Selection specified icodes, while the "
+                                 "topology doesn't have any.")
+
+        if not icodes is None:
+            mask = self._sel_with_icodes(vals, icodes)
+        else:
+            mask = self._sel_without_icodes(vals)
+
+        return unique(group[mask])
+
+    def _sel_without_icodes(self, vals):
+        # Final mask that gets applied to group
+        mask = np.zeros(len(vals), dtype=np.bool)
+
+        for (u_resid, _), (l_resid, _) in zip(self.uppers, self.lowers):
+            if u_resid is not None:  # range selection
+                thismask = vals >= l_resid
+                thismask &= vals <= u_resid
+            else:  # single residue selection
+                thismask = vals == l_resid
+
+            mask |= thismask
+
+        return mask
+
+    def _sel_with_icodes(self, vals, icodes):
+        # Final mask that gets applied to group
+        mask = np.zeros(len(vals), dtype=np.bool)
+
+        for (u_resid, u_icode), (l_resid, l_icode) in zip(self.uppers, self.lowers):
+            if u_resid is not None:  # Selecting a range
+                # Special case, if l_resid == u_resid, ie 163A-163C, this simplifies to:
+                # all 163, and A <= icode <= C
+                if l_resid == u_resid:
+                    thismask = vals == l_resid
+                    thismask &= icodes >= l_icode
+                    thismask &= icodes <= u_icode
+                # For 163A to 166B we want:
+                # [START]  all 163 and icode >= 'A'
+                # [MIDDLE] all of 164 and 165, any icode
+                # [END]    166 and icode <= 'B'
+                else:
+                    # start of range
+                    startmask = vals == l_resid
+                    startmask &= icodes >= l_icode
+                    thismask = startmask
+
+                    # middle of range
+                    mid = np.arange(l_resid + 1, u_resid)
+                    if len(mid):  # if there are any resids in the middle
+                        mid_beg, mid_end = mid[0], mid[-1]
+                        midmask = vals >= mid_beg
+                        midmask &= vals <= mid_end
+
+                        thismask |= midmask
+
+                    # end of range
+                    endmask = vals == u_resid
+                    endmask &= icodes <= u_icode
+
+                    thismask |= endmask
+            else:  # Selecting a single residue
+                thismask = vals == l_resid
+                thismask &= icodes == l_icode
+
+            mask |= thismask
+
+        return mask
+
+
+class RangeSelection(Selection):
+    def __init__(self, parser, tokens):
+        values = grab_not_keywords(tokens)
+        if not values:
+            raise ValueError("Unexpected token: '{0}'".format(tokens[0]))
+
+        uppers = []  # upper limit on any range
+        lowers = []  # lower limit on any range
+
         for val in values:
             try:
                 lower = int(val)
@@ -650,12 +765,13 @@ def __init__(self, parser, tokens):
         self.lowers = lowers
         self.uppers = uppers
 
-    def _get_vals(self, group):
-        return getattr(group, self.field)
+
+class ResnumSelection(RangeSelection):
+    token = 'resnum'
 
     def apply(self, group):
         mask = np.zeros(len(group), dtype=np.bool)
-        vals = self._get_vals(group)
+        vals = group.resnums
 
         for upper, lower in zip(self.uppers, self.lowers):
             if upper is not None:
@@ -668,23 +784,23 @@ def apply(self, group):
         return unique(group[mask])
 
 
-class ResidueIDSelection(RangeSelection):
-    token = 'resid'
-    field = 'resids'
-
+class ByNumSelection(RangeSelection):
+    token = 'bynum'
 
-class ResnumSelection(RangeSelection):
-    token = 'resnum'
-    field = 'resnums'
+    def apply(self, group):
+        mask = np.zeros(len(group), dtype=np.bool)
+        vals = group.indices + 1  # queries are in 1 based indices
 
+        for upper, lower in zip(self.uppers, self.lowers):
+            if upper is not None:
+                thismask = vals >= lower
+                thismask &= vals <= upper
+            else:
+                thismask = vals == lower
 
-class ByNumSelection(RangeSelection):
-    token = 'bynum'
+            mask |= thismask
+        return unique(group[mask])
 
-    def _get_vals(self, group):
-        # In this case we'll use 1 indexing since that's what the
-        # user will be familiar with
-        return group.indices + 1
 
 
 class ProteinSelection(Selection):

diff --git a/testsuite/MDAnalysisTests/core/test_atomselections.py b/testsuite/MDAnalysisTests/core/test_atomselections.py
@@ -45,6 +45,7 @@
     GRO, NUCL, NUCLsel, TPR, XTC,
     TRZ_psf, TRZ,
     PDB_full,
+    PDB_icodes,
 )
 from MDAnalysisTests.plugins.knownfailure import knownfailure
 from MDAnalysisTests import parser_not_found
@@ -846,3 +847,122 @@ def test_range_selections(self):
                 yield (self._check_sels,
                        ref.format(typ=seltype),
                        sel.format(typ=seltype))
+
+class TestICodeSelection(object):
+    def setUp(self):
+        self.u = mda.Universe(PDB_icodes)
+
+    def tearDown(self):
+        del self.u
+
+    def test_select_icode(self):
+        ag = self.u.select_atoms('resid 163A')
+
+        assert_(len(ag) == 7)
+        assert_array_equal(ag.ids, np.arange(7) + 1230)
+
+    def test_select_resid_implicit_icode(self):
+        ag = self.u.select_atoms('resid 163')
+
+        assert_(len(ag) == 6)
+        assert_array_equal(ag.ids, np.arange(6) + 1224)
+
+    def test_select_icode_range_1(self):
+        # testing range within a single resid integer value
+        u = self.u
+        ag = u.select_atoms('resid 163B-163D')
+
+        # do it manually without selection language...
+        ref = u.residues[u.residues.resids == 163]
+        ref = ref[(ref.icodes >= 'B') & (ref.icodes <= 'D')]
+        ref = ref.atoms
+
+        assert_array_equal(ag.ids, ref.ids)
+
+        assert_(len(ag) == 19)
+        assert_array_equal(ag.ids, np.arange(19) + 1237)
+
+    def test_select_icode_range_2(self):
+        u = self.u
+
+        ag = u.select_atoms('resid 163B-165')
+
+        resids = u.residues.resids
+
+        start = u.residues[resids == 163]
+        start = start[start.icodes >= 'B']
+
+        mid = u.residues[resids == 164]
+
+        end = u.residues[resids == 165]
+        end = end[end.icodes == '']
+
+        ref = start.atoms + mid.atoms + end.atoms
+
+        assert_array_equal(ag.ids, ref.ids)
+
+    def test_select_icode_range_3(self):
+        # same as #2 but with no "middle" icodes
+        u = self.u
+
+        ag = u.select_atoms('resid 163B-164')
+
+        resids = u.residues.resids
+
+        start = u.residues[resids == 163]
+        start = start[start.icodes >= 'B']
+
+        end = u.residues[resids == 164]
+        end = end[end.icodes == '']
+
+        ref = start.atoms + end.atoms
+
+        assert_array_equal(ag.ids, ref.ids)
+
+    def test_select_icode_range_4(self):
+        u = self.u
+
+        ag = u.select_atoms('resid 160-163G')
+
+        resids = u.residues.resids
+
+        start = u.residues[resids == 160]
+        start = start[start.icodes >= '']
+
+        mid = u.residues[(resids == 161) | (resids == 162)]
+
+        end = u.residues[resids == 163]
+        end = end[end.icodes <= 'G']
+
+        ref = start.atoms + mid.atoms + end.atoms
+
+        assert_array_equal(ag.ids, ref.ids)
+
+    def test_select_icode_range_5(self):
+        # same as #4 but with no "middle" icodes in range
+        u = self.u
+
+        ag = u.select_atoms('resid 162-163G')
+
+        resids = u.residues.resids
+
+        start = u.residues[resids == 162]
+        start = start[start.icodes >= '']
+
+        end = u.residues[resids == 163]
+        end = end[end.icodes <= 'G']
+
+        ref = start.atoms + end.atoms
+
+        assert_array_equal(ag.ids, ref.ids)
+
+    def test_missing_icodes_VE(self):
+        # trying a selection with icodes in a Universe without raises VA
+        u = make_Universe(('resids',))
+
+        assert_raises(ValueError, u.select_atoms, 'resid 10A')
+
+    def test_missing_icodes_range_VE(self):
+        u = make_Universe(('resids',))
+
+        assert_raises(ValueError, u.select_atoms, 'resid 10A-12')
diff --git a/testsuite/MDAnalysisTests/data/1osm.pdb.gz b/testsuite/MDAnalysisTests/data/1osm.pdb.gz
diff --git a/testsuite/MDAnalysisTests/datafiles.py b/testsuite/MDAnalysisTests/datafiles.py
@@ -45,6 +45,7 @@
     "PDB_conect",
     "PDB_conect2TER",  # Conect record to a TER entry (Issue 936)
     "PDB_singleconect",  # Conect record with one entry (Issue 937)
+    "PDB_icodes",  # stripped down version of 1osm, has icodes!
     "XPDB_small",
     "PDB_full",   # PDB 4E43 (full HEADER, TITLE, COMPND, REMARK, altloc)
     "ALIGN",  # Various way to align atom names in PDB files
@@ -176,6 +177,7 @@
 PDB_conect = resource_filename(__name__, 'data/conect_parsing.pdb')
 PDB_conect2TER = resource_filename(__name__, 'data/CONECT2TER.pdb')
 PDB_singleconect = resource_filename(__name__, 'data/SINGLECONECT.pdb')
+PDB_icodes = resource_filename(__name__, 'data/1osm.pdb.gz')
 
 GRO = resource_filename(__name__, 'data/adk_oplsaa.gro')
 GRO_velocity = resource_filename(__name__, 'data/sample_velocity_file.gro')