From 45e56e8314c278e3eb98ed7a6029b74e7435e8be Mon Sep 17 00:00:00 2001
From: Richard Gowers <richardjgowers@gmail.com>
Date: Tue, 25 Aug 2020 14:35:33 +0100
Subject: [PATCH] Faster name selections (#2755)

* modified AtomNames topologyattr to include lookup table index

* cheeky little optimisation

* rework atom name selection to use lookup tables

* Update topologyattrs.py

* fixed test supplying integer as atom name

really topologyattrs need to be statically typed and protective about this

* Update test_topologyattrs.py

* use dict-lookup string attrs EVERYWHERERE

* removed some code duplication

made protein selection faster, 48ms -> 0.5ms on GRO testfile

* improved nucleic/backbone selections

* Added explicit tests for Resnames topologyattr

tests now provide str types for resnames/icodes

* use fnmatchcase to be case sensitive

* Update package/MDAnalysis/core/selection.py

@jbarnoud's fix

* apply suggestions from code review

Co-authored-by: Irfan Alibay <IAlibay@users.noreply.github.com>

* added test for setting multiple segids at once

Co-authored-by: Oliver Beckstein <orbeckst@gmail.com>
Co-authored-by: Irfan Alibay <IAlibay@users.noreply.github.com>
---
 package/CHANGELOG                             |   3 +
 package/MDAnalysis/core/selection.py          | 175 ++++++++++++---
 package/MDAnalysis/core/topologyattrs.py      | 207 +++++++++++++++---
 .../core/test_atomselections.py               |   2 +-
 .../MDAnalysisTests/core/test_segmentgroup.py |  18 ++
 .../core/test_topologyattrs.py                |  37 ++--
 6 files changed, 369 insertions(+), 73 deletions(-)

diff --git a/package/CHANGELOG b/package/CHANGELOG
index 9df19de4a28..5410e0f6153 100644
--- a/package/CHANGELOG
+++ b/package/CHANGELOG
@@ -45,6 +45,7 @@ Fixes
   * In hydrogenbonds.hbond_analysis.HydrogenbondAnalysis an AttributeError
     was thrown when finding D-H pairs via the topology if `hydrogens` was an
     empty AtomGroup (Issue #2848)
+  * Fixed performance regression on select_atoms for string selections (#2751)
   * Fixed the DMSParser, allowing the creation of multiple segids sharing
     residues with identical resids (Issue #1387, PR #2872)
   * H5MD files are now picklable with H5PYPicklable (Issue #2890, PR #2894)
@@ -79,6 +80,8 @@ Enhancements
   * Added new kwargs `select_remove` and `select_protein` to
     analysis.dihedrals.Janin analysis to give user more fine grained control
     over selections (PR #2899)
+  * Improved performance of select_atoms on strings (e.g. name, type, resname) and
+    'protein' selection (#2751 PR #2755)
   * Added an RDKit converter that works for any input with all hydrogens
     explicit in the topology (Issue #2468, PR #2775)
 
diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py
index bdb156ff249..dc8747832ea 100644
--- a/package/MDAnalysis/core/selection.py
+++ b/package/MDAnalysis/core/selection.py
@@ -515,7 +515,7 @@ def apply(self, group):
         return group[mask]
 
 
-class StringSelection(Selection):
+class _ProtoStringSelection(Selection):
     """Selections based on text attributes
 
     .. versionchanged:: 1.0.0
@@ -530,11 +530,23 @@ def __init__(self, parser, tokens):
 
     @return_empty_on_apply
     def apply(self, group):
-        mask = np.zeros(len(group), dtype=bool)
-        for val in self.values:
-            values = getattr(group, self.field)
-            mask |= [fnmatch.fnmatch(x, val) for x in values]
-        return group[mask].unique
+        # rather than work on group.names, cheat and look at the lookup table
+        nmattr = getattr(group.universe._topology, self.field)
+
+        matches = []  # list of passing indices
+        # iterate through set of known atom names, check which pass
+        for nm, ix in nmattr.namedict.items():
+            if any(fnmatch.fnmatchcase(nm, val) for val in self.values):
+                matches.append(ix)
+
+        # atomname indices for members of this group
+        nmidx = nmattr.nmidx[getattr(group, self.level)]
+
+        return group[np.in1d(nmidx, matches)].unique
+
+
+class StringSelection(_ProtoStringSelection):
+    level = 'ix'  # operates on atom level attribute, i.e. '.ix'
 
 
 class AtomNameSelection(StringSelection):
@@ -561,22 +573,27 @@ class AtomICodeSelection(StringSelection):
     field = 'icodes'
 
 
-class ResidueNameSelection(StringSelection):
+class _ResidueStringSelection(_ProtoStringSelection):
+    level= 'resindices'
+
+
+class ResidueNameSelection(_ResidueStringSelection):
     """Select atoms based on 'resnames' attribute"""
     token = 'resname'
     field = 'resnames'
 
 
-class MoleculeTypeSelection(StringSelection):
+class MoleculeTypeSelection(_ResidueStringSelection):
     """Select atoms based on 'moltypes' attribute"""
     token = 'moltype'
     field = 'moltypes'
 
 
-class SegmentNameSelection(StringSelection):
+class SegmentNameSelection(_ProtoStringSelection):
     """Select atoms based on 'segids' attribute"""
     token = 'segid'
     field = 'segids'
+    level = 'segindices'
 
 
 class AltlocSelection(StringSelection):
@@ -802,10 +819,15 @@ class ProteinSelection(Selection):
     See Also
     --------
     :func:`MDAnalysis.lib.util.convert_aa_code`
+
+
+    .. versionchanged:: 2.0.0
+       prot_res changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'protein'
 
-    prot_res = np.array([
+    prot_res = {
         # CHARMM top_all27_prot_lipid.rtf
         'ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HSD',
         'HSE', 'HSP', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR',
@@ -828,14 +850,20 @@ class ProteinSelection(Selection):
         'CLEU', 'CILE', 'CVAL', 'CASF', 'CASN', 'CGLN', 'CARG', 'CHID', 'CHIE',
         'CHIP', 'CTRP', 'CPHE', 'CTYR', 'CGLU', 'CASP', 'CLYS', 'CPRO', 'CCYS',
         'CCYX', 'CMET', 'CME', 'ASF',
-    ])
+    }
 
     def __init__(self, parser, tokens):
         pass
 
     def apply(self, group):
-        mask = np.in1d(group.resnames, self.prot_res)
-        return group[mask].unique
+        resname_attr = group.universe._topology.resnames
+        # which values in resname attr are in prot_res?
+        matches = [ix for (nm, ix) in resname_attr.namedict.items()
+                   if nm in self.prot_res]
+        # index of each atom's resname
+        nmidx = resname_attr.nmidx[group.resindices]
+        # intersect atom's resname index and matches to prot_res
+        return group[np.in1d(nmidx, matches)].unique
 
 
 class NucleicSelection(Selection):
@@ -850,23 +878,32 @@ class NucleicSelection(Selection):
 
     .. versionchanged:: 0.8
        additional Gromacs selections
+    .. versionchanged:: 2.0.0
+       nucl_res changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'nucleic'
 
-    nucl_res = np.array([
+    nucl_res = {
         'ADE', 'URA', 'CYT', 'GUA', 'THY', 'DA', 'DC', 'DG', 'DT', 'RA',
         'RU', 'RG', 'RC', 'A', 'T', 'U', 'C', 'G',
         'DA5', 'DC5', 'DG5', 'DT5',
         'DA3', 'DC3', 'DG3', 'DT3',
         'RA5', 'RU5', 'RG5', 'RC5',
         'RA3', 'RU3', 'RG3', 'RC3'
-    ])
+    }
 
     def __init__(self, parser, tokens):
         pass
 
     def apply(self, group):
-        mask = np.in1d(group.resnames, self.nucl_res)
+        resnames = group.universe._topology.resnames
+        nmidx = resnames.nmidx[group.resindices]
+
+        matches = [ix for (nm, ix) in resnames.namedict.items()
+                   if nm in self.nucl_res]
+        mask = np.in1d(nmidx, matches)
+
         return group[mask].unique
 
 
@@ -875,14 +912,32 @@ class BackboneSelection(ProteinSelection):
 
     This excludes OT* on C-termini
     (which are included by, eg VMD's backbone selection).
+
+
+    .. versionchanged:: 2.0.0
+       bb_atoms changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'backbone'
-    bb_atoms = np.array(['N', 'CA', 'C', 'O'])
+    bb_atoms = {'N', 'CA', 'C', 'O'}
 
     def apply(self, group):
-        mask = np.in1d(group.names, self.bb_atoms)
-        mask &= np.in1d(group.resnames, self.prot_res)
-        return group[mask].unique
+        atomnames = group.universe._topology.names
+        resnames = group.universe._topology.resnames
+
+        # filter by atom names
+        name_matches = [ix for (nm, ix) in atomnames.namedict.items()
+                        if nm in self.bb_atoms]
+        nmidx = atomnames.nmidx[group.ix]
+        group = group[np.in1d(nmidx, name_matches)]
+
+        # filter by resnames
+        resname_matches = [ix for (nm, ix) in resnames.namedict.items()
+                           if nm in self.prot_res]
+        nmidx = resnames.nmidx[group.resindices]
+        group = group[np.in1d(nmidx, resname_matches)]
+
+        return group.unique
 
 
 class NucleicBackboneSelection(NucleicSelection):
@@ -890,14 +945,32 @@ class NucleicBackboneSelection(NucleicSelection):
 
     These atoms are only recognized if they are in a residue matched
     by the :class:`NucleicSelection`.
+
+
+    .. versionchanged:: 2.0.0
+       bb_atoms changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'nucleicbackbone'
-    bb_atoms = np.array(["P", "C5'", "C3'", "O3'", "O5'"])
+    bb_atoms = {"P", "C5'", "C3'", "O3'", "O5'"}
 
     def apply(self, group):
-        mask = np.in1d(group.names, self.bb_atoms)
-        mask &= np.in1d(group.resnames, self.nucl_res)
-        return group[mask].unique
+        atomnames = group.universe._topology.names
+        resnames = group.universe._topology.resnames
+
+        # filter by atom names
+        name_matches = [ix for (nm, ix) in atomnames.namedict.items()
+                        if nm in self.bb_atoms]
+        nmidx = atomnames.nmidx[group.ix]
+        group = group[np.in1d(nmidx, name_matches)]
+
+        # filter by resnames
+        resname_matches = [ix for (nm, ix) in resnames.namedict.items()
+                           if nm in self.nucl_res]
+        nmidx = resnames.nmidx[group.resindices]
+        group = group[np.in1d(nmidx, resname_matches)]
+
+        return group.unique
 
 
 class BaseSelection(NucleicSelection):
@@ -907,29 +980,65 @@ class BaseSelection(NucleicSelection):
 
      'N9', 'N7', 'C8', 'C5', 'C4', 'N3', 'C2', 'N1', 'C6',
      'O6','N2','N6', 'O2','N4','O4','C5M'
+
+
+    .. versionchanged:: 2.0.0
+       base_atoms changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'nucleicbase'
-    base_atoms = np.array([
+    base_atoms = {
         'N9', 'N7', 'C8', 'C5', 'C4', 'N3', 'C2', 'N1', 'C6',
         'O6', 'N2', 'N6',
-        'O2', 'N4', 'O4', 'C5M'])
+        'O2', 'N4', 'O4', 'C5M'}
 
     def apply(self, group):
-        mask = np.in1d(group.names, self.base_atoms)
-        mask &= np.in1d(group.resnames, self.nucl_res)
-        return group[mask].unique
+        atomnames = group.universe._topology.names
+        resnames = group.universe._topology.resnames
+
+        # filter by atom names
+        name_matches = [ix for (nm, ix) in atomnames.namedict.items()
+                        if nm in self.base_atoms]
+        nmidx = atomnames.nmidx[group.ix]
+        group = group[np.in1d(nmidx, name_matches)]
+
+        # filter by resnames
+        resname_matches = [ix for (nm, ix) in resnames.namedict.items()
+                           if nm in self.nucl_res]
+        nmidx = resnames.nmidx[group.resindices]
+        group = group[np.in1d(nmidx, resname_matches)]
+
+        return group.unique
 
 
 class NucleicSugarSelection(NucleicSelection):
     """Contains all atoms with name C1', C2', C3', C4', O2', O4', O3'.
+
+
+    .. versionchanged:: 2.0.0
+       sug_atoms changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'nucleicsugar'
-    sug_atoms = np.array(["C1'", "C2'", "C3'", "C4'", "O4'"])
+    sug_atoms = {"C1'", "C2'", "C3'", "C4'", "O4'"}
 
     def apply(self, group):
-        mask = np.in1d(group.names, self.sug_atoms)
-        mask &= np.in1d(group.resnames, self.nucl_res)
-        return group[mask].unique
+        atomnames = group.universe._topology.names
+        resnames = group.universe._topology.resnames
+
+        # filter by atom names
+        name_matches = [ix for (nm, ix) in atomnames.namedict.items()
+                        if nm in self.sug_atoms]
+        nmidx = atomnames.nmidx[group.ix]
+        group = group[np.in1d(nmidx, name_matches)]
+
+        # filter by resnames
+        resname_matches = [ix for (nm, ix) in resnames.namedict.items()
+                           if nm in self.nucl_res]
+        nmidx = resnames.nmidx[group.resindices]
+        group = group[np.in1d(nmidx, resname_matches)]
+
+        return group.unique
 
 
 class PropertySelection(Selection):
diff --git a/package/MDAnalysis/core/topologyattrs.py b/package/MDAnalysis/core/topologyattrs.py
index c600ada3eb1..a591f09cb3a 100644
--- a/package/MDAnalysis/core/topologyattrs.py
+++ b/package/MDAnalysis/core/topologyattrs.py
@@ -473,8 +473,65 @@ def _gen_initial_values(na, nr, ns):
         return np.arange(1, na + 1)
 
 
+class _AtomStringAttr(AtomAttr):
+    def __init__(self, vals, guessed=False):
+        self._guessed = guessed
+      
+        self.namedict = dict()  # maps str to nmidx
+        name_lookup = []  # maps idx to str
+        # eg namedict['O'] = 5 & name_lookup[5] = 'O'
+
+        self.nmidx = np.zeros_like(vals, dtype=int)  # the lookup for each atom
+        # eg Atom 5 is 'C', so nmidx[5] = 7, where name_lookup[7] = 'C'
+
+        for i, val in enumerate(vals):
+            try:
+                self.nmidx[i] = self.namedict[val]
+            except KeyError:
+                nextidx = len(self.namedict)
+                self.namedict[val] = nextidx
+                name_lookup.append(val)
+
+                self.nmidx[i] = nextidx
+
+        self.name_lookup = np.array(name_lookup, dtype=object)
+        self.values = self.name_lookup[self.nmidx]
+
+    @staticmethod
+    def _gen_initial_values(na, nr, ns):
+        return np.array(['' for _ in range(na)], dtype=object)
+
+    @_check_length
+    def set_atoms(self, ag, values):
+        newnames = []
+
+        # two possibilities, either single value given, or one per Atom
+        if isinstance(values, str):
+            try:
+                newidx = self.namedict[values]
+            except KeyError:
+                newidx = len(self.namedict)
+                self.namedict[values] = newidx
+                newnames.append(values)
+        else:
+            newidx = np.zeros_like(values, dtype=int)
+            for i, val in enumerate(values):
+                try:
+                    newidx[i] = self.namedict[val]
+                except KeyError:
+                    nextidx = len(self.namedict)
+                    self.namedict[val] = nextidx
+                    newnames.append(val)
+                    newidx[i] = nextidx
+
+        self.nmidx[ag.ix] = newidx  # newidx either single value or same size array
+        if newnames:
+            self.name_lookup = np.concatenate([self.name_lookup, newnames])
+        self.values = self.name_lookup[self.nmidx]
+
+
 # TODO: update docs to property doc
-class Atomnames(AtomAttr):
+class Atomnames(_AtomStringAttr):
     """Name for each atom.
     """
     attrname = 'names'
@@ -483,10 +540,6 @@ class Atomnames(AtomAttr):
     dtype = object
     transplants = defaultdict(list)
 
-    @staticmethod
-    def _gen_initial_values(na, nr, ns):
-        return np.array(['' for _ in range(na)], dtype=object)
-
     def phi_selection(residue, c_name='C', n_name='N', ca_name='CA'):
         """Select AtomGroup corresponding to the phi protein backbone dihedral
         C'-N-CA-C.
@@ -958,20 +1011,16 @@ def chi1_selections(residues, n_name='N', ca_name='CA', cb_name='CB',
 
 
 # TODO: update docs to property doc
-class Atomtypes(AtomAttr):
+class Atomtypes(_AtomStringAttr):
     """Type for each atom"""
     attrname = 'types'
     singular = 'type'
     per_object = 'atom'
     dtype = object
 
-    @staticmethod
-    def _gen_initial_values(na, nr, ns):
-        return np.array(['' for _ in range(na)], dtype=object)
-
 
 # TODO: update docs to property doc
-class Elements(AtomAttr):
+class Elements(_AtomStringAttr):
     """Element for each atom"""
     attrname = 'elements'
     singular = 'element'
@@ -995,7 +1044,7 @@ def _gen_initial_values(na, nr, ns):
         return np.zeros(na)
 
 
-class RecordTypes(AtomAttr):
+class RecordTypes(_AtomStringAttr):
     """For PDB-like formats, indicates if ATOM or HETATM
 
     Defaults to 'ATOM'
@@ -1013,7 +1062,7 @@ def _gen_initial_values(na, nr, ns):
         return np.array(['ATOM'] * na, dtype=object)
 
 
-class ChainIDs(AtomAttr):
+class ChainIDs(_AtomStringAttr):
     """ChainID per atom
 
     Note
@@ -1025,10 +1074,6 @@ class ChainIDs(AtomAttr):
     per_object = 'atom'
     dtype = object
 
-    @staticmethod
-    def _gen_initial_values(na, nr, ns):
-        return np.array(['' for _ in range(na)], dtype=object)
-
 
 class Tempfactors(AtomAttr):
     """Tempfactor for atoms"""
@@ -1574,7 +1619,7 @@ def _gen_initial_values(na, nr, ns):
 
 
 # TODO: update docs to property doc
-class AltLocs(AtomAttr):
+class AltLocs(_AtomStringAttr):
     """AltLocs for each atom"""
     attrname = 'altLocs'
     singular = 'altLoc'
@@ -1727,8 +1772,65 @@ def _gen_initial_values(na, nr, ns):
         return np.arange(1, nr + 1)
 
 
+class _ResidueStringAttr(ResidueAttr):
+    def __init__(self, vals, guessed=False):
+        self._guessed = guessed
+      
+        self.namedict = dict()  # maps str to nmidx
+        name_lookup = []  # maps idx to str
+        # eg namedict['O'] = 5 & name_lookup[5] = 'O'
+
+        self.nmidx = np.zeros_like(vals, dtype=int)  # the lookup for each atom
+        # eg Atom 5 is 'C', so nmidx[5] = 7, where name_lookup[7] = 'C'
+
+        for i, val in enumerate(vals):
+            try:
+                self.nmidx[i] = self.namedict[val]
+            except KeyError:
+                nextidx = len(self.namedict)
+                self.namedict[val] = nextidx
+                name_lookup.append(val)
+
+                self.nmidx[i] = nextidx
+
+        self.name_lookup = np.array(name_lookup, dtype=object)
+        self.values = self.name_lookup[self.nmidx]    
+
+    @staticmethod
+    def _gen_initial_values(na, nr, ns):
+        return np.array(['' for _ in range(nr)], dtype=object)
+
+    @_check_length
+    def set_residues(self, rg, values):
+        newnames = []
+
+        # two possibilities, either single value given, or one per Atom
+        if isinstance(values, str):
+            try:
+                newidx = self.namedict[values]
+            except KeyError:
+                newidx = len(self.namedict)
+                self.namedict[values] = newidx
+                newnames.append(values)
+        else:
+            newidx = np.zeros_like(values, dtype=int)
+            for i, val in enumerate(values):
+                try:
+                    newidx[i] = self.namedict[val]
+                except KeyError:
+                    nextidx = len(self.namedict)
+                    self.namedict[val] = nextidx
+                    newnames.append(val)
+                    newidx[i] = nextidx
+
+        self.nmidx[rg.ix] = newidx  # newidx either single value or same size array
+        if newnames:
+            self.name_lookup = np.concatenate([self.name_lookup, newnames])
+        self.values = self.name_lookup[self.nmidx]    
+
+
 # TODO: update docs to property doc
-class Resnames(ResidueAttr):
+class Resnames(_ResidueStringAttr):
     attrname = 'resnames'
     singular = 'resname'
     transplants = defaultdict(list)
@@ -1847,18 +1949,14 @@ def _gen_initial_values(na, nr, ns):
         return np.arange(1, nr + 1)
 
 
-class ICodes(ResidueAttr):
+class ICodes(_ResidueStringAttr):
     """Insertion code for Atoms"""
     attrname = 'icodes'
     singular = 'icode'
     dtype = object
 
-    @staticmethod
-    def _gen_initial_values(na, nr, ns):
-        return np.array(['' for _ in range(nr)], dtype=object)
-
 
-class Moltypes(ResidueAttr):
+class Moltypes(_ResidueStringAttr):
     """Name of the molecule type
 
     Two molecules that share a molecule type share a common template topology.
@@ -1910,8 +2008,65 @@ def set_segments(self, sg, values):
         self.values[sg.ix] = values
 
 
+class _SegmentStringAttr(SegmentAttr):
+    def __init__(self, vals, guessed=False):
+        self._guessed = guessed
+      
+        self.namedict = dict()  # maps str to nmidx
+        name_lookup = []  # maps idx to str
+        # eg namedict['O'] = 5 & name_lookup[5] = 'O'
+
+        self.nmidx = np.zeros_like(vals, dtype=int)  # the lookup for each atom
+        # eg Atom 5 is 'C', so nmidx[5] = 7, where name_lookup[7] = 'C'
+
+        for i, val in enumerate(vals):
+            try:
+                self.nmidx[i] = self.namedict[val]
+            except KeyError:
+                nextidx = len(self.namedict)
+                self.namedict[val] = nextidx
+                name_lookup.append(val)
+
+                self.nmidx[i] = nextidx
+
+        self.name_lookup = np.array(name_lookup, dtype=object)
+        self.values = self.name_lookup[self.nmidx]    
+
+    @staticmethod
+    def _gen_initial_values(na, nr, ns):
+        return np.array(['' for _ in range(nr)], dtype=object)
+
+    @_check_length
+    def set_segments(self, sg, values):
+        newnames = []
+
+        # two possibilities, either single value given, or one per Atom
+        if isinstance(values, str):
+            try:
+                newidx = self.namedict[values]
+            except KeyError:
+                newidx = len(self.namedict)
+                self.namedict[values] = newidx
+                newnames.append(values)
+        else:
+            newidx = np.zeros_like(values, dtype=int)
+            for i, val in enumerate(values):
+                try:
+                    newidx[i] = self.namedict[val]
+                except KeyError:
+                    nextidx = len(self.namedict)
+                    self.namedict[val] = nextidx
+                    newnames.append(val)
+                    newidx[i] = nextidx
+
+        self.nmidx[sg.ix] = newidx  # newidx either single value or same size array
+        if newnames:
+            self.name_lookup = np.concatenate([self.name_lookup, newnames])
+        self.values = self.name_lookup[self.nmidx]    
+
+        
 # TODO: update docs to property doc
-class Segids(SegmentAttr):
+class Segids(_SegmentStringAttr):
     attrname = 'segids'
     singular = 'segid'
     transplants = defaultdict(list)
diff --git a/testsuite/MDAnalysisTests/core/test_atomselections.py b/testsuite/MDAnalysisTests/core/test_atomselections.py
index b7d0f515b7f..4042bb84820 100644
--- a/testsuite/MDAnalysisTests/core/test_atomselections.py
+++ b/testsuite/MDAnalysisTests/core/test_atomselections.py
@@ -74,7 +74,7 @@ def test_protein(self, universe):
                      sorted(universe.select_atoms('segid 4AKE').indices),
                      "selected protein is not the same as auto-generated protein segment s4AKE")
 
-    @pytest.mark.parametrize('resname', MDAnalysis.core.selection.ProteinSelection.prot_res)
+    @pytest.mark.parametrize('resname', sorted(MDAnalysis.core.selection.ProteinSelection.prot_res))
     def test_protein_resnames(self, resname):
         u = make_Universe(('resnames',))
         # set half the residues' names to the resname we're testing
diff --git a/testsuite/MDAnalysisTests/core/test_segmentgroup.py b/testsuite/MDAnalysisTests/core/test_segmentgroup.py
index 3f0c251e543..546c5ad44cf 100644
--- a/testsuite/MDAnalysisTests/core/test_segmentgroup.py
+++ b/testsuite/MDAnalysisTests/core/test_segmentgroup.py
@@ -88,6 +88,24 @@ def test_set_segid_updates_(universe):
                  err_msg="old selection was not changed in place after set_segid")
 
 
+def test_set_segids_many():
+    u = mda.Universe.empty(n_atoms=6, n_residues=2, n_segments=2,
+                           atom_resindex=[0, 0, 0, 1, 1, 1], residue_segindex=[0,1])
+    u.add_TopologyAttr('segids', ['A', 'B'])
+
+    # universe with 2 segments, A and B
+
+    u.segments.segids = ['X', 'Y']
+
+    assert u.segments[0].segid == 'X'
+    assert u.segments[1].segid == 'Y'
+
+    assert len(u.select_atoms('segid A')) == 0
+    assert len(u.select_atoms('segid B')) == 0
+    assert len(u.select_atoms('segid X')) == 3
+    assert len(u.select_atoms('segid Y')) == 3
+
+
 def test_atom_order(universe):
     assert_equal(universe.segments.atoms.indices,
                  sorted(universe.segments.atoms.indices))
diff --git a/testsuite/MDAnalysisTests/core/test_topologyattrs.py b/testsuite/MDAnalysisTests/core/test_topologyattrs.py
index 6fa082b3b3f..270491514af 100644
--- a/testsuite/MDAnalysisTests/core/test_topologyattrs.py
+++ b/testsuite/MDAnalysisTests/core/test_topologyattrs.py
@@ -93,6 +93,7 @@ class TestAtomAttr(TopologyAttrMixin):
 
     """
     values = np.array([7, 3, 69, 9993, 84, 194, 263, 501, 109, 5873])
+    single_value = 567
     attrclass = tpattrs.AtomAttr
 
     def test_set_atom_VE(self):
@@ -112,8 +113,9 @@ def test_get_atoms(self, attr):
     def test_set_atoms_singular(self, attr):
         # set len 2 Group to len 1 value
         dg = DummyGroup([3, 7])
-        attr.set_atoms(dg, 567)
-        assert_equal(attr.get_atoms(dg), np.array([567, 567]))
+        attr.set_atoms(dg, self.single_value)
+        assert_equal(attr.get_atoms(dg),
+                     np.array([self.single_value, self.single_value]))
 
     def test_set_atoms_plural(self, attr):
         # set len 2 Group to len 2 values
@@ -175,6 +177,7 @@ def test_cant_set_segment_indices(self, u):
 class TestAtomnames(TestAtomAttr):
     values = np.array(['O', 'C', 'CA', 'N', 'CB', 'CG', 'CD', 'NA', 'CL', 'OW'],
                       dtype=np.object)
+    single_value = 'Ca2'
     attrclass = tpattrs.Atomnames
 
 
@@ -206,18 +209,19 @@ class TestResidueAttr(TopologyAttrMixin):
     """Test residue-level TopologyAttrs.
 
     """
+    single_value = 2
     values = np.array([15.2, 395.6, 0.1, 9.8])
     attrclass = tpattrs.ResidueAttr
 
-    def test_set_residue_VE(self):
-        u = make_Universe(('resnames',))
-        res = u.residues[0]
+    def test_set_residue_VE(self, universe):
+        # setting e.g. resname to 2 values should fail with VE
+        res = universe.residues[0]
         with pytest.raises(ValueError):
-            setattr(res, 'resname', ['wrong', 'length'])
+            setattr(res, self.attrclass.singular, self.values[:2])
 
     def test_get_atoms(self, attr):
         assert_equal(attr.get_atoms(DummyGroup([7, 3, 9])),
-                           self.values[[3, 2, 2]])
+                     self.values[[3, 2, 2]])
 
     def test_get_atom(self, universe):
         attr = getattr(universe.atoms[0], self.attrclass.singular)
@@ -225,14 +229,14 @@ def test_get_atom(self, universe):
 
     def test_get_residues(self, attr):
         assert_equal(attr.get_residues(DummyGroup([1, 2, 1, 3])),
-                           self.values[[1, 2, 1, 3]])
+                     self.values[[1, 2, 1, 3]])
 
     def test_set_residues_singular(self, attr):
         dg = DummyGroup([3, 0, 1])
-        attr.set_residues(dg, 2)
+        attr.set_residues(dg, self.single_value)
 
-        assert_almost_equal(attr.get_residues(dg),
-                                  np.array([2, 2, 2]))
+        assert_equal(attr.get_residues(dg),
+                     np.array([self.single_value]*3, dtype=self.values.dtype))
 
     def test_set_residues_plural(self, attr):
         attr.set_residues(DummyGroup([3, 0, 1]),
@@ -254,10 +258,17 @@ def test_get_segments(self, attr):
         assert_equal(attr.get_segments(DummyGroup([0, 1, 1])),
                            [self.values[[0, 3]], self.values[[1, 2]], self.values[[1, 2]]])
 
-class TestICodes(TestResidueAttr):
-    values = np.array(['a', 'b', '', 'd'])
+
+class TestResnames(TestResidueAttr):
+    attrclass = tpattrs.Resnames
+    single_value = 'xyz'
+    values = np.array(['a', 'b', '', 'd'], dtype=object)
+
+
+class TestICodes(TestResnames):
     attrclass = tpattrs.ICodes
 
+
 class TestResids(TestResidueAttr):
     values = np.array([10, 11, 18, 20])
     attrclass = tpattrs.Resids