MDAnalysis · richardjgowers · Jul 25, 2022 · Jun 19, 2022 · Jun 19, 2022 · Jun 19, 2022
diff --git a/package/CHANGELOG b/package/CHANGELOG
@@ -34,6 +34,8 @@ Enhancements
   * Additional logger.info output when per-frame analysis starts (PR #3710)
   * Timestep has been converted to a Cython Extension type
     (CZI Performance track, PR #3683)
+  * Refactor make_downshift_arrays with vector operation in numpy(PR #3724)
+  * Optimize topology serialization/construction by lazy-building _RA and _SR.
 
 Changes
   * To add additional optional packages for different file formats when

diff --git a/package/MDAnalysis/core/topology.py b/package/MDAnalysis/core/topology.py
@@ -64,16 +64,6 @@
 from ..exceptions import NoDataError
 
 
-# TODO Notes:
-#   Could make downshift tables lazily built! This would
-#     a) Make these not get built when not used
-#     b) Optimise moving multiple atoms between residues as only built once
-#     afterwards
-
-#   Could optimise moves by only updating the two parent tables rather than
-#   rebuilding everything!
-
-
 def make_downshift_arrays(upshift, nparents):
     """From an upwards translation table, create the opposite direction
 
@@ -99,7 +89,7 @@ def make_downshift_arrays(upshift, nparents):
     To find the residue to atom mappings for a given atom to residue mapping:
 
     >>> atom2res = np.array([0, 1, 0, 2, 2, 0, 2])
-    >>> make_downshift_arrays(atom2res)
+    >>> make_downshift_arrays(atom2res, 3)
     array([array([0, 2, 5]), array([1]), array([3, 4, 6]), None], dtype=object)
 
     Entry 0 corresponds to residue 0 and says that this contains atoms 0, 2 & 5
@@ -115,26 +105,28 @@ def make_downshift_arrays(upshift, nparents):
     if not len(upshift):
         return np.array([], dtype=object)
 
-    order = np.argsort(upshift)
+    # mergesort for a stable ordered array for the same value.
+    order = np.argsort(upshift, kind="mergesort")
 
     upshift_sorted = upshift[order]
-    borders = [None] + list(np.nonzero(np.diff(upshift_sorted))[0] + 1) + [None]
-
-    # returns an array of arrays
-    downshift = []
-    counter = -1
-    # don't use enumerate, we modify counter in place
-    for x, y in zip(borders[:-1], borders[1:]):
-        counter += 1
-        # If parent is skipped, eg (0, 0, 2, 2, etc)
-        while counter != upshift[order[x:y][0]]:
-            downshift.append(np.array([], dtype=np.intp))
-            counter += 1
-        downshift.append(np.sort(np.array(order[x:y], copy=True, dtype=np.intp)))
-    # Add entries for childless parents at end of range
-    while counter < (nparents - 1):
-        downshift.append(np.array([], dtype=np.intp))
-        counter += 1
+    u_values, indices = np.unique(upshift_sorted, return_index=True)
+
+    # reset nparents to the larger one between input and heuristic from data
+    # This is useful for creating empty Universe where default value is 1.
+    nparents = np.max([nparents, u_values.max()+1])
+    residue_indices = np.zeros(nparents, dtype=int)
+    missing_resids = np.sort(np.setdiff1d(np.arange(nparents), u_values))
+    indices = np.append(indices, upshift_sorted.shape[0])
+
+    residue_indices[u_values] = indices[1:]
+
+    for missing_resid in missing_resids:
+        if missing_resid == 0:
+            residue_indices[missing_resid] = 0
+        else:
+            residue_indices[missing_resid] = residue_indices[missing_resid-1]
+
+    downshift = np.split(order, residue_indices[:-1])
     # Add None to end of array to force it to be of type Object
     # Without this, a rectangular array gets squashed into a single array
     downshift.append(None)
@@ -181,6 +173,9 @@ class TransTable(object):
     size : tuple
         tuple ``(n_atoms, n_residues, n_segments)`` describing the shape of
         the TransTable
+
+    .. versionchanged:: 2.3.0
+        Lazy building RA and SR.
     """
     def __init__(self,
                  n_atoms, n_residues, n_segments,  # Size of tables
@@ -197,7 +192,7 @@ def __init__(self,
             self._AR = np.asarray(atom_resindex, dtype=np.intp).copy()
             if not len(self._AR) == n_atoms:
                 raise ValueError("atom_resindex must be len n_atoms")
-        self._RA = make_downshift_arrays(self._AR, n_residues)
+        self._RA = None
 
         # built residue-to-segment mapping, and vice-versa
         if residue_segindex is None:
@@ -206,13 +201,27 @@ def __init__(self,
             self._RS = np.asarray(residue_segindex, dtype=np.intp).copy()
             if not len(self._RS) == n_residues:
                 raise ValueError("residue_segindex must be len n_residues")
-        self._SR = make_downshift_arrays(self._RS, n_segments)
+        self._SR = None
 
     def copy(self):
         """Return a deepcopy of this Transtable"""
         return self.__class__(self.n_atoms, self.n_residues, self.n_segments,
                               atom_resindex=self._AR, residue_segindex=self._RS)
 
+    @property
+    def RA(self):
+        if self._RA is None:
+            self._RA = make_downshift_arrays(self._AR,
+                                             self.n_residues)
+        return self._RA
+
+    @property
+    def SR(self):
+        if self._SR is None:
+            self._SR = make_downshift_arrays(self._RS,
+                                             self.n_segments)
+        return self._SR
+
     @property
     def size(self):
         """The shape of the table, ``(n_atoms, n_residues, n_segments)``.
@@ -251,11 +260,12 @@ def residues2atoms_1d(self, rix):
             indices of atoms present in residues, collectively
 
         """
+        RA = self.RA
         try:
-            return np.concatenate(self._RA[rix])
+            return np.concatenate(RA[rix])
         except ValueError:  # rix is not iterable or empty
             # don't accidentally return a view!
-            return self._RA[rix].astype(np.intp, copy=True)
+            return RA[rix].astype(np.intp, copy=True)
 
     def residues2atoms_2d(self, rix):
         """Get atom indices represented by each residue index.
@@ -273,10 +283,11 @@ def residues2atoms_2d(self, rix):
             in that residue
 
         """
+        RA = self.RA
         try:
-            return [self._RA[r].copy() for r in rix]
+            return [RA[r].copy() for r in rix]
         except TypeError:
-            return [self._RA[rix].copy()]  # why would this be singular for 2d?
+            return [RA[rix].copy()]  # why would this be singular for 2d?
 
     def residues2segments(self, rix):
         """Get segment indices for each residue.
@@ -308,11 +319,12 @@ def segments2residues_1d(self, six):
             sorted indices of residues present in segments, collectively
 
         """
+        SR = self.SR
         try:
-            return np.concatenate(self._SR[six])
+            return np.concatenate(SR[six])
         except ValueError:  # six is not iterable or empty
             # don't accidentally return a view!
-            return self._SR[six].astype(np.intp, copy=True)
+            return SR[six].astype(np.intp, copy=True)
 
     def segments2residues_2d(self, six):
         """Get residue indices represented by each segment index.
@@ -330,10 +342,11 @@ def segments2residues_2d(self, six):
             present in that segment
 
         """
+        SR = self.SR
         try:
-            return [self._SR[s].copy() for s in six]
+            return [SR[s].copy() for s in six]
         except TypeError:
-            return [self._SR[six].copy()]
+            return [SR[six].copy()]
 
     # Compound moves, does 2 translations
     def atoms2segments(self, aix):
@@ -394,29 +407,35 @@ def segments2atoms_2d(self, six):
     def move_atom(self, aix, rix):
         """Move aix to be in rix"""
         self._AR[aix] = rix
-        self._RA = make_downshift_arrays(self._AR, self.n_residues)
+        self._RA = None
 
     def move_residue(self, rix, six):
         """Move rix to be in six"""
         self._RS[rix] = six
-        self._SR = make_downshift_arrays(self._RS, self.n_segments)
+        self._SR = None
 
     def add_Residue(self, segidx):
         # segidx - index of parent
         self.n_residues += 1
-        self._RA = make_downshift_arrays(self._AR, self.n_residues)
+        self._RA = None
         self._RS = np.concatenate([self._RS, np.array([segidx])])
-        self._SR = make_downshift_arrays(self._RS, self.n_segments)
+        self._SR = None
+
 
         return self.n_residues - 1
 
     def add_Segment(self):
         self.n_segments += 1
-        # self._RS remains the same, no residues point to the new segment yet
-        self._SR = make_downshift_arrays(self._RS, self.n_segments)
-
+        self._SR = None
         return self.n_segments - 1
 
+    def __getstate__(self):
+        # don't serialize _RA and _SR for performance.
+        attrs = self.__dict__
+        attrs['_RA'] = None
+        attrs['_SR'] = None
+        return attrs
+
 
 class Topology(object):
     """In-memory, array-based topology database.

diff --git a/testsuite/MDAnalysisTests/core/test_copying.py b/testsuite/MDAnalysisTests/core/test_copying.py
@@ -57,7 +57,7 @@ def test_size_independent(self, refTT):
         refTT.n_atoms = -10
         assert new.n_atoms == old
 
-    @pytest.mark.parametrize('attr', ['_AR', '_RA', '_RS', '_SR'])
+    @pytest.mark.parametrize('attr', ['_AR', 'RA', '_RS', 'SR'])
     def test_AR(self, refTT, attr):
         new = refTT.copy()
         ref = getattr(refTT, attr)
@@ -66,7 +66,7 @@ def test_AR(self, refTT, attr):
         for a, b in zip(ref, other):
             assert_equal(a, b)
 
-    @pytest.mark.parametrize('attr', ['_AR', '_RA', '_RS', '_SR'])
+    @pytest.mark.parametrize('attr', ['_AR', 'RA', '_RS', 'SR'])
     def test_AR_independent(self, refTT, attr):
         new = refTT.copy()
         ref = getattr(refTT, attr)

diff --git a/testsuite/MDAnalysisTests/core/test_topology.py b/testsuite/MDAnalysisTests/core/test_topology.py
@@ -9,6 +9,7 @@
 )
 import pytest
 import numpy as np
+import pickle
 
 from MDAnalysisTests import make_Universe
 
@@ -23,6 +24,11 @@
 import MDAnalysis
 
 
+def assert_rows_match(a, b):
+    for row_a, row_b in zip(a, b):
+        assert_equal(row_a, row_b)
+
+
 class TestTransTable(object):
     @pytest.fixture()
     def tt(self):
@@ -144,6 +150,39 @@ def test_move_residue_simple(self, tt):
         assert_equal(len(tt.segments2residues_1d(0)), 3)
         assert_equal(len(tt.segments2residues_1d(1)), 1)
 
+    def test_lazy_building_RA(self, tt):
+        assert_equal(tt._RA, None)
+        RA = tt.RA
+        assert_rows_match(tt.RA,
+                          np.array([np.array([0, 1]),
+                                    np.array([4, 5, 8]),
+                                    np.array([2, 3, 9]),
+                                    np.array([6, 7]),
+                                    None], dtype=object))
+
+        tt.move_atom(1, 3)
+        assert_equal(tt._RA, None)
+
+    def test_lazy_building_SR(self, tt):
+        assert_equal(tt._SR, None)
+        SR = tt.SR
+        assert_rows_match(tt.SR,
+                          np.array([np.array([0, 3]),
+                                    np.array([1, 2]),
+                                    None], dtype=object))
+
+        tt.move_residue(1, 0)
+        assert_equal(tt._SR, None)
+
+    def test_serialization(self, tt):
+        _ = tt.RA
+        _ = tt.SR
+        tt_loaded = pickle.loads(pickle.dumps(tt))
+        assert_equal(tt_loaded._RA, None)
+        assert_equal(tt_loaded._SR, None)
+        assert_rows_match(tt_loaded.RA, tt.RA)
+        assert_rows_match(tt_loaded.SR, tt.SR)
+
 
 class TestLevelMoves(object):
     """Tests for moving atoms/residues between residues/segments
@@ -467,11 +506,6 @@ def ragged_result(self):
         return np.array([[0, 4, 7], [1, 5, 8], [2, 3, 6, 9]],
                         dtype=object)
 
-    @staticmethod
-    def assert_rows_match(a, b):
-        for row_a, row_b in zip(a, b):
-            assert_equal(row_a, row_b)
-
     # The array as a whole must be dtype object
     # While the subarrays must be integers
     def test_downshift_dtype_square(self, square, square_size):
@@ -498,16 +532,16 @@ def test_shape_ragged(self, ragged, ragged_size):
 
     def test_contents_square(self, square, square_size, square_result):
         out = make_downshift_arrays(square, square_size)
-        self.assert_rows_match(out, square_result)
+        assert_rows_match(out, square_result)
 
     def test_contents_ragged(self, ragged, ragged_size, ragged_result):
         out = make_downshift_arrays(ragged, ragged_size)
-        self.assert_rows_match(out, ragged_result)
+        assert_rows_match(out, ragged_result)
 
     def test_missing_intra_values(self):
         out = make_downshift_arrays(
             np.array([0, 0, 2, 2, 3, 3]), 4)
-        self.assert_rows_match(out,
+        assert_rows_match(out,
                                np.array([np.array([0, 1]),
                                          np.array([], dtype=int),
                                          np.array([2, 3]),
@@ -517,7 +551,7 @@ def test_missing_intra_values(self):
     def test_missing_intra_values_2(self):
         out = make_downshift_arrays(
             np.array([0, 0, 3, 3, 4, 4]), 5)
-        self.assert_rows_match(out,
+        assert_rows_match(out,
                                np.array([np.array([0, 1]),
                                          np.array([], dtype=int),
                                          np.array([], dtype=int),
@@ -527,7 +561,7 @@ def test_missing_intra_values_2(self):
 
     def test_missing_end_values(self):
         out = make_downshift_arrays(np.array([0, 0, 1, 1, 2, 2]), 4)
-        self.assert_rows_match(out,
+        assert_rows_match(out,
                                np.array([np.array([0, 1]),
                                          np.array([2, 3]),
                                          np.array([4, 5]),
@@ -536,14 +570,22 @@ def test_missing_end_values(self):
 
     def test_missing_end_values_2(self):
         out = make_downshift_arrays(np.array([0, 0, 1, 1, 2, 2]), 6)
-        self.assert_rows_match(out,
+        assert_rows_match(out,
                                np.array([np.array([0, 1]),
                                          np.array([2, 3]),
                                          np.array([4, 5]),
                                          np.array([], dtype=int),
                                          np.array([], dtype=int),
                                          None], dtype=object))
 
+    def test_missing_start_values_2(self):
+        out = make_downshift_arrays(np.array([1, 1, 2, 2, 3, 3]), 4)
+        assert_rows_match(out,
+                          np.array([np.array([], dtype=int),
+                                    np.array([0, 1]),
+                                    np.array([2, 3]),
+                                    np.array([4, 5]),
+                                    None], dtype=object))
 
 class TestAddingResidues(object):
     """Tests for adding residues and segments to a Universe