From 71920b1cd9a60171bb2790fcc1ec5ed9abff17d3 Mon Sep 17 00:00:00 2001 From: Yuxuan Zhuang Date: Sun, 19 Jun 2022 23:51:22 +0200 Subject: [PATCH 1/7] refactor topology serialization --- package/MDAnalysis/core/topology.py | 46 ++++++++++++++++++----------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/package/MDAnalysis/core/topology.py b/package/MDAnalysis/core/topology.py index 2a719e443e8..898e41ad538 100644 --- a/package/MDAnalysis/core/topology.py +++ b/package/MDAnalysis/core/topology.py @@ -115,26 +115,21 @@ def make_downshift_arrays(upshift, nparents): if not len(upshift): return np.array([], dtype=object) - order = np.argsort(upshift) + upshift = np.array(upshift) + order = np.argsort(upshift, kind="mergesort") upshift_sorted = upshift[order] - borders = [None] + list(np.nonzero(np.diff(upshift_sorted))[0] + 1) + [None] - - # returns an array of arrays - downshift = [] - counter = -1 - # don't use enumerate, we modify counter in place - for x, y in zip(borders[:-1], borders[1:]): - counter += 1 - # If parent is skipped, eg (0, 0, 2, 2, etc) - while counter != upshift[order[x:y][0]]: - downshift.append(np.array([], dtype=np.intp)) - counter += 1 - downshift.append(np.sort(np.array(order[x:y], copy=True, dtype=np.intp))) - # Add entries for childless parents at end of range - while counter < (nparents - 1): - downshift.append(np.array([], dtype=np.intp)) - counter += 1 + u_values, indices = np.unique(upshift_sorted, return_index=True) + residue_indices = np.zeros(nparents, dtype=int) + missing_resids = np.sort(np.setdiff1d(np.arange(nparents), u_values)) + indices = np.append(indices, upshift_sorted.shape[0]) + + residue_indices[u_values] = indices[1:] + + for missing_resid in missing_resids: + residue_indices[missing_resid] = residue_indices[missing_resid-1] + + downshift = np.split(order, residue_indices[:-1]) # Add None to end of array to force it to be of type Object # Without this, a rectangular array gets squashed into a single array downshift.append(None) @@ -417,6 +412,21 @@ def add_Segment(self): return self.n_segments - 1 + def __getstate__(self): + return (self.n_atoms, self.n_residues, self.n_segments, + self._AR, self._RS) + + def __setstate__(self, args): + # rebuild _RA and _SR instead of serializing them. + n_atoms = args[0] + n_residues = args[1] + n_segments = args[2] + _AR = args[3] + _RS = args[4] + return self.__init__(n_atoms, n_residues, n_segments, + atom_resindex=_AR, residue_segindex=_RS) + + class Topology(object): """In-memory, array-based topology database. From 1eb4794e09718784dd5f509759a4dbe27be1a3e9 Mon Sep 17 00:00:00 2001 From: Yuxuan Zhuang Date: Mon, 20 Jun 2022 00:24:20 +0200 Subject: [PATCH 2/7] fix missing resid is 0 --- package/MDAnalysis/core/topology.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/package/MDAnalysis/core/topology.py b/package/MDAnalysis/core/topology.py index 898e41ad538..84ac070f121 100644 --- a/package/MDAnalysis/core/topology.py +++ b/package/MDAnalysis/core/topology.py @@ -127,7 +127,10 @@ def make_downshift_arrays(upshift, nparents): residue_indices[u_values] = indices[1:] for missing_resid in missing_resids: - residue_indices[missing_resid] = residue_indices[missing_resid-1] + if missing_resid == 0: + residue_indices[missing_resid] = 0 + else: + residue_indices[missing_resid] = residue_indices[missing_resid-1] downshift = np.split(order, residue_indices[:-1]) # Add None to end of array to force it to be of type Object From d82a356e17fdeec56f6526a2d4623dacc6a62b4e Mon Sep 17 00:00:00 2001 From: Yuxuan Zhuang Date: Mon, 20 Jun 2022 00:30:27 +0200 Subject: [PATCH 3/7] add missing start value test --- package/MDAnalysis/core/topology.py | 6 +++++- testsuite/MDAnalysisTests/core/test_topology.py | 8 ++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/package/MDAnalysis/core/topology.py b/package/MDAnalysis/core/topology.py index 84ac070f121..090b807ac51 100644 --- a/package/MDAnalysis/core/topology.py +++ b/package/MDAnalysis/core/topology.py @@ -120,6 +120,10 @@ def make_downshift_arrays(upshift, nparents): upshift_sorted = upshift[order] u_values, indices = np.unique(upshift_sorted, return_index=True) + + # reset nparents to the larger one between input and heuristic from data + # This is useful for creating empty Universe where default value is 1. + nparents = np.max([nparents, u_values.max()+1]) residue_indices = np.zeros(nparents, dtype=int) missing_resids = np.sort(np.setdiff1d(np.arange(nparents), u_values)) indices = np.append(indices, upshift_sorted.shape[0]) @@ -427,7 +431,7 @@ def __setstate__(self, args): _AR = args[3] _RS = args[4] return self.__init__(n_atoms, n_residues, n_segments, - atom_resindex=_AR, residue_segindex=_RS) + atom_resindex=_AR, residue_segindex=_RS) diff --git a/testsuite/MDAnalysisTests/core/test_topology.py b/testsuite/MDAnalysisTests/core/test_topology.py index 2e777835310..1940a598f9d 100644 --- a/testsuite/MDAnalysisTests/core/test_topology.py +++ b/testsuite/MDAnalysisTests/core/test_topology.py @@ -544,6 +544,14 @@ def test_missing_end_values_2(self): np.array([], dtype=int), None], dtype=object)) + def test_missing_start_values_2(self): + out = make_downshift_arrays(np.array([1, 1, 2, 2, 3, 3]), 4) + self.assert_rows_match(out, + np.array([np.array([], dtype=int), + np.array([0, 1]), + np.array([2, 3]), + np.array([4, 5]), + None], dtype=object)) class TestAddingResidues(object): """Tests for adding residues and segments to a Universe From c983609f1aa7fedc5417c61c306efba85f180740 Mon Sep 17 00:00:00 2001 From: Yuxuan Zhuang Date: Thu, 23 Jun 2022 20:00:31 +0200 Subject: [PATCH 4/7] changelog --- package/CHANGELOG | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/package/CHANGELOG b/package/CHANGELOG index 5d6c7106bfe..f08b6d986b6 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -13,7 +13,7 @@ The rules for this file: * release numbers follow "Semantic Versioning" http://semver.org ------------------------------------------------------------------------------ -??/??/?? IAlibay, PicoCentauri, orbeckst, hmacdope +??/??/?? IAlibay, PicoCentauri, orbeckst, hmacdope, yuxuanzhuang * 2.3.0 @@ -26,6 +26,8 @@ Enhancements * Additional logger.info output when per-frame analysis starts (PR #3710) * Timestep has been converted to a Cython Extension type (CZI Performance track, PR #3683) + * Refactor make_downshift_arrays with vector operation in numpy(PR #3724) + * Optimize topology serialization by reconstructing _RA and _SR. Changes From cbb8448d9e37a473f1cf5413f61b268f3399d188 Mon Sep 17 00:00:00 2001 From: Yuxuan Zhuang Date: Thu, 23 Jun 2022 21:09:43 +0200 Subject: [PATCH 5/7] lazy build RA, SR --- package/MDAnalysis/core/topology.py | 83 ++++++++++++++--------------- 1 file changed, 41 insertions(+), 42 deletions(-) diff --git a/package/MDAnalysis/core/topology.py b/package/MDAnalysis/core/topology.py index 090b807ac51..fada65831fd 100644 --- a/package/MDAnalysis/core/topology.py +++ b/package/MDAnalysis/core/topology.py @@ -64,16 +64,6 @@ from ..exceptions import NoDataError -# TODO Notes: -# Could make downshift tables lazily built! This would -# a) Make these not get built when not used -# b) Optimise moving multiple atoms between residues as only built once -# afterwards - -# Could optimise moves by only updating the two parent tables rather than -# rebuilding everything! - - def make_downshift_arrays(upshift, nparents): """From an upwards translation table, create the opposite direction @@ -99,7 +89,7 @@ def make_downshift_arrays(upshift, nparents): To find the residue to atom mappings for a given atom to residue mapping: >>> atom2res = np.array([0, 1, 0, 2, 2, 0, 2]) - >>> make_downshift_arrays(atom2res) + >>> make_downshift_arrays(atom2res, 3) array([array([0, 2, 5]), array([1]), array([3, 4, 6]), None], dtype=object) Entry 0 corresponds to residue 0 and says that this contains atoms 0, 2 & 5 @@ -115,7 +105,7 @@ def make_downshift_arrays(upshift, nparents): if not len(upshift): return np.array([], dtype=object) - upshift = np.array(upshift) + # mergesort for a stable ordered array for the same value. order = np.argsort(upshift, kind="mergesort") upshift_sorted = upshift[order] @@ -199,7 +189,7 @@ def __init__(self, self._AR = np.asarray(atom_resindex, dtype=np.intp).copy() if not len(self._AR) == n_atoms: raise ValueError("atom_resindex must be len n_atoms") - self._RA = make_downshift_arrays(self._AR, n_residues) + self._RA = None # built residue-to-segment mapping, and vice-versa if residue_segindex is None: @@ -208,13 +198,27 @@ def __init__(self, self._RS = np.asarray(residue_segindex, dtype=np.intp).copy() if not len(self._RS) == n_residues: raise ValueError("residue_segindex must be len n_residues") - self._SR = make_downshift_arrays(self._RS, n_segments) + self._SR = None def copy(self): """Return a deepcopy of this Transtable""" return self.__class__(self.n_atoms, self.n_residues, self.n_segments, atom_resindex=self._AR, residue_segindex=self._RS) + @property + def RA(self): + if self._RA is None: + self._RA = make_downshift_arrays(self._AR, + self.n_residues) + return self._RA + + @property + def SR(self): + if self._SR is None: + self._SR = make_downshift_arrays(self._RS, + self.n_segments) + return self._SR + @property def size(self): """The shape of the table, ``(n_atoms, n_residues, n_segments)``. @@ -253,11 +257,12 @@ def residues2atoms_1d(self, rix): indices of atoms present in residues, collectively """ + RA = self.RA try: - return np.concatenate(self._RA[rix]) + return np.concatenate(RA[rix]) except ValueError: # rix is not iterable or empty # don't accidentally return a view! - return self._RA[rix].astype(np.intp, copy=True) + return RA[rix].astype(np.intp, copy=True) def residues2atoms_2d(self, rix): """Get atom indices represented by each residue index. @@ -275,10 +280,11 @@ def residues2atoms_2d(self, rix): in that residue """ + RA = self.RA try: - return [self._RA[r].copy() for r in rix] + return [RA[r].copy() for r in rix] except TypeError: - return [self._RA[rix].copy()] # why would this be singular for 2d? + return [RA[rix].copy()] # why would this be singular for 2d? def residues2segments(self, rix): """Get segment indices for each residue. @@ -310,11 +316,12 @@ def segments2residues_1d(self, six): sorted indices of residues present in segments, collectively """ + SR = self.SR try: - return np.concatenate(self._SR[six]) + return np.concatenate(SR[six]) except ValueError: # six is not iterable or empty # don't accidentally return a view! - return self._SR[six].astype(np.intp, copy=True) + return SR[six].astype(np.intp, copy=True) def segments2residues_2d(self, six): """Get residue indices represented by each segment index. @@ -332,10 +339,11 @@ def segments2residues_2d(self, six): present in that segment """ + SR = self.SR try: - return [self._SR[s].copy() for s in six] + return [SR[s].copy() for s in six] except TypeError: - return [self._SR[six].copy()] + return [SR[six].copy()] # Compound moves, does 2 translations def atoms2segments(self, aix): @@ -396,43 +404,34 @@ def segments2atoms_2d(self, six): def move_atom(self, aix, rix): """Move aix to be in rix""" self._AR[aix] = rix - self._RA = make_downshift_arrays(self._AR, self.n_residues) + self._RA = None def move_residue(self, rix, six): """Move rix to be in six""" self._RS[rix] = six - self._SR = make_downshift_arrays(self._RS, self.n_segments) + self._SR = None def add_Residue(self, segidx): # segidx - index of parent self.n_residues += 1 - self._RA = make_downshift_arrays(self._AR, self.n_residues) + self._RA = None self._RS = np.concatenate([self._RS, np.array([segidx])]) - self._SR = make_downshift_arrays(self._RS, self.n_segments) + self._SR = None + return self.n_residues - 1 def add_Segment(self): self.n_segments += 1 - # self._RS remains the same, no residues point to the new segment yet - self._SR = make_downshift_arrays(self._RS, self.n_segments) - + self._SR = None return self.n_segments - 1 def __getstate__(self): - return (self.n_atoms, self.n_residues, self.n_segments, - self._AR, self._RS) - - def __setstate__(self, args): - # rebuild _RA and _SR instead of serializing them. - n_atoms = args[0] - n_residues = args[1] - n_segments = args[2] - _AR = args[3] - _RS = args[4] - return self.__init__(n_atoms, n_residues, n_segments, - atom_resindex=_AR, residue_segindex=_RS) - + # don't serialize _RA and _SR for performance. + attrs = self.__dict__ + attrs['_RA'] = None + attrs['_SR'] = None + return attrs class Topology(object): From fe595b2e4f1c38d3b6925e8c867938875f88695f Mon Sep 17 00:00:00 2001 From: Yuxuan Zhuang Date: Thu, 23 Jun 2022 21:12:52 +0200 Subject: [PATCH 6/7] test --- package/CHANGELOG | 2 +- package/MDAnalysis/core/topology.py | 3 + .../MDAnalysisTests/core/test_copying.py | 4 +- .../MDAnalysisTests/core/test_topology.py | 58 +++++++++++++++---- 4 files changed, 52 insertions(+), 15 deletions(-) diff --git a/package/CHANGELOG b/package/CHANGELOG index f08b6d986b6..62987731339 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -27,7 +27,7 @@ Enhancements * Timestep has been converted to a Cython Extension type (CZI Performance track, PR #3683) * Refactor make_downshift_arrays with vector operation in numpy(PR #3724) - * Optimize topology serialization by reconstructing _RA and _SR. + * Optimize topology serialization/construction by lazy-building _RA and _SR. Changes diff --git a/package/MDAnalysis/core/topology.py b/package/MDAnalysis/core/topology.py index fada65831fd..4fc320ac69e 100644 --- a/package/MDAnalysis/core/topology.py +++ b/package/MDAnalysis/core/topology.py @@ -173,6 +173,9 @@ class TransTable(object): size : tuple tuple ``(n_atoms, n_residues, n_segments)`` describing the shape of the TransTable + + .. versionchanged:: 2.3.0 + Lazy building RA and SR. """ def __init__(self, n_atoms, n_residues, n_segments, # Size of tables diff --git a/testsuite/MDAnalysisTests/core/test_copying.py b/testsuite/MDAnalysisTests/core/test_copying.py index eb14bb3a13d..f5279b39f8a 100644 --- a/testsuite/MDAnalysisTests/core/test_copying.py +++ b/testsuite/MDAnalysisTests/core/test_copying.py @@ -57,7 +57,7 @@ def test_size_independent(self, refTT): refTT.n_atoms = -10 assert new.n_atoms == old - @pytest.mark.parametrize('attr', ['_AR', '_RA', '_RS', '_SR']) + @pytest.mark.parametrize('attr', ['_AR', 'RA', '_RS', 'SR']) def test_AR(self, refTT, attr): new = refTT.copy() ref = getattr(refTT, attr) @@ -66,7 +66,7 @@ def test_AR(self, refTT, attr): for a, b in zip(ref, other): assert_equal(a, b) - @pytest.mark.parametrize('attr', ['_AR', '_RA', '_RS', '_SR']) + @pytest.mark.parametrize('attr', ['_AR', 'RA', '_RS', 'SR']) def test_AR_independent(self, refTT, attr): new = refTT.copy() ref = getattr(refTT, attr) diff --git a/testsuite/MDAnalysisTests/core/test_topology.py b/testsuite/MDAnalysisTests/core/test_topology.py index 1940a598f9d..9f50b3404b3 100644 --- a/testsuite/MDAnalysisTests/core/test_topology.py +++ b/testsuite/MDAnalysisTests/core/test_topology.py @@ -9,6 +9,7 @@ ) import pytest import numpy as np +import pickle from MDAnalysisTests import make_Universe @@ -23,6 +24,11 @@ import MDAnalysis +def assert_rows_match(a, b): + for row_a, row_b in zip(a, b): + assert_equal(row_a, row_b) + + class TestTransTable(object): @pytest.fixture() def tt(self): @@ -144,6 +150,39 @@ def test_move_residue_simple(self, tt): assert_equal(len(tt.segments2residues_1d(0)), 3) assert_equal(len(tt.segments2residues_1d(1)), 1) + def test_lazy_building_RA(self, tt): + assert_equal(tt._RA, None) + RA = tt.RA + assert_rows_match(tt.RA, + np.array([np.array([0, 1]), + np.array([4, 5, 8]), + np.array([2, 3, 9]), + np.array([6, 7]), + None], dtype=object)) + + tt.move_atom(1, 3) + assert_equal(tt._RA, None) + + def test_lazy_building_SR(self, tt): + assert_equal(tt._SR, None) + SR = tt.SR + assert_rows_match(tt.SR, + np.array([np.array([0, 3]), + np.array([1, 2]), + None], dtype=object)) + + tt.move_residue(1, 0) + assert_equal(tt._SR, None) + + def test_serialization(self, tt): + _ = tt.RA + _ = tt.SR + tt_loaded = pickle.loads(pickle.dumps(tt)) + assert_equal(tt_loaded._RA, None) + assert_equal(tt_loaded._SR, None) + assert_rows_match(tt_loaded.RA, tt.RA) + assert_rows_match(tt_loaded.SR, tt.SR) + class TestLevelMoves(object): """Tests for moving atoms/residues between residues/segments @@ -467,11 +506,6 @@ def ragged_result(self): return np.array([[0, 4, 7], [1, 5, 8], [2, 3, 6, 9]], dtype=object) - @staticmethod - def assert_rows_match(a, b): - for row_a, row_b in zip(a, b): - assert_equal(row_a, row_b) - # The array as a whole must be dtype object # While the subarrays must be integers def test_downshift_dtype_square(self, square, square_size): @@ -498,16 +532,16 @@ def test_shape_ragged(self, ragged, ragged_size): def test_contents_square(self, square, square_size, square_result): out = make_downshift_arrays(square, square_size) - self.assert_rows_match(out, square_result) + assert_rows_match(out, square_result) def test_contents_ragged(self, ragged, ragged_size, ragged_result): out = make_downshift_arrays(ragged, ragged_size) - self.assert_rows_match(out, ragged_result) + assert_rows_match(out, ragged_result) def test_missing_intra_values(self): out = make_downshift_arrays( np.array([0, 0, 2, 2, 3, 3]), 4) - self.assert_rows_match(out, + assert_rows_match(out, np.array([np.array([0, 1]), np.array([], dtype=int), np.array([2, 3]), @@ -517,7 +551,7 @@ def test_missing_intra_values(self): def test_missing_intra_values_2(self): out = make_downshift_arrays( np.array([0, 0, 3, 3, 4, 4]), 5) - self.assert_rows_match(out, + assert_rows_match(out, np.array([np.array([0, 1]), np.array([], dtype=int), np.array([], dtype=int), @@ -527,7 +561,7 @@ def test_missing_intra_values_2(self): def test_missing_end_values(self): out = make_downshift_arrays(np.array([0, 0, 1, 1, 2, 2]), 4) - self.assert_rows_match(out, + assert_rows_match(out, np.array([np.array([0, 1]), np.array([2, 3]), np.array([4, 5]), @@ -536,7 +570,7 @@ def test_missing_end_values(self): def test_missing_end_values_2(self): out = make_downshift_arrays(np.array([0, 0, 1, 1, 2, 2]), 6) - self.assert_rows_match(out, + assert_rows_match(out, np.array([np.array([0, 1]), np.array([2, 3]), np.array([4, 5]), @@ -546,7 +580,7 @@ def test_missing_end_values_2(self): def test_missing_start_values_2(self): out = make_downshift_arrays(np.array([1, 1, 2, 2, 3, 3]), 4) - self.assert_rows_match(out, + assert_rows_match(out, np.array([np.array([], dtype=int), np.array([0, 1]), np.array([2, 3]), From cd998c695631fa25e1c9775841b2390904b28c11 Mon Sep 17 00:00:00 2001 From: Yuxuan Zhuang Date: Thu, 23 Jun 2022 21:19:05 +0200 Subject: [PATCH 7/7] pep8 --- .../MDAnalysisTests/core/test_topology.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/testsuite/MDAnalysisTests/core/test_topology.py b/testsuite/MDAnalysisTests/core/test_topology.py index 9f50b3404b3..170f29f3f80 100644 --- a/testsuite/MDAnalysisTests/core/test_topology.py +++ b/testsuite/MDAnalysisTests/core/test_topology.py @@ -154,11 +154,11 @@ def test_lazy_building_RA(self, tt): assert_equal(tt._RA, None) RA = tt.RA assert_rows_match(tt.RA, - np.array([np.array([0, 1]), - np.array([4, 5, 8]), - np.array([2, 3, 9]), - np.array([6, 7]), - None], dtype=object)) + np.array([np.array([0, 1]), + np.array([4, 5, 8]), + np.array([2, 3, 9]), + np.array([6, 7]), + None], dtype=object)) tt.move_atom(1, 3) assert_equal(tt._RA, None) @@ -167,9 +167,9 @@ def test_lazy_building_SR(self, tt): assert_equal(tt._SR, None) SR = tt.SR assert_rows_match(tt.SR, - np.array([np.array([0, 3]), - np.array([1, 2]), - None], dtype=object)) + np.array([np.array([0, 3]), + np.array([1, 2]), + None], dtype=object)) tt.move_residue(1, 0) assert_equal(tt._SR, None) @@ -581,11 +581,11 @@ def test_missing_end_values_2(self): def test_missing_start_values_2(self): out = make_downshift_arrays(np.array([1, 1, 2, 2, 3, 3]), 4) assert_rows_match(out, - np.array([np.array([], dtype=int), - np.array([0, 1]), - np.array([2, 3]), - np.array([4, 5]), - None], dtype=object)) + np.array([np.array([], dtype=int), + np.array([0, 1]), + np.array([2, 3]), + np.array([4, 5]), + None], dtype=object)) class TestAddingResidues(object): """Tests for adding residues and segments to a Universe