Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor make_downshift_arrays and enhance the performance of topology construction #3724

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions package/CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ Enhancements
* Additional logger.info output when per-frame analysis starts (PR #3710)
* Timestep has been converted to a Cython Extension type
(CZI Performance track, PR #3683)
* Refactor make_downshift_arrays with vector operation in numpy(PR #3724)
* Optimize topology serialization/construction by lazy-building _RA and _SR.

Changes
* To add additional optional packages for different file formats when
Expand Down
111 changes: 65 additions & 46 deletions package/MDAnalysis/core/topology.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,16 +64,6 @@
from ..exceptions import NoDataError


# TODO Notes:
# Could make downshift tables lazily built! This would
# a) Make these not get built when not used
# b) Optimise moving multiple atoms between residues as only built once
# afterwards

# Could optimise moves by only updating the two parent tables rather than
# rebuilding everything!


def make_downshift_arrays(upshift, nparents):
"""From an upwards translation table, create the opposite direction

Expand All @@ -99,7 +89,7 @@ def make_downshift_arrays(upshift, nparents):
To find the residue to atom mappings for a given atom to residue mapping:

>>> atom2res = np.array([0, 1, 0, 2, 2, 0, 2])
>>> make_downshift_arrays(atom2res)
>>> make_downshift_arrays(atom2res, 3)
array([array([0, 2, 5]), array([1]), array([3, 4, 6]), None], dtype=object)

Entry 0 corresponds to residue 0 and says that this contains atoms 0, 2 & 5
Expand All @@ -115,26 +105,28 @@ def make_downshift_arrays(upshift, nparents):
if not len(upshift):
return np.array([], dtype=object)

order = np.argsort(upshift)
# mergesort for a stable ordered array for the same value.
order = np.argsort(upshift, kind="mergesort")

upshift_sorted = upshift[order]
borders = [None] + list(np.nonzero(np.diff(upshift_sorted))[0] + 1) + [None]

# returns an array of arrays
downshift = []
counter = -1
# don't use enumerate, we modify counter in place
for x, y in zip(borders[:-1], borders[1:]):
counter += 1
# If parent is skipped, eg (0, 0, 2, 2, etc)
while counter != upshift[order[x:y][0]]:
downshift.append(np.array([], dtype=np.intp))
counter += 1
downshift.append(np.sort(np.array(order[x:y], copy=True, dtype=np.intp)))
# Add entries for childless parents at end of range
while counter < (nparents - 1):
downshift.append(np.array([], dtype=np.intp))
counter += 1
u_values, indices = np.unique(upshift_sorted, return_index=True)

# reset nparents to the larger one between input and heuristic from data
# This is useful for creating empty Universe where default value is 1.
nparents = np.max([nparents, u_values.max()+1])
residue_indices = np.zeros(nparents, dtype=int)
missing_resids = np.sort(np.setdiff1d(np.arange(nparents), u_values))
indices = np.append(indices, upshift_sorted.shape[0])

residue_indices[u_values] = indices[1:]

for missing_resid in missing_resids:
if missing_resid == 0:
residue_indices[missing_resid] = 0
else:
residue_indices[missing_resid] = residue_indices[missing_resid-1]

downshift = np.split(order, residue_indices[:-1])
# Add None to end of array to force it to be of type Object
# Without this, a rectangular array gets squashed into a single array
downshift.append(None)
Expand Down Expand Up @@ -181,6 +173,9 @@ class TransTable(object):
size : tuple
tuple ``(n_atoms, n_residues, n_segments)`` describing the shape of
the TransTable

.. versionchanged:: 2.3.0
Lazy building RA and SR.
"""
def __init__(self,
n_atoms, n_residues, n_segments, # Size of tables
Expand All @@ -197,7 +192,7 @@ def __init__(self,
self._AR = np.asarray(atom_resindex, dtype=np.intp).copy()
if not len(self._AR) == n_atoms:
raise ValueError("atom_resindex must be len n_atoms")
self._RA = make_downshift_arrays(self._AR, n_residues)
self._RA = None

# built residue-to-segment mapping, and vice-versa
if residue_segindex is None:
Expand All @@ -206,13 +201,27 @@ def __init__(self,
self._RS = np.asarray(residue_segindex, dtype=np.intp).copy()
if not len(self._RS) == n_residues:
raise ValueError("residue_segindex must be len n_residues")
self._SR = make_downshift_arrays(self._RS, n_segments)
self._SR = None

def copy(self):
"""Return a deepcopy of this Transtable"""
return self.__class__(self.n_atoms, self.n_residues, self.n_segments,
atom_resindex=self._AR, residue_segindex=self._RS)

@property
def RA(self):
if self._RA is None:
self._RA = make_downshift_arrays(self._AR,
self.n_residues)
return self._RA

@property
def SR(self):
if self._SR is None:
self._SR = make_downshift_arrays(self._RS,
self.n_segments)
return self._SR

@property
def size(self):
"""The shape of the table, ``(n_atoms, n_residues, n_segments)``.
Expand Down Expand Up @@ -251,11 +260,12 @@ def residues2atoms_1d(self, rix):
indices of atoms present in residues, collectively

"""
RA = self.RA
try:
return np.concatenate(self._RA[rix])
return np.concatenate(RA[rix])
except ValueError: # rix is not iterable or empty
# don't accidentally return a view!
return self._RA[rix].astype(np.intp, copy=True)
return RA[rix].astype(np.intp, copy=True)

def residues2atoms_2d(self, rix):
"""Get atom indices represented by each residue index.
Expand All @@ -273,10 +283,11 @@ def residues2atoms_2d(self, rix):
in that residue

"""
RA = self.RA
try:
return [self._RA[r].copy() for r in rix]
return [RA[r].copy() for r in rix]
except TypeError:
return [self._RA[rix].copy()] # why would this be singular for 2d?
return [RA[rix].copy()] # why would this be singular for 2d?

def residues2segments(self, rix):
"""Get segment indices for each residue.
Expand Down Expand Up @@ -308,11 +319,12 @@ def segments2residues_1d(self, six):
sorted indices of residues present in segments, collectively

"""
SR = self.SR
try:
return np.concatenate(self._SR[six])
return np.concatenate(SR[six])
except ValueError: # six is not iterable or empty
# don't accidentally return a view!
return self._SR[six].astype(np.intp, copy=True)
return SR[six].astype(np.intp, copy=True)

def segments2residues_2d(self, six):
"""Get residue indices represented by each segment index.
Expand All @@ -330,10 +342,11 @@ def segments2residues_2d(self, six):
present in that segment

"""
SR = self.SR
try:
return [self._SR[s].copy() for s in six]
return [SR[s].copy() for s in six]
except TypeError:
return [self._SR[six].copy()]
return [SR[six].copy()]

# Compound moves, does 2 translations
def atoms2segments(self, aix):
Expand Down Expand Up @@ -394,29 +407,35 @@ def segments2atoms_2d(self, six):
def move_atom(self, aix, rix):
"""Move aix to be in rix"""
self._AR[aix] = rix
self._RA = make_downshift_arrays(self._AR, self.n_residues)
self._RA = None

def move_residue(self, rix, six):
"""Move rix to be in six"""
self._RS[rix] = six
self._SR = make_downshift_arrays(self._RS, self.n_segments)
self._SR = None

def add_Residue(self, segidx):
# segidx - index of parent
self.n_residues += 1
self._RA = make_downshift_arrays(self._AR, self.n_residues)
self._RA = None
self._RS = np.concatenate([self._RS, np.array([segidx])])
self._SR = make_downshift_arrays(self._RS, self.n_segments)
self._SR = None


return self.n_residues - 1

def add_Segment(self):
self.n_segments += 1
# self._RS remains the same, no residues point to the new segment yet
self._SR = make_downshift_arrays(self._RS, self.n_segments)

self._SR = None
return self.n_segments - 1

def __getstate__(self):
# don't serialize _RA and _SR for performance.
attrs = self.__dict__
attrs['_RA'] = None
attrs['_SR'] = None
return attrs


class Topology(object):
"""In-memory, array-based topology database.
Expand Down
4 changes: 2 additions & 2 deletions testsuite/MDAnalysisTests/core/test_copying.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def test_size_independent(self, refTT):
refTT.n_atoms = -10
assert new.n_atoms == old

@pytest.mark.parametrize('attr', ['_AR', '_RA', '_RS', '_SR'])
@pytest.mark.parametrize('attr', ['_AR', 'RA', '_RS', 'SR'])
def test_AR(self, refTT, attr):
new = refTT.copy()
ref = getattr(refTT, attr)
Expand All @@ -66,7 +66,7 @@ def test_AR(self, refTT, attr):
for a, b in zip(ref, other):
assert_equal(a, b)

@pytest.mark.parametrize('attr', ['_AR', '_RA', '_RS', '_SR'])
@pytest.mark.parametrize('attr', ['_AR', 'RA', '_RS', 'SR'])
def test_AR_independent(self, refTT, attr):
new = refTT.copy()
ref = getattr(refTT, attr)
Expand Down
64 changes: 53 additions & 11 deletions testsuite/MDAnalysisTests/core/test_topology.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
)
import pytest
import numpy as np
import pickle

from MDAnalysisTests import make_Universe

Expand All @@ -23,6 +24,11 @@
import MDAnalysis


def assert_rows_match(a, b):
for row_a, row_b in zip(a, b):
assert_equal(row_a, row_b)


class TestTransTable(object):
@pytest.fixture()
def tt(self):
Expand Down Expand Up @@ -144,6 +150,39 @@ def test_move_residue_simple(self, tt):
assert_equal(len(tt.segments2residues_1d(0)), 3)
assert_equal(len(tt.segments2residues_1d(1)), 1)

def test_lazy_building_RA(self, tt):
assert_equal(tt._RA, None)
RA = tt.RA
assert_rows_match(tt.RA,
np.array([np.array([0, 1]),
np.array([4, 5, 8]),
np.array([2, 3, 9]),
np.array([6, 7]),
None], dtype=object))

tt.move_atom(1, 3)
assert_equal(tt._RA, None)

def test_lazy_building_SR(self, tt):
assert_equal(tt._SR, None)
SR = tt.SR
assert_rows_match(tt.SR,
np.array([np.array([0, 3]),
np.array([1, 2]),
None], dtype=object))

tt.move_residue(1, 0)
assert_equal(tt._SR, None)

def test_serialization(self, tt):
_ = tt.RA
_ = tt.SR
tt_loaded = pickle.loads(pickle.dumps(tt))
assert_equal(tt_loaded._RA, None)
assert_equal(tt_loaded._SR, None)
assert_rows_match(tt_loaded.RA, tt.RA)
assert_rows_match(tt_loaded.SR, tt.SR)


class TestLevelMoves(object):
"""Tests for moving atoms/residues between residues/segments
Expand Down Expand Up @@ -467,11 +506,6 @@ def ragged_result(self):
return np.array([[0, 4, 7], [1, 5, 8], [2, 3, 6, 9]],
dtype=object)

@staticmethod
def assert_rows_match(a, b):
for row_a, row_b in zip(a, b):
assert_equal(row_a, row_b)

# The array as a whole must be dtype object
# While the subarrays must be integers
def test_downshift_dtype_square(self, square, square_size):
Expand All @@ -498,16 +532,16 @@ def test_shape_ragged(self, ragged, ragged_size):

def test_contents_square(self, square, square_size, square_result):
out = make_downshift_arrays(square, square_size)
self.assert_rows_match(out, square_result)
assert_rows_match(out, square_result)

def test_contents_ragged(self, ragged, ragged_size, ragged_result):
out = make_downshift_arrays(ragged, ragged_size)
self.assert_rows_match(out, ragged_result)
assert_rows_match(out, ragged_result)

def test_missing_intra_values(self):
out = make_downshift_arrays(
np.array([0, 0, 2, 2, 3, 3]), 4)
self.assert_rows_match(out,
assert_rows_match(out,
np.array([np.array([0, 1]),
np.array([], dtype=int),
np.array([2, 3]),
Expand All @@ -517,7 +551,7 @@ def test_missing_intra_values(self):
def test_missing_intra_values_2(self):
out = make_downshift_arrays(
np.array([0, 0, 3, 3, 4, 4]), 5)
self.assert_rows_match(out,
assert_rows_match(out,
np.array([np.array([0, 1]),
np.array([], dtype=int),
np.array([], dtype=int),
Expand All @@ -527,7 +561,7 @@ def test_missing_intra_values_2(self):

def test_missing_end_values(self):
out = make_downshift_arrays(np.array([0, 0, 1, 1, 2, 2]), 4)
self.assert_rows_match(out,
assert_rows_match(out,
np.array([np.array([0, 1]),
np.array([2, 3]),
np.array([4, 5]),
Expand All @@ -536,14 +570,22 @@ def test_missing_end_values(self):

def test_missing_end_values_2(self):
out = make_downshift_arrays(np.array([0, 0, 1, 1, 2, 2]), 6)
self.assert_rows_match(out,
assert_rows_match(out,
np.array([np.array([0, 1]),
np.array([2, 3]),
np.array([4, 5]),
np.array([], dtype=int),
np.array([], dtype=int),
None], dtype=object))

def test_missing_start_values_2(self):
out = make_downshift_arrays(np.array([1, 1, 2, 2, 3, 3]), 4)
assert_rows_match(out,
np.array([np.array([], dtype=int),
np.array([0, 1]),
np.array([2, 3]),
np.array([4, 5]),
None], dtype=object))

class TestAddingResidues(object):
"""Tests for adding residues and segments to a Universe
Expand Down