MDAnalysis · orbeckst · May 12, 2020 · Apr 10, 2019 · Apr 10, 2019 · Apr 14, 2019
diff --git a/package/CHANGELOG b/package/CHANGELOG
@@ -116,6 +116,9 @@ Enhancements
   * Added weights_groupselections option in RMSD for mapping custom weights
     to groupselections (PR #2610, Issue #2429)
   * PersistenceLength.plot() now create new axes if current axes not provided (Issue #2590)
+  * Added a correlations module to provide functionality for analysis modules to
+    calculate the discrete autocorrelation function in a standardised way. Added
+    the capability to allow intermittent behaviour (PR #2256)
 
 Changes
   * Deprecated :class:`ProgressMeter` and replaced it with :class:`ProgressBar` using

diff --git a/package/MDAnalysis/analysis/hbonds/hbond_analysis.py b/package/MDAnalysis/analysis/hbonds/hbond_analysis.py
@@ -334,6 +334,7 @@ class HydrogenBondAnalysis_OtherFF(HydrogenBondAnalysis):
 from MDAnalysis.lib.log import ProgressBar
 from MDAnalysis.lib.NeighborSearch import AtomNeighborSearch
 from MDAnalysis.lib import distances
+from MDAnalysis.lib.correlations import autocorrelation, correct_intermittency
 
 
 logger = logging.getLogger('MDAnalysis.analysis.hbonds')
@@ -382,6 +383,9 @@ class HydrogenBondAnalysis(base.AnalysisBase):
        DEFAULT_DONORS/ACCEPTORS is now embedded in a dict to switch between
        default values for different force fields.
 
+    .. versionchanged:: 0.21.0
+        Added autocorrelation (MDAnalysis.lib.correlations.py) for calculating hydrogen bond lifetime
+
     .. versionchanged:: 1.0.0
        ``save_table()`` method has been removed. You can use ``np.save()`` or
        ``cPickle.dump()`` on :attr:`HydrogenBondAnalysis.table` instead.
@@ -1409,3 +1413,56 @@ def _make_dict(donors, hydrogens):
         h2donor.update(_make_dict(s1d, s1h))
 
         return h2donor
+
+
+    def autocorrelation(self, tau_max=20, window_step=1, intermittency=0):
+        """
+        Computes and returns the autocorrelation (HydrogenBondLifetimes ) on the computed hydrogen bonds.
+
+        Parameters
+        ----------
+        window_step : int, optional
+            Jump every `step`-th frame. This is compatible but independant of the taus used.
+            Note that `step` and `tau_max` work consistently with intermittency.
+        tau_max : int, optional
+            Survival probability is calculated for the range 1 <= `tau` <= `tau_max`
+        intermittency : int, optional
+            The maximum number of consecutive frames for which a bond can disappear but be counted as present if it
+            returns at the next frame. An intermittency of `0` is equivalent to a continuous autocorrelation, which does
+            not allow for the hydrogen bond disappearance. For example, for `intermittency=2`, any given hbond may
+            disappear for up to two consecutive frames yet be treated as being present at all frames.
+            The default is continuous (0).
+
+        Returns
+        -------
+        tau_timeseries : list
+            tau from 1 to `tau_max`. Saved in the field tau_timeseries.
+        timeseries : list
+            autcorrelation value for each value of `tau`. Saved in the field timeseries.
+        timeseries_data: list
+            raw datapoints from which the average is taken (timeseries).
+            Time dependancy and distribution can be extracted.
+        """
+
+        if self._timeseries is None:
+            logging.error("Autocorrelation analysis of hydrogen bonds cannot be done before the hydrogen bonds are found")
+            logging.error("Autocorrelation: Please use the .run() before calling this function")
+            return
+
+        if self.step != 1:
+            logging.warning("Autocorrelation function should be carried out on consecutive frames. ")
+            logging.warning("Autocorrelation: if you would like to allow bonds to break and reform, please use 'intermittency'")
+
+        # Extract the hydrogen bonds IDs only in the format [set(superset(x1,x2), superset(x3,x4)), ..]
+        hydrogen_bonds = self.timeseries
+        found_hydrogen_bonds = [{frozenset(bond[0:2]) for bond in frame} for frame in hydrogen_bonds]
+
+        intermittent_hbonds = correct_intermittency(found_hydrogen_bonds, intermittency=intermittency)
+        tau_timeseries, timeseries, timeseries_data = autocorrelation(intermittent_hbonds, tau_max,
+                                                                      window_step=window_step)
+        self.acf_tau_timeseries = tau_timeseries
+        self.acf_timeseries = timeseries
+        # user can investigate the distribution and sample size
+        self.acf_timeseries_data = timeseries_data
+
+        return self
diff --git a/package/MDAnalysis/analysis/hbonds/hbond_autocorrel.py b/package/MDAnalysis/analysis/hbonds/hbond_autocorrel.py
@@ -301,6 +301,11 @@ def __init__(self, universe,
                  nruns=1,  # number of times to iterate through the trajectory
                  nsamples=50,  # number of different points to sample in a run
                  pbc=True):
+
+        #warnings.warn("This class is deprecated, use analysis.hbonds.HydrogenBondAnalysis "
+        #              "which has .autocorrelation function",
+        #              category=DeprecationWarning)
+
         self.u = universe
         # check that slicing is possible
         try:

diff --git a/package/MDAnalysis/analysis/utils/__init__.py b/package/MDAnalysis/analysis/utils/__init__.py
diff --git a/package/MDAnalysis/analysis/waterdynamics.py b/package/MDAnalysis/analysis/waterdynamics.py
@@ -297,10 +297,6 @@
   for tau, sp in zip(tau_timeseries, sp_timeseries):
         print("{time} {sp}".format(time=tau, sp=sp))
 
-  # SP is not calculated at tau=0, but if you would like to plot SP=1 at tau=0:
-  tau_timeseries.insert(0, 0)
-  sp_timeseries.insert(1, 0)
-
   # plot
   plt.xlabel('Time')
   plt.ylabel('SP')
@@ -449,12 +445,15 @@
 """
 from __future__ import print_function, division, absolute_import
 
+from MDAnalysis.lib.correlations import autocorrelation, correct_intermittency
+import MDAnalysis.analysis.hbonds
+from six.moves import range, zip_longest
+import logging
 import warnings
+import numpy as np
 
-from six.moves import range, zip_longest
 
-import numpy as np
-import MDAnalysis.analysis.hbonds
+logger = logging.getLogger('MDAnalysis.analysis.waterdynamics')
 from MDAnalysis.lib.log import ProgressBar
 
 
@@ -507,7 +506,9 @@ class HydrogenBondLifetimes(object):
        may have failed in some cases.
     """
 
-    def __init__(self, universe, selection1, selection2, t0, tf, dtmax):
+
+    def __init__(self, universe, selection1, selection2, t0, tf, dtmax,
+                 nproc=1):
         self.universe = universe
         self.selection1 = selection1
         self.selection2 = selection2
@@ -1171,7 +1172,8 @@ class SurvivalProbability(object):
 
 
     .. versionadded:: 0.11.0
-
+    .. versionchanged:: 0.21.0
+        Using the MDAnalysis.lib.correlations.py to carry out the intermittency and autocorrelation calculations
     .. versionchanged:: 1.0.0
        Changed `selection` keyword to `select`
     """
@@ -1195,62 +1197,6 @@ def __init__(self, universe, select, t0=None, tf=None, dtmax=None, verbose=False
             self.tau_max = dtmax
             warnings.warn("dtmax is deprecated, use run(tau_max=dtmax) instead", category=DeprecationWarning)
 
-    def print(self, verbose, *args):
-        if self.verbose:
-            print(args)
-        elif verbose:
-            print(args)
-
-    def _correct_intermittency(self, intermittency, selected_ids):
-        """
-        Pre-process Consecutive Intermittency with a single pass over the data.
-        If an atom is absent for a number of frames equal or smaller
-        than the `intermittency`, then correct the data and remove the absence.
-        ie 7,A,A,7 with intermittency=2 will be replaced by 7,7,7,7, where A=absence
-
-        Parameters
-        ----------
-        intermittency : int
-            the max gap allowed and to be corrected
-        selected_ids: list of ids
-            modifies the selecteded IDs in place by adding atoms which left for <= `intermittency`
-        """
-        self.print('Correcting the selected IDs for intermittancy (gaps). ')
-        if intermittency == 0:
-            return
-
-        for i, ids in enumerate(selected_ids):
-            # initially update each frame as seen 0 ago (now)
-            seen_frames_ago = {i: 0 for i in ids}
-            for j in range(1, intermittency + 2):
-                for atomid in seen_frames_ago.keys():
-                    # no more frames
-                    if i + j >= len(selected_ids):
-                        continue
-
-                    # if the atom is absent now
-                    if not atomid in selected_ids[i + j]:
-                        # increase its absence counter
-                        seen_frames_ago[atomid] += 1
-                        continue
-
-                    # the atom is found
-                    if seen_frames_ago[atomid] == 0:
-                        # the atom was present in the last frame
-                        continue
-
-                    # it was absent more times than allowed
-                    if seen_frames_ago[atomid] > intermittency:
-                        continue
-
-                    # the atom was absent but returned (within <= intermittency)
-                    # add it to the frames where it was absent.
-                    # Introduce the corrections.
-                    for k in range(seen_frames_ago[atomid], 0, -1):
-                        selected_ids[i + j - k].add(atomid)
-
-                    seen_frames_ago[atomid] = 0
-
 
     def run(self, tau_max=20, start=0, stop=None, step=1, residues=False, intermittency=0, verbose=False):
         """
@@ -1286,6 +1232,9 @@ def run(self, tau_max=20, start=0, stop=None, step=1, residues=False, intermitte
             tau from 1 to `tau_max`. Saved in the field tau_timeseries.
         sp_timeseries : list
             survival probability for each value of `tau`. Saved in the field sp_timeseries.
+        sp_timeseries_data: list
+            raw datapoints from which the average is taken (sp_timeseries).
+            Time dependancy and distribution can be extracted.
         """
 
         # backward compatibility (and priority)
@@ -1306,9 +1255,13 @@ def run(self, tau_max=20, start=0, stop=None, step=1, residues=False, intermitte
             raise ValueError("Too few frames selected for given tau_max.")
 
         # preload the frames (atom IDs) to a list of sets
-        self.selected_ids = []
+        self._selected_ids = []
+
+        # fixme - to parallise: the section should be rewritten so that this loop only creates a list of indices,
+        # on which the parallel _single_frame can be applied.
 
-        # skip frames that will not be used
+        # skip frames that will not be used in order to improve performance
+        # because AtomGroup.select_atoms is the most expensive part of this calculation
         # Example: step 5 and tau 2: LLLSS LLLSS, ... where L = Load, and S = Skip
         # Intermittency means that we have to load the extra frames to know if the atom is actually missing.
         # Say step=5 and tau=1, intermittency=0: LLSSS LLSSS
@@ -1323,7 +1276,7 @@ def run(self, tau_max=20, start=0, stop=None, step=1, residues=False, intermitte
         frame_no = start
         while frame_no < stop:      # we have already added 1 to stop, therefore <
             if num_frames_to_skip != 0 and frame_loaded_counter == frames_per_window:
-                self.print(verbose, "Skipping the next %d frames:" % num_frames_to_skip)
+                logger.info("Skipping the next %d frames:", num_frames_to_skip)
                 frame_no += num_frames_to_skip
                 frame_loaded_counter = 0
                 # Correct the number of frames to be loaded after the first window (which starts at t=0, and
@@ -1334,46 +1287,31 @@ def run(self, tau_max=20, start=0, stop=None, step=1, residues=False, intermitte
             # update the frame number
             self.universe.trajectory[frame_no]
 
-            self.print(verbose, "Loading frame:", self.universe.trajectory.ts)
+            logger.info("Loading frame: %d", self.universe.trajectory.frame)
             atoms = self.universe.select_atoms(self.selection)
 
             # SP of residues or of atoms
             ids = atoms.residues.resids if residues else atoms.ids
-            self.selected_ids.append(set(ids))
+            self._selected_ids.append(set(ids))
 
             frame_no += 1
             frame_loaded_counter += 1
 
-        # correct the dataset for gaps (intermittency)
-        self._correct_intermittency(intermittency, self.selected_ids)
-
-        # calculate Survival Probability
-        tau_timeseries = np.arange(1, tau_max + 1)
-        sp_timeseries_data = [[] for _ in range(tau_max)]
-
         # adjust for the frames that were not loaded (step>tau_max + 1),
         # and for extra frames that were loaded (intermittency)
         window_jump = step - num_frames_to_skip
 
-        for t in range(0, len(self.selected_ids), window_jump):
-            Nt = len(self.selected_ids[t])
-
-            if Nt == 0:
-                self.print(verbose,
-                           "At frame {} the selection did not find any molecule. Moving on to the next frame".format(t))
-                continue
-
-            for tau in tau_timeseries:
-                if t + tau >= len(self.selected_ids):
-                    break
+        self._intermittent_selected_ids = correct_intermittency(self._selected_ids, intermittency=intermittency)
+        tau_timeseries, sp_timeseries, sp_timeseries_data = autocorrelation(self._intermittent_selected_ids,
+                                                                            tau_max, window_jump)
 
-                # continuous: IDs that survive from t to t + tau and at every frame in between
-                Ntau = len(set.intersection(*self.selected_ids[t:t + tau + 1]))
-                sp_timeseries_data[tau - 1].append(Ntau / float(Nt))
+        # warn the user if the NaN are found
+        if all(np.isnan(sp_timeseries[1:])):
+            logger.warning('NaN Error: Most likely data was not found. Check your atom selections. ')
 
         # user can investigate the distribution and sample size
         self.sp_timeseries_data = sp_timeseries_data
 
         self.tau_timeseries = tau_timeseries
-        self.sp_timeseries = [np.mean(sp) for sp in sp_timeseries_data]
+        self.sp_timeseries = sp_timeseries
         return self