add R_c (convergence fraction) convergence analysis (#239)

* Fix #104 * add @VOD555 's fractional equilibration time analysis for convergence (Fan et al 2021) * reimplemented the R_c (based on code provided by @VOD555) and the A_c based on my understanding of the paper * improved performance of original implementation by doing initial block average for "precision"-size blocks * add docs with example graph * add tests * update CHANGES Co-authored-by: Zhiyi Wu <[email protected]> Co-authored-by: Shujie Fan <[email protected]> Co-authored-by: Oliver Beckstein <[email protected]>
alchemistry · Oct 31, 2022 · cbdbfcd · cbdbfcd
1 parent 3809fdf
commit cbdbfcd
Show file tree

Hide file tree

Showing 7 changed files with 284 additions and 7 deletions.
diff --git a/CHANGES b/CHANGES
@@ -41,6 +41,8 @@ Enhancements
   - Add remove_burnin keyword to decorrelate_u_nk and decorrelate_dhdl (PR #218).
   - Add a base class for workflows (PR #188).
   - Add the ABFE workflow (PR #114).
+  - Add R_c and A_c for "fractional equilibration time" convergence analysis 
+    (issue #104, PR #239)
   - Add the keyword arg final_error to plot_convergence (#249)
 
 

diff --git a/docs/convergence.rst b/docs/convergence.rst
@@ -33,6 +33,52 @@ Will give a plot looks like this
    A convergence plot of showing that the forward and backward has converged
    fully.
 
+Fractional equilibration time
+-----------------------------
+
+Another way of assessing whether the simulation has converged is to check the
+energy files. In [Fan2021]_, :math:`R_c` and
+:math:`A_c` are two criteria of checking the
+convergence. :func:`~alchemlyb.convergence.fwdrev_cumavg_Rc` takes a decorrelated
+:class:`pandas.Series` as input and gives the metric
+:math:`R_c`, which is 0 for fully-equilibrated
+simulation and 1 for fully-unequilibrated simulation. ::
+
+    >>> from alchemtest.gmx import load_ABFE
+    >>> from alchemlyb.parsing.gmx import extract_dHdl
+    >>> from alchemlyb.preprocessing import decorrelate_dhdl, dhdl2series
+    >>> from alchemlyb.convergence import fwdrev_cumavg_Rc
+    >>> from alchemlyb.visualisation import plot_convergence
+
+    >>> file = load_ABFE().data['ligand'][0]
+    >>> dhdl = extract_dHdl(file, T=300)
+    >>> decorrelated = decorrelate_dhdl(dhdl, remove_burnin=True)
+    >>> R_c, running_average = fwdrev_cumavg_Rc(dhdl2series(decorrelated), tol=2)
+    >>> print(R_c)
+    0.04
+    >>> ax = plot_convergence(running_average, final_error=2)
+    >>> ax.set_ylabel("$\partial H/\partial\lambda$ (in kT)")
+
+
+Will give a plot like this.
+
+.. figure:: images/R_c.png
+
+
+The :func:`~alchemlyb.convergence.A_c` on the other hand, takes in a list of
+decorrelated :class:`pandas.Series` and gives a metric of how converged the set
+is, where 0 fully-unequilibrated and 1.0 is fully-equilibrated. ::
+
+    >>> from alchemlyb.convergence import A_c
+    >>> dhdl_list = []
+    >>> for file in load_ABFE().data['ligand']:
+    >>>     dhdl = extract_dHdl(file, T=300)
+    >>>     decorrelated = decorrelate_dhdl(dhdl, remove_burnin=True)
+    >>>     decorrelated = dhdl2series(decorrelated)
+    >>>     dhdl_list.append(decorrelated)
+    >>> value = A_c(dhdl_list)
+    0.7085
+
 
 Convergence functions
 ---------------------
@@ -44,5 +90,14 @@ The currently available connvergence functions:
 .. autosummary::
     :toctree: convergence
 
-    convergence
+    forward_backward_convergence
+    fwdrev_cumavg_Rc
+    A_c
+
+References
+----------
 
+.. [Fan2021] Fan, S., Nedev, H., Vijayan, R., Iorga, B.I., and Beckstein, O.
+    (2021). Precise force-field-based calculations of octanol-water partition
+    coefficients for the SAMPL7 molecules. Journal of Computer-Aided Molecular
+    Design 35, 853–87
diff --git a/docs/images/R_c.png b/docs/images/R_c.png
diff --git a/src/alchemlyb/convergence/__init__.py b/src/alchemlyb/convergence/__init__.py
@@ -1 +1 @@
-from .convergence import forward_backward_convergence
+from .convergence import forward_backward_convergence, fwdrev_cumavg_Rc, A_c
diff --git a/src/alchemlyb/convergence/convergence.py b/src/alchemlyb/convergence/convergence.py
@@ -7,6 +7,7 @@
 from ..estimators import BAR, TI, FEP_ESTIMATORS, TI_ESTIMATORS
 from ..estimators import AutoMBAR as MBAR
 from .. import concat
+from ..postprocessors.units import to_kT
 
 
 def forward_backward_convergence(df_list, estimator='MBAR', num=10, **kwargs):
@@ -63,7 +64,7 @@ def forward_backward_convergence(df_list, estimator='MBAR', num=10, **kwargs):
     .. versionadded:: 0.6.0
     .. versionchanged:: 1.0.0
        The ``estimator`` accepts uppercase input.
-       The default for using ``estimator='MBAR'`` was changed from 
+       The default for using ``estimator='MBAR'`` was changed from
        :class:`~alchemlyb.estimators.MBAR` to :class:`~alchemlyb.estimators.AutoMBAR`.
 
     '''
@@ -137,3 +138,175 @@ def forward_backward_convergence(df_list, estimator='MBAR', num=10, **kwargs):
          'data_fraction': [i / num for i in range(1, num + 1)]})
     convergence.attrs = df_list[0].attrs
     return convergence
+
+def _cummean(vals, out_length):
+    '''The cumulative mean of an array.
+
+    This function computes the cumulative mean and shapes the result to the
+    desired length.
+
+    Parameters
+    ----------
+    vals : numpy.array
+        The one-dimensional input array.
+    out_length : int
+        The length of the output array.
+
+    Returns
+    -------
+    numpy.array
+        The one-dimensional input array with length of total.
+
+    Note
+    ----
+    If the length of the input series is smaller than the ``out_length``, the
+    length of the output array is the same as the input series.
+
+
+    .. versionadded:: 1.0.0
+
+    '''
+    in_length = len(vals)
+    if in_length < out_length:
+        out_length = in_length
+    block = in_length // out_length
+    reshape = vals[: block*out_length].reshape(block, out_length)
+    mean = np.mean(reshape, axis=0)
+    result = np.cumsum(mean) / np.arange(1, out_length+1)
+    return result
+
+def fwdrev_cumavg_Rc(series, precision=0.01, tol=2):
+    '''Generate the convergence criteria :math:`R_c` for a single simulation.
+
+    The input will be pandas.Series generated by
+    :func:`~alchemlyb.preprocessing.subsampling.decorrelate_u_nk` or
+    :func:`~alchemlyb.preprocessing.subsampling.decorrelate_dhdl`.
+
+    The output will the float :math:`R_c`. :math:`R_c = 0` indicates that the system is well
+    equilibrated right from the beginning while :math:`R_c = 1` signifies that the
+    whole trajectory is not equilibrated.
+
+    Parameters
+    ----------
+    series : pandas.Series
+        The input energy array.
+    precision : float
+        The precision of the output :math:`R_c`. To speed the calculation up, the data
+        has been block-averaged before doing the calculation, the size of the
+        block is controlled by the desired precision.
+    tol : float
+        Tolerance of the convergence check in :math:`kT`.
+
+    Returns
+    -------
+    float
+        Convergence time fraction.
+    DataFrame
+        The DataFrame with moving average. ::
+
+                Forward  Backward  data_fraction
+            0  3.016442  3.065176            0.1
+            1  3.078106  3.078567            0.2
+            2  3.072561  3.047357            0.3
+            3  3.048325  3.057527            0.4
+            4  3.049769  3.037454            0.5
+            5  3.034078  3.040484            0.6
+            6  3.043274  3.032495            0.7
+            7  3.035460  3.036670            0.8
+            8  3.042032  3.046597            0.9
+            9  3.044149  3.044385            1.0
+
+    Note
+    ----
+    This function computes :math:`R_c` from equation 16 from
+    https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8397498/#FD16. The code is
+    modified based on Shujie Fan's (@VOD555) work.
+   
+   Please cite [Fan2021]_ when using this function.
+   
+
+    .. versionadded:: 1.0.0
+    '''
+    series = to_kT(series)
+    array = series.to_numpy()
+    out_length = int(1 / precision)
+    g_forward = _cummean(array, out_length)
+    g_backward = _cummean(array[::-1], out_length)
+    length = len(g_forward)
+
+    convergence = pd.DataFrame(
+        {'Forward': g_forward,
+         'Backward': g_backward,
+         'data_fraction': [i / length for i in range(1, length + 1)]})
+    convergence.attrs = series.attrs
+
+    # Final value
+    g = g_forward[-1]
+    conv = np.logical_and(np.abs(g_forward - g) < tol, np.abs(g_backward - g) < tol)
+    for i in range(out_length):
+        if all(conv[i:]):
+            return i / length, convergence
+    else:
+        # This branch exists as we are truncating the dataset to speed the
+        # calculation up. For example if the dataset has 21 points and the
+        # precision is 0.05. The last point of g_forward is computed using
+        # data[0:20] while the last point of g_backward is computed using
+        # data[1:21]. Thus, the last point of g_forward and g_backward are not
+        # the same as this branch will be triggered.
+        return 1.0, convergence
+
+def A_c(series_list, precision=0.01, diff=2):
+    '''Generate the ensemble convergence criteria :math:`A_c` for a set of simulations.
+
+    The input is a list of pandas.Series generated by
+    :func:`~alchemlyb.preprocessing.subsampling.decorrelate_u_nk` or
+    :func:`~alchemlyb.preprocessing.subsampling.decorrelate_dhdl`.
+
+    The output will the float :math:`A_c`. :math:`A_c` is a number between 0 and 1 that can be
+    interpreted as the ratio of the total equilibrated simulation time to the
+    whole simulation time for a full set of simulations. :math:`A_c = 1` means that all
+    simulation time frames in all windows can be considered equilibrated, while
+    :math:`A_c = 0` indicates that nothing is equilibrated.
+
+    Parameters
+    ----------
+    series_list : list
+        A list of pandas.Series energy array.
+    precision : float
+        The precision of the output :math:`A_c`. To speed the calculation up, the data
+        has been block-averaged before doing the calculation, the size of the
+        block is controlled by the desired precision.
+    diff : float
+        Tolerance of the convergence check in :math:`kT`.
+
+    Returns
+    -------
+    float
+        The area under curve for convergence time fraction.
+
+    Note
+    ----
+    This function computes :math:`A_c` from equation 18 from
+    https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8397498/#FD18. See also
+    :func:`~alchemlyb.convergence.fwdrev_cumavg_Rc`.
+
+    Please cite [Fan2021]_ when using this function.
+
+
+    .. versionadded:: 1.0.0
+    '''
+    logger = logging.getLogger('alchemlyb.convergence.A_c')
+    n_R_c = len(series_list)
+    R_c_list = [fwdrev_cumavg_Rc(series, precision, diff)[0] for series in series_list]
+    logger.info(f'R_c list: {R_c_list}')
+    # Integrate the R_c_list <= R_c over the range of 0 to 1
+    array_01 = np.hstack((R_c_list, [0, 1]))
+    sorted_array = np.sort(np.unique(array_01))
+    result = 0
+    for i, element in enumerate(sorted_array[::-1]):
+        if i == 0:
+            continue
+        else:
+            d_R_c = sorted_array[-i] - sorted_array[-i-1]
+            result += d_R_c * sum(R_c_list <= element) / n_R_c
+    return result
diff --git a/src/alchemlyb/preprocessing/subsampling.py b/src/alchemlyb/preprocessing/subsampling.py
@@ -494,9 +494,9 @@ def equilibrium_detection(df, series=None, lower=None, upper=None, step=None,
 
 
     .. versionchanged:: 1.0.0
-       Add the drop_duplicates and sort keyword to unify the behaviour between
-    :func:`~alchemlyb.preprocessing.subsampling.statistical_inefficiency` or
-    :func:`~alchemlyb.preprocessing.subsampling.equilibrium_detection`.
+        Add the drop_duplicates and sort keyword to unify the behaviour between
+        :func:`~alchemlyb.preprocessing.subsampling.statistical_inefficiency` or
+        :func:`~alchemlyb.preprocessing.subsampling.equilibrium_detection`.
 
     """
     df, series = _prepare_input(df, series, drop_duplicates, sort)

diff --git a/src/alchemlyb/tests/test_convergence.py b/src/alchemlyb/tests/test_convergence.py
@@ -1,8 +1,12 @@
+import numpy as np
+import pandas as pd
 import pytest
 
 from alchemtest.gmx import load_benzene
 from alchemlyb.parsing import gmx
-from alchemlyb.convergence import forward_backward_convergence
+from alchemlyb.convergence import forward_backward_convergence, fwdrev_cumavg_Rc, A_c
+from alchemlyb.convergence.convergence import _cummean
+
 
 @pytest.fixture()
 def gmx_benzene():
@@ -60,3 +64,46 @@ def test_convergence_method(gmx_benzene):
     dHdl, u_nk = gmx_benzene
     convergence = forward_backward_convergence(u_nk, 'MBAR', num=2, method='adaptive')
     assert len(convergence) == 2
+
+def test_cummean_short():
+    '''Test the case where the input is shorter than the expected output'''
+    value = _cummean(np.empty(10), 100)
+    assert len(value) == 10
+
+def test_cummean_long():
+    '''Test the case where the input is longer than the expected output'''
+    value = _cummean(np.empty(20), 10)
+    assert len(value) == 10
+
+def test_cummean_long_none_integter():
+    '''Test the case where the input is not a integer multiple of the expected output'''
+    value = _cummean(np.empty(25), 10)
+    assert len(value) == 10
+
+def test_R_c_converged():
+    data = pd.Series(data=[0,]*100)
+    data.attrs['temperature'] = 310
+    data.attrs['energy_unit'] = 'kcal/mol'
+    value, running_average = fwdrev_cumavg_Rc(data)
+    np.testing.assert_allclose(value, 0.0)
+
+def test_R_c_notconverged():
+    data = pd.Series(data=range(21))
+    data.attrs['temperature'] = 310
+    data.attrs['energy_unit'] = 'kcal/mol'
+    value, running_average = fwdrev_cumavg_Rc(data, tol=0.1, precision=0.05)
+    np.testing.assert_allclose(value, 1.0)
+
+def test_R_c_real():
+    data = pd.Series(data=np.hstack((range(10), [4.5,]*10)))
+    data.attrs['temperature'] = 310
+    data.attrs['energy_unit'] = 'kcal/mol'
+    value, running_average = fwdrev_cumavg_Rc(data)
+    np.testing.assert_allclose(value, 0.35)
+
+def test_A_c_real():
+    data = pd.Series(data=np.hstack((range(10), [4.5,]*10)))
+    data.attrs['temperature'] = 310
+    data.attrs['energy_unit'] = 'kcal/mol'
+    value = A_c([data, ] * 2)
+    np.testing.assert_allclose(value, 0.65)
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from .convergence import forward_backward_convergence
		from .convergence import forward_backward_convergence, fwdrev_cumavg_Rc, A_c