Skip to content

Commit

Permalink
add R_c (convergence fraction) convergence analysis (#239)
Browse files Browse the repository at this point in the history
* Fix #104
* add @VOD555 's fractional equilibration time analysis for convergence (Fan et al 2021)
* reimplemented the R_c (based on code provided by @VOD555) and the A_c based on my understanding of the paper
* improved performance of original implementation by doing initial block average for "precision"-size blocks
* add docs with example graph
* add tests
* update CHANGES

Co-authored-by: Zhiyi Wu <[email protected]>
Co-authored-by: Shujie Fan <[email protected]>
Co-authored-by: Oliver Beckstein <[email protected]>
  • Loading branch information
4 people authored Oct 31, 2022
1 parent 3809fdf commit cbdbfcd
Show file tree
Hide file tree
Showing 7 changed files with 284 additions and 7 deletions.
2 changes: 2 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ Enhancements
- Add remove_burnin keyword to decorrelate_u_nk and decorrelate_dhdl (PR #218).
- Add a base class for workflows (PR #188).
- Add the ABFE workflow (PR #114).
- Add R_c and A_c for "fractional equilibration time" convergence analysis
(issue #104, PR #239)
- Add the keyword arg final_error to plot_convergence (#249)


Expand Down
57 changes: 56 additions & 1 deletion docs/convergence.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,52 @@ Will give a plot looks like this
A convergence plot of showing that the forward and backward has converged
fully.

Fractional equilibration time
-----------------------------

Another way of assessing whether the simulation has converged is to check the
energy files. In [Fan2021]_, :math:`R_c` and
:math:`A_c` are two criteria of checking the
convergence. :func:`~alchemlyb.convergence.fwdrev_cumavg_Rc` takes a decorrelated
:class:`pandas.Series` as input and gives the metric
:math:`R_c`, which is 0 for fully-equilibrated
simulation and 1 for fully-unequilibrated simulation. ::

>>> from alchemtest.gmx import load_ABFE
>>> from alchemlyb.parsing.gmx import extract_dHdl
>>> from alchemlyb.preprocessing import decorrelate_dhdl, dhdl2series
>>> from alchemlyb.convergence import fwdrev_cumavg_Rc
>>> from alchemlyb.visualisation import plot_convergence

>>> file = load_ABFE().data['ligand'][0]
>>> dhdl = extract_dHdl(file, T=300)
>>> decorrelated = decorrelate_dhdl(dhdl, remove_burnin=True)
>>> R_c, running_average = fwdrev_cumavg_Rc(dhdl2series(decorrelated), tol=2)
>>> print(R_c)
0.04
>>> ax = plot_convergence(running_average, final_error=2)
>>> ax.set_ylabel("$\partial H/\partial\lambda$ (in kT)")


Will give a plot like this.

.. figure:: images/R_c.png


The :func:`~alchemlyb.convergence.A_c` on the other hand, takes in a list of
decorrelated :class:`pandas.Series` and gives a metric of how converged the set
is, where 0 fully-unequilibrated and 1.0 is fully-equilibrated. ::

>>> from alchemlyb.convergence import A_c
>>> dhdl_list = []
>>> for file in load_ABFE().data['ligand']:
>>> dhdl = extract_dHdl(file, T=300)
>>> decorrelated = decorrelate_dhdl(dhdl, remove_burnin=True)
>>> decorrelated = dhdl2series(decorrelated)
>>> dhdl_list.append(decorrelated)
>>> value = A_c(dhdl_list)
0.7085


Convergence functions
---------------------
Expand All @@ -44,5 +90,14 @@ The currently available connvergence functions:
.. autosummary::
:toctree: convergence

convergence
forward_backward_convergence
fwdrev_cumavg_Rc
A_c

References
----------

.. [Fan2021] Fan, S., Nedev, H., Vijayan, R., Iorga, B.I., and Beckstein, O.
(2021). Precise force-field-based calculations of octanol-water partition
coefficients for the SAMPL7 molecules. Journal of Computer-Aided Molecular
Design 35, 853–87
Binary file added docs/images/R_c.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion src/alchemlyb/convergence/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .convergence import forward_backward_convergence
from .convergence import forward_backward_convergence, fwdrev_cumavg_Rc, A_c
175 changes: 174 additions & 1 deletion src/alchemlyb/convergence/convergence.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from ..estimators import BAR, TI, FEP_ESTIMATORS, TI_ESTIMATORS
from ..estimators import AutoMBAR as MBAR
from .. import concat
from ..postprocessors.units import to_kT


def forward_backward_convergence(df_list, estimator='MBAR', num=10, **kwargs):
Expand Down Expand Up @@ -63,7 +64,7 @@ def forward_backward_convergence(df_list, estimator='MBAR', num=10, **kwargs):
.. versionadded:: 0.6.0
.. versionchanged:: 1.0.0
The ``estimator`` accepts uppercase input.
The default for using ``estimator='MBAR'`` was changed from
The default for using ``estimator='MBAR'`` was changed from
:class:`~alchemlyb.estimators.MBAR` to :class:`~alchemlyb.estimators.AutoMBAR`.
'''
Expand Down Expand Up @@ -137,3 +138,175 @@ def forward_backward_convergence(df_list, estimator='MBAR', num=10, **kwargs):
'data_fraction': [i / num for i in range(1, num + 1)]})
convergence.attrs = df_list[0].attrs
return convergence

def _cummean(vals, out_length):
'''The cumulative mean of an array.
This function computes the cumulative mean and shapes the result to the
desired length.
Parameters
----------
vals : numpy.array
The one-dimensional input array.
out_length : int
The length of the output array.
Returns
-------
numpy.array
The one-dimensional input array with length of total.
Note
----
If the length of the input series is smaller than the ``out_length``, the
length of the output array is the same as the input series.
.. versionadded:: 1.0.0
'''
in_length = len(vals)
if in_length < out_length:
out_length = in_length
block = in_length // out_length
reshape = vals[: block*out_length].reshape(block, out_length)
mean = np.mean(reshape, axis=0)
result = np.cumsum(mean) / np.arange(1, out_length+1)
return result

def fwdrev_cumavg_Rc(series, precision=0.01, tol=2):
'''Generate the convergence criteria :math:`R_c` for a single simulation.
The input will be pandas.Series generated by
:func:`~alchemlyb.preprocessing.subsampling.decorrelate_u_nk` or
:func:`~alchemlyb.preprocessing.subsampling.decorrelate_dhdl`.
The output will the float :math:`R_c`. :math:`R_c = 0` indicates that the system is well
equilibrated right from the beginning while :math:`R_c = 1` signifies that the
whole trajectory is not equilibrated.
Parameters
----------
series : pandas.Series
The input energy array.
precision : float
The precision of the output :math:`R_c`. To speed the calculation up, the data
has been block-averaged before doing the calculation, the size of the
block is controlled by the desired precision.
tol : float
Tolerance of the convergence check in :math:`kT`.
Returns
-------
float
Convergence time fraction.
DataFrame
The DataFrame with moving average. ::
Forward Backward data_fraction
0 3.016442 3.065176 0.1
1 3.078106 3.078567 0.2
2 3.072561 3.047357 0.3
3 3.048325 3.057527 0.4
4 3.049769 3.037454 0.5
5 3.034078 3.040484 0.6
6 3.043274 3.032495 0.7
7 3.035460 3.036670 0.8
8 3.042032 3.046597 0.9
9 3.044149 3.044385 1.0
Note
----
This function computes :math:`R_c` from equation 16 from
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8397498/#FD16. The code is
modified based on Shujie Fan's (@VOD555) work.
Please cite [Fan2021]_ when using this function.
.. versionadded:: 1.0.0
'''
series = to_kT(series)
array = series.to_numpy()
out_length = int(1 / precision)
g_forward = _cummean(array, out_length)
g_backward = _cummean(array[::-1], out_length)
length = len(g_forward)

convergence = pd.DataFrame(
{'Forward': g_forward,
'Backward': g_backward,
'data_fraction': [i / length for i in range(1, length + 1)]})
convergence.attrs = series.attrs

# Final value
g = g_forward[-1]
conv = np.logical_and(np.abs(g_forward - g) < tol, np.abs(g_backward - g) < tol)
for i in range(out_length):
if all(conv[i:]):
return i / length, convergence
else:
# This branch exists as we are truncating the dataset to speed the
# calculation up. For example if the dataset has 21 points and the
# precision is 0.05. The last point of g_forward is computed using
# data[0:20] while the last point of g_backward is computed using
# data[1:21]. Thus, the last point of g_forward and g_backward are not
# the same as this branch will be triggered.
return 1.0, convergence

def A_c(series_list, precision=0.01, diff=2):
'''Generate the ensemble convergence criteria :math:`A_c` for a set of simulations.
The input is a list of pandas.Series generated by
:func:`~alchemlyb.preprocessing.subsampling.decorrelate_u_nk` or
:func:`~alchemlyb.preprocessing.subsampling.decorrelate_dhdl`.
The output will the float :math:`A_c`. :math:`A_c` is a number between 0 and 1 that can be
interpreted as the ratio of the total equilibrated simulation time to the
whole simulation time for a full set of simulations. :math:`A_c = 1` means that all
simulation time frames in all windows can be considered equilibrated, while
:math:`A_c = 0` indicates that nothing is equilibrated.
Parameters
----------
series_list : list
A list of pandas.Series energy array.
precision : float
The precision of the output :math:`A_c`. To speed the calculation up, the data
has been block-averaged before doing the calculation, the size of the
block is controlled by the desired precision.
diff : float
Tolerance of the convergence check in :math:`kT`.
Returns
-------
float
The area under curve for convergence time fraction.
Note
----
This function computes :math:`A_c` from equation 18 from
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8397498/#FD18. See also
:func:`~alchemlyb.convergence.fwdrev_cumavg_Rc`.
Please cite [Fan2021]_ when using this function.
.. versionadded:: 1.0.0
'''
logger = logging.getLogger('alchemlyb.convergence.A_c')
n_R_c = len(series_list)
R_c_list = [fwdrev_cumavg_Rc(series, precision, diff)[0] for series in series_list]
logger.info(f'R_c list: {R_c_list}')
# Integrate the R_c_list <= R_c over the range of 0 to 1
array_01 = np.hstack((R_c_list, [0, 1]))
sorted_array = np.sort(np.unique(array_01))
result = 0
for i, element in enumerate(sorted_array[::-1]):
if i == 0:
continue
else:
d_R_c = sorted_array[-i] - sorted_array[-i-1]
result += d_R_c * sum(R_c_list <= element) / n_R_c
return result
6 changes: 3 additions & 3 deletions src/alchemlyb/preprocessing/subsampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,9 +494,9 @@ def equilibrium_detection(df, series=None, lower=None, upper=None, step=None,
.. versionchanged:: 1.0.0
Add the drop_duplicates and sort keyword to unify the behaviour between
:func:`~alchemlyb.preprocessing.subsampling.statistical_inefficiency` or
:func:`~alchemlyb.preprocessing.subsampling.equilibrium_detection`.
Add the drop_duplicates and sort keyword to unify the behaviour between
:func:`~alchemlyb.preprocessing.subsampling.statistical_inefficiency` or
:func:`~alchemlyb.preprocessing.subsampling.equilibrium_detection`.
"""
df, series = _prepare_input(df, series, drop_duplicates, sort)
Expand Down
49 changes: 48 additions & 1 deletion src/alchemlyb/tests/test_convergence.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import numpy as np
import pandas as pd
import pytest

from alchemtest.gmx import load_benzene
from alchemlyb.parsing import gmx
from alchemlyb.convergence import forward_backward_convergence
from alchemlyb.convergence import forward_backward_convergence, fwdrev_cumavg_Rc, A_c
from alchemlyb.convergence.convergence import _cummean


@pytest.fixture()
def gmx_benzene():
Expand Down Expand Up @@ -60,3 +64,46 @@ def test_convergence_method(gmx_benzene):
dHdl, u_nk = gmx_benzene
convergence = forward_backward_convergence(u_nk, 'MBAR', num=2, method='adaptive')
assert len(convergence) == 2

def test_cummean_short():
'''Test the case where the input is shorter than the expected output'''
value = _cummean(np.empty(10), 100)
assert len(value) == 10

def test_cummean_long():
'''Test the case where the input is longer than the expected output'''
value = _cummean(np.empty(20), 10)
assert len(value) == 10

def test_cummean_long_none_integter():
'''Test the case where the input is not a integer multiple of the expected output'''
value = _cummean(np.empty(25), 10)
assert len(value) == 10

def test_R_c_converged():
data = pd.Series(data=[0,]*100)
data.attrs['temperature'] = 310
data.attrs['energy_unit'] = 'kcal/mol'
value, running_average = fwdrev_cumavg_Rc(data)
np.testing.assert_allclose(value, 0.0)

def test_R_c_notconverged():
data = pd.Series(data=range(21))
data.attrs['temperature'] = 310
data.attrs['energy_unit'] = 'kcal/mol'
value, running_average = fwdrev_cumavg_Rc(data, tol=0.1, precision=0.05)
np.testing.assert_allclose(value, 1.0)

def test_R_c_real():
data = pd.Series(data=np.hstack((range(10), [4.5,]*10)))
data.attrs['temperature'] = 310
data.attrs['energy_unit'] = 'kcal/mol'
value, running_average = fwdrev_cumavg_Rc(data)
np.testing.assert_allclose(value, 0.35)

def test_A_c_real():
data = pd.Series(data=np.hstack((range(10), [4.5,]*10)))
data.attrs['temperature'] = 310
data.attrs['energy_unit'] = 'kcal/mol'
value = A_c([data, ] * 2)
np.testing.assert_allclose(value, 0.65)

0 comments on commit cbdbfcd

Please sign in to comment.