alchemistry · xiki-tempula · Sep 14, 2024 · Jul 24, 2024 · Jul 24, 2024 · Jul 24, 2024
diff --git a/CHANGES b/CHANGES
@@ -13,6 +13,14 @@ The rules for this file:
   * release numbers follow "Semantic Versioning" https://semver.org
 
 ------------------------------------------------------------------------------
+??/??/2024 jaclark5
+
+  * 2.4.0
+
+Enhancements
+  - Addition of `block_average` function in both `convergence` and 
+    `visualization` (Issue #380, PR #381)
+
 
 08/24/2024 xiki-tempula
 

diff --git a/docs/convergence.rst b/docs/convergence.rst
@@ -75,6 +75,31 @@ is, where 0 fully-unequilibrated and 1.0 is fully-equilibrated. ::
     >>> value = A_c(dhdl_list, tol=2)
     0.7085
 
+Moving Average
+--------------
+If one obtains suspicious results from the forward / backward convergence plot,
+it may be useful to view the block averages of the change in free energy using
+:func:`~alchemlyb.convergence.block_average` and
+:func:`~alchemlyb.visualisation.plot_block_average` over the course of each 
+step in lambda individually, the following example is for :math:`\lambda` = 0
+
+    >>> from alchemtest.gmx import load_benzene
+    >>> from alchemlyb.parsing.gmx import extract_u_nk
+    >>> from alchemlyb.visualisation import plot_block_average
+    >>> from alchemlyb.convergence import block_average
+
+    >>> bz = load_benzene().data
+    >>> data_list = [extract_u_nk(xvg, T=300) for xvg in bz['Coulomb']]
+    >>> df = block_average(data_list, 'mbar')
+    >>> ax = plot_block_average(df)
+    >>> ax.figure.savefig('dF_t_block_average.png')
+
+Will give a plot looks like this
+
+.. figure:: images/dF_t_block_average.png
+
+   A convergence plot of showing that the forward and backward has converged
+   fully.
 
 Convergence functions
 ---------------------

diff --git a/docs/convergence/alchemlyb.convergence.convergence.rst b/docs/convergence/alchemlyb.convergence.convergence.rst
@@ -15,3 +15,5 @@ All convergence functions are located in this submodule but for convenience they
 .. autofunction:: alchemlyb.convergence.fwdrev_cumavg_Rc
 
 .. autofunction:: alchemlyb.convergence.A_c		  
+
+.. autofunction:: alchemlyb.convergence.block_average
diff --git a/docs/images/dF_t_block_average.png b/docs/images/dF_t_block_average.png
diff --git a/docs/visualisation.rst b/docs/visualisation.rst
@@ -19,6 +19,7 @@ Plotting Functions
     plot_ti_dhdl
     plot_dF_state
     plot_convergence
+    plot_block_average
 
 .. _plot_overlap_matrix:
 

diff --git a/docs/visualisation/alchemlyb.visualisation.plot_block_average.rst b/docs/visualisation/alchemlyb.visualisation.plot_block_average.rst
@@ -0,0 +1,6 @@
+alchemlyb.visualisation.plot\_block\_average
+=============================================
+
+.. currentmodule:: alchemlyb.visualisation
+
+.. autofunction:: plot_block_average
diff --git a/src/alchemlyb/convergence/__init__.py b/src/alchemlyb/convergence/__init__.py
@@ -1 +1 @@
-from .convergence import forward_backward_convergence, fwdrev_cumavg_Rc, A_c
+from .convergence import forward_backward_convergence, fwdrev_cumavg_Rc, A_c, block_average
diff --git a/src/alchemlyb/convergence/convergence.py b/src/alchemlyb/convergence/convergence.py
@@ -30,7 +30,8 @@ def forward_backward_convergence(
     Parameters
     ----------
     df_list : list
-        List of DataFrame of either dHdl or u_nk.
+        List of DataFrame of either dHdl or u_nk, where each represents a
+        different value of lambda.
     estimator : {'MBAR', 'BAR', 'TI'}
         Name of the estimators.
         See the important note below on the use of "MBAR".
@@ -94,7 +95,16 @@ def forward_backward_convergence(
         # select estimator class by name
         my_estimator = estimators_dispatch[estimator](**kwargs)
         logger.info(f"Use {estimator} estimator for convergence analysis.")
-
+
+    # Check that each df in the list has only one value of lambda
+    for i, df in enumerate(df_list):
+        lambda_values = list(set([x[1:] for x in df.index.to_numpy()]))
+        if len(lambda_values) > 1:
+            ind  = [j for j in range(len(lambda_values[0])) if len(list(set([x[j] for x in lambda_values]))) > 1][0]
+            raise ValueError(
+                "Provided DataFrame, df_list[{}] has more than one lambda value in df.index[{}]".format(i, ind)
+            )
+
     logger.info("Begin forward analysis")
     forward_list = []
     forward_error_list = []
@@ -262,7 +272,7 @@ def fwdrev_cumavg_Rc(series, precision=0.01, tol=2):
     float
         Convergence time fraction :math:`R_c` [Fan2021]_
     :class:`pandas.DataFrame`
-        The DataFrame with moving average. ::
+        The DataFrame with block average. ::
 
                 Forward  Backward  data_fraction
             0  3.016442  3.065176            0.1
@@ -389,3 +399,122 @@ def A_c(series_list, precision=0.01, tol=2):
             d_R_c = sorted_array[-i] - sorted_array[-i - 1]
             result += d_R_c * sum(R_c_list <= element) / n_R_c
     return result
+
+
+def block_average(df_list, estimator="MBAR", num=10, **kwargs):
+    """Free energy estimate for portions of the trajectory.
+
+    Generate the free energy estimate for a series of blocks in time,
+    with the specified number of equally spaced points.
+    For example, setting `num` to 10 would give the block averages
+    which is the free energy estimate from the first 10% alone, then the
+    next 10% ... of the data.
+
+    Parameters
+    ----------
+    df_list : list
+        List of DataFrame of either dHdl or u_nk, where each represents a
+        different value of lambda.
+    estimator : {'MBAR', 'BAR', 'TI'}
+        Name of the estimators.
+        See the important note below on the use of "MBAR".
+    num : int
+        The number of time points.
+    kwargs : dict
+        Keyword arguments to be passed to the estimator.
+
+    Returns
+    -------
+    :class:`pandas.DataFrame`
+        The DataFrame with estimate data. ::
+
+               FE             FE_Error
+            0  3.016442       0.052748
+            1  3.078106       0.037170
+            2  3.072561       0.030186
+            3  3.048325       0.026070
+            4  3.049769       0.023359
+            5  3.034078       0.021260
+            6  3.043274       0.019642
+            7  3.035460       0.018340
+            8  3.042032       0.017319
+            9  3.044149       0.016405
+
+
+    .. versionadded:: 2.4.0
+
+    """
+    logger.info("Start block averaging analysis.")
+    logger.info("Check data availability.")
+    if estimator not in (FEP_ESTIMATORS + TI_ESTIMATORS):
+        msg = f"Estimator {estimator} is not available in {FEP_ESTIMATORS + TI_ESTIMATORS}."
+        logger.error(msg)
+        raise ValueError(msg)
+    else:
+        # select estimator class by name
+        estimator_fit = estimators_dispatch[estimator](**kwargs).fit
+        logger.info(f"Use {estimator} estimator for convergence analysis.")
+
+    # Check that each df in the list has only one value of lambda
+    for i, df in enumerate(df_list):
+        lambda_values = list(set([x[1:] for x in df.index.to_numpy()]))
+        if len(lambda_values) > 1:
+            ind  = [j for j in range(len(lambda_values[0])) if len(list(set([x[j] for x in lambda_values]))) > 1][0]
+            raise ValueError(
+                "Provided DataFrame, df_list[{}] has more than one lambda value in df.index[{}]".format(i, ind)
+            )
+
+    if estimator in ["BAR"] and len(df_list) > 2:
+        raise ValueError(
+            "Restrict to two DataFrames, one with a fep-lambda value and one its forward adjacent state for a "
+            "meaningful result."
+        )
+
+    # Choose length of comparison trajectory
+    lx_lambdas = [len(x) for x in df_list]
+    if len(set(lx_lambdas)) > 1:
+        lx = np.min( lx_lambdas)
+        warn(
+            "Not all trajectories for each lambda value are the same length, using minimum length for analysis: {}".format(
+                " ".join([f"len(df[{i}])={len(df_list[i])}" for i in range(len(df_list))])
+            ))
+    else:
+        lx = len(df_list[0])
+
+    logger.info("Begin Moving Average Analysis")
+    average_list = []
+    average_error_list = []
+    for i in range(1, num):
+        logger.info("Moving Average Analysis: {:.2f}%".format(100 * i / num))
+        ind1, ind2 = lx // num * (i - 1), lx // num * i
+        sample = []
+        for data in df_list:
+            sample.append(data[ind1:ind2])
+        sample = concat(sample)
+        result = estimator_fit(sample)
+
+        average_list.append(result.delta_f_.iloc[0, -1])
+        if estimator.lower() == "bar":
+            error = np.sqrt(
+                sum(
+                    [
+                        result.d_delta_f_.iloc[i, i + 1] ** 2
+                        for i in range(len(result.d_delta_f_) - 1)
+                    ]
+                )
+            )
+            average_error_list.append(error)
+        else:
+            average_error_list.append(result.d_delta_f_.iloc[0, -1])
+        logger.info(
+            "{:.2f} +/- {:.2f} kT".format(average_list[-1], average_error_list[-1])
+        )
+
+    convergence = pd.DataFrame(
+        {
+            "FE": average_list,
+            "FE_Error": average_error_list,
+        }
+    )
+    convergence.attrs = df_list[0].attrs
+    return convergence
diff --git a/src/alchemlyb/estimators/bar_.py b/src/alchemlyb/estimators/bar_.py
@@ -88,7 +88,7 @@ def fit(self, u_nk):
         # sort by state so that rows from same state are in contiguous blocks
         u_nk = u_nk.sort_index(level=u_nk.index.names[1:])
 
-        # get a list of the lambda states
+        # get a list of the lambda states that are sampled
         self._states_ = u_nk.columns.values.tolist()
 
         # group u_nk by lambda states
@@ -97,18 +97,21 @@ def fit(self, u_nk):
             (len(groups.get_group(i)) if i in groups.groups else 0)
             for i in u_nk.columns
         ]
-
+        states = [x for i, x in enumerate(self._states_) if N_k[i] > 0]
         # Now get free energy differences and their uncertainties for each step
         deltas = np.array([])
         d_deltas = np.array([])
         for k in range(len(N_k) - 1):
+            if N_k[k] == 0 or N_k[k + 1] == 0:
+                continue
             # get us from lambda step k
             uk = groups.get_group(self._states_[k])
             # get w_F
             w_f = uk.iloc[:, k + 1] - uk.iloc[:, k]
 
             # get us from lambda step k+1
             uk1 = groups.get_group(self._states_[k + 1])
+
             # get w_R
             w_r = uk1.iloc[:, k] - uk1.iloc[:, k + 1]
 
@@ -150,13 +153,11 @@ def fit(self, u_nk):
             ad_delta += np.diagflat(np.array(dout), k=j + 1)
 
         # yield standard delta_f_ free energies between each state
-        self._delta_f_ = pd.DataFrame(
-            adelta - adelta.T, columns=self._states_, index=self._states_
-        )
+        self._delta_f_ = pd.DataFrame(adelta - adelta.T, columns=states, index=states)
 
         # yield standard deviation d_delta_f_ between each state
         self._d_delta_f_ = pd.DataFrame(
-            np.sqrt(ad_delta + ad_delta.T), columns=self._states_, index=self._states_
+            np.sqrt(ad_delta + ad_delta.T), columns=states, index=states
         )
         self._delta_f_.attrs = u_nk.attrs
         self._d_delta_f_.attrs = u_nk.attrs

diff --git a/src/alchemlyb/estimators/mbar_.py b/src/alchemlyb/estimators/mbar_.py
@@ -33,7 +33,7 @@ class MBAR(BaseEstimator, _EstimatorMixOut):
         .. versionchanged:: 2.3.0
            The new default is now "BAR" as it provides a substantial speedup
            over the previous default `None`.
-           
+
 
     method : str, optional, default="robust"
         The optimization routine to use.  This can be any of the methods
@@ -135,6 +135,25 @@ def fit(self, u_nk):
             )
             bar.fit(u_nk)
             initial_f_k = bar.delta_f_.iloc[0, :]
+            states = [
+                x
+                for i, x in enumerate(self._states_[:-1])
+                if N_k[i] > 0 and N_k[i + 1] > 0
+            ]
+            if len(bar.delta_f_.iloc[0, :]) != len(self._states_):
+                states = [
+                    x
+                    for i, x in enumerate(self._states_[:-1])
+                    if N_k[i] > 0 and N_k[i + 1] > 0
+                ]
+                initial_f_k = pd.Series(
+                    [
+                        initial_f_k.loc(x) if x in states else np.nan
+                        for x in self._states_
+                    ],
+                    index=self._states_,
+                    dtype=float,
+                )
         else:
             initial_f_k = self.initial_f_k
Original file line number	Diff line number	Diff line change
Expand Up		@@ -15,3 +15,5 @@ All convergence functions are located in this submodule but for convenience they
		.. autofunction:: alchemlyb.convergence.fwdrev_cumavg_Rc

		.. autofunction:: alchemlyb.convergence.A_c

		.. autofunction:: alchemlyb.convergence.block_average
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from .convergence import forward_backward_convergence, fwdrev_cumavg_Rc, A_c
		from .convergence import forward_backward_convergence, fwdrev_cumavg_Rc, A_c, block_average