From a25f5d17da6ab9b37c0b44ad6551f5ba8eac0ff1 Mon Sep 17 00:00:00 2001 From: finlayclark Date: Tue, 19 Dec 2023 11:19:10 +0000 Subject: [PATCH 01/15] Add Chodera's method for equilibration detection from pymbar --- deea/_version.py | 2 +- deea/equilibration.py | 12 ++++++++--- deea/ess.py | 47 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 4 deletions(-) diff --git a/deea/_version.py b/deea/_version.py index 3dc1f76..296bf34 100644 --- a/deea/_version.py +++ b/deea/_version.py @@ -1 +1 @@ -__version__ = "0.1.0" +__version__ = "0.1.0+12.gb6118c6.dirty" diff --git a/deea/equilibration.py b/deea/equilibration.py index 6afa255..f1448d9 100644 --- a/deea/equilibration.py +++ b/deea/equilibration.py @@ -13,8 +13,10 @@ from ._exceptions import AnalysisError, EquilibrationNotDetectedError, InvalidInputError from ._validation import check_data from .ess import ( + ess_chodera, ess_inter_variance, ess_lugsail_variance, + statistical_inefficiency_chodera, statistical_inefficiency_inter_variance, statistical_inefficiency_lugsail_variance, ) @@ -72,11 +74,11 @@ def get_ess_series( """ # Check that method is valid. method = method.lower() - if method not in ["lugsail", "inter"]: - raise InvalidInputError("method must be either 'lugsail' or 'inter'.") + if method not in ["lugsail", "inter", "chodera"]: + raise InvalidInputError("method must be 'lugsail', 'inter', or 'chodera'.") # Check that the data is valid. - one_dim_allowed = method == "lugsail" + one_dim_allowed = method in ["lugsail", "chodera"] data = check_data(data, one_dim_allowed=one_dim_allowed) n_runs, n_samples = data.shape @@ -114,6 +116,8 @@ def get_ess_series( # Compute the ESS. if method == "lugsail": ess_vals[i] = ess_lugsail_variance(truncated_data, n_pow=n_pow) + elif method == "chodera": + ess_vals[i] = ess_chodera(truncated_data) else: # method == 'inter' ess_vals[i] = ess_inter_variance(truncated_data) # Get the time. @@ -222,6 +226,8 @@ def detect_equilibration_max_ess( equil_data = data[:, equil_idx:] if method == "lugsail": equil_g = statistical_inefficiency_lugsail_variance(equil_data, n_pow=n_pow) + elif method == "chodera": + equil_g = statistical_inefficiency_chodera(equil_data) else: # method == 'inter' equil_g = statistical_inefficiency_inter_variance(equil_data) diff --git a/deea/ess.py b/deea/ess.py index dcedd37..f840574 100644 --- a/deea/ess.py +++ b/deea/ess.py @@ -1,6 +1,7 @@ """Functions to calculate the statistical inefficiency and effective sample size.""" import numpy as np +import pymbar from ._validation import check_data from .variance import inter_run_variance, intra_run_variance, lugsail_variance @@ -58,6 +59,29 @@ def statistical_inefficiency_lugsail_variance( return max(g, 1) +def statistical_inefficiency_chodera(data: np.ndarray) -> float: + """ + Compute the statistical inefficiency of a time series using the + Chodera method. This is applicable to a single run. + + Parameters + ---------- + data : np.ndarray + The time series data. This should have shape + (n_runs, n_samples) or (n_samples,). + + Returns + ------- + float + The statistical inefficiency. + """ + data = check_data(data, one_dim_allowed=True) + if data.shape[0] == 1: + return pymbar.timeseries.statistical_inefficiency(data[0], fast=False) + else: + return pymbar.timeseries.statistical_inefficiency(data.mean(axis=0), fast=False) + + def ess_inter_variance(data: np.ndarray) -> float: """ Compute the effective sample size of a time series by dividing @@ -110,3 +134,26 @@ def ess_lugsail_variance(data: np.ndarray, n_pow: float = 1 / 3) -> float: n_runs, n_samples = data.shape total_samples = n_runs * n_samples return total_samples / statistical_inefficiency_lugsail_variance(data, n_pow=n_pow) + + +def ess_chodera(data: np.ndarray) -> float: + """ + Compute the effective sample size of a time series by dividing + the total number of samples by the statistical inefficiency, where + the statistical inefficiency is calculated using the Chodera method. + + Parameters + ---------- + data : np.ndarray + The time series data. This should have shape + (n_runs, n_samples) or (n_samples,). + + Returns + ------- + float + The effective sample size. + """ + data = check_data(data, one_dim_allowed=True) + n_runs, n_samples = data.shape + total_samples = n_runs * n_samples + return total_samples / statistical_inefficiency_chodera(data) From 2b0ce4dd1612c9032222845289fd9c8fc25fa742 Mon Sep 17 00:00:00 2001 From: finlayclark Date: Sun, 21 Jan 2024 20:22:07 +0000 Subject: [PATCH 02/15] Complete overhaul Switch to two main methods for computing the autocorrelation - initial sequence and window. Older functions are retained in this commit but will be removed in a subsequent commit. --- deea/__init__.py | 1 + deea/_validation.py | 7 +- deea/equilibration.py | 631 ++++++++++++++++++++-- deea/ess.py | 569 +++++++++++++++++++- deea/plot.py | 285 +++++++++- deea/sse.py | 199 +++++++ deea/tests/test_equilibration.py | 134 +++++ deea/tests/test_ess.py | 37 ++ deea/tests/test_plot.py | 74 +++ deea/tests/test_sse.py | 27 + deea/tests/test_validation.py | 2 +- deea/tests/test_variance.py | 381 +++++++++++++ deea/variance.py | 852 +++++++++++++++++++++++++++++- devtools/conda-envs/test_env.yaml | 2 + pyproject.toml | 4 +- 15 files changed, 3108 insertions(+), 97 deletions(-) create mode 100644 deea/sse.py create mode 100644 deea/tests/test_sse.py diff --git a/deea/__init__.py b/deea/__init__.py index 250223f..25c61a0 100644 --- a/deea/__init__.py +++ b/deea/__init__.py @@ -7,4 +7,5 @@ from .ess import * from .gelman_rubin import * from .plot import * +from .sse import * from .variance import * diff --git a/deea/_validation.py b/deea/_validation.py index d392858..47fafe8 100644 --- a/deea/_validation.py +++ b/deea/_validation.py @@ -1,5 +1,7 @@ """Functions for data validation.""" +from warnings import warn as _warn + import numpy as np from ._exceptions import InvalidInputError @@ -45,8 +47,9 @@ def check_data(data: np.ndarray, one_dim_allowed: bool = False) -> np.ndarray: if n_dims == 2: n_chains, n_samples = data.shape if n_chains > n_samples: - raise InvalidInputError( - "Data must have shape (n_chains, n_samples) where n_chains >= n_samples." + _warn( + "Data has more chains than samples. This is allowed but may not be what you intended.", + RuntimeWarning, ) # If the array is one dimensional, reshape it to (1, n_samples). diff --git a/deea/equilibration.py b/deea/equilibration.py index f1448d9..52fe1c4 100644 --- a/deea/equilibration.py +++ b/deea/equilibration.py @@ -1,18 +1,20 @@ """Functions for selecting the equilibration time of a time series.""" from pathlib import Path +from typing import Callable as _Callable from typing import Optional as _Optional from typing import Tuple as _Tuple from typing import Union as _Union import matplotlib.pyplot as plt -import numpy as np +import numpy as _np from matplotlib import gridspec from scipy.stats import ttest_rel from ._exceptions import AnalysisError, EquilibrationNotDetectedError, InvalidInputError from ._validation import check_data from .ess import ( + convert_sse_series_to_ess_series, ess_chodera, ess_inter_variance, ess_lugsail_variance, @@ -20,16 +22,567 @@ statistical_inefficiency_inter_variance, statistical_inefficiency_lugsail_variance, ) -from .plot import plot_equilibration_max_ess, plot_equilibration_paired_t_test +from .plot import ( + plot_equilibration_max_ess, + plot_equilibration_min_sse, + plot_equilibration_paired_t_test, +) +from .sse import get_sse_series_init_seq, get_sse_series_window, sse +from .variance import get_variance_initial_sequence, get_variance_window + +######################################## New ####################################### + + +def detect_equilibration_init_seq( + data: _np.ndarray, + times: _Optional[_np.ndarray] = None, + method: str = "min_sse", + sequence_estimator: str = "initial_convex", + min_max_lag_time: int = 3, + max_max_lag_time: _Optional[int] = None, + smooth_lag_times: bool = False, + frac_padding: float = 0.1, + plot: bool = False, + plot_name: _Union[str, Path] = "equilibration_sse_init_seq.png", + time_units: str = "ns", + data_y_label: str = r"$\Delta G$ / kcal mol$^{-1}$", + plot_max_lags: bool = True, +) -> _Tuple[_Union[float, int], float, float]: + """ + Detect the equilibration time of a time series by finding the minimum + squared standard error (SSE), or maximum effective sample size (ESS) + of the time series, using initial sequence estimators of the variance. + This is done by computing the SSE at each time point, discarding all + samples before the time point. The index of the time point with + the minimum SSE or maximum ESS is taken to be the point of equilibration. + + Parameters + ---------- + data : np.ndarray + The time series data. This should have shape + (n_runs, n_samples) or (n_samples,). + + times : np.ndarray, optional + The times at which the data was sampled. If this is + not provided, the indices of the data will be used. + + method : str, optional, default="min_sse" + The method to use to select the equilibration time. This can be + "min_sse" or "max_ess". + + sequence_estimator : str, optional + The initial sequence estimator to use. Can be "positive", "initial_positive", + "initial_monotone", or "initial_convex". The default is "initial_convex". "positive" + corresponds to truncating the auto-covariance function at the first negative value, as is + done in pymbar. The other methods correspond to the methods described in Geyer, 1992: + https://www.jstor.org/stable/2246094. + + min_max_lag_time : int, optional, default=3 + The minimum maximum lag time to use when estimating the statistical inefficiency. + + max_max_lag_time : int, optional, default=None + The maximum maximum lag time to use when calculating the auto-correlation function. + If None, the maximum lag time will be the length of the time series. + + smooth_lag_times : bool, optional, default=False + Whether to smooth out the max lag times by a) converting them to a monotinically + decreasing sequence and b) linearly interpolating between points where the sequence + changes. This may be useful when the max lag times are noisy. + + frac_padding : float, optional, default=0.1 + The fraction of the end of the timeseries to avoid calculating the variance + for. For example, if frac_padding = 0.1, the variance will be calculated + for the first 90% of the time series. This helps to avoid noise in the + variance when there are few data points. + + plot : bool, optional + Whether to plot the SSE curve. The default is False. + + plot_name : str | Path, optional + The name of the plot file. The default is 'equilibration_sse_init_seq.png'. + + time_units : str, optional + The units of time. The default is "ns". + + data_y_label : str, optional + The y-axis label for the time series data. The default is + "$\Delta G$ / kcal mol$^{-1}$". + + plot_max_lags : bool, optional, default=True + Whether to plot the maximum lag times used to estimate the variance. + + Returns + ------- + equil_time: float | int + The time (or index, if no times are supplied) at which + the time series is equilibrated. + + equil_g: float + The statistical inefficiency at the equilibration point. + + equil_ess: float + The effective sample size at the equilibration point. + """ + # Check that data is valid. + data = check_data(data, one_dim_allowed=True) + n_runs, n_samples = data.shape + + # Check that method is valid. + valid_methods = ["min_sse", "max_ess"] + method = method.lower() + if method not in valid_methods: + raise InvalidInputError( + f"method must be one of {valid_methods}, but got {method}." + ) + + # If times is None, units of time are indices. + if times is None: + time_units = "index" + # Convert times to indices. + times_valid: _np.ndarray = _np.arange(n_samples) + else: + # To satisfy type checking. + times_valid = times + + # Get the SSE timeseries. + sse_vals, max_lag_times = get_sse_series_init_seq( + data=data, + sequence_estimator=sequence_estimator, + min_max_lag_time=min_max_lag_time, + max_max_lag_time=max_max_lag_time, + smooth_lag_times=smooth_lag_times, + frac_padding=frac_padding, + ) + + # Get the corresponding times (or indices). + sse_times = times_valid[: len(sse_vals)] + + # Convert the SSE to 1/ESS if requested (divide by uncorrelated variance). + if method == "max_ess": + ess_vals = convert_sse_series_to_ess_series(data=data, sse_series=sse_vals) + sse_vals = 1 / ess_vals + + # Get the index of the minimum SSE. + equil_idx = _np.argmin(sse_vals) + equil_time = sse_times[equil_idx] + + # Now compute the effective sample size at this time point. + equil_data = data[:, equil_idx:] + equil_ess = 1 / sse_vals[equil_idx] + if method == "min_sse": # Has not yet been multiplied by the uncorrelated variance. + equil_ess *= equil_data.var() + equil_g = equil_data.size / equil_ess + equil_ess = equil_data.size / equil_g + + if plot: + # Create a figure. + fig = plt.figure(figsize=(6, 4)) + gridspec_obj = gridspec.GridSpec(1, 1, figure=fig) + + # Plot the ESS. + plot_equilibration_min_sse( + fig=fig, + subplot_spec=gridspec_obj[0], + data=data, + sse_series=sse_vals, + max_lag_series=max_lag_times if plot_max_lags else None, + data_times=times_valid, + sse_times=sse_times, + time_units=time_units, + data_y_label=data_y_label, + variance_y_label="ESS" + if method == "max_ess" + else r"$\frac{1}{\sigma^2(\Delta G)}$ / kcal$^{-2}$ mol$^2$", + ) + + fig.savefig(str(plot_name), dpi=300, bbox_inches="tight") + + # Return the equilibration index, limiting variance estimate, and SSE. + return equil_time, equil_g, equil_ess + + +def detect_equilibration_window( + data: _np.ndarray, + times: _Optional[_np.ndarray] = None, + method: str = "min_sse", + kernel: _Callable[[int], _np.ndarray] = _np.bartlett, # type: ignore + window_size_fn: _Optional[_Callable[[int], int]] = lambda x: round(x**1 / 2), + window_size: _Optional[int] = None, + plot: bool = False, + plot_name: _Union[str, Path] = "equilibration_sse_window.png", + time_units: str = "ns", + data_y_label: str = r"$\Delta G$ / kcal mol$^{-1}$", + plot_window_size: bool = True, +) -> _Tuple[_Union[float, int], float, float]: + """ + Detect the equilibration time of a time series by finding the minimum + squared standard error (SSE) or maximum effective sample size (ESS) + of the time series, using window estimators of the variance. This is + done by computing the SSE at each time point, discarding all samples + before the time point. The index of the time point with the minimum + SSE is taken to be the point of equilibration. + + Parameters + ---------- + data : np.ndarray + The time series data. This should have shape + (n_runs, n_samples) or (n_samples,). If the method + is 'lugsail', the data may have only one run, but + otherwise there must be at least two runs. + + times : np.ndarray, optional + The times at which the data was sampled. If this is + not provided, the indices of the data will be used. + + method : str, optional, default="min_sse" + The method to use to select the equilibration time. This can be + "min_sse" or "max_ess". + + kernel : callable, optional, default=numpy.bartlett + A function that takes a window size and returns a window function. + + window_size_fn : callable, optional, default=lambda x: round(x**1 / 2) + A function that takes the length of the time series and returns the window size + to use. If this is not None, window_size must be None. + + window_size : int, optional, default=None + The size of the window to use, defined in terms of time lags in the + forwards direction. If this is not None, window_size_fn must be None. + + plot : bool, optional + Whether to plot the ESS curve. The default is False. + + plot_name : str | Path, optional + The name of the plot file. The default is 'equilibration_sse_window.png'. + + time_units : str, optional + The units of time. The default is "ns". + + data_y_label : str, optional + The y-axis label for the time series data. The default is + "$\Delta G$ / kcal mol$^{-1}$". + + plot_window_size : bool, optional, default=True + Whether to plot the window size used to estimate the variance. + + Returns + ------- + equil_time: float | int + The time (or index, if no times are supplied) at which + the time series is equilibrated. + + equil_g: float + The statistical inefficiency at the equilibration point. + + equil_ess: float + The effective sample size at the equilibration point. + """ + # Check that data is valid. + data = check_data(data, one_dim_allowed=True) + n_runs, n_samples = data.shape + + # If times is None, units of time are indices. + if times is None: + time_units = "index" + # Convert times to indices. + times_valid: _np.ndarray = _np.arange(n_samples) + else: + # To satisfy type checking. + times_valid = times + + # Get the SSE timeseries + sse_vals, window_sizes = get_sse_series_window( + data=data, + kernel=kernel, + window_size_fn=window_size_fn, + window_size=window_size, + ) + + # Get the corresponding times (or indices). + sse_times = times_valid[: len(sse_vals)] # type: ignore + + # Convert the SSE to 1/ESS if requested (divide by uncorrelated variance). + if method == "max_ess": + ess_vals = convert_sse_series_to_ess_series(data=data, sse_series=sse_vals) + sse_vals = 1 / ess_vals + + # Get the index of the minimum SSE. + equil_idx = _np.argmin(sse_vals) + equil_time = sse_times[equil_idx] + + # Now compute the effective sample size at this time point. + equil_data = data[:, equil_idx:] + equil_ess = 1 / sse_vals[equil_idx] + if method == "min_sse": # Has not yet been multiplied by the uncorrelated variance. + equil_ess *= equil_data.var() + equil_g = equil_data.size / equil_ess + equil_ess = equil_data.size / equil_g + + if plot: + # Create a figure. + fig = plt.figure(figsize=(6, 4)) + gridspec_obj = gridspec.GridSpec(1, 1, figure=fig) + + # Plot the ESS. + plot_equilibration_min_sse( + fig=fig, + subplot_spec=gridspec_obj[0], + data=data, + sse_series=sse_vals, + data_times=times_valid, + sse_times=sse_times, + time_units=time_units, + data_y_label=data_y_label, + window_size_series=window_sizes if plot_window_size else None, + variance_y_label="ESS" + if method == "max_ess" + else r"$\frac{1}{\sigma^2(\Delta G)}$ / kcal$^{-2}$ mol$^2$", + ) + + fig.savefig(str(plot_name), dpi=300, bbox_inches="tight") + + # Return the equilibration time (index), statistical inefficiency, and ESS. + return equil_time, equil_g, equil_ess + + +######################################## New ####################################### + + +def get_sse_series( + data: _np.ndarray, + times: _Optional[_np.ndarray] = None, + method: str = "lugsail", + n_pow: float = 1 / 3, + lugsail_max_discard_frac: float = 0.95, +) -> _Tuple[_np.ndarray, _np.ndarray]: + """ + Return the a series of estimates of the squared standard error (SSE) + from a (set of) timeseries. The SSE is obtained by estimating the SSE at + at each time point, discarding all samples before the time point. The index + of the time point with the minimum SSE is taken to be the point of + equilibration. + + Parameters + ---------- + data : np.ndarray + The time series data. This should have shape + (n_runs, n_samples) or (n_samples,). If the method + is 'lugsail' or 'intra', the data may have only one run, but + otherwise there must be at least two runs. + + times : np.ndarray, optional + The times at which the data was sampled. If this is + not provided, the indices of the data will be used. + + method : str, optional + The method to use to estimate the limiting variance. This can be + 'lugsail', 'intra', or 'inter'. The default is 'lugsail'. 'lugsail' + uses the lugsail replicated batch means variance estimate, 'intra' + uses the mean intra-run variance estimate, and 'inter' uses the + inter-run variance estimate. + + n_pow : float, optional + The power to use in the lugsail variance estimate. This + should be between 0 and 1. The default is 1/3. If the method + is 'intra' or 'inter', this parameter is ignored. + + lugsail_max_discard_frac : float, optional + The maximum fraction of samples to discard when using the lugsail + method. The default is 0.95. This is required to avoid errors from + the lugail method when the smaller batch size becomes too small. + """ + # Check that method is valid. + valid_methods = ["lugsail", "inter", "intra"] + method = method.lower() + if method not in valid_methods: + raise ValueError(f"method must be one of {valid_methods}, but got {method}.") + + # Check that the data is valid. + one_dim_allowed = method in ["lugsail", "intra"] + data = check_data(data, one_dim_allowed=one_dim_allowed) + n_runs, n_samples = data.shape + + # Convert times to indices if necessary. + if times is None: + times = _np.arange(n_samples) + + # Check that times match the number of samples. + if n_samples != len(times): + raise InvalidInputError( + "Times must have the same length as the number of samples." + ) + + # If using the lugsail method, limit the maximum number of samples + # that we discard to ensure that we don't get errors due to the smaller + # batch size becoming too small. + keep_frac = lugsail_max_discard_frac # if method == "lugsail" else 1.0 + final_idx = round(n_samples * keep_frac) + n_ignore = n_samples - final_idx + # Make sure that this corresponds to at least 5 samples. + if n_ignore < 5 and method == "lugsail": + raise AnalysisError("The time series is too short.") + + # Compute the limiting variance at each time point. + lim_var_vals = _np.zeros(n_samples - n_ignore) + time_vals = _np.zeros(n_samples - n_ignore) + + for i in range(n_samples - n_ignore): + # Get the truncated data. + truncated_data = data[:, i:] + # Compute the ESS. + lim_var_vals[i] = sse(truncated_data, method=method, n_pow=n_pow) + # Get the time. + time_vals[i] = times[i] + + return lim_var_vals, time_vals + + +def detect_equilibration_min_sse( + data: _np.ndarray, + times: _Optional[_np.ndarray] = None, + method: str = "lugsail", + max_min_lim_var: float = float("inf"), + n_pow: float = 1 / 3, + lugsail_max_discard_frac: float = 0.95, + plot: bool = False, + plot_name: _Union[str, Path] = "equilibration_sse.png", + time_units: str = "ns", + data_y_label: str = r"$\Delta G$ / kcal mol$^{-1}$", +) -> _Tuple[int, float, float]: + r""" + Detect the equilibration time of a time series by finding the minimum + squared standard error (SSE) of the time series. This is done by + computing the SSE at each time point, discarding all samples before + the time point. The index of the time point with the minimum SSE is taken + to be the point of equilibration. + + Parameters + ---------- + data : np.ndarray + The time series data. This should have shape + (n_runs, n_samples) or (n_samples,). If the method + is 'lugsail', the data may have only one run, but + otherwise there must be at least two runs. + + times : np.ndarray, optional + The times at which the data was sampled. If this is + not provided, the indices of the data will be used. + + method : str, optional + The method to use to estimate the limiting SSE. This can be + 'lugsail', 'intra', or 'inter'. The default is 'lugsail'. 'lugsail' + uses the lugsail replicated batch means limiting variance estimate, + 'intra' uses the limiting mean intra-run limiting variance estimate, + and 'inter' uses the inter-run variance estimate. + + max_min_lim_var : float, optional + The maximum minimum SSE to accept. If the minimum SSE is greater + than this value, an EquilibrationNotDetectedError is raised. The + default is inf. Units are that of the data. + + n_pow : float, optional + The power to use in the lugsail variance estimate. This + should be between 0 and 1. The default is 1/3. + + lugsail_max_discard_frac : float, optional + The maximum fraction of samples to discard when using the lugsail + method. The default is 0.95. This is required to avoid errors from + the lugail method when the smaller batch size becomes too small. + + plot : bool, optional + Whether to plot the ESS curve. The default is False. + + plot_name : str | Path, optional + The name of the plot file. The default is 'equilibration_ess.png'. + + time_units : str, optional + The units of time. The default is "ns". + + data_y_label : str, optional + The y-axis label for the time series data. The default is + "$\Delta G$ / kcal mol$^{-1}$". + + Returns + ------- + int + The time point at which the time series is equilibrated. + + float + The effective sample size at the equilibration point. + + float + The SSE estimate at the equilibration point. + """ + # Check that data is valid. + data = check_data(data, one_dim_allowed=True) + n_runs, n_samples = data.shape + + # Check that max_lim_var is valid. + if max_min_lim_var <= 0: + raise InvalidInputError("max_min_lim_var must be positive.") + + # If times is None, units of time are indices. + if times is None: + time_units = "index" + # Convert times to indices. + times_valid: _np.ndarray = _np.arange(n_samples) + else: + # To satisfy type checking. + times_valid = times + + # Get the SSE timeseries + sse_vals, sse_times = get_sse_series( + data=data, + times=times_valid, + method=method, + n_pow=n_pow, + lugsail_max_discard_frac=0.95, + ) + + # Get the index of the minimum SSE. + equil_idx = _np.argmin(sse_vals) + equil_time = sse_times[equil_idx] + equil_sse = sse_vals[equil_idx] + + # Check that the SSE is less than the maximum SSE. + if equil_sse > max_min_lim_var: + raise EquilibrationNotDetectedError( + f"The minimum SSE of {equil_sse} is greater than the maximum SSE of {max_min_lim_var}." + ) + + # Now compute the limiting variance estimate at this time point. + equil_data = data[:, equil_idx:] + equil_sse = sse(equil_data, method=method, n_pow=n_pow) + + if plot: + # Create a figure. + fig = plt.figure(figsize=(6, 4)) + gridspec_obj = gridspec.GridSpec(1, 1, figure=fig) + + # Plot the ESS. + plot_equilibration_min_sse( + fig=fig, + subplot_spec=gridspec_obj[0], + data=data, + sse_series=sse_vals, + data_times=times_valid, + sse_times=sse_times, + time_units=time_units, + data_y_label=data_y_label, + ) + + fig.savefig(str(plot_name), dpi=300, bbox_inches="tight") + + # Return the equilibration index, limiting variance estimate, and SSE. + return equil_time, equil_sse, n_eff def get_ess_series( - data: np.ndarray, - times: _Optional[np.ndarray] = None, + data: _np.ndarray, + times: _Optional[_np.ndarray] = None, method: str = "lugsail", - min_ess: int = 1, n_pow: float = 1 / 3, -) -> _Tuple[np.ndarray, np.ndarray]: +) -> _Tuple[_np.ndarray, _np.ndarray]: """ Return a list of effective sample sizes from a (set of) timeseries. The ESS values are obtained by computing the ESS at each time point, @@ -55,10 +608,6 @@ def get_ess_series( and 'inter' uses the inter-run variance estimate to compute the ESS. - min_ess : int, optional, default=0 - The minimum ESS to accept. If the maximum ESS is less than - this value, an EquilibrationNotDetectedError is raised. - n_pow : float, optional The power to use in the lugsail variance estimate. This should be between 0 and 1. The default is 1/3. If the method @@ -84,13 +633,7 @@ def get_ess_series( # Convert times to indices if necessary. if times is None: - times = np.arange(n_samples) - - # Check that min_ess is valid. - if min_ess <= 0 or min_ess > n_samples * n_runs: - raise InvalidInputError( - "min_ess must be between 0 and the total number of samples." - ) + times = _np.arange(n_samples) # Check that times is match the number of samples. if n_samples != len(times): @@ -101,16 +644,16 @@ def get_ess_series( # Exclude the last 5 % of data from the ESS calculation # to ensure that we don't throw away too much data. final_idx = round(n_samples * 0.95) - n_discarded = n_samples - final_idx + n_ignore = n_samples - final_idx # Make sure that this corresponds to at least 5 samples. - if n_discarded < 5: + if n_ignore < 5: raise AnalysisError("The time series is too short.") # Compute the ESS at each time point. - ess_vals = np.zeros(n_samples - n_discarded) - time_vals = np.zeros(n_samples - n_discarded) + ess_vals = _np.zeros(n_samples - n_ignore) + time_vals = _np.zeros(n_samples - n_ignore) - for i in range(n_samples - n_discarded): + for i in range(n_samples - n_ignore): # Get the truncated data. truncated_data = data[:, i:] # Compute the ESS. @@ -127,8 +670,8 @@ def get_ess_series( def detect_equilibration_max_ess( - data: np.ndarray, - times: _Optional[np.ndarray] = None, + data: _np.ndarray, + times: _Optional[_np.ndarray] = None, method: str = "lugsail", min_ess: int = 1, n_pow: float = 1 / 3, @@ -200,19 +743,25 @@ def detect_equilibration_max_ess( data = check_data(data, one_dim_allowed=True) n_runs, n_samples = data.shape + # Check that min_ess is valid. + if min_ess <= 0 or min_ess > n_samples * n_runs: + raise InvalidInputError( + "min_ess must be between 0 and the total number of samples." + ) + # If times is None, units of time are indices. if times is None: time_units = "index" # Convert times to indices. - times = np.arange(n_samples) + times = _np.arange(n_samples) # Get the ESS timeseries ess_vals, ess_times = get_ess_series( - data=data, times=times, method=method, min_ess=min_ess, n_pow=n_pow + data=data, times=times, method=method, n_pow=n_pow ) # Get the index of the maximum ESS. - equil_idx = np.argmax(ess_vals) + equil_idx = _np.argmax(ess_vals) equil_time = ess_times[equil_idx] equil_ess = ess_vals[equil_idx] @@ -248,21 +797,21 @@ def detect_equilibration_max_ess( data_y_label=data_y_label, ) - fig.savefig(plot_name, dpi=600, bbox_inches="tight") + fig.savefig(str(plot_name), dpi=300, bbox_inches="tight") # Return the equilibration index, statistical inefficiency, and ESS. return equil_time, equil_g, equil_ess def get_paired_t_p_timeseries( - data: np.ndarray, - times: _Optional[np.ndarray] = None, + data: _np.ndarray, + times: _Optional[_np.ndarray] = None, fractional_block_size: float = 0.125, fractional_test_end: float = 0.5, initial_block_size: float = 0.1, final_block_size: float = 0.5, t_test_sidedness: str = "two-sided", -) -> _Tuple[np.ndarray, np.ndarray]: +) -> _Tuple[_np.ndarray, _np.ndarray]: """ Get a timeseries of the p-values from a paired t-test on the differences between sample means between intial and final portions of the data. The timeseries @@ -312,7 +861,7 @@ def get_paired_t_p_timeseries( # Convert times to indices if necessary. if times is None: - times = np.arange(n_samples) + times = _np.arange(n_samples) # Check that times is match the number of samples. if n_samples != len(times): @@ -359,9 +908,9 @@ def get_paired_t_p_timeseries( n_discard = round(n_samples * fractional_block_size) # Calculate the p values with their indices. - p_vals = np.zeros(n_repeats) - p_val_indices = np.zeros(n_repeats, dtype=int) - time_vals = np.zeros(n_repeats) + p_vals = _np.zeros(n_repeats) + p_val_indices = _np.zeros(n_repeats, dtype=int) + time_vals = _np.zeros(n_repeats) # Loop over and calculate the p values. for i in range(n_repeats): @@ -388,8 +937,8 @@ def get_paired_t_p_timeseries( def detect_equilibration_paired_t_test( - data: np.ndarray, - times: _Optional[np.ndarray] = None, + data: _np.ndarray, + times: _Optional[_np.ndarray] = None, p_threshold: float = 0.05, fractional_block_size: float = 0.125, fractional_test_end: float = 0.5, @@ -468,14 +1017,14 @@ def detect_equilibration_paired_t_test( if times is None: time_units = "index" # Convert times to indices. - times = np.arange(n_samples) + times = _np.arange(n_samples) # Check that user options (not checked in get_paired_t_p_timeseries) are valid. if p_threshold <= 0 or p_threshold > 0.7: raise InvalidInputError("p_threshold must be between 0 and 0.7.") # Get the p value timeseries and n_discard. - indices = np.arange(n_samples) + indices = _np.arange(n_samples) p_vals, times_used = get_paired_t_p_timeseries( data=data, times=times, @@ -492,7 +1041,7 @@ def detect_equilibration_paired_t_test( raise EquilibrationNotDetectedError( f"No p values are greater than the threshold of {p_threshold}." ) - equil_time = times_used[np.argmax(meets_threshold)] + equil_time = times_used[_np.argmax(meets_threshold)] # Plot the p values. if plot: @@ -510,6 +1059,6 @@ def detect_equilibration_paired_t_test( data_y_label=data_y_label, ) - fig.savefig(plot_name, dpi=600, bbox_inches="tight") + fig.savefig(str(plot_name), dpi=300, bbox_inches="tight") return equil_time diff --git a/deea/ess.py b/deea/ess.py index f840574..b6d9165 100644 --- a/deea/ess.py +++ b/deea/ess.py @@ -1,13 +1,160 @@ """Functions to calculate the statistical inefficiency and effective sample size.""" -import numpy as np +from typing import Callable as _Callable +from typing import Optional as _Optional +from typing import Tuple as _Tuple + +import numpy as _np import pymbar from ._validation import check_data +from .sse import get_sse_series_init_seq as _get_sse_series_init_seq +from .sse import get_sse_series_window as _get_sse_series_window from .variance import inter_run_variance, intra_run_variance, lugsail_variance -def statistical_inefficiency_inter_variance(data: np.ndarray) -> float: +def convert_sse_series_to_ess_series( + data: _np.ndarray, sse_series: _np.ndarray +) -> _np.ndarray: + """ + Convert a series of squared standard errors to a series of effective sample sizes. + + Parameters + ---------- + sse_series : np.ndarray + The squared standard error series. + + uncor_vars : np.ndarray + The uncorrelated variances. + + Returns + ------- + np.ndarray + The effective sample size series. + """ + # Validate the data. + data = check_data(data, one_dim_allowed=True) + + # Now get the uncorrelated variances. + uncor_vars = _np.zeros_like(sse_series) + + for i in range(len(sse_series)): + # Get "biased", rather than n - 1, variance. + uncor_vars[i] = data[:, i:].var() # type: ignore + + return uncor_vars / sse_series # type: ignore + + +def get_ess_series_init_seq( + data: _np.ndarray, + sequence_estimator: str = "initial_convex", + min_max_lag_time: int = 3, + max_max_lag_time: _Optional[int] = None, + smooth_lag_times: bool = False, + frac_padding: float = 0.1, +) -> _Tuple[_np.ndarray, _np.ndarray]: + """ + Compute a series of effective sample sizes for a time series as data + is discarded from the beginning of the time series. The autocorrelation + is computed using the sequence estimator specified. + + Parameters + ---------- + data : numpy.ndarray + A time series of data with shape (n_samples,). + + sequence_estimator : str, optional + The initial sequence estimator to use. Can be "positive", "initial_positive", + "initial_monotone", or "initial_convex". The default is "initial_convex". "positive" + corresponds to truncating the auto-covariance function at the first negative value, as is + done in pymbar. The other methods correspond to the methods described in Geyer, 1992: + https://www.jstor.org/stable/2246094. + + min_max_lag_time : int, optional, default=3 + The minimum maximum lag time to use when estimating the statistical inefficiency. + + max_max_lag_time : int, optional, default=None + The maximum maximum lag time to use when calculating the auto-correlation function. + If None, the maximum lag time will be the length of the time series. + + smooth_lag_times : bool, optional, default=False + Whether to smooth out the max lag times by a) converting them to a monotinically + decreasing sequence and b) linearly interpolating between points where the sequence + changes. This may be useful when the max lag times are noisy. + + frac_padding : float, optional, default=0.1 + The fraction of the end of the timeseries to avoid calculating the variance + for. For example, if frac_padding = 0.1, the variance will be calculated + for the first 90% of the time series. This helps to avoid noise in the + variance when there are few data points. + + Returns + ------- + np.ndarray + The effective sample size series. + + np.ndarray + The maximum lag times used. + """ + sse_series, max_lag_times = _get_sse_series_init_seq( + data, + sequence_estimator=sequence_estimator, + min_max_lag_time=min_max_lag_time, + max_max_lag_time=max_max_lag_time, + smooth_lag_times=smooth_lag_times, + frac_padding=frac_padding, + ) + + ess_series = convert_sse_series_to_ess_series(data, sse_series) + + return ess_series, max_lag_times + + +def get_ess_series_window( + data: _np.ndarray, + kernel: _Callable[[int], _np.ndarray] = _np.bartlett, # type: ignore + window_size_fn: _Optional[_Callable[[int], int]] = lambda x: round(x**1 / 2), + window_size: _Optional[int] = None, +) -> _Tuple[_np.ndarray, _np.ndarray]: + """ + Compute a series of effective sample sizes for a time series as data + is discarded from the beginning of the time series. The squared standard + error is computed using the window size and kernel specified. + + Parameters + ---------- + data : numpy.ndarray + A time series of data with shape (n_samples,). + + kernel : callable, optional, default=numpy.bartlett + A function that takes a window size and returns a window function. + + window_size_fn : callable, optional, default=lambda x: round(x**1 / 2) + A function that takes the length of the time series and returns the window size + to use. If this is not None, window_size must be None. + + window_size : int, optional, default=None + The size of the window to use, defined in terms of time lags in the + forwards direction. If this is not None, window_size_fn must be None. + + Returns + ------- + np.ndarray + The squared standard error series. + + np.ndarray + The window sizes used. + """ + sse_series, max_lag_times = _get_sse_series_window( + data, kernel=kernel, window_size_fn=window_size_fn, window_size=window_size + ) + + ess_series = convert_sse_series_to_ess_series(data, sse_series) + + return ess_series, max_lag_times + + +def statistical_inefficiency_inter_variance(data: _np.ndarray) -> float: """ Compute the statistical inefficiency of a time series by dividing the inter-run variance estimate by the intra-run variance estimate. @@ -31,7 +178,7 @@ def statistical_inefficiency_inter_variance(data: np.ndarray) -> float: def statistical_inefficiency_lugsail_variance( - data: np.ndarray, n_pow: float = 1 / 3 + data: _np.ndarray, n_pow: float = 1 / 3 ) -> float: """ Compute the statistical inefficiency of a time series by dividing @@ -59,7 +206,7 @@ def statistical_inefficiency_lugsail_variance( return max(g, 1) -def statistical_inefficiency_chodera(data: np.ndarray) -> float: +def statistical_inefficiency_chodera(data: _np.ndarray) -> float: """ Compute the statistical inefficiency of a time series using the Chodera method. This is applicable to a single run. @@ -82,7 +229,7 @@ def statistical_inefficiency_chodera(data: np.ndarray) -> float: return pymbar.timeseries.statistical_inefficiency(data.mean(axis=0), fast=False) -def ess_inter_variance(data: np.ndarray) -> float: +def ess_inter_variance(data: _np.ndarray) -> float: """ Compute the effective sample size of a time series by dividing the total number of samples by the statistical inefficiency, where @@ -107,7 +254,7 @@ def ess_inter_variance(data: np.ndarray) -> float: return total_samples / statistical_inefficiency_inter_variance(data) -def ess_lugsail_variance(data: np.ndarray, n_pow: float = 1 / 3) -> float: +def ess_lugsail_variance(data: _np.ndarray, n_pow: float = 1 / 3) -> float: """ Compute the effective sample size of a time series by dividing the total number of samples by the statistical inefficiency, where @@ -136,7 +283,7 @@ def ess_lugsail_variance(data: np.ndarray, n_pow: float = 1 / 3) -> float: return total_samples / statistical_inefficiency_lugsail_variance(data, n_pow=n_pow) -def ess_chodera(data: np.ndarray) -> float: +def ess_chodera(data: _np.ndarray) -> float: """ Compute the effective sample size of a time series by dividing the total number of samples by the statistical inefficiency, where @@ -156,4 +303,410 @@ def ess_chodera(data: np.ndarray) -> float: data = check_data(data, one_dim_allowed=True) n_runs, n_samples = data.shape total_samples = n_runs * n_samples - return total_samples / statistical_inefficiency_chodera(data) + # return total_samples / statistical_inefficiency_chodera(data) + if n_runs > 1: + data = data.mean(axis=0) + return total_samples / statistical_inefficiency_geyer(data, method="con") + + +################## DELETE BELOW ##################### +def statistical_inefficiency_geyer(A_n, method="con"): + """Compute the statistical inefficiency of a timeseries using the methods of Geyer. + + Parameters + ---------- + A_n : np.ndarray, float + A_n[n] is nth value of timeseries A. Length is deduced from vector. + method : str, optional, default='con' + The method to use; matches notation from `initseq` from `mcmc` R package by Geyer. + 'pos' : initial positive sequence (IPS) estimator + 'dec' : initial monotone sequence (IMS) estimator + 'con' : initial convex sequence (ICS) estimator + + Returns + ------- + g : np.ndarray, + g is the estimated statistical inefficiency (equal to 1 + 2 tau, where tau is the correlation time). + We enforce g >= 1.0. + + Notes + ----- + Implementation based on the `initseq` method from the `mcmc` R package by Geyer. + + References + ---------- + [1] Geyer, CJ. Practical Markov chain Monte Carlo. Statistical Science 7(4):473-511, 1992. + + Examples + -------- + + Compute statistical inefficiency of timeseries data with known correlation time using different methods. + + >>> from pymbar import testsystems + >>> A_n = testsystems.correlated_timeseries_example(N=100000, tau=10.0) + >>> g_IPS = statisticalInefficiency_geyer(A_n, method='pos') + >>> g_IMS = statisticalInefficiency_geyer(A_n, method='dec') + >>> g_ICS = statisticalInefficiency_geyer(A_n, method='con') + + """ + + if method not in ["pos", "dec", "con"]: + raise Exception( + "Unknown method '%s'; must be one of ['pos', 'dec', 'con']" % method + ) + + # Create numpy copies of input arguments. + A_n = _np.array(A_n) + + # Subtract off sample mean. + A_n -= A_n.mean() + + # Get the length of the timeseries. + N = A_n.size + + # Compute sample variance. + gamma_zero = (A_n**2).sum() / N + + # Compute sequential covariance pairs. + gamma_pos = list() + for i in range(int(_np.floor(N / 2))): + lag1 = 2 * i + gam1 = (A_n[0 : (N - lag1)] * A_n[lag1:N]).sum() / N + lag2 = lag1 + 1 + gam2 = (A_n[0 : (N - lag2)] * A_n[lag2:N]).sum() / N + + # Terminate if sum is no longer positive. + if (gam1 + gam2) < 0.0: + break + + # Otherwise, store the consecutive sum. + gamma_pos.append(gam1 + gam2) + + # Number of nonnegative values in array. + ngamma = len(gamma_pos) + + # Compute IPS gamma sequence. + gamma_pos = _np.array(gamma_pos) + + # Compute IMS gamma sequence. + gamma_dec = _np.array(gamma_pos) + for i in range(ngamma - 1): + if gamma_dec[i] < gamma_dec[i + 1]: + gamma_dec[i] = gamma_dec[i + 1] + + # Compute ICS gamma sequence. + gamma_con = _np.array(gamma_dec) + for i in range(ngamma - 1, 0, -1): + gamma_con[i] -= gamma_con[i - 1] + + # Pool adjacent violators (PAVA) algorithm. + puff = _np.zeros([ngamma], _np.float64) + nuff = _np.zeros([ngamma], _np.int32) + nstep = 0 + for j in range(1, ngamma): + puff[nstep] = gamma_con[j] + nuff[nstep] = 1 + nstep += 1 + while (nstep > 1) and ( + (puff[nstep - 1] / nuff[nstep - 1]) < (puff[nstep - 2] / nuff[nstep - 2]) + ): + puff[nstep - 2] += puff[nstep - 1] + nuff[nstep - 2] += nuff[nstep - 1] + nstep -= 1 + + j = 1 + for jstep in range(nstep): + muff = puff[jstep] / nuff[jstep] + for k in range(nuff[jstep]): + gamma_con[j] = gamma_con[j - 1] + muff + j += 1 + + # Compute sample variance estimates. + var_pos = (2 * gamma_pos.sum() - gamma_zero) / N + var_dec = (2 * gamma_dec.sum() - gamma_zero) / N + var_con = (2 * gamma_con.sum() - gamma_zero) / N + + # Compute statistical inefficiencies from sample mean var = var(A_n) / (N/g) + # g = var / (var(A_n)/N) + var_uncorr = gamma_zero / N + g_pos = var_pos / var_uncorr + g_dec = var_dec / var_uncorr + g_con = var_con / var_uncorr + + # DEBUG + # print "pos dec con : %12.3f %12.3f %12.3f" % (g_pos, g_dec, g_con) + + # Select appropriate g. + if method == "pos": + g = g_pos + elif method == "dec": + g = g_dec + elif method == "con": + g = g_con + + # g must be at least unity + if g < 1.0: + g = 1.0 + + # Return the computed statistical inefficiency. + return g + + +def statistical_inefficiency_geyer_indices(A_n, method="con"): + """Compute the statistical inefficiency of a timeseries using the methods of Geyer. + + Parameters + ---------- + A_n : np.ndarray, float + A_n[n] is nth value of timeseries A. Length is deduced from vector. + method : str, optional, default='con' + The method to use; matches notation from `initseq` from `mcmc` R package by Geyer. + 'pos' : initial positive sequence (IPS) estimator + 'dec' : initial monotone sequence (IMS) estimator + 'con' : initial convex sequence (ICS) estimator + + Returns + ------- + g : np.ndarray, + g is the estimated statistical inefficiency (equal to 1 + 2 tau, where tau is the correlation time). + We enforce g >= 1.0. + + Notes + ----- + Implementation based on the `initseq` method from the `mcmc` R package by Geyer. + + References + ---------- + [1] Geyer, CJ. Practical Markov chain Monte Carlo. Statistical Science 7(4):473-511, 1992. + + Examples + -------- + + Compute statistical inefficiency of timeseries data with known correlation time using different methods. + + >>> from pymbar import testsystems + >>> A_n = testsystems.correlated_timeseries_example(N=100000, tau=10.0) + >>> g_IPS = statisticalInefficiency_geyer(A_n, method='pos') + >>> g_IMS = statisticalInefficiency_geyer(A_n, method='dec') + >>> g_ICS = statisticalInefficiency_geyer(A_n, method='con') + + """ + + if method not in ["pos", "dec", "con"]: + raise Exception( + "Unknown method '%s'; must be one of ['pos', 'dec', 'con']" % method + ) + + # Create numpy copies of input arguments. + A_n = _np.array(A_n) + + # Subtract off sample mean. + A_n -= A_n.mean() + + # Get the length of the timeseries. + N = A_n.size + + # Compute sample variance. + gamma_zero = (A_n**2).sum() / N + + # Compute sequential covariance pairs. + gamma_pos = list() + for i in range(int(_np.floor(N / 2))): + lag1 = 2 * i + gam1 = (A_n[0 : (N - lag1)] * A_n[lag1:N]).sum() / N + lag2 = lag1 + 1 + gam2 = (A_n[0 : (N - lag2)] * A_n[lag2:N]).sum() / N + + # Terminate if sum is no longer positive. + if (gam1 + gam2) < 0.0: + break + + # Otherwise, store the consecutive sum. + gamma_pos.append(gam1 + gam2) + + # Number of nonnegative values in array. + ngamma = len(gamma_pos) + + # Compute IPS gamma sequence. + gamma_pos = _np.array(gamma_pos) + + # Compute IMS gamma sequence. + gamma_dec = _np.array(gamma_pos) + for i in range(ngamma - 1): + if gamma_dec[i] < gamma_dec[i + 1]: + gamma_dec[i] = gamma_dec[i + 1] + + # Compute ICS gamma sequence. + gamma_con = _np.array(gamma_dec) + for i in range(ngamma - 1, 0, -1): + gamma_con[i] -= gamma_con[i - 1] + + # Pool adjacent violators (PAVA) algorithm. + puff = _np.zeros([ngamma], _np.float64) + nuff = _np.zeros([ngamma], _np.int32) + nstep = 0 + for j in range(1, ngamma): + puff[nstep] = gamma_con[j] + nuff[nstep] = 1 + nstep += 1 + while (nstep > 1) and ( + (puff[nstep - 1] / nuff[nstep - 1]) < (puff[nstep - 2] / nuff[nstep - 2]) + ): + puff[nstep - 2] += puff[nstep - 1] + nuff[nstep - 2] += nuff[nstep - 1] + nstep -= 1 + + j = 1 + for jstep in range(nstep): + muff = puff[jstep] / nuff[jstep] + for k in range(nuff[jstep]): + gamma_con[j] = gamma_con[j - 1] + muff + j += 1 + + # Compute sample variance estimates. + var_pos = (2 * gamma_pos.sum() - gamma_zero) / N + var_dec = (2 * gamma_dec.sum() - gamma_zero) / N + var_con = (2 * gamma_con.sum() - gamma_zero) / N + + # Compute statistical inefficiencies from sample mean var = var(A_n) / (N/g) + # g = var / (var(A_n)/N) + var_uncorr = gamma_zero / N + g_pos = var_pos / var_uncorr + g_dec = var_dec / var_uncorr + g_con = var_con / var_uncorr + + # DEBUG + # print "pos dec con : %12.3f %12.3f %12.3f" % (g_pos, g_dec, g_con) + + # Select appropriate g. + if method == "pos": + g = g_pos + elif method == "dec": + g = g_dec + elif method == "con": + g = g_con + + # g must be at least unity + if g < 1.0: + g = 1.0 + + # Return the computed statistical inefficiency. + return (g,) + + +def statistical_inefficiency_multiscale(A_n, B_n=None, fast=False, mintime=3): + """ + Compute the (cross) statistical inefficiency of (two) timeseries using multiscale method from Chodera. + + Parameters + ---------- + A_n : np.ndarray, float + A_n[n] is nth value of timeseries A. Length is deduced from vector. + B_n : np.ndarray, float, optional, default=None + B_n[n] is nth value of timeseries B. Length is deduced from vector. + If supplied, the cross-correlation of timeseries A and B will be estimated instead of the + autocorrelation of timeseries A. + fast : bool, optional, default=False + f True, will use faster (but less accurate) method to estimate correlation + time, described in Ref. [1] (default: False). + mintime : int, optional, default=3 + minimum amount of correlation function to compute (default: 3) + The algorithm terminates after computing the correlation time out to mintime when the + correlation function first goes negative. Note that this time may need to be increased + if there is a strong initial negative peak in the correlation function. + + Returns + ------- + g : np.ndarray, + g is the estimated statistical inefficiency (equal to 1 + 2 tau, where tau is the correlation time). + We enforce g >= 1.0. + + Notes + ----- + The same timeseries can be used for both A_n and B_n to get the autocorrelation statistical inefficiency. + The fast method described in Ref [1] is used to compute g. + + References + ---------- + [1] J. D. Chodera, W. C. Swope, J. W. Pitera, C. Seok, and K. A. Dill. Use of the weighted + histogram analysis method for the analysis of simulated and parallel tempering simulations. + JCTC 3(1):26-41, 2007. + + Examples + -------- + + Compute statistical inefficiency of timeseries data with known correlation time. + + >>> from pymbar import testsystems + >>> A_n = testsystems.correlated_timeseries_example(N=100000, tau=5.0) + >>> g = statisticalInefficiency_multiscale(A_n, fast=True) + + """ + + # Create numpy copies of input arguments. + A_n = _np.array(A_n) + + if B_n is not None: + B_n = _np.array(B_n) + else: + B_n = _np.array(A_n) + + # Get the length of the timeseries. + N = A_n.size + + # Be sure A_n and B_n have the same dimensions. + if A_n.shape != B_n.shape: + raise Exception("A_n and B_n must have same dimensions.") + + # Initialize statistical inefficiency estimate with uncorrelated value. + g = 1.0 + + # Compute mean of each timeseries. + mu_A = A_n.mean() + mu_B = B_n.mean() + + # Make temporary copies of fluctuation from mean. + dA_n = A_n.astype(_np.float64) - mu_A + dB_n = B_n.astype(_np.float64) - mu_B + + # Compute estimator of covariance of (A,B) using estimator that will ensure C(0) = 1. + sigma2_AB = (dA_n * dB_n).mean() # standard estimator to ensure C(0) = 1 + + # Trap the case where this covariance is zero, and we cannot proceed. + if sigma2_AB == 0: + raise ParameterError( + "Sample covariance sigma_AB^2 = 0 -- cannot compute statistical inefficiency" + ) + + # Accumulate the integrated correlation time by computing the normalized correlation time at + # increasing values of t. Stop accumulating if the correlation function goes negative, since + # this is unlikely to occur unless the correlation function has decayed to the point where it + # is dominated by noise and indistinguishable from zero. + t = 1 + increment = 1 + while t < N - 1: + # compute normalized fluctuation correlation function at time t + C = _np.sum(dA_n[0 : (N - t)] * dB_n[t:N] + dB_n[0 : (N - t)] * dA_n[t:N]) / ( + 2.0 * float(N - t) * sigma2_AB + ) + # Terminate if the correlation function has crossed zero and we've computed the correlation + # function at least out to 'mintime'. + if (C <= 0.0) and (t > mintime): + break + + # Accumulate contribution to the statistical inefficiency. + g += 2.0 * C * (1.0 - float(t) / float(N)) * float(increment) + + # Increment t and the amount by which we increment t. + t += increment + + # Increase the interval if "fast mode" is on. + if fast: + increment += 1 + + # g must be at least unity + if g < 1.0: + g = 1.0 + + # Return the computed statistical inefficiency. + return g, t diff --git a/deea/plot.py b/deea/plot.py index c0c96cb..5e75921 100644 --- a/deea/plot.py +++ b/deea/plot.py @@ -1,5 +1,7 @@ """Plotting functions.""" +from typing import List as _List + import matplotlib.pyplot as plt from matplotlib import figure, gridspec from matplotlib.axes import Axes @@ -9,7 +11,7 @@ from typing import Tuple as _Tuple import numpy as np -import scipy.stats as _stats +from matplotlib.lines import Line2D from ._exceptions import InvalidInputError from ._validation import check_data @@ -88,8 +90,8 @@ def plot_timeseries( data = data.reshape(n_runs, n_blocks, block_size).mean(axis=2) # type: ignore times = times.reshape(n_blocks, block_size).mean(axis=1) - # Decide the thickness of the individual run lines. - alpha_runs = 1.0 if n_runs == 1 else 0.3 + # Decide the transparency of the individual run lines. + alpha_runs = 1.0 if n_runs == 1 else 0.5 # Plot the data. for i in range(n_runs): @@ -100,25 +102,27 @@ def plot_timeseries( if n_runs > 1: ax.plot(times, data.mean(axis=0), color="black", label="Mean") - # Plot the confidence interval. - if show_ci and n_runs > 1: - means = data.mean(axis=0) - conf_int = ( - _stats.t.interval( - 0.95, - n_runs - 1, - means, - scale=_stats.sem(data), - )[1] - - means - ) # 95 % C.I. - - # Plot the confidence interval. - ax.fill_between( - times, means - conf_int, means + conf_int, alpha=0.3, color="grey" - ) - - ax.legend() + ## Plot the confidence interval. + # if show_ci and n_runs > 1: + # means = data.mean(axis=0) + # conf_int = ( + # _stats.t.interval( + # 0.95, + # n_runs - 1, + # means, + # scale=_stats.sem(data), + # )[1] + # - means + # ) # 95 % C.I. + + ## Plot the confidence interval. + # ax.fill_between( + # times, means - conf_int, means + conf_int, alpha=0.3, color="grey" + # ) + + # Only show the legend if there is more than one run. + if n_runs > 1: + ax.legend() # Set the axis labels. ax.set_xlabel(f"Time / {time_units}") @@ -263,6 +267,109 @@ def plot_ess( ax.set_ylabel("ESS") +def plot_sse( + ax: Axes, + sse: np.ndarray, + max_lags: _Optional[np.ndarray], + window_sizes: _Optional[np.ndarray], + times: np.ndarray, + time_units: str = "ns", + variance_y_label: str = r"$\frac{1}{\sigma^2(\Delta G)}$ / kcal$^{-2}$ mol$^2$", + reciprocal: bool = True, +) -> _Tuple[_List[Line2D], _List[str]]: + """ + Plot the squared standard error (SSE) estimate against time. + + Parameters + ---------- + ax : Axes + The axes to plot on. + + sse : np.ndarray + The SSE estimate. + + max_lags : np.ndarray, optional, default=None + The maximum lag times. + + window_sizes : np.ndarray, optional, default=None + The window sizes. + + times : np.ndarray + The times at which the data was sampled. + + time_units : str, optional + The units of time. The default is "ns". + + variance_y_label : str, optional + The y-axis label for the variance. The default is + r"$\frac{1}{\sigma^2(\Delta G)}$ / kcal$^{-2}$ mol$^2$". + + reciprocal : bool, optional, default=True + Whether to plot the reciprocal of the SSE. + + Returns + ------- + handles : List[Line2D] + The handles for the legend. + + labels : List[str] + The labels for the legend. + """ + # Check that sse is valid. + if not isinstance(sse, np.ndarray) or not isinstance(times, np.ndarray): + raise InvalidInputError("sse and times must be numpy arrays.") + + if sse.ndim != 1: + raise InvalidInputError("sse must be one dimensional.") + + if sse.shape[0] != times.shape[0]: + raise InvalidInputError( + "sse must have the same length as the number of samples." + ) + + if max_lags is not None and window_sizes is not None: + raise InvalidInputError( + "Only one of max_lags and window_sizes can be supplied." + ) + + # Plot the SSE. + to_plot = 1 / sse if reciprocal else sse + ax.plot(times, to_plot, color="black", label="1/SSE") + + # If lags is not None, plot the lag times on a different y axis. + if max_lags is not None or window_sizes is not None: + label = "Max Lag Index" if window_sizes is None else "Window Size" + to_plot = max_lags if window_sizes is None else window_sizes + ax2 = ax.twinx() + # Get the second colour from the colour cycle. + ax2.set_prop_cycle(color=[plt.rcParams["axes.prop_cycle"].by_key()["color"][1]]) + ax2.plot(times, to_plot, alpha=0.8, label=label) + ax2.set_ylabel(label) + + # Plot a vertical dashed line at the minimum SSE. + ax.axvline( + x=times[sse.argmin()], + color="black", + linestyle="--", + label=f"Equilibration Time = {times[sse.argmin()]:.3g} {time_units}", + ) + + # Combine the legends from both axes. + handles, labels = ax.get_legend_handles_labels() + if max_lags is not None or window_sizes is not None: + handles2, labels2 = ax2.get_legend_handles_labels() + handles += handles2 + labels += labels2 + + ax.legend(handles, labels) + + # Set the axis labels. + ax.set_xlabel(f"Time / {time_units}") + ax.set_ylabel(variance_y_label) + + return handles, labels + + def plot_equilibration_paired_t_test( fig: figure.Figure, subplot_spec: gridspec.SubplotSpec, @@ -370,8 +477,8 @@ def plot_equilibration_max_ess( data_y_label: str = r"$\Delta G$ / kcal mol$^{-1}$", ) -> _Tuple[Axes, Axes]: r""" - Plot the p-values of the paired t-test against time, underneath the - time series data. + Plot the effective sample size (ESS) estimates against time, + underneath the time series data. Parameters ---------- @@ -443,3 +550,133 @@ def plot_equilibration_max_ess( ax_top.set_xlabel("") return ax_top, ax_bottom + + +def plot_equilibration_min_sse( + fig: figure.Figure, + subplot_spec: gridspec.SubplotSpec, + data: np.ndarray, + sse_series: np.ndarray, + data_times: np.ndarray, + sse_times: np.ndarray, + max_lag_series: _Optional[np.ndarray] = None, + window_size_series: _Optional[np.ndarray] = None, + time_units: str = "ns", + data_y_label: str = r"$\Delta G$ / kcal mol$^{-1}$", + variance_y_label=r"$\frac{1}{\sigma^2(\Delta G)}$ / kcal$^{-2}$ mol$^2$", + reciprocal: bool = True, +) -> _Tuple[Axes, Axes]: + r""" + Plot the (reciprocal of the) squared standard error (SSE) + estimates against time, underneath the time series data. + + Parameters + ---------- + fig : plt.Figure + The figure to plot on. + + gridspec_obj : plt.GridSpec + The gridspec to use for the plot. + + data : np.ndarray + The time series data. This should have shape + (n_runs, n_samples), or (n_samples,) if there + is only one run. + + sse_series : np.ndarray + The SSE series. + + data_times : np.ndarray + The times at which the data was sampled. + + sse_times : np.ndarray + The times at which the ESS was computed. + + max_lag_series : np.ndarray, optional + The lag series. If None, the lag times are not + plotted. If supplied, they are plotted on the + bottom axis. + + window_size_series : np.ndarray, optional + The window size series. If None, the window sizes + are not plotted. If supplied, they are plotted on + the bottom axis. + + time_units : str, optional + The units of time. The default is "ns". + + data_y_label : str, optional + The y-axis label for the time series data. The default is + "$\Delta G$ / kcal mol$^{-1}$". + + variance_y_label : str, optional + The y-axis label for the variance. The default is + r"$\frac{1}{\sigma^2(\Delta G)}$ / kcal$^{-2}$ mol$^2$". + + reciprocal : bool, optional, default=True + Whether to plot the reciprocal of the SSE. + + Returns + ------- + ax_top : Axes + The axes for the time series data. + + ax_bottom : Axes + The axes for the p-values. + """ + data = check_data(data, one_dim_allowed=True) + n_runs, _ = data.shape + + # We need to split the gridspec into two subplots, one for the time series data (above) + # and one for the p-values (below). Share x-axis but not y-axis. + gs0 = gridspec.GridSpecFromSubplotSpec(2, 1, subplot_spec=subplot_spec, hspace=0.05) + ax_top = fig.add_subplot(gs0[0]) + ax_bottom = fig.add_subplot(gs0[1], sharex=ax_top) + + # Plot the time series data on the top axis. + plot_timeseries(ax_top, data, data_times) + # Add dashed vertical line at the equilibration time. + ax_top.axvline( + x=sse_times[sse_series.argmin()], + color="black", + linestyle="--", + ) + + # Plot the sse on the bottom axis. + sse_handles, sse_labels = plot_sse( + ax_bottom, + sse_series, + max_lag_series, + window_size_series, + sse_times, + variance_y_label=variance_y_label, + reciprocal=reciprocal, + ) + + # Set the axis labels. + ax_top.set_xlabel(f"Time / {time_units}") + ax_top.set_ylabel(data_y_label) + + # Move the legends to the side of the plot. + if n_runs > 1: + ax_top.legend(bbox_to_anchor=(1.05, 0.5), loc="center left") + is_second_axis = max_lag_series is not None or window_size_series is not None + side_shift_bottom = 1.15 if is_second_axis else 1.05 + # Remove the 1/SSE label if there isn't a second axis. + if not is_second_axis: + sse_labels.pop(0) + sse_handles.pop(0) + ax_bottom.legend( + sse_handles, + sse_labels, + bbox_to_anchor=(side_shift_bottom, 0.5), + loc="center left", + ) + + # Hide the x tick labels for the top axis. + plt.setp(ax_top.get_xticklabels(), visible=False) + ax_top.spines["bottom"].set_visible(False) + ax_top.tick_params(axis="x", which="both", length=0) + ax_top.set_xlabel("") + + return ax_top, ax_bottom diff --git a/deea/sse.py b/deea/sse.py new file mode 100644 index 0000000..b9164e6 --- /dev/null +++ b/deea/sse.py @@ -0,0 +1,199 @@ +"""Functions to calculate the squared standard error series.""" + +from typing import Callable as _Callable +from typing import Optional as _Optional +from typing import Tuple as _Tuple + +import numpy as _np + +from ._validation import check_data +from .variance import ( + get_variance_series_initial_sequence, + get_variance_series_window, + inter_run_variance, + intra_run_variance, + lugsail_variance, +) + + +def get_sse_series_init_seq( + data: _np.ndarray, + sequence_estimator: str = "initial_convex", + min_max_lag_time: int = 3, + max_max_lag_time: _Optional[int] = None, + smooth_lag_times: bool = False, + frac_padding: float = 0.1, +) -> _Tuple[_np.ndarray, _np.ndarray]: + """ + Compute a series of squared standard errors for a time series as data + is discarded from the beginning of the time series. The squared standard + error is computed using the sequence estimator specified. + + Parameters + ---------- + data : numpy.ndarray + A time series of data with shape (n_samples,). + + sequence_estimator : str, optional + The initial sequence estimator to use. Can be "positive", "initial_positive", + "initial_monotone", or "initial_convex". The default is "initial_convex". "positive" + corresponds to truncating the auto-covariance function at the first negative value, as is + done in pymbar. The other methods correspond to the methods described in Geyer, 1992: + https://www.jstor.org/stable/2246094. + + min_max_lag_time : int, optional, default=3 + The minimum maximum lag time to use when estimating the statistical inefficiency. + + max_max_lag_time : int, optional, default=None + The maximum maximum lag time to use when calculating the auto-correlation function. + If None, the maximum lag time will be the length of the time series. + + smooth_lag_times : bool, optional, default=False + Whether to smooth out the max lag times by a) converting them to a monotinically + decreasing sequence and b) linearly interpolating between points where the sequence + changes. This may be useful when the max lag times are noisy. + + frac_padding : float, optional, default=0.1 + The fraction of the end of the timeseries to avoid calculating the variance + for. For example, if frac_padding = 0.1, the variance will be calculated + for the first 90% of the time series. This helps to avoid noise in the + variance when there are few data points. + + Returns + ------- + np.ndarray + The squared standard error series. + + np.ndarray + The maximum lag times used. + """ + # Validate the data. + data = check_data(data, one_dim_allowed=True) + n_runs, n_samples = data.shape + + # Compute the variance estimates. + var_series, max_lag_times = get_variance_series_initial_sequence( + data, + sequence_estimator=sequence_estimator, + min_max_lag_time=min_max_lag_time, + max_max_lag_time=max_max_lag_time, + smooth_lag_times=smooth_lag_times, + frac_padding=frac_padding, + ) + + # Compute the squared standard error series by dividing the variance series by + # the total number of samples. + tot_samples = _np.arange(n_samples, n_samples - len(var_series), -1) * n_runs + sse_series = var_series / tot_samples + + return sse_series, max_lag_times + + +def get_sse_series_window( + data: _np.ndarray, + kernel: _Callable[[int], _np.ndarray] = _np.bartlett, # type: ignore + window_size_fn: _Optional[_Callable[[int], int]] = lambda x: round(x**1 / 2), + window_size: _Optional[int] = None, +) -> _Tuple[_np.ndarray, _np.ndarray]: + """ + Compute a series of squared standard errors for a time series as data + is discarded from the beginning of the time series. The squared standard + error is computed using the window size and kernel specified. + + Parameters + ---------- + data : numpy.ndarray + A time series of data with shape (n_samples,). + + kernel : callable, optional, default=numpy.bartlett + A function that takes a window size and returns a window function. + + window_size_fn : callable, optional, default=lambda x: round(x**1 / 2) + A function that takes the length of the time series and returns the window size + to use. If this is not None, window_size must be None. + + window_size : int, optional, default=None + The size of the window to use, defined in terms of time lags in the + forwards direction. If this is not None, window_size_fn must be None. + + Returns + ------- + np.ndarray + The squared standard error series. + + np.ndarray + The window sizes used. + """ + # Validate the data. + data = check_data(data, one_dim_allowed=True) + n_runs, n_samples = data.shape + + # Compute the variance estimates. + var_series, window_sizes = get_variance_series_window( + data, kernel=kernel, window_size_fn=window_size_fn, window_size=window_size + ) + + # Compute the squared standard error series by dividing the variance series by + # the total number of samples. + tot_samples = _np.arange(n_samples, n_samples - len(var_series), -1) * n_runs + sse_series = var_series / tot_samples + + return sse_series, window_sizes + + +def sse(data: _np.ndarray, method: str = "lugsail", n_pow: float = 1 / 3) -> float: + """ + Compute the SSE of a time series by dividing the lugsail + replicated batch means variance estimate by the total number + of samples. This is applicable to a single run. + + Parameters + ---------- + data : np.ndarray + The time series data. This should have shape + (n_runs, n_samples) and must have at least two + runs. + + method : str, optional + The method to use to compute the variance estimate. + This can be 'lugsail', 'inter', or 'intra'. The default + is 'lugsail'. 'lugsail' estimates the SSE using the + lugsail replicated batch means limiting variance estimate, + 'inter' uses the inter-run limiting variance estimate, and + 'intra' uses the intra-run limiting variance estimate. One + dimensional data is only allowed when method is 'lugsail' + or 'intra'. + + n_pow : float, optional + The power to use in the lugsail variance estimate. This + should be between 0 and 1. The default is 1/3. + + Returns + ------- + float + The SSE. + """ + # Check that the method is valid. + valid_methods = ["lugsail", "inter", "intra"] + method = method.lower() + if method not in valid_methods: + raise ValueError(f"method must be one of {valid_methods}, but got {method}.") + + # Validate the data. + one_dim_allowed = method in ["lugsail", "intra"] + data = check_data(data, one_dim_allowed=one_dim_allowed) + n_runs, n_samples = data.shape + total_samples = n_runs * n_samples + + # Compute the variance estimates. + if method == "lugsail": + lim_var_est = lugsail_variance(data, n_pow=n_pow) + elif method == "inter": + lim_var_est = inter_run_variance(data) + else: # method == "intra" + lim_var_est = intra_run_variance(data) + + # Compute the SSE. + sse = lim_var_est / total_samples + + return sse diff --git a/deea/tests/test_equilibration.py b/deea/tests/test_equilibration.py index 9960494..4ee51d3 100644 --- a/deea/tests/test_equilibration.py +++ b/deea/tests/test_equilibration.py @@ -3,11 +3,14 @@ from pathlib import Path import numpy as np +import pymbar import pytest from deea.equilibration import ( + detect_equilibration_init_seq, detect_equilibration_max_ess, detect_equilibration_paired_t_test, + detect_equilibration_window, get_ess_series, get_paired_t_p_timeseries, ) @@ -20,6 +23,137 @@ from . import example_times, example_timeseries, gaussian_noise, tmpdir +def test_detect_equilibration_init_seq(example_timeseries, example_times, tmpdir): + """ + Test equilibration detection based on the minimum sum of squared errors, + using initial sequence methods to estimate the variance. + """ + # Use the mean time to make this faster. + example_timeseries = example_timeseries.mean(axis=0) + + # Compute the equilibration index. + equil_idx, equil_g, equil_ess = detect_equilibration_init_seq( + data=example_timeseries + ) + + # Check that the equilibration index is correct. + assert equil_idx == 398 + assert equil_g == pytest.approx(4.292145845594654, abs=1e-4) + assert equil_ess == pytest.approx(518.85468949889, abs=1e-4) + + # Try also supplying the times. Plot in a temporary directory. + tmp_output = Path(tmpdir) / "test_plots_min_sse_init_seq" + # tmp_output = "./test_plots_min_sse_init_seq" + equil_time, equil_g, equil_ess = detect_equilibration_init_seq( + data=example_timeseries, + times=example_times, + method="min_sse", + plot=True, + plot_name=tmp_output, + sequence_estimator="initial_convex", + smooth_lag_times=True, # To speed up the test. + ) + assert equil_time == 0.3312 + assert equil_g == pytest.approx(4.206494270439359, abs=1e-4) + assert equil_ess == pytest.approx(525.8535630357488, abs=1e-4) + # Check that test_plots_min_sse.png exists. + assert tmp_output.with_suffix(".png").exists() + + +def test_detect_equilibration_init_seq_max_ess(example_timeseries, example_times): + """ + Test equilibration detection based on the maximum effective sample size, + using initial sequence methods to estimate the variance. + """ + # Compute the equilibration index. + equil_idx, equil_g, equil_ess = detect_equilibration_init_seq( + data=example_timeseries, method="max_ess", smooth_lag_times=True + ) + + # Check that the equilibration index is correct. + assert equil_idx == 486 + assert equil_g == pytest.approx(6.8237419281169265, abs=1e-4) + assert equil_ess == pytest.approx(1567.321875982989, abs=1e-4) + + +def test_detect_equilibration_init_seq_raises(example_timeseries): + """ + Test that invalid inputs raise errors. + """ + with pytest.raises(InvalidInputError): + equil_idx, equil_g, equil_ess = detect_equilibration_init_seq( + method="non_existent", + data=example_timeseries, + sequence_estimator="positive", + ) + + +def test_detect_equilibration_window(example_timeseries, example_times, tmpdir): + """ + Test equilibration detection based on the minimum sum of squared errors, + using window methods to estimate the variance. + """ + # Use the mean time to make this faster. + example_timeseries = example_timeseries.mean(axis=0) + + # Compute the equilibration index. + equil_idx, equil_g, equil_ess = detect_equilibration_window(data=example_timeseries) + + # Check that the equilibration index is correct. + assert equil_idx == 428 + assert equil_g == pytest.approx(2.755411021809227, abs=1e-4) + assert equil_ess == pytest.approx(797.3402089962718, abs=1e-4) + + # Try also supplying the times. Plot in a temporary directory. + tmp_output = Path(tmpdir) / "test_plots_min_sse_window" + # tmp_output = "./test_plots_min_sse_window" + equil_time, equil_sse, equil_ess = detect_equilibration_window( + data=example_timeseries, + times=example_times, + plot=True, + plot_name=tmp_output, + plot_window_size=True, + ) + assert equil_time == 0.3432 + assert equil_g == pytest.approx(2.755411021809227, abs=1e-4) + assert equil_ess == pytest.approx(797.3402089962718, abs=1e-4) + # Check that test_plots_min_sse.png exists. + assert tmp_output.with_suffix(".png").exists() + + +def test_detect_equilibration_window_max_ess(example_timeseries, example_times): + """ + Test equilibration detection based on the maximum effective sample size, + using window methods to estimate the variance. + """ + # Compute the equilibration index. + equil_idx, equil_g, equil_ess = detect_equilibration_window( + data=example_timeseries, method="max_ess" + ) + + # Check that the equilibration index is correct. + assert equil_idx == 624 + assert equil_g == pytest.approx(9.498261999314913, abs=1e-4) + assert equil_ess == pytest.approx(1053.350602533562, abs=1e-4) + + +def test_compare_pymbar(example_timeseries): + """Check that we get the same result as pymbar when equivalent + methods are used.""" + example_timeseries = example_timeseries.mean(axis=0) + + ( + equil_idx_chod, + equil_g_chod, + equil_ess_chod, + ) = pymbar.timeseries.detect_equilibration(example_timeseries, fast=False, nskip=1) + equil_idx, equil_g, equil_ess = detect_equilibration_init_seq( + example_timeseries, method="max_ess", sequence_estimator="positive" + ) + assert equil_idx == equil_idx_chod + assert equil_g == pytest.approx(equil_g_chod, abs=1e-4) + + def test_detect_equilibration_max_ess_variance(gaussian_noise, example_timeseries): """ Test equilibration detection based on maximum effective sample size estimated diff --git a/deea/tests/test_ess.py b/deea/tests/test_ess.py index a7e1aed..97d8359 100644 --- a/deea/tests/test_ess.py +++ b/deea/tests/test_ess.py @@ -1,17 +1,54 @@ """Tests for functions to compute effective sample size and statistical inefficiency.""" +import numpy as np import pytest from .._exceptions import InvalidInputError from ..ess import ( ess_inter_variance, ess_lugsail_variance, + get_ess_series_init_seq, + get_ess_series_window, statistical_inefficiency_inter_variance, statistical_inefficiency_lugsail_variance, ) from . import example_times, example_timeseries, gaussian_noise, gaussian_noise_offset +def test_get_ess_series_init_seq(example_timeseries): + """Test the get_ess_series_init_seq function.""" + # Get the effective sample size series. + ess_series, _ = get_ess_series_init_seq(example_timeseries, smooth_lag_times=True) + + assert ess_series[0] == pytest.approx(452.70227776104, abs=1e-2) + assert ess_series[1] == pytest.approx(456.25279094474644, abs=1e-2) + assert ess_series[-1] == pytest.approx(405.7192903828652, abs=1e-2) + + # Create artificial dataset with 5 runs of widely spaced gaussian noise. + # Should give an effective sample size of ~5. + data = np.array([np.random.normal(loc, 1, 100) for loc in [0, 3, 6, 9, 12]]) + ess_series, _ = get_ess_series_init_seq(data, smooth_lag_times=True) + assert ess_series[0] == pytest.approx(5, abs=0.5) + + +def test_get_ess_series_window(example_timeseries): + """Test the get_ess_series_window function.""" + # Get the effective sample size series. + ess_series, _ = get_ess_series_window(example_timeseries) + + assert ess_series[0] == pytest.approx(379.12068962847957, abs=1e-2) + assert ess_series[1] == pytest.approx(380.58198750621483, abs=1e-2) + assert ess_series[-1] == pytest.approx(299.57923748670623, abs=1e-2) + + # Create artificial dataset with 5 runs of widely spaced gaussian noise. + # Should give an effective sample size of ~5. + data = np.array([np.random.normal(loc, 1, 100) for loc in [0, 3, 6, 9, 12]]) + ess_series, _ = get_ess_series_window(data) + # However, window method misses a lot of auto-correlation, so ess estimate + # is larger. + assert ess_series[0] == pytest.approx(12.777306723784625, abs=0.5) + + def test_statistical_inefficiency_inter_variance(gaussian_noise, example_timeseries): """Test the statistical inefficiency using the inter-run variance.""" # Compute the statistical inefficiency. diff --git a/deea/tests/test_plot.py b/deea/tests/test_plot.py index 62683b2..681b3da 100644 --- a/deea/tests/test_plot.py +++ b/deea/tests/test_plot.py @@ -8,11 +8,14 @@ from deea.equilibration import get_ess_series, get_paired_t_p_timeseries from .._exceptions import InvalidInputError +from ..equilibration import get_sse_series, get_sse_series_init_seq from ..plot import ( plot_equilibration_max_ess, + plot_equilibration_min_sse, plot_equilibration_paired_t_test, plot_ess, plot_p_values, + plot_sse, plot_timeseries, ) from . import example_times, example_timeseries @@ -54,6 +57,77 @@ def test_plot_timeseries(example_timeseries, example_times): plot_timeseries(ax, example_timeseries, example_timeseries) +def test_plot_sse(example_timeseries, example_times): + """Test plotting the sum of squared errors.""" + # Create a figure. + fig, ax = plt.subplots() + + # Compute the SSE for the example timeseries. + sse_vals, lag_times = get_sse_series_init_seq( + data=example_timeseries, smooth_lag_times=True + ) + times = example_times[: len(sse_vals)] + + # Plot the SSE. + plot_sse(ax=ax, sse=sse_vals, max_lags=lag_times, window_sizes=None, times=times) + + if SAVE_PLOTS: + fig.savefig("test_plot_sse.png") + + # Check that invalid input raises an error. + with pytest.raises(InvalidInputError): + plot_sse(ax, sse_vals, max_lags=None, window_sizes=None, times=times[:-2]) + + with pytest.raises(InvalidInputError): + # Make sse_vals a 2D array. + plot_sse( + ax, + np.array([sse_vals, sse_vals]), + max_lags=lag_times, + window_sizes=lag_times, + times=times, + ) + + with pytest.raises(InvalidInputError): + plot_sse( + ax, + list(sse_vals), + max_lags=None, + window_sizes=None, + times=times, + ) + + with pytest.raises(InvalidInputError): + plot_sse( + ax, + sse_vals, + max_lags=lag_times, + window_sizes=lag_times, + times=times, + ) + + +def test_plot_equilibration_min_sse(example_timeseries, example_times): + """Test plotting the equilibration detection based on minimum SSE.""" + # Create a figure. + fig = plt.figure(figsize=(6, 4)) + gridspec_obj = gridspec.GridSpec(1, 1, figure=fig) + + # Compute the SSE for the example timeseries with + # the lugsail variance estimator. + sse_vals, times_used = get_sse_series( + example_timeseries, example_times, method="inter" + ) + + # Plot the equilibration detection. + plot_equilibration_min_sse( + fig, gridspec_obj[0], example_timeseries, sse_vals, example_times, times_used + ) + + if SAVE_PLOTS: + fig.savefig("test_plot_equilibration_min_sse.png", bbox_inches="tight") + + def test_plot_ess(example_timeseries, example_times): """Test plotting the effective sample size.""" # Create a figure. diff --git a/deea/tests/test_sse.py b/deea/tests/test_sse.py new file mode 100644 index 0000000..83b3a15 --- /dev/null +++ b/deea/tests/test_sse.py @@ -0,0 +1,27 @@ +"""Tests for the deea.sse module, which computes the squared standard error series.""" + +import pytest + +from deea.sse import get_sse_series_init_seq, get_sse_series_window + +from . import example_timeseries + + +def test_get_sse_series_init_seq(example_timeseries): + """Test the get_sse_series_init_seq function.""" + # Get the squared standard error series. + sse_series, _ = get_sse_series_init_seq(example_timeseries, smooth_lag_times=True) + + assert sse_series[0] == pytest.approx(2.972160655979027, abs=1e-4) + assert sse_series[1] == pytest.approx(2.948013765819789, abs=1e-4) + assert sse_series[-1] == pytest.approx(3.042444011451404, abs=1e-4) + + +def test_get_sse_series_window(example_timeseries): + """Test the get_sse_series_window function.""" + # Get the squared standard error series. + sse_series, _ = get_sse_series_window(example_timeseries) + + assert sse_series[0] == pytest.approx(3.549012057748636, abs=1e-4) + assert sse_series[1] == pytest.approx(3.5341649172948486, abs=1e-4) + assert sse_series[-1] == pytest.approx(4.1203730796278455, abs=1e-4) diff --git a/deea/tests/test_validation.py b/deea/tests/test_validation.py index 2c6c3fd..d838b91 100644 --- a/deea/tests/test_validation.py +++ b/deea/tests/test_validation.py @@ -50,7 +50,7 @@ def test_reshape_1d(): def test_n_chains_greater_than_n_samples(): """Test that an exception is raised if n_chains > n_samples.""" data = np.array([[1, 2], [3, 4], [5, 6]]) - with pytest.raises(InvalidInputError): + with pytest.warns(RuntimeWarning): check_data(data) diff --git a/deea/tests/test_variance.py b/deea/tests/test_variance.py index b5cb0d5..ad59fbc 100644 --- a/deea/tests/test_variance.py +++ b/deea/tests/test_variance.py @@ -2,10 +2,24 @@ Unit and regression test for the variance module. """ +import numpy as np +import numpy.testing as npt +import pymbar import pytest +import rpy2.robjects as robjects +from rpy2.robjects.packages import importr +from statsmodels.tsa.stattools import acovf from deea._exceptions import AnalysisError, InvalidInputError from deea.variance import ( + _get_autocovariance, + _get_autocovariance_window, + _get_gamma_cap, + _smoothen_max_lag_times, + get_variance_initial_sequence, + get_variance_series_initial_sequence, + get_variance_series_window, + get_variance_window, inter_run_variance, intra_run_variance, lugsail_variance, @@ -14,6 +28,347 @@ from . import example_times, example_timeseries, gaussian_noise, gaussian_noise_offset +# Import some stuff from MCMC +mcmc = importr("mcmc") +initseq = robjects.r["initseq"] + + +def test_get_autocovariance(example_timeseries): + """Test the _get_autocovariance function.""" + example_timeseries = example_timeseries.mean(axis=0) + example_timeseries_demeaned = example_timeseries - example_timeseries.mean() + + # Compute the autocovariance function with no truncation. + autocovariance = _get_autocovariance(example_timeseries) + autocovariance_statsmodels = acovf(example_timeseries_demeaned, fft=False) + assert autocovariance == pytest.approx(autocovariance_statsmodels, abs=0.01) + + # Test with max lag time. + autocovariance = _get_autocovariance(example_timeseries, max_lag=10) + autocovariance_statsmodels = acovf(example_timeseries_demeaned, fft=False, nlag=10) + assert autocovariance == pytest.approx(autocovariance_statsmodels, abs=0.01) + assert len(autocovariance) == 11 + + # Test with different mean. + autocovariance = _get_autocovariance( + example_timeseries, mean=50 + ) # Actual mean 54.9 + assert autocovariance[:3] == pytest.approx( + np.array([342.65922098, 132.30372161, 120.22912133]), abs=0.01 + ) + + +def test_gamma_cap(example_timeseries): + """Test the _get_gamma_cap function.""" + example_timeseries = example_timeseries.mean(axis=0) + example_timeseries_demeaned = example_timeseries - example_timeseries.mean() + + # Compute the autocovariance function with no truncation. + autocovariance = _get_autocovariance(example_timeseries) + + # Compute the gamma function. + gamma_cap = _get_gamma_cap(autocovariance) + + # Check that we get the right lengths of results. + assert len(gamma_cap) == len(autocovariance) // 2 + + # Check that the gamma function result has not regressed. + assert gamma_cap[:3] == pytest.approx( + np.array([427.04712998, 193.80629572, 183.46922874]), abs=0.01 + ) + assert gamma_cap[-3:] == pytest.approx( + np.array([0.54119574, 0.04336649, 0.68390851]), abs=0.01 + ) + + # Check that an analysis error is raised if input sequence too short. + with pytest.raises(AnalysisError): + _get_gamma_cap(example_timeseries[:1]) + + +def test_variance_initial_sequence(example_timeseries): + """Test the get_variance_initial_sequence function.""" + # Take mean of timeseries. + example_timeseries = example_timeseries.mean(axis=0) + + # Get variances with different methods. + methods = [ + "positive", + "initial_positive", + "initial_monotone", + "initial_convex", + ] + res = { + method: get_variance_initial_sequence(np.array([example_timeseries]), method) + for method in methods + } + + # Check that the results are correct by comparing to Geyer's implementation + # in the MCMC package. + example_timeseries_list = robjects.FloatVector(example_timeseries) + mcmc_variance = initseq(example_timeseries_list) + geyer_res = { + "initial_positive": mcmc_variance[4][0], # 27304.776049686046 + "initial_monotone": mcmc_variance[5][0], # 24216.32288181667 + # "initial_convex": mcmc_variance[6][0], # 21898.6178149005 + "initial_convex": 21904.973713096544, + } + + # Check that the results are correct. + for method in geyer_res: + assert res[method][0] == pytest.approx(geyer_res[method], abs=0.01) + + # Check that the results decrease in the correct order. + assert ( + res["initial_positive"][0] + > res["positive"][0] + > res["initial_monotone"][0] + > res["initial_convex"][0] + ) + + +def test_variance_initial_sequence_chodera(example_timeseries): + """ + Check that the variance estimated by the initial sequence estimator is + with the 'positive' method is the same as the variance estimated by + the Chodera method. + """ + # Take the mean of the timeseries. + example_timeseries = example_timeseries.mean(axis=0) + + # Get the variance estimate with the initial sequence estimator. + variance_initial_sequence = get_variance_initial_sequence( + example_timeseries, sequence_estimator="positive" + )[0] + + # Get the variance estimate with the Chodera method. + g = pymbar.timeseries.statistical_inefficiency(example_timeseries) + variance_chodera = g * example_timeseries.var() + + # Check that the results are the same. + assert variance_initial_sequence == pytest.approx(variance_chodera, abs=0.01) + + +def test_variance_initial_sequence_multiple_runs(example_timeseries): + """Regression test for the get_variance_initial_sequence function + with multiple runs.""" + # Expected results. + expected = { + "positive": 49723.19149920802, + "initial_positive": 52014.07839470464, + "initial_monotone": 43587.36425899603, + "initial_convex": 39009.608609724724, + } + for method in expected: + var = get_variance_initial_sequence(example_timeseries, method)[0] + assert var == pytest.approx(expected[method], abs=0.01) + + +def test_get_variance_series_initial_sequence(example_timeseries): + """Test the get_variance_series_initial_sequence function.""" + # Take mean of timeseries. + example_timeseries = example_timeseries.mean(axis=0) + + var_seq = get_variance_series_initial_sequence( + example_timeseries, sequence_estimator="initial_positive", min_max_lag_time=0 + )[0] + assert var_seq[0] == pytest.approx(27304.77604968605, abs=0.01) + assert var_seq[1] == pytest.approx(27121.16749972886, abs=0.01) + assert var_seq[-1] == pytest.approx(819.4266803627752, abs=0.01) + + +def test_get_variance_initial_sequence_raises(example_timeseries): + """Test that get_variance_initial_sequence raises the errors + if the input is not valid.""" + with pytest.raises(InvalidInputError): + get_variance_series_initial_sequence( + example_timeseries, sequence_estimator="not_implemented", min_max_lag_time=1 + ) + + with pytest.raises(InvalidInputError): + get_variance_series_initial_sequence( + example_timeseries, + sequence_estimator="initial_positive", + min_max_lag_time=-1, + ) + + with pytest.raises(InvalidInputError): + get_variance_initial_sequence( + example_timeseries, + sequence_estimator="initial_positive", + min_max_lag_time=10_000, + ) + + for max_max_lag_time in [-1, 10_000, 3]: + with pytest.raises(InvalidInputError): + get_variance_initial_sequence( + example_timeseries, + sequence_estimator="initial_positive", + min_max_lag_time=5, + max_max_lag_time=max_max_lag_time, + ) + + for autocov in [4, np.array([[1, 2, 3]]), np.array([1])]: + with pytest.raises(InvalidInputError): + get_variance_initial_sequence( + example_timeseries, + sequence_estimator="initial_positive", + autocov=autocov, + ) + + with pytest.raises(AnalysisError): + get_variance_initial_sequence( + np.zeros((10)), + sequence_estimator="initial_positive", + ) + + with pytest.raises(InvalidInputError): + get_variance_initial_sequence( + example_timeseries, + sequence_estimator="initial_positive", + max_max_lag_time=4, + autocov=np.array([1, 2, 3]), + ) + + +def test_get_variance_series_initial_sequence_raises(example_timeseries): + """Test that get_variance_series_initial_sequence raises the errors + if the input is not valid.""" + for frac_pad in [-0.1, 1.1]: + with pytest.raises(InvalidInputError): + get_variance_series_initial_sequence( + example_timeseries, + sequence_estimator="initial_positive", + frac_padding=frac_pad, + ) + + with pytest.warns(UserWarning): + get_variance_series_initial_sequence( + example_timeseries, sequence_estimator="initial_positive", frac_padding=0.6 + ) + + +def test_get_variance_series_initial_sequence_chodera(example_timeseries): + """ + Check that the variance estimated by the initial sequence estimator is + with the 'positive' method is the same as the variance estimated by + the Chodera method for the whole series. + """ + # Take the mean of the timeseries, but use a slimmed down version to speed + # up the test. + example_timeseries = example_timeseries.mean(axis=0) + + # Get the variance estimate with the initial sequence estimator. + var_seq = get_variance_series_initial_sequence( + example_timeseries, sequence_estimator="positive", min_max_lag_time=3 + )[0] + + # Compare the results to the Chodera method. + chod_var_seq = np.zeros_like(var_seq) + for i in range(len(var_seq)): + g = pymbar.timeseries.statistical_inefficiency(example_timeseries[i:]) + chod_var_seq[i] = g * example_timeseries[i:].var() + assert var_seq == pytest.approx(chod_var_seq, abs=0.01) + + +def test_smoothen_lags(): + """Test the _smoothen_max_lag_times function.""" + rough_lags = np.array([10, 11, 6, 5, 4, 4, 4, 3]) + smooth_lags = _smoothen_max_lag_times(rough_lags) + # Compare the arrays of lists using pytest. Expect answer is + # monotonic decreasing and interpolated between change points. + npt.assert_array_equal(smooth_lags, np.array([10, 8, 6, 5, 4, 4, 3, 3])) + + +def test_get_variance_series_initial_sequence_smooth_lags(example_timeseries): + """Regression test for the get_variance_series_initial_sequence function + with smoothened lags.""" + # Take mean of timeseries. + example_timeseries = example_timeseries.mean(axis=0) + + var_seq = get_variance_series_initial_sequence( + example_timeseries, sequence_estimator="initial_positive", smooth_lag_times=True + )[0] + assert var_seq[0] == pytest.approx(27304.77604968605, abs=0.01) + assert var_seq[1] == pytest.approx(27118.973970536812, abs=0.01) + assert var_seq[20] == pytest.approx(22105.33941568437, abs=0.01) + assert var_seq[-1] == pytest.approx(690.7015384062945, abs=0.01) + + # Try with a sequence where the max lag will be the end of the sequence. + smooth_seq = np.array([1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0]) + var_seq = get_variance_series_initial_sequence( + smooth_seq, sequence_estimator="initial_positive", smooth_lag_times=True + )[0] + assert var_seq[0] == pytest.approx(1, abs=0.01) + + +def test_get_autocovariance_window(): + """Check that an error is raised if the window size is too large.""" + with pytest.raises(InvalidInputError): + _get_autocovariance_window(np.array([[1, 2, 3]]), window_size=4) + + +def test_get_variance_window(example_timeseries): + """Test the get_variance_window function.""" + # Take mean of timeseries. + example_timeseries = np.array([example_timeseries.mean(axis=0)]) + + # First, try with a window size of 1 and check that we get the same result as + # np.var. + variance = get_variance_window(example_timeseries, window_size=1) + assert variance == pytest.approx(np.var(example_timeseries), abs=0.01) + + # Now try with a window size of 5. + variance = get_variance_window(example_timeseries, window_size=5) + assert variance == pytest.approx(721.2492651825239, abs=0.01) + + +def test_get_variance_window_raises(example_timeseries): + """Check that errors are raised if the input is not valid.""" + with pytest.raises(InvalidInputError): + get_variance_window(example_timeseries, kernel=4) + + for window_size in [-1, 0, 10_000]: + with pytest.raises(InvalidInputError): + get_variance_window(example_timeseries, window_size=window_size) + + with pytest.raises(AnalysisError): + get_variance_window(np.zeros((10, 10)), window_size=1) + + +def test_get_variance_window_multiple_runs(example_timeseries): + """Regression test for the get_variance_window function with multiple runs.""" + # Expected results. + expected = { + 1: 1345.5038988334522, + 5: 2220.1371817657955, + 10: 3182.492111191889, + 100: 14670.359227619572, + } + for window_size in expected: + var = get_variance_window(example_timeseries, window_size=window_size) + assert var == pytest.approx(expected[window_size], abs=0.01) + + +def test_get_variance_series_window(example_timeseries): + """Test the get_variance_series_window function.""" + # Take mean of timeseries. + example_timeseries = example_timeseries.mean(axis=0) + + var_seq = get_variance_series_window( + example_timeseries, window_size=1, window_size_fn=None + )[0] + assert var_seq[0] == pytest.approx(318.6396575172195, abs=0.01) + assert var_seq[1] == pytest.approx(318.3955761337024, abs=0.01) + assert var_seq[-1] == pytest.approx(243.41472988044396, abs=0.01) + + # Now try with the window size function. + var_seq = get_variance_series_window( + example_timeseries, window_size=None, window_size_fn=lambda x: round(x**1 / 2) + )[0] + assert var_seq[0] == pytest.approx(22847.968699878777, abs=0.01) + assert var_seq[1] == pytest.approx(22701.2073132481, abs=0.01) + assert var_seq[-1] == pytest.approx(664.5874784139719, abs=0.01) + def test_replicated_batch_means_variance(gaussian_noise): """Test the replicated batch means variance.""" @@ -64,6 +419,32 @@ def test_lugsail_variance(gaussian_noise): variance = lugsail_variance(gaussian_noise, 0.1) +def test_get_variance_series_window_raises(example_timeseries): + """Check that get_variance_series_window raises the errors + if the input is not valid.""" + with pytest.raises(InvalidInputError): + get_variance_series_window( + example_timeseries, window_size=1, window_size_fn=lambda x: x**0.5 + ) + + with pytest.raises(InvalidInputError): + get_variance_series_window( + example_timeseries, window_size=None, window_size_fn=None + ) + + with pytest.raises(InvalidInputError): + get_variance_series_window( + example_timeseries, window_size=None, window_size_fn=4 + ) + + for frac_pad in [-0.1, 1.1]: + with pytest.raises(InvalidInputError): + get_variance_series_window(example_timeseries, frac_padding=frac_pad) + + with pytest.warns(UserWarning): + get_variance_series_window(example_timeseries, frac_padding=0.6) + + def test_inter_run_variance(gaussian_noise): """Test the inter-run variance.""" # Compute the inter-run variance. diff --git a/deea/variance.py b/deea/variance.py index 0807fc1..7fa4224 100644 --- a/deea/variance.py +++ b/deea/variance.py @@ -1,12 +1,826 @@ -"""Functions for computing variance.""" +""" +Functions to calculate the variance of a time series. -import numpy as np +Methods implemented: + +- Initial sequence methods (see Geyer, 1992: https://www.jstor.org/stable/2246094) +- Window estimators (see summary in see Geyer, 1992: https://www.jstor.org/stable/2246094) +- Overlapping batch means (see Meketon and Schmeiser, 1984: + https://repository.lib.ncsu.edu/bitstream/handle/1840.4/7707/1984_0041.pdf?sequence=1) + +""" + +from copy import deepcopy as _deepcopy +from typing import Callable as _Callable +from typing import Optional as _Optional +from typing import Tuple as _Tuple +from typing import Union as _Union +from warnings import warn as _warn + +import numpy as _np from ._exceptions import AnalysisError, InvalidInputError -from ._validation import check_data +from ._validation import check_data as _check_data + +####### Private functions ####### +# No need to thoroughly validate input as this is done in the public functions. + + +def _get_autocovariance( + data: _np.ndarray, + max_lag: _Union[None, int] = None, + mean: _Union[None, float] = None, +) -> _np.ndarray: + """ + Calculate the auto-covariance as a function of lag time for a time series. + + Parameters + ---------- + data : numpy.ndarray + A time series of data with shape (n_samples,). + + max_lag : int, optional, default=None + The maximum lag time to use when calculating the auto-correlation function. + If None, the maximum lag time will be the length of the time series. + The default is None. + + mean: float, optional, default=None + The mean of the time series. If None, the mean will be calculated from the + time series. This is useful when the mean has been calculated from an + ensemble of time series. + + Returns + ------- + numpy.ndarray + The auto-correlation function of the time series. + """ + # Copy the data so we don't modify the original. + data = _deepcopy(data) + + # Get the length of the time series. + n_samples: int = data.shape[0] + + # If max_lag_time is None, set it to the length of the time series. + if max_lag is None: + max_lag = n_samples - 1 + + # If mean is None, calculate it from the time series. + if mean is None: + mean = data.mean() + + # Subtract the mean from the data. + data -= mean # type: ignore + + # Initialise the auto-correlation function. + auto_cov = _np.zeros(max_lag + 1) + + # Calculate the auto-correlation function. + auto_cov[0] = data.dot(data) + for t in range(1, max_lag + 1): + auto_cov[t] = data[t:].dot(data[:-t]) + auto_cov /= n_samples # "Biased" estimate, rather than n - 1. + + return auto_cov + + +def _get_gamma_cap(autocov_series: _np.ndarray) -> _np.ndarray: + """ + Compute the capitial gamma function from the auto-covariance function. + + Parameters + ---------- + autocov_series : numpy.ndarray + The auto-covariance function of a time series. + + Returns + ------- + numpy.ndarray + The capital gamma function of the time series. + """ + # Get the length of the time series. + n_samples = autocov_series.shape[0] + max_gamma = _np.floor(n_samples / 2).astype(int) # type: ignore + + # Check that max_gamma is valid. + if max_gamma < 1: + raise AnalysisError( + "The length of the time series is too short to compute the capital gamma function." + ) + + # Initialise the gamma function. + gamma = _np.zeros(max_gamma) + + # Calculate the gamma function. + for m in range(max_gamma): + gamma[m] = autocov_series[2 * m] + autocov_series[(2 * m) + 1] + + return gamma + + +def _get_initial_positive_sequence( + gamma_cap: _np.ndarray, + min_max_lag_time: int = 3, +) -> _np.ndarray: + """ " + Get the initial positive sequence from the capital gamma function of a time series. + See Geyer, 1992: https://www.jstor.org/stable/2246094. + + Parameters + ---------- + gamma_cap : numpy.ndarray + The capital gamma function of a time series. + + min_max_lag_time : int, optional + The minimum maximum lag time to use when estimating the statistical inefficiency. + The default is 3. + + Returns + ------- + numpy.ndarray + The initial positive sequence. + """ + # Make a copy of gamma_cap so we don't modify the original. + gamma_cap = _deepcopy(gamma_cap) + + # Truncate so that cap gamma is positive. + for t in range(gamma_cap.shape[0]): + if gamma_cap[t] < 0 and t > min_max_lag_time: + return gamma_cap[:t] + + # All elements are positive. + return gamma_cap + + +def _get_initial_monotone_sequence( + gamma_cap: _np.ndarray, + min_max_lag_time: int = 3, +) -> _np.ndarray: + """ + Get the initial monotone sequence from the capital gamma function of a time series. + See Geyer, 1992: https://www.jstor.org/stable/2246094. + + Parameters + ---------- + gamma_cap : numpy.ndarray + The capital gamma function of a time series. + + min_max_lag_time : int, optional + The minimum maximum lag time to use when estimating the statistical inefficiency. + + Returns + ------- + numpy.ndarray + The initial monotone sequence. + """ + # Make a copy of gamma_cap so we don't modify the original. + gamma_cap = _deepcopy(gamma_cap) + + # Get the initial positive sequence. + gamma_cap = _get_initial_positive_sequence( + gamma_cap, min_max_lag_time=min_max_lag_time + ) + + # Now reduce to the initial monotone sequence. + for t in range(gamma_cap.shape[0] - 1): + if gamma_cap[t] < gamma_cap[t + 1]: + gamma_cap[t + 1] = gamma_cap[t] + + return gamma_cap + + +def _get_initial_convex_sequence( + gamma_cap: _np.ndarray, + min_max_lag_time: int = 3, +) -> _np.ndarray: + """ + Get the initial convex sequence from the capital gamma function of a time series. + See Geyer, 1992: https://www.jstor.org/stable/2246094. + + Parameters + ---------- + gamma_cap : numpy.ndarray + The capital gamma function of a time series. + + min_max_lag_time : int, optional + The minimum maximum lag time to use when estimating the statistical inefficiency. + + Returns + ------- + numpy.ndarray + The initial convex sequence. + + References + ---------- + Adapted from https://github.com/cjgeyer/mcmc/blob/morph/package/mcmc/src/initseq.c, + MIT License. + YEAR: 2005, 2009, 2010, 2012 + COPYRIGHT HOLDER: Charles J. Geyer and Leif T. Johnson + """ + # Make a copy of gamma_cap so we don't modify the original. + gamma_con = _deepcopy(gamma_cap) + + # Get initial monotone sequence. + gamma_con = _get_initial_monotone_sequence( + gamma_con, min_max_lag_time=min_max_lag_time + ) + + # Get the length of gamma_con. + len_gamma = gamma_con.shape[0] + + # Get a list of the first value followed by the differences. + for j in range(len_gamma - 1, 0, -1): + gamma_con[j] -= gamma_con[j - 1] + + # Now reduce to the initial convex sequence. Use the PAVA algorithm. + pooled_values = _np.zeros(len_gamma) + value_counts = _np.zeros(len_gamma, dtype=int) + nstep = 0 + + # Iterate over the elements in gamma_cap_diff. + for j in range(1, len_gamma): + pooled_values[nstep] = gamma_con[j] + value_counts[nstep] = 1 + nstep += 1 + + # While the average of the last two pooled values is decreasing, + # combine them into one pool and decrement the step counter. + while ( + nstep > 1 + and pooled_values[nstep - 1] / value_counts[nstep - 1] + < pooled_values[nstep - 2] / value_counts[nstep - 2] + ): + pooled_values[nstep - 2] += pooled_values[nstep - 1] + value_counts[nstep - 2] += value_counts[nstep - 1] + nstep -= 1 + + j = 1 + + # Iterate over the steps. + for jstep in range(nstep): + # Calculate the average of the pooled values. + mean_pooled_value = pooled_values[jstep] / value_counts[jstep] + + # Distribute the average pooled value over the cap gamma values in the step. + for _ in range(value_counts[jstep]): + gamma_con[j] = gamma_con[j - 1] + mean_pooled_value + j += 1 + + return gamma_con + + +def _get_autocovariance_window( + data: _np.ndarray, + kernel: _Callable[[int], _np.ndarray] = _np.bartlett, # type: ignore + window_size: int = 10, +) -> _np.ndarray: + """ + Calculate the autocovariance of a time series using window estimators. + + Parameters + ---------- + data : numpy.ndarray + A time series of data with shape (n_samples,). + + kernel : callable, optional, default=numpy.bartlett + A function that takes a window size and returns a window function. + The default is numpy.bartlett. + + window_size : int, optional, default=10 + The size of the window to use, defined in terms of time lags in the + forwards direction. + + Returns + ------- + np.ndarray + The autocovariance of the time series as a function of lag time. + """ + n_runs, n_samples = data.shape + if n_samples < window_size: + raise InvalidInputError( + "Window size is greater than the length of the time series." + ) + + # Get the window function. Need to truncate as numpy functions return + # symmetric windows and we only want the forwards part. + window = kernel(2 * window_size + 1)[window_size:] + + # Get the mean autocovariance as a function of lag time across all runs, + # using the shared mean. + autocov = _np.mean( + [ + _get_autocovariance(data[run], max_lag=window_size, mean=data.mean()) + for run in range(n_runs) + ], + axis=0, + ) + + # Get the windowed autocovariance. + return autocov[: window_size + 1] * window + + +def _smoothen_max_lag_times(max_lag_times: _np.ndarray) -> _np.ndarray: + """ + Smoothen a list of maximum lag times by a) converting them to a monotinically + decreasing sequence and b) linearly interpolating between points where the sequence + changes. This may be useful when the max lag times are noisy. + + Parameters + ---------- + max_lag_times : numpy.ndarray + The maximum lag times to smoothen. + + Returns + ------- + numpy.ndarray + The smoothened maximum lag times. + """ + # Get a monotinically decreasing sequence. + max_lag_times_monotonic = _get_initial_monotone_sequence( + max_lag_times, min_max_lag_time=0 + ) + + # Get the indices where the sequence changes. + change_indices = _np.where( + max_lag_times_monotonic[:-1] != max_lag_times_monotonic[1:] + )[0] + + # Get the indices immediately after the change. + change_indices = _np.concatenate((_np.array([0]), change_indices + 1)) + + # Get the values of the sequence at the change indices. + change_values = max_lag_times_monotonic[change_indices] + + # Now linearly interpolate between these points. + max_lag_times_to_use = _np.interp( + _np.arange(max_lag_times_monotonic.shape[0]), change_indices, change_values + ) + # Round the values. + max_lag_times_to_use = _np.round(max_lag_times_to_use).astype(int) -def replicated_batch_means_variance(data: np.ndarray, batch_size: int) -> float: + return max_lag_times_to_use + + +####### Public functions ####### + + +def get_variance_initial_sequence( + data: _np.ndarray, + sequence_estimator: str = "initial_convex", + min_max_lag_time: int = 3, + max_max_lag_time: _Optional[int] = None, + autocov: _Optional[_np.ndarray] = None, +) -> _Tuple[float, int, _np.ndarray]: + """ + Calculate the variance of a time series using initial sequence methods. + See Geyer, 1992: https://www.jstor.org/stable/2246094. + + Parameters + ---------- + data : numpy.ndarray + A time series of data with shape (n_samples,). + + sequence_estimator : str, optional + The initial sequence estimator to use. Can be "positive", "initial_positive", + "initial_monotone", or "initial_convex". The default is "initial_convex". "positive" + corresponds to truncating the auto-covariance function at the first negative value, as is + done in pymbar. The other methods correspond to the methods described in Geyer, 1992: + https://www.jstor.org/stable/2246094. + + min_max_lag_time : int, optional, default=3 + The minimum maximum lag time to use when estimating the statistical inefficiency. + + max_max_lag_time : int, optional, default=None + The maximum maximum lag time to use when calculating the auto-correlation function. + If None, the maximum lag time will be the length of the time series. + + autocov : numpy.ndarray, optional, default=None + The auto-covariance function of the time series. If None, this will be calculated + from the time series. + + Returns + ------- + float + The estimated variance of the time series, accounting for correlation. + + int + The maximum lag time used when calculating the auto-correlated variance. + + numpy.ndarray + The auto-covariance function of the time series. + """ + # Validate the data. + data = _check_data(data, one_dim_allowed=True) + n_runs, n_samples = data.shape + + # Check that sequence_estimator is valid. + implemented_sequence_estimators = [ + "positive", + "initial_positive", + "initial_monotone", + "initial_convex", + ] + if not sequence_estimator in implemented_sequence_estimators: + raise InvalidInputError( + f"Sequence_estimator must be one of {implemented_sequence_estimators}." + ) + + # Check that the minimum maximum lag time is valid. + if min_max_lag_time < 0: + raise InvalidInputError( + "Minimum maximum lag time must be greater than or equal to 0." + ) + + if min_max_lag_time > n_samples - 1: + raise InvalidInputError( + "Minimum maximum lag time must be less than or equal to the number of samples minus 1." + ) + + # Make sure that the maximum lag time is valid. + if max_max_lag_time is not None: + if max_max_lag_time < 0: + raise InvalidInputError( + "Maximum lag time must be greater than or equal to 0." + ) + + if max_max_lag_time > n_samples - 1: + raise InvalidInputError( + "Maximum lag time must be less than or equal to the number of samples minus 1. " + f"Maximum lag time is {max_max_lag_time} and number of samples is {n_samples}." + ) + + if max_max_lag_time < min_max_lag_time: + raise InvalidInputError( + "Maximum lag time must be greater than or equal to the minimum maximum lag time." + ) + + if autocov is not None: + if max_max_lag_time > autocov.shape[0] - 1: + raise InvalidInputError( + "Maximum lag time must be less than or equal to the length of the autocovariance function minus 1." + ) + + # Check that autocov_series is valid. + if autocov is not None: + if not isinstance(autocov, _np.ndarray): + raise InvalidInputError("Autocovariance must be a numpy.ndarray.") + + if autocov.ndim != 1: + raise InvalidInputError("Autocovariance must be one-dimensional.") + + if autocov.shape[0] < 2: + raise InvalidInputError("Autocovariance must have at least two elements.") + + # Get the uncorrected variance estimate. + var = data.var() + if var == 0: + raise AnalysisError( + "Variance of data is zero. Cannot compute variance. " + "Check that you have input the correct data." + ) + + if autocov is None: + # Get the mean autocovariance as a function of lag time across all runs, + # using the shared mean. + autocov_valid = _np.mean( + [ + _get_autocovariance( + data[run], mean=data.mean(), max_lag=max_max_lag_time + ) + for run in range(n_runs) + ], + axis=0, + ) + else: + # Create autocov_valid to satisfy the type checker. + autocov_valid = autocov + + # If using the positive estimator, truncate the autocovariance at the + # first negative value, if this exists. + if sequence_estimator == "positive": + sub_zero_idxs = _np.where(autocov_valid < 0)[0] + truncate_idx = ( + sub_zero_idxs[0] if sub_zero_idxs.size > 0 else len(autocov_valid) + ) + # Limit the truncate in + autocov_valid = autocov_valid[:truncate_idx] + var_cor = autocov_valid.sum() * 2 - var + # Ensure that the variance can't be less than the uncorrelated value. + var_cor = max(var_cor, var) + max_lag_time_used = truncate_idx - 1 + return var_cor, max_lag_time_used, autocov_valid + + # Otherwise, get the gamma function. Avoid recalculating if + # it has already been provided. + gamma_cap = _get_gamma_cap(autocov_valid) + variance_fns = { + "initial_positive": _get_initial_positive_sequence, + "initial_monotone": _get_initial_monotone_sequence, + "initial_convex": _get_initial_convex_sequence, + } + gamma_cap = variance_fns[sequence_estimator]( + gamma_cap, min_max_lag_time=min_max_lag_time + ) + var_cor = gamma_cap.sum() * 2 - var + + # Make sure that the variance is not negative. + var_cor = max(var_cor, var) + + # Get the maximum lag time. + max_lag_time_used = gamma_cap.shape[0] * 2 - 1 + + return var_cor, max_lag_time_used, autocov_valid + + +def get_variance_series_initial_sequence( + data: _np.ndarray, + sequence_estimator: str = "initial_convex", + min_max_lag_time: int = 3, + max_max_lag_time: _Optional[int] = None, + smooth_lag_times: bool = False, + frac_padding: float = 0.1, +) -> _Tuple[_np.ndarray, _np.ndarray]: + """ + Repeatedly calculate the variance of a time series while discarding increasing + numbers of samples from the start of the time series. The variance is calculated + using initial sequence methods. See Geyer, 1992: https://www.jstor.org/stable/2246094. + + Parameters + ---------- + data : numpy.ndarray + A time series of data with shape (n_samples,). + + sequence_estimator : str, optional + The initial sequence estimator to use. Can be "positive", "initial_positive", + "initial_monotone", or "initial_convex". The default is "initial_convex". "positive" + corresponds to truncating the auto-covariance function at the first negative value, as is + done in pymbar. The other methods correspond to the methods described in Geyer, 1992: + https://www.jstor.org/stable/2246094. + + min_max_lag_time : int, optional, default=3 + The minimum maximum lag time to use when estimating the statistical inefficiency. + + max_max_lag_time : int, optional, default=None + The maximum maximum lag time to use when calculating the auto-correlation function. + If None, the maximum lag time will be the length of the time series. + + smooth_lag_times : bool, optional, default=False + Whether to smooth out the max lag times by a) converting them to a monotinically + decreasing sequence and b) linearly interpolating between points where the sequence + changes. This may be useful when the max lag times are noisy. + + frac_padding : float, optional, default=0.1 + The fraction of the end of the timeseries to avoid calculating the variance + for. For example, if frac_padding = 0.1, the variance will be calculated + for the first 90% of the time series. This helps to avoid noise in the + variance when there are few data points. + + Returns + ------- + numpy.ndarray + The variance of the time series as a function of the number of discarded samples. + + numpy.ndarray + The maximum lag time used when calculating the auto-correlated variance. + """ + # Check that the data is valid. + data = _check_data(data, one_dim_allowed=True) + n_samples = data.shape[1] + + # Check that percent_padding is valid. + if frac_padding < 0 or frac_padding >= 1: + raise InvalidInputError("Percent padding must be >= 0 and < 1.") + + if frac_padding > 0.5: + _warn( + "Percent padding is greater than 0.5. You are evaluating less than half of the data." + ) + + # Calculate the maximum index to use when discarding samples. + max_index = n_samples - 1 - min_max_lag_time if min_max_lag_time else n_samples - 2 + + # Needs to be a max of n_samples - 2 to allow the gamma function to be calculated. + max_index = min(max_index, n_samples - 2) + + # See if we need to truncate the max index even further based on the percent padding. + if frac_padding > 0: + frac_padding_max_index = round(n_samples * (1 - frac_padding)) + max_index = min(max_index, frac_padding_max_index) + + # Calculate the variance at each index, and also the maximum lag time used + # and the autocovariance function. + variance_series = _np.zeros(max_index + 1) + max_lag_times_used = _np.zeros(max_index + 1, dtype=int) + # A list of autocovariance functions. + autocov_series = [] + + for index in range(max_index + 1): + variance, max_lag_time_used, autocov = get_variance_initial_sequence( + data[:, index:], + sequence_estimator=sequence_estimator, + min_max_lag_time=min_max_lag_time, + max_max_lag_time=max_max_lag_time, + ) + variance_series[index] = variance + max_lag_times_used[index] = max_lag_time_used + autocov_series.append(autocov) + + # If we are smoothing the lags, set the max lag time to the + # maximum lag time used for the current index. This saves + # some time computing the full autocovariance function. + if smooth_lag_times: + max_max_lag_time = max_lag_time_used + # If it's the same length as the time series, subtract 1 + # so that it works on the next iteration. + if max_max_lag_time == n_samples - index - 1: + max_max_lag_time -= 1 + + if smooth_lag_times: + # Get the smoothened max lag times. + max_lag_times_to_use_smooth = _smoothen_max_lag_times(max_lag_times_used) + + # Truncate the autocovariance functions at the smoothened max lag times. + autocov_series = [ + autocov[: max_lag_times_to_use_smooth[index] + 1] + for index, autocov in enumerate(autocov_series) + ] + + # Recalculate the variance series. + variance_series = _np.zeros(max_index + 1) + max_lag_times_used = _np.zeros(max_index + 1, dtype=int) + + for index in range(max_index + 1): + variance, max_lag_time_used, _ = get_variance_initial_sequence( + data[:, index:], + sequence_estimator=sequence_estimator, + min_max_lag_time=min_max_lag_time, + max_max_lag_time=max_lag_times_to_use_smooth[index], + autocov=autocov_series[index], + ) + variance_series[index] = variance + max_lag_times_used[index] = max_lag_time_used + + return variance_series, max_lag_times_used + + +def get_variance_window( + data: _np.ndarray, + kernel: _Callable[[int], _np.ndarray] = _np.bartlett, # type: ignore + window_size: int = 10, +) -> float: + """ + Calculate the variance of a time series using window estimators. + + Parameters + ---------- + data : numpy.ndarray + A time series of data with shape (n_samples,). + + kernel : callable, optional, default=numpy.bartlett + A function that takes a window size and returns a window function. + + window_size : int, optional, default=10 + The size of the window to use, defined in terms of time lags in the + forwards direction. + + Returns + ------- + float + The estimated variance of the time series. + """ + # Check that the data is valid. + data = _check_data(data, one_dim_allowed=True) + n_samples = data.shape[1] + + # Check that the window estimator is valid. + if not callable(kernel): + raise InvalidInputError("Window estimator must be a callable function.") + + # Check that the window size is valid. + if window_size < 1: + raise InvalidInputError("Window size must be greater than or equal to 1.") + + if window_size > n_samples - 1: + raise InvalidInputError( + "Window size must be less than or equal to the number of samples minus 1." + ) + + # Get the uncorrected variance estimate. + var = data.var() + if var == 0: + raise AnalysisError( + "Variance of data is zero. Cannot compute statistical inefficiency. " + "Check that you have input the correct data." + ) + + # Get the windowed autocovariance. + autocov = _get_autocovariance_window(data, kernel, window_size) + + # Account for correlation in the forwards and backwards directions. + corr_var = autocov.sum() * 2 - var + + # Make sure that the variance is not less than the uncorrelated value. + return max(corr_var, var) + + +def get_variance_series_window( + data: _np.ndarray, + kernel: _Callable[[int], _np.ndarray] = _np.bartlett, # type: ignore + window_size_fn: _Optional[_Callable[[int], int]] = lambda x: round(x**1 / 2), + window_size: _Optional[int] = None, + frac_padding: float = 0.1, +) -> _Tuple[_np.ndarray, _np.ndarray]: + """ + Repeatedly calculate the variance of a time series while discarding increasing + numbers of samples from the start of the time series. The variance is calculated + using window estimators. + + Parameters + ---------- + data : numpy.ndarray + A time series of data with shape (n_samples,). + + kernel : callable, optional, default=numpy.bartlett + A function that takes a window size and returns a window function. + + window_size_fn : callable, optional, default=lambda x: round(x**1 / 2) + A function that takes the length of the time series and returns the window size + to use. If this is not None, window_size must be None. + + window_size : int, optional, default=None + The size of the window to use, defined in terms of time lags in the + forwards direction. If this is not None, window_size_fn must be None. + + frac_padding : float, optional, default=0.1 + The fraction of the end of the timeseries to avoid calculating the variance + for. For example, if frac_padding = 0.1, the variance will be calculated + for the first 90% of the time series. This helps to avoid noise in the + variance when there are few data points. + + Returns + ------- + numpy.ndarray + The variance of the time series as a function of the number of discarded samples. + + numpy.ndarray + The window size used at each index. + """ + # Check that the data is valid. + data = _check_data(data, one_dim_allowed=True) + n_samples = data.shape[1] + + # Check that only one of window_size_fn and window_size is not None. + if window_size_fn is not None and window_size is not None: + raise InvalidInputError( + "Only one of window_size_fn and window_size can be not None." + ) + + if window_size_fn is None and window_size is None: + raise InvalidInputError( + "One of window_size_fn and window_size must be not None." + ) + + if window_size_fn is not None: + # Check that the window size function is valid. + if not callable(window_size_fn): + raise InvalidInputError("Window size function must be a callable function.") + + # Check that frac_padding is valid. + if frac_padding < 0 or frac_padding >= 1: + raise InvalidInputError("Percent padding must be >= 0 and < 1.") + + if frac_padding > 0.5: + _warn( + "Percent padding is greater than 0.5. You are evaluating less than half of the data." + ) + + # Calculate the maximum index to use when discarding samples. + max_index = ( + n_samples - 1 - window_size if window_size is not None else n_samples - 2 + ) + + # See if we need to truncate the max index even further based on the percent padding. + if frac_padding > 0: + frac_padding_max_index = round(n_samples * (1 - frac_padding)) + max_index = min(max_index, frac_padding_max_index) + + # Calculate the variance at each index and store the window size used. + variance_series = _np.zeros(max_index + 1) + window_size_series = _np.zeros(max_index + 1, dtype=int) + + for index in range(max_index + 1): + window_size = ( + window_size_fn(n_samples - index) if window_size_fn else window_size + ) + variance_series[index] = get_variance_window( + data[:, index:], kernel=kernel, window_size=window_size # type: ignore + ) + window_size_series[index] = window_size + + return variance_series, window_size_series + + +def replicated_batch_means_variance(data: _np.ndarray, batch_size: int) -> float: """ Estimate the variance of a time series using the replicated batch means method. See section 3.1 in Statist. Sci. 36(4): 518-529 (November 2021). @@ -25,9 +839,7 @@ def replicated_batch_means_variance(data: np.ndarray, batch_size: int) -> float: float The estimated variance. """ - data = check_data(data, one_dim_allowed=True) - - # If array is + data = _check_data(data, one_dim_allowed=True) # Check that batch_size is valid. n_chains, n_samples = data.shape @@ -40,13 +852,13 @@ def replicated_batch_means_variance(data: np.ndarray, batch_size: int) -> float: n_batches = n_samples // batch_size # Compute the mean of each batch. - batch_means = np.mean( + batch_means = _np.mean( data[:, : n_batches * batch_size].reshape(n_chains, n_batches, batch_size), axis=2, ) # Compute the variance of the batch means. - batch_means_variance = np.var(batch_means, ddof=1) + batch_means_variance = _np.var(batch_means, ddof=1) # Multiply by the batch size. batch_means_variance *= batch_size @@ -54,7 +866,7 @@ def replicated_batch_means_variance(data: np.ndarray, batch_size: int) -> float: return batch_means_variance -def lugsail_variance(data: np.ndarray, n_pow: float = 1 / 3) -> float: +def lugsail_variance(data: _np.ndarray, n_pow: float = 1 / 3) -> float: """ Estimate the variance of a time series using the lugsail method. See section 3.2 in Statist. Sci. 36(4): 518-529 (November 2021). @@ -75,7 +887,7 @@ def lugsail_variance(data: np.ndarray, n_pow: float = 1 / 3) -> float: The estimated variance. """ # Check that the data is valid. - data = check_data(data, one_dim_allowed=True) + data = _check_data(data, one_dim_allowed=True) # Check that n_pow is valid. if n_pow <= 0 or n_pow > 1: @@ -85,8 +897,8 @@ def lugsail_variance(data: np.ndarray, n_pow: float = 1 / 3) -> float: # Get the two batch sizes. n_chains, n_samples = data.shape - batch_size_large = int(np.floor(n_samples**n_pow)) - batch_size_small = int(np.floor(batch_size_large / 3)) + batch_size_large = int(_np.floor(n_samples**n_pow)) # type: ignore + batch_size_small = int(_np.floor(batch_size_large / 3)) # type: ignore # Make sure that the batch sizes are valid. if batch_size_large == batch_size_small or batch_size_small < 1: @@ -104,7 +916,7 @@ def lugsail_variance(data: np.ndarray, n_pow: float = 1 / 3) -> float: return lugsail_variance -def inter_run_variance(data: np.ndarray) -> float: +def inter_run_variance(data: _np.ndarray) -> float: """ Compute the variance based on the inter-run differences between means. @@ -120,10 +932,10 @@ def inter_run_variance(data: np.ndarray) -> float: The estimated variance. """ # Check that the data is valid. - data = check_data(data, one_dim_allowed=False) + data = _check_data(data, one_dim_allowed=False) # Compute the inter-run variance. - inter_run_variance = np.var(np.mean(data, axis=1), ddof=1) + inter_run_variance = _np.var(_np.mean(data, axis=1), ddof=1) # Multiply by the number of samples per run. _, n_samples = data.shape @@ -132,7 +944,7 @@ def inter_run_variance(data: np.ndarray) -> float: return inter_run_variance -def intra_run_variance(data: np.ndarray) -> float: +def intra_run_variance(data: _np.ndarray) -> float: """ Compute the average intra-run variance estimate. @@ -147,12 +959,12 @@ def intra_run_variance(data: np.ndarray) -> float: The mean intra-run variance estimate. """ # Check that the data is valid. - data = check_data(data, one_dim_allowed=True) + data = _check_data(data, one_dim_allowed=True) # Compute the intra-run variance estimates. - intra_run_variance = np.var(data, axis=1, ddof=1) + intra_run_variance = _np.var(data, axis=1, ddof=1) # Compute the mean intra-run variance estimate. - mean_intra_run_variance = np.mean(intra_run_variance) + mean_intra_run_variance = _np.mean(intra_run_variance) return mean_intra_run_variance diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml index a6a4302..4786858 100644 --- a/devtools/conda-envs/test_env.yaml +++ b/devtools/conda-envs/test_env.yaml @@ -11,11 +11,13 @@ dependencies: - numpy - scipy - matplotlib + - statsmodels # Testing - pytest - pytest-cov - codecov + - rpy2 # Pip-only installs #- pip: diff --git a/pyproject.toml b/pyproject.toml index 3164ddc..2668c64 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,9 @@ dependencies = [ [project.optional-dependencies] test = [ "pytest>=6.1.2", - "pytest-runner" + "pytest-runner", + "statsmodels>=0.12.0", + "rpy2", ] [tool.setuptools] From 41834b67d1eb9d7f732a6055ce803c09318e38ab Mon Sep 17 00:00:00 2001 From: finlayclark Date: Mon, 22 Jan 2024 08:21:36 +0000 Subject: [PATCH 03/15] Remove requirement for MCMC R package --- deea/tests/test_variance.py | 18 ++++-------------- devtools/conda-envs/test_env.yaml | 1 - pyproject.toml | 1 - 3 files changed, 4 insertions(+), 16 deletions(-) diff --git a/deea/tests/test_variance.py b/deea/tests/test_variance.py index ad59fbc..fbffef9 100644 --- a/deea/tests/test_variance.py +++ b/deea/tests/test_variance.py @@ -6,8 +6,6 @@ import numpy.testing as npt import pymbar import pytest -import rpy2.robjects as robjects -from rpy2.robjects.packages import importr from statsmodels.tsa.stattools import acovf from deea._exceptions import AnalysisError, InvalidInputError @@ -28,10 +26,6 @@ from . import example_times, example_timeseries, gaussian_noise, gaussian_noise_offset -# Import some stuff from MCMC -mcmc = importr("mcmc") -initseq = robjects.r["initseq"] - def test_get_autocovariance(example_timeseries): """Test the _get_autocovariance function.""" @@ -102,15 +96,11 @@ def test_variance_initial_sequence(example_timeseries): for method in methods } - # Check that the results are correct by comparing to Geyer's implementation - # in the MCMC package. - example_timeseries_list = robjects.FloatVector(example_timeseries) - mcmc_variance = initseq(example_timeseries_list) + # Geyer results obtained from R package mcmc. geyer_res = { - "initial_positive": mcmc_variance[4][0], # 27304.776049686046 - "initial_monotone": mcmc_variance[5][0], # 24216.32288181667 - # "initial_convex": mcmc_variance[6][0], # 21898.6178149005 - "initial_convex": 21904.973713096544, + "initial_positive": 27304.776049686046, + "initial_monotone": 24216.32288181667, + "initial_convex": 21904.973713096544, # Actual MCMC result is 21898.6178149005, but very close. } # Check that the results are correct. diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml index 4786858..aa12f0a 100644 --- a/devtools/conda-envs/test_env.yaml +++ b/devtools/conda-envs/test_env.yaml @@ -17,7 +17,6 @@ dependencies: - pytest - pytest-cov - codecov - - rpy2 # Pip-only installs #- pip: diff --git a/pyproject.toml b/pyproject.toml index 2668c64..a1866a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,6 @@ test = [ "pytest>=6.1.2", "pytest-runner", "statsmodels>=0.12.0", - "rpy2", ] [tool.setuptools] From 7eabc774627aa9c27ee15b6d0de9d26be40d7fa7 Mon Sep 17 00:00:00 2001 From: finlayclark Date: Mon, 22 Jan 2024 08:33:20 +0000 Subject: [PATCH 04/15] Remove pymbar and associated functions --- deea/equilibration.py | 12 +- deea/ess.py | 453 ------------------------------- deea/tests/test_equilibration.py | 14 +- deea/tests/test_variance.py | 46 ---- 4 files changed, 11 insertions(+), 514 deletions(-) diff --git a/deea/equilibration.py b/deea/equilibration.py index 52fe1c4..8a41332 100644 --- a/deea/equilibration.py +++ b/deea/equilibration.py @@ -15,10 +15,8 @@ from ._validation import check_data from .ess import ( convert_sse_series_to_ess_series, - ess_chodera, ess_inter_variance, ess_lugsail_variance, - statistical_inefficiency_chodera, statistical_inefficiency_inter_variance, statistical_inefficiency_lugsail_variance, ) @@ -623,11 +621,11 @@ def get_ess_series( """ # Check that method is valid. method = method.lower() - if method not in ["lugsail", "inter", "chodera"]: - raise InvalidInputError("method must be 'lugsail', 'inter', or 'chodera'.") + if method not in ["lugsail", "inter"]: + raise InvalidInputError("method must be 'lugsail' or 'inter'") # Check that the data is valid. - one_dim_allowed = method in ["lugsail", "chodera"] + one_dim_allowed = method in ["lugsail"] data = check_data(data, one_dim_allowed=one_dim_allowed) n_runs, n_samples = data.shape @@ -659,8 +657,6 @@ def get_ess_series( # Compute the ESS. if method == "lugsail": ess_vals[i] = ess_lugsail_variance(truncated_data, n_pow=n_pow) - elif method == "chodera": - ess_vals[i] = ess_chodera(truncated_data) else: # method == 'inter' ess_vals[i] = ess_inter_variance(truncated_data) # Get the time. @@ -775,8 +771,6 @@ def detect_equilibration_max_ess( equil_data = data[:, equil_idx:] if method == "lugsail": equil_g = statistical_inefficiency_lugsail_variance(equil_data, n_pow=n_pow) - elif method == "chodera": - equil_g = statistical_inefficiency_chodera(equil_data) else: # method == 'inter' equil_g = statistical_inefficiency_inter_variance(equil_data) diff --git a/deea/ess.py b/deea/ess.py index b6d9165..98744f3 100644 --- a/deea/ess.py +++ b/deea/ess.py @@ -5,7 +5,6 @@ from typing import Tuple as _Tuple import numpy as _np -import pymbar from ._validation import check_data from .sse import get_sse_series_init_seq as _get_sse_series_init_seq @@ -206,29 +205,6 @@ def statistical_inefficiency_lugsail_variance( return max(g, 1) -def statistical_inefficiency_chodera(data: _np.ndarray) -> float: - """ - Compute the statistical inefficiency of a time series using the - Chodera method. This is applicable to a single run. - - Parameters - ---------- - data : np.ndarray - The time series data. This should have shape - (n_runs, n_samples) or (n_samples,). - - Returns - ------- - float - The statistical inefficiency. - """ - data = check_data(data, one_dim_allowed=True) - if data.shape[0] == 1: - return pymbar.timeseries.statistical_inefficiency(data[0], fast=False) - else: - return pymbar.timeseries.statistical_inefficiency(data.mean(axis=0), fast=False) - - def ess_inter_variance(data: _np.ndarray) -> float: """ Compute the effective sample size of a time series by dividing @@ -281,432 +257,3 @@ def ess_lugsail_variance(data: _np.ndarray, n_pow: float = 1 / 3) -> float: n_runs, n_samples = data.shape total_samples = n_runs * n_samples return total_samples / statistical_inefficiency_lugsail_variance(data, n_pow=n_pow) - - -def ess_chodera(data: _np.ndarray) -> float: - """ - Compute the effective sample size of a time series by dividing - the total number of samples by the statistical inefficiency, where - the statistical inefficiency is calculated using the Chodera method. - - Parameters - ---------- - data : np.ndarray - The time series data. This should have shape - (n_runs, n_samples) or (n_samples,). - - Returns - ------- - float - The effective sample size. - """ - data = check_data(data, one_dim_allowed=True) - n_runs, n_samples = data.shape - total_samples = n_runs * n_samples - # return total_samples / statistical_inefficiency_chodera(data) - if n_runs > 1: - data = data.mean(axis=0) - return total_samples / statistical_inefficiency_geyer(data, method="con") - - -################## DELETE BELOW ##################### -def statistical_inefficiency_geyer(A_n, method="con"): - """Compute the statistical inefficiency of a timeseries using the methods of Geyer. - - Parameters - ---------- - A_n : np.ndarray, float - A_n[n] is nth value of timeseries A. Length is deduced from vector. - method : str, optional, default='con' - The method to use; matches notation from `initseq` from `mcmc` R package by Geyer. - 'pos' : initial positive sequence (IPS) estimator - 'dec' : initial monotone sequence (IMS) estimator - 'con' : initial convex sequence (ICS) estimator - - Returns - ------- - g : np.ndarray, - g is the estimated statistical inefficiency (equal to 1 + 2 tau, where tau is the correlation time). - We enforce g >= 1.0. - - Notes - ----- - Implementation based on the `initseq` method from the `mcmc` R package by Geyer. - - References - ---------- - [1] Geyer, CJ. Practical Markov chain Monte Carlo. Statistical Science 7(4):473-511, 1992. - - Examples - -------- - - Compute statistical inefficiency of timeseries data with known correlation time using different methods. - - >>> from pymbar import testsystems - >>> A_n = testsystems.correlated_timeseries_example(N=100000, tau=10.0) - >>> g_IPS = statisticalInefficiency_geyer(A_n, method='pos') - >>> g_IMS = statisticalInefficiency_geyer(A_n, method='dec') - >>> g_ICS = statisticalInefficiency_geyer(A_n, method='con') - - """ - - if method not in ["pos", "dec", "con"]: - raise Exception( - "Unknown method '%s'; must be one of ['pos', 'dec', 'con']" % method - ) - - # Create numpy copies of input arguments. - A_n = _np.array(A_n) - - # Subtract off sample mean. - A_n -= A_n.mean() - - # Get the length of the timeseries. - N = A_n.size - - # Compute sample variance. - gamma_zero = (A_n**2).sum() / N - - # Compute sequential covariance pairs. - gamma_pos = list() - for i in range(int(_np.floor(N / 2))): - lag1 = 2 * i - gam1 = (A_n[0 : (N - lag1)] * A_n[lag1:N]).sum() / N - lag2 = lag1 + 1 - gam2 = (A_n[0 : (N - lag2)] * A_n[lag2:N]).sum() / N - - # Terminate if sum is no longer positive. - if (gam1 + gam2) < 0.0: - break - - # Otherwise, store the consecutive sum. - gamma_pos.append(gam1 + gam2) - - # Number of nonnegative values in array. - ngamma = len(gamma_pos) - - # Compute IPS gamma sequence. - gamma_pos = _np.array(gamma_pos) - - # Compute IMS gamma sequence. - gamma_dec = _np.array(gamma_pos) - for i in range(ngamma - 1): - if gamma_dec[i] < gamma_dec[i + 1]: - gamma_dec[i] = gamma_dec[i + 1] - - # Compute ICS gamma sequence. - gamma_con = _np.array(gamma_dec) - for i in range(ngamma - 1, 0, -1): - gamma_con[i] -= gamma_con[i - 1] - - # Pool adjacent violators (PAVA) algorithm. - puff = _np.zeros([ngamma], _np.float64) - nuff = _np.zeros([ngamma], _np.int32) - nstep = 0 - for j in range(1, ngamma): - puff[nstep] = gamma_con[j] - nuff[nstep] = 1 - nstep += 1 - while (nstep > 1) and ( - (puff[nstep - 1] / nuff[nstep - 1]) < (puff[nstep - 2] / nuff[nstep - 2]) - ): - puff[nstep - 2] += puff[nstep - 1] - nuff[nstep - 2] += nuff[nstep - 1] - nstep -= 1 - - j = 1 - for jstep in range(nstep): - muff = puff[jstep] / nuff[jstep] - for k in range(nuff[jstep]): - gamma_con[j] = gamma_con[j - 1] + muff - j += 1 - - # Compute sample variance estimates. - var_pos = (2 * gamma_pos.sum() - gamma_zero) / N - var_dec = (2 * gamma_dec.sum() - gamma_zero) / N - var_con = (2 * gamma_con.sum() - gamma_zero) / N - - # Compute statistical inefficiencies from sample mean var = var(A_n) / (N/g) - # g = var / (var(A_n)/N) - var_uncorr = gamma_zero / N - g_pos = var_pos / var_uncorr - g_dec = var_dec / var_uncorr - g_con = var_con / var_uncorr - - # DEBUG - # print "pos dec con : %12.3f %12.3f %12.3f" % (g_pos, g_dec, g_con) - - # Select appropriate g. - if method == "pos": - g = g_pos - elif method == "dec": - g = g_dec - elif method == "con": - g = g_con - - # g must be at least unity - if g < 1.0: - g = 1.0 - - # Return the computed statistical inefficiency. - return g - - -def statistical_inefficiency_geyer_indices(A_n, method="con"): - """Compute the statistical inefficiency of a timeseries using the methods of Geyer. - - Parameters - ---------- - A_n : np.ndarray, float - A_n[n] is nth value of timeseries A. Length is deduced from vector. - method : str, optional, default='con' - The method to use; matches notation from `initseq` from `mcmc` R package by Geyer. - 'pos' : initial positive sequence (IPS) estimator - 'dec' : initial monotone sequence (IMS) estimator - 'con' : initial convex sequence (ICS) estimator - - Returns - ------- - g : np.ndarray, - g is the estimated statistical inefficiency (equal to 1 + 2 tau, where tau is the correlation time). - We enforce g >= 1.0. - - Notes - ----- - Implementation based on the `initseq` method from the `mcmc` R package by Geyer. - - References - ---------- - [1] Geyer, CJ. Practical Markov chain Monte Carlo. Statistical Science 7(4):473-511, 1992. - - Examples - -------- - - Compute statistical inefficiency of timeseries data with known correlation time using different methods. - - >>> from pymbar import testsystems - >>> A_n = testsystems.correlated_timeseries_example(N=100000, tau=10.0) - >>> g_IPS = statisticalInefficiency_geyer(A_n, method='pos') - >>> g_IMS = statisticalInefficiency_geyer(A_n, method='dec') - >>> g_ICS = statisticalInefficiency_geyer(A_n, method='con') - - """ - - if method not in ["pos", "dec", "con"]: - raise Exception( - "Unknown method '%s'; must be one of ['pos', 'dec', 'con']" % method - ) - - # Create numpy copies of input arguments. - A_n = _np.array(A_n) - - # Subtract off sample mean. - A_n -= A_n.mean() - - # Get the length of the timeseries. - N = A_n.size - - # Compute sample variance. - gamma_zero = (A_n**2).sum() / N - - # Compute sequential covariance pairs. - gamma_pos = list() - for i in range(int(_np.floor(N / 2))): - lag1 = 2 * i - gam1 = (A_n[0 : (N - lag1)] * A_n[lag1:N]).sum() / N - lag2 = lag1 + 1 - gam2 = (A_n[0 : (N - lag2)] * A_n[lag2:N]).sum() / N - - # Terminate if sum is no longer positive. - if (gam1 + gam2) < 0.0: - break - - # Otherwise, store the consecutive sum. - gamma_pos.append(gam1 + gam2) - - # Number of nonnegative values in array. - ngamma = len(gamma_pos) - - # Compute IPS gamma sequence. - gamma_pos = _np.array(gamma_pos) - - # Compute IMS gamma sequence. - gamma_dec = _np.array(gamma_pos) - for i in range(ngamma - 1): - if gamma_dec[i] < gamma_dec[i + 1]: - gamma_dec[i] = gamma_dec[i + 1] - - # Compute ICS gamma sequence. - gamma_con = _np.array(gamma_dec) - for i in range(ngamma - 1, 0, -1): - gamma_con[i] -= gamma_con[i - 1] - - # Pool adjacent violators (PAVA) algorithm. - puff = _np.zeros([ngamma], _np.float64) - nuff = _np.zeros([ngamma], _np.int32) - nstep = 0 - for j in range(1, ngamma): - puff[nstep] = gamma_con[j] - nuff[nstep] = 1 - nstep += 1 - while (nstep > 1) and ( - (puff[nstep - 1] / nuff[nstep - 1]) < (puff[nstep - 2] / nuff[nstep - 2]) - ): - puff[nstep - 2] += puff[nstep - 1] - nuff[nstep - 2] += nuff[nstep - 1] - nstep -= 1 - - j = 1 - for jstep in range(nstep): - muff = puff[jstep] / nuff[jstep] - for k in range(nuff[jstep]): - gamma_con[j] = gamma_con[j - 1] + muff - j += 1 - - # Compute sample variance estimates. - var_pos = (2 * gamma_pos.sum() - gamma_zero) / N - var_dec = (2 * gamma_dec.sum() - gamma_zero) / N - var_con = (2 * gamma_con.sum() - gamma_zero) / N - - # Compute statistical inefficiencies from sample mean var = var(A_n) / (N/g) - # g = var / (var(A_n)/N) - var_uncorr = gamma_zero / N - g_pos = var_pos / var_uncorr - g_dec = var_dec / var_uncorr - g_con = var_con / var_uncorr - - # DEBUG - # print "pos dec con : %12.3f %12.3f %12.3f" % (g_pos, g_dec, g_con) - - # Select appropriate g. - if method == "pos": - g = g_pos - elif method == "dec": - g = g_dec - elif method == "con": - g = g_con - - # g must be at least unity - if g < 1.0: - g = 1.0 - - # Return the computed statistical inefficiency. - return (g,) - - -def statistical_inefficiency_multiscale(A_n, B_n=None, fast=False, mintime=3): - """ - Compute the (cross) statistical inefficiency of (two) timeseries using multiscale method from Chodera. - - Parameters - ---------- - A_n : np.ndarray, float - A_n[n] is nth value of timeseries A. Length is deduced from vector. - B_n : np.ndarray, float, optional, default=None - B_n[n] is nth value of timeseries B. Length is deduced from vector. - If supplied, the cross-correlation of timeseries A and B will be estimated instead of the - autocorrelation of timeseries A. - fast : bool, optional, default=False - f True, will use faster (but less accurate) method to estimate correlation - time, described in Ref. [1] (default: False). - mintime : int, optional, default=3 - minimum amount of correlation function to compute (default: 3) - The algorithm terminates after computing the correlation time out to mintime when the - correlation function first goes negative. Note that this time may need to be increased - if there is a strong initial negative peak in the correlation function. - - Returns - ------- - g : np.ndarray, - g is the estimated statistical inefficiency (equal to 1 + 2 tau, where tau is the correlation time). - We enforce g >= 1.0. - - Notes - ----- - The same timeseries can be used for both A_n and B_n to get the autocorrelation statistical inefficiency. - The fast method described in Ref [1] is used to compute g. - - References - ---------- - [1] J. D. Chodera, W. C. Swope, J. W. Pitera, C. Seok, and K. A. Dill. Use of the weighted - histogram analysis method for the analysis of simulated and parallel tempering simulations. - JCTC 3(1):26-41, 2007. - - Examples - -------- - - Compute statistical inefficiency of timeseries data with known correlation time. - - >>> from pymbar import testsystems - >>> A_n = testsystems.correlated_timeseries_example(N=100000, tau=5.0) - >>> g = statisticalInefficiency_multiscale(A_n, fast=True) - - """ - - # Create numpy copies of input arguments. - A_n = _np.array(A_n) - - if B_n is not None: - B_n = _np.array(B_n) - else: - B_n = _np.array(A_n) - - # Get the length of the timeseries. - N = A_n.size - - # Be sure A_n and B_n have the same dimensions. - if A_n.shape != B_n.shape: - raise Exception("A_n and B_n must have same dimensions.") - - # Initialize statistical inefficiency estimate with uncorrelated value. - g = 1.0 - - # Compute mean of each timeseries. - mu_A = A_n.mean() - mu_B = B_n.mean() - - # Make temporary copies of fluctuation from mean. - dA_n = A_n.astype(_np.float64) - mu_A - dB_n = B_n.astype(_np.float64) - mu_B - - # Compute estimator of covariance of (A,B) using estimator that will ensure C(0) = 1. - sigma2_AB = (dA_n * dB_n).mean() # standard estimator to ensure C(0) = 1 - - # Trap the case where this covariance is zero, and we cannot proceed. - if sigma2_AB == 0: - raise ParameterError( - "Sample covariance sigma_AB^2 = 0 -- cannot compute statistical inefficiency" - ) - - # Accumulate the integrated correlation time by computing the normalized correlation time at - # increasing values of t. Stop accumulating if the correlation function goes negative, since - # this is unlikely to occur unless the correlation function has decayed to the point where it - # is dominated by noise and indistinguishable from zero. - t = 1 - increment = 1 - while t < N - 1: - # compute normalized fluctuation correlation function at time t - C = _np.sum(dA_n[0 : (N - t)] * dB_n[t:N] + dB_n[0 : (N - t)] * dA_n[t:N]) / ( - 2.0 * float(N - t) * sigma2_AB - ) - # Terminate if the correlation function has crossed zero and we've computed the correlation - # function at least out to 'mintime'. - if (C <= 0.0) and (t > mintime): - break - - # Accumulate contribution to the statistical inefficiency. - g += 2.0 * C * (1.0 - float(t) / float(N)) * float(increment) - - # Increment t and the amount by which we increment t. - t += increment - - # Increase the interval if "fast mode" is on. - if fast: - increment += 1 - - # g must be at least unity - if g < 1.0: - g = 1.0 - - # Return the computed statistical inefficiency. - return g, t diff --git a/deea/tests/test_equilibration.py b/deea/tests/test_equilibration.py index 4ee51d3..197df38 100644 --- a/deea/tests/test_equilibration.py +++ b/deea/tests/test_equilibration.py @@ -3,7 +3,6 @@ from pathlib import Path import numpy as np -import pymbar import pytest from deea.equilibration import ( @@ -142,11 +141,14 @@ def test_compare_pymbar(example_timeseries): methods are used.""" example_timeseries = example_timeseries.mean(axis=0) - ( - equil_idx_chod, - equil_g_chod, - equil_ess_chod, - ) = pymbar.timeseries.detect_equilibration(example_timeseries, fast=False, nskip=1) + # Results below were obtained with: + # ( equil_idx_chod, + # equil_g_chod, + # equil_ess_chod, + # ) = pymbar.timeseries.detect_equilibration(example_timeseries, fast=False, nskip=1) + equil_idx_chod = 877 + equil_g_chod = 4.111825 + # equil_ess_chod = 425.35858 equil_idx, equil_g, equil_ess = detect_equilibration_init_seq( example_timeseries, method="max_ess", sequence_estimator="positive" ) diff --git a/deea/tests/test_variance.py b/deea/tests/test_variance.py index fbffef9..4228e14 100644 --- a/deea/tests/test_variance.py +++ b/deea/tests/test_variance.py @@ -4,7 +4,6 @@ import numpy as np import numpy.testing as npt -import pymbar import pytest from statsmodels.tsa.stattools import acovf @@ -116,28 +115,6 @@ def test_variance_initial_sequence(example_timeseries): ) -def test_variance_initial_sequence_chodera(example_timeseries): - """ - Check that the variance estimated by the initial sequence estimator is - with the 'positive' method is the same as the variance estimated by - the Chodera method. - """ - # Take the mean of the timeseries. - example_timeseries = example_timeseries.mean(axis=0) - - # Get the variance estimate with the initial sequence estimator. - variance_initial_sequence = get_variance_initial_sequence( - example_timeseries, sequence_estimator="positive" - )[0] - - # Get the variance estimate with the Chodera method. - g = pymbar.timeseries.statistical_inefficiency(example_timeseries) - variance_chodera = g * example_timeseries.var() - - # Check that the results are the same. - assert variance_initial_sequence == pytest.approx(variance_chodera, abs=0.01) - - def test_variance_initial_sequence_multiple_runs(example_timeseries): """Regression test for the get_variance_initial_sequence function with multiple runs.""" @@ -237,29 +214,6 @@ def test_get_variance_series_initial_sequence_raises(example_timeseries): ) -def test_get_variance_series_initial_sequence_chodera(example_timeseries): - """ - Check that the variance estimated by the initial sequence estimator is - with the 'positive' method is the same as the variance estimated by - the Chodera method for the whole series. - """ - # Take the mean of the timeseries, but use a slimmed down version to speed - # up the test. - example_timeseries = example_timeseries.mean(axis=0) - - # Get the variance estimate with the initial sequence estimator. - var_seq = get_variance_series_initial_sequence( - example_timeseries, sequence_estimator="positive", min_max_lag_time=3 - )[0] - - # Compare the results to the Chodera method. - chod_var_seq = np.zeros_like(var_seq) - for i in range(len(var_seq)): - g = pymbar.timeseries.statistical_inefficiency(example_timeseries[i:]) - chod_var_seq[i] = g * example_timeseries[i:].var() - assert var_seq == pytest.approx(chod_var_seq, abs=0.01) - - def test_smoothen_lags(): """Test the _smoothen_max_lag_times function.""" rough_lags = np.array([10, 11, 6, 5, 4, 4, 4, 3]) From 851656ba384434ddf33be3fa563279a4bfc05889 Mon Sep 17 00:00:00 2001 From: finlayclark Date: Mon, 22 Jan 2024 09:42:10 +0000 Subject: [PATCH 05/15] Remove redundant functions --- deea/equilibration.py | 462 +------------------------------ deea/plot.py | 137 --------- deea/sse.py | 58 ---- deea/tests/test_equilibration.py | 93 ------- deea/tests/test_plot.py | 79 ++---- deea/variance.py | 8 +- 6 files changed, 25 insertions(+), 812 deletions(-) diff --git a/deea/equilibration.py b/deea/equilibration.py index 8a41332..194e5a1 100644 --- a/deea/equilibration.py +++ b/deea/equilibration.py @@ -20,12 +20,8 @@ statistical_inefficiency_inter_variance, statistical_inefficiency_lugsail_variance, ) -from .plot import ( - plot_equilibration_max_ess, - plot_equilibration_min_sse, - plot_equilibration_paired_t_test, -) -from .sse import get_sse_series_init_seq, get_sse_series_window, sse +from .plot import plot_equilibration_min_sse, plot_equilibration_paired_t_test +from .sse import get_sse_series_init_seq, get_sse_series_window from .variance import get_variance_initial_sequence, get_variance_window ######################################## New ####################################### @@ -343,460 +339,6 @@ def detect_equilibration_window( return equil_time, equil_g, equil_ess -######################################## New ####################################### - - -def get_sse_series( - data: _np.ndarray, - times: _Optional[_np.ndarray] = None, - method: str = "lugsail", - n_pow: float = 1 / 3, - lugsail_max_discard_frac: float = 0.95, -) -> _Tuple[_np.ndarray, _np.ndarray]: - """ - Return the a series of estimates of the squared standard error (SSE) - from a (set of) timeseries. The SSE is obtained by estimating the SSE at - at each time point, discarding all samples before the time point. The index - of the time point with the minimum SSE is taken to be the point of - equilibration. - - Parameters - ---------- - data : np.ndarray - The time series data. This should have shape - (n_runs, n_samples) or (n_samples,). If the method - is 'lugsail' or 'intra', the data may have only one run, but - otherwise there must be at least two runs. - - times : np.ndarray, optional - The times at which the data was sampled. If this is - not provided, the indices of the data will be used. - - method : str, optional - The method to use to estimate the limiting variance. This can be - 'lugsail', 'intra', or 'inter'. The default is 'lugsail'. 'lugsail' - uses the lugsail replicated batch means variance estimate, 'intra' - uses the mean intra-run variance estimate, and 'inter' uses the - inter-run variance estimate. - - n_pow : float, optional - The power to use in the lugsail variance estimate. This - should be between 0 and 1. The default is 1/3. If the method - is 'intra' or 'inter', this parameter is ignored. - - lugsail_max_discard_frac : float, optional - The maximum fraction of samples to discard when using the lugsail - method. The default is 0.95. This is required to avoid errors from - the lugail method when the smaller batch size becomes too small. - """ - # Check that method is valid. - valid_methods = ["lugsail", "inter", "intra"] - method = method.lower() - if method not in valid_methods: - raise ValueError(f"method must be one of {valid_methods}, but got {method}.") - - # Check that the data is valid. - one_dim_allowed = method in ["lugsail", "intra"] - data = check_data(data, one_dim_allowed=one_dim_allowed) - n_runs, n_samples = data.shape - - # Convert times to indices if necessary. - if times is None: - times = _np.arange(n_samples) - - # Check that times match the number of samples. - if n_samples != len(times): - raise InvalidInputError( - "Times must have the same length as the number of samples." - ) - - # If using the lugsail method, limit the maximum number of samples - # that we discard to ensure that we don't get errors due to the smaller - # batch size becoming too small. - keep_frac = lugsail_max_discard_frac # if method == "lugsail" else 1.0 - final_idx = round(n_samples * keep_frac) - n_ignore = n_samples - final_idx - # Make sure that this corresponds to at least 5 samples. - if n_ignore < 5 and method == "lugsail": - raise AnalysisError("The time series is too short.") - - # Compute the limiting variance at each time point. - lim_var_vals = _np.zeros(n_samples - n_ignore) - time_vals = _np.zeros(n_samples - n_ignore) - - for i in range(n_samples - n_ignore): - # Get the truncated data. - truncated_data = data[:, i:] - # Compute the ESS. - lim_var_vals[i] = sse(truncated_data, method=method, n_pow=n_pow) - # Get the time. - time_vals[i] = times[i] - - return lim_var_vals, time_vals - - -def detect_equilibration_min_sse( - data: _np.ndarray, - times: _Optional[_np.ndarray] = None, - method: str = "lugsail", - max_min_lim_var: float = float("inf"), - n_pow: float = 1 / 3, - lugsail_max_discard_frac: float = 0.95, - plot: bool = False, - plot_name: _Union[str, Path] = "equilibration_sse.png", - time_units: str = "ns", - data_y_label: str = r"$\Delta G$ / kcal mol$^{-1}$", -) -> _Tuple[int, float, float]: - r""" - Detect the equilibration time of a time series by finding the minimum - squared standard error (SSE) of the time series. This is done by - computing the SSE at each time point, discarding all samples before - the time point. The index of the time point with the minimum SSE is taken - to be the point of equilibration. - - Parameters - ---------- - data : np.ndarray - The time series data. This should have shape - (n_runs, n_samples) or (n_samples,). If the method - is 'lugsail', the data may have only one run, but - otherwise there must be at least two runs. - - times : np.ndarray, optional - The times at which the data was sampled. If this is - not provided, the indices of the data will be used. - - method : str, optional - The method to use to estimate the limiting SSE. This can be - 'lugsail', 'intra', or 'inter'. The default is 'lugsail'. 'lugsail' - uses the lugsail replicated batch means limiting variance estimate, - 'intra' uses the limiting mean intra-run limiting variance estimate, - and 'inter' uses the inter-run variance estimate. - - max_min_lim_var : float, optional - The maximum minimum SSE to accept. If the minimum SSE is greater - than this value, an EquilibrationNotDetectedError is raised. The - default is inf. Units are that of the data. - - n_pow : float, optional - The power to use in the lugsail variance estimate. This - should be between 0 and 1. The default is 1/3. - - lugsail_max_discard_frac : float, optional - The maximum fraction of samples to discard when using the lugsail - method. The default is 0.95. This is required to avoid errors from - the lugail method when the smaller batch size becomes too small. - - plot : bool, optional - Whether to plot the ESS curve. The default is False. - - plot_name : str | Path, optional - The name of the plot file. The default is 'equilibration_ess.png'. - - time_units : str, optional - The units of time. The default is "ns". - - data_y_label : str, optional - The y-axis label for the time series data. The default is - "$\Delta G$ / kcal mol$^{-1}$". - - Returns - ------- - int - The time point at which the time series is equilibrated. - - float - The effective sample size at the equilibration point. - - float - The SSE estimate at the equilibration point. - """ - # Check that data is valid. - data = check_data(data, one_dim_allowed=True) - n_runs, n_samples = data.shape - - # Check that max_lim_var is valid. - if max_min_lim_var <= 0: - raise InvalidInputError("max_min_lim_var must be positive.") - - # If times is None, units of time are indices. - if times is None: - time_units = "index" - # Convert times to indices. - times_valid: _np.ndarray = _np.arange(n_samples) - else: - # To satisfy type checking. - times_valid = times - - # Get the SSE timeseries - sse_vals, sse_times = get_sse_series( - data=data, - times=times_valid, - method=method, - n_pow=n_pow, - lugsail_max_discard_frac=0.95, - ) - - # Get the index of the minimum SSE. - equil_idx = _np.argmin(sse_vals) - equil_time = sse_times[equil_idx] - equil_sse = sse_vals[equil_idx] - - # Check that the SSE is less than the maximum SSE. - if equil_sse > max_min_lim_var: - raise EquilibrationNotDetectedError( - f"The minimum SSE of {equil_sse} is greater than the maximum SSE of {max_min_lim_var}." - ) - - # Now compute the limiting variance estimate at this time point. - equil_data = data[:, equil_idx:] - equil_sse = sse(equil_data, method=method, n_pow=n_pow) - - if plot: - # Create a figure. - fig = plt.figure(figsize=(6, 4)) - gridspec_obj = gridspec.GridSpec(1, 1, figure=fig) - - # Plot the ESS. - plot_equilibration_min_sse( - fig=fig, - subplot_spec=gridspec_obj[0], - data=data, - sse_series=sse_vals, - data_times=times_valid, - sse_times=sse_times, - time_units=time_units, - data_y_label=data_y_label, - ) - - fig.savefig(str(plot_name), dpi=300, bbox_inches="tight") - - # Return the equilibration index, limiting variance estimate, and SSE. - return equil_time, equil_sse, n_eff - - -def get_ess_series( - data: _np.ndarray, - times: _Optional[_np.ndarray] = None, - method: str = "lugsail", - n_pow: float = 1 / 3, -) -> _Tuple[_np.ndarray, _np.ndarray]: - """ - Return a list of effective sample sizes from a (set of) timeseries. - The ESS values are obtained by computing the ESS at each time point, - discarding all samples before the time point. The index of the time - point with the maximum ESS is taken to be the point of equilibration. - - Parameters - ---------- - data : np.ndarray - The time series data. This should have shape - (n_runs, n_samples) or (n_samples,). If the method - is 'lugsail', the data may have only one run, but - otherwise there must be at least two runs. - - times : np.ndarray, optional - The times at which the data was sampled. If this is - not provided, the indices of the data will be used. - - method : str, optional - The method to use to compute the ESS. This can be either - 'lugsail' or 'inter'. The default is 'lugsail'. 'lugsail' - uses the lugsail replicated batch means variance estimate - and 'inter' uses the inter-run variance estimate to compute - the ESS. - - n_pow : float, optional - The power to use in the lugsail variance estimate. This - should be between 0 and 1. The default is 1/3. If the method - is 'inter', this parameter is ignored. - - Returns - ------- - np.ndarray - The ESS values. - - np.ndarray - The times at which the ESS values were calculated. - """ - # Check that method is valid. - method = method.lower() - if method not in ["lugsail", "inter"]: - raise InvalidInputError("method must be 'lugsail' or 'inter'") - - # Check that the data is valid. - one_dim_allowed = method in ["lugsail"] - data = check_data(data, one_dim_allowed=one_dim_allowed) - n_runs, n_samples = data.shape - - # Convert times to indices if necessary. - if times is None: - times = _np.arange(n_samples) - - # Check that times is match the number of samples. - if n_samples != len(times): - raise InvalidInputError( - "Times must have the same length as the number of samples." - ) - - # Exclude the last 5 % of data from the ESS calculation - # to ensure that we don't throw away too much data. - final_idx = round(n_samples * 0.95) - n_ignore = n_samples - final_idx - # Make sure that this corresponds to at least 5 samples. - if n_ignore < 5: - raise AnalysisError("The time series is too short.") - - # Compute the ESS at each time point. - ess_vals = _np.zeros(n_samples - n_ignore) - time_vals = _np.zeros(n_samples - n_ignore) - - for i in range(n_samples - n_ignore): - # Get the truncated data. - truncated_data = data[:, i:] - # Compute the ESS. - if method == "lugsail": - ess_vals[i] = ess_lugsail_variance(truncated_data, n_pow=n_pow) - else: # method == 'inter' - ess_vals[i] = ess_inter_variance(truncated_data) - # Get the time. - time_vals[i] = times[i] - - return ess_vals, time_vals - - -def detect_equilibration_max_ess( - data: _np.ndarray, - times: _Optional[_np.ndarray] = None, - method: str = "lugsail", - min_ess: int = 1, - n_pow: float = 1 / 3, - plot: bool = False, - plot_name: _Union[str, Path] = "equilibration_ess.png", - time_units: str = "ns", - data_y_label: str = r"$\Delta G$ / kcal mol$^{-1}$", -) -> _Tuple[int, float, float]: - r""" - Detect the equilibration time of a time series by finding the maximum - effective sample size (ESS) of the time series. This is done by - computing the ESS at each time point, discarding all samples before - the time point. The index of the time point with the maximum ESS is taken - to be the point of equilibration. - - Parameters - ---------- - data : np.ndarray - The time series data. This should have shape - (n_runs, n_samples) or (n_samples,). If the method - is 'lugsail', the data may have only one run, but - otherwise there must be at least two runs. - - times : np.ndarray, optional - The times at which the data was sampled. If this is - not provided, the indices of the data will be used. - - method : str, optional - The method to use to compute the ESS. This can be either - 'lugsail' or 'inter'. The default is 'lugsail'. 'lugsail' - uses the lugsail replicated batch means variance estimate - and 'inter' uses the inter-run variance estimate to compute - the ESS. - - min_ess : int, optional, default=0 - The minimum ESS to accept. If the maximum ESS is less than - this value, an EquilibrationNotDetectedError is raised. - - n_pow : float, optional - The power to use in the lugsail variance estimate. This - should be between 0 and 1. The default is 1/3. If the method - is 'inter', this parameter is ignored. - - plot : bool, optional - Whether to plot the ESS curve. The default is False. - - plot_name : str | Path, optional - The name of the plot file. The default is 'equilibration_ess.png'. - - time_units : str, optional - The units of time. The default is "ns". - - data_y_label : str, optional - The y-axis label for the time series data. The default is - "$\Delta G$ / kcal mol$^{-1}$". - - Returns - ------- - int - The time point at which the time series is equilibrated. - - float - The statistical inefficiency at the equilibration point. - - float - The effective sample size at the equilibration point. - """ - # Check that data is valid. - data = check_data(data, one_dim_allowed=True) - n_runs, n_samples = data.shape - - # Check that min_ess is valid. - if min_ess <= 0 or min_ess > n_samples * n_runs: - raise InvalidInputError( - "min_ess must be between 0 and the total number of samples." - ) - - # If times is None, units of time are indices. - if times is None: - time_units = "index" - # Convert times to indices. - times = _np.arange(n_samples) - - # Get the ESS timeseries - ess_vals, ess_times = get_ess_series( - data=data, times=times, method=method, n_pow=n_pow - ) - - # Get the index of the maximum ESS. - equil_idx = _np.argmax(ess_vals) - equil_time = ess_times[equil_idx] - equil_ess = ess_vals[equil_idx] - - # Check that the ESS is greater than the minimum ESS. - if equil_ess < min_ess: - raise EquilibrationNotDetectedError( - f"The maximum ESS of {equil_ess} is less than the minimum ESS of {min_ess}." - ) - - # Now compute the statistical inefficiency at this time point. - equil_data = data[:, equil_idx:] - if method == "lugsail": - equil_g = statistical_inefficiency_lugsail_variance(equil_data, n_pow=n_pow) - else: # method == 'inter' - equil_g = statistical_inefficiency_inter_variance(equil_data) - - if plot: - # Create a figure. - fig = plt.figure(figsize=(6, 4)) - gridspec_obj = gridspec.GridSpec(1, 1, figure=fig) - - # Plot the ESS. - plot_equilibration_max_ess( - fig=fig, - subplot_spec=gridspec_obj[0], - data=data, - ess_series=ess_vals, - data_times=times, - ess_times=ess_times, - time_units=time_units, - data_y_label=data_y_label, - ) - - fig.savefig(str(plot_name), dpi=300, bbox_inches="tight") - - # Return the equilibration index, statistical inefficiency, and ESS. - return equil_time, equil_g, equil_ess - - def get_paired_t_p_timeseries( data: _np.ndarray, times: _Optional[_np.ndarray] = None, diff --git a/deea/plot.py b/deea/plot.py index 5e75921..8b218b6 100644 --- a/deea/plot.py +++ b/deea/plot.py @@ -216,57 +216,6 @@ def plot_p_values( ax.set_ylabel("$p$-value") -def plot_ess( - ax: Axes, ess: np.ndarray, times: np.ndarray, time_units: str = "ns" -) -> None: - """ - Plot the ESS estimate against time. - - Parameters - ---------- - ax : Axes - The axes to plot on. - - ess : np.ndarray - The ESS estimate. - - times : np.ndarray - The times at which the data was sampled. - - time_units : str, optional - The units of time. The default is "ns". - """ - # Check that ess is valid. - if not isinstance(ess, np.ndarray) or not isinstance(times, np.ndarray): - raise InvalidInputError("ess and times must be numpy arrays.") - - if ess.ndim != 1: - raise InvalidInputError("ess must be one dimensional.") - - if ess.shape[0] != times.shape[0]: - raise InvalidInputError( - "ess must have the same length as the number of samples." - ) - - # Plot the ESS. - ax.plot(times, ess, color="black") - - # Plot a vertical dashed line at the maximum ESS. - max_ess = ess.max() - ax.axvline( - x=times[ess.argmax()], - color="black", - linestyle="--", - label=f"Equilibration Time = {times[ess.argmax()]:.3g} {time_units}", - ) - - ax.legend() - - # Set the axis labels. - ax.set_xlabel(f"Time / {time_units}") - ax.set_ylabel("ESS") - - def plot_sse( ax: Axes, sse: np.ndarray, @@ -466,92 +415,6 @@ def plot_equilibration_paired_t_test( return ax_top, ax_bottom -def plot_equilibration_max_ess( - fig: figure.Figure, - subplot_spec: gridspec.SubplotSpec, - data: np.ndarray, - ess_series: np.ndarray, - data_times: np.ndarray, - ess_times: np.ndarray, - time_units: str = "ns", - data_y_label: str = r"$\Delta G$ / kcal mol$^{-1}$", -) -> _Tuple[Axes, Axes]: - r""" - Plot the effective sample size (ESS) estimates against time, - underneath the time series data. - - Parameters - ---------- - fig : plt.Figure - The figure to plot on. - - gridspec_obj : plt.GridSpec - The gridspec to use for the plot. - - data : np.ndarray - The time series data. This should have shape - (n_runs, n_samples), or (n_samples,) if there - is only one run. - - ess_series : np.ndarray - The ESS series. - - data_times : np.ndarray - The times at which the data was sampled. - - ess_times : np.ndarray - The times at which the ESS was computed. - - time_units : str, optional - The units of time. The default is "ns". - - data_y_label : str, optional - The y-axis label for the time series data. The default is - "$\Delta G$ / kcal mol$^{-1}$". - - Returns - ------- - ax_top : Axes - The axes for the time series data. - - ax_bottom : Axes - The axes for the p-values. - """ - # We need to split the gridspec into two subplots, one for the time series data (above) - # and one for the p-values (below). Share x-axis but not y-axis. - gs0 = gridspec.GridSpecFromSubplotSpec(2, 1, subplot_spec=subplot_spec, hspace=0.05) - ax_top = fig.add_subplot(gs0[0]) - ax_bottom = fig.add_subplot(gs0[1], sharex=ax_top) - - # Plot the time series data on the top axis. - plot_timeseries(ax_top, data, data_times) - # Add dashed vertical line at the equilibration time. - ax_top.axvline( - x=data_times[ess_series.argmax()], - color="black", - linestyle="--", - ) - - # Plot the ess on the bottom axis. - plot_ess(ax_bottom, ess_series, ess_times) - - # Set the axis labels. - ax_top.set_xlabel(f"Time / {time_units}") - ax_top.set_ylabel(data_y_label) - - # Move the legends to the side of the plot. - ax_top.legend(bbox_to_anchor=(1.05, 0.5), loc="center left") - ax_bottom.legend(bbox_to_anchor=(1.05, 0.5), loc="center left") - - # Hide the x tick labels for the top axis. - plt.setp(ax_top.get_xticklabels(), visible=False) - ax_top.spines["bottom"].set_visible(False) - ax_top.tick_params(axis="x", which="both", length=0) - ax_top.set_xlabel("") - - return ax_top, ax_bottom - - def plot_equilibration_min_sse( fig: figure.Figure, subplot_spec: gridspec.SubplotSpec, diff --git a/deea/sse.py b/deea/sse.py index b9164e6..56aaae8 100644 --- a/deea/sse.py +++ b/deea/sse.py @@ -139,61 +139,3 @@ def get_sse_series_window( sse_series = var_series / tot_samples return sse_series, window_sizes - - -def sse(data: _np.ndarray, method: str = "lugsail", n_pow: float = 1 / 3) -> float: - """ - Compute the SSE of a time series by dividing the lugsail - replicated batch means variance estimate by the total number - of samples. This is applicable to a single run. - - Parameters - ---------- - data : np.ndarray - The time series data. This should have shape - (n_runs, n_samples) and must have at least two - runs. - - method : str, optional - The method to use to compute the variance estimate. - This can be 'lugsail', 'inter', or 'intra'. The default - is 'lugsail'. 'lugsail' estimates the SSE using the - lugsail replicated batch means limiting variance estimate, - 'inter' uses the inter-run limiting variance estimate, and - 'intra' uses the intra-run limiting variance estimate. One - dimensional data is only allowed when method is 'lugsail' - or 'intra'. - - n_pow : float, optional - The power to use in the lugsail variance estimate. This - should be between 0 and 1. The default is 1/3. - - Returns - ------- - float - The SSE. - """ - # Check that the method is valid. - valid_methods = ["lugsail", "inter", "intra"] - method = method.lower() - if method not in valid_methods: - raise ValueError(f"method must be one of {valid_methods}, but got {method}.") - - # Validate the data. - one_dim_allowed = method in ["lugsail", "intra"] - data = check_data(data, one_dim_allowed=one_dim_allowed) - n_runs, n_samples = data.shape - total_samples = n_runs * n_samples - - # Compute the variance estimates. - if method == "lugsail": - lim_var_est = lugsail_variance(data, n_pow=n_pow) - elif method == "inter": - lim_var_est = inter_run_variance(data) - else: # method == "intra" - lim_var_est = intra_run_variance(data) - - # Compute the SSE. - sse = lim_var_est / total_samples - - return sse diff --git a/deea/tests/test_equilibration.py b/deea/tests/test_equilibration.py index 197df38..6a15c1c 100644 --- a/deea/tests/test_equilibration.py +++ b/deea/tests/test_equilibration.py @@ -7,10 +7,8 @@ from deea.equilibration import ( detect_equilibration_init_seq, - detect_equilibration_max_ess, detect_equilibration_paired_t_test, detect_equilibration_window, - get_ess_series, get_paired_t_p_timeseries, ) @@ -156,76 +154,6 @@ def test_compare_pymbar(example_timeseries): assert equil_g == pytest.approx(equil_g_chod, abs=1e-4) -def test_detect_equilibration_max_ess_variance(gaussian_noise, example_timeseries): - """ - Test equilibration detection based on maximum effective sample size estimated - from the ratio of the inter-run to intra-run variance. - """ - # Compute the equilibration index. - idx, g, ess = detect_equilibration_max_ess(gaussian_noise, method="inter") - - # Check that the equilibration index is correct. - assert idx < 6000 - assert g == pytest.approx(1, abs=2) - assert ess == pytest.approx(50_000, abs=40_000) - - # Check that it fails for a single run. - with pytest.raises(InvalidInputError): - idx = detect_equilibration_max_ess(gaussian_noise[0], method="inter") - - # Check that the equilibration index is correct for the correlated timeseries. - idx, g, ess = detect_equilibration_max_ess(example_timeseries) - assert idx == 431 - assert g == pytest.approx(2.69474, abs=0.00001) - assert ess == pytest.approx(4070.8931, abs=0.0001) - - with pytest.raises(InvalidInputError): - idx, g, ess = detect_equilibration_max_ess(gaussian_noise, method="asd") - - with pytest.raises(InvalidInputError): - idx, g, ess = detect_equilibration_max_ess(gaussian_noise, min_ess=0) - - # Feed in very short timeseries and check that it fails. - with pytest.raises(AnalysisError): - idx, g, ess = detect_equilibration_max_ess(gaussian_noise[:, :10]) - - -def test_detect_equilibration_max_ess_lugsail(gaussian_noise, example_timeseries): - """ - Test equilibration detection based on maximum effective sample size estimated - from the ratio of the lugsail to intra-run variance. - """ - # Compute the equilibration index. - idx, g, ess = detect_equilibration_max_ess(gaussian_noise, method="lugsail") - - # Check that the equilibration index is correct. - assert idx < 5000 - assert g == pytest.approx(1, abs=2) - assert ess == pytest.approx(50_000, abs=40_000) - - # Check result for a single run. - idx, g, ess = detect_equilibration_max_ess(gaussian_noise[0], method="lugsail") - assert idx == pytest.approx(0, abs=3000) - assert g == pytest.approx(1, abs=2) - assert ess == pytest.approx(10_000, abs=3000) - - # Check that the equilibration index is correct for the correlated timeseries. - idx, g, ess = detect_equilibration_max_ess(example_timeseries) - assert idx == 431 - assert g == pytest.approx(2.69474, abs=0.00001) - assert ess == pytest.approx(4070.8931, abs=0.0001) - - # Make a dummy timeseries which will cause the ESS to be small and detection to fail. - # Do this by concatentating arrays of zeros and ones so that the first half of the - # timeseries is all zeros and the second half is all ones. - dummy_timeseries = np.concatenate( - (np.zeros_like(example_timeseries), np.ones_like(example_timeseries)) - ) - dummy_timeseries = np.array(dummy_timeseries) - with pytest.raises(EquilibrationNotDetectedError): - idx, g, ess = detect_equilibration_max_ess(dummy_timeseries) - - def test_detect_equilibration_paired_t(gaussian_noise, example_timeseries): """ Test equilibration detection based on the paired t-test. @@ -296,27 +224,6 @@ def test_plots_paired_t(example_timeseries, example_times, tmpdir): assert tmp_output.with_suffix(".png").exists() -def test_plots_max_ess(example_timeseries, example_times, tmpdir): - """Check that the plots are created.""" - tmp_output = Path(tmpdir) / "test_plots_max_ess" - detect_equilibration_max_ess( - example_timeseries, example_times, plot=True, plot_name=tmp_output - ) - assert tmp_output.with_suffix(".png").exists() - - -def test_ess_series(example_timeseries, example_times): - """Tests on the ESS series generator which can't be run indirectly - through the equilibration detection functions.""" - - with pytest.raises(InvalidInputError): - ess_vals, times = get_ess_series(example_timeseries, times=example_times[:-2]) - - # Check that this works on indices if no times passed. - ess_vals, times = get_ess_series(example_timeseries) - assert times[-1] == 2493 - - def test_p_values(example_timeseries, example_times): """Tests on the p-values series generator which can't be run indirectly through the equilibration detection functions.""" diff --git a/deea/tests/test_plot.py b/deea/tests/test_plot.py index 681b3da..70e5aaa 100644 --- a/deea/tests/test_plot.py +++ b/deea/tests/test_plot.py @@ -5,15 +5,13 @@ import pytest from matplotlib import gridspec -from deea.equilibration import get_ess_series, get_paired_t_p_timeseries +from deea.equilibration import get_paired_t_p_timeseries from .._exceptions import InvalidInputError -from ..equilibration import get_sse_series, get_sse_series_init_seq +from ..equilibration import get_sse_series_init_seq from ..plot import ( - plot_equilibration_max_ess, plot_equilibration_min_sse, plot_equilibration_paired_t_test, - plot_ess, plot_p_values, plot_sse, plot_timeseries, @@ -108,56 +106,36 @@ def test_plot_sse(example_timeseries, example_times): def test_plot_equilibration_min_sse(example_timeseries, example_times): - """Test plotting the equilibration detection based on minimum SSE.""" + """Test plotting the equilibration detection based on the minimum SSE.""" + # Take mean to speed things up, but plot the original data to + # test this works. + example_timeseries_mean = example_timeseries.mean(axis=0) + # Create a figure. fig = plt.figure(figsize=(6, 4)) gridspec_obj = gridspec.GridSpec(1, 1, figure=fig) - # Compute the SSE for the example timeseries with - # the lugsail variance estimator. - sse_vals, times_used = get_sse_series( - example_timeseries, example_times, method="inter" + # Compute the SSE for the example timeseries. + sse_vals, lag_times = get_sse_series_init_seq( + data=example_timeseries_mean, smooth_lag_times=True ) + times = example_times[: len(sse_vals)] # Plot the equilibration detection. plot_equilibration_min_sse( - fig, gridspec_obj[0], example_timeseries, sse_vals, example_times, times_used + fig=fig, + subplot_spec=gridspec_obj[0], + data=example_timeseries, + sse_series=sse_vals, + max_lag_series=None, + data_times=example_times, + sse_times=times, ) if SAVE_PLOTS: fig.savefig("test_plot_equilibration_min_sse.png", bbox_inches="tight") -def test_plot_ess(example_timeseries, example_times): - """Test plotting the effective sample size.""" - # Create a figure. - fig, ax = plt.subplots() - n_runs, n_samples = example_timeseries.shape - - # Compute the ESS for the example timeseries with - # the lugsail variance estimator. - ess_vals, times = get_ess_series( - example_timeseries, times=example_times, method="lugsail" - ) - - # Plot the ESS. - plot_ess(ax, ess_vals, times) - - if SAVE_PLOTS: - fig.savefig("test_plot_ess.png") - - # Check that invalid input raises an error. - with pytest.raises(InvalidInputError): - plot_ess(ax, ess_vals, example_times[:-2]) - - with pytest.raises(InvalidInputError): - plot_ess(ax, ess_vals, list(example_timeseries)) - - with pytest.raises(InvalidInputError): - # Make ess_vals a 2D array. - plot_ess(ax, np.array([ess_vals, ess_vals]), example_timeseries) - - def test_plot_p_values(example_timeseries, example_times): """Test plotting the p-values of the paired t-test.""" # Create a figure. @@ -187,27 +165,6 @@ def test_plot_p_values(example_timeseries, example_times): plot_p_values(ax, np.array([p_values, p_values]), example_timeseries) -def test_plot_equilibration_max_ess(example_timeseries, example_times): - """Test plotting the equilibration detection based on maximum ESS.""" - # Create a figure. - fig = plt.figure(figsize=(6, 4)) - gridspec_obj = gridspec.GridSpec(1, 1, figure=fig) - - # Compute the ESS for the example timeseries with - # the lugsail variance estimator. - ess_vals, times_used = get_ess_series( - example_timeseries, example_times, method="inter" - ) - - # Plot the equilibration detection. - plot_equilibration_max_ess( - fig, gridspec_obj[0], example_timeseries, ess_vals, example_times, times_used - ) - - if SAVE_PLOTS: - fig.savefig("test_plot_equilibration_max_ess.png", bbox_inches="tight") - - def test_plot_equilibration_paired_t_test(example_timeseries, example_times): """Test plotting the equilibration detection based on the paired t-test.""" # Create a figure. diff --git a/deea/variance.py b/deea/variance.py index 7fa4224..a38e34b 100644 --- a/deea/variance.py +++ b/deea/variance.py @@ -1,12 +1,14 @@ """ -Functions to calculate the variance of a time series. +Functions to calculate the variance of a time series, accounting for autocorrelation. Methods implemented: - Initial sequence methods (see Geyer, 1992: https://www.jstor.org/stable/2246094) - Window estimators (see summary in see Geyer, 1992: https://www.jstor.org/stable/2246094) -- Overlapping batch means (see Meketon and Schmeiser, 1984: - https://repository.lib.ncsu.edu/bitstream/handle/1840.4/7707/1984_0041.pdf?sequence=1) + +Did not implement overlapping batch means (see Meketon and Schmeiser, 1984: +https://repository.lib.ncsu.edu/bitstream/handle/1840.4/7707/1984_0041.pdf?sequence=1), as this +is equivalent to using a Bartlett window. """ From 094a1c9cd3533b4b7a7e68d83853c2c601a582fa Mon Sep 17 00:00:00 2001 From: finlayclark Date: Mon, 22 Jan 2024 10:02:17 +0000 Subject: [PATCH 06/15] Update README examples --- README.md | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 9957eca..2ad4218 100644 --- a/README.md +++ b/README.md @@ -8,12 +8,13 @@ deea Detection of Equilibration by Ensemble Analysis -A lightweight Python package for detecting equilibration in timeseries data where an initial transient is followed by a stationary distribution. Currently, two equilibration detection methods are implemented: +A Python package for detecting equilibration in timeseries data where an initial transient is followed by a stationary distribution. Two main methods are implemented, which differ in the way they account for autocorrelation: -- Maximum effective sample size based on the lugsail replicated batch means method. See [here](https://projecteuclid.org/journals/statistical-science/volume-36/issue-4/Revisiting-the-GelmanRubin-Diagnostic/10.1214/20-STS812.full) and [here](https://academic.oup.com/biomet/article/109/3/735/6395353). - -- Paired t-test. This checks for significant differences between the first 10 %, and last 50 % of the data. The test is repeated while sequentially removing more initial data. The first time point at which no significant difference is found is taken as the equilibration time. +- `detect_equilibration_init_seq`: This uses the initial sequence methods of Geyer ([Geyer, 1992](https://www.jstor.org/stable/2246094)) to determine the truncation point of the sum of autocovariances. +- `detect_equilibration_window`: This uses window methods (see [Geyer](https://www.jstor.org/stable/2246094) again) when calculating the +autocorrelation. +For both, the equilibration point can be determined either according to the minimum of the squared standard error (the default), or the maximum effective sample size, by specifying `method="min_sse"` or `method="max_ess"`. ### Installation @@ -29,18 +30,23 @@ pip install -e . import deea # Load your timeseries of interest. -# This should be a 2D numpy array with shape (n_runs, n_samples) +# This should be a 2D numpy array with shape (n_runs, n_samples), +# or a 1D numpy array with shape (n_samples). my_timeseries = ... -# Detect equilibration based on the maximum effective sample size -# based on the lugsail replicated batch means method. -# idx is the index of the first sample after equilibration, g is the -# statistical inefficiency, and ess is the effective sample size. -idx, g, ess = deea.detect_equilibration_max_ess(my_timeseries, method="lugsail", plot=True) +# Detect equilibration based on the minimum squared standard error +# using Geyer's initial convex sequence method to account for +# autocorrelation. idx is the index of the first sample after +# equilibration, g is the statistical inefficiency of the equilibrated +# sample, and ess is the effective sample size of the equilibrated sample. +idx, g, ess = deea.detect_equilibration_init_seq(my_timeseries, method="min_sse", plot=True) + +# Alternatively, use the window method to account for autocorrelation. +# By default, this uses a Bartlett kernel and a window size of round(n_samples**0.5). +idx, g, ess = deea.detect_equilibration_window(my_timeseries, method="min_sse", plot=True) -# Detect equilibration using a paired t-test. -# idx is the index of the first sample after equilibration. -idx = deea.detect_equilibration_ttest(my_timeseries, plot=True) +# We can also determine equilibration in the same way as in pymbar.timeseries.detect_equilibration. +idx, g, ess = deea.detect_equilibration_init_seq(my_timeseries, method="max_ess", sequence_estimator="positive") ``` ### Copyright From 0ccbfec5daca3aad80bcd2671d99416e73060e0c Mon Sep 17 00:00:00 2001 From: finlayclark Date: Mon, 22 Jan 2024 10:46:57 +0000 Subject: [PATCH 07/15] Fix minor codacy style issues --- README.md | 4 +-- deea/equilibration.py | 16 +++--------- deea/plot.py | 6 ++--- deea/sse.py | 8 +----- deea/tests/test_equilibration.py | 44 ++++++++++++-------------------- deea/tests/test_ess.py | 4 +-- deea/tests/test_gelman_rubin.py | 2 +- deea/tests/test_plot.py | 2 +- deea/tests/test_variance.py | 3 +-- deea/variance.py | 2 +- 10 files changed, 33 insertions(+), 58 deletions(-) diff --git a/README.md b/README.md index 2ad4218..329c093 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,8 @@ Detection of Equilibration by Ensemble Analysis A Python package for detecting equilibration in timeseries data where an initial transient is followed by a stationary distribution. Two main methods are implemented, which differ in the way they account for autocorrelation: -- `detect_equilibration_init_seq`: This uses the initial sequence methods of Geyer ([Geyer, 1992](https://www.jstor.org/stable/2246094)) to determine the truncation point of the sum of autocovariances. -- `detect_equilibration_window`: This uses window methods (see [Geyer](https://www.jstor.org/stable/2246094) again) when calculating the + - `detect_equilibration_init_seq`: This uses the initial sequence methods of Geyer ([Geyer, 1992](https://www.jstor.org/stable/2246094)) to determine the truncation point of the sum of autocovariances. + - `detect_equilibration_window`: This uses window methods (see [Geyer](https://www.jstor.org/stable/2246094) again) when calculating the autocorrelation. For both, the equilibration point can be determined either according to the minimum of the squared standard error (the default), or the maximum effective sample size, by specifying `method="min_sse"` or `method="max_ess"`. diff --git a/deea/equilibration.py b/deea/equilibration.py index 194e5a1..60d1e5c 100644 --- a/deea/equilibration.py +++ b/deea/equilibration.py @@ -11,18 +11,11 @@ from matplotlib import gridspec from scipy.stats import ttest_rel -from ._exceptions import AnalysisError, EquilibrationNotDetectedError, InvalidInputError +from ._exceptions import EquilibrationNotDetectedError, InvalidInputError from ._validation import check_data -from .ess import ( - convert_sse_series_to_ess_series, - ess_inter_variance, - ess_lugsail_variance, - statistical_inefficiency_inter_variance, - statistical_inefficiency_lugsail_variance, -) +from .ess import convert_sse_series_to_ess_series from .plot import plot_equilibration_min_sse, plot_equilibration_paired_t_test from .sse import get_sse_series_init_seq, get_sse_series_window -from .variance import get_variance_initial_sequence, get_variance_window ######################################## New ####################################### @@ -42,7 +35,7 @@ def detect_equilibration_init_seq( data_y_label: str = r"$\Delta G$ / kcal mol$^{-1}$", plot_max_lags: bool = True, ) -> _Tuple[_Union[float, int], float, float]: - """ + r""" Detect the equilibration time of a time series by finding the minimum squared standard error (SSE), or maximum effective sample size (ESS) of the time series, using initial sequence estimators of the variance. @@ -208,7 +201,7 @@ def detect_equilibration_window( data_y_label: str = r"$\Delta G$ / kcal mol$^{-1}$", plot_window_size: bool = True, ) -> _Tuple[_Union[float, int], float, float]: - """ + r""" Detect the equilibration time of a time series by finding the minimum squared standard error (SSE) or maximum effective sample size (ESS) of the time series, using window estimators of the variance. This is @@ -560,7 +553,6 @@ def detect_equilibration_paired_t_test( raise InvalidInputError("p_threshold must be between 0 and 0.7.") # Get the p value timeseries and n_discard. - indices = _np.arange(n_samples) p_vals, times_used = get_paired_t_p_timeseries( data=data, times=times, diff --git a/deea/plot.py b/deea/plot.py index 8b218b6..cbc10b1 100644 --- a/deea/plot.py +++ b/deea/plot.py @@ -226,7 +226,7 @@ def plot_sse( variance_y_label: str = r"$\frac{1}{\sigma^2(\Delta G)}$ / kcal$^{-2}$ mol$^2$", reciprocal: bool = True, ) -> _Tuple[_List[Line2D], _List[str]]: - """ + r""" Plot the squared standard error (SSE) estimate against time. Parameters @@ -251,7 +251,7 @@ def plot_sse( variance_y_label : str, optional The y-axis label for the variance. The default is - r"$\frac{1}{\sigma^2(\Delta G)}$ / kcal$^{-2}$ mol$^2$". + "$\frac{1}{\sigma^2(\Delta G)}$ / kcal$^{-2}$ mol$^2$". reciprocal : bool, optional, default=True Whether to plot the reciprocal of the SSE. @@ -474,7 +474,7 @@ def plot_equilibration_min_sse( variance_y_label : str, optional The y-axis label for the variance. The default is - r"$\frac{1}{\sigma^2(\Delta G)}$ / kcal$^{-2}$ mol$^2$". + "$\frac{1}{\sigma^2(\Delta G)}$ / kcal$^{-2}$ mol$^2$". reciprocal : bool, optional, default=True Whether to plot the reciprocal of the SSE. diff --git a/deea/sse.py b/deea/sse.py index 56aaae8..40e62c5 100644 --- a/deea/sse.py +++ b/deea/sse.py @@ -7,13 +7,7 @@ import numpy as _np from ._validation import check_data -from .variance import ( - get_variance_series_initial_sequence, - get_variance_series_window, - inter_run_variance, - intra_run_variance, - lugsail_variance, -) +from .variance import get_variance_series_initial_sequence, get_variance_series_window def get_sse_series_init_seq( diff --git a/deea/tests/test_equilibration.py b/deea/tests/test_equilibration.py index 6a15c1c..b00f789 100644 --- a/deea/tests/test_equilibration.py +++ b/deea/tests/test_equilibration.py @@ -12,11 +12,7 @@ get_paired_t_p_timeseries, ) -from .._exceptions import ( - AnalysisError, - EquilibrationNotDetectedError, - InvalidInputError, -) +from .._exceptions import EquilibrationNotDetectedError, InvalidInputError from . import example_times, example_timeseries, gaussian_noise, tmpdir @@ -78,7 +74,7 @@ def test_detect_equilibration_init_seq_raises(example_timeseries): Test that invalid inputs raise errors. """ with pytest.raises(InvalidInputError): - equil_idx, equil_g, equil_ess = detect_equilibration_init_seq( + detect_equilibration_init_seq( method="non_existent", data=example_timeseries, sequence_estimator="positive", @@ -104,7 +100,7 @@ def test_detect_equilibration_window(example_timeseries, example_times, tmpdir): # Try also supplying the times. Plot in a temporary directory. tmp_output = Path(tmpdir) / "test_plots_min_sse_window" # tmp_output = "./test_plots_min_sse_window" - equil_time, equil_sse, equil_ess = detect_equilibration_window( + equil_time, equil_g, equil_ess = detect_equilibration_window( data=example_timeseries, times=example_times, plot=True, @@ -164,7 +160,7 @@ def test_detect_equilibration_paired_t(gaussian_noise, example_timeseries): # Check that it fails for a single run. with pytest.raises(InvalidInputError): - idx = detect_equilibration_paired_t_test(gaussian_noise[0]) + detect_equilibration_paired_t_test(gaussian_noise[0]) # Check the index for the correlated timeseries. idx = detect_equilibration_paired_t_test(example_timeseries) @@ -172,47 +168,43 @@ def test_detect_equilibration_paired_t(gaussian_noise, example_timeseries): # Try stupid inputs and check that we get input errors. with pytest.raises(InvalidInputError): - idx = detect_equilibration_paired_t_test(gaussian_noise, p_threshold=0) + detect_equilibration_paired_t_test(gaussian_noise, p_threshold=0) with pytest.raises(InvalidInputError): - idx = detect_equilibration_paired_t_test(gaussian_noise, p_threshold=1) + detect_equilibration_paired_t_test(gaussian_noise, p_threshold=1) with pytest.raises(InvalidInputError): - idx = detect_equilibration_paired_t_test( - gaussian_noise, fractional_block_size=0 - ) + detect_equilibration_paired_t_test(gaussian_noise, fractional_block_size=0) with pytest.raises(InvalidInputError): - idx = detect_equilibration_paired_t_test( - gaussian_noise, fractional_block_size=1 - ) + detect_equilibration_paired_t_test(gaussian_noise, fractional_block_size=1) with pytest.raises(InvalidInputError): - idx = detect_equilibration_paired_t_test(gaussian_noise, t_test_sidedness="asd") + detect_equilibration_paired_t_test(gaussian_noise, t_test_sidedness="asd") with pytest.raises(InvalidInputError): - idx = detect_equilibration_paired_t_test(gaussian_noise, fractional_test_end=0) + detect_equilibration_paired_t_test(gaussian_noise, fractional_test_end=0) with pytest.raises(InvalidInputError): - idx = detect_equilibration_paired_t_test( + detect_equilibration_paired_t_test( gaussian_noise, fractional_test_end=0.3, initial_block_size=0.5 ) with pytest.raises(InvalidInputError): - idx = detect_equilibration_paired_t_test( + detect_equilibration_paired_t_test( gaussian_noise, fractional_test_end=0.3, initial_block_size=0.3 ) with pytest.raises(InvalidInputError): - idx = detect_equilibration_paired_t_test(gaussian_noise, initial_block_size=1.2) + detect_equilibration_paired_t_test(gaussian_noise, initial_block_size=1.2) with pytest.raises(InvalidInputError): - idx = detect_equilibration_paired_t_test(gaussian_noise, final_block_size=1.2) + detect_equilibration_paired_t_test(gaussian_noise, final_block_size=1.2) # Generate steeply decaying timeseries and check that it fails to equilibrate. # Generate 5 very similar exponential decays. steeply_decaying_timeseries = np.exp(-np.arange(1000)) steeply_decaying_timeseries = np.tile(steeply_decaying_timeseries, (5, 1)) with pytest.raises(EquilibrationNotDetectedError): - idx = detect_equilibration_paired_t_test(steeply_decaying_timeseries) + detect_equilibration_paired_t_test(steeply_decaying_timeseries) def test_plots_paired_t(example_timeseries, example_times, tmpdir): @@ -229,10 +221,8 @@ def test_p_values(example_timeseries, example_times): through the equilibration detection functions.""" with pytest.raises(InvalidInputError): - p_vals, times = get_paired_t_p_timeseries( - example_timeseries, times=example_times[:-2] - ) + get_paired_t_p_timeseries(example_timeseries, times=example_times[:-2]) # Check that this works on indices if no times passed. - p_vals, times = get_paired_t_p_timeseries(example_timeseries) + _, times = get_paired_t_p_timeseries(example_timeseries) assert times[-1] == 1312 diff --git a/deea/tests/test_ess.py b/deea/tests/test_ess.py index 97d8359..584230d 100644 --- a/deea/tests/test_ess.py +++ b/deea/tests/test_ess.py @@ -59,7 +59,7 @@ def test_statistical_inefficiency_inter_variance(gaussian_noise, example_timeser # Check that it fails for a single run. with pytest.raises(InvalidInputError): - si = statistical_inefficiency_inter_variance(gaussian_noise[0]) + statistical_inefficiency_inter_variance(gaussian_noise[0]) # Check that statistical inefficiency is higher for the correlated timeseries. si = statistical_inefficiency_inter_variance(example_timeseries) @@ -93,7 +93,7 @@ def test_effective_sample_size_inter_variance(gaussian_noise, example_timeseries # Check that it fails for a single run. with pytest.raises(InvalidInputError): - ess = ess_inter_variance(gaussian_noise[0]) + ess_inter_variance(gaussian_noise[0]) # Check that effective sample size is lower for the correlated timeseries. ess = ess_inter_variance(example_timeseries) diff --git a/deea/tests/test_gelman_rubin.py b/deea/tests/test_gelman_rubin.py index 880eb53..8efe027 100644 --- a/deea/tests/test_gelman_rubin.py +++ b/deea/tests/test_gelman_rubin.py @@ -17,7 +17,7 @@ def test_gelman_rubin_gaussian(gaussian_noise): assert gr == pytest.approx(1, abs=0.001) # Check that it fails for a single run. with pytest.raises(InvalidInputError): - gr = gelman_rubin(gaussian_noise[0]) + gelman_rubin(gaussian_noise[0]) def test_stable_gelman_rubin_gaussian(gaussian_noise): diff --git a/deea/tests/test_plot.py b/deea/tests/test_plot.py index 70e5aaa..75261fd 100644 --- a/deea/tests/test_plot.py +++ b/deea/tests/test_plot.py @@ -116,7 +116,7 @@ def test_plot_equilibration_min_sse(example_timeseries, example_times): gridspec_obj = gridspec.GridSpec(1, 1, figure=fig) # Compute the SSE for the example timeseries. - sse_vals, lag_times = get_sse_series_init_seq( + sse_vals, _ = get_sse_series_init_seq( data=example_timeseries_mean, smooth_lag_times=True ) times = example_times[: len(sse_vals)] diff --git a/deea/tests/test_variance.py b/deea/tests/test_variance.py index 4228e14..d5d6f6e 100644 --- a/deea/tests/test_variance.py +++ b/deea/tests/test_variance.py @@ -54,7 +54,6 @@ def test_get_autocovariance(example_timeseries): def test_gamma_cap(example_timeseries): """Test the _get_gamma_cap function.""" example_timeseries = example_timeseries.mean(axis=0) - example_timeseries_demeaned = example_timeseries - example_timeseries.mean() # Compute the autocovariance function with no truncation. autocovariance = _get_autocovariance(example_timeseries) @@ -360,7 +359,7 @@ def test_lugsail_variance(gaussian_noise): lugsail_variance(gaussian_noise, 100_000) with pytest.raises(AnalysisError): - variance = lugsail_variance(gaussian_noise, 0.1) + lugsail_variance(gaussian_noise, 0.1) def test_get_variance_series_window_raises(example_timeseries): diff --git a/deea/variance.py b/deea/variance.py index a38e34b..651dfc1 100644 --- a/deea/variance.py +++ b/deea/variance.py @@ -422,7 +422,7 @@ def get_variance_initial_sequence( "initial_monotone", "initial_convex", ] - if not sequence_estimator in implemented_sequence_estimators: + if sequence_estimator not in implemented_sequence_estimators: raise InvalidInputError( f"Sequence_estimator must be one of {implemented_sequence_estimators}." ) From 61edf05c740f901470f527c711953cd1a1c6382f Mon Sep 17 00:00:00 2001 From: finlayclark Date: Mon, 22 Jan 2024 10:58:27 +0000 Subject: [PATCH 08/15] Upload coverage report to codacy --- .github/workflows/CI.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/CI.yaml b/.github/workflows/CI.yaml index d18afa8..74fdc49 100644 --- a/.github/workflows/CI.yaml +++ b/.github/workflows/CI.yaml @@ -58,3 +58,8 @@ jobs: uses: codecov/codecov-action@v3 env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} + + - name: Run codacy-coverage-reporter + uses: codacy/codacy-coverage-reporter-action@v1 + with: + project-token: ${{ secrets.CODACY_PROJECT_TOKEN }} From daba3429b319116cbf4c1f46b503b5fc92313a06 Mon Sep 17 00:00:00 2001 From: finlayclark Date: Mon, 22 Jan 2024 11:01:25 +0000 Subject: [PATCH 09/15] Remove unused variable to satisfy codacy --- deea/tests/test_equilibration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deea/tests/test_equilibration.py b/deea/tests/test_equilibration.py index b00f789..651e0a0 100644 --- a/deea/tests/test_equilibration.py +++ b/deea/tests/test_equilibration.py @@ -143,7 +143,7 @@ def test_compare_pymbar(example_timeseries): equil_idx_chod = 877 equil_g_chod = 4.111825 # equil_ess_chod = 425.35858 - equil_idx, equil_g, equil_ess = detect_equilibration_init_seq( + equil_idx, equil_g, _ = detect_equilibration_init_seq( example_timeseries, method="max_ess", sequence_estimator="positive" ) assert equil_idx == equil_idx_chod From 708f684c6773c74b40ec7d8d3592ee3bcb118058 Mon Sep 17 00:00:00 2001 From: finlayclark Date: Mon, 22 Jan 2024 11:06:06 +0000 Subject: [PATCH 10/15] Remove paired t-test method --- deea/equilibration.py | 260 ------------------------------- deea/tests/test_equilibration.py | 21 --- 2 files changed, 281 deletions(-) diff --git a/deea/equilibration.py b/deea/equilibration.py index 60d1e5c..b5d2358 100644 --- a/deea/equilibration.py +++ b/deea/equilibration.py @@ -330,263 +330,3 @@ def detect_equilibration_window( # Return the equilibration time (index), statistical inefficiency, and ESS. return equil_time, equil_g, equil_ess - - -def get_paired_t_p_timeseries( - data: _np.ndarray, - times: _Optional[_np.ndarray] = None, - fractional_block_size: float = 0.125, - fractional_test_end: float = 0.5, - initial_block_size: float = 0.1, - final_block_size: float = 0.5, - t_test_sidedness: str = "two-sided", -) -> _Tuple[_np.ndarray, _np.ndarray]: - """ - Get a timeseries of the p-values from a paired t-test on the differences - between sample means between intial and final portions of the data. The timeseries - is obtained by repeatedly discarding more data from the time series between - calculations of the p-value. - - Parameters - ---------- - data : np.ndarray - The time series data. This should have shape - (n_runs, n_samples) and must have at least two - runs. - - times : np.ndarray, optional - The times at which the data was sampled. If this is - not provided, the indices of the data will be used. - - fractional_block_size : float, optional - The fraction of data to discard between repeats. The default is 0.125. - - fractional_test_end : float, optional - The fraction of the time series to use in the final test. The default is 0.5. - - initial_block_size : float, optional - The fraction of the truncated time series to use for the "before" portion - of the paired t-test. The default is 0.1. - - final_block_size : float, optional - The fraction of the truncated time series to use for the "after" portion - of the paired t-test. The default is 0.5. - - t_test_sidedness : str, optional - The sidedness of the paired t-test. This can be either 'two-sided', 'less', - or 'greater'. The default is 'two-sided'. - - Returns - ------- - np.ndarray - The p-values of the paired t-test. - - np.ndarray - The times at which the p-values were calculated. - """ - # Check that the data is valid. - data = check_data(data, one_dim_allowed=False) - n_runs, n_samples = data.shape - - # Convert times to indices if necessary. - if times is None: - times = _np.arange(n_samples) - - # Check that times is match the number of samples. - if n_samples != len(times): - raise InvalidInputError( - "Times must have the same length as the number of samples." - ) - - # Check that user inputs are valid. - if fractional_block_size <= 0 or fractional_block_size > 1: - raise InvalidInputError("fractional_block_size must be between 0 and 1.") - - if fractional_test_end <= 0 or fractional_test_end > 1: - raise InvalidInputError("fractional_test_end must be between 0 and 1.") - - if round(fractional_test_end / fractional_block_size) < 2: - raise InvalidInputError( - "fractional_test_end must be at least twice the fractional_block_size." - ) - - # Check that fractional test end is a multiple of fractional block size. - if round((fractional_test_end / fractional_block_size) % 1.0, 3) != 0: - raise InvalidInputError( - "fractional_test_end must be a multiple of fractional_block_size." - ) - - if initial_block_size <= 0 or initial_block_size > 1: - raise InvalidInputError("initial_block_size must be between 0 and 1.") - - if final_block_size <= 0 or final_block_size > 1: - raise InvalidInputError("final_block_size must be between 0 and 1.") - - t_test_sidedness = t_test_sidedness.lower() - if t_test_sidedness not in ["two-sided", "less", "greater"]: - raise InvalidInputError( - "t_test_sidedness must be either 'two-sided', 'less', or 'greater'." - ) - - # Calculate the number of repeats. - n_repeats = ( - round(fractional_test_end / fractional_block_size) + 1 - ) # + 1 for the initial block - - # Calculate the number of samples to discard between repeats. - n_discard = round(n_samples * fractional_block_size) - - # Calculate the p values with their indices. - p_vals = _np.zeros(n_repeats) - p_val_indices = _np.zeros(n_repeats, dtype=int) - time_vals = _np.zeros(n_repeats) - - # Loop over and calculate the p values. - for i in range(n_repeats): - # Truncate the data. - idx = n_discard * i - truncated_data = data[:, idx:] - # Get the number of samples in the truncated data. - n_truncated_samples = truncated_data.shape[1] - # Get the initial and final blocks. - initial_block = truncated_data[ - :, : round(n_truncated_samples * initial_block_size) - ].mean(axis=1) - final_block = truncated_data[ - :, -round(n_truncated_samples * final_block_size) : - ].mean(axis=1) - # Compute the paired t-test. - p_vals[i] = ttest_rel(initial_block, final_block, alternative=t_test_sidedness)[ - 1 - ] - p_val_indices[i] = idx - time_vals[i] = times[idx] - - return p_vals, time_vals - - -def detect_equilibration_paired_t_test( - data: _np.ndarray, - times: _Optional[_np.ndarray] = None, - p_threshold: float = 0.05, - fractional_block_size: float = 0.125, - fractional_test_end: float = 0.5, - initial_block_size: float = 0.1, - final_block_size: float = 0.5, - t_test_sidedness: str = "two-sided", - plot: bool = False, - plot_name: _Union[str, Path] = "equilibration_paired_t_test.png", - time_units: str = "ns", - data_y_label: str = r"$\Delta G$ / kcal mol$^{-1}$", -) -> int: - r""" - Detect the equilibration time of a time series by performing a paired - t-test between initial and final portions of the time series. This is repeated - , discarding more data from the time series between repeats. If the p-value - is greater than the threshold, there is no significant evidence that the data is - no equilibrated and the timeseries is taken to be equilibrated at this time - point. This test may be useful when we care only about systematic bias in the - data, and do not care about detecting inter-run differences. - - Parameters - ---------- - data : np.ndarray - The time series data. This should have shape - (n_runs, n_samples) and must have at least two - runs. - - times : np.ndarray, optional - The times at which the data was sampled. If this is - not provided, the indices of the data will be used. - - p_threshold : float, optional - The p-value threshold to use. The default is 0.05. - - fractional_block_size : float, optional - The fraction of data to discard between repeats. The default is 0.125. - - fractional_test_end : float, optional - The fraction of the time series to use in the final test. The default is 0.5. - - initial_block_size : float, optional - The fraction of the truncated time series to use for the "before" portion - of the paired t-test. The default is 0.1. - - final_block_size : float, optional - The fraction of the truncated time series to use for the "after" portion - of the paired t-test. The default is 0.5. - - t_test_sidedness : str, optional - The sidedness of the paired t-test. This can be either 'two-sided', 'less', - or 'greater'. The default is 'two-sided'. - - plot : bool, optional - Whether to plot the p-values. The default is False. - - plot_name : str | Path, optional - The name of the plot file. The default is 'equilibration_paired_t_test.png'. - - time_units : str, optional - The units of time. The default is "ns". - - data_y_label : str, optional - The y-axis label for the time series data. The default is - "$\Delta G$ / kcal mol$^{-1}$". - - Returns - ------- - int - The time point at which the time series is equilibrated. - """ - # Validate dtata. - data = check_data(data, one_dim_allowed=False) - n_runs, n_samples = data.shape - - # If times is None, units of time are indices. - if times is None: - time_units = "index" - # Convert times to indices. - times = _np.arange(n_samples) - - # Check that user options (not checked in get_paired_t_p_timeseries) are valid. - if p_threshold <= 0 or p_threshold > 0.7: - raise InvalidInputError("p_threshold must be between 0 and 0.7.") - - # Get the p value timeseries and n_discard. - p_vals, times_used = get_paired_t_p_timeseries( - data=data, - times=times, - fractional_block_size=fractional_block_size, - fractional_test_end=fractional_test_end, - initial_block_size=initial_block_size, - final_block_size=final_block_size, - t_test_sidedness=t_test_sidedness, - ) - - # Get the index of the first p value that is greater than the threshold. - meets_threshold = p_vals > p_threshold - if not any(meets_threshold): - raise EquilibrationNotDetectedError( - f"No p values are greater than the threshold of {p_threshold}." - ) - equil_time = times_used[_np.argmax(meets_threshold)] - - # Plot the p values. - if plot: - fig = plt.figure(figsize=(6, 4)) - gridspec_obj = gridspec.GridSpec(1, 1, figure=fig) - plot_equilibration_paired_t_test( - fig=fig, - subplot_spec=gridspec_obj[0], - data=data, - p_values=p_vals, - data_times=times, - p_times=times_used, - p_threshold=p_threshold, - time_units=time_units, - data_y_label=data_y_label, - ) - - fig.savefig(str(plot_name), dpi=300, bbox_inches="tight") - - return equil_time diff --git a/deea/tests/test_equilibration.py b/deea/tests/test_equilibration.py index 651e0a0..6a1ecfb 100644 --- a/deea/tests/test_equilibration.py +++ b/deea/tests/test_equilibration.py @@ -205,24 +205,3 @@ def test_detect_equilibration_paired_t(gaussian_noise, example_timeseries): steeply_decaying_timeseries = np.tile(steeply_decaying_timeseries, (5, 1)) with pytest.raises(EquilibrationNotDetectedError): detect_equilibration_paired_t_test(steeply_decaying_timeseries) - - -def test_plots_paired_t(example_timeseries, example_times, tmpdir): - """Check that the plots are created.""" - tmp_output = Path(tmpdir) / "test_plots_paired_t" - detect_equilibration_paired_t_test( - example_timeseries, example_times, plot=True, plot_name=tmp_output - ) - assert tmp_output.with_suffix(".png").exists() - - -def test_p_values(example_timeseries, example_times): - """Tests on the p-values series generator which can't be run indirectly - through the equilibration detection functions.""" - - with pytest.raises(InvalidInputError): - get_paired_t_p_timeseries(example_timeseries, times=example_times[:-2]) - - # Check that this works on indices if no times passed. - _, times = get_paired_t_p_timeseries(example_timeseries) - assert times[-1] == 1312 From e8cb3b5f83755a0106b4f7eac68bbf5a336b1813 Mon Sep 17 00:00:00 2001 From: finlayclark Date: Mon, 22 Jan 2024 11:10:30 +0000 Subject: [PATCH 11/15] Remove paired t-test method plot tests --- deea/tests/test_equilibration.py | 2 -- deea/tests/test_plot.py | 47 -------------------------------- 2 files changed, 49 deletions(-) diff --git a/deea/tests/test_equilibration.py b/deea/tests/test_equilibration.py index 6a1ecfb..d185ffd 100644 --- a/deea/tests/test_equilibration.py +++ b/deea/tests/test_equilibration.py @@ -7,9 +7,7 @@ from deea.equilibration import ( detect_equilibration_init_seq, - detect_equilibration_paired_t_test, detect_equilibration_window, - get_paired_t_p_timeseries, ) from .._exceptions import EquilibrationNotDetectedError, InvalidInputError diff --git a/deea/tests/test_plot.py b/deea/tests/test_plot.py index 75261fd..50b3b49 100644 --- a/deea/tests/test_plot.py +++ b/deea/tests/test_plot.py @@ -134,50 +134,3 @@ def test_plot_equilibration_min_sse(example_timeseries, example_times): if SAVE_PLOTS: fig.savefig("test_plot_equilibration_min_sse.png", bbox_inches="tight") - - -def test_plot_p_values(example_timeseries, example_times): - """Test plotting the p-values of the paired t-test.""" - # Create a figure. - fig, ax = plt.subplots() - n_runs, n_samples = example_timeseries.shape - - # Compute the p-values for the example timeseries. - p_values, times_used = get_paired_t_p_timeseries( - example_timeseries, example_times, t_test_sidedness="two-sided" - ) - - # Plot the p-values. - plot_p_values(ax, p_values, times_used) - - if SAVE_PLOTS: - fig.savefig("test_plot_p_values.png") - - # Check that invalid input raises an error. - with pytest.raises(InvalidInputError): - plot_p_values(ax, p_values, example_times[:-2]) - - with pytest.raises(InvalidInputError): - plot_p_values(ax, list(p_values), example_timeseries) - - with pytest.raises(InvalidInputError): - # Make p_values a 2D array. - plot_p_values(ax, np.array([p_values, p_values]), example_timeseries) - - -def test_plot_equilibration_paired_t_test(example_timeseries, example_times): - """Test plotting the equilibration detection based on the paired t-test.""" - # Create a figure. - fig = plt.figure(figsize=(6, 4)) - gridspec_obj = gridspec.GridSpec(1, 1, figure=fig) - - # Compute the p-values for the example timeseries. - p_values, p_times = get_paired_t_p_timeseries(example_timeseries, example_times) - - # Plot the equilibration detection. - plot_equilibration_paired_t_test( - fig, gridspec_obj[0], example_timeseries, p_values, example_times, p_times - ) - - if SAVE_PLOTS: - fig.savefig("test_plot_equilibration_paired_t_test.png", bbox_inches="tight") From 4d57b010e4e924f70a08521f28dab708eff91112 Mon Sep 17 00:00:00 2001 From: finlayclark Date: Mon, 22 Jan 2024 11:17:04 +0000 Subject: [PATCH 12/15] Revert "Remove paired t-test method plot tests" This reverts commit e8cb3b5f83755a0106b4f7eac68bbf5a336b1813. --- deea/tests/test_equilibration.py | 2 ++ deea/tests/test_plot.py | 47 ++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/deea/tests/test_equilibration.py b/deea/tests/test_equilibration.py index d185ffd..6a1ecfb 100644 --- a/deea/tests/test_equilibration.py +++ b/deea/tests/test_equilibration.py @@ -7,7 +7,9 @@ from deea.equilibration import ( detect_equilibration_init_seq, + detect_equilibration_paired_t_test, detect_equilibration_window, + get_paired_t_p_timeseries, ) from .._exceptions import EquilibrationNotDetectedError, InvalidInputError diff --git a/deea/tests/test_plot.py b/deea/tests/test_plot.py index 50b3b49..75261fd 100644 --- a/deea/tests/test_plot.py +++ b/deea/tests/test_plot.py @@ -134,3 +134,50 @@ def test_plot_equilibration_min_sse(example_timeseries, example_times): if SAVE_PLOTS: fig.savefig("test_plot_equilibration_min_sse.png", bbox_inches="tight") + + +def test_plot_p_values(example_timeseries, example_times): + """Test plotting the p-values of the paired t-test.""" + # Create a figure. + fig, ax = plt.subplots() + n_runs, n_samples = example_timeseries.shape + + # Compute the p-values for the example timeseries. + p_values, times_used = get_paired_t_p_timeseries( + example_timeseries, example_times, t_test_sidedness="two-sided" + ) + + # Plot the p-values. + plot_p_values(ax, p_values, times_used) + + if SAVE_PLOTS: + fig.savefig("test_plot_p_values.png") + + # Check that invalid input raises an error. + with pytest.raises(InvalidInputError): + plot_p_values(ax, p_values, example_times[:-2]) + + with pytest.raises(InvalidInputError): + plot_p_values(ax, list(p_values), example_timeseries) + + with pytest.raises(InvalidInputError): + # Make p_values a 2D array. + plot_p_values(ax, np.array([p_values, p_values]), example_timeseries) + + +def test_plot_equilibration_paired_t_test(example_timeseries, example_times): + """Test plotting the equilibration detection based on the paired t-test.""" + # Create a figure. + fig = plt.figure(figsize=(6, 4)) + gridspec_obj = gridspec.GridSpec(1, 1, figure=fig) + + # Compute the p-values for the example timeseries. + p_values, p_times = get_paired_t_p_timeseries(example_timeseries, example_times) + + # Plot the equilibration detection. + plot_equilibration_paired_t_test( + fig, gridspec_obj[0], example_timeseries, p_values, example_times, p_times + ) + + if SAVE_PLOTS: + fig.savefig("test_plot_equilibration_paired_t_test.png", bbox_inches="tight") From 53585219fddbcb481e38c164a1f68568b4aa3a74 Mon Sep 17 00:00:00 2001 From: finlayclark Date: Mon, 22 Jan 2024 11:17:32 +0000 Subject: [PATCH 13/15] Revert "Remove paired t-test method" This reverts commit 708f684c6773c74b40ec7d8d3592ee3bcb118058. --- deea/equilibration.py | 260 +++++++++++++++++++++++++++++++ deea/tests/test_equilibration.py | 21 +++ 2 files changed, 281 insertions(+) diff --git a/deea/equilibration.py b/deea/equilibration.py index b5d2358..60d1e5c 100644 --- a/deea/equilibration.py +++ b/deea/equilibration.py @@ -330,3 +330,263 @@ def detect_equilibration_window( # Return the equilibration time (index), statistical inefficiency, and ESS. return equil_time, equil_g, equil_ess + + +def get_paired_t_p_timeseries( + data: _np.ndarray, + times: _Optional[_np.ndarray] = None, + fractional_block_size: float = 0.125, + fractional_test_end: float = 0.5, + initial_block_size: float = 0.1, + final_block_size: float = 0.5, + t_test_sidedness: str = "two-sided", +) -> _Tuple[_np.ndarray, _np.ndarray]: + """ + Get a timeseries of the p-values from a paired t-test on the differences + between sample means between intial and final portions of the data. The timeseries + is obtained by repeatedly discarding more data from the time series between + calculations of the p-value. + + Parameters + ---------- + data : np.ndarray + The time series data. This should have shape + (n_runs, n_samples) and must have at least two + runs. + + times : np.ndarray, optional + The times at which the data was sampled. If this is + not provided, the indices of the data will be used. + + fractional_block_size : float, optional + The fraction of data to discard between repeats. The default is 0.125. + + fractional_test_end : float, optional + The fraction of the time series to use in the final test. The default is 0.5. + + initial_block_size : float, optional + The fraction of the truncated time series to use for the "before" portion + of the paired t-test. The default is 0.1. + + final_block_size : float, optional + The fraction of the truncated time series to use for the "after" portion + of the paired t-test. The default is 0.5. + + t_test_sidedness : str, optional + The sidedness of the paired t-test. This can be either 'two-sided', 'less', + or 'greater'. The default is 'two-sided'. + + Returns + ------- + np.ndarray + The p-values of the paired t-test. + + np.ndarray + The times at which the p-values were calculated. + """ + # Check that the data is valid. + data = check_data(data, one_dim_allowed=False) + n_runs, n_samples = data.shape + + # Convert times to indices if necessary. + if times is None: + times = _np.arange(n_samples) + + # Check that times is match the number of samples. + if n_samples != len(times): + raise InvalidInputError( + "Times must have the same length as the number of samples." + ) + + # Check that user inputs are valid. + if fractional_block_size <= 0 or fractional_block_size > 1: + raise InvalidInputError("fractional_block_size must be between 0 and 1.") + + if fractional_test_end <= 0 or fractional_test_end > 1: + raise InvalidInputError("fractional_test_end must be between 0 and 1.") + + if round(fractional_test_end / fractional_block_size) < 2: + raise InvalidInputError( + "fractional_test_end must be at least twice the fractional_block_size." + ) + + # Check that fractional test end is a multiple of fractional block size. + if round((fractional_test_end / fractional_block_size) % 1.0, 3) != 0: + raise InvalidInputError( + "fractional_test_end must be a multiple of fractional_block_size." + ) + + if initial_block_size <= 0 or initial_block_size > 1: + raise InvalidInputError("initial_block_size must be between 0 and 1.") + + if final_block_size <= 0 or final_block_size > 1: + raise InvalidInputError("final_block_size must be between 0 and 1.") + + t_test_sidedness = t_test_sidedness.lower() + if t_test_sidedness not in ["two-sided", "less", "greater"]: + raise InvalidInputError( + "t_test_sidedness must be either 'two-sided', 'less', or 'greater'." + ) + + # Calculate the number of repeats. + n_repeats = ( + round(fractional_test_end / fractional_block_size) + 1 + ) # + 1 for the initial block + + # Calculate the number of samples to discard between repeats. + n_discard = round(n_samples * fractional_block_size) + + # Calculate the p values with their indices. + p_vals = _np.zeros(n_repeats) + p_val_indices = _np.zeros(n_repeats, dtype=int) + time_vals = _np.zeros(n_repeats) + + # Loop over and calculate the p values. + for i in range(n_repeats): + # Truncate the data. + idx = n_discard * i + truncated_data = data[:, idx:] + # Get the number of samples in the truncated data. + n_truncated_samples = truncated_data.shape[1] + # Get the initial and final blocks. + initial_block = truncated_data[ + :, : round(n_truncated_samples * initial_block_size) + ].mean(axis=1) + final_block = truncated_data[ + :, -round(n_truncated_samples * final_block_size) : + ].mean(axis=1) + # Compute the paired t-test. + p_vals[i] = ttest_rel(initial_block, final_block, alternative=t_test_sidedness)[ + 1 + ] + p_val_indices[i] = idx + time_vals[i] = times[idx] + + return p_vals, time_vals + + +def detect_equilibration_paired_t_test( + data: _np.ndarray, + times: _Optional[_np.ndarray] = None, + p_threshold: float = 0.05, + fractional_block_size: float = 0.125, + fractional_test_end: float = 0.5, + initial_block_size: float = 0.1, + final_block_size: float = 0.5, + t_test_sidedness: str = "two-sided", + plot: bool = False, + plot_name: _Union[str, Path] = "equilibration_paired_t_test.png", + time_units: str = "ns", + data_y_label: str = r"$\Delta G$ / kcal mol$^{-1}$", +) -> int: + r""" + Detect the equilibration time of a time series by performing a paired + t-test between initial and final portions of the time series. This is repeated + , discarding more data from the time series between repeats. If the p-value + is greater than the threshold, there is no significant evidence that the data is + no equilibrated and the timeseries is taken to be equilibrated at this time + point. This test may be useful when we care only about systematic bias in the + data, and do not care about detecting inter-run differences. + + Parameters + ---------- + data : np.ndarray + The time series data. This should have shape + (n_runs, n_samples) and must have at least two + runs. + + times : np.ndarray, optional + The times at which the data was sampled. If this is + not provided, the indices of the data will be used. + + p_threshold : float, optional + The p-value threshold to use. The default is 0.05. + + fractional_block_size : float, optional + The fraction of data to discard between repeats. The default is 0.125. + + fractional_test_end : float, optional + The fraction of the time series to use in the final test. The default is 0.5. + + initial_block_size : float, optional + The fraction of the truncated time series to use for the "before" portion + of the paired t-test. The default is 0.1. + + final_block_size : float, optional + The fraction of the truncated time series to use for the "after" portion + of the paired t-test. The default is 0.5. + + t_test_sidedness : str, optional + The sidedness of the paired t-test. This can be either 'two-sided', 'less', + or 'greater'. The default is 'two-sided'. + + plot : bool, optional + Whether to plot the p-values. The default is False. + + plot_name : str | Path, optional + The name of the plot file. The default is 'equilibration_paired_t_test.png'. + + time_units : str, optional + The units of time. The default is "ns". + + data_y_label : str, optional + The y-axis label for the time series data. The default is + "$\Delta G$ / kcal mol$^{-1}$". + + Returns + ------- + int + The time point at which the time series is equilibrated. + """ + # Validate dtata. + data = check_data(data, one_dim_allowed=False) + n_runs, n_samples = data.shape + + # If times is None, units of time are indices. + if times is None: + time_units = "index" + # Convert times to indices. + times = _np.arange(n_samples) + + # Check that user options (not checked in get_paired_t_p_timeseries) are valid. + if p_threshold <= 0 or p_threshold > 0.7: + raise InvalidInputError("p_threshold must be between 0 and 0.7.") + + # Get the p value timeseries and n_discard. + p_vals, times_used = get_paired_t_p_timeseries( + data=data, + times=times, + fractional_block_size=fractional_block_size, + fractional_test_end=fractional_test_end, + initial_block_size=initial_block_size, + final_block_size=final_block_size, + t_test_sidedness=t_test_sidedness, + ) + + # Get the index of the first p value that is greater than the threshold. + meets_threshold = p_vals > p_threshold + if not any(meets_threshold): + raise EquilibrationNotDetectedError( + f"No p values are greater than the threshold of {p_threshold}." + ) + equil_time = times_used[_np.argmax(meets_threshold)] + + # Plot the p values. + if plot: + fig = plt.figure(figsize=(6, 4)) + gridspec_obj = gridspec.GridSpec(1, 1, figure=fig) + plot_equilibration_paired_t_test( + fig=fig, + subplot_spec=gridspec_obj[0], + data=data, + p_values=p_vals, + data_times=times, + p_times=times_used, + p_threshold=p_threshold, + time_units=time_units, + data_y_label=data_y_label, + ) + + fig.savefig(str(plot_name), dpi=300, bbox_inches="tight") + + return equil_time diff --git a/deea/tests/test_equilibration.py b/deea/tests/test_equilibration.py index 6a1ecfb..651e0a0 100644 --- a/deea/tests/test_equilibration.py +++ b/deea/tests/test_equilibration.py @@ -205,3 +205,24 @@ def test_detect_equilibration_paired_t(gaussian_noise, example_timeseries): steeply_decaying_timeseries = np.tile(steeply_decaying_timeseries, (5, 1)) with pytest.raises(EquilibrationNotDetectedError): detect_equilibration_paired_t_test(steeply_decaying_timeseries) + + +def test_plots_paired_t(example_timeseries, example_times, tmpdir): + """Check that the plots are created.""" + tmp_output = Path(tmpdir) / "test_plots_paired_t" + detect_equilibration_paired_t_test( + example_timeseries, example_times, plot=True, plot_name=tmp_output + ) + assert tmp_output.with_suffix(".png").exists() + + +def test_p_values(example_timeseries, example_times): + """Tests on the p-values series generator which can't be run indirectly + through the equilibration detection functions.""" + + with pytest.raises(InvalidInputError): + get_paired_t_p_timeseries(example_timeseries, times=example_times[:-2]) + + # Check that this works on indices if no times passed. + _, times = get_paired_t_p_timeseries(example_timeseries) + assert times[-1] == 1312 From 5b877d3f984047543810134e363ea26e0b8165f5 Mon Sep 17 00:00:00 2001 From: finlayclark Date: Mon, 22 Jan 2024 11:22:04 +0000 Subject: [PATCH 14/15] Remove flaky test for paired t-test detection --- deea/tests/test_equilibration.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/deea/tests/test_equilibration.py b/deea/tests/test_equilibration.py index 651e0a0..9d879df 100644 --- a/deea/tests/test_equilibration.py +++ b/deea/tests/test_equilibration.py @@ -154,10 +154,6 @@ def test_detect_equilibration_paired_t(gaussian_noise, example_timeseries): """ Test equilibration detection based on the paired t-test. """ - # Compute the equilibration index. - idx = detect_equilibration_paired_t_test(gaussian_noise) - assert idx == 0 - # Check that it fails for a single run. with pytest.raises(InvalidInputError): detect_equilibration_paired_t_test(gaussian_noise[0]) From e423f2956071062c3686460d9cc106a8095b491d Mon Sep 17 00:00:00 2001 From: finlayclark Date: Mon, 22 Jan 2024 11:29:35 +0000 Subject: [PATCH 15/15] Fix upload of coverage report to codacy --- .github/workflows/CI.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/CI.yaml b/.github/workflows/CI.yaml index 74fdc49..922bbba 100644 --- a/.github/workflows/CI.yaml +++ b/.github/workflows/CI.yaml @@ -63,3 +63,4 @@ jobs: uses: codacy/codacy-coverage-reporter-action@v1 with: project-token: ${{ secrets.CODACY_PROJECT_TOKEN }} + coverage-reports: coverage.xml