Merge pull request #3 from fjclark/min_mcse

Min SSE
fjclark · Jan 22, 2024 · 0e501e1 · 0e501e1
2 parents b6118c6 + e423f29
commit 0e501e1
Show file tree

Hide file tree

Showing 19 changed files with 2,143 additions and 398 deletions.
diff --git a/.github/workflows/CI.yaml b/.github/workflows/CI.yaml
@@ -58,3 +58,9 @@ jobs:
         uses: codecov/codecov-action@v3
         env:
           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+
+      - name: Run codacy-coverage-reporter
+        uses: codacy/codacy-coverage-reporter-action@v1
+        with:
+          project-token: ${{ secrets.CODACY_PROJECT_TOKEN }}
+          coverage-reports: coverage.xml
diff --git a/README.md b/README.md
@@ -8,12 +8,13 @@ deea
 
 Detection of Equilibration by Ensemble Analysis
 
-A lightweight Python package for detecting equilibration in timeseries data where an initial transient is followed by a stationary distribution. Currently, two equilibration detection methods are implemented:
+A Python package for detecting equilibration in timeseries data where an initial transient is followed by a stationary distribution. Two main methods are implemented, which differ in the way they account for autocorrelation: 
 
-- Maximum effective sample size based on the lugsail replicated batch means method. See [here](https://projecteuclid.org/journals/statistical-science/volume-36/issue-4/Revisiting-the-GelmanRubin-Diagnostic/10.1214/20-STS812.full) and [here](https://academic.oup.com/biomet/article/109/3/735/6395353).
-
-- Paired t-test. This checks for significant differences between the first 10 %, and last 50 % of the data. The test is repeated while sequentially removing more initial data. The first time point at which no significant difference is found is taken as the equilibration time.
+  - `detect_equilibration_init_seq`: This uses the initial sequence methods of Geyer ([Geyer, 1992](https://www.jstor.org/stable/2246094)) to determine the truncation point of the sum of autocovariances.
+  - `detect_equilibration_window`: This uses window methods (see [Geyer](https://www.jstor.org/stable/2246094) again) when calculating the 
+autocorrelation. 
 
+For both, the equilibration point can be determined either according to the minimum of the squared standard error (the default), or the maximum effective sample size, by specifying `method="min_sse"` or `method="max_ess"`.
 
 ### Installation
 
@@ -29,18 +30,23 @@ pip install -e .
 import deea
 
 # Load your timeseries of interest.
-# This should be a 2D numpy array with shape (n_runs, n_samples)
+# This should be a 2D numpy array with shape (n_runs, n_samples),
+# or a 1D numpy array with shape (n_samples).
 my_timeseries = ...
 
-# Detect equilibration based on the maximum effective sample size 
-# based on the lugsail replicated batch means method.
-# idx is the index of the first sample after equilibration, g is the
-# statistical inefficiency, and ess is the effective sample size.
-idx, g, ess = deea.detect_equilibration_max_ess(my_timeseries, method="lugsail", plot=True)
+# Detect equilibration based on the minimum squared standard error
+# using Geyer's initial convex sequence method to account for
+# autocorrelation. idx is the index of the first sample after
+# equilibration, g is the statistical inefficiency of the equilibrated 
+# sample, and ess is the effective sample size of the equilibrated sample.
+idx, g, ess = deea.detect_equilibration_init_seq(my_timeseries, method="min_sse", plot=True)
+
+# Alternatively, use the window method to account for autocorrelation.
+# By default, this uses a Bartlett kernel and a window size of round(n_samples**0.5).
+idx, g, ess = deea.detect_equilibration_window(my_timeseries, method="min_sse", plot=True)
 
-# Detect equilibration using a paired t-test.
-# idx is the index of the first sample after equilibration.
-idx = deea.detect_equilibration_ttest(my_timeseries, plot=True)
+# We can also determine equilibration in the same way as in pymbar.timeseries.detect_equilibration.
+idx, g, ess = deea.detect_equilibration_init_seq(my_timeseries, method="max_ess", sequence_estimator="positive")
 ```
 
 ### Copyright

diff --git a/deea/__init__.py b/deea/__init__.py
@@ -7,4 +7,5 @@
 from .ess import *
 from .gelman_rubin import *
 from .plot import *
+from .sse import *
 from .variance import *
diff --git a/deea/_validation.py b/deea/_validation.py
@@ -1,5 +1,7 @@
 """Functions for data validation."""
 
+from warnings import warn as _warn
+
 import numpy as np
 
 from ._exceptions import InvalidInputError
@@ -45,8 +47,9 @@ def check_data(data: np.ndarray, one_dim_allowed: bool = False) -> np.ndarray:
     if n_dims == 2:
         n_chains, n_samples = data.shape
         if n_chains > n_samples:
-            raise InvalidInputError(
-                "Data must have shape (n_chains, n_samples) where n_chains >= n_samples."
+            _warn(
+                "Data has more chains than samples. This is allowed but may not be what you intended.",
+                RuntimeWarning,
             )
 
     # If the array is one dimensional, reshape it to (1, n_samples).

diff --git a/deea/_version.py b/deea/_version.py
@@ -1 +1 @@
-__version__ = "0.1.0"
+__version__ = "0.1.0+12.gb6118c6.dirty"
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.1.0"
		__version__ = "0.1.0+12.gb6118c6.dirty"