From 5824cb5aea7dad0af9ac73cb413f5097806ac6a2 Mon Sep 17 00:00:00 2001 From: finlayclark Date: Tue, 17 Sep 2024 14:31:46 +0100 Subject: [PATCH] Add makefile to simplify common commands Such as linting, testing, type-checking etc. Also reformat the code to pass the linting and type-checking requirements. --- .github/CONTRIBUTING.md | 3 +- .github/PULL_REQUEST_TEMPLATE.md | 2 +- .github/workflows/CI.yaml | 4 +- .pre-commit-config.yaml | 30 +++ MANIFEST.in | 2 +- Makefile | 36 ++++ README.md | 12 +- devtools/README.md | 16 +- devtools/conda-envs/base.yaml | 15 ++ .../conda-envs/{test_env.yaml => test.yaml} | 12 +- devtools/scripts/create_conda_env.py | 76 +++++--- docs/Makefile | 2 +- docs/README.md | 5 +- docs/_static/README.md | 4 +- docs/_templates/README.md | 4 +- docs/conf.py | 4 +- docs/getting_started.rst | 2 +- docs/requirements.yaml | 1 - pyproject.toml | 23 +++ readthedocs.yml | 2 +- red/__init__.py | 18 +- red/_validation.py | 9 +- red/_version.py | 2 +- red/equilibration.py | 63 +++---- red/ess.py | 23 +-- red/gelman_rubin.py | 33 ++-- red/plot.py | 90 ++++----- red/sse.py | 11 +- red/tests/test_equilibration.py | 4 +- red/tests/test_plot.py | 8 +- red/tests/test_variance.py | 27 +-- red/variance.py | 177 +++++++++--------- 32 files changed, 411 insertions(+), 309 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 Makefile create mode 100644 devtools/conda-envs/base.yaml rename devtools/conda-envs/{test_env.yaml => test.yaml} (70%) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 03292da..3eebeff 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -1,7 +1,6 @@ # How to contribute -We welcome contributions from external contributors, and this document -describes how to merge code changes into this EnsEquil. +We welcome contributions from external contributors, and this document describes how to merge code changes into this repository. ## Getting Started diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index c772b96..26da5a2 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -9,4 +9,4 @@ Notable points that this PR has either accomplished or will accomplish. - [ ] Question1 ## Status -- [ ] Ready to go \ No newline at end of file +- [ ] Ready to go diff --git a/.github/workflows/CI.yaml b/.github/workflows/CI.yaml index 4f8fa9a..bef84e1 100644 --- a/.github/workflows/CI.yaml +++ b/.github/workflows/CI.yaml @@ -36,8 +36,8 @@ jobs: - uses: mamba-org/setup-micromamba@v1 with: - environment-file: devtools/conda-envs/test_env.yaml - environment-name: test + environment-file: devtools/conda-envs/test.yaml + environment-name: red create-args: >- # beware the >- instead of |, we don't split on newlines but on spaces python=${{ matrix.python-version }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..73afab1 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,30 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v3.2.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + +- repo: local + hooks: + - id: ruff + name: Autoformat python code + language: system + entry: bash + args: [-c, "make format"] + +- repo: local + hooks: + - id: ruff + name: Lint python code + language: system + entry: bash + args: [-c, "make lint"] + +- repo: local + hooks: + - id: mypy + name: Type check python code + language: system + entry: bash + args: [-c, "make type-check"] diff --git a/MANIFEST.in b/MANIFEST.in index 7b62c26..e0267af 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,3 @@ include CODE_OF_CONDUCT.md -global-exclude *.py[cod] __pycache__ *.so \ No newline at end of file +global-exclude *.py[cod] __pycache__ *.so diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..e2b8931 --- /dev/null +++ b/Makefile @@ -0,0 +1,36 @@ +PACKAGE_NAME := red +PACKAGE_DIR := red + +CONDA_ENV_RUN = conda run --no-capture-output --name $(PACKAGE_NAME) + +TEST_ARGS := -v --cov=$(PACKAGE_NAME) --cov-report=term --cov-report=xml --junitxml=unit.xml --color=yes + +.PHONY: env lint format test type-check docs-build docs-deploy + +env: + mamba create --name $(PACKAGE_NAME) + mamba env update --name $(PACKAGE_NAME) --file devtools/conda-envs/test.yaml + $(CONDA_ENV_RUN) pip install --no-deps -e . + $(CONDA_ENV_RUN) pre-commit install || true + +lint: + $(CONDA_ENV_RUN) ruff check $(PACKAGE_DIR) + +format: + $(CONDA_ENV_RUN) ruff format $(PACKAGE_DIR) + $(CONDA_ENV_RUN) ruff check --fix --select I $(PACKAGE_DIR) + +test: + $(CONDA_ENV_RUN) pytest -v $(TEST_ARGS) $(PACKAGE_DIR)/tests/ + +type-check: + $(CONDA_ENV_RUN) mypy --follow-imports=silent --ignore-missing-imports --strict $(PACKAGE_DIR) + +docs-build: + $(CONDA_ENV_RUN) mkdocs build + +docs-deploy: +ifndef VERSION + $(error VERSION is not set) +endif + $(CONDA_ENV_RUN) mike deploy --push --update-aliases $(VERSION) diff --git a/README.md b/README.md index 936c5d0..da7fa88 100644 --- a/README.md +++ b/README.md @@ -11,11 +11,11 @@ Robust Equilibration Detection [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) -A Python package for detecting equilibration in timeseries data where an initial transient is followed by a stationary distribution. Two main methods are implemented, which differ in the way they account for autocorrelation: +A Python package for detecting equilibration in timeseries data where an initial transient is followed by a stationary distribution. Two main methods are implemented, which differ in the way they account for autocorrelation: - `detect_equilibration_init_seq`: This uses the initial sequence methods of Geyer ([Geyer, 1992](https://www.jstor.org/stable/2246094)) to determine the truncation point of the sum of autocovariances. - - `detect_equilibration_window`: This uses window methods (see [Geyer](https://www.jstor.org/stable/2246094) again) when calculating the -autocorrelation. + - `detect_equilibration_window`: This uses window methods (see [Geyer](https://www.jstor.org/stable/2246094) again) when calculating the +autocorrelation. For both, the equilibration point can be determined either according to the minimum of the squared standard error (the default), or the maximum effective sample size, by specifying `method="min_sse"` or `method="max_ess"`. @@ -40,7 +40,7 @@ my_timeseries = ... # Detect equilibration based on the minimum squared standard error # using Geyer's initial convex sequence method to account for # autocorrelation. idx is the index of the first sample after -# equilibration, g is the statistical inefficiency of the equilibrated +# equilibration, g is the statistical inefficiency of the equilibrated # sample, and ess is the effective sample size of the equilibrated sample. idx, g, ess = red.detect_equilibration_init_seq(my_timeseries, method="min_sse", plot=True) @@ -58,6 +58,6 @@ Copyright (c) 2023, Finlay Clark #### Acknowledgements - -Project based on the + +Project based on the [Computational Molecular Science Python Cookiecutter](https://github.com/molssi/cookiecutter-cms) version 1.1. diff --git a/devtools/README.md b/devtools/README.md index e0b2ae4..df69d5c 100644 --- a/devtools/README.md +++ b/devtools/README.md @@ -1,6 +1,6 @@ # Development, testing, and deployment tools -This directory contains a collection of tools for running Continuous Integration (CI) tests, +This directory contains a collection of tools for running Continuous Integration (CI) tests, conda installation, and other development tools not directly related to the coding process. @@ -8,7 +8,7 @@ conda installation, and other development tools not directly related to the codi ### Continuous Integration -You should test your code, but do not feel compelled to use these specific programs. You also may not need Unix and +You should test your code, but do not feel compelled to use these specific programs. You also may not need Unix and Windows testing if you only plan to deploy on specific platforms. These are just to help you get started. ### Conda Environment: @@ -17,7 +17,7 @@ This directory contains the files to setup the Conda environment for testing pur * `conda-envs`: directory containing the YAML file(s) which fully describe Conda Environments, their dependencies, and those dependency provenance's * `test_env.yaml`: Simple test environment file with base dependencies. Channels are not specified here and therefore respect global Conda configuration - + ### Additional Scripts: This directory contains OS agnostic helper scripts which don't fall in any of the previous categories @@ -40,17 +40,17 @@ This directory contains OS agnostic helper scripts which don't fall in any of th - [ ] Make sure there is an/are issue(s) opened for your specific update - [ ] Create the PR, referencing the issue - [ ] Debug the PR as needed until tests pass -- [ ] Tag the final, debugged version +- [ ] Tag the final, debugged version * `git tag -a X.Y.Z [latest pushed commit] && git push --follow-tags` - [ ] Get the PR merged in ## Versioneer Auto-version -[Versioneer](https://github.com/warner/python-versioneer) will automatically infer what version -is installed by looking at the `git` tags and how many commits ahead this version is. The format follows +[Versioneer](https://github.com/warner/python-versioneer) will automatically infer what version +is installed by looking at the `git` tags and how many commits ahead this version is. The format follows [PEP 440](https://www.python.org/dev/peps/pep-0440/) and has the regular expression of: ```regexp \d+.\d+.\d+(?\+\d+-[a-z0-9]+) ``` -If the version of this commit is the same as a `git` tag, the installed version is the same as the tag, -e.g. `red-0.1.2`, otherwise it will be appended with `+X` where `X` is the number of commits +If the version of this commit is the same as a `git` tag, the installed version is the same as the tag, +e.g. `red-0.1.2`, otherwise it will be appended with `+X` where `X` is the number of commits ahead from the last tag, and then `-YYYYYY` where the `Y`'s are replaced with the `git` commit hash. diff --git a/devtools/conda-envs/base.yaml b/devtools/conda-envs/base.yaml new file mode 100644 index 0000000..0ab4a6e --- /dev/null +++ b/devtools/conda-envs/base.yaml @@ -0,0 +1,15 @@ +name: red + +channels: + - conda-forge + - defaults + +dependencies: + # Base depends + - python + - pip + - numpy + - numba + - scipy + - matplotlib + - statsmodels diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test.yaml similarity index 70% rename from devtools/conda-envs/test_env.yaml rename to devtools/conda-envs/test.yaml index 7c7ad9d..2a732cc 100644 --- a/devtools/conda-envs/test_env.yaml +++ b/devtools/conda-envs/test.yaml @@ -1,9 +1,9 @@ -name: test -channels: +name: red +channels: - conda-forge - - defaults + dependencies: # Base depends - python @@ -14,8 +14,12 @@ dependencies: - matplotlib - statsmodels - # Testing + # Testing / Development - pytest - pytest-cov - pytest-env - codecov + - ruff + - pre-commit + - mypy + - types-PyYAML diff --git a/devtools/scripts/create_conda_env.py b/devtools/scripts/create_conda_env.py index 9ece84a..e3087e5 100644 --- a/devtools/scripts/create_conda_env.py +++ b/devtools/scripts/create_conda_env.py @@ -6,9 +6,11 @@ import subprocess as sp from tempfile import TemporaryDirectory from contextlib import contextmanager + # YAML imports try: import yaml # PyYAML + loader = yaml.safe_load except ImportError: try: @@ -17,19 +19,31 @@ try: # Load Ruamel YAML from the base conda environment from importlib import util as import_util - CONDA_BIN = os.path.dirname(os.environ['CONDA_EXE']) - ruamel_yaml_path = glob.glob(os.path.join(CONDA_BIN, '..', - 'lib', 'python*.*', 'site-packages', - 'ruamel_yaml', '__init__.py'))[0] - # Based on importlib example, but only needs to load_module since its the whole package, not just - # a module - spec = import_util.spec_from_file_location('ruamel_yaml', ruamel_yaml_path) + + CONDA_BIN = os.path.dirname(os.environ["CONDA_EXE"]) + ruamel_yaml_path = glob.glob( + os.path.join( + CONDA_BIN, + "..", + "lib", + "python*.*", + "site-packages", + "ruamel_yaml", + "__init__.py", + ) + )[0] + # Based on importlib example, but only needs to load_module since its the whole + # package, not just a module + spec = import_util.spec_from_file_location("ruamel_yaml", ruamel_yaml_path) yaml = spec.loader.load_module() except (KeyError, ImportError, IndexError): - raise ImportError("No YAML parser could be found in this or the conda environment. " - "Could not find PyYAML or Ruamel YAML in the current environment, " - "AND could not find Ruamel YAML in the base conda environment through CONDA_EXE path. " - "Environment not created!") + raise ImportError( # noqa: B904 + "No YAML parser could be found in this or the conda environment. " + "Could not find PyYAML or Ruamel YAML in the current environment, " + "AND could not find Ruamel YAML in the base conda environment through " + "CONDA_EXE path. " + "Environment not created!" + ) loader = yaml.YAML(typ="safe").load # typ="safe" avoids odd typing on output @@ -46,13 +60,14 @@ def temp_cd(): # Args -parser = argparse.ArgumentParser(description='Creates a conda environment from file for a given Python version.') -parser.add_argument('-n', '--name', type=str, - help='The name of the created Python environment') -parser.add_argument('-p', '--python', type=str, - help='The version of the created Python environment') -parser.add_argument('conda_file', - help='The file for the created Python environment') +parser = argparse.ArgumentParser( + description="Creates a conda environment from file for a given Python version." +) +parser.add_argument("-n", "--name", type=str, help="The name of the created Python environment") +parser.add_argument( + "-p", "--python", type=str, help="The version of the created Python environment" +) +parser.add_argument("conda_file", help="The file for the created Python environment") args = parser.parse_args() @@ -63,16 +78,21 @@ def temp_cd(): python_replacement_string = "python {}*".format(args.python) try: - for dep_index, dep_value in enumerate(yaml_script['dependencies']): - if re.match('python([ ><=*]+[0-9.*]*)?$', dep_value): # Match explicitly 'python' and its formats - yaml_script['dependencies'].pop(dep_index) - break # Making the assumption there is only one Python entry, also avoids need to enumerate in reverse + for dep_index, dep_value in enumerate(yaml_script["dependencies"]): + if re.match( + "python([ ><=*]+[0-9.*]*)?$", dep_value + ): # Match explicitly 'python' and its formats + yaml_script["dependencies"].pop(dep_index) + # Making the assumption there is only one Python entry, also avoids + # need to enumerate in reverse + break except (KeyError, TypeError): # Case of no dependencies key, or dependencies: None - yaml_script['dependencies'] = [] + yaml_script["dependencies"] = [] finally: - # Ensure the python version is added in. Even if the code does not need it, we assume the env does - yaml_script['dependencies'].insert(0, python_replacement_string) + # Ensure the python version is added in. Even if the code does not + # need it, we assume the env does + yaml_script["dependencies"].insert(0, python_replacement_string) # Figure out conda path if "CONDA_EXE" in os.environ: @@ -80,7 +100,9 @@ def temp_cd(): else: conda_path = shutil.which("conda") if conda_path is None: - raise RuntimeError("Could not find a conda binary in CONDA_EXE variable or in executable search path") + raise RuntimeError( + "Could not find a conda binary in CONDA_EXE variable or in executable search path" + ) print("CONDA ENV NAME {}".format(args.name)) print("PYTHON VERSION {}".format(args.python)) @@ -90,6 +112,6 @@ def temp_cd(): # Write to a temp directory which will always be cleaned up with temp_cd(): temp_file_name = "temp_script.yaml" - with open(temp_file_name, 'w') as f: + with open(temp_file_name, "w") as f: f.write(yaml.dump(yaml_script)) sp.call("{} env create -n {} -f {}".format(conda_path, args.name, temp_file_name), shell=True) diff --git a/docs/Makefile b/docs/Makefile index a7f455d..4252d7d 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -17,4 +17,4 @@ help: # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/README.md b/docs/README.md index b04c909..f8d8ce9 100644 --- a/docs/README.md +++ b/docs/README.md @@ -5,7 +5,7 @@ To compile the docs, first ensure that Sphinx and the ReadTheDocs theme are inst ```bash -conda install sphinx sphinx_rtd_theme +conda install sphinx sphinx_rtd_theme ``` @@ -14,11 +14,10 @@ Once installed, you can use the `Makefile` in this directory to compile static H make html ``` -The compiled docs will be in the `_build` directory and can be viewed by opening `index.html` (which may itself +The compiled docs will be in the `_build` directory and can be viewed by opening `index.html` (which may itself be inside a directory called `html/` depending on what version of Sphinx is installed). A configuration file for [Read The Docs](https://readthedocs.org/) (readthedocs.yaml) is included in the top level of the repository. To use Read the Docs to host your documentation, go to https://readthedocs.org/ and connect this repository. You may need to change your default branch to `main` under Advanced Settings for the project. If you would like to use Read The Docs with `autodoc` (included automatically) and your package has dependencies, you will need to include those dependencies in your documentation yaml file (`docs/requirements.yaml`). - diff --git a/docs/_static/README.md b/docs/_static/README.md index 2f0cf84..122b610 100644 --- a/docs/_static/README.md +++ b/docs/_static/README.md @@ -1,11 +1,11 @@ # Static Doc Directory Add any paths that contain custom static files (such as style sheets) here, -relative to the `conf.py` file's directory. +relative to the `conf.py` file's directory. They are copied after the builtin static files, so a file named "default.css" will overwrite the builtin "default.css". -The path to this folder is set in the Sphinx `conf.py` file in the line: +The path to this folder is set in the Sphinx `conf.py` file in the line: ```python templates_path = ['_static'] ``` diff --git a/docs/_templates/README.md b/docs/_templates/README.md index 3f4f804..485f82a 100644 --- a/docs/_templates/README.md +++ b/docs/_templates/README.md @@ -1,11 +1,11 @@ # Templates Doc Directory -Add any paths that contain templates here, relative to +Add any paths that contain templates here, relative to the `conf.py` file's directory. They are copied after the builtin template files, so a file named "page.html" will overwrite the builtin "page.html". -The path to this folder is set in the Sphinx `conf.py` file in the line: +The path to this folder is set in the Sphinx `conf.py` file in the line: ```python html_static_path = ['_templates'] ``` diff --git a/docs/conf.py b/docs/conf.py index 9d4eaf2..ee22523 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,7 +18,7 @@ sys.path.insert(0, os.path.abspath("..")) -import red +import red # noqa: F401 # -- Project information ----------------------------------------------------- @@ -124,7 +124,7 @@ # -- Options for LaTeX output ------------------------------------------------ -latex_elements = { +latex_elements = { # type: ignore[var-annotated] # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', diff --git a/docs/getting_started.rst b/docs/getting_started.rst index c7b5bdc..de97f9e 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -1,4 +1,4 @@ Getting Started =============== -This page details how to get started with red. +This page details how to get started with red. diff --git a/docs/requirements.yaml b/docs/requirements.yaml index 939a290..946c491 100644 --- a/docs/requirements.yaml +++ b/docs/requirements.yaml @@ -9,4 +9,3 @@ dependencies: # Pip-only installs #- pip: - diff --git a/pyproject.toml b/pyproject.toml index 5ec12d1..e80a7f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,11 +36,34 @@ dependencies = [ [project.optional-dependencies] test = [ + "mypy", "pytest>=6.1.2", "pytest-runner", "pytest-env", + "ruff", + "pre-commit", + "types-PyYAML", ] + +[tool.mypy] +exclude = 'tests/' +follow_imports = "silent" +ignore_missing_imports = true +strict = true +disable_error_code = "unused-ignore" + +[tool.ruff] +line-length = 100 + +[tool.ruff.lint] +ignore = ["PLR", "PLW", "C901"] +select = ["B","C","E","F","W","B9"] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["F401"] +"red/tests/*.py" = ["F401", "F811"] + [tool.setuptools] # This subkey is a beta stage development and keys may change in the future, see https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html for more details # diff --git a/readthedocs.yml b/readthedocs.yml index 69d6db5..95b50ae 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -12,4 +12,4 @@ python: path: . conda: - environment: docs/requirements.yaml \ No newline at end of file + environment: docs/requirements.yaml diff --git a/red/__init__.py b/red/__init__.py index 25c61a0..8bd34e8 100644 --- a/red/__init__.py +++ b/red/__init__.py @@ -2,10 +2,14 @@ # Add imports here from ._version import __version__ -from .data import * -from .equilibration import * -from .ess import * -from .gelman_rubin import * -from .plot import * -from .sse import * -from .variance import * +from .equilibration import ( + detect_equilibration_init_seq, + detect_equilibration_paired_t_test, + detect_equilibration_window, +) +from .variance import ( + get_variance_initial_sequence, + get_variance_series_initial_sequence, + get_variance_series_window, + get_variance_window, +) diff --git a/red/_validation.py b/red/_validation.py index 91b9fcb..9381ffd 100644 --- a/red/_validation.py +++ b/red/_validation.py @@ -3,11 +3,14 @@ from warnings import warn as _warn import numpy as _np +import numpy.typing as _npt from ._exceptions import InvalidInputError -def check_data(data: _np.ndarray, one_dim_allowed: bool = False) -> _np.ndarray: +def check_data( + data: _npt.NDArray[_np.float64], one_dim_allowed: bool = False +) -> _npt.NDArray[_np.float64]: """ Assert that data passed is a numpy array where the first dimension is the number of chains and @@ -48,8 +51,10 @@ def check_data(data: _np.ndarray, one_dim_allowed: bool = False) -> _np.ndarray: n_chains, n_samples = data.shape if n_chains > n_samples: _warn( - "Data has more chains than samples. This is allowed but may not be what you intended.", + "Data has more chains than samples. This is allowed but may not" + " be what you intended.", RuntimeWarning, + stacklevel=2, ) # If the array is one dimensional, reshape it to (1, n_samples). diff --git a/red/_version.py b/red/_version.py index 81d8fc8..3dc1f76 100644 --- a/red/_version.py +++ b/red/_version.py @@ -1 +1 @@ -__version__ = "0.1.0+34.g69584b1.dirty" +__version__ = "0.1.0" diff --git a/red/equilibration.py b/red/equilibration.py index 38f0c6c..4e3c097 100644 --- a/red/equilibration.py +++ b/red/equilibration.py @@ -8,6 +8,7 @@ import matplotlib.pyplot as _plt import numpy as _np +import numpy.typing as _npt from matplotlib import gridspec as _gridspec from scipy.stats import ttest_rel as _ttest_rel @@ -19,8 +20,8 @@ def detect_equilibration_init_seq( - data: _np.ndarray, - times: _Optional[_np.ndarray] = None, + data: _npt.NDArray[_np.float64], + times: _Optional[_npt.NDArray[_np.float64]] = None, method: str = "min_sse", sequence_estimator: str = "initial_convex", min_max_lag_time: int = 3, @@ -116,15 +117,13 @@ def detect_equilibration_init_seq( valid_methods = ["min_sse", "max_ess"] method = method.lower() if method not in valid_methods: - raise InvalidInputError( - f"method must be one of {valid_methods}, but got {method}." - ) + raise InvalidInputError(f"method must be one of {valid_methods}, but got {method}.") # If times is None, units of time are indices. if times is None: time_units = "index" # Convert times to indices. - times_valid: _np.ndarray = _np.arange(n_samples) + times_valid: _npt.NDArray[_np.float64] = _np.arange(n_samples, dtype=_np.float64) else: # To satisfy type checking. times_valid = times @@ -187,10 +186,10 @@ def detect_equilibration_init_seq( def detect_equilibration_window( - data: _np.ndarray, - times: _Optional[_np.ndarray] = None, + data: _npt.NDArray[_np.float64], + times: _Optional[_npt.NDArray[_np.float64]] = None, method: str = "min_sse", - kernel: _Callable[[int], _np.ndarray] = _np.bartlett, # type: ignore + kernel: _Callable[[int], _npt.NDArray[_np.float64]] = _np.bartlett, # type: ignore window_size_fn: _Optional[_Callable[[int], int]] = lambda x: round(x**0.5), window_size: _Optional[int] = None, frac_padding: float = 0.1, @@ -277,7 +276,7 @@ def detect_equilibration_window( if times is None: time_units = "index" # Convert times to indices. - times_valid: _np.ndarray = _np.arange(n_samples) + times_valid: _npt.NDArray[_np.float64] = _np.arange(n_samples, dtype=_np.float64) else: # To satisfy type checking. times_valid = times @@ -339,14 +338,14 @@ def detect_equilibration_window( def get_paired_t_p_timeseries( - data: _np.ndarray, - times: _Optional[_np.ndarray] = None, + data: _npt.NDArray[_np.float64], + times: _Optional[_npt.NDArray[_np.float64]] = None, fractional_block_size: float = 0.125, fractional_test_end: float = 0.5, initial_block_size: float = 0.1, final_block_size: float = 0.5, t_test_sidedness: str = "two-sided", -) -> _Tuple[_np.ndarray, _np.ndarray]: +) -> _Tuple[_npt.NDArray[_np.float64], _npt.NDArray[_np.float64]]: """ Get a timeseries of the p-values from a paired t-test on the differences between sample means between intial and final portions of the data. The timeseries @@ -396,13 +395,11 @@ def get_paired_t_p_timeseries( # Convert times to indices if necessary. if times is None: - times = _np.arange(n_samples) + times = _np.arange(n_samples, dtype=_np.float64) # Check that times is match the number of samples. if n_samples != len(times): - raise InvalidInputError( - "Times must have the same length as the number of samples." - ) + raise InvalidInputError("Times must have the same length as the number of samples.") # Check that user inputs are valid. if fractional_block_size <= 0 or fractional_block_size > 1: @@ -418,9 +415,7 @@ def get_paired_t_p_timeseries( # Check that fractional test end is a multiple of fractional block size. if round((fractional_test_end / fractional_block_size) % 1.0, 3) != 0: - raise InvalidInputError( - "fractional_test_end must be a multiple of fractional_block_size." - ) + raise InvalidInputError("fractional_test_end must be a multiple of fractional_block_size.") if initial_block_size <= 0 or initial_block_size > 1: raise InvalidInputError("initial_block_size must be between 0 and 1.") @@ -435,9 +430,7 @@ def get_paired_t_p_timeseries( ) # Calculate the number of repeats. - n_repeats = ( - round(fractional_test_end / fractional_block_size) + 1 - ) # + 1 for the initial block + n_repeats = round(fractional_test_end / fractional_block_size) + 1 # + 1 for the initial block # Calculate the number of samples to discard between repeats. n_discard = round(n_samples * fractional_block_size) @@ -455,16 +448,14 @@ def get_paired_t_p_timeseries( # Get the number of samples in the truncated data. n_truncated_samples = truncated_data.shape[1] # Get the initial and final blocks. - initial_block = truncated_data[ - :, : round(n_truncated_samples * initial_block_size) - ].mean(axis=1) - final_block = truncated_data[ - :, -round(n_truncated_samples * final_block_size) : - ].mean(axis=1) + initial_block = truncated_data[:, : round(n_truncated_samples * initial_block_size)].mean( + axis=1 + ) + final_block = truncated_data[:, -round(n_truncated_samples * final_block_size) :].mean( + axis=1 + ) # Compute the paired t-test. - p_vals[i] = _ttest_rel( - initial_block, final_block, alternative=t_test_sidedness - )[1] + p_vals[i] = _ttest_rel(initial_block, final_block, alternative=t_test_sidedness)[1] p_val_indices[i] = idx time_vals[i] = times[idx] @@ -472,8 +463,8 @@ def get_paired_t_p_timeseries( def detect_equilibration_paired_t_test( - data: _np.ndarray, - times: _Optional[_np.ndarray] = None, + data: _npt.NDArray[_np.float64], + times: _Optional[_npt.NDArray[_np.float64]] = None, p_threshold: float = 0.05, fractional_block_size: float = 0.125, fractional_test_end: float = 0.5, @@ -552,7 +543,7 @@ def detect_equilibration_paired_t_test( if times is None: time_units = "index" # Convert times to indices. - times = _np.arange(n_samples) + times = _np.arange(n_samples, dtype=_np.float64) # Check that user options (not checked in get_paired_t_p_timeseries) are valid. if p_threshold <= 0 or p_threshold > 0.7: @@ -595,4 +586,4 @@ def detect_equilibration_paired_t_test( fig.savefig(str(plot_name), dpi=300, bbox_inches="tight") - return equil_time + return equil_time # type: ignore[no-any-return] diff --git a/red/ess.py b/red/ess.py index 4824c3d..8393cc9 100644 --- a/red/ess.py +++ b/red/ess.py @@ -5,6 +5,7 @@ from typing import Tuple as _Tuple import numpy as _np +import numpy.typing as _npt from ._validation import check_data from .sse import get_sse_series_init_seq as _get_sse_series_init_seq @@ -13,8 +14,8 @@ def convert_sse_series_to_ess_series( - data: _np.ndarray, sse_series: _np.ndarray -) -> _np.ndarray: + data: _npt.NDArray[_np.float64], sse_series: _npt.NDArray[_np.float64] +) -> _npt.NDArray[_np.float64]: """ Convert a series of squared standard errors to a series of effective sample sizes. @@ -45,13 +46,13 @@ def convert_sse_series_to_ess_series( def get_ess_series_init_seq( - data: _np.ndarray, + data: _npt.NDArray[_np.float64], sequence_estimator: str = "initial_convex", min_max_lag_time: int = 3, max_max_lag_time: _Optional[int] = None, smooth_lag_times: bool = False, frac_padding: float = 0.1, -) -> _Tuple[_np.ndarray, _np.ndarray]: +) -> _Tuple[_npt.NDArray[_np.float64], _npt.NDArray[_np.float64]]: """ Compute a series of effective sample sizes for a time series as data is discarded from the beginning of the time series. The autocorrelation @@ -110,11 +111,11 @@ def get_ess_series_init_seq( def get_ess_series_window( - data: _np.ndarray, - kernel: _Callable[[int], _np.ndarray] = _np.bartlett, # type: ignore + data: _npt.NDArray[_np.float64], + kernel: _Callable[[int], _npt.NDArray[_np.float64]] = _np.bartlett, # type: ignore window_size_fn: _Optional[_Callable[[int], int]] = lambda x: round(x**0.5), window_size: _Optional[int] = None, -) -> _Tuple[_np.ndarray, _np.ndarray]: +) -> _Tuple[_npt.NDArray[_np.float64], _npt.NDArray[_np.float64]]: """ Compute a series of effective sample sizes for a time series as data is discarded from the beginning of the time series. The squared standard @@ -153,7 +154,7 @@ def get_ess_series_window( return ess_series, max_lag_times -def statistical_inefficiency_inter_variance(data: _np.ndarray) -> float: +def statistical_inefficiency_inter_variance(data: _npt.NDArray[_np.float64]) -> float: """ Compute the statistical inefficiency of a time series by dividing the inter-run variance estimate by the intra-run variance estimate. @@ -177,7 +178,7 @@ def statistical_inefficiency_inter_variance(data: _np.ndarray) -> float: def statistical_inefficiency_lugsail_variance( - data: _np.ndarray, n_pow: float = 1 / 3 + data: _npt.NDArray[_np.float64], n_pow: float = 1 / 3 ) -> float: """ Compute the statistical inefficiency of a time series by dividing @@ -205,7 +206,7 @@ def statistical_inefficiency_lugsail_variance( return max(g, 1) -def ess_inter_variance(data: _np.ndarray) -> float: +def ess_inter_variance(data: _npt.NDArray[_np.float64]) -> float: """ Compute the effective sample size of a time series by dividing the total number of samples by the statistical inefficiency, where @@ -230,7 +231,7 @@ def ess_inter_variance(data: _np.ndarray) -> float: return total_samples / statistical_inefficiency_inter_variance(data) -def ess_lugsail_variance(data: _np.ndarray, n_pow: float = 1 / 3) -> float: +def ess_lugsail_variance(data: _npt.NDArray[_np.float64], n_pow: float = 1 / 3) -> float: """ Compute the effective sample size of a time series by dividing the total number of samples by the statistical inefficiency, where diff --git a/red/gelman_rubin.py b/red/gelman_rubin.py index 3702d6c..e66c202 100644 --- a/red/gelman_rubin.py +++ b/red/gelman_rubin.py @@ -1,12 +1,21 @@ """Compute the Gelman-Rubin diagnostic.""" import numpy as _np +import numpy.typing as _npt -from ._validation import check_data -from .variance import inter_run_variance, intra_run_variance, lugsail_variance +from ._validation import check_data as _check_data +from .variance import ( + inter_run_variance as _inter_run_variance, +) +from .variance import ( + intra_run_variance as _intra_run_variance, +) +from .variance import ( + lugsail_variance as _lugsail_variance, +) -def gelman_rubin(data: _np.ndarray) -> float: +def gelman_rubin(data: _npt.NDArray[_np.float64]) -> float: """ Compute the Gelman-Rubin diagnostic according to equation 4 in Statist. Sci. 36(4): 518-529 @@ -25,12 +34,12 @@ def gelman_rubin(data: _np.ndarray) -> float: The Gelman-Rubin diagnostic. """ # Check that the data is valid. - data = check_data(data, one_dim_allowed=False) + data = _check_data(data, one_dim_allowed=False) _, n_samples = data.shape # Compute the variance estimates. - intra_run_variance_est = intra_run_variance(data) - inter_run_variance_est = inter_run_variance(data) + intra_run_variance_est = _intra_run_variance(data) + inter_run_variance_est = _inter_run_variance(data) # Get combined variance estimate. combined_variance_est = ( @@ -40,10 +49,10 @@ def gelman_rubin(data: _np.ndarray) -> float: # Compute GR diagnostic. gelman_rubin_diagnostic = _np.sqrt(combined_variance_est / intra_run_variance_est) - return gelman_rubin_diagnostic + return gelman_rubin_diagnostic # type: ignore[no-any-return] -def stable_gelman_rubin(data: _np.ndarray, n_pow: float = 1 / 3) -> float: +def stable_gelman_rubin(data: _npt.NDArray[_np.float64], n_pow: float = 1 / 3) -> float: """ Compute the stable Gelman-Rubin diagnostic according to equation 7 in Statist. Sci. 36(4): 518-529 @@ -51,12 +60,12 @@ def stable_gelman_rubin(data: _np.ndarray, n_pow: float = 1 / 3) -> float: a single run. """ # Validate the data. - data = check_data(data, one_dim_allowed=True) + data = _check_data(data, one_dim_allowed=True) _, n_samples = data.shape # Compute the variance estimates. - intra_run_variance_est = intra_run_variance(data) - lugsail_variance_est = lugsail_variance(data, n_pow=n_pow) + intra_run_variance_est = _intra_run_variance(data) + lugsail_variance_est = _lugsail_variance(data, n_pow=n_pow) # Get combined variance estimate. combined_variance_est = ( @@ -66,4 +75,4 @@ def stable_gelman_rubin(data: _np.ndarray, n_pow: float = 1 / 3) -> float: # Compute GR diagnostic. gelman_rubin_diagnostic = _np.sqrt(combined_variance_est / intra_run_variance_est) - return gelman_rubin_diagnostic + return gelman_rubin_diagnostic # type: ignore[no-any-return] diff --git a/red/plot.py b/red/plot.py index c86c02e..a029f06 100644 --- a/red/plot.py +++ b/red/plot.py @@ -1,27 +1,28 @@ """Plotting functions.""" +from typing import Any as _Any from typing import List as _List +from typing import Optional as _Optional +from typing import Tuple as _Tuple import matplotlib.pyplot as _plt +import numpy as _np +import numpy.typing as _npt from matplotlib import figure as _figure from matplotlib import gridspec as _gridspec +from matplotlib.artist import Artist as _Artist from matplotlib.axes import Axes as _Axes -_plt.style.use("ggplot") -from typing import Optional as _Optional -from typing import Tuple as _Tuple - -import numpy as _np -from matplotlib.lines import Line2D as _Line2D - from ._exceptions import InvalidInputError from ._validation import check_data +_plt.style.use("ggplot") + def plot_timeseries( ax: _Axes, - data: _np.ndarray, - times: _np.ndarray, + data: _npt.NDArray[_np.float64], + times: _npt.NDArray[_np.float64], n_blocks: int = 100, time_units: str = "ns", y_label: str = r"$\Delta G$ / kcal mol$^{-1}$", @@ -66,13 +67,12 @@ def plot_timeseries( raise InvalidInputError("Times must be one dimensional.") if times.shape[0] != n_samples: - raise InvalidInputError( - "Times must have the same length as the number of samples." - ) + raise InvalidInputError("Times must have the same length as the number of samples.") if n_blocks < 0 or n_blocks > n_samples: raise InvalidInputError( - "n_blocks must be greater than or equal to 0 and less than or equal to the number of samples." + "n_blocks must be greater than or equal to 0 and less than or equal to" + " the number of samples." ) if n_blocks == 0: @@ -115,11 +115,11 @@ def plot_timeseries( def plot_p_values( ax: _Axes, - p_values: _np.ndarray, - times: _np.ndarray, + p_values: _npt.NDArray[_np.float64], + times: _npt.NDArray[_np.float64], p_threshold: float = 0.05, time_units: str = "ns", - threshold_times: _Optional[_np.ndarray] = None, + threshold_times: _Optional[_npt.NDArray[_np.float64]] = None, ) -> None: """ Plot the p-values of the paired t-test. @@ -154,9 +154,7 @@ def plot_p_values( raise InvalidInputError("p_values must be one dimensional.") if p_values.shape[0] != times.shape[0]: - raise InvalidInputError( - "p_values must have the same length as the number of samples." - ) + raise InvalidInputError("p_values must have the same length as the number of samples.") if threshold_times is None: threshold_times = times @@ -202,14 +200,14 @@ def plot_p_values( def plot_sse( ax: _Axes, - sse: _np.ndarray, - max_lags: _Optional[_np.ndarray], - window_sizes: _Optional[_np.ndarray], - times: _np.ndarray, + sse: _npt.NDArray[_np.float64], + max_lags: _Optional[_npt.NDArray[_np.float64]], + window_sizes: _Optional[_npt.NDArray[_np.float64]], + times: _npt.NDArray[_np.float64], time_units: str = "ns", variance_y_label: str = r"$\frac{1}{\sigma^2(\Delta G)}$ / kcal$^{-2}$ mol$^2$", reciprocal: bool = True, -) -> _Tuple[_List[_Line2D], _List[str]]: +) -> _Tuple[_List[_Artist], _List[_Any]]: r""" Plot the squared standard error (SSE) estimate against time. @@ -256,14 +254,10 @@ def plot_sse( raise InvalidInputError("sse must be one dimensional.") if sse.shape[0] != times.shape[0]: - raise InvalidInputError( - "sse must have the same length as the number of samples." - ) + raise InvalidInputError("sse must have the same length as the number of samples.") if max_lags is not None and window_sizes is not None: - raise InvalidInputError( - "Only one of max_lags and window_sizes can be supplied." - ) + raise InvalidInputError("Only one of max_lags and window_sizes can be supplied.") # Plot the SSE. to_plot = 1 / sse if reciprocal else sse @@ -272,12 +266,10 @@ def plot_sse( # If lags is not None, plot the lag times on a different y axis. if max_lags is not None or window_sizes is not None: label = "Max Lag Index" if window_sizes is None else "Window Size" - to_plot = max_lags if window_sizes is None else window_sizes + to_plot = max_lags if window_sizes is None else window_sizes # type: ignore ax2 = ax.twinx() # Get the second colour from the colour cycle. - ax2.set_prop_cycle( - color=[_plt.rcParams["axes.prop_cycle"].by_key()["color"][1]] - ) + ax2.set_prop_cycle(color=[_plt.rcParams["axes.prop_cycle"].by_key()["color"][1]]) ax2.plot(times, to_plot, alpha=0.8, label=label) ax2.set_ylabel(label) @@ -308,10 +300,10 @@ def plot_sse( def plot_equilibration_paired_t_test( fig: _figure.Figure, subplot_spec: _gridspec.SubplotSpec, - data: _np.ndarray, - p_values: _np.ndarray, - data_times: _np.ndarray, - p_times: _np.ndarray, + data: _npt.NDArray[_np.float64], + p_values: _npt.NDArray[_np.float64], + data_times: _npt.NDArray[_np.float64], + p_times: _npt.NDArray[_np.float64], p_threshold: float = 0.05, time_units: str = "ns", data_y_label: str = r"$\Delta G$ / kcal mol$^{-1}$", @@ -362,9 +354,7 @@ def plot_equilibration_paired_t_test( """ # We need to split the gridspec into two subplots, one for the time series data (above) # and one for the p-values (below). Share x-axis but not y-axis. - gs0 = _gridspec.GridSpecFromSubplotSpec( - 2, 1, subplot_spec=subplot_spec, hspace=0.05 - ) + gs0 = _gridspec.GridSpecFromSubplotSpec(2, 1, subplot_spec=subplot_spec, hspace=0.05) ax_top = fig.add_subplot(gs0[0]) ax_bottom = fig.add_subplot(gs0[1], sharex=ax_top) @@ -406,15 +396,15 @@ def plot_equilibration_paired_t_test( def plot_equilibration_min_sse( fig: _figure.Figure, subplot_spec: _gridspec.SubplotSpec, - data: _np.ndarray, - sse_series: _np.ndarray, - data_times: _np.ndarray, - sse_times: _np.ndarray, - max_lag_series: _Optional[_np.ndarray] = None, - window_size_series: _Optional[_np.ndarray] = None, + data: _npt.NDArray[_np.float64], + sse_series: _npt.NDArray[_np.float64], + data_times: _npt.NDArray[_np.float64], + sse_times: _npt.NDArray[_np.float64], + max_lag_series: _Optional[_npt.NDArray[_np.float64]] = None, + window_size_series: _Optional[_npt.NDArray[_np.float64]] = None, time_units: str = "ns", data_y_label: str = r"$\Delta G$ / kcal mol$^{-1}$", - variance_y_label=r"$\frac{1}{\sigma^2(\Delta G)}$ / kcal$^{-2}$ mol$^2$", + variance_y_label: str = r"$\frac{1}{\sigma^2(\Delta G)}$ / kcal$^{-2}$ mol$^2$", reciprocal: bool = True, ) -> _Tuple[_Axes, _Axes]: r""" @@ -480,9 +470,7 @@ def plot_equilibration_min_sse( # We need to split the gridspec into two subplots, one for the time series data (above) # and one for the p-values (below). Share x-axis but not y-axis. - gs0 = _gridspec.GridSpecFromSubplotSpec( - 2, 1, subplot_spec=subplot_spec, hspace=0.05 - ) + gs0 = _gridspec.GridSpecFromSubplotSpec(2, 1, subplot_spec=subplot_spec, hspace=0.05) ax_top = fig.add_subplot(gs0[0]) ax_bottom = fig.add_subplot(gs0[1], sharex=ax_top) diff --git a/red/sse.py b/red/sse.py index 60ee854..19a1eda 100644 --- a/red/sse.py +++ b/red/sse.py @@ -5,19 +5,20 @@ from typing import Tuple as _Tuple import numpy as _np +import numpy.typing as _npt from ._validation import check_data from .variance import get_variance_series_initial_sequence, get_variance_series_window def get_sse_series_init_seq( - data: _np.ndarray, + data: _npt.NDArray[_np.float64], sequence_estimator: str = "initial_convex", min_max_lag_time: int = 3, max_max_lag_time: _Optional[int] = None, smooth_lag_times: bool = False, frac_padding: float = 0.1, -) -> _Tuple[_np.ndarray, _np.ndarray]: +) -> _Tuple[_npt.NDArray[_np.float64], _npt.NDArray[_np.float64]]: """ Compute a series of squared standard errors for a time series as data is discarded from the beginning of the time series. The squared standard @@ -84,12 +85,12 @@ def get_sse_series_init_seq( def get_sse_series_window( - data: _np.ndarray, - kernel: _Callable[[int], _np.ndarray] = _np.bartlett, # type: ignore + data: _npt.NDArray[_np.float64], + kernel: _Callable[[int], _npt.NDArray[_np.float64]] = _np.bartlett, # type: ignore window_size_fn: _Optional[_Callable[[int], int]] = lambda x: round(x**0.5), window_size: _Optional[int] = None, frac_padding: float = 0.1, -) -> _Tuple[_np.ndarray, _np.ndarray]: +) -> _Tuple[_npt.NDArray[_np.float64], _npt.NDArray[_np.float64]]: """ Compute a series of squared standard errors for a time series as data is discarded from the beginning of the time series. The squared standard diff --git a/red/tests/test_equilibration.py b/red/tests/test_equilibration.py index ccac6d0..06a89d1 100644 --- a/red/tests/test_equilibration.py +++ b/red/tests/test_equilibration.py @@ -25,9 +25,7 @@ def test_detect_equilibration_init_seq(example_timeseries, example_times, tmpdir example_timeseries = example_timeseries.mean(axis=0) # Compute the equilibration index. - equil_idx, equil_g, equil_ess = detect_equilibration_init_seq( - data=example_timeseries - ) + equil_idx, equil_g, equil_ess = detect_equilibration_init_seq(data=example_timeseries) # Check that the equilibration index is correct. assert equil_idx == 398 diff --git a/red/tests/test_plot.py b/red/tests/test_plot.py index af9f999..6403ffd 100644 --- a/red/tests/test_plot.py +++ b/red/tests/test_plot.py @@ -61,9 +61,7 @@ def test_plot_sse(example_timeseries, example_times): fig, ax = plt.subplots() # Compute the SSE for the example timeseries. - sse_vals, lag_times = get_sse_series_init_seq( - data=example_timeseries, smooth_lag_times=True - ) + sse_vals, lag_times = get_sse_series_init_seq(data=example_timeseries, smooth_lag_times=True) times = example_times[: len(sse_vals)] # Plot the SSE. @@ -116,9 +114,7 @@ def test_plot_equilibration_min_sse(example_timeseries, example_times): gridspec_obj = gridspec.GridSpec(1, 1, figure=fig) # Compute the SSE for the example timeseries. - sse_vals, _ = get_sse_series_init_seq( - data=example_timeseries_mean, smooth_lag_times=True - ) + sse_vals, _ = get_sse_series_init_seq(data=example_timeseries_mean, smooth_lag_times=True) times = example_times[: len(sse_vals)] # Plot the equilibration detection. diff --git a/red/tests/test_variance.py b/red/tests/test_variance.py index 2c9d7d1..e38a7c1 100644 --- a/red/tests/test_variance.py +++ b/red/tests/test_variance.py @@ -43,9 +43,7 @@ def test_get_autocovariance(example_timeseries): assert len(autocovariance) == 11 # Test with different mean. - autocovariance = _get_autocovariance( - example_timeseries, mean=50 - ) # Actual mean 54.9 + autocovariance = _get_autocovariance(example_timeseries, mean=50) # Actual mean 54.9 assert autocovariance[:3] == pytest.approx( np.array([342.65922098, 132.30372161, 120.22912133]), abs=0.01 ) @@ -68,9 +66,7 @@ def test_gamma_cap(example_timeseries): assert gamma_cap[:3] == pytest.approx( np.array([427.04712998, 193.80629572, 183.46922874]), abs=0.01 ) - assert gamma_cap[-3:] == pytest.approx( - np.array([0.54119574, 0.04336649, 0.68390851]), abs=0.01 - ) + assert gamma_cap[-3:] == pytest.approx(np.array([0.54119574, 0.04336649, 0.68390851]), abs=0.01) # Check that an analysis error is raised if input sequence too short. with pytest.raises(AnalysisError): @@ -98,7 +94,8 @@ def test_variance_initial_sequence(example_timeseries): geyer_res = { "initial_positive": 27304.776049686046, "initial_monotone": 24216.32288181667, - "initial_convex": 21904.973713096544, # Actual MCMC result is 21898.6178149005, but very close. + # Actual MCMC result for initial convex is 21898.6178149005, but very close. + "initial_convex": 21904.973713096544, } # Check that the results are correct. @@ -297,9 +294,7 @@ def test_get_variance_series_window(example_timeseries): # Take mean of timeseries. example_timeseries = example_timeseries.mean(axis=0) - var_seq = get_variance_series_window( - example_timeseries, window_size=1, window_size_fn=None - )[0] + var_seq = get_variance_series_window(example_timeseries, window_size=1, window_size_fn=None)[0] assert var_seq[0] == pytest.approx(318.6396575172195, abs=0.01) assert var_seq[1] == pytest.approx(318.3955761337024, abs=0.01) assert var_seq[-1] == pytest.approx(243.41472988044396, abs=0.01) @@ -371,14 +366,10 @@ def test_get_variance_series_window_raises(example_timeseries): ) with pytest.raises(InvalidInputError): - get_variance_series_window( - example_timeseries, window_size=None, window_size_fn=None - ) + get_variance_series_window(example_timeseries, window_size=None, window_size_fn=None) with pytest.raises(InvalidInputError): - get_variance_series_window( - example_timeseries, window_size=None, window_size_fn=4 - ) + get_variance_series_window(example_timeseries, window_size=None, window_size_fn=4) for frac_pad in [-0.1, 1.1]: with pytest.raises(InvalidInputError): @@ -394,9 +385,7 @@ def test_inter_run_variance(gaussian_noise): variance = inter_run_variance(gaussian_noise) # Check that the variance is correct. - assert variance == pytest.approx( - 0.1, abs=0.5 - ) # Wide tolerance due to high variance. + assert variance == pytest.approx(0.1, abs=0.5) # Wide tolerance due to high variance. # Check that it raises an invalid input error if the data is one dimensional. with pytest.raises(InvalidInputError): diff --git a/red/variance.py b/red/variance.py index 5694fc6..ae0ec5b 100644 --- a/red/variance.py +++ b/red/variance.py @@ -6,7 +6,7 @@ - Initial sequence methods (see Geyer, 1992: https://www.jstor.org/stable/2246094) - Window estimators (see summary in see Geyer, 1992: https://www.jstor.org/stable/2246094) -Did not implement overlapping batch means (see Meketon and Schmeiser, 1984: +Did not implement overlapping batch means (see Meketon and Schmeiser, 1984: https://repository.lib.ncsu.edu/bitstream/handle/1840.4/7707/1984_0041.pdf?sequence=1), as this is equivalent to using a Bartlett window. @@ -18,8 +18,9 @@ from typing import Union as _Union from warnings import warn as _warn -import numpy as _np import numba as _numba +import numpy as _np +import numpy.typing as _npt from statsmodels.tsa.stattools import acovf as _acovf from ._exceptions import AnalysisError, InvalidInputError @@ -28,8 +29,11 @@ ####### Private functions ####### # No need to thoroughly validate input as this is done in the public functions. -@_numba.njit -def _compute_autocovariance_no_fft(data: _np.ndarray, max_lag: int) -> _np.ndarray: + +@_numba.njit(cache=True) # type: ignore +def _compute_autocovariance_no_fft( + data: _npt.NDArray[_np.float64], max_lag: int +) -> _npt.NDArray[_np.float64]: """ Calculate the auto-covariance as a function of lag time for a time series. Avoids using statsmodel's acovf function as using numpy's dot function and jit @@ -65,7 +69,10 @@ def _compute_autocovariance_no_fft(data: _np.ndarray, max_lag: int) -> _np.ndarr return auto_cov -def _compute_autocovariance_fft(data: _np.ndarray, max_lag: int) -> _np.ndarray: + +def _compute_autocovariance_fft( + data: _npt.NDArray[_np.float64], max_lag: int +) -> _npt.NDArray[_np.float64]: """ Calculate the autocovariance using the FFT method, as implemented in statsmodels. Note that we can speed this up for large arrays by rewriting to directly use numpy's fft @@ -85,14 +92,15 @@ def _compute_autocovariance_fft(data: _np.ndarray, max_lag: int) -> _np.ndarray: numpy.ndarray The auto-correlation function of the time series. """ - return _acovf(data, adjusted=False, nlag=max_lag, fft=True, demean=False) + return _acovf(data, adjusted=False, nlag=max_lag, fft=True, demean=False) # type: ignore[no-any-return] + def _get_autocovariance( - data: _np.ndarray, + data: _npt.NDArray[_np.float64], max_lag: _Union[None, int] = None, mean: _Union[None, float] = None, - fft: bool = False -) -> _np.ndarray: + fft: bool = False, +) -> _npt.NDArray[_np.float64]: """ Calculate the auto-covariance as a function of lag time for a time series. @@ -139,11 +147,13 @@ def _get_autocovariance( # FFT is faster for large arrays and slower for shorter arrays. compute_autocov_fn = _compute_autocovariance_fft if fft else _compute_autocovariance_no_fft - return compute_autocov_fn(data, max_lag) + return compute_autocov_fn(data, max_lag) # type: ignore[no-any-return] -@_numba.njit -def _get_gamma_cap(autocov_series: _np.ndarray) -> _np.ndarray: +@_numba.njit(cache=True) # type: ignore +def _get_gamma_cap( + autocov_series: _npt.NDArray[_np.float64], +) -> _npt.NDArray[_np.float64]: """ Compute the capitial gamma function from the auto-covariance function. @@ -177,11 +187,11 @@ def _get_gamma_cap(autocov_series: _np.ndarray) -> _np.ndarray: return gamma -@_numba.njit +@_numba.njit(cache=True) # type: ignore def _get_initial_positive_sequence( - gamma_cap: _np.ndarray, + gamma_cap: _npt.NDArray[_np.float64], min_max_lag_time: int = 3, -) -> _np.ndarray: +) -> _npt.NDArray[_np.float64]: """ " Get the initial positive sequence from the capital gamma function of a time series. See Geyer, 1992: https://www.jstor.org/stable/2246094. @@ -213,11 +223,11 @@ def _get_initial_positive_sequence( return gamma_cap -@_numba.njit +@_numba.njit(cache=True) # type: ignore def _get_initial_monotone_sequence( - gamma_cap: _np.ndarray, + gamma_cap: _npt.NDArray[_np.float64], min_max_lag_time: int = 3, -) -> _np.ndarray: +) -> _npt.NDArray[_np.float64]: """ Get the initial monotone sequence from the capital gamma function of a time series. See Geyer, 1992: https://www.jstor.org/stable/2246094. @@ -239,9 +249,7 @@ def _get_initial_monotone_sequence( gamma_cap = gamma_cap.copy() # Get the initial positive sequence. - gamma_cap = _get_initial_positive_sequence( - gamma_cap, min_max_lag_time=min_max_lag_time - ) + gamma_cap = _get_initial_positive_sequence(gamma_cap, min_max_lag_time=min_max_lag_time) # Now reduce to the initial monotone sequence. for t in range(gamma_cap.shape[0] - 1): @@ -251,11 +259,11 @@ def _get_initial_monotone_sequence( return gamma_cap -@_numba.njit +@_numba.njit(cache=True) # type: ignore def _get_initial_convex_sequence( - gamma_cap: _np.ndarray, + gamma_cap: _npt.NDArray[_np.float64], min_max_lag_time: int = 3, -) -> _np.ndarray: +) -> _npt.NDArray[_np.float64]: """ Get the initial convex sequence from the capital gamma function of a time series. See Geyer, 1992: https://www.jstor.org/stable/2246094. @@ -284,9 +292,7 @@ def _get_initial_convex_sequence( gamma_con = gamma_cap.copy() # Get initial monotone sequence. - gamma_con = _get_initial_monotone_sequence( - gamma_con, min_max_lag_time=min_max_lag_time - ) + gamma_con = _get_initial_monotone_sequence(gamma_con, min_max_lag_time=min_max_lag_time) # Get the length of gamma_con. len_gamma = gamma_con.shape[0] @@ -333,10 +339,10 @@ def _get_initial_convex_sequence( def _get_autocovariance_window( - data: _np.ndarray, - kernel: _Callable[[int], _np.ndarray] = _np.bartlett, # type: ignore + data: _npt.NDArray[_np.float64], + kernel: _Callable[[int], _npt.NDArray[_np.float64]] = _np.bartlett, # type: ignore window_size: int = 10, -) -> _np.ndarray: +) -> _npt.NDArray[_np.float64]: """ Calculate the autocovariance of a time series using window estimators. @@ -360,9 +366,7 @@ def _get_autocovariance_window( """ n_runs, n_samples = data.shape if n_samples < window_size: - raise InvalidInputError( - "Window size is greater than the length of the time series." - ) + raise InvalidInputError("Window size is greater than the length of the time series.") # Get the window function. Need to truncate as numpy functions return # symmetric windows and we only want the forwards part. @@ -380,10 +384,12 @@ def _get_autocovariance_window( ) # Get the windowed autocovariance. - return autocov[: window_size + 1] * window + return autocov[: window_size + 1] * window # type: ignore[no-any-return] -def _smoothen_max_lag_times(max_lag_times: _np.ndarray) -> _np.ndarray: +def _smoothen_max_lag_times( + max_lag_times: _npt.NDArray[_np.float64], +) -> _npt.NDArray[_np.float64]: """ Smoothen a list of maximum lag times by a) converting them to a monotinically decreasing sequence and b) linearly interpolating between points where the sequence @@ -400,14 +406,10 @@ def _smoothen_max_lag_times(max_lag_times: _np.ndarray) -> _np.ndarray: The smoothened maximum lag times. """ # Get a monotinically decreasing sequence. - max_lag_times_monotonic = _get_initial_monotone_sequence( - max_lag_times, min_max_lag_time=0 - ) + max_lag_times_monotonic = _get_initial_monotone_sequence(max_lag_times, min_max_lag_time=0) # Get the indices where the sequence changes. - change_indices = _np.where( - max_lag_times_monotonic[:-1] != max_lag_times_monotonic[1:] - )[0] + change_indices = _np.where(max_lag_times_monotonic[:-1] != max_lag_times_monotonic[1:])[0] # Get the indices immediately after the change. change_indices = _np.concatenate((_np.array([0]), change_indices + 1)) @@ -430,12 +432,12 @@ def _smoothen_max_lag_times(max_lag_times: _np.ndarray) -> _np.ndarray: def get_variance_initial_sequence( - data: _np.ndarray, + data: _npt.NDArray[_np.float64], sequence_estimator: str = "initial_convex", min_max_lag_time: int = 3, max_max_lag_time: _Optional[int] = None, - autocov: _Optional[_np.ndarray] = None, -) -> _Tuple[float, int, _np.ndarray]: + autocov: _Optional[_npt.NDArray[_np.float64]] = None, +) -> _Tuple[float, int, _npt.NDArray[_np.float64]]: """ Calculate the variance of a time series using initial sequence methods. See Geyer, 1992: https://www.jstor.org/stable/2246094. @@ -492,9 +494,7 @@ def get_variance_initial_sequence( # Check that the minimum maximum lag time is valid. if min_max_lag_time < 0: - raise InvalidInputError( - "Minimum maximum lag time must be greater than or equal to 0." - ) + raise InvalidInputError("Minimum maximum lag time must be greater than or equal to 0.") if min_max_lag_time > n_samples - 1: raise InvalidInputError( @@ -504,9 +504,7 @@ def get_variance_initial_sequence( # Make sure that the maximum lag time is valid. if max_max_lag_time is not None: if max_max_lag_time < 0: - raise InvalidInputError( - "Maximum lag time must be greater than or equal to 0." - ) + raise InvalidInputError("Maximum lag time must be greater than or equal to 0.") if max_max_lag_time > n_samples - 1: raise InvalidInputError( @@ -522,7 +520,8 @@ def get_variance_initial_sequence( if autocov is not None: if max_max_lag_time > autocov.shape[0] - 1: raise InvalidInputError( - "Maximum lag time must be less than or equal to the length of the autocovariance function minus 1." + "Maximum lag time must be less than or equal to the length of the" + "autocovariance function minus 1." ) # Check that autocov_series is valid. @@ -551,7 +550,10 @@ def get_variance_initial_sequence( autocov_valid = _np.mean( [ _get_autocovariance( - data[run], mean=data.mean(), max_lag=max_max_lag_time, fft=True, + data[run], + mean=data.mean(), + max_lag=max_max_lag_time, + fft=True, ) for run in range(n_runs) ], @@ -565,9 +567,7 @@ def get_variance_initial_sequence( # first negative value, if this exists. if sequence_estimator == "positive": sub_zero_idxs = _np.where(autocov_valid < 0)[0] - truncate_idx = ( - sub_zero_idxs[0] if sub_zero_idxs.size > 0 else len(autocov_valid) - ) + truncate_idx = sub_zero_idxs[0] if sub_zero_idxs.size > 0 else len(autocov_valid) # Limit the truncate in autocov_valid = autocov_valid[:truncate_idx] var_cor = autocov_valid.sum() * 2 - var @@ -584,9 +584,7 @@ def get_variance_initial_sequence( "initial_monotone": _get_initial_monotone_sequence, "initial_convex": _get_initial_convex_sequence, } - gamma_cap = variance_fns[sequence_estimator]( - gamma_cap, min_max_lag_time=min_max_lag_time - ) + gamma_cap = variance_fns[sequence_estimator](gamma_cap, min_max_lag_time=min_max_lag_time) var_cor = gamma_cap.sum() * 2 - var # Make sure that the variance is not negative. @@ -599,13 +597,13 @@ def get_variance_initial_sequence( def get_variance_series_initial_sequence( - data: _np.ndarray, + data: _npt.NDArray[_np.float64], sequence_estimator: str = "initial_convex", min_max_lag_time: int = 3, max_max_lag_time: _Optional[int] = None, smooth_lag_times: bool = False, frac_padding: float = 0.1, -) -> _Tuple[_np.ndarray, _np.ndarray]: +) -> _Tuple[_npt.NDArray[_np.float64], _npt.NDArray[_np.float64]]: """ Repeatedly calculate the variance of a time series while discarding increasing numbers of samples from the start of the time series. The variance is calculated @@ -659,7 +657,8 @@ def get_variance_series_initial_sequence( if frac_padding > 0.5: _warn( - "Percent padding is greater than 0.5. You are evaluating less than half of the data." + "Percent padding is greater than 0.5. You are evaluating less than half of the data.", + stacklevel=2, ) # Calculate the maximum index to use when discarding samples. @@ -730,8 +729,8 @@ def get_variance_series_initial_sequence( def get_variance_window( - data: _np.ndarray, - kernel: _Callable[[int], _np.ndarray] = _np.bartlett, # type: ignore + data: _npt.NDArray[_np.float64], + kernel: _Callable[[int], _npt.NDArray[_np.float64]] = _np.bartlett, # type: ignore window_size: int = 10, ) -> float: """ @@ -786,16 +785,16 @@ def get_variance_window( corr_var = autocov.sum() * 2 - var # Make sure that the variance is not less than the uncorrelated value. - return max(corr_var, var) + return max(corr_var, var) # type: ignore[no-any-return] def get_variance_series_window( - data: _np.ndarray, - kernel: _Callable[[int], _np.ndarray] = _np.bartlett, # type: ignore + data: _npt.NDArray[_np.float64], + kernel: _Callable[[int], _npt.NDArray[_np.float64]] = _np.bartlett, # type: ignore window_size_fn: _Optional[_Callable[[int], int]] = lambda x: round(x**0.5), window_size: _Optional[int] = None, frac_padding: float = 0.1, -) -> _Tuple[_np.ndarray, _np.ndarray]: +) -> _Tuple[_npt.NDArray[_np.float64], _npt.NDArray[_np.float64]]: """ Repeatedly calculate the variance of a time series while discarding increasing numbers of samples from the start of the time series. The variance is calculated @@ -837,14 +836,10 @@ def get_variance_series_window( # Check that only one of window_size_fn and window_size is not None. if window_size_fn is not None and window_size is not None: - raise InvalidInputError( - "Only one of window_size_fn and window_size can be not None." - ) + raise InvalidInputError("Only one of window_size_fn and window_size can be not None.") if window_size_fn is None and window_size is None: - raise InvalidInputError( - "One of window_size_fn and window_size must be not None." - ) + raise InvalidInputError("One of window_size_fn and window_size must be not None.") if window_size_fn is not None: # Check that the window size function is valid. @@ -857,13 +852,12 @@ def get_variance_series_window( if frac_padding > 0.5: _warn( - "Percent padding is greater than 0.5. You are evaluating less than half of the data." + "Percent padding is greater than 0.5. You are evaluating less than half of the data.", + stacklevel=2, ) # Calculate the maximum index to use when discarding samples. - max_index = ( - n_samples - 1 - window_size if window_size is not None else n_samples - 2 - ) + max_index = n_samples - 1 - window_size if window_size is not None else n_samples - 2 # See if we need to truncate the max index even further based on the percent padding. if frac_padding > 0: @@ -875,18 +869,18 @@ def get_variance_series_window( window_size_series = _np.zeros(max_index + 1, dtype=int) for index in range(max_index + 1): - window_size = ( - window_size_fn(n_samples - index) if window_size_fn else window_size - ) + window_size = window_size_fn(n_samples - index) if window_size_fn else window_size variance_series[index] = get_variance_window( - data[:, index:], kernel=kernel, window_size=window_size # type: ignore + data[:, index:], + kernel=kernel, + window_size=window_size, # type: ignore ) window_size_series[index] = window_size return variance_series, window_size_series -def replicated_batch_means_variance(data: _np.ndarray, batch_size: int) -> float: +def replicated_batch_means_variance(data: _npt.NDArray[_np.float64], batch_size: int) -> float: """ Estimate the variance of a time series using the replicated batch means method. See section 3.1 in Statist. Sci. 36(4): 518-529 (November 2021). @@ -911,7 +905,8 @@ def replicated_batch_means_variance(data: _np.ndarray, batch_size: int) -> float n_chains, n_samples = data.shape if batch_size < 1 or batch_size > n_samples: raise InvalidInputError( - f"batch_size must be between 1 and n_samples = {n_samples} (inclusive), but got {batch_size}." + f"batch_size must be between 1 and n_samples = {n_samples} (inclusive)," + f" but got {batch_size}." ) # Compute the number of batches. @@ -929,10 +924,10 @@ def replicated_batch_means_variance(data: _np.ndarray, batch_size: int) -> float # Multiply by the batch size. batch_means_variance *= batch_size - return batch_means_variance + return batch_means_variance # type: ignore[no-any-return] -def lugsail_variance(data: _np.ndarray, n_pow: float = 1 / 3) -> float: +def lugsail_variance(data: _npt.NDArray[_np.float64], n_pow: float = 1 / 3) -> float: """ Estimate the variance of a time series using the lugsail method. See section 3.2 in Statist. Sci. 36(4): 518-529 (November 2021). @@ -957,9 +952,7 @@ def lugsail_variance(data: _np.ndarray, n_pow: float = 1 / 3) -> float: # Check that n_pow is valid. if n_pow <= 0 or n_pow > 1: - raise InvalidInputError( - f"n_pow must be between 0 and 1 (inclusive), but got {n_pow}." - ) + raise InvalidInputError(f"n_pow must be between 0 and 1 (inclusive), but got {n_pow}.") # Get the two batch sizes. _, n_samples = data.shape @@ -982,7 +975,7 @@ def lugsail_variance(data: _np.ndarray, n_pow: float = 1 / 3) -> float: return lugsail_variance -def inter_run_variance(data: _np.ndarray) -> float: +def inter_run_variance(data: _npt.NDArray[_np.float64]) -> float: """ Compute the variance based on the inter-run differences between means. @@ -1007,10 +1000,10 @@ def inter_run_variance(data: _np.ndarray) -> float: _, n_samples = data.shape inter_run_variance *= n_samples - return inter_run_variance + return inter_run_variance # type: ignore[no-any-return] -def intra_run_variance(data: _np.ndarray) -> float: +def intra_run_variance(data: _npt.NDArray[_np.float64]) -> float: """ Compute the average intra-run variance estimate. @@ -1033,4 +1026,4 @@ def intra_run_variance(data: _np.ndarray) -> float: # Compute the mean intra-run variance estimate. mean_intra_run_variance = _np.mean(intra_run_variance) - return mean_intra_run_variance + return mean_intra_run_variance # type: ignore[no-any-return]