From 9b578321e7244cdeb86091f87da9c5e331a6735d Mon Sep 17 00:00:00 2001 From: Till Hoffmann Date: Thu, 14 Dec 2023 10:30:03 +0100 Subject: [PATCH 1/9] Replace non-standard `\parenth` command in markdown. --- trees/trees.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/trees/trees.md b/trees/trees.md index 0257d4e..b69b7c6 100644 --- a/trees/trees.md +++ b/trees/trees.md @@ -100,11 +100,11 @@ fig.tight_layout() We compare the GP-based inference with a simpler approach that employes a Gaussian filter to smooth the data. The estimate is $$ -\hat{\vec{y}}_\lambda = \frac{\vec{g}_\lambda\ast\parenth{\vec{b}\circ\vec{y}}}{\vec{g}_\lambda\ast\vec{b}}, +\hat{\vec{y}}_\lambda = \frac{\vec{g}_\lambda\ast\left(\vec{b}\circ\vec{y}\right)}{\vec{g}_\lambda\ast\vec{b}}, $$ where $\ast$ denotes convolution, $\circ$ denotes the elementwise product, $\vec{g}_\lambda$ is a Gaussian filter with smoothing scale $\lambda$, and $\vec{b}$ is the binary mask indicating which data are available for training. Because the Gaussian filter only provides a point estimate, we cannot use the posterior predictive distribution to compare the two approaches. We instead use a scaled mean squared error $$ -S\parenth{\vec{y},\hat{\vec{y}}=\exp\hat{\vec{f}}} = \frac{1}{m}\sum_{j=1}^m \frac{\parenth{y_i-\exp \hat f_i}^2}{\max\parenth{y_i,1}}, +S\left(\vec{y},\hat{\vec{y}}\right) = \exp\hat{\vec{f}} = \frac{1}{m}\sum_{j=1}^m \frac{\left(y_i-\exp \hat f_i\right)^2}{\max\left(y_i,1\right)}, $$ to compare the held out data $\vec{y}$ with the prediction $\hat{\vec{y}}$. The scaling ensures large frequencies do not dominate the error measure because they [naturally have a larger variance](https://en.wikipedia.org/wiki/Poisson_distribution#Descriptive_statistics). From 219722e9715dc20a7eb3e7e6fc84148b58ba6d8d Mon Sep 17 00:00:00 2001 From: Till Hoffmann Date: Thu, 14 Dec 2023 10:30:19 +0100 Subject: [PATCH 2/9] Install pytest. --- requirements.in | 1 + requirements.txt | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/requirements.in b/requirements.in index 2708623..c337c40 100644 --- a/requirements.in +++ b/requirements.in @@ -5,5 +5,6 @@ matplotlib jupyter jupytext pip-tools +pytest scipy tabulate diff --git a/requirements.txt b/requirements.txt index b31f42f..c399cd6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -66,6 +66,7 @@ exceptiongroup==1.1.3 # via # anyio # ipython + # pytest executing==1.2.0 # via stack-data fastjsonschema==2.18.0 @@ -83,6 +84,8 @@ idna==3.4 # anyio # jsonschema # requests +iniconfig==2.0.0 + # via pytest ipykernel==6.25.2 # via # jupyter @@ -230,6 +233,7 @@ packaging==23.1 # jupyterlab-server # matplotlib # nbconvert + # pytest # qtconsole # qtpy pandas==2.1.0 @@ -248,6 +252,8 @@ pip-tools==7.3.0 # via -r requirements.in platformdirs==3.10.0 # via jupyter-core +pluggy==1.3.0 + # via pytest prometheus-client==0.17.1 # via jupyter-server prompt-toolkit==3.0.39 @@ -274,6 +280,8 @@ pyparsing==3.1.1 # via matplotlib pyproject-hooks==1.0.0 # via build +pytest==7.4.3 + # via -r requirements.in python-dateutil==2.8.2 # via # arrow @@ -350,6 +358,7 @@ tomli==2.0.1 # jupyterlab # pip-tools # pyproject-hooks + # pytest tornado==6.3.3 # via # ipykernel From 24a03c1164a299661e8b95a86a2ddec4f70bd5c9 Mon Sep 17 00:00:00 2001 From: Till Hoffmann Date: Thu, 14 Dec 2023 10:59:20 +0100 Subject: [PATCH 3/9] Add `*.ipynb` for each experiment. --- .github/workflows/main.yaml | 2 + .gitignore | 1 - README.md | 2 +- kernels/kernels.ipynb | 100 ++++++++++ padding/padding.ipynb | 310 +++++++++++++++++++++++++++++ profile/profile.ipynb | 382 ++++++++++++++++++++++++++++++++++++ tests/test_notebooks.py | 26 +++ trees/trees.ipynb | 296 ++++++++++++++++++++++++++++ tube/tube.ipynb | 213 ++++++++++++++++++++ 9 files changed, 1330 insertions(+), 2 deletions(-) create mode 100644 kernels/kernels.ipynb create mode 100644 padding/padding.ipynb create mode 100644 profile/profile.ipynb create mode 100644 tests/test_notebooks.py create mode 100644 trees/trees.ipynb create mode 100644 tube/tube.ipynb diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 9db032c..0e4a102 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -15,6 +15,8 @@ jobs: - uses: "actions/checkout@v3" - name: Build Docker image. run: docker build -t gptools . + - name: Verify notebooks are up to date. + run: docker run --rm -v `pwd`:/workdir gptools pytest -v - name: Generate figures (using ./in-docker.sh but we can't use `-it` in the Action). run: docker run --rm -e FAST=true -v `pwd`:/workdir gptools cook exec figures - name: Upload figures and reports as artifacts. diff --git a/.gitignore b/.gitignore index 07d3457..8646ada 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,6 @@ .ipynb_checkpoints *.hpp *.html -*.ipynb *.o *.pdf *.pkl diff --git a/README.md b/README.md index f770e2c..e895b7d 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ To ensure the reproducibility of these materials, the results are also computed ## Running the experiments -Figures in the manuscript were generated using the containerized runtime, and all runtime estimates below are based on a 2020 Macbook Pro with M1 chip and 16 GB of memory running macOS 13.4 (22F66). Figures are generated by executing Jupyter notebooks stored in markdown format. All figures can be reproduced by running `[./in-docker.sh] cook exec figures` (see below for details). +Figures in the manuscript were generated using the containerized runtime, and all runtime estimates below are based on a 2020 Macbook Pro with M1 chip and 16 GB of memory running macOS 13.4 (22F66). Figures are generated by executing Jupyter notebooks stored in markdown format; the notebooks can be opened directly in a standard Jupyter environment using the [`jupytext`](https://jupytext.readthedocs.io/en/latest/) extension. If you prefer, the folder for each experiment also contains a corresponding `*.ipynb` file. All figures can be reproduced by running `[./in-docker.sh] cook exec figures` (see below for details). ### Applications diff --git a/kernels/kernels.ipynb b/kernels/kernels.ipynb new file mode 100644 index 0000000..f8a02b3 --- /dev/null +++ b/kernels/kernels.ipynb @@ -0,0 +1,100 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "031e9913", + "metadata": {}, + "outputs": [], + "source": [ + "from gptools.util.kernels import ExpQuadKernel, MaternKernel\n", + "from gptools.util.fft.fft1 import transform_irfft, evaluate_rfft_scale\n", + "import matplotlib as mpl\n", + "from matplotlib import pyplot as plt\n", + "import numpy as np\n", + "from pathlib import Path\n", + "\n", + "mpl.style.use(\"../jss.mplstyle\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "250db79b", + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(9) # Seed picked for good legend positioning. Works for any though.\n", + "fig, axes = plt.subplots(2, 2)\n", + "length_scale = 0.2\n", + "kernels = {\n", + " \"squared exp.\": lambda period: ExpQuadKernel(1, length_scale, period),\n", + " \"Matern ³⁄₂\": lambda period: MaternKernel(1.5, 1, length_scale, period),\n", + "}\n", + "\n", + "x = np.linspace(0, 1, 101, endpoint=False)\n", + "z = np.random.normal(0, 1, x.size)\n", + "\n", + "for ax, (key, kernel) in zip(axes[1], kernels.items()):\n", + " value = kernel(None).evaluate(0, x[:, None])\n", + " line, = axes[0, 0].plot(x, value, ls=\"--\")\n", + " rfft = kernel(1).evaluate_rfft([x.size])\n", + " value = np.fft.irfft(rfft, x.size)\n", + " axes[0, 1].plot(rfft, label=key)\n", + " axes[0, 0].plot(x, value, color=line.get_color())\n", + "\n", + " for maxf, ls in [(x.size // 2 + 1, \"-\"), (5, \"--\"), (3, \":\")]:\n", + " rfft_scale = evaluate_rfft_scale(cov_rfft=rfft, size=x.size)\n", + " rfft_scale[maxf:] = 0\n", + " f = transform_irfft(z, np.zeros_like(z), rfft_scale=rfft_scale)\n", + " ax.plot(x, f, ls=ls, color=line.get_color(), label=fr\"$\\xi_\\max={maxf}$\")\n", + "\n", + " ax.set_xlabel(\"position $x$\")\n", + " ax.set_ylabel(f\"{key} GP $f$\")\n", + "\n", + "ax = axes[0, 0]\n", + "ax.set_ylabel(\"kernel $k(0,x)$\")\n", + "ax.set_xlabel(\"position $x$\")\n", + "ax.legend([\n", + " mpl.lines.Line2D([], [], ls=\"--\", color=\"gray\"),\n", + " mpl.lines.Line2D([], [], ls=\"-\", color=\"gray\"),\n", + "], [\"standard\", \"periodic\"], fontsize=\"small\")\n", + "ax.text(0.05, 0.05, \"(a)\", transform=ax.transAxes)\n", + "ax.yaxis.set_ticks([0, 0.5, 1])\n", + "\n", + "ax = axes[0, 1]\n", + "ax.set_yscale(\"log\")\n", + "ax.set_ylim(1e-5, x.size)\n", + "ax.set_xlabel(r\"frequency $\\xi$\")\n", + "ax.set_ylabel(r\"Fourier kernel $\\tilde k=\\phi\\left(k\\right)$\")\n", + "ax.legend(fontsize=\"small\", loc=\"center right\")\n", + "ax.text(0.95, 0.95, \"(b)\", transform=ax.transAxes, ha=\"right\", va=\"top\")\n", + "\n", + "ax = axes[1, 0]\n", + "ax.legend(fontsize=\"small\", loc=\"lower center\")\n", + "ax.text(0.95, 0.95, \"(c)\", transform=ax.transAxes, ha=\"right\", va=\"top\")\n", + "\n", + "ax = axes[1, 1]\n", + "ax.legend(fontsize=\"small\", loc=\"lower center\")\n", + "ax.sharey(axes[1, 0])\n", + "ax.text(0.95, 0.95, \"(d)\", transform=ax.transAxes, ha=\"right\", va=\"top\")\n", + "\n", + "for ax in [axes[0, 0], *axes[1]]:\n", + " ax.xaxis.set_ticks([0, 0.5, 1])\n", + "\n", + "fig.tight_layout()\n", + "fig.savefig(\"kernels.pdf\", bbox_inches=\"tight\")\n", + "fig.savefig(\"kernels.png\", bbox_inches=\"tight\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/padding/padding.ipynb b/padding/padding.ipynb new file mode 100644 index 0000000..fe2d9b4 --- /dev/null +++ b/padding/padding.ipynb @@ -0,0 +1,310 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1c010e06", + "metadata": {}, + "source": [ + "# Simulation study to study the effect of padding for Fourier-based Gaussian processes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64e84665", + "metadata": {}, + "outputs": [], + "source": [ + "import cmdstanpy\n", + "from gptools.util.kernels import ExpQuadKernel, MaternKernel\n", + "from gptools.stan import compile_model\n", + "import logging\n", + "import matplotlib as mpl\n", + "from matplotlib import pyplot as plt\n", + "import numpy as np\n", + "import os\n", + "import pandas as pd\n", + "from pathlib import Path\n", + "import pickle\n", + "from scipy.stats import gaussian_kde, kstest\n", + "from tqdm.notebook import tqdm\n", + "\n", + "\n", + "mpl.style.use(\"../jss.mplstyle\")\n", + "FAST = \"FAST\" in os.environ\n", + "\n", + "# Disable cmdstan logging because we have a lot of fits.\n", + "cmdstanpy_logger = cmdstanpy.utils.get_logger()\n", + "for handler in cmdstanpy_logger.handlers:\n", + " handler.setLevel(logging.WARNING)" + ] + }, + { + "cell_type": "raw", + "id": "64d617f5", + "metadata": {}, + "source": [ + "# Convert this cell to a code cell to remove cached pickle files.\n", + "!rm -f *.pkl" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "962824ee", + "metadata": {}, + "outputs": [], + "source": [ + "# Define hyperparameters and generate synthetic datasets.\n", + "np.random.seed(0)\n", + "m = 10 if FAST else 100\n", + "iter_sampling = 10 if FAST else 100\n", + "n = 128\n", + "sigma = 1\n", + "length_scale = 16\n", + "kappa = 1\n", + "epsilon = 1e-5\n", + "padding_factors = [0, 0.125, 0.25, 0.375, 0.5, 0.75, 1.0, 1.25, 1.5]\n", + "\n", + "\n", + "# Declare the kernels.\n", + "x = np.arange(n)\n", + "kernels = {\n", + " \"ExpQuadKernel\": ExpQuadKernel(sigma, length_scale),\n", + " \"Matern32Kernel\": MaternKernel(1.5, sigma, length_scale),\n", + "}\n", + "\n", + "# Generate synthetic datasets.\n", + "def sample(n, cov, kappa):\n", + " \"\"\"\n", + " Draw `n` samples from the generative model with covariance `cov` and observation noise `kappa`.\n", + " \"\"\"\n", + " f = np.random.multivariate_normal(np.zeros(n), cov)\n", + " y = np.random.normal(f, kappa)\n", + " return f, y\n", + "\n", + "fs = {}\n", + "ys = {}\n", + "for key, kernel in kernels.items():\n", + " cov = kernel.evaluate(x[:, None]) + epsilon * np.eye(n)\n", + " for _ in range(m):\n", + " f, y = sample(n, cov, kappa)\n", + " fs.setdefault(key, []).append(f)\n", + " ys.setdefault(key, []).append(y)\n", + "\n", + "fs = {key: np.asarray(value) for key, value in fs.items()}\n", + "ys = {key: np.asarray(value) for key, value in ys.items()}\n", + "\n", + "# Visualize one of the samples for each kernel.\n", + "fig, axes = plt.subplots(1, 2, sharex=True, sharey=True)\n", + "i = 0\n", + "for ax, key in zip(axes, fs):\n", + " ax.plot(x, fs[key][i], label=\"Gaussian process $f$\")\n", + " ax.scatter(x, ys[key][i], label=\"target $y$\", marker=\".\")\n", + " ax.set_xlabel(\"covariate $x$\")\n", + "ax.legend()\n", + "fig.tight_layout()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6dcff8e", + "metadata": {}, + "outputs": [], + "source": [ + "# Fit the standard non-centered model to each dataset independently.\n", + "standard_model = compile_model(stan_file=\"exact.stan\")\n", + "fourier_model = compile_model(stan_file=\"padded.stan\")\n", + "\n", + "\n", + "def get_fit(model, y, kernel, padding=0):\n", + " if isinstance(kernel, ExpQuadKernel):\n", + " kernel = 0\n", + " elif isinstance(kernel, MaternKernel):\n", + " kernel = 1\n", + " else:\n", + " raise ValueError(kernel)\n", + "\n", + " if padding != int(padding):\n", + " raise ValueError(\"non-integer padding\")\n", + "\n", + " data = {\n", + " \"x\": x,\n", + " \"num_observations\": n,\n", + " \"observe_first\": n - 1,\n", + " \"y\": y,\n", + " \"length_scale\": length_scale,\n", + " \"sigma\": sigma,\n", + " \"kappa\": kappa,\n", + " \"epsilon\": epsilon,\n", + " \"padding\": int(padding),\n", + " \"kernel\": kernel,\n", + " }\n", + " return model.sample(data, iter_warmup=5 * iter_sampling, iter_sampling=iter_sampling, chains=1,\n", + " show_progress=False, seed=0)\n", + "\n", + "\n", + "os.makedirs(\"results\", exist_ok=True)\n", + "statistics_by_kernel = {}\n", + "for key, kernel in kernels.items():\n", + " # Naive caching for log pdfs and ranks.\n", + " filename = f\"results/padding-cache-{key}-{m}.pkl\"\n", + " try:\n", + " with open(filename, \"rb\") as fp:\n", + " statistics_by_kernel[key] = pickle.load(fp)\n", + " print(f\"loaded results from {filename}\")\n", + " except FileNotFoundError:\n", + " for y, f in tqdm(zip(ys[key], fs[key]), desc=key, total=m):\n", + " # Get the fits for different models.\n", + " fits = {\"exact\": get_fit(standard_model, y, kernel)}\n", + " for factor in padding_factors:\n", + " fits[factor] = get_fit(fourier_model, y, kernel, length_scale * factor)\n", + "\n", + " # Compute the posterior density at the held out data point and its rank for each fit.\n", + " statistics_by_kernel.setdefault(key, []).append({\n", + " \"lpds\": {\n", + " key: gaussian_kde(fit.f[:, n - 1]).logpdf(f[n - 1]).squeeze()\n", + " for key, fit in fits.items()\n", + " },\n", + " \"ranks\": {\n", + " key: np.sum(fit.f[:, n - 1] < f[n - 1])\n", + " for key, fit in fits.items()\n", + " },\n", + " })\n", + " # Save the results.\n", + " with open(filename, \"wb\") as fp:\n", + " pickle.dump(statistics_by_kernel[key], fp)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f311314e", + "metadata": {}, + "outputs": [], + "source": [ + "# Transpose the data for plotting. This isn't pretty but gets the job done.\n", + "transposed = {}\n", + "\n", + "for key, statistics in statistics_by_kernel.items():\n", + " result = {}\n", + " for record in statistics:\n", + " for stat, values in record.items():\n", + " result.setdefault(stat, []).append(list(values.values()))\n", + " transposed[key] = {key: np.asarray(value) for key, value in result.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a75f3ced", + "metadata": {}, + "outputs": [], + "source": [ + "# Show an example. We sample with a seed to align legends around the plot and pick an example\n", + "# that excacerbates the effect of periodic boundary conditions.\n", + "np.random.seed(1)\n", + "kernel = kernels[\"ExpQuadKernel\"]\n", + "f, y = sample(n, kernel.evaluate(x[:, None]), kappa)\n", + "fits = {\"exact\": get_fit(standard_model, y, kernel)}\n", + "for factor in padding_factors:\n", + " fits[factor] = get_fit(fourier_model, y, kernel, length_scale * factor)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1fff100", + "metadata": {}, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(2, 2, sharex=\"col\")\n", + "\n", + "# Show an example.\n", + "ax = axes[0, 0]\n", + "ax.plot(x, f, color=\"k\", label=\"latent GP $f$\")\n", + "pts = ax.scatter(x, y, marker=\".\", color=\"k\", label=\"data $y$\")\n", + "plt.setp(ax.xaxis.get_ticklabels(), visible=False)\n", + "ax.legend(loc=\"upper right\", fontsize=\"small\")\n", + "\n", + "ax = axes[1, 0]\n", + "ax.plot(x, f, color=\"k\")\n", + "keys = [\"exact\", 0, 0.5, 1.0]\n", + "for i, key in enumerate(keys):\n", + " color = f\"C{i}\"\n", + " l = fits[key].f.mean(axis=0)\n", + " label = key if key == \"exact\" else fr\"$w/\\ell={key:.2f}$\"\n", + " ax.plot(np.arange(l.size), l, color=color, label=label, alpha=0.7)\n", + "ax.set_xlabel(\"covariate $x$\")\n", + "ax.legend(fontsize=\"small\", ncol=2, loc=\"upper center\")\n", + "\n", + "for ax in axes[:, 0]:\n", + " poly = ax.axvspan(n, 2 * n, facecolor=\"silver\", alpha=0.2)\n", + " poly.remove()\n", + " ax.relim()\n", + " ax.add_artist(poly)\n", + " ax.text((n + ax.get_xlim()[1]) / 2, -1, \"padding\", ha=\"center\", va=\"center\",\n", + " rotation=90, color=\"gray\")\n", + " ax.set_ylim(-2.5, 2.0)\n", + "\n", + "# Show the evaluation for the two types of kernels.\n", + "for ax, (key, statistics) in zip(axes[:, 1], transposed.items()):\n", + " lpds = statistics[\"lpds\"]\n", + " l = lpds.mean(axis=0)\n", + " s = lpds.std(axis=0) / np.sqrt(m - 1)\n", + " line = ax.axhline(l[0], label=\"exact\")\n", + " ax.axhspan(l[0] - s[0], l[0] + s[0], alpha=0.2, color=line.get_color())\n", + " ax.errorbar(padding_factors, l[1:], s[1:], marker=\"o\", markeredgecolor=\"w\",\n", + " color=\"gray\", ls=\":\", label=\"padded Fourier\")\n", + "\n", + " # Add the markers visualized in the example.\n", + " if key == \"ExpQuadKernel\":\n", + " for i, key in enumerate(keys[1:], 1):\n", + " ax.scatter(key, l[padding_factors.index(key) + 1], zorder=9,\n", + " color=f\"C{i}\", edgecolor=\"w\")\n", + "\n", + "axes[1, 1].set_xlabel(r\"padding factor $w/\\ell$\")\n", + "axes[1, 1].set_ylabel(r\"log posterior density $\\log p\\left(f_n\\mid y_{ np.ndarray:\n", + " \"\"\"\n", + " Load a configuration and compute the log probability for held-out data.\n", + " \"\"\"\n", + " with open(filename, \"rb\") as fp:\n", + " result = pickle.load(fp)\n", + " errors = []\n", + " for fit, eta, data in zip(result[\"fits\"], result[\"etas\"], result[\"data\"]):\n", + " if fit is None:\n", + " errors.append(np.nan)\n", + " continue\n", + " test_idx = np.setdiff1d(1 + np.arange(size), data[\"observed_idx\"]) - 1\n", + " if not test_idx.size:\n", + " raise ValueError(\"there are no test values\")\n", + " test_eta = eta[test_idx]\n", + "\n", + " if isinstance(fit, cmdstanpy.CmdStanVB):\n", + " df = pd.DataFrame(fit.variational_sample, columns=fit.column_names)\n", + " df = df[[column for column in df if re.fullmatch(r\"eta\\[\\d+\\]\", column)]]\n", + " eta_samples = df.values\n", + " elif isinstance(fit, cmdstanpy.CmdStanMCMC):\n", + " eta_samples = fit.stan_variable(\"eta\")\n", + " if fit.method_variables()[\"divergent__\"].sum():\n", + " print(\"divergent transitions\")\n", + " else:\n", + " raise TypeError(fit)\n", + "\n", + "\n", + " error = 0\n", + " for i in test_idx:\n", + " # assert False\n", + " kde = stats.gaussian_kde(eta_samples[:, i])\n", + " error += kde.logpdf(eta[i]).squeeze()\n", + " errors.append(error)\n", + " return np.asarray(errors)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8004410", + "metadata": {}, + "outputs": [], + "source": [ + "size = 1024\n", + "methods = [\"sample\", \"variational\"]\n", + "parameterizations = [\"fourier_non_centered\", \"fourier_centered\"]\n", + "product = it.product(methods, parameterizations, LOG10_NOISE_SCALES)\n", + "lps = {}\n", + "for method, parameterization, log_noise_scale in product:\n", + " suffix = \"-train-test\" if method == \"sample\" else \"\"\n", + " filename = f\"results/{method}/{parameterization}/\" \\\n", + " f\"log10_noise_scale-{log_noise_scale:.3f}_size-{size}{suffix}.pkl\"\n", + " lps.setdefault((method, parameterization), []).append(load_lps(filename))\n", + "\n", + "num_bootstrap = 1000\n", + "mean_lps = {\n", + " key: np.asarray([\n", + " np.random.dirichlet(np.ones_like(value), num_bootstrap) @ value\n", + " for value in values\n", + " ]).T for key, values in lps.items()\n", + "}\n", + "\n", + "deltas = {\n", + " method: mean_lps[(method, \"fourier_non_centered\")] - mean_lps[(method, \"fourier_centered\")]\n", + " for method in methods\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "512a04e9", + "metadata": {}, + "outputs": [], + "source": [ + "color_by_method = {\n", + " \"standard\": \"C0\",\n", + " \"graph\": \"C1\",\n", + " \"fourier\": \"C2\",\n", + "}\n", + "marker_by_parameterization = {\n", + " \"centered\": \"o\",\n", + " \"non_centered\": \"s\",\n", + "}\n", + "ls_by_parameterization = {\n", + " \"centered\": \"-\",\n", + " \"non_centered\": \"--\",\n", + "}\n", + "\n", + "fig = plt.figure()\n", + "# First ratio is manually fiddled to match the heights of ax1 and ax3\n", + "gs = fig.add_gridspec(2, 2)\n", + "ax1 = fig.add_subplot(gs[0, 0])\n", + "ax2 = fig.add_subplot(gs[0, 1], sharex=ax1, sharey=ax1)\n", + "ax3 = fig.add_subplot(gs[1, 0], sharey=ax1)\n", + "\n", + "gs4 = mpl.gridspec.GridSpecFromSubplotSpec(2, 1, gs[1, 1], height_ratios=[2, 1],\n", + " hspace=0.1)\n", + "ax4t = fig.add_subplot(gs4[0])\n", + "ax4b = fig.add_subplot(gs4[1])\n", + "\n", + "ax = ax1\n", + "ax.set_xlabel(r\"size $n$\")\n", + "ax.set_ylabel(r\"runtime (seconds)\")\n", + "ax = ax2\n", + "ax.set_xlabel(r\"size $n$\")\n", + "ax.set_ylabel(r\"runtime (seconds)\")\n", + "\n", + "for i, ax in [(0, ax1), (-1, ax2)]:\n", + " print(f\"kappa = 10 ** {LOG10_NOISE_SCALES[i]}\")\n", + " for key, value in durations.items():\n", + " method, parameterization = key.split(\"_\", 1)\n", + " line, = ax.plot(\n", + " SIZES, value[i], color=color_by_method[method],\n", + " marker=marker_by_parameterization[parameterization],\n", + " ls=ls_by_parameterization[parameterization],\n", + " )\n", + " line.set_markeredgecolor(\"w\")\n", + "\n", + " f = np.isfinite(value[i])\n", + " fit = np.polynomial.Polynomial.fit(np.log(SIZES[f])[-3:], np.log(value[i][f])[-3:], 1)\n", + " fit = fit.convert()\n", + " print(f\"{key}: n ** {fit.coef[1]:.3f}\")\n", + "\n", + " ax.set_xscale(\"log\")\n", + " ax.set_yscale(\"log\")\n", + " bbox = {\"facecolor\": \"w\", \"edgecolor\": \"none\", \"alpha\": 0.9}\n", + " ax.text(0.05, 0.95, fr\"({'ab'[i]}) $\\kappa=10^{{{LOG10_NOISE_SCALES[i]:.0f}}}$\",\n", + " transform=ax.transAxes, va=\"top\", bbox=bbox)\n", + " ax.set_xlabel(\"size $n$\")\n", + " ax.set_ylabel(\"duration (seconds)\")\n", + " print()\n", + "\n", + "ax = ax3\n", + "ax.set_xlabel(r\"noise scale $\\kappa$\")\n", + "ax.set_ylabel(r\"runtime (seconds)\")\n", + "\n", + "ax.set_xlabel(r\"noise scale $\\kappa$\")\n", + "ax.set_ylabel(\"duration (seconds)\")\n", + "ax.set_xscale(\"log\")\n", + "max_size = 4096\n", + "mappable = mpl.cm.ScalarMappable(norm=mpl.colors.LogNorm(SIZES.min(), max_size),\n", + " cmap=\"viridis\")\n", + "method = \"fourier\"\n", + "for parameterization in [\"non_centered\", \"centered\"]:\n", + " for size, y in zip(SIZES, durations[f\"{method}_{parameterization}\"].T):\n", + " if size > max_size:\n", + " continue\n", + " ax.plot(NOISE_SCALES, y, color=mappable.to_rgba(size),\n", + " ls=ls_by_parameterization[parameterization])\n", + "ax.text(0.05, 0.95, \"(c)\", transform=ax.transAxes, va=\"top\")\n", + "ax.set_ylim(top=100)\n", + "# We'll add the colorbar later below after laying out the figure.\n", + "\n", + "ax = ax4b\n", + "ax.set_xlabel(r\"noise scale $\\kappa$\")\n", + "ax.spines[\"top\"].set_visible(False)\n", + "\n", + "# Monkeypatch for fixed order of magnitude.\n", + "def _set_order_of_magnitude(self):\n", + " self.orderOfMagnitude = 3\n", + "ax4b.yaxis.major.formatter._set_order_of_magnitude = types.MethodType(\n", + " _set_order_of_magnitude, ax4b.yaxis.major.formatter,\n", + ")\n", + "ax4t.yaxis.major.formatter._set_order_of_magnitude = types.MethodType(\n", + " _set_order_of_magnitude, ax4t.yaxis.major.formatter,\n", + ")\n", + "\n", + "for ax in [ax4b, ax4t]:\n", + " for key, value in deltas.items():\n", + " marker = \"o\" if key == \"sample\" else \"s\"\n", + " line, *_ = ax.errorbar(NOISE_SCALES, value.mean(axis=0), value.std(axis=0),\n", + " label=key, marker=marker, markeredgecolor=\"w\", ls=\"none\", zorder=9)\n", + " ax.plot(NOISE_SCALES, value.mean(axis=0), color=\"silver\", zorder=0)\n", + " ax.axhline(0, color=\"k\", ls=\":\", zorder=1)\n", + "\n", + " ax.set_xscale(\"log\")\n", + " ax.ticklabel_format(axis=\"y\", scilimits=(0, 0), useMathText=True)\n", + "ax4b.legend(loc=\"lower right\")\n", + "ax4b.set_ylim(-10e3, -6.5e3)\n", + "ax4t.set_ylim(-1300, 1100)\n", + "ax4b.yaxis.get_offset_text().set_visible(False)\n", + "ax4b.set_yticks([-9e3, -7e3])\n", + "\n", + "# Visibility adjustment must happen after plotting.\n", + "ax4t.set_ylabel(r\"log p.d. difference $\\Delta$\", y=0.2)\n", + "ax4t.set_xticklabels([])\n", + "ax4t.spines[\"bottom\"].set_visible(False)\n", + "plt.setp(ax.xaxis.get_majorticklines(), visible=False)\n", + "plt.setp(ax.xaxis.get_minorticklines(), visible=False)\n", + "ax4t.text(0.05, 0.95, \"(d)\", transform=ax.transAxes, va=\"top\")\n", + "\n", + "# Lay out the figure before adding extra elements.\n", + "gs.tight_layout(fig, rect=[0, 0, 1, 0.93], h_pad=0)\n", + "\n", + "# Add the broken axis markers.\n", + "angle = np.deg2rad(30)\n", + "scale = 0.01\n", + "ax = ax4t\n", + "pm = np.asarray([-1, 1])\n", + "for x in [0, 1]:\n", + " for ax, y in [(ax4t, 0), (ax4b, 1)]:\n", + " pos = ax.get_position()\n", + " line = mpl.lines.Line2D(\n", + " x + scale * np.cos(angle) * pm / pos.width,\n", + " y + scale * np.sin(angle) * pm / pos.height,\n", + " transform=ax.transAxes, clip_on=False, color=\"k\",\n", + " lw=mpl.rcParams[\"axes.linewidth\"], in_layout=False,\n", + " )\n", + " ax.add_line(line)\n", + "\n", + "# Add the figure-level legend.\n", + "handles_labels = [\n", + " (\n", + " mpl.lines.Line2D([], [], linestyle=ls_by_parameterization[\"centered\"],\n", + " marker=marker_by_parameterization[\"centered\"], color=\"gray\",\n", + " markeredgecolor=\"w\"),\n", + " \"centered\",\n", + " ),\n", + " (\n", + " mpl.lines.Line2D([], [], linestyle=\":\",\n", + " marker=marker_by_parameterization[\"non_centered\"], color=\"gray\",\n", + " markeredgecolor=\"w\"),\n", + " \"non-centered\",\n", + " ),\n", + "]\n", + "for method, color in color_by_method.items():\n", + " handles_labels.append((\n", + " mpl.lines.Line2D([], [], color=color),\n", + " \"Fourier\" if method == \"fourier\" else method,\n", + " ))\n", + "bbox1 = ax1.get_position()\n", + "bbox2 = ax2.get_position()\n", + "bbox_to_anchor = [bbox1.xmin, 0, bbox2.xmax - bbox1.xmin, 1]\n", + "legend = fig.legend(*zip(*handles_labels), fontsize=\"small\", loc=\"upper center\",\n", + " ncol=5, bbox_to_anchor=bbox_to_anchor)\n", + "\n", + "# Finally add the colorbar.\n", + "rect = ax3.get_position()\n", + "rect = (\n", + " rect.xmin + 0.525 * rect.width,\n", + " rect.ymin + rect.height * 0.89,\n", + " rect.width * 0.45,\n", + " rect.height * 0.06,\n", + ")\n", + "cax = fig.add_axes(rect) # rect=(left, bottom, width, height)\n", + "cb = fig.colorbar(mappable, cax=cax, orientation=\"horizontal\")\n", + "cax.tick_params(labelsize=\"small\")\n", + "cax.set_ylabel(\"size $n$\", fontsize=\"small\", rotation=0, ha=\"right\", va=\"center\")\n", + "\n", + "for ax in [ax1, ax2]:\n", + " ax.axhline(timeout, ls=\":\", color=\"k\", zorder=0)\n", + "\n", + "fig.savefig(\"profile.pdf\", bbox_inches=\"tight\")\n", + "fig.savefig(\"profile.png\", bbox_inches=\"tight\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b16e8b8", + "metadata": {}, + "outputs": [], + "source": [ + "# Fit a linear model on the log-log scale. We use the last three data points to get a rough idea of\n", + "# \"asymptotic\" scaling.\n", + "idx = LOG10_NOISE_SCALES.size // 2\n", + "print(f\"log noise-scale: {LOG10_NOISE_SCALES[idx]}\")\n", + "y = durations[\"standard_centered\"][idx]\n", + "f = np.isfinite(y)\n", + "np.polynomial.Polynomial.fit(np.log(SIZES[f][-3:]), np.log(y[f][-3:]), 1).convert()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2e9eea9", + "metadata": {}, + "outputs": [], + "source": [ + "# Report the runtimes for 10k observations.\n", + "for parameterization in [\"fourier_centered\", \"fourier_non_centered\"]:\n", + " filename = f\"results/sample/{parameterization}/log10_noise_scale-0.000_size-10000.pkl\"\n", + " with open(filename, \"rb\") as fp:\n", + " result = pickle.load(fp)\n", + " print(parameterization, result[\"durations\"].mean())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/test_notebooks.py b/tests/test_notebooks.py new file mode 100644 index 0000000..9801729 --- /dev/null +++ b/tests/test_notebooks.py @@ -0,0 +1,26 @@ +from pathlib import Path +import pytest +import re +import subprocess + + +markdown_paths = [path for path in Path(".").glob("*/*.md") if not str(path).startswith(".")] + + +@pytest.mark.parametrize("markdown_path", markdown_paths, ids=map(str, markdown_paths)) +def test_notebooks(markdown_path: Path, tmp_path: Path) -> None: + # Check the notebook exists. + notebook_path = markdown_path.with_suffix(".ipynb") + if not notebook_path.is_file(): + raise FileNotFoundError(notebook_path) + + # Generate the notebook in a temp directory. + tmp_notebook_path = tmp_path / notebook_path.name + subprocess.check_call(["jupytext", str(markdown_path), "--output", str(tmp_notebook_path)]) + + # Check that they're up to date after stripping random ids. + pattern = re.compile(r'"id": "\w+"') + actual_text = pattern.sub("", tmp_notebook_path.read_text()) + expected_text = pattern.sub("", notebook_path.read_text()) + assert actual_text == expected_text, f"{markdown_path} and {notebook_path} do not contain " \ + "equivalent content. Do you need to update the notebook?" diff --git a/trees/trees.ipynb b/trees/trees.ipynb new file mode 100644 index 0000000..1a27775 --- /dev/null +++ b/trees/trees.ipynb @@ -0,0 +1,296 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "de330688", + "metadata": {}, + "source": [ + "# Density of *T. panamensis* on a 50 ha plot in Panama\n", + "\n", + "Trees on 50 ha plot on [Barro Colorado Island](https://en.wikipedia.org/wiki/Barro_Colorado_Island) have been [censused regularly since 1982](https://datadryad.org/stash/dataset/doi:10.15146/5xcp-0d46). The data are publicly available, and we use them here for a demonstration of a Gaussian process using the two-dimensional fast Fourier transform. For a given species, the data comprise the frequency $y$ of trees in each of 20 by 20 meter quadrants. For this example, we pick *T. panamensis* because its distribution is relatively heterogeneous over the plot. We load and visualize the data below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ddd1409", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib as mpl\n", + "from matplotlib import pyplot as plt\n", + "import numpy as np\n", + "from pathlib import Path\n", + "\n", + "mpl.style.use(\"../jss.mplstyle\")\n", + "\n", + "# Load the matrix of tree frequencies.\n", + "frequency = np.loadtxt(f\"tachve.csv\", delimiter=\",\")\n", + "nrows, ncols = frequency.shape\n", + "delta = 20 / 1000 # Separation between adjacent plots in km.\n", + "\n", + "# Show the tree frequency in the plot.\n", + "fig, ax = plt.subplots()\n", + "extent = delta * (np.asarray([0, ncols, 0, nrows]) - 0.5)\n", + "im = ax.imshow(frequency, extent=extent, origin=\"lower\")\n", + "ax.set_xlabel(\"easting (km)\")\n", + "ax.set_ylabel(\"northing (km)\")\n", + "fig.colorbar(im, ax=ax, location=\"top\", label=\"tree frequency\", fraction=0.05)\n", + "fig.tight_layout()\n", + "frequency.shape" + ] + }, + { + "cell_type": "markdown", + "id": "37fd835b", + "metadata": {}, + "source": [ + "The FFT assumes periodic boundary conditions, but, of course, these do not apply to trees. We thus pad the domain to attenuate correlation between quadrants at opposite sides of the plot. A padding of 10 quadrants corresponds to 200 meters. The model is shown below.\n", + "\n", + "```{literalinclude} trees.stan\n", + " :language: stan\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77b1cb07", + "metadata": {}, + "outputs": [], + "source": [ + "from gptools.stan import compile_model\n", + "import os\n", + "\n", + "\n", + "# Set up the padded shapes.\n", + "padding = 10\n", + "padded_rows = nrows + padding\n", + "padded_cols = ncols + padding\n", + "\n", + "# Sample a random training mask for later evaluation.\n", + "seed = 0\n", + "np.random.seed(seed)\n", + "train_fraction = 0.8\n", + "train_mask = np.random.uniform(size=frequency.shape) < train_fraction\n", + "\n", + "# Prepare the data for stan.\n", + "data = {\n", + " \"num_rows\": nrows,\n", + " \"num_rows_padded\": padded_rows,\n", + " \"num_cols\": ncols,\n", + " \"num_cols_padded\": padded_cols,\n", + " # Use -1 for held-out data.\n", + " \"frequency\": np.where(train_mask, frequency, -1).astype(int),\n", + "}\n", + "\n", + "# Compile and fit the model.\n", + "model = compile_model(stan_file=\"trees.stan\")\n", + "fit = model.sample(data, seed=seed, show_progress=False)\n", + "diagnose = fit.diagnose()\n", + "assert \"no problems detected\" in diagnose, diagnose\n", + "print(diagnose)" + ] + }, + { + "cell_type": "markdown", + "id": "78ad76b9", + "metadata": {}, + "source": [ + "The model is able to infer the underlying density of trees. As shown in the left panel below, the density follows within the original domain the data but is smoother. Outside the domain, in the padded region delineated by dashed lines, the posterior mean of the Gaussian process is very smooth because there are no data. As shown in the right panel, the posterior standard deviation is small where there is data and large in the padded area without data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a962f1e", + "metadata": {}, + "outputs": [], + "source": [ + "fig, (ax1, ax2) = plt.subplots(1, 2, sharex=True, sharey=True)\n", + "\n", + "padded_extent = delta * (np.asarray([0, padded_cols, 0, padded_rows]) - 0.5)\n", + "im1 = ax1.imshow(fit.f.mean(axis=0), extent=padded_extent, origin=\"lower\")\n", + "im2 = ax2.imshow(fit.f.var(axis=0), extent=padded_extent, origin=\"lower\")\n", + "fig.colorbar(im1, ax=ax1, location=\"top\", label=\"posterior mean $f$\")\n", + "fig.colorbar(im2, ax=ax2, location=\"top\", label=\"posterior var $f$\")\n", + "ax1.set_ylabel(\"padded northing (km)\")\n", + "\n", + "for ax in [ax1, ax2]:\n", + " ax.set_xlabel(\"padded easting (km)\")\n", + " ax.axhline(nrows * delta, color=\"w\", ls=\"--\")\n", + " ax.axvline(ncols * delta, color=\"w\", ls=\"--\")\n", + "fig.tight_layout()" + ] + }, + { + "cell_type": "markdown", + "id": "9af33f6f", + "metadata": {}, + "source": [ + "We compare the GP-based inference with a simpler approach that employes a Gaussian filter to smooth the data. The estimate is\n", + "$$\n", + "\\hat{\\vec{y}}_\\lambda = \\frac{\\vec{g}_\\lambda\\ast\\left(\\vec{b}\\circ\\vec{y}\\right)}{\\vec{g}_\\lambda\\ast\\vec{b}},\n", + "$$\n", + "where $\\ast$ denotes convolution, $\\circ$ denotes the elementwise product, $\\vec{g}_\\lambda$ is a Gaussian filter with smoothing scale $\\lambda$, and $\\vec{b}$ is the binary mask indicating which data are available for training. Because the Gaussian filter only provides a point estimate, we cannot use the posterior predictive distribution to compare the two approaches. We instead use a scaled mean squared error\n", + "$$\n", + "S\\left(\\vec{y},\\hat{\\vec{y}}\\right) = \\exp\\hat{\\vec{f}} = \\frac{1}{m}\\sum_{j=1}^m \\frac{\\left(y_i-\\exp \\hat f_i\\right)^2}{\\max\\left(y_i,1\\right)},\n", + "$$\n", + "to compare the held out data $\\vec{y}$ with the prediction $\\hat{\\vec{y}}$. The scaling ensures large frequencies do not dominate the error measure because they [naturally have a larger variance](https://en.wikipedia.org/wiki/Poisson_distribution#Descriptive_statistics)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89b6f024", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.ndimage import gaussian_filter\n", + "\n", + "\n", + "def evaluate_scaled_error(actual, prediction, num_bs=1000):\n", + " \"\"\"\n", + " Evaluate scaled bootstrapped errors between held-out data and model predictions.\n", + " \"\"\"\n", + " idx = np.random.choice(actual.size, (num_bs, actual.size))\n", + " bs_actual = actual[idx]\n", + " bs_prediction = prediction[idx]\n", + " return (np.square(bs_actual - bs_prediction) / np.maximum(bs_actual, 1)).mean(axis=-1)\n", + "\n", + "\n", + "def filter_estimate(frequency, train_mask, scale):\n", + " \"\"\"\n", + " Estimate held-out data using a Gaussian filter.\n", + " \"\"\"\n", + " smoothed_mask = gaussian_filter(train_mask.astype(float), scale)\n", + " smoothed_masked_frequency = gaussian_filter(np.where(train_mask, frequency, 0), scale)\n", + " return smoothed_masked_frequency / smoothed_mask\n", + "\n", + "\n", + "# Evaluate predictions and errors using Gaussian filters at different scales.\n", + "smoothed_errors = []\n", + "sigmas = np.logspace(-0.8, 1)\n", + "for sigma in sigmas:\n", + " smoothed_prediction = filter_estimate(frequency, train_mask, sigma)\n", + " smoothed_errors.append(evaluate_scaled_error(frequency[~train_mask],\n", + " smoothed_prediction[~train_mask]))\n", + "smoothed_errors = np.asarray(smoothed_errors)\n", + "\n", + "\n", + "# Also evaluate the errors for the posterior median Gaussian process rates.\n", + "rate = np.median(np.exp(fit.stan_variable(\"f\")[:, :nrows, :ncols]), axis=0)\n", + "gp_errors = evaluate_scaled_error(frequency[~train_mask], rate[~train_mask])\n", + "\n", + "def plot_errors(smoothed_errors, gp_errors, ax=None):\n", + " \"\"\"\n", + " Plot bootstrapped errors for Gaussian filter and Gaussian process estimates.\n", + " \"\"\"\n", + " scale = delta * 1e3 # Show smoothing scale in meters.\n", + " ax = ax or plt.gca()\n", + "\n", + " # Gaussian filter errors.\n", + " smoothed_loc = smoothed_errors.mean(axis=-1)\n", + " smoothed_scale = smoothed_errors.std(axis=-1)\n", + " line, = ax.plot(scale * sigmas, smoothed_loc, label=\"Gaussian\\nfilter\", color=\"C1\")\n", + " ax.fill_between(scale * sigmas, smoothed_loc - smoothed_scale, smoothed_loc + smoothed_scale,\n", + " alpha=0.5, color=line.get_color())\n", + "\n", + " # Gaussian process errors.\n", + " gp_loc = gp_errors.mean()\n", + " gp_scale = gp_errors.std()\n", + " line = ax.axhline(gp_loc, label=\"Gaussian\\nprocess\")\n", + " ax.axhspan(gp_loc - gp_scale, gp_loc + gp_scale, alpha=0.5, color=line.get_color())\n", + "\n", + " ax.set_xscale(\"log\")\n", + " ax.set_xlabel(r\"smoothing scale $\\lambda$\")\n", + " ax.set_ylabel(\"scaled mean-\\nsquared error $S$\")\n", + "\n", + "plot_errors(smoothed_errors, gp_errors)" + ] + }, + { + "cell_type": "markdown", + "id": "0fdf533c", + "metadata": {}, + "source": [ + "The bootstrapped scaled error for the Gaussian process is lower than the best scaled error for the Gaussian filter---even though the scale of the Gaussian filter was implicitly optimized on the held-out data.\n", + "\n", + "Let's assemble the parts to produce the figure in the accompanying publication." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e8948fc", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib as mpl\n", + "\n", + "cmap = mpl.cm.viridis.copy()\n", + "cmap.set_under(\"silver\")\n", + "kwargs = {\n", + " \"origin\": \"lower\",\n", + " \"extent\": extent,\n", + " \"cmap\": cmap,\n", + " \"norm\": mpl.colors.Normalize(0, rate.max()),\n", + "}\n", + "\n", + "fig = plt.figure()\n", + "fig.set_layout_engine(\"constrained\", w_pad=0.1)\n", + "gs = fig.add_gridspec(1, 2, width_ratios=[4, 3])\n", + "gs1 = mpl.gridspec.GridSpecFromSubplotSpec(3, 1, gs[0], height_ratios=[0.075, 1, 1])\n", + "gs2 = mpl.gridspec.GridSpecFromSubplotSpec(2, 1, gs[1])\n", + "\n", + "cax = fig.add_subplot(gs1[0])\n", + "ax1 = fig.add_subplot(gs1[1])\n", + "ax1.set_ylabel(\"northing (km)\")\n", + "ax1.set_xlabel(\"easting (km)\")\n", + "im = ax1.imshow(data[\"frequency\"], **kwargs)\n", + "\n", + "ax2 = fig.add_subplot(gs1[2], sharex=ax1, sharey=ax1)\n", + "ax2.set_xlabel(\"easting (km)\")\n", + "ax2.set_ylabel(\"northing (km)\")\n", + "im = ax2.imshow(rate, **kwargs)\n", + "\n", + "cb = fig.colorbar(im, cax=cax, extend=\"max\", orientation=\"horizontal\")\n", + "cb.set_label(\"tree density\")\n", + "cax.xaxis.set_ticks_position(\"top\")\n", + "cax.xaxis.set_label_position(\"top\")\n", + "\n", + "ax3 = fig.add_subplot(gs2[0])\n", + "ax3.scatter(fit.length_scale * delta * 1e3, fit.sigma, marker=\".\",\n", + " alpha=0.25)\n", + "ax3.set_xlabel(r\"correlation length $\\ell$ (m)\")\n", + "ax3.set_ylabel(r\"marginal scale $\\sigma$\")\n", + "\n", + "ax4 = fig.add_subplot(gs2[1])\n", + "plot_errors(smoothed_errors, gp_errors, ax4)\n", + "\n", + "ax4.legend(fontsize=\"small\", loc=(0.05, 0.425))\n", + "\n", + "fig.draw_without_rendering()\n", + "\n", + "text = ax1.get_yticklabels()[0]\n", + "ax1.text(0, 0.5, \"(a)\", transform=text.get_transform(), ha=\"right\", va=\"center\")\n", + "text = ax2.get_yticklabels()[0]\n", + "ax2.text(0, 0.5, \"(c)\", transform=text.get_transform(), ha=\"right\", va=\"center\")\n", + "ax3.text(0.05, 0.95, \"(b)\", va=\"top\", transform=ax3.transAxes)\n", + "ax4.text(0.05, 0.95, \"(d)\", va=\"top\", transform=ax4.transAxes)\n", + "\n", + "fig.savefig(\"trees.pdf\", bbox_inches=\"tight\")\n", + "fig.savefig(\"trees.png\", bbox_inches=\"tight\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tube/tube.ipynb b/tube/tube.ipynb new file mode 100644 index 0000000..3836ff8 --- /dev/null +++ b/tube/tube.ipynb @@ -0,0 +1,213 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "15d0d4aa", + "metadata": {}, + "source": [ + "# Passengers on the London Underground network\n", + "\n", + "The [London Underground](https://en.wikipedia.org/wiki/London_Underground), nicknamed \"The Tube\", transports millions of people each day. Which factors might affect passenger numbers at different stations? Here, we use the transport network to build a sparse Gaussian process model and fit it to daily passenger numbers. We first load the prepared data, including features such as the [transport zone](https://en.wikipedia.org/wiki/London_fare_zones), number of interchanges, and location, for each station. Detailes on the data preparation can be found [here](https://github.com/onnela-lab/gptools/blob/main/data/prepare_tube_data.py).\n", + "\n", + "We next compile the model and draw posterior samples. The model is shown below.\n", + "\n", + "```{literalinclude} tube.stan\n", + " :language: stan\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6e0771b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from gptools.stan import compile_model\n", + "import json\n", + "import numpy as np\n", + "\n", + "\n", + "with open(\"tube-stan.json\") as fp:\n", + " data = json.load(fp)\n", + "\n", + "# Sample a training mask and update the data for Stan.\n", + "seed = 0\n", + "train_frac = 0.8\n", + "np.random.seed(seed)\n", + "train_mask = np.random.uniform(0, 1, data[\"num_stations\"]) < train_frac\n", + "\n", + "# Apply the training mask and include degree and zone effects.\n", + "y = np.asarray(data[\"passengers\"])\n", + "data.update({\n", + " \"include_zone_effect\": 1,\n", + " \"include_degree_effect\": 1,\n", + " # We use -1 for held-out data.\n", + " \"passengers\": np.where(train_mask, y, -1),\n", + "})\n", + "\n", + "\n", + "model_with_gp = compile_model(stan_file=\"tube.stan\")\n", + "fit_with_gp = model_with_gp.sample(data, seed=seed, adapt_delta=0.95, show_progress=False)\n", + "diagnose = fit_with_gp.diagnose()\n", + "assert \"no problems detected\" in diagnose, diagnose\n", + "print(diagnose)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2810281a", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "import matplotlib as mpl\n", + "from matplotlib import pyplot as plt\n", + "\n", + "mpl.style.use(\"../jss.mplstyle\")\n", + "\n", + "# Show some basic diagnostics plots.\n", + "fig, (ax1, ax2) = plt.subplots(1, 2)\n", + "rate = np.exp(fit_with_gp.stan_variable(\"log_mean\").mean(axis=0))\n", + "ax1.scatter(y[train_mask], rate[train_mask], marker=\".\", label=\"train\")\n", + "ax1.scatter(y[~train_mask], rate[~train_mask], marker=\".\", label=\"test\")\n", + "mm = y.min(), y.max()\n", + "ax1.plot(mm, mm, color=\"k\", ls=\":\")\n", + "ax1.set_xlabel(\"actual\")\n", + "ax1.set_ylabel(\"predicted\")\n", + "ax1.set_xscale(\"log\")\n", + "ax1.set_yscale(\"log\")\n", + "ax1.set_aspect(\"equal\")\n", + "ax1.legend()\n", + "\n", + "a = fit_with_gp.stan_variable(\"length_scale\")\n", + "b = fit_with_gp.stan_variable(\"sigma\")\n", + "pts = ax2.scatter(a, b, marker=\".\", c=fit_with_gp.method_variables()[\"lp__\"])\n", + "d = fit_with_gp.method_variables()[\"divergent__\"].astype(bool).ravel()\n", + "ax2.scatter(a[d], b[d], color=\"C1\", marker=\"x\")\n", + "fig.colorbar(pts, ax=ax2, label=\"log probability\", location=\"top\")\n", + "ax2.set_xscale(\"log\")\n", + "ax2.set_xlabel(r\"length scale $\\ell$\")\n", + "ax2.set_ylabel(r\"marginal scale $\\sigma$\")\n", + "fig.tight_layout()" + ] + }, + { + "cell_type": "markdown", + "id": "6dcdb5cd", + "metadata": {}, + "source": [ + "We construct a figure that shows the data, effects of zone and degree, and the residual effects captured by the Gaussian process." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "afdd109f", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "fig, axes = plt.subplots(2, 2, gridspec_kw={\"width_ratios\": [2, 1]}, figsize=(6, 6))\n", + "ax1, ax2 = axes[:, 0]\n", + "kwargs = {\"marker\": \"o\", \"s\": 10}\n", + "\n", + "\n", + "X = np.asarray(data[\"station_locations\"])\n", + "ax1.scatter(*X[~train_mask].T, facecolor=\"w\", edgecolor=\"gray\", **kwargs)\n", + "pts1 = ax1.scatter(*X[train_mask].T, c=y[train_mask], norm=mpl.colors.LogNorm(vmin=np.min(y)),\n", + " **kwargs)\n", + "\n", + "c = fit_with_gp.stan_variable(\"f\").mean(axis=0)\n", + "vmax = np.abs(c).max()\n", + "pts2 = ax2.scatter(*X.T, c=c, vmin=-vmax, vmax=vmax,\n", + " **kwargs, cmap=\"coolwarm\")\n", + "ax1.set_aspect(\"equal\")\n", + "ax2.set_aspect(\"equal\")\n", + "\n", + "ax2.annotate(\"Canary Wharf\", X[np.argmax(c)], (20, -12), ha=\"center\",\n", + " va=\"center\", fontsize=\"small\",\n", + " arrowprops={\"arrowstyle\": \"->\", \"connectionstyle\": \"arc3,rad=-0.5\"})\n", + "ax2.annotate(\"Hainault\\nLoop\", X[np.argmin(c)], (31, 13), ha=\"right\",\n", + " va=\"center\", fontsize=\"small\",\n", + " arrowprops={\"arrowstyle\": \"->\", \"connectionstyle\": \"arc3,rad=0.2\",\n", + " \"patchA\": None, \"shrinkA\": 10})\n", + "\n", + "ax1.set_axis_off()\n", + "ax2.set_axis_off()\n", + "location = \"top\"\n", + "fraction = 0.05\n", + "cb1 = fig.colorbar(pts1, ax=ax1, label=\"entries & exits\", location=location, fraction=fraction)\n", + "cb2 = fig.colorbar(pts2, ax=ax2, label=\"Gaussian process effect\", location=location, fraction=fraction)\n", + "\n", + "for ax in [ax1, ax2]:\n", + " segments = []\n", + " for u, v in np.transpose(data[\"edge_index\"]):\n", + " segments.append([X[u - 1], X[v - 1]])\n", + " collection = mpl.collections.LineCollection(segments, zorder=0, color=\"silver\")\n", + " ax.add_collection(collection)\n", + "\n", + "ax1.set_ylabel(\"northing (km)\")\n", + "ax1.set_xlabel(\"easting (km)\")\n", + "ax2.set_ylabel(\"northing (km)\")\n", + "ax2.set_xlabel(\"easting (km)\")\n", + "ax3, ax4 = axes[:, 1]\n", + "\n", + "effect = fit_with_gp.stan_variable(\"degree_effect\")\n", + "line, *_ = ax3.errorbar(np.arange(effect.shape[1]) + 1, effect.mean(axis=0), effect.std(axis=0),\n", + " marker=\"o\")\n", + "line.set_markeredgecolor(\"w\")\n", + "ax3.set_ylabel(\"degree effect\")\n", + "ax3.set_xlabel(\"degree\")\n", + "ax3.set_xticks([1, 3, data[\"num_degrees\"]])\n", + "ax3.set_xticklabels([\"1\", \"3\", f\"{data['num_degrees']}+\"])\n", + "ax3.axhline(0, color=\"k\", ls=\":\")\n", + "\n", + "\n", + "effect = fit_with_gp.stan_variable(\"zone_effect\")\n", + "line, *_ = ax4.errorbar(np.arange(effect.shape[1]) + 1, effect.mean(axis=0), effect.std(axis=0),\n", + " marker=\"o\")\n", + "line.set_markeredgecolor(\"w\")\n", + "ax4.set_ylabel(\"zone effect\")\n", + "ax4.set_xlabel(\"zone\")\n", + "ax4.set_xticks([2, 4, data[\"num_zones\"]])\n", + "ax4.set_xticklabels([\"2\", \"4\", f\"{data['num_zones']}+\"])\n", + "ax4.axhline(0, color=\"k\", ls=\":\")\n", + "\n", + "ax1.text(0.025, 0.05, \"(a)\", transform=ax1.transAxes)\n", + "ax2.text(0.025, 0.05, \"(c)\", transform=ax2.transAxes)\n", + "ax3.text(0.05, 0.95, \"(b)\", transform=ax3.transAxes, va=\"top\")\n", + "ax4.text(0.95, 0.95, \"(d)\", transform=ax4.transAxes, va=\"top\", ha=\"right\")\n", + "\n", + "\n", + "fig.tight_layout()\n", + "fig.savefig(\"tube.pdf\", bbox_inches=\"tight\")\n", + "fig.savefig(\"tube.png\", bbox_inches=\"tight\")" + ] + }, + { + "cell_type": "markdown", + "id": "dbef7c7e", + "metadata": {}, + "source": [ + "On the one hand, the three northern stations of the [Hainault Loop](https://en.wikipedia.org/wiki/Hainault_Loop) ([Roding Valley](https://en.wikipedia.org/wiki/Roding_Valley_tube_station), [Chigwell](https://en.wikipedia.org/wiki/Chigwell_tube_station), and [Grange Hill](https://en.wikipedia.org/wiki/Grange_Hill_tube_station)) are underused because they are serviced by only three trains an hour whereas nearby stations (such as [Hainault](https://en.wikipedia.org/wiki/Hainault_tube_station), [Woodford](https://en.wikipedia.org/wiki/Woodford_tube_station), and [Buckhurst Hill](https://en.wikipedia.org/wiki/Buckhurst_Hill_tube_station)) are serviced by twelve trains an hour. On the other hand, [Canary Wharf](https://en.wikipedia.org/wiki/Canary_Wharf_tube_station) at the heart of the financial district has much higher use than would be expected for a station that only serves a single line in zone 2." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 23e8be6cd26e96d235da4bc7888af9f831f3ed72 Mon Sep 17 00:00:00 2001 From: Till Hoffmann Date: Thu, 14 Dec 2023 12:50:21 +0100 Subject: [PATCH 4/9] Add linear regression example. --- .gitignore | 1 + linear/linear.ipynb | 57 +++++++++++++++++++++++++++++++++++++++++++++ linear/linear.md | 38 ++++++++++++++++++++++++++++++ linear/linear.stan | 18 ++++++++++++++ recipe.py | 6 +++-- 5 files changed, 118 insertions(+), 2 deletions(-) create mode 100644 linear/linear.ipynb create mode 100644 linear/linear.md create mode 100644 linear/linear.stan diff --git a/.gitignore b/.gitignore index 8646ada..38cf578 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ *.pkl *.tmp __pycache__ +linear/linear padding/exact padding/padded profile/fourier_centered diff --git a/linear/linear.ipynb b/linear/linear.ipynb new file mode 100644 index 0000000..7939310 --- /dev/null +++ b/linear/linear.ipynb @@ -0,0 +1,57 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "eb2c936b", + "metadata": {}, + "source": [ + "# Linear regression example from Section 2 of the manuscript" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42ecaa12", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "np.random.seed(0)\n", + "n = 100\n", + "p = 3\n", + "X = np.random.normal(0, 1, (n, p))\n", + "theta = np.random.normal(0, 1, p)\n", + "sigma = np.random.gamma(2, 2)\n", + "y = np.random.normal(X @ theta, sigma)\n", + "\n", + "print(f\"coefficients: {theta}\")\n", + "print(f\"observation noise scale: {sigma}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07069f61", + "metadata": {}, + "outputs": [], + "source": [ + "import cmdstanpy\n", + "\n", + "model = cmdstanpy.CmdStanModel(stan_file=\"linear.stan\")\n", + "fit = model.sample(data={\"n\": n, \"p\": p, \"X\": X, \"y\": y}, seed=0)\n", + "\n", + "print(fit.summary())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/linear/linear.md b/linear/linear.md new file mode 100644 index 0000000..cd13679 --- /dev/null +++ b/linear/linear.md @@ -0,0 +1,38 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.15.1 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + +# Linear regression example from Section 2 of the manuscript + +```{code-cell} ipython3 +import numpy as np + +np.random.seed(0) +n = 100 +p = 3 +X = np.random.normal(0, 1, (n, p)) +theta = np.random.normal(0, 1, p) +sigma = np.random.gamma(2, 2) +y = np.random.normal(X @ theta, sigma) + +print(f"coefficients: {theta}") +print(f"observation noise scale: {sigma}") +``` + +```{code-cell} ipython3 +import cmdstanpy + +model = cmdstanpy.CmdStanModel(stan_file="linear.stan") +fit = model.sample(data={"n": n, "p": p, "X": X, "y": y}, seed=0) + +print(fit.summary()) +``` diff --git a/linear/linear.stan b/linear/linear.stan new file mode 100644 index 0000000..a691346 --- /dev/null +++ b/linear/linear.stan @@ -0,0 +1,18 @@ +// Text file "linear.stan" containing the model definition. + +data { + int n, p; + matrix [n, p] X; + vector[n] y; +} + +parameters { + vector[p] theta; + real sigma; +} + +model { + theta ~ normal(0, 1); + sigma ~ gamma(2, 2); + y ~ normal(X * theta, sigma); +} diff --git a/recipe.py b/recipe.py index b3a24f5..2755a4f 100644 --- a/recipe.py +++ b/recipe.py @@ -93,12 +93,14 @@ def create_profile_task( # Run the notebooks to generate figures. figures = [] -for example in ["kernels", "padding", "profile", "trees", "tube"]: +for example in ["kernels", "linear", "padding", "profile", "trees", "tube"]: ipynb = Path(example, f"{example}.ipynb") md = ipynb.with_suffix(".md") create_task(f"{example}:nb", dependencies=[md], targets=[ipynb], action=f"jupytext --to notebook {md}") - targets = [ipynb.with_suffix(".png"), ipynb.with_suffix(".html")] + targets = [ipynb.with_suffix(".html")] + if example != "linear": + targets.append(ipynb.with_suffix(".png")) task = create_task( f"{example}:fig", dependencies=[ipynb], targets=targets, action=f"jupyter nbconvert --to=html --execute --ExecutePreprocessor.timeout=-1 {ipynb}" From 81d1f6a8d8caccc2a8a96da433c391c06ec32dca Mon Sep 17 00:00:00 2001 From: Till Hoffmann Date: Thu, 14 Dec 2023 15:29:42 +0100 Subject: [PATCH 5/9] Update notes regarding `*.ipynb` notebooks. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e895b7d..b762b4f 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ To ensure the reproducibility of these materials, the results are also computed ## Running the experiments -Figures in the manuscript were generated using the containerized runtime, and all runtime estimates below are based on a 2020 Macbook Pro with M1 chip and 16 GB of memory running macOS 13.4 (22F66). Figures are generated by executing Jupyter notebooks stored in markdown format; the notebooks can be opened directly in a standard Jupyter environment using the [`jupytext`](https://jupytext.readthedocs.io/en/latest/) extension. If you prefer, the folder for each experiment also contains a corresponding `*.ipynb` file. All figures can be reproduced by running `[./in-docker.sh] cook exec figures` (see below for details). +Figures in the manuscript were generated using the containerized runtime, and all runtime estimates below are based on a 2020 Macbook Pro with M1 chip and 16 GB of memory running macOS 13.4 (22F66). All figures can be reproduced by running `[./in-docker.sh] cook exec figures` (see below for details). Figures are generated by executing Jupyter notebooks stored in markdown format; the notebooks can be opened directly in a standard Jupyter environment using the [`jupytext`](https://jupytext.readthedocs.io/en/latest/) extension. If you prefer, the folder for each experiment also contains a corresponding `*.ipynb` file. To open and use the notebooks, please set up a local computing environment as described above. ### Applications From 9eaf53ebbceef7eb49df07381de2931ca3873e8b Mon Sep 17 00:00:00 2001 From: Till Hoffmann Date: Thu, 14 Dec 2023 15:30:08 +0100 Subject: [PATCH 6/9] Ignore `playground` folder. --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 38cf578..91cb84e 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ __pycache__ linear/linear padding/exact padding/padded +playground profile/fourier_centered profile/fourier_non_centered profile/graph_centered From e6bcdb5f5871ff323b7da7baf6d0559b93efe190 Mon Sep 17 00:00:00 2001 From: Till Hoffmann Date: Sat, 16 Dec 2023 20:54:36 +0100 Subject: [PATCH 7/9] Add getting started for Python and R. --- .github/workflows/main.yaml | 4 +++ Dockerfile | 10 +++++- README.md | 25 ++++++++++----- getting_started/.gitignore | 1 + getting_started/getting_started.Rmd | 32 +++++++++++++++++++ getting_started/getting_started.ipynb | 44 +++++++++++++++++++++++++++ getting_started/getting_started.md | 31 +++++++++++++++++++ getting_started/getting_started.stan | 22 ++++++++++++++ recipe.py | 33 ++++++++++++++++---- setup.R | 3 ++ 10 files changed, 190 insertions(+), 15 deletions(-) create mode 100644 getting_started/.gitignore create mode 100644 getting_started/getting_started.Rmd create mode 100644 getting_started/getting_started.ipynb create mode 100644 getting_started/getting_started.md create mode 100644 getting_started/getting_started.stan create mode 100644 setup.R diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 0e4a102..76e2cce 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -17,6 +17,10 @@ jobs: run: docker build -t gptools . - name: Verify notebooks are up to date. run: docker run --rm -v `pwd`:/workdir gptools pytest -v + - name: Render the getting started notebook for Python. + run: docker run --rm -v `pwd`:/workdir gptools cook exec getting_started:run + - name: Render the getting started Rmarkdown for R. + run: docker run --rm -v `pwd`:/workdir gptools cook exec getting_started_R:run - name: Generate figures (using ./in-docker.sh but we can't use `-it` in the Action). run: docker run --rm -e FAST=true -v `pwd`:/workdir gptools cook exec figures - name: Upload figures and reports as artifacts. diff --git a/Dockerfile b/Dockerfile index c7532f8..f233f0f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,14 @@ FROM python:3.10 +# Install R and dependencies. +RUN apt-get update && apt-get install -y \ + r-base \ + r-cran-devtools \ + && rm -rf /var/lib/apt/lists/* WORKDIR /workdir +COPY setup.R . +RUN Rscript setup.R + +# Install Python dependencies and compile cmdstan. COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt RUN python -m cmdstanpy.install_cmdstan --verbose --version 2.33.0 - diff --git a/README.md b/README.md index b762b4f..435e83b 100644 --- a/README.md +++ b/README.md @@ -31,28 +31,37 @@ If you switch between containerized and local runtime, you may need to remove co To ensure the reproducibility of these materials, the results are also computed as the output of a GitHub Action workflow [![Reproduction Materials](https://github.com/onnela-lab/gptools-reproduction-material/actions/workflows/main.yaml/badge.svg)](https://github.com/onnela-lab/gptools-reproduction-material/actions/workflows/main.yaml) with the `FAST=true` flag. Figures can be obtained by selecting a workflow run and downloading the `figures-reports` artifact. +## Getting started + +The `getting_started` folder contains a Python notebook and Rmarkdown file to reproduce the results of the "Getting started" sections in the accompanying manuscript. HTML reports can be generated in the `getting_started` folder by running + +- `[./in-docker.sh] cook exec getting_started:run` for Python +- `[./in-docker.sh] cook exec getting_started_R:run` for R + +However, the folder is likely most suitable for interactive exploration and experimentation to get familiar with the package. + ## Running the experiments Figures in the manuscript were generated using the containerized runtime, and all runtime estimates below are based on a 2020 Macbook Pro with M1 chip and 16 GB of memory running macOS 13.4 (22F66). All figures can be reproduced by running `[./in-docker.sh] cook exec figures` (see below for details). Figures are generated by executing Jupyter notebooks stored in markdown format; the notebooks can be opened directly in a standard Jupyter environment using the [`jupytext`](https://jupytext.readthedocs.io/en/latest/) extension. If you prefer, the folder for each experiment also contains a corresponding `*.ipynb` file. To open and use the notebooks, please set up a local computing environment as described above. ### Applications -The folders `trees` and `tube` contain code and data to reproduce the two applied examples in the manuscript. Each example takes about five minutes to run. The figures can be reproduced by running `[./in-docker.sh] cook exec tube:fig trees:fig` and will be saved in the corresponding folder as png and pdf files. +The folders `trees` and `tube` contain code and data to reproduce the two applied examples in the manuscript. Each example takes about five minutes to run. The figures can be reproduced by running `[./in-docker.sh] cook exec tube:run trees:run` and will be saved in the corresponding folder as png and pdf files. ### Profiling experiments -The folder `profile` contains code to reproduce profiling experiments, and running all experiments can take up to ten hours. The profiling figure can be reproduced by running `[./in-docker.sh] cook exec profile:fig`. If a reduced runtime (but more noisy results) are desired, run `FAST=true cook exec profile:fig` which takes about 90 minutes. +The folder `profile` contains code to reproduce profiling experiments, and running all experiments can take up to ten hours. The profiling figure can be reproduced by running `[./in-docker.sh] cook exec profile:run`. If a reduced runtime (but more noisy results) are desired, run `FAST=true cook exec profile:run` which takes about 90 minutes. All experiments are seeded for reproducibility, but profiling experiments are subject to variability due to different hardware and competing processes running on the same machine. Despite seeding, results may also vary depending on the operating system and stdlib implementation. ### Kernel properties and effect of padding -The folders `kernels` and `padding` contain code to reproduce figures on the properties of different kernels and their spectral properties as well as the effect of padding on Fourier methods, respectively. The figures can be reproduced by running `[./in-docker.sh] cook exec kernels:fig padding:fig`; the latter takes about ten minutes to generate. +The folders `kernels` and `padding` contain code to reproduce figures on the properties of different kernels and their spectral properties as well as the effect of padding on Fourier methods, respectively. The figures can be reproduced by running `[./in-docker.sh] cook exec kernels:run padding:run`; the latter takes about ten minutes to generate. ## Expected results -- `[./in-docker.sh] cook exec kernels:fig` ![](kernels/kernels.png) -- `[./in-docker.sh] cook exec padding:fig` ![](padding/padding.png) -- `[./in-docker.sh] cook exec profile:fig` ![](profile/profile.png) -- `[./in-docker.sh] cook exec trees:fig` ![](trees/trees.png) -- `[./in-docker.sh] cook exec tube:fig` ![](tube/tube.png) +- `[./in-docker.sh] cook exec kernels:run` ![](kernels/kernels.png) +- `[./in-docker.sh] cook exec padding:run` ![](padding/padding.png) +- `[./in-docker.sh] cook exec profile:run` ![](profile/profile.png) +- `[./in-docker.sh] cook exec trees:run` ![](trees/trees.png) +- `[./in-docker.sh] cook exec tube:run` ![](tube/tube.png) diff --git a/getting_started/.gitignore b/getting_started/.gitignore new file mode 100644 index 0000000..f3b4e27 --- /dev/null +++ b/getting_started/.gitignore @@ -0,0 +1 @@ +getting_started diff --git a/getting_started/getting_started.Rmd b/getting_started/getting_started.Rmd new file mode 100644 index 0000000..67718ae --- /dev/null +++ b/getting_started/getting_started.Rmd @@ -0,0 +1,32 @@ +Install the packages. We explicitly specify the repos here so we don't get asked for the mirror when +rendering the Rmarkdown. + +```{r} +install.packages( + "cmdstanr", + repos = c("https://mc-stan.org/r-packages/", "http://cran.us.r-project.org") +) +install.packages("gptoolsStan", repos=c("http://cran.us.r-project.org")) +``` + +Compile and run the model. + +```{r} +library(cmdstanr) +library(gptoolsStan) + +model <- cmdstan_model( + stan_file="getting_started.stan", + include_paths=gptools_include_path(), +) +fit <- model$sample( + data=list(n=100, sigma=1, length_scale=0.1, period=1), + chains=1, + iter_warmup=500, + iter_sampling=50 +) +f <- fit$draws("f") +dim(f) +``` + +Expected output: `[1] 50 1 100` diff --git a/getting_started/getting_started.ipynb b/getting_started/getting_started.ipynb new file mode 100644 index 0000000..8e7f66a --- /dev/null +++ b/getting_started/getting_started.ipynb @@ -0,0 +1,44 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "d2b9dac2", + "metadata": {}, + "outputs": [], + "source": [ + ">>> import cmdstanpy\n", + ">>> from gptools.stan import get_include\n", + ">>>\n", + ">>> model = cmdstanpy.CmdStanModel(\n", + "... stan_file=\"getting_started.stan\",\n", + "... stanc_options={\"include-paths\": get_include()},\n", + "... )\n", + ">>> fit = model.sample(\n", + "... data = {\"n\": 100, \"sigma\": 1, \"length_scale\": 0.1, \"period\": 1},\n", + "... chains=1,\n", + "... iter_warmup=500,\n", + "... iter_sampling=50,\n", + "... )\n", + ">>> fit.f.shape" + ] + }, + { + "cell_type": "markdown", + "id": "34ffcd17", + "metadata": {}, + "source": [ + "Expected output: `(50, 100)`" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/getting_started/getting_started.md b/getting_started/getting_started.md new file mode 100644 index 0000000..293064e --- /dev/null +++ b/getting_started/getting_started.md @@ -0,0 +1,31 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.15.1 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + +```{code-cell} ipython3 +>>> import cmdstanpy +>>> from gptools.stan import get_include +>>> +>>> model = cmdstanpy.CmdStanModel( +... stan_file="getting_started.stan", +... stanc_options={"include-paths": get_include()}, +... ) +>>> fit = model.sample( +... data = {"n": 100, "sigma": 1, "length_scale": 0.1, "period": 1}, +... chains=1, +... iter_warmup=500, +... iter_sampling=50, +... ) +>>> fit.f.shape +``` + +Expected output: `(50, 100)` diff --git a/getting_started/getting_started.stan b/getting_started/getting_started.stan new file mode 100644 index 0000000..892d600 --- /dev/null +++ b/getting_started/getting_started.stan @@ -0,0 +1,22 @@ +functions { + #include gptools/util.stan + #include gptools/fft.stan +} + +data { + int n; + real sigma, length_scale, period; +} + +transformed data { + vector [n %/% 2 + 1] cov_rfft = + gp_periodic_exp_quad_cov_rfft(n, sigma, length_scale, period) + 1e-9; +} + +parameters { + vector [n] f; +} + +model { + f ~ gp_rfft(zeros_vector(n), cov_rfft); +} diff --git a/recipe.py b/recipe.py index 2755a4f..f4593aa 100644 --- a/recipe.py +++ b/recipe.py @@ -91,28 +91,49 @@ def create_profile_task( create_profile_task("sample", "fourier_non_centered", 0, 10_000, timeout=300) -# Run the notebooks to generate figures. +# Run the notebooks to generate figures (the booleans indicate if a figure should be generated.) figures = [] -for example in ["kernels", "linear", "padding", "profile", "trees", "tube"]: +examples = { + "getting_started": False, + "kernels": True, + "linear": False, + "padding": True, + "profile": True, + "trees": True, + "tube": True, +} +for example, has_figure in examples.items(): ipynb = Path(example, f"{example}.ipynb") md = ipynb.with_suffix(".md") create_task(f"{example}:nb", dependencies=[md], targets=[ipynb], action=f"jupytext --to notebook {md}") targets = [ipynb.with_suffix(".html")] - if example != "linear": - targets.append(ipynb.with_suffix(".png")) + if has_figure: + figure = ipynb.with_suffix(".png") + targets.append(figure) + figures.append(figure) task = create_task( - f"{example}:fig", dependencies=[ipynb], targets=targets, + f"{example}:run", dependencies=[ipynb], targets=targets, action=f"jupyter nbconvert --to=html --execute --ExecutePreprocessor.timeout=-1 {ipynb}" ) if example == "profile": task.task_dependencies.append(profile_group.task) - figures.append(targets[0]) # Task that reproduces all outputs. create_task("figures", dependencies=figures) +# Add the R example for getting started. +rmd = "getting_started/getting_started.Rmd" +html = Path("getting_started/getting_started_R.html") +action = [ + "Rscript", + "-e", + f"rmarkdown::render('{rmd}', output_file = '{html.name}', output_dir='getting_started')" +] +create_task(name="getting_started_R:run", dependencies=[rmd], targets=[html], + action=action) + def delete_compiled_stan_files(_: Task) -> None: # Find all Stan files and remove compiled versions if they exist. diff --git a/setup.R b/setup.R new file mode 100644 index 0000000..c388588 --- /dev/null +++ b/setup.R @@ -0,0 +1,3 @@ +repos <- c("https://mc-stan.org/r-packages/", "http://cran.us.r-project.org") +install.packages("cmdstanr", repos=repos) +devtools::install_github("onnela-lab/gptoolsStan") From 7e8accb109dea71d56b1f1b8ae09675762746a00 Mon Sep 17 00:00:00 2001 From: Till Hoffmann Date: Wed, 20 Dec 2023 16:21:55 +0100 Subject: [PATCH 8/9] Ignore R history. --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 91cb84e..6fbf04f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,13 +1,14 @@ +__pycache__ .cook .DS_Store .ipynb_checkpoints +.Rhistory *.hpp *.html *.o *.pdf *.pkl *.tmp -__pycache__ linear/linear padding/exact padding/padded From cdd4d44c38c14b96b4e467c82562f22f82ccb9e5 Mon Sep 17 00:00:00 2001 From: Till Hoffmann Date: Wed, 20 Dec 2023 16:29:05 +0100 Subject: [PATCH 9/9] Install gptoolsStan from CRAN. --- setup.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.R b/setup.R index c388588..939bf87 100644 --- a/setup.R +++ b/setup.R @@ -1,3 +1,3 @@ repos <- c("https://mc-stan.org/r-packages/", "http://cran.us.r-project.org") install.packages("cmdstanr", repos=repos) -devtools::install_github("onnela-lab/gptoolsStan") +install.packages("onnela-lab/gptoolsStan", repos=repos)