From de0ddb60bf95af994d156ad62c2c77f03ea74b14 Mon Sep 17 00:00:00 2001 From: Sam Date: Wed, 4 Sep 2024 17:11:04 +0100 Subject: [PATCH 01/11] start by improving the stan vignette --- vignettes/using-stan-tools.Rmd | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/vignettes/using-stan-tools.Rmd b/vignettes/using-stan-tools.Rmd index a3bc78b..0c1b81c 100644 --- a/vignettes/using-stan-tools.Rmd +++ b/vignettes/using-stan-tools.Rmd @@ -34,6 +34,7 @@ In this vignette, we'll explore how to use the `primarycensoreddist` package in Stan is a probabilistic programming language for statistical inference. It provides a flexible and efficient platform for Bayesian modeling and is widely used in various fields of data science and statistics. In this vignette, we'll use Stan in conjunction with `primarycensoreddist` to perform Bayesian inference on censored data. For more information on Stan: + - [Stan's official website](https://mc-stan.org/) - [Stan documentation](https://mc-stan.org/users/documentation/) - [Stan forums](https://discourse.mc-stan.org/) for community support and discussions @@ -108,13 +109,15 @@ writeLines( text = c( "functions {", "#include expgrowth.stan", + "}", + "generated quantities {", + " real y = expgrowth_rng(0, 1, 0.4);", "}" ), con = expgrowth_stan_file ) ``` -} -``` + We can now use this file to compile a model. **Note** that we need to include the path to the `primarycensoreddist` Stan functions using the `include_paths` argument to `cmdstan_model()`. ```{r} @@ -122,11 +125,18 @@ model <- cmdstan_model(expgrowth_stan_file, include_paths = pcd_stan_path()) model ``` +We can then sample from the model (we set `fixed_param = TRUE` here as our toy example doesn't require MCMC sampling). + +```{r} +samples <- model$sample(chains = 1, fixed_param = TRUE) +samples +``` + ## Using Stan functions directly in R Whilst it is possible to use Stan functions directly in R it is not recommended for most use cases (use the R functions in `primarycensoreddist` instead). However, it can be useful to understand what is going on under the hood or for exploration (indeed we use this internally in `primarycensoreddist` to check our functions against the R implementations). To do this we use the `expose_functions()` method on our already compiled model. -```{r} +```{r, message = FALSE} model$expose_functions(global = TRUE) ``` From 47479762340929f343797eafe2d93844b91da7c5 Mon Sep 17 00:00:00 2001 From: Sam Date: Wed, 4 Sep 2024 18:04:23 +0100 Subject: [PATCH 02/11] add data simulatio --- DESCRIPTION | 2 + vignettes/fitting-dists-with-stan.Rmd | 119 ++++++++++++++++++++++++++ 2 files changed, 121 insertions(+) diff --git a/DESCRIPTION b/DESCRIPTION index 2296376..7c45a78 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -20,11 +20,13 @@ Depends: Suggests: bookdown, cmdstanr, + dplyr, knitr, ggplot2, rmarkdown, spelling, testthat (>= 3.1.9), + tidybayes, usethis Additional_repositories: https://stan-dev.r-universe.dev diff --git a/vignettes/fitting-dists-with-stan.Rmd b/vignettes/fitting-dists-with-stan.Rmd index 6b4a1b0..b78e781 100644 --- a/vignettes/fitting-dists-with-stan.Rmd +++ b/vignettes/fitting-dists-with-stan.Rmd @@ -16,3 +16,122 @@ vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- + +# Introduction + +## What are we going to do in this Vignette + +In this vignette, we'll demonstrate how to use `primarycensoreddist` in conjunction with Stan for Bayesian inference of epidemiological delay distributions. We'll cover the following key points: + +1. Simulating censored delay distribution data +2. Fitting a naive model using cmdstan +3. Evaluating the naive model's performance +4. Fitting an improved model using `primarycensoreddist` functionality +5. Comparing the `primarycensoreddist` model's performance to the naive model + +## What might I need to know before starting + +This vignette builds on the concepts introduced in the [Getting Started with primarycensoreddist](primarycensoreddist.html) vignette and assumes familiarity with using Stan tools as covered in the [How to use primarycensoreddist with Stan](using-stan-tools.html) vignette. + +## Packages used in this vignette + +Alongside the `primarycensoreddist` package we will use the `cmdstanr` package for interfacing with cmdstan and the `tidybayes` package for posterior analysis. We will also use the `ggplot2` package for plotting. + +```{r setup, message = FALSE} +library(primarycensoreddist) +library(cmdstanr) +library(tidybayes) +library(ggplot2) +``` + +# Simulating censored and truncated delay distribution data + +We'll start by simulating some censored and truncated delay distribution data. We'll use the `rprimarycensoreddist` function (actually we will use the `rpcens ` alias for brevity). + +```{r sample-lognormal} +set.seed(123) # For reproducibility + +# Define the true distribution parameters +n <- 1000 +meanlog <- 1.5 +sdlog <- 0.75 + +# Generate varying pwindow, swindow, and obs_time lengths +pwindows <- sample(1:5, n, replace = TRUE) +swindows <- sample(1:5, n, replace = TRUE) +obs_times <- sample(8:12, n, replace = TRUE) + +# Function to generate a single sample +generate_sample <- function(pwindow, swindow, obs_time) { + rpcens( + 1, rlnorm, + meanlog = meanlog, sdlog = sdlog, + pwindow = pwindow, swindow = swindow, D = obs_time + ) +} + +# Generate samples +samples <- mapply(generate_sample, pwindows, swindows, obs_times) + +# Create initial data frame +delay_data <- data.frame( + pwindow = pwindows, + swindow = swindows, + obs_time = obs_times, + observed_delay = samples +) + +head(delay_data) + +# Aggregate to unique combinations and count occurrences +# Aggregate to unique combinations and count occurrences +delay_counts <- delay_data |> + summarise(n = n(), .by = c(pwindow, swindow, obs_time, observed_delay)) + +head(delay_counts) + +# Compare the samples with and without secondary censoring to the true +# distribution +# Calculate empirical CDF +empirical_cdf <- ecdf(samples) + +# Create a sequence of x values for the theoretical CDF +x_seq <- seq(min(samples), max(samples), length.out = 100) + +# Calculate theoretical CDF +theoretical_cdf <- plnorm(x_seq, meanlog = meanlog, sdlog = sdlog) + +# Create a long format data frame for plotting +cdf_data <- data.frame( + x = rep(x_seq, 2), + probability = c(empirical_cdf(x_seq), theoretical_cdf), + type = rep(c("Observed", "Theoretical"), each = length(x_seq)) +) + +# Plot +ggplot(cdf_data, aes(x = x, y = probability, color = type)) + + geom_step(linewidth = 1) + + scale_color_manual( + values = c(Observed = "#4292C6", Theoretical = "#252525") + ) + + geom_vline( + aes(xintercept = mean(samples), color = "Observed"), + linetype = "dashed", linewidth = 1 + ) + + geom_vline( + aes(xintercept = exp(meanlog + sdlog^2 / 2), color = "Theoretical"), + linetype = "dashed", linewidth = 1 + ) + + labs( + title = "Comparison of Observed vs Theoretical CDF", + x = "Delay", + y = "Cumulative Probability", + color = "CDF Type" + ) + + theme_bw() + + theme( + panel.grid.minor = element_blank(), + plot.title = element_text(hjust = 0.5), + legend.position = "bottom" + ) +``` From 260be4682ce311fc8f11489f7f6484b6d7c614e2 Mon Sep 17 00:00:00 2001 From: Sam Date: Wed, 4 Sep 2024 19:39:55 +0100 Subject: [PATCH 03/11] first working version --- NEWS.md | 1 + vignettes/fitting-dists-with-stan.Rmd | 141 +++++++++++++++++++++++++- 2 files changed, 139 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 21eb1c5..5e3b21d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,6 +10,7 @@ * Added a getting started vignette. * Added a vignette showcasing how to use the package Stan code with `cmdstanr`. +* Added a vignette showcasing how to fit distributions using the `cmdstanr` package. # primarycensoreddist 0.1.0 diff --git a/vignettes/fitting-dists-with-stan.Rmd b/vignettes/fitting-dists-with-stan.Rmd index b78e781..be7045a 100644 --- a/vignettes/fitting-dists-with-stan.Rmd +++ b/vignettes/fitting-dists-with-stan.Rmd @@ -57,9 +57,9 @@ meanlog <- 1.5 sdlog <- 0.75 # Generate varying pwindow, swindow, and obs_time lengths -pwindows <- sample(1:5, n, replace = TRUE) -swindows <- sample(1:5, n, replace = TRUE) -obs_times <- sample(8:12, n, replace = TRUE) +pwindows <- sample(1, n, replace = TRUE) +swindows <- sample(1, n, replace = TRUE) +obs_times <- 10 # Function to generate a single sample generate_sample <- function(pwindow, swindow, obs_time) { @@ -135,3 +135,138 @@ ggplot(cdf_data, aes(x = x, y = probability, color = type)) + legend.position = "bottom" ) ``` + +We've aggregated the data to unique combinations of `pwindow`, `swindow`, and `obs_time` and counted the number of occurrences of each `observed_delay` for each combination. This is the data we will use to fit our model. + +# Fitting a naive model using cmdstan + +We'll start by fitting a naive model using cmdstan. We'll use the `cmdstanr` package to interface with cmdstan. We define the model in a string and then write it to a file as in the [How to use primarycensoreddist with Stan](using-stan-tools.html) vignette. + +```{r naive-model} +writeLines( + "data { + int N; // number of observations + vector[N] y; // observed delays + vector[N] n; // number of occurrences for each delay + } + parameters { + real mu; + real sigma; + } + model { + // Priors + mu ~ normal(1, 2); + sigma ~ normal(0.5, 1); + + // Likelihood + target += n .* lognormal_lpdf(y | mu, sigma); + }", + con = file.path(tempdir(), "naive_lognormal.stan") +) +``` + +Now let's compile the model + +```{r fit-naive-model} +naive_model <- cmdstan_model(file.path(tempdir(), "naive_lognormal.stan")) +naive_model +``` + +aand now let's fit the compiledmodel. + +```{r fit-naive-model} +naive_fit <- naive_model$sample( + data = list( + # Add a small constant to avoid log(0) + y = delay_counts$observed_delay + 1e-6, + n = delay_counts$n, + N = nrow(delay_counts) + ), + chains = 4, + parallel_chains = 4, + show_messages = FALSE +) +naive_fit +``` + +We see that the model has converged and the diagnostics look good. However, just from the model posterior summary we see that we might not be very happy with the fit. `mu` is smaller than the target `r meanlog` and `sigma` is larger than the target `r sdlog`. + +# Fitting an improved model using primarycensoreddist + +We'll now fit an improved model using the `primarycensoreddist` package. The main improvement is that we will use the `primary_censored_dist_lpdf` function to fit the model. This is the Stan version of the `pcens()` function and accounts for the primary and secondary censoring windows as well as the truncation. We encode that our primary distribution is a lognormal distribution by passing 1 as the `dist_id` parameter and that our primary event distribution is uniform by passing 1 as the `primary_dist_id` parameter. See the [Stan documentation](https://primarycensoreddist.epinowcast.org/stan/primary__censored__dist_8stan.html#acc97240dee1bc19e4f02013118f3857d) for more details on the `primary_censored_dist_lpdf` function. + +```{r pimarycensoreddist-model} +writeLines( + " + functions { + #include primary_censored_dist.stan + #include expgrowth.stan + } + data { + int N; // number of observations + array[N] int y; // observed delays + array[N] int n; // number of occurrences for each delay + array[N] int pwindow; // primary censoring window + array[N] int swindow; // secondary censoring window + array[N] int D; // maximum delay + } + transformed data { + array[0] real primary_params; + } + parameters { + real mu; + real sigma; + } + model { + // Priors + mu ~ normal(1, 2); + sigma ~ normal(0.5, 1); + + // Likelihood + for (i in 1:N) { + target += n[i] * primary_censored_dist_lpmf( + y[i] | 1, {mu, sigma}, + pwindow[i], swindow[i], D[i], + 1, primary_params + ); + } + }", + con = file.path(tempdir(), "primarycensoreddist_lognormal.stan") +) +``` + +Now let's compile the model + +```{r compile-primarycensoreddist-model, message = FALSE} +primarycensoreddist_model <- cmdstan_model( + file.path(tempdir(), "primarycensoreddist_lognormal.stan"), + include_paths = pcd_stan_path() +) +primarycensoreddist_model +``` + +Now let's fit the compiled model. + +```{r fit-primarycensoreddist-model} +primarycensoreddist_fit <- primarycensoreddist_model$sample( + data = list( + y = delay_counts$observed_delay, + n = delay_counts$n, + pwindow = delay_counts$pwindow, + swindow = delay_counts$swindow, + D = delay_counts$obs_time, + N = nrow(delay_counts) + ), + chains = 4, + parallel_chains = 4, + init = list( + list(mu = 1, sigma = 0.5), + list(mu = 1, sigma = 0.5), + list(mu = 1, sigma = 0.5), + list(mu = 1, sigma = 0.5) + ) +) +primarycensoreddist_fit +``` + +We see that the model has converged and the diagnostics look good. We also see that the posterior means are much closer to the true parameters. From 4976bd3e294dd5c13846f51b3b9ae99288f6aadc Mon Sep 17 00:00:00 2001 From: Sam Date: Wed, 4 Sep 2024 21:22:54 +0100 Subject: [PATCH 04/11] complete first draft --- DESCRIPTION | 1 - inst/stan/primary_censored_dist.stan | 3 +++ vignettes/fitting-dists-with-stan.Rmd | 23 ++++++++++++----------- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7c45a78..dbd8c79 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -26,7 +26,6 @@ Suggests: rmarkdown, spelling, testthat (>= 3.1.9), - tidybayes, usethis Additional_repositories: https://stan-dev.r-universe.dev diff --git a/inst/stan/primary_censored_dist.stan b/inst/stan/primary_censored_dist.stan index 391e7ef..6cf5db1 100644 --- a/inst/stan/primary_censored_dist.stan +++ b/inst/stan/primary_censored_dist.stan @@ -126,6 +126,9 @@ real primary_censored_integrand(real x, real xc, array[] real theta, // Compute adjusted delay real d_adj = d - x; + if (d_adj <= 0) { + return 0; + } // Compute log probabilities real log_cdf = dist_lcdf(d_adj | params, dist_id); diff --git a/vignettes/fitting-dists-with-stan.Rmd b/vignettes/fitting-dists-with-stan.Rmd index be7045a..0166115 100644 --- a/vignettes/fitting-dists-with-stan.Rmd +++ b/vignettes/fitting-dists-with-stan.Rmd @@ -40,7 +40,6 @@ Alongside the `primarycensoreddist` package we will use the `cmdstanr` package f ```{r setup, message = FALSE} library(primarycensoreddist) library(cmdstanr) -library(tidybayes) library(ggplot2) ``` @@ -155,7 +154,7 @@ writeLines( } model { // Priors - mu ~ normal(1, 2); + mu ~ normal(1, 1); sigma ~ normal(0.5, 1); // Likelihood @@ -219,8 +218,8 @@ writeLines( } model { // Priors - mu ~ normal(1, 2); - sigma ~ normal(0.5, 1); + mu ~ normal(1, 1); + sigma ~ normal(0.5, 0.5); // Likelihood for (i in 1:N) { @@ -259,14 +258,16 @@ primarycensoreddist_fit <- primarycensoreddist_model$sample( ), chains = 4, parallel_chains = 4, - init = list( - list(mu = 1, sigma = 0.5), - list(mu = 1, sigma = 0.5), - list(mu = 1, sigma = 0.5), - list(mu = 1, sigma = 0.5) - ) + init = list( # we use this to resolve initialisation issues + list(mu = 1.5, sigma = 0.6), + list(mu = 1.5, sigma = 0.4), + list(mu = 1.5, sigma = 0.3), + list(mu = 1.5, sigma = 0.55) + ), + refresh = 0, + show_messages = FALSE ) primarycensoreddist_fit ``` -We see that the model has converged and the diagnostics look good. We also see that the posterior means are much closer to the true parameters. +We see that the model has converged and the diagnostics look good. We also see that the posterior means are very near the true parameters and the 90% credible intervals include the true parameters. From ae15bb5a60be38fad06857ff240f3b984136b43a Mon Sep 17 00:00:00 2001 From: Sam Date: Wed, 4 Sep 2024 21:34:24 +0100 Subject: [PATCH 05/11] update plot theme --- vignettes/fitting-dists-with-stan.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/fitting-dists-with-stan.Rmd b/vignettes/fitting-dists-with-stan.Rmd index 0166115..b4d856e 100644 --- a/vignettes/fitting-dists-with-stan.Rmd +++ b/vignettes/fitting-dists-with-stan.Rmd @@ -127,7 +127,7 @@ ggplot(cdf_data, aes(x = x, y = probability, color = type)) + y = "Cumulative Probability", color = "CDF Type" ) + - theme_bw() + + theme_minimal() + theme( panel.grid.minor = element_blank(), plot.title = element_text(hjust = 0.5), From b31e7d568e82d2e04e42b47e99085c91df35d309 Mon Sep 17 00:00:00 2001 From: Sam Date: Wed, 4 Sep 2024 21:36:49 +0100 Subject: [PATCH 06/11] solve lint issue --- vignettes/fitting-dists-with-stan.Rmd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vignettes/fitting-dists-with-stan.Rmd b/vignettes/fitting-dists-with-stan.Rmd index b4d856e..f9cbe77 100644 --- a/vignettes/fitting-dists-with-stan.Rmd +++ b/vignettes/fitting-dists-with-stan.Rmd @@ -104,7 +104,8 @@ theoretical_cdf <- plnorm(x_seq, meanlog = meanlog, sdlog = sdlog) cdf_data <- data.frame( x = rep(x_seq, 2), probability = c(empirical_cdf(x_seq), theoretical_cdf), - type = rep(c("Observed", "Theoretical"), each = length(x_seq)) + type = rep(c("Observed", "Theoretical"), each = length(x_seq)), + stringsAsFactors = FALSE ) # Plot From 6dd22be5e1ac490c8296924aa99fb17c9514f0f9 Mon Sep 17 00:00:00 2001 From: Sam Date: Wed, 4 Sep 2024 21:52:37 +0100 Subject: [PATCH 07/11] resolve duplicate chunk label --- vignettes/fitting-dists-with-stan.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/fitting-dists-with-stan.Rmd b/vignettes/fitting-dists-with-stan.Rmd index f9cbe77..124b686 100644 --- a/vignettes/fitting-dists-with-stan.Rmd +++ b/vignettes/fitting-dists-with-stan.Rmd @@ -167,7 +167,7 @@ writeLines( Now let's compile the model -```{r fit-naive-model} +```{r compile-naive-model} naive_model <- cmdstan_model(file.path(tempdir(), "naive_lognormal.stan")) naive_model ``` From 061a9cd9f27ea344b0f0d9f52ac9b4d09ea4134d Mon Sep 17 00:00:00 2001 From: Sam Date: Wed, 4 Sep 2024 21:54:39 +0100 Subject: [PATCH 08/11] resolve duplicate chunk label --- vignettes/fitting-dists-with-fitdistr.Rmd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vignettes/fitting-dists-with-fitdistr.Rmd b/vignettes/fitting-dists-with-fitdistr.Rmd index d779de9..d2591aa 100644 --- a/vignettes/fitting-dists-with-fitdistr.Rmd +++ b/vignettes/fitting-dists-with-fitdistr.Rmd @@ -16,3 +16,5 @@ vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- + +*Under construction* \ No newline at end of file From 0f2f4c45dfe7040f5b7dedb74e938329da7932b2 Mon Sep 17 00:00:00 2001 From: Sam Date: Wed, 4 Sep 2024 21:56:46 +0100 Subject: [PATCH 09/11] resolve duplicate chunk label --- vignettes/fitting-dists-with-fitdistr.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/fitting-dists-with-fitdistr.Rmd b/vignettes/fitting-dists-with-fitdistr.Rmd index d2591aa..c181865 100644 --- a/vignettes/fitting-dists-with-fitdistr.Rmd +++ b/vignettes/fitting-dists-with-fitdistr.Rmd @@ -17,4 +17,4 @@ vignette: > %\VignetteEncoding{UTF-8} --- -*Under construction* \ No newline at end of file +*Under construction* From 6f4afa54106fdb4795ae4ed3468ce672a4879097 Mon Sep 17 00:00:00 2001 From: Sam Date: Wed, 4 Sep 2024 22:02:07 +0100 Subject: [PATCH 10/11] add in missing dplyr --- vignettes/fitting-dists-with-stan.Rmd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vignettes/fitting-dists-with-stan.Rmd b/vignettes/fitting-dists-with-stan.Rmd index 124b686..0f39a7b 100644 --- a/vignettes/fitting-dists-with-stan.Rmd +++ b/vignettes/fitting-dists-with-stan.Rmd @@ -35,12 +35,13 @@ This vignette builds on the concepts introduced in the [Getting Started with pri ## Packages used in this vignette -Alongside the `primarycensoreddist` package we will use the `cmdstanr` package for interfacing with cmdstan and the `tidybayes` package for posterior analysis. We will also use the `ggplot2` package for plotting. +Alongside the `primarycensoreddist` package we will use the `cmdstanr` package for interfacing with cmdstananalysis. We will also use the `ggplot2` package for plotting and `dplyr` for data manipulation. ```{r setup, message = FALSE} library(primarycensoreddist) library(cmdstanr) library(ggplot2) +library(dplyr) ``` # Simulating censored and truncated delay distribution data From 325916d0eefb4a01af9558178f1d04f027bc61bf Mon Sep 17 00:00:00 2001 From: Sam Date: Wed, 4 Sep 2024 22:18:21 +0100 Subject: [PATCH 11/11] tidy up new vignette --- vignettes/fitting-dists-with-stan.Rmd | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/vignettes/fitting-dists-with-stan.Rmd b/vignettes/fitting-dists-with-stan.Rmd index 0f39a7b..3e3a01e 100644 --- a/vignettes/fitting-dists-with-stan.Rmd +++ b/vignettes/fitting-dists-with-stan.Rmd @@ -170,12 +170,11 @@ Now let's compile the model ```{r compile-naive-model} naive_model <- cmdstan_model(file.path(tempdir(), "naive_lognormal.stan")) -naive_model ``` -aand now let's fit the compiledmodel. +aand now let's fit the compiled model. -```{r fit-naive-model} +```{r fit-naive-model, message = FALSE} naive_fit <- naive_model$sample( data = list( # Add a small constant to avoid log(0) @@ -185,7 +184,8 @@ naive_fit <- naive_model$sample( ), chains = 4, parallel_chains = 4, - show_messages = FALSE + show_messages = FALSE, + refresh = 0 ) naive_fit ``` @@ -243,12 +243,11 @@ primarycensoreddist_model <- cmdstan_model( file.path(tempdir(), "primarycensoreddist_lognormal.stan"), include_paths = pcd_stan_path() ) -primarycensoreddist_model ``` Now let's fit the compiled model. -```{r fit-primarycensoreddist-model} +```{r fit-primarycensoreddist-model, message = FALSE} primarycensoreddist_fit <- primarycensoreddist_model$sample( data = list( y = delay_counts$observed_delay,