epinowcast · seabbs · Sep 16, 2024 · Sep 13, 2024 · Sep 13, 2024 · Sep 13, 2024
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -24,3 +24,4 @@
 ^CITATION\.cff$
 ^vignettes/using-stan-tools\.Rmd$
 ^vignettes/fitting-dists-with-stan\.Rmd$
+^inst/pcens_model$
diff --git a/.gitignore b/.gitignore
@@ -21,3 +21,4 @@ cache
 .here
 *.pdf
 *.html
+inst/pcens_model
diff --git a/NAMESPACE b/NAMESPACE
@@ -6,6 +6,8 @@ export(dexpgrowth)
 export(dpcens)
 export(dprimarycensoreddist)
 export(fitdistdoublecens)
+export(pcd_as_cmdstan_data)
+export(pcd_cmdstan_model)
 export(pcd_load_stan_functions)
 export(pcd_stan_files)
 export(pcd_stan_functions)

diff --git a/NEWS.md b/NEWS.md
@@ -5,6 +5,9 @@
 * Added a new function `fitdistdoublecens()` to allow for fitting of double censored and truncated data using the `fitdistrplus` package.
 * Added low level tests for the Stan `primary_censored_ode` function.
 * Rephrased the stan code to use a ODE solver rather than a numerical integration method. This allows for much faster and more stable computation of the likelihood
+* Added a `CmdStan` model for fitting distributions using the `cmdstanr` package.
+* Added helpers functions for working with the new `CmdStan` model and added an example to the vignette.
+* Added parameter recovery tests for the new `CmdStan` model which tests the `primary_censored_dist_lpmf` function when used with NUTS based fitting.
 
 # primarycensoreddist 0.3.0
 

diff --git a/R/pcd_cmdstan_model.R b/R/pcd_cmdstan_model.R
@@ -0,0 +1,169 @@
+#' Create a CmdStanModel with primarycensoreddist Stan functions
+#'
+#' This function creates a CmdStanModel object using the Stan model and
+#' functions from primarycensoreddist and optionally includes additional
+#' user-specified Stan files.
+#'
+#' @param include_paths Character vector of paths to include for Stan
+#'  compilation. Defaults to the result of `pcd_stan_path()`.
+#'
+#' @param ... Additional arguments passed to cmdstanr::cmdstan_model().
+#'
+#' @return A CmdStanModel object.
+#'
+#' @details
+#' The underlying Stan model (`pcens_model.stan`) supports various features:
+#' - Multiple probability distributions for modeling delays
+#' - Primary and secondary censoring
+#' - Truncation
+#' - Optional use of reduce_sum for improved performance (via
+#'   within chain parallelism).
+#' - Flexible prior specifications
+#' - Optional computation of log-likelihood for model comparison
+#'
+#' @export
+#' @family modelhelpers
+#'
+#' @examples
+#' \dontrun{
+#' model <- pcd_cmdstan_model()
+#' fit <- model$sample(data = stan_data)
+#' }
+pcd_cmdstan_model <- function(
+    include_paths = primarycensoreddist::pcd_stan_path(), ...) {
+  if (!requireNamespace("cmdstanr", quietly = TRUE)) {
+    stop("Package 'cmdstanr' is required but not installed for this function.")
+  }
+
+  pcd_stan_model <- system.file(
+    "pcens_model.stan",
+    package = "primarycensoreddist"
+  )
+
+  cmdstanr::cmdstan_model(
+    pcd_stan_model,
+    include_paths = include_paths,
+    ...
+  )
+}
+
+#' Prepare data for primarycensoreddist Stan model
+#'
+#' This function takes in delay data and prepares it for use with the
+#' primarycensoreddist Stan model.
+#'
+#' @param data A data frame containing the delay data.
+#'
+#' @param delay Column name for observed delays (default: "delay")
+#'
+#' @param delay_upper Column name for upper bound of delays
+#'  (default: "delay_upper")
+#'
+#' @param n Column name for count of observations (default: "n")
+#'
+#' @param pwindow Column name for primary window (default: "pwindow")
+#'
+#' @param relative_obs_time Column name for relative observation time
+#' (default: "relative_obs_time")
+#'
+#' @param dist_id Integer identifying the delay distribution:
+#'   1 = Lognormal, 2 = Gamma, 3 = Weibull, 4 = Exponential,
+#'   5 = Generalized Gamma, 6 = Negative Binomial, 7 = Poisson,
+#'   8 = Bernoulli, 9 = Beta, 10 = Binomial, 11 = Categorical, 12 = Cauchy,
+#'   13 = Chi-square, 14 = Dirichlet, 15 = Gumbel, 16 = Inverse Gamma,
+#'   17 = Logistic
+#'
+#' @param primary_dist_id Integer identifying the primary distribution:
+#'   1 = Uniform, 2 = Exponential growth
+#'
+#' @param param_bounds A list with elements `lower` and `upper`, each a numeric
+#'   vector specifying bounds for the delay distribution parameters.
+#'
+#' @param primary_param_bounds A list with elements `lower` and `upper`, each a
+#'   numeric vector specifying bounds for the primary distribution parameters.
+#'
+#' @param priors A list with elements `location` and `scale`, each a numeric
+#'   vector specifying priors for the delay distribution parameters.
+#'
+#' @param primary_priors A list with elements `location` and `scale`, each a
+#'   numeric vector specifying priors for the primary distribution parameters.
+#'
+#' @param compute_log_lik Logical; compute log likelihood? (default: FALSE)
+#'
+#' @param use_reduce_sum Logical; use reduce_sum for performance?
+#'  (default: FALSE)
+#'
+#' @return A list containing the data formatted for use with
+#'  [pcd_cmdstan_model()]
+#'
+#' @export
+#' @family modelhelpers
+#'
+#' @examples
+#' \dontrun{
+#' data <- data.frame(
+#'   delay = c(1, 2, 3),
+#'   delay_upper = c(2, 3, 4),
+#'   n = c(10, 20, 15),
+#'   pwindow = c(1, 1, 2),
+#'   relative_obs_time = c(10, 10, 10)
+#' )
+#' stan_data <- pcd_as_cmdstan_data(
+#'   data,
+#'   dist_id = 1,
+#'   primary_dist_id = 1,
+#'   param_bounds = list(lower = c(0, 0), upper = c(10, 10)),
+#'   primary_param_bounds = list(lower = numeric(0), upper = numeric(0)),
+#'   priors = list(location = c(1, 1), scale = c(1, 1)),
+#'   primary_priors = list(location = numeric(0), scale = numeric(0))
+#' )
+#' }
+pcd_as_cmdstan_data <- function(
+    data, delay = "delay", delay_upper = "delay_upper",
+    n = "n", pwindow = "pwindow",
+    relative_obs_time = "relative_obs_time",
+    dist_id, primary_dist_id,
+    param_bounds, primary_param_bounds,
+    priors, primary_priors,
+    compute_log_lik = FALSE,
+    use_reduce_sum = FALSE) {
+  required_cols <- c(delay, delay_upper, n, pwindow, relative_obs_time)
+  missing_cols <- setdiff(required_cols, names(data))
+  if (length(missing_cols) > 0) {
+    stop(
+      "Missing required columns: ", toString(missing_cols), "\n",
+      "Please ensure your data frame contains these columns or set the",
+      " corresponding arguments:\n",
+      "delay = '", delay, "'\n",
+      "delay_upper = '", delay_upper, "'\n",
+      "n = '", n, "'\n",
+      "pwindow = '", pwindow, "'\n",
+      "relative_obs_time = '", relative_obs_time, "'"
+    )
+  }
+
+  stan_data <- list(
+    N = nrow(data),
+    d = data[[delay]],
+    d_upper = data[[delay_upper]],
+    n = data[[n]],
+    pwindow = data[[pwindow]],
+    D = data[[relative_obs_time]],
+    dist_id = dist_id,
+    primary_dist_id = primary_dist_id,
+    n_params = length(param_bounds$lower),
+    n_primary_params = length(primary_param_bounds$lower),
+    compute_log_lik = as.integer(compute_log_lik),
+    use_reduce_sum = as.integer(use_reduce_sum),
+    param_lower_bounds = param_bounds$lower,
+    param_upper_bounds = param_bounds$upper,
+    primary_param_lower_bounds = primary_param_bounds$lower,
+    primary_param_upper_bounds = primary_param_bounds$upper,
+    prior_location = priors$location,
+    prior_scale = priors$scale,
+    primary_prior_location = primary_priors$location,
+    primary_prior_scale = primary_priors$scale
+  )
+
+  return(stan_data)
+}
diff --git a/inst/WORDLIST b/inst/WORDLIST
@@ -1,17 +1,22 @@
 CMD
 Charniga
 CmdStan
+CmdStanModel
 Codecov
+Dirichlet
 Github
+Gumbel
 Inf
 Lifecycle
 PMF
 PMFs
+Poisson
 SamuelBrand
 athowes
 cdot
 cens
 cmdstan
+cmdstanr
 dp
 dprimary
 dprimarycensoreddist
@@ -25,6 +30,7 @@ leq
 lfloor
 modelhelpers
 modeling
+obs
 pcd
 pdist
 pprimarycensoreddist

diff --git a/inst/pcens_model.stan b/inst/pcens_model.stan
@@ -0,0 +1,96 @@
+functions {
+  #include primary_censored_dist.stan
+  #include expgrowth.stan
+
+  real partial_sum(array[] int dummy, int start, int end,
+                   array[] int d, array[] int d_upper, array[] int n,
+                   array[] int pwindow, array[] int D,
+                   int dist_id, array[] real params,
+                   int primary_dist_id, array[] real primary_params) {
+    real partial_target = 0;
+    for (i in start:end) {
+      partial_target += n[i] * primary_censored_dist_lpmf(
+        d[i] | dist_id, params,
+        pwindow[i], d_upper[i], D[i],
+        primary_dist_id, primary_params
+      );
+    }
+    return partial_target;
+  }
+}
+
+data {
+  int<lower=0> N;  // number of observations
+  array[N] int<lower=0> d;     // observed delays
+  array[N] int<lower=0> d_upper;     // observed delays upper bound
+  array[N] int<lower=0> n;     // number of occurrences for each delay
+  array[N] int<lower=0> pwindow; // primary censoring window
+  array[N] int<lower=0> D; // maximum delay
+  int<lower=1, upper=17> dist_id; // distribution identifier
+  int<lower=1, upper=2> primary_dist_id; // primary distribution identifier
+  int<lower=0> n_params; // number of distribution parameters
+  int<lower=0> n_primary_params; // number of primary distribution parameters
+  int<lower=0, upper=1> compute_log_lik; // whether to compute log likelihood
+  int<lower=0, upper=1> use_reduce_sum; // whether to use reduce_sum
+  vector[n_params] param_lower_bounds; // lower bounds for parameters
+  vector[n_params] param_upper_bounds; // upper bounds for parameters
+  vector[n_primary_params] primary_param_lower_bounds; // lower bounds for primary parameters
+  vector[n_primary_params] primary_param_upper_bounds; // upper bounds for primary parameters
+  array[n_params] real prior_location; // location parameters for priors
+  array[n_params] real prior_scale; // scale parameters for priors
+  array[n_primary_params] real primary_prior_location; // location parameters for primary priors
+  array[n_primary_params] real primary_prior_scale; // scale parameters for primary priors
+}
+
+transformed data {
+  array[N] int indexes = linspaced_int_array(N, 1, N);
+}
+
+parameters {
+  vector<lower=param_lower_bounds, upper=param_upper_bounds>[n_params] params; // distribution parameters
+  vector<lower=primary_param_lower_bounds, upper=primary_param_upper_bounds>[n_primary_params] primary_params; // primary distribution parameters
+}
+
+
+model {
+  // Priors
+  for (i in 1:n_params) {
+    params[i] ~ normal(prior_location[i], prior_scale[i]);
+  }
+  if (n_primary_params) {
+    for (i in 1:n_primary_params) {
+      primary_params[i] ~ normal(
+        primary_prior_location[i],
+        primary_prior_scale[i]
+      );
+    }
+  }
+
+  // Likelihood
+  if (use_reduce_sum) {
+    target += reduce_sum(partial_sum, indexes, 1, d,
+                         d_upper, n, pwindow, D, dist_id, to_array_1d(params),
+                         primary_dist_id, to_array_1d(primary_params));
+  } else {
+    for (i in 1:N) {
+      target += n[i] * primary_censored_dist_lpmf(
+        d[i] | dist_id, to_array_1d(params),
+        pwindow[i], d_upper[i], D[i],
+        primary_dist_id, to_array_1d(primary_params)
+      );
+    }
+  }
+}
+
+generated quantities {
+  vector[compute_log_lik ? N : 0] log_lik;
+  if (compute_log_lik) {
+    for (i in 1:N) {
+      log_lik[i] = primary_censored_dist_lpmf(
+        d[i] | dist_id, to_array_1d(params),
+        pwindow[i], d_upper[i], D[i],
+        primary_dist_id, to_array_1d(primary_params)
+      );
+    }
+  }
+}
diff --git a/man/fitdistdoublecens.Rd b/man/fitdistdoublecens.Rd
-Original file line number
+Diff line change
@@ Expand Up / @@ -21,3 +21,4 @@ cache @@
     .here
     *.pdf
     *.html
+    inst/pcens_model