From 5b4e6cc72af335bbc7d7beb09c92136420d707cd Mon Sep 17 00:00:00 2001 From: Sam Date: Fri, 4 Oct 2024 17:23:21 +0100 Subject: [PATCH 1/5] add check_truncation --- NAMESPACE | 1 + R/check.R | 61 ++++++++++++++++++++++++++ man/check_dprimary.Rd | 3 +- man/check_pdist.Rd | 3 +- man/check_truncation.Rd | 34 ++++++++++++++ tests/testthat/test-check_truncation.R | 57 ++++++++++++++++++++++++ 6 files changed, 157 insertions(+), 2 deletions(-) create mode 100644 man/check_truncation.Rd create mode 100644 tests/testthat/test-check_truncation.R diff --git a/NAMESPACE b/NAMESPACE index 93b5927..efaeba6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,6 +5,7 @@ S3method(primary_censored_cdf,pcens_pgamma_dunif) S3method(primary_censored_cdf,pcens_plnorm_dunif) export(check_dprimary) export(check_pdist) +export(check_truncation) export(dexpgrowth) export(dpcens) export(dprimarycensoreddist) diff --git a/R/check.R b/R/check.R index 116e3ad..2089477 100644 --- a/R/check.R +++ b/R/check.R @@ -74,3 +74,64 @@ check_dprimary <- function(dprimary, pwindow, dprimary_args = list(), } return(invisible(NULL)) } + +#' Check if truncation time is appropriate relative to the maximum delay +#' +#' This function checks if the truncation time D is appropriate relative to the +#' maximum delay. If D is much larger than necessary, it suggests +#' considering setting it to `Inf` for better efficiency with minimal accuracy +#' cost. +#' +#' @param delays A numeric vector of delay times +#' +#' @param D The truncation time +#' +#' @param multiplier The multiplier for the maximum delay to compare with D. +#' Default is 2. +#' +#' @return Invisible NULL. Prints a message if the condition is met. +#' +#' @export +#' @family check +#' +#' @examples +#' check_truncation(delays = c(1, 2, 3, 4), D = 10, multiplier = 2) +check_truncation <- function(delays, D, multiplier = 2) { + if (!is.numeric(delays) || !is.numeric(D) || !is.numeric(multiplier)) { + stop("All arguments must be numeric.") + } + + if (D <= 0 || multiplier <= 1) { + stop( + "Invalid argument values. D must be positive and multiplier must be ", + "greater than 1." + ) + } + + # Remove NA and infinite values + delays <- delays[is.finite(delays)] + + if (length(delays) == 0) { + warning("No finite observed delays to check.") + return(invisible(NULL)) + } + + max_delay <- max(delays) + expected_D <- max_delay * multiplier + + # Check if D is much larger than expected + if (D > expected_D) { + message( + sprintf( + paste0( + "The truncation time D (%g) is larger than %g times the maximum ", + "observed delay (%g). Consider setting D to Inf for better ", + "efficiency with minimal accuracy cost for this case." + ), + D, multiplier, max_delay + ) + ) + } + + invisible(NULL) +} diff --git a/man/check_dprimary.Rd b/man/check_dprimary.Rd index 935b9eb..f950510 100644 --- a/man/check_dprimary.Rd +++ b/man/check_dprimary.Rd @@ -38,6 +38,7 @@ check_dprimary(dunif, pwindow = 1) } \seealso{ Distribution checking functions -\code{\link{check_pdist}()} +\code{\link{check_pdist}()}, +\code{\link{check_truncation}()} } \concept{check} diff --git a/man/check_pdist.Rd b/man/check_pdist.Rd index 903a086..d1fed7b 100644 --- a/man/check_pdist.Rd +++ b/man/check_pdist.Rd @@ -27,6 +27,7 @@ check_pdist(pnorm, D = 10) } \seealso{ Distribution checking functions -\code{\link{check_dprimary}()} +\code{\link{check_dprimary}()}, +\code{\link{check_truncation}()} } \concept{check} diff --git a/man/check_truncation.Rd b/man/check_truncation.Rd new file mode 100644 index 0000000..86eef23 --- /dev/null +++ b/man/check_truncation.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check.R +\name{check_truncation} +\alias{check_truncation} +\title{Check if truncation time is appropriate relative to the maximum delay} +\usage{ +check_truncation(delays, D, multiplier = 2) +} +\arguments{ +\item{delays}{A numeric vector of delay times} + +\item{D}{The truncation time} + +\item{multiplier}{The multiplier for the maximum delay to compare with D. +Default is 2.} +} +\value{ +Invisible NULL. Prints a message if the condition is met. +} +\description{ +This function checks if the truncation time D is appropriate relative to the +maximum delay. If D is much larger than necessary, it suggests +considering setting it to \code{Inf} for better efficiency with minimal accuracy +cost. +} +\examples{ +check_truncation(delays = c(1, 2, 3, 4), D = 10, multiplier = 2) +} +\seealso{ +Distribution checking functions +\code{\link{check_dprimary}()}, +\code{\link{check_pdist}()} +} +\concept{check} diff --git a/tests/testthat/test-check_truncation.R b/tests/testthat/test-check_truncation.R new file mode 100644 index 0000000..f1914ff --- /dev/null +++ b/tests/testthat/test-check_truncation.R @@ -0,0 +1,57 @@ +test_that("check_truncation is silent when D is appropriate", { + expect_silent( + check_truncation(delays = c(1, 2, 3, 4), D = 10, multiplier = 2) + ) +}) + +test_that("check_truncation gives a message when D is too large", { + expect_message( + check_truncation(delays = c(1, 2, 3, 4), D = 20, multiplier = 2), + paste0( + "The truncation time D (20) is larger than 2 times ", + "the maximum observed delay (4)." + ) + ) +}) + +test_that("check_truncation warns for empty delays vector", { + expect_warning( + check_truncation(delays = numeric(0), D = 10, multiplier = 2), + "No finite observed delays to check." + ) +}) + +test_that("check_truncation handles delays with NA and Inf values", { + expect_silent( + check_truncation( + delays = c(1, 2, NA, 4, Inf), D = 10, multiplier = 2 + ) + ) +}) + +test_that("check_truncation errors on non-numeric delays", { + expect_error( + check_truncation(delays = "not numeric", D = 10, multiplier = 2), + "All arguments must be numeric." + ) +}) + +test_that("check_truncation errors on negative D", { + expect_error( + check_truncation(delays = c(1, 2, 3, 4), D = -1, multiplier = 2), + paste0( + "Invalid argument values. D must be positive and multiplier ", + "must be greater than 1." + ) + ) +}) + +test_that("check_truncation errors on multiplier <= 1", { + expect_error( + check_truncation(delays = c(1, 2, 3, 4), D = 10, multiplier = 0.5), + paste0( + "Invalid argument values. D must be positive and multiplier ", + "must be greater than 1." + ) + ) +}) From c4f8cf741a47f660c393b4c620101fddd5996bff Mon Sep 17 00:00:00 2001 From: Sam Date: Fri, 4 Oct 2024 17:36:11 +0100 Subject: [PATCH 2/5] add unit tests --- R/check.R | 8 ++++++-- .../{test-check.R => test-check_dprimary.R} | 17 ----------------- tests/testthat/test-check_pdist.R | 16 ++++++++++++++++ tests/testthat/test-check_truncation.R | 13 ++++++++++--- 4 files changed, 32 insertions(+), 22 deletions(-) rename tests/testthat/{test-check.R => test-check_dprimary.R} (64%) create mode 100644 tests/testthat/test-check_pdist.R diff --git a/R/check.R b/R/check.R index 2089477..7ee5b69 100644 --- a/R/check.R +++ b/R/check.R @@ -108,8 +108,12 @@ check_truncation <- function(delays, D, multiplier = 2) { ) } - # Remove NA and infinite values - delays <- delays[is.finite(delays)] + if (is.infinite(D)) { + return(invisible(NULL)) + } + + # Remove NA + delays <- delays[!is.na(delays)] if (length(delays) == 0) { warning("No finite observed delays to check.") diff --git a/tests/testthat/test-check.R b/tests/testthat/test-check_dprimary.R similarity index 64% rename from tests/testthat/test-check.R rename to tests/testthat/test-check_dprimary.R index 98f1704..c97da8d 100644 --- a/tests/testthat/test-check.R +++ b/tests/testthat/test-check_dprimary.R @@ -1,20 +1,3 @@ -test_that("check_pdist works correctly for valid CDF", { - expect_silent(check_pdist(pnorm, D = 10)) - expect_silent(check_pdist(punif, D = 1)) -}) - -test_that("check_pdist throws error for invalid CDF", { - invalid_cdf <- function(x, ...) x^2 - 1 # Not bounded between 0 and 1 - expect_error( - check_pdist(invalid_cdf, D = 10), - "pdist is not a valid cumulative distribution function" - ) -}) - -test_that("check_pdist handles infinite D correctly", { - expect_silent(check_pdist(pnorm, D = Inf)) -}) - test_that("check_dprimary works correctly for valid PDF", { expect_silent(check_dprimary(dunif, pwindow = 1)) }) diff --git a/tests/testthat/test-check_pdist.R b/tests/testthat/test-check_pdist.R new file mode 100644 index 0000000..ad53dad --- /dev/null +++ b/tests/testthat/test-check_pdist.R @@ -0,0 +1,16 @@ +test_that("check_pdist works correctly for valid CDF", { + expect_silent(check_pdist(pnorm, D = 10)) + expect_silent(check_pdist(punif, D = 1)) +}) + +test_that("check_pdist throws error for invalid CDF", { + invalid_cdf <- function(x, ...) x^2 - 1 # Not bounded between 0 and 1 + expect_error( + check_pdist(invalid_cdf, D = 10), + "pdist is not a valid cumulative distribution function" + ) +}) + +test_that("check_pdist handles infinite D correctly", { + expect_silent(check_pdist(pnorm, D = Inf)) +}) diff --git a/tests/testthat/test-check_truncation.R b/tests/testthat/test-check_truncation.R index f1914ff..c332558 100644 --- a/tests/testthat/test-check_truncation.R +++ b/tests/testthat/test-check_truncation.R @@ -1,6 +1,6 @@ test_that("check_truncation is silent when D is appropriate", { expect_silent( - check_truncation(delays = c(1, 2, 3, 4), D = 10, multiplier = 2) + check_truncation(delays = c(1, 2, 3, 4), D = 5, multiplier = 2) ) }) @@ -8,8 +8,9 @@ test_that("check_truncation gives a message when D is too large", { expect_message( check_truncation(delays = c(1, 2, 3, 4), D = 20, multiplier = 2), paste0( - "The truncation time D (20) is larger than 2 times ", - "the maximum observed delay (4)." + "The truncation time D \\(20\\) is larger than 2 times the maximum ", # nolint + "observed delay \\(4\\). Consider setting D to Inf for better ", # nolint + "efficiency with minimal accuracy cost for this case." ) ) }) @@ -55,3 +56,9 @@ test_that("check_truncation errors on multiplier <= 1", { ) ) }) + +test_that("check_truncation is silent when D is infinite", { + expect_silent( + check_truncation(delays = c(1, 2, 3, 4), D = Inf, multiplier = 2) + ) +}) From 50a23c0575a8908a82aef97cc21b53e8c3b6a6bd Mon Sep 17 00:00:00 2001 From: Sam Date: Fri, 4 Oct 2024 18:06:34 +0100 Subject: [PATCH 3/5] add truncation checking to model functions --- R/fitdistdoublecens.R | 16 +++++++++- R/pcd_cmdstan_model.R | 64 +++++++++++++++++++++++--------------- man/fitdistdoublecens.Rd | 5 +++ man/pcd_as_cmdstan_data.Rd | 19 +++++------ man/pcd_cmdstan_model.Rd | 14 ++++++++- 5 files changed, 82 insertions(+), 36 deletions(-) diff --git a/R/fitdistdoublecens.R b/R/fitdistdoublecens.R index 77e276c..b1fa01e 100644 --- a/R/fitdistdoublecens.R +++ b/R/fitdistdoublecens.R @@ -20,6 +20,10 @@ #' #' @param ... Additional arguments to be passed to [fitdistrplus::fitdist()]. #' +#' @param truncation_check_multiplier Numeric multiplier to use for checking +#' if the truncation time D is appropriate relative to the maximum delay. +#' Set to NULL to skip the check. Default is 2. +#' #' @return An object of class "fitdist" as returned by fitdistrplus::fitdist. #' #' @export @@ -56,7 +60,9 @@ fitdistdoublecens <- function(censdata, distr, pwindow = 1, D = Inf, dprimary = stats::dunif, dprimary_name = NULL, - dprimary_args = list(), ...) { + dprimary_args = list(), + truncation_check_multiplier = 2, + ...) { # Check if fitdistrplus is available if (!requireNamespace("fitdistrplus", quietly = TRUE)) { stop( @@ -78,6 +84,14 @@ fitdistdoublecens <- function(censdata, distr, dprimary_name <- .extract_function_name(dprimary) } + if (!is.null(truncation_check_multiplier)) { + check_truncation( + delays = censdata$left, + D = D, + multiplier = truncation_check_multiplier + ) + } + # Get the distribution functions pdist_name <- paste0("p", distr) pdist <- get(pdist_name) diff --git a/R/pcd_cmdstan_model.R b/R/pcd_cmdstan_model.R index 7245f33..6cae4f3 100644 --- a/R/pcd_cmdstan_model.R +++ b/R/pcd_cmdstan_model.R @@ -6,7 +6,11 @@ #' #' @param include_paths Character vector of paths to include for Stan #' compilation. Defaults to the result of `pcd_stan_path()`. -#' +#' @param check_truncation_multiplier Numeric multiplier to use for checking +#' if the truncation time D is appropriate relative to the maximum delay. +#' Set to NULL to skip the check. Default is 2. +#' @param delays Optional numeric vector of observed delays to use for checking +#' the truncation time. Only used if `check_truncation_multiplier` is not NULL. #' @param ... Additional arguments passed to cmdstanr::cmdstan_model(). #' #' @return A CmdStanModel object. @@ -30,7 +34,10 @@ #' fit <- model$sample(data = stan_data) #' } pcd_cmdstan_model <- function( - include_paths = primarycensoreddist::pcd_stan_path(), ...) { + include_paths = primarycensoreddist::pcd_stan_path(), + check_truncation_multiplier = 2, + delays = NULL, + ...) { if (!requireNamespace("cmdstanr", quietly = TRUE)) { stop("Package 'cmdstanr' is required but not installed for this function.") } @@ -40,6 +47,14 @@ pcd_cmdstan_model <- function( package = "primarycensoreddist" ) + if (!is.null(check_truncation_multiplier) && !is.null(delays)) { + check_truncation( + delays = delays, + D = stan_data$D, + multiplier = check_truncation_multiplier + ) + } + cmdstanr::cmdstan_model( pcd_stan_model, include_paths = include_paths, @@ -53,50 +68,36 @@ pcd_cmdstan_model <- function( #' primarycensoreddist Stan model. #' #' @param data A data frame containing the delay data. -#' #' @param delay Column name for observed delays (default: "delay") -#' -#' @param delay_upper Column name for upper bound of delays -#' (default: "delay_upper") -#' +#' @param delay_upper Column name for upper bound of delays (default: "delay_upper") #' @param n Column name for count of observations (default: "n") -#' #' @param pwindow Column name for primary window (default: "pwindow") -#' -#' @param relative_obs_time Column name for relative observation time -#' (default: "relative_obs_time") -#' +#' @param relative_obs_time Column name for relative observation time (default: "relative_obs_time") #' @param dist_id Integer identifying the delay distribution: #' 1 = Lognormal, 2 = Gamma, 3 = Weibull, 4 = Exponential, #' 5 = Generalized Gamma, 6 = Negative Binomial, 7 = Poisson, #' 8 = Bernoulli, 9 = Beta, 10 = Binomial, 11 = Categorical, 12 = Cauchy, #' 13 = Chi-square, 14 = Dirichlet, 15 = Gumbel, 16 = Inverse Gamma, #' 17 = Logistic -#' #' @param primary_dist_id Integer identifying the primary distribution: #' 1 = Uniform, 2 = Exponential growth -#' #' @param param_bounds A list with elements `lower` and `upper`, each a numeric #' vector specifying bounds for the delay distribution parameters. -#' #' @param primary_param_bounds A list with elements `lower` and `upper`, each a #' numeric vector specifying bounds for the primary distribution parameters. -#' #' @param priors A list with elements `location` and `scale`, each a numeric #' vector specifying priors for the delay distribution parameters. -#' #' @param primary_priors A list with elements `location` and `scale`, each a #' numeric vector specifying priors for the primary distribution parameters. -#' #' @param compute_log_lik Logical; compute log likelihood? (default: FALSE) +#' @param use_reduce_sum Logical; use reduce_sum for performance? (default: FALSE) +#' @param truncation_check_multiplier Numeric multiplier to use for checking +#' if the truncation time D is appropriate relative to the maximum delay +#' for each unique D value. Set to NULL to skip the check. Default is 2. #' -#' @param use_reduce_sum Logical; use reduce_sum for performance? -#' (default: FALSE) -#' -#' @return A list containing the data formatted for use with -#' [pcd_cmdstan_model()] -#' +#' @return A list containing the data formatted for use with [pcd_cmdstan_model()] #' @export +#' #' @family modelhelpers #' #' @examples @@ -126,7 +127,8 @@ pcd_as_cmdstan_data <- function( param_bounds, primary_param_bounds, priors, primary_priors, compute_log_lik = FALSE, - use_reduce_sum = FALSE) { + use_reduce_sum = FALSE, + truncation_check_multiplier = 2) { required_cols <- c(delay, delay_upper, n, pwindow, relative_obs_time) missing_cols <- setdiff(required_cols, names(data)) if (length(missing_cols) > 0) { @@ -142,6 +144,18 @@ pcd_as_cmdstan_data <- function( ) } + if (!is.null(truncation_check_multiplier)) { + unique_D <- unique(data[[relative_obs_time]]) + for (D in unique_D) { + delays_subset <- data[[delay]][data[[relative_obs_time]] == D] + check_truncation( + delays = delays_subset, + D = D, + multiplier = truncation_check_multiplier + ) + } + } + stan_data <- list( N = nrow(data), d = data[[delay]], diff --git a/man/fitdistdoublecens.Rd b/man/fitdistdoublecens.Rd index 48dad8a..690e446 100644 --- a/man/fitdistdoublecens.Rd +++ b/man/fitdistdoublecens.Rd @@ -12,6 +12,7 @@ fitdistdoublecens( dprimary = stats::dunif, dprimary_name = NULL, dprimary_args = list(), + truncation_check_multiplier = 2, ... ) } @@ -47,6 +48,10 @@ dprimary. For example, when using \code{dexpgrowth}, you would pass \code{list(min = 0, max = pwindow, r = 0.2)} to set the minimum, maximum, and rate parameters} +\item{truncation_check_multiplier}{Numeric multiplier to use for checking +if the truncation time D is appropriate relative to the maximum delay. +Set to NULL to skip the check. Default is 2.} + \item{...}{Additional arguments to be passed to \code{\link[fitdistrplus:fitdist]{fitdistrplus::fitdist()}}.} } \value{ diff --git a/man/pcd_as_cmdstan_data.Rd b/man/pcd_as_cmdstan_data.Rd index 4d70e71..6900981 100644 --- a/man/pcd_as_cmdstan_data.Rd +++ b/man/pcd_as_cmdstan_data.Rd @@ -18,7 +18,8 @@ pcd_as_cmdstan_data( priors, primary_priors, compute_log_lik = FALSE, - use_reduce_sum = FALSE + use_reduce_sum = FALSE, + truncation_check_multiplier = 2 ) } \arguments{ @@ -26,15 +27,13 @@ pcd_as_cmdstan_data( \item{delay}{Column name for observed delays (default: "delay")} -\item{delay_upper}{Column name for upper bound of delays -(default: "delay_upper")} +\item{delay_upper}{Column name for upper bound of delays (default: "delay_upper")} \item{n}{Column name for count of observations (default: "n")} \item{pwindow}{Column name for primary window (default: "pwindow")} -\item{relative_obs_time}{Column name for relative observation time -(default: "relative_obs_time")} +\item{relative_obs_time}{Column name for relative observation time (default: "relative_obs_time")} \item{dist_id}{Integer identifying the delay distribution: 1 = Lognormal, 2 = Gamma, 3 = Weibull, 4 = Exponential, @@ -60,12 +59,14 @@ numeric vector specifying priors for the primary distribution parameters.} \item{compute_log_lik}{Logical; compute log likelihood? (default: FALSE)} -\item{use_reduce_sum}{Logical; use reduce_sum for performance? -(default: FALSE)} +\item{use_reduce_sum}{Logical; use reduce_sum for performance? (default: FALSE)} + +\item{truncation_check_multiplier}{Numeric multiplier to use for checking +if the truncation time D is appropriate relative to the maximum delay +for each unique D value. Set to NULL to skip the check. Default is 2.} } \value{ -A list containing the data formatted for use with -\code{\link[=pcd_cmdstan_model]{pcd_cmdstan_model()}} +A list containing the data formatted for use with \code{\link[=pcd_cmdstan_model]{pcd_cmdstan_model()}} } \description{ This function takes in delay data and prepares it for use with the diff --git a/man/pcd_cmdstan_model.Rd b/man/pcd_cmdstan_model.Rd index c77aa57..c1e86bc 100644 --- a/man/pcd_cmdstan_model.Rd +++ b/man/pcd_cmdstan_model.Rd @@ -4,12 +4,24 @@ \alias{pcd_cmdstan_model} \title{Create a CmdStanModel with primarycensoreddist Stan functions} \usage{ -pcd_cmdstan_model(include_paths = primarycensoreddist::pcd_stan_path(), ...) +pcd_cmdstan_model( + include_paths = primarycensoreddist::pcd_stan_path(), + check_truncation_multiplier = 2, + delays = NULL, + ... +) } \arguments{ \item{include_paths}{Character vector of paths to include for Stan compilation. Defaults to the result of \code{pcd_stan_path()}.} +\item{check_truncation_multiplier}{Numeric multiplier to use for checking +if the truncation time D is appropriate relative to the maximum delay. +Set to NULL to skip the check. Default is 2.} + +\item{delays}{Optional numeric vector of observed delays to use for checking +the truncation time. Only used if \code{check_truncation_multiplier} is not NULL.} + \item{...}{Additional arguments passed to cmdstanr::cmdstan_model().} } \value{ From 1106729b914e7ffa8a591b4542f0b9a647592019 Mon Sep 17 00:00:00 2001 From: Sam Date: Fri, 4 Oct 2024 18:13:26 +0100 Subject: [PATCH 4/5] add news and rename --- NAMESPACE | 2 +- NEWS.md | 3 +- R/pcd_cmdstan_model.R | 48 ++++++++++--------- man/fitdistdoublecens.Rd | 2 +- ...as_cmdstan_data.Rd => pcd_as_stan_data.Rd} | 20 ++++---- man/pcd_cmdstan_model.Rd | 16 +------ tests/testthat/test-pcd_as_cmdstan_data.R | 16 +++---- tests/testthat/test-pcd_cmdstan_model.R | 8 ++-- touchstone/script.R | 4 +- vignettes/fitting-dists-with-stan.Rmd | 4 +- 10 files changed, 60 insertions(+), 63 deletions(-) rename man/{pcd_as_cmdstan_data.Rd => pcd_as_stan_data.Rd} (87%) diff --git a/NAMESPACE b/NAMESPACE index efaeba6..c19ecc3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,7 +11,7 @@ export(dpcens) export(dprimarycensoreddist) export(fitdistdoublecens) export(new_primary_censored_dist) -export(pcd_as_cmdstan_data) +export(pcd_as_stan_data) export(pcd_cmdstan_model) export(pcd_load_stan_functions) export(pcd_stan_files) diff --git a/NEWS.md b/NEWS.md index 0cfc9e2..7bea493 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,7 +6,8 @@ This is the development version of `primarycensoreddist` and is not yet ready fo * Split "Why it works" vignette into two separate vignettes, "Why it works" and "Analytic solutions for censored delay distributions". * Removed the need to assign functions to the global environment for `fitdistdoublecens()` by using `withr`. - +* Added a `check_truncation()` function to check if the truncation time is larger than the maximum observed delay. This is used in `fitdistdoublecens()` and `pcd_as_stan_data()` to ensure that the truncation time is appropriate to maximise computational efficiency. +* ## Documentation * Simplified the "Analytic solutions" vignette by removing verbose derivation details. diff --git a/R/pcd_cmdstan_model.R b/R/pcd_cmdstan_model.R index 6cae4f3..05906b1 100644 --- a/R/pcd_cmdstan_model.R +++ b/R/pcd_cmdstan_model.R @@ -6,11 +6,7 @@ #' #' @param include_paths Character vector of paths to include for Stan #' compilation. Defaults to the result of `pcd_stan_path()`. -#' @param check_truncation_multiplier Numeric multiplier to use for checking -#' if the truncation time D is appropriate relative to the maximum delay. -#' Set to NULL to skip the check. Default is 2. -#' @param delays Optional numeric vector of observed delays to use for checking -#' the truncation time. Only used if `check_truncation_multiplier` is not NULL. +#' #' @param ... Additional arguments passed to cmdstanr::cmdstan_model(). #' #' @return A CmdStanModel object. @@ -35,8 +31,6 @@ #' } pcd_cmdstan_model <- function( include_paths = primarycensoreddist::pcd_stan_path(), - check_truncation_multiplier = 2, - delays = NULL, ...) { if (!requireNamespace("cmdstanr", quietly = TRUE)) { stop("Package 'cmdstanr' is required but not installed for this function.") @@ -47,14 +41,6 @@ pcd_cmdstan_model <- function( package = "primarycensoreddist" ) - if (!is.null(check_truncation_multiplier) && !is.null(delays)) { - check_truncation( - delays = delays, - D = stan_data$D, - multiplier = check_truncation_multiplier - ) - } - cmdstanr::cmdstan_model( pcd_stan_model, include_paths = include_paths, @@ -68,36 +54,54 @@ pcd_cmdstan_model <- function( #' primarycensoreddist Stan model. #' #' @param data A data frame containing the delay data. +#' #' @param delay Column name for observed delays (default: "delay") -#' @param delay_upper Column name for upper bound of delays (default: "delay_upper") +#' +#' @param delay_upper Column name for upper bound of delays +#' (default: "delay_upper") +#' #' @param n Column name for count of observations (default: "n") +#' #' @param pwindow Column name for primary window (default: "pwindow") -#' @param relative_obs_time Column name for relative observation time (default: "relative_obs_time") +#' +#' @param relative_obs_time Column name for relative observation time +#' (default: "relative_obs_time") +#' #' @param dist_id Integer identifying the delay distribution: #' 1 = Lognormal, 2 = Gamma, 3 = Weibull, 4 = Exponential, #' 5 = Generalized Gamma, 6 = Negative Binomial, 7 = Poisson, #' 8 = Bernoulli, 9 = Beta, 10 = Binomial, 11 = Categorical, 12 = Cauchy, #' 13 = Chi-square, 14 = Dirichlet, 15 = Gumbel, 16 = Inverse Gamma, #' 17 = Logistic +#' #' @param primary_dist_id Integer identifying the primary distribution: #' 1 = Uniform, 2 = Exponential growth +#' #' @param param_bounds A list with elements `lower` and `upper`, each a numeric #' vector specifying bounds for the delay distribution parameters. +#' #' @param primary_param_bounds A list with elements `lower` and `upper`, each a #' numeric vector specifying bounds for the primary distribution parameters. +#' #' @param priors A list with elements `location` and `scale`, each a numeric #' vector specifying priors for the delay distribution parameters. +#' #' @param primary_priors A list with elements `location` and `scale`, each a #' numeric vector specifying priors for the primary distribution parameters. +#' #' @param compute_log_lik Logical; compute log likelihood? (default: FALSE) -#' @param use_reduce_sum Logical; use reduce_sum for performance? (default: FALSE) +#' +#' @param use_reduce_sum Logical; use reduce_sum for performance? +#' (default: FALSE) +#' #' @param truncation_check_multiplier Numeric multiplier to use for checking #' if the truncation time D is appropriate relative to the maximum delay #' for each unique D value. Set to NULL to skip the check. Default is 2. #' -#' @return A list containing the data formatted for use with [pcd_cmdstan_model()] -#' @export +#' @return A list containing the data formatted for use with +#' [pcd_cmdstan_model()] #' +#' @export #' @family modelhelpers #' #' @examples @@ -109,7 +113,7 @@ pcd_cmdstan_model <- function( #' pwindow = c(1, 1, 2), #' relative_obs_time = c(10, 10, 10) #' ) -#' stan_data <- pcd_as_cmdstan_data( +#' stan_data <- pcd_as_stan_data( #' data, #' dist_id = 1, #' primary_dist_id = 1, @@ -119,7 +123,7 @@ pcd_cmdstan_model <- function( #' primary_priors = list(location = numeric(0), scale = numeric(0)) #' ) #' } -pcd_as_cmdstan_data <- function( +pcd_as_stan_data <- function( data, delay = "delay", delay_upper = "delay_upper", n = "n", pwindow = "pwindow", relative_obs_time = "relative_obs_time", diff --git a/man/fitdistdoublecens.Rd b/man/fitdistdoublecens.Rd index 690e446..c09700d 100644 --- a/man/fitdistdoublecens.Rd +++ b/man/fitdistdoublecens.Rd @@ -98,7 +98,7 @@ summary(fit_norm) } \seealso{ Modelling wrappers for external fitting packages -\code{\link{pcd_as_cmdstan_data}()}, +\code{\link{pcd_as_stan_data}()}, \code{\link{pcd_cmdstan_model}()} } \concept{modelhelpers} diff --git a/man/pcd_as_cmdstan_data.Rd b/man/pcd_as_stan_data.Rd similarity index 87% rename from man/pcd_as_cmdstan_data.Rd rename to man/pcd_as_stan_data.Rd index 6900981..1180c29 100644 --- a/man/pcd_as_cmdstan_data.Rd +++ b/man/pcd_as_stan_data.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/pcd_cmdstan_model.R -\name{pcd_as_cmdstan_data} -\alias{pcd_as_cmdstan_data} +\name{pcd_as_stan_data} +\alias{pcd_as_stan_data} \title{Prepare data for primarycensoreddist Stan model} \usage{ -pcd_as_cmdstan_data( +pcd_as_stan_data( data, delay = "delay", delay_upper = "delay_upper", @@ -27,13 +27,15 @@ pcd_as_cmdstan_data( \item{delay}{Column name for observed delays (default: "delay")} -\item{delay_upper}{Column name for upper bound of delays (default: "delay_upper")} +\item{delay_upper}{Column name for upper bound of delays +(default: "delay_upper")} \item{n}{Column name for count of observations (default: "n")} \item{pwindow}{Column name for primary window (default: "pwindow")} -\item{relative_obs_time}{Column name for relative observation time (default: "relative_obs_time")} +\item{relative_obs_time}{Column name for relative observation time +(default: "relative_obs_time")} \item{dist_id}{Integer identifying the delay distribution: 1 = Lognormal, 2 = Gamma, 3 = Weibull, 4 = Exponential, @@ -59,14 +61,16 @@ numeric vector specifying priors for the primary distribution parameters.} \item{compute_log_lik}{Logical; compute log likelihood? (default: FALSE)} -\item{use_reduce_sum}{Logical; use reduce_sum for performance? (default: FALSE)} +\item{use_reduce_sum}{Logical; use reduce_sum for performance? +(default: FALSE)} \item{truncation_check_multiplier}{Numeric multiplier to use for checking if the truncation time D is appropriate relative to the maximum delay for each unique D value. Set to NULL to skip the check. Default is 2.} } \value{ -A list containing the data formatted for use with \code{\link[=pcd_cmdstan_model]{pcd_cmdstan_model()}} +A list containing the data formatted for use with +\code{\link[=pcd_cmdstan_model]{pcd_cmdstan_model()}} } \description{ This function takes in delay data and prepares it for use with the @@ -81,7 +85,7 @@ data <- data.frame( pwindow = c(1, 1, 2), relative_obs_time = c(10, 10, 10) ) -stan_data <- pcd_as_cmdstan_data( +stan_data <- pcd_as_stan_data( data, dist_id = 1, primary_dist_id = 1, diff --git a/man/pcd_cmdstan_model.Rd b/man/pcd_cmdstan_model.Rd index c1e86bc..152889c 100644 --- a/man/pcd_cmdstan_model.Rd +++ b/man/pcd_cmdstan_model.Rd @@ -4,24 +4,12 @@ \alias{pcd_cmdstan_model} \title{Create a CmdStanModel with primarycensoreddist Stan functions} \usage{ -pcd_cmdstan_model( - include_paths = primarycensoreddist::pcd_stan_path(), - check_truncation_multiplier = 2, - delays = NULL, - ... -) +pcd_cmdstan_model(include_paths = primarycensoreddist::pcd_stan_path(), ...) } \arguments{ \item{include_paths}{Character vector of paths to include for Stan compilation. Defaults to the result of \code{pcd_stan_path()}.} -\item{check_truncation_multiplier}{Numeric multiplier to use for checking -if the truncation time D is appropriate relative to the maximum delay. -Set to NULL to skip the check. Default is 2.} - -\item{delays}{Optional numeric vector of observed delays to use for checking -the truncation time. Only used if \code{check_truncation_multiplier} is not NULL.} - \item{...}{Additional arguments passed to cmdstanr::cmdstan_model().} } \value{ @@ -53,6 +41,6 @@ fit <- model$sample(data = stan_data) \seealso{ Modelling wrappers for external fitting packages \code{\link{fitdistdoublecens}()}, -\code{\link{pcd_as_cmdstan_data}()} +\code{\link{pcd_as_stan_data}()} } \concept{modelhelpers} diff --git a/tests/testthat/test-pcd_as_cmdstan_data.R b/tests/testthat/test-pcd_as_cmdstan_data.R index fe84f66..a086177 100644 --- a/tests/testthat/test-pcd_as_cmdstan_data.R +++ b/tests/testthat/test-pcd_as_cmdstan_data.R @@ -1,4 +1,4 @@ -test_that("pcd_as_cmdstan_data correctly formats data", { +test_that("pcd_as_stan_data correctly formats data", { data <- data.frame( delay = c(1, 2, 3), delay_upper = c(2, 3, 4), @@ -14,7 +14,7 @@ test_that("pcd_as_cmdstan_data correctly formats data", { priors <- list(location = c(1, 1), scale = c(1, 1)) primary_priors <- list(location = numeric(0), scale = numeric(0)) - result <- pcd_as_cmdstan_data( + result <- pcd_as_stan_data( data, dist_id = dist_id, primary_dist_id = primary_dist_id, @@ -51,7 +51,7 @@ test_that("pcd_as_cmdstan_data correctly formats data", { expect_identical(result$primary_prior_scale, primary_priors$scale) }) -test_that("pcd_as_cmdstan_data handles missing columns correctly", { +test_that("pcd_as_stan_data handles missing columns correctly", { data <- data.frame( delay = c(1, 2, 3), n = c(10, 20, 15), @@ -59,7 +59,7 @@ test_that("pcd_as_cmdstan_data handles missing columns correctly", { ) expect_error( - pcd_as_cmdstan_data( + pcd_as_stan_data( data, dist_id = 1, primary_dist_id = 1, @@ -72,7 +72,7 @@ test_that("pcd_as_cmdstan_data handles missing columns correctly", { ) }) -test_that("pcd_as_cmdstan_data handles optional parameters correctly", { +test_that("pcd_as_stan_data handles optional parameters correctly", { data <- data.frame( delay = c(1, 2, 3), delay_upper = c(2, 3, 4), @@ -81,7 +81,7 @@ test_that("pcd_as_cmdstan_data handles optional parameters correctly", { relative_obs_time = c(10, 10, 10) ) - result <- pcd_as_cmdstan_data( + result <- pcd_as_stan_data( data, dist_id = 1, primary_dist_id = 1, @@ -97,7 +97,7 @@ test_that("pcd_as_cmdstan_data handles optional parameters correctly", { expect_identical(result$use_reduce_sum, 1L) }) -test_that("pcd_as_cmdstan_data handles custom column names correctly", { +test_that("pcd_as_stan_data handles custom column names correctly", { data <- data.frame( obs_delay = c(1, 2, 3), obs_delay_upper = c(2, 3, 4), @@ -106,7 +106,7 @@ test_that("pcd_as_cmdstan_data handles custom column names correctly", { obs_time = c(10, 10, 10) ) - result <- pcd_as_cmdstan_data( + result <- pcd_as_stan_data( data, delay = "obs_delay", delay_upper = "obs_delay_upper", diff --git a/tests/testthat/test-pcd_cmdstan_model.R b/tests/testthat/test-pcd_cmdstan_model.R index e68c665..3f2b492 100644 --- a/tests/testthat/test-pcd_cmdstan_model.R +++ b/tests/testthat/test-pcd_cmdstan_model.R @@ -64,7 +64,7 @@ test_that("pcd_cmdstan_model recovers true values for simple lognormal data", { ) # Prepare data for Stan - stan_data <- pcd_as_cmdstan_data( + stan_data <- pcd_as_stan_data( delay_counts, dist_id = 1, # Lognormal primary_dist_id = 1, # Uniform @@ -139,7 +139,7 @@ test_that( ) # Prepare data for Stan - stan_data <- pcd_as_cmdstan_data( + stan_data <- pcd_as_stan_data( delay_counts, dist_id = 2, # Gamma primary_dist_id = 2, # Exponential growth @@ -218,7 +218,7 @@ test_that( ) # Prepare data for Stan - stan_data <- pcd_as_cmdstan_data( + stan_data <- pcd_as_stan_data( delay_counts, dist_id = 1, # Lognormal primary_dist_id = 1, # Uniform @@ -299,7 +299,7 @@ test_that( ) # Prepare data for Stan - stan_data <- pcd_as_cmdstan_data( + stan_data <- pcd_as_stan_data( delay_counts, dist_id = 1, # Lognormal primary_dist_id = 1, # Uniform diff --git a/touchstone/script.R b/touchstone/script.R index f1fea3b..61d3d0d 100644 --- a/touchstone/script.R +++ b/touchstone/script.R @@ -167,7 +167,7 @@ touchstone::benchmark_run( .by = c(pwindow, relative_obs_time, delay, delay_upper) ) - stan_data1 <- pcd_as_cmdstan_data( + stan_data1 <- pcd_as_stan_data( delay_counts1, dist_id = 1, primary_dist_id = 1, @@ -234,7 +234,7 @@ touchstone::benchmark_run( .by = c(pwindow, relative_obs_time, delay, delay_upper) ) - stan_data2 <- pcd_as_cmdstan_data( + stan_data2 <- pcd_as_stan_data( delay_counts2, dist_id = 2, primary_dist_id = 2, diff --git a/vignettes/fitting-dists-with-stan.Rmd b/vignettes/fitting-dists-with-stan.Rmd index f53875b..eaee40d 100644 --- a/vignettes/fitting-dists-with-stan.Rmd +++ b/vignettes/fitting-dists-with-stan.Rmd @@ -277,7 +277,7 @@ We see that the model has converged and the diagnostics look good. We also see t # Using `pcd_cmdstan_model()` for a more efficient approach -While the previous approach works well, `primarycensoreddist` provides a more efficient and convenient model which we can compile using `pcd_cmdstan_model()`. This approach not only saves time in model specification but also leverages within chain parallelisation to make best use of your machine's resources. Alongside this we also supply a convenience function `pcd_as_cmdstan_data()` to convert our data into a format that can be used to fit the model and supply priors, bounds, and other settings. +While the previous approach works well, `primarycensoreddist` provides a more efficient and convenient model which we can compile using `pcd_cmdstan_model()`. This approach not only saves time in model specification but also leverages within chain parallelisation to make best use of your machine's resources. Alongside this we also supply a convenience function `pcd_as_stan_data()` to convert our data into a format that can be used to fit the model and supply priors, bounds, and other settings. Let's use this function to fit our data: @@ -285,7 +285,7 @@ Let's use this function to fit our data: # Compile the model with multithreading support pcd_model <- pcd_cmdstan_model(cpp_options = list(stan_threads = TRUE)) -pcd_data <- pcd_as_cmdstan_data( +pcd_data <- pcd_as_stan_data( delay_counts, delay = "observed_delay", delay_upper = "observed_delay_upper", From 62762c036181b9b8763af65578d7df812370320d Mon Sep 17 00:00:00 2001 From: Sam Date: Fri, 4 Oct 2024 18:18:06 +0100 Subject: [PATCH 5/5] include new news item --- NEWS.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 7bea493..d1bbfbb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -7,7 +7,8 @@ This is the development version of `primarycensoreddist` and is not yet ready fo * Split "Why it works" vignette into two separate vignettes, "Why it works" and "Analytic solutions for censored delay distributions". * Removed the need to assign functions to the global environment for `fitdistdoublecens()` by using `withr`. * Added a `check_truncation()` function to check if the truncation time is larger than the maximum observed delay. This is used in `fitdistdoublecens()` and `pcd_as_stan_data()` to ensure that the truncation time is appropriate to maximise computational efficiency. -* +* `pcd_as_cmdstan_data()` has been renamed to `pcd_as_stan_data()` to better reflect that it is used for `Stan` models in general rather than just the `CmdStan` models. + ## Documentation * Simplified the "Analytic solutions" vignette by removing verbose derivation details.