Skip to content

Commit

Permalink
Handle messaging around model treatment of missingness (#774)
Browse files Browse the repository at this point in the history
* Add message about NA treatment for option "missing"

* Add missing tests for obs_opts

* Add NEWS item

* Improve messages

Co-authored-by: Sebastian Funk <[email protected]>

* Check if na is explicitly specified by user

Co-authored-by: Sebastian Funk <[email protected]>

* Change less regular to non-daily

* Create function to test for complete data

* Add tests for test_data_complete

* Document test_data_complete

* Add function to crosscheck na as missing setting with data

* Move message elsewhere and only check if na is specified for later use

* Add new global

* Apply na cross checking function in main functions

* Rename dirty data

* Add test for na cross checking function

* Add more tests to obs_opts

* Remove global variable

* Use alternative syntax to avoid linting issues

* copy before modifying data.table

* Fix test

* Update NEWS.md

* Make message/warning verbose and test for message/warning

* Simplify comment

* Use vapply for type safety

* Add lifecycle badge

* Add lifecycle badge

* Improve docs

* Return unmodified obs_opts

* Move removal of element of obs_opts() into checker function

* Use setequal instead of checking lengths

* Remove new unnecessary new lines

* Remove trailing white space

* Add test to check for expected element introduced

* Add new argument to pass column names

* Add error for column mismatch

* Refactor for readability

* Perform missingness check on the raw data

* Remove test that no longer applies

* Add reviewer and PR number

Co-authored-by: Sebastian Funk <[email protected]>

---------

Co-authored-by: Sebastian Funk <[email protected]>
  • Loading branch information
jamesmbaazam and sbfnk authored Oct 16, 2024
1 parent 259b150 commit f5927c4
Show file tree
Hide file tree
Showing 9 changed files with 290 additions and 7 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ A release that introduces model improvements to the Gaussian Process models, alo
- Switch to broadcasting the day of the week effect. By @seabbs in #746 and reviewed by @jamesmbaazam.
- A warning is now thrown if nonparametric PMFs passed to delay options have consecutive tail values that are below a certain low threshold as these lead to loss in speed with little gain in accuracy. By @jamesmbaazam in #752 and reviewed by @seabbs, with a subsequent bug fix in #802.
- `dist_fit()` can now accept any number of `samples` without throwing a warning when `samples` < 1000 in #751 by @jamesmbaazam and reviewed by @seabbs and @sbfnk.
- `obs_opts()` now informs users about how NA observations are treated to help them decide on existing alternatives. By @jamesmbaazam in #774 and reviewed by @sbfnk.
- Users are now informed that `NA` observations will be treated as missing instead of zero when using the default `obs_opts()`. Options to treat `NA` as zeros or accumulate them are also provided. By @jamesmbaazam in #774 and reviewed by @sbfnk.

## Package changes

Expand Down
90 changes: 90 additions & 0 deletions R/checks.R
Original file line number Diff line number Diff line change
Expand Up @@ -179,3 +179,93 @@ check_sparse_pmf_tail <- function(pmf, span = 5, tol = 1e-6) {
)
}
}

#' Check if data has either explicit NA values or implicit missing dates.
#'
#' @param data The data to be checked
#' @param cols_to_check A character vector of the columns to check
#' @return `TRUE` if data is complete, else if data has implicit or explicit
#' missingness, `FALSE`.
#' @importFrom cli cli_abort col_blue
#' @keywords internal
test_data_complete <- function(data, cols_to_check) {
data <- setDT(data) # Convert data to data.table

if (!all(cols_to_check %in% names(data))) {
cli_abort(
c(
"x" = "{.var cols_to_check} must be present in the data.",
"i" = "{.var data} has columns: {col_blue(names(data))} but you
specified {.var cols_to_expect}: {col_blue(cols_to_check)}."
)
)
}
# Check for explicit missingness in the specified columns
data_has_explicit_na <- any(
vapply(data[, cols_to_check, with = FALSE], anyNA, logical(1))
)
if (data_has_explicit_na) {
return(FALSE)
}

# Check for implicit missingness by comparing the expected full date sequence
complete_dates <- seq(
min(data$date, na.rm = TRUE),
max(data$date, na.rm = TRUE),
by = "1 day"
)
data_has_implicit_na <- !all(complete_dates %in% data$date)
if (data_has_implicit_na) {
return(FALSE)
}

return(TRUE) # Return TRUE if no missing values or gaps in date sequence
}

#' Cross-check treatment of `NA` in obs_opts() against input data
#'
#' @description `r lifecycle::badge("experimental")`
#'
#' This function checks the input data for implicit and/or explicit missingness
#' and checks if the user specified `na = "missing"` in [obs_opts()].
#' If the two are TRUE, it throws a message about how the model treats
#' missingness and provides alternatives. It returns an unmodified [obs_opts()].
#'
#' This function is necessary because the data and observation model
#' do not currently interact internally. It will be deprecated in future
#' versions when the data specification interface is enhanced.
#'
#' @param obs A call to [obs_opts()]
#' @param data The raw data
#' @inheritParams test_data_complete
#' @importFrom cli cli_inform col_red
#'
#' @return [obs_opts()]
#' @keywords internal
check_na_setting_against_data <- function(data, cols_to_check, obs) {
# If users are using the default treatment of NA's and their data has
# implicit or explicit NA's, inform them of what's happening and alternatives
data_is_complete <- test_data_complete(data, cols_to_check)
if (!obs$accumulate &&
obs$na_as_missing_default_used &&
!data_is_complete) {
#nolint start: duplicate_argument_linter
cli_inform(
c(
"i" = "{col_red(\"As of version 1.5.0 missing dates or dates with `NA`
cases are treated as missing. This is in contrast to previous versions
where these were interpreted as dates with zero cases. \")}",
"i" = "In order to treat missing or `NA` cases as zeroes, see
solutions in {.url https://github.com/epiforecasts/EpiNow2/issues/767#issuecomment-2348805272}", #nolint
"i" = "If the data is reported at non-daily intervals (for example
weekly), consider using `obs_opts(na=\"accumulate\")`.",
"i" = "For more information on these options, see `?obs_opts`."
),
.frequency = "regularly",
.frequency_id = "check_na_setting_against_data"
)
#nolint end
}
obs$na_as_missing_default_used <- NULL
return(obs)
}
10 changes: 10 additions & 0 deletions R/estimate_infections.R
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,16 @@ estimate_infections <- function(data,
name = "EpiNow2.epinow.estimate_infections"
)
}

# If the user is using the default treatment of NA's as missing and
# their data has implicit or explicit NA's, inform them of what's
# happening and provide alternatives.
obs <- check_na_setting_against_data(
obs = obs,
data = dirty_reported_cases,
cols_to_check = c("date", "confirm")
)
# Create clean and complete cases
# Order cases
reported_cases <- create_clean_reported_cases(
data, horizon,
Expand Down
13 changes: 10 additions & 3 deletions R/estimate_secondary.R
Original file line number Diff line number Diff line change
Expand Up @@ -182,15 +182,22 @@ estimate_secondary <- function(data,
assert_logical(verbose)

reports <- data.table::as.data.table(data)
secondary_reports <- reports[, list(date, confirm = secondary)]
# If the user is using the default treatment of NA's as missing and
# their data has implicit or explicit NA's, inform them of what's
# happening and provide alternatives.
obs <- check_na_setting_against_data(
obs = obs,
data = reports,
cols_to_check = c("date", "primary", "secondary")
)
secondary_reports_dirty <- reports[, list(date, confirm = secondary)]
secondary_reports <- create_clean_reported_cases(
secondary_reports,
secondary_reports_dirty,
filter_leading_zeros = filter_leading_zeros,
zero_threshold = zero_threshold
)
## fill in missing data (required if fitting to prevalence)
complete_secondary <- create_complete_cases(secondary_reports)

## fill down
secondary_reports[, confirm := nafill(confirm, type = "locf")]
## fill any early data up
Expand Down
8 changes: 5 additions & 3 deletions R/opts.R
Original file line number Diff line number Diff line change
Expand Up @@ -628,6 +628,8 @@ obs_opts <- function(family = c("negbin", "poisson"),
na = c("missing", "accumulate"),
likelihood = TRUE,
return_likelihood = FALSE) {
# NB: This has to be checked first before the na argument is touched anywhere.
na_default_used <- missing(na)
na <- arg_match(na)
if (na == "accumulate") {
#nolint start: duplicate_argument_linter
Expand All @@ -644,9 +646,8 @@ obs_opts <- function(family = c("negbin", "poisson"),
.frequency = "regularly",
.frequency_id = "obs_opts"
)
#nolint end
}

#nolint end
if (length(phi) == 2 && is.numeric(phi)) {
cli_abort(
c(
Expand All @@ -664,7 +665,8 @@ obs_opts <- function(family = c("negbin", "poisson"),
scale = scale,
accumulate = as.integer(na == "accumulate"),
likelihood = likelihood,
return_likelihood = return_likelihood
return_likelihood = return_likelihood,
na_as_missing_default_used = na_default_used
)

for (param in c("phi", "scale")) {
Expand Down
31 changes: 31 additions & 0 deletions man/check_na_setting_against_data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 21 additions & 0 deletions man/test_data_complete.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

91 changes: 90 additions & 1 deletion tests/testthat/test-checks.R
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,95 @@ test_that("check_reports_valid errors for bad 'secondary' specifications", {
})

test_that("check_sparse_pmf_tail throws a warning as expected", {
# NB: The warning is set to be thrown once every 8 hours, so hard to test
# regularly. The fix is to change the local setting here to throw the
# warning on demand for the sake of multiple runs of the test within
# 8 hours. That's what the rlang call below does
rlang::local_options(rlib_warning_verbosity = "verbose")
pmf <- c(0.4, 0.30, 0.20, 0.05, 0.049995, 4.5e-06, rep(1e-7, 5))
expect_warning(check_sparse_pmf_tail(pmf), "PMF tail has")
expect_warning(
check_sparse_pmf_tail(pmf),
"PMF tail has"
)
})

test_that("test_data_complete detects complete and incomplete data", {
# example_confirmed with explicit missing dates
ec_missing_date <- copy(example_confirmed)[c(1, 3), date := NA]
# example_confirmed with explicit missing confirm
ec_missing_confirm <- copy(example_confirmed)[c(1, 3), confirm := NA]
# example_confirmed with implicit missing (missing entries)
ec_implicit_missing <- copy(example_confirmed)[-c(1,3,5), ]
# Create a hypothetical complete example_secondary
es <- copy(example_confirmed)[
, primary := confirm
][
, secondary := primary * 0.4
]
# example_secondary with explicit missing primary
es_missing_primary <- copy(es)[c(1, 3), primary := NA]
# example_secondary with explicit missing secondary
es_missing_secondary <- copy(es)[c(1, 3), secondary := NA]

# cols to check
ep_cols <- c("date", "confirm")
es_cols <- c("date", "primary", "secondary")
# Expectations
expect_true(test_data_complete(example_confirmed, ep_cols))
expect_true(test_data_complete(es, es_cols))
expect_false(test_data_complete(ec_missing_date, ep_cols))
expect_false(test_data_complete(ec_missing_confirm, ep_cols))
expect_false(test_data_complete(es_missing_primary, es_cols))
expect_false(test_data_complete(es_missing_secondary, es_cols))
expect_false(test_data_complete(ec_implicit_missing, ep_cols))
})

test_that("check_na_setting_against_data works as expected", {
# If data is incomplete and the default na = "missing" is being used,
# expect a message thrown once every 8 hours.
# NB: We change the local setting here to throw the message on demand, rather
# than every 8 hours, for the sake of multiple runs of the test within
# 8 hours.
rlang::local_options(rlib_message_verbosity = "verbose")
expect_message(
check_na_setting_against_data(
obs = obs_opts(),
data = copy(example_confirmed)[c(1, 3), confirm := NA],
cols_to_check = c("date", "confirm")
),
"version 1.5.0 missing dates or dates"
)
# If data is incomplete but the user explicitly set na = "missing", then
# expect no message
expect_no_message(
check_na_setting_against_data(
obs = obs_opts(na = "missing"),
data = copy(example_confirmed)[c(1, 3), confirm := NA],
cols_to_check = c("date", "confirm")
)
)
# If data is complete, expect no message even when using default na as
# missing setting
expect_no_message(
check_na_setting_against_data(
obs = obs_opts(),
data = example_confirmed,
cols_to_check = c("date", "confirm")
)
)
expect_identical(
setdiff(
names(
obs_opts()
),
names(
check_na_setting_against_data(
obs = obs_opts(),
data = example_confirmed,
cols_to_check = c("date", "confirm")
)
)
),
"na_as_missing_default_used"
)
})
31 changes: 31 additions & 0 deletions tests/testthat/test-obs_opts.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
test_that("obs_opts returns expected default values", {
result <- suppressWarnings(obs_opts())

expect_s3_class(result, "obs_opts")
expect_equal(result$family, "negbin")
expect_equal(result$weight, 1)
expect_true(result$week_effect)
expect_equal(result$week_length, 7L)
expect_equal(result$scale, list(mean = 1, sd = 0))
expect_equal(result$accumulate, 0)
expect_true(result$likelihood)
expect_false(result$return_likelihood)
})

test_that("obs_opts returns expected messages", {
# The option na = "accumulate" informs the user of what is
# going to be done once every 8 hours, so hard to test regularly.
# NB: We change the local setting here to throw the message on demand, rather
# than every 8 hours, for the sake of multiple runs of the test within
# 8 hours.
rlang::local_options(rlib_message_verbosity = "verbose")
expect_message(
obs_opts(na = "accumulate"),
"modelled values that correspond to NA values"
)
})

test_that("obs_opts behaves as expected for user specified na treatment", {
# If user explicitly specifies NA as missing, then don't throw message
expect_false(obs_opts(na = "missing")$na_as_missing_default_used)
})

0 comments on commit f5927c4

Please sign in to comment.