-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathcombine_estimates.R
157 lines (135 loc) · 5.32 KB
/
combine_estimates.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# ============================================
# Authors: MG
# Maintainers: MG
# Copyright: 2023, HRDAG, GPL v2 or later
# ============================================
#' @title combine_estimates
#'
#' @description Combine MSE estimations results for a given stratum calculated
#' using multiple replicate files created using multiple imputation. Combination
#' is done using the standard approach that makes use of the laws of total
#' expectation and total variance.
#'
#' @param stratum_estimates A data frame of estimates for a stratum of interest
#' calculated using `mse` for all replicates being used for the analysis. The
#' data frame should have columns `N` and `n_obs` from the `mse` function and an
#' additional column `replicate` indicating which replicate the estimates were
#' calculated on.
#'
#' @return A data frame row with the point estimate (`N_mean`) and the
#' associated 95% uncertainty interval (lower bound is `N_025`, upper bound is
#' `N_975`).
#' @export
#' @importFrom dplyr "%>%"
#' @importFrom Rdpack reprompt
#'
#' @references
#' \insertRef{BDA3}{verdata}
#'
#' @examples
#' \dontrun{
#' set.seed(19481210)
#'
#' library(dplyr)
#' library(purrr)
#' library(glue)
#'
#'
#' simulate_estimates <- function(stratum_data, replicate_num) {
#'
#' # simulate an imputed stratification variable to determine whether a record
#' # should be considered part of the stratum for estimation
#' stratification_var <- sample(c(0, 1), size = 100,
#' replace = TRUE, prob = c(0.1, 0.9))
#'
#' my_stratum <- bind_cols(my_stratum, tibble::tibble(stratification_var)) %>%
#' filter(stratification_var == 1)
#'
#' results <- mse(my_stratum, "my_stratum", K = 4) %>%
#' mutate(replicate = replicate_num)
#'
#' return(results)
#'
#' }
#'
#'
#' in_A <- sample(c(0, 1), size = 100, replace = TRUE, prob = c(0.45, 0.65))
#' in_B <- sample(c(0, 1), size = 100, replace = TRUE, prob = c(0.5, 0.5))
#' in_C <- sample(c(0, 1), size = 100, replace = TRUE, prob = c(0.75, 0.25))
#'
#' my_stratum <- tibble::tibble(in_A, in_B, in_C)
#'
#' replicate_nums <- glue("R{1:20}")
#'
#' estimates <- map_dfr(.x = replicate_nums,
#' .f = ~simulate_estimates(stratum_data = my_stratum, replicate_num = .x))
#'
#' combine_estimates(estimates)
#'
#' }
combine_estimates <- function(stratum_estimates) {
# prep replicate data
estimates_df <- stratum_estimates %>%
assertr::verify(assertr::has_all_names("N", "replicate", "n_obs")) %>%
dplyr::mutate(n_0 = N - n_obs) %>%
assertr::verify(all(n_0 >= 0))
# calculate number of replicates
num_replicates <- dplyr::n_distinct(estimates_df$replicate)
if (num_replicates == 1) {
warning("encountered stratum estimated using only one replicate; setting results to NA")
return(tibble::tibble(N_025 = NA_integer_,
N_mean = NA_integer_,
N_975 = NA_integer_))
}
if (num_replicates < 10) {
warning("stratum estimated with fewer than 10 replicates, results may be unreliable")
}
# calculate mean n_obs to be used for CI construction
n_obs <- estimates_df %>%
dplyr::select(n_obs, replicate) %>%
dplyr::distinct() %>%
dplyr::pull(n_obs) %>%
mean() %>%
round(0)
# mean - law of total expectation
total_mean <- estimates_df %>%
dplyr::group_by(replicate) %>%
dplyr::summarize(within_mean = mean(n_0)) %>%
dplyr::summarize(total_mean = round(mean(within_mean), 0)) %>%
dplyr::pull(total_mean)
# log transform data for variance calculation
log_df <- estimates_df %>%
dplyr::mutate(n_0 = if_else(n_0 == 0, 0.1, n_0)) %>%
dplyr::mutate(log_n_0 = log(n_0)) %>%
dplyr::select(log_n_0, replicate)
# total mean of log
total_mean_log <- log_df %>%
dplyr::group_by(replicate) %>%
dplyr::summarize(within_mean = mean(log_n_0)) %>%
dplyr::summarize(total_mean = mean(within_mean)) %>%
dplyr::pull(total_mean)
# calculate within replicate variance
within_variance_log <- log_df %>%
dplyr::group_by(replicate) %>%
dplyr::summarize(within_variance = stats::var(log_n_0)) %>%
dplyr::summarize(total_within_variance = mean(within_variance)) %>%
dplyr::pull(total_within_variance)
# calculate between replicate variances
between_variance_log <- log_df %>%
dplyr::group_by(replicate) %>%
dplyr::summarize(within_mean = mean(log_n_0)) %>%
dplyr::summarize(total_between_variance = stats::var(within_mean)) %>%
dplyr::pull(total_between_variance)
# calculate total variance
total_variance_log <- (within_variance_log + ((num_replicates + 1) / num_replicates) * between_variance_log)
sd_log <- sqrt(total_variance_log)
# construct approximate 95% CI
degrees_freedom <- (num_replicates - 1) * (1 + 1 / (num_replicates + 1) * within_variance_log / between_variance_log)^2
t_statistic <- abs(stats::qt(0.025, degrees_freedom))
n_0_975 <- round(exp(total_mean_log + t_statistic * sd_log), 0)
n_0_025 <- round(exp(total_mean_log - t_statistic * sd_log), 0)
return(tibble::tibble(N_025 = n_obs + n_0_025,
N_mean = n_obs + total_mean,
N_975 = n_obs + n_0_975))
}
# done.