-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathBayesian_IPW_Demo.R
414 lines (358 loc) · 12.3 KB
/
Bayesian_IPW_Demo.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
#------Bayesian IPW Modeling for Causal Inference in a Bayesian Framework-------
#-Author: A. Jordan Nafa----------------------------Created: November 18, 2021-#
#-R Version: 4.1.0----------------------------------Revised: December 18, 2021-#
# Note: All analysis for the models were performed on a Windows 10 desktop
# computer with a Ryzen 9 5900X CPU, 128GB of DDR4 Memory, and an Nvidia RTX
# 3080TI GPU using cmdstan 2.8.2 as a backend via cmdstanr and brms.
# Set Project Options----
options(
digits = 4, # Significant figures output
scipen = 999, # Disable scientific notation
repos = getOption("repos")["CRAN"],
brms.backend = "cmdstanr"
)
# Load the Required libraries----
pacman::p_load(
"tidyverse",
"data.table",
"dtplyr",
"datawizard",
"brms",
"cmdstanr",
"tidybayes",
"ggdag"
)
# Load the helper functions----
source("scripts/00_Helper_Functions.R")
# Base Directory for the model objects
models_dir <- "analyses/models/"
# Base Directory for the stan files
stan_dir <- "analyses/stan/"
#------------------------------------------------------------------------------#
#-----------------------------Data Preparation----------------------------------
#------------------------------------------------------------------------------#
# Load the Mosquito Nets Data
nets <- read_rds("data/mosquito_nets.rds")
# Print the data
glimpse(nets)
#------------------------------------------------------------------------------#
#-----------------Causal Graphs for the Treatment Assignment--------------------
#------------------------------------------------------------------------------#
# Construct the DAG to Identify Confounders to Adjust For
mosquito_dag <- dagify(
malaria_risk ~ net + income + health + temperature,
net ~ income + health + temperature,
health ~ income,
exposure = "net",
outcome = "malaria_risk",
coords = list(
x = c(
malaria_risk = 7,
net = 3,
income = 4,
health = 5,
temperature = 6
),
y = c(
malaria_risk = 2,
net = 2,
income = 3,
health = 1,
temperature = 3
)
),
labels = c(
malaria_risk = "Risk of Malaria",
net = "Mosquito Net",
income = "Income",
health = "Health",
temperature = "Nighttime Temperatures",
resistance = "Insecticide Resistance"
)
)
# Turn DAG into a tidy data frame for plotting
mosquito_dag_tidy <- mosquito_dag %>%
tidy_dagitty() %>%
node_status()
# Make things pretty because we aren't savages
mosquito_dag_plot <- ggplot(
mosquito_dag_tidy,
aes(x = x, y = y, xend = xend, yend = yend)
) +
# Add a geom for the graph edges
geom_dag_edges() +
# Add a geom for the graph points
geom_dag_point(aes(color = status)) +
# Add labels for the graph nodes
geom_label(
aes(label = label, fill = status),
color = "white",
fontface = "bold",
family = "serif",
nudge_y = -0.3
) +
# Set the color scheme of the graph nodes
scale_color_manual(
values = c(
exposure = "#5870D0",
outcome = "#A00000",
latent = "#8830E0"
),
na.value = "#A868F8"
) +
# Set the fill scheme of the graph nodes
scale_fill_manual(
values = c(
exposure = "#5870D0",
outcome = "#A00000",
latent = "#8830E0"
),
na.value = "#A868F8"
) +
# Disable legends
guides(color = "none", fill = "none") +
# Apply custom sylistic settings
map_theme()
# Render the DAG Plot
print(mosquito_dag_plot)
# Retreive the adjustment set
dagitty::adjustmentSets(mosquito_dag)
#------------------------------------------------------------------------------#
#------------------------Fitting the Design Stage Model-------------------------
#------------------------------------------------------------------------------#
# Note: Since the goal is to predict the assignment of the treatment, in an
# actual application where there are a wide number of potentially important
# variables it may make more sense to perform projection predictive variable
# selection via the projpred package or specify several different treatment
# models and employ a pseudo-bayesian model averaging approach via loo.
# Define a formula for predicting net usage (the treatment)----
treatment_bf <- bf(
net ~ income + temperature + health,
family = brmsfamily("bernoulli", "logit"), # Logistic Regression Model
decomp = "QR" # Applies a QR Decompoisition to the Predictor Matrix
)
# Specify some weakly informative priors for the model parameters----
treatment_priors <-
prior("student_t(3, 0, 2.5)", class = "Intercept") +
prior("normal(0, 2)", class = "b")
# Fit the model using brms----
treatment_model <- brm(
formula = treatment_bf,
prior = treatment_priors,
data = nets,
chains = 8,
cores = 8L,
iter = 2000, # 1000 warmup/1000 sampling per chain
opencl = opencl(ids = c(0, 0)), # GPU Accelerated Computation
seed = 1234,
backend = "cmdstanr",
save_pars = save_pars(all = TRUE),
save_model = str_c(stan_dir, "Treatment_Model.stan"),
file = str_c(models_dir, "treatment_model")
)
# Add LOO and Bayes R2 to the Model for the Full Data----
treatment_model <- add_criterion(
treatment_model,
criterion = c("loo", "bayes_R2", "loo_R2"),
cores = 8, # Adjust this based on your system specifications
file = str_c(models_dir, "treatment_model"),
seed = 1234
)
# Posterior predictive check for the model
ppc_treatment <- pp_check(treatment_model, ndraws = 1000)
#------------------------------------------------------------------------------#
#------------------------Generating the IPW Weights-----------------------------
#------------------------------------------------------------------------------#
# Note: The `ndraws` argument needs to be equal to at least the total number of
# iterations (warmup + sampling) that the second stage outcome model will be
# run for.
# First we need to generate a matrix of expectations from the posterior
pred_probs_chains <- posterior_epred(
treatment_model,
cores = 8L,
seed = 1234
)
# Then we need to transpose the matrix so that rows correspond to observations
# and each column is a vector of probabilities of length N
ipw_matrix <- t(pred_probs_chains) %>%
# Coerce the matrix to a tibble for further pre-processing
as_tibble() %>%
# Add the response vector as the first column
mutate(net_num = nets$net_num, .before = 1) %>%
# Generate the inverse probability weights
transmute(across(
starts_with("V"),
~ (net_num / .x) + ((1 - net_num) / (1 - .x))
))
# Rows are observations and columns are posterior samples
head(ipw_matrix, c(10, 11))
# Write the IPW Matrix to a file
write_rds(ipw_matrix, "data/ipw_matrix.rds")
#------------------------------------------------------------------------------#
#-----------------------Specifying the Outcome Model----------------------------
#------------------------------------------------------------------------------#
# Note: I'm not sure if its possible to specify the outcome model via the
# method used here in brms. It may be possible to do by specifying a custom
# family but I'm honestly not sure so for the time being I'm just going to code
# things up directly in Stan. That said, we can use brms to generate the
# intitial stan code
# Specify the formula for the outcome model----
# outcome_bf <- bf(
# malaria_risk | weights() ~ net_num,
# family = brmsfamily("gaussian", "identity") # Normal Regression Model
# )
# Specify some weakly informative priors for the model parameters----
# outcome_priors <-
# prior("student_t(3, 0, 2.5)", class = "Intercept") +
# prior("normal(0, 2.5)", class = "b")
# Generate Stan Code for the Model
#make_stancode(
# formula = outcome_bf,
# data = nets,
# prior = outcome_priors,
# sample_prior = "no",
# save_model = str_c(stan_dir, "Outcome_Model.stan"),
#)
# Our goal here is to generate samples based on a different vector
# of the IPW matrix for each iteration. To do this, we need to
# define some C++ function to use in the model. We add the following
# in the functions block of our stan program
"functions {
// Sets the starting iteration count to 1
static int itct = 1;
// Adds 1 to the count each iteration
inline void add_iter(std::ostream* pstream__){
itct += 1;
}
// Returns the current iteration
inline int get_iter(std::ostream* pstream__){
return itct;
}
}"
# Then we need to declare the dimensions of the ipw_matrix in the
# data block. We'll remove the existing weights declaration because
# we're going to define the weights as a matrix here
"data {
int<lower=1> N; // Total Number of Observations
vector[N] Y; // Response Vector
int<lower=1> K; // Number of Population-Level Coefficients
matrix[N, K] X; // Design Matrix for the Fixed Effects
int L; // The number of columns in the ipw matrix
matrix[N, L] IPW; // Declaring the weights matrix
}"
# Then we can add the declaration of the weights matrix in the
# tranformed data block.
"transformed data {
int Kc = K - 1;
matrix[N, Kc] Xc; // Centered version of X without an intercept
vector[Kc] means_X; // Column means of X before centering
for (i in 2:K) {
means_X[i - 1] = mean(X[, i]);
Xc[, i - 1] = X[, i] - means_X[i - 1];
}
}"
# We don't need to make any changes to the parameters block
"parameters {
vector[Kc] b; // Population-Level Effects
real Intercept; // Temporary Intercept for Centered Predictors
real<lower=0> sigma; // Dispersion Parameter
}"
# In the model block, we'll need to make some changes to the likelihood
# and make the weights a matrix instead of a vector
"model {
// Likelihood (It might be possible to vectorize this)
vector[N] mu = Intercept + Xc * b;
int M = get_iter(); // Get the current iteration
for (n in 1:N) {
target += IPW[n, M] * (normal_lpdf(Y[n] | mu[n], sigma));
}
// Priors for the Model Parameters
target += normal_lpdf(b | 0, 2.5);
target += student_t_lpdf(Intercept | 3, 0, 2.5);
target += student_t_lpdf(sigma | 3, 0, 14.8) - 1 * student_t_lccdf(0 | 3, 0, 14.8);
}"
# Finally, in generated quantities we need to call the function to update
# the number of iteration counter at each pass.
"generated quantities {
// Actual Population-Level Intercept
real b_Intercept = Intercept - dot_product(means_X, b);
// Update the counter each iteration
add_iter();
// You could also calculate the ATE here or whatever
}"
# Then we put it all together in a stan file (not shown) and read back in the
# modified stan code
# Print the model
str_split(
read_lines(str_c(stan_dir, "IPW_Outcome_Model.stan")),
pattern = ";",
simplify = T
)
# I'm using rstan to fit the outcome model because cmdstanr doesn't
# like my C++ code for some reason
library(rstan)
rstan_options(auto_write = TRUE)
# Compile the stan model
outcome_model <- stan_model(
model_name = "Outcome IPW Model",
file = "analyses/stan/IPW_Outcome_Model.stan",
allow_undefined = TRUE,
includes = paste0('\n#include "', file.path(getwd(), 'analyses/stan/iterfuns.hpp'), '"\n'),
save_dso = TRUE
)
# Define the data for the stan model
stan_data_ls <- list(
N = nrow(nets),
Y = nets$malaria_risk,
K = 2,
X = model.matrix(~ net_num, data = nets),
L = 8000,
IPW = as.matrix(ipw_matrix)
)
samp_outcome_model <- sampling(
outcome_model,
data = stan_data_ls,
chains = 8,
iter = 8000,
seed = 1234,
cores = 12,
sample_file = str_c(models_dir, "outcome-model/IPW_Outcome_Model_Samples"),
diagnostic_file = str_c(models_dir, "outcome-model/IPW_Outcome_Model_Diag")
)
# Write the stan samples to an object
write_rds(samp_outcome_model, str_c(models_dir, "IPW_Outcome_Model.rds"))
# Calculate Treated and Untreated Units
treats <- samp_outcome_model %>%
# Tidy data frame of draws
tidy_draws() %>%
# Transmute the draws
transmute(
treated = b_Intercept + `b[1]`,
untreated = b_Intercept,
ATE = treated - untreated
)
ate_plot <- ggplot(treats, aes(x = ATE)) +
# Add the gradient slab
stat_halfeye(
aes(slab_alpha = stat(pdf)),
fill = "purple",
fill_type = "gradient",
show.legend = FALSE,
point_interval = mean_qi
) +
# Apply custom theme settings
plot_theme(plot.margin = margin(5,5,5,5, "mm")) +
# Add labels
labs(
y = "Density",
x = expression(paste("ATE", Delta))
)
# Render the gradient plot to a file
ggsave(
filename = "ATE_Graph.jpeg",
dpi = "retina",
device = "jpeg",
height = 8,
width = 12,
type = "cairo"
)