-
Notifications
You must be signed in to change notification settings - Fork 99
/
pre_process_did.R
372 lines (310 loc) · 13.6 KB
/
pre_process_did.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
#' @title Process `did` Function Arguments
#'
#' @description Function to process arguments passed to the main methods in the
#' `did` package as well as conducting some tests to make sure
#' data is in proper format / try to throw helpful error messages.
#'
#' @inheritParams att_gt
#' @param call Function call to att_gt
#'
#' @return a [`DIDparams`] object
#'
#' @export
pre_process_did <- function(yname,
tname,
idname,
gname,
xformla = NULL,
data,
panel = TRUE,
allow_unbalanced_panel,
control_group = c("nevertreated","notyettreated"),
anticipation = 0,
weightsname = NULL,
alp = 0.05,
bstrap = FALSE,
cband = FALSE,
biters = 1000,
clustervars = NULL,
est_method = "dr",
base_period = "varying",
print_details = TRUE,
faster_mode = FALSE,
pl = FALSE,
cores = 1,
call = NULL) {
#-----------------------------------------------------------------------------
# Data pre-processing and error checking
#-----------------------------------------------------------------------------
# set control group
control_group <- control_group[1]
if(!(control_group %in% c("nevertreated","notyettreated"))){
stop("control_group must be either 'nevertreated' or 'notyettreated'")
}
# make sure dataset is a data.frame
# this gets around RStudio's default of reading data as tibble
if (!all( class(data) == "data.frame")) {
data <- as.data.frame(data)
}
# make sure time periods are numeric
if (! (is.numeric(data[, tname])) ) stop("data[, tname] must be numeric")
# make sure gname is numeric
if (! (is.numeric(data[, gname])) ) stop("data[, gname] must be numeric")
# put in blank xformla if no covariates or check whether all variables are in data
if (is.null(xformla)) {
xformla <- ~1
} else {
# extract variable names from the formula
formula_vars <- all.vars(xformla)
# identify variables in xformla not in data
missing_vars <- setdiff(formula_vars, names(data))
# error checking for missing variables in data
if (length(missing_vars) > 0) {
stop(paste("The following variables are not in data:", paste(missing_vars, collapse = ", ")), call. = FALSE)
}
}
# drop irrelevant columns from data
data <- cbind.data.frame(data[,c(idname, tname, yname, gname, weightsname, clustervars)], model.frame(xformla, data=data, na.action=na.pass))
# check if any covariates were missing
n_orig <- nrow(data)
data <- data[complete.cases(data),]
n_diff <- n_orig - nrow(data)
if (n_diff != 0) {
warning(paste0("dropped ", n_diff, " rows from original data due to missing data"))
}
# weights if null
ifelse(is.null(weightsname), w <- rep(1, nrow(data)), w <- data[,weightsname])
if (".w" %in% colnames(data)) stop("`did` tried to use column named \".w\" internally, but there was already a column with this name")
data$.w <- w
# Outcome variable will be denoted by y
# data$.y <- data[, yname]
# figure out the dates
# list of dates from smallest to largest
tlist <- unique(data[,tname])[order(unique(data[,tname]))]
# Groups with treatment time bigger than max time period are considered to be never treated
asif_never_treated <- (data[,gname] > max(tlist, na.rm = TRUE))
asif_never_treated[is.na(asif_never_treated)] <- FALSE
data[asif_never_treated, gname] <- 0
# list of treated groups (by time) from smallest to largest
glist <- unique(data[,gname], )[order(unique(data[,gname]))]
# Check if there is a never treated group
if ( length(glist[glist==0]) == 0) {
if(control_group=="nevertreated"){
stop("There is no available never-treated group")
} else {
# Drop all time periods with time periods >= latest treated
data <- subset(data,(data[,tname] < (max(glist)-anticipation)))
# Replace last treated time with zero
# lines.gmax <- data[,gname]==max(glist, na.rm = TRUE)
# data[lines.gmax,gname] <- 0
tlist <- sort(unique(data[,tname]))
glist <- sort(unique(data[,gname]))
# don't comput ATT(g,t) for groups that are only treated at end
# and only play a role as a comparison group
glist <- glist[ glist < max(glist)]
}
}
# Only the treated groups
glist <- glist[glist>0]
# drop groups treated in the first period or before
first.period <- tlist[1]
glist <- glist[glist > first.period + anticipation]
# check for groups treated in the first period and drop these
# nfirstperiod <- length(unique(data[ !((data[,gname] > first.period) | (data[,gname]==0)), ] )[,idname])
treated_first_period <- ( data[,gname] <= first.period ) & ( !(data[,gname]==0) )
treated_first_period[is.na(treated_first_period)] <- FALSE
nfirstperiod <- ifelse(panel, length(unique(data[treated_first_period,][,idname])), nrow(data[treated_first_period,]))
if ( nfirstperiod > 0 ) {
warning(paste0("Dropped ", nfirstperiod, " units that were already treated in the first period."))
data <- data[ data[,gname] %in% c(0,glist), ]
# update tlist and glist
tlist <- unique(data[,tname])[order(unique(data[,tname]))]
glist <- unique(data[,gname], )[order(unique(data[,gname]))]
glist <- glist[glist>0]
# drop groups treated in the first period or before
first.period <- tlist[1]
glist <- glist[glist > first.period + anticipation]
}
# make sure id is numeric
if (! is.null(idname)){
# make sure id is numeric
if (! (is.numeric(data[, idname])) ) stop("data[, idname] must be numeric")
## # checks below are useful, but removing due to being slow
## # might ought to figure out a way to do these faster later
## # these checks are also closely related to making sure
## # that we have a well-balanced panel, so it might make
## # sense to move them over to the BMisc package
## # Check if idname is unique by tname
## n_id_year = all( table(data[, idname], data[, tname]) <= 1)
## if (! n_id_year) stop("The value of idname must be the unique (by tname)")
## # make sure gname doesn't change across periods for particular individuals
## if (!all(sapply( split(data, data[,idname]), function(df) {
## length(unique(df[,gname]))==1
## }))) {
## stop("The value of gname must be the same across all periods for each particular individual.")
## }
}
# if user specifies repeated cross sections,
# set that it really is repeated cross sections
true_repeated_cross_sections <- FALSE
if (!panel) {
true_repeated_cross_sections <- TRUE
}
#-----------------------------------------------------------------------------
# setup data in panel case
#-----------------------------------------------------------------------------
# Check if data is a balanced panel if panel = TRUE and allow_unbalanced_panel = TRUE
bal_panel_test <- panel*allow_unbalanced_panel
if (bal_panel_test) {
# First, focus on complete cases
keepers <- complete.cases(data)
data_comp <- data[keepers,]
# make it a balanced data set
n_all <- length(unique(data_comp[,idname]))
data_bal <- BMisc::makeBalancedPanel(data_comp, idname, tname)
n_bal <- length(unique(data_bal[,idname]))
if (n_bal < n_all) {
# message(paste0("You have an unbalanced panel. Proceeding as such."))
allow_unbalanced_panel <- TRUE
} else {
# message(paste0("You have a balanced panel. Setting the allow_unbalanced_panel = FALSE."))
allow_unbalanced_panel <- FALSE
}
}
if (panel) {
# check for unbalanced panel
if (allow_unbalanced_panel) {
# Flag for true repeated cross sections
panel <- FALSE
true_repeated_cross_sections <- FALSE
} else {
# this is the case where we coerce balanced panel
# check for complete cases
keepers <- complete.cases(data)
n <- length(unique(data[,idname]))
n.keep <- length(unique(data[keepers,idname]))
if (nrow(data[keepers,]) < nrow(data)) {
warning(paste0("Dropped ", (n-n.keep), " observations that had missing data."))
data <- data[keepers,]
}
# make it a balanced data set
n.old <- length(unique(data[,idname]))
data <- BMisc::makeBalancedPanel(data, idname, tname)
n <- length(unique(data[,idname]))
if (n < n.old) {
warning(paste0("Dropped ", n.old-n, " observations while converting to balanced panel."))
}
# If drop all data, you do not have a panel.
if (nrow(data)==0) {
stop("All observations dropped to converted data to balanced panel. Consider setting `panel = FALSE' and/or revisit 'idname'.")
}
n <- nrow(data[ data[,tname]==tlist[1], ])
# slow, repeated check here...
## # check that first.treat doesn't change across periods for particular individuals
## if (!all(sapply( split(data, data[,idname]), function(df) {
## length(unique(df[,gname]))==1
## }))) {
## stop("The value of gname must be the same across all periods for each particular individual.")
## }
}
}
#-----------------------------------------------------------------------------
# code for setting up repeated cross sections (and unbalanced panel)
#-----------------------------------------------------------------------------
if (!panel) {
# check for complete cases
keepers <- complete.cases(data)
if (nrow(data[keepers,]) < nrow(data)) {
warning(paste0("Dropped ", nrow(data) - nrow(data[keepers,]), " observations that had missing data."))
data <- data[keepers,]
}
# If drop all data, you do not have a panel.
if (nrow(data)==0) {
stop("All observations dropped due to missing data problems.")
}
# n-row data.frame to hold the influence function
if (true_repeated_cross_sections) {
data$.rowid <- seq(1:nrow(data))
idname <- ".rowid"
} else {
# set rowid to idname for repeated cross section/unbalanced
data$.rowid <- data[, idname]
}
# n is unique number of cross section observations
# this is different for repeated cross sections and unbalanced panel
n <- length(unique(data[,idname]))
}
## # Update tlist and glist because of data handling
## # figure out the dates
## # list of dates from smallest to largest
## tlist <- unique(data[,tname])[order(unique(data[,tname]))]
## # list of treated groups (by time) from smallest to largest
## glist <- unique(data[,gname])[order(unique(data[,gname]))]
## # Only the treated groups
## glist <- glist[glist>0]
## # drop groups treated in the first period or before
## first.period <- tlist[1]
## glist <- glist[glist > first.period + anticipation]
# Check if groups is empty (usually a problem with the way people defined groups)
if(length(glist)==0){
stop("No valid groups. The variable in 'gname' should be expressed as the time a unit is first treated (0 if never-treated).")
}
# if there are only two time periods, then uniform confidence
# bands are the same as pointwise confidence intervals
if (length(tlist)==2) {
cband <- FALSE
}
#-----------------------------------------------------------------------------
# more error handling after we have balanced the panel
# check against very small groups
gsize <- aggregate(data[,gname], by=list(data[,gname]), function(x) length(x)/length(tlist))
# how many in each group before give warning
# 5 is just a buffer, could pick something else, but seems to work fine
reqsize <- length(BMisc::rhs.vars(xformla)) + 5
# which groups to warn about
gsize <- subset(gsize, x < reqsize) # x is name of column from aggregate
# warn if some groups are small
if (nrow(gsize) > 0) {
gpaste <- paste(gsize[,1], collapse=",")
warning(paste0("Be aware that there are some small groups in your dataset.\n Check groups: ", gpaste, "."))
if ( (0 %in% gsize[,1]) & (control_group == "nevertreated") ) {
stop("never treated group is too small, try setting control_group=\"notyettreated\"")
}
}
#----------------------------------------------------------------------------
# How many time periods
nT <- length(tlist)
# How many treated groups
nG <- length(glist)
# order dataset wrt idname and tname
data <- data[order(data[,idname], data[,tname]),]
# store parameters for passing around later
dp <- DIDparams(yname=yname,
tname=tname,
idname=idname,
gname=gname,
xformla=xformla,
data=as.data.frame(data),
control_group=control_group,
anticipation=anticipation,
weightsname=weightsname,
alp=alp,
bstrap=bstrap,
biters=biters,
clustervars=clustervars,
cband=cband,
print_details=print_details,
faster_mode=faster_mode,
pl=pl,
cores=cores,
est_method=est_method,
base_period=base_period,
panel=panel,
true_repeated_cross_sections=true_repeated_cross_sections,
n=n,
nG=nG,
nT=nT,
tlist=tlist,
glist=glist,
call=call)
}