diff --git a/DESCRIPTION b/DESCRIPTION index b8a5dfa..29995a8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: did Title: Treatment Effects with Multiple Periods and Groups -Version: 2.2.0.907 +Version: 2.2.1.909 Authors@R: c(person("Brantly", "Callaway", email = "brantly.callaway@uga.edu", role = c("aut", "cre")), person("Pedro H. C.", "Sant'Anna", email="pedro.h.santanna@vanderbilt.edu", role = c("aut"))) URL: https://bcallaway11.github.io/did/, https://github.com/bcallaway11/did/ Description: The standard Difference-in-Differences (DID) setup involves two periods and two groups -- a treated group and untreated group. Many applications of DID methods involve more than two periods and have individuals that are treated at different points in time. This package contains tools for computing average treatment effect parameters in Difference in Differences setups with more than two periods and with variation in treatment timing using the methods developed in Callaway and Sant'Anna (2021) . The main parameters are group-time average treatment effects which are the average treatment effect for a particular group at a a particular time. These can be aggregated into a fewer number of treatment effect parameters, and the package deals with the cases where there is selective treatment timing, dynamic treatment effects, calendar time effects, or combinations of these. There are also functions for testing the Difference in Differences assumption, and plotting group-time average treatment effects. @@ -18,7 +18,9 @@ Imports: generics, methods, tidyr, - data.table (>= 1.15.4) + parglm (>= 0.1.7), + data.table (>= 1.15.4), + dreamerr (>= 1.4.0) Suggests: backports Roxygen: list(markdown = TRUE) diff --git a/NAMESPACE b/NAMESPACE index eb7393f..a766c41 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -20,14 +20,15 @@ export(att_gt) export(build_sim_dataset) export(compute.aggte) export(compute.att_gt) +export(compute.att_gt2) export(conditional_did_pretest) -export(get_wide_data) export(ggdid) export(glance) export(gplot) export(indicator) export(mboot) export(pre_process_did) +export(pre_process_did2) export(process_attgt) export(reset.sim) export(sim) @@ -40,7 +41,15 @@ import(data.table) import(ggplot2) import(stats) import(utils) +importFrom(DRDID,drdid_panel) +importFrom(DRDID,drdid_rc) +importFrom(DRDID,reg_did_panel) +importFrom(DRDID,reg_did_rc) +importFrom(DRDID,std_ipw_did_panel) +importFrom(DRDID,std_ipw_did_rc) +importFrom(dreamerr,check_set_arg) importFrom(generics,glance) importFrom(generics,tidy) +importFrom(methods,as) importFrom(methods,is) importFrom(tidyr,gather) diff --git a/NEWS.md b/NEWS.md index 33d215c..5cec8ee 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,6 @@ # did 2.2.0 (development version) - * Code improvements that made the package much faster and memory efficient + * Code improvements that made the package faster and more memory efficient * Improved automated testing and regression testing diff --git a/R/DIDparams.R b/R/DIDparams.R index b163d11..7aafd3a 100644 --- a/R/DIDparams.R +++ b/R/DIDparams.R @@ -31,6 +31,7 @@ DIDparams <- function(yname, clustervars=NULL, cband=TRUE, print_details=TRUE, + faster_mode=FALSE, pl=FALSE, cores=1, est_method="dr", @@ -59,6 +60,7 @@ DIDparams <- function(yname, clustervars=clustervars, cband=cband, print_details=print_details, + faster_mode=faster_mode, pl=pl, cores=cores, est_method=est_method, diff --git a/R/DIDparams2.R b/R/DIDparams2.R new file mode 100644 index 0000000..bde12f0 --- /dev/null +++ b/R/DIDparams2.R @@ -0,0 +1,91 @@ +#' @title DIDparams +#' +#' @description Object to hold DiD parameters that are passed across functions +#' +#' @inheritParams att_gt2 +#' @inheritParams pre_process_did2 +#' @param did_tensor list of outcome tensors that are used in the estimation +#' @param args list of arguments that are used in the estimation +#' @noRd +DIDparams2 <- function(did_tensors, args, call=NULL) { + # get the arguments from args + yname <- args$yname + tname <- args$tname + idname <- args$idname + gname <- args$gname + xformla <- args$xformla # formula of covariates + panel <- args$panel + est_method <- args$est_method + bstrap <- args$bstrap + biters <- args$biters + cband <- args$cband + anticipation <- args$anticipation + control_group <- args$control_group + allow_unbalanced_panel <- args$allow_unbalanced_panel + weightsname <- args$weightsname + base_period <- args$base_period + clustervars <- args$clustervars + cores <- args$cores + pl <- args$pl + print_details <- args$print_details + faster_mode <- args$faster_mode + alp <- args$alp + true_repeated_cross_sections <- args$true_repeated_cross_sections + time_periods_count <- args$time_periods_count + time_periods <- args$time_periods + treated_groups_count <- args$treated_groups_count + treated_groups <- args$treated_groups + id_count <- args$id_count + + # get the arguments from did_tensors + outcomes_tensor <- did_tensors$outcomes_tensor + data <- did_tensors$data + time_invariant_data <- did_tensors$time_invariant_data + cohort_counts <- did_tensors$cohort_counts + period_counts <- did_tensors$period_counts + crosstable_counts <- did_tensors$crosstable_counts + covariates <- did_tensors$covariates # matrix of covariates + cluster_vector <- did_tensors$cluster + weights_vector <- did_tensors$weights + + + out <- list(yname=yname, + tname=tname, + idname=idname, + gname=gname, + xformla=xformla, + panel=panel, + est_method=est_method, + bstrap=bstrap, + biters=biters, + cband=cband, + anticipation=anticipation, + control_group=control_group, + allow_unbalanced_panel=allow_unbalanced_panel, + weightsname=weightsname, + base_period=base_period, + clustervars=clustervars, + cores=cores, + pl = pl, + print_details=print_details, + faster_mode=faster_mode, + alp=alp, + true_repeated_cross_sections=true_repeated_cross_sections, + time_periods_count=time_periods_count, + time_periods=time_periods, + treated_groups_count=treated_groups_count, + treated_groups=treated_groups, + id_count=id_count, + outcomes_tensor=outcomes_tensor, + data=data, + time_invariant_data=time_invariant_data, + cohort_counts=cohort_counts, + period_counts=period_counts, + crosstable_counts=crosstable_counts, + covariates=covariates, + cluster_vector=cluster_vector, + weights_vector=weights_vector, + call=call) + class(out) <- "DIDparams" + return(out) +} diff --git a/R/att_gt.R b/R/att_gt.R index fd56bf6..f908a57 100644 --- a/R/att_gt.R +++ b/R/att_gt.R @@ -89,6 +89,10 @@ #' @param anticipation The number of time periods before participating #' in the treatment where units can anticipate participating in the #' treatment and therefore it can affect their untreated potential outcomes +#' @param faster_mode This option enables a faster version of `did`, optimizing +#' computation time for large datasets by improving data management within the package. +#' The default is set to `FALSE`. While the difference is minimal for small datasets, +#' it is recommended for use with large datasets. #' @param base_period Whether to use a "varying" base period or a #' "universal" base period. Either choice results in the same #' post-treatment estimates of ATT(g,t)'s. In pre-treatment @@ -181,47 +185,98 @@ att_gt <- function(yname, clustervars=NULL, est_method="dr", base_period="varying", + faster_mode=FALSE, print_details=FALSE, pl=FALSE, cores=1) { - # this is a DIDparams object - dp <- pre_process_did(yname=yname, - tname=tname, - idname=idname, - gname=gname, - xformla=xformla, - data=data, - panel=panel, - allow_unbalanced_panel=allow_unbalanced_panel, - control_group=control_group, - anticipation=anticipation, - weightsname=weightsname, - alp=alp, - bstrap=bstrap, - cband=cband, - biters=biters, - clustervars=clustervars, - est_method=est_method, - base_period=base_period, - print_details=print_details, - pl=pl, - cores=cores, - call=match.call() - ) - - #----------------------------------------------------------------------------- - # Compute all ATT(g,t) - #----------------------------------------------------------------------------- - results <- compute.att_gt(dp) + # Check if user wants to run faster mode: + if (faster_mode) { + # this is a DIDparams2 object + dp <- pre_process_did2(yname=yname, + tname=tname, + idname=idname, + gname=gname, + xformla=xformla, + data=data, + panel=panel, + allow_unbalanced_panel=allow_unbalanced_panel, + control_group=control_group, + anticipation=anticipation, + weightsname=weightsname, + alp=alp, + bstrap=bstrap, + cband=cband, + biters=biters, + clustervars=clustervars, + est_method=est_method, + base_period=base_period, + print_details=print_details, + faster_mode=faster_mode, + pl=pl, + cores=cores, + call=match.call() + ) + + #----------------------------------------------------------------------------- + # Compute all ATT(g,t) + #----------------------------------------------------------------------------- + results <- compute.att_gt2(dp) + } else { + # this is a DIDparams object + dp <- pre_process_did(yname=yname, + tname=tname, + idname=idname, + gname=gname, + xformla=xformla, + data=data, + panel=panel, + allow_unbalanced_panel=allow_unbalanced_panel, + control_group=control_group, + anticipation=anticipation, + weightsname=weightsname, + alp=alp, + bstrap=bstrap, + cband=cband, + biters=biters, + clustervars=clustervars, + est_method=est_method, + base_period=base_period, + print_details=print_details, + pl=pl, + cores=cores, + call=match.call() + ) + + #----------------------------------------------------------------------------- + # Compute all ATT(g,t) + #----------------------------------------------------------------------------- + results <- compute.att_gt(dp) + } # extract ATT(g,t) and influence functions attgt.list <- results$attgt.list inffunc <- results$inffunc # process results - attgt.results <- process_attgt(attgt.list) + # attgt.results <- process_attgt(attgt.list) + tryCatch( + { + # Attempt to run this line for process results + attgt.results <- process_attgt(attgt.list) + }, + error = function(e) { + # Handle the error + if (faster_mode) { + # If faster_mode is TRUE, send this stop message + stop("An unexpected error occurred, normally associated with a singular matrix due to not enough control units. Try changing faster_mode=FALSE.") + } else { + # If faster_mode is FALSE, send this stop message + stop("An unexpected error occurred, normally associated with a singular matrix due to not enough control units.") + } + } + ) group <- attgt.results$group att <- attgt.results$att tt <- attgt.results$tt @@ -236,7 +291,7 @@ att_gt <- function(yname, # note to self: this def. won't work with unbalanced panel, # same with clustered standard errors # but it is always ignored b/c bstrap has to be true in that case - n <- dp$n + n <- ifelse(faster_mode, dp$id_count, dp$n) V <- Matrix::t(inffunc)%*%inffunc/n se <- sqrt(Matrix::diag(V)/n) diff --git a/R/compute.aggte.R b/R/compute.aggte.R index 017d968..785392c 100644 --- a/R/compute.aggte.R +++ b/R/compute.aggte.R @@ -1,645 +1,654 @@ -#' @title Compute Aggregated Treatment Effect Parameters -#' -#' @description Does the heavy lifting on computing aggregated group-time -#' average treatment effects -#' -#' @inheritParams att_gt -#' @inheritParams aggte -#' @param call The function call to aggte -#' -#' @return [`AGGTEobj`] object -#' -#' @keywords internal -#' -#' @export -compute.aggte <- function(MP, - type = "group", - balance_e = NULL, - min_e = -Inf, - max_e = Inf, - na.rm = FALSE, - bstrap = NULL, - biters = NULL, - cband = NULL, - alp = NULL, - clustervars = NULL, - call = NULL) { - - #----------------------------------------------------------------------------- - # unpack MP object - #----------------------------------------------------------------------------- - # load parameters - group <- MP$group - t <- MP$t - att <- MP$att - dp <- MP$DIDparams - inffunc1 <- MP$inffunc - n <- MP$n - - - gname <- dp$gname - data <- as.data.frame(dp$data) - tname <- dp$tname - idname <- dp$idname - if(is.null(clustervars)){ - clustervars <- dp$clustervars - } - if(is.null(bstrap)){ - bstrap <- dp$bstrap - } - if(is.null(biters)){ - biters <- dp$biters - } - if(is.null(alp)){ - alp <- dp$alp - } - if(is.null(cband)){ - cband <- dp$cband - } - - tlist <- dp$tlist - glist <- dp$glist - panel <- dp$panel - - # overwrite MP objects (so we can actually compute bootstrap) - MP$DIDparams$clustervars <- clustervars - MP$DIDparams$bstrap <- bstrap - MP$DIDparams$biters <- biters - MP$DIDparams$alp <- alp - MP$DIDparams$cband <- cband - dp <- MP$DIDparams - - if(!(type %in% c("simple", "dynamic", "group", "calendar"))) { - stop('`type` must be one of c("simple", "dynamic", "group", "calendar")') - } - - if(na.rm){ - notna <- !is.na(att) - group <- group[notna] - t <- t[notna] - att <- att[notna] - inffunc1 <- inffunc1[, notna] - #tlist <- sort(unique(t)) - glist <- sort(unique(group)) - - # If aggte is of the group type, ensure we have non-missing post-treatment ATTs for each group - if(type == "group"){ - # Get the groups that have some non-missing ATT(g,t) in post-treatmemt periods - gnotna <- sapply(glist, function(g) { - # look at post-treatment periods for group g - whichg <- which( (group == g) & (g <= t)) - attg <- att[whichg] - group_select <- !is.na(mean(attg)) - return(group_select) - }) - gnotna <- glist[gnotna] - # indicator for not all post-treatment ATT(g,t) missing - not_all_na <- group %in% gnotna - # Re-do the na.rm thing to update the groups - group <- group[not_all_na] - t <- t[not_all_na] - att <- att[not_all_na] - inffunc1 <- inffunc1[, not_all_na] - #tlist <- sort(unique(t)) - glist <- sort(unique(group)) - } - } - - if((na.rm == FALSE) && base::anyNA(att)) stop("Missing values at att_gt found. If you want to remove these, set `na.rm = TRUE'.") - - # data from first period - #ifelse(panel, - # dta <- data[ data[,tname]==tlist[1], ], - # dta <- data - # ) - if(panel){ - # data from first period - dta <- data[ data[,tname]==tlist[1], ] - }else { - #aggregate data - dta <- base::suppressWarnings(stats::aggregate(data, list((data[,idname])), mean)[,-1]) - } - - #----------------------------------------------------------------------------- - # data organization and recoding - #----------------------------------------------------------------------------- - - # do some recoding to make sure time periods are 1 unit apart - # and then put these back together at the end - originalt <- t - originalgroup <- group - originalglist <- glist - originaltlist <- tlist - # In case g's are not part of tlist - originalgtlist <- sort(unique(c(originaltlist,originalglist))) - uniquet <- seq(1,length(unique(originalgtlist))) - # function to switch from "new" t values to original t values - t2orig <- function(t) { - unique(c(originalgtlist,0))[which(c(uniquet,0)==t)] - } - # function to switch between "original" - # t values and new t values - orig2t <- function(orig) { - new_t <- c(uniquet,0)[which(unique(c(originalgtlist,0))==orig)] - out <- ifelse(length(new_t) == 0, NA, new_t) - out - } - t <- sapply(originalt, orig2t) - group <- sapply(originalgroup, orig2t) - glist <- sapply(originalglist, orig2t) - tlist <- unique(t) - maxT <- max(t) - - # Set the weights - weights.ind <- dta$.w - - # we can work in overall probabilities because conditioning will cancel out - # cause it shows up in numerator and denominator - pg <- sapply(originalglist, function(g) mean(weights.ind*(dta[,gname]==g))) - - # length of this is equal to number of groups - pgg <- pg - - # same but length is equal to the number of ATT(g,t) - pg <- pg[match(group, glist)] - - # which group time average treatment effects are post-treatment - keepers <- which(group <= t & t<= (group + max_e)) ### added second condition to allow for limit on longest period included in att - - # n x 1 vector of group variable - G <- unlist(lapply(dta[,gname], orig2t)) - - #----------------------------------------------------------------------------- - # Compute the simple ATT summary - #----------------------------------------------------------------------------- - - if (type == "simple") { - - # simple att - # averages all post-treatment ATT(g,t) with weights - # given by group size - simple.att <- sum(att[keepers]*pg[keepers])/(sum(pg[keepers])) - if(is.nan(simple.att)) simple.att <- NA - - # get the part of the influence function coming from estimated weights - simple.wif <- wif(keepers, pg, weights.ind, G, group) - - # get the overall influence function - simple.if <- get_agg_inf_func(att=att, - inffunc1=inffunc1, - whichones=keepers, - weights.agg=pg[keepers]/sum(pg[keepers]), - wif=simple.wif) - # Make it as vector - simple.if <- as.numeric(simple.if) - - # get standard errors from overall influence function - simple.se <- getSE(simple.if, dp) - if(!is.na(simple.se)){ - if(simple.se <= sqrt(.Machine$double.eps)*10) simple.se <- NA - } - - - return(AGGTEobj(overall.att = simple.att, - overall.se = simple.se, - type = type, - inf.function = list(simple.att = simple.if), - call=call, - DIDparams=dp)) - } - - #----------------------------------------------------------------------------- - # Compute the group (i.e., selective) treatment timing estimators - #----------------------------------------------------------------------------- - - if (type == "group") { - - # get group specific ATTs - # note: there are no estimated weights here - selective.att.g <- sapply(glist, function(g) { - # look at post-treatment periods for group g - whichg <- which( (group == g) & (g <= t) & (t<= (group + max_e))) ### added last condition to allow for limit on longest period included in att - attg <- att[whichg] - mean(attg) - }) - selective.att.g[is.nan(selective.att.g)] <- NA - - - # get standard errors for each group specific ATT - selective.se.inner <- lapply(glist, function(g) { - whichg <- which( (group == g) & (g <= t) & (t<= (group + max_e))) ### added last condition to allow for limit on longest period included in att - inf.func.g <- as.numeric(get_agg_inf_func(att=att, - inffunc1=inffunc1, - whichones=whichg, - weights.agg=pg[whichg]/sum(pg[whichg]), - wif=NULL)) - se.g <- getSE(inf.func.g, dp) - list(inf.func=inf.func.g, se=se.g) - }) - - # recover standard errors separately by group - selective.se.g <- unlist(BMisc::getListElement(selective.se.inner, "se")) - selective.se.g[selective.se.g <= sqrt(.Machine$double.eps)*10] <- NA - - # recover influence function separately by group - selective.inf.func.g <- simplify2array(BMisc::getListElement(selective.se.inner, "inf.func")) - - # use multiplier bootstrap (across groups) to get critical value - # for constructing uniform confidence bands - selective.crit.val <- stats::qnorm(1 - alp/2) - if(dp$cband==TRUE){ - if(dp$bstrap == FALSE){ - warning('Used bootstrap procedure to compute simultaneous confidence band') - } - selective.crit.val <- mboot(selective.inf.func.g, dp)$crit.val - - if(is.na(selective.crit.val) | is.infinite(selective.crit.val)){ - warning('Simultaneous critival value is NA. This probably happened because we cannot compute t-statistic (std errors are NA). We then report pointwise conf. intervals.') - selective.crit.val <- stats::qnorm(1 - alp/2) - dp$cband <- FALSE - } - - if(selective.crit.val < stats::qnorm(1 - alp/2)){ - warning('Simultaneous conf. band is somehow smaller than pointwise one using normal approximation. Since this is unusual, we are reporting pointwise confidence intervals') - selective.crit.val <- stats::qnorm(1 - alp/2) - dp$cband <- FALSE - } - - if(selective.crit.val >= 7){ - warning("Simultaneous critical value is arguably `too large' to be realible. This usually happens when number of observations per group is small and/or there is no much variation in outcomes.") - } - - } - - # get overall att under selective treatment timing - # (here use pgg instead of pg because we can just look at each group) - selective.att <- sum(selective.att.g * pgg)/sum(pgg) - - # account for having to estimate pgg in the influence function - selective.wif <- wif(keepers=1:length(glist), - pg=pgg, - weights.ind=weights.ind, - G=G, - group=group) - - # get overall influence function - selective.inf.func <- get_agg_inf_func(att=selective.att.g, - inffunc1=selective.inf.func.g, - whichones=(1:length(glist)), - weights.agg=pgg/sum(pgg), - wif=selective.wif) - - - selective.inf.func <- as.numeric(selective.inf.func) - # get overall standard error - selective.se <- getSE(selective.inf.func, dp) - if(!is.na(selective.se)){ - if((selective.se <= sqrt(.Machine$double.eps)*10)) selective.se <- NA - } - - return(AGGTEobj(overall.att=selective.att, - overall.se=selective.se, - type=type, - egt=originalglist, - att.egt=selective.att.g, - se.egt=selective.se.g, - crit.val.egt=selective.crit.val, - inf.function = list(selective.inf.func.g = selective.inf.func.g, - selective.inf.func = selective.inf.func), - call=call, - DIDparams=dp)) - - } - - - - #----------------------------------------------------------------------------- - # Compute the event-study estimators - #----------------------------------------------------------------------------- - - if (type == "dynamic") { - - # event times - # this looks at all available event times - # note: event times can be negative here. - # note: event time = 0 corresponds to "on impact" - #eseq <- unique(t-group) - eseq <- unique(originalt - originalgroup) - eseq <- eseq[order(eseq)] - - # if the user specifies balance_e, then we are going to - # drop some event times and some groups; if not, we just - # keep everything (that is what this variable is for) - include.balanced.gt <- rep(TRUE, length(originalgroup)) - - # if we balance the sample with resepect to event time - if (!is.null(balance_e)) { - include.balanced.gt <- (t2orig(maxT) - originalgroup >= balance_e) - - eseq <- unique(originalt[include.balanced.gt] - originalgroup[include.balanced.gt]) - eseq <- eseq[order(eseq)] - - eseq <- eseq[ (eseq <= balance_e) & (eseq >= balance_e - t2orig(maxT) + t2orig(1))] - - } - - # only looks at some event times - eseq <- eseq[ (eseq >= min_e) & (eseq <= max_e) ] - - # compute atts that are specific to each event time - dynamic.att.e <- sapply(eseq, function(e) { - # keep att(g,t) for the right g&t as well as ones that - # are not trimmed out from balancing the sample - whiche <- which( (originalt - originalgroup == e) & (include.balanced.gt) ) - atte <- att[whiche] - pge <- pg[whiche]/(sum(pg[whiche])) - sum(atte*pge) - }) - - # compute standard errors for dynamic effects - dynamic.se.inner <- lapply(eseq, function(e) { - whiche <- which( (originalt - originalgroup == e) & (include.balanced.gt) ) - pge <- pg[whiche]/(sum(pg[whiche])) - wif.e <- wif(whiche, pg, weights.ind, G, group) - inf.func.e <- as.numeric(get_agg_inf_func(att=att, - inffunc1=inffunc1, - whichones=whiche, - weights.agg=pge, - wif=wif.e)) - se.e <- getSE(inf.func.e, dp) - list(inf.func=inf.func.e, se=se.e) - }) - - dynamic.se.e <- unlist(BMisc::getListElement(dynamic.se.inner, "se")) - dynamic.se.e[dynamic.se.e <= sqrt(.Machine$double.eps)*10] <- NA - - dynamic.inf.func.e <- simplify2array(BMisc::getListElement(dynamic.se.inner, "inf.func")) - - dynamic.crit.val <- stats::qnorm(1 - alp/2) - if(dp$cband==TRUE){ - if(dp$bstrap == FALSE){ - warning('Used bootstrap procedure to compute simultaneous confidence band') - } - dynamic.crit.val <- mboot(dynamic.inf.func.e, dp)$crit.val - - if(is.na(dynamic.crit.val) | is.infinite(dynamic.crit.val)){ - warning('Simultaneous critival value is NA. This probably happened because we cannot compute t-statistic (std errors are NA). We then report pointwise conf. intervals.') - dynamic.crit.val <- stats::qnorm(1 - alp/2) - dp$cband <- FALSE - } - - if(dynamic.crit.val < stats::qnorm(1 - alp/2)){ - warning('Simultaneous conf. band is somehow smaller than pointwise one using normal approximation. Since this is unusual, we are reporting pointwise confidence intervals') - dynamic.crit.val <- stats::qnorm(1 - alp/2) - dp$cband <- FALSE - } - - if(dynamic.crit.val >= 7){ - warning("Simultaneous critical value is arguably `too large' to be realible. This usually happens when number of observations per group is small and/or there is no much variation in outcomes.") - } - } - - # get overall average treatment effect - # by averaging over positive dynamics - epos <- eseq >= 0 - dynamic.att <- mean(dynamic.att.e[epos]) - dynamic.inf.func <- get_agg_inf_func(att=dynamic.att.e[epos], - inffunc1=as.matrix(dynamic.inf.func.e[,epos]), - whichones=(1:sum(epos)), - weights.agg=(rep(1/sum(epos), sum(epos))), - wif=NULL) - - dynamic.inf.func <- as.numeric(dynamic.inf.func) - dynamic.se <- getSE(dynamic.inf.func, dp) - if(!is.na(dynamic.se)){ - if (dynamic.se <= sqrt(.Machine$double.eps)*10) dynamic.se <- NA - } - - return(AGGTEobj(overall.att=dynamic.att, - overall.se=dynamic.se, - type=type, - egt=eseq, - att.egt=dynamic.att.e, - se.egt=dynamic.se.e, - crit.val.egt=dynamic.crit.val, - inf.function = list(dynamic.inf.func.e = dynamic.inf.func.e, - dynamic.inf.func = dynamic.inf.func), - call=call, - min_e=min_e, - max_e=max_e, - balance_e=balance_e, - DIDparams=dp - )) - } - - #----------------------------------------------------------------------------- - # calendar time effects - #----------------------------------------------------------------------------- - - if (type == "calendar") { - - # drop time periods where no one is treated yet - # (can't get treatment effects in those periods) - minG <- min(group) - calendar.tlist <- tlist[tlist>=minG] - - # calendar time specific atts - calendar.att.t <- sapply(calendar.tlist, function(t1) { - # look at post-treatment periods for group g - whicht <- which( (t == t1) & (group <= t)) - attt <- att[whicht] - pgt <- pg[whicht]/(sum(pg[whicht])) - sum(pgt * attt) - }) - - # get standard errors and influence functions - # for each time specific att - calendar.se.inner <- lapply(calendar.tlist, function(t1) { - whicht <- which( (t == t1) & (group <= t)) - pgt <- pg[whicht]/(sum(pg[whicht])) - wif.t <- wif(keepers=whicht, - pg=pg, - weights.ind=weights.ind, - G=G, - group=group) - inf.func.t <- as.numeric(get_agg_inf_func(att=att, - inffunc1=inffunc1, - whichones=whicht, - weights.agg=pgt, - wif=wif.t)) - se.t <- getSE(inf.func.t, dp) - list(inf.func=inf.func.t, se=se.t) - }) - - # recover standard errors separately by time - calendar.se.t <- unlist(BMisc::getListElement(calendar.se.inner, "se")) - calendar.se.t[calendar.se.t <= sqrt(.Machine$double.eps)*10] <- NA - # recover influence function separately by time - calendar.inf.func.t <- simplify2array(BMisc::getListElement(calendar.se.inner, "inf.func")) - - # use multiplier boostrap (across groups) to get critical value - # for constructing uniform confidence bands - calendar.crit.val <- stats::qnorm(1-alp/2) - if(dp$cband==TRUE){ - if(dp$bstrap == FALSE){ - warning('Used bootstrap procedure to compute simultaneous confidence band') - } - calendar.crit.val <- mboot(calendar.inf.func.t, dp)$crit.val - - if(is.na(calendar.crit.val) | is.infinite(calendar.crit.val)){ - warning('Simultaneous critival value is NA. This probably happened because we cannot compute t-statistic (std errors are NA). We then report pointwise conf. intervals.') - calendar.crit.val <- stats::qnorm(1 - alp/2) - dp$cband <- FALSE - } - - if(calendar.crit.val < stats::qnorm(1 - alp/2)){ - warning('Simultaneous conf. band is somehow smaller than pointwise one using normal approximation. Since this is unusual, we are reporting pointwise confidence intervals') - calendar.crit.val <- stats::qnorm(1 - alp/2) - dp$cband <- FALSE - } - - if(calendar.crit.val >= 7){ - warning("Simultaneous critical value is arguably `too large' to be realible. This usually happens when number of observations per group is small and/or there is no much variation in outcomes.") - } - } - - # get overall att under calendar time effects - # this is just average over all time periods - calendar.att <- mean(calendar.att.t) - - # get overall influence function - calendar.inf.func <- get_agg_inf_func(att=calendar.att.t, - inffunc1=calendar.inf.func.t, - whichones=(1:length(calendar.tlist)), - weights.agg=rep(1/length(calendar.tlist), length(calendar.tlist)), - wif=NULL) - calendar.inf.func <- as.numeric(calendar.inf.func) - # get overall standard error - calendar.se <- getSE(calendar.inf.func, dp) - if(!is.na(calendar.se)){ - if (calendar.se <= sqrt(.Machine$double.eps)*10) calendar.se <- NA - } - return(AGGTEobj(overall.att=calendar.att, - overall.se=calendar.se, - type=type, - egt=sapply(calendar.tlist,t2orig), - att.egt=calendar.att.t, - se.egt=calendar.se.t, - crit.val.egt=calendar.crit.val, - inf.function = list(calendar.inf.func.t = calendar.inf.func.t, - calendar.inf.func = calendar.inf.func), - call=call, - DIDparams=dp - )) - - } - - -} - -#----------------------------------------------------------------------------- -# Internal functions for getting standard errors -#----------------------------------------------------------------------------- - -#' @title Compute extra term in influence function due to estimating weights -#' -#' @description A function to compute the extra term that shows up in the -#' influence function for aggregated treatment effect parameters -#' due to estimating the weights -#' -#' @param keepers a vector of indices for which group-time average -#' treatment effects are used to compute a particular aggregated parameter -#' @param pg a vector with same length as total number of group-time average -#' treatment effects that contains the probability of being in particular group -#' @param weights.ind additional sampling weights (nx1) -#' @param G vector containing which group a unit belongs to (nx1) -#' @param group vector of groups -#' -#' @return nxk influence function matrix -#' -#' @keywords internal -wif <- function(keepers, pg, weights.ind, G, group) { - # note: weights are all of the form P(G=g|cond)/sum_cond(P(G=g|cond)) - # this is equal to P(G=g)/sum_cond(P(G=g)) which simplifies things here - - # effect of estimating weights in the numerator - if1 <- sapply(keepers, function(k) { - (weights.ind * 1*BMisc::TorF(G==group[k]) - pg[k]) / - sum(pg[keepers]) - }) - # effect of estimating weights in the denominator - if2 <- base::rowSums( sapply( keepers, function(k) { - weights.ind*1*BMisc::TorF(G==group[k]) - pg[k] - })) %*% - t(pg[keepers]/(sum(pg[keepers])^2)) - - # return the influence function for the weights - if1 - if2 -} - - -#' @title Get an influence function for particular aggregate parameters -#' -#' @title This is a generic internal function for combining influence -#' functions across ATT(g,t)'s to return an influence function for -#' various aggregated treatment effect parameters. -#' -#' @param att vector of group-time average treatment effects -#' @param inffunc1 influence function for all group-time average treatment effects -#' (matrix) -#' @param whichones which elements of att will be used to compute the aggregated -#' treatment effect parameter -#' @param weights.agg the weights to apply to each element of att(whichones); -#' should have the same dimension as att(whichones) -#' @param wif extra influence function term coming from estimating the weights; -#' should be n x k matrix where k is dimension of whichones -#' -#' @return nx1 influence function -#' -#' @keywords internal -get_agg_inf_func <- function(att, inffunc1, whichones, weights.agg, wif=NULL) { - # enforce weights are in matrix form - weights.agg <- as.matrix(weights.agg) - - # multiplies influence function times weights and sums to get vector of weighted IF (of length n) - thisinffunc <- inffunc1[,whichones]%*%weights.agg - - # Incorporate influence function of the weights - if (!is.null(wif)) { - thisinffunc <- thisinffunc + wif%*%as.matrix(att[whichones]) - } - - # return influence function - return(thisinffunc) -} - - -#' @title Take influence function and return standard errors -#' -#' @description Function to take an nx1 influence function and return -#' a standard error -#' -#' @param thisinffunc An influence function -#' @inheritParams compute.aggte -#' -#' @return scalar standard error -#' -#' @keywords internal -getSE <- function(thisinffunc, DIDparams=NULL) { - alp <- .05 - bstrap <- FALSE - if (!is.null(DIDparams)) { - bstrap <- DIDparams$bstrap - alp <- DIDparams$alp - cband <- DIDparams$cband - n <- length(thisinffunc) - } - - if (bstrap) { - bout <- mboot(thisinffunc, DIDparams) - return(bout$se) - } else { - return(sqrt( mean((thisinffunc)^2)/n )) - } -} - +#' @title Compute Aggregated Treatment Effect Parameters +#' +#' @description Does the heavy lifting on computing aggregated group-time +#' average treatment effects +#' +#' @inheritParams att_gt +#' @inheritParams aggte +#' @param call The function call to aggte +#' +#' @return [`AGGTEobj`] object +#' +#' @keywords internal +#' +#' @export +compute.aggte <- function(MP, + type = "group", + balance_e = NULL, + min_e = -Inf, + max_e = Inf, + na.rm = FALSE, + bstrap = NULL, + biters = NULL, + cband = NULL, + alp = NULL, + clustervars = NULL, + call = NULL) { + + #----------------------------------------------------------------------------- + # unpack MP object + #----------------------------------------------------------------------------- + # load parameters + group <- MP$group + t <- MP$t + att <- MP$att + dp <- MP$DIDparams + inffunc1 <- MP$inffunc + n <- MP$n + + + gname <- dp$gname + tname <- dp$tname + idname <- dp$idname + panel <- dp$panel + if(is.null(clustervars)){ + clustervars <- dp$clustervars + } + if(is.null(bstrap)){ + bstrap <- dp$bstrap + } + if(is.null(biters)){ + biters <- dp$biters + } + if(is.null(alp)){ + alp <- dp$alp + } + if(is.null(cband)){ + cband <- dp$cband + } + if(dp$faster_mode){ + dt <- dp$data + dt[get(gname) == Inf, (gname) := 0] # going back to the old way + data <- as.data.frame(dt) + rm(dt) + tlist <- dp$time_periods + glist <- dp$treated_groups + } else { + data <- as.data.frame(dp$data) + tlist <- dp$tlist + glist <- dp$glist + } + + # overwrite MP objects (so we can actually compute bootstrap) + MP$DIDparams$clustervars <- clustervars + MP$DIDparams$bstrap <- bstrap + MP$DIDparams$biters <- biters + MP$DIDparams$alp <- alp + MP$DIDparams$cband <- cband + dp <- MP$DIDparams + + if(!(type %in% c("simple", "dynamic", "group", "calendar"))) { + stop('`type` must be one of c("simple", "dynamic", "group", "calendar")') + } + + if(na.rm){ + notna <- !is.na(att) + group <- group[notna] + t <- t[notna] + att <- att[notna] + inffunc1 <- inffunc1[, notna] + #tlist <- sort(unique(t)) + glist <- sort(unique(group)) + + # If aggte is of the group type, ensure we have non-missing post-treatment ATTs for each group + if(type == "group"){ + # Get the groups that have some non-missing ATT(g,t) in post-treatmemt periods + gnotna <- sapply(glist, function(g) { + # look at post-treatment periods for group g + whichg <- which( (group == g) & (g <= t)) + attg <- att[whichg] + group_select <- !is.na(mean(attg)) + return(group_select) + }) + gnotna <- glist[gnotna] + # indicator for not all post-treatment ATT(g,t) missing + not_all_na <- group %in% gnotna + # Re-do the na.rm thing to update the groups + group <- group[not_all_na] + t <- t[not_all_na] + att <- att[not_all_na] + inffunc1 <- inffunc1[, not_all_na] + #tlist <- sort(unique(t)) + glist <- sort(unique(group)) + } + } + + if((na.rm == FALSE) && base::anyNA(att)) stop("Missing values at att_gt found. If you want to remove these, set `na.rm = TRUE'.") + + # data from first period + #ifelse(panel, + # dta <- data[ data[,tname]==tlist[1], ], + # dta <- data + # ) + if(panel){ + # TODO; THIS HAS TO BE OPTIMIZED WITH faster_mode enabled. + # data from first period + dta <- data[data[,tname]==tlist[1], ] + }else { + #aggregate data + dta <- base::suppressWarnings(stats::aggregate(data, list((data[,idname])), mean)[,-1]) + } + + #----------------------------------------------------------------------------- + # data organization and recoding + #----------------------------------------------------------------------------- + + # do some recoding to make sure time periods are 1 unit apart + # and then put these back together at the end + originalt <- t + originalgroup <- group + originalglist <- glist + originaltlist <- tlist + # In case g's are not part of tlist + originalgtlist <- sort(unique(c(originaltlist,originalglist))) + uniquet <- seq(1,length(unique(originalgtlist))) + # function to switch from "new" t values to original t values + t2orig <- function(t) { + unique(c(originalgtlist,0))[which(c(uniquet,0)==t)] + } + # function to switch between "original" + # t values and new t values + orig2t <- function(orig) { + new_t <- c(uniquet,0)[which(unique(c(originalgtlist,0))==orig)] + out <- ifelse(length(new_t) == 0, NA, new_t) + out + } + t <- sapply(originalt, orig2t) + group <- sapply(originalgroup, orig2t) + glist <- sapply(originalglist, orig2t) + tlist <- unique(t) + maxT <- max(t) + + # Set the weights + ifelse(dp$faster_mode, weights.ind <- as.numeric(dp$weights_vector), weights.ind <- dta$.w) + + # we can work in overall probabilities because conditioning will cancel out + # cause it shows up in numerator and denominator + pg <- sapply(originalglist, function(g) mean(weights.ind*(dta[,gname]==g))) + + # length of this is equal to number of groups + pgg <- pg + + # same but length is equal to the number of ATT(g,t) + pg <- pg[match(group, glist)] + + # which group time average treatment effects are post-treatment + keepers <- which(group <= t & t<= (group + max_e)) ### added second condition to allow for limit on longest period included in att + + # n x 1 vector of group variable + G <- unlist(lapply(dta[,gname], orig2t)) + + #----------------------------------------------------------------------------- + # Compute the simple ATT summary + #----------------------------------------------------------------------------- + + if (type == "simple") { + + # simple att + # averages all post-treatment ATT(g,t) with weights + # given by group size + simple.att <- sum(att[keepers]*pg[keepers])/(sum(pg[keepers])) + if(is.nan(simple.att)) simple.att <- NA + + # get the part of the influence function coming from estimated weights + simple.wif <- wif(keepers, pg, weights.ind, G, group) + + # get the overall influence function + simple.if <- get_agg_inf_func(att=att, + inffunc1=inffunc1, + whichones=keepers, + weights.agg=pg[keepers]/sum(pg[keepers]), + wif=simple.wif) + # Make it as vector + simple.if <- as.numeric(simple.if) + + # get standard errors from overall influence function + simple.se <- getSE(simple.if, dp) + if(!is.na(simple.se)){ + if(simple.se <= sqrt(.Machine$double.eps)*10) simple.se <- NA + } + + + return(AGGTEobj(overall.att = simple.att, + overall.se = simple.se, + type = type, + inf.function = list(simple.att = simple.if), + call=call, + DIDparams=dp)) + } + + #----------------------------------------------------------------------------- + # Compute the group (i.e., selective) treatment timing estimators + #----------------------------------------------------------------------------- + + if (type == "group") { + + # get group specific ATTs + # note: there are no estimated weights here + selective.att.g <- sapply(glist, function(g) { + # look at post-treatment periods for group g + whichg <- which( (group == g) & (g <= t) & (t<= (group + max_e))) ### added last condition to allow for limit on longest period included in att + attg <- att[whichg] + mean(attg) + }) + selective.att.g[is.nan(selective.att.g)] <- NA + + + # get standard errors for each group specific ATT + selective.se.inner <- lapply(glist, function(g) { + whichg <- which( (group == g) & (g <= t) & (t<= (group + max_e))) ### added last condition to allow for limit on longest period included in att + inf.func.g <- as.numeric(get_agg_inf_func(att=att, + inffunc1=inffunc1, + whichones=whichg, + weights.agg=pg[whichg]/sum(pg[whichg]), + wif=NULL)) + se.g <- getSE(inf.func.g, dp) + list(inf.func=inf.func.g, se=se.g) + }) + + # recover standard errors separately by group + selective.se.g <- unlist(BMisc::getListElement(selective.se.inner, "se")) + selective.se.g[selective.se.g <= sqrt(.Machine$double.eps)*10] <- NA + + # recover influence function separately by group + selective.inf.func.g <- simplify2array(BMisc::getListElement(selective.se.inner, "inf.func")) + + # use multiplier bootstrap (across groups) to get critical value + # for constructing uniform confidence bands + selective.crit.val <- stats::qnorm(1 - alp/2) + if(dp$cband==TRUE){ + if(dp$bstrap == FALSE){ + warning('Used bootstrap procedure to compute simultaneous confidence band') + } + selective.crit.val <- mboot(selective.inf.func.g, dp)$crit.val + + if(is.na(selective.crit.val) | is.infinite(selective.crit.val)){ + warning('Simultaneous critival value is NA. This probably happened because we cannot compute t-statistic (std errors are NA). We then report pointwise conf. intervals.') + selective.crit.val <- stats::qnorm(1 - alp/2) + dp$cband <- FALSE + } + + if(selective.crit.val < stats::qnorm(1 - alp/2)){ + warning('Simultaneous conf. band is somehow smaller than pointwise one using normal approximation. Since this is unusual, we are reporting pointwise confidence intervals') + selective.crit.val <- stats::qnorm(1 - alp/2) + dp$cband <- FALSE + } + + if(selective.crit.val >= 7){ + warning("Simultaneous critical value is arguably `too large' to be realible. This usually happens when number of observations per group is small and/or there is no much variation in outcomes.") + } + + } + + # get overall att under selective treatment timing + # (here use pgg instead of pg because we can just look at each group) + selective.att <- sum(selective.att.g * pgg)/sum(pgg) + + # account for having to estimate pgg in the influence function + selective.wif <- wif(keepers=1:length(glist), + pg=pgg, + weights.ind=weights.ind, + G=G, + group=group) + + # get overall influence function + selective.inf.func <- get_agg_inf_func(att=selective.att.g, + inffunc1=selective.inf.func.g, + whichones=(1:length(glist)), + weights.agg=pgg/sum(pgg), + wif=selective.wif) + + + selective.inf.func <- as.numeric(selective.inf.func) + # get overall standard error + selective.se <- getSE(selective.inf.func, dp) + if(!is.na(selective.se)){ + if((selective.se <= sqrt(.Machine$double.eps)*10)) selective.se <- NA + } + + return(AGGTEobj(overall.att=selective.att, + overall.se=selective.se, + type=type, + egt=originalglist, + att.egt=selective.att.g, + se.egt=selective.se.g, + crit.val.egt=selective.crit.val, + inf.function = list(selective.inf.func.g = selective.inf.func.g, + selective.inf.func = selective.inf.func), + call=call, + DIDparams=dp)) + + } + + + + #----------------------------------------------------------------------------- + # Compute the event-study estimators + #----------------------------------------------------------------------------- + + if (type == "dynamic") { + + # event times + # this looks at all available event times + # note: event times can be negative here. + # note: event time = 0 corresponds to "on impact" + #eseq <- unique(t-group) + eseq <- unique(originalt - originalgroup) + eseq <- eseq[order(eseq)] + + # if the user specifies balance_e, then we are going to + # drop some event times and some groups; if not, we just + # keep everything (that is what this variable is for) + include.balanced.gt <- rep(TRUE, length(originalgroup)) + + # if we balance the sample with resepect to event time + if (!is.null(balance_e)) { + include.balanced.gt <- (t2orig(maxT) - originalgroup >= balance_e) + + eseq <- unique(originalt[include.balanced.gt] - originalgroup[include.balanced.gt]) + eseq <- eseq[order(eseq)] + + eseq <- eseq[ (eseq <= balance_e) & (eseq >= balance_e - t2orig(maxT) + t2orig(1))] + + } + + # only looks at some event times + eseq <- eseq[ (eseq >= min_e) & (eseq <= max_e) ] + + # compute atts that are specific to each event time + dynamic.att.e <- sapply(eseq, function(e) { + # keep att(g,t) for the right g&t as well as ones that + # are not trimmed out from balancing the sample + whiche <- which( (originalt - originalgroup == e) & (include.balanced.gt) ) + atte <- att[whiche] + pge <- pg[whiche]/(sum(pg[whiche])) + sum(atte*pge) + }) + + # compute standard errors for dynamic effects + dynamic.se.inner <- lapply(eseq, function(e) { + whiche <- which( (originalt - originalgroup == e) & (include.balanced.gt) ) + pge <- pg[whiche]/(sum(pg[whiche])) + wif.e <- wif(whiche, pg, weights.ind, G, group) + inf.func.e <- as.numeric(get_agg_inf_func(att=att, + inffunc1=inffunc1, + whichones=whiche, + weights.agg=pge, + wif=wif.e)) + se.e <- getSE(inf.func.e, dp) + list(inf.func=inf.func.e, se=se.e) + }) + + dynamic.se.e <- unlist(BMisc::getListElement(dynamic.se.inner, "se")) + dynamic.se.e[dynamic.se.e <= sqrt(.Machine$double.eps)*10] <- NA + + dynamic.inf.func.e <- simplify2array(BMisc::getListElement(dynamic.se.inner, "inf.func")) + + dynamic.crit.val <- stats::qnorm(1 - alp/2) + if(dp$cband==TRUE){ + if(dp$bstrap == FALSE){ + warning('Used bootstrap procedure to compute simultaneous confidence band') + } + dynamic.crit.val <- mboot(dynamic.inf.func.e, dp)$crit.val + + if(is.na(dynamic.crit.val) | is.infinite(dynamic.crit.val)){ + warning('Simultaneous critival value is NA. This probably happened because we cannot compute t-statistic (std errors are NA). We then report pointwise conf. intervals.') + dynamic.crit.val <- stats::qnorm(1 - alp/2) + dp$cband <- FALSE + } + + if(dynamic.crit.val < stats::qnorm(1 - alp/2)){ + warning('Simultaneous conf. band is somehow smaller than pointwise one using normal approximation. Since this is unusual, we are reporting pointwise confidence intervals') + dynamic.crit.val <- stats::qnorm(1 - alp/2) + dp$cband <- FALSE + } + + if(dynamic.crit.val >= 7){ + warning("Simultaneous critical value is arguably `too large' to be realible. This usually happens when number of observations per group is small and/or there is no much variation in outcomes.") + } + } + + # get overall average treatment effect + # by averaging over positive dynamics + epos <- eseq >= 0 + dynamic.att <- mean(dynamic.att.e[epos]) + dynamic.inf.func <- get_agg_inf_func(att=dynamic.att.e[epos], + inffunc1=as.matrix(dynamic.inf.func.e[,epos]), + whichones=(1:sum(epos)), + weights.agg=(rep(1/sum(epos), sum(epos))), + wif=NULL) + + dynamic.inf.func <- as.numeric(dynamic.inf.func) + dynamic.se <- getSE(dynamic.inf.func, dp) + if(!is.na(dynamic.se)){ + if (dynamic.se <= sqrt(.Machine$double.eps)*10) dynamic.se <- NA + } + + return(AGGTEobj(overall.att=dynamic.att, + overall.se=dynamic.se, + type=type, + egt=eseq, + att.egt=dynamic.att.e, + se.egt=dynamic.se.e, + crit.val.egt=dynamic.crit.val, + inf.function = list(dynamic.inf.func.e = dynamic.inf.func.e, + dynamic.inf.func = dynamic.inf.func), + call=call, + min_e=min_e, + max_e=max_e, + balance_e=balance_e, + DIDparams=dp + )) + } + + #----------------------------------------------------------------------------- + # calendar time effects + #----------------------------------------------------------------------------- + + if (type == "calendar") { + + # drop time periods where no one is treated yet + # (can't get treatment effects in those periods) + minG <- min(group) + calendar.tlist <- tlist[tlist>=minG] + + # calendar time specific atts + calendar.att.t <- sapply(calendar.tlist, function(t1) { + # look at post-treatment periods for group g + whicht <- which( (t == t1) & (group <= t)) + attt <- att[whicht] + pgt <- pg[whicht]/(sum(pg[whicht])) + sum(pgt * attt) + }) + + # get standard errors and influence functions + # for each time specific att + calendar.se.inner <- lapply(calendar.tlist, function(t1) { + whicht <- which( (t == t1) & (group <= t)) + pgt <- pg[whicht]/(sum(pg[whicht])) + wif.t <- wif(keepers=whicht, + pg=pg, + weights.ind=weights.ind, + G=G, + group=group) + inf.func.t <- as.numeric(get_agg_inf_func(att=att, + inffunc1=inffunc1, + whichones=whicht, + weights.agg=pgt, + wif=wif.t)) + se.t <- getSE(inf.func.t, dp) + list(inf.func=inf.func.t, se=se.t) + }) + + # recover standard errors separately by time + calendar.se.t <- unlist(BMisc::getListElement(calendar.se.inner, "se")) + calendar.se.t[calendar.se.t <= sqrt(.Machine$double.eps)*10] <- NA + # recover influence function separately by time + calendar.inf.func.t <- simplify2array(BMisc::getListElement(calendar.se.inner, "inf.func")) + + # use multiplier boostrap (across groups) to get critical value + # for constructing uniform confidence bands + calendar.crit.val <- stats::qnorm(1-alp/2) + if(dp$cband==TRUE){ + if(dp$bstrap == FALSE){ + warning('Used bootstrap procedure to compute simultaneous confidence band') + } + calendar.crit.val <- mboot(calendar.inf.func.t, dp)$crit.val + + if(is.na(calendar.crit.val) | is.infinite(calendar.crit.val)){ + warning('Simultaneous critival value is NA. This probably happened because we cannot compute t-statistic (std errors are NA). We then report pointwise conf. intervals.') + calendar.crit.val <- stats::qnorm(1 - alp/2) + dp$cband <- FALSE + } + + if(calendar.crit.val < stats::qnorm(1 - alp/2)){ + warning('Simultaneous conf. band is somehow smaller than pointwise one using normal approximation. Since this is unusual, we are reporting pointwise confidence intervals') + calendar.crit.val <- stats::qnorm(1 - alp/2) + dp$cband <- FALSE + } + + if(calendar.crit.val >= 7){ + warning("Simultaneous critical value is arguably `too large' to be realible. This usually happens when number of observations per group is small and/or there is no much variation in outcomes.") + } + } + + # get overall att under calendar time effects + # this is just average over all time periods + calendar.att <- mean(calendar.att.t) + + # get overall influence function + calendar.inf.func <- get_agg_inf_func(att=calendar.att.t, + inffunc1=calendar.inf.func.t, + whichones=(1:length(calendar.tlist)), + weights.agg=rep(1/length(calendar.tlist), length(calendar.tlist)), + wif=NULL) + calendar.inf.func <- as.numeric(calendar.inf.func) + # get overall standard error + calendar.se <- getSE(calendar.inf.func, dp) + if(!is.na(calendar.se)){ + if (calendar.se <= sqrt(.Machine$double.eps)*10) calendar.se <- NA + } + return(AGGTEobj(overall.att=calendar.att, + overall.se=calendar.se, + type=type, + egt=sapply(calendar.tlist,t2orig), + att.egt=calendar.att.t, + se.egt=calendar.se.t, + crit.val.egt=calendar.crit.val, + inf.function = list(calendar.inf.func.t = calendar.inf.func.t, + calendar.inf.func = calendar.inf.func), + call=call, + DIDparams=dp + )) + + } + + +} + +#----------------------------------------------------------------------------- +# Internal functions for getting standard errors +#----------------------------------------------------------------------------- + +#' @title Compute extra term in influence function due to estimating weights +#' +#' @description A function to compute the extra term that shows up in the +#' influence function for aggregated treatment effect parameters +#' due to estimating the weights +#' +#' @param keepers a vector of indices for which group-time average +#' treatment effects are used to compute a particular aggregated parameter +#' @param pg a vector with same length as total number of group-time average +#' treatment effects that contains the probability of being in particular group +#' @param weights.ind additional sampling weights (nx1) +#' @param G vector containing which group a unit belongs to (nx1) +#' @param group vector of groups +#' +#' @return nxk influence function matrix +#' +#' @keywords internal +wif <- function(keepers, pg, weights.ind, G, group) { + # note: weights are all of the form P(G=g|cond)/sum_cond(P(G=g|cond)) + # this is equal to P(G=g)/sum_cond(P(G=g)) which simplifies things here + + # effect of estimating weights in the numerator + if1 <- sapply(keepers, function(k) { + (weights.ind * 1*BMisc::TorF(G==group[k]) - pg[k]) / + sum(pg[keepers]) + }) + # effect of estimating weights in the denominator + if2 <- base::rowSums( sapply( keepers, function(k) { + weights.ind*1*BMisc::TorF(G==group[k]) - pg[k] + })) %*% + t(pg[keepers]/(sum(pg[keepers])^2)) + + # return the influence function for the weights + if1 - if2 +} + + +#' @title Get an influence function for particular aggregate parameters +#' +#' @title This is a generic internal function for combining influence +#' functions across ATT(g,t)'s to return an influence function for +#' various aggregated treatment effect parameters. +#' +#' @param att vector of group-time average treatment effects +#' @param inffunc1 influence function for all group-time average treatment effects +#' (matrix) +#' @param whichones which elements of att will be used to compute the aggregated +#' treatment effect parameter +#' @param weights.agg the weights to apply to each element of att(whichones); +#' should have the same dimension as att(whichones) +#' @param wif extra influence function term coming from estimating the weights; +#' should be n x k matrix where k is dimension of whichones +#' +#' @return nx1 influence function +#' +#' @keywords internal +get_agg_inf_func <- function(att, inffunc1, whichones, weights.agg, wif=NULL) { + # enforce weights are in matrix form + weights.agg <- as.matrix(weights.agg) + + # multiplies influence function times weights and sums to get vector of weighted IF (of length n) + thisinffunc <- inffunc1[,whichones]%*%weights.agg + + # Incorporate influence function of the weights + if (!is.null(wif)) { + thisinffunc <- thisinffunc + wif%*%as.matrix(att[whichones]) + } + + # return influence function + return(thisinffunc) +} + + +#' @title Take influence function and return standard errors +#' +#' @description Function to take an nx1 influence function and return +#' a standard error +#' +#' @param thisinffunc An influence function +#' @inheritParams compute.aggte +#' +#' @return scalar standard error +#' +#' @keywords internal +getSE <- function(thisinffunc, DIDparams=NULL) { + alp <- .05 + bstrap <- FALSE + if (!is.null(DIDparams)) { + bstrap <- DIDparams$bstrap + alp <- DIDparams$alp + cband <- DIDparams$cband + n <- length(thisinffunc) + } + + if (bstrap) { + bout <- mboot(thisinffunc, DIDparams) + return(bout$se) + } else { + return(sqrt( mean((thisinffunc)^2)/n )) + } +} + diff --git a/R/compute.att_gt.R b/R/compute.att_gt.R index deee358..ca4986e 100644 --- a/R/compute.att_gt.R +++ b/R/compute.att_gt.R @@ -20,7 +20,7 @@ compute.att_gt <- function(dp) { #----------------------------------------------------------------------------- # unpack DIDparams #----------------------------------------------------------------------------- - data <- as.data.frame(dp$data) + data <- data.table::as.data.table(dp$data) yname <- dp$yname tname <- dp$tname idname <- dp$idname @@ -51,32 +51,30 @@ compute.att_gt <- function(dp) { counter <- 1 # number of time periods - tlist.length <- length(tlist) - tfac <- 0 - - if (base_period != "universal") { - tlist.length <- tlist.length - 1 - tfac <- 1 - } + tlist.length <- ifelse(base_period != "universal", length(tlist) - 1, length(tlist)) + tfac <- ifelse(base_period != "universal", 1, 0) # influence function - inffunc <- Matrix::Matrix(data=0,nrow=n, ncol=nG*(nT-tfac), sparse=TRUE) + inffunc <- Matrix::Matrix(data = 0, nrow = n, ncol = nG*(nT-tfac), sparse = TRUE) + + # list of collect sparse matrix updates + inffunc_updates <- list() + # counter for keeping track of updates + update_counter <- 1 # never treated option nevertreated <- (control_group[1] == "nevertreated") if(nevertreated) { - data$.C <- 1*(data[,gname] == 0) + data[, .C := as.integer(get(gname) == 0)] } - - # rename yname to .y - data$.y <- data[,yname] + data[, .y := get(yname), .SDcols = yname] # loop over groups for (g in 1:nG) { # Set up .G once - data$.G <- 1*(data[,gname] == glist[g]) + data[, .G := as.numeric(get(gname) == glist[[..g]]), .SDcols = gname] # loop over time periods for (t in 1:tlist.length) { @@ -97,9 +95,9 @@ compute.att_gt <- function(dp) { # that is, never treated + units that are eventually treated, # but not treated by the current period (+ anticipation) if(!nevertreated) { - data$.C <- 1 * ((data[,gname] == 0) | - ((data[,gname] > (tlist[max(t,pret)+tfac]+anticipation)) & - (data[,gname] != glist[g]))) + data[, .C := as.integer((get(gname) == 0) | + ((get(gname) > (tlist[max(t, pret) + tfac] + anticipation)) & + (get(gname) != glist[[..g]])))] } @@ -127,8 +125,16 @@ compute.att_gt <- function(dp) { if (base_period == "universal") { if (tlist[pret] == tlist[(t+tfac)]) { attgt.list[[counter]] <- list(att=0, group=glist[g], year=tlist[(t+tfac)], post=0) - inffunc[,counter] <- rep(0,n) - counter <- counter+1 + # inffunc[,counter] <- rep(0,n) + # counter <- counter+1 + inffunc_updates[[update_counter]] <- list( + indices = rep(TRUE, n), # Apply to all units + values = as.matrix(rep(0, n)) # Zero influence function + ) + + # Update the counters + update_counter <- update_counter + 1 + counter <- counter + 1 next } } @@ -148,13 +154,15 @@ compute.att_gt <- function(dp) { post.treat <- 1*(glist[g] <= tlist[t+tfac]) # total number of units (not just included in G or C) - disdat <- data[data[,tname] == tlist[t+tfac] | data[,tname] == tlist[pret],] + # disdat <- data[data[,tname] == tlist[t+tfac] | data[,tname] == tlist[pret],] + target_times <- c(tlist[t+tfac], tlist[pret]) + disdat <- data[get(tname) %in% target_times] if (panel) { # transform disdat it into "cross-sectional" data where one of the columns # contains the change in the outcome over time. - disdat <- panel2cs2(disdat, yname, idname, tname, balance_panel=FALSE) + disdat <- get_wide_data(disdat, yname, idname, tname) # still total number of units (not just included in G or C) n <- nrow(disdat) @@ -163,7 +171,7 @@ compute.att_gt <- function(dp) { disidx <- disdat$.G==1 | disdat$.C==1 # pick up the data that will be used to compute ATT(g,t) - disdat <- disdat[disidx,] + disdat <- disdat[disidx] n1 <- nrow(disdat) # num obs. for computing ATT(g,t) @@ -197,7 +205,8 @@ compute.att_gt <- function(dp) { # checks for pscore based methods if (est_method %in% c("dr", "ipw")) { - preliminary_logit <- glm(G ~ -1 + covariates, family=binomial(link=logit)) + #preliminary_logit <- glm(G ~ -1 + covariates, family=binomial(link=logit)) + preliminary_logit <- parglm::parglm(G ~ -1 + covariates, family = "binomial") preliminary_pscores <- predict(preliminary_logit, type="response") if (max(preliminary_pscores) >= 0.999) { pscore_problems_likely <- TRUE @@ -217,8 +226,16 @@ compute.att_gt <- function(dp) { if (reg_problems_likely | pscore_problems_likely) { attgt.list[[counter]] <- list(att=NA, group=glist[g], year=tlist[(t+tfac)], post=post.treat) - inffunc[,counter] <- NA - counter <- counter+1 + # inffunc[,counter] <- NA + # counter <- counter+1 + inffunc_updates[[update_counter]] <- list( + indices = rep(TRUE, n), # Apply to all units + values = as.matrix(rep(NA, n)) # NA influence function + ) + + # Update the counters + update_counter <- update_counter + 1 + counter <- counter + 1 next } } @@ -270,7 +287,7 @@ compute.att_gt <- function(dp) { # this is the fix for unbalanced panels; 2nd criteria shouldn't do anything # with true repeated cross sections, but should pick up the right time periods # only with unbalanced panel - disidx <- (data$.rowid %in% rightids) & ( (data[,tname] == tlist[t+tfac]) | (data[,tname]==tlist[pret])) + disidx <- (data$.rowid %in% rightids) & ( (data[[tname]] == tlist[t+tfac]) | (data[[tname]]==tlist[pret])) # pick up the data that will be used to compute ATT(g,t) disdat <- data[disidx,] @@ -281,8 +298,8 @@ compute.att_gt <- function(dp) { # give short names for data in this iteration G <- disdat$.G C <- disdat$.C - Y <- disdat[,yname] - post <- 1*(disdat[,tname] == tlist[t+tfac]) + Y <- disdat[[yname]] + post <- 1*(disdat[[tname]] == tlist[t+tfac]) # num obs. for computing ATT(g,t), have to be careful here n1 <- sum(G+C) w <- disdat$.w @@ -309,8 +326,16 @@ compute.att_gt <- function(dp) { if (skip_this_att_gt) { attgt.list[[counter]] <- list(att=NA, group=glist[g], year=tlist[(t+tfac)], post=post.treat) - inffunc[,counter] <- NA - counter <- counter+1 + # inffunc[,counter] <- NA + # counter <- counter+1 + inffunc_updates[[update_counter]] <- list( + indices = rep(TRUE, n), # Apply to all units + values = as.matrix(rep(NA, n)) # NA influence function + ) + + # Update the counters + update_counter <- update_counter + 1 + counter <- counter + 1 next } @@ -374,30 +399,45 @@ compute.att_gt <- function(dp) { att = attgt$ATT, group = glist[g], year = tlist[(t+tfac)], post = post.treat ) - # recover the influence function - # start with vector of 0s because influence function - # for units that are not in G or C will be equal to 0 - inf.func <- rep(0, n) # populate the influence function in the right places if(panel) { - inf.func[disidx] <- attgt$att.inf.func + # inf.func[disidx] <- attgt$att.inf.func + # Collect the indices and corresponding values for the update + inffunc_updates[[update_counter]] <- list( + indices = disidx, + values = attgt$att.inf.func + ) } else { # aggregate inf functions by id (order by id) aggte_inffunc = suppressWarnings(stats::aggregate(attgt$att.inf.func, list(rightids), sum)) disidx <- (unique(data$.rowid) %in% aggte_inffunc[,1]) - inf.func[disidx] <- aggte_inffunc[,2] + #inf.func[disidx] <- aggte_inffunc[,2] + inffunc_updates[[update_counter]] <- list( + indices = disidx, + values = aggte_inffunc[,2] + ) } # save it in influence function matrix # inffunc[g,t,] <- inf.func - inffunc[,counter] <- inf.func + #inffunc[,counter] <- inf.func + update_counter <- update_counter + 1 # update counter counter <- counter+1 } # end looping over t } # end looping over g + # Apply the updates to the influence function matrix + update_inffunc <- rbindlist(lapply(seq_along(inffunc_updates), function(i) { + update <- inffunc_updates[[i]] + data.table(row = which(update$indices), col = i, value = update$values) + })) + # Apply updates to the sparse matrix + inffunc[cbind(update_inffunc$row, update_inffunc$col)] <- update_inffunc$value + + return(list(attgt.list=attgt.list, inffunc=inffunc)) } diff --git a/R/compute.att_gt2.R b/R/compute.att_gt2.R new file mode 100644 index 0000000..d1cc7d9 --- /dev/null +++ b/R/compute.att_gt2.R @@ -0,0 +1,444 @@ +#' @title Get pre treatment period +#' @description A utility function to get the pre treatment period for a given g,t pair. +#' +#' @param group Group. +#' @param time Time period. +#' @param base_period Whether to use a "varying" base period or a +#' "universal" base period. +#' @param anticipation The number of time periods before participating +#' in the treatment where units can anticipate participating in the +#' treatment and therefore it can affect their untreated potential outcomes +#' +#' @return Time period indicating the pre treatment period. +#' @noRd +get_pret <- function(group, time, base_period, anticipation){ + if(base_period == "universal"){ + pret <- group - 1 - anticipation + } else { + pret <- ifelse(time >= group, group - 1 - anticipation, time - 1) + } + + return(pret) +} + +#' @title Get the cohort for the current g,t values +#' @description A utility function to get vector with index of units that will be included in the DiD estimation for the current g,t pair. +#' +#' @param group Group. +#' @param time Time period. +#' @param pret Pre treatment period. +#' @param dp2 DiD parameters v2.0. +#' +#' @return Time period indicating the pre treatment period. +#' @noRd +get_did_cohort_index <- function(group, time, pret, dp2){ + # return a vector of dimension id_size with 1, 0 or NA values + treated_groups <- dp2$treated_groups + time_periods <- dp2$time_periods + # based on control_group option + + min_control_group <- ifelse((dp2$control_group == "notyettreated"), + dp2$cohort_counts$cohort[which(dp2$cohort_counts$cohort > dp2$reverse_mapping[max(time, pret)] + dp2$anticipation)][1], + Inf) + max_control_group <- Inf # always include the never treated units as the maximum. We add a correction in case is needed afterwards. + + + # select the DiD cohort + ifelse(dp2$allow_unbalanced_panel, did_cohort_index <- rep(NA, dp2$time_invariant_data[, .N]), did_cohort_index <- rep(NA, dp2$id_count)) + + if(dp2$panel){ + + # adding some correction in control group to avoid weird behavior when the control group is not yet treated + if(!max_control_group %in% dp2$cohort_counts$cohort){ + max_control_group <- tail(dp2$cohort_count$cohort,1) + } + + # getting the index to get units who will participate in the estimation for the (g,t) cell. + start_control <- dp2$cohort_counts[cohort < min_control_group, sum(cohort_size)]+1 + end_control <- dp2$cohort_counts[cohort <= max_control_group, sum(cohort_size)] + index <- which(dp2$cohort_counts[, cohort] == dp2$reverse_mapping[group]) + start_treat <- ifelse(index == 1, 1, dp2$cohort_counts[1:(index-1), sum(cohort_size)]+1) + end_treat <- dp2$cohort_counts[1:index, sum(cohort_size)] + # set the cohort index; .C = 0 and .G = 1 + did_cohort_index[start_control:end_control] <- 0 + did_cohort_index[start_treat:end_treat] <- 1 + } else { + # getting the index to get units who will participate in the estimation for the (g,t) cell. + # Note: This works because the data is already ordered in a specific way. Changing that order will break this. + + # Pre-extract column names for use within the loop + col_names <- names(dp2$crosstable_counts)[-1] # Exclude 'T' column + min_idx <- match(as.character(min_control_group), col_names) + max_idx <- match(as.character(max_control_group), col_names) + # correction in case there is not never-treated units! Pick the very last time period... + ifelse(is.na(min_idx), min_idx <- match(tail(col_names,1), col_names), min_idx <- min_idx) + ifelse(is.na(max_idx), max_idx <- match(tail(col_names,1), col_names), max_idx <- max_idx) + + # Start the loop + for (i in c(pret, time)) { + + # ------------------------------------------------ + # SET UP: Identify index_time, start_time, and precompute relevant counts + # ------------------------------------------------ + + # Identify the index_time for 'i' based on period_counts and compute the starting point + index_time <- which(dp2$period_counts[, period] == dp2$reverse_mapping[i]) + start_time <- ifelse(index_time == 1, 1, dp2$period_counts[1:(index_time - 1), sum(period_size)] + 1) + + # Extract the relevant row for current time period 'i' only once + relevant_g_row <- dp2$crosstable_counts[i, ] + + # ------------------------------------------------ + # FOR CONTROL UNITS FIRST + # ------------------------------------------------ + + # Calculate the starting and ending index for control units (min_control_group to max_control_group) + start_control <- start_time + sum(relevant_g_row[, col_names[1:min_idx - 1], with = FALSE], na.rm = TRUE) + csize <- sum(relevant_g_row[, col_names[min_idx:max_idx], with = FALSE], na.rm = TRUE) + end_control <- start_control + csize - 1 + + # Impute control units with 0 + did_cohort_index[start_control:end_control] <- 0 + + # ------------------------------------------------ + # FOR TREATED UNITS + # ------------------------------------------------ + + # Calculate correction_index based on the current g and the relevant row for the current time period + index_cohort <- which(dp2$cohort_counts[, cohort] == dp2$reverse_mapping[group]) + correction_index <- ifelse(index_cohort == 1, 0, sum(relevant_g_row[, col_names[1:(index_cohort - 1)], with = FALSE], na.rm = TRUE)) + + # Compute start and end points for treated units + start_treat <- start_time + correction_index + g_size <- relevant_g_row[, get(as.character(dp2$reverse_mapping[group]))] + end_treat <- start_treat + g_size - 1 + + # Impute treated units with 1 + did_cohort_index[start_treat:end_treat] <- 1 + } + + } + + return(did_cohort_index) +} + +#' @title Wrapper to run DRDID package +#' @description A utility function to run the DRDID package for the current g,t pair. +#' +#' @param cohort_data Data table with the cohort data for the current g,t pair +#' @param covariates Matrix of covariates to be used in the estimation +#' @param dp2 DiD parameters v2.0. +#' +#' @return Time period indicating the pre treatment period. +#' @noRd +run_DRDID <- function(cohort_data, covariates, dp2){ + + if(dp2$panel){ + # -------------------------------------- + # Panel Data + # -------------------------------------- + + # still total number of units (not just included in G or C) + n <- cohort_data[, .N] + + # pick up the indices for units that will be used to compute ATT(g,t) + valid_obs <- which(cohort_data[, !is.na(D)]) + cohort_data <- cohort_data[valid_obs] + # TODO; THIS HAS TO BE BETTER WRITTEN + if(dp2$xformla != ~1){ + covariates <- covariates[valid_obs,] + } else { + covariates <- rep(1, length(valid_obs)) + } + covariates <- as.matrix(covariates) + + # num obs. for computing ATT(g,t) + n1 <- cohort_data[, .N] + + #----------------------------------------------------------------------------- + # code for actually computing ATT(g,t) + #----------------------------------------------------------------------------- + + + if (inherits(dp2$est_method, "function")) { + # user-specified function + attgt <- dp2$est_method(y1=cohort_data[, y1], + y0=cohort_data[, y0], + D=cohort_data[, D], + covariates=covariates, + i.weights=cohort_data[, i.weights], + inffunc=TRUE) + } else if (dp2$est_method == "ipw") { + # inverse-probability weights + attgt <- std_ipw_did_panel(y1=cohort_data[, y1], + y0=cohort_data[, y0], + D=cohort_data[, D], + covariates=covariates, + i.weights=cohort_data[, i.weights], + boot=FALSE, inffunc=TRUE) + } else if (dp2$est_method == "reg") { + # regression + attgt <- reg_did_panel(y1=cohort_data[, y1], + y0=cohort_data[, y0], + D=cohort_data[, D], + covariates=covariates, + i.weights=cohort_data[, i.weights], + boot=FALSE, inffunc=TRUE) + } else { + # doubly robust, this is default + attgt <- drdid_panel(y1=cohort_data[, y1], + y0=cohort_data[, y0], + D=cohort_data[, D], + covariates=covariates, + i.weights=cohort_data[, i.weights], + boot=FALSE, inffunc=TRUE) + } + + # adjust influence function to account for only using + # subgroup to estimate att(g,t) + inf_func_vector <- rep(0, n) + inf_func_not_na <- (n/n1)*attgt$att.inf.func + inf_func_vector[valid_obs] <- inf_func_not_na + + } else { + # -------------------------------------- + # Repeated Cross-Section + # -------------------------------------- + # if we are running unbalanced panel data, we get a temporary copy of cohort data to compute influence function + if(dp2$allow_unbalanced_panel){ + cohort_data_init <- copy(cohort_data[, .(D, .rowid)]) + } + # still total number of units (not just included in G or C) + n <- cohort_data[, .N] + + # pick up the indices for units that will be used to compute ATT(g,t) + valid_obs <- which(cohort_data[, !is.na(D)]) + cohort_data <- cohort_data[valid_obs] + + # num obs. for computing ATT(g,t) + n1 <- cohort_data[, .N] + + if(dp2$xformla != ~1){ + covariates <- covariates[valid_obs,] + } else { + covariates <- covariates[valid_obs] + } + + covariates <- as.matrix(covariates) + + #----------------------------------------------------------------------------- + # code for actually computing ATT(g,t) + #----------------------------------------------------------------------------- + + if (inherits(dp2$est_method, "function")) { + # user-specified function + attgt <- dp2$est_method(y=cohort_data[, y], + post=cohort_data[, post], + D=cohort_data[, D], + covariates=covariates, + i.weights=cohort_data[, i.weights], + inffunc=TRUE) + } else if (dp2$est_method == "ipw") { + # inverse-probability weights + attgt <- std_ipw_did_rc(y=cohort_data[, y], + post=cohort_data[, post], + D=cohort_data[, D], + covariates=covariates, + i.weights=cohort_data[, i.weights], + boot=FALSE, inffunc=TRUE) + } else if (dp2$est_method == "reg") { + # regression + attgt <- reg_did_rc(y=cohort_data[, y], + post=cohort_data[, post], + D=cohort_data[, D], + covariates=covariates, + i.weights=cohort_data[, i.weights], + boot=FALSE, inffunc=TRUE) + } else { + # doubly robust, this is default + attgt <- drdid_rc(y=cohort_data[, y], + post=cohort_data[, post], + D=cohort_data[, D], + covariates=covariates, + i.weights=cohort_data[, i.weights], + boot=FALSE, inffunc=TRUE) + } + + # n/n1 adjusts for estimating the + # att_gt only using observations from groups + # G and C + # adjust influence function to account for only using + # subgroup to estimate att(g,t) + if(dp2$allow_unbalanced_panel){ + # since this is technically a panel data but ran as RCS, we need to adjust the influence function + # by aggregating influence value by .rowid (since several obs of one unit could be used to estimate ATT in each 2x2) + cohort_data_init[, inf_func_long := 0] + # Assign values from vec to the valid rows identified by valid_obs + cohort_data_init[valid_obs, inf_func_long := (dp2$id_count/n1)*attgt$att.inf.func] + inf_func_vector <- cohort_data_init[, .(inf_func_agg = sum(inf_func_long, na.rm = TRUE)), by = .rowid][ , inf_func_agg] + + } else { + inf_func_vector <- rep(0, n) + inf_func_not_na <- (n/n1)*attgt$att.inf.func + inf_func_vector[valid_obs] <- inf_func_not_na + } + + } + + return(list(att = attgt$ATT, inf_func = inf_func_vector)) + +} + + +#' @title Run ATT estimation for a given group-time pair +#' +#' @description `run_att_gt_estimation` does the main work for computing +#' multiperiod group-time average treatment effects +#' @param g group of interest (treated group at time t) +#' @param t time period +#' @param dp2 A DIDparams object v2.0 +#' +#' @return a list with the gt cell and the results after performing estimation +#' +#' @keywords internal +run_att_gt_estimation <- function(g, t, dp2){ + + if(dp2$print_details){cat("\n", paste0("Evaluating (g,t) = (",g,",",t,")"))} + pret <- get_pret(g, t, dp2$base_period, dp2$anticipation) + + # if we are in period (g-1) or base period out of bounds, normalize results to be equal to NULL + # and break without computing anything + if(t == pret | !pret %in% seq_along(dp2$time_periods)){ + # if(dp2$print_details){cat("\n Skipping (g,t) as base period is out of bounds or for varying base period.")} + return(NULL) + } + + # get units in treatment and control group + did_cohort_index <- get_did_cohort_index(g, t, pret, dp2) + # In case of not treatment and control group in the cohort, return NULL + valid_did_cohort <- any(did_cohort_index == 1) & any(did_cohort_index == 0) + if(!isTRUE(valid_did_cohort)){ + if(dp2$print_details){cat("\n Skipping (g,t) as no treatment group or control group found")} + return(NULL) + } + + # Get the matrix of covariates + covariates <- dp2$covariates + if(dp2$panel){ + cohort_data <- data.table(did_cohort_index, dp2$outcomes_tensor[[t]], dp2$outcomes_tensor[[pret]], dp2$weights_vector) + names(cohort_data) <- c("D", "y1", "y0", "i.weights") + } else { + dp2$time_invariant_data[, post := fifelse(get(dp2$tname) == dp2$reverse_mapping[t], 1, 0)] + cohort_data <- data.table(did_cohort_index, dp2$time_invariant_data[[dp2$yname]], dp2$time_invariant_data$post, dp2$time_invariant_data$weights, dp2$time_invariant_data$.rowid) + names(cohort_data) <- c("D", "y", "post", "i.weights", ".rowid") + } + + + # run estimation + did_result <- tryCatch(run_DRDID(cohort_data, covariates, dp2), + error = function(e) { + warning("\n Error in computing internal 2x2 DiD for (g,t) = (",g,",",t,"): ", e$message) + return(NULL) + }) + return(did_result) + +} + +#' @title Compute Group-Time Average Treatment Effects +#' +#' @description `compute.att_gt` does the (g,t) cell and send it to estimation, +#' then do all the post process after estimation +#' +#' @param dp2 A DIDparams object v2.0 +#' +#' @return a list with length equal to the number of groups times the +#' number of time periods; each element of the list contains an +#' object that contains group-time average treatment effect as well +#' as which group it is for and which time period it is for. It also exports +#' the influence function which is used externally to compute +#' standard errors. +#' +#' @keywords internal +#' @export +compute.att_gt2 <- function(dp2) { + + n <- dp2$id_count # Total number of units + treated_groups <- dp2$treated_groups + time_periods <- dp2$time_periods + + gt_index <- sort(union(treated_groups, time_periods)) + # standardize the times to indexes + # Create a mapping for time_periods to standardized form + #time_mapping <- setNames(seq_along(time_periods), time_periods) + time_mapping <- setNames(seq_along(gt_index), gt_index) + # Create a reverse mapping from standardized values back to the original + reverse_mapping <- setNames(as.numeric(names(time_mapping)), time_mapping) + dp2$reverse_mapping <- reverse_mapping + # Convert both time_periods and treated_group using the mapping + time_periods <- time_mapping[as.character(time_periods)] + treated_groups <- time_mapping[as.character(treated_groups)] + # Get (g,t) cells to perform computations. This replace the nested for-loop. + gt_cells <- expand.grid(g = treated_groups, t = time_periods, stringsAsFactors = FALSE) + gt_cells <- gt_cells[order(gt_cells$g, gt_cells$t), ] + total_gt_iterations <- nrow(gt_cells) + + + # Running estimation using run_att_gt_estimation() function + # Helper function for processing each (g,t) pair + process_gt <- function(gt_cell, idx) { + g <- gt_cell$g + t <- gt_cell$t + + # Run estimation + gt_result <- run_att_gt_estimation(g, t, dp2) + + # Compute post-treatment indicator + post.treat <- as.integer(g <= t) + + if (is.null(gt_result) || is.null(gt_result$att)) { + # Estimation failed or was skipped + if(dp2$base_period == "universal"){ + inffunc_updates <- rep(0, n) + gt_result <- list(att = 0, group = reverse_mapping[g], year = reverse_mapping[t], post = post.treat, inffunc_updates = inffunc_updates) + return(gt_result) + } else { + return(NULL) + } + + } else { + att <- gt_result$att + inf_func <- gt_result$inf_func + + # Handle NaN ATT + if (is.nan(att)) { + att <- 0 + inf_func <- rep(0, n) + } + + # Save ATT and influence function + inffunc_updates <- inf_func + gt_result <- list(att = att, group = reverse_mapping[g], year = reverse_mapping[t], post = post.treat, inffunc_updates = inffunc_updates) + return(gt_result) + } + } + + # run the estimation for each (g,t) pair with process_gt + gt_results <- lapply(seq_len(total_gt_iterations), function(idx) { + process_gt(gt_cells[idx, ], idx) + }) + + # Filter out NULL results + gt_results <- Filter(Negate(is.null), gt_results) + + # Post processing: Apply the updates to the sparse matrix in one shot + n_rows <- length(gt_results[[1]]$inffunc_updates) + update_inffunc <- data.table(matrix(NA_real_, nrow = n_rows, ncol = length(gt_results))) + for (i in seq_along(gt_results)) { + update_inffunc[[i]] <- gt_results[[i]]$inffunc_updates + } + + # Update the sparse matrix with the values collected + inffunc <- as(Matrix::Matrix(as.matrix(update_inffunc), sparse = TRUE), "CsparseMatrix") + + return(list(attgt.list=gt_results, inffunc=inffunc)) +} diff --git a/R/imports.R b/R/imports.R index bd872c4..108e2f4 100644 --- a/R/imports.R +++ b/R/imports.R @@ -12,8 +12,10 @@ #' @import BMisc #' @import data.table #' @importFrom tidyr gather -#' @importFrom methods is +#' @importFrom methods is as +#' @importFrom dreamerr check_set_arg +#' @importFrom DRDID drdid_panel reg_did_panel std_ipw_did_panel std_ipw_did_rc reg_did_rc drdid_rc NULL utils::globalVariables(c('.','.G','.y', 'asif_never_treated', 'treated_first_period', 'count', 'constant', '.rowid', 'V1', 'control_group', 'cohort', 'cohort_size', 'period', 'period_size', 'y1', 'y0', - 'i.weights', 'y', 'cluster', 'id', '..cols_to_keep', '..g')) + 'i.weights', 'y', 'cluster', 'id', '..cols_to_keep', '..g', 'inf_func_long', 'inf_func_agg')) diff --git a/R/mboot.R b/R/mboot.R index d110e6f..6c433ca 100644 --- a/R/mboot.R +++ b/R/mboot.R @@ -20,21 +20,23 @@ #' @export mboot <- function(inf.func, DIDparams, pl = FALSE, cores = 1) { - # setup needed variables - data <- as.data.frame(DIDparams$data) + # setup needed variables according to faster_mode; This returns different type of objects + # depending on whether we are in faster_mode or not that has to be handled in the code below idname <- DIDparams$idname clustervars <- DIDparams$clustervars biters <- DIDparams$biters tname <- DIDparams$tname - tlist <- unique(data[,tname])[order(unique(data[,tname]))] alp <- DIDparams$alp panel <- DIDparams$panel true_repeated_cross_sections <- DIDparams$true_repeated_cross_sections - - # just get n obsevations (for clustering below...) + unbalanced_panel <- DIDparams$allow_unbalanced_panel + data <- as.data.frame(DIDparams$data) + tlist <- ifelse(DIDparams$faster_mode, DIDparams$time_periods, unique(data[,tname])[order(unique(data[,tname]))]) + # just get n observations (for clustering below...) ifelse(panel, dta <- data[ data[,tname]==tlist[1], ], dta <- data) + # Make sure inf.func is matrix because we need this for computing n below inf.func <- as.matrix(inf.func) @@ -59,22 +61,23 @@ mboot <- function(inf.func, DIDparams, pl = FALSE, cores = 1) { } if (length(clustervars) > 0) { - # check that cluster variable does not vary over time within unit - clust_tv <- aggregate(data[,clustervars], by=list(data[,idname]), function(rr) length(unique(rr))==1) - if (!all(clust_tv[,2])) { - stop("can't handle time-varying cluster variables") + if(!DIDparams$faster_mode){ + # check that cluster variable does not vary over time within unit + clust_tv <- aggregate(data[,clustervars], by=list(data[,idname]), function(rr) length(unique(rr))==1) + if (!all(clust_tv[,2])) { + stop("can't handle time-varying cluster variables") + } + ## # CHECK iF CLUSTERVAR is TIME-VARYING + ## clust_tv = base::suppressWarnings(stats::aggregate(data[,clustervars], list((data[,idname])), sd)) + ## clust_tv$x[is.na(clust_tv$x)] <- 0 + ## if(any(clust_tv[,2]>.Machine$double.eps)){ + ## stop("can't handle time-varying cluster variables") + ## } else if (!panel){ + ## # IF NOT, SUBSET DTA TO ONE VALUE PER ID + ## # Here we do not care about tname and yname as we do not use these + ## dta <- base::suppressWarnings(stats::aggregate(dta, list((data[,idname])), mean)[,-1]) + ## } } - ## # CHECK iF CLUSTERVAR is TIME-VARYING - ## clust_tv = base::suppressWarnings(stats::aggregate(data[,clustervars], list((data[,idname])), sd)) - ## clust_tv$x[is.na(clust_tv$x)] <- 0 - ## if(any(clust_tv[,2]>.Machine$double.eps)){ - ## stop("can't handle time-varying cluster variables") - ## } else if (!panel){ - ## # IF NOT, SUBSET DTA TO ONE VALUE PER ID - ## # Here we do not care about tname and yname as we do not use these - ## dta <- base::suppressWarnings(stats::aggregate(dta, list((data[,idname])), mean)[,-1]) - ## } - } # multiplier bootstrap @@ -82,7 +85,7 @@ mboot <- function(inf.func, DIDparams, pl = FALSE, cores = 1) { if (length(clustervars)==0) { bres <- sqrt(n) * run_multiplier_bootstrap(inf.func, biters, pl, cores) } else { - n_clusters <- length(unique(data[,clustervars])) + n_clusters <- length(unique(dta[,clustervars])) cluster <- unique(dta[,c(idname,clustervars)])[,2] cluster_n <- aggregate(cluster, by=list(cluster), length)[,2] cluster_mean_if <- rowsum(inf.func, cluster,reorder=TRUE) / cluster_n diff --git a/R/pre_process_did.R b/R/pre_process_did.R index 0946e29..6d47aa9 100644 --- a/R/pre_process_did.R +++ b/R/pre_process_did.R @@ -29,6 +29,7 @@ pre_process_did <- function(yname, est_method = "dr", base_period = "varying", print_details = TRUE, + faster_mode = FALSE, pl = FALSE, cores = 1, call = NULL) { @@ -52,9 +53,20 @@ pre_process_did <- function(yname, # make sure gname is numeric if (! (is.numeric(data[, gname])) ) stop("data[, gname] must be numeric") - # put in blank xformla if no covariates + # put in blank xformla if no covariates or check whether all variables are in data if (is.null(xformla)) { xformla <- ~1 + } else { + # extract variable names from the formula + formula_vars <- all.vars(xformla) + + # identify variables in xformla not in data + missing_vars <- setdiff(formula_vars, names(data)) + + # error checking for missing variables in data + if (length(missing_vars) > 0) { + stop(paste("The following variables are not in data:", paste(missing_vars, collapse = ", ")), call. = FALSE) + } } # drop irrelevant columns from data @@ -344,6 +356,7 @@ pre_process_did <- function(yname, clustervars=clustervars, cband=cband, print_details=print_details, + faster_mode=faster_mode, pl=pl, cores=cores, est_method=est_method, diff --git a/R/pre_process_did2.R b/R/pre_process_did2.R new file mode 100644 index 0000000..ec1b9de --- /dev/null +++ b/R/pre_process_did2.R @@ -0,0 +1,533 @@ +#' @title validate_args +#' @description A utility function to validate arguments passed to a att_gt() +#' @param data data.table used in function +#' @param args list of arguments to validate +#' +#' @return nothing, but throws an error if any of the arguments are not valid +#' @noRd +validate_args <- function(args, data){ + + data_names <- names(data) + + # ---------------------- Error Checking ---------------------- + args$control_group <- args$control_group[1] + # Flag for control group types + control_group_message <- "control_group must be either 'nevertreated' or 'notyettreated'" + dreamerr::check_set_arg(args$control_group, "match", .choices = c("nevertreated", "notyettreated"), .message = control_group_message, .up = 1) + + # Flag for tname, gname, yname + name_message <- "__ARG__ must be a character scalar and a name of a column from the dataset." + dreamerr::check_set_arg(args$tname, args$gname, args$yname, "match", .choices = data_names, .message = name_message, .up = 1) + + # Flag for clustervars and weightsname + checkvar_message <- "__ARG__ must be NULL or a character scalar if a name of columns from the dataset." + dreamerr::check_set_arg(args$weightsname, args$clustervars, "NULL | match", .choices = data_names, .message = checkvar_message, .up = 1) + + # check if times periods are numeric + if(!data[, is.numeric(get(args$tname))]){stop("tname = ",args$tname, " is not numeric. Please convert it")} + + # Check if gname is numeric + if(!data[, is.numeric(get(args$gname))]){stop("gname = ",args$gname, " is not numeric. Please convert it")} + + # Flag for idname + if(!is.null(args$idname)){ + # Check if idname is in the data + name_message <- "__ARG__ must be a character scalar and a name of a column from the dataset." + dreamerr::check_set_arg(args$idname, "match", .choices = data_names, .message = name_message, .up = 1) + + # check if idname is numeric + if(!data[, is.numeric(get(args$idname))]){stop("idname = ", args$idname, " is not numeric. Please convert it")} + + # Check if gname is unique by idname: irreversibility of the treatment + check_treatment_uniqueness <- data[, .(constant = all(get(args$gname)[1] == get(args$gname))), by = get(args$idname)][, all(constant, na.rm = TRUE)] + if (!check_treatment_uniqueness) { + stop("The value of gname (treatment variable) must be the same across all periods for each particular unit. The treatment must be irreversible.") + } + + # Check if any combination of idname and tname is duplicated + n_id_year = anyDuplicated(data[, .(args$idname, args$tname)]) + # If any combination is duplicated, stop execution and throw an error + if (n_id_year > 0) { + stop("The value of idname must be unique (by tname). Some units are observed more than once in a period.") + } + } + + # Flag for based period: not in c("universal", "varying"), stop + args$base_period <- args$base_period[1] + base_period_message <- "base_period must be either 'universal' or 'varying'." + dreamerr::check_set_arg(args$base_period, "match", .choices = c("universal", "varying"), .message = base_period_message, .up = 1) + + # Flags for cluster variable + if (!is.null(args$clustervars)) { + # dropping idname from cluster + if ((!is.null(args$idname)) && (args$idname %in% args$clustervars)){ + args$clustervars <- setdiff(args$clustervars, args$idname) + } + + # check if user is providing more than 2 cluster variables (different than idname) + if (length(args$clustervars) > 1) { + stop("You can only provide 1 cluster variable additionally to the one provided in idname. Please check your arguments.") + } + + # Check that cluster variables do not vary over time within each unit + if (length(args$clustervars) > 0) { + # Efficiently check for time-varying cluster variables + clust_tv <- data[, lapply(.SD, function(col) length(unique(col)) == 1), by = get(args$idname), .SDcols = args$clustervars] + # If any cluster variable varies over time within any unit, stop execution + if (!all(unlist(clust_tv[, -1, with = FALSE]))) { + stop("did cannot handle time-varying cluster variables at the moment. Please check your cluster variable.") + } + } + } + + # Check if anticipation is numeric using + if (!is.numeric(args$anticipation)) { + stop("anticipation must be numeric. Please convert it.") + } + + # Check if anticipation is positive + if (args$anticipation < 0) { + stop("anticipation must be positive. Please check your arguments.") + } + +} + +#' @title DiD standarization +#' @description A utility function to coerce the data in standard format +#' @param data data.table used in function +#' @param args list of arguments to validate +#' +#' @return A List, containing order data and arguments +#' @noRd +did_standarization <- function(data, args){ + # keep relevant columns in data + cols_to_keep <- c(args$idname, args$tname, args$gname, args$yname, args$weightsname, args$clustervars) + + model_frame <- model.frame(args$xformla, data = data, na.action = na.pass) + # Subset the dataset to keep only the relevant columns + data <- data[, ..cols_to_keep] + + # Column bind the model frame to the data + data <- cbind(data, as.data.table(model_frame)) + + # Check if any covariates were missing + n_orig <- data[, .N] + data <- data[complete.cases(data)] + n_new <- data[, .N] + n_diff <- n_orig - n_new + if (n_diff != 0) { + warning(paste0("dropped ", n_diff, " rows from original data due to missing data")) + } + + # Set weights + base::ifelse(is.null(args$weightsname), weights <- rep(1, n_new), weights <- data[[args$weightsname]]) + # enforcing normalization of weights. At this point we already drop any missing values in weights. + weights <- weights/mean(weights) + data$weights <- weights + + # get a list of dates from min to max + tlist <- data[, sort(unique(get(args$tname)))] + + # Coerce control group identified with zero as Inf + data[get(args$gname) == 0, (args$gname) := Inf] + + # Working out the dates + # Identify groups with treatment time bigger than the maximum treatment time + # calculate the maximum treatment time + max_treatment_time <- max(tlist, na.rm = TRUE) + data[, asif_never_treated := (get(args$gname) > max_treatment_time)] + # replace NA values with FALSE in the logical vector + data[is.na(asif_never_treated), asif_never_treated := FALSE] + # set gname to 0 for those groups considered as never treated + data[asif_never_treated == TRUE, (args$gname) := Inf] + # remove the temporary column + data[, asif_never_treated := NULL] + + # get list of treated groups (by time) from min to max + glist <- sort(unique(data[[args$gname]])) + + # Check if there is a never treated group in the data + if (!(Inf %in% glist)) { + if (args$control_group == "nevertreated") { + stop("There is no available never-treated group") + } else { + # Drop all time periods with time periods >= latest treated + max_treated_time <- max(glist[is.finite(glist)], na.rm = TRUE) + latest_treated_time <- max_treated_time - args$anticipation + data <- data[get(args$tname) < latest_treated_time] + + tlist <- sort(unique(data[[args$tname]])) + glist <- sort(unique(data[[args$gname]])) + + # don't compute ATT(g,t) for groups that are only treated at end + # and only play a role as a comparison group + glist <- glist[glist < max_treated_time] + } + } + + # get only the first period + first_period <- tlist[1] + + # drop groups treated in the first period or before; keep only treated groups + glist <- glist[glist != Inf & glist > first_period + args$anticipation] + + # Check for groups treated in the first period and drop them + # identify groups treated in the first period + data[, treated_first_period := (get(args$gname) <= first_period)] + data[is.na(treated_first_period), treated_first_period := FALSE] + + # count the number of units treated in the first period + nfirstperiod <- ifelse(args$panel, uniqueN(data[treated_first_period == TRUE, get(args$idname)]), nrow(data[treated_first_period == TRUE])) + + # handle units treated in the first period + if (nfirstperiod > 0) { + warning(paste0("Dropped ", nfirstperiod, " units that were already treated in the first period.")) + data <- data[get(args$gname) %in% c(glist, Inf)] + + # update tlist and glist + tlist <- data[, sort(unique(get(args$tname)))] + glist <- data[, sort(unique(get(args$gname)))] + + # Drop groups treated in the first period or before + first_period <- tlist[1] + glist <- glist[glist != Inf & glist > first_period + args$anticipation] + } + # remove the temporary column + data[, treated_first_period := NULL] + + # If user specifies repeated cross sections, + # set that it really is repeated cross sections + args$true_repeated_cross_sections <- FALSE + if (!args$panel) { + args$true_repeated_cross_sections <- TRUE + } + + #----------------------------------------------------------------------------- + # setup data in panel case + #----------------------------------------------------------------------------- + # Check if data is a balanced panel if panel = TRUE and allow_unbalanced_panel = TRUE + bal_panel_test <- (args$panel)*(args$allow_unbalanced_panel) + + if (bal_panel_test) { + # First, focus on complete cases and make a balanced dataset + data_comp <- data[complete.cases(data)] + + # Check if the panel is balanced + is_balanced <- check_balance(data_comp, id_col = args$idname, time_col = args$tname) + + args$allow_unbalanced_panel <- !is_balanced + message(if (!is_balanced) + "You have an unbalanced panel. Proceeding as such." + else + "You have a balanced panel. Setting allow_unbalanced_panel = FALSE.") + } + + if (args$panel) { + # Check for unbalanced panel + if (args$allow_unbalanced_panel) { + # Flag for true repeated cross sections + args$panel <- FALSE + args$true_repeated_cross_sections <- FALSE + } else { + # Coerce balanced panel + + # Focus on complete cases + keepers <- complete.cases(data) + n <- uniqueN(data[[args$idname]]) + n_keep <- uniqueN(data[keepers, ][[args$idname]]) + if (n_keep < n) { + warning(paste0("Dropped ", (n - n_keep), " observations that had missing data.")) + data <- data[keepers, ] + } + + + # Make balanced panel + # n_old <- uniqueN(data[[args$idname]]) + # data <- BMisc::makeBalancedPanel(data, args$idname, args$tname, return_data.table = TRUE) # coerce to data.table again. This is ugly, find a better way to do it. + # n <- uniqueN(data[[args$idname]]) + # + # if (n < n_old) { + # warning(paste0("Dropped ", n_old - n, " observations while converting to balanced panel.")) + # } + raw_time_size <- uniqueN(data[[args$tname]]) + unit_count <- data[, .(count = .N), by = get(args$idname)] + if(any(unit_count[, count < raw_time_size])){ + mis_unit <- unit_count[count < raw_time_size] + warning(nrow(mis_unit), " units are missing in some periods. Converting to balanced panel by dropping them.") + data <- data[!get(args$idname) %in% mis_unit$get] + } + + # If all data is dropped, stop execution + if (nrow(data) == 0) { + stop("All observations dropped while converting data to balanced panel. Consider setting `panel = FALSE` and/or revisiting 'idname'.") + } + + n <- data[get(args$tname) == tlist[1], .N] + + # Check if the treatment is irreversible + # Ensure The value of gname must be the same across all periods for each particular individual. + check_treatment_uniqueness <- data[, .(constant = uniqueN(get(args$gname)) == 1), by = get(args$idname)][, all(constant)] + if (!check_treatment_uniqueness) { + stop("The value of gname (treatment variable) must be the same across all periods for each particular unit. The treatment must be irreversible.") + } + } + } + + #----------------------------------------------------------------------------- + # setup data in repeated cross section + #----------------------------------------------------------------------------- + + if (!args$panel) { + # Focus on complete cases + data <- data[complete.cases(data)] + + if (nrow(data) == 0) { + stop("All observations dropped due to missing data problems.") + } + + # n-row data.frame to hold the influence function + if (args$true_repeated_cross_sections) { + data[, .rowid := .I] # Create row index + args$idname <- ".rowid" + } else { + # Set rowid to idname for repeated cross section/unbalanced + data[, .rowid := get(args$idname)] + } + + # Count unique number of cross section observations + n <- uniqueN(data[[args$idname]]) + } + + # Check if groups is empty (usually a problem with the way people defined groups) + if(length(glist)==0){ + stop("No valid groups. The variable in 'gname' should be expressed as the time a unit is first treated (0 if never-treated).") + } + + # if there are only two time periods, then uniform confidence + # bands are the same as pointwise confidence intervals + if (length(tlist)==2) { + args$cband <- FALSE + } + + #----------------------------------------------------------------------------- + # more error handling after we have balanced the panel + + # Check against very small groups + # Calculate group sizes, dividing the count of each group by the length of tlist + gsize <- data[, .N / length(tlist), by = get(args$gname)] + + # How many in each group before giving a warning + reqsize <- length(BMisc::rhs.vars(args$xformla)) + 5 + + # Filter groups smaller than reqsize + gsize <- gsize[V1 < reqsize] + + # Warn if some groups are small + if (nrow(gsize) > 0) { + gpaste <- paste(gsize[,1], collapse = ",") + warning(paste0("Be aware that there are some small groups in your dataset.\n Check groups: ", gpaste, ".")) + + # Check if the never treated group is too small + if ((Inf %in% gsize[,1]) & (args$control_group == "nevertreated")) { + stop("never treated group is too small, try setting control_group=\"notyettreated\"") + } + } + + #----------------------------------------------------------------------------- + # Sort the data for easy access later on + if (args$panel){ + setorderv(data, c(args$tname, args$gname, args$idname), c(1,1,1)) + } else { + setorderv(data, c(args$tname, args$gname), c(1,1)) + } + + # Assign new args regarding number of time periods and groups + # How many time periods + args$time_periods_count <- length(tlist) + # time periods + args$time_periods <- tlist + # How many treated groups + args$treated_groups_count <- length(glist) + # treated groups + args$treated_groups <- glist + # id size + args$id_count <- n + + return(list(data = data, args = args)) +} + +#' @title DiD tensors +#' @description A utility function that split the data in a list of outcomes tensors and a list of arguments. +#' Tensor are objects of dimension id_count x 1 x time_periods_count and are used for faster filtering in the computation of the DiD estimator. +#' @param data data.table used in function +#' @param args list of arguments to validate +#' +#' @return A List, containing outcomes tensor, time invariant data, cohort counts, covariates matrix, cluster vector and weights vector. +#' @noRd +get_did_tensors <- function(data, args){ + + # Getting the outcomes tensor: a vector a outcome variables per time period of dimension id_count x 1 x time_periods_count + outcomes_tensor <- list() + #if(!args$allow_unbalanced_panel){ + if(args$panel){ + for(time in seq_along(args$time_periods)){ + # iterating over index + # creating buckets of outcomes of size from 1 to id_size, id_size+1 to 2*id_size, etc. + start <- (time - 1) * args$id_count + 1 + end <- time * args$id_count + outcomes_tensor[[time]] <- data[seq(start,end), get(args$yname)] + } + } else { + # for(time in args$time_periods){ + # outcome_vector_time <- rep(NA, args$id_count) # Initialize vector with NAs + # # Create a new column with Y values if tname == current_time, otherwise NA + # data[, outcome_vector_time := fifelse(get(args$tname) == time, get(args$yname), NA_real_)] + # # Store the vector in the outcomes_tensor + # outcomes_tensor[[time]] <- data$outcome_vector_time + # # drop the column created + # data[, outcome_vector_time := NULL] + # } + outcomes_tensor <- NULL + } + + # Getting the time invariant data + time_invariant_cols <- c(args$idname, args$gname, "weights", args$clustervars) + #if(!args$allow_unbalanced_panel){ + if(args$panel){ + # We can do this filtering because the data is already sorted appropriately + invariant_data <- data[1:args$id_count] + } else { + if (!args$allow_unbalanced_panel) { + # get the first observation for each unit + invariant_data <- data[data[, .I[1], by = get(args$idname)]$V1] + # # order by idname + # setorderv(invariant_data, c(args$idname), c(1)) + } else { + invariant_data <- copy(data) + } + } + + # Get cohort counts + cohorts <- c(args$treated_groups, Inf) + cohort_counts <- invariant_data[, .(cohort_size = .N) , by = get(args$gname)] + names(cohort_counts)[1] <- "cohort" # changing the name + + # Get period counts + period_counts <- invariant_data[, .(period_size = .N), by = get(args$tname)] + names(period_counts)[1] <- "period" # changing the name + + # Get crosstable counts + crosstable_counts <- invariant_data[, .N, by = .(get(args$tname), get(args$gname))] # Directly count occurrences by tname and gname + names(crosstable_counts)[1] <- "period" # changing the name + names(crosstable_counts)[2] <- "cohort" # changing the name + crosstable_counts <- dcast(crosstable_counts, period ~ cohort, value.var = "N", fill = 0) # Reshape + + + # Get covariates if any + if(args$xformla == ~1){ + covariates <- rep(1, args$id_count) + } else { + covariates <- as.data.table(model.matrix(args$xformla, data = invariant_data, na.action = na.pass)) + } + + # Get the cluster variable only + if(!is.null(args$clustervars)){ + cluster <- invariant_data[, .SD, .SDcols = args$clustervars] |> unlist() + } else { + cluster <- NA + } + + # Get the weights only + weights <- invariant_data[, .SD, .SDcols = "weights"] |> unlist() + + # Gather all the arguments to return + return(list(outcomes_tensor = outcomes_tensor, + data = data, + time_invariant_data = invariant_data, + cohort_counts = cohort_counts, + period_counts = period_counts, + crosstable_counts = crosstable_counts, + covariates = covariates, + cluster = cluster, + weights = weights)) +} + +#' @title Process `did` Function Arguments +#' +#' @description Function to process arguments passed to the main methods in the +#' `did` package as well as conducting some tests to make sure +#' data is in proper format / try to throw helpful error messages. +#' +#' @inheritParams att_gt +#' @param call Function call to att_gt +#' +#' @return a [`DIDparams`] object +#' +#' @export +pre_process_did2 <- function(yname, + tname, + idname, + gname, + xformla = NULL, + data, + panel = TRUE, + allow_unbalanced_panel, + control_group = c("nevertreated","notyettreated"), + anticipation = 0, + weightsname = NULL, + alp = 0.05, + bstrap = FALSE, + cband = FALSE, + biters = 1000, + clustervars = NULL, + est_method = "dr", + base_period = "varying", + print_details = TRUE, + faster_mode=FALSE, + pl = FALSE, + cores = 1, + call = NULL) { + + + # coerce data to data.table first + if (!is.data.table(data)) { + data <- as.data.table(data) + } + + # gathering all the arguments except data + args_names <- setdiff(names(formals()), "data") + args <- mget(args_names, sys.frame(sys.nframe())) + + # pick a control_group by default + args$control_group <- control_group[1] + + # run error checking on arguments + validate_args(args, data) + + # put in blank xformla if no covariates + if (is.null(args$xformla)) { + args$xformla <- ~1 + } else { + # extract variable names from the formula + formula_vars <- all.vars(args$xformla) + + # identify variables in xformla not in data + missing_vars <- setdiff(formula_vars, names(data)) + + # error checking for missing variables in data + if (length(missing_vars) > 0) { + stop(paste("The following variables are not in data:", paste(missing_vars, collapse = ", ")), call. = FALSE) + } + } + + # Put the data in a standard format after some validation + cleaned_did <- did_standarization(data, args) + + # Partition staggered DiD into a 2x2 DiD for faster implementation + did_tensors <- get_did_tensors(cleaned_did$data, cleaned_did$args) + + # store parameters for passing around later + dp <- DIDparams2(did_tensors, cleaned_did$args, call=call) + + return(dp) +} diff --git a/R/utility_functions.R b/R/utility_functions.R index 2b17235..39af866 100644 --- a/R/utility_functions.R +++ b/R/utility_functions.R @@ -49,8 +49,7 @@ trimmer <- function(g, tname, idname, gname, xformla, data, control_group="notye #' #' @return data from first period with .y0 (outcome in first period), .y1 (outcome in second period), #' and .dy (change in outcomes over time) appended to it -#' -#' @export +#' @noRd get_wide_data <- function(data, yname, idname, tname) { # check if data is data.table if (!is.data.table(data)) { @@ -74,3 +73,25 @@ get_wide_data <- function(data, yname, idname, tname) { return(data) } +#' @title Check balanced panel data +#' @description A utility function to check if your dataset is a balanced panel dataset. +#' +#' @param data data.table used in function +#' @param id_col name of id column in the dataset +#' @param time_col name of time column in the dataset +#' +#' @return Boolean indicating if the dataset is balanced or not. +#' @noRd +check_balance <- function(data, id_col, time_col) { + + # Count the number of observations per unit (idname) + panel_counts <- data[, .N, by = get(id_col)] + + # Determine the maximum number of time periods for any unit + max_time_periods <- data[, uniqueN(get(time_col))] + + # Check if every unit has the same number of time periods as max_time_periods + is_balanced <- all(panel_counts$N == max_time_periods) + + return(is_balanced) +} diff --git a/man/DIDparams.Rd b/man/DIDparams.Rd index 9303d07..083a4f2 100644 --- a/man/DIDparams.Rd +++ b/man/DIDparams.Rd @@ -20,6 +20,7 @@ DIDparams( clustervars = NULL, cband = TRUE, print_details = TRUE, + faster_mode = FALSE, pl = FALSE, cores = 1, est_method = "dr", @@ -100,6 +101,11 @@ bands, \code{bstrap} must also be set to \code{TRUE}. The default is \item{print_details}{Whether or not to show details/progress of computations. Default is \code{FALSE}.} +\item{faster_mode}{This option enables a faster version of \code{did}, optimizing +computation time for large datasets by improving data management within the package. +The default is set to \code{FALSE}. While the difference is minimal for small datasets, +it is recommended for use with large datasets.} + \item{pl}{Whether or not to use parallel processing} \item{cores}{The number of cores to use for parallel processing} diff --git a/man/aggte.Rd b/man/aggte.Rd index e845efc..ed9930b 100644 --- a/man/aggte.Rd +++ b/man/aggte.Rd @@ -1,210 +1,210 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/aggte.R -\name{aggte} -\alias{aggte} -\title{Aggregate Group-Time Average Treatment Effects} -\usage{ -aggte( - MP, - type = "group", - balance_e = NULL, - min_e = -Inf, - max_e = Inf, - na.rm = FALSE, - bstrap = NULL, - biters = NULL, - cband = NULL, - alp = NULL, - clustervars = NULL -) -} -\arguments{ -\item{MP}{an MP object (i.e., the results of the \code{\link[=att_gt]{att_gt()}} method)} - -\item{type}{Which type of aggregated treatment effect parameter to compute. -One option is "simple" (this just computes a weighted average of all -group-time average treatment effects with weights proportional to group -size). Other options are "dynamic" (this computes average effects across -different lengths of exposure to the treatment and is similar to an -"event study"; here the overall effect averages the effect of the -treatment across all positive lengths of exposure); "group" (this -is the default option and -computes average treatment effects across different groups; here -the overall effect averages the effect across different groups); and -"calendar" (this computes average treatment effects across different -time periods; here the overall effect averages the effect across each -time period).} - -\item{balance_e}{If set (and if one computes dynamic effects), it balances -the sample with respect to event time. For example, if \code{balance.e=2}, -\code{aggte} will drop groups that are not exposed to treatment for -at least three periods. (the initial period when \code{e=0} as well as the -next two periods when \code{e=1} and the \code{e=2}). This ensures that -the composition of groups does not change when event time changes.} - -\item{min_e}{For event studies, this is the smallest event time to compute -dynamic effects for. By default, \code{min_e = -Inf} so that effects at -all lengths of exposure are computed.} - -\item{max_e}{For event studies, this is the largest event time to compute -dynamic effects for. By default, \code{max_e = Inf} so that effects at -all lengths of exposure are computed.} - -\item{na.rm}{Logical value if we are to remove missing Values from analyses. Defaults is FALSE.} - -\item{bstrap}{Boolean for whether or not to compute standard errors using -the multiplier bootstrap. If standard errors are clustered, then one -must set \code{bstrap=TRUE}. Default is value set in the MP object. If bstrap is \code{FALSE}, then analytical -standard errors are reported.} - -\item{biters}{The number of bootstrap iterations to use. The default is the value set in the MP object, -and this is only applicable if \code{bstrap=TRUE}.} - -\item{cband}{Boolean for whether or not to compute a uniform confidence -band that covers all of the group-time average treatment effects -with fixed probability \code{1-alp}. In order to compute uniform confidence -bands, \code{bstrap} must also be set to \code{TRUE}. The default is -the value set in the MP object} - -\item{alp}{the significance level, default is value set in the MP object.} - -\item{clustervars}{A vector of variables to cluster on. At most, there -can be two variables (otherwise will throw an error) and one of these -must be the same as idname which allows for clustering at the individual -level. Default is the variables set in the MP object} -} -\value{ -An \code{\link{AGGTEobj}} object that holds the results from the -aggregation -} -\description{ -A function to take group-time average treatment effects -and aggregate them into a smaller number of parameters. There are -several possible aggregations including "simple", "dynamic", "group", -and "calendar." -} -\section{Examples}{ - - -Initial ATT(g,t) estimates from \code{\link[=att_gt]{att_gt()}} - -\if{html}{\out{
}}\preformatted{data(mpdta) -set.seed(09152024) -out <- att_gt(yname="lemp", - tname="year", - idname="countyreal", - gname="first.treat", - xformla=NULL, - data=mpdta) -}\if{html}{\out{
}} - -You can aggregate the ATT(g,t) in many ways. - -\strong{Overall ATT:} - -\if{html}{\out{
}}\preformatted{aggte(out, type = "simple") -#> -#> Call: -#> aggte(MP = out, type = "simple") -#> -#> Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. , -#> -#> -#> ATT Std. Error [ 95\% Conf. Int.] -#> -0.04 0.0123 -0.064 -0.0159 * -#> -#> -#> --- -#> Signif. codes: `*' confidence band does not cover 0 -#> -#> Control Group: Never Treated, Anticipation Periods: 0 -#> Estimation Method: Doubly Robust -}\if{html}{\out{
}} - -\strong{Dynamic ATT (Event-Study):} - -\if{html}{\out{
}}\preformatted{aggte(out, type = "dynamic") -#> -#> Call: -#> aggte(MP = out, type = "dynamic") -#> -#> Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. , -#> -#> -#> Overall summary of ATT's based on event-study/dynamic aggregation: -#> ATT Std. Error [ 95\% Conf. Int.] -#> -0.0772 0.0209 -0.1181 -0.0363 * -#> -#> -#> Dynamic Effects: -#> Event time Estimate Std. Error [95\% Simult. Conf. Band] -#> -3 0.0305 0.0158 -0.0103 0.0713 -#> -2 -0.0006 0.0134 -0.0351 0.0340 -#> -1 -0.0245 0.0145 -0.0617 0.0128 -#> 0 -0.0199 0.0125 -0.0521 0.0122 -#> 1 -0.0510 0.0161 -0.0926 -0.0093 * -#> 2 -0.1373 0.0386 -0.2368 -0.0377 * -#> 3 -0.1008 0.0345 -0.1899 -0.0117 * -#> --- -#> Signif. codes: `*' confidence band does not cover 0 -#> -#> Control Group: Never Treated, Anticipation Periods: 0 -#> Estimation Method: Doubly Robust -}\if{html}{\out{
}} - -\strong{ATT for each group:} - -\if{html}{\out{
}}\preformatted{aggte(out, type = "group") -#> -#> Call: -#> aggte(MP = out, type = "group") -#> -#> Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. , -#> -#> -#> Overall summary of ATT's based on group/cohort aggregation: -#> ATT Std. Error [ 95\% Conf. Int.] -#> -0.031 0.0126 -0.0558 -0.0062 * -#> -#> -#> Group Effects: -#> Group Estimate Std. Error [95\% Simult. Conf. Band] -#> 2004 -0.0797 0.0281 -0.1407 -0.0188 * -#> 2006 -0.0229 0.0156 -0.0568 0.0110 -#> 2007 -0.0261 0.0172 -0.0634 0.0113 -#> --- -#> Signif. codes: `*' confidence band does not cover 0 -#> -#> Control Group: Never Treated, Anticipation Periods: 0 -#> Estimation Method: Doubly Robust -}\if{html}{\out{
}} - -\strong{ATT for each calendar year:} - -\if{html}{\out{
}}\preformatted{aggte(out, type = "calendar") -#> -#> Call: -#> aggte(MP = out, type = "calendar") -#> -#> Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. , -#> -#> -#> Overall summary of ATT's based on calendar time aggregation: -#> ATT Std. Error [ 95\% Conf. Int.] -#> -0.0417 0.0172 -0.0755 -0.0079 * -#> -#> -#> Time Effects: -#> Time Estimate Std. Error [95\% Simult. Conf. Band] -#> 2004 -0.0105 0.0251 -0.0701 0.0490 -#> 2005 -0.0704 0.0320 -0.1464 0.0056 -#> 2006 -0.0488 0.0213 -0.0994 0.0018 -#> 2007 -0.0371 0.0139 -0.0700 -0.0041 * -#> --- -#> Signif. codes: `*' confidence band does not cover 0 -#> -#> Control Group: Never Treated, Anticipation Periods: 0 -#> Estimation Method: Doubly Robust -}\if{html}{\out{
}} -} - +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/aggte.R +\name{aggte} +\alias{aggte} +\title{Aggregate Group-Time Average Treatment Effects} +\usage{ +aggte( + MP, + type = "group", + balance_e = NULL, + min_e = -Inf, + max_e = Inf, + na.rm = FALSE, + bstrap = NULL, + biters = NULL, + cband = NULL, + alp = NULL, + clustervars = NULL +) +} +\arguments{ +\item{MP}{an MP object (i.e., the results of the \code{\link[=att_gt]{att_gt()}} method)} + +\item{type}{Which type of aggregated treatment effect parameter to compute. +One option is "simple" (this just computes a weighted average of all +group-time average treatment effects with weights proportional to group +size). Other options are "dynamic" (this computes average effects across +different lengths of exposure to the treatment and is similar to an +"event study"; here the overall effect averages the effect of the +treatment across all positive lengths of exposure); "group" (this +is the default option and +computes average treatment effects across different groups; here +the overall effect averages the effect across different groups); and +"calendar" (this computes average treatment effects across different +time periods; here the overall effect averages the effect across each +time period).} + +\item{balance_e}{If set (and if one computes dynamic effects), it balances +the sample with respect to event time. For example, if \code{balance.e=2}, +\code{aggte} will drop groups that are not exposed to treatment for +at least three periods. (the initial period when \code{e=0} as well as the +next two periods when \code{e=1} and the \code{e=2}). This ensures that +the composition of groups does not change when event time changes.} + +\item{min_e}{For event studies, this is the smallest event time to compute +dynamic effects for. By default, \code{min_e = -Inf} so that effects at +all lengths of exposure are computed.} + +\item{max_e}{For event studies, this is the largest event time to compute +dynamic effects for. By default, \code{max_e = Inf} so that effects at +all lengths of exposure are computed.} + +\item{na.rm}{Logical value if we are to remove missing Values from analyses. Defaults is FALSE.} + +\item{bstrap}{Boolean for whether or not to compute standard errors using +the multiplier bootstrap. If standard errors are clustered, then one +must set \code{bstrap=TRUE}. Default is value set in the MP object. If bstrap is \code{FALSE}, then analytical +standard errors are reported.} + +\item{biters}{The number of bootstrap iterations to use. The default is the value set in the MP object, +and this is only applicable if \code{bstrap=TRUE}.} + +\item{cband}{Boolean for whether or not to compute a uniform confidence +band that covers all of the group-time average treatment effects +with fixed probability \code{1-alp}. In order to compute uniform confidence +bands, \code{bstrap} must also be set to \code{TRUE}. The default is +the value set in the MP object} + +\item{alp}{the significance level, default is value set in the MP object.} + +\item{clustervars}{A vector of variables to cluster on. At most, there +can be two variables (otherwise will throw an error) and one of these +must be the same as idname which allows for clustering at the individual +level. Default is the variables set in the MP object} +} +\value{ +An \code{\link{AGGTEobj}} object that holds the results from the +aggregation +} +\description{ +A function to take group-time average treatment effects +and aggregate them into a smaller number of parameters. There are +several possible aggregations including "simple", "dynamic", "group", +and "calendar." +} +\section{Examples}{ + + +Initial ATT(g,t) estimates from \code{\link[=att_gt]{att_gt()}} + +\if{html}{\out{
}}\preformatted{data(mpdta) +set.seed(09152024) +out <- att_gt(yname="lemp", + tname="year", + idname="countyreal", + gname="first.treat", + xformla=NULL, + data=mpdta) +}\if{html}{\out{
}} + +You can aggregate the ATT(g,t) in many ways. + +\strong{Overall ATT:} + +\if{html}{\out{
}}\preformatted{aggte(out, type = "simple") +#> +#> Call: +#> aggte(MP = out, type = "simple") +#> +#> Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. , +#> +#> +#> ATT Std. Error [ 95\% Conf. Int.] +#> -0.04 0.0123 -0.064 -0.0159 * +#> +#> +#> --- +#> Signif. codes: `*' confidence band does not cover 0 +#> +#> Control Group: Never Treated, Anticipation Periods: 0 +#> Estimation Method: Doubly Robust +}\if{html}{\out{
}} + +\strong{Dynamic ATT (Event-Study):} + +\if{html}{\out{
}}\preformatted{aggte(out, type = "dynamic") +#> +#> Call: +#> aggte(MP = out, type = "dynamic") +#> +#> Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. , +#> +#> +#> Overall summary of ATT's based on event-study/dynamic aggregation: +#> ATT Std. Error [ 95\% Conf. Int.] +#> -0.0772 0.0209 -0.1181 -0.0363 * +#> +#> +#> Dynamic Effects: +#> Event time Estimate Std. Error [95\% Simult. Conf. Band] +#> -3 0.0305 0.0158 -0.0103 0.0713 +#> -2 -0.0006 0.0134 -0.0351 0.0340 +#> -1 -0.0245 0.0145 -0.0617 0.0128 +#> 0 -0.0199 0.0125 -0.0521 0.0122 +#> 1 -0.0510 0.0161 -0.0926 -0.0093 * +#> 2 -0.1373 0.0386 -0.2368 -0.0377 * +#> 3 -0.1008 0.0345 -0.1899 -0.0117 * +#> --- +#> Signif. codes: `*' confidence band does not cover 0 +#> +#> Control Group: Never Treated, Anticipation Periods: 0 +#> Estimation Method: Doubly Robust +}\if{html}{\out{
}} + +\strong{ATT for each group:} + +\if{html}{\out{
}}\preformatted{aggte(out, type = "group") +#> +#> Call: +#> aggte(MP = out, type = "group") +#> +#> Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. , +#> +#> +#> Overall summary of ATT's based on group/cohort aggregation: +#> ATT Std. Error [ 95\% Conf. Int.] +#> -0.031 0.0126 -0.0558 -0.0062 * +#> +#> +#> Group Effects: +#> Group Estimate Std. Error [95\% Simult. Conf. Band] +#> 2004 -0.0797 0.0281 -0.1407 -0.0188 * +#> 2006 -0.0229 0.0156 -0.0568 0.0110 +#> 2007 -0.0261 0.0172 -0.0634 0.0113 +#> --- +#> Signif. codes: `*' confidence band does not cover 0 +#> +#> Control Group: Never Treated, Anticipation Periods: 0 +#> Estimation Method: Doubly Robust +}\if{html}{\out{
}} + +\strong{ATT for each calendar year:} + +\if{html}{\out{
}}\preformatted{aggte(out, type = "calendar") +#> +#> Call: +#> aggte(MP = out, type = "calendar") +#> +#> Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. , +#> +#> +#> Overall summary of ATT's based on calendar time aggregation: +#> ATT Std. Error [ 95\% Conf. Int.] +#> -0.0417 0.0172 -0.0755 -0.0079 * +#> +#> +#> Time Effects: +#> Time Estimate Std. Error [95\% Simult. Conf. Band] +#> 2004 -0.0105 0.0251 -0.0701 0.0490 +#> 2005 -0.0704 0.0320 -0.1464 0.0056 +#> 2006 -0.0488 0.0213 -0.0994 0.0018 +#> 2007 -0.0371 0.0139 -0.0700 -0.0041 * +#> --- +#> Signif. codes: `*' confidence band does not cover 0 +#> +#> Control Group: Never Treated, Anticipation Periods: 0 +#> Estimation Method: Doubly Robust +}\if{html}{\out{
}} +} + diff --git a/man/att_gt.Rd b/man/att_gt.Rd index 9fb4590..9ecd2c1 100644 --- a/man/att_gt.Rd +++ b/man/att_gt.Rd @@ -1,293 +1,299 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/att_gt.R -\name{att_gt} -\alias{att_gt} -\title{Group-Time Average Treatment Effects} -\usage{ -att_gt( - yname, - tname, - idname = NULL, - gname, - xformla = NULL, - data, - panel = TRUE, - allow_unbalanced_panel = FALSE, - control_group = c("nevertreated", "notyettreated"), - anticipation = 0, - weightsname = NULL, - alp = 0.05, - bstrap = TRUE, - cband = TRUE, - biters = 1000, - clustervars = NULL, - est_method = "dr", - base_period = "varying", - print_details = FALSE, - pl = FALSE, - cores = 1 -) -} -\arguments{ -\item{yname}{The name of the outcome variable} - -\item{tname}{The name of the column containing the time periods} - -\item{idname}{The individual (cross-sectional unit) id name} - -\item{gname}{The name of the variable in \code{data} that -contains the first period when a particular observation is treated. -This should be a positive number for all observations in treated groups. -It defines which "group" a unit belongs to. It should be 0 for units -in the untreated group.} - -\item{xformla}{A formula for the covariates to include in the -model. It should be of the form \code{~ X1 + X2}. Default -is NULL which is equivalent to \code{xformla=~1}. This is -used to create a matrix of covariates which is then passed -to the 2x2 DID estimator chosen in \code{est_method}.} - -\item{data}{The name of the data.frame that contains the data} - -\item{panel}{Whether or not the data is a panel dataset. -The panel dataset should be provided in long format -- that -is, where each row corresponds to a unit observed at a -particular point in time. The default is TRUE. When -is using a panel dataset, the variable \code{idname} must -be set. When \code{panel=FALSE}, the data is treated -as repeated cross sections.} - -\item{allow_unbalanced_panel}{Whether or not function should -"balance" the panel with respect to time and id. The default -values if \code{FALSE} which means that \code{\link[=att_gt]{att_gt()}} will drop -all units where data is not observed in all periods. -The advantage of this is that the computations are faster -(sometimes substantially).} - -\item{control_group}{Which units to use the control group. -The default is "nevertreated" which sets the control group -to be the group of units that never participate in the -treatment. This group does not change across groups or -time periods. The other option is to set -\code{group="notyettreated"}. In this case, the control group -is set to the group of units that have not yet participated -in the treatment in that time period. This includes all -never treated units, but it includes additional units that -eventually participate in the treatment, but have not -participated yet.} - -\item{anticipation}{The number of time periods before participating -in the treatment where units can anticipate participating in the -treatment and therefore it can affect their untreated potential outcomes} - -\item{weightsname}{The name of the column containing the sampling weights. -If not set, all observations have same weight.} - -\item{alp}{the significance level, default is 0.05} - -\item{bstrap}{Boolean for whether or not to compute standard errors using -the multiplier bootstrap. If standard errors are clustered, then one -must set \code{bstrap=TRUE}. Default is \code{TRUE} (in addition, cband -is also by default \code{TRUE} indicating that uniform confidence bands -will be returned. If bstrap is \code{FALSE}, then analytical -standard errors are reported.} - -\item{cband}{Boolean for whether or not to compute a uniform confidence -band that covers all of the group-time average treatment effects -with fixed probability \code{1-alp}. In order to compute uniform confidence -bands, \code{bstrap} must also be set to \code{TRUE}. The default is -\code{TRUE}.} - -\item{biters}{The number of bootstrap iterations to use. The default is 1000, -and this is only applicable if \code{bstrap=TRUE}.} - -\item{clustervars}{A vector of variables names to cluster on. At most, there -can be two variables (otherwise will throw an error) and one of these -must be the same as idname which allows for clustering at the individual -level. By default, we cluster at individual level (when \code{bstrap=TRUE}).} - -\item{est_method}{the method to compute group-time average treatment effects. The default is "dr" which uses the doubly robust -approach in the \code{DRDID} package. Other built-in methods -include "ipw" for inverse probability weighting and "reg" for -first step regression estimators. The user can also pass their -own function for estimating group time average treatment -effects. This should be a function -\code{f(Y1,Y0,treat,covariates)} where \code{Y1} is an -\code{n} x \code{1} vector of outcomes in the post-treatment -outcomes, \code{Y0} is an \code{n} x \code{1} vector of -pre-treatment outcomes, \code{treat} is a vector indicating -whether or not an individual participates in the treatment, -and \code{covariates} is an \code{n} x \code{k} matrix of -covariates. The function should return a list that includes -\code{ATT} (an estimated average treatment effect), and -\code{inf.func} (an \code{n} x \code{1} influence function). -The function can return other things as well, but these are -the only two that are required. \code{est_method} is only used -if covariates are included.} - -\item{base_period}{Whether to use a "varying" base period or a -"universal" base period. Either choice results in the same -post-treatment estimates of ATT(g,t)'s. In pre-treatment -periods, using a varying base period amounts to computing a -pseudo-ATT in each treatment period by comparing the change -in outcomes for a particular group relative to its comparison -group in the pre-treatment periods (i.e., in pre-treatment -periods this setting computes changes from period t-1 to period -t, but repeatedly changes the value of t) - -A universal base period fixes the base period to always be -(g-anticipation-1). This does not compute -pseudo-ATT(g,t)'s in pre-treatment periods, but rather -reports average changes in outcomes from period t to -(g-anticipation-1) for a particular group relative to its comparison -group. This is analogous to what is often reported in event -study regressions. - -Using a varying base period results in an estimate of -ATT(g,t) being reported in the period immediately before -treatment. Using a universal base period normalizes the -estimate in the period right before treatment (or earlier when -the user allows for anticipation) to be equal to 0, but one -extra estimate in an earlier period.} - -\item{print_details}{Whether or not to show details/progress of computations. -Default is \code{FALSE}.} - -\item{pl}{Whether or not to use parallel processing} - -\item{cores}{The number of cores to use for parallel processing} -} -\value{ -an \code{\link{MP}} object containing all the results for group-time average -treatment effects -} -\description{ -\code{att_gt} computes average treatment effects in DID -setups where there are more than two periods of data and allowing for -treatment to occur at different points in time and allowing for -treatment effect heterogeneity and dynamics. -See Callaway and Sant'Anna (2021) for a detailed description. -} -\section{Examples:}{ -\strong{Basic \code{\link[=att_gt]{att_gt()}} call:} - -\if{html}{\out{
}}\preformatted{# Example data -data(mpdta) -set.seed(09152024) -out1 <- att_gt(yname="lemp", - tname="year", - idname="countyreal", - gname="first.treat", - xformla=NULL, - data=mpdta) -summary(out1) -#> -#> Call: -#> att_gt(yname = "lemp", tname = "year", idname = "countyreal", -#> gname = "first.treat", xformla = NULL, data = mpdta) -#> -#> Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. , -#> -#> Group-Time Average Treatment Effects: -#> Group Time ATT(g,t) Std. Error [95\% Simult. Conf. Band] -#> 2004 2004 -0.0105 0.0246 -0.0755 0.0545 -#> 2004 2005 -0.0704 0.0346 -0.1621 0.0212 -#> 2004 2006 -0.1373 0.0397 -0.2422 -0.0323 * -#> 2004 2007 -0.1008 0.0366 -0.1976 -0.0040 * -#> 2006 2004 0.0065 0.0226 -0.0532 0.0663 -#> 2006 2005 -0.0028 0.0193 -0.0538 0.0483 -#> 2006 2006 -0.0046 0.0185 -0.0536 0.0444 -#> 2006 2007 -0.0412 0.0208 -0.0962 0.0137 -#> 2007 2004 0.0305 0.0146 -0.0081 0.0692 -#> 2007 2005 -0.0027 0.0162 -0.0457 0.0402 -#> 2007 2006 -0.0311 0.0182 -0.0793 0.0172 -#> 2007 2007 -0.0261 0.0174 -0.0722 0.0201 -#> --- -#> Signif. codes: `*' confidence band does not cover 0 -#> -#> P-value for pre-test of parallel trends assumption: 0.16812 -#> Control Group: Never Treated, Anticipation Periods: 0 -#> Estimation Method: Doubly Robust -}\if{html}{\out{
}} - -\strong{Using covariates:} - -\if{html}{\out{
}}\preformatted{out2 <- att_gt(yname="lemp", - tname="year", - idname="countyreal", - gname="first.treat", - xformla=~lpop, - data=mpdta) -summary(out2) -#> -#> Call: -#> att_gt(yname = "lemp", tname = "year", idname = "countyreal", -#> gname = "first.treat", xformla = ~lpop, data = mpdta) -#> -#> Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. , -#> -#> Group-Time Average Treatment Effects: -#> Group Time ATT(g,t) Std. Error [95\% Simult. Conf. Band] -#> 2004 2004 -0.0145 0.0249 -0.0817 0.0527 -#> 2004 2005 -0.0764 0.0307 -0.1592 0.0064 -#> 2004 2006 -0.1404 0.0370 -0.2403 -0.0406 * -#> 2004 2007 -0.1069 0.0331 -0.1962 -0.0176 * -#> 2006 2004 -0.0005 0.0215 -0.0583 0.0574 -#> 2006 2005 -0.0062 0.0181 -0.0549 0.0425 -#> 2006 2006 0.0010 0.0190 -0.0502 0.0521 -#> 2006 2007 -0.0413 0.0207 -0.0971 0.0145 -#> 2007 2004 0.0267 0.0143 -0.0117 0.0652 -#> 2007 2005 -0.0046 0.0153 -0.0459 0.0368 -#> 2007 2006 -0.0284 0.0197 -0.0816 0.0247 -#> 2007 2007 -0.0288 0.0157 -0.0712 0.0136 -#> --- -#> Signif. codes: `*' confidence band does not cover 0 -#> -#> P-value for pre-test of parallel trends assumption: 0.23267 -#> Control Group: Never Treated, Anticipation Periods: 0 -#> Estimation Method: Doubly Robust -}\if{html}{\out{
}} - -\strong{Specify comparison units:} - -\if{html}{\out{
}}\preformatted{out3 <- att_gt(yname="lemp", - tname="year", - idname="countyreal", - gname="first.treat", - xformla=~lpop, - control_group = "notyettreated", - data=mpdta) -summary(out3) -#> -#> Call: -#> att_gt(yname = "lemp", tname = "year", idname = "countyreal", -#> gname = "first.treat", xformla = ~lpop, data = mpdta, control_group = "notyettreated") -#> -#> Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. , -#> -#> Group-Time Average Treatment Effects: -#> Group Time ATT(g,t) Std. Error [95\% Simult. Conf. Band] -#> 2004 2004 -0.0212 0.0219 -0.0797 0.0374 -#> 2004 2005 -0.0816 0.0299 -0.1617 -0.0015 * -#> 2004 2006 -0.1382 0.0375 -0.2387 -0.0376 * -#> 2004 2007 -0.1069 0.0354 -0.2016 -0.0122 * -#> 2006 2004 -0.0075 0.0216 -0.0653 0.0504 -#> 2006 2005 -0.0046 0.0184 -0.0539 0.0448 -#> 2006 2006 0.0087 0.0167 -0.0362 0.0535 -#> 2006 2007 -0.0413 0.0192 -0.0927 0.0101 -#> 2007 2004 0.0269 0.0146 -0.0122 0.0661 -#> 2007 2005 -0.0042 0.0160 -0.0470 0.0386 -#> 2007 2006 -0.0284 0.0182 -0.0773 0.0204 -#> 2007 2007 -0.0288 0.0176 -0.0759 0.0184 -#> --- -#> Signif. codes: `*' confidence band does not cover 0 -#> -#> P-value for pre-test of parallel trends assumption: 0.23326 -#> Control Group: Not Yet Treated, Anticipation Periods: 0 -#> Estimation Method: Doubly Robust -}\if{html}{\out{
}} -} - -\references{ -Callaway, Brantly and Pedro H.C. Sant'Anna. \"Difference-in-Differences with Multiple Time Periods.\" Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. \doi{10.1016/j.jeconom.2020.12.001}, \url{https://arxiv.org/abs/1803.09015} -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/att_gt.R +\name{att_gt} +\alias{att_gt} +\title{Group-Time Average Treatment Effects} +\usage{ +att_gt( + yname, + tname, + idname = NULL, + gname, + xformla = NULL, + data, + panel = TRUE, + allow_unbalanced_panel = FALSE, + control_group = c("nevertreated", "notyettreated"), + anticipation = 0, + weightsname = NULL, + alp = 0.05, + bstrap = TRUE, + cband = TRUE, + biters = 1000, + clustervars = NULL, + est_method = "dr", + base_period = "varying", + faster_mode = FALSE, + print_details = FALSE, + pl = FALSE, + cores = 1 +) +} +\arguments{ +\item{yname}{The name of the outcome variable} + +\item{tname}{The name of the column containing the time periods} + +\item{idname}{The individual (cross-sectional unit) id name} + +\item{gname}{The name of the variable in \code{data} that +contains the first period when a particular observation is treated. +This should be a positive number for all observations in treated groups. +It defines which "group" a unit belongs to. It should be 0 for units +in the untreated group.} + +\item{xformla}{A formula for the covariates to include in the +model. It should be of the form \code{~ X1 + X2}. Default +is NULL which is equivalent to \code{xformla=~1}. This is +used to create a matrix of covariates which is then passed +to the 2x2 DID estimator chosen in \code{est_method}.} + +\item{data}{The name of the data.frame that contains the data} + +\item{panel}{Whether or not the data is a panel dataset. +The panel dataset should be provided in long format -- that +is, where each row corresponds to a unit observed at a +particular point in time. The default is TRUE. When +is using a panel dataset, the variable \code{idname} must +be set. When \code{panel=FALSE}, the data is treated +as repeated cross sections.} + +\item{allow_unbalanced_panel}{Whether or not function should +"balance" the panel with respect to time and id. The default +values if \code{FALSE} which means that \code{\link[=att_gt]{att_gt()}} will drop +all units where data is not observed in all periods. +The advantage of this is that the computations are faster +(sometimes substantially).} + +\item{control_group}{Which units to use the control group. +The default is "nevertreated" which sets the control group +to be the group of units that never participate in the +treatment. This group does not change across groups or +time periods. The other option is to set +\code{group="notyettreated"}. In this case, the control group +is set to the group of units that have not yet participated +in the treatment in that time period. This includes all +never treated units, but it includes additional units that +eventually participate in the treatment, but have not +participated yet.} + +\item{anticipation}{The number of time periods before participating +in the treatment where units can anticipate participating in the +treatment and therefore it can affect their untreated potential outcomes} + +\item{weightsname}{The name of the column containing the sampling weights. +If not set, all observations have same weight.} + +\item{alp}{the significance level, default is 0.05} + +\item{bstrap}{Boolean for whether or not to compute standard errors using +the multiplier bootstrap. If standard errors are clustered, then one +must set \code{bstrap=TRUE}. Default is \code{TRUE} (in addition, cband +is also by default \code{TRUE} indicating that uniform confidence bands +will be returned. If bstrap is \code{FALSE}, then analytical +standard errors are reported.} + +\item{cband}{Boolean for whether or not to compute a uniform confidence +band that covers all of the group-time average treatment effects +with fixed probability \code{1-alp}. In order to compute uniform confidence +bands, \code{bstrap} must also be set to \code{TRUE}. The default is +\code{TRUE}.} + +\item{biters}{The number of bootstrap iterations to use. The default is 1000, +and this is only applicable if \code{bstrap=TRUE}.} + +\item{clustervars}{A vector of variables names to cluster on. At most, there +can be two variables (otherwise will throw an error) and one of these +must be the same as idname which allows for clustering at the individual +level. By default, we cluster at individual level (when \code{bstrap=TRUE}).} + +\item{est_method}{the method to compute group-time average treatment effects. The default is "dr" which uses the doubly robust +approach in the \code{DRDID} package. Other built-in methods +include "ipw" for inverse probability weighting and "reg" for +first step regression estimators. The user can also pass their +own function for estimating group time average treatment +effects. This should be a function +\code{f(Y1,Y0,treat,covariates)} where \code{Y1} is an +\code{n} x \code{1} vector of outcomes in the post-treatment +outcomes, \code{Y0} is an \code{n} x \code{1} vector of +pre-treatment outcomes, \code{treat} is a vector indicating +whether or not an individual participates in the treatment, +and \code{covariates} is an \code{n} x \code{k} matrix of +covariates. The function should return a list that includes +\code{ATT} (an estimated average treatment effect), and +\code{inf.func} (an \code{n} x \code{1} influence function). +The function can return other things as well, but these are +the only two that are required. \code{est_method} is only used +if covariates are included.} + +\item{base_period}{Whether to use a "varying" base period or a +"universal" base period. Either choice results in the same +post-treatment estimates of ATT(g,t)'s. In pre-treatment +periods, using a varying base period amounts to computing a +pseudo-ATT in each treatment period by comparing the change +in outcomes for a particular group relative to its comparison +group in the pre-treatment periods (i.e., in pre-treatment +periods this setting computes changes from period t-1 to period +t, but repeatedly changes the value of t) + +A universal base period fixes the base period to always be +(g-anticipation-1). This does not compute +pseudo-ATT(g,t)'s in pre-treatment periods, but rather +reports average changes in outcomes from period t to +(g-anticipation-1) for a particular group relative to its comparison +group. This is analogous to what is often reported in event +study regressions. + +Using a varying base period results in an estimate of +ATT(g,t) being reported in the period immediately before +treatment. Using a universal base period normalizes the +estimate in the period right before treatment (or earlier when +the user allows for anticipation) to be equal to 0, but one +extra estimate in an earlier period.} + +\item{faster_mode}{This option enables a faster version of \code{did}, optimizing +computation time for large datasets by improving data management within the package. +The default is set to \code{FALSE}. While the difference is minimal for small datasets, +it is recommended for use with large datasets.} + +\item{print_details}{Whether or not to show details/progress of computations. +Default is \code{FALSE}.} + +\item{pl}{Whether or not to use parallel processing} + +\item{cores}{The number of cores to use for parallel processing} +} +\value{ +an \code{\link{MP}} object containing all the results for group-time average +treatment effects +} +\description{ +\code{att_gt} computes average treatment effects in DID +setups where there are more than two periods of data and allowing for +treatment to occur at different points in time and allowing for +treatment effect heterogeneity and dynamics. +See Callaway and Sant'Anna (2021) for a detailed description. +} +\section{Examples:}{ +\strong{Basic \code{\link[=att_gt]{att_gt()}} call:} + +\if{html}{\out{
}}\preformatted{# Example data +data(mpdta) +set.seed(09152024) +out1 <- att_gt(yname="lemp", + tname="year", + idname="countyreal", + gname="first.treat", + xformla=NULL, + data=mpdta) +summary(out1) +#> +#> Call: +#> att_gt(yname = "lemp", tname = "year", idname = "countyreal", +#> gname = "first.treat", xformla = NULL, data = mpdta) +#> +#> Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. , +#> +#> Group-Time Average Treatment Effects: +#> Group Time ATT(g,t) Std. Error [95\% Simult. Conf. Band] +#> 2004 2004 -0.0105 0.0246 -0.0755 0.0545 +#> 2004 2005 -0.0704 0.0346 -0.1621 0.0212 +#> 2004 2006 -0.1373 0.0397 -0.2422 -0.0323 * +#> 2004 2007 -0.1008 0.0366 -0.1976 -0.0040 * +#> 2006 2004 0.0065 0.0226 -0.0532 0.0663 +#> 2006 2005 -0.0028 0.0193 -0.0538 0.0483 +#> 2006 2006 -0.0046 0.0185 -0.0536 0.0444 +#> 2006 2007 -0.0412 0.0208 -0.0962 0.0137 +#> 2007 2004 0.0305 0.0146 -0.0081 0.0692 +#> 2007 2005 -0.0027 0.0162 -0.0457 0.0402 +#> 2007 2006 -0.0311 0.0182 -0.0793 0.0172 +#> 2007 2007 -0.0261 0.0174 -0.0722 0.0201 +#> --- +#> Signif. codes: `*' confidence band does not cover 0 +#> +#> P-value for pre-test of parallel trends assumption: 0.16812 +#> Control Group: Never Treated, Anticipation Periods: 0 +#> Estimation Method: Doubly Robust +}\if{html}{\out{
}} + +\strong{Using covariates:} + +\if{html}{\out{
}}\preformatted{out2 <- att_gt(yname="lemp", + tname="year", + idname="countyreal", + gname="first.treat", + xformla=~lpop, + data=mpdta) +summary(out2) +#> +#> Call: +#> att_gt(yname = "lemp", tname = "year", idname = "countyreal", +#> gname = "first.treat", xformla = ~lpop, data = mpdta) +#> +#> Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. , +#> +#> Group-Time Average Treatment Effects: +#> Group Time ATT(g,t) Std. Error [95\% Simult. Conf. Band] +#> 2004 2004 -0.0145 0.0249 -0.0817 0.0527 +#> 2004 2005 -0.0764 0.0307 -0.1592 0.0064 +#> 2004 2006 -0.1404 0.0370 -0.2403 -0.0406 * +#> 2004 2007 -0.1069 0.0331 -0.1962 -0.0176 * +#> 2006 2004 -0.0005 0.0215 -0.0583 0.0574 +#> 2006 2005 -0.0062 0.0181 -0.0549 0.0425 +#> 2006 2006 0.0010 0.0190 -0.0502 0.0521 +#> 2006 2007 -0.0413 0.0207 -0.0971 0.0145 +#> 2007 2004 0.0267 0.0143 -0.0117 0.0652 +#> 2007 2005 -0.0046 0.0153 -0.0459 0.0368 +#> 2007 2006 -0.0284 0.0197 -0.0816 0.0247 +#> 2007 2007 -0.0288 0.0157 -0.0712 0.0136 +#> --- +#> Signif. codes: `*' confidence band does not cover 0 +#> +#> P-value for pre-test of parallel trends assumption: 0.23267 +#> Control Group: Never Treated, Anticipation Periods: 0 +#> Estimation Method: Doubly Robust +}\if{html}{\out{
}} + +\strong{Specify comparison units:} + +\if{html}{\out{
}}\preformatted{out3 <- att_gt(yname="lemp", + tname="year", + idname="countyreal", + gname="first.treat", + xformla=~lpop, + control_group = "notyettreated", + data=mpdta) +summary(out3) +#> +#> Call: +#> att_gt(yname = "lemp", tname = "year", idname = "countyreal", +#> gname = "first.treat", xformla = ~lpop, data = mpdta, control_group = "notyettreated") +#> +#> Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. , +#> +#> Group-Time Average Treatment Effects: +#> Group Time ATT(g,t) Std. Error [95\% Simult. Conf. Band] +#> 2004 2004 -0.0212 0.0219 -0.0797 0.0374 +#> 2004 2005 -0.0816 0.0299 -0.1617 -0.0015 * +#> 2004 2006 -0.1382 0.0375 -0.2387 -0.0376 * +#> 2004 2007 -0.1069 0.0354 -0.2016 -0.0122 * +#> 2006 2004 -0.0075 0.0216 -0.0653 0.0504 +#> 2006 2005 -0.0046 0.0184 -0.0539 0.0448 +#> 2006 2006 0.0087 0.0167 -0.0362 0.0535 +#> 2006 2007 -0.0413 0.0192 -0.0927 0.0101 +#> 2007 2004 0.0269 0.0146 -0.0122 0.0661 +#> 2007 2005 -0.0042 0.0160 -0.0470 0.0386 +#> 2007 2006 -0.0284 0.0182 -0.0773 0.0204 +#> 2007 2007 -0.0288 0.0176 -0.0759 0.0184 +#> --- +#> Signif. codes: `*' confidence band does not cover 0 +#> +#> P-value for pre-test of parallel trends assumption: 0.23326 +#> Control Group: Not Yet Treated, Anticipation Periods: 0 +#> Estimation Method: Doubly Robust +}\if{html}{\out{
}} +} + +\references{ +Callaway, Brantly and Pedro H.C. Sant'Anna. \"Difference-in-Differences with Multiple Time Periods.\" Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. \doi{10.1016/j.jeconom.2020.12.001}, \url{https://arxiv.org/abs/1803.09015} +} diff --git a/man/compute.att_gt2.Rd b/man/compute.att_gt2.Rd new file mode 100644 index 0000000..b75e178 --- /dev/null +++ b/man/compute.att_gt2.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compute.att_gt2.R +\name{compute.att_gt2} +\alias{compute.att_gt2} +\title{Compute Group-Time Average Treatment Effects} +\usage{ +compute.att_gt2(dp2) +} +\arguments{ +\item{dp2}{A DIDparams object v2.0} +} +\value{ +a list with length equal to the number of groups times the +number of time periods; each element of the list contains an +object that contains group-time average treatment effect as well +as which group it is for and which time period it is for. It also exports +the influence function which is used externally to compute +standard errors. +} +\description{ +\code{compute.att_gt} does the (g,t) cell and send it to estimation, +then do all the post process after estimation +} +\keyword{internal} diff --git a/man/get_wide_data.Rd b/man/get_wide_data.Rd deleted file mode 100644 index 56d95d3..0000000 --- a/man/get_wide_data.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utility_functions.R -\name{get_wide_data} -\alias{get_wide_data} -\title{get_wide_data} -\usage{ -get_wide_data(data, yname, idname, tname) -} -\arguments{ -\item{data}{data.table used in function} - -\item{yname}{name of outcome variable that can change over time} - -\item{idname}{unique id} - -\item{tname}{time period name} -} -\value{ -data from first period with .y0 (outcome in first period), .y1 (outcome in second period), -and .dy (change in outcomes over time) appended to it -} -\description{ -A utility function to convert long data to wide data, i.e., takes a 2 period dataset and turns it into a cross sectional dataset. -} diff --git a/man/pre_process_did.Rd b/man/pre_process_did.Rd index 08b9ddf..2d8a2c7 100644 --- a/man/pre_process_did.Rd +++ b/man/pre_process_did.Rd @@ -24,6 +24,7 @@ pre_process_did( est_method = "dr", base_period = "varying", print_details = TRUE, + faster_mode = FALSE, pl = FALSE, cores = 1, call = NULL @@ -154,6 +155,11 @@ extra estimate in an earlier period.} \item{print_details}{Whether or not to show details/progress of computations. Default is \code{FALSE}.} +\item{faster_mode}{This option enables a faster version of \code{did}, optimizing +computation time for large datasets by improving data management within the package. +The default is set to \code{FALSE}. While the difference is minimal for small datasets, +it is recommended for use with large datasets.} + \item{pl}{Whether or not to use parallel processing} \item{cores}{The number of cores to use for parallel processing} diff --git a/man/pre_process_did2.Rd b/man/pre_process_did2.Rd new file mode 100644 index 0000000..0d1d9cb --- /dev/null +++ b/man/pre_process_did2.Rd @@ -0,0 +1,176 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pre_process_did2.R +\name{pre_process_did2} +\alias{pre_process_did2} +\title{Process \code{did} Function Arguments} +\usage{ +pre_process_did2( + yname, + tname, + idname, + gname, + xformla = NULL, + data, + panel = TRUE, + allow_unbalanced_panel, + control_group = c("nevertreated", "notyettreated"), + anticipation = 0, + weightsname = NULL, + alp = 0.05, + bstrap = FALSE, + cband = FALSE, + biters = 1000, + clustervars = NULL, + est_method = "dr", + base_period = "varying", + print_details = TRUE, + faster_mode = FALSE, + pl = FALSE, + cores = 1, + call = NULL +) +} +\arguments{ +\item{yname}{The name of the outcome variable} + +\item{tname}{The name of the column containing the time periods} + +\item{idname}{The individual (cross-sectional unit) id name} + +\item{gname}{The name of the variable in \code{data} that +contains the first period when a particular observation is treated. +This should be a positive number for all observations in treated groups. +It defines which "group" a unit belongs to. It should be 0 for units +in the untreated group.} + +\item{xformla}{A formula for the covariates to include in the +model. It should be of the form \code{~ X1 + X2}. Default +is NULL which is equivalent to \code{xformla=~1}. This is +used to create a matrix of covariates which is then passed +to the 2x2 DID estimator chosen in \code{est_method}.} + +\item{data}{The name of the data.frame that contains the data} + +\item{panel}{Whether or not the data is a panel dataset. +The panel dataset should be provided in long format -- that +is, where each row corresponds to a unit observed at a +particular point in time. The default is TRUE. When +is using a panel dataset, the variable \code{idname} must +be set. When \code{panel=FALSE}, the data is treated +as repeated cross sections.} + +\item{allow_unbalanced_panel}{Whether or not function should +"balance" the panel with respect to time and id. The default +values if \code{FALSE} which means that \code{\link[=att_gt]{att_gt()}} will drop +all units where data is not observed in all periods. +The advantage of this is that the computations are faster +(sometimes substantially).} + +\item{control_group}{Which units to use the control group. +The default is "nevertreated" which sets the control group +to be the group of units that never participate in the +treatment. This group does not change across groups or +time periods. The other option is to set +\code{group="notyettreated"}. In this case, the control group +is set to the group of units that have not yet participated +in the treatment in that time period. This includes all +never treated units, but it includes additional units that +eventually participate in the treatment, but have not +participated yet.} + +\item{anticipation}{The number of time periods before participating +in the treatment where units can anticipate participating in the +treatment and therefore it can affect their untreated potential outcomes} + +\item{weightsname}{The name of the column containing the sampling weights. +If not set, all observations have same weight.} + +\item{alp}{the significance level, default is 0.05} + +\item{bstrap}{Boolean for whether or not to compute standard errors using +the multiplier bootstrap. If standard errors are clustered, then one +must set \code{bstrap=TRUE}. Default is \code{TRUE} (in addition, cband +is also by default \code{TRUE} indicating that uniform confidence bands +will be returned. If bstrap is \code{FALSE}, then analytical +standard errors are reported.} + +\item{cband}{Boolean for whether or not to compute a uniform confidence +band that covers all of the group-time average treatment effects +with fixed probability \code{1-alp}. In order to compute uniform confidence +bands, \code{bstrap} must also be set to \code{TRUE}. The default is +\code{TRUE}.} + +\item{biters}{The number of bootstrap iterations to use. The default is 1000, +and this is only applicable if \code{bstrap=TRUE}.} + +\item{clustervars}{A vector of variables names to cluster on. At most, there +can be two variables (otherwise will throw an error) and one of these +must be the same as idname which allows for clustering at the individual +level. By default, we cluster at individual level (when \code{bstrap=TRUE}).} + +\item{est_method}{the method to compute group-time average treatment effects. The default is "dr" which uses the doubly robust +approach in the \code{DRDID} package. Other built-in methods +include "ipw" for inverse probability weighting and "reg" for +first step regression estimators. The user can also pass their +own function for estimating group time average treatment +effects. This should be a function +\code{f(Y1,Y0,treat,covariates)} where \code{Y1} is an +\code{n} x \code{1} vector of outcomes in the post-treatment +outcomes, \code{Y0} is an \code{n} x \code{1} vector of +pre-treatment outcomes, \code{treat} is a vector indicating +whether or not an individual participates in the treatment, +and \code{covariates} is an \code{n} x \code{k} matrix of +covariates. The function should return a list that includes +\code{ATT} (an estimated average treatment effect), and +\code{inf.func} (an \code{n} x \code{1} influence function). +The function can return other things as well, but these are +the only two that are required. \code{est_method} is only used +if covariates are included.} + +\item{base_period}{Whether to use a "varying" base period or a +"universal" base period. Either choice results in the same +post-treatment estimates of ATT(g,t)'s. In pre-treatment +periods, using a varying base period amounts to computing a +pseudo-ATT in each treatment period by comparing the change +in outcomes for a particular group relative to its comparison +group in the pre-treatment periods (i.e., in pre-treatment +periods this setting computes changes from period t-1 to period +t, but repeatedly changes the value of t) + +A universal base period fixes the base period to always be +(g-anticipation-1). This does not compute +pseudo-ATT(g,t)'s in pre-treatment periods, but rather +reports average changes in outcomes from period t to +(g-anticipation-1) for a particular group relative to its comparison +group. This is analogous to what is often reported in event +study regressions. + +Using a varying base period results in an estimate of +ATT(g,t) being reported in the period immediately before +treatment. Using a universal base period normalizes the +estimate in the period right before treatment (or earlier when +the user allows for anticipation) to be equal to 0, but one +extra estimate in an earlier period.} + +\item{print_details}{Whether or not to show details/progress of computations. +Default is \code{FALSE}.} + +\item{faster_mode}{This option enables a faster version of \code{did}, optimizing +computation time for large datasets by improving data management within the package. +The default is set to \code{FALSE}. While the difference is minimal for small datasets, +it is recommended for use with large datasets.} + +\item{pl}{Whether or not to use parallel processing} + +\item{cores}{The number of cores to use for parallel processing} + +\item{call}{Function call to att_gt} +} +\value{ +a \code{\link{DIDparams}} object +} +\description{ +Function to process arguments passed to the main methods in the +\code{did} package as well as conducting some tests to make sure +data is in proper format / try to throw helpful error messages. +} diff --git a/man/run_att_gt_estimation.Rd b/man/run_att_gt_estimation.Rd new file mode 100644 index 0000000..a2e1e50 --- /dev/null +++ b/man/run_att_gt_estimation.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compute.att_gt2.R +\name{run_att_gt_estimation} +\alias{run_att_gt_estimation} +\title{Run ATT estimation for a given group-time pair} +\usage{ +run_att_gt_estimation(g, t, dp2) +} +\arguments{ +\item{g}{group of interest (treated group at time t)} + +\item{t}{time period} + +\item{dp2}{A DIDparams object v2.0} +} +\value{ +a list with the gt cell and the results after performing estimation +} +\description{ +\code{run_att_gt_estimation} does the main work for computing +multiperiod group-time average treatment effects +} +\keyword{internal} diff --git a/tests/testthat/test-att_gt.R b/tests/testthat/test-att_gt.R index fe495d2..c31fd09 100644 --- a/tests/testthat/test-att_gt.R +++ b/tests/testthat/test-att_gt.R @@ -8,7 +8,7 @@ library(BMisc) #----------------------------------------------------------------------------- # test each estimation method with panel data # Expected results: treatment effects = 1, p-value for pre-test -# uniformly distributed, ipw model is incorectly specified here +# uniformly distributed, ipw model is incorrectly specified here #----------------------------------------------------------------------------- test_that("att_gt works w/o dynamics, time effects, or group effects", { set.seed(09142024) @@ -533,18 +533,32 @@ test_that("small comparison group", { expect_error(expect_warning(res_ipw <- att_gt(yname="Y", xformla=~X, data=data, tname="period", idname="id", gname="G", est_method="ipw"), "small groups"), "never treated group is too small") + #----------------------------------------------------------------------------- + # not-yet-treated comparison group with faster_mode = TRUE + #----------------------------------------------------------------------------- + + # dr + expect_error(expect_warning(res_dr <- att_gt(yname="Y", xformla=~X, data=data, tname="period", idname="id", control_group="notyettreated", + gname="G", est_method="dr", faster_mode = TRUE), "matrix is singular"), "faster_mode=FALSE") + # reg + expect_error(expect_warning(res_reg <- att_gt(yname="Y", xformla=~X, data=data, tname="period", idname="id", control_group="notyettreated", + gname="G", est_method="reg", faster_mode = TRUE), "matrix is singular"), "faster_mode=FALSE") + # ipw + expect_warning(res_ipw <- att_gt(yname="Y", xformla=~X, data=data, tname="period", idname="id", control_group="notyettreated", + gname="G", est_method="ipw", faster_mode = TRUE), "small groups") + #----------------------------------------------------------------------------- # not-yet-treated comparison group #----------------------------------------------------------------------------- # dr expect_warning(res_dr <- att_gt(yname="Y", xformla=~X, data=data, tname="period", idname="id", control_group="notyettreated", - gname="G", est_method="dr"), "small group") + gname="G", est_method="dr", faster_mode = FALSE), "small group") # reg expect_warning(res_reg <- att_gt(yname="Y", xformla=~X, data=data, tname="period", idname="id", control_group="notyettreated", - gname="G", est_method="reg"), "Not enough control units") + gname="G", est_method="reg", faster_mode = FALSE), "Not enough control units") # ipw expect_warning(res_ipw <- att_gt(yname="Y", xformla=~X, data=data, tname="period", idname="id", control_group="notyettreated", - gname="G", est_method="ipw"), "overlap condition violated") + gname="G", est_method="ipw", faster_mode = FALSE), "overlap condition violated") # code should still work for some (g,t)'s expect_equal(res_dr$att[1], 1, tol=.5) @@ -637,9 +651,16 @@ test_that("clustered standard errors", { data=data, panel=TRUE, allow_unbalanced_panel=TRUE, + faster_mode = FALSE, clustervars="cluster") expect_equal(res_ub$att[1], 1, tol=.5) + # clustered standard errors with unbalanced panel with faster mode + res_ub_faster <- att_gt(yname="Y",tname="period",idname="id",gname="G", + xformla=~X,data=data,panel=TRUE, allow_unbalanced_panel=TRUE, + faster_mode = TRUE, clustervars="cluster") + expect_equal(res_ub_faster$att[1], 1, tol=.5) + #----------------------------------------------------------------------------- # also, check that we error when clustering variable varies within unit # over time @@ -659,3 +680,113 @@ test_that("clustered standard errors", { gname="G", est_method="dr", clustervars="cluster", panel=FALSE) expect_equal(res_rc$att[1], 1, tol=.5) }) + +test_that("faster mode enabled for panel data", { + data <- did::mpdta + out <- att_gt(yname = "lemp", gname = "first.treat", idname = "countyreal", tname = "year", + xformla = ~1, data = data, bstrap = FALSE, cband = FALSE, base_period = "universal", + control_group = "nevertreated", est_method = "dr", faster_mode = FALSE) + out2 <- att_gt(yname = "lemp", gname = "first.treat", idname = "countyreal", tname = "year", + xformla = ~1, data = data, bstrap = FALSE, cband = FALSE, base_period = "universal", + control_group = "nevertreated", est_method = "dr", faster_mode = TRUE) + + # check if results are equal. + expect_equal(out$att, out2$att) + expect_equal(out$se, as.numeric(out2$se)) + # -------------------------------------------------------------------------------------------------------- + out <- att_gt(yname = "lemp", gname = "first.treat", idname = "countyreal", tname = "year", + xformla = ~1, data = data, bstrap = FALSE, cband = FALSE, base_period = "varying", + control_group = "nevertreated", est_method = "dr", faster_mode = FALSE) + out2 <- att_gt(yname = "lemp", gname = "first.treat", idname = "countyreal", tname = "year", + xformla = ~1, data = data, bstrap = FALSE, cband = FALSE, base_period = "varying", + control_group = "nevertreated", est_method = "dr", faster_mode = TRUE) + + # check if results are equal. + expect_equal(out$att, out2$att) + expect_equal(out$se, as.numeric(out2$se)) + + # -------------------------------------------------------------------------------------------------------- + out <- att_gt(yname = "lemp", gname = "first.treat", idname = "countyreal", tname = "year", + xformla = ~1, data = data, bstrap = FALSE, cband = FALSE, base_period = "varying", + control_group = "notyettreated", est_method = "dr", faster_mode = FALSE) + out2 <- att_gt(yname = "lemp", gname = "first.treat", idname = "countyreal", tname = "year", + xformla = ~1, data = data, bstrap = FALSE, cband = FALSE, base_period = "varying", + control_group = "notyettreated", est_method = "dr", faster_mode = TRUE) + + # check if results are equal. + expect_equal(out$att, out2$att) + expect_equal(out$se, as.numeric(out2$se)) + + # -------------------------------------------------------------------------------------------------------- + out <- att_gt(yname = "lemp", gname = "first.treat", idname = "countyreal", tname = "year", + xformla = ~1, data = data, bstrap = FALSE, cband = FALSE, base_period = "universal", + control_group = "notyettreated", est_method = "dr", faster_mode = FALSE) + out2 <- att_gt(yname = "lemp", gname = "first.treat", idname = "countyreal", tname = "year", + xformla = ~1, data = data, bstrap = FALSE, cband = FALSE, base_period = "universal", + control_group = "notyettreated", est_method = "dr", faster_mode = TRUE) + + # check if results are equal. + expect_equal(out$att, out2$att) + expect_equal(out$se, as.numeric(out2$se)) + +}) + +test_that("faster model enabled for repeated cross sectional data", { + + data_rcs <- as.data.table(did::build_sim_dataset(reset.sim(time.periods=4, n=1000), panel=FALSE)) + data_rcs$period <- as.integer(data_rcs$period) + data_rcs[G == 0, G := Inf] + + # ---------------------------------------------------------------------------------------------------------------------------------- + out_rcs <- att_gt(yname = "Y", gname = "G", tname = "period", xformla = ~1, data = data_rcs, panel = FALSE, + bstrap = FALSE, cband = FALSE, base_period = "universal", control_group = "nevertreated", + est_method = "dr", faster_mode = FALSE) + + out_rcs2 <- att_gt(yname = "Y", gname = "G", tname = "period", xformla = ~1, data = data_rcs, panel = FALSE, + bstrap = FALSE, cband = FALSE, base_period = "universal", control_group = "nevertreated", + est_method = "dr", faster_mode = TRUE ) + + # check if results are equal. + expect_equal(out_rcs$att, out_rcs2$att) + expect_equal(out_rcs$se, as.numeric(out_rcs2$se)) + + # ---------------------------------------------------------------------------------------------------------------------------------- + out_rcs <- att_gt(yname = "Y", gname = "G", tname = "period", xformla = ~1, data = data_rcs, panel = FALSE, + bstrap = FALSE, cband = FALSE, base_period = "varying", control_group = "nevertreated", + est_method = "dr", faster_mode = FALSE) + + out_rcs2 <- att_gt(yname = "Y", gname = "G", tname = "period", xformla = ~1, data = data_rcs, panel = FALSE, + bstrap = FALSE, cband = FALSE, base_period = "varying", control_group = "nevertreated", + est_method = "dr", faster_mode = TRUE ) + + # check if results are equal. + expect_equal(out_rcs$att, out_rcs2$att) + expect_equal(out_rcs$se, as.numeric(out_rcs2$se)) + + # ---------------------------------------------------------------------------------------------------------------------------------- + out_rcs <- att_gt(yname = "Y", gname = "G", tname = "period", xformla = ~1, data = data_rcs, panel = FALSE, + bstrap = FALSE, cband = FALSE, base_period = "varying", control_group = "notyettreated", + est_method = "dr", faster_mode = FALSE) + + out_rcs2 <- att_gt(yname = "Y", gname = "G", tname = "period", xformla = ~1, data = data_rcs, panel = FALSE, + bstrap = FALSE, cband = FALSE, base_period = "varying", control_group = "notyettreated", + est_method = "dr", faster_mode = TRUE ) + + # check if results are equal. + expect_equal(out_rcs$att, out_rcs2$att) + expect_equal(out_rcs$se, as.numeric(out_rcs2$se)) + + # ---------------------------------------------------------------------------------------------------------------------------------- + out_rcs <- att_gt(yname = "Y", gname = "G", tname = "period", xformla = ~1, data = data_rcs, panel = FALSE, + bstrap = FALSE, cband = FALSE, base_period = "universal", control_group = "notyettreated", + est_method = "dr", faster_mode = FALSE) + + out_rcs2 <- att_gt(yname = "Y", gname = "G", tname = "period", xformla = ~1, data = data_rcs, panel = FALSE, + bstrap = FALSE, cband = FALSE, base_period = "universal", control_group = "notyettreated", + est_method = "dr", faster_mode = TRUE ) + + # check if results are equal. + expect_equal(out_rcs$att, out_rcs2$att) + expect_equal(out_rcs$se, as.numeric(out_rcs2$se)) + +}) diff --git a/tests/testthat/test-user_bug_fixes.R b/tests/testthat/test-user_bug_fixes.R index 4954b20..c7ff635 100644 --- a/tests/testthat/test-user_bug_fixes.R +++ b/tests/testthat/test-user_bug_fixes.R @@ -146,3 +146,15 @@ test_that("0 pre-treatment estimates when outcomes are 0", { res_idx <- which(res$group==9 & res$t==7) expect_equal(res$att[res_idx],0) }) + +test_that("variables not in live in dataset", { + sp <- did::reset.sim(time.periods=3) + data <- build_sim_dataset(sp) + + X2 <- factor(data$cluster) + + expect_error(att_gt(yname="Y", xformla=~X2, data=data, tname="period", idname="id", control_group="notyettreated", + gname="G", est_method="dr", clustervars="cluster"), " variables are not in data") + +}) + diff --git a/tests/testthat/test_sim_data_2_groups.R b/tests/testthat/test_sim_data_2_groups.R index 20c1ea2..2024ed3 100644 --- a/tests/testthat/test_sim_data_2_groups.R +++ b/tests/testthat/test_sim_data_2_groups.R @@ -1,6 +1,6 @@ library(DRDID) library(BMisc) -#library(tidyr) +library(tidyr) ## ----------------------------------------------------------------------------- #----------------------------------------------------------------------------- # test it works without covariates and only two gropups diff --git a/vignettes/did-basics.Rmd b/vignettes/did-basics.Rmd index f4edf7f..9061e52 100644 --- a/vignettes/did-basics.Rmd +++ b/vignettes/did-basics.Rmd @@ -76,17 +76,13 @@ library(did) #source(here::here("vignettes/setup_sims.R")) ``` -```{r, echo=FALSE} -time.periods <- 4 -sp <- reset.sim() -sp$te <- 0 -``` - ```{r, message = FALSE, warning = FALSE} # set seed so everything is reproducible set.seed(1814) # generate dataset with 4 time periods +sp <- reset.sim() +sp$te <- 0 time.periods <- 4 # add dynamic effects