R/pre_process_did.R

#' @title Process `did` Function Arguments
#'
#' @description Function to process arguments passed to the main methods in the
#'  `did` package as well as conducting some tests to make sure
#'  data is in proper format / try to throw helpful error messages.
#'
#' @inheritParams att_gt
#' @param call Function call to att_gt
#'
#' @return a [`DIDparams`] object
#'
#' @export
pre_process_did <- function(yname,
                            tname,
                            idname,
                            gname,
                            xformla = NULL,
                            data,
                            panel = TRUE,
                            allow_unbalanced_panel,
                            control_group = c("nevertreated","notyettreated"),
                            anticipation = 0,
                            weightsname = NULL,
                            alp = 0.05,
                            bstrap = FALSE,
                            cband = FALSE,
                            biters = 1000,
                            clustervars = NULL,
                            est_method = "dr",
                            base_period = "varying",
                            print_details = TRUE,
                            faster_mode = FALSE,
                            pl = FALSE,
                            cores = 1,
                            call = NULL) {
  #-----------------------------------------------------------------------------
  # Data pre-processing and error checking
  #-----------------------------------------------------------------------------
  # set control group
  control_group <- control_group[1]
  if(!(control_group %in% c("nevertreated","notyettreated"))){
    stop("control_group must be either 'nevertreated' or 'notyettreated'")
  }
  # make sure dataset is a data.frame
  # this gets around RStudio's default of reading data as tibble
  if (!all( class(data) == "data.frame")) {
    data <- as.data.frame(data)
  }

  # make sure time periods are numeric
  if (! (is.numeric(data[, tname])) ) stop("data[, tname] must be numeric")

  #  make sure gname is numeric
  if (! (is.numeric(data[, gname])) ) stop("data[, gname] must be numeric")

  # put in blank xformla if no covariates or check whether all variables are in data
  if (is.null(xformla)) {
    xformla <- ~1
  } else {
    # extract variable names from the formula
    formula_vars <- all.vars(xformla)

    # identify variables in xformla not in data
    missing_vars <- setdiff(formula_vars, names(data))

    # error checking for missing variables in data
    if (length(missing_vars) > 0) {
      stop(paste("The following variables are not in data:", paste(missing_vars, collapse = ", ")), call. = FALSE)
    }
  }

  # drop irrelevant columns from data
  data <- cbind.data.frame(data[,c(idname, tname, yname, gname, weightsname, clustervars)], model.frame(xformla, data=data, na.action=na.pass))

  # check if any covariates were missing
  n_orig <- nrow(data)
  data <- data[complete.cases(data),]
  n_diff <- n_orig - nrow(data)
  if (n_diff != 0) {
    warning(paste0("dropped ", n_diff, " rows from original data due to missing data"))
  }

  # weights if null
  ifelse(is.null(weightsname), w <- rep(1, nrow(data)), w <- data[,weightsname])

  if (".w" %in% colnames(data)) stop("`did` tried to use column named \".w\" internally, but there was already a column with this name")
  data$.w <- w

  # Outcome variable will be denoted by y
  # data$.y <- data[, yname]

  # figure out the dates
  # list of dates from smallest to largest
  tlist <- unique(data[,tname])[order(unique(data[,tname]))]

  # Groups with treatment time bigger than max time period are considered to be never treated
  asif_never_treated <- (data[,gname] > max(tlist, na.rm = TRUE))
  asif_never_treated[is.na(asif_never_treated)] <- FALSE
  data[asif_never_treated, gname] <- 0

  # list of treated groups (by time) from smallest to largest
  glist <- unique(data[,gname], )[order(unique(data[,gname]))]


  # Check if there is a never treated group
  if ( length(glist[glist==0]) == 0) {
    if(control_group=="nevertreated"){
      stop("There is no available never-treated group")
    } else {
      # Drop all time periods with time periods >= latest treated
      data <- subset(data,(data[,tname] < (max(glist)-anticipation)))
      # Replace last treated time with zero
      # lines.gmax <- data[,gname]==max(glist, na.rm = TRUE)
      # data[lines.gmax,gname] <- 0

      tlist <- sort(unique(data[,tname]))
      glist <- sort(unique(data[,gname]))

      # don't comput ATT(g,t) for groups that are only treated at end
      # and only play a role as a comparison group
      glist <- glist[ glist < max(glist)]
    }
  }

  # Only the treated groups
  glist <- glist[glist>0]

  # drop groups treated in the first period or before
  first.period <- tlist[1]
  glist <- glist[glist > first.period + anticipation]

  # check for groups treated in the first period and drop these
  # nfirstperiod <- length(unique(data[ !((data[,gname] > first.period) | (data[,gname]==0)), ] )[,idname])
  treated_first_period <- ( data[,gname] <= first.period ) & ( !(data[,gname]==0) )
  treated_first_period[is.na(treated_first_period)] <- FALSE
  nfirstperiod <- ifelse(panel, length(unique(data[treated_first_period,][,idname])), nrow(data[treated_first_period,]))
  if ( nfirstperiod > 0 ) {
    warning(paste0("Dropped ", nfirstperiod, " units that were already treated in the first period."))
    data <- data[ data[,gname] %in% c(0,glist), ]
    # update tlist and glist
    tlist <- unique(data[,tname])[order(unique(data[,tname]))]
    glist <- unique(data[,gname], )[order(unique(data[,gname]))]
    glist <- glist[glist>0]

    # drop groups treated in the first period or before
    first.period <- tlist[1]
    glist <- glist[glist > first.period + anticipation]

  }

  #  make sure id is numeric
  if (! is.null(idname)){
    #  make sure id is numeric
    if (! (is.numeric(data[, idname])) ) stop("data[, idname] must be numeric")

    ## # checks below are useful, but removing due to being slow
    ## # might ought to figure out a way to do these faster later
    ## # these checks are also closely related to making sure
    ## # that we have a well-balanced panel, so it might make
    ## # sense to move them over to the BMisc package

    ## # Check if idname is unique by tname
    ## n_id_year = all( table(data[, idname], data[, tname]) <= 1)
    ## if (! n_id_year) stop("The value of idname must be the unique (by tname)")

    ## # make sure gname doesn't change across periods for particular individuals
    ## if (!all(sapply( split(data, data[,idname]), function(df) {
    ##   length(unique(df[,gname]))==1
    ## }))) {
    ##   stop("The value of gname must be the same across all periods for each particular individual.")
    ## }
  }


  # if user specifies repeated cross sections,
  # set that it really is repeated cross sections
  true_repeated_cross_sections <- FALSE
  if (!panel) {
    true_repeated_cross_sections <- TRUE
  }

  #-----------------------------------------------------------------------------
  # setup data in panel case
  #-----------------------------------------------------------------------------
  # Check if data is a balanced panel if panel = TRUE and allow_unbalanced_panel = TRUE
  bal_panel_test <- panel*allow_unbalanced_panel
  if (bal_panel_test) {
    # First, focus on complete cases
    keepers <- complete.cases(data)
    data_comp <- data[keepers,]
    # make it a balanced data set
    n_all <- length(unique(data_comp[,idname]))
    data_bal <- BMisc::makeBalancedPanel(data_comp, idname, tname)
    n_bal <- length(unique(data_bal[,idname]))
    if (n_bal < n_all) {
      # message(paste0("You have an unbalanced panel. Proceeding as such."))
      allow_unbalanced_panel <- TRUE
    } else {
      # message(paste0("You have a balanced panel. Setting the allow_unbalanced_panel = FALSE."))
      allow_unbalanced_panel <- FALSE
    }
  }


  if (panel) {

    # check for unbalanced panel
    if (allow_unbalanced_panel) {

      # Flag for true repeated cross sections
      panel <- FALSE
      true_repeated_cross_sections <- FALSE

    } else {

      # this is the case where we coerce balanced panel

      # check for complete cases
      keepers <- complete.cases(data)
      n <- length(unique(data[,idname]))
      n.keep <- length(unique(data[keepers,idname]))
      if (nrow(data[keepers,]) < nrow(data)) {
        warning(paste0("Dropped ", (n-n.keep), " observations that had missing data."))
        data <- data[keepers,]
      }

      # make it a balanced data set
      n.old <- length(unique(data[,idname]))
      data <- BMisc::makeBalancedPanel(data, idname, tname)
      n <- length(unique(data[,idname]))
      if (n < n.old) {
        warning(paste0("Dropped ", n.old-n, " observations while converting to balanced panel."))
      }

      # If drop all data, you do not have a panel.
      if (nrow(data)==0) {
        stop("All observations dropped to converted data to balanced panel. Consider setting `panel = FALSE' and/or revisit 'idname'.")
      }

      n <- nrow(data[ data[,tname]==tlist[1], ])

      # slow, repeated check here...
      ## # check that first.treat doesn't change across periods for particular individuals
      ## if (!all(sapply( split(data, data[,idname]), function(df) {
      ##   length(unique(df[,gname]))==1
      ## }))) {
      ##   stop("The value of gname must be the same across all periods for each particular individual.")
      ## }

    }
  }

  #-----------------------------------------------------------------------------
  # code for setting up repeated cross sections (and unbalanced panel)
  #-----------------------------------------------------------------------------
  if (!panel) {

    # check for complete cases
    keepers <- complete.cases(data)
    if (nrow(data[keepers,]) < nrow(data)) {
      warning(paste0("Dropped ", nrow(data) - nrow(data[keepers,]), " observations that had missing data."))
      data <- data[keepers,]
    }

    # If drop all data, you do not have a panel.
    if (nrow(data)==0) {
      stop("All observations dropped due to missing data problems.")
    }

    # n-row data.frame to hold the influence function
    if (true_repeated_cross_sections) {
      data$.rowid <- seq(1:nrow(data))
      idname <- ".rowid"
    } else {
      # set rowid to idname for repeated cross section/unbalanced
      data$.rowid <- data[, idname]
    }

    # n is unique number of cross section observations
    # this is different for repeated cross sections and unbalanced panel
    n <- length(unique(data[,idname]))
  }

  ## # Update tlist and glist because of data handling
  ## # figure out the dates
  ## # list of dates from smallest to largest
  ## tlist <- unique(data[,tname])[order(unique(data[,tname]))]
  ## # list of treated groups (by time) from smallest to largest
  ## glist <- unique(data[,gname])[order(unique(data[,gname]))]

  ## # Only the treated groups
  ## glist <- glist[glist>0]

  ## # drop groups treated in the first period or before
  ## first.period <- tlist[1]
  ## glist <- glist[glist > first.period + anticipation]

  # Check if groups is empty (usually a problem with the way people defined groups)
  if(length(glist)==0){
    stop("No valid groups. The variable in 'gname' should be expressed as the time a unit is first treated (0 if never-treated).")
  }

  # if there are only two time periods, then uniform confidence
  # bands are the same as pointwise confidence intervals
  if (length(tlist)==2) {
    cband <- FALSE
  }

  #-----------------------------------------------------------------------------
  # more error handling after we have balanced the panel

  # check against very small groups
  gsize <- aggregate(data[,gname], by=list(data[,gname]), function(x) length(x)/length(tlist))

  # how many in each group before give warning
  # 5 is just a buffer, could pick something else, but seems to work fine
  reqsize <- length(BMisc::rhs.vars(xformla)) + 5

  # which groups to warn about
  gsize <- subset(gsize, x < reqsize) # x is name of column from aggregate

  # warn if some groups are small
  if (nrow(gsize) > 0) {
    gpaste <-  paste(gsize[,1], collapse=",")
    warning(paste0("Be aware that there are some small groups in your dataset.\n  Check groups: ", gpaste, "."))

    if ( (0 %in% gsize[,1]) & (control_group == "nevertreated") ) {
      stop("never treated group is too small, try setting control_group=\"notyettreated\"")
    }
  }
  #----------------------------------------------------------------------------

  # How many time periods
  nT <- length(tlist)
  # How many treated groups
  nG <- length(glist)

  # order dataset wrt idname and tname
  data <- data[order(data[,idname], data[,tname]),]

  # store parameters for passing around later
  dp <- DIDparams(yname=yname,
                  tname=tname,
                  idname=idname,
                  gname=gname,
                  xformla=xformla,
                  data=as.data.frame(data),
                  control_group=control_group,
                  anticipation=anticipation,
                  weightsname=weightsname,
                  alp=alp,
                  bstrap=bstrap,
                  biters=biters,
                  clustervars=clustervars,
                  cband=cband,
                  print_details=print_details,
                  faster_mode=faster_mode,
                  pl=pl,
                  cores=cores,
                  est_method=est_method,
                  base_period=base_period,
                  panel=panel,
                  true_repeated_cross_sections=true_repeated_cross_sections,
                  n=n,
                  nG=nG,
                  nT=nT,
                  tlist=tlist,
                  glist=glist,
                  call=call)
}