From eeece4c3c217455b10443ac0e7270b743100244b Mon Sep 17 00:00:00 2001 From: Kyle Butts Date: Wed, 17 May 2023 17:12:04 -0600 Subject: [PATCH 01/18] Initial `sparse_model_matrix` implementation --- R/MiscFuns.R | 7 +- R/sparse_model_matrix.R | 674 ++++++++++++++++++++++++++++++++++++++++ tests/fixest_tests.R | 124 ++++++++ 3 files changed, 803 insertions(+), 2 deletions(-) create mode 100644 R/sparse_model_matrix.R diff --git a/R/MiscFuns.R b/R/MiscFuns.R index 664fca0e..e3f0a0a6 100644 --- a/R/MiscFuns.R +++ b/R/MiscFuns.R @@ -1384,7 +1384,7 @@ i = function(factor_var, var, ref, keep, bin, ref2, keep2, bin2, ...){ items_name = items } - if(FROM_FIXEST){ + if(FROM_FIXEST || is_sparse){ # Pour avoir des jolis noms c'est un vrai gloubiboulga, # mais j'ai pas trouve plus simple... if(IS_INTER_FACTOR){ @@ -1421,6 +1421,10 @@ i = function(factor_var, var, ref, keep, bin, ref2, keep2, bin2, ...){ fe_colid = to_integer(fe_num[valid_row], sorted = TRUE) values = if(length(var) == 1) rep(1, length(valid_row)) else var + + # Clean names + col_names = sub("^.*__CLEAN__", "", col_names) + res = list(rowid = which(valid_row), values = values, colid = fe_colid, col_names = col_names) @@ -3694,7 +3698,6 @@ fixest_model_matrix = function(fml, data, fake_intercept = FALSE, i_noref = FALS } else { linear.mat = prepare_matrix(fml, data, fake_intercept) } - if(any(grepl("__CLEAN__", colnames(linear.mat), fixed = TRUE))){ new_names = clean_interact_names(colnames(linear.mat)) diff --git a/R/sparse_model_matrix.R b/R/sparse_model_matrix.R new file mode 100644 index 00000000..9f63906a --- /dev/null +++ b/R/sparse_model_matrix.R @@ -0,0 +1,674 @@ +#' Design matrix of a `fixest` object returned in sparse format +#' +#' This function creates the left-hand-side or the right-hand-side(s) of a [`femlm`], [`feols`] or [`feglm`] estimation. Note that this function currently does not accept a formula +#' +#' @method sparse_model_matrix fixest +#' +#' @inheritParams nobs.fixest +#' +#' @param data If missing (default) then the original data is obtained by evaluating the `call`. Otherwise, it should be a `data.frame`. +#' @param type Character vector or one sided formula, default is "rhs". Contains the type of matrix/data.frame to be returned. Possible values are: "lhs", "rhs", "fixef", "iv.rhs1" (1st stage RHS), "iv.rhs2" (2nd stage RHS), "iv.endo" (endogenous vars.), "iv.exo" (exogenous vars), "iv.inst" (instruments). +#' @param na.rm Default is `TRUE`. Should observations with NAs be removed from the matrix? +#' @param collin.rm Logical scalar, default is `TRUE`. Whether to remove variables that were found to be collinear during the estimation. Beware: it does not perform a collinearity check. +#' @param combine Logical scalar, default is `TRUE`. Whether to combine each resulting sparse matrix +#' @param ... Not currently used. +#' +#' @return +#' It returns either a single sparse matrix a list of matrices, depending whether `combine` is `TRUE` or `FALSE`. The sparse matrix is of class `dgCMatrix` from the `Matrix` package. +#' +#' @seealso +#' See also the main estimation functions [`femlm`], [`feols`] or [`feglm`]. [`formula.fixest`], [`update.fixest`], [`summary.fixest`], [`vcov.fixest`]. +#' +#' +#' @author +#' Laurent Berge, Kyle Butts +#' +#' @examples +#' +#' est = feols(wt ~ i(vs) + poly(hp, 2) | cyl, mtcars) +#' +#' +#' +sparse_model_matrix = function(object, data, type = "rhs", na.rm = TRUE, collin.rm = TRUE, combine = TRUE, ...) { + # We evaluate the formula with the past call + # type: lhs, rhs, fixef, iv.endo, iv.inst, iv.rhs1, iv.rhs2 + # if fixef => return a DF + + # Checking the arguments + if (is_user_level_call()) { + validate_dots(suggest_args = c("data", "type")) + } + + # We allow type to be used in the location of data if data is missing + if (!missing(data) && missing(type)) { + sc = sys.call() + if (!"data" %in% names(sc)) { + if (!is.null(data) && (is.character(data) || "formula" %in% class(data))) { + # data is in fact the type + type = data + data = NULL + } + } + } + + type = check_set_types(type, c("lhs", "rhs", "fixef", "iv.endo", "iv.inst", "iv.exo", "iv.rhs1", "iv.rhs2")) + + if (isTRUE(object$is_fit)) { + stop("model.matrix method not available for fixest estimations obtained from fit methods.") + } + + if (any(grepl("^iv", type)) && !isTRUE(object$iv)) { + stop("The type", enumerate_items(grep("^iv", type, value = TRUE), "s.is"), " only valid for IV estimations.") + } + + check_arg(subset, "logical scalar | character vector no na") + + check_arg_plus(collin.rm, "logical scalar") + + # Evaluation with the data + original_data = FALSE + if (missnull(data)) { + original_data = TRUE + + data = fetch_data(object, "To apply 'sparse_model_matrix.fixest', ") + } + + # control of the data + if (is.matrix(data)) { + if (is.null(colnames(data))) { + stop("If argument 'data' is to be a matrix, its columns must be named.") + } + data = as.data.frame(data) + } + + if (!"data.frame" %in% class(data)) { + stop("The argument 'data' must be a data.frame or a matrix.") + } + + data = as.data.frame(data) + + # Allows passage of formula to sparse_model_matrix. A bit inefficient, but it works. + isFormula = FALSE + split_fml = NULL + if ("formula" %in% class(object)) { + split_fml = fml_split_internal(object) + + if (collin.rm == TRUE | length(split_fml) == 3) { + message("Formula passed to sparse_model_matrix with collin.rm == TRUE or iv. Estimating feols with formula.") + object = feols(object, data = data) + } else { + isFormula = TRUE + } + } + + # na.rm = FALSE and collin.rm = FALSE doesn't work with type = "fixef" + if (("fixef" %in% type & !na.rm)) { + # na.rm = TRUE + message("na.rm = FALSE doesn't work with type = 'fixef'. It has been set to TRUE.") + } + + + # Panel setup + if (!isFormula) { + panel__meta__info = set_panel_meta_info(object, data) + } + + # The formulas + if (isFormula) { + fml_linear = split_fml[[1]] + + fml_0 = attr(stats::terms(fml_linear), "intercept") == 0 + # TODO: Check if they are only heterogeneous slopes + fake_intercept = !is.null(split_fml[[2]]) | fml_0 + + + } else { + fml_linear = formula(object, type = "linear") + + # we kick out the intercept if there is presence of fixed-effects + fml_0 = attr(stats::terms(fml_linear), "intercept") == 0 + fake_intercept = !is.null(object$fixef_vars) && !(!is.null(object$slope_flag) && all(object$slope_flag < 0)) + fake_intercept = fake_intercept | fml_0 + } + + res = list() + + # TODO: multiple outcome variables doesn't work + if ("lhs" %in% type) { + lhs_text = deparse_long(fml_linear[[2]]) + lhs = eval(fml_linear[[2]], data) + lhs = Matrix::Matrix(lhs, sparse = TRUE, ncol = 1) + + colnames(lhs) = lhs_text + + res[["lhs"]] = lhs + } + + if ("rhs" %in% type && !isTRUE(object$onlyFixef)) { + + fml = fml_linear + + if (isFormula && (length(split_fml) == 3)) { + fml_iv = split_fml[[3]] + fml = .xpd(..lhs ~ ..endo + ..rhs, ..lhs = fml[[2]], ..endo = fml_iv[[2]], ..rhs = fml[[3]]) + } else if (isTRUE(object$iv)) { + fml_iv = object$fml_all$iv + fml = .xpd(..lhs ~ ..endo + ..rhs, ..lhs = fml[[2]], ..endo = fml_iv[[2]], ..rhs = fml[[3]]) + } + + vars = attr(stats::terms(fml), "term.labels") + + linear.mat = vars_to_sparse_mat(vars = vars, data = data, object = object, collin.rm = collin.rm, type = "rhs", add_intercept = !fake_intercept) + + res[["rhs"]] = linear.mat + } + + # + # Fixef + # + if ("fixef" %in% type) { + + if (isFormula && (length(split_fml) < 2)) { + mat_FE = NULL + } else if (!isFormula & length(object$fixef_id) == 0) { + mat_FE = NULL + } else { + + if(isFormula) { + fixef_fml = .xpd(~ ..fe, ..fe = split_fml[[2]]) + } else { + fixef_fml = object$fml_all$fixef + } + + fixef_terms_full = fixef_terms(fixef_fml) + fe_vars = fixef_terms_full$fe_vars + slope_var_list = fixef_terms_full$slope_vars_list + + fixef_df = prepare_df(fe_vars, data, fastCombine = FALSE) + + # Check for slope vars + if (any(fixef_terms_full$slope_flag > 0)) { + slope_df = prepare_df(unlist(slope_var_list), data) + } + + cols_lengths = c(0) + total_cols = 0 + n_FE = 0 + nrows = nrow(data) + for (i in seq_along(fe_vars)) { + + fe_var = fe_vars[i] + slope_vars = slope_var_list[[i]] + n_slope_vars = if (is.null(slope_vars)) 0 else length(slope_vars) + + fe = fixef_df[[fe_var]] + unique_fe = unique(fe) + n_cols = length(unique_fe[!is.na(unique_fe)]) + + total_cols = total_cols + n_cols * (1 + n_slope_vars) + cols_lengths = c(cols_lengths, rep(n_cols, 1 + n_slope_vars)) + n_FE = n_FE + 1 + n_slope_vars + } + running_cols = cumsum(cols_lengths) + + id_all = names_all = val_all = vector("list", n_FE) + rowid = 1:nrows + j = 1 + for (i in seq_along(fe_vars)) { + + fe_var = fe_vars[i] + xi = fixef_df[[fe_var]] + + col_id = unclass(droplevels(as.factor(xi))) + col_levels = attr(col_id, "levels") + + slope_vars = slope_var_list[[i]] + n_slope_vars = if (is.null(slope_vars)) 0 else length(slope_vars) + + # First fixed-effect by itself + val_all[[j]] = as.numeric(col_id > 0) + col_id[is.na(col_id)] = 1 + id_all[[j]] = cbind(rowid, running_cols[j] + col_id) + names_all[[j]] = paste0(fe_var, "::", col_levels) + j = j + 1 + + for (k in seq_along(slope_vars)) { + slope_var = slope_vars[k] + slope = slope_df[[slope_var]] + + id_all[[j]] = cbind(rowid, running_cols[j] + col_id) + val_all[[j]] = as.numeric(slope) + names_all[[j]] = paste0(fe_var, "[[", slope_var, "]]", "::", attr(col_id, "levels")) + j = j + 1 + } + } + + id_mat = do.call(rbind, id_all) + val_vec = unlist(val_all) + names_vec = unlist(names_all) + + mat_FE = Matrix::sparseMatrix( + i = id_mat[, 1], + j = id_mat[, 2], + x = val_vec, + dimnames = list(NULL, names_vec) + ) + + + # Keep non-zero FEs + if (collin.rm == TRUE) { + fixefs = fixef(object) + + select = lapply( + names(fixefs), + function(var) { + names = names(fixefs[[var]]) + names = names[fixefs[[var]] != 0] + + paste0(var, "::", names) + } + ) + select = unlist(select) + + # When original_data isn't used, some FEs may not be in the new dataset, add them in + missing_cols = setdiff(select, colnames(mat_FE)) + mat_FE = cbind( + mat_FE, + Matrix::Matrix(0, ncol = length(missing_cols), nrow = nrow(mat_FE), sparse = TRUE, dimnames = list(NULL, missing_cols)) + ) + + # Subset fixef and order columns to match fixef(object) + idx = which(colnames(mat_FE) %in% select) + mat_FE = mat_FE[, idx, drop = FALSE] + + # Reorder columns to match fixef(object) + reorder = match(unlist(select), colnames(mat_FE)) + mat_FE = mat_FE[, reorder[!is.na(reorder)], drop = FALSE] + } + + } + + res[["fixef"]] = mat_FE + } + + # + # IV + # + if ("iv.endo" %in% type) { + fml = object$iv_endo_fml + vars = attr(stats::terms(fml), "term.labels") + endo.mat = vars_to_sparse_mat(vars = vars, data = data, object = object, collin.rm = collin.rm) + + res[["iv.endo"]] = endo.mat + } + + if ("iv.inst" %in% type) { + fml = object$fml_all$iv + vars = attr(stats::terms(fml), "term.labels") + inst.mat = vars_to_sparse_mat(vars = vars, data = data, object = object, collin.rm = collin.rm) + + res[["iv.inst"]] = inst.mat + } + + if ("iv.exo" %in% type) { + + fml = object$fml_all$linear + vars = attr(stats::terms(fml), "term.labels") + exo.mat = vars_to_sparse_mat(vars = vars, data = data, object = object, collin.rm = collin.rm, type = "iv.exo", add_intercept = !fake_intercept) + + res[["iv.exo"]] = exo.mat + } + + if ("iv.rhs1" %in% type) { + fml = object$fml + if (object$iv_stage == 2) { + fml_iv = object$fml_all$iv + fml = .xpd(..lhs ~ ..inst + ..rhs, ..lhs = fml[[2]], ..inst = fml_iv[[3]], ..rhs = fml[[3]]) + } + vars = attr(stats::terms(fml), "term.labels") + iv_rhs1 = vars_to_sparse_mat(vars = vars, data = data, object = object, collin.rm = collin.rm, type = "iv.rhs1", add_intercept = !fake_intercept) + + res[["iv.rhs1"]] = iv_rhs1 + } + + if ("iv.rhs2" %in% type) { + # Second stage + if (!object$iv_stage == 2) { + stop("In model.matrix, the type 'iv.rhs2' is only valid for second stage IV models. This estimation is the first stage.") + } + + # I) we get the fitted values + stage_1 = object$iv_first_stage + + fit_vars = c() + for (i in seq_along(stage_1)) { + fit_vars[i] = v = paste0("fit_", names(stage_1)[i]) + data[[v]] = predict(stage_1[[i]], newdata = data, sample = "original") + } + + # II) we create the variables + fml = object$fml + fml = .xpd(..lhs ~ ..fit + ..rhs, ..lhs = fml[[2]], ..fit = fit_vars, ..rhs = fml[[3]]) + vars = attr(stats::terms(fml), "term.labels") + iv_rhs2 = vars_to_sparse_mat(vars = vars, data = data, object = object, collin.rm = collin.rm, type = "iv.rhs2", add_intercept = !fake_intercept) + + res[["iv.rhs2"]] = iv_rhs2 + } + + if (na.rm) { + na_rows = lapply(res, function(mat) { + # Get rows with NA + 1 + mat@i[is.na(mat@x)] + }) + + na_rows = unlist(na_rows, use.names = FALSE) + + if (original_data) { + na_rows = c(na_rows, -1 * object$obs_selection$obsRemoved) + } + + na_rows = unique(na_rows) + + if (length(na_rows) > 0) { + res = lapply(res, function(mat) { + mat[-na_rows, , drop = FALSE] + }) + } + } + + # Formatting res + if (length(res) == 0) { + return(NULL) + } else if (length(type) > 1) { + if (combine) { + res = res[type] + res = do.call(cbind, unname(res)) + } + } else { + res = res[[1]] + } + + res +} + + + +# Internal: modifies the calls so that each variable/interaction is evaluated with mult_sparse +mult_wrap = function(x) { + # x: character string of a variable to be evaluated + # ex: "x1" => mult_sparse(x1) + # "x1:factor(x2):x3" => mult_sparse(x3, factor(x2), x1) + # + # We also add the argument sparse to i() + # "x1:i(species, TRUE)" => mult_sparse(x1, i(species, TRUE, sparse = TRUE)) + + x_call = str2lang(x) + + res = (~ mult_sparse())[[2]] + + if (length(x_call) == 1 || x_call[[1]] != ":") { + res[[2]] = x_call + + } else { + res[[2]] = x_call[[3]] + tmp = x_call[[2]] + + while (length(tmp) == 3 && tmp[[1]] == ":") { + res[[length(res) + 1]] = tmp[[3]] + tmp = tmp[[2]] + } + + res[[length(res) + 1]] = tmp + } + + # We also add sparse to i() if found + for (i in 2:length(res)) { + ri = res[[i]] + if (length(ri) > 1 && ri[[1]] == "i") { + ri[["sparse"]] = TRUE + res[[i]] = ri + } + } + + if (length(res) > 2) { + # we restore the original order + res[-1] = rev(res[-1]) + } + + return(res) +} + +# Internal function to evaluate the variables (and interactions) in a sparse way +mult_sparse = function(...) { + # Only sparsifies factor variables + # Takes care of interactions + + dots = list(...) + n = length(dots) + + num_var = NULL + factor_list = list() + info_i = NULL + is_i = is_factor = FALSE + # You can't have interactions between i and factors, it's either + + for (i in 1:n) { + xi = dots[[i]] + if (is.numeric(xi)) { + # We stack the product + num_var = if (is.null(num_var)) xi else xi * num_var + } else if (inherits(xi, "i_sparse")) { + is_i = TRUE + info_i = xi + } else { + is_factor = TRUE + factor_list[[length(factor_list) + 1]] = xi + } + } + + # numeric + if (!is_i && !is_factor) { + return(num_var) + } + # factor() + if (is_factor) { + factor_list$add_items = TRUE + factor_list$items.list = TRUE + + fact_as_int = do.call(to_integer, factor_list) + + values = if (is.null(num_var)) rep(1, length(fact_as_int$x)) else num_var + + rowid = seq_along(values) + res = list(rowid = rowid, colid = fact_as_int$x, values = values, + col_names = fact_as_int$items, n_cols = length(fact_as_int$items)) + # i() + } else { + + values = info_i$values + if (!is.null(num_var)) { + num_var = num_var[info_i$rowid] + values = values * num_var + } + + res = list(rowid = info_i$rowid, colid = info_i$colid, + values = values[info_i$rowid], + col_names = info_i$col_names, + n_cols = length(info_i$col_names)) + } + + class(res) = "sparse_var" + + res +} + +# Takes a vector of strings denoting the variables (including terms like `poly()`, `i()`, `I()`, `lag()`, etc.) and returns a sparse matrix of the variables extracted from `data`. +vars_to_sparse_mat = function(vars, data, collin.rm = FALSE, object = NULL, type = NULL, add_intercept = FALSE) { + + if (length(vars) == 0) { + # Case only FEs + mat = NULL + } else { + + # Since we don't want to evaluate the factors, + # the code is a bit intricate because we have to catch them before + # any interaction takes place + # + # that's why I wrap interactions in a function (mult_sparse()) + # + + # Below, we evaluate all the variables in a "sparse" way + + vars_calls = lapply(vars, mult_wrap) + + n = length(vars) + variables_list = vector("list", n) + for (i in 1:n) { + variables_list[[i]] = eval(vars_calls[[i]], data) + } + + # To create the sparse matrix, we need the column indexes + total_cols = 0 + running_cols = c(0) + for (i in 1:n) { + xi = variables_list[[i]] + if (inherits(xi, "sparse_var")) { + total_cols = total_cols + xi$n_cols + } else { + total_cols = total_cols + NCOL(xi) + } + running_cols[i + 1] = total_cols + } + + # We just create a sparse matrix and fill it + + # 1) creating the indexes + names + + # NOTA: I use lists to avoid creating copies + rowid = 1:nrow(data) + id_all = values_all = names_all = vector("list", n) + for (i in 1:n) { + xi = variables_list[[i]] + + if (inherits(xi, "sparse_var")) { + + id_all[[i]] = cbind(xi$rowid, running_cols[i] + xi$colid) + values_all[[i]] = xi$values + names_all[[i]] = xi$col_names + + } else if (NCOL(xi) == 1) { + + id_all[[i]] = cbind(rowid, running_cols[i] + 1) + values_all[[i]] = xi + names_all[[i]] = vars[[i]] + + } else { + + colid = rep(1:NCOL(xi), each = nrow(data)) + id_all[[i]] = cbind(rep(rowid, NCOL(xi)), running_cols[i] + colid) + values_all[[i]] = as.vector(xi) + if (!is.null(colnames(xi))) { + names_all[[i]] = paste0(vars[[i]], colnames(xi)) + } else { + names_all[[i]] = paste0(vars[[i]], 1:NCOL(xi)) + } + + } + } + + id_mat = do.call(rbind, id_all) + values_vec = unlist(values_all) + names_vec = unlist(names_all) + + # 2) filling the matrix: one shot, no copies + + mat = Matrix::Matrix(0, nrow(data), total_cols, dimnames = list(NULL, names_vec)) + mat[id_mat] = values_vec + + } + + # Check + # if (isTRUE(object$iv)) { + # fml_iv = object$fml_all$iv + # fml = .xpd(..lhs ~ ..endo + ..rhs, ..lhs = fml[[2]], ..endo = fml_iv[[2]], ..rhs = fml[[3]]) + # } + + # TODO: Check in on this + # mat = error_sender(fixest_model_matrix_extra( + # object = object, newdata = data, original_data = original_data, + # fml = fml, fake_intercept = fake_intercept, + # subset = subset), + # "In 'model.matrix', the RHS could not be evaluated: ") + + if (collin.rm & is.null(object)) { + stop("You need to provide the 'object' argument to use 'collin.rm = TRUE'.") + } + + if (collin.rm) { + qui = which(colnames(mat) %in% object$collin.var) + if (length(qui) == ncol(mat)) { + mat = NULL + } else if (length(qui) > 0) { + mat = mat[, -qui, drop = FALSE] + } + + coefs = names(object$coefficients) + if (isTRUE(object$iv)) { + fml_iv = object$fml_all$iv + endo = fml_iv[[2]] + # TODO: See if a similar trick helps with multiple outcome variables + # Trick to get the rhs variables as a character vector + endo = .xpd(~ ..endo, ..endo = endo) + endo = attr(stats::terms(endo), "term.labels") + + exo = lapply(object$iv_first_stage, function(x) names(stats::coef(x))) + exo = unique(unlist(exo, use.names = FALSE)) + } + + # Custom subsetting for na.rm depending on `type` + if (!is.null(type)) { + if (type == "rhs") { + if (isTRUE(object$iv)) { + keep = c(endo, coefs) + } else { + keep = coefs + } + } else if(type == "iv.exo") { + keep = coefs + } else if (type == "iv.exo") { + keep = c(endo, coefs) + } else if (type == "iv.rhs1") { + keep = c(exo, coefs) + } else if (type == "iv.rhs2") { + keep = coefs + } + + keep = keep[!keep %in% object$collin.var] + if (length(keep) == 0) { + mat = NULL + } else { + idx = which(colnames(mat) %in% keep) + mat = mat[, idx, drop = FALSE] + } + } + + if (length(coefs) == ncol(mat) && any(colnames(mat) != names(coefs))) { + # we reorder the matrix + # This can happen in multiple estimations, where we respect the + # order of the user + + if (all(names(coefs) %in% colnames(mat))) { + mat = mat[, names(coefs), drop = FALSE] + } + } + } + + if (add_intercept) { + mat = cbind(1, mat) + colnames(mat)[1] = "(Intercept)" + } + + return(mat) +} + + \ No newline at end of file diff --git a/tests/fixest_tests.R b/tests/fixest_tests.R index 20cef9ba..d4dbcfcf 100644 --- a/tests/fixest_tests.R +++ b/tests/fixest_tests.R @@ -2135,6 +2135,130 @@ m_lhs_rhs_fixef = model.matrix(res_mult, type = c("lhs", "iv.rhs2", "fixef"), na test(names(m_lhs_rhs_fixef), c("y1", "fit_x2", "x1", "species")) +#### +#### sparse_model_matrix #### +#### + + +base = iris +names(base) = c("y1", "x1", "x2", "x3", "species") +base$y2 = 10 + rnorm(150) + 0.5 * base$x1 +base$x4 = rnorm(150) + 0.5 * base$y1 +base$fe2 = rep(letters[1:15], 10) +base$fe2[50:51] = NA +base$y2[base$fe2 == "a" & !is.na(base$fe2)] = 0 +base$x2[1:5] = NA +base$x3[6] = NA +base$fe3 = rep(letters[1:10], 15) +base$id = rep(1:15, each = 10) +base$time = rep(1:10, 15) + +base_bis = base[1:50, ] +base_bis$id = rep(1:5, each = 10) +base_bis$time = rep(1:10, 5) + + +res = feols(y1 ~ x1 + x2 + x3, base) +sm1 = sparse_model_matrix( + res, type = "lhs", + na.rm = TRUE +) +test(length(sm1), res$nobs) + +sm1_na = sparse_model_matrix( + res, data = base, + type = "lhs", + na.rm = FALSE +) +test(length(sm1_na), res$nobs_origin) +test(max(abs(sm1_na - base$y1), na.rm = TRUE), 0) + + +sm2 = sparse_model_matrix( + res, type = c("lhs", "rhs"), data = base, + combine = FALSE, na.rm = FALSE +) +y = sm2[["lhs"]] +X = sm2[["rhs"]] +obs_rm = res$obs_selection$obsRemoved +res_bis = Matrix::solve( + MatrixExtra::crossprod(X[obs_rm, ]), + MatrixExtra::crossprod(X[obs_rm, ], y[obs_rm]) +) +test(as.numeric(res_bis), res$coefficients) + +# No constant +res_nocons = feols(mpg ~ 0 + i(cyl), mtcars) +sm_nocons = sparse_model_matrix(res_nocons, type = "rhs") +test("(Intercept)" %in% colnames(sm_nocons), FALSE) + +# Lag +data(base_did) +pdat = panel(base_did, ~ id + period) +est1 = feols(y ~ l(x1, 0:1), pdat) +res_lag = feols(y1 ~ l(x1, 1:2) + x2 + x3, base, panel = ~id + time) +test(nrow(sm_lag), nobs(res_lag)) + + +# TODO: Fix poly and newdata +# With poly +# res_poly = feols(y1 ~ poly(x1, 2), base) +# works +# res_poly = feols(y1 ~ x1, base) +# sm_poly_old = sparse_model_matrix(res_poly) +# sm_poly_new = sparse_model_matrix(res_poly, data = base_bis) +# test(sm_poly_old[1:50, 3], sm_poly_new[, 3]) + + +# Interacted fixef +res = feols(y1 ~ x1 + x2 + x3 | species^fe2, base) +sm_ife = sparse_model_matrix(res, data = base, type = "fixef", collin.rm = FALSE) + +# fixef +res = feols(y1 ~ x1 + x2 + x3 | species + fe2, base) +sm_fe = sparse_model_matrix(res, data = base, type = "fixef") +test(ncol(sm_fe), 17) + +sm_fe_no_collin_rm = sparse_model_matrix(res, data = base, type = "fixef", collin.rm = FALSE) +test(ncol(sm_fe_no_collin_rm), 18) + +sm_fe_base_bis = sparse_model_matrix(res, data = base_bis, type = "fixef") + + +# Time-varying slopes +res_slopes = feols(y1 ~ x1 + x2 + x3 | fe2[I(x2+1)], data = base[7:48, ]) +sm_slopes = sparse_model_matrix(res_slopes, type = "fixef") + +# IV +res_iv = feols(y1 ~ x1 | x2 ~ x3, base) + +sm_rhs1 = sparse_model_matrix(res_iv, type = "iv.rhs1") +test(colnames(sm_rhs1)[-1], c("x3", "x1")) + +sm_rhs2 = sparse_model_matrix(res_iv, type = "iv.rhs2") +test(colnames(sm_rhs2)[-1], c("fit_x2", "x1")) + +sm_endo = sparse_model_matrix(res_iv, type = "iv.endo") +test(colnames(sm_endo), "x2") + +sm_exo = sparse_model_matrix(res_iv, type = "iv.exo") +test(colnames(sm_exo)[-1], "x1") + +sm_inst = sparse_model_matrix(res_iv, type = "iv.inst") +test(colnames(sm_inst), "x3") + +# several +res_mult = feols(y1 ~ x1 | species | x2 ~ x3, base) + +sm_lhs_rhs_fixef = sparse_model_matrix(res_mult, type = c("lhs", "iv.rhs2", "fixef")) +test(colnames(sm_lhs_rhs_fixef), c("y1", "fit_x2", "x1", "species::setosa", "species::versicolor", "species::virginica")) + + +# non-linear model +res_pois = fepois(Sepal.Length ~ Sepal.Width + Petal.Length | Species, iris) +sp_pois = sparse_model_matrix(res_pois, data = iris) + + #### #### fitstat #### #### From 239d313f09bffd007c25d4feb120a512694f93d8 Mon Sep 17 00:00:00 2001 From: Kyle Butts Date: Wed, 17 May 2023 17:16:52 -0600 Subject: [PATCH 02/18] Export `sparse_model_matrix` --- R/sparse_model_matrix.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/sparse_model_matrix.R b/R/sparse_model_matrix.R index 9f63906a..9ddfb5bd 100644 --- a/R/sparse_model_matrix.R +++ b/R/sparse_model_matrix.R @@ -28,7 +28,7 @@ #' est = feols(wt ~ i(vs) + poly(hp, 2) | cyl, mtcars) #' #' -#' +#' @export sparse_model_matrix = function(object, data, type = "rhs", na.rm = TRUE, collin.rm = TRUE, combine = TRUE, ...) { # We evaluate the formula with the past call # type: lhs, rhs, fixef, iv.endo, iv.inst, iv.rhs1, iv.rhs2 From b992cd7b09907ee09cac007e3ee5c372956e4396 Mon Sep 17 00:00:00 2001 From: Kyle Butts Date: Wed, 17 May 2023 18:05:42 -0600 Subject: [PATCH 03/18] Allow named list for `.vcov` in summary to change name from "Custom" --- R/Methods.R | 2 +- R/VCOV.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/Methods.R b/R/Methods.R index c7128568..98780230 100644 --- a/R/Methods.R +++ b/R/Methods.R @@ -338,7 +338,7 @@ print.fixest = function(x, n, type = "table", fitstat = NULL, ...){ #' @param forceCovariance (Advanced users.) Logical, default is `FALSE`. In the peculiar case where the obtained Hessian is not invertible (usually because of collinearity of some variables), use this option to force the covariance matrix, by using a generalized inverse of the Hessian. This can be useful to spot where possible problems come from. #' @param keepBounded (Advanced users -- `feNmlm` with non-linear part and bounded coefficients only.) Logical, default is `FALSE`. If `TRUE`, then the bounded coefficients (if any) are treated as unrestricted coefficients and their S.E. is computed (otherwise it is not). #' @param n Integer, default is 1000. Number of coefficients to display when the print method is used. -#' @param ... Only used if the argument `.vocv` is provided and is a function: extra arguments to be passed to that function. +#' @param ... Only used if the argument `.vcov` is provided and is a function: extra arguments to be passed to that function. #' #' @section Compatibility with \pkg{sandwich} package: #' The VCOVs from `sandwich` can be used with `feols`, `feglm` and `fepois` estimations. If you want to have a `sandwich` VCOV when using `summary.fixest`, you can use the argument `vcov` to specify the VCOV function to use (see examples). diff --git a/R/VCOV.R b/R/VCOV.R index 8ef5df42..27b264bf 100644 --- a/R/VCOV.R +++ b/R/VCOV.R @@ -2129,7 +2129,7 @@ oldargs_to_vcov = function(se, cluster, vcov, .vcov = NULL){ } } - check_arg(.vcov, "matrix | function") + check_arg(.vcov, "matrix | function | named list L1") vcov = .vcov attr(vcov, "deparsed_arg") = fetch_arg_deparse(".vcov") From df86391efd65ed2d7dcee60d4e25841bb916ba8e Mon Sep 17 00:00:00 2001 From: Kyle Butts Date: Wed, 17 May 2023 19:05:55 -0600 Subject: [PATCH 04/18] Update documentation --- DESCRIPTION | 4 +-- NAMESPACE | 2 ++ R/sparse_model_matrix.R | 7 +++--- _pkgdown.yml | 1 + man/coefplot.Rd | 4 +-- man/degrees_freedom.Rd | 4 +-- man/etable.Rd | 6 ++--- man/l.Rd | 6 ++--- man/lag.formula.Rd | 4 +-- man/sparse_model_matrix.Rd | 50 ++++++++++++++++++++++++++++++++++++++ man/ssc.Rd | 4 +-- man/summary.fixest.Rd | 2 +- tests/fixest_tests.R | 11 +++------ 13 files changed, 77 insertions(+), 28 deletions(-) create mode 100644 man/sparse_model_matrix.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 97ffa093..41499dd9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,7 +15,7 @@ Authors@R: role = "ctb", email = "grantmcd@uoregon.edu", comment = c(ORCID = "0000-0001-7883-8573"))) -Imports: stats, graphics, grDevices, tools, utils, methods, numDeriv, nlme, sandwich, Rcpp(>= 1.0.5), dreamerr(>= 1.2.3) +Imports: stats, graphics, grDevices, tools, utils, methods, numDeriv, nlme, sandwich, Rcpp(>= 1.0.5), dreamerr(>= 1.2.3), Matrix Suggests: knitr, rmarkdown, data.table, plm, MASS, pander, ggplot2, lfe, tinytex, pdftools LinkingTo: Rcpp Depends: R(>= 3.5.0) @@ -28,6 +28,6 @@ URL: https://lrberge.github.io/fixest/, https://github.com/lrberge/fixest SystemRequirements: C++11 VignetteBuilder: knitr LazyData: true -RoxygenNote: 7.2.0 +RoxygenNote: 7.2.3 Roxygen: list(markdown = TRUE) Encoding: UTF-8 diff --git a/NAMESPACE b/NAMESPACE index f008a373..7fdd011a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -36,6 +36,8 @@ export(to_integer, demean, obs) export(as.dict) # setters & getters exportPattern("^(s|g)etFixest") +# sparse_model_matrix +export(sparse_model_matrix) # coeftable and co export(coeftable, se, pvalue, tstat) diff --git a/R/sparse_model_matrix.R b/R/sparse_model_matrix.R index 9ddfb5bd..66f32a2c 100644 --- a/R/sparse_model_matrix.R +++ b/R/sparse_model_matrix.R @@ -2,8 +2,6 @@ #' #' This function creates the left-hand-side or the right-hand-side(s) of a [`femlm`], [`feols`] or [`feglm`] estimation. Note that this function currently does not accept a formula #' -#' @method sparse_model_matrix fixest -#' #' @inheritParams nobs.fixest #' #' @param data If missing (default) then the original data is obtained by evaluating the `call`. Otherwise, it should be a `data.frame`. @@ -25,7 +23,8 @@ #' #' @examples #' -#' est = feols(wt ~ i(vs) + poly(hp, 2) | cyl, mtcars) +#' est = feols(wt ~ i(vs) + hp | cyl, mtcars) +#' sparse_model_matrix(est) #' #' #' @export @@ -70,7 +69,7 @@ sparse_model_matrix = function(object, data, type = "rhs", na.rm = TRUE, collin if (missnull(data)) { original_data = TRUE - data = fetch_data(object, "To apply 'sparse_model_matrix.fixest', ") + data = fetch_data(object, "To apply 'sparse_model_matrix', ") } # control of the data diff --git a/_pkgdown.yml b/_pkgdown.yml index 12f9fb81..6e7a6cdb 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -84,6 +84,7 @@ reference: - weights.fixest - formula.fixest - terms.fixest + - sparse_model_matrix - title: Formula tools desc: Tools to manipulate formulas contents: diff --git a/man/coefplot.Rd b/man/coefplot.Rd index 0270a586..e037fe7e 100644 --- a/man/coefplot.Rd +++ b/man/coefplot.Rd @@ -248,9 +248,9 @@ This function plots the results of estimations (coefficients and confidence inte } \section{Functions}{ \itemize{ -\item \code{iplot}: Plots the coefficients generated with i() -}} +\item \code{iplot()}: Plots the coefficients generated with i() +}} \section{Setting custom default values}{ The function \code{coefplot} dispose of many arguments to parametrize the plots. Most of these arguments can be set once an for all using the function \code{\link{setFixest_coefplot}}. See Example 3 below for a demonstration. diff --git a/man/degrees_freedom.Rd b/man/degrees_freedom.Rd index 96c89192..993b5900 100644 --- a/man/degrees_freedom.Rd +++ b/man/degrees_freedom.Rd @@ -42,9 +42,9 @@ Simple utility to extract the degrees of freedom from a \code{fixest} estimation } \section{Functions}{ \itemize{ -\item \code{degrees_freedom_iid}: Gets the degrees of freedom of a \code{fixest} estimation -}} +\item \code{degrees_freedom_iid()}: Gets the degrees of freedom of a \code{fixest} estimation +}} \examples{ # First: an estimation diff --git a/man/etable.Rd b/man/etable.Rd index 0240eaa9..f388000f 100644 --- a/man/etable.Rd +++ b/man/etable.Rd @@ -424,11 +424,11 @@ When the argument \code{postprocessing.tex} is not missing, two additional tags } \section{Functions}{ \itemize{ -\item \code{esttable}: Exports the results of multiple \code{fixest} estimations in a Latex table. +\item \code{esttable()}: Exports the results of multiple \code{fixest} estimations in a Latex table. -\item \code{esttex}: Exports the results of multiple \code{fixest} estimations in a Latex table. -}} +\item \code{esttex()}: Exports the results of multiple \code{fixest} estimations in a Latex table. +}} \section{How does \code{digits} handle the number of decimals displayed?}{ diff --git a/man/l.Rd b/man/l.Rd index 6022f618..5e81f36f 100644 --- a/man/l.Rd +++ b/man/l.Rd @@ -29,11 +29,11 @@ Produce lags or leads in the formulas of \code{fixest} estimations or when creat } \section{Functions}{ \itemize{ -\item \code{f}: Forwards a variable (inverse of lagging) in a \code{fixest} estimation +\item \code{f()}: Forwards a variable (inverse of lagging) in a \code{fixest} estimation -\item \code{d}: Creates differences (i.e. x - lag(x)) in a \code{fixest} estimation -}} +\item \code{d()}: Creates differences (i.e. x - lag(x)) in a \code{fixest} estimation +}} \examples{ data(base_did) diff --git a/man/lag.formula.Rd b/man/lag.formula.Rd index 373adb09..19137949 100644 --- a/man/lag.formula.Rd +++ b/man/lag.formula.Rd @@ -48,9 +48,9 @@ Lags a variable using panel id + time identifiers in a formula. } \section{Functions}{ \itemize{ -\item \code{lag_fml}: Lags a variable using a formula syntax -}} +\item \code{lag_fml()}: Lags a variable using a formula syntax +}} \examples{ # simple example with an unbalanced panel base = data.frame(id = rep(1:2, each = 4), diff --git a/man/sparse_model_matrix.Rd b/man/sparse_model_matrix.Rd new file mode 100644 index 00000000..d553c74e --- /dev/null +++ b/man/sparse_model_matrix.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sparse_model_matrix.R +\name{sparse_model_matrix} +\alias{sparse_model_matrix} +\title{Design matrix of a \code{fixest} object returned in sparse format} +\usage{ +sparse_model_matrix( + object, + data, + type = "rhs", + na.rm = TRUE, + collin.rm = TRUE, + combine = TRUE, + ... +) +} +\arguments{ +\item{object}{A \code{fixest} object. Obtained using the functions \code{\link{femlm}}, \code{\link{feols}} or \code{\link{feglm}}.} + +\item{data}{If missing (default) then the original data is obtained by evaluating the \code{call}. Otherwise, it should be a \code{data.frame}.} + +\item{type}{Character vector or one sided formula, default is "rhs". Contains the type of matrix/data.frame to be returned. Possible values are: "lhs", "rhs", "fixef", "iv.rhs1" (1st stage RHS), "iv.rhs2" (2nd stage RHS), "iv.endo" (endogenous vars.), "iv.exo" (exogenous vars), "iv.inst" (instruments).} + +\item{na.rm}{Default is \code{TRUE}. Should observations with NAs be removed from the matrix?} + +\item{collin.rm}{Logical scalar, default is \code{TRUE}. Whether to remove variables that were found to be collinear during the estimation. Beware: it does not perform a collinearity check.} + +\item{combine}{Logical scalar, default is \code{TRUE}. Whether to combine each resulting sparse matrix} + +\item{...}{Not currently used.} +} +\value{ +It returns either a single sparse matrix a list of matrices, depending whether \code{combine} is \code{TRUE} or \code{FALSE}. The sparse matrix is of class \code{dgCMatrix} from the \code{Matrix} package. +} +\description{ +This function creates the left-hand-side or the right-hand-side(s) of a \code{\link{femlm}}, \code{\link{feols}} or \code{\link{feglm}} estimation. Note that this function currently does not accept a formula +} +\examples{ + +est = feols(wt ~ i(vs) + hp | cyl, mtcars) +sparse_model_matrix(est) + + +} +\seealso{ +See also the main estimation functions \code{\link{femlm}}, \code{\link{feols}} or \code{\link{feglm}}. \code{\link{formula.fixest}}, \code{\link{update.fixest}}, \code{\link{summary.fixest}}, \code{\link{vcov.fixest}}. +} +\author{ +Laurent Berge, Kyle Butts +} diff --git a/man/ssc.Rd b/man/ssc.Rd index 18952c44..9e338046 100644 --- a/man/ssc.Rd +++ b/man/ssc.Rd @@ -55,9 +55,9 @@ The following vignette: \href{https://lrberge.github.io/fixest/articles/standard } \section{Functions}{ \itemize{ -\item \code{dof}: This function is deprecated and will be removed at some point (in 6 months from August 2021). Exactly the same as \code{ssc}. -}} +\item \code{dof()}: This function is deprecated and will be removed at some point (in 6 months from August 2021). Exactly the same as \code{ssc}. +}} \examples{ # diff --git a/man/summary.fixest.Rd b/man/summary.fixest.Rd index aca3a7b6..a202eaa3 100644 --- a/man/summary.fixest.Rd +++ b/man/summary.fixest.Rd @@ -61,7 +61,7 @@ \item{nthreads}{The number of threads. Can be: a) an integer lower than, or equal to, the maximum number of threads; b) 0: meaning all available threads will be used; c) a number strictly between 0 and 1 which represents the fraction of all threads to use. The default is to use 50\% of all threads. You can set permanently the number of threads used within this package using the function \code{\link{setFixest_nthreads}}.} -\item{...}{Only used if the argument \code{.vocv} is provided and is a function: extra arguments to be passed to that function.} +\item{...}{Only used if the argument \code{.vcov} is provided and is a function: extra arguments to be passed to that function.} } \value{ It returns a \code{fixest} object with: diff --git a/tests/fixest_tests.R b/tests/fixest_tests.R index d4dbcfcf..02f76c67 100644 --- a/tests/fixest_tests.R +++ b/tests/fixest_tests.R @@ -2139,7 +2139,6 @@ test(names(m_lhs_rhs_fixef), c("y1", "fit_x2", "x1", "species")) #### sparse_model_matrix #### #### - base = iris names(base) = c("y1", "x1", "x2", "x3", "species") base$y2 = 10 + rnorm(150) + 0.5 * base$x1 @@ -2181,9 +2180,9 @@ sm2 = sparse_model_matrix( y = sm2[["lhs"]] X = sm2[["rhs"]] obs_rm = res$obs_selection$obsRemoved -res_bis = Matrix::solve( - MatrixExtra::crossprod(X[obs_rm, ]), - MatrixExtra::crossprod(X[obs_rm, ], y[obs_rm]) +res_bis = solve( + crossprod(as.matrix(X)[obs_rm, ]), + crossprod(as.matrix(X)[obs_rm, ], as.matrix(y)[obs_rm, ]) ) test(as.numeric(res_bis), res$coefficients) @@ -2193,10 +2192,8 @@ sm_nocons = sparse_model_matrix(res_nocons, type = "rhs") test("(Intercept)" %in% colnames(sm_nocons), FALSE) # Lag -data(base_did) -pdat = panel(base_did, ~ id + period) -est1 = feols(y ~ l(x1, 0:1), pdat) res_lag = feols(y1 ~ l(x1, 1:2) + x2 + x3, base, panel = ~id + time) +sm_lag = sparse_model_matrix(res_lag, type = "rhs") test(nrow(sm_lag), nobs(res_lag)) From c4ffb1a1267029a2ebc744dc65ac994d68618fd6 Mon Sep 17 00:00:00 2001 From: Kyle Butts Date: Wed, 17 May 2023 19:11:38 -0600 Subject: [PATCH 05/18] Add to news --- NEWS.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/NEWS.md b/NEWS.md index 5ff6c1f4..582d2b61 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,36 @@ # fixest 0.11.1 +## Sparse Model Matrix + +- Add the ability to create sparse model matrices from regression objects and formula in a memory-efficient way. + +```R +est = feols(mpg ~ drat | cyl, mtcars) + +sparse_model_matrix(est, type = "lhs") +#> 32 x 1 sparse Matrix of class "dgCMatrix" +#> mpg +#> [1,] 21.0 +#> [2,] 21.0 +#> [3,] 22.8 + +sparse_model_matrix(est, type = c("rhs", "fixef")) +#> 32 x 4 sparse Matrix of class "dgCMatrix" +#> drat cyl::4 cyl::6 cyl::8 +#> [1,] 3.90 . 1 . +#> [2,] 3.90 . 1 . + +# Or you can pass a (linear) formula +sparse_model_matrix(mpg ~ i(vs) | gear^cyl, data = mtcars, type = c("rhs", "fixef")) +#> 32 x 9 sparse Matrix of class "dgCMatrix" +#> vs::1 gear^cyl::3_4 gear^cyl::3_6 gear^cyl::3_8 gear^cyl::4_4 gear^cyl::4_6 gear^cyl::5_4 gear^cyl::5_6 gear^cyl::5_8 +#> [1,] . . . . . 1 . . . +#> [2,] . . . . . 1 . . . +#> [3,] 1 . . . 1 . . . . +``` + + ## Documentation bug - fix bug in the help of `bin` From a2b2ef008cef91811a0af33237ac6dc72ae91f2e Mon Sep 17 00:00:00 2001 From: Kyle Butts Date: Thu, 18 May 2023 23:36:16 -0600 Subject: [PATCH 06/18] Speedup sparse_model_matrix with `type = "fixef"` --- R/sparse_model_matrix.R | 61 +++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/R/sparse_model_matrix.R b/R/sparse_model_matrix.R index 66f32a2c..b4b12690 100644 --- a/R/sparse_model_matrix.R +++ b/R/sparse_model_matrix.R @@ -100,7 +100,7 @@ sparse_model_matrix = function(object, data, type = "rhs", na.rm = TRUE, collin } } - # na.rm = FALSE and collin.rm = FALSE doesn't work with type = "fixef" + # na.rm = FALSE doesn't work with type = "fixef" (which FE col gets NA?) if (("fixef" %in% type & !na.rm)) { # na.rm = TRUE message("na.rm = FALSE doesn't work with type = 'fixef'. It has been set to TRUE.") @@ -117,10 +117,8 @@ sparse_model_matrix = function(object, data, type = "rhs", na.rm = TRUE, collin fml_linear = split_fml[[1]] fml_0 = attr(stats::terms(fml_linear), "intercept") == 0 - # TODO: Check if they are only heterogeneous slopes fake_intercept = !is.null(split_fml[[2]]) | fml_0 - - + } else { fml_linear = formula(object, type = "linear") @@ -132,7 +130,6 @@ sparse_model_matrix = function(object, data, type = "rhs", na.rm = TRUE, collin res = list() - # TODO: multiple outcome variables doesn't work if ("lhs" %in% type) { lhs_text = deparse_long(fml_linear[[2]]) lhs = eval(fml_linear[[2]], data) @@ -162,9 +159,6 @@ sparse_model_matrix = function(object, data, type = "rhs", na.rm = TRUE, collin res[["rhs"]] = linear.mat } - # - # Fixef - # if ("fixef" %in% type) { if (isFormula && (length(split_fml) < 2)) { @@ -218,16 +212,28 @@ sparse_model_matrix = function(object, data, type = "rhs", na.rm = TRUE, collin fe_var = fe_vars[i] xi = fixef_df[[fe_var]] - col_id = unclass(droplevels(as.factor(xi))) - col_levels = attr(col_id, "levels") + where_na = which(is.na(xi)) + xi_quf = quickUnclassFactor(xi[-where_na], addItem = TRUE) + + col_id = xi_quf$x + col_levels = as.character(xi_quf$items) slope_vars = slope_var_list[[i]] n_slope_vars = if (is.null(slope_vars)) 0 else length(slope_vars) + # Be careful with NAs # First fixed-effect by itself - val_all[[j]] = as.numeric(col_id > 0) - col_id[is.na(col_id)] = 1 - id_all[[j]] = cbind(rowid, running_cols[j] + col_id) + val_all[[j]] = c(rep(1, length(col_id)), rep(NA, length(where_na))) + id_all[[j]] = cbind( + c( + rowid[-where_na], + rowid[where_na] + ), + c( + running_cols[j] + col_id, + rep(running_cols[j] + 1, length(where_na)) + ) + ) names_all[[j]] = paste0(fe_var, "::", col_levels) j = j + 1 @@ -235,9 +241,18 @@ sparse_model_matrix = function(object, data, type = "rhs", na.rm = TRUE, collin slope_var = slope_vars[k] slope = slope_df[[slope_var]] - id_all[[j]] = cbind(rowid, running_cols[j] + col_id) - val_all[[j]] = as.numeric(slope) - names_all[[j]] = paste0(fe_var, "[[", slope_var, "]]", "::", attr(col_id, "levels")) + val_all[[j]] = c(as.numeric(slope[-where_na]), rep(NA, length(where_na))) + id_all[[j]] = cbind( + c( + rowid[-where_na], + rowid[where_na] + ), + c( + running_cols[j] + col_id, + rep(running_cols[j] + 1, length(where_na)) + ) + ) + names_all[[j]] = paste0(fe_var, "[[", slope_var, "]]", "::", col_levels) j = j + 1 } } @@ -256,7 +271,7 @@ sparse_model_matrix = function(object, data, type = "rhs", na.rm = TRUE, collin # Keep non-zero FEs if (collin.rm == TRUE) { - fixefs = fixef(object) + fixefs = fixef(object, sorted = TRUE) select = lapply( names(fixefs), @@ -586,13 +601,7 @@ vars_to_sparse_mat = function(vars, data, collin.rm = FALSE, object = NULL, type } - # Check - # if (isTRUE(object$iv)) { - # fml_iv = object$fml_all$iv - # fml = .xpd(..lhs ~ ..endo + ..rhs, ..lhs = fml[[2]], ..endo = fml_iv[[2]], ..rhs = fml[[3]]) - # } - - # TODO: Check in on this + # TODO: Should I use error_sender? # mat = error_sender(fixest_model_matrix_extra( # object = object, newdata = data, original_data = original_data, # fml = fml, fake_intercept = fake_intercept, @@ -615,7 +624,7 @@ vars_to_sparse_mat = function(vars, data, collin.rm = FALSE, object = NULL, type if (isTRUE(object$iv)) { fml_iv = object$fml_all$iv endo = fml_iv[[2]] - # TODO: See if a similar trick helps with multiple outcome variables + # Trick to get the rhs variables as a character vector endo = .xpd(~ ..endo, ..endo = endo) endo = attr(stats::terms(endo), "term.labels") @@ -632,7 +641,7 @@ vars_to_sparse_mat = function(vars, data, collin.rm = FALSE, object = NULL, type } else { keep = coefs } - } else if(type == "iv.exo") { + } else if (type == "iv.exo") { keep = coefs } else if (type == "iv.exo") { keep = c(endo, coefs) From edf58c3d5b3e8ac7692996cbbbfdda44a1371f21 Mon Sep 17 00:00:00 2001 From: Kyle Butts Date: Sat, 20 May 2023 09:25:12 -0600 Subject: [PATCH 07/18] fix bug when no NA in fixef --- R/sparse_model_matrix.R | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/R/sparse_model_matrix.R b/R/sparse_model_matrix.R index b4b12690..85291d4d 100644 --- a/R/sparse_model_matrix.R +++ b/R/sparse_model_matrix.R @@ -212,8 +212,11 @@ sparse_model_matrix = function(object, data, type = "rhs", na.rm = TRUE, collin fe_var = fe_vars[i] xi = fixef_df[[fe_var]] - where_na = which(is.na(xi)) - xi_quf = quickUnclassFactor(xi[-where_na], addItem = TRUE) + keep = which(!is.na(xi)) + if (length(keep) == 0) stop("All values of the fixed-effect variable '", fe_var, "' are NA.") + + xi = xi[keep] + xi_quf = quickUnclassFactor(xi, addItem = TRUE) col_id = xi_quf$x col_levels = as.character(xi_quf$items) @@ -223,15 +226,15 @@ sparse_model_matrix = function(object, data, type = "rhs", na.rm = TRUE, collin # Be careful with NAs # First fixed-effect by itself - val_all[[j]] = c(rep(1, length(col_id)), rep(NA, length(where_na))) + val_all[[j]] = c(rep(1, length(col_id)), rep(NA, nrows - length(keep))) id_all[[j]] = cbind( c( - rowid[-where_na], - rowid[where_na] + rowid[keep], + rowid[-keep] ), c( running_cols[j] + col_id, - rep(running_cols[j] + 1, length(where_na)) + rep(running_cols[j] + 1, nrows - length(keep)) ) ) names_all[[j]] = paste0(fe_var, "::", col_levels) @@ -241,15 +244,15 @@ sparse_model_matrix = function(object, data, type = "rhs", na.rm = TRUE, collin slope_var = slope_vars[k] slope = slope_df[[slope_var]] - val_all[[j]] = c(as.numeric(slope[-where_na]), rep(NA, length(where_na))) + val_all[[j]] = c(as.numeric(slope[keep]), rep(NA, nrows - length(keep))) id_all[[j]] = cbind( c( - rowid[-where_na], - rowid[where_na] + rowid[keep], + rowid[-keep] ), c( running_cols[j] + col_id, - rep(running_cols[j] + 1, length(where_na)) + rep(running_cols[j] + 1, nrows - length(keep)) ) ) names_all[[j]] = paste0(fe_var, "[[", slope_var, "]]", "::", col_levels) From 0f114a8362ff55e007fd88dd4f315a45e0b95666 Mon Sep 17 00:00:00 2001 From: Kyle Butts Date: Sat, 20 May 2023 09:26:36 -0600 Subject: [PATCH 08/18] Update hatvalues.fixest: - Add fixed effect support - Add exact = FALSE option to approximate using JLA algorithm --- NEWS.md | 1 + R/Methods.R | 99 +++++++++++++++++++++++++++++++---------- man/hatvalues.fixest.Rd | 10 +++-- tests/fixest_tests.R | 5 +++ 4 files changed, 88 insertions(+), 27 deletions(-) diff --git a/NEWS.md b/NEWS.md index 582d2b61..5aec393c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -32,6 +32,7 @@ sparse_model_matrix(mpg ~ i(vs) | gear^cyl, data = mtcars, type = c("rhs", "fixe ``` +- `hatvalues` now works with fixed effects. It also allows for approximate computation (via `exact` and `p` arguments). For details on the approximation, see (Kline, Saggio, and Sølvsten 2020). ## Documentation bug - fix bug in the help of `bin` diff --git a/R/Methods.R b/R/Methods.R index 98780230..13eb313e 100644 --- a/R/Methods.R +++ b/R/Methods.R @@ -3728,15 +3728,17 @@ deviance.fixest = function(object, ...){ #' Hat values for `fixest` objects #' -#' Computes the hat values for [`feols`] or [`feglm`] estimations. Only works when there are no fixed-effects. +#' Computes the hat values for [`feols`] or [`feglm`] estimations. #' #' @param model A fixest object. For instance from feols or feglm. +#' @param exact Logical scalar, default is `TRUE`. Whether the diagonals of the projection matrix should be calculated exactly. If `FALSE`, then it will be +#' @param p Numeric scala, default is 1000. This is only used when `exact == FALSE`. This determined the number of bootstrap samples used to estimate the projection matrix. #' @param ... Not currently used. #' #' @details #' Hat values are not available for [`fenegbin`][fixest::femlm], [`femlm`] and [`feNmlm`] estimations. -#' -#' When there are fixed-effects, the hat values of the reduced form are different from the hat values of the full model. And we cannot get costlessly the hat values of the full model from the reduced form. It would require to reestimate the model with the fixed-effects as regular variables. +#' +#' When `exact == FALSE`, the Johnson-Lindenstrauss approximation (LA) algorithm is used which approximates the diagonals of the projection matrix. For more precision (but longer time), increase the value of `p`. #' #' @return #' Returns a vector of the same length as the number of observations used in the estimation. @@ -3747,12 +3749,8 @@ deviance.fixest = function(object, ...){ #' head(hatvalues(est)) #' #' -hatvalues.fixest = function(model, ...){ - # Only works for feglm/feols objects + no fixed-effects - # When there are fixed-effects the hatvalues of the reduced form is different from - # the hatvalues of the full model. And we cannot get costlessly the hatvalues of the full - # model from the reduced form. => we need to reestimate the model with the FEs as - # regular variables. +hatvalues.fixest = function(model, exact = TRUE, p = 1000, ...){ + # Only works for feglm/feols objects if(isTRUE(model$lean)){ # LATER: recompute it @@ -3764,29 +3762,82 @@ hatvalues.fixest = function(model, ...){ } method = model$method_type - family = model$family - - msg = "hatvalues.fixest: 'hatvalues' is not implemented for estimations with fixed-effects." - # An error is in fact nicer than a message + NA return due to the interplay with sandwich - if(!is.null(model$fixef_id)){ - stop(msg) + if (!(method %in% c("feols", "feglm"))) { + stop("'hatvalues' is currently not implemented for function ", method, ".") } - if(method == "feols"){ - X = model.matrix(model) + X = sparse_model_matrix(model, type = c("rhs", "fixef"), collin.rm = TRUE, na.rm = TRUE, combine = TRUE) + + if (method == "feglm") X = X * sqrt(model$irls_weights) + + fe_only = isTRUE(model$onlyFixef) + + Xt <- Matrix::t(X) + S_xx = Matrix::crossprod(X) + + # If there are only fixed effects, then can do this *super fast* + if (fe_only == TRUE) { + # https://stackoverflow.com/questions/39533785/how-to-compute-diagx-solvea-tx-efficiently-without-taking-matrix-i + U = Matrix::chol(S_xx) + Z = Matrix::solve(Matrix::t(U), Xt) + P_ii = Matrix::colSums(Z ^ 2) + + return(P_ii) + } + + # Note: This might fail if looking at too large of a matrix + if (exact == TRUE) { + # Z = (X'X)^{-1} X' + Z = Matrix::solve(S_xx, Xt) + + # X %*% Z = X (X'X)^{-1} X' + # P_ii = diag(X %*% Z) = colSums(X * Z) + P_ii = Matrix::rowSums(X * Matrix::t(Z)) + + return(P_ii) + } + + if (exact == FALSE) { + + n <- nrow(X) + + # Using speed heuristics based on n*p and trying + # Solve for Z = (X'X)^{-1} (X' R_p) + if (n * p <= 1000000) { + + # Fastest for small n*p (b/c of my for loop) + Rp <- matrix(rbinom(n * p, 1, 0.5) * 2 - 1, nrow = p, ncol = n) + X_Rp <- Matrix::tcrossprod(Xt, Rp) + Z = Matrix::solve(S_xx, X_Rp) + + # P_ii = 1/p || X_i' Z ||^2 + P_ii = Matrix::colSums(Matrix::crossprod(Z, Xt)^2) / p - res = cpp_diag_XUtX(X, model$cov.iid / model$sigma2) + } else { + # Slightly slower, but very memory efficient + # Theory: https://cs-people.bu.edu/evimaria/cs565/achlioptas.pdf + P_ii = rep(0, n) - } else if(method == "feglm"){ - XW = model.matrix(model) * sqrt(model$irls_weights) - res = cpp_diag_XUtX(XW, model$cov.iid) + for (j in 1:p) { + # Rademacher Weights (-1, 1) + Rp_j <- matrix( + (runif(n) > 0.5) * 2 - 1, + # Uncomment for speed-up + # rademacher::sample_rademacher(n), + nrow = 1, ncol = n + ) + X_Rp_j <- Matrix::tcrossprod(Xt, Rp_j) + + Zj <- Matrix::solve(S_xx, X_Rp_j) + P_ii = P_ii + as.numeric(X %*% Zj)^2 / p + } + } + + return(P_ii) - } else { - stop("'hatvalues' is currently not implemented for function ", method, ".") } - res } #### diff --git a/man/hatvalues.fixest.Rd b/man/hatvalues.fixest.Rd index bda72b07..2daad3a9 100644 --- a/man/hatvalues.fixest.Rd +++ b/man/hatvalues.fixest.Rd @@ -4,23 +4,27 @@ \alias{hatvalues.fixest} \title{Hat values for \code{fixest} objects} \usage{ -\method{hatvalues}{fixest}(model, ...) +\method{hatvalues}{fixest}(model, exact = TRUE, p = 1000, ...) } \arguments{ \item{model}{A fixest object. For instance from feols or feglm.} +\item{exact}{Logical scalar, default is \code{TRUE}. Whether the diagonals of the projection matrix should be calculated exactly. If \code{FALSE}, then it will be} + +\item{p}{Numeric scala, default is 1000. This is only used when \code{exact == FALSE}. This determined the number of bootstrap samples used to estimate the projection matrix.} + \item{...}{Not currently used.} } \value{ Returns a vector of the same length as the number of observations used in the estimation. } \description{ -Computes the hat values for \code{\link{feols}} or \code{\link{feglm}} estimations. Only works when there are no fixed-effects. +Computes the hat values for \code{\link{feols}} or \code{\link{feglm}} estimations. } \details{ Hat values are not available for \code{\link[=femlm]{fenegbin}}, \code{\link{femlm}} and \code{\link{feNmlm}} estimations. -When there are fixed-effects, the hat values of the reduced form are different from the hat values of the full model. And we cannot get costlessly the hat values of the full model from the reduced form. It would require to reestimate the model with the fixed-effects as regular variables. +When \code{exact == FALSE}, the Johnson-Lindenstrauss approximation (LA) algorithm is used which approximates the diagonals of the projection matrix. For more precision (but longer time), increase the value of \code{p}. } \examples{ diff --git a/tests/fixest_tests.R b/tests/fixest_tests.R index 02f76c67..2da7d9b2 100644 --- a/tests/fixest_tests.R +++ b/tests/fixest_tests.R @@ -1756,6 +1756,11 @@ glm_probit = glm(y_bin ~ x1 + x2, family = binomial("probit"), base) feglm_probit = feglm(y_bin ~ x1 + x2, base, binomial("probit")) test(hatvalues(feglm_probit), hatvalues(glm_probit)) +# Fixed effects +fm_fe = lm(y ~ x1 + x2 + factor(species), base) +ffm_fe = feols(y ~ x1 + x2 | species, base) +test(hatvalues(ffm_fe), hatvalues(fm_fe)) + #### #### sandwich #### From 6d95aa8f92216132b83a2b7f839f8559c3c98e04 Mon Sep 17 00:00:00 2001 From: Kyle Butts Date: Sat, 20 May 2023 09:47:12 -0600 Subject: [PATCH 09/18] Pass Check now --- R/Methods.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/Methods.R b/R/Methods.R index 13eb313e..37e61e25 100644 --- a/R/Methods.R +++ b/R/Methods.R @@ -3807,7 +3807,7 @@ hatvalues.fixest = function(model, exact = TRUE, p = 1000, ...){ if (n * p <= 1000000) { # Fastest for small n*p (b/c of my for loop) - Rp <- matrix(rbinom(n * p, 1, 0.5) * 2 - 1, nrow = p, ncol = n) + Rp <- matrix((stats::runif(n*p) > 0.5) * 2 - 1, nrow = p, ncol = n) X_Rp <- Matrix::tcrossprod(Xt, Rp) Z = Matrix::solve(S_xx, X_Rp) @@ -3822,7 +3822,7 @@ hatvalues.fixest = function(model, exact = TRUE, p = 1000, ...){ for (j in 1:p) { # Rademacher Weights (-1, 1) Rp_j <- matrix( - (runif(n) > 0.5) * 2 - 1, + (stats::runif(n) > 0.5) * 2 - 1, # Uncomment for speed-up # rademacher::sample_rademacher(n), nrow = 1, ncol = n From 729764e15ae789927dd2c3c488ea5abe5e173c41 Mon Sep 17 00:00:00 2001 From: Kyle Butts Date: Sat, 20 May 2023 09:57:24 -0600 Subject: [PATCH 10/18] Attempt at updating github action --- .github/workflows/check-standard.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/check-standard.yml b/.github/workflows/check-standard.yml index 1692193e..2a94e1f1 100644 --- a/.github/workflows/check-standard.yml +++ b/.github/workflows/check-standard.yml @@ -34,13 +34,13 @@ jobs: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - - uses: r-lib/actions/setup-r@v1 + - uses: r-lib/actions/setup-r@v2 with: r-version: ${{ matrix.config.r }} - - uses: r-lib/actions/setup-pandoc@v1 + - uses: r-lib/actions/setup-pandoc@v2 - name: Query dependencies run: | From 7f9c05203030b4ed930117215fb0c6b2a0e022a2 Mon Sep 17 00:00:00 2001 From: Kyle Butts Date: Wed, 7 Jun 2023 11:35:51 -0600 Subject: [PATCH 11/18] `hatvalues` now works for generalized linear models - Matches hatvalues.glm --- R/Methods.R | 27 +++++++++++++++++++++++---- R/sparse_model_matrix.R | 2 +- man/hatvalues.fixest.Rd | 9 ++++++++- 3 files changed, 32 insertions(+), 6 deletions(-) diff --git a/R/Methods.R b/R/Methods.R index 37e61e25..11ab16c3 100644 --- a/R/Methods.R +++ b/R/Methods.R @@ -3738,7 +3738,16 @@ deviance.fixest = function(object, ...){ #' @details #' Hat values are not available for [`fenegbin`][fixest::femlm], [`femlm`] and [`feNmlm`] estimations. #' -#' When `exact == FALSE`, the Johnson-Lindenstrauss approximation (LA) algorithm is used which approximates the diagonals of the projection matrix. For more precision (but longer time), increase the value of `p`. +#' Hat values for generalized linear model are disussed in Belsley, Kuh and Welsch (1980), Cook and Weisberg (1982), etc. +#' +#' When `exact == FALSE`, the Johnson-Lindenstrauss approximation (JLA) algorithm is used which approximates the diagonals of the projection matrix. For more precision (but longer time), increase the value of `p`. See Kline, Saggio, and Sølvsten (2020) for details. +#' +#' +#' @references +#' Belsley, D. A., Kuh, E. and Welsch, R. E. (1980). *Regression Diagnostics*. New York: Wiley. +#' Cook, R. D. and Weisberg, S. (1982). *Residuals and Influence in Regression*. London: Chapman and Hall. +#' Kline, P., Saggio R., and Sølvsten, M. (2020). *Leave‐Out Estimation of Variance Components*. Econometrica. +#' #' #' @return #' Returns a vector of the same length as the number of observations used in the estimation. @@ -3757,6 +3766,10 @@ hatvalues.fixest = function(model, exact = TRUE, p = 1000, ...){ stop("The method 'hatvalues.fixest' cannot be applied to 'lean' fixest objects. Please re-estimate with 'lean = FALSE'.") } + if (!is.null(res$iv)) { + stop("The method 'hatvalues.fixest' cannot be applied to ") + } + if(is_user_level_call()){ validate_dots() } @@ -3769,8 +3782,14 @@ hatvalues.fixest = function(model, exact = TRUE, p = 1000, ...){ X = sparse_model_matrix(model, type = c("rhs", "fixef"), collin.rm = TRUE, na.rm = TRUE, combine = TRUE) - if (method == "feglm") X = X * sqrt(model$irls_weights) - + if (method == "feols") { + weights = model$weights + if (is.null(weights)) weights = 1 + } else { + weights = model$irls_weights + } + X = X * sqrt(weights) + fe_only = isTRUE(model$onlyFixef) Xt <- Matrix::t(X) @@ -3807,7 +3826,7 @@ hatvalues.fixest = function(model, exact = TRUE, p = 1000, ...){ if (n * p <= 1000000) { # Fastest for small n*p (b/c of my for loop) - Rp <- matrix((stats::runif(n*p) > 0.5) * 2 - 1, nrow = p, ncol = n) + Rp <- matrix((stats::runif(n * p) > 0.5) * 2 - 1, nrow = p, ncol = n) X_Rp <- Matrix::tcrossprod(Xt, Rp) Z = Matrix::solve(S_xx, X_Rp) diff --git a/R/sparse_model_matrix.R b/R/sparse_model_matrix.R index 85291d4d..322c1462 100644 --- a/R/sparse_model_matrix.R +++ b/R/sparse_model_matrix.R @@ -167,7 +167,7 @@ sparse_model_matrix = function(object, data, type = "rhs", na.rm = TRUE, collin mat_FE = NULL } else { - if(isFormula) { + if (isFormula) { fixef_fml = .xpd(~ ..fe, ..fe = split_fml[[2]]) } else { fixef_fml = object$fml_all$fixef diff --git a/man/hatvalues.fixest.Rd b/man/hatvalues.fixest.Rd index 2daad3a9..cbd33da5 100644 --- a/man/hatvalues.fixest.Rd +++ b/man/hatvalues.fixest.Rd @@ -24,7 +24,9 @@ Computes the hat values for \code{\link{feols}} or \code{\link{feglm}} estimatio \details{ Hat values are not available for \code{\link[=femlm]{fenegbin}}, \code{\link{femlm}} and \code{\link{feNmlm}} estimations. -When \code{exact == FALSE}, the Johnson-Lindenstrauss approximation (LA) algorithm is used which approximates the diagonals of the projection matrix. For more precision (but longer time), increase the value of \code{p}. +Hat values for generalized linear model are disussed in Belsley, Kuh and Welsch (1980), Cook and Weisberg (1982), etc. + +When \code{exact == FALSE}, the Johnson-Lindenstrauss approximation (JLA) algorithm is used which approximates the diagonals of the projection matrix. For more precision (but longer time), increase the value of \code{p}. See Kline, Saggio, and Sølvsten (2020) for details. } \examples{ @@ -33,3 +35,8 @@ head(hatvalues(est)) } +\references{ +Belsley, D. A., Kuh, E. and Welsch, R. E. (1980). \emph{Regression Diagnostics}. New York: Wiley. +Cook, R. D. and Weisberg, S. (1982). \emph{Residuals and Influence in Regression}. London: Chapman and Hall. +Kline, P., Saggio R., and Sølvsten, M. (2020). \emph{Leave‐Out Estimation of Variance Components}. Econometrica. +} From 12c71eea1601977998ca3321d2a70bb63eaac303 Mon Sep 17 00:00:00 2001 From: Kyle Butts Date: Wed, 7 Jun 2023 14:59:54 -0600 Subject: [PATCH 12/18] HC2/HC3 Implementation --- NAMESPACE | 2 +- NEWS.md | 7 ++- R/Methods.R | 2 +- R/VCOV.R | 132 ++++++++++++++++++++++++++++++++++++++----- man/vcov_hetero.Rd | 50 ++++++++++++++++ tests/fixest_tests.R | 91 +++++++++++++++++++++++++---- 6 files changed, 256 insertions(+), 28 deletions(-) create mode 100644 man/vcov_hetero.Rd diff --git a/NAMESPACE b/NAMESPACE index 7fdd011a..f7a9612f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -56,7 +56,7 @@ S3method(tstat, fixest) S3method(tstat, fixest_multi) # VCOV funs -export(vcov_cluster, vcov_DK, vcov_NW, vcov_conley) +export(vcov_cluster, vcov_DK, vcov_NW, vcov_conley, vcov_hetero) # Data manipulation export(n_unik, osize, sample_df, fdim) diff --git a/NEWS.md b/NEWS.md index 5aec393c..6bf329ea 100644 --- a/NEWS.md +++ b/NEWS.md @@ -32,7 +32,10 @@ sparse_model_matrix(mpg ~ i(vs) | gear^cyl, data = mtcars, type = c("rhs", "fixe ``` -- `hatvalues` now works with fixed effects. It also allows for approximate computation (via `exact` and `p` arguments). For details on the approximation, see (Kline, Saggio, and Sølvsten 2020). +- `hatvalues` now works with fixed effects. It also allows for approximate computation (via `exact` and `p` arguments). For details on the approximation, see [Kline, Saggio, and Sølvsten 2020](https://eml.berkeley.edu/~pkline/papers/KSS2020.pdf). + +- Add `HC2` and `HC3` options to `vcov` for `feols` and `feglm` models. With this gets added the `vcov_hetero` function. + ## Documentation bug - fix bug in the help of `bin` @@ -1052,7 +1055,7 @@ Common methods have been extended to `fixest_multi` objects. - Improve error messages. - - `hatvalues.fixest`: now returns an error instead of a message when fixed-effects are present (it makes the interplay with `sadnwich` 'nicer'). + - `hatvalues.fixest`: now returns an error instead of a message when fixed-effects are present (it makes the interplay with `sandwich` 'nicer'). # fixest 0.8.4 diff --git a/R/Methods.R b/R/Methods.R index 11ab16c3..1ee41525 100644 --- a/R/Methods.R +++ b/R/Methods.R @@ -3766,7 +3766,7 @@ hatvalues.fixest = function(model, exact = TRUE, p = 1000, ...){ stop("The method 'hatvalues.fixest' cannot be applied to 'lean' fixest objects. Please re-estimate with 'lean = FALSE'.") } - if (!is.null(res$iv)) { + if (!is.null(model$iv)) { stop("The method 'hatvalues.fixest' cannot be applied to ") } diff --git a/R/VCOV.R b/R/VCOV.R index 27b264bf..fdb2a3c3 100644 --- a/R/VCOV.R +++ b/R/VCOV.R @@ -726,6 +726,12 @@ vcov.fixest = function(object, vcov = NULL, se = NULL, cluster, ssc = NULL, attr if(!missing(nthreads)) nthreads = check_set_nthreads(nthreads) + # Get default ssc if none is provied + # ssc related => we accept NULL + # we check ssc since it can be used by the funs + if(missnull(ssc)) ssc = getFixest_ssc() + check_arg(ssc, "class(ssc.type)", .message = "The argument 'ssc' must be an object created by the function ssc().") + #### #### ... scores #### #### @@ -748,6 +754,44 @@ vcov.fixest = function(object, vcov = NULL, se = NULL, cluster, ssc = NULL, attr scores = object$scores } + # For HC1, apply cluster.adj (N / N - 1) if ssc$ssc.adj = TRUE to match vcovHC(type = "HC1") + if (vcov %in% c("hetero", "white", "hc1") & ssc$cluster.adj) { + n = nrow(scores) + scores = scores * sqrt(n / (n - 1)) + } + + # For HC2/ HC3, need to divide scores by hatvalues + if (vcov %in% c("hc2", "hc3")) { + + if (isTRUE(object$iv)) { + stop("hc2/hc3 vcov are not defined for IV estimates") + } + + exact = ifelse(is.null(extra_args$exact), TRUE, extra_args$exact) + p = ifelse(is.null(extra_args$p), 500, extra_args$p) + P_ii = hatvalues(object, exact = exact, p = p) + + problem_idx = which(P_ii > (1 - sqrt(.Machine$double.eps))) + if (length(problem_idx) > 0L) { + + if (length(problem_idx) > 10L) { + problem_idx = c(problem_idx[1:10], "...") + } + + stop( + sprintf( + "When calculating the diagonals of the projection matrix, one of the columns was found to equal 1 (or within Machine epsilon). This is most likely due to a fixed effect with only 1 observation, so removing that observation would make that coefficient non-estimable. The problem rows of the data are rows: %s", + paste(problem_idx, collapse = ", ") + ) + ) + } + + if (vcov == "hc2") { + scores = scores / sqrt(1 - P_ii) + } else if (vcov == "hc3") { + scores = scores / (1 - P_ii) + } + } #### #### ... bread #### @@ -812,11 +856,6 @@ vcov.fixest = function(object, vcov = NULL, se = NULL, cluster, ssc = NULL, attr # we compute the vcov. The adjustment (which is a pain in the neck) will come after that # Here vcov is ALWAYS a character scalar - # ssc related => we accept NULL - # we check ssc since it can be used by the funs - if(missnull(ssc)) ssc = getFixest_ssc() - check_arg(ssc, "class(ssc.type)", .message = "The argument 'ssc' must be an object created by the function ssc().") - fun_name = vcov_select$fun_name args = list(bread = bread, scores = scores, vars = vcov_vars, ssc = ssc, sandwich = sandwich, nthreads = nthreads, @@ -840,7 +879,6 @@ vcov.fixest = function(object, vcov = NULL, se = NULL, cluster, ssc = NULL, attr #### # ssc is a ssc object in here - n_fe = n_fe_ok = length(object$fixef_id) # we adjust the fixef sizes to account for slopes @@ -954,6 +992,7 @@ vcov.fixest = function(object, vcov = NULL, se = NULL, cluster, ssc = NULL, attr K = max(K, length(object$coefficients) + 1) } + # TODO: There never is an attr "ss_adj". I think this can be safely removed? # Small sample adjustment ss_adj = attr(vcov_noAdj, "ss_adj") if(!is.null(ss_adj)){ @@ -963,6 +1002,12 @@ vcov.fixest = function(object, vcov = NULL, se = NULL, cluster, ssc = NULL, attr attr(vcov_noAdj, "ss_adj") = NULL } else { ss_adj = ifelse(ssc$adj, (n - 1) / (n - K), 1) + + # TODO: think about this? It's not standard to apply ss_adj, but don't like removing the option from the user? + # Don't apply ss_adj to hc2/hc3 since they are small-sample adjusted already! + # if (vcov %in% c("hc2", "hc3")) { + # ss_adj = 1 + # } } vcov_mat = vcov_noAdj * ss_adj @@ -1573,6 +1618,72 @@ vcov_conley = function(x, lat = NULL, lon = NULL, cutoff = NULL, pixel = 0, res } +#' Heteroskedasticity-Robust VCOV +#' +#' Computes the heteroskedasticity-robust VCOV of `fixest` objects. +#' +#' @param x A `fixest` object. +#' @param type A string scalar. Either "HC1"/"HC2"/"HC3" +#' @param exact A logical scalar. Should the hat matrix be calculated exactly or approximated using the JLA algorithm? Default = TRUE. See [`hatvalues.fixest`] for details. +#' @param p A numeric scalar. The number of draws should be used when approximating the hat matrix? Note that larger `p` are more accurate, but slower. This is only used when `exact = TRUE`. Default is 500. +#' @param ssc An object returned by the function [`ssc`]. It specifies how to perform the small sample correction. +#' +#' @return +#' If the first argument is a `fixest` object, then a VCOV is returned (i.e. a symmetric matrix). +#' +#' If the first argument is not a `fixest` object, then a) implicitly the arguments are shifted to the left (i.e. `vcov_hetero("HC3")` is equivalent to `vcov_hetero(type = "HC3")` and b) a VCOV-*request* is returned and NOT a VCOV. That VCOV-request can then be used in the argument `vcov` of various `fixest` functions (e.g. [`vcov.fixest`] or even in the estimation calls). +#' +#' @author +#' Laurent Berge and Kyle Butts +#' +#' @references +#' Cameron AC, Gelbach JB, Miller DL (2011). "Robust Inference with Multiway Clustering." *Journal of Business & Economic Statistics*, 29(2), 238-249. doi:10.1198/jbes.2010.07136. +#' +#' @examples +#' +#' base = iris +#' names(base) = c("y", "x1", "x2", "x3", "species") +#' base$clu = rep(1:5, 30) +#' +#' est = feols(y ~ x1 | species, base) +#' +#' vcov_hetero(est, "hc1") +#' vcov_hetero(est, "hc2", ssc = ssc(adj = FALSE)) +#' vcov_hetero(est, "hc3", ssc = ssc(adj = FALSE)) +#' +#' # Using approximate hatvalues +#' vcov_hetero(est, "hc3", exact = FALSE, p = 500) +#' +#' +vcov_hetero = function(x, type = "hc1", exact = TRUE, p = 500, ssc = NULL){ + # User-level function to compute clustered SEs + # typically we only do checking and reshaping here + + # slide_args allows the implicit allocation of arguments + # it makes semi-global changes => the values of the args here are modified + slide_args(x, type = type, exact = exact, p = p, ssc = ssc) + IS_REQUEST = is.null(x) + + check_value(ssc, "NULL class(ssc.type)", .message = "The argument 'ssc' must be an object created by the function ssc().") + + + # We create the request + use_request = IS_REQUEST + + extra_args = list(exact = exact, p = p) + vcov_request = list(vcov = type, ssc = ssc, extra_args = extra_args) + class(vcov_request) = "fixest_vcov_request" + + + if(IS_REQUEST){ + res = vcov_request + } else { + res = vcov(x, vcov = vcov_request) + } + + res +} + #### #### SETUP #### @@ -1592,7 +1703,7 @@ vcov_setup = function(){ # Heteroskedasticity robust # - vcov_hetero_setup = list(name = c("hetero", "white", "hc1"), fun_name = "vcov_hetero_internal", vcov_label = "Heteroskedasticity-robust") + vcov_hetero_setup = list(name = c("hetero", "white", "hc1", "hc2", "hc3"), fun_name = "vcov_hetero_internal", vcov_label = "Heteroskedasticity-robust") # # cluster(s) @@ -1766,12 +1877,7 @@ vcov_hetero_internal = function(bread, scores, sandwich, ssc, nthreads, ...){ if(!sandwich){ vcov_noAdj = cpppar_crossprod(scores, 1, nthreads) } else { - # we make a n/(n-1) adjustment to match vcovHC(type = "HC1") - - n = nrow(scores) - adj = ifelse(ssc$cluster.adj, n / (n-1), 1) - - vcov_noAdj = cpppar_crossprod(cpppar_matprod(scores, bread, nthreads), 1, nthreads) * adj + vcov_noAdj = cpppar_crossprod(cpppar_matprod(scores, bread, nthreads), 1, nthreads) } vcov_noAdj diff --git a/man/vcov_hetero.Rd b/man/vcov_hetero.Rd new file mode 100644 index 00000000..46cb8dfc --- /dev/null +++ b/man/vcov_hetero.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/VCOV.R +\name{vcov_hetero} +\alias{vcov_hetero} +\title{Heteroskedasticity-Robust VCOV} +\usage{ +vcov_hetero(x, type = "hc1", exact = TRUE, p = 500, ssc = NULL) +} +\arguments{ +\item{x}{A \code{fixest} object.} + +\item{type}{A string scalar. Either "HC1"/"HC2"/"HC3"} + +\item{exact}{A logical scalar. Should the hat matrix be calculated exactly or approximated using the JLA algorithm? Default = TRUE. See \code{\link{hatvalues.fixest}} for details.} + +\item{p}{A numeric scalar. The number of draws should be used when approximating the hat matrix? Note that larger \code{p} are more accurate, but slower. This is only used when \code{exact = TRUE}. Default is 500.} + +\item{ssc}{An object returned by the function \code{\link{ssc}}. It specifies how to perform the small sample correction.} +} +\value{ +If the first argument is a \code{fixest} object, then a VCOV is returned (i.e. a symmetric matrix). + +If the first argument is not a \code{fixest} object, then a) implicitly the arguments are shifted to the left (i.e. \code{vcov_hetero("HC3")} is equivalent to \code{vcov_hetero(type = "HC3")} and b) a VCOV-\emph{request} is returned and NOT a VCOV. That VCOV-request can then be used in the argument \code{vcov} of various \code{fixest} functions (e.g. \code{\link{vcov.fixest}} or even in the estimation calls). +} +\description{ +Computes the heteroskedasticity-robust VCOV of \code{fixest} objects. +} +\examples{ + +base = iris +names(base) = c("y", "x1", "x2", "x3", "species") +base$clu = rep(1:5, 30) + +est = feols(y ~ x1 | species, base) + +vcov_hetero(est, "hc1") +vcov_hetero(est, "hc2", ssc = ssc(adj = FALSE)) +vcov_hetero(est, "hc3", ssc = ssc(adj = FALSE)) + +# Using approximate hatvalues +vcov_hetero(est, "hc3", exact = FALSE, p = 500) + + +} +\references{ +Cameron AC, Gelbach JB, Miller DL (2011). "Robust Inference with Multiway Clustering." \emph{Journal of Business & Economic Statistics}, 29(2), 238-249. doi:10.1198/jbes.2010.07136. +} +\author{ +Laurent Berge and Kyle Butts +} diff --git a/tests/fixest_tests.R b/tests/fixest_tests.R index 2da7d9b2..2e29ee74 100644 --- a/tests/fixest_tests.R +++ b/tests/fixest_tests.R @@ -1140,6 +1140,16 @@ est_feols = feols(y ~ x | grp + tm, data=d) test(se(est_feols, se = "st")["x"], se(est_lm)["x"]) +# +# Heteroskedasticity-robust +# + +se_white_lm_HC1 = sqrt(vcovHC(est_lm, type = "HC1")["x", "x"]) +se_white_lm_HC0 = sqrt(vcovHC(est_lm, type = "HC0")["x", "x"]) + +test(se(est_feols, se = "hetero"), se_white_lm_HC1) +test(se(est_feols, se = "hetero", ssc = ssc(adj = FALSE, cluster.adj = FALSE)), se_white_lm_HC0) + # # Clustered # @@ -1152,16 +1162,6 @@ se_CL_grp_lm_HC0 = sqrt(vcovCL(est_lm, cluster = d$grp, type = "HC0")["x", "x"]) test(se(est_feols, ssc = ssc(fixef.K = "full")), se_CL_grp_lm_HC1) test(se(est_feols, ssc = ssc(adj = FALSE, fixef.K = "full")), se_CL_grp_lm_HC0) -# -# Heteroskedasticity-robust -# - -se_white_lm_HC1 = sqrt(vcovHC(est_lm, type = "HC1")["x", "x"]) -se_white_lm_HC0 = sqrt(vcovHC(est_lm, type = "HC0")["x", "x"]) - -test(se(est_feols, se = "hetero"), se_white_lm_HC1) -test(se(est_feols, se = "hetero", ssc = ssc(adj = FALSE, cluster.adj = FALSE)), se_white_lm_HC0) - # # Two way # @@ -1172,13 +1172,82 @@ se_CL_2w_feols = se(est_feols, se = "twoway") test(se(est_feols, se = "twoway", ssc = ssc(fixef.K = "full", cluster.df = "conv")), se_CL_2w_lm) +# +# HC2/HC3 +# + +# HC2/HC3 +base = iris +base$w = runif(nrow(base)) + +est_feols = feols(Sepal.Length ~ Sepal.Width | Species, base) +est_lm = lm(Sepal.Length ~ Sepal.Width + factor(Species), base) + +test( + vcov(est_feols, "hc2", ssc = ssc(adj = FALSE, cluster.adj = FALSE)), + sandwich::vcovHC(est_lm, "HC2")["Sepal.Width", "Sepal.Width"] +) + +test( + vcov(est_feols, "hc3", ssc = ssc(adj = FALSE, cluster.adj = FALSE)), + sandwich::vcovHC(est_lm, "HC3")["Sepal.Width", "Sepal.Width"] +) + +# HC2/HC3 with weights +base = iris +base$w = runif(nrow(base)) + +est_feols = feols(Sepal.Length ~ Sepal.Width | Species, base, weights = ~ w) +est_lm = lm(Sepal.Length ~ Sepal.Width + factor(Species), base, weights = base$w) + +test( + vcov(est_feols, "hc2", ssc = ssc(adj = FALSE, cluster.adj = FALSE)), + sandwich::vcovHC(est_lm, "HC2")["Sepal.Width", "Sepal.Width"] +) + +test( + vcov(est_feols, "hc3", ssc = ssc(adj = FALSE, cluster.adj = FALSE)), + sandwich::vcovHC(est_lm, "HC3")["Sepal.Width", "Sepal.Width"] +) + +# HC2/HC3 with GLM +base$Sepal.Length = floor(base$Sepal.Length) +est_feglm = feglm( + Sepal.Length ~ Sepal.Width | Species, base, + "poisson", weights = ~ w +) +est_glm = glm( + Sepal.Length ~ Sepal.Width + factor(Species), base, + family = poisson(), weights = base$w +) + +test( + vcov(est_feglm, "hc2", ssc = ssc(adj = FALSE, cluster.adj = FALSE)), + sandwich::vcovHC(est_glm, "HC2")["Sepal.Width", "Sepal.Width"] +) +test( + vcov(est_feglm, "hc3", ssc = ssc(adj = FALSE, cluster.adj = FALSE)), + sandwich::vcovHC(est_glm, "HC3")["Sepal.Width", "Sepal.Width"] +) + +# Fail when P_ii = 1 +base$Species = as.character(base$Species) +base[1, "Species"] = "foo" + +est_pii_singular = feols(Sepal.Length ~ Sepal.Width | Species, base) + +test( + vcov(est_pii_singular, "hc2"), "err" +) + + # # Checking the calls work properly # data(trade) -est_pois = femlm(Euros ~ log(dist_km)|Origin+Destination, trade) +est_pois = femlm(Euros ~ log(dist_km) | Origin + Destination, trade) se_clust = se(est_pois, se = "cluster", cluster = "Product") test(se(est_pois, cluster = trade$Product), se_clust) From 661a3d86629cb05409b816b4f1abf1d61deda8cb Mon Sep 17 00:00:00 2001 From: Kyle Butts Date: Fri, 16 Jun 2023 18:30:24 -0600 Subject: [PATCH 13/18] `hatvalues.fixest` speedup --- R/Methods.R | 90 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 64 insertions(+), 26 deletions(-) diff --git a/R/Methods.R b/R/Methods.R index 1ee41525..a29c331e 100644 --- a/R/Methods.R +++ b/R/Methods.R @@ -3780,55 +3780,93 @@ hatvalues.fixest = function(model, exact = TRUE, p = 1000, ...){ stop("'hatvalues' is currently not implemented for function ", method, ".") } - X = sparse_model_matrix(model, type = c("rhs", "fixef"), collin.rm = TRUE, na.rm = TRUE, combine = TRUE) - + mats = sparse_model_matrix( + model, type = c("rhs", "fixef"), + collin.rm = TRUE, na.rm = TRUE, + combine = FALSE + ) + + # Check for slope.vars and move to rhs + if (!is.null(model$slope_flag)) { + slope_var_cols = which( + grepl("\\[\\[", colnames(mats$fixef)) + ) + mats$rhs = cbind( + mats$rhs, mats$fixef[, slope_var_cols] + ) + mats$fixef = mats$fixef[, -slope_var_cols] + } + if (method == "feols") { weights = model$weights - if (is.null(weights)) weights = 1 + if (is.null(weights)) weights = rep(1, model$nobs) } else { weights = model$irls_weights } - X = X * sqrt(weights) - fe_only = isTRUE(model$onlyFixef) + # Deal with fixed effects + if (!is.null(model$fixef_vars)) { + f = model.matrix(model, type = "fixef") + f = f[, model$fixef_vars] - Xt <- Matrix::t(X) - S_xx = Matrix::crossprod(X) + if (!is.null(mats$rhs)) { + X = demean( + as.matrix(mats$rhs), f, + weights = weights + ) + } + + FEs = mats$fixef * sqrt(weights) + + X = X * sqrt(weights) + + } else if (!is.null(mats$rhs)) { + X = mats$rhs + X = X * sqrt(weights) + + FEs = NULL + } + + P_ii_X = P_ii_FE = 0 + + if (!is.null(model$fixef_vars)) { + # https://stackoverflow.com/questions/39533785/how-to-compute-diagx-solvea-tx-efficiently-without-taking-matrix-i + U = Matrix::chol(Matrix::crossprod(FEs)) + Z = Matrix::solve(Matrix::t(U), Matrix::t(FEs)) + P_ii_FE = Matrix::colSums(Z ^ 2) + } + + if (is.null(mats$rhs)) { + return(P_ii_FE) + } - # If there are only fixed effects, then can do this *super fast* - if (fe_only == TRUE) { - # https://stackoverflow.com/questions/39533785/how-to-compute-diagx-solvea-tx-efficiently-without-taking-matrix-i - U = Matrix::chol(S_xx) - Z = Matrix::solve(Matrix::t(U), Xt) - P_ii = Matrix::colSums(Z ^ 2) - - return(P_ii) - } - # Note: This might fail if looking at too large of a matrix if (exact == TRUE) { + tXX = Matrix::crossprod(X) + # Z = (X'X)^{-1} X' - Z = Matrix::solve(S_xx, Xt) + Z = Matrix::solve(tXX, Matrix::t(X)) # X %*% Z = X (X'X)^{-1} X' - # P_ii = diag(X %*% Z) = colSums(X * Z) - P_ii = Matrix::rowSums(X * Matrix::t(Z)) + # P_ii = diag(X %*% Z) = rowSums(X * Z) + P_ii_X = Matrix::rowSums(X * Matrix::t(Z)) - return(P_ii) } if (exact == FALSE) { + tXX = Matrix::crossprod(X) + Xt = Matrix::t(X) n <- nrow(X) # Using speed heuristics based on n*p and trying # Solve for Z = (X'X)^{-1} (X' R_p) if (n * p <= 1000000) { - + # Fastest for small n*p (b/c of my for loop) Rp <- matrix((stats::runif(n * p) > 0.5) * 2 - 1, nrow = p, ncol = n) X_Rp <- Matrix::tcrossprod(Xt, Rp) - Z = Matrix::solve(S_xx, X_Rp) + Z = Matrix::solve(tXX, X_Rp) # P_ii = 1/p || X_i' Z ||^2 P_ii = Matrix::colSums(Matrix::crossprod(Z, Xt)^2) / p @@ -3848,15 +3886,15 @@ hatvalues.fixest = function(model, exact = TRUE, p = 1000, ...){ ) X_Rp_j <- Matrix::tcrossprod(Xt, Rp_j) - Zj <- Matrix::solve(S_xx, X_Rp_j) + Zj <- Matrix::solve(tXX, X_Rp_j) P_ii = P_ii + as.numeric(X %*% Zj)^2 / p } } - return(P_ii) - } + return(P_ii_FE + P_ii_X) + } #### From 869aeee1ac030fe787a217fbf9f358b7f0eef8f4 Mon Sep 17 00:00:00 2001 From: Kyle Butts Date: Sat, 17 Jun 2023 12:14:46 -0600 Subject: [PATCH 14/18] Speedup `hatvalues.fixest` by using block projection --- R/Methods.R | 188 ++++++++++++++++++++++++---------------- man/hatvalues.fixest.Rd | 2 +- 2 files changed, 112 insertions(+), 78 deletions(-) diff --git a/R/Methods.R b/R/Methods.R index a29c331e..911009b6 100644 --- a/R/Methods.R +++ b/R/Methods.R @@ -3731,7 +3731,7 @@ deviance.fixest = function(object, ...){ #' Computes the hat values for [`feols`] or [`feglm`] estimations. #' #' @param model A fixest object. For instance from feols or feglm. -#' @param exact Logical scalar, default is `TRUE`. Whether the diagonals of the projection matrix should be calculated exactly. If `FALSE`, then it will be +#' @param exact Logical scalar, default is `TRUE`. Whether the diagonals of the projection matrix should be calculated exactly. If `FALSE`, then it will be approximated using a JLA algorithm. See details. Unless you have a very large number of observations, it is recommended to keep the default value. #' @param p Numeric scala, default is 1000. This is only used when `exact == FALSE`. This determined the number of bootstrap samples used to estimate the projection matrix. #' @param ... Not currently used. #' @@ -3762,12 +3762,11 @@ hatvalues.fixest = function(model, exact = TRUE, p = 1000, ...){ # Only works for feglm/feols objects if(isTRUE(model$lean)){ - # LATER: recompute it stop("The method 'hatvalues.fixest' cannot be applied to 'lean' fixest objects. Please re-estimate with 'lean = FALSE'.") } if (!is.null(model$iv)) { - stop("The method 'hatvalues.fixest' cannot be applied to ") + stop("The method 'hatvalues.fixest' cannot be applied to IV models.") } if(is_user_level_call()){ @@ -3780,121 +3779,156 @@ hatvalues.fixest = function(model, exact = TRUE, p = 1000, ...){ stop("'hatvalues' is currently not implemented for function ", method, ".") } - mats = sparse_model_matrix( - model, type = c("rhs", "fixef"), + # Outline: Blockwise Formula for Projection Matrix + # https://en.wikipedia.org/wiki/Projection_matrix#Blockwise_formula + # Part 1: fixed effects + # Part 2: slope vars (demeaned by fixed effects) + # Part 3: rhs vars (demeaned by fixed effects and slope vars) + # Sum the three parts to get the projection matrix + P_ii_list = list() + mats = list() + mats$fixef = sparse_model_matrix( + model, type = c("fixef"), collin.rm = TRUE, na.rm = TRUE, - combine = FALSE + sortCols = FALSE ) - # Check for slope.vars and move to rhs + # Check for slope.vars and move to seperate matrix if (!is.null(model$slope_flag)) { slope_var_cols = which( grepl("\\[\\[", colnames(mats$fixef)) ) - mats$rhs = cbind( - mats$rhs, mats$fixef[, slope_var_cols] - ) + mats$slope_vars = mats$fixef[, slope_var_cols] mats$fixef = mats$fixef[, -slope_var_cols] } + # Get weights if (method == "feols") { weights = model$weights - if (is.null(weights)) weights = rep(1, model$nobs) } else { weights = model$irls_weights } - - # Deal with fixed effects + if (!is.null(model$fixef_vars)) { f = model.matrix(model, type = "fixef") f = f[, model$fixef_vars] - - if (!is.null(mats$rhs)) { - X = demean( - as.matrix(mats$rhs), f, + + # Matrix of fixed effects times weights + if (!is.null(weights)) mats$fixef = mats$fixef * sqrt(weights) + + # Demean X by FEs and slope_vars + if (is.null(model$onlyFixef) & method == "feols") { + mats$rhs = demean(model)[, -1, drop = FALSE] + if (!is.null(weights)) mats$rhs = mats$rhs * sqrt(weights) + } + if (is.null(model$onlyFixef) & method == "feglm") { + mats$rhs = demean( + model.matrix(model, "rhs"), + f, weights = weights ) + if (!is.null(weights)) mats$rhs = mats$rhs * sqrt(weights) } - - FEs = mats$fixef * sqrt(weights) - X = X * sqrt(weights) - - } else if (!is.null(mats$rhs)) { - X = mats$rhs - X = X * sqrt(weights) + # Demean slope_vars by FEs + if (!is.null(mats$slope_vars)) { + mats$slope_vars = demean( + as.matrix(mats$slope_vars), + f, + weights = weights + ) + if (!is.null(weights)) mats$slope_vars = mats$slope_vars * sqrt(weights) + } - FEs = NULL + } else if (is.null(model$onlyFixef)) { + mats$rhs = sparse_model_matrix( + model, type = c("rhs"), + collin.rm = TRUE, na.rm = TRUE + ) + + if (!is.null(weights)) mats$rhs = mats$rhs * sqrt(weights) } - - P_ii_X = P_ii_FE = 0 - if (!is.null(model$fixef_vars)) { - # https://stackoverflow.com/questions/39533785/how-to-compute-diagx-solvea-tx-efficiently-without-taking-matrix-i - U = Matrix::chol(Matrix::crossprod(FEs)) - Z = Matrix::solve(Matrix::t(U), Matrix::t(FEs)) - P_ii_FE = Matrix::colSums(Z ^ 2) - } - if (is.null(mats$rhs)) { - return(P_ii_FE) + # Caclulate P_ii's + if (!is.null(mats$fixef)) { + # https://stackoverflow.com/questions/39533785/how-to-compute-diagx-solvea-tx-efficiently-without-taking-matrix-i + U = Matrix::chol(Matrix::crossprod(mats$fixef)) + Z = Matrix::solve(Matrix::t(U), Matrix::t(mats$fixef)) + P_ii_list$fixef = Matrix::colSums(Z ^ 2) } # Note: This might fail if looking at too large of a matrix if (exact == TRUE) { - tXX = Matrix::crossprod(X) - - # Z = (X'X)^{-1} X' - Z = Matrix::solve(tXX, Matrix::t(X)) - - # X %*% Z = X (X'X)^{-1} X' - # P_ii = diag(X %*% Z) = rowSums(X * Z) - P_ii_X = Matrix::rowSums(X * Matrix::t(Z)) + if (!is.null(mats$slope_vars)) { + Z = Matrix::solve( + Matrix::crossprod(mats$slope_vars), + Matrix::t(mats$slope_vars) + ) + # P_ii = diag(X %*% Z) = rowSums(X * Z) + P_ii_list$slope_vars = Matrix::rowSums( + mats$slope_vars * Matrix::t(Z) + ) + } + if (!is.null(mats$rhs)) { + tXXinv = model$cov.iid + if (method == "feols") tXXinv = tXXinv / model$sigma2 + + P_ii_list$rhs = Matrix::rowSums( + mats$rhs * Matrix::t(tXXinv %*% Matrix::t(mats$rhs)) + ) + } } + # Theory: https://cs-people.bu.edu/evimaria/cs565/achlioptas.pdf if (exact == FALSE) { + n = model$nobs - tXX = Matrix::crossprod(X) - Xt = Matrix::t(X) - n <- nrow(X) - - # Using speed heuristics based on n*p and trying - # Solve for Z = (X'X)^{-1} (X' R_p) - if (n * p <= 1000000) { - - # Fastest for small n*p (b/c of my for loop) - Rp <- matrix((stats::runif(n * p) > 0.5) * 2 - 1, nrow = p, ncol = n) - X_Rp <- Matrix::tcrossprod(Xt, Rp) - Z = Matrix::solve(tXX, X_Rp) - - # P_ii = 1/p || X_i' Z ||^2 - P_ii = Matrix::colSums(Matrix::crossprod(Z, Xt)^2) / p + if (!is.null(mats$slope_vars)) { + P_ii_list$slope_vars = rep(0, n) + tSS = Matrix::crossprod(mats$slope_vars) + tS = Matrix::t(mats$slope_vars) + } + if (!is.null(mats$rhs)) { + P_ii_list$rhs = rep(0, n) + tXX = Matrix::crossprod(mats$rhs) + tX = Matrix::t(mats$rhs) + } - } else { - # Slightly slower, but very memory efficient - # Theory: https://cs-people.bu.edu/evimaria/cs565/achlioptas.pdf - P_ii = rep(0, n) - - for (j in 1:p) { - # Rademacher Weights (-1, 1) - Rp_j <- matrix( - (stats::runif(n) > 0.5) * 2 - 1, - # Uncomment for speed-up - # rademacher::sample_rademacher(n), - nrow = 1, ncol = n - ) - X_Rp_j <- Matrix::tcrossprod(Xt, Rp_j) + for (j in 1:p) { + # Rademacher Weights (-1, 1) + Rp_j <- matrix( + (stats::runif(n) > 0.5) * 2 - 1, + # Uncomment for speed-up + # rademacher::sample_rademacher(n), + nrow = 1, ncol = n + ) + + + if (!is.null(mats$slope_vars)) { + tS_Rp_j <- Matrix::tcrossprod(tS, Rp_j) - Zj <- Matrix::solve(tXX, X_Rp_j) - P_ii = P_ii + as.numeric(X %*% Zj)^2 / p - } + Zj <- Matrix::solve(tSS, tS_Rp_j) + P_ii_list$slope_vars = + P_ii_list$slope_vars + + as.numeric(mats$slope_vars %*% Zj)^2 / p } - } + if (!is.null(mats$rhs)) { + tX_Rp_j <- Matrix::tcrossprod(tX, Rp_j) - return(P_ii_FE + P_ii_X) + Zj <- Matrix::solve(tXX, tX_Rp_j) + P_ii_list$rhs = + P_ii_list$rhs + + as.numeric(mats$rhs %*% Zj)^2 / p + } + } + } + P_ii = Reduce("+", P_ii_list) + + return(P_ii) } #### diff --git a/man/hatvalues.fixest.Rd b/man/hatvalues.fixest.Rd index cbd33da5..e9919fc2 100644 --- a/man/hatvalues.fixest.Rd +++ b/man/hatvalues.fixest.Rd @@ -9,7 +9,7 @@ \arguments{ \item{model}{A fixest object. For instance from feols or feglm.} -\item{exact}{Logical scalar, default is \code{TRUE}. Whether the diagonals of the projection matrix should be calculated exactly. If \code{FALSE}, then it will be} +\item{exact}{Logical scalar, default is \code{TRUE}. Whether the diagonals of the projection matrix should be calculated exactly. If \code{FALSE}, then it will be approximated using a JLA algorithm. See details. Unless you have a very large number of observations, it is recommended to keep the default value.} \item{p}{Numeric scala, default is 1000. This is only used when \code{exact == FALSE}. This determined the number of bootstrap samples used to estimate the projection matrix.} From d915ef75e3272ac2900b7ec1940239d4807fdc50 Mon Sep 17 00:00:00 2001 From: Kyle Butts Date: Wed, 9 Aug 2023 08:53:14 -0400 Subject: [PATCH 15/18] Work with data.table --- R/sparse_model_matrix.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/sparse_model_matrix.R b/R/sparse_model_matrix.R index 322c1462..6ac8a98c 100644 --- a/R/sparse_model_matrix.R +++ b/R/sparse_model_matrix.R @@ -84,7 +84,7 @@ sparse_model_matrix = function(object, data, type = "rhs", na.rm = TRUE, collin stop("The argument 'data' must be a data.frame or a matrix.") } - data = as.data.frame(data) + # data = as.data.frame(data) # Allows passage of formula to sparse_model_matrix. A bit inefficient, but it works. isFormula = FALSE From 2b2323f3a49a0c3745ac4b8457fd178c6d87d6a9 Mon Sep 17 00:00:00 2001 From: Kyle Butts Date: Wed, 9 Aug 2023 08:55:13 -0400 Subject: [PATCH 16/18] Work with data.table (pt. 2) --- R/Methods.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/Methods.R b/R/Methods.R index 911009b6..81f1b0b5 100644 --- a/R/Methods.R +++ b/R/Methods.R @@ -3811,7 +3811,7 @@ hatvalues.fixest = function(model, exact = TRUE, p = 1000, ...){ if (!is.null(model$fixef_vars)) { f = model.matrix(model, type = "fixef") - f = f[, model$fixef_vars] + f = subset(f, select = model$fixef_vars) # Matrix of fixed effects times weights if (!is.null(weights)) mats$fixef = mats$fixef * sqrt(weights) From dabb4484fa1c5df19f22fcef15b0ed949286d6c3 Mon Sep 17 00:00:00 2001 From: Kyle Butts Date: Thu, 9 May 2024 12:59:21 -0400 Subject: [PATCH 17/18] Fix documentation because formula is valid --- R/sparse_model_matrix.R | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/R/sparse_model_matrix.R b/R/sparse_model_matrix.R index 6ac8a98c..73b8242c 100644 --- a/R/sparse_model_matrix.R +++ b/R/sparse_model_matrix.R @@ -1,13 +1,13 @@ #' Design matrix of a `fixest` object returned in sparse format #' -#' This function creates the left-hand-side or the right-hand-side(s) of a [`femlm`], [`feols`] or [`feglm`] estimation. Note that this function currently does not accept a formula +#' This function creates the left-hand-side or the right-hand-side(s) of a [`femlm`], [`feols`] or [`feglm`] estimation. #' #' @inheritParams nobs.fixest #' #' @param data If missing (default) then the original data is obtained by evaluating the `call`. Otherwise, it should be a `data.frame`. #' @param type Character vector or one sided formula, default is "rhs". Contains the type of matrix/data.frame to be returned. Possible values are: "lhs", "rhs", "fixef", "iv.rhs1" (1st stage RHS), "iv.rhs2" (2nd stage RHS), "iv.endo" (endogenous vars.), "iv.exo" (exogenous vars), "iv.inst" (instruments). #' @param na.rm Default is `TRUE`. Should observations with NAs be removed from the matrix? -#' @param collin.rm Logical scalar, default is `TRUE`. Whether to remove variables that were found to be collinear during the estimation. Beware: it does not perform a collinearity check. +#' @param collin.rm Logical scalar. Whether to remove variables that were found to be collinear during the estimation. Beware: it does not perform a collinearity check and bases on the `coef(object)`. Default is TRUE if object is a `fixest` object, or FALSE if object is a formula. #' @param combine Logical scalar, default is `TRUE`. Whether to combine each resulting sparse matrix #' @param ... Not currently used. #' @@ -25,10 +25,10 @@ #' #' est = feols(wt ~ i(vs) + hp | cyl, mtcars) #' sparse_model_matrix(est) -#' +#' sparse_model_matrix(wt ~ i(vs) + hp | cyl, mtcars) #' #' @export -sparse_model_matrix = function(object, data, type = "rhs", na.rm = TRUE, collin.rm = TRUE, combine = TRUE, ...) { +sparse_model_matrix = function(object, data, type = "rhs", na.rm = TRUE, collin.rm = NULL, combine = TRUE, ...) { # We evaluate the formula with the past call # type: lhs, rhs, fixef, iv.endo, iv.inst, iv.rhs1, iv.rhs2 # if fixef => return a DF @@ -53,7 +53,7 @@ sparse_model_matrix = function(object, data, type = "rhs", na.rm = TRUE, collin type = check_set_types(type, c("lhs", "rhs", "fixef", "iv.endo", "iv.inst", "iv.exo", "iv.rhs1", "iv.rhs2")) if (isTRUE(object$is_fit)) { - stop("model.matrix method not available for fixest estimations obtained from fit methods.") + stop("sparse_model_matrix method not available for fixest estimations obtained from fit methods.") } if (any(grepl("^iv", type)) && !isTRUE(object$iv)) { @@ -62,13 +62,16 @@ sparse_model_matrix = function(object, data, type = "rhs", na.rm = TRUE, collin check_arg(subset, "logical scalar | character vector no na") - check_arg_plus(collin.rm, "logical scalar") + if (missing(collin.rm)) { + collin.rm = if (inherits(object, "formula")) FALSE else TRUE + } else { + check_arg_plus(collin.rm, "logical scalar") + } # Evaluation with the data original_data = FALSE if (missnull(data)) { original_data = TRUE - data = fetch_data(object, "To apply 'sparse_model_matrix', ") } @@ -84,16 +87,17 @@ sparse_model_matrix = function(object, data, type = "rhs", na.rm = TRUE, collin stop("The argument 'data' must be a data.frame or a matrix.") } - # data = as.data.frame(data) # Allows passage of formula to sparse_model_matrix. A bit inefficient, but it works. isFormula = FALSE split_fml = NULL - if ("formula" %in% class(object)) { + if (inherits(object, "formula")) { split_fml = fml_split_internal(object) - - if (collin.rm == TRUE | length(split_fml) == 3) { - message("Formula passed to sparse_model_matrix with collin.rm == TRUE or iv. Estimating feols with formula.") + if (isTRUE(collin.rm)) { + message("Formula passed to sparse_model_matrix with collin.rm == TRUE. Estimating feols with formula.") + object = feols(object, data = data) + } else if (length(split_fml) == 3) { + message("Formula passed to sparse_model_matrix with an iv formula. Estimating feols with formula.") object = feols(object, data = data) } else { isFormula = TRUE @@ -481,7 +485,7 @@ mult_sparse = function(...) { is_factor = TRUE factor_list[[length(factor_list) + 1]] = xi } - } + } # numeric if (!is_i && !is_factor) { @@ -506,7 +510,7 @@ mult_sparse = function(...) { if (!is.null(num_var)) { num_var = num_var[info_i$rowid] values = values * num_var - } + } res = list(rowid = info_i$rowid, colid = info_i$colid, values = values[info_i$rowid], @@ -675,8 +679,8 @@ vars_to_sparse_mat = function(vars, data, collin.rm = FALSE, object = NULL, type } if (add_intercept) { - mat = cbind(1, mat) - colnames(mat)[1] = "(Intercept)" + mat = cbind(1, mat) + colnames(mat)[1] = "(Intercept)" } return(mat) From e673e2d25ee8a85ed98bf9a0bcd5406ac3d4fc9d Mon Sep 17 00:00:00 2001 From: Kyle Butts Date: Thu, 9 May 2024 12:59:48 -0400 Subject: [PATCH 18/18] Fix names of `sparse_model_matrix` with test coverage --- R/sparse_model_matrix.R | 43 ++++++++++++++++++++++++++++++----------- tests/fixest_tests.R | 18 +++++++++++++++++ 2 files changed, 50 insertions(+), 11 deletions(-) diff --git a/R/sparse_model_matrix.R b/R/sparse_model_matrix.R index 73b8242c..57bd3df0 100644 --- a/R/sparse_model_matrix.R +++ b/R/sparse_model_matrix.R @@ -466,25 +466,36 @@ mult_sparse = function(...) { dots = list(...) n = length(dots) + mc = match.call() + var_call = sapply(seq_len(n), function(i) deparse_long(mc[[i+1]])) num_var = NULL factor_list = list() + factor_idx = c() + i_idx = c() info_i = NULL is_i = is_factor = FALSE # You can't have interactions between i and factors, it's either for (i in 1:n) { xi = dots[[i]] + vari = mc[[i+1]] if (is.numeric(xi)) { # We stack the product num_var = if (is.null(num_var)) xi else xi * num_var } else if (inherits(xi, "i_sparse")) { is_i = TRUE info_i = xi + i_idx = c(i_idx, i) } else { is_factor = TRUE factor_list[[length(factor_list) + 1]] = xi + factor_idx = c(factor_idx, i) } + } + + if (is_factor && is_i) { + stop("Unfortunately, can not interact factor with `i()` in sparse_model_matrix") } # numeric @@ -495,14 +506,24 @@ mult_sparse = function(...) { if (is_factor) { factor_list$add_items = TRUE factor_list$items.list = TRUE - + factor_list$multi.join = " ; " fact_as_int = do.call(to_integer, factor_list) values = if (is.null(num_var)) rep(1, length(fact_as_int$x)) else num_var - rowid = seq_along(values) + + # messy, but need this to get things like `factor(am)1:hp:factor(cyl)6` + items_mat = do.call(rbind, strsplit(fact_as_int$items, " ; ")) + + # intersplice var_call with items + col_names = sapply(seq_len(nrow(items_mat)), function(i) { + pasteable = var_call + pasteable[factor_idx] = paste0(var_call[factor_idx], items_mat[i, ]) + paste(pasteable, collapse = ":") + }, USE.NAMES = FALSE) + res = list(rowid = rowid, colid = fact_as_int$x, values = values, - col_names = fact_as_int$items, n_cols = length(fact_as_int$items)) + col_names = col_names, n_cols = length(fact_as_int$items)) # i() } else { @@ -510,11 +531,18 @@ mult_sparse = function(...) { if (!is.null(num_var)) { num_var = num_var[info_i$rowid] values = values * num_var + col_names = sapply(info_i$col_names, function(item) { + pasteable = var_call + pasteable[i_idx] = item + paste(pasteable, collapse = ":") + }, USE.NAMES = FALSE) + } else { + col_names = info_i$col_names } res = list(rowid = info_i$rowid, colid = info_i$colid, values = values[info_i$rowid], - col_names = info_i$col_names, + col_names = col_names, n_cols = length(info_i$col_names)) } @@ -608,13 +636,6 @@ vars_to_sparse_mat = function(vars, data, collin.rm = FALSE, object = NULL, type } - # TODO: Should I use error_sender? - # mat = error_sender(fixest_model_matrix_extra( - # object = object, newdata = data, original_data = original_data, - # fml = fml, fake_intercept = fake_intercept, - # subset = subset), - # "In 'model.matrix', the RHS could not be evaluated: ") - if (collin.rm & is.null(object)) { stop("You need to provide the 'object' argument to use 'collin.rm = TRUE'.") } diff --git a/tests/fixest_tests.R b/tests/fixest_tests.R index 2e29ee74..06694aff 100644 --- a/tests/fixest_tests.R +++ b/tests/fixest_tests.R @@ -2330,6 +2330,24 @@ res_pois = fepois(Sepal.Length ~ Sepal.Width + Petal.Length | Species, iris) sp_pois = sparse_model_matrix(res_pois, data = iris) +# make sure names are correct +test_col_names <- function(est) { + m <- fixest::sparse_model_matrix(est, collin.rm = FALSE) + q <- test(all(names(coef(est)) %in% colnames(m)), TRUE) + return(invisible(NULL)) +} +test_col_names(feols(mpg ~ i(am) + disp | vs, mtcars)) +test_col_names(feols(mpg ~ i(am, i.cyl) + disp | vs, mtcars)) +test_col_names(feols(mpg ~ i(am, hp) + disp | vs, mtcars)) +test_col_names(feols(mpg ~ i(am):hp + disp | vs, mtcars)) +test_col_names(feols(mpg ~ factor(am) + disp | vs, mtcars)) +test_col_names(feols(mpg ~ factor(am):factor(cyl) + disp | vs, mtcars, notes = FALSE)) +test_col_names(feols(mpg ~ factor(am):hp + disp | vs, mtcars)) +test_col_names(feols(mpg ~ poly(hp, degree = 2) + disp | vs, mtcars)) + + + + #### #### fitstat #### ####