diff --git a/.github/workflows/check-standard.yml b/.github/workflows/check-standard.yml index 1692193e..2a94e1f1 100644 --- a/.github/workflows/check-standard.yml +++ b/.github/workflows/check-standard.yml @@ -34,13 +34,13 @@ jobs: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - - uses: r-lib/actions/setup-r@v1 + - uses: r-lib/actions/setup-r@v2 with: r-version: ${{ matrix.config.r }} - - uses: r-lib/actions/setup-pandoc@v1 + - uses: r-lib/actions/setup-pandoc@v2 - name: Query dependencies run: | diff --git a/DESCRIPTION b/DESCRIPTION index 97ffa093..41499dd9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,7 +15,7 @@ Authors@R: role = "ctb", email = "grantmcd@uoregon.edu", comment = c(ORCID = "0000-0001-7883-8573"))) -Imports: stats, graphics, grDevices, tools, utils, methods, numDeriv, nlme, sandwich, Rcpp(>= 1.0.5), dreamerr(>= 1.2.3) +Imports: stats, graphics, grDevices, tools, utils, methods, numDeriv, nlme, sandwich, Rcpp(>= 1.0.5), dreamerr(>= 1.2.3), Matrix Suggests: knitr, rmarkdown, data.table, plm, MASS, pander, ggplot2, lfe, tinytex, pdftools LinkingTo: Rcpp Depends: R(>= 3.5.0) @@ -28,6 +28,6 @@ URL: https://lrberge.github.io/fixest/, https://github.com/lrberge/fixest SystemRequirements: C++11 VignetteBuilder: knitr LazyData: true -RoxygenNote: 7.2.0 +RoxygenNote: 7.2.3 Roxygen: list(markdown = TRUE) Encoding: UTF-8 diff --git a/NAMESPACE b/NAMESPACE index f008a373..f7a9612f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -36,6 +36,8 @@ export(to_integer, demean, obs) export(as.dict) # setters & getters exportPattern("^(s|g)etFixest") +# sparse_model_matrix +export(sparse_model_matrix) # coeftable and co export(coeftable, se, pvalue, tstat) @@ -54,7 +56,7 @@ S3method(tstat, fixest) S3method(tstat, fixest_multi) # VCOV funs -export(vcov_cluster, vcov_DK, vcov_NW, vcov_conley) +export(vcov_cluster, vcov_DK, vcov_NW, vcov_conley, vcov_hetero) # Data manipulation export(n_unik, osize, sample_df, fdim) diff --git a/NEWS.md b/NEWS.md index 5ff6c1f4..6bf329ea 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,40 @@ # fixest 0.11.1 +## Sparse Model Matrix + +- Add the ability to create sparse model matrices from regression objects and formula in a memory-efficient way. + +```R +est = feols(mpg ~ drat | cyl, mtcars) + +sparse_model_matrix(est, type = "lhs") +#> 32 x 1 sparse Matrix of class "dgCMatrix" +#> mpg +#> [1,] 21.0 +#> [2,] 21.0 +#> [3,] 22.8 + +sparse_model_matrix(est, type = c("rhs", "fixef")) +#> 32 x 4 sparse Matrix of class "dgCMatrix" +#> drat cyl::4 cyl::6 cyl::8 +#> [1,] 3.90 . 1 . +#> [2,] 3.90 . 1 . + +# Or you can pass a (linear) formula +sparse_model_matrix(mpg ~ i(vs) | gear^cyl, data = mtcars, type = c("rhs", "fixef")) +#> 32 x 9 sparse Matrix of class "dgCMatrix" +#> vs::1 gear^cyl::3_4 gear^cyl::3_6 gear^cyl::3_8 gear^cyl::4_4 gear^cyl::4_6 gear^cyl::5_4 gear^cyl::5_6 gear^cyl::5_8 +#> [1,] . . . . . 1 . . . +#> [2,] . . . . . 1 . . . +#> [3,] 1 . . . 1 . . . . +``` + + +- `hatvalues` now works with fixed effects. It also allows for approximate computation (via `exact` and `p` arguments). For details on the approximation, see [Kline, Saggio, and Sølvsten 2020](https://eml.berkeley.edu/~pkline/papers/KSS2020.pdf). + +- Add `HC2` and `HC3` options to `vcov` for `feols` and `feglm` models. With this gets added the `vcov_hetero` function. + ## Documentation bug - fix bug in the help of `bin` @@ -1021,7 +1055,7 @@ Common methods have been extended to `fixest_multi` objects. - Improve error messages. - - `hatvalues.fixest`: now returns an error instead of a message when fixed-effects are present (it makes the interplay with `sadnwich` 'nicer'). + - `hatvalues.fixest`: now returns an error instead of a message when fixed-effects are present (it makes the interplay with `sandwich` 'nicer'). # fixest 0.8.4 diff --git a/R/Methods.R b/R/Methods.R index c7128568..81f1b0b5 100644 --- a/R/Methods.R +++ b/R/Methods.R @@ -338,7 +338,7 @@ print.fixest = function(x, n, type = "table", fitstat = NULL, ...){ #' @param forceCovariance (Advanced users.) Logical, default is `FALSE`. In the peculiar case where the obtained Hessian is not invertible (usually because of collinearity of some variables), use this option to force the covariance matrix, by using a generalized inverse of the Hessian. This can be useful to spot where possible problems come from. #' @param keepBounded (Advanced users -- `feNmlm` with non-linear part and bounded coefficients only.) Logical, default is `FALSE`. If `TRUE`, then the bounded coefficients (if any) are treated as unrestricted coefficients and their S.E. is computed (otherwise it is not). #' @param n Integer, default is 1000. Number of coefficients to display when the print method is used. -#' @param ... Only used if the argument `.vocv` is provided and is a function: extra arguments to be passed to that function. +#' @param ... Only used if the argument `.vcov` is provided and is a function: extra arguments to be passed to that function. #' #' @section Compatibility with \pkg{sandwich} package: #' The VCOVs from `sandwich` can be used with `feols`, `feglm` and `fepois` estimations. If you want to have a `sandwich` VCOV when using `summary.fixest`, you can use the argument `vcov` to specify the VCOV function to use (see examples). @@ -3728,15 +3728,26 @@ deviance.fixest = function(object, ...){ #' Hat values for `fixest` objects #' -#' Computes the hat values for [`feols`] or [`feglm`] estimations. Only works when there are no fixed-effects. +#' Computes the hat values for [`feols`] or [`feglm`] estimations. #' #' @param model A fixest object. For instance from feols or feglm. +#' @param exact Logical scalar, default is `TRUE`. Whether the diagonals of the projection matrix should be calculated exactly. If `FALSE`, then it will be approximated using a JLA algorithm. See details. Unless you have a very large number of observations, it is recommended to keep the default value. +#' @param p Numeric scala, default is 1000. This is only used when `exact == FALSE`. This determined the number of bootstrap samples used to estimate the projection matrix. #' @param ... Not currently used. #' #' @details #' Hat values are not available for [`fenegbin`][fixest::femlm], [`femlm`] and [`feNmlm`] estimations. -#' -#' When there are fixed-effects, the hat values of the reduced form are different from the hat values of the full model. And we cannot get costlessly the hat values of the full model from the reduced form. It would require to reestimate the model with the fixed-effects as regular variables. +#' +#' Hat values for generalized linear model are disussed in Belsley, Kuh and Welsch (1980), Cook and Weisberg (1982), etc. +#' +#' When `exact == FALSE`, the Johnson-Lindenstrauss approximation (JLA) algorithm is used which approximates the diagonals of the projection matrix. For more precision (but longer time), increase the value of `p`. See Kline, Saggio, and Sølvsten (2020) for details. +#' +#' +#' @references +#' Belsley, D. A., Kuh, E. and Welsch, R. E. (1980). *Regression Diagnostics*. New York: Wiley. +#' Cook, R. D. and Weisberg, S. (1982). *Residuals and Influence in Regression*. London: Chapman and Hall. +#' Kline, P., Saggio R., and Sølvsten, M. (2020). *Leave‐Out Estimation of Variance Components*. Econometrica. +#' #' #' @return #' Returns a vector of the same length as the number of observations used in the estimation. @@ -3747,46 +3758,177 @@ deviance.fixest = function(object, ...){ #' head(hatvalues(est)) #' #' -hatvalues.fixest = function(model, ...){ - # Only works for feglm/feols objects + no fixed-effects - # When there are fixed-effects the hatvalues of the reduced form is different from - # the hatvalues of the full model. And we cannot get costlessly the hatvalues of the full - # model from the reduced form. => we need to reestimate the model with the FEs as - # regular variables. +hatvalues.fixest = function(model, exact = TRUE, p = 1000, ...){ + # Only works for feglm/feols objects if(isTRUE(model$lean)){ - # LATER: recompute it stop("The method 'hatvalues.fixest' cannot be applied to 'lean' fixest objects. Please re-estimate with 'lean = FALSE'.") } + if (!is.null(model$iv)) { + stop("The method 'hatvalues.fixest' cannot be applied to IV models.") + } + if(is_user_level_call()){ validate_dots() } method = model$method_type - family = model$family - msg = "hatvalues.fixest: 'hatvalues' is not implemented for estimations with fixed-effects." - - # An error is in fact nicer than a message + NA return due to the interplay with sandwich - if(!is.null(model$fixef_id)){ - stop(msg) - } - - if(method == "feols"){ - X = model.matrix(model) - - res = cpp_diag_XUtX(X, model$cov.iid / model$sigma2) - - } else if(method == "feglm"){ - XW = model.matrix(model) * sqrt(model$irls_weights) - res = cpp_diag_XUtX(XW, model$cov.iid) - - } else { + if (!(method %in% c("feols", "feglm"))) { stop("'hatvalues' is currently not implemented for function ", method, ".") } - res + # Outline: Blockwise Formula for Projection Matrix + # https://en.wikipedia.org/wiki/Projection_matrix#Blockwise_formula + # Part 1: fixed effects + # Part 2: slope vars (demeaned by fixed effects) + # Part 3: rhs vars (demeaned by fixed effects and slope vars) + # Sum the three parts to get the projection matrix + P_ii_list = list() + mats = list() + mats$fixef = sparse_model_matrix( + model, type = c("fixef"), + collin.rm = TRUE, na.rm = TRUE, + sortCols = FALSE + ) + + # Check for slope.vars and move to seperate matrix + if (!is.null(model$slope_flag)) { + slope_var_cols = which( + grepl("\\[\\[", colnames(mats$fixef)) + ) + mats$slope_vars = mats$fixef[, slope_var_cols] + mats$fixef = mats$fixef[, -slope_var_cols] + } + + # Get weights + if (method == "feols") { + weights = model$weights + } else { + weights = model$irls_weights + } + + if (!is.null(model$fixef_vars)) { + f = model.matrix(model, type = "fixef") + f = subset(f, select = model$fixef_vars) + + # Matrix of fixed effects times weights + if (!is.null(weights)) mats$fixef = mats$fixef * sqrt(weights) + + # Demean X by FEs and slope_vars + if (is.null(model$onlyFixef) & method == "feols") { + mats$rhs = demean(model)[, -1, drop = FALSE] + if (!is.null(weights)) mats$rhs = mats$rhs * sqrt(weights) + } + if (is.null(model$onlyFixef) & method == "feglm") { + mats$rhs = demean( + model.matrix(model, "rhs"), + f, + weights = weights + ) + if (!is.null(weights)) mats$rhs = mats$rhs * sqrt(weights) + } + + # Demean slope_vars by FEs + if (!is.null(mats$slope_vars)) { + mats$slope_vars = demean( + as.matrix(mats$slope_vars), + f, + weights = weights + ) + if (!is.null(weights)) mats$slope_vars = mats$slope_vars * sqrt(weights) + } + + } else if (is.null(model$onlyFixef)) { + mats$rhs = sparse_model_matrix( + model, type = c("rhs"), + collin.rm = TRUE, na.rm = TRUE + ) + + if (!is.null(weights)) mats$rhs = mats$rhs * sqrt(weights) + } + + + # Caclulate P_ii's + if (!is.null(mats$fixef)) { + # https://stackoverflow.com/questions/39533785/how-to-compute-diagx-solvea-tx-efficiently-without-taking-matrix-i + U = Matrix::chol(Matrix::crossprod(mats$fixef)) + Z = Matrix::solve(Matrix::t(U), Matrix::t(mats$fixef)) + P_ii_list$fixef = Matrix::colSums(Z ^ 2) + } + + # Note: This might fail if looking at too large of a matrix + if (exact == TRUE) { + if (!is.null(mats$slope_vars)) { + Z = Matrix::solve( + Matrix::crossprod(mats$slope_vars), + Matrix::t(mats$slope_vars) + ) + # P_ii = diag(X %*% Z) = rowSums(X * Z) + P_ii_list$slope_vars = Matrix::rowSums( + mats$slope_vars * Matrix::t(Z) + ) + } + + if (!is.null(mats$rhs)) { + tXXinv = model$cov.iid + if (method == "feols") tXXinv = tXXinv / model$sigma2 + + P_ii_list$rhs = Matrix::rowSums( + mats$rhs * Matrix::t(tXXinv %*% Matrix::t(mats$rhs)) + ) + } + } + + # Theory: https://cs-people.bu.edu/evimaria/cs565/achlioptas.pdf + if (exact == FALSE) { + n = model$nobs + + if (!is.null(mats$slope_vars)) { + P_ii_list$slope_vars = rep(0, n) + tSS = Matrix::crossprod(mats$slope_vars) + tS = Matrix::t(mats$slope_vars) + } + if (!is.null(mats$rhs)) { + P_ii_list$rhs = rep(0, n) + tXX = Matrix::crossprod(mats$rhs) + tX = Matrix::t(mats$rhs) + } + + for (j in 1:p) { + # Rademacher Weights (-1, 1) + Rp_j <- matrix( + (stats::runif(n) > 0.5) * 2 - 1, + # Uncomment for speed-up + # rademacher::sample_rademacher(n), + nrow = 1, ncol = n + ) + + + if (!is.null(mats$slope_vars)) { + tS_Rp_j <- Matrix::tcrossprod(tS, Rp_j) + + Zj <- Matrix::solve(tSS, tS_Rp_j) + P_ii_list$slope_vars = + P_ii_list$slope_vars + + as.numeric(mats$slope_vars %*% Zj)^2 / p + } + + if (!is.null(mats$rhs)) { + tX_Rp_j <- Matrix::tcrossprod(tX, Rp_j) + + Zj <- Matrix::solve(tXX, tX_Rp_j) + P_ii_list$rhs = + P_ii_list$rhs + + as.numeric(mats$rhs %*% Zj)^2 / p + } + } + } + + P_ii = Reduce("+", P_ii_list) + + return(P_ii) } #### diff --git a/R/MiscFuns.R b/R/MiscFuns.R index 664fca0e..e3f0a0a6 100644 --- a/R/MiscFuns.R +++ b/R/MiscFuns.R @@ -1384,7 +1384,7 @@ i = function(factor_var, var, ref, keep, bin, ref2, keep2, bin2, ...){ items_name = items } - if(FROM_FIXEST){ + if(FROM_FIXEST || is_sparse){ # Pour avoir des jolis noms c'est un vrai gloubiboulga, # mais j'ai pas trouve plus simple... if(IS_INTER_FACTOR){ @@ -1421,6 +1421,10 @@ i = function(factor_var, var, ref, keep, bin, ref2, keep2, bin2, ...){ fe_colid = to_integer(fe_num[valid_row], sorted = TRUE) values = if(length(var) == 1) rep(1, length(valid_row)) else var + + # Clean names + col_names = sub("^.*__CLEAN__", "", col_names) + res = list(rowid = which(valid_row), values = values, colid = fe_colid, col_names = col_names) @@ -3694,7 +3698,6 @@ fixest_model_matrix = function(fml, data, fake_intercept = FALSE, i_noref = FALS } else { linear.mat = prepare_matrix(fml, data, fake_intercept) } - if(any(grepl("__CLEAN__", colnames(linear.mat), fixed = TRUE))){ new_names = clean_interact_names(colnames(linear.mat)) diff --git a/R/VCOV.R b/R/VCOV.R index 8ef5df42..fdb2a3c3 100644 --- a/R/VCOV.R +++ b/R/VCOV.R @@ -726,6 +726,12 @@ vcov.fixest = function(object, vcov = NULL, se = NULL, cluster, ssc = NULL, attr if(!missing(nthreads)) nthreads = check_set_nthreads(nthreads) + # Get default ssc if none is provied + # ssc related => we accept NULL + # we check ssc since it can be used by the funs + if(missnull(ssc)) ssc = getFixest_ssc() + check_arg(ssc, "class(ssc.type)", .message = "The argument 'ssc' must be an object created by the function ssc().") + #### #### ... scores #### #### @@ -748,6 +754,44 @@ vcov.fixest = function(object, vcov = NULL, se = NULL, cluster, ssc = NULL, attr scores = object$scores } + # For HC1, apply cluster.adj (N / N - 1) if ssc$ssc.adj = TRUE to match vcovHC(type = "HC1") + if (vcov %in% c("hetero", "white", "hc1") & ssc$cluster.adj) { + n = nrow(scores) + scores = scores * sqrt(n / (n - 1)) + } + + # For HC2/ HC3, need to divide scores by hatvalues + if (vcov %in% c("hc2", "hc3")) { + + if (isTRUE(object$iv)) { + stop("hc2/hc3 vcov are not defined for IV estimates") + } + + exact = ifelse(is.null(extra_args$exact), TRUE, extra_args$exact) + p = ifelse(is.null(extra_args$p), 500, extra_args$p) + P_ii = hatvalues(object, exact = exact, p = p) + + problem_idx = which(P_ii > (1 - sqrt(.Machine$double.eps))) + if (length(problem_idx) > 0L) { + + if (length(problem_idx) > 10L) { + problem_idx = c(problem_idx[1:10], "...") + } + + stop( + sprintf( + "When calculating the diagonals of the projection matrix, one of the columns was found to equal 1 (or within Machine epsilon). This is most likely due to a fixed effect with only 1 observation, so removing that observation would make that coefficient non-estimable. The problem rows of the data are rows: %s", + paste(problem_idx, collapse = ", ") + ) + ) + } + + if (vcov == "hc2") { + scores = scores / sqrt(1 - P_ii) + } else if (vcov == "hc3") { + scores = scores / (1 - P_ii) + } + } #### #### ... bread #### @@ -812,11 +856,6 @@ vcov.fixest = function(object, vcov = NULL, se = NULL, cluster, ssc = NULL, attr # we compute the vcov. The adjustment (which is a pain in the neck) will come after that # Here vcov is ALWAYS a character scalar - # ssc related => we accept NULL - # we check ssc since it can be used by the funs - if(missnull(ssc)) ssc = getFixest_ssc() - check_arg(ssc, "class(ssc.type)", .message = "The argument 'ssc' must be an object created by the function ssc().") - fun_name = vcov_select$fun_name args = list(bread = bread, scores = scores, vars = vcov_vars, ssc = ssc, sandwich = sandwich, nthreads = nthreads, @@ -840,7 +879,6 @@ vcov.fixest = function(object, vcov = NULL, se = NULL, cluster, ssc = NULL, attr #### # ssc is a ssc object in here - n_fe = n_fe_ok = length(object$fixef_id) # we adjust the fixef sizes to account for slopes @@ -954,6 +992,7 @@ vcov.fixest = function(object, vcov = NULL, se = NULL, cluster, ssc = NULL, attr K = max(K, length(object$coefficients) + 1) } + # TODO: There never is an attr "ss_adj". I think this can be safely removed? # Small sample adjustment ss_adj = attr(vcov_noAdj, "ss_adj") if(!is.null(ss_adj)){ @@ -963,6 +1002,12 @@ vcov.fixest = function(object, vcov = NULL, se = NULL, cluster, ssc = NULL, attr attr(vcov_noAdj, "ss_adj") = NULL } else { ss_adj = ifelse(ssc$adj, (n - 1) / (n - K), 1) + + # TODO: think about this? It's not standard to apply ss_adj, but don't like removing the option from the user? + # Don't apply ss_adj to hc2/hc3 since they are small-sample adjusted already! + # if (vcov %in% c("hc2", "hc3")) { + # ss_adj = 1 + # } } vcov_mat = vcov_noAdj * ss_adj @@ -1573,6 +1618,72 @@ vcov_conley = function(x, lat = NULL, lon = NULL, cutoff = NULL, pixel = 0, res } +#' Heteroskedasticity-Robust VCOV +#' +#' Computes the heteroskedasticity-robust VCOV of `fixest` objects. +#' +#' @param x A `fixest` object. +#' @param type A string scalar. Either "HC1"/"HC2"/"HC3" +#' @param exact A logical scalar. Should the hat matrix be calculated exactly or approximated using the JLA algorithm? Default = TRUE. See [`hatvalues.fixest`] for details. +#' @param p A numeric scalar. The number of draws should be used when approximating the hat matrix? Note that larger `p` are more accurate, but slower. This is only used when `exact = TRUE`. Default is 500. +#' @param ssc An object returned by the function [`ssc`]. It specifies how to perform the small sample correction. +#' +#' @return +#' If the first argument is a `fixest` object, then a VCOV is returned (i.e. a symmetric matrix). +#' +#' If the first argument is not a `fixest` object, then a) implicitly the arguments are shifted to the left (i.e. `vcov_hetero("HC3")` is equivalent to `vcov_hetero(type = "HC3")` and b) a VCOV-*request* is returned and NOT a VCOV. That VCOV-request can then be used in the argument `vcov` of various `fixest` functions (e.g. [`vcov.fixest`] or even in the estimation calls). +#' +#' @author +#' Laurent Berge and Kyle Butts +#' +#' @references +#' Cameron AC, Gelbach JB, Miller DL (2011). "Robust Inference with Multiway Clustering." *Journal of Business & Economic Statistics*, 29(2), 238-249. doi:10.1198/jbes.2010.07136. +#' +#' @examples +#' +#' base = iris +#' names(base) = c("y", "x1", "x2", "x3", "species") +#' base$clu = rep(1:5, 30) +#' +#' est = feols(y ~ x1 | species, base) +#' +#' vcov_hetero(est, "hc1") +#' vcov_hetero(est, "hc2", ssc = ssc(adj = FALSE)) +#' vcov_hetero(est, "hc3", ssc = ssc(adj = FALSE)) +#' +#' # Using approximate hatvalues +#' vcov_hetero(est, "hc3", exact = FALSE, p = 500) +#' +#' +vcov_hetero = function(x, type = "hc1", exact = TRUE, p = 500, ssc = NULL){ + # User-level function to compute clustered SEs + # typically we only do checking and reshaping here + + # slide_args allows the implicit allocation of arguments + # it makes semi-global changes => the values of the args here are modified + slide_args(x, type = type, exact = exact, p = p, ssc = ssc) + IS_REQUEST = is.null(x) + + check_value(ssc, "NULL class(ssc.type)", .message = "The argument 'ssc' must be an object created by the function ssc().") + + + # We create the request + use_request = IS_REQUEST + + extra_args = list(exact = exact, p = p) + vcov_request = list(vcov = type, ssc = ssc, extra_args = extra_args) + class(vcov_request) = "fixest_vcov_request" + + + if(IS_REQUEST){ + res = vcov_request + } else { + res = vcov(x, vcov = vcov_request) + } + + res +} + #### #### SETUP #### @@ -1592,7 +1703,7 @@ vcov_setup = function(){ # Heteroskedasticity robust # - vcov_hetero_setup = list(name = c("hetero", "white", "hc1"), fun_name = "vcov_hetero_internal", vcov_label = "Heteroskedasticity-robust") + vcov_hetero_setup = list(name = c("hetero", "white", "hc1", "hc2", "hc3"), fun_name = "vcov_hetero_internal", vcov_label = "Heteroskedasticity-robust") # # cluster(s) @@ -1766,12 +1877,7 @@ vcov_hetero_internal = function(bread, scores, sandwich, ssc, nthreads, ...){ if(!sandwich){ vcov_noAdj = cpppar_crossprod(scores, 1, nthreads) } else { - # we make a n/(n-1) adjustment to match vcovHC(type = "HC1") - - n = nrow(scores) - adj = ifelse(ssc$cluster.adj, n / (n-1), 1) - - vcov_noAdj = cpppar_crossprod(cpppar_matprod(scores, bread, nthreads), 1, nthreads) * adj + vcov_noAdj = cpppar_crossprod(cpppar_matprod(scores, bread, nthreads), 1, nthreads) } vcov_noAdj @@ -2129,7 +2235,7 @@ oldargs_to_vcov = function(se, cluster, vcov, .vcov = NULL){ } } - check_arg(.vcov, "matrix | function") + check_arg(.vcov, "matrix | function | named list L1") vcov = .vcov attr(vcov, "deparsed_arg") = fetch_arg_deparse(".vcov") diff --git a/R/sparse_model_matrix.R b/R/sparse_model_matrix.R new file mode 100644 index 00000000..57bd3df0 --- /dev/null +++ b/R/sparse_model_matrix.R @@ -0,0 +1,710 @@ +#' Design matrix of a `fixest` object returned in sparse format +#' +#' This function creates the left-hand-side or the right-hand-side(s) of a [`femlm`], [`feols`] or [`feglm`] estimation. +#' +#' @inheritParams nobs.fixest +#' +#' @param data If missing (default) then the original data is obtained by evaluating the `call`. Otherwise, it should be a `data.frame`. +#' @param type Character vector or one sided formula, default is "rhs". Contains the type of matrix/data.frame to be returned. Possible values are: "lhs", "rhs", "fixef", "iv.rhs1" (1st stage RHS), "iv.rhs2" (2nd stage RHS), "iv.endo" (endogenous vars.), "iv.exo" (exogenous vars), "iv.inst" (instruments). +#' @param na.rm Default is `TRUE`. Should observations with NAs be removed from the matrix? +#' @param collin.rm Logical scalar. Whether to remove variables that were found to be collinear during the estimation. Beware: it does not perform a collinearity check and bases on the `coef(object)`. Default is TRUE if object is a `fixest` object, or FALSE if object is a formula. +#' @param combine Logical scalar, default is `TRUE`. Whether to combine each resulting sparse matrix +#' @param ... Not currently used. +#' +#' @return +#' It returns either a single sparse matrix a list of matrices, depending whether `combine` is `TRUE` or `FALSE`. The sparse matrix is of class `dgCMatrix` from the `Matrix` package. +#' +#' @seealso +#' See also the main estimation functions [`femlm`], [`feols`] or [`feglm`]. [`formula.fixest`], [`update.fixest`], [`summary.fixest`], [`vcov.fixest`]. +#' +#' +#' @author +#' Laurent Berge, Kyle Butts +#' +#' @examples +#' +#' est = feols(wt ~ i(vs) + hp | cyl, mtcars) +#' sparse_model_matrix(est) +#' sparse_model_matrix(wt ~ i(vs) + hp | cyl, mtcars) +#' +#' @export +sparse_model_matrix = function(object, data, type = "rhs", na.rm = TRUE, collin.rm = NULL, combine = TRUE, ...) { + # We evaluate the formula with the past call + # type: lhs, rhs, fixef, iv.endo, iv.inst, iv.rhs1, iv.rhs2 + # if fixef => return a DF + + # Checking the arguments + if (is_user_level_call()) { + validate_dots(suggest_args = c("data", "type")) + } + + # We allow type to be used in the location of data if data is missing + if (!missing(data) && missing(type)) { + sc = sys.call() + if (!"data" %in% names(sc)) { + if (!is.null(data) && (is.character(data) || "formula" %in% class(data))) { + # data is in fact the type + type = data + data = NULL + } + } + } + + type = check_set_types(type, c("lhs", "rhs", "fixef", "iv.endo", "iv.inst", "iv.exo", "iv.rhs1", "iv.rhs2")) + + if (isTRUE(object$is_fit)) { + stop("sparse_model_matrix method not available for fixest estimations obtained from fit methods.") + } + + if (any(grepl("^iv", type)) && !isTRUE(object$iv)) { + stop("The type", enumerate_items(grep("^iv", type, value = TRUE), "s.is"), " only valid for IV estimations.") + } + + check_arg(subset, "logical scalar | character vector no na") + + if (missing(collin.rm)) { + collin.rm = if (inherits(object, "formula")) FALSE else TRUE + } else { + check_arg_plus(collin.rm, "logical scalar") + } + + # Evaluation with the data + original_data = FALSE + if (missnull(data)) { + original_data = TRUE + data = fetch_data(object, "To apply 'sparse_model_matrix', ") + } + + # control of the data + if (is.matrix(data)) { + if (is.null(colnames(data))) { + stop("If argument 'data' is to be a matrix, its columns must be named.") + } + data = as.data.frame(data) + } + + if (!"data.frame" %in% class(data)) { + stop("The argument 'data' must be a data.frame or a matrix.") + } + + + # Allows passage of formula to sparse_model_matrix. A bit inefficient, but it works. + isFormula = FALSE + split_fml = NULL + if (inherits(object, "formula")) { + split_fml = fml_split_internal(object) + if (isTRUE(collin.rm)) { + message("Formula passed to sparse_model_matrix with collin.rm == TRUE. Estimating feols with formula.") + object = feols(object, data = data) + } else if (length(split_fml) == 3) { + message("Formula passed to sparse_model_matrix with an iv formula. Estimating feols with formula.") + object = feols(object, data = data) + } else { + isFormula = TRUE + } + } + + # na.rm = FALSE doesn't work with type = "fixef" (which FE col gets NA?) + if (("fixef" %in% type & !na.rm)) { + # na.rm = TRUE + message("na.rm = FALSE doesn't work with type = 'fixef'. It has been set to TRUE.") + } + + + # Panel setup + if (!isFormula) { + panel__meta__info = set_panel_meta_info(object, data) + } + + # The formulas + if (isFormula) { + fml_linear = split_fml[[1]] + + fml_0 = attr(stats::terms(fml_linear), "intercept") == 0 + fake_intercept = !is.null(split_fml[[2]]) | fml_0 + + } else { + fml_linear = formula(object, type = "linear") + + # we kick out the intercept if there is presence of fixed-effects + fml_0 = attr(stats::terms(fml_linear), "intercept") == 0 + fake_intercept = !is.null(object$fixef_vars) && !(!is.null(object$slope_flag) && all(object$slope_flag < 0)) + fake_intercept = fake_intercept | fml_0 + } + + res = list() + + if ("lhs" %in% type) { + lhs_text = deparse_long(fml_linear[[2]]) + lhs = eval(fml_linear[[2]], data) + lhs = Matrix::Matrix(lhs, sparse = TRUE, ncol = 1) + + colnames(lhs) = lhs_text + + res[["lhs"]] = lhs + } + + if ("rhs" %in% type && !isTRUE(object$onlyFixef)) { + + fml = fml_linear + + if (isFormula && (length(split_fml) == 3)) { + fml_iv = split_fml[[3]] + fml = .xpd(..lhs ~ ..endo + ..rhs, ..lhs = fml[[2]], ..endo = fml_iv[[2]], ..rhs = fml[[3]]) + } else if (isTRUE(object$iv)) { + fml_iv = object$fml_all$iv + fml = .xpd(..lhs ~ ..endo + ..rhs, ..lhs = fml[[2]], ..endo = fml_iv[[2]], ..rhs = fml[[3]]) + } + + vars = attr(stats::terms(fml), "term.labels") + + linear.mat = vars_to_sparse_mat(vars = vars, data = data, object = object, collin.rm = collin.rm, type = "rhs", add_intercept = !fake_intercept) + + res[["rhs"]] = linear.mat + } + + if ("fixef" %in% type) { + + if (isFormula && (length(split_fml) < 2)) { + mat_FE = NULL + } else if (!isFormula & length(object$fixef_id) == 0) { + mat_FE = NULL + } else { + + if (isFormula) { + fixef_fml = .xpd(~ ..fe, ..fe = split_fml[[2]]) + } else { + fixef_fml = object$fml_all$fixef + } + + fixef_terms_full = fixef_terms(fixef_fml) + fe_vars = fixef_terms_full$fe_vars + slope_var_list = fixef_terms_full$slope_vars_list + + fixef_df = prepare_df(fe_vars, data, fastCombine = FALSE) + + # Check for slope vars + if (any(fixef_terms_full$slope_flag > 0)) { + slope_df = prepare_df(unlist(slope_var_list), data) + } + + cols_lengths = c(0) + total_cols = 0 + n_FE = 0 + nrows = nrow(data) + for (i in seq_along(fe_vars)) { + + fe_var = fe_vars[i] + slope_vars = slope_var_list[[i]] + n_slope_vars = if (is.null(slope_vars)) 0 else length(slope_vars) + + fe = fixef_df[[fe_var]] + unique_fe = unique(fe) + n_cols = length(unique_fe[!is.na(unique_fe)]) + + total_cols = total_cols + n_cols * (1 + n_slope_vars) + cols_lengths = c(cols_lengths, rep(n_cols, 1 + n_slope_vars)) + n_FE = n_FE + 1 + n_slope_vars + } + running_cols = cumsum(cols_lengths) + + id_all = names_all = val_all = vector("list", n_FE) + rowid = 1:nrows + j = 1 + for (i in seq_along(fe_vars)) { + + fe_var = fe_vars[i] + xi = fixef_df[[fe_var]] + + keep = which(!is.na(xi)) + if (length(keep) == 0) stop("All values of the fixed-effect variable '", fe_var, "' are NA.") + + xi = xi[keep] + xi_quf = quickUnclassFactor(xi, addItem = TRUE) + + col_id = xi_quf$x + col_levels = as.character(xi_quf$items) + + slope_vars = slope_var_list[[i]] + n_slope_vars = if (is.null(slope_vars)) 0 else length(slope_vars) + + # Be careful with NAs + # First fixed-effect by itself + val_all[[j]] = c(rep(1, length(col_id)), rep(NA, nrows - length(keep))) + id_all[[j]] = cbind( + c( + rowid[keep], + rowid[-keep] + ), + c( + running_cols[j] + col_id, + rep(running_cols[j] + 1, nrows - length(keep)) + ) + ) + names_all[[j]] = paste0(fe_var, "::", col_levels) + j = j + 1 + + for (k in seq_along(slope_vars)) { + slope_var = slope_vars[k] + slope = slope_df[[slope_var]] + + val_all[[j]] = c(as.numeric(slope[keep]), rep(NA, nrows - length(keep))) + id_all[[j]] = cbind( + c( + rowid[keep], + rowid[-keep] + ), + c( + running_cols[j] + col_id, + rep(running_cols[j] + 1, nrows - length(keep)) + ) + ) + names_all[[j]] = paste0(fe_var, "[[", slope_var, "]]", "::", col_levels) + j = j + 1 + } + } + + id_mat = do.call(rbind, id_all) + val_vec = unlist(val_all) + names_vec = unlist(names_all) + + mat_FE = Matrix::sparseMatrix( + i = id_mat[, 1], + j = id_mat[, 2], + x = val_vec, + dimnames = list(NULL, names_vec) + ) + + + # Keep non-zero FEs + if (collin.rm == TRUE) { + fixefs = fixef(object, sorted = TRUE) + + select = lapply( + names(fixefs), + function(var) { + names = names(fixefs[[var]]) + names = names[fixefs[[var]] != 0] + + paste0(var, "::", names) + } + ) + select = unlist(select) + + # When original_data isn't used, some FEs may not be in the new dataset, add them in + missing_cols = setdiff(select, colnames(mat_FE)) + mat_FE = cbind( + mat_FE, + Matrix::Matrix(0, ncol = length(missing_cols), nrow = nrow(mat_FE), sparse = TRUE, dimnames = list(NULL, missing_cols)) + ) + + # Subset fixef and order columns to match fixef(object) + idx = which(colnames(mat_FE) %in% select) + mat_FE = mat_FE[, idx, drop = FALSE] + + # Reorder columns to match fixef(object) + reorder = match(unlist(select), colnames(mat_FE)) + mat_FE = mat_FE[, reorder[!is.na(reorder)], drop = FALSE] + } + + } + + res[["fixef"]] = mat_FE + } + + # + # IV + # + if ("iv.endo" %in% type) { + fml = object$iv_endo_fml + vars = attr(stats::terms(fml), "term.labels") + endo.mat = vars_to_sparse_mat(vars = vars, data = data, object = object, collin.rm = collin.rm) + + res[["iv.endo"]] = endo.mat + } + + if ("iv.inst" %in% type) { + fml = object$fml_all$iv + vars = attr(stats::terms(fml), "term.labels") + inst.mat = vars_to_sparse_mat(vars = vars, data = data, object = object, collin.rm = collin.rm) + + res[["iv.inst"]] = inst.mat + } + + if ("iv.exo" %in% type) { + + fml = object$fml_all$linear + vars = attr(stats::terms(fml), "term.labels") + exo.mat = vars_to_sparse_mat(vars = vars, data = data, object = object, collin.rm = collin.rm, type = "iv.exo", add_intercept = !fake_intercept) + + res[["iv.exo"]] = exo.mat + } + + if ("iv.rhs1" %in% type) { + fml = object$fml + if (object$iv_stage == 2) { + fml_iv = object$fml_all$iv + fml = .xpd(..lhs ~ ..inst + ..rhs, ..lhs = fml[[2]], ..inst = fml_iv[[3]], ..rhs = fml[[3]]) + } + vars = attr(stats::terms(fml), "term.labels") + iv_rhs1 = vars_to_sparse_mat(vars = vars, data = data, object = object, collin.rm = collin.rm, type = "iv.rhs1", add_intercept = !fake_intercept) + + res[["iv.rhs1"]] = iv_rhs1 + } + + if ("iv.rhs2" %in% type) { + # Second stage + if (!object$iv_stage == 2) { + stop("In model.matrix, the type 'iv.rhs2' is only valid for second stage IV models. This estimation is the first stage.") + } + + # I) we get the fitted values + stage_1 = object$iv_first_stage + + fit_vars = c() + for (i in seq_along(stage_1)) { + fit_vars[i] = v = paste0("fit_", names(stage_1)[i]) + data[[v]] = predict(stage_1[[i]], newdata = data, sample = "original") + } + + # II) we create the variables + fml = object$fml + fml = .xpd(..lhs ~ ..fit + ..rhs, ..lhs = fml[[2]], ..fit = fit_vars, ..rhs = fml[[3]]) + vars = attr(stats::terms(fml), "term.labels") + iv_rhs2 = vars_to_sparse_mat(vars = vars, data = data, object = object, collin.rm = collin.rm, type = "iv.rhs2", add_intercept = !fake_intercept) + + res[["iv.rhs2"]] = iv_rhs2 + } + + if (na.rm) { + na_rows = lapply(res, function(mat) { + # Get rows with NA + 1 + mat@i[is.na(mat@x)] + }) + + na_rows = unlist(na_rows, use.names = FALSE) + + if (original_data) { + na_rows = c(na_rows, -1 * object$obs_selection$obsRemoved) + } + + na_rows = unique(na_rows) + + if (length(na_rows) > 0) { + res = lapply(res, function(mat) { + mat[-na_rows, , drop = FALSE] + }) + } + } + + # Formatting res + if (length(res) == 0) { + return(NULL) + } else if (length(type) > 1) { + if (combine) { + res = res[type] + res = do.call(cbind, unname(res)) + } + } else { + res = res[[1]] + } + + res +} + + + +# Internal: modifies the calls so that each variable/interaction is evaluated with mult_sparse +mult_wrap = function(x) { + # x: character string of a variable to be evaluated + # ex: "x1" => mult_sparse(x1) + # "x1:factor(x2):x3" => mult_sparse(x3, factor(x2), x1) + # + # We also add the argument sparse to i() + # "x1:i(species, TRUE)" => mult_sparse(x1, i(species, TRUE, sparse = TRUE)) + + x_call = str2lang(x) + + res = (~ mult_sparse())[[2]] + + if (length(x_call) == 1 || x_call[[1]] != ":") { + res[[2]] = x_call + + } else { + res[[2]] = x_call[[3]] + tmp = x_call[[2]] + + while (length(tmp) == 3 && tmp[[1]] == ":") { + res[[length(res) + 1]] = tmp[[3]] + tmp = tmp[[2]] + } + + res[[length(res) + 1]] = tmp + } + + # We also add sparse to i() if found + for (i in 2:length(res)) { + ri = res[[i]] + if (length(ri) > 1 && ri[[1]] == "i") { + ri[["sparse"]] = TRUE + res[[i]] = ri + } + } + + if (length(res) > 2) { + # we restore the original order + res[-1] = rev(res[-1]) + } + + return(res) +} + +# Internal function to evaluate the variables (and interactions) in a sparse way +mult_sparse = function(...) { + # Only sparsifies factor variables + # Takes care of interactions + + dots = list(...) + n = length(dots) + mc = match.call() + var_call = sapply(seq_len(n), function(i) deparse_long(mc[[i+1]])) + + num_var = NULL + factor_list = list() + factor_idx = c() + i_idx = c() + info_i = NULL + is_i = is_factor = FALSE + # You can't have interactions between i and factors, it's either + + for (i in 1:n) { + xi = dots[[i]] + vari = mc[[i+1]] + if (is.numeric(xi)) { + # We stack the product + num_var = if (is.null(num_var)) xi else xi * num_var + } else if (inherits(xi, "i_sparse")) { + is_i = TRUE + info_i = xi + i_idx = c(i_idx, i) + } else { + is_factor = TRUE + factor_list[[length(factor_list) + 1]] = xi + factor_idx = c(factor_idx, i) + } + } + + if (is_factor && is_i) { + stop("Unfortunately, can not interact factor with `i()` in sparse_model_matrix") + } + + # numeric + if (!is_i && !is_factor) { + return(num_var) + } + # factor() + if (is_factor) { + factor_list$add_items = TRUE + factor_list$items.list = TRUE + factor_list$multi.join = " ; " + fact_as_int = do.call(to_integer, factor_list) + + values = if (is.null(num_var)) rep(1, length(fact_as_int$x)) else num_var + rowid = seq_along(values) + + # messy, but need this to get things like `factor(am)1:hp:factor(cyl)6` + items_mat = do.call(rbind, strsplit(fact_as_int$items, " ; ")) + + # intersplice var_call with items + col_names = sapply(seq_len(nrow(items_mat)), function(i) { + pasteable = var_call + pasteable[factor_idx] = paste0(var_call[factor_idx], items_mat[i, ]) + paste(pasteable, collapse = ":") + }, USE.NAMES = FALSE) + + res = list(rowid = rowid, colid = fact_as_int$x, values = values, + col_names = col_names, n_cols = length(fact_as_int$items)) + # i() + } else { + + values = info_i$values + if (!is.null(num_var)) { + num_var = num_var[info_i$rowid] + values = values * num_var + col_names = sapply(info_i$col_names, function(item) { + pasteable = var_call + pasteable[i_idx] = item + paste(pasteable, collapse = ":") + }, USE.NAMES = FALSE) + } else { + col_names = info_i$col_names + } + + res = list(rowid = info_i$rowid, colid = info_i$colid, + values = values[info_i$rowid], + col_names = col_names, + n_cols = length(info_i$col_names)) + } + + class(res) = "sparse_var" + + res +} + +# Takes a vector of strings denoting the variables (including terms like `poly()`, `i()`, `I()`, `lag()`, etc.) and returns a sparse matrix of the variables extracted from `data`. +vars_to_sparse_mat = function(vars, data, collin.rm = FALSE, object = NULL, type = NULL, add_intercept = FALSE) { + + if (length(vars) == 0) { + # Case only FEs + mat = NULL + } else { + + # Since we don't want to evaluate the factors, + # the code is a bit intricate because we have to catch them before + # any interaction takes place + # + # that's why I wrap interactions in a function (mult_sparse()) + # + + # Below, we evaluate all the variables in a "sparse" way + + vars_calls = lapply(vars, mult_wrap) + + n = length(vars) + variables_list = vector("list", n) + for (i in 1:n) { + variables_list[[i]] = eval(vars_calls[[i]], data) + } + + # To create the sparse matrix, we need the column indexes + total_cols = 0 + running_cols = c(0) + for (i in 1:n) { + xi = variables_list[[i]] + if (inherits(xi, "sparse_var")) { + total_cols = total_cols + xi$n_cols + } else { + total_cols = total_cols + NCOL(xi) + } + running_cols[i + 1] = total_cols + } + + # We just create a sparse matrix and fill it + + # 1) creating the indexes + names + + # NOTA: I use lists to avoid creating copies + rowid = 1:nrow(data) + id_all = values_all = names_all = vector("list", n) + for (i in 1:n) { + xi = variables_list[[i]] + + if (inherits(xi, "sparse_var")) { + + id_all[[i]] = cbind(xi$rowid, running_cols[i] + xi$colid) + values_all[[i]] = xi$values + names_all[[i]] = xi$col_names + + } else if (NCOL(xi) == 1) { + + id_all[[i]] = cbind(rowid, running_cols[i] + 1) + values_all[[i]] = xi + names_all[[i]] = vars[[i]] + + } else { + + colid = rep(1:NCOL(xi), each = nrow(data)) + id_all[[i]] = cbind(rep(rowid, NCOL(xi)), running_cols[i] + colid) + values_all[[i]] = as.vector(xi) + if (!is.null(colnames(xi))) { + names_all[[i]] = paste0(vars[[i]], colnames(xi)) + } else { + names_all[[i]] = paste0(vars[[i]], 1:NCOL(xi)) + } + + } + } + + id_mat = do.call(rbind, id_all) + values_vec = unlist(values_all) + names_vec = unlist(names_all) + + # 2) filling the matrix: one shot, no copies + + mat = Matrix::Matrix(0, nrow(data), total_cols, dimnames = list(NULL, names_vec)) + mat[id_mat] = values_vec + + } + + if (collin.rm & is.null(object)) { + stop("You need to provide the 'object' argument to use 'collin.rm = TRUE'.") + } + + if (collin.rm) { + qui = which(colnames(mat) %in% object$collin.var) + if (length(qui) == ncol(mat)) { + mat = NULL + } else if (length(qui) > 0) { + mat = mat[, -qui, drop = FALSE] + } + + coefs = names(object$coefficients) + if (isTRUE(object$iv)) { + fml_iv = object$fml_all$iv + endo = fml_iv[[2]] + + # Trick to get the rhs variables as a character vector + endo = .xpd(~ ..endo, ..endo = endo) + endo = attr(stats::terms(endo), "term.labels") + + exo = lapply(object$iv_first_stage, function(x) names(stats::coef(x))) + exo = unique(unlist(exo, use.names = FALSE)) + } + + # Custom subsetting for na.rm depending on `type` + if (!is.null(type)) { + if (type == "rhs") { + if (isTRUE(object$iv)) { + keep = c(endo, coefs) + } else { + keep = coefs + } + } else if (type == "iv.exo") { + keep = coefs + } else if (type == "iv.exo") { + keep = c(endo, coefs) + } else if (type == "iv.rhs1") { + keep = c(exo, coefs) + } else if (type == "iv.rhs2") { + keep = coefs + } + + keep = keep[!keep %in% object$collin.var] + if (length(keep) == 0) { + mat = NULL + } else { + idx = which(colnames(mat) %in% keep) + mat = mat[, idx, drop = FALSE] + } + } + + if (length(coefs) == ncol(mat) && any(colnames(mat) != names(coefs))) { + # we reorder the matrix + # This can happen in multiple estimations, where we respect the + # order of the user + + if (all(names(coefs) %in% colnames(mat))) { + mat = mat[, names(coefs), drop = FALSE] + } + } + } + + if (add_intercept) { + mat = cbind(1, mat) + colnames(mat)[1] = "(Intercept)" + } + + return(mat) +} + + \ No newline at end of file diff --git a/_pkgdown.yml b/_pkgdown.yml index 12f9fb81..6e7a6cdb 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -84,6 +84,7 @@ reference: - weights.fixest - formula.fixest - terms.fixest + - sparse_model_matrix - title: Formula tools desc: Tools to manipulate formulas contents: diff --git a/man/coefplot.Rd b/man/coefplot.Rd index 0270a586..e037fe7e 100644 --- a/man/coefplot.Rd +++ b/man/coefplot.Rd @@ -248,9 +248,9 @@ This function plots the results of estimations (coefficients and confidence inte } \section{Functions}{ \itemize{ -\item \code{iplot}: Plots the coefficients generated with i() -}} +\item \code{iplot()}: Plots the coefficients generated with i() +}} \section{Setting custom default values}{ The function \code{coefplot} dispose of many arguments to parametrize the plots. Most of these arguments can be set once an for all using the function \code{\link{setFixest_coefplot}}. See Example 3 below for a demonstration. diff --git a/man/degrees_freedom.Rd b/man/degrees_freedom.Rd index 96c89192..993b5900 100644 --- a/man/degrees_freedom.Rd +++ b/man/degrees_freedom.Rd @@ -42,9 +42,9 @@ Simple utility to extract the degrees of freedom from a \code{fixest} estimation } \section{Functions}{ \itemize{ -\item \code{degrees_freedom_iid}: Gets the degrees of freedom of a \code{fixest} estimation -}} +\item \code{degrees_freedom_iid()}: Gets the degrees of freedom of a \code{fixest} estimation +}} \examples{ # First: an estimation diff --git a/man/etable.Rd b/man/etable.Rd index 0240eaa9..f388000f 100644 --- a/man/etable.Rd +++ b/man/etable.Rd @@ -424,11 +424,11 @@ When the argument \code{postprocessing.tex} is not missing, two additional tags } \section{Functions}{ \itemize{ -\item \code{esttable}: Exports the results of multiple \code{fixest} estimations in a Latex table. +\item \code{esttable()}: Exports the results of multiple \code{fixest} estimations in a Latex table. -\item \code{esttex}: Exports the results of multiple \code{fixest} estimations in a Latex table. -}} +\item \code{esttex()}: Exports the results of multiple \code{fixest} estimations in a Latex table. +}} \section{How does \code{digits} handle the number of decimals displayed?}{ diff --git a/man/hatvalues.fixest.Rd b/man/hatvalues.fixest.Rd index bda72b07..e9919fc2 100644 --- a/man/hatvalues.fixest.Rd +++ b/man/hatvalues.fixest.Rd @@ -4,23 +4,29 @@ \alias{hatvalues.fixest} \title{Hat values for \code{fixest} objects} \usage{ -\method{hatvalues}{fixest}(model, ...) +\method{hatvalues}{fixest}(model, exact = TRUE, p = 1000, ...) } \arguments{ \item{model}{A fixest object. For instance from feols or feglm.} +\item{exact}{Logical scalar, default is \code{TRUE}. Whether the diagonals of the projection matrix should be calculated exactly. If \code{FALSE}, then it will be approximated using a JLA algorithm. See details. Unless you have a very large number of observations, it is recommended to keep the default value.} + +\item{p}{Numeric scala, default is 1000. This is only used when \code{exact == FALSE}. This determined the number of bootstrap samples used to estimate the projection matrix.} + \item{...}{Not currently used.} } \value{ Returns a vector of the same length as the number of observations used in the estimation. } \description{ -Computes the hat values for \code{\link{feols}} or \code{\link{feglm}} estimations. Only works when there are no fixed-effects. +Computes the hat values for \code{\link{feols}} or \code{\link{feglm}} estimations. } \details{ Hat values are not available for \code{\link[=femlm]{fenegbin}}, \code{\link{femlm}} and \code{\link{feNmlm}} estimations. -When there are fixed-effects, the hat values of the reduced form are different from the hat values of the full model. And we cannot get costlessly the hat values of the full model from the reduced form. It would require to reestimate the model with the fixed-effects as regular variables. +Hat values for generalized linear model are disussed in Belsley, Kuh and Welsch (1980), Cook and Weisberg (1982), etc. + +When \code{exact == FALSE}, the Johnson-Lindenstrauss approximation (JLA) algorithm is used which approximates the diagonals of the projection matrix. For more precision (but longer time), increase the value of \code{p}. See Kline, Saggio, and Sølvsten (2020) for details. } \examples{ @@ -29,3 +35,8 @@ head(hatvalues(est)) } +\references{ +Belsley, D. A., Kuh, E. and Welsch, R. E. (1980). \emph{Regression Diagnostics}. New York: Wiley. +Cook, R. D. and Weisberg, S. (1982). \emph{Residuals and Influence in Regression}. London: Chapman and Hall. +Kline, P., Saggio R., and Sølvsten, M. (2020). \emph{Leave‐Out Estimation of Variance Components}. Econometrica. +} diff --git a/man/l.Rd b/man/l.Rd index 6022f618..5e81f36f 100644 --- a/man/l.Rd +++ b/man/l.Rd @@ -29,11 +29,11 @@ Produce lags or leads in the formulas of \code{fixest} estimations or when creat } \section{Functions}{ \itemize{ -\item \code{f}: Forwards a variable (inverse of lagging) in a \code{fixest} estimation +\item \code{f()}: Forwards a variable (inverse of lagging) in a \code{fixest} estimation -\item \code{d}: Creates differences (i.e. x - lag(x)) in a \code{fixest} estimation -}} +\item \code{d()}: Creates differences (i.e. x - lag(x)) in a \code{fixest} estimation +}} \examples{ data(base_did) diff --git a/man/lag.formula.Rd b/man/lag.formula.Rd index 373adb09..19137949 100644 --- a/man/lag.formula.Rd +++ b/man/lag.formula.Rd @@ -48,9 +48,9 @@ Lags a variable using panel id + time identifiers in a formula. } \section{Functions}{ \itemize{ -\item \code{lag_fml}: Lags a variable using a formula syntax -}} +\item \code{lag_fml()}: Lags a variable using a formula syntax +}} \examples{ # simple example with an unbalanced panel base = data.frame(id = rep(1:2, each = 4), diff --git a/man/sparse_model_matrix.Rd b/man/sparse_model_matrix.Rd new file mode 100644 index 00000000..d553c74e --- /dev/null +++ b/man/sparse_model_matrix.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sparse_model_matrix.R +\name{sparse_model_matrix} +\alias{sparse_model_matrix} +\title{Design matrix of a \code{fixest} object returned in sparse format} +\usage{ +sparse_model_matrix( + object, + data, + type = "rhs", + na.rm = TRUE, + collin.rm = TRUE, + combine = TRUE, + ... +) +} +\arguments{ +\item{object}{A \code{fixest} object. Obtained using the functions \code{\link{femlm}}, \code{\link{feols}} or \code{\link{feglm}}.} + +\item{data}{If missing (default) then the original data is obtained by evaluating the \code{call}. Otherwise, it should be a \code{data.frame}.} + +\item{type}{Character vector or one sided formula, default is "rhs". Contains the type of matrix/data.frame to be returned. Possible values are: "lhs", "rhs", "fixef", "iv.rhs1" (1st stage RHS), "iv.rhs2" (2nd stage RHS), "iv.endo" (endogenous vars.), "iv.exo" (exogenous vars), "iv.inst" (instruments).} + +\item{na.rm}{Default is \code{TRUE}. Should observations with NAs be removed from the matrix?} + +\item{collin.rm}{Logical scalar, default is \code{TRUE}. Whether to remove variables that were found to be collinear during the estimation. Beware: it does not perform a collinearity check.} + +\item{combine}{Logical scalar, default is \code{TRUE}. Whether to combine each resulting sparse matrix} + +\item{...}{Not currently used.} +} +\value{ +It returns either a single sparse matrix a list of matrices, depending whether \code{combine} is \code{TRUE} or \code{FALSE}. The sparse matrix is of class \code{dgCMatrix} from the \code{Matrix} package. +} +\description{ +This function creates the left-hand-side or the right-hand-side(s) of a \code{\link{femlm}}, \code{\link{feols}} or \code{\link{feglm}} estimation. Note that this function currently does not accept a formula +} +\examples{ + +est = feols(wt ~ i(vs) + hp | cyl, mtcars) +sparse_model_matrix(est) + + +} +\seealso{ +See also the main estimation functions \code{\link{femlm}}, \code{\link{feols}} or \code{\link{feglm}}. \code{\link{formula.fixest}}, \code{\link{update.fixest}}, \code{\link{summary.fixest}}, \code{\link{vcov.fixest}}. +} +\author{ +Laurent Berge, Kyle Butts +} diff --git a/man/ssc.Rd b/man/ssc.Rd index 18952c44..9e338046 100644 --- a/man/ssc.Rd +++ b/man/ssc.Rd @@ -55,9 +55,9 @@ The following vignette: \href{https://lrberge.github.io/fixest/articles/standard } \section{Functions}{ \itemize{ -\item \code{dof}: This function is deprecated and will be removed at some point (in 6 months from August 2021). Exactly the same as \code{ssc}. -}} +\item \code{dof()}: This function is deprecated and will be removed at some point (in 6 months from August 2021). Exactly the same as \code{ssc}. +}} \examples{ # diff --git a/man/summary.fixest.Rd b/man/summary.fixest.Rd index aca3a7b6..a202eaa3 100644 --- a/man/summary.fixest.Rd +++ b/man/summary.fixest.Rd @@ -61,7 +61,7 @@ \item{nthreads}{The number of threads. Can be: a) an integer lower than, or equal to, the maximum number of threads; b) 0: meaning all available threads will be used; c) a number strictly between 0 and 1 which represents the fraction of all threads to use. The default is to use 50\% of all threads. You can set permanently the number of threads used within this package using the function \code{\link{setFixest_nthreads}}.} -\item{...}{Only used if the argument \code{.vocv} is provided and is a function: extra arguments to be passed to that function.} +\item{...}{Only used if the argument \code{.vcov} is provided and is a function: extra arguments to be passed to that function.} } \value{ It returns a \code{fixest} object with: diff --git a/man/vcov_hetero.Rd b/man/vcov_hetero.Rd new file mode 100644 index 00000000..46cb8dfc --- /dev/null +++ b/man/vcov_hetero.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/VCOV.R +\name{vcov_hetero} +\alias{vcov_hetero} +\title{Heteroskedasticity-Robust VCOV} +\usage{ +vcov_hetero(x, type = "hc1", exact = TRUE, p = 500, ssc = NULL) +} +\arguments{ +\item{x}{A \code{fixest} object.} + +\item{type}{A string scalar. Either "HC1"/"HC2"/"HC3"} + +\item{exact}{A logical scalar. Should the hat matrix be calculated exactly or approximated using the JLA algorithm? Default = TRUE. See \code{\link{hatvalues.fixest}} for details.} + +\item{p}{A numeric scalar. The number of draws should be used when approximating the hat matrix? Note that larger \code{p} are more accurate, but slower. This is only used when \code{exact = TRUE}. Default is 500.} + +\item{ssc}{An object returned by the function \code{\link{ssc}}. It specifies how to perform the small sample correction.} +} +\value{ +If the first argument is a \code{fixest} object, then a VCOV is returned (i.e. a symmetric matrix). + +If the first argument is not a \code{fixest} object, then a) implicitly the arguments are shifted to the left (i.e. \code{vcov_hetero("HC3")} is equivalent to \code{vcov_hetero(type = "HC3")} and b) a VCOV-\emph{request} is returned and NOT a VCOV. That VCOV-request can then be used in the argument \code{vcov} of various \code{fixest} functions (e.g. \code{\link{vcov.fixest}} or even in the estimation calls). +} +\description{ +Computes the heteroskedasticity-robust VCOV of \code{fixest} objects. +} +\examples{ + +base = iris +names(base) = c("y", "x1", "x2", "x3", "species") +base$clu = rep(1:5, 30) + +est = feols(y ~ x1 | species, base) + +vcov_hetero(est, "hc1") +vcov_hetero(est, "hc2", ssc = ssc(adj = FALSE)) +vcov_hetero(est, "hc3", ssc = ssc(adj = FALSE)) + +# Using approximate hatvalues +vcov_hetero(est, "hc3", exact = FALSE, p = 500) + + +} +\references{ +Cameron AC, Gelbach JB, Miller DL (2011). "Robust Inference with Multiway Clustering." \emph{Journal of Business & Economic Statistics}, 29(2), 238-249. doi:10.1198/jbes.2010.07136. +} +\author{ +Laurent Berge and Kyle Butts +} diff --git a/tests/fixest_tests.R b/tests/fixest_tests.R index 20cef9ba..06694aff 100644 --- a/tests/fixest_tests.R +++ b/tests/fixest_tests.R @@ -1140,6 +1140,16 @@ est_feols = feols(y ~ x | grp + tm, data=d) test(se(est_feols, se = "st")["x"], se(est_lm)["x"]) +# +# Heteroskedasticity-robust +# + +se_white_lm_HC1 = sqrt(vcovHC(est_lm, type = "HC1")["x", "x"]) +se_white_lm_HC0 = sqrt(vcovHC(est_lm, type = "HC0")["x", "x"]) + +test(se(est_feols, se = "hetero"), se_white_lm_HC1) +test(se(est_feols, se = "hetero", ssc = ssc(adj = FALSE, cluster.adj = FALSE)), se_white_lm_HC0) + # # Clustered # @@ -1152,16 +1162,6 @@ se_CL_grp_lm_HC0 = sqrt(vcovCL(est_lm, cluster = d$grp, type = "HC0")["x", "x"]) test(se(est_feols, ssc = ssc(fixef.K = "full")), se_CL_grp_lm_HC1) test(se(est_feols, ssc = ssc(adj = FALSE, fixef.K = "full")), se_CL_grp_lm_HC0) -# -# Heteroskedasticity-robust -# - -se_white_lm_HC1 = sqrt(vcovHC(est_lm, type = "HC1")["x", "x"]) -se_white_lm_HC0 = sqrt(vcovHC(est_lm, type = "HC0")["x", "x"]) - -test(se(est_feols, se = "hetero"), se_white_lm_HC1) -test(se(est_feols, se = "hetero", ssc = ssc(adj = FALSE, cluster.adj = FALSE)), se_white_lm_HC0) - # # Two way # @@ -1172,13 +1172,82 @@ se_CL_2w_feols = se(est_feols, se = "twoway") test(se(est_feols, se = "twoway", ssc = ssc(fixef.K = "full", cluster.df = "conv")), se_CL_2w_lm) +# +# HC2/HC3 +# + +# HC2/HC3 +base = iris +base$w = runif(nrow(base)) + +est_feols = feols(Sepal.Length ~ Sepal.Width | Species, base) +est_lm = lm(Sepal.Length ~ Sepal.Width + factor(Species), base) + +test( + vcov(est_feols, "hc2", ssc = ssc(adj = FALSE, cluster.adj = FALSE)), + sandwich::vcovHC(est_lm, "HC2")["Sepal.Width", "Sepal.Width"] +) + +test( + vcov(est_feols, "hc3", ssc = ssc(adj = FALSE, cluster.adj = FALSE)), + sandwich::vcovHC(est_lm, "HC3")["Sepal.Width", "Sepal.Width"] +) + +# HC2/HC3 with weights +base = iris +base$w = runif(nrow(base)) + +est_feols = feols(Sepal.Length ~ Sepal.Width | Species, base, weights = ~ w) +est_lm = lm(Sepal.Length ~ Sepal.Width + factor(Species), base, weights = base$w) + +test( + vcov(est_feols, "hc2", ssc = ssc(adj = FALSE, cluster.adj = FALSE)), + sandwich::vcovHC(est_lm, "HC2")["Sepal.Width", "Sepal.Width"] +) + +test( + vcov(est_feols, "hc3", ssc = ssc(adj = FALSE, cluster.adj = FALSE)), + sandwich::vcovHC(est_lm, "HC3")["Sepal.Width", "Sepal.Width"] +) + +# HC2/HC3 with GLM +base$Sepal.Length = floor(base$Sepal.Length) +est_feglm = feglm( + Sepal.Length ~ Sepal.Width | Species, base, + "poisson", weights = ~ w +) +est_glm = glm( + Sepal.Length ~ Sepal.Width + factor(Species), base, + family = poisson(), weights = base$w +) + +test( + vcov(est_feglm, "hc2", ssc = ssc(adj = FALSE, cluster.adj = FALSE)), + sandwich::vcovHC(est_glm, "HC2")["Sepal.Width", "Sepal.Width"] +) +test( + vcov(est_feglm, "hc3", ssc = ssc(adj = FALSE, cluster.adj = FALSE)), + sandwich::vcovHC(est_glm, "HC3")["Sepal.Width", "Sepal.Width"] +) + +# Fail when P_ii = 1 +base$Species = as.character(base$Species) +base[1, "Species"] = "foo" + +est_pii_singular = feols(Sepal.Length ~ Sepal.Width | Species, base) + +test( + vcov(est_pii_singular, "hc2"), "err" +) + + # # Checking the calls work properly # data(trade) -est_pois = femlm(Euros ~ log(dist_km)|Origin+Destination, trade) +est_pois = femlm(Euros ~ log(dist_km) | Origin + Destination, trade) se_clust = se(est_pois, se = "cluster", cluster = "Product") test(se(est_pois, cluster = trade$Product), se_clust) @@ -1756,6 +1825,11 @@ glm_probit = glm(y_bin ~ x1 + x2, family = binomial("probit"), base) feglm_probit = feglm(y_bin ~ x1 + x2, base, binomial("probit")) test(hatvalues(feglm_probit), hatvalues(glm_probit)) +# Fixed effects +fm_fe = lm(y ~ x1 + x2 + factor(species), base) +ffm_fe = feols(y ~ x1 + x2 | species, base) +test(hatvalues(ffm_fe), hatvalues(fm_fe)) + #### #### sandwich #### @@ -2135,6 +2209,145 @@ m_lhs_rhs_fixef = model.matrix(res_mult, type = c("lhs", "iv.rhs2", "fixef"), na test(names(m_lhs_rhs_fixef), c("y1", "fit_x2", "x1", "species")) +#### +#### sparse_model_matrix #### +#### + +base = iris +names(base) = c("y1", "x1", "x2", "x3", "species") +base$y2 = 10 + rnorm(150) + 0.5 * base$x1 +base$x4 = rnorm(150) + 0.5 * base$y1 +base$fe2 = rep(letters[1:15], 10) +base$fe2[50:51] = NA +base$y2[base$fe2 == "a" & !is.na(base$fe2)] = 0 +base$x2[1:5] = NA +base$x3[6] = NA +base$fe3 = rep(letters[1:10], 15) +base$id = rep(1:15, each = 10) +base$time = rep(1:10, 15) + +base_bis = base[1:50, ] +base_bis$id = rep(1:5, each = 10) +base_bis$time = rep(1:10, 5) + + +res = feols(y1 ~ x1 + x2 + x3, base) +sm1 = sparse_model_matrix( + res, type = "lhs", + na.rm = TRUE +) +test(length(sm1), res$nobs) + +sm1_na = sparse_model_matrix( + res, data = base, + type = "lhs", + na.rm = FALSE +) +test(length(sm1_na), res$nobs_origin) +test(max(abs(sm1_na - base$y1), na.rm = TRUE), 0) + + +sm2 = sparse_model_matrix( + res, type = c("lhs", "rhs"), data = base, + combine = FALSE, na.rm = FALSE +) +y = sm2[["lhs"]] +X = sm2[["rhs"]] +obs_rm = res$obs_selection$obsRemoved +res_bis = solve( + crossprod(as.matrix(X)[obs_rm, ]), + crossprod(as.matrix(X)[obs_rm, ], as.matrix(y)[obs_rm, ]) +) +test(as.numeric(res_bis), res$coefficients) + +# No constant +res_nocons = feols(mpg ~ 0 + i(cyl), mtcars) +sm_nocons = sparse_model_matrix(res_nocons, type = "rhs") +test("(Intercept)" %in% colnames(sm_nocons), FALSE) + +# Lag +res_lag = feols(y1 ~ l(x1, 1:2) + x2 + x3, base, panel = ~id + time) +sm_lag = sparse_model_matrix(res_lag, type = "rhs") +test(nrow(sm_lag), nobs(res_lag)) + + +# TODO: Fix poly and newdata +# With poly +# res_poly = feols(y1 ~ poly(x1, 2), base) +# works +# res_poly = feols(y1 ~ x1, base) +# sm_poly_old = sparse_model_matrix(res_poly) +# sm_poly_new = sparse_model_matrix(res_poly, data = base_bis) +# test(sm_poly_old[1:50, 3], sm_poly_new[, 3]) + + +# Interacted fixef +res = feols(y1 ~ x1 + x2 + x3 | species^fe2, base) +sm_ife = sparse_model_matrix(res, data = base, type = "fixef", collin.rm = FALSE) + +# fixef +res = feols(y1 ~ x1 + x2 + x3 | species + fe2, base) +sm_fe = sparse_model_matrix(res, data = base, type = "fixef") +test(ncol(sm_fe), 17) + +sm_fe_no_collin_rm = sparse_model_matrix(res, data = base, type = "fixef", collin.rm = FALSE) +test(ncol(sm_fe_no_collin_rm), 18) + +sm_fe_base_bis = sparse_model_matrix(res, data = base_bis, type = "fixef") + + +# Time-varying slopes +res_slopes = feols(y1 ~ x1 + x2 + x3 | fe2[I(x2+1)], data = base[7:48, ]) +sm_slopes = sparse_model_matrix(res_slopes, type = "fixef") + +# IV +res_iv = feols(y1 ~ x1 | x2 ~ x3, base) + +sm_rhs1 = sparse_model_matrix(res_iv, type = "iv.rhs1") +test(colnames(sm_rhs1)[-1], c("x3", "x1")) + +sm_rhs2 = sparse_model_matrix(res_iv, type = "iv.rhs2") +test(colnames(sm_rhs2)[-1], c("fit_x2", "x1")) + +sm_endo = sparse_model_matrix(res_iv, type = "iv.endo") +test(colnames(sm_endo), "x2") + +sm_exo = sparse_model_matrix(res_iv, type = "iv.exo") +test(colnames(sm_exo)[-1], "x1") + +sm_inst = sparse_model_matrix(res_iv, type = "iv.inst") +test(colnames(sm_inst), "x3") + +# several +res_mult = feols(y1 ~ x1 | species | x2 ~ x3, base) + +sm_lhs_rhs_fixef = sparse_model_matrix(res_mult, type = c("lhs", "iv.rhs2", "fixef")) +test(colnames(sm_lhs_rhs_fixef), c("y1", "fit_x2", "x1", "species::setosa", "species::versicolor", "species::virginica")) + + +# non-linear model +res_pois = fepois(Sepal.Length ~ Sepal.Width + Petal.Length | Species, iris) +sp_pois = sparse_model_matrix(res_pois, data = iris) + + +# make sure names are correct +test_col_names <- function(est) { + m <- fixest::sparse_model_matrix(est, collin.rm = FALSE) + q <- test(all(names(coef(est)) %in% colnames(m)), TRUE) + return(invisible(NULL)) +} +test_col_names(feols(mpg ~ i(am) + disp | vs, mtcars)) +test_col_names(feols(mpg ~ i(am, i.cyl) + disp | vs, mtcars)) +test_col_names(feols(mpg ~ i(am, hp) + disp | vs, mtcars)) +test_col_names(feols(mpg ~ i(am):hp + disp | vs, mtcars)) +test_col_names(feols(mpg ~ factor(am) + disp | vs, mtcars)) +test_col_names(feols(mpg ~ factor(am):factor(cyl) + disp | vs, mtcars, notes = FALSE)) +test_col_names(feols(mpg ~ factor(am):hp + disp | vs, mtcars)) +test_col_names(feols(mpg ~ poly(hp, degree = 2) + disp | vs, mtcars)) + + + + #### #### fitstat #### ####