lrberge · kylebutts · May 17, 2023 · May 17, 2023 · May 18, 2023 · May 18, 2023
diff --git a/.github/workflows/check-standard.yml b/.github/workflows/check-standard.yml
@@ -34,13 +34,13 @@ jobs:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
-      - uses: r-lib/actions/setup-r@v1
+      - uses: r-lib/actions/setup-r@v2
         with:
           r-version: ${{ matrix.config.r }}
 
-      - uses: r-lib/actions/setup-pandoc@v1
+      - uses: r-lib/actions/setup-pandoc@v2
 
       - name: Query dependencies
         run: |

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -15,7 +15,7 @@ Authors@R:
              role = "ctb",
              email = "[email protected]",
              comment = c(ORCID = "0000-0001-7883-8573")))
-Imports: stats, graphics, grDevices, tools, utils, methods, numDeriv, nlme, sandwich, Rcpp(>= 1.0.5), dreamerr(>= 1.2.3)
+Imports: stats, graphics, grDevices, tools, utils, methods, numDeriv, nlme, sandwich, Rcpp(>= 1.0.5), dreamerr(>= 1.2.3), Matrix
 Suggests: knitr, rmarkdown, data.table, plm, MASS, pander, ggplot2, lfe, tinytex, pdftools
 LinkingTo: Rcpp
 Depends: R(>= 3.5.0)
@@ -28,6 +28,6 @@ URL: https://lrberge.github.io/fixest/, https://github.com/lrberge/fixest
 SystemRequirements: C++11
 VignetteBuilder: knitr
 LazyData: true
-RoxygenNote: 7.2.0
+RoxygenNote: 7.2.3
 Roxygen: list(markdown = TRUE)
 Encoding: UTF-8
diff --git a/NAMESPACE b/NAMESPACE
@@ -36,6 +36,8 @@ export(to_integer, demean, obs)
 export(as.dict)
 # setters & getters
 exportPattern("^(s|g)etFixest")
+# sparse_model_matrix
+export(sparse_model_matrix)
 
 # coeftable and co
 export(coeftable, se, pvalue, tstat)
@@ -54,7 +56,7 @@ S3method(tstat, fixest)
 S3method(tstat, fixest_multi)
 
 # VCOV funs
-export(vcov_cluster, vcov_DK, vcov_NW, vcov_conley)
+export(vcov_cluster, vcov_DK, vcov_NW, vcov_conley, vcov_hetero)
 
 # Data manipulation
 export(n_unik, osize, sample_df, fdim)

diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,40 @@
 
 # fixest 0.11.1
 
+## Sparse Model Matrix
+
+- Add the ability to create sparse model matrices from regression objects and formula in a memory-efficient way.
+
+```R
+est = feols(mpg ~ drat | cyl, mtcars)
+
+sparse_model_matrix(est, type = "lhs")
+#> 32 x 1 sparse Matrix of class "dgCMatrix"
+#>        mpg
+#>  [1,] 21.0
+#>  [2,] 21.0
+#>  [3,] 22.8
+
+sparse_model_matrix(est, type = c("rhs", "fixef"))
+#> 32 x 4 sparse Matrix of class "dgCMatrix"
+#>       drat cyl::4 cyl::6 cyl::8
+#>  [1,] 3.90      .      1      .
+#>  [2,] 3.90      .      1      .
+
+# Or you can pass a (linear) formula
+sparse_model_matrix(mpg ~ i(vs) | gear^cyl, data = mtcars, type = c("rhs", "fixef"))
+#> 32 x 9 sparse Matrix of class "dgCMatrix"
+#>       vs::1 gear^cyl::3_4 gear^cyl::3_6 gear^cyl::3_8 gear^cyl::4_4 gear^cyl::4_6 gear^cyl::5_4 gear^cyl::5_6 gear^cyl::5_8
+#>  [1,]     .             .             .             .             .             1             .             .             .
+#>  [2,]     .             .             .             .             .             1             .             .             .
+#>  [3,]     1             .             .             .             1             .             .             .             .
+```
+
+
+- `hatvalues` now works with fixed effects. It also allows for approximate computation (via `exact` and `p` arguments). For details on the approximation, see [Kline, Saggio, and Sølvsten 2020](https://eml.berkeley.edu/~pkline/papers/KSS2020.pdf).
+
+- Add `HC2` and `HC3` options to `vcov` for `feols` and `feglm` models. With this gets added the `vcov_hetero` function.
+
 ## Documentation bug
 
 - fix bug in the help of `bin`
@@ -1021,7 +1055,7 @@ Common methods have been extended to `fixest_multi` objects.
 
   - Improve error messages. 
 
-  - `hatvalues.fixest`: now returns an error instead of a message when fixed-effects are present (it makes the interplay with `sadnwich` 'nicer').
+  - `hatvalues.fixest`: now returns an error instead of a message when fixed-effects are present (it makes the interplay with `sandwich` 'nicer').
 
 # fixest 0.8.4
 

diff --git a/R/Methods.R b/R/Methods.R
@@ -338,7 +338,7 @@ print.fixest = function(x, n, type = "table", fitstat = NULL, ...){
 #' @param forceCovariance (Advanced users.) Logical, default is `FALSE`. In the peculiar case where the obtained Hessian is not invertible (usually because of collinearity of some variables), use this option to force the covariance matrix, by using a generalized inverse of the Hessian. This can be useful to spot where possible problems come from.
 #' @param keepBounded (Advanced users -- `feNmlm` with non-linear part and bounded coefficients only.) Logical, default is `FALSE`. If `TRUE`, then the bounded coefficients (if any) are treated as unrestricted coefficients and their S.E. is computed (otherwise it is not).
 #' @param n Integer, default is 1000. Number of coefficients to display when the print method is used.
-#' @param ... Only used if the argument `.vocv` is provided and is a function: extra arguments to be passed to that function.
+#' @param ... Only used if the argument `.vcov` is provided and is a function: extra arguments to be passed to that function.
 #'
 #' @section Compatibility with \pkg{sandwich} package:
 #' The VCOVs from `sandwich` can be used with `feols`, `feglm` and `fepois` estimations. If you want to have a `sandwich` VCOV when using `summary.fixest`, you can use the argument `vcov` to specify the VCOV function to use (see examples).
@@ -3728,15 +3728,26 @@ deviance.fixest = function(object, ...){
 
 #' Hat values for `fixest` objects
 #'
-#' Computes the hat values for [`feols`] or [`feglm`] estimations. Only works when there are no fixed-effects.
+#' Computes the hat values for [`feols`] or [`feglm`] estimations. 
 #'
 #' @param model A fixest object. For instance from feols or feglm.
+#' @param exact Logical scalar, default is `TRUE`. Whether the diagonals of the projection matrix should be calculated exactly. If `FALSE`, then it will be approximated using a JLA algorithm. See details. Unless you have a very large number of observations, it is recommended to keep the default value.
+#' @param p Numeric scala, default is 1000. This is only used when `exact == FALSE`. This determined the number of bootstrap samples used to estimate the projection matrix.
 #' @param ... Not currently used.
 #'
 #' @details
 #' Hat values are not available for [`fenegbin`][fixest::femlm], [`femlm`] and [`feNmlm`] estimations.
-#'
-#' When there are fixed-effects, the hat values of the reduced form are different from the hat values of the full model. And we cannot get costlessly the hat values of the full model from the reduced form. It would require to reestimate the model with the fixed-effects as regular variables.
+#' 
+#' Hat values for generalized linear model are disussed in Belsley, Kuh and Welsch (1980), Cook and Weisberg (1982), etc.
+#' 
+#' When `exact == FALSE`, the Johnson-Lindenstrauss approximation (JLA) algorithm is used which approximates the diagonals of the projection matrix. For more precision (but longer time), increase the value of `p`. See Kline, Saggio, and Sølvsten (2020) for details.
+#' 
+#' 
+#' @references
+#' Belsley, D. A., Kuh, E. and Welsch, R. E. (1980). *Regression Diagnostics*. New York: Wiley.
+#' Cook, R. D. and Weisberg, S. (1982). *Residuals and Influence in Regression*. London: Chapman and Hall.
+#' Kline, P., Saggio R., and Sølvsten, M. (2020). *Leave‐Out Estimation of Variance Components*. Econometrica.
+#' 
 #'
 #' @return
 #' Returns a vector of the same length as the number of observations used in the estimation.
@@ -3747,46 +3758,177 @@ deviance.fixest = function(object, ...){
 #' head(hatvalues(est))
 #'
 #'
-hatvalues.fixest = function(model, ...){
-    # Only works for feglm/feols objects + no fixed-effects
-    # When there are fixed-effects the hatvalues of the reduced form is different from
-    #  the hatvalues of the full model. And we cannot get costlessly the hatvalues of the full
-    #  model from the reduced form. => we need to reestimate the model with the FEs as
-    #  regular variables.
+hatvalues.fixest = function(model, exact = TRUE, p = 1000, ...){
+    # Only works for feglm/feols objects
 
     if(isTRUE(model$lean)){
-        # LATER: recompute it
         stop("The method 'hatvalues.fixest' cannot be applied to 'lean' fixest objects. Please re-estimate with 'lean = FALSE'.")
     }
 
+    if (!is.null(model$iv)) {
+      stop("The method 'hatvalues.fixest' cannot be applied to IV models.")
+    }
+
     if(is_user_level_call()){
         validate_dots()
     }
 
     method = model$method_type
-    family = model$family
 
-    msg = "hatvalues.fixest: 'hatvalues' is not implemented for estimations with fixed-effects."
-
-    # An error is in fact nicer than a message + NA return due to the interplay with sandwich
-    if(!is.null(model$fixef_id)){
-        stop(msg)
-    }
-
-    if(method == "feols"){
-        X = model.matrix(model)
-
-        res = cpp_diag_XUtX(X, model$cov.iid / model$sigma2)
-
-    } else if(method == "feglm"){
-        XW = model.matrix(model) * sqrt(model$irls_weights)
-        res = cpp_diag_XUtX(XW, model$cov.iid)
-
-    } else {
+    if (!(method %in% c("feols", "feglm"))) {
         stop("'hatvalues' is currently not implemented for function ", method, ".")
     }
 
-    res
+    # Outline: Blockwise Formula for Projection Matrix
+    # https://en.wikipedia.org/wiki/Projection_matrix#Blockwise_formula
+    # Part 1: fixed effects
+    # Part 2: slope vars (demeaned by fixed effects)
+    # Part 3: rhs vars (demeaned by fixed effects and slope vars)
+    # Sum the three parts to get the projection matrix
+    P_ii_list = list()
+    mats = list()
+    mats$fixef = sparse_model_matrix(
+      model, type = c("fixef"), 
+      collin.rm = TRUE, na.rm = TRUE, 
+      sortCols = FALSE
+    )
+
+    # Check for slope.vars and move to seperate matrix
+    if (!is.null(model$slope_flag)) {
+      slope_var_cols = which(
+        grepl("\\[\\[", colnames(mats$fixef))
+      )
+      mats$slope_vars = mats$fixef[, slope_var_cols]
+      mats$fixef = mats$fixef[, -slope_var_cols]
+    }
+
+    # Get weights
+    if (method == "feols") {
+      weights = model$weights
+    } else {
+      weights = model$irls_weights
+    }
+
+    if (!is.null(model$fixef_vars)) {
+      f = model.matrix(model, type = "fixef")
+      f = subset(f, select = model$fixef_vars)
+
+      # Matrix of fixed effects times weights
+      if (!is.null(weights)) mats$fixef = mats$fixef * sqrt(weights)
+
+      # Demean X by FEs and slope_vars
+      if (is.null(model$onlyFixef) & method == "feols") {
+        mats$rhs = demean(model)[, -1, drop = FALSE] 
+        if (!is.null(weights)) mats$rhs = mats$rhs * sqrt(weights)
+      }
+      if (is.null(model$onlyFixef) & method == "feglm") {
+        mats$rhs = demean(
+          model.matrix(model, "rhs"),
+          f,
+          weights = weights
+        )
+        if (!is.null(weights)) mats$rhs = mats$rhs * sqrt(weights)
+      }
+
+      # Demean slope_vars by FEs
+      if (!is.null(mats$slope_vars)) {
+        mats$slope_vars = demean(
+          as.matrix(mats$slope_vars), 
+          f, 
+          weights = weights
+        ) 
+        if (!is.null(weights)) mats$slope_vars = mats$slope_vars * sqrt(weights)
+      }
+
+    } else if (is.null(model$onlyFixef)) {
+      mats$rhs = sparse_model_matrix(
+        model, type = c("rhs"), 
+        collin.rm = TRUE, na.rm = TRUE
+      )
+
+      if (!is.null(weights)) mats$rhs = mats$rhs * sqrt(weights)
+    }
+
+
+    # Caclulate P_ii's
+    if (!is.null(mats$fixef)) {
+      # https://stackoverflow.com/questions/39533785/how-to-compute-diagx-solvea-tx-efficiently-without-taking-matrix-i
+      U = Matrix::chol(Matrix::crossprod(mats$fixef))
+      Z = Matrix::solve(Matrix::t(U), Matrix::t(mats$fixef))
+      P_ii_list$fixef = Matrix::colSums(Z ^ 2)
+    }
+
+    # Note: This might fail if looking at too large of a matrix
+    if (exact == TRUE) {
+        if (!is.null(mats$slope_vars)) { 
+            Z = Matrix::solve(
+              Matrix::crossprod(mats$slope_vars), 
+              Matrix::t(mats$slope_vars)
+            )
+            # P_ii = diag(X %*% Z) = rowSums(X * Z)
+            P_ii_list$slope_vars = Matrix::rowSums(
+              mats$slope_vars * Matrix::t(Z)
+            )
+        }
+
+        if (!is.null(mats$rhs)) {
+            tXXinv = model$cov.iid 
+            if (method == "feols") tXXinv = tXXinv / model$sigma2
+
+            P_ii_list$rhs = Matrix::rowSums(
+              mats$rhs * Matrix::t(tXXinv %*% Matrix::t(mats$rhs))
+            )
+        }
+    }
+
+    # Theory: https://cs-people.bu.edu/evimaria/cs565/achlioptas.pdf
+    if (exact == FALSE) {
+      n = model$nobs
+
+      if (!is.null(mats$slope_vars)) { 
+        P_ii_list$slope_vars = rep(0, n)
+        tSS = Matrix::crossprod(mats$slope_vars)
+        tS = Matrix::t(mats$slope_vars)
+      }
+      if (!is.null(mats$rhs)) { 
+        P_ii_list$rhs = rep(0, n)
+        tXX = Matrix::crossprod(mats$rhs)
+        tX = Matrix::t(mats$rhs)
+      }
+
+      for (j in 1:p) {
+        # Rademacher Weights (-1, 1)
+        Rp_j <- matrix(
+          (stats::runif(n) > 0.5) * 2 - 1,
+          # Uncomment for speed-up
+          # rademacher::sample_rademacher(n), 
+          nrow = 1, ncol = n
+        )
+
+
+        if (!is.null(mats$slope_vars)) {
+          tS_Rp_j <- Matrix::tcrossprod(tS, Rp_j)
+
+          Zj <- Matrix::solve(tSS, tS_Rp_j)
+          P_ii_list$slope_vars = 
+            P_ii_list$slope_vars + 
+            as.numeric(mats$slope_vars %*% Zj)^2 / p
+        }
+
+        if (!is.null(mats$rhs)) { 
+          tX_Rp_j <- Matrix::tcrossprod(tX, Rp_j)
+
+          Zj <- Matrix::solve(tXX, tX_Rp_j)
+          P_ii_list$rhs = 
+            P_ii_list$rhs + 
+            as.numeric(mats$rhs %*% Zj)^2 / p
+        }
+      }
+    }
+
+    P_ii = Reduce("+", P_ii_list)
+
+    return(P_ii)
 }
 
 ####

diff --git a/R/MiscFuns.R b/R/MiscFuns.R
@@ -1384,7 +1384,7 @@ i = function(factor_var, var, ref, keep, bin, ref2, keep2, bin2, ...){
         items_name = items
     }
 
-    if(FROM_FIXEST){
+    if(FROM_FIXEST || is_sparse){
         # Pour avoir des jolis noms c'est un vrai gloubiboulga,
         # mais j'ai pas trouve plus simple...
         if(IS_INTER_FACTOR){
@@ -1421,6 +1421,10 @@ i = function(factor_var, var, ref, keep, bin, ref2, keep2, bin2, ...){
         fe_colid = to_integer(fe_num[valid_row], sorted = TRUE)
 
         values = if(length(var) == 1) rep(1, length(valid_row)) else var
+
+        # Clean names
+        col_names = sub("^.*__CLEAN__", "", col_names)
+
         res = list(rowid = which(valid_row), values = values,
                    colid = fe_colid, col_names = col_names)
 
@@ -3694,7 +3698,6 @@ fixest_model_matrix = function(fml, data, fake_intercept = FALSE, i_noref = FALS
     } else {
         linear.mat = prepare_matrix(fml, data, fake_intercept)
     }
-
     if(any(grepl("__CLEAN__", colnames(linear.mat), fixed = TRUE))){
         new_names = clean_interact_names(colnames(linear.mat))