statdivlab · svteichman · Jun 5, 2024 · Jun 4, 2024
diff --git a/NAMESPACE b/NAMESPACE
@@ -4,6 +4,7 @@ export("%>%")
 export(gee_test)
 export(get_test_statistic)
 export(glm_test)
+export(jackknife_se)
 export(robust_score_test)
 import(geeasy)
 import(rlang)

diff --git a/R/gee_test.R b/R/gee_test.R
@@ -1,6 +1,7 @@
 #' Generalized Estimating Equations under technical replication with robust and non-robust Wald and Rao (score) tests
 #'
 #' @param use_geeasy When TRUE, uses `geeasy` for gee estimation, when FALSE uses `geepack`
+#' @param use_jack_se When TRUE uses jackknife standard errors (which take longer), when FALSE uses sandwich standard errors
 #' @param ... Arguments that you would pass to a regular `geepack::geeglm` call. Note that for now we only have functionality for Poisson tests with log link
 #'
 #' @importFrom sandwich sandwich
@@ -14,10 +15,10 @@
 #'
 #'
 #' @export
-gee_test <- function(use_geeasy = TRUE, ...) {
-
+gee_test <- function(use_geeasy = TRUE, use_jack_se = FALSE, ...) {
+  
   cl_orig <- match.call()
-  cl_orig <- call_modify(cl_orig, use_geeasy = zap())
+  cl_orig <- call_modify(cl_orig, use_geeasy = zap(), use_jack_se = zap())
 
   cl <- cl_orig
   if (use_geeasy) {
@@ -78,7 +79,13 @@ gee_test <- function(use_geeasy = TRUE, ...) {
   colnames(output)[3] <- "Robust Wald p"                ## "Pr(>|W|)"
   colnames(output)[2] <- "Robust Std Error"             ## "Std. Error"
 
-
+  # get jackknife standard errors if needed 
+  if (use_jack_se) {
+    output[, 2] <- jackknife_se(object = gee_result, 
+                                dat = eval(cl$data, envir = rlang::caller_env())[the_reorder, ],
+                                id = eval(cl$id, envir = rlang::caller_env())[the_reorder])
+  }
+
   #### run robust score tests
   pp <- nrow(output)
   robust_score_p <- rep(NA, length = pp)

diff --git a/R/jackknife_se.R b/R/jackknife_se.R
@@ -0,0 +1,37 @@
+#' Jackknife standard errors 
+#'
+#' @param object The fitted object under the alternative.
+#' @param dat The data used to fit the model
+#' @param id Observations with the same id are in the same cluster. If not included, independence between 
+#' observations is assumed.
+#'
+#' @export
+jackknife_se <- function(object, dat, id = NULL){
+  if (is.null(id)) {
+    id <- 1:nrow(dat)
+  }
+  if (!is.numeric(id)) {
+    id <- as.factor(id)
+  }
+  unique_id <- unique(id)
+  parm <- sapply(unique_id, update_model, object, dat, id)
+  parm <- t(parm)
+  parm.mean <- apply(parm, 2, mean)
+
+  parm.cent <- sapply(1:nrow(parm),
+                      function(i) {
+                        parm[i, ] - parm.mean
+                      })
+  parm.cent <- t(parm.cent) 
+
+  jack.var <- ((length(unique_id) - 1) / length(unique_id)) * t(parm.cent) %*% parm.cent
+  jack.se <- sqrt(diag(jack.var))
+  jack.se
+}
+
+update_model <- function(id_i, object, dat, id) {
+  remove_ind <- which(id == id_i)
+  dat.i <- dat[-remove_ind, ]
+  object$call <- call_modify(object$call, id = id[-remove_ind])
+  coef(stats::update(object, data = dat.i))
+}
diff --git a/man/gee_test.Rd b/man/gee_test.Rd
diff --git a/man/jackknife_se.Rd b/man/jackknife_se.Rd
diff --git a/tests/testthat/test-replicates.R b/tests/testthat/test-replicates.R
@@ -1,12 +1,12 @@
-test_that("replicates work", {
+set.seed(1)
+data(cars)
+cars2 <- cars
+cars2$wts <- log(rnorm(nrow(cars), 20))
+cars2$batch1 <- rep(LETTERS[1:5], each = 10)
+cars2$batch2 <- rep(LETTERS[6:10], 10)
+cars2$batch3 <- rep(1:5, each = 10)
 
-  set.seed(1)
-  data(cars)
-  cars2 <- cars
-  cars2$wts <- log(rnorm(nrow(cars), 20))
-  cars2$batch1 <- rep(LETTERS[1:5], each = 10)
-  cars2$batch2 <- rep(LETTERS[6:10], 10)
-  cars2$batch3 <- rep(1:5, each = 10)
+test_that("replicates work", {
 
   ### fails in test unless `cars` is loaded?
   gee_test(formula = dist ~ speed,
@@ -86,13 +86,6 @@ test_that("replicates work", {
 })
 
 test_that("geeasy and geepack give similar results", {
-  set.seed(1)
-  data(cars)
-  cars2 <- cars
-  cars2$wts <- log(rnorm(nrow(cars), 20))
-  cars2$batch1 <- rep(LETTERS[1:5], each = 10)
-  cars2$batch2 <- rep(LETTERS[6:10], 10)
-  cars2$batch3 <- rep(1:5, each = 10)
 
   geeasy_test <- gee_test(formula = dist ~ speed,
                           data = cars2,
@@ -109,3 +102,23 @@ test_that("geeasy and geepack give similar results", {
   expect_true((geeasy_test$Estimate[1] - geepack_test$Estimate[1])/geeasy_test$Estimate[1] < 0.01)
 })
 
+test_that("jackknife standard errors work", {
+  cars2_obj <- geelm(formula = dist ~ speed, data = cars2, id = batch3, 
+                      family = poisson(link = "log"), offset = wts, corstr = "exchangeable")
+
+  jack_se <- jackknife_se(cars2_obj, cars2)
+  jack_se_cluster <- jackknife_se(cars2_obj, cars2, id = cars2$batch3)
+  sand_se <- summary(cars2_obj)$coef$Std.err
+  # expect that non-clustered se's are closer to robust se's because this data has no inherent clustering
+  expect_true(sum((jack_se - sand_se)^2) < sum((jack_se_cluster - sand_se)^2))
+
+  gee_res <- gee_test(formula = dist ~ speed,
+                      data = cars2,
+                      id = batch3,
+                      family=poisson(link="log"),
+                      offset = wts,
+                      use_jack_se = TRUE)
+  # expect that gee test robust standard errors are equal to those directly from jackknife function
+  expect_true(all.equal(gee_res$`Robust Std Error`[1:2], as.vector(jack_se_cluster)))
+})
+