From c47f2d8a1be44814e2d74d8c7173f28c0aa445f0 Mon Sep 17 00:00:00 2001
From: merliseclyde <clyde@stat.duke.edu>
Date: Mon, 9 Nov 2020 23:26:10 -0500
Subject: [PATCH] * missing `mean.x` in GLMs caused  `predict` to error when
 `se.fit = TRUE` [Issue #50](https://github.com/merliseclyde/BAS/issues/50)

* Prediction under the MPM failed with GLMs [Issue #51](https://github.com/merliseclyde/BAS/issues/51)

close #50
close #51

issue #52 regarding SE still open!
---
 R/bas_glm.R                    | 22 ++++++----
 R/bas_lm.R                     |  2 +
 R/predict.R                    | 75 ++++++++++++++++++++++++++++------
 man/bas.glm.Rd                 | 22 ++++++----
 man/bas.lm.Rd                  |  2 +
 man/fitted.Rd                  |  1 +
 man/variable.names.pred.bas.Rd |  1 +
 tests/testthat/test-predict.R  | 68 ++++++++++++++++++++++++------
 8 files changed, 154 insertions(+), 39 deletions(-)

diff --git a/R/bas_glm.R b/R/bas_glm.R
index 3458f53f..0361edd3 100644
--- a/R/bas_glm.R
+++ b/R/bas_glm.R
@@ -159,18 +159,26 @@ normalize.initprobs.glm <- function(initprobs, glm.obj) {
 #' \item{priorprobs}{the prior probabilities of the models selected}
 #' \item{logmarg}{values of the log of the marginal likelihood for the models}
 #' \item{n.vars}{total number of independent variables in the full model,
-#' including the intercept} \item{size}{the number of independent variables in
-#' each of the models, includes the intercept} \item{which}{a list of lists
+#' including the intercept}
+#' \item{size}{the number of independent variables in
+#' each of the models, includes the intercept}
+#' \item{which}{a list of lists
 #' with one list per model with variables that are included in the model}
 #' \item{probne0}{the posterior probability that each variable is non-zero}
 #' \item{coefficients}{list of lists with one list per model giving the GLM
-#' estimate of each (nonzero) coefficient for each model.} \item{se}{list of
+#' estimate of each (nonzero) coefficient for each model.}
+#' \item{se}{list of
 #' lists with one list per model giving the GLM standard error of each
-#' coefficient for each model} \item{deviance}{the GLM deviance for each model}
+#' coefficient for each model}
+#' \item{deviance}{the GLM deviance for each model}
 #' \item{modelprior}{the prior distribution on models that created the BMA
-#' object} \item{Q}{the Q statistic for each model used in the marginal
-#' likelihood approximation} \item{Y}{response} \item{X}{matrix of predictors}
-#' \item{family}{family object from the original call} \item{betaprior}{family
+#' object}
+#' \item{Q}{the Q statistic for each model used in the marginal
+#' likelihood approximation}
+#' \item{Y}{response}
+#' \item{X}{matrix of predictors}
+#' \item{family}{family object from the original call}
+#' \item{betaprior}{family
 #' object for prior on coefficients, including hyperparameters}
 #' \item{modelprior}{family object for prior on the models}
 #' \item{include.always}{indices of variables that are forced into the model}
diff --git a/R/bas_lm.R b/R/bas_lm.R
index 294c856f..e2af5e4c 100644
--- a/R/bas_lm.R
+++ b/R/bas_lm.R
@@ -302,6 +302,7 @@ is.solaris<-function() {
 #' \item{X}{matrix of predictors}
 #' \item{mean.x}{vector of means for each column of X (used in
 #' \code{\link{predict.bas}})}
+#' \link{weights} used in model fitting
 #' \item{include.always}{indices of variables that are forced into the model}
 #'
 #' The function \code{\link{summary.bas}}, is used to print a summary of the
@@ -844,6 +845,7 @@ bas.lm <- function(formula,
   result$xlevels <- .getXlevels(mt, mf)
   result$terms <- mt
   result$model <- mf
+  result$weights <- weights
 
   class(result) <- c("bas")
   if (prior == "EB-global") {
diff --git a/R/predict.R b/R/predict.R
index 7d8098c4..9ce4e679 100644
--- a/R/predict.R
+++ b/R/predict.R
@@ -230,10 +230,13 @@ predict.bas <- function(object,
                         estimator = "BMA",
                         na.action = na.pass,
                         ...) {
-  if (!(estimator %in% c("BMA", "HPM", "MPM", "BPM"))) {
+  if (!(estimator %in% c("BMA", "HPM", "MPM", "BPM", "MPMold"))) {
     stop("Estimator must be one of 'BMA', 'BPM', 'HPM', or 'MPM'.")
   }
 
+  if (estimator == "MPM") {
+    object = extract_MPM(object)
+  }
   tt <- terms(object)
 
   if (missing(newdata) || is.null(newdata)) {
@@ -277,7 +280,7 @@ predict.bas <- function(object,
   df <- object$df
 
 
-  if (estimator == "MPM") {
+  if (estimator == "MPMold") {
     nvar <- object$n.vars - 1
     bestmodel <- (0:nvar)[object$probne0 > .5]
     newX <- cbind(1, newdata)
@@ -415,12 +418,11 @@ predict.bas <- function(object,
 
   if (se.fit) {
     if (estimator != "BMA") {
-      se <- .se.fit(fit, newdata, object, insample)
+      se <- .se.fit(fit, newdata, object, insample, type)
     }
     else {
       se <- .se.bma(
-        Ybma, newdata, Ypred, best, object,
-        insample
+        Ybma, newdata, Ypred, best, object, insample, type
       )
     }
   }
@@ -521,8 +523,12 @@ fitted.bas <- function(object,
   if (is.null(top)) {
     top <- nmodels
   }
+
+  if (estimator == "MPM") { top = 1 }
+
+
   if (estimator == "HPM") {
-    yhat <- predict(
+     yhat <- predict(
       object,
       newdata = NULL,
       top = 1,
@@ -530,6 +536,7 @@ fitted.bas <- function(object,
       na.action = na.action
     )$fit
   }
+
   if (estimator == "BMA") {
     yhat <- predict(
       object,
@@ -548,6 +555,15 @@ fitted.bas <- function(object,
       na.action = na.action
     )$fit
   }
+  if (estimator == "MPMold") {
+    yhat <- predict(
+      object,
+      newdata = NULL,
+      top = top,
+      estimator = "MPMold", type = type,
+      na.action = na.action
+    )$fit
+  }
   if (estimator == "BPM") {
     yhat <- predict(
       object,
@@ -558,10 +574,19 @@ fitted.bas <- function(object,
     )$fit
   }
 
+ yhat <- predict(
+    object,
+    newdata = NULL,
+    top = top,
+    estimator = estimator,
+    type = type,
+    na.action = na.action
+  )$fit
+
   return(as.vector(yhat))
 }
 
-.se.fit <- function(yhat, X, object, insample) {
+.se.fit <- function(yhat, X, object, insample, type) {
   n <- object$n
   model <- attr(yhat, "model")
   best <- attr(yhat, "best")
@@ -569,7 +594,7 @@ fitted.bas <- function(object,
   df <- object$df[best]
 
   mean.x = object$mean.x  # glms don't have centered X for intercept so need t
-  # to center X and newX to get the right hat values with orthogonal X
+  # to center X and newX to get the right hat values with weights
   if (is.null(mean.x)) {
     mean.x =colMeans(object$X[,-1])
     X = sweep(X, 2, mean.x)
@@ -578,9 +603,33 @@ fitted.bas <- function(object,
 
 
   shrinkage <- object$shrinkage[best]
+
   if (insample) {
-    xiXTXxiT <- hat(object$X[, model + 1]) - 1 / n
-  } else {
+
+    if (!is.null(object$family$family)) {
+        if (type == 'link') {
+          mu.eta <- object$family$mu.eta(as.vector(yhat))
+          weights <- mu.eta^2/object$family$variance(object$family$linkinv(yhat))
+        }
+        else {
+          mu.eta <-  object$family$mu.eta(object$family$link(as.vector(yhat)))
+          weights <-  mu.eta^2/object$family$variance(as.vector(yhat))
+        }
+      }
+    else {
+        if (!is.null(object$weights)) {
+            weights <- object$weights
+        }
+        else {
+              weights = rep(1, object$n)
+        }
+    }
+    # browser()  FIX issue #52
+    xiXTXxiT <- hat(diag(sqrt(weights)) %*% object$X[, model + 1])/weights - 1 / sum(weights)
+  }
+  else {
+
+    #Fix below! FIX issue #52
     X <- cbind(1, X[, model[-1], drop = FALSE])
     oldX <- (sweep(object$X[, -1], 2, mean.x))[, model[-1]]  #center
     #    browser()
@@ -589,9 +638,9 @@ fitted.bas <- function(object,
   }
   scale_fit <- 1 / n + object$shrinkage[best] * xiXTXxiT
   if (is.null(object$family)) {
-    family <- gaussian()
+    object$family <- gaussian()
   }
-  if (eval(family)$family == "gaussian") {
+  if (object$family$family == "gaussian") {
     ssy <- var(object$Y) * (n - 1)
     bayes_mse <- ssy * (1 - shrinkage * object$R2[best]) / df
   }
@@ -607,7 +656,7 @@ fitted.bas <- function(object,
   ))
 }
 
-.se.bma <- function(fit, Xnew, Ypred, best, object, insample) {
+.se.bma <- function(fit, Xnew, Ypred, best, object, insample, type) {
   n <- object$n
 
   df <- object$df[best]
diff --git a/man/bas.glm.Rd b/man/bas.glm.Rd
index a35e08da..298411f0 100644
--- a/man/bas.glm.Rd
+++ b/man/bas.glm.Rd
@@ -164,18 +164,26 @@ components:
 \item{priorprobs}{the prior probabilities of the models selected}
 \item{logmarg}{values of the log of the marginal likelihood for the models}
 \item{n.vars}{total number of independent variables in the full model,
-including the intercept} \item{size}{the number of independent variables in
-each of the models, includes the intercept} \item{which}{a list of lists
+including the intercept}
+\item{size}{the number of independent variables in
+each of the models, includes the intercept}
+\item{which}{a list of lists
 with one list per model with variables that are included in the model}
 \item{probne0}{the posterior probability that each variable is non-zero}
 \item{coefficients}{list of lists with one list per model giving the GLM
-estimate of each (nonzero) coefficient for each model.} \item{se}{list of
+estimate of each (nonzero) coefficient for each model.}
+\item{se}{list of
 lists with one list per model giving the GLM standard error of each
-coefficient for each model} \item{deviance}{the GLM deviance for each model}
+coefficient for each model}
+\item{deviance}{the GLM deviance for each model}
 \item{modelprior}{the prior distribution on models that created the BMA
-object} \item{Q}{the Q statistic for each model used in the marginal
-likelihood approximation} \item{Y}{response} \item{X}{matrix of predictors}
-\item{family}{family object from the original call} \item{betaprior}{family
+object}
+\item{Q}{the Q statistic for each model used in the marginal
+likelihood approximation}
+\item{Y}{response}
+\item{X}{matrix of predictors}
+\item{family}{family object from the original call}
+\item{betaprior}{family
 object for prior on coefficients, including hyperparameters}
 \item{modelprior}{family object for prior on the models}
 \item{include.always}{indices of variables that are forced into the model}
diff --git a/man/bas.lm.Rd b/man/bas.lm.Rd
index ceecf8ac..08a98685 100644
--- a/man/bas.lm.Rd
+++ b/man/bas.lm.Rd
@@ -259,6 +259,7 @@ BMA object}
 \item{X}{matrix of predictors}
 \item{mean.x}{vector of means for each column of X (used in
 \code{\link{predict.bas}})}
+\link{weights} used in model fitting
 \item{include.always}{indices of variables that are forced into the model}
 
 The function \code{\link{summary.bas}}, is used to print a summary of the
@@ -442,6 +443,7 @@ Other bas methods:
 \code{\link{confint.coef.bas}()},
 \code{\link{confint.pred.bas}()},
 \code{\link{diagnostics}()},
+\code{\link{extract_MPM}()},
 \code{\link{fitted.bas}()},
 \code{\link{force.heredity.bas}()},
 \code{\link{image.bas}()},
diff --git a/man/fitted.Rd b/man/fitted.Rd
index 38d8990d..fde3ced8 100644
--- a/man/fitted.Rd
+++ b/man/fitted.Rd
@@ -84,6 +84,7 @@ Other bas methods:
 \code{\link{confint.coef.bas}()},
 \code{\link{confint.pred.bas}()},
 \code{\link{diagnostics}()},
+\code{\link{extract_MPM}()},
 \code{\link{force.heredity.bas}()},
 \code{\link{image.bas}()},
 \code{\link{plot.confint.bas}()},
diff --git a/man/variable.names.pred.bas.Rd b/man/variable.names.pred.bas.Rd
index ca0de94f..6fd743a1 100644
--- a/man/variable.names.pred.bas.Rd
+++ b/man/variable.names.pred.bas.Rd
@@ -48,6 +48,7 @@ Other bas methods:
 \code{\link{confint.coef.bas}()},
 \code{\link{confint.pred.bas}()},
 \code{\link{diagnostics}()},
+\code{\link{extract_MPM}()},
 \code{\link{fitted.bas}()},
 \code{\link{force.heredity.bas}()},
 \code{\link{image.bas}()},
diff --git a/tests/testthat/test-predict.R b/tests/testthat/test-predict.R
index b037fad7..f5d62174 100644
--- a/tests/testthat/test-predict.R
+++ b/tests/testthat/test-predict.R
@@ -36,18 +36,42 @@ test_that("predict.bas.glm", {
   pima_pred <- predict(pima_gprior,
                        estimator = "HPM",
                        se.fit = FALSE)
+  pima_top <-  predict(pima_gprior,
+                       estimator = "BMA", top=1,
+                       se.fit = TRUE)
+
+ expect_equal(pima_pred$fit, pima_top$fit, check.attributes = FALSE)
+
+})
+
+
+
+#Fixed Issue #51
+test_that("MPM and predict glm", {
+  data("Pima.tr", package="MASS")
+  data("Pima.te", package="MASS")
+  pima_gprior <- bas.glm(type ~ ., data = Pima.tr,
+                         betaprior = g.prior(g=as.numeric(nrow(Pima.tr))),
+                         family=binomial())
+  pima_MPM = extract_MPM(pima_gprior)
+
+  expect_equal(predict(pima_gprior, estimator = "MPM", se.fit = FALSE)$fit,
+               predict(pima_MPM, se.fit = FALSE)$fit,
+               check.attributes = FALSE)
+
+
+  pima_pred <- predict(pima_gprior,
+                       estimator = "MPM", type = "link",
+                       se.fit = FALSE)
+  pima_fit <-  fitted(pima_gprior,
+                       estimator = "MPM")
+
+  expect_equal(pima_pred$fit, pima_fit, check.attributes = FALSE)
 
-  # should not error
-  expect_error(predict(pima_gprior,
-                       estimator = "HPM",
-                       se.fit = TRUE))
-#  expect_null(plot(confint(pima_pred, parm = "mean")))
-#  should not error
-  expect_error( predict(hald_gprior, newdata=Pima.te, estimator = "HPM",
-                       se.fit = TRUE))
-  #expect_null(plot(confint(pima_pred)))
 })
 
+
+# Issue #52 SE's are incorrect for glms and weighted regression
 test_that("se.fit.glm", {
   data("Pima.tr", package="MASS")
   data("Pima.te", package="MASS")
@@ -57,6 +81,7 @@ pima.bic = bas.glm(type ~ ., data=Pima.tr, n.models= 2^7,
                              betaprior=bic.prior(n=200), family=binomial(),
                              modelprior=beta.binomial(1,1))
 
+fit.bic = predict(pima.bic,  se.fit = TRUE, top=1, type="link", estimator="HPM")
 pred.bic = predict(pima.bic, newdata=Pima.te, se.fit = TRUE, top=1, type="link")
 
 form = paste("type ~ ",
@@ -64,13 +89,32 @@ form = paste("type ~ ",
                      collapse = "+"))
 
 pima.glm = glm(form, data=Pima.tr, family=binomial())
+fit.glm = predict(pima.glm,  se.fit=TRUE, type='link')
 pred.glm = predict(pima.glm, newdata=Pima.te, se.fit=TRUE, type='link')
 
-expect_true(all.equal(pred.glm$fit, pred.bic$fit, check.attributes = FALSE))
+expect_true(all.equal(fit.glm$fit, fit.bic$fit, check.attributes = FALSE))
+
 
 # issue #50 in github regarding se.fit failing; debugging indicates se.fit is
 # incorrect
-# Should be expect_true
-expect_false(all.equal(pred.glm$se.fit, pred.bic$se.fit, check.attributes = FALSE))
+# Should be expect_equal
+
+expect_equal(fit.glm$se.fit,  fit.bic$se.fit, check.attributes = FALSE)
+expect_equal(pred.glm$se.fit, pred.bic$se.fit, check.attributes = FALSE)
+
+
+})
+
+# Added feature issue #53
+test_that("MPM and predict in lm", {
+  data(Hald, package="BAS")
+  hald_bic =  bas.lm(Y ~ ., data=Hald, alpha=13, prior="BIC",
+                     modelprior = uniform())
 
+  hald_MPM = extract_MPM(hald_bic)
+  expect_equal(predict(hald_bic, estimator = "MPM")$fit,
+               predict(hald_MPM)$fit, check.attributes = FALSE)
+  expect_equal(predict(hald_bic, estimator = "MPM")$fit,
+               predict(hald_bic, estimator = "MPMold")$fit,
+               check.attributes = FALSE)
 })