From c47f2d8a1be44814e2d74d8c7173f28c0aa445f0 Mon Sep 17 00:00:00 2001 From: merliseclyde Date: Mon, 9 Nov 2020 23:26:10 -0500 Subject: [PATCH] * missing `mean.x` in GLMs caused `predict` to error when `se.fit = TRUE` [Issue #50](https://github.com/merliseclyde/BAS/issues/50) * Prediction under the MPM failed with GLMs [Issue #51](https://github.com/merliseclyde/BAS/issues/51) close #50 close #51 issue #52 regarding SE still open! --- R/bas_glm.R | 22 ++++++---- R/bas_lm.R | 2 + R/predict.R | 75 ++++++++++++++++++++++++++++------ man/bas.glm.Rd | 22 ++++++---- man/bas.lm.Rd | 2 + man/fitted.Rd | 1 + man/variable.names.pred.bas.Rd | 1 + tests/testthat/test-predict.R | 68 ++++++++++++++++++++++++------ 8 files changed, 154 insertions(+), 39 deletions(-) diff --git a/R/bas_glm.R b/R/bas_glm.R index 3458f53f..0361edd3 100644 --- a/R/bas_glm.R +++ b/R/bas_glm.R @@ -159,18 +159,26 @@ normalize.initprobs.glm <- function(initprobs, glm.obj) { #' \item{priorprobs}{the prior probabilities of the models selected} #' \item{logmarg}{values of the log of the marginal likelihood for the models} #' \item{n.vars}{total number of independent variables in the full model, -#' including the intercept} \item{size}{the number of independent variables in -#' each of the models, includes the intercept} \item{which}{a list of lists +#' including the intercept} +#' \item{size}{the number of independent variables in +#' each of the models, includes the intercept} +#' \item{which}{a list of lists #' with one list per model with variables that are included in the model} #' \item{probne0}{the posterior probability that each variable is non-zero} #' \item{coefficients}{list of lists with one list per model giving the GLM -#' estimate of each (nonzero) coefficient for each model.} \item{se}{list of +#' estimate of each (nonzero) coefficient for each model.} +#' \item{se}{list of #' lists with one list per model giving the GLM standard error of each -#' coefficient for each model} \item{deviance}{the GLM deviance for each model} +#' coefficient for each model} +#' \item{deviance}{the GLM deviance for each model} #' \item{modelprior}{the prior distribution on models that created the BMA -#' object} \item{Q}{the Q statistic for each model used in the marginal -#' likelihood approximation} \item{Y}{response} \item{X}{matrix of predictors} -#' \item{family}{family object from the original call} \item{betaprior}{family +#' object} +#' \item{Q}{the Q statistic for each model used in the marginal +#' likelihood approximation} +#' \item{Y}{response} +#' \item{X}{matrix of predictors} +#' \item{family}{family object from the original call} +#' \item{betaprior}{family #' object for prior on coefficients, including hyperparameters} #' \item{modelprior}{family object for prior on the models} #' \item{include.always}{indices of variables that are forced into the model} diff --git a/R/bas_lm.R b/R/bas_lm.R index 294c856f..e2af5e4c 100644 --- a/R/bas_lm.R +++ b/R/bas_lm.R @@ -302,6 +302,7 @@ is.solaris<-function() { #' \item{X}{matrix of predictors} #' \item{mean.x}{vector of means for each column of X (used in #' \code{\link{predict.bas}})} +#' \link{weights} used in model fitting #' \item{include.always}{indices of variables that are forced into the model} #' #' The function \code{\link{summary.bas}}, is used to print a summary of the @@ -844,6 +845,7 @@ bas.lm <- function(formula, result$xlevels <- .getXlevels(mt, mf) result$terms <- mt result$model <- mf + result$weights <- weights class(result) <- c("bas") if (prior == "EB-global") { diff --git a/R/predict.R b/R/predict.R index 7d8098c4..9ce4e679 100644 --- a/R/predict.R +++ b/R/predict.R @@ -230,10 +230,13 @@ predict.bas <- function(object, estimator = "BMA", na.action = na.pass, ...) { - if (!(estimator %in% c("BMA", "HPM", "MPM", "BPM"))) { + if (!(estimator %in% c("BMA", "HPM", "MPM", "BPM", "MPMold"))) { stop("Estimator must be one of 'BMA', 'BPM', 'HPM', or 'MPM'.") } + if (estimator == "MPM") { + object = extract_MPM(object) + } tt <- terms(object) if (missing(newdata) || is.null(newdata)) { @@ -277,7 +280,7 @@ predict.bas <- function(object, df <- object$df - if (estimator == "MPM") { + if (estimator == "MPMold") { nvar <- object$n.vars - 1 bestmodel <- (0:nvar)[object$probne0 > .5] newX <- cbind(1, newdata) @@ -415,12 +418,11 @@ predict.bas <- function(object, if (se.fit) { if (estimator != "BMA") { - se <- .se.fit(fit, newdata, object, insample) + se <- .se.fit(fit, newdata, object, insample, type) } else { se <- .se.bma( - Ybma, newdata, Ypred, best, object, - insample + Ybma, newdata, Ypred, best, object, insample, type ) } } @@ -521,8 +523,12 @@ fitted.bas <- function(object, if (is.null(top)) { top <- nmodels } + + if (estimator == "MPM") { top = 1 } + + if (estimator == "HPM") { - yhat <- predict( + yhat <- predict( object, newdata = NULL, top = 1, @@ -530,6 +536,7 @@ fitted.bas <- function(object, na.action = na.action )$fit } + if (estimator == "BMA") { yhat <- predict( object, @@ -548,6 +555,15 @@ fitted.bas <- function(object, na.action = na.action )$fit } + if (estimator == "MPMold") { + yhat <- predict( + object, + newdata = NULL, + top = top, + estimator = "MPMold", type = type, + na.action = na.action + )$fit + } if (estimator == "BPM") { yhat <- predict( object, @@ -558,10 +574,19 @@ fitted.bas <- function(object, )$fit } + yhat <- predict( + object, + newdata = NULL, + top = top, + estimator = estimator, + type = type, + na.action = na.action + )$fit + return(as.vector(yhat)) } -.se.fit <- function(yhat, X, object, insample) { +.se.fit <- function(yhat, X, object, insample, type) { n <- object$n model <- attr(yhat, "model") best <- attr(yhat, "best") @@ -569,7 +594,7 @@ fitted.bas <- function(object, df <- object$df[best] mean.x = object$mean.x # glms don't have centered X for intercept so need t - # to center X and newX to get the right hat values with orthogonal X + # to center X and newX to get the right hat values with weights if (is.null(mean.x)) { mean.x =colMeans(object$X[,-1]) X = sweep(X, 2, mean.x) @@ -578,9 +603,33 @@ fitted.bas <- function(object, shrinkage <- object$shrinkage[best] + if (insample) { - xiXTXxiT <- hat(object$X[, model + 1]) - 1 / n - } else { + + if (!is.null(object$family$family)) { + if (type == 'link') { + mu.eta <- object$family$mu.eta(as.vector(yhat)) + weights <- mu.eta^2/object$family$variance(object$family$linkinv(yhat)) + } + else { + mu.eta <- object$family$mu.eta(object$family$link(as.vector(yhat))) + weights <- mu.eta^2/object$family$variance(as.vector(yhat)) + } + } + else { + if (!is.null(object$weights)) { + weights <- object$weights + } + else { + weights = rep(1, object$n) + } + } + # browser() FIX issue #52 + xiXTXxiT <- hat(diag(sqrt(weights)) %*% object$X[, model + 1])/weights - 1 / sum(weights) + } + else { + + #Fix below! FIX issue #52 X <- cbind(1, X[, model[-1], drop = FALSE]) oldX <- (sweep(object$X[, -1], 2, mean.x))[, model[-1]] #center # browser() @@ -589,9 +638,9 @@ fitted.bas <- function(object, } scale_fit <- 1 / n + object$shrinkage[best] * xiXTXxiT if (is.null(object$family)) { - family <- gaussian() + object$family <- gaussian() } - if (eval(family)$family == "gaussian") { + if (object$family$family == "gaussian") { ssy <- var(object$Y) * (n - 1) bayes_mse <- ssy * (1 - shrinkage * object$R2[best]) / df } @@ -607,7 +656,7 @@ fitted.bas <- function(object, )) } -.se.bma <- function(fit, Xnew, Ypred, best, object, insample) { +.se.bma <- function(fit, Xnew, Ypred, best, object, insample, type) { n <- object$n df <- object$df[best] diff --git a/man/bas.glm.Rd b/man/bas.glm.Rd index a35e08da..298411f0 100644 --- a/man/bas.glm.Rd +++ b/man/bas.glm.Rd @@ -164,18 +164,26 @@ components: \item{priorprobs}{the prior probabilities of the models selected} \item{logmarg}{values of the log of the marginal likelihood for the models} \item{n.vars}{total number of independent variables in the full model, -including the intercept} \item{size}{the number of independent variables in -each of the models, includes the intercept} \item{which}{a list of lists +including the intercept} +\item{size}{the number of independent variables in +each of the models, includes the intercept} +\item{which}{a list of lists with one list per model with variables that are included in the model} \item{probne0}{the posterior probability that each variable is non-zero} \item{coefficients}{list of lists with one list per model giving the GLM -estimate of each (nonzero) coefficient for each model.} \item{se}{list of +estimate of each (nonzero) coefficient for each model.} +\item{se}{list of lists with one list per model giving the GLM standard error of each -coefficient for each model} \item{deviance}{the GLM deviance for each model} +coefficient for each model} +\item{deviance}{the GLM deviance for each model} \item{modelprior}{the prior distribution on models that created the BMA -object} \item{Q}{the Q statistic for each model used in the marginal -likelihood approximation} \item{Y}{response} \item{X}{matrix of predictors} -\item{family}{family object from the original call} \item{betaprior}{family +object} +\item{Q}{the Q statistic for each model used in the marginal +likelihood approximation} +\item{Y}{response} +\item{X}{matrix of predictors} +\item{family}{family object from the original call} +\item{betaprior}{family object for prior on coefficients, including hyperparameters} \item{modelprior}{family object for prior on the models} \item{include.always}{indices of variables that are forced into the model} diff --git a/man/bas.lm.Rd b/man/bas.lm.Rd index ceecf8ac..08a98685 100644 --- a/man/bas.lm.Rd +++ b/man/bas.lm.Rd @@ -259,6 +259,7 @@ BMA object} \item{X}{matrix of predictors} \item{mean.x}{vector of means for each column of X (used in \code{\link{predict.bas}})} +\link{weights} used in model fitting \item{include.always}{indices of variables that are forced into the model} The function \code{\link{summary.bas}}, is used to print a summary of the @@ -442,6 +443,7 @@ Other bas methods: \code{\link{confint.coef.bas}()}, \code{\link{confint.pred.bas}()}, \code{\link{diagnostics}()}, +\code{\link{extract_MPM}()}, \code{\link{fitted.bas}()}, \code{\link{force.heredity.bas}()}, \code{\link{image.bas}()}, diff --git a/man/fitted.Rd b/man/fitted.Rd index 38d8990d..fde3ced8 100644 --- a/man/fitted.Rd +++ b/man/fitted.Rd @@ -84,6 +84,7 @@ Other bas methods: \code{\link{confint.coef.bas}()}, \code{\link{confint.pred.bas}()}, \code{\link{diagnostics}()}, +\code{\link{extract_MPM}()}, \code{\link{force.heredity.bas}()}, \code{\link{image.bas}()}, \code{\link{plot.confint.bas}()}, diff --git a/man/variable.names.pred.bas.Rd b/man/variable.names.pred.bas.Rd index ca0de94f..6fd743a1 100644 --- a/man/variable.names.pred.bas.Rd +++ b/man/variable.names.pred.bas.Rd @@ -48,6 +48,7 @@ Other bas methods: \code{\link{confint.coef.bas}()}, \code{\link{confint.pred.bas}()}, \code{\link{diagnostics}()}, +\code{\link{extract_MPM}()}, \code{\link{fitted.bas}()}, \code{\link{force.heredity.bas}()}, \code{\link{image.bas}()}, diff --git a/tests/testthat/test-predict.R b/tests/testthat/test-predict.R index b037fad7..f5d62174 100644 --- a/tests/testthat/test-predict.R +++ b/tests/testthat/test-predict.R @@ -36,18 +36,42 @@ test_that("predict.bas.glm", { pima_pred <- predict(pima_gprior, estimator = "HPM", se.fit = FALSE) + pima_top <- predict(pima_gprior, + estimator = "BMA", top=1, + se.fit = TRUE) + + expect_equal(pima_pred$fit, pima_top$fit, check.attributes = FALSE) + +}) + + + +#Fixed Issue #51 +test_that("MPM and predict glm", { + data("Pima.tr", package="MASS") + data("Pima.te", package="MASS") + pima_gprior <- bas.glm(type ~ ., data = Pima.tr, + betaprior = g.prior(g=as.numeric(nrow(Pima.tr))), + family=binomial()) + pima_MPM = extract_MPM(pima_gprior) + + expect_equal(predict(pima_gprior, estimator = "MPM", se.fit = FALSE)$fit, + predict(pima_MPM, se.fit = FALSE)$fit, + check.attributes = FALSE) + + + pima_pred <- predict(pima_gprior, + estimator = "MPM", type = "link", + se.fit = FALSE) + pima_fit <- fitted(pima_gprior, + estimator = "MPM") + + expect_equal(pima_pred$fit, pima_fit, check.attributes = FALSE) - # should not error - expect_error(predict(pima_gprior, - estimator = "HPM", - se.fit = TRUE)) -# expect_null(plot(confint(pima_pred, parm = "mean"))) -# should not error - expect_error( predict(hald_gprior, newdata=Pima.te, estimator = "HPM", - se.fit = TRUE)) - #expect_null(plot(confint(pima_pred))) }) + +# Issue #52 SE's are incorrect for glms and weighted regression test_that("se.fit.glm", { data("Pima.tr", package="MASS") data("Pima.te", package="MASS") @@ -57,6 +81,7 @@ pima.bic = bas.glm(type ~ ., data=Pima.tr, n.models= 2^7, betaprior=bic.prior(n=200), family=binomial(), modelprior=beta.binomial(1,1)) +fit.bic = predict(pima.bic, se.fit = TRUE, top=1, type="link", estimator="HPM") pred.bic = predict(pima.bic, newdata=Pima.te, se.fit = TRUE, top=1, type="link") form = paste("type ~ ", @@ -64,13 +89,32 @@ form = paste("type ~ ", collapse = "+")) pima.glm = glm(form, data=Pima.tr, family=binomial()) +fit.glm = predict(pima.glm, se.fit=TRUE, type='link') pred.glm = predict(pima.glm, newdata=Pima.te, se.fit=TRUE, type='link') -expect_true(all.equal(pred.glm$fit, pred.bic$fit, check.attributes = FALSE)) +expect_true(all.equal(fit.glm$fit, fit.bic$fit, check.attributes = FALSE)) + # issue #50 in github regarding se.fit failing; debugging indicates se.fit is # incorrect -# Should be expect_true -expect_false(all.equal(pred.glm$se.fit, pred.bic$se.fit, check.attributes = FALSE)) +# Should be expect_equal + +expect_equal(fit.glm$se.fit, fit.bic$se.fit, check.attributes = FALSE) +expect_equal(pred.glm$se.fit, pred.bic$se.fit, check.attributes = FALSE) + + +}) + +# Added feature issue #53 +test_that("MPM and predict in lm", { + data(Hald, package="BAS") + hald_bic = bas.lm(Y ~ ., data=Hald, alpha=13, prior="BIC", + modelprior = uniform()) + hald_MPM = extract_MPM(hald_bic) + expect_equal(predict(hald_bic, estimator = "MPM")$fit, + predict(hald_MPM)$fit, check.attributes = FALSE) + expect_equal(predict(hald_bic, estimator = "MPM")$fit, + predict(hald_bic, estimator = "MPMold")$fit, + check.attributes = FALSE) })