Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VIF and GVIF #300

Merged
merged 20 commits into from
Sep 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@ StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"

[compat]
Aqua = "0.6"
Aqua = "0.7"
CategoricalArrays = "0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.10"
DataAPI = "1.1"
DataFrames = "1"
DataStructures = "0.17, 0.18"
ShiftedArrays = "1, 2"
StatsAPI = "1"
StatsAPI = "1.7"
StatsBase = "0.33.5, 0.34"
StatsFuns = "0.9, 1.0"
Tables = "0.2, 1"
Expand Down
4 changes: 2 additions & 2 deletions docs/make.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using Documenter, StatsModels
using Documenter, StatsModels, StatsAPI

DocMeta.setdocmeta!(StatsModels, :DocTestSetup, :(using StatsModels, StatsBase); recursive=true)

Expand All @@ -15,7 +15,7 @@ makedocs(
"Temporal variables and Time Series Terms" => "temporal_terms.md",
"API documentation" => "api.md"
],
modules = [StatsModels],
modules = [StatsModels, StatsAPI],
doctestfilters = [r"([a-z]*) => \1", r"getfield\(.*##[0-9]+#[0-9]+"],
palday marked this conversation as resolved.
Show resolved Hide resolved
strict=Documenter.except(:missing_docs)
)
Expand Down
6 changes: 4 additions & 2 deletions docs/src/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,12 @@ apply_schema

```@docs
fit
response
modelmatrix
gvif
lrtest
formula
modelmatrix
response
vif
```

### Traits
Expand Down
8 changes: 6 additions & 2 deletions src/StatsModels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ using DataStructures
using DataAPI
using DataAPI: levels
using Printf: @sprintf
using StatsAPI: coefnames, fit, predict, dof
using StatsAPI: coefnames, dof, fit, gvif, predict, vif
using StatsFuns: chisqccdf

using SparseArrays
Expand Down Expand Up @@ -70,7 +70,10 @@ export
omitsintercept,
hasresponse,

lrtest
lrtest,

vif,
gvif

const SPECIALS = (:+, :&, :*, :~)

Expand All @@ -84,6 +87,7 @@ include("formula.jl")
include("modelframe.jl")
include("statsmodel.jl")
include("lrtest.jl")
include("vif.jl")
include("deprecated.jl")

end # module StatsModels
121 changes: 121 additions & 0 deletions src/vif.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# putting this behind a function barrier means that the model matrix
# can potentially be freed immediately if it's large and constructed on the fly
function _find_intercept(model::RegressionModel)
modelmat = modelmatrix(model)
cols = eachcol(modelmat)
# collect is necessary for Julia 1.6
# but it's :just: an array of references to views, so shouldn't be too
# expensive
@static if VERSION < v"1.7"
cols = collect(cols)
end
return findfirst(Base.Fix1(all, isone), cols)
end

_find_intercept(form::FormulaTerm) = _find_intercept(form.rhs)
# we need these in case the RHS is a single term
_find_intercept(::AbstractTerm) = nothing
_find_intercept(::InterceptTerm{true}) = 1
_find_intercept(m::MatrixTerm) = _find_intercept(m.terms)
function _find_intercept(t::TupleTerm)
return findfirst(!isnothing ∘ _find_intercept, t)
end

# borrowed from Effects.jl
function get_matrix_term(x)
x = collect_matrix_terms(x)
x = x isa MatrixTerm ? x : first(x)
x isa MatrixTerm || throw(ArgumentError("couldn't extract matrix term from $x"))
# this is a work around for some weirdness that happens in MixedModels.jl
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
x = first(x.terms) isa MatrixTerm ? only(x.terms) : x
return x
end

function StatsAPI.vif(model::RegressionModel)
vc = vcov(model)
Base.require_one_based_indexing(vc)

intercept = _find_intercept(model)
isnothing(intercept) &&
throw(ArgumentError("VIF is only defined for models with an intercept term"))

# copy just in case vc was a reference to an internal structure
vc = StatsBase.cov2cor!(copy(vc), stderror(model))
m = view(vc, axes(vc, 1) .!= intercept, axes(vc, 2) .!= intercept)
size(m, 2) > 1 ||
throw(ArgumentError("VIF not meaningful for models with only one non-intercept term"))
# NB: The correlation matrix is positive definite and hence invertible
# unless there is perfect rank deficiency, hence the warning in the docstring.
return diag(inv(Symmetric(m)))
end

"""
gvif(model::RegressionModel; scale=false)

Compute the generalized variance inflation factor (GVIF).

If `scale=true`, then each GVIF is scaled by the degrees of freedom
for (number of coefficients associated with) the predictor: ``GVIF^(1 / (2*df))``.

Returns a vector of inflation factors computed for each term except
for the intercept.
In other words, the corresponding coefficients are `termnames(m)[2:end]`.

The [GVIF](https://doi.org/10.2307/2290467)
measures the increase in the variance of a (group of) parameter's estimate in a model
with multiple parameters relative to the variance of a parameter's estimate in a
model containing only that parameter. For continuous, numerical predictors, the GVIF
is the same as the VIF, but for categorical predictors, the GVIF provides a single
number for the entire group of contrast-coded coefficients associated with a categorical
predictor.

See also [`termnames`](@ref), [`vif`](@ref).

!!! warning
This method will fail if there is (numerically) perfect multicollinearity,
i.e. rank deficiency (in the fixed effects). In that case though, the VIF
isn't particularly informative anyway.

# References

Fox, J., & Monette, G. (1992). Generalized Collinearity Diagnostics.
Journal of the American Statistical Association, 87(417), 178. doi:10.2307/2290467
"""
function StatsAPI.gvif(model::RegressionModel; scale=false)
form = formula(model)
intercept = _find_intercept(form)
isnothing(intercept) &&
throw(ArgumentError("GVIF only defined for models with an intercept term"))
vc = vcov(model)
Base.require_one_based_indexing(vc)

vc = StatsBase.cov2cor!(copy(vc), stderror(model))
m = view(vc, axes(vc, 1) .!= intercept, axes(vc, 2) .!= intercept)
size(m, 2) > 1 ||
throw(ArgumentError("GVIF not meaningful for models with only one non-intercept term"))

tn = last(termnames(model))
tn = view(tn, axes(tn, 1) .!= intercept)
trms = get_matrix_term(form.rhs).terms
# MatrixTerms.terms is a tuple or vector so always 1-based indexing
trms = trms[1:length(trms) .!= intercept]

df = width.(trms)
vals = zeros(eltype(m), length(tn))
logdetm = logdet(m)
acc = 0
for idx in axes(vals, 1)
wt = df[idx]
trm_only = [acc < i <= (acc + wt) for i in axes(m, 2)]
trm_excl = .!trm_only
vals[idx] = exp(logdet(view(m, trm_only, trm_only)) +
logdet(view(m, trm_excl, trm_excl)) -
logdetm)
acc += wt
end

if scale
vals .= vals .^ (1 ./ (2 .* df))
end
return vals
end
8 changes: 6 additions & 2 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,15 @@ my_tests = ["ambiguity.jl",
"contrasts.jl",
"extension.jl",
"traits.jl",
"protect.jl"]
"protect.jl",
"vif.jl"]

@testset "StatsModels" begin
@testset "aqua" begin
Aqua.test_all(StatsModels; ambiguities=false)
# because VIF and GVIF are defined in StatsAPI for RegressionModel,
# which is also defined there, it's flagged as piracy. But
# we're the offical implementers so it's privateering.
Aqua.test_all(StatsModels; ambiguities=false, piracy=(treat_as_own=[vif, gvif],))
end

for tf in my_tests
Expand Down
106 changes: 106 additions & 0 deletions test/vif.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
using StatsAPI: RegressionModel

struct MyRegressionModel <: RegressionModel
end

StatsAPI.modelmatrix(::MyRegressionModel) = [1 2; 3 4]
StatsAPI.vcov(::MyRegressionModel) = [1 0; 0 1]

struct MyRegressionModel2 <: RegressionModel
end

StatsAPI.modelmatrix(::MyRegressionModel2) = [1 2; 1 2]
StatsAPI.vcov(::MyRegressionModel2) = [1 0; 0 1]

struct MyRegressionModel3 <: RegressionModel
end

StatsAPI.modelmatrix(::MyRegressionModel3) = [1 2 3; 1 2 3]
StatsAPI.vcov(::MyRegressionModel3) = [1 0 0; 0 1 0; 0 0 1]

Base.@kwdef struct Duncan <: RegressionModel
coefnames::Vector{String}
coef::Vector{Float64}
stderror::Vector{Float64}
modelmatrix::Matrix{Float64}
vcov::Matrix{Float64}
formula::FormulaTerm
end

for f in [:coefnames, :coef, :stderror, :modelmatrix, :vcov]
@eval StatsAPI.$f(model::Duncan) = model.$f
end

StatsModels.formula(model::Duncan) = model.formula

@testset "VIF input checks" begin
# no intercept term
@test_throws ArgumentError vif(MyRegressionModel())

# only one non intercept term
@test_throws ArgumentError vif(MyRegressionModel2())

# vcov is identity, so the VIF is just 1
@test vif(MyRegressionModel3()) ≈ [1, 1]
end

palday marked this conversation as resolved.
Show resolved Hide resolved
# Reference values from car::vif in R:
# > library(car)
# > data(Duncan)
# > lm1 = lm(prestige ~ 1 + income + education, Duncan)
# > vif(lm1)
# income education
# 2.1049 2.1049
# > lm2 = lm(prestige ~ 1 + income + education + type, Duncan)
# > vif(lm2)
# GVIF Df GVIF^(1/(2*Df))
# income 2.209178 1 1.486330
# education 5.297584 1 2.301648
# type 5.098592 2 1.502666


@testset "GVIF and VIF are the same for continuous predictors" begin
# these are copied from a GLM fit to the car::duncan data
duncan2 = Duncan(; coefnames=["(Intercept)", "Income", "Education"],
formula=term(:Prestige) ~ InterceptTerm{true}() + ContinuousTerm(:Income, 1,1,1,1) +
ContinuousTerm(:Education, 1,1,1,1),
coef=[-6.064662922103356, 0.5987328215294941, 0.5458339094008812],
stderror=[4.271941174529124, 0.11966734981235407, 0.0982526413303999],
# we actually don't need the whole matrix -- just enough to find an intercept
modelmatrix=[1.0 62.0 86.0],
vcov=[18.2495 -0.151845 -0.150706
-0.151845 0.0143203 -0.00851855
-0.150706 -0.00851855 0.00965358])
@test vif(duncan2) ≈ [2.1049, 2.1049] atol=1e-5
# two different ways of calculating the same quantity
@test vif(duncan2) ≈ gvif(duncan2)
end

@testset "GVIF" begin
cm = StatsModels.ContrastsMatrix(DummyCoding("bc", ["bc", "prof", "wc"]), ["bc", "prof", "wc"])
duncan3 = Duncan(; coefnames=["(Intercept)", "Income", "Education", "Type: prof", "Type: wc"],
formula=term(:Prestige) ~ InterceptTerm{true}() + ContinuousTerm(:Income, 1,1,1,1) +
ContinuousTerm(:Education, 1,1,1,1) + CategoricalTerm(:Type, cm),
coef=[0.185028, 0.597546, 0.345319, 16.6575, -14.6611],
stderror=[3.71377, 0.0893553, 0.113609, 6.99301, 6.10877],
# we actually don't need the whole matrix -- just enough to find an intercept
modelmatrix=[1.0 62.0 86.0 1.0 0.0],
vcov=[13.7921 -0.115637 -0.257486 14.0947 7.9022
-0.115637 0.00798437 -0.00292449 -0.126011 -0.109049
-0.257486 -0.00292449 0.012907 -0.616651 -0.38812
14.0947 -0.126011 -0.616651 48.9021 30.2139
7.9022 -0.109049 -0.38812 30.2139 37.3171])
@test gvif(duncan3) ≈ [2.209178, 5.297584, 5.098592] atol=1e-4
@test gvif(duncan3; scale=true) ≈ [1.486330, 2.301648, 1.502666] atol=1e-5
palday marked this conversation as resolved.
Show resolved Hide resolved
end

@testset "utils" begin
int = InterceptTerm{true}()
noint = InterceptTerm{false}()
xterm = term(:x)
@test StatsModels._find_intercept(xterm) === nothing
@test StatsModels._find_intercept(int) == 1
@test StatsModels._find_intercept(noint) === nothing
@test StatsModels._find_intercept(MatrixTerm((xterm, int))) == 2
@test StatsModels.get_matrix_term(MatrixTerm(MatrixTerm((xterm, int)))) == MatrixTerm((xterm, int))
end
Loading