Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Directly store replicate weights #168

Merged
merged 11 commits into from
Jan 10, 2023
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
Expand Down
1 change: 1 addition & 0 deletions src/Survey.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ using CairoMakie
using AlgebraOfGraphics
using CategoricalArrays
using Random
using Missings

include("SurveyDesign.jl")
include("bootstrap.jl")
Expand Down
41 changes: 31 additions & 10 deletions src/bootstrap.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,27 +26,48 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis
H = length(unique(design.data[!, design.strata]))
stratified = groupby(design.data, design.strata)
function replicate(stratified, H)
for j in 1:H
substrata = DataFrame(stratified[j])
for h in 1:H
substrata = DataFrame(stratified[h])
psus = unique(substrata[!, design.cluster])
if length(psus) == 1
return DataFrame(statistic = X, SE = 0)
# @show psus
if length(psus) <= 1
return DataFrame(statistic = X, SE = 0) # bug!
end
nh = length(psus)
randinds = rand(rng, 1:(nh), (nh-1)) # Main bootstrap algo. Draw nh-1 out of nh, with replacement.
rh = [(count(==(i), randinds)) for i in 1:nh] # main bootstrap algo.
gdf = groupby(substrata, design.cluster)
# @show keys(gdf)
for i in 1:nh
gdf[i].rh = repeat([rh[i]], nrow(gdf[i]))
end
stratified[j].rh = DataFrame(gdf).rh
gdf[i].whij = repeat([rh[i]], nrow(gdf[i])) .* gdf[i].weights .* (nh / (nh - 1))
end
stratified[h].whij = transform(gdf).whij

end
return DataFrame(stratified)
return transform(stratified, :whij)
end
df = replicate(stratified, H)
rename!(df,:rh => :replicate_1)
rename!(df,:whij => :replicate_1)
df.replicate_1 = disallowmissing(df.replicate_1)
for i in 2:(replicates)
df[!, "replicate_"*string(i)] = Float64.(replicate(stratified, H).rh)
df[!, "replicate_"*string(i)] = disallowmissing(replicate(stratified, H).whij)
end
return ReplicateDesign(df, design.cluster, design.popsize, design.sampsize, design.strata, design.pps, replicates)
end

function bootstrap(x::Symbol, design::SurveyDesign, func = wsum; replicates = 100, rng = MersenneTwister(1234))
gdf = groupby(design.data, design.cluster)
psus = unique(design.data[!, design.cluster])
nh = length(psus)
X = func(design.data[:, x], design.data.weights)
Xt = Array{Float64, 1}(undef, replicates)
for i in 1:replicates
selected_psus = psus[rand(rng, 1:nh, (nh-1))] # simple random sample of PSUs, with replacement. Select (nh-1) out of nh
xhij = (reduce(vcat, [gdf[(i,)][!, x] for i in selected_psus]))
whij = (reduce(vcat, [gdf[(i,)].weights * (nh / (nh - 1)) for i in selected_psus]))
Xt[i] = func(xhij, whij)
end
@show Xt
variance = sum((Xt .- X).^2) / replicates
return DataFrame(statistic = X, SE = sqrt(variance))
end
2 changes: 1 addition & 1 deletion src/by.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ function bydomain(x::Symbol, domain::Symbol, design::ReplicateDesign, func::Func
X = combine(gdf, [x, :weights] => ((a, b) -> func(a, weights(b))) => :statistic)
Xt_mat = Array{Float64, 2}(undef, (nd, design.replicates))
for i in 1:design.replicates
Xt_mat[:, i] = combine(gdf, [x, :weights, Symbol("replicate_"*string(i))] => ((a, b, c) -> func(a, weights(b .* c))) => :statistic).statistic
Xt_mat[:, i] = combine(gdf, [x, Symbol("replicate_"*string(i))] => ((a, c) -> func(a, weights(c))) => :statistic).statistic
end
ses = []
for i in 1:nd
Expand Down
12 changes: 6 additions & 6 deletions src/mean.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,21 @@ julia> using Survey, Random, StatsBase;

julia> apiclus1 = load_data("apiclus1");

julia> dclus1 = SurveyDesign(apiclus1, :dnum, :fpc);
julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw);

julia> bclus1 = bootweights(apiclus1; replicates = 1000)
julia> bclus1 = bootweights(dclus1; replicates = 1000)

julia> mean(:api00, bclus1)
1×2 DataFrame
Row │ mean SE
│ Float64 Float64
─────┼──────────────────
1 │ 644.169 23.0897
1 │ 644.169 23.7208
```
"""
function mean(x::Symbol, design::ReplicateDesign)
X = mean(design.data[!, x], weights(design.data.weights))
Xt = [mean(design.data[!, x], weights(design.data.weights .* design.data[! , "replicate_"*string(i)])) for i in 1:design.replicates]
Xt = [mean(design.data[!, x], weights(design.data[! , "replicate_"*string(i)])) for i in 1:design.replicates]
variance = sum((Xt .- X).^2) / design.replicates
DataFrame(mean = X, SE = sqrt(variance))
end
Expand All @@ -28,9 +28,9 @@ julia> using Survey, Random, StatsBase;

julia> apiclus1 = load_data("apiclus1");

julia> dclus1 = SurveyDesign(apiclus1, :dnum, :fpc);
julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw);

julia> bclus1 = bootweights(apiclus1; replicates = 1000)
julia> bclus1 = bootweights(dclus1; replicates = 1000)

julia> mean(:api00, :cname, bclus1) |> print
38×3 DataFrame
Expand Down
2 changes: 1 addition & 1 deletion src/total.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ julia> total(:api00, bclus1)
"""
function total(x::Symbol, design::ReplicateDesign)
X = wsum(design.data[!, x], weights(design.data.weights))
Xt = [wsum(design.data[!, x], weights(design.data.weights .* design.data[! , "replicate_"*string(i)])) for i in 1:design.replicates]
Xt = [wsum(design.data[!, x], weights(design.data[! , "replicate_"*string(i)])) for i in 1:design.replicates]
variance = sum((Xt .- X).^2) / design.replicates
DataFrame(total = X, SE = sqrt(variance))
end
Expand Down