From 545bc497561363e754b05973c57e8d1c0f79b023 Mon Sep 17 00:00:00 2001 From: ayushpatnaikgit Date: Fri, 16 Dec 2022 22:59:15 +0530 Subject: [PATCH] Add bootstrap estimation for cluster sampling. --- Project.toml | 1 + docs/Project.toml | 2 ++ docs/make.jl | 1 + src/Survey.jl | 3 +++ src/bootstrap.jl | 36 ++++++++++++++++++++++++++++++++++++ test/Project.toml | 2 ++ test/bootstrap.jl | 10 ++++++++++ test/runtests.jl | 3 ++- 8 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 src/bootstrap.jl create mode 100644 test/bootstrap.jl diff --git a/Project.toml b/Project.toml index 592127e6..2659ebc2 100644 --- a/Project.toml +++ b/Project.toml @@ -10,6 +10,7 @@ CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" diff --git a/docs/Project.toml b/docs/Project.toml index 1d2a187a..76d580a7 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,3 +1,5 @@ [deps] Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" Survey = "c1a98b4d-6cd2-47ec-b9e9-69b59c35373c" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" diff --git a/docs/make.jl b/docs/make.jl index df10b794..c79915e1 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,5 +1,6 @@ using Survey using Documenter +using Random DocMeta.setdocmeta!(Survey, :DocTestSetup, :(using Survey); recursive=true) diff --git a/src/Survey.jl b/src/Survey.jl index 095ae659..adebc18a 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -10,6 +10,7 @@ using LinearAlgebra using CairoMakie using AlgebraOfGraphics using CategoricalArrays +using Random include("SurveyDesign.jl") include("mean.jl") @@ -21,6 +22,7 @@ include("plot.jl") include("dimnames.jl") include("boxplot.jl") include("show.jl") +include("bootstrap.jl") export load_data export AbstractSurveyDesign, SimpleRandomSample, StratifiedSample @@ -30,5 +32,6 @@ export mean, total, quantile export plot export hist, sturges, freedman_diaconis export boxplot +export bootstrap end \ No newline at end of file diff --git a/src/bootstrap.jl b/src/bootstrap.jl new file mode 100644 index 00000000..188a46c0 --- /dev/null +++ b/src/bootstrap.jl @@ -0,0 +1,36 @@ +""" +```jldoctest +julia> using Survey, Random, StatsBase; + +julia> apiclus1 = load_data("apiclus1"); + +julia> dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc); + +julia> rng = MersenneTwister(111); + +julia> func = wsum; + +julia> bootstrap(:api00, dclus1, func; replicates=1000, rng) +1×2 DataFrame + Row │ statistic SE + │ Float64 Float64 +─────┼────────────────────── + 1 │ 5.94916e6 1.36593e6 + +``` +""" +function bootstrap(x::Symbol, design::OneStageClusterSample, func = wsum; replicates = 100, rng = MersenneTwister(1234)) + gdf = groupby(design.data, design.cluster) + psus = unique(design.data[!, design.cluster]) + nh = length(psus) + X = func(design.data[:, x], design.data.weights) + Xt = Array{Float64, 1}(undef, replicates) + for i in 1:replicates + selected_psus = psus[rand(rng, 1:nh, (nh-1))] # simple random sample of PSUs, with replacement. Select (nh-1) out of nh + xhij = (reduce(vcat, [gdf[(i,)][!, x] for i in selected_psus])) + whij = (reduce(vcat, [gdf[(i,)].weights * (nh / (nh - 1)) for i in selected_psus])) + Xt[i] = func(xhij, whij) + end + variance = sum((Xt .- X).^2) / replicates + return DataFrame(statistic = X, SE = sqrt(variance)) +end diff --git a/test/Project.toml b/test/Project.toml index 62dd5dde..0c99cc27 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,3 +1,5 @@ [deps] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" diff --git a/test/bootstrap.jl b/test/bootstrap.jl new file mode 100644 index 00000000..263a5dd7 --- /dev/null +++ b/test/bootstrap.jl @@ -0,0 +1,10 @@ +using Random, StatsBase +apiclus1 = load_data("apiclus1") +dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc); +rng = MersenneTwister(111); +func = wsum; +est = bootstrap(:api00, dclus1, func; replicates=1000, rng) +@testset "bootstrap.jl" begin + @test est.SE[1] ≈ 1.365925776009e6 + @test est.statistic[1] ≈ 5.9491620666e6 +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 00e850b5..c0a7bd7d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -16,4 +16,5 @@ include("mean.jl") include("dimnames.jl") include("plot.jl") include("hist.jl") -include("boxplot.jl") \ No newline at end of file +include("boxplot.jl") +include("bootstrap.jl") \ No newline at end of file