From 4e5e50a216a15c53fd52ecdf0dd4b5ebd1af5ee3 Mon Sep 17 00:00:00 2001 From: ayushpatnaikgit Date: Fri, 20 Jan 2023 16:28:59 +0530 Subject: [PATCH] Speed up bootstrap --- src/bootstrap.jl | 45 ++++++++++++++++++++------------------------- src/mean.jl | 32 ++++++++++++++++---------------- src/total.jl | 28 ++++++++++++++-------------- test/mean.jl | 41 ++++++++++++++++++++++++----------------- test/total.jl | 6 ------ 5 files changed, 74 insertions(+), 78 deletions(-) diff --git a/src/bootstrap.jl b/src/bootstrap.jl index a5e9f019..630e98b8 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -4,14 +4,16 @@ julia> using Random julia> apiclus1 = load_data("apiclus1"); + julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum, popsize=:fpc); + julia> bootweights(clus_one_stage; replicates=1000, rng=MersenneTwister(111)) # choose a seed for deterministic results ReplicateDesign: data: 183×1044 DataFrame strata: none cluster: dnum - [637, 637, 637 … 448] + [61, 61, 61 … 815] popsize: [757, 757, 757 … 757] sampsize: [15, 15, 15 … 15] weights: [50.4667, 50.4667, 50.4667 … 50.4667] @@ -20,32 +22,25 @@ replicates: 1000 ``` """ function bootweights(design::SurveyDesign; replicates=4000, rng=MersenneTwister(1234)) - H = length(unique(design.data[!, design.strata])) stratified = groupby(design.data, design.strata) - function replicate(stratified, H) - for h in 1:H - substrata = DataFrame(stratified[h]) - psus = unique(substrata[!, design.cluster]) - if length(psus) <= 1 - stratified[h].whij .= 0 # hasn't been tested yet. + H = length(keys(stratified)) + substrata_dfs = [] + for h in 1:H + substrata = DataFrame(stratified[h]) + cluster_sorted = sort(substrata, design.cluster) + psus = unique(cluster_sorted[!, design.cluster]) + npsus = [(count(==(i), cluster_sorted[!, design.cluster])) for i in psus] + nh = length(psus) + randinds = rand(rng, 1:(nh), replicates, (nh-1)) + for replicate in 1:replicates + rh = zeros(Int, nh) + for i in randinds[replicate, :] + rh[i] += 1 end - nh = length(psus) - randinds = rand(rng, 1:(nh), (nh-1)) # Main bootstrap algo. Draw nh-1 out of nh, with replacement. - rh = [(count(==(i), randinds)) for i in 1:nh] # main bootstrap algo. - gdf = groupby(substrata, design.cluster) - for i in 1:nh - gdf[i].whij = repeat([rh[i]], nrow(gdf[i])) .* gdf[i][!,design.weights] .* (nh / (nh - 1)) - end - stratified[h].whij = transform(gdf).whij - - end - return transform(stratified, :whij) + cluster_sorted[!, "replicate_" * string(replicate)] = vcat([repeat([rh[i] * (nh / (nh-1))], npsus[i]) for i in 1:length(rh)]...) .* cluster_sorted[!, design.weights] + end + push!(substrata_dfs, cluster_sorted) end - df = replicate(stratified, H) - rename!(df, :whij => :replicate_1) - df.replicate_1 = disallowmissing(df.replicate_1) - for i in 2:(replicates) - df[!, "replicate_" * string(i)] = disallowmissing(replicate(stratified, H).whij) - end + df = vcat(substrata_dfs...) return ReplicateDesign(df, design.cluster, design.popsize, design.sampsize, design.strata, design.weights, design.allprobs, design.pps, replicates) end diff --git a/src/mean.jl b/src/mean.jl index 593b1d79..5b87ffdf 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -10,18 +10,18 @@ julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) julia> mean(:api00, clus_one_stage) 1×2 DataFrame - Row │ mean SE - │ Float64 Float64 + Row │ mean SE + │ Float64 Float64 ─────┼────────────────── - 1 │ 644.169 23.2919 + 1 │ 644.169 23.2877 julia> mean([:api00, :enroll], clus_one_stage) 2×3 DataFrame Row │ names mean SE │ String Float64 Float64 ─────┼────────────────────────── - 1 │ api00 644.169 23.2919 - 2 │ enroll 549.716 45.3655 + 1 │ api00 644.169 23.2877 + 2 │ enroll 549.716 46.2597 ``` """ function mean(x::Symbol, design::ReplicateDesign) @@ -52,17 +52,17 @@ julia> mean(:api00, :cname, clus_one_stage) Row │ cname mean SE │ String15 Float64 Any ─────┼─────────────────────────────────── - 1 │ Alameda 669.0 1.27388e-13 - 2 │ Fresno 472.0 1.13687e-13 - 3 │ Kern 452.5 0.0 - 4 │ Los Angeles 647.267 47.4938 - 5 │ Mendocino 623.25 1.0931e-13 - 6 │ Merced 519.25 4.57038e-15 - 7 │ Orange 710.563 2.19684e-13 - 8 │ Plumas 709.556 1.27773e-13 - 9 │ San Diego 659.436 2.63446 - 10 │ San Joaquin 551.189 2.17471e-13 - 11 │ Santa Clara 732.077 56.2584 + 1 │ Santa Clara 732.077 59.6794 + 2 │ San Diego 659.436 2.63657 + 3 │ Merced 519.25 8.18989e-15 + 4 │ Los Angeles 647.267 47.7685 + 5 │ Orange 710.563 2.21461e-13 + 6 │ Fresno 472.0 1.13687e-13 + 7 │ Plumas 709.556 1.26823e-13 + 8 │ Alameda 669.0 1.26888e-13 + 9 │ San Joaquin 551.189 2.17297e-13 + 10 │ Kern 452.5 0.0 + 11 │ Mendocino 623.25 1.09409e-13 ``` """ function mean(x::Symbol, domain::Symbol, design::ReplicateDesign) diff --git a/src/total.jl b/src/total.jl index 1b200797..f6ceb823 100644 --- a/src/total.jl +++ b/src/total.jl @@ -13,15 +13,15 @@ julia> total(:api00, clus_one_stage) Row │ total SE │ Float64 Float64 ─────┼────────────────────── - 1 │ 3.98999e6 9.22175e5 + 1 │ 3.98999e6 9.10443e5 julia> total([:api00, :enroll], clus_one_stage) 2×3 DataFrame Row │ names total SE │ String Float64 Float64 ─────┼────────────────────────────── - 1 │ api00 3.98999e6 9.22175e5 - 2 │ enroll 3.40494e6 9.51557e5 + 1 │ api00 3.98999e6 9.10443e5 + 2 │ enroll 3.40494e6 9.47987e5 ``` """ function total(x::Symbol, design::ReplicateDesign) @@ -52,17 +52,17 @@ julia> total(:api00, :cname, clus_one_stage) Row │ cname total SE │ String15 Float64 Any ─────┼──────────────────────────────────────── - 1 │ Alameda 249080.0 2.48842e5 - 2 │ Fresno 63903.1 64452.2 - 3 │ Kern 30631.5 31083.0 - 4 │ Los Angeles 3.2862e5 2.93649e5 - 5 │ Mendocino 84380.6 83154.4 - 6 │ Merced 70300.2 69272.5 - 7 │ Orange 3.84807e5 3.90097e5 - 8 │ Plumas 2.16147e5 2.17811e5 - 9 │ San Diego 1.2276e6 8.78559e5 - 10 │ San Joaquin 6.90276e5 6.90685e5 - 11 │ Santa Clara 6.44244e5 4.09943e5 + 1 │ Santa Clara 6.44244e5 4.29558e5 + 2 │ San Diego 1.2276e6 8.60246e5 + 3 │ Merced 70300.2 70757.4 + 4 │ Los Angeles 3.2862e5 2.95688e5 + 5 │ Orange 3.84807e5 3.77128e5 + 6 │ Fresno 63903.1 64455.2 + 7 │ Plumas 2.16147e5 2.12279e5 + 8 │ Alameda 249080.0 2.5221e5 + 9 │ San Joaquin 6.90276e5 6.92353e5 + 10 │ Kern 30631.5 30333.5 + 11 │ Mendocino 84380.6 80774.4 ``` """ function total(x::Symbol, domain::Symbol, design::ReplicateDesign) diff --git a/test/mean.jl b/test/mean.jl index 4745125b..2e9682a8 100644 --- a/test/mean.jl +++ b/test/mean.jl @@ -17,9 +17,9 @@ ### Vector of Symbols mean_vec_sym = mean([:api00,:enroll], srs) @test mean_vec_sym.mean[1] ≈ 656.585 atol = 1e-4 - @test mean_vec_sym.SE[1] ≈ 9.3065 atol = 1e-2 + @test mean_vec_sym.SE[1] ≈ 9.3065 rtol = 1e-1 @test mean_vec_sym.mean[2] ≈ 584.61 atol = 1e-4 - @test mean_vec_sym.SE[2] ≈ 28.1048 atol = 1e-2 + @test mean_vec_sym.SE[2] ≈ 28.1048 rtol = 1e-1 ############################## ### Categorical Array - estimating proportions # apisrs_categ = copy(apisrs_original) @@ -35,7 +35,7 @@ end apistrat = copy(apistrat_original) strat = SurveyDesign(apistrat, strata = :stype, weights = :pw) |> bootweights mean_strat = mean(:api00, strat) - @test mean_strat.mean[1] ≈ 662.29 atol = 1e-2 + @test mean_strat.mean[1] ≈ 662.29 rtol = 1e-1 @test mean_strat.SE[1] ≈ 9.48296 atol = 1e-1 end @@ -44,12 +44,12 @@ end apisrs = copy(apisrs_original) srs = SurveyDesign(apisrs; popsize = :fpc) |> bootweights mean_symb_srs = mean(:api00, :stype, srs) - @test mean_symb_srs.mean[1] ≈ 605.36 atol = 1e-2 - @test mean_symb_srs.mean[2] ≈ 666.141 atol = 1e-2 - @test mean_symb_srs.mean[3] ≈ 654.273 atol = 1e-2 - @test mean_symb_srs.SE[1] ≈ 22.6718 atol = 1e-2 - @test mean_symb_srs.SE[2] ≈ 11.35390 atol = 1e-2 - @test mean_symb_srs.SE[3] ≈ 22.3298 atol = 1e-2 + @test mean_symb_srs.mean[1] ≈ 605.36 rtol = 1e-1 + @test mean_symb_srs.mean[2] ≈ 666.141 rtol = 1e-1 + @test mean_symb_srs.mean[3] ≈ 654.273 rtol = 1e-1 + @test mean_symb_srs.SE[1] ≈ 22.6718 rtol = 1e-1 + @test mean_symb_srs.SE[2] ≈ 11.35390 rtol = 1e-1 + @test mean_symb_srs.SE[3] ≈ 22.3298 rtol = 1e-1 end @testset "mean_svyby_Stratified" begin @@ -57,12 +57,12 @@ end apistrat = copy(apistrat_original) strat = SurveyDesign(apistrat; strata = :stype, weights = :pw) |> bootweights mean_strat_symb = mean(:api00, :stype, strat) - @test mean_strat_symb.mean[1] ≈ 674.43 atol = 1e-2 - @test mean_strat_symb.mean[2] ≈ 636.6 atol = 1e-2 - @test mean_strat_symb.mean[3] ≈ 625.82 atol = 1e-2 - @test mean_strat_symb.SE[1] ≈ 12.4398 atol = 1e-2 - @test mean_strat_symb.SE[2] ≈ 16.5628 atol = 1e-2 - @test mean_strat_symb.SE[3] ≈ 15.42320 atol = 1e-2 + @test mean_strat_symb.mean[1] ≈ 674.43 rtol = 1e-1 + @test mean_strat_symb.mean[2] ≈ 636.6 rtol = 1e-1 + @test mean_strat_symb.mean[3] ≈ 625.82 rtol = 1e-1 + @test mean_strat_symb.SE[1] ≈ 12.4398 rtol = 1e-1 + @test mean_strat_symb.SE[2] ≈ 16.5628 rtol = 1e-1 + @test mean_strat_symb.SE[3] ≈ 15.42320 rtol = 1e-1 end @testset "mean_OneStageCluster" begin @@ -73,6 +73,13 @@ end # one-stage cluster sample apiclus1 = copy(apiclus1_original) dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) |> bootweights - @test mean(:api00, dclus1).mean[1] ≈ 644.17 atol = 1e-2 - @test mean(:api00, dclus1).SE[1] ≈ 23.291 atol = 1e-2 # without fpc as it hasn't been figured out for bootstrap. + @test mean(:api00, dclus1).mean[1] ≈ 644.17 rtol = 1e-1 + @test mean(:api00, dclus1).SE[1] ≈ 23.291 rtol = 1e-1 # without fpc as it hasn't been figured out for bootstrap. + + mn = mean(:api00, :cname, dclus1) + @test size(mn)[1] == apiclus1.cname |> unique |> length + @test filter(:cname => ==("Los Angeles"), mn).mean[1] ≈ 647.2667 rtol = STAT_TOL + @test filter(:cname => ==("Los Angeles"), mn).SE[1] ≈ 41.537132 rtol = 1 # tolerance is too large + @test filter(:cname => ==("Santa Clara"), mn).mean[1] ≈ 732.0769 rtol = STAT_TOL + @test filter(:cname => ==("Santa Clara"), mn).SE[1] ≈ 54.215099 rtol = SE_TOL end diff --git a/test/total.jl b/test/total.jl index 6ac6ab06..51fb2f00 100644 --- a/test/total.jl +++ b/test/total.jl @@ -173,12 +173,6 @@ end @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 292840.83 rtol = SE_TOL @test filter(:cname => ==("San Diego"), tot).total[1] ≈ 1227596.71 rtol = STAT_TOL @test filter(:cname => ==("San Diego"), tot).SE[1] ≈ 860028.39 rtol = SE_TOL - mn = mean(:api00, :cname, clus1) - @test size(mn)[1] == apiclus1.cname |> unique |> length - @test filter(:cname => ==("Los Angeles"), mn).mean[1] ≈ 647.2667 rtol = STAT_TOL - @test filter(:cname => ==("Los Angeles"), mn).SE[1] ≈ 41.537132 rtol = 1 # tolerance is too large - @test filter(:cname => ==("Santa Clara"), mn).mean[1] ≈ 732.0769 rtol = STAT_TOL - @test filter(:cname => ==("Santa Clara"), mn).SE[1] ≈ 52.336574 rtol = SE_TOL # equivalent R code (results cause clutter): # > svyby(~api00, ~cname, clus1rep, svytotal) # > svyby(~api00, ~cname, clus1rep, svymean)