From f9efaea9f7314ae4ed61ff0aec0b9a0c40c11c3a Mon Sep 17 00:00:00 2001 From: ayushpatnaikgit Date: Wed, 4 Jan 2023 21:19:10 +0530 Subject: [PATCH 01/11] Fix bug in total. --- Project.toml | 1 + src/Survey.jl | 1 + src/bootstrap.jl | 41 +++++++++++++++++++++++++++++++---------- src/by.jl | 2 +- src/mean.jl | 12 ++++++------ src/total.jl | 2 +- 6 files changed, 41 insertions(+), 18 deletions(-) diff --git a/Project.toml b/Project.toml index 2659ebc2..f288d231 100644 --- a/Project.toml +++ b/Project.toml @@ -10,6 +10,7 @@ CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" diff --git a/src/Survey.jl b/src/Survey.jl index f6d3d030..fee13d6d 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -11,6 +11,7 @@ using CairoMakie using AlgebraOfGraphics using CategoricalArrays using Random +using Missings include("SurveyDesign.jl") include("bootstrap.jl") diff --git a/src/bootstrap.jl b/src/bootstrap.jl index 70df8a81..a28fabe3 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -26,27 +26,48 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis H = length(unique(design.data[!, design.strata])) stratified = groupby(design.data, design.strata) function replicate(stratified, H) - for j in 1:H - substrata = DataFrame(stratified[j]) + for h in 1:H + substrata = DataFrame(stratified[h]) psus = unique(substrata[!, design.cluster]) - if length(psus) == 1 - return DataFrame(statistic = X, SE = 0) + # @show psus + if length(psus) <= 1 + return DataFrame(statistic = X, SE = 0) # bug! end nh = length(psus) randinds = rand(rng, 1:(nh), (nh-1)) # Main bootstrap algo. Draw nh-1 out of nh, with replacement. rh = [(count(==(i), randinds)) for i in 1:nh] # main bootstrap algo. gdf = groupby(substrata, design.cluster) + # @show keys(gdf) for i in 1:nh - gdf[i].rh = repeat([rh[i]], nrow(gdf[i])) - end - stratified[j].rh = DataFrame(gdf).rh + gdf[i].whij = repeat([rh[i]], nrow(gdf[i])) .* gdf[i].weights .* (nh / (nh - 1)) + end + stratified[h].whij = transform(gdf).whij + end - return DataFrame(stratified) + return transform(stratified, :whij) end df = replicate(stratified, H) - rename!(df,:rh => :replicate_1) + rename!(df,:whij => :replicate_1) + df.replicate_1 = disallowmissing(df.replicate_1) for i in 2:(replicates) - df[!, "replicate_"*string(i)] = Float64.(replicate(stratified, H).rh) + df[!, "replicate_"*string(i)] = disallowmissing(replicate(stratified, H).whij) end return ReplicateDesign(df, design.cluster, design.popsize, design.sampsize, design.strata, design.pps, replicates) +end + +function bootstrap(x::Symbol, design::SurveyDesign, func = wsum; replicates = 100, rng = MersenneTwister(1234)) + gdf = groupby(design.data, design.cluster) + psus = unique(design.data[!, design.cluster]) + nh = length(psus) + X = func(design.data[:, x], design.data.weights) + Xt = Array{Float64, 1}(undef, replicates) + for i in 1:replicates + selected_psus = psus[rand(rng, 1:nh, (nh-1))] # simple random sample of PSUs, with replacement. Select (nh-1) out of nh + xhij = (reduce(vcat, [gdf[(i,)][!, x] for i in selected_psus])) + whij = (reduce(vcat, [gdf[(i,)].weights * (nh / (nh - 1)) for i in selected_psus])) + Xt[i] = func(xhij, whij) + end + @show Xt + variance = sum((Xt .- X).^2) / replicates + return DataFrame(statistic = X, SE = sqrt(variance)) end \ No newline at end of file diff --git a/src/by.jl b/src/by.jl index 30cb2dd2..be26d5a3 100644 --- a/src/by.jl +++ b/src/by.jl @@ -4,7 +4,7 @@ function bydomain(x::Symbol, domain::Symbol, design::ReplicateDesign, func::Func X = combine(gdf, [x, :weights] => ((a, b) -> func(a, weights(b))) => :statistic) Xt_mat = Array{Float64, 2}(undef, (nd, design.replicates)) for i in 1:design.replicates - Xt_mat[:, i] = combine(gdf, [x, :weights, Symbol("replicate_"*string(i))] => ((a, b, c) -> func(a, weights(b .* c))) => :statistic).statistic + Xt_mat[:, i] = combine(gdf, [x, Symbol("replicate_"*string(i))] => ((a, c) -> func(a, weights(c))) => :statistic).statistic end ses = [] for i in 1:nd diff --git a/src/mean.jl b/src/mean.jl index 501230d7..2bf8b925 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -4,21 +4,21 @@ julia> using Survey, Random, StatsBase; julia> apiclus1 = load_data("apiclus1"); -julia> dclus1 = SurveyDesign(apiclus1, :dnum, :fpc); +julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); -julia> bclus1 = bootweights(apiclus1; replicates = 1000) +julia> bclus1 = bootweights(dclus1; replicates = 1000) julia> mean(:api00, bclus1) 1×2 DataFrame Row │ mean SE │ Float64 Float64 ─────┼────────────────── - 1 │ 644.169 23.0897 + 1 │ 644.169 23.7208 ``` """ function mean(x::Symbol, design::ReplicateDesign) X = mean(design.data[!, x], weights(design.data.weights)) - Xt = [mean(design.data[!, x], weights(design.data.weights .* design.data[! , "replicate_"*string(i)])) for i in 1:design.replicates] + Xt = [mean(design.data[!, x], weights(design.data[! , "replicate_"*string(i)])) for i in 1:design.replicates] variance = sum((Xt .- X).^2) / design.replicates DataFrame(mean = X, SE = sqrt(variance)) end @@ -28,9 +28,9 @@ julia> using Survey, Random, StatsBase; julia> apiclus1 = load_data("apiclus1"); -julia> dclus1 = SurveyDesign(apiclus1, :dnum, :fpc); +julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); -julia> bclus1 = bootweights(apiclus1; replicates = 1000) +julia> bclus1 = bootweights(dclus1; replicates = 1000) julia> mean(:api00, :cname, bclus1) |> print 38×3 DataFrame diff --git a/src/total.jl b/src/total.jl index 3a3185c0..fdf83216 100644 --- a/src/total.jl +++ b/src/total.jl @@ -20,7 +20,7 @@ julia> total(:api00, bclus1) """ function total(x::Symbol, design::ReplicateDesign) X = wsum(design.data[!, x], weights(design.data.weights)) - Xt = [wsum(design.data[!, x], weights(design.data.weights .* design.data[! , "replicate_"*string(i)])) for i in 1:design.replicates] + Xt = [wsum(design.data[!, x], weights(design.data[! , "replicate_"*string(i)])) for i in 1:design.replicates] variance = sum((Xt .- X).^2) / design.replicates DataFrame(total = X, SE = sqrt(variance)) end From 7a65293550935ef7355a516e3a8deec3985cef7f Mon Sep 17 00:00:00 2001 From: ayushpatnaikgit Date: Thu, 5 Jan 2023 11:18:08 +0530 Subject: [PATCH 02/11] Remove comments and unused function. --- src/bootstrap.jl | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/src/bootstrap.jl b/src/bootstrap.jl index a28fabe3..9dd605c1 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -29,7 +29,6 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis for h in 1:H substrata = DataFrame(stratified[h]) psus = unique(substrata[!, design.cluster]) - # @show psus if length(psus) <= 1 return DataFrame(statistic = X, SE = 0) # bug! end @@ -37,7 +36,6 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis randinds = rand(rng, 1:(nh), (nh-1)) # Main bootstrap algo. Draw nh-1 out of nh, with replacement. rh = [(count(==(i), randinds)) for i in 1:nh] # main bootstrap algo. gdf = groupby(substrata, design.cluster) - # @show keys(gdf) for i in 1:nh gdf[i].whij = repeat([rh[i]], nrow(gdf[i])) .* gdf[i].weights .* (nh / (nh - 1)) end @@ -53,21 +51,4 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis df[!, "replicate_"*string(i)] = disallowmissing(replicate(stratified, H).whij) end return ReplicateDesign(df, design.cluster, design.popsize, design.sampsize, design.strata, design.pps, replicates) -end - -function bootstrap(x::Symbol, design::SurveyDesign, func = wsum; replicates = 100, rng = MersenneTwister(1234)) - gdf = groupby(design.data, design.cluster) - psus = unique(design.data[!, design.cluster]) - nh = length(psus) - X = func(design.data[:, x], design.data.weights) - Xt = Array{Float64, 1}(undef, replicates) - for i in 1:replicates - selected_psus = psus[rand(rng, 1:nh, (nh-1))] # simple random sample of PSUs, with replacement. Select (nh-1) out of nh - xhij = (reduce(vcat, [gdf[(i,)][!, x] for i in selected_psus])) - whij = (reduce(vcat, [gdf[(i,)].weights * (nh / (nh - 1)) for i in selected_psus])) - Xt[i] = func(xhij, whij) - end - @show Xt - variance = sum((Xt .- X).^2) / replicates - return DataFrame(statistic = X, SE = sqrt(variance)) end \ No newline at end of file From af7ee993962a6c710b42c23c2fb757b09a30d25b Mon Sep 17 00:00:00 2001 From: ayushpatnaikgit Date: Thu, 5 Jan 2023 11:27:03 +0530 Subject: [PATCH 03/11] Attemp bug fix. --- src/bootstrap.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bootstrap.jl b/src/bootstrap.jl index 9dd605c1..2055b6bf 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -30,7 +30,7 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis substrata = DataFrame(stratified[h]) psus = unique(substrata[!, design.cluster]) if length(psus) <= 1 - return DataFrame(statistic = X, SE = 0) # bug! + stratified[h].whij .= 0 # hasn't been tested yet. end nh = length(psus) randinds = rand(rng, 1:(nh), (nh-1)) # Main bootstrap algo. Draw nh-1 out of nh, with replacement. From 9c0b68547be12744fa4390e5be8554615f89c4d7 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Mon, 9 Jan 2023 18:11:31 +0530 Subject: [PATCH 04/11] Fix and add tests for SRS for `total` --- test/total.jl | 144 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 129 insertions(+), 15 deletions(-) diff --git a/test/total.jl b/test/total.jl index 5825a319..4c3788a8 100644 --- a/test/total.jl +++ b/test/total.jl @@ -5,11 +5,20 @@ apisrs = copy(apisrs_original) srs = SurveyDesign(apisrs; weights = :pw) |> bootweights tot = total(:api00, srs) - @test tot.total[1] ≈ 4.06688749e6 atol = 1e-4 - @test tot.SE[1] ≈ 60518.199 atol = 1e-1 - # TODO: uncomment after correcting `total` function - # @test tot.total[1] ≈ 131317 atol = 1 - # @test tot.SE[1] ≈ 1880.6 atol = 1e-1 + @test tot.total[1] ≈ 4066888 rtol = 1e-5 + @test tot.SE[1] ≈ 58526 rtol = 1e-1 + mn = mean(:api00, srs) + @test mn.mean[1] ≈ 656.58 rtol = 1e-5 + @test mn.SE[1] ≈ 9.4488 rtol = 1e-1 + # equivalent R code and results: + # > srs <- svydesign(data=apisrs, id=~1, weights=~pw) + # > srsrep <- as.svrepdesign(srs, type="bootstrap", replicates=4000) + # > svytotal(~api00, srsrep) + # total SE + # api00 4066888 58526 + # > svymean(~api00, srsrep) + # mean SE + # api00 656.58 9.4488 # CategoricalArray # apisrs = copy(apisrs_original) @@ -24,24 +33,129 @@ # Vector{Symbol} apisrs = copy(apisrs_original) - srs = SurveyDesign(apisrs; popsize = :fpc) |> bootweights + srs = SurveyDesign(apisrs; weights = :pw) |> bootweights tot = total([:api00, :enroll], srs) + mn = mean([:api00, :enroll], srs) ## :api00 - @test tot.total[1] ≈ 4066888 atol = 1 - @test tot.SE[1] ≈ 60518.199 atol = 1 + @test tot.total[1] ≈ 4066888 rtol = 1e-5 + @test tot.SE[1] ≈ 57502 rtol = 1e-1 + @test mn.mean[1] ≈ 656.58 rtol = 1e-5 + @test mn.SE[1] ≈ 9.2835 rtol = 1e-1 ## :enroll - @test tot.total[2] ≈ 3621074 atol = 1 - @test tot.SE[2] ≈ 173784.343 atol = 1 + @test tot.total[2] ≈ 3621074 rtol = 1e-5 + @test tot.SE[2] ≈ 176793 rtol = 1e-1 + @test mn.mean[2] ≈ 584.61 rtol = 1e-5 + @test mn.SE[2] ≈ 28.5427 rtol = 1e-1 + # equivalent R code and results: + # > srs <- svydesign(data=apisrs, id=~1, weights=~pw) + # > srsrep <- as.svrepdesign(srs, type="bootstrap", replicates=4000) + # > svytotal(~api00+~enroll, srsrep) + # total SE + # api00 4066888 57502 + # enroll 3621074 176793 + # > svymean(~api00+~enroll, srsrep) + # mean SE + # api00 656.58 9.2835 + # enroll 584.61 28.5427 # subpopulation apisrs = copy(apisrs_original) - srs = SurveyDesign(apisrs; popsize = :fpc) |> bootweights + srs = SurveyDesign(apisrs; weights = :pw) |> bootweights tot = total(:api00, :cname, srs) @test size(tot)[1] == apisrs.cname |> unique |> length - @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 917238.49 atol = 1e-2 - @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 122366.33 atol = 1e-2 - @test filter(:cname => ==("Monterey"), tot).total[1] ≈ 74947.40 atol = 1e-2 - @test filter(:cname => ==("Monterey"), tot).SE[1] ≈ 38178.35 atol = 1e-2 + @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 917238.49 rtol = 1e-5 + @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 122193.02 rtol = 1e-1 + @test filter(:cname => ==("Monterey"), tot).total[1] ≈ 74947.40 rtol = 1e-5 + @test filter(:cname => ==("Monterey"), tot).SE[1] ≈ 38862.71 rtol = 1e-1 + mn = mean(:api00, :cname, srs) + @test size(mn)[1] == apisrs.cname |> unique |> length + @test filter(:cname => ==("Los Angeles"), mn).mean[1] ≈ 658.1556 rtol = 1e-5 + @test filter(:cname => ==("Los Angeles"), mn).SE[1] ≈ 2.126852e+01 rtol = 1e-1 + @test filter(:cname => ==("Santa Clara"), mn).mean[1] ≈ 718.2857 rtol = 1e-5 + @test filter(:cname => ==("Santa Clara"), mn).SE[1] ≈ 5.835346e+01 rtol = 1e-1 + # equivalent R code and results: + # > srs <- svydesign(data=apisrs, id=~1, weights=~pw) + # > srsrep <- as.svrepdesign(srs, type="bootstrap", replicates=4000) + # > svyby(~api00, ~cname, srsrep, svytotal) + # cname api00 se + # Alameda Alameda 230323.89 67808.91 + # Calaveras Calaveras 24466.30 24199.26 + # Contra Costa Contra Costa 213538.15 68780.65 + # Fresno Fresno 148717.94 54174.78 + # Imperial Imperial 19263.34 19292.34 + # Kern Kern 177643.92 56429.75 + # Kings Kings 29080.83 20659.88 + # Lake Lake 24899.88 24796.24 + # Lassen Lassen 23289.44 23150.91 + # Los Angeles Los Angeles 917238.49 122193.02 + # Madera Madera 44596.80 25684.62 + # Marin Marin 74297.03 43018.64 + # Merced Merced 18427.15 18057.21 + # Modoc Modoc 20780.87 20977.35 + # Monterey Monterey 74947.40 38862.71 + # Napa Napa 45030.38 31747.05 + # Orange Orange 208861.68 66824.94 + # Placer Placer 23506.23 23426.32 + # Riverside Riverside 177860.71 55697.57 + # Sacramento Sacramento 152620.16 53266.09 + # San Bernardino San Bernardino 247388.36 66806.58 + # San Diego San Diego 254387.58 71730.93 + # San Francisco San Francisco 51874.75 29597.88 + # San Joaquin San Joaquin 113102.44 46195.42 + # San Luis Obispo San Luis Obispo 22886.83 22984.23 + # San Mateo San Mateo 38216.98 27075.67 + # Santa Barbara Santa Barbara 67700.42 38550.72 + # Santa Clara Santa Clara 155717.16 58101.15 + # Santa Cruz Santa Cruz 58006.81 34633.27 + # Shasta Shasta 46702.76 32882.09 + # Siskiyou Siskiyou 21648.03 21667.03 + # Solano Solano 57882.93 33095.96 + # Sonoma Sonoma 19511.10 19782.71 + # Stanislaus Stanislaus 68412.73 39997.43 + # Sutter Sutter 23041.68 22738.16 + # Tulare Tulare 41128.16 28933.90 + # Ventura Ventura 115177.43 51200.56 + # Yolo Yolo 14710.75 14676.49 + # > svyby(~api00, ~cname, srsrep, svymean) + # cname api00 se + # Alameda Alameda 676.0909 3.522082e+01 + # Calaveras Calaveras 790.0000 0.000000e+00 + # Contra Costa Contra Costa 766.1111 5.435054e+01 + # Fresno Fresno 600.2500 5.811781e+01 + # Imperial Imperial 622.0000 0.000000e+00 + # Kern Kern 573.6000 4.634744e+01 + # Kings Kings 469.5000 4.264356e+01 + # Lake Lake 804.0000 0.000000e+00 + # Lassen Lassen 752.0000 0.000000e+00 + # Los Angeles Los Angeles 658.1556 2.126852e+01 + # Madera Madera 480.0000 3.461786e+00 + # Marin Marin 799.6667 3.509912e+01 + # Merced Merced 595.0000 0.000000e+00 + # Modoc Modoc 671.0000 0.000000e+00 + # Monterey Monterey 605.0000 8.356655e+01 + # Napa Napa 727.0000 4.770914e+01 + # Orange Orange 749.3333 2.876956e+01 + # Placer Placer 759.0000 0.000000e+00 + # Riverside Riverside 574.3000 2.789294e+01 + # Sacramento Sacramento 616.0000 3.785063e+01 + # San Bernardino San Bernardino 614.4615 2.985197e+01 + # San Diego San Diego 684.5000 3.254291e+01 + # San Francisco San Francisco 558.3333 4.404227e+01 + # San Joaquin San Joaquin 608.6667 4.153241e+01 + # San Luis Obispo San Luis Obispo 739.0000 2.691382e-14 + # San Mateo San Mateo 617.0000 7.352923e+01 + # Santa Barbara Santa Barbara 728.6667 2.551393e+01 + # Santa Clara Santa Clara 718.2857 5.835346e+01 + # Santa Cruz Santa Cruz 624.3333 1.131098e+02 + # Shasta Shasta 754.0000 5.731963e+01 + # Siskiyou Siskiyou 699.0000 0.000000e+00 + # Solano Solano 623.0000 4.541173e+01 + # Sonoma Sonoma 630.0000 0.000000e+00 + # Stanislaus Stanislaus 736.3333 5.176843e+00 + # Sutter Sutter 744.0000 0.000000e+00 + # Tulare Tulare 664.0000 2.061011e+01 + # Ventura Ventura 743.8000 3.153839e+01 + # Yolo Yolo 475.0000 0.000000e+00 end @testset "total_Stratified" begin From a5f3a4e13e858041b8ea4c2dd3460fcc014eae43 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Mon, 9 Jan 2023 18:44:17 +0530 Subject: [PATCH 05/11] Fix and add tests for Stratified, add constants for tolerances --- test/total.jl | 213 ++++++++++++++++---------------------------------- 1 file changed, 69 insertions(+), 144 deletions(-) diff --git a/test/total.jl b/test/total.jl index 4c3788a8..9b7c8e56 100644 --- a/test/total.jl +++ b/test/total.jl @@ -1,3 +1,6 @@ +const STAT_TOL = 1e-5 +const SE_TOL = 1e-1 + @testset "Simple random sample" begin apisrs_original = load_data("apisrs") @@ -5,11 +8,11 @@ apisrs = copy(apisrs_original) srs = SurveyDesign(apisrs; weights = :pw) |> bootweights tot = total(:api00, srs) - @test tot.total[1] ≈ 4066888 rtol = 1e-5 - @test tot.SE[1] ≈ 58526 rtol = 1e-1 + @test tot.total[1] ≈ 4066888 rtol = STAT_TOL + @test tot.SE[1] ≈ 58526 rtol = SE_TOL mn = mean(:api00, srs) - @test mn.mean[1] ≈ 656.58 rtol = 1e-5 - @test mn.SE[1] ≈ 9.4488 rtol = 1e-1 + @test mn.mean[1] ≈ 656.58 rtol = STAT_TOL + @test mn.SE[1] ≈ 9.4488 rtol = SE_TOL # equivalent R code and results: # > srs <- svydesign(data=apisrs, id=~1, weights=~pw) # > srsrep <- as.svrepdesign(srs, type="bootstrap", replicates=4000) @@ -20,35 +23,20 @@ # mean SE # api00 656.58 9.4488 - # CategoricalArray - # apisrs = copy(apisrs_original) - # apisrs[!, :cname] = CategoricalArrays.categorical(apisrs.cname) - # srs = SurveyDesign(apisrs; popsize = :fpc) - # tot = total(:cname, srs) - # @test size(tot)[1] == apisrs.cname |> unique |> length - # @test filter(:cname => ==("Alameda"), tot).total[1] ≈ 340.67 atol = 1e-2 - # @test filter(:cname => ==("Alameda"), tot).SE[1] ≈ 98.472 atol = 1e-3 - # @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 1393.65 atol = 1e-2 - # @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 180.368 atol = 1e-3 - # Vector{Symbol} - apisrs = copy(apisrs_original) - srs = SurveyDesign(apisrs; weights = :pw) |> bootweights tot = total([:api00, :enroll], srs) mn = mean([:api00, :enroll], srs) ## :api00 - @test tot.total[1] ≈ 4066888 rtol = 1e-5 - @test tot.SE[1] ≈ 57502 rtol = 1e-1 - @test mn.mean[1] ≈ 656.58 rtol = 1e-5 - @test mn.SE[1] ≈ 9.2835 rtol = 1e-1 + @test tot.total[1] ≈ 4066888 rtol = STAT_TOL + @test tot.SE[1] ≈ 57502 rtol = SE_TOL + @test mn.mean[1] ≈ 656.58 rtol = STAT_TOL + @test mn.SE[1] ≈ 9.2835 rtol = SE_TOL ## :enroll - @test tot.total[2] ≈ 3621074 rtol = 1e-5 - @test tot.SE[2] ≈ 176793 rtol = 1e-1 - @test mn.mean[2] ≈ 584.61 rtol = 1e-5 - @test mn.SE[2] ≈ 28.5427 rtol = 1e-1 + @test tot.total[2] ≈ 3621074 rtol = STAT_TOL + @test tot.SE[2] ≈ 176793 rtol = SE_TOL + @test mn.mean[2] ≈ 584.61 rtol = STAT_TOL + @test mn.SE[2] ≈ 28.5427 rtol = SE_TOL # equivalent R code and results: - # > srs <- svydesign(data=apisrs, id=~1, weights=~pw) - # > srsrep <- as.svrepdesign(srs, type="bootstrap", replicates=4000) # > svytotal(~api00+~enroll, srsrep) # total SE # api00 4066888 57502 @@ -59,146 +47,83 @@ # enroll 584.61 28.5427 # subpopulation - apisrs = copy(apisrs_original) - srs = SurveyDesign(apisrs; weights = :pw) |> bootweights tot = total(:api00, :cname, srs) @test size(tot)[1] == apisrs.cname |> unique |> length - @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 917238.49 rtol = 1e-5 - @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 122193.02 rtol = 1e-1 - @test filter(:cname => ==("Monterey"), tot).total[1] ≈ 74947.40 rtol = 1e-5 - @test filter(:cname => ==("Monterey"), tot).SE[1] ≈ 38862.71 rtol = 1e-1 + @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 917238.49 rtol = STAT_TOL + @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 122193.02 rtol = SE_TOL + @test filter(:cname => ==("Monterey"), tot).total[1] ≈ 74947.40 rtol = STAT_TOL + @test filter(:cname => ==("Monterey"), tot).SE[1] ≈ 38862.71 rtol = SE_TOL mn = mean(:api00, :cname, srs) @test size(mn)[1] == apisrs.cname |> unique |> length - @test filter(:cname => ==("Los Angeles"), mn).mean[1] ≈ 658.1556 rtol = 1e-5 - @test filter(:cname => ==("Los Angeles"), mn).SE[1] ≈ 2.126852e+01 rtol = 1e-1 - @test filter(:cname => ==("Santa Clara"), mn).mean[1] ≈ 718.2857 rtol = 1e-5 - @test filter(:cname => ==("Santa Clara"), mn).SE[1] ≈ 5.835346e+01 rtol = 1e-1 - # equivalent R code and results: - # > srs <- svydesign(data=apisrs, id=~1, weights=~pw) - # > srsrep <- as.svrepdesign(srs, type="bootstrap", replicates=4000) + @test filter(:cname => ==("Los Angeles"), mn).mean[1] ≈ 658.1556 rtol = STAT_TOL + @test filter(:cname => ==("Los Angeles"), mn).SE[1] ≈ 2.126852e+01 rtol = SE_TOL + @test filter(:cname => ==("Santa Clara"), mn).mean[1] ≈ 718.2857 rtol = STAT_TOL + @test filter(:cname => ==("Santa Clara"), mn).SE[1] ≈ 5.835346e+01 rtol = SE_TOL + # equivalent R code (results cause clutter): # > svyby(~api00, ~cname, srsrep, svytotal) - # cname api00 se - # Alameda Alameda 230323.89 67808.91 - # Calaveras Calaveras 24466.30 24199.26 - # Contra Costa Contra Costa 213538.15 68780.65 - # Fresno Fresno 148717.94 54174.78 - # Imperial Imperial 19263.34 19292.34 - # Kern Kern 177643.92 56429.75 - # Kings Kings 29080.83 20659.88 - # Lake Lake 24899.88 24796.24 - # Lassen Lassen 23289.44 23150.91 - # Los Angeles Los Angeles 917238.49 122193.02 - # Madera Madera 44596.80 25684.62 - # Marin Marin 74297.03 43018.64 - # Merced Merced 18427.15 18057.21 - # Modoc Modoc 20780.87 20977.35 - # Monterey Monterey 74947.40 38862.71 - # Napa Napa 45030.38 31747.05 - # Orange Orange 208861.68 66824.94 - # Placer Placer 23506.23 23426.32 - # Riverside Riverside 177860.71 55697.57 - # Sacramento Sacramento 152620.16 53266.09 - # San Bernardino San Bernardino 247388.36 66806.58 - # San Diego San Diego 254387.58 71730.93 - # San Francisco San Francisco 51874.75 29597.88 - # San Joaquin San Joaquin 113102.44 46195.42 - # San Luis Obispo San Luis Obispo 22886.83 22984.23 - # San Mateo San Mateo 38216.98 27075.67 - # Santa Barbara Santa Barbara 67700.42 38550.72 - # Santa Clara Santa Clara 155717.16 58101.15 - # Santa Cruz Santa Cruz 58006.81 34633.27 - # Shasta Shasta 46702.76 32882.09 - # Siskiyou Siskiyou 21648.03 21667.03 - # Solano Solano 57882.93 33095.96 - # Sonoma Sonoma 19511.10 19782.71 - # Stanislaus Stanislaus 68412.73 39997.43 - # Sutter Sutter 23041.68 22738.16 - # Tulare Tulare 41128.16 28933.90 - # Ventura Ventura 115177.43 51200.56 - # Yolo Yolo 14710.75 14676.49 # > svyby(~api00, ~cname, srsrep, svymean) - # cname api00 se - # Alameda Alameda 676.0909 3.522082e+01 - # Calaveras Calaveras 790.0000 0.000000e+00 - # Contra Costa Contra Costa 766.1111 5.435054e+01 - # Fresno Fresno 600.2500 5.811781e+01 - # Imperial Imperial 622.0000 0.000000e+00 - # Kern Kern 573.6000 4.634744e+01 - # Kings Kings 469.5000 4.264356e+01 - # Lake Lake 804.0000 0.000000e+00 - # Lassen Lassen 752.0000 0.000000e+00 - # Los Angeles Los Angeles 658.1556 2.126852e+01 - # Madera Madera 480.0000 3.461786e+00 - # Marin Marin 799.6667 3.509912e+01 - # Merced Merced 595.0000 0.000000e+00 - # Modoc Modoc 671.0000 0.000000e+00 - # Monterey Monterey 605.0000 8.356655e+01 - # Napa Napa 727.0000 4.770914e+01 - # Orange Orange 749.3333 2.876956e+01 - # Placer Placer 759.0000 0.000000e+00 - # Riverside Riverside 574.3000 2.789294e+01 - # Sacramento Sacramento 616.0000 3.785063e+01 - # San Bernardino San Bernardino 614.4615 2.985197e+01 - # San Diego San Diego 684.5000 3.254291e+01 - # San Francisco San Francisco 558.3333 4.404227e+01 - # San Joaquin San Joaquin 608.6667 4.153241e+01 - # San Luis Obispo San Luis Obispo 739.0000 2.691382e-14 - # San Mateo San Mateo 617.0000 7.352923e+01 - # Santa Barbara Santa Barbara 728.6667 2.551393e+01 - # Santa Clara Santa Clara 718.2857 5.835346e+01 - # Santa Cruz Santa Cruz 624.3333 1.131098e+02 - # Shasta Shasta 754.0000 5.731963e+01 - # Siskiyou Siskiyou 699.0000 0.000000e+00 - # Solano Solano 623.0000 4.541173e+01 - # Sonoma Sonoma 630.0000 0.000000e+00 - # Stanislaus Stanislaus 736.3333 5.176843e+00 - # Sutter Sutter 744.0000 0.000000e+00 - # Tulare Tulare 664.0000 2.061011e+01 - # Ventura Ventura 743.8000 3.153839e+01 - # Yolo Yolo 475.0000 0.000000e+00 end -@testset "total_Stratified" begin +@testset "Stratified sample" begin apistrat_original = load_data("apistrat") # base functionality apistrat = copy(apistrat_original) strat = SurveyDesign(apistrat; strata = :stype, weights = :pw) |> bootweights tot = total(:api00, strat) - @test tot.total[1] ≈ 4102208 atol = 10 - @test tot.SE[1] ≈ 77211.61 atol = 1e-1 - # without fpc - # TODO: uncomment after correcting `total` function - # @test tot.SE[1] ≈ 1690.4 atol = 1e-1 - - # CategoricalArray - # apistrat = copy(apistrat_original) - # apistrat[!, :cname] = CategoricalArrays.categorical(apistrat.cname) - # strat = StratifiedSample(apistrat, :stype; popsize = :fpc) - # TODO: uncomment after adding `CategoricalArray` support - # @test tot.SE[1] ≈ 1690.4 atol = 1e-1 - # tot = total(:cname, strat) - # @test size(tot)[1] == apistrat.cname |> unique |> length - # @test filter(:cname => ==("Kern"), tot).total[1] ≈ 291.97 atol = 1e-2 - # @test filter(:cname => ==("Kern"), tot).SE[1] ≈ 101.760 atol = 1e-3 - # @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 1373.15 atol = 1e-2 - # @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 199.635 atol = 1e-3 + @test tot.total[1] ≈ 4102208 rtol = STAT_TOL + @test tot.SE[1] ≈ 60746 rtol = SE_TOL + @test mn.mean[1] ≈ 662.29 rtol = STAT_TOL + @test mn.SE[1] ≈ 9.8072 rtol = SE_TOL + # equivalent R code and results: + # > strat <- svydesign(data=apistrat, id=~1, weights=~pw, strata=~stype) + # > stratrep <- as.svrepdesign(strat, type="bootstrap", replicates=4000) + # > svytotal(~api00, stratrep) + # total SE + # api00 4102208 60746 + # > svymean(~api00, stratrep) + # mean SE + # api00 662.29 9.8072 # Vector{Symbol} tot = total([:api00, :enroll], strat) + mn = mean([:api00, :enroll], strat) ## :api00 - @test tot.total[1] ≈ 4102208 atol = 1 - @test tot.SE[1] ≈ 77211.61 atol = 1 + @test tot.total[1] ≈ 4102208 rtol = STAT_TOL + @test tot.SE[1] ≈ 60746 rtol = SE_TOL + @test mn.mean[1] ≈ 662.29 rtol = STAT_TOL + @test mn.SE[1] ≈ 9.8072 rtol = SE_TOL ## :enroll - @test tot.total[2] ≈ 3687178 atol = 1 - @test tot.SE[2] ≈ 127021.5540 atol = 1 + @test tot.total[2] ≈ 3687178 rtol = STAT_TOL + @test tot.SE[2] ≈ 117322 rtol = SE_TOL + @test mn.mean[2] ≈ 595.28 rtol = STAT_TOL + @test mn.SE[2] ≈ 18.9412 rtol = SE_TOL + # equivalent R code and results: + # > svytotal(~api00+~enroll, stratrep) + # > svymean(~api00+~enroll, stratrep) + # mean SE + # api00 662.29 9.8072 + # enroll 595.28 18.9412 # subpopulation - # TODO: add functionality in `src/total.jl` - # TODO: add tests + tot = total(:api00, :cname, strat) + @test size(tot)[1] == apistrat.cname |> unique |> length + @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 869905.98 rtol = STAT_TOL + @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 134195.81 rtol = SE_TOL + @test filter(:cname => ==("Monterey"), tot).total[1] ≈ 72103.09 rtol = STAT_TOL + @test filter(:cname => ==("Monterey"), tot).SE[1] ≈ 45532.88 rtol = SE_TOL + mn = mean(:api00, :cname, strat) + @test size(mn)[1] == apistrat.cname |> unique |> length + @test filter(:cname => ==("Los Angeles"), mn).mean[1] ≈ 633.5113 rtol = STAT_TOL + @test filter(:cname => ==("Los Angeles"), mn).SE[1] ≈ 21.681068 rtol = SE_TOL + @test filter(:cname => ==("Santa Clara"), mn).mean[1] ≈ 664.1212 rtol = STAT_TOL + @test filter(:cname => ==("Santa Clara"), mn).SE[1] ≈ 48.817277 rtol = SE_TOL + # equivalent R code (results cause clutter): + # > svyby(~api00, ~cname, stratrep, svytotal) + # > svyby(~api00, ~cname, stratrep, svymean) end -@testset "total_OneStageClusterSample" begin +@testset "One stage cluster sample" begin # Load API datasets apiclus1_original = load_data("apiclus1") apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column From 87c11dc01c23e55da98dcfd2cfe6dc58b573d72f Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Mon, 9 Jan 2023 19:55:15 +0530 Subject: [PATCH 06/11] Fix and add tests for Cluster and minor reordering --- test/total.jl | 87 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 66 insertions(+), 21 deletions(-) diff --git a/test/total.jl b/test/total.jl index 9b7c8e56..6ac6ab06 100644 --- a/test/total.jl +++ b/test/total.jl @@ -1,12 +1,11 @@ const STAT_TOL = 1e-5 const SE_TOL = 1e-1 -@testset "Simple random sample" begin - apisrs_original = load_data("apisrs") +@testset "total SRS" begin + apisrs = load_data("apisrs") + srs = SurveyDesign(apisrs; weights = :pw) |> bootweights # base functionality - apisrs = copy(apisrs_original) - srs = SurveyDesign(apisrs; weights = :pw) |> bootweights tot = total(:api00, srs) @test tot.total[1] ≈ 4066888 rtol = STAT_TOL @test tot.SE[1] ≈ 58526 rtol = SE_TOL @@ -64,15 +63,15 @@ const SE_TOL = 1e-1 # > svyby(~api00, ~cname, srsrep, svymean) end -@testset "Stratified sample" begin - apistrat_original = load_data("apistrat") +@testset "total Stratified" begin + apistrat = load_data("apistrat") + strat = SurveyDesign(apistrat; strata = :stype, weights = :pw) |> bootweights # base functionality - apistrat = copy(apistrat_original) - strat = SurveyDesign(apistrat; strata = :stype, weights = :pw) |> bootweights tot = total(:api00, strat) @test tot.total[1] ≈ 4102208 rtol = STAT_TOL @test tot.SE[1] ≈ 60746 rtol = SE_TOL + mn = mean(:api00, strat) @test mn.mean[1] ≈ 662.29 rtol = STAT_TOL @test mn.SE[1] ≈ 9.8072 rtol = SE_TOL # equivalent R code and results: @@ -123,18 +122,64 @@ end # > svyby(~api00, ~cname, stratrep, svymean) end -@testset "One stage cluster sample" begin - # Load API datasets - apiclus1_original = load_data("apiclus1") - apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column - ############################## - # one-stage cluster sample - apiclus1 = copy(apiclus1_original) - dclus1 = SurveyDesign(apiclus1, clusters = :dnum, weights = :pw) |> bootweights - @test total(:api00,dclus1).total[1] ≈ 5949162 atol = 1 - @test total(:api00,dclus1).SE[1] ≈ 1.3338978891316957e6 atol = 1 +@testset "total Cluster" begin + apiclus1 = load_data("apiclus1") + clus1 = SurveyDesign(apiclus1, clusters = :dnum, weights = :pw) |> bootweights + + # base functionality + tot = total(:api00, clus1) + @test tot.total[1] ≈ 3989986 rtol = STAT_TOL + @test tot.SE[1] ≈ 900323 rtol = SE_TOL + mn = mean(:api00, clus1) + @test mn.mean[1] ≈ 644.17 rtol = STAT_TOL + @test mn.SE[1] ≈ 23.534 rtol = SE_TOL + # equivalent R code and results: + # > clus1 <- svydesign(data=apiclus1, id=~dnum, weights=~pw) + # > clus1rep <- as.svrepdesign(clus1, type="bootstrap", replicates=4000) + # > svytotal(~api00, clus1rep) + # total SE + # api00 3989986 900323 + # > svymean(~api00, clus1rep) + # mean SE + # api00 644.17 23.534 - @test total(:api00, dclus1).total[1] ≈ 5949162 atol = 1 - @test total(:api00, dclus1).SE[1] ≈ 1352953 atol = 50000 # without fpc as it hasn't been figured out for bootstrap. - + # Vector{Symbol} + tot = total([:api00, :enroll], clus1) + mn = mean([:api00, :enroll], clus1) + ## :api00 + @test tot.total[1] ≈ 3989986 rtol = STAT_TOL + @test tot.SE[1] ≈ 900323 rtol = SE_TOL + @test mn.mean[1] ≈ 644.17 rtol = STAT_TOL + @test mn.SE[1] ≈ 23.534 rtol = SE_TOL + ## :enroll + @test tot.total[2] ≈ 3404940 rtol = STAT_TOL + @test tot.SE[2] ≈ 941501 rtol = SE_TOL + @test mn.mean[2] ≈ 549.72 rtol = STAT_TOL + @test mn.SE[2] ≈ 46.070 rtol = SE_TOL + # equivalent R code and results: + # > svytotal(~api00+~enroll, clus1rep) + # total SE + # api00 3989986 900323 + # enroll 3404940 941501 + # > svymean(~api00+~enroll, clus1rep) + # mean SE + # api00 644.17 23.534 + # enroll 549.72 46.070 + + # subpopulation + tot = total(:api00, :cname, clus1) + @test size(tot)[1] == apiclus1.cname |> unique |> length + @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 328620.49 rtol = STAT_TOL + @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 292840.83 rtol = SE_TOL + @test filter(:cname => ==("San Diego"), tot).total[1] ≈ 1227596.71 rtol = STAT_TOL + @test filter(:cname => ==("San Diego"), tot).SE[1] ≈ 860028.39 rtol = SE_TOL + mn = mean(:api00, :cname, clus1) + @test size(mn)[1] == apiclus1.cname |> unique |> length + @test filter(:cname => ==("Los Angeles"), mn).mean[1] ≈ 647.2667 rtol = STAT_TOL + @test filter(:cname => ==("Los Angeles"), mn).SE[1] ≈ 41.537132 rtol = 1 # tolerance is too large + @test filter(:cname => ==("Santa Clara"), mn).mean[1] ≈ 732.0769 rtol = STAT_TOL + @test filter(:cname => ==("Santa Clara"), mn).SE[1] ≈ 52.336574 rtol = SE_TOL + # equivalent R code (results cause clutter): + # > svyby(~api00, ~cname, clus1rep, svytotal) + # > svyby(~api00, ~cname, clus1rep, svymean) end \ No newline at end of file From 81b77e8c55fd3020585354615fa1358bb24476aa Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Tue, 10 Jan 2023 16:52:42 +0530 Subject: [PATCH 07/11] Remove references to old designs, add references to new functions --- docs/src/api.md | 29 +++++++++-------------------- docs/src/index.md | 41 +++++------------------------------------ src/Survey.jl | 5 ++--- 3 files changed, 16 insertions(+), 59 deletions(-) diff --git a/docs/src/api.md b/docs/src/api.md index 062d379d..5431b9ae 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -7,27 +7,19 @@ Module = [Survey] Order = [:type, :function] Private = false ``` -Survey data can be loaded from a `DataFrame` into a survey design. The package currently supports simple random sample and stratified sample designs. -```@docs -AbstractSurveyDesign -SimpleRandomSample -StratifiedSample -``` ```@docs +AbstractSurveyDesign +SurveyDesign +ReplicateDesign load_data -Survey.mean(x::Symbol, design::SimpleRandomSample) -total(x::Symbol, design::SimpleRandomSample) +bootweights +mean(x::Symbol, design::ReplicateDesign) +mean(x::Symbol, domain::Symbol, design::ReplicateDesign) +total(x::Symbol, design::ReplicateDesign) +total(x::Symbol, domain::Symbol, design::ReplicateDesign) quantile -``` - -It is often required to estimate population parameters for sub-populations of interest. For example, you may have a sample of heights, but you want the average heights of males and females separately. -```@docs -mean(x::Symbol, by::Symbol, design::SimpleRandomSample) -total(x::Symbol, by::Symbol, design::SimpleRandomSample) -``` -```@docs -ratio(variable_num:: Symbol, variable_den:: Symbol, design::SurveyDesign) +ratio(variable_num::Symbol, variable_den::Symbol, design::SurveyDesign) plot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...) boxplot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...) hist(design::AbstractSurveyDesign, var::Symbol, @@ -35,7 +27,4 @@ hist(design::AbstractSurveyDesign, var::Symbol, normalization = :density, kwargs... ) -dim(design::AbstractSurveyDesign) -dimnames(design::AbstractSurveyDesign) -colnames(design::AbstractSurveyDesign) ``` diff --git a/docs/src/index.md b/docs/src/index.md index a099c73c..eddbcf0f 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -18,45 +18,14 @@ with at least 100 students and for various probability samples of the data. The API program has been discontinued at the end of 2018. Information is archived at [https://www.cde.ca.gov/re/pr/api.asp](https://www.cde.ca.gov/re/pr/api.asp) -Firstly, a survey design needs a dataset from which to gather information. - - -The sample datasets provided with the package can be loaded as `DataFrames` using the `load_data` function: +Firstly, a survey design needs a dataset from which to gather information. The sample +datasets provided with the package can be loaded as `DataFrame`s using [`load_data`](@ref): ```julia julia> apisrs = load_data("apisrs"); ``` -`apisrs` is a simple random sample of the Academic Performance Index of Californian schools. - -Next, we can build a design. The design corresponding to a simple random sample is [`SimpleRandomSample`](@ref), which can be instantiated by calling the constructor: - -```julia -julia> srs = SimpleRandomSample(apisrs; weights = :pw) -SimpleRandomSample: -data: 200x42 DataFrame -weights: 31.0, 31.0, 31.0, ..., 31.0 -probs: 0.0323, 0.0323, 0.0323, ..., 0.0323 -fpc: 6194, 6194, 6194, ..., 6194 -popsize: 6194 -sampsize: 200 -sampfraction: 0.0323 -ignorefpc: false -``` -With a `SimpleRandomSample` (as well as with any subtype of [`AbstractSurveyDesign`](@ref)) it is possible to calculate estimates of the mean, population total, etc., for a given variable, along with the corresponding standard errors. +`apisrs` is a simple random sample of the Academic Performance Index of Californian schools. -```julia -julia> mean(:api00, srs) -1×2 DataFrame - Row │ mean sem - │ Float64 Float64 -─────┼────────────────── - 1 │ 656.585 9.24972 - -julia> total(:api00, srs) -1×2 DataFrame - Row │ total se_total - │ Float64 Float64 -─────┼───────────────────── - 1 │ 4.06689e6 57292.8 -``` +Next, we can build a design. +#TODO: continue tutorial diff --git a/src/Survey.jl b/src/Survey.jl index fee13d6d..dd71a092 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -28,12 +28,11 @@ include("ratio.jl") include("by.jl") export load_data -export AbstractSurveyDesign, SimpleRandomSample, StratifiedSample -export SurveyDesign +export AbstractSurveyDesign, SurveyDesign, ReplicateDesign export dim, colnames, dimnames export mean, total, quantile export plot -export hist +export hist, sturges, freedman_diaconis export boxplot export bootweights export jkknife From 751813822326ea0e8001b0f42314dfc5e16795d7 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Tue, 10 Jan 2023 16:57:01 +0530 Subject: [PATCH 08/11] Fix docstring and minor style modifications --- src/bootstrap.jl | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/bootstrap.jl b/src/bootstrap.jl index 2055b6bf..b4e226a8 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -1,15 +1,15 @@ """ ```jldoctest -julia> using Survey, Random; +julia> using Random -julia> apiclus1 = load_data("apiclus1"); +julia> apiclus1 = load_data("apiclus1"); -julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum); +julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum); -julia> rng = MersenneTwister(111); +julia> rng = MersenneTwister(111); -julia> Survey.bootweights(dclus1; replicates=1000, rng) -Survey.ReplicateDesign: +julia> bootweights(dclus1; replicates=1000, rng) +ReplicateDesign: data: 183x1046 DataFrame cluster: dnum design.data[!,design.cluster]: 637, 637, 637, ..., 448 @@ -22,7 +22,7 @@ design.data[!,:allprobs]: 1.0, 1.0, 1.0, ..., 1.0 replicates: 1000 ``` """ -function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwister(1234)) +function bootweights(design::SurveyDesign; replicates=4000, rng=MersenneTwister(1234)) H = length(unique(design.data[!, design.strata])) stratified = groupby(design.data, design.strata) function replicate(stratified, H) @@ -45,10 +45,10 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis return transform(stratified, :whij) end df = replicate(stratified, H) - rename!(df,:whij => :replicate_1) + rename!(df, :whij => :replicate_1) df.replicate_1 = disallowmissing(df.replicate_1) for i in 2:(replicates) - df[!, "replicate_"*string(i)] = disallowmissing(replicate(stratified, H).whij) + df[!, "replicate_" * string(i)] = disallowmissing(replicate(stratified, H).whij) end return ReplicateDesign(df, design.cluster, design.popsize, design.sampsize, design.strata, design.pps, replicates) end \ No newline at end of file From efe2f6a32db2c9fddfc61135ec7baf54aa379547 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Tue, 10 Jan 2023 17:02:57 +0530 Subject: [PATCH 09/11] Convert indentation to spaces --- docs/src/api.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/src/api.md b/docs/src/api.md index 5431b9ae..5b538a55 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -23,8 +23,8 @@ ratio(variable_num::Symbol, variable_den::Symbol, design::SurveyDesign) plot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...) boxplot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...) hist(design::AbstractSurveyDesign, var::Symbol, - bins::Union{Integer, AbstractVector} = freedman_diaconis(design, var); - normalization = :density, - kwargs... - ) + bins::Union{Integer, AbstractVector} = freedman_diaconis(design, var); + normalization = :density, + kwargs... + ) ``` From 2bac93875eaa85a5a16fb06cbb2fa513d16684a0 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Tue, 10 Jan 2023 17:16:22 +0530 Subject: [PATCH 10/11] Fix docstrings, minor rearrangements and style checks --- src/SurveyDesign.jl | 65 ++++++++++++++------------ src/mean.jl | 108 ++++++++++++++++++-------------------------- src/total.jl | 81 ++++++++++++++++++--------------- 3 files changed, 124 insertions(+), 130 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 1541a3b9..07b9e1de 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -13,35 +13,32 @@ abstract type AbstractSurveyDesign end """ SurveyDesign <: AbstractSurveyDesign -Survey design sampled by one stage clusters sampling. -Clusters chosen by SRS followed by complete sampling of selected clusters. -Assumes each individual in one and only one clusters; disjoint and nested clusters. +General survey design encompassing a simple random, stratified, cluster or multi-stage design. -`clusters` must be specified as a Symbol name of a column in `data`. +In the case of cluster sample, the clusters are chosen by simple random sampling. All +individuals in one cluster are sampled. The clusters are considered disjoint and nested. # Arguments: `data::AbstractDataFrame`: the survey dataset (!this gets modified by the constructor). -`clusters::Symbol`: the stratification variable - must be given as a column in `data`. -`popsize::Union{Nothing,Symbol,<:Unsigned,Vector{<:Real}}=nothing`: the (expected) survey population size. For - -`weights::Union{Nothing,Symbol,Vector{<:Real}}=nothing`: the sampling weights. +`strata::Union{Nothing, Symbol}=nothing`: the stratification variable - must be given as a column in `data`. +`clusters::Union{Nothing, Symbol, Vector{Symbol}}=nothing`: the clustering variable - must be given as column(s) in `data`. +`weights::Union{Nothing, Symbol}=nothing`: the sampling weights. +`popsize::Union{Nothing, Int, Symbol}=nothing`: the (expected) survey population size. ```jldoctest julia> apiclus1 = load_data("apiclus1"); -julia> apiclus1[!, :pw] = fill(757/15,(size(apiclus1,1),)); # Correct api mistake for pw column - -julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) +julia> dclus1 = SurveyDesign(apiclus1; clusters=:dnum, weights=:pw) SurveyDesign: data: 183x46 DataFrame cluster: dnum design.data[!,design.cluster]: 637, 637, 637, ..., 448 popsize: popsize -design.data[!,design.popsize]: 9240.0, 9240.0, 9240.0, ..., 9240.0 +design.data[!,design.popsize]: 6190.0, 6190.0, 6190.0, ..., 6190.0 sampsize: sampsize design.data[!,design.sampsize]: 15, 15, 15, ..., 15 -design.data[!,:probs]: 0.0198, 0.0198, 0.0198, ..., 0.0198 -design.data[!,:allprobs]: 0.0198, 0.0198, 0.0198, ..., 0.0198 +design.data[!,:probs]: 0.0295, 0.0295, 0.0295, ..., 0.0295 +design.data[!,:allprobs]: 0.0295, 0.0295, 0.0295, ..., 0.0295 ``` """ struct SurveyDesign <: AbstractSurveyDesign @@ -52,9 +49,15 @@ struct SurveyDesign <: AbstractSurveyDesign strata::Symbol pps::Bool # Single stage clusters sample, like apiclus1 - function SurveyDesign(data::AbstractDataFrame; strata::Union{Nothing,Symbol} = nothing, weights::Union{Nothing,Symbol}= nothing, clusters::Union{Nothing, Symbol, Vector{Symbol}} = nothing, popsize::Union{Nothing, Int,Symbol}=nothing) + function SurveyDesign( + data::AbstractDataFrame; + strata::Union{Nothing, Symbol}=nothing, + clusters::Union{Nothing, Symbol, Vector{Symbol}}=nothing, + weights::Union{Nothing, Symbol}=nothing, + popsize::Union{Nothing, Int, Symbol}=nothing + ) # sampsize here is number of clusters completely sampled, popsize is total clusters in population - if typeof(strata) <:Nothing + if typeof(strata) <: Nothing data.false_strata = repeat(["FALSE_STRATA"], nrow(data)) strata = :false_strata end @@ -71,7 +74,7 @@ struct SurveyDesign <: AbstractSurveyDesign end # For one-stage sample only one sampsize vector sampsize_labels = :sampsize - data[!, sampsize_labels] = fill(length(unique(data[!, cluster])),(nrow(data),)) + data[!, sampsize_labels] = fill(length(unique(data[!, cluster])), (nrow(data),)) if !(typeof(popsize) <: Nothing) data[!, :weights] = data[!, popsize] ./ data[!, sampsize_labels] elseif !(typeof(weights) <: Nothing) @@ -91,24 +94,26 @@ struct SurveyDesign <: AbstractSurveyDesign end """ -```jldoctest -julia> apiclus1 = load_data("apiclus1"); + ReplicateDesign <: AbstractSurveyDesign -julia> apiclus1[!, :pw] = fill(757/15,(size(apiclus1,1),)); # Correct api mistake for pw column +Survey design obtained by replicating an original design using [`bootweights`](@ref). -julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); +```jldoctest +julia> apistrat = load_data("apistrat"); -julia> bclus1 = Survey.bootweights(dclus1; replicates = 1000) -Survey.ReplicateDesign: -data: 183x1046 DataFrame -cluster: dnum -design.data[!,design.cluster]: 637, 637, 637, ..., 448 +julia> strat = SurveyDesign(apistrat; strata=:stype, weights=:pw); + +julia> bootstrat = bootweights(strat; replicates=1000) +ReplicateDesign: +data: 200x1046 DataFrame +cluster: false_cluster +design.data[!,design.cluster]: 1, 2, 3, ..., 200 popsize: popsize -design.data[!,design.popsize]: 9240.0, 9240.0, 9240.0, ..., 9240.0 +design.data[!,design.popsize]: 6190.0, 6190.0, 6190.0, ..., 6190.0 sampsize: sampsize -design.data[!,design.sampsize]: 15, 15, 15, ..., 15 -design.data[!,:probs]: 0.0198, 0.0198, 0.0198, ..., 0.0198 -design.data[!,:allprobs]: 0.0198, 0.0198, 0.0198, ..., 0.0198 +design.data[!,design.sampsize]: 200, 200, 200, ..., 200 +design.data[!,:probs]: 0.0226, 0.0226, 0.0226, ..., 0.0662 +design.data[!,:allprobs]: 0.0226, 0.0226, 0.0226, ..., 0.0662 replicates: 1000 ``` """ diff --git a/src/mean.jl b/src/mean.jl index 2bf8b925..0ef5bb37 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -1,19 +1,27 @@ """ -```jldoctest -julia> using Survey, Random, StatsBase; + mean(var, design) -julia> apiclus1 = load_data("apiclus1"); +Compute the estimated mean of one or more variables within a survey design. -julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); +```jldoctest +julia> apiclus1 = load_data("apiclus1"); -julia> bclus1 = bootweights(dclus1; replicates = 1000) +julia> clus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) |> bootweights; -julia> mean(:api00, bclus1) +julia> mean(:api00, clus1) 1×2 DataFrame - Row │ mean SE - │ Float64 Float64 + Row │ mean SE + │ Float64 Float64 ─────┼────────────────── - 1 │ 644.169 23.7208 + 1 │ 644.169 23.2919 + +julia> mean([:api00, :enroll], clus1) +2×3 DataFrame + Row │ names mean SE + │ String Float64 Float64 +─────┼────────────────────────── + 1 │ api00 644.169 23.2919 + 2 │ enroll 549.716 45.3655 ``` """ function mean(x::Symbol, design::ReplicateDesign) @@ -22,59 +30,39 @@ function mean(x::Symbol, design::ReplicateDesign) variance = sum((Xt .- X).^2) / design.replicates DataFrame(mean = X, SE = sqrt(variance)) end + +function mean(x::Vector{Symbol}, design::ReplicateDesign) + df = reduce(vcat, [mean(i, design) for i in x]) + insertcols!(df, 1, :names => String.(x)) + return df +end + """ -```jldoctest -julia> using Survey, Random, StatsBase; + mean(var, domain, design) -julia> apiclus1 = load_data("apiclus1"); +Compute the estimated mean within a domain. -julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); +```jldoctest +julia> apiclus1 = load_data("apiclus1"); -julia> bclus1 = bootweights(dclus1; replicates = 1000) +julia> clus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) |> bootweights; -julia> mean(:api00, :cname, bclus1) |> print -38×3 DataFrame - Row │ cname statistic SE - │ String15 Float64 Any -─────┼───────────────────────────────────────── - 1 │ Kern 573.6 44.5578 - 2 │ Los Angeles 658.156 22.2058 - 3 │ Orange 749.333 29.5701 - 4 │ San Luis Obispo 739.0 3.37273e-14 - 5 │ San Francisco 558.333 45.6266 - 6 │ Modoc 671.0 0.0 - 7 │ Alameda 676.091 37.3104 - 8 │ Solano 623.0 45.1222 - 9 │ Santa Cruz 624.333 113.43 - 10 │ Monterey 605.0 85.4116 - 11 │ San Bernardino 614.462 30.0066 - 12 │ Riverside 574.3 27.2025 - 13 │ Tulare 664.0 22.0097 - 14 │ San Diego 684.5 32.2241 - 15 │ Sacramento 616.0 39.7877 - 16 │ Marin 799.667 35.2397 - 17 │ Imperial 622.0 0.0 - 18 │ Ventura 743.8 31.7425 - 19 │ San Joaquin 608.667 40.8592 - 20 │ Sonoma 630.0 0.0 - 21 │ Fresno 600.25 56.9173 - 22 │ Santa Clara 718.286 58.562 - 23 │ Sutter 744.0 0.0 - 24 │ Contra Costa 766.111 53.598 - 25 │ Stanislaus 736.333 5.26576 - 26 │ Madera 480.0 3.5861 - 27 │ Placer 759.0 0.0 - 28 │ Lassen 752.0 0.0 - 29 │ Santa Barbara 728.667 25.8749 - 30 │ San Mateo 617.0 78.1173 - 31 │ Siskiyou 699.0 0.0 - 32 │ Kings 469.5 44.6284 - 33 │ Shasta 754.0 60.5829 - 34 │ Yolo 475.0 0.0 - 35 │ Calaveras 790.0 0.0 - 36 │ Napa 727.0 50.5542 - 37 │ Lake 804.0 0.0 - 38 │ Merced 595.0 0 +julia> mean(:api00, :cname, clus1) +11×3 DataFrame + Row │ cname mean SE + │ String15 Float64 Any +─────┼─────────────────────────────────── + 1 │ Alameda 669.0 1.27388e-13 + 2 │ Fresno 472.0 1.13687e-13 + 3 │ Kern 452.5 0.0 + 4 │ Los Angeles 647.267 47.4938 + 5 │ Mendocino 623.25 1.0931e-13 + 6 │ Merced 519.25 4.57038e-15 + 7 │ Orange 710.563 2.19684e-13 + 8 │ Plumas 709.556 1.27773e-13 + 9 │ San Diego 659.436 2.63446 + 10 │ San Joaquin 551.189 2.17471e-13 + 11 │ Santa Clara 732.077 56.2584 ``` """ function mean(x::Symbol, domain::Symbol, design::ReplicateDesign) @@ -82,10 +70,4 @@ function mean(x::Symbol, domain::Symbol, design::ReplicateDesign) df = bydomain(x, domain, design, weighted_mean) rename!(df, :statistic => :mean) return df -end - -function mean(x::Vector{Symbol}, design::ReplicateDesign) - df = reduce(vcat, [mean(i, design) for i in x]) - insertcols!(df, 1, :names => String.(x)) - return df end \ No newline at end of file diff --git a/src/total.jl b/src/total.jl index fdf83216..e5fbbdcb 100644 --- a/src/total.jl +++ b/src/total.jl @@ -1,21 +1,27 @@ """ -```jldoctest -julia> using Survey; - -julia> apiclus1 = load_data("apiclus1"); + total(var, design) -julia> apiclus1[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column +Compute the estimated population total for one or more variables within a survey design. -julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); +```jldoctest +julia> apiclus1 = load_data("apiclus1"); -julia> bclus1 = bootweights(dclus1; replicates = 1000); +julia> clus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) |> bootweights; -julia> total(:api00, bclus1) +julia> total(:api00, clus1) 1×2 DataFrame - Row │ total SE - │ Float64 Float64 + Row │ total SE + │ Float64 Float64 ─────┼────────────────────── - 1 │ 5.94916e6 1.31977e6 + 1 │ 3.98999e6 9.22175e5 + +julia> total([:api00, :enroll], clus1) +2×3 DataFrame + Row │ names total SE + │ String Float64 Float64 +─────┼────────────────────────────── + 1 │ api00 3.98999e6 9.22175e5 + 2 │ enroll 3.40494e6 9.51557e5 ``` """ function total(x::Symbol, design::ReplicateDesign) @@ -24,41 +30,42 @@ function total(x::Symbol, design::ReplicateDesign) variance = sum((Xt .- X).^2) / design.replicates DataFrame(total = X, SE = sqrt(variance)) end + +function total(x::Vector{Symbol}, design::ReplicateDesign) + df = reduce(vcat, [total(i, design) for i in x]) + insertcols!(df, 1, :names => String.(x)) + return df +end + """ -```jldoctest -julia> using Survey; + total(var, domain, design) -julia> apiclus1 = load_data("apiclus1"); +Compute the estimated population total within a domain. -julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); +```jldoctest +julia> apiclus1 = load_data("apiclus1"); -julia> bclus1 = bootweights(dclus1; replicates = 1000); +julia> clus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) |> bootweights; -julia> total(:api00, :cname, bclus1) |> print +julia> total(:api00, :cname, clus1) 11×3 DataFrame - Row │ cname statistic SE - │ String15 Float64 Any -─────┼─────────────────────────────────────── - 1 │ Alameda 3.71384e5 3.78375e5 - 2 │ Fresno 95281.1 96134.8 - 3 │ Kern 45672.3 43544.7 - 4 │ Los Angeles 4.89981e5 4.42865e5 - 5 │ Mendocino 1.25813e5 1.22757e5 - 6 │ Merced 1.04819e5 1.09032e5 - 7 │ Orange 5.73756e5 6.01213e5 - 8 │ Plumas 3.2228e5 3.26443e5 - 9 │ San Diego 1.83038e6 1.34155e6 - 10 │ San Joaquin 1.02922e6 1.04048e6 - 11 │ Santa Clara 9.60583e5 643492.0 + Row │ cname total SE + │ String15 Float64 Any +─────┼──────────────────────────────────────── + 1 │ Alameda 249080.0 2.48842e5 + 2 │ Fresno 63903.1 64452.2 + 3 │ Kern 30631.5 31083.0 + 4 │ Los Angeles 3.2862e5 2.93649e5 + 5 │ Mendocino 84380.6 83154.4 + 6 │ Merced 70300.2 69272.5 + 7 │ Orange 3.84807e5 3.90097e5 + 8 │ Plumas 2.16147e5 2.17811e5 + 9 │ San Diego 1.2276e6 8.78559e5 + 10 │ San Joaquin 6.90276e5 6.90685e5 + 11 │ Santa Clara 6.44244e5 4.09943e5 ``` """ function total(x::Symbol, domain::Symbol, design::ReplicateDesign) df = bydomain(x, domain, design, wsum) rename!(df, :statistic => :total) -end - -function total(x::Vector{Symbol}, design::ReplicateDesign) - df = reduce(vcat, [total(i, design) for i in x]) - insertcols!(df, 1, :names => String.(x)) - return df end \ No newline at end of file From 76275cd66b93f0b5794d8a8638023c2e2857da01 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Tue, 10 Jan 2023 17:26:11 +0530 Subject: [PATCH 11/11] Fix argument rendering in docstring --- src/SurveyDesign.jl | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 07b9e1de..69657cd0 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -18,12 +18,14 @@ General survey design encompassing a simple random, stratified, cluster or multi In the case of cluster sample, the clusters are chosen by simple random sampling. All individuals in one cluster are sampled. The clusters are considered disjoint and nested. +`strata` and `clusters` must be given as columns in `data`. + # Arguments: -`data::AbstractDataFrame`: the survey dataset (!this gets modified by the constructor). -`strata::Union{Nothing, Symbol}=nothing`: the stratification variable - must be given as a column in `data`. -`clusters::Union{Nothing, Symbol, Vector{Symbol}}=nothing`: the clustering variable - must be given as column(s) in `data`. -`weights::Union{Nothing, Symbol}=nothing`: the sampling weights. -`popsize::Union{Nothing, Int, Symbol}=nothing`: the (expected) survey population size. +- `data::AbstractDataFrame`: the survey dataset (!this gets modified by the constructor). +- `strata::Union{Nothing, Symbol}=nothing`: the stratification variable. +- `clusters::Union{Nothing, Symbol, Vector{Symbol}}=nothing`: the clustering variable. +- `weights::Union{Nothing, Symbol}=nothing`: the sampling weights. +- `popsize::Union{Nothing, Int, Symbol}=nothing`: the (expected) survey population size. ```jldoctest julia> apiclus1 = load_data("apiclus1");