From 140eb889ea3023bdb8f78d2eadc1b97c0516ab51 Mon Sep 17 00:00:00 2001 From: ayushpatnaikgit Date: Fri, 30 Dec 2022 22:34:46 +0530 Subject: [PATCH 01/80] Add general survey design --- docs/src/api.md | 2 +- src/Survey.jl | 4 +- src/SurveyDesign.jl | 91 +++++++++++++++++++++----------------------- src/bootstrap.jl | 40 ++++++++++++------- src/jackknife.jl | 2 +- src/mean.jl | 16 ++++++-- src/ratio.jl | 4 +- src/show.jl | 8 ++-- src/total.jl | 10 ++--- test/SurveyDesign.jl | 6 +-- test/bootstrap.jl | 2 +- test/jackknife.jl | 2 +- test/mean.jl | 2 +- test/ratio.jl | 2 +- test/total.jl | 2 +- 15 files changed, 105 insertions(+), 88 deletions(-) diff --git a/docs/src/api.md b/docs/src/api.md index 5554b427..062d379d 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -27,7 +27,7 @@ mean(x::Symbol, by::Symbol, design::SimpleRandomSample) total(x::Symbol, by::Symbol, design::SimpleRandomSample) ``` ```@docs -ratio(variable_num:: Symbol, variable_den:: Symbol, design::OneStageClusterSample) +ratio(variable_num:: Symbol, variable_den:: Symbol, design::SurveyDesign) plot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...) boxplot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...) hist(design::AbstractSurveyDesign, var::Symbol, diff --git a/src/Survey.jl b/src/Survey.jl index 086e11ef..8854e1d3 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -28,11 +28,11 @@ include("ratio.jl") export load_data export AbstractSurveyDesign, SimpleRandomSample, StratifiedSample -export OneStageClusterSample +export SurveyDesign export dim, colnames, dimnames export mean, total, quantile export plot -export hist, sturges, freedman_diaconis +export hist export boxplot export Bootstrap export jkknife diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 30770b77..a17655c0 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -323,17 +323,17 @@ struct StratifiedSample <: AbstractSurveyDesign end """ - OneStageClusterSample <: AbstractSurveyDesign + SurveyDesign <: AbstractSurveyDesign -Survey design sampled by one stage cluster sampling. +Survey design sampled by one stage clusters sampling. Clusters chosen by SRS followed by complete sampling of selected clusters. -Assumes each individual in one and only one cluster; disjoint and nested clusters. +Assumes each individual in one and only one clusters; disjoint and nested clusters. -`cluster` must be specified as a Symbol name of a column in `data`. +`clusters` must be specified as a Symbol name of a column in `data`. # Arguments: `data::AbstractDataFrame`: the survey dataset (!this gets modified by the constructor). -`cluster::Symbol`: the stratification variable - must be given as a column in `data`. +`clusters::Symbol`: the stratification variable - must be given as a column in `data`. `popsize::Union{Nothing,Symbol,<:Unsigned,Vector{<:Real}}=nothing`: the (expected) survey population size. For `weights::Union{Nothing,Symbol,Vector{<:Real}}=nothing`: the sampling weights. @@ -343,11 +343,11 @@ julia> apiclus1 = load_data("apiclus1"); julia> apiclus1[!, :pw] = fill(757/15,(size(apiclus1,1),)); # Correct api mistake for pw column -julia> dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc) -OneStageClusterSample: +julia> dclus1 = SurveyDesign(apiclus1, :dnum, :fpc) +SurveyDesign: data: 183x45 DataFrame -cluster: dnum -design.data[!,design.cluster]: 637, 637, 637, ..., 448 +clusters: dnum +design.data[!,design.clusters]: 637, 637, 637, ..., 448 popsize: fpc design.data[!,design.popsize]: 757, 757, 757, ..., 757 sampsize: sampsize @@ -362,11 +362,11 @@ julia> apiclus1 = load_data("apiclus1"); julia> apiclus1[!, :pw] = fill(757/15,(size(apiclus1,1),)); # Correct api mistake for pw column -julia> dclus1 = OneStageClusterSample(apiclus1, :dnum; weights=:pw) -OneStageClusterSample: +julia> dclus1 = SurveyDesign(apiclus1, :dnum; weights=:pw) +SurveyDesign: data: 183x46 DataFrame -cluster: dnum -design.data[!,design.cluster]: 637, 637, 637, ..., 448 +clusters: dnum +design.data[!,design.clusters]: 637, 637, 637, ..., 448 popsize: popsize design.data[!,design.popsize]: 757.0, 757.0, 757.0, ..., 757.0 sampsize: sampsize @@ -378,53 +378,48 @@ design.data[!,:probs]: 0.0198, 0.0198, 0.0198, ..., 0.0198 design.data[!,:allprobs]: 0.0198, 0.0198, 0.0198, ..., 0.0198 ``` """ -struct OneStageClusterSample <: AbstractSurveyDesign +struct SurveyDesign <: AbstractSurveyDesign data::AbstractDataFrame cluster::Symbol popsize::Symbol sampsize::Symbol - weights::Symbol + strata::Symbol pps::Bool - has_strata::Bool - # Single stage cluster sample, like apiclus1 - function OneStageClusterSample(data::AbstractDataFrame, cluster::Symbol, popsize::Symbol; kwargs...) # Right now kwargs does nothing, for expansion + # Single stage clusters sample, like apiclus1 + function SurveyDesign(data::AbstractDataFrame; strata::Union{Nothing,Symbol} = nothing, weights::Union{Nothing,Symbol}= nothing, clusters::Union{Nothing, Symbol, Vector{Symbol}} = nothing, popsize::Union{Nothing, Int,Symbol}=nothing) # sampsize here is number of clusters completely sampled, popsize is total clusters in population - if !(typeof(data[!, popsize]) <: Vector{<:Real}) - error(string("given popsize column ", popsize , " is not of numeric type")) + if typeof(strata) <:Nothing + data.false_strata = repeat(["FALSE_STRATA"], nrow(data)) + strata = :false_strata + end + if typeof(clusters) <: Nothing + data.false_cluster = 1:nrow(data) + cluster = :false_cluster + end + ## Single stage approximation + if typeof(clusters) <: Vector{Symbol} + cluster = first(clusters) end - if !all(w -> w == first(data[!, popsize]), data[!, popsize]) - error("popsize must be same for all observations within the cluster in ClusterSample") + if typeof(clusters) <: Symbol + cluster = clusters end # For one-stage sample only one sampsize vector sampsize_labels = :sampsize - data_groupedby_cluster = groupby(data, cluster) - data[!, sampsize_labels] = fill(size(data_groupedby_cluster, 1),(nrow(data),)) - weights = :weights - data[!, :weights] = data[!, popsize] ./ data[!, sampsize_labels] - data[!, :probs] = 1 ./ data[!, weights] # Many formulae are easily defined in terms of sampling probabilties - data[!, :allprobs] = data[!, :probs] # In one-stage cluster sample, allprobs is just probs, no multiplication needed - data[!, :strata] = ones(nrow(data)) - pps = false - has_strata = false - new(data, cluster, popsize, sampsize_labels, weights ,pps, has_strata) - end - # Single stage cluster sample, like apiclus1 - function OneStageClusterSample(data::AbstractDataFrame, cluster::Symbol; weights::Symbol=nothing, kwargs...) # Right now kwargs does nothing, for expansion - # sampsize here is number of clusters completely sampled, popsize is total clusters in population - if !(typeof(data[!, weights]) <: Vector{<:Real}) - error(string("given weights column ", weights , " is not of numeric type")) + data[!, sampsize_labels] = fill(length(unique(data[!, cluster])),(nrow(data),)) + if !(typeof(popsize) <: Nothing) + data[!, :weights] = data[!, popsize] ./ data[!, sampsize_labels] + elseif !(typeof(weights) <: Nothing) + data.weights = data[!, weights] + else + data.weights = repeat([1], nrow(data)) end - sampsize_labels = :sampsize - data_groupedby_cluster = groupby(data, cluster) - data[!, sampsize_labels] = fill(size(data_groupedby_cluster, 1),(nrow(data),)) - popsize = :popsize - data[!, popsize] = data[!, weights] .* data[!, sampsize_labels] - data[!, :probs] = 1 ./ data[!, weights] # Many formulae are easily defined in terms of sampling probabilties - data[!, :weights] = data[!, weights] + data[!, :probs] = 1 ./ data[!, :weights] # Many formulae are easily defined in terms of sampling probabilties data[!, :allprobs] = data[!, :probs] # In one-stage cluster sample, allprobs is just probs, no multiplication needed - data[!, :strata] = ones(nrow(data)) pps = false - has_strata = false - new(data, cluster, popsize, sampsize_labels, weights, pps, has_strata) + if !(typeof(popsize) <: Symbol) + data.popsize = repeat([sum(data.weights)], nrow(data)) + popsize = :popsize + end + new(data, cluster, popsize, sampsize_labels, strata, pps) end end \ No newline at end of file diff --git a/src/bootstrap.jl b/src/bootstrap.jl index 27c84a73..1320cd51 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -1,7 +1,7 @@ struct Bootstrap replicates rng - function Bootstrap(; replicates = 100, rng = MersenneTwister(111)) + function Bootstrap(; replicates = 1000, rng = MersenneTwister(111)) new(replicates, rng) end end @@ -12,7 +12,7 @@ julia> using Survey, Random, StatsBase; julia> apiclus1 = load_data("apiclus1"); -julia> dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc); +julia> dclus1 = SurveyDesign(apiclus1, :dnum, :fpc); julia> rng = MersenneTwister(111); @@ -27,18 +27,32 @@ julia> Survey.bootstrap(:api00, dclus1, func; replicates=1000, rng) ``` """ -function bootstrap(x::Symbol, design::OneStageClusterSample, func = wsum; replicates = 100, rng = MersenneTwister(1234)) - gdf = groupby(design.data, design.cluster) - psus = unique(design.data[!, design.cluster]) - nh = length(psus) +function bootstrap(x::Symbol, design::SurveyDesign, func = wsum; replicates = 100, rng = MersenneTwister(1234)) X = func(design.data[:, x], design.data.weights) + H = length(unique(design.data[!, design.strata])) + stratified = groupby(design.data, design.strata) Xt = Array{Float64, 1}(undef, replicates) for i in 1:replicates - selected_psus = psus[rand(rng, 1:nh, (nh-1))] # simple random sample of PSUs, with replacement. Select (nh-1) out of nh - xhij = (reduce(vcat, [gdf[(i,)][!, x] for i in selected_psus])) - whij = (reduce(vcat, [gdf[(i,)].weights * (nh / (nh - 1)) for i in selected_psus])) - Xt[i] = func(xhij, whij) - end - variance = sum((Xt .- X).^2) / replicates + Xh = [] + Wh = [] + for j in 1:H + substrata = DataFrame(stratified[j]) + psus = unique(substrata[!, design.cluster]) + if length(psus) == 1 + return DataFrame(statistic = X, SE = 0) + end + nh = length(psus) + gdf = groupby(substrata, design.cluster) + selected_psus = psus[rand(rng, 1:nh, (nh-1))] # simple random sample of PSUs, with replacement. Select (nh-1) out of nh + xhij = (reduce(vcat, [gdf[(i,)][!, x] for i in selected_psus])) + whij = (reduce(vcat, [gdf[(i,)].weights * (nh / (nh - 1)) for i in selected_psus])) + append!(Xh, xhij) + append!(Wh, whij) + end + Xh = Float64.(Xh) + Wh = Float64.(Wh) + Xt[i] = func(Xh, Wh) + end + variance = sum((Xt .- X).^2) / replicates return DataFrame(statistic = X, SE = sqrt(variance)) -end +end \ No newline at end of file diff --git a/src/jackknife.jl b/src/jackknife.jl index 2d0a656d..794ef10b 100644 --- a/src/jackknife.jl +++ b/src/jackknife.jl @@ -1,4 +1,4 @@ -function jkknife(variable:: Symbol, design::OneStageClusterSample ,func:: Function; params =[]) +function jkknife(variable:: Symbol, design::SurveyDesign ,func:: Function; params =[]) statistic = func(design.data[!,variable],params...) nh = length(unique(design.data[!,design.cluster])) newv = [] diff --git a/src/mean.jl b/src/mean.jl index c54cc337..d56c815d 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -6,7 +6,7 @@ Estimate the population mean of a variable of a simple random sample, and the co The calculations were done according to the book [Sampling Techniques](https://www.academia.edu/29684662/Cochran_1977_Sampling_Techniques_Third_Edition) by William Cochran. -For OneStageClusterSample, formula adapted from Sarndal pg129, section 4.2.2 Simple Random Cluster Sampling +For SurveyDesign, formula adapted from Sarndal pg129, section 4.2.2 Simple Random Cluster Sampling ```jldoctest julia> apisrs = load_data("apisrs"); @@ -92,7 +92,7 @@ function mean(x::Symbol, design::StratifiedSample) return DataFrame(mean=Ȳ̂, SE=SE) end -function mean(x::Symbol, design::OneStageClusterSample) +function mean(x::Symbol, design::SurveyDesign) ## Based on logical translation of corresponding in total.jl ## Not quite same from R as it rounds of `total`, so division results in difference # > svymean(~api00,dclus1) @@ -190,7 +190,7 @@ julia> using Survey, Random, StatsBase; julia> apiclus1 = load_data("apiclus1"); -julia> dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc); +julia> dclus1 = SurveyDesign(apiclus1, :dnum, :fpc); julia> mean(:api00, dclus1, Bootstrap(replicates = 1000, rng = MersenneTwister(111))) 1×2 DataFrame @@ -200,8 +200,16 @@ julia> mean(:api00, dclus1, Bootstrap(replicates = 1000, rng = MersenneTwister(1 1 │ 644.169 23.0897 ``` """ -function mean(x::Symbol, design::OneStageClusterSample, method::Bootstrap) +function mean(x::Symbol, design::SurveyDesign, method::Bootstrap) weighted_mean(x, w) = mean(x, weights(w)) df = bootstrap(x, design, weighted_mean; method.replicates, method.rng) df = rename(df, :statistic => :mean) +end + +function mean(x::Symbol, by::Symbol, design::SurveyDesign, method::Bootstrap) + gdf = groupby(design.data, by) + subdesigns = [SurveyDesign(gdf[i]; strata = design.strata, weights = :weights, clusters = design.cluster) for i in 1:length(gdf)] + df = vcat([mean(x, subdesign, method) for subdesign in subdesigns]...) + df[!, by] = [first(gdf[i][!, by]) for i in 1:length(gdf)] + return df end \ No newline at end of file diff --git a/src/ratio.jl b/src/ratio.jl index c6050635..4a75e9aa 100644 --- a/src/ratio.jl +++ b/src/ratio.jl @@ -9,7 +9,7 @@ julia> apiclus1 = load_data("apiclus1"); julia> apiclus1[!, :pw] = fill(757/15,(size(apiclus1,1),)); # Correct api mistake for pw column -julia> dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc); +julia> dclus1 = SurveyDesign(apiclus1, :dnum, :fpc); julia> ratio(:api00, :enroll, dclus1) 1×2 DataFrame @@ -19,7 +19,7 @@ julia> ratio(:api00, :enroll, dclus1) 1 │ 1.17182 0.151242 ``` """ -function ratio(variable_num:: Symbol, variable_den:: Symbol, design::OneStageClusterSample) +function ratio(variable_num:: Symbol, variable_den:: Symbol, design::SurveyDesign) statistic = wsum(design.data[!,variable_num],design.data.weights)/wsum(design.data[!,variable_den],design.data.weights) nh = length(unique(design.data[!,design.cluster])) newv = [] diff --git a/src/show.jl b/src/show.jl index 1bc9989a..a912e723 100644 --- a/src/show.jl +++ b/src/show.jl @@ -49,7 +49,7 @@ function Base.show(io::IO, ::MIME"text/plain", design::StratifiedSample) end "Print information about a survey design." -function Base.show(io::IO, ::MIME"text/plain", design::OneStageClusterSample) +function Base.show(io::IO, ::MIME"text/plain", design::SurveyDesign) type = typeof(design) printstyled(io, "$type:\n"; bold=true) printstyled(io, "data: "; bold=true) @@ -60,9 +60,9 @@ function Base.show(io::IO, ::MIME"text/plain", design::OneStageClusterSample) printinfo(io, "design.data[!,design.popsize]", makeshort(design.data[!,design.popsize])) printinfo(io, "sampsize", string(design.sampsize); newline=true) printinfo(io, "design.data[!,design.sampsize]", makeshort(design.data[!,design.sampsize])) - printinfo(io, "weights", string(design.weights); newline=true) - printinfo(io, "design.data[!,design.weights]", makeshort(design.data[!,design.weights])) - printinfo(io, "design.data[!,:strata]", makeshort(design.data[!,:strata])) + # printinfo(io, "weights", string(design.weights); newline=true) + # printinfo(io, "design.data[!,design.weights]", makeshort(design.data[!,design.weights])) + # printinfo(io, "design.data[!,:strata]", makeshort(design.data[!,:strata])) printinfo(io, "design.data[!,:probs]", makeshort(design.data.probs)) printinfo(io, "design.data[!,:allprobs]", makeshort(design.data.allprobs)) end \ No newline at end of file diff --git a/src/total.jl b/src/total.jl index 087c6123..f9824255 100644 --- a/src/total.jl +++ b/src/total.jl @@ -3,7 +3,7 @@ Estimate the population total for the variable specified by `x`. -For OneStageClusterSample, formula adapted from Sarndal pg129, section 4.2.2 Simple Random Cluster Sampling +For SurveyDesign, formula adapted from Sarndal pg129, section 4.2.2 Simple Random Cluster Sampling ```jldoctest julia> using Survey; @@ -94,7 +94,7 @@ julia> using Survey julia> apiclus1 = load_data("apiclus1"); -julia> dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc); +julia> dclus1 = SurveyDesign(apiclus1, :dnum, :fpc); julia> total(:api00, dclus1) 1×2 DataFrame @@ -104,7 +104,7 @@ julia> total(:api00, dclus1) 1 │ 5.94916e6 1.33948e6 ``` """ -function total(x::Symbol, design::OneStageClusterSample) +function total(x::Symbol, design::SurveyDesign) gdf = groupby(design.data, design.cluster) ŷₜ = combine(gdf, x => sum => :sum).sum Nₜ = first(design.data[!,design.popsize]) @@ -157,7 +157,7 @@ julia> using Survey, Random, StatsBase; julia> apiclus1 = load_data("apiclus1"); -julia> dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc); +julia> dclus1 = SurveyDesign(apiclus1, :dnum, :fpc); julia> total(:api00, dclus1, Bootstrap(replicates = 1000, rng = MersenneTwister(111))) 1×2 DataFrame @@ -167,7 +167,7 @@ julia> total(:api00, dclus1, Bootstrap(replicates = 1000, rng = MersenneTwister( 1 │ 5.94916e6 1.36593e6 ``` """ -function total(x::Symbol, design::OneStageClusterSample, method::Bootstrap) +function total(x::Symbol, design::SurveyDesign, method::Bootstrap) df = bootstrap(x, design, wsum; method.replicates, method.rng) df = rename(df, :statistic => :total) end \ No newline at end of file diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl index 0ec15b5d..0f3bc796 100644 --- a/test/SurveyDesign.jl +++ b/test/SurveyDesign.jl @@ -165,14 +165,14 @@ end @test_throws ErrorException StratifiedSample(apistrat, :stype; popsize= :pw, sampsize=:fpc) end -@testset "OneStageClusterSample" begin +@testset "SurveyDesign" begin # Load API datasets apiclus1_original = load_data("apiclus1") apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column ############################## # one-stage cluster sample with popsize apiclus1 = copy(apiclus1_original) - dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc) + dclus1 = SurveyDesign(apiclus1, :dnum, :fpc) @test dclus1.data[!,dclus1.weights] ≈ fill(50.4667,size(apiclus1,1)) atol = 1e-3 @test dclus1.data[!,dclus1.sampsize] ≈ fill(15,size(apiclus1,1)) @test dclus1.data[!,:allprobs] ≈ dclus1.data[!,:probs] atol = 1e-4 @@ -180,7 +180,7 @@ end ############################## # one-stage cluster sample with weights apiclus1 = copy(apiclus1_original) - dclus1 = OneStageClusterSample(apiclus1, :dnum; weights=:pw) + dclus1 = SurveyDesign(apiclus1, :dnum; weights=:pw) @test dclus1.data[!,dclus1.weights] ≈ fill(50.4667,size(apiclus1,1)) atol = 1e-3 @test dclus1.data[!,dclus1.sampsize] ≈ fill(15,size(apiclus1,1)) @test dclus1.data[!,:allprobs] ≈ dclus1.data[!,:probs] atol = 1e-4 diff --git a/test/bootstrap.jl b/test/bootstrap.jl index 7c9778eb..e49c1b3f 100644 --- a/test/bootstrap.jl +++ b/test/bootstrap.jl @@ -1,6 +1,6 @@ using Random, StatsBase apiclus1 = load_data("apiclus1") -dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc); +dclus1 = SurveyDesign(apiclus1, :dnum, :fpc); rng = MersenneTwister(111); func = wsum; est = Survey.bootstrap(:api00, dclus1, func; replicates=1000, rng) diff --git a/test/jackknife.jl b/test/jackknife.jl index 85ca3261..73e90f78 100644 --- a/test/jackknife.jl +++ b/test/jackknife.jl @@ -4,7 +4,7 @@ ############################## # one-stage cluster sample apiclus1 = copy(apiclus1_original) - dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc) + dclus1 = SurveyDesign(apiclus1, :dnum, :fpc) @test jkknife(:api00,dclus1, mean).SE[1] ≈ 26.5997 atol = 1e-4 @test jkknife(:api00, dclus1, mean).Statistic[1] ≈ 644.1693 atol = 1e-4 end diff --git a/test/mean.jl b/test/mean.jl index bbc1d952..d2fbd8a2 100644 --- a/test/mean.jl +++ b/test/mean.jl @@ -80,7 +80,7 @@ end ############################## # one-stage cluster sample apiclus1 = copy(apiclus1_original) - dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc) + dclus1 = SurveyDesign(apiclus1, :dnum, :fpc) @test mean(:api00,dclus1, Bootstrap()).mean[1] ≈ 644.17 atol = 1 @test mean(:api00,dclus1, Bootstrap(replicates = 10000)).SE[1] ≈ 23.779 atol = 0.5 # without fpc as it hasn't been figured out for bootstrap. diff --git a/test/ratio.jl b/test/ratio.jl index 9ac25a1a..d198ce1b 100644 --- a/test/ratio.jl +++ b/test/ratio.jl @@ -4,7 +4,7 @@ ############################## # one-stage cluster sample apiclus1 = copy(apiclus1_original) - dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc) + dclus1 = SurveyDesign(apiclus1, :dnum, :fpc) @test ratio(:api00, :enroll, dclus1).SE[1] ≈ 0.151242 atol = 1e-4 @test ratio(:api00, :enroll, dclus1).Statistic[1] ≈ 1.17182 atol = 1e-4 end \ No newline at end of file diff --git a/test/total.jl b/test/total.jl index c57114d1..de19c41c 100644 --- a/test/total.jl +++ b/test/total.jl @@ -100,7 +100,7 @@ end ############################## # one-stage cluster sample apiclus1 = copy(apiclus1_original) - dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc) + dclus1 = SurveyDesign(apiclus1, :dnum, :fpc) @test total(:api00,dclus1).total[1] ≈ 5949162 atol = 1 @test total(:api00,dclus1).SE[1] ≈ 1339481 atol = 1 From 5c7fe92f35ace80361c91db130a8141e5ce7c42c Mon Sep 17 00:00:00 2001 From: ayushpatnaikgit Date: Sat, 31 Dec 2022 19:18:49 +0530 Subject: [PATCH 02/80] Add bootweights and domain estimation --- src/SurveyDesign.jl | 12 +++++++++++- src/bootstrap.jl | 28 ++++++++++++++++++++++++++++ src/mean.jl | 37 +++++++++++++++++++++++++++++++------ src/ratio.jl | 4 ++++ src/show.jl | 21 +++++++++++++++++++++ 5 files changed, 95 insertions(+), 7 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index a17655c0..acfafa50 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -422,4 +422,14 @@ struct SurveyDesign <: AbstractSurveyDesign end new(data, cluster, popsize, sampsize_labels, strata, pps) end -end \ No newline at end of file +end + +struct ReplicateDesign <: AbstractSurveyDesign + data::AbstractDataFrame + cluster::Symbol + popsize::Symbol + sampsize::Symbol + strata::Symbol + pps::Bool + replicates::UInt +end diff --git a/src/bootstrap.jl b/src/bootstrap.jl index 1320cd51..e454a82c 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -55,4 +55,32 @@ function bootstrap(x::Symbol, design::SurveyDesign, func = wsum; replicates = 10 end variance = sum((Xt .- X).^2) / replicates return DataFrame(statistic = X, SE = sqrt(variance)) +end + +function bootweights(design::SurveyDesign; replicates = 100, rng = MersenneTwister(1234)) + H = length(unique(design.data[!, design.strata])) + stratified = groupby(design.data, design.strata) + function replicate(stratified, H) + for j in 1:H + substrata = DataFrame(stratified[j]) + psus = unique(substrata[!, design.cluster]) + if length(psus) == 1 + return DataFrame(statistic = X, SE = 0) + end + nh = length(psus) + rh = [(count(==(i), rand(1:(nh-1), nh))) for i in 1:nh] # main bootstrap algo. + gdf = groupby(substrata, design.cluster) + for i in 1:nh + gdf[i].rh = repeat([rh[i]], nrow(gdf[i])) + end + stratified[j].rh = DataFrame(gdf).rh + end + return DataFrame(stratified) + end + df = replicate(stratified, H) + rename!(df,:rh => :replicate_1) + for i in 2:(replicates) + df[!, "replicate_"*string(i)] = Float64.(replicate(stratified, H).rh) + end + return ReplicateDesign(df, design.cluster, design.popsize, design.sampsize, design.strata, design.pps, replicates) end \ No newline at end of file diff --git a/src/mean.jl b/src/mean.jl index d56c815d..ff6e78dd 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -206,10 +206,35 @@ function mean(x::Symbol, design::SurveyDesign, method::Bootstrap) df = rename(df, :statistic => :mean) end -function mean(x::Symbol, by::Symbol, design::SurveyDesign, method::Bootstrap) - gdf = groupby(design.data, by) - subdesigns = [SurveyDesign(gdf[i]; strata = design.strata, weights = :weights, clusters = design.cluster) for i in 1:length(gdf)] - df = vcat([mean(x, subdesign, method) for subdesign in subdesigns]...) - df[!, by] = [first(gdf[i][!, by]) for i in 1:length(gdf)] - return df +# function mean(x::Symbol, by::Symbol, design::SurveyDesign, method::Bootstrap) +# gdf = groupby(design.data, by) +# subdesigns = [SurveyDesign(gdf[i]; strata = design.strata, weights = :weights, clusters = design.cluster) for i in 1:length(gdf)] +# df = vcat([mean(x, subdesign, method) for subdesign in subdesigns]...) +# df[!, by] = [first(gdf[i][!, by]) for i in 1:length(gdf)] +# return df +# end + +function mean(x::Symbol, design::ReplicateDesign) + X = mean(design.data[!, x], weights(design.data.weights)) + Xt = [mean(design.data[!, x], weights(design.data.weights .* design.data[! , "replicate_"*string(i)])) for i in 1:design.replicates] + variance = sum((Xt .- X).^2) / design.replicates + DataFrame(mean = X, SE = sqrt(variance)) +end + + +function mean(x::Symbol, domain::Symbol, design::ReplicateDesign) + gdf = groupby(design.data, domain) + X = combine(gdf, [x, :weights] => ((a, b) -> mean(a, weights(b))) => :mean) + Xt_mat = Array{Float64, 2}(undef, (length(unique(design.data[!, domain])), design.replicates)) + for i in 1:design.replicates + Xt = combine(gdf, [x, :weights, Symbol("replicate_"*string(i))] => ((a, b, c) -> mean(a, weights(b .* c))) => :mean).mean + for i in 1:length(Xt) + if isnan(Xt[i]) + Xt[i] = X.mean[i] # replace lonely psu with point estimate. This needs to be corrected. + end + end + Xt_mat[:, i] = Xt + end + X.SE = sqrt.(sum((Xt_mat .- X.mean).^2 / design.replicates, dims = 2))[:,1] + return X end \ No newline at end of file diff --git a/src/ratio.jl b/src/ratio.jl index 4a75e9aa..8e923d42 100644 --- a/src/ratio.jl +++ b/src/ratio.jl @@ -36,3 +36,7 @@ function ratio(variable_num:: Symbol, variable_den:: Symbol, design::SurveyDesig var = c*(nh-1)/nh return DataFrame(Statistic = statistic, SE = sqrt(var)) end + +# function ratio(x::Symbol, design::ReplicateDesign) +# design.data[!, "ones"] = ones(nrow(design.data)) +# end \ No newline at end of file diff --git a/src/show.jl b/src/show.jl index a912e723..4adb61d8 100644 --- a/src/show.jl +++ b/src/show.jl @@ -65,4 +65,25 @@ function Base.show(io::IO, ::MIME"text/plain", design::SurveyDesign) # printinfo(io, "design.data[!,:strata]", makeshort(design.data[!,:strata])) printinfo(io, "design.data[!,:probs]", makeshort(design.data.probs)) printinfo(io, "design.data[!,:allprobs]", makeshort(design.data.allprobs)) +end + +"Print information about a repliocate design." +function Base.show(io::IO, ::MIME"text/plain", design::ReplicateDesign) + type = typeof(design) + printstyled(io, "$type:\n"; bold=true) + printstyled(io, "data: "; bold=true) + println(io, size(design.data, 1), "x", size(design.data, 2), " DataFrame") + printinfo(io, "cluster", string(design.cluster); newline=true) + printinfo(io, "design.data[!,design.cluster]", makeshort(design.data[!,design.cluster])) + printinfo(io, "popsize", string(design.popsize); newline=true) + printinfo(io, "design.data[!,design.popsize]", makeshort(design.data[!,design.popsize])) + printinfo(io, "sampsize", string(design.sampsize); newline=true) + printinfo(io, "design.data[!,design.sampsize]", makeshort(design.data[!,design.sampsize])) + # printinfo(io, "weights", string(design.weights); newline=true) + # printinfo(io, "design.data[!,design.weights]", makeshort(design.data[!,design.weights])) + # printinfo(io, "design.data[!,:strata]", makeshort(design.data[!,:strata])) + printinfo(io, "design.data[!,:probs]", makeshort(design.data.probs)) + printinfo(io, "design.data[!,:allprobs]", makeshort(design.data.allprobs)) + printstyled(io, "replicates: "; bold=true) + println(io, design.replicates) end \ No newline at end of file From 5c9ec3106743f13e28c654d30d2ae5b84d569f07 Mon Sep 17 00:00:00 2001 From: ayushpatnaikgit Date: Sun, 1 Jan 2023 11:22:30 +0530 Subject: [PATCH 03/80] Remove other mean functions. --- src/bootstrap.jl | 2 +- src/mean.jl | 223 +++-------------------------------------------- 2 files changed, 14 insertions(+), 211 deletions(-) diff --git a/src/bootstrap.jl b/src/bootstrap.jl index e454a82c..2662a7a6 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -68,7 +68,7 @@ function bootweights(design::SurveyDesign; replicates = 100, rng = MersenneTwist return DataFrame(statistic = X, SE = 0) end nh = length(psus) - rh = [(count(==(i), rand(1:(nh-1), nh))) for i in 1:nh] # main bootstrap algo. + rh = [(count(==(i), rand(rng, 1:(nh-1), nh))) for i in 1:nh] # main bootstrap algo. gdf = groupby(substrata, design.cluster) for i in 1:nh gdf[i].rh = repeat([rh[i]], nrow(gdf[i])) diff --git a/src/mean.jl b/src/mean.jl index ff6e78dd..f19ee1f5 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -1,189 +1,3 @@ -""" - mean(x, design) - -Estimate the population mean of a variable of a simple random sample, and the corresponding standard error. - -The calculations were done according to the book [Sampling Techniques](https://www.academia.edu/29684662/Cochran_1977_Sampling_Techniques_Third_Edition) -by William Cochran. - -For SurveyDesign, formula adapted from Sarndal pg129, section 4.2.2 Simple Random Cluster Sampling - -```jldoctest -julia> apisrs = load_data("apisrs"); - -julia> srs = SimpleRandomSample(apisrs;popsize=:fpc); - -julia> mean(:enroll, srs) -1×2 DataFrame - Row │ mean SE - │ Float64 Float64 -─────┼────────────────── - 1 │ 584.61 27.3684 - -julia> mean([:api00, :api99], srs) -2×3 DataFrame - Row │ names mean SE - │ String Float64 Float64 -─────┼────────────────────────── - 1 │ api00 656.585 9.24972 - 2 │ api99 624.685 9.5003 - -julia> strat = load_data("apistrat"); - -julia> dstrat = StratifiedSample(strat, :stype; popsize = :fpc); - -julia> mean(:api00, dstrat) -1×2 DataFrame - Row │ mean SE - │ Float64 Float64 -─────┼────────────────── - 1 │ 662.287 9.40894 -``` -""" -function mean(x::Symbol, design::SimpleRandomSample) - function se(x::Symbol, design::SimpleRandomSample) - variance = design.fpc * Statistics.var(design.data[!, x]) / design.sampsize - return sqrt(variance) - end - if isa(design.data[!, x], CategoricalArray) - gdf = groupby(design.data, x) - p = combine(gdf, nrow => :counts) - p.mean = p.counts ./ sum(p.counts) - # variance of proportion - p.var = design.fpc .* p.mean .* (1 .- p.mean) ./ (design.sampsize - 1) - p.se = sqrt.(p.var) - return select(p, Not([:counts, :var])) - end - return DataFrame(mean=mean(design.data[!, x]), SE=se(x, design)) -end - -function mean(x::Vector{Symbol}, design::SimpleRandomSample) - df = reduce(vcat, [mean(i, design) for i in x]) - insertcols!(df, 1, :names => String.(x)) - return df -end - -function mean(x::Symbol, design::StratifiedSample) - if x == design.strata - gdf = groupby(design.data, x) - p = combine(gdf, :weights => sum => :Nₕ) - p.Wₕ = p.Nₕ ./ sum(p.Nₕ) - p = select!(p, Not(:Nₕ)) - return p - elseif isa(design.data[!, x], CategoricalArray) - gdf = groupby(design.data, x) - p = combine(gdf, nrow => :counts) - p.proportion = p.counts ./ sum(p.counts) - # variance of proportion - p.var = design.fpc .* p.proportion .* (1 .- p.proportion) ./ (design.sampsize - 1) - p.SE = sqrt.(p.var) - return p - end - gdf = groupby(design.data, design.strata) - ȳₕ = combine(gdf, x => mean => :mean).mean - Nₕ = combine(gdf, :weights => sum => :Nₕ).Nₕ - nₕ = combine(gdf, nrow => :nₕ).nₕ - fₕ = nₕ ./ Nₕ - Wₕ = Nₕ ./ sum(Nₕ) - Ȳ̂ = sum(Wₕ .* ȳₕ) - s²ₕ = combine(gdf, x => var => :s²h).s²h - V̂Ȳ̂ = sum((Wₕ .^ 2) .* (1 .- fₕ) .* s²ₕ ./ nₕ) - SE = sqrt(V̂Ȳ̂) - return DataFrame(mean=Ȳ̂, SE=SE) -end - -function mean(x::Symbol, design::SurveyDesign) - ## Based on logical translation of corresponding in total.jl - ## Not quite same from R as it rounds of `total`, so division results in difference - # > svymean(~api00,dclus1) - # mean SE - # api00 644.17 23.542 - gdf = groupby(design.data, design.cluster) - ȳₜ = combine(gdf, x => mean => :mean).mean - Nₜ = first(design.data[!,design.popsize]) - nₜ = first(design.data[!,design.sampsize]) - Ȳ = mean(ȳₜ) - s²ₜ = var(ȳₜ) - VȲ = (1 - nₜ/Nₜ) * s²ₜ / nₜ - return DataFrame(mean = Ȳ, SE = sqrt(VȲ)) -end - -""" - mean(x, by, design) - -Estimate the subpopulation mean of a variable `x`. - -The calculations were done according to the book [Model-Assisted Survey Sampling](https://link.springer.com/book/9780387406206) -by Carl-Erik Sarndal, Bengt Swensson, Jan Wretman, section 3.3 and Chap 10. Assumes popsize is known and subpopulation size is unknown. - -```jldoctest -julia> using Survey; - -julia> srs = load_data("apisrs"); - -julia> srs = SimpleRandomSample(srs; popsize = :fpc); - -julia> mean(:api00, :cname, srs) |> first -DataFrameRow - Row │ cname mean SE - │ String15 Float64 Float64 -─────┼──────────────────────────── - 1 │ Kern 573.6 42.8026 - -julia> strat = load_data("apistrat"); - -julia> dstrat = StratifiedSample(strat, :stype; popsize = :fpc); - -julia> mean(:api00, :cname, dstrat) |> first -DataFrameRow - Row │ cname mean SE - │ String15 Float64 Float64 -─────┼─────────────────────────────── - 1 │ Los Angeles 633.511 21.3912 -``` -""" -function mean(x::Symbol, by::Symbol, design::SimpleRandomSample) - function domain_mean(x::AbstractVector, design::SimpleRandomSample, weights) - function se(x::AbstractVector, design::SimpleRandomSample) - nd = length(x) # domain size - n = design.sampsize - fpc = design.fpc - variance = (nd / n)^(-2) / n * fpc * ((nd - 1) / (n - 1)) * var(x) - return sqrt(variance) - end - return DataFrame(mean=Statistics.mean(x), SE=se(x, design)) - end - gdf = groupby(design.data, by) - combine(gdf, [x, :weights] => ((a, b) -> domain_mean(a, design, b)) => AsTable) -end - -function mean(x::Symbol, by::Symbol, design::StratifiedSample) - function domain_mean(x::AbstractVector, popsize::AbstractVector, sampsize::AbstractVector, sampfraction::AbstractVector, strata::AbstractVector) - df = DataFrame(x=x, popsize=popsize, sampsize=sampsize, sampfraction=sampfraction, strata=strata) - function calculate_components(x, popsize, sampsize, sampfraction) - return DataFrame( - nsdh = length(x), - nsh = length(x), - substrata_domain_totals = sum(x), - ȳsdh = mean(x), - Nh = first(popsize), - nh = first(sampsize), - fh = first(sampfraction), - sigma_ȳsh_squares = sum((x .- mean(x)).^2) - ) - end - components = combine(groupby(df, :strata), [:x, :popsize, :sampsize, :sampfraction] => calculate_components => AsTable) - domain_mean = sum(components.Nh .* components.substrata_domain_totals ./ components.nh) / sum(components.Nh .* components.nsdh ./ components.nh) - pdh = components.nsdh ./ components.nh - N̂d = sum(components.Nh .* pdh) - domain_var = sum(components.Nh .^ 2 .* (1 .- components.fh) .* (components.sigma_ȳsh_squares .+ (components.nsdh .* (1 .- pdh) .* (components.ȳsdh .- domain_mean) .^ 2)) ./ (components.nh .* (components.nh .- 1))) ./ N̂d .^ 2 - domain_mean_se = sqrt(domain_var) - return DataFrame(mean=domain_mean, SE=domain_mean_se) - end - gdf_domain = groupby(design.data, by) - combine(gdf_domain, [x, :popsize,:sampsize,:sampfraction, design.strata] => domain_mean => AsTable) -end - """ ```jldoctest julia> using Survey, Random, StatsBase; @@ -192,7 +6,9 @@ julia> apiclus1 = load_data("apiclus1"); julia> dclus1 = SurveyDesign(apiclus1, :dnum, :fpc); -julia> mean(:api00, dclus1, Bootstrap(replicates = 1000, rng = MersenneTwister(111))) +julia> bclus1 = bootweights(apiclus1; replicates = 1000) + +julia> mean(:api00, bclus1) 1×2 DataFrame Row │ mean SE │ Float64 Float64 @@ -200,20 +16,6 @@ julia> mean(:api00, dclus1, Bootstrap(replicates = 1000, rng = MersenneTwister(1 1 │ 644.169 23.0897 ``` """ -function mean(x::Symbol, design::SurveyDesign, method::Bootstrap) - weighted_mean(x, w) = mean(x, weights(w)) - df = bootstrap(x, design, weighted_mean; method.replicates, method.rng) - df = rename(df, :statistic => :mean) -end - -# function mean(x::Symbol, by::Symbol, design::SurveyDesign, method::Bootstrap) -# gdf = groupby(design.data, by) -# subdesigns = [SurveyDesign(gdf[i]; strata = design.strata, weights = :weights, clusters = design.cluster) for i in 1:length(gdf)] -# df = vcat([mean(x, subdesign, method) for subdesign in subdesigns]...) -# df[!, by] = [first(gdf[i][!, by]) for i in 1:length(gdf)] -# return df -# end - function mean(x::Symbol, design::ReplicateDesign) X = mean(design.data[!, x], weights(design.data.weights)) Xt = [mean(design.data[!, x], weights(design.data.weights .* design.data[! , "replicate_"*string(i)])) for i in 1:design.replicates] @@ -224,17 +26,18 @@ end function mean(x::Symbol, domain::Symbol, design::ReplicateDesign) gdf = groupby(design.data, domain) + nd = length(unique(design.data[!, domain])) X = combine(gdf, [x, :weights] => ((a, b) -> mean(a, weights(b))) => :mean) - Xt_mat = Array{Float64, 2}(undef, (length(unique(design.data[!, domain])), design.replicates)) + Xt_mat = Array{Float64, 2}(undef, (nd, design.replicates)) for i in 1:design.replicates - Xt = combine(gdf, [x, :weights, Symbol("replicate_"*string(i))] => ((a, b, c) -> mean(a, weights(b .* c))) => :mean).mean - for i in 1:length(Xt) - if isnan(Xt[i]) - Xt[i] = X.mean[i] # replace lonely psu with point estimate. This needs to be corrected. - end - end - Xt_mat[:, i] = Xt + Xt_mat[:, i] = combine(gdf, [x, :weights, Symbol("replicate_"*string(i))] => ((a, b, c) -> mean(a, weights(b .* c))) => :mean).mean + end + ses = [] + for i in 1:nd + filtered_dx = filter(!isnan, Xt_mat[i, :] .- X.mean[i]) + push!(ses, sqrt(sum(filtered_dx.^2) / length(filtered_dx))) end - X.SE = sqrt.(sum((Xt_mat .- X.mean).^2 / design.replicates, dims = 2))[:,1] + X.SE = ses + # X.SE = sqrt.(sum((Xt_mat .- X.mean).^2 / design.replicates, dims = 2))[:,1] return X end \ No newline at end of file From 62dc3fd8023f5eb1e5805ca38814e83fe127d171 Mon Sep 17 00:00:00 2001 From: ayushpatnaikgit Date: Sun, 1 Jan 2023 17:36:37 +0530 Subject: [PATCH 04/80] Fix bug in bootstrap --- src/bootstrap.jl | 68 ++++++++++++------------------------------------ 1 file changed, 17 insertions(+), 51 deletions(-) diff --git a/src/bootstrap.jl b/src/bootstrap.jl index 2662a7a6..70df8a81 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -1,63 +1,28 @@ -struct Bootstrap - replicates - rng - function Bootstrap(; replicates = 1000, rng = MersenneTwister(111)) - new(replicates, rng) - end -end - """ ```jldoctest -julia> using Survey, Random, StatsBase; +julia> using Survey, Random; julia> apiclus1 = load_data("apiclus1"); -julia> dclus1 = SurveyDesign(apiclus1, :dnum, :fpc); +julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum); julia> rng = MersenneTwister(111); -julia> func = wsum; - -julia> Survey.bootstrap(:api00, dclus1, func; replicates=1000, rng) -1×2 DataFrame - Row │ statistic SE - │ Float64 Float64 -─────┼────────────────────── - 1 │ 5.94916e6 1.36593e6 - +julia> Survey.bootweights(dclus1; replicates=1000, rng) +Survey.ReplicateDesign: +data: 183x1046 DataFrame +cluster: dnum +design.data[!,design.cluster]: 637, 637, 637, ..., 448 +popsize: popsize +design.data[!,design.popsize]: 183, 183, 183, ..., 183 +sampsize: sampsize +design.data[!,design.sampsize]: 15, 15, 15, ..., 15 +design.data[!,:probs]: 1.0, 1.0, 1.0, ..., 1.0 +design.data[!,:allprobs]: 1.0, 1.0, 1.0, ..., 1.0 +replicates: 1000 ``` """ -function bootstrap(x::Symbol, design::SurveyDesign, func = wsum; replicates = 100, rng = MersenneTwister(1234)) - X = func(design.data[:, x], design.data.weights) - H = length(unique(design.data[!, design.strata])) - stratified = groupby(design.data, design.strata) - Xt = Array{Float64, 1}(undef, replicates) - for i in 1:replicates - Xh = [] - Wh = [] - for j in 1:H - substrata = DataFrame(stratified[j]) - psus = unique(substrata[!, design.cluster]) - if length(psus) == 1 - return DataFrame(statistic = X, SE = 0) - end - nh = length(psus) - gdf = groupby(substrata, design.cluster) - selected_psus = psus[rand(rng, 1:nh, (nh-1))] # simple random sample of PSUs, with replacement. Select (nh-1) out of nh - xhij = (reduce(vcat, [gdf[(i,)][!, x] for i in selected_psus])) - whij = (reduce(vcat, [gdf[(i,)].weights * (nh / (nh - 1)) for i in selected_psus])) - append!(Xh, xhij) - append!(Wh, whij) - end - Xh = Float64.(Xh) - Wh = Float64.(Wh) - Xt[i] = func(Xh, Wh) - end - variance = sum((Xt .- X).^2) / replicates - return DataFrame(statistic = X, SE = sqrt(variance)) -end - -function bootweights(design::SurveyDesign; replicates = 100, rng = MersenneTwister(1234)) +function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwister(1234)) H = length(unique(design.data[!, design.strata])) stratified = groupby(design.data, design.strata) function replicate(stratified, H) @@ -68,7 +33,8 @@ function bootweights(design::SurveyDesign; replicates = 100, rng = MersenneTwist return DataFrame(statistic = X, SE = 0) end nh = length(psus) - rh = [(count(==(i), rand(rng, 1:(nh-1), nh))) for i in 1:nh] # main bootstrap algo. + randinds = rand(rng, 1:(nh), (nh-1)) # Main bootstrap algo. Draw nh-1 out of nh, with replacement. + rh = [(count(==(i), randinds)) for i in 1:nh] # main bootstrap algo. gdf = groupby(substrata, design.cluster) for i in 1:nh gdf[i].rh = repeat([rh[i]], nrow(gdf[i])) From 667193132e0026e563e474218b117c91d6c2afca Mon Sep 17 00:00:00 2001 From: ayushpatnaikgit Date: Sun, 1 Jan 2023 17:36:56 +0530 Subject: [PATCH 05/80] Replace other design with the new general design. --- src/Survey.jl | 4 +- src/SurveyDesign.jl | 364 ++++--------------------------------------- src/boxplot.jl | 2 +- src/by.jl | 17 ++ src/dimnames.jl | 69 -------- src/hist.jl | 4 +- src/mean.jl | 76 +++++++-- src/plot.jl | 2 +- src/quantile.jl | 6 +- src/ratio.jl | 8 +- src/show.jl | 20 --- src/total.jl | 188 +++++----------------- test/SurveyDesign.jl | 197 +---------------------- test/bootstrap.jl | 10 -- test/boxplot.jl | 2 +- test/dimnames.jl | 16 -- test/hist.jl | 2 +- test/jackknife.jl | 6 +- test/mean.jl | 45 +++--- test/plot.jl | 2 +- test/quantile.jl | 20 +-- test/ratio.jl | 2 +- test/runtests.jl | 2 - test/sampling.jl | 3 - test/total.jl | 14 +- 25 files changed, 190 insertions(+), 891 deletions(-) create mode 100644 src/by.jl delete mode 100644 src/dimnames.jl delete mode 100644 test/bootstrap.jl delete mode 100644 test/dimnames.jl delete mode 100644 test/sampling.jl diff --git a/src/Survey.jl b/src/Survey.jl index 8854e1d3..f6d3d030 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -21,10 +21,10 @@ include("total.jl") include("load_data.jl") include("hist.jl") include("plot.jl") -include("dimnames.jl") include("boxplot.jl") include("show.jl") include("ratio.jl") +include("by.jl") export load_data export AbstractSurveyDesign, SimpleRandomSample, StratifiedSample @@ -34,7 +34,7 @@ export mean, total, quantile export plot export hist export boxplot -export Bootstrap +export bootweights export jkknife export ratio diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index acfafa50..1541a3b9 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -10,318 +10,6 @@ Supertype for every survey design type. """ abstract type AbstractSurveyDesign end -""" - SimpleRandomSample <: AbstractSurveyDesign - - -Survey design sampled by simple random sampling. - -# Arguments: -`data::AbstractDataFrame`: the survey dataset (!this gets modified by the constructor). -`sampsize::Union{Nothing,Symbol,<:Unsigned,Vector{<:Real}}=UInt(nrow(data))`: the survey sample size. -`popsize::Union{Nothing,Symbol,<:Unsigned,Vector{<:Real}}=nothing`: the (expected) survey population size. -`weights::Union{Nothing,Symbol,Vector{<:Real}}=nothing`: the sampling weights. -`probs::Union{Nothing,Symbol,Vector{<:Real}}=nothing: the sampling probabilities. -`ignorefpc::Bool=false`: choose to ignore finite population correction and assume all weights equal to 1.0 - -The precedence order of using `popsize`, `weights` and `probs` is `popsize` > `weights` > `probs`. -E.g. If `popsize` is given then it is assumed to be the ground truth over `weights` or `probs`. - -If `popsize` is not given `weights` or `probs` must be given. `popsize` is then calculated -using the weights and the sample size. - -```jldoctest -julia> apisrs = load_data("apisrs"); - -julia> srs = SimpleRandomSample(apisrs; popsize=:fpc) -SimpleRandomSample: -data: 200x42 DataFrame -weights: 31.0, 31.0, 31.0, ..., 31.0 -probs: 0.0323, 0.0323, 0.0323, ..., 0.0323 -fpc: 6194, 6194, 6194, ..., 6194 -popsize: 6194 -sampsize: 200 -sampfraction: 0.0323 -ignorefpc: false -``` -""" -struct SimpleRandomSample <: AbstractSurveyDesign - data::AbstractDataFrame - sampsize::Union{Unsigned,Nothing} - popsize::Union{Unsigned,Nothing} - sampfraction::Float64 - fpc::Float64 - ignorefpc::Bool - function SimpleRandomSample(data::AbstractDataFrame; - popsize::Union{Nothing,Symbol,Unsigned,Vector{<:Real}}=nothing, - sampsize::Union{Nothing,Symbol,Unsigned,Vector{<:Real}}=nrow(data) |> UInt, - weights::Union{Nothing,Symbol,Vector{<:Real}}=nothing, - probs::Union{Nothing,Symbol,Vector{<:Real}}=nothing, - ignorefpc::Bool=false - ) - # If any of weights or probs given as Symbol, - # find the corresponding column in `data` - if isa(weights, Symbol) - weights = data[!, weights] - end - if isa(probs, Symbol) - probs = data[!, probs] - end - # If weights/probs vector not numeric/real, ie. string column passed for weights, then raise error - if !isa(weights, Union{Nothing,Vector{<:Real}}) - error("weights should be Vector{<:Real}. You passed $(typeof(weights))") - elseif !isa(probs, Union{Nothing,Vector{<:Real}}) - error("sampling probabilities should be Vector{<:Real}. You passed $(typeof(probs))") - end - # If popsize given as Symbol or Vector, check all records equal - if isa(popsize, Symbol) - if !all(w -> w == first(data[!, popsize]), data[!, popsize]) - error("popsize must be same for all observations in Simple Random Sample") - end - popsize = first(data[!, popsize]) |> UInt - elseif isa(popsize, Vector{<:Real}) - if !all(w -> w == first(popsize), popsize) - error("popsize must be same for all observations in Simple Random Sample") - end - popsize = first(popsize) |> UInt - end - # If sampsize given as Symbol or Vector, check all records equal - if isa(sampsize, Symbol) - if !all(w -> w == first(data[!, sampsize]), data[!, sampsize]) - error("sampsize must be same for all observations in Simple Random Sample") - end - sampsize = first(data[!, sampsize]) |> UInt - elseif isa(sampsize, Vector{<:Real}) - if !all(w -> w == first(sampsize), sampsize) - error("sampsize must be same for all observations in Simple Random Sample") - end - sampsize = first(sampsize) |> UInt - end - # If both `weights` and `probs` given, then `weights` is assumed to be ground truth for probs. - if !isnothing(weights) && !isnothing(probs) - probs = 1 ./ weights - data[!, :probs] = probs - end - # popsize must be nothing or <:Unsigned by now - if isnothing(popsize) - # If popsize not given, fallback to weights, probs and sampsize to estimate `popsize` - @warn "popsize not given. using weights/probs and sampsize to estimate `popsize`" - # Check that all weights (or probs if weights not given) are equal, as SRS is by definition equi-weighted - if typeof(weights) <: Vector{<:Real} - if !all(w -> w == first(weights), weights) - error("all frequency weights must be equal for Simple Random Sample") - end - elseif typeof(probs) <: Vector{<:Real} - if !all(p -> p == first(probs), probs) - error("all probability weights must be equal for Simple Random Sample") - end - weights = 1 ./ probs - else - error("either weights or probs must be given if `popsize` not given") - end - # Estimate population size - popsize = round(sampsize * first(weights)) |> UInt - elseif typeof(popsize) <: Unsigned - weights = fill(popsize / sampsize, nrow(data)) # If popsize is given, weights vector is made concordant with popsize and sampsize, regardless of given weights argument - probs = 1 ./ weights - else - error("something went wrong, please check validity of inputs.") - end - # If sampsize greater than popsize than illogical arguments specified. - if sampsize > popsize - error("population size was estimated to be less than given sampsize. Please check input arguments.") - end - # If ignorefpc then set weights to 1 ?? - # TODO: This works under some cases, but should find better way to process ignoring fpc - if ignorefpc - @warn "assuming all weights are equal to 1.0" - weights = ones(nrow(data)) - probs = 1 ./ weights - end - # sum of weights must equal to `popsize` for SRS - if !isnothing(weights) && !(isapprox(sum(weights), popsize; atol=1e-4)) - if ignorefpc && !(isapprox(sum(weights), sampsize; atol=1e-4)) # Change if ignorefpc functionality changes - error("sum of sampling weights should be equal to `sampsize` for `SimpleRandomSample` with `ignorefpc`") - elseif !ignorefpc - error("sum of sampling weights must be equal to `popsize` for `SimpleRandomSample`") - end - end - # sum of probs must equal popsize for SRS - if !isnothing(probs) && !(isapprox(sum(1 ./ probs), popsize; atol=1e-4)) - if ignorefpc && !(isapprox(sum(1 ./ probs), sampsize; atol=1e-4)) # Change if ignorefpc functionality changes - error("sum of inverse sampling probabilities should be equal to `sampsize` for `SimpleRandomSample` with `ignorefpc`") - elseif !ignorefpc - error("sum of inverse of sampling probabilities must be equal to `popsize` for Simple Random Sample") - end - end - ## Set remaining parts of data structure - # set sampling fraction - sampfraction = sampsize / popsize - # set fpc - fpc = ignorefpc ? 1 : 1 - (sampsize / popsize) - # add columns for frequency and probability weights to `data` - data[!, :weights] = weights - if isnothing(probs) - probs = 1 ./ data[!, :weights] - end - data[!, :probs] = probs - # Initialise the structure - new(data, sampsize, popsize, sampfraction, fpc, ignorefpc) - end -end - -""" - StratifiedSample <: AbstractSurveyDesign - -Survey design sampled by stratification. - -`strata` must be specified as a Symbol name of a column in `data`. - -# Arguments: -`data::AbstractDataFrame`: the survey dataset (!this gets modified by the constructor). -`strata::Symbol`: the stratification variable - must be given as a column in `data`. -`sampsize::Union{Nothing,Symbol,<:Unsigned,Vector{<:Real}}=UInt(nrow(data))`: the survey sample size. -`popsize::Union{Nothing,Symbol,<:Unsigned,Vector{<:Real}}=nothing`: the (expected) survey population size. -`weights::Union{Nothing,Symbol,Vector{<:Real}}=nothing`: the sampling weights. -`probs::Union{Nothing,Symbol,Vector{<:Real}}=nothing: the sampling probabilities. -`ignorefpc::Bool=false`: choose to ignore finite population correction and assume all weights equal to 1.0 - -The `popsize`, `weights` and `probs` parameters follow the same rules as for [`SimpleRandomSample`](@ref). - -```jldoctest -julia> apistrat = load_data("apistrat"); - -julia> dstrat = StratifiedSample(apistrat, :stype; popsize=:fpc) -StratifiedSample: -data: 200x45 DataFrame -strata: stype -weights: 44.2, 44.2, 44.2, ..., 15.1 -probs: 0.0226, 0.0226, 0.0226, ..., 0.0662 -fpc: 0.977, 0.977, 0.977, ..., 0.934 -popsize: 4421, 4421, 4421, ..., 755 -sampsize: 100, 100, 100, ..., 50 -sampfraction: 0.0226, 0.0226, 0.0226, ..., 0.0662 -ignorefpc: false -``` -""" -struct StratifiedSample <: AbstractSurveyDesign - data::AbstractDataFrame - strata::Symbol - ignorefpc::Bool - function StratifiedSample(data::AbstractDataFrame, strata::Symbol; - popsize::Union{Nothing,Symbol}=nothing, - sampsize::Union{Nothing,Symbol}=nothing, - weights::Union{Nothing,Symbol,Vector{<:Real}}=nothing, - probs::Union{Nothing,Symbol,Vector{<:Real}}=nothing, - ignorefpc::Bool=false - ) - # Store the iterator over each strata, as used multiple times - data_groupedby_strata = groupby(data, strata) - # If any of weights or probs given as Symbol, find the corresponding column in `data` - if isa(weights, Symbol) - for each_strata in keys(data_groupedby_strata) - if !all(w -> w == first(data_groupedby_strata[each_strata][!, weights]), data_groupedby_strata[each_strata][!, weights]) - error("sampling weights within each strata must be equal in StratifiedSample") - end - end - # original_weights_colname = copy(weights) - weights = data[!, weights] # If all good with weights column, then store it as Vector - end - if isa(probs, Symbol) - for each_strata in keys(data_groupedby_strata) - if !all(p -> p == first(data_groupedby_strata[each_strata][!, probs]), data_groupedby_strata[each_strata][!, probs]) - error("sampling probabilities within each strata must be equal in StratifiedSample") - end - end - # original_probs_colname = copy(probs) - probs = data[!, probs] # If all good with probs column, then store it as Vector - end - # If weights/probs vector not numeric/real, ie. string column passed for weights, then raise error - if !isa(weights, Union{Nothing,Vector{<:Real}}) - error("weights should be Vector{<:Real}. You passed $(typeof(weights))") - elseif !isa(probs, Union{Nothing,Vector{<:Real}}) - error("sampling probabilities should be Vector{<:Real}. You passed $(typeof(probs))") - end - # If popsize given as Symbol or Vector, check all records equal in each strata - if isa(popsize, Symbol) - for each_strata in keys(data_groupedby_strata) - if !all(w -> w == first(data_groupedby_strata[each_strata][!, popsize]), data_groupedby_strata[each_strata][!, popsize]) - error("popsize must be same for all observations within each strata in StratifiedSample") - end - end - # original_popsize_colname = copy(popsize) - popsize = data[!, popsize] - end - # If sampsize given as Symbol or Vector, check all records equal - if isa(sampsize, Symbol) - if isnothing(popsize) && isnothing(weights) && isnothing(probs) - error("if sampsize given, and popsize not given, then weights or probs must given to calculate popsize") - end - for each_strata in keys(data_groupedby_strata) - if !all(w -> w == first(data_groupedby_strata[each_strata][!, sampsize]), data_groupedby_strata[each_strata][!, sampsize]) - error("sampsize must be same for all observations within each strata in StratifiedSample") - end - end - # original_sampsize_colname = copy(sampsize) - sampsize = data[!, sampsize] - # If sampsize column not provided in constructor call, set it as nrow of strata - elseif isnothing(sampsize) - sampsize = transform(groupby(data, strata), nrow => :counts).counts - end - # If both `weights` and `probs` given, then `weights` is assumed to be ground truth for probs. - if !isnothing(weights) && !isnothing(probs) - probs = 1 ./ weights - data[!, :probs] = probs - end - # `popsize` is either nothing or a Vector{<:Real} by now - if isnothing(popsize) - # If popsize not given, fallback to weights, probs and sampsize to estimate `popsize` - @warn "popsize not given. using weights/probs and sampsize to estimate `popsize` for StratifiedSample" - # Check that all weights (or probs if weights not given) are equal, as SRS is by definition equi-weighted - if typeof(probs) <: Vector{<:Real} - weights = 1 ./ probs - elseif !(typeof(weights) <: Vector{<:Real}) - error("either weights or probs must be given if `popsize` not given") - end - # Estimate population size - popsize = sampsize .* weights - elseif typeof(popsize) <: Vector{<:Real} # Still need to check if the provided Column is of <:Real - # If popsize is given, weights and probs made concordant with popsize and sampsize, regardless of supplied arguments - weights = popsize ./ sampsize - probs = 1 ./ weights - else - error("something went wrong. Please check validity of inputs.") - end - # If sampsize greater than popsize than illogical arguments specified. - if any(sampsize .> popsize) - error("population sizes were estimated to be less than sampsize. please check input arguments.") - end - # If ignorefpc then set weights to 1 ?? - # TODO: This works under some cases, but should find better way to process ignoring fpc - if ignorefpc - @warn "assuming all weights are equal to 1.0" - weights = ones(nrow(data)) - probs = 1 ./ weights - end - ## Set remaining parts of data structure - # set sampling fraction - sampfraction = sampsize ./ popsize - # set fpc - fpc = ignorefpc ? fill(1, size(data, 1)) : 1 .- (sampsize ./ popsize) - # add columns for frequency and probability weights to `data` - data[!, :weights] = weights - if isnothing(probs) - probs = 1 ./ data[!, :weights] - end - data[!, :probs] = probs - data[!, :sampsize] = sampsize - data[!, :popsize] = popsize - data[!, :fpc] = fpc - data[!, :sampfraction] = sampfraction - new(data, strata, ignorefpc) - end -end - """ SurveyDesign <: AbstractSurveyDesign @@ -343,37 +31,15 @@ julia> apiclus1 = load_data("apiclus1"); julia> apiclus1[!, :pw] = fill(757/15,(size(apiclus1,1),)); # Correct api mistake for pw column -julia> dclus1 = SurveyDesign(apiclus1, :dnum, :fpc) -SurveyDesign: -data: 183x45 DataFrame -clusters: dnum -design.data[!,design.clusters]: 637, 637, 637, ..., 448 -popsize: fpc -design.data[!,design.popsize]: 757, 757, 757, ..., 757 -sampsize: sampsize -design.data[!,design.sampsize]: 15, 15, 15, ..., 15 -weights: weights -design.data[!,design.weights]: 50.5, 50.5, 50.5, ..., 50.5 -design.data[!,:strata]: 1.0, 1.0, 1.0, ..., 1.0 -design.data[!,:probs]: 0.0198, 0.0198, 0.0198, ..., 0.0198 -design.data[!,:allprobs]: 0.0198, 0.0198, 0.0198, ..., 0.0198 - -julia> apiclus1 = load_data("apiclus1"); - -julia> apiclus1[!, :pw] = fill(757/15,(size(apiclus1,1),)); # Correct api mistake for pw column - -julia> dclus1 = SurveyDesign(apiclus1, :dnum; weights=:pw) +julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) SurveyDesign: data: 183x46 DataFrame -clusters: dnum -design.data[!,design.clusters]: 637, 637, 637, ..., 448 +cluster: dnum +design.data[!,design.cluster]: 637, 637, 637, ..., 448 popsize: popsize -design.data[!,design.popsize]: 757.0, 757.0, 757.0, ..., 757.0 +design.data[!,design.popsize]: 9240.0, 9240.0, 9240.0, ..., 9240.0 sampsize: sampsize design.data[!,design.sampsize]: 15, 15, 15, ..., 15 -weights: pw -design.data[!,design.weights]: 50.5, 50.5, 50.5, ..., 50.5 -design.data[!,:strata]: 1.0, 1.0, 1.0, ..., 1.0 design.data[!,:probs]: 0.0198, 0.0198, 0.0198, ..., 0.0198 design.data[!,:allprobs]: 0.0198, 0.0198, 0.0198, ..., 0.0198 ``` @@ -424,6 +90,28 @@ struct SurveyDesign <: AbstractSurveyDesign end end +""" +```jldoctest +julia> apiclus1 = load_data("apiclus1"); + +julia> apiclus1[!, :pw] = fill(757/15,(size(apiclus1,1),)); # Correct api mistake for pw column + +julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); + +julia> bclus1 = Survey.bootweights(dclus1; replicates = 1000) +Survey.ReplicateDesign: +data: 183x1046 DataFrame +cluster: dnum +design.data[!,design.cluster]: 637, 637, 637, ..., 448 +popsize: popsize +design.data[!,design.popsize]: 9240.0, 9240.0, 9240.0, ..., 9240.0 +sampsize: sampsize +design.data[!,design.sampsize]: 15, 15, 15, ..., 15 +design.data[!,:probs]: 0.0198, 0.0198, 0.0198, ..., 0.0198 +design.data[!,:allprobs]: 0.0198, 0.0198, 0.0198, ..., 0.0198 +replicates: 1000 +``` +""" struct ReplicateDesign <: AbstractSurveyDesign data::AbstractDataFrame cluster::Symbol diff --git a/src/boxplot.jl b/src/boxplot.jl index 46f6958a..8790f116 100644 --- a/src/boxplot.jl +++ b/src/boxplot.jl @@ -10,7 +10,7 @@ The keyword arguments are all the arguments that can be passed to `mapping` in ```@example boxplot apisrs = load_data("apisrs"); -srs = SimpleRandomSample(apisrs; weights = :pw); +srs = srs = SurveyDesign(apisrs; weights=:pw); bp = boxplot(srs, :stype, :enroll; weights = :pw) save("boxplot.png", bp); nothing # hide ``` diff --git a/src/by.jl b/src/by.jl new file mode 100644 index 00000000..30cb2dd2 --- /dev/null +++ b/src/by.jl @@ -0,0 +1,17 @@ +function bydomain(x::Symbol, domain::Symbol, design::ReplicateDesign, func::Function) + gdf = groupby(design.data, domain) + nd = length(unique(design.data[!, domain])) + X = combine(gdf, [x, :weights] => ((a, b) -> func(a, weights(b))) => :statistic) + Xt_mat = Array{Float64, 2}(undef, (nd, design.replicates)) + for i in 1:design.replicates + Xt_mat[:, i] = combine(gdf, [x, :weights, Symbol("replicate_"*string(i))] => ((a, b, c) -> func(a, weights(b .* c))) => :statistic).statistic + end + ses = [] + for i in 1:nd + filtered_dx = filter(!isnan, Xt_mat[i, :] .- X.statistic[i]) + push!(ses, sqrt(sum(filtered_dx.^2) / length(filtered_dx))) + end + replace!(ses, NaN => 0) + X.SE = ses + return X +end \ No newline at end of file diff --git a/src/dimnames.jl b/src/dimnames.jl deleted file mode 100644 index 91bd473e..00000000 --- a/src/dimnames.jl +++ /dev/null @@ -1,69 +0,0 @@ -""" - dim(design) - -Get the dimensions of a `SurveyDesign`. - -```jldoctest -julia> apisrs = load_data("apisrs"); - -julia> srs = SimpleRandomSample(apisrs; popsize =:fpc); - -julia> dim(srs) -(200, 42) -``` -""" -dim(design::AbstractSurveyDesign) = size(design.data) - -""" - colnames(design) - -Get the column names of a `SurveyDesign`. - -```jldoctest -julia> apisrs = load_data("apisrs"); - -julia> srs = SimpleRandomSample(apisrs; popsize=:fpc); - -julia> colnames(srs) -42-element Vector{String}: - "Column1" - "cds" - "stype" - "name" - "sname" - "snum" - "dname" - "dnum" - "cname" - "cnum" - ⋮ - "avg.ed" - "full" - "emer" - "enroll" - "api.stu" - "pw" - "fpc" - "weights" - "probs" -``` -""" -colnames(design::AbstractSurveyDesign) = names(design.data) - -""" - dimnames(design) - -Get the names of the rows and columns of a `SurveyDesign`. - -```jldoctest -julia> apisrs = load_data("apisrs"); - -julia> srs = SimpleRandomSample(apisrs;popsize=:fpc); - -julia> dimnames(srs) -2-element Vector{Vector{String}}: - ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10" … "191", "192", "193", "194", "195", "196", "197", "198", "199", "200"] - ["Column1", "cds", "stype", "name", "sname", "snum", "dname", "dnum", "cname", "cnum" … "grad.sch", "avg.ed", "full", "emer", "enroll", "api.stu", "pw", "fpc", "weights", "probs"] -``` -""" -dimnames(design::AbstractSurveyDesign) = [string.(1:size(design.data, 1)), names(design.data)] diff --git a/src/hist.jl b/src/hist.jl index c140e59e..90d42d1b 100644 --- a/src/hist.jl +++ b/src/hist.jl @@ -11,7 +11,7 @@ Calculate the number of bins to use in a histogram using the Sturges rule. ```jldoctest julia> apisrs = load_data("apisrs"); -julia> srs = SimpleRandomSample(apisrs;popsize=:fpc); +julia> srs = SurveyDesign(apisrs; weights=:pw); julia> sturges(srs, :enroll) 9 @@ -31,7 +31,7 @@ Calculate the number of bins to use in a histogram using the Freedman-Diaconis r ```jldoctest julia> apisrs = load_data("apisrs"); -julia> srs = SimpleRandomSample(apisrs;popsize=:fpc); +julia> srs = SurveyDesign(apisrs; weights=:pw); julia> freedman_diaconis(srs, :enroll) 18 diff --git a/src/mean.jl b/src/mean.jl index f19ee1f5..5dc679f3 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -22,22 +22,68 @@ function mean(x::Symbol, design::ReplicateDesign) variance = sum((Xt .- X).^2) / design.replicates DataFrame(mean = X, SE = sqrt(variance)) end +""" +```jldoctest +julia> using Survey, Random, StatsBase; + +julia> apiclus1 = load_data("apiclus1"); + +julia> dclus1 = SurveyDesign(apiclus1, :dnum, :fpc); +julia> bclus1 = bootweights(apiclus1; replicates = 1000) +julia> mean(:api00, :cname, bclus1) |> print +38×3 DataFrame + Row │ cname statistic SE + │ String15 Float64 Any +─────┼───────────────────────────────────────── + 1 │ Kern 573.6 44.5578 + 2 │ Los Angeles 658.156 22.2058 + 3 │ Orange 749.333 29.5701 + 4 │ San Luis Obispo 739.0 3.37273e-14 + 5 │ San Francisco 558.333 45.6266 + 6 │ Modoc 671.0 0.0 + 7 │ Alameda 676.091 37.3104 + 8 │ Solano 623.0 45.1222 + 9 │ Santa Cruz 624.333 113.43 + 10 │ Monterey 605.0 85.4116 + 11 │ San Bernardino 614.462 30.0066 + 12 │ Riverside 574.3 27.2025 + 13 │ Tulare 664.0 22.0097 + 14 │ San Diego 684.5 32.2241 + 15 │ Sacramento 616.0 39.7877 + 16 │ Marin 799.667 35.2397 + 17 │ Imperial 622.0 0.0 + 18 │ Ventura 743.8 31.7425 + 19 │ San Joaquin 608.667 40.8592 + 20 │ Sonoma 630.0 0.0 + 21 │ Fresno 600.25 56.9173 + 22 │ Santa Clara 718.286 58.562 + 23 │ Sutter 744.0 0.0 + 24 │ Contra Costa 766.111 53.598 + 25 │ Stanislaus 736.333 5.26576 + 26 │ Madera 480.0 3.5861 + 27 │ Placer 759.0 0.0 + 28 │ Lassen 752.0 0.0 + 29 │ Santa Barbara 728.667 25.8749 + 30 │ San Mateo 617.0 78.1173 + 31 │ Siskiyou 699.0 0.0 + 32 │ Kings 469.5 44.6284 + 33 │ Shasta 754.0 60.5829 + 34 │ Yolo 475.0 0.0 + 35 │ Calaveras 790.0 0.0 + 36 │ Napa 727.0 50.5542 + 37 │ Lake 804.0 0.0 + 38 │ Merced 595.0 0 +``` +""" function mean(x::Symbol, domain::Symbol, design::ReplicateDesign) - gdf = groupby(design.data, domain) - nd = length(unique(design.data[!, domain])) - X = combine(gdf, [x, :weights] => ((a, b) -> mean(a, weights(b))) => :mean) - Xt_mat = Array{Float64, 2}(undef, (nd, design.replicates)) - for i in 1:design.replicates - Xt_mat[:, i] = combine(gdf, [x, :weights, Symbol("replicate_"*string(i))] => ((a, b, c) -> mean(a, weights(b .* c))) => :mean).mean - end - ses = [] - for i in 1:nd - filtered_dx = filter(!isnan, Xt_mat[i, :] .- X.mean[i]) - push!(ses, sqrt(sum(filtered_dx.^2) / length(filtered_dx))) - end - X.SE = ses - # X.SE = sqrt.(sum((Xt_mat .- X.mean).^2 / design.replicates, dims = 2))[:,1] - return X + weighted_mean(x, w) = mean(x, StatsBase.weights(w)) + bydomain(x, domain, design, weighted_mean) +end + +function mean(x::Vector{Symbol}, design::ReplicateDesign) + df = reduce(vcat, [mean(i, design) for i in x]) + insertcols!(df, 1, :names => String.(x)) + return df end \ No newline at end of file diff --git a/src/plot.jl b/src/plot.jl index cb9792d1..7dd4f555 100644 --- a/src/plot.jl +++ b/src/plot.jl @@ -8,7 +8,7 @@ in the design. ```@example plot apisrs = load_data("apisrs"); -srs = SimpleRandomSample(apisrs; weights = :pw); +srs = SurveyDesign(apisrs; weights=:pw); s = plot(srs, :api99, :api00) save("scatter.png", s); nothing # hide ``` diff --git a/src/quantile.jl b/src/quantile.jl index 8ee6000c..1cc9646c 100644 --- a/src/quantile.jl +++ b/src/quantile.jl @@ -13,7 +13,7 @@ The Julia, R and Python-numpy use the same defaults ```jldoctest julia> apisrs = load_data("apisrs"); -julia> srs = SimpleRandomSample(apisrs;popsize=:fpc); +julia> srs = SurveyDesign(apisrs; weights=:pw); julia> quantile(:api00,srs,0.5) 1×2 DataFrame @@ -34,7 +34,7 @@ julia> quantile(:enroll,srs,[0.1,0.2,0.5,0.75,0.95]) 5 │ 0.95 1473.1 ``` """ -function quantile(var::Symbol, design::SimpleRandomSample, p::Union{<:Real,Vector{<:Real}}; +function quantile(var::Symbol, design::SurveyDesign, p::Union{<:Real,Vector{<:Real}}; alpha::Float64=0.05, ci::Bool=false, se::Bool=false, qrule="hf7",kwargs...) v = design.data[!, var] probs = design.data[!, :probs] @@ -43,7 +43,7 @@ function quantile(var::Symbol, design::SimpleRandomSample, p::Union{<:Real,Vecto return df end -function quantile(var::Symbol, design::StratifiedSample, p::Union{<:Real,Vector{<:Real}}; +function quantile(var::Symbol, design::SurveyDesign, p::Union{<:Real,Vector{<:Real}}; alpha::Float64=0.05, ci::Bool=false, se::Bool=false, qrule="hf7",kwargs...) v = design.data[!, var] probs = design.data[!, :probs] diff --git a/src/ratio.jl b/src/ratio.jl index 8e923d42..67e51668 100644 --- a/src/ratio.jl +++ b/src/ratio.jl @@ -9,7 +9,7 @@ julia> apiclus1 = load_data("apiclus1"); julia> apiclus1[!, :pw] = fill(757/15,(size(apiclus1,1),)); # Correct api mistake for pw column -julia> dclus1 = SurveyDesign(apiclus1, :dnum, :fpc); +julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); julia> ratio(:api00, :enroll, dclus1) 1×2 DataFrame @@ -35,8 +35,4 @@ function ratio(variable_num:: Symbol, variable_den:: Symbol, design::SurveyDesig end var = c*(nh-1)/nh return DataFrame(Statistic = statistic, SE = sqrt(var)) -end - -# function ratio(x::Symbol, design::ReplicateDesign) -# design.data[!, "ones"] = ones(nrow(design.data)) -# end \ No newline at end of file +end \ No newline at end of file diff --git a/src/show.jl b/src/show.jl index 4adb61d8..3319e653 100644 --- a/src/show.jl +++ b/src/show.jl @@ -33,20 +33,6 @@ function Base.show(io::IO, ::MIME"text/plain", design::AbstractSurveyDesign) printinfo(io, "ignorefpc", string(design.ignorefpc); newline=false) end -function Base.show(io::IO, ::MIME"text/plain", design::StratifiedSample) - type = typeof(design) - printstyled(io, "$type:\n"; bold=true) - printstyled(io, "data: "; bold=true) - println(io, size(design.data, 1), "x", size(design.data, 2), " DataFrame") - printinfo(io, "strata", string(design.strata); newline=true) - printinfo(io, "weights", makeshort(design.data.weights)) - printinfo(io, "probs", makeshort(design.data.probs)) - printinfo(io, "fpc", makeshort(design.data.fpc)) - printinfo(io, "popsize", makeshort(design.data.popsize)) - printinfo(io, "sampsize", makeshort(design.data.sampsize)) - printinfo(io, "sampfraction", makeshort(design.data.sampfraction)) - printinfo(io, "ignorefpc", string(design.ignorefpc); newline=false) -end "Print information about a survey design." function Base.show(io::IO, ::MIME"text/plain", design::SurveyDesign) @@ -60,9 +46,6 @@ function Base.show(io::IO, ::MIME"text/plain", design::SurveyDesign) printinfo(io, "design.data[!,design.popsize]", makeshort(design.data[!,design.popsize])) printinfo(io, "sampsize", string(design.sampsize); newline=true) printinfo(io, "design.data[!,design.sampsize]", makeshort(design.data[!,design.sampsize])) - # printinfo(io, "weights", string(design.weights); newline=true) - # printinfo(io, "design.data[!,design.weights]", makeshort(design.data[!,design.weights])) - # printinfo(io, "design.data[!,:strata]", makeshort(design.data[!,:strata])) printinfo(io, "design.data[!,:probs]", makeshort(design.data.probs)) printinfo(io, "design.data[!,:allprobs]", makeshort(design.data.allprobs)) end @@ -79,9 +62,6 @@ function Base.show(io::IO, ::MIME"text/plain", design::ReplicateDesign) printinfo(io, "design.data[!,design.popsize]", makeshort(design.data[!,design.popsize])) printinfo(io, "sampsize", string(design.sampsize); newline=true) printinfo(io, "design.data[!,design.sampsize]", makeshort(design.data[!,design.sampsize])) - # printinfo(io, "weights", string(design.weights); newline=true) - # printinfo(io, "design.data[!,design.weights]", makeshort(design.data[!,design.weights])) - # printinfo(io, "design.data[!,:strata]", makeshort(design.data[!,:strata])) printinfo(io, "design.data[!,:probs]", makeshort(design.data.probs)) printinfo(io, "design.data[!,:allprobs]", makeshort(design.data.allprobs)) printstyled(io, "replicates: "; bold=true) diff --git a/src/total.jl b/src/total.jl index f9824255..d298d62d 100644 --- a/src/total.jl +++ b/src/total.jl @@ -1,173 +1,55 @@ """ - total(x, design) - -Estimate the population total for the variable specified by `x`. - -For SurveyDesign, formula adapted from Sarndal pg129, section 4.2.2 Simple Random Cluster Sampling - ```jldoctest -julia> using Survey; - -julia> apisrs = load_data("apisrs"); - -julia> srs = SimpleRandomSample(apisrs; popsize=:fpc); - -julia> total(:enroll, srs) -1×2 DataFrame - Row │ total SE - │ Float64 Float64 -─────┼───────────────────── - 1 │ 3.62107e6 1.6952e5 - -julia> strat = load_data("apistrat"); - -julia> dstrat = StratifiedSample(strat, :stype; popsize=:fpc); - -julia> total(:api00, dstrat) -1×2 DataFrame - Row │ total SE - │ Float64 Float64 -─────┼──────────────────── - 1 │ 4.10221e6 58279.0 - -julia> total([:api00, :enroll], dstrat) -2×3 DataFrame - Row │ names total SE - │ String Float64 Float64 -─────┼────────────────────────────────── - 1 │ api00 4.10221e6 58279.0 - 2 │ enroll 3.68718e6 1.14642e5 -``` -""" -function total(x::Symbol, design::SimpleRandomSample) - function se(x::Symbol, design::SimpleRandomSample) - function variance(x::Symbol, design::SimpleRandomSample) - return design.popsize^2 * design.fpc * var(design.data[!, x]) / design.sampsize - end - return sqrt(variance(x, design)) - end - if isa(design.data[!, x], CategoricalArray) - gdf = groupby(design.data, x) - p = combine(gdf, nrow => :count) - p.total = design.popsize .* p.count ./ sum(p.count) - p.proportion = p.total ./ design.popsize - p = select!(p, Not(:count)) # count column is not necessary for `total` - p.var = design.popsize^2 .* design.fpc .* p.proportion .* - (1 .- p.proportion) ./ (design.sampsize - 1) # N^2 .* variance of proportion - p.SE = sqrt.(p.var) - return select(p, Not([:proportion, :var])) - end - m = mean(x,design) - total = design.popsize * m.mean[1] - return DataFrame(total=total, SE=se(x, design)) -end - -function total(x::Symbol, design::StratifiedSample) - # TODO: check if statement - if x == design.strata - gdf = groupby(design.data, x) - return combine(gdf, :weights => sum => :Nₕ) - end - gdf = groupby(design.data, design.strata) - grand_total = sum(combine(gdf, [x, :weights] => ((a, b) -> wsum(a, b)) => :total).total) - # variance estimation using closed-form formula - Nₕ = combine(gdf, :weights => sum => :Nₕ).Nₕ - nₕ = combine(gdf, nrow => :nₕ).nₕ - fₕ = nₕ ./ Nₕ - - s²ₕ = combine(gdf, x => var => :s²h).s²h - # the only difference between total and mean variance is the Nₕ instead of Wₕ - V̂Ȳ̂ = sum((Nₕ .^ 2) .* (1 .- fₕ) .* s²ₕ ./ nₕ) - SE = sqrt(V̂Ȳ̂) - return DataFrame(total=grand_total, SE=SE) -end - -function total(x::Vector{Symbol}, design::AbstractSurveyDesign) - df = reduce(vcat, [total(i, design) for i in x]) - insertcols!(df, 1, :names => String.(x)) - return df -end - -""" -```jldoctest -julia> using Survey +julia> using Survey; julia> apiclus1 = load_data("apiclus1"); -julia> dclus1 = SurveyDesign(apiclus1, :dnum, :fpc); +julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); + +julia> bclus1 = bootweights(dclus1; replicates = 1000); -julia> total(:api00, dclus1) +julia> total(:api00, bclus1) 1×2 DataFrame - Row │ total SE + Row │ mean SE │ Float64 Float64 ─────┼────────────────────── - 1 │ 5.94916e6 1.33948e6 -``` -""" -function total(x::Symbol, design::SurveyDesign) - gdf = groupby(design.data, design.cluster) - ŷₜ = combine(gdf, x => sum => :sum).sum - Nₜ = first(design.data[!,design.popsize]) - Ȳ = Nₜ * mean(ŷₜ) - nₜ = first(design.data[!,design.sampsize]) - s²ₜ = var(ŷₜ) - VȲ = Nₜ^2 * (1 - nₜ/Nₜ) * s²ₜ / nₜ - return DataFrame(total = Ȳ, SE = sqrt(VȲ)) -end - -""" - total(x, by, design) - -Estimate the subpopulation total of a variable `x`. - -```jldoctest -julia> using Survey; - -julia> apisrs = load_data("apisrs"); - -julia> srs = SimpleRandomSample(apisrs;popsize=:fpc); - -julia> total(:api00, :cname, srs) |> first -DataFrameRow - Row │ cname total SE - │ String15 Float64 Float64 -─────┼────────────────────────────── - 1 │ Kern 1.77644e5 55600.8 - + 1 │ 5.94916e6 2.01705e6 ``` """ -function total(x::Symbol, by::Symbol, design::SimpleRandomSample) - function domain_total(x::AbstractVector, design::SimpleRandomSample, weights) - function se(x::AbstractVector, design::SimpleRandomSample) - # vector of length equal to `sampsize` containing `x` and zeros - z = cat(zeros(design.sampsize - length(x)), x; dims=1) - variance = design.popsize^2 / design.sampsize * design.fpc * var(z) - return sqrt(variance) - end - total = wsum(x, weights) - return DataFrame(total=total, SE=se(x, design::SimpleRandomSample)) - end - gdf = groupby(design.data, by) - combine(gdf, [x, :weights] => ((a, b) -> domain_total(a, design, b)) => AsTable) +function total(x::Symbol, design::ReplicateDesign) + X = wsum(design.data[!, x], weights(design.data.weights)) + Xt = [wsum(design.data[!, x], weights(design.data.weights .* design.data[! , "replicate_"*string(i)])) for i in 1:design.replicates] + variance = sum((Xt .- X).^2) / design.replicates + DataFrame(total = X, SE = sqrt(variance)) end - """ ```jldoctest -julia> using Survey, Random, StatsBase; +julia> using Survey; julia> apiclus1 = load_data("apiclus1"); -julia> dclus1 = SurveyDesign(apiclus1, :dnum, :fpc); - -julia> total(:api00, dclus1, Bootstrap(replicates = 1000, rng = MersenneTwister(111))) -1×2 DataFrame - Row │ total SE - │ Float64 Float64 -─────┼────────────────────── - 1 │ 5.94916e6 1.36593e6 +julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); + +julia> bclus1 = bootweights(dclus1; replicates = 1000); + +julia> total(:api00, :cname, bclus1) |> print +11×3 DataFrame + Row │ cname statistic SE + │ String15 Float64 Any +─────┼─────────────────────────────────────── + 1 │ Alameda 3.71384e5 3.78375e5 + 2 │ Fresno 95281.1 96134.8 + 3 │ Kern 45672.3 43544.7 + 4 │ Los Angeles 4.89981e5 4.42865e5 + 5 │ Mendocino 1.25813e5 1.22757e5 + 6 │ Merced 1.04819e5 1.09032e5 + 7 │ Orange 5.73756e5 6.01213e5 + 8 │ Plumas 3.2228e5 3.26443e5 + 9 │ San Diego 1.83038e6 1.34155e6 + 10 │ San Joaquin 1.02922e6 1.04048e6 + 11 │ Santa Clara 9.60583e5 643492.0 ``` """ -function total(x::Symbol, design::SurveyDesign, method::Bootstrap) - df = bootstrap(x, design, wsum; method.replicates, method.rng) - df = rename(df, :statistic => :total) +function total(x::Symbol, domain::Symbol, design::ReplicateDesign) + bydomain(x, domain, design, wsum) end \ No newline at end of file diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl index 0f3bc796..6851f4e8 100644 --- a/test/SurveyDesign.jl +++ b/test/SurveyDesign.jl @@ -1,170 +1,3 @@ -# Work on copies, keep original -@testset "SimpleRandomSample" begin - ##### SimpleRandomSample tests - # Load API datasets - apisrs_original = load_data("apisrs") - apisrs_original[!, :derived_probs] = 1 ./ apisrs_original.pw - apisrs_original[!, :derived_sampsize] = fill(200.0, size(apisrs_original, 1)) - ############################## - ### Valid type checking tests - apisrs = copy(apisrs_original) - @test_throws TypeError SimpleRandomSample(apisrs, popsize=-2.83, ignorefpc=true) - @test_throws TypeError SimpleRandomSample(apisrs, sampsize=-300) - @test_throws TypeError SimpleRandomSample(apisrs, sampsize=-2.8, ignorefpc=true) - @test_throws TypeError SimpleRandomSample(apisrs, weights=50) - @test_throws TypeError SimpleRandomSample(apisrs, probs=1) - ############################## - ### weights or probs as Symbol - apisrs = copy(apisrs_original) - srs_weights = SimpleRandomSample(apisrs; weights=:pw) - @test srs_weights.data.weights[1] ≈ 30.97 atol = 1e-4 - @test srs_weights.data.weights == 1 ./ srs_weights.data.probs - ### probs as Symbol - apisrs = copy(apisrs_original) - srs_probs_sym = SimpleRandomSample(apisrs; probs=:derived_probs) - @test srs_probs_sym.data.probs[1] ≈ 0.032289 atol = 1e-4 - @test srs_probs_sym.data.probs == 1 ./ srs_probs_sym.data.weights - ############################## - ### Weights or probs as non-numeric error - apisrs = copy(apisrs_original) - @test_throws ErrorException SimpleRandomSample(apisrs, weights=:stype) - @test_throws ErrorException SimpleRandomSample(apisrs, probs=:cname) - ############################## - ### popsize given as Symbol - apisrs = copy(apisrs_original) - srs_popsize_sym = SimpleRandomSample(apisrs; popsize=:fpc) - @test srs_popsize_sym.data.weights == 1 ./ srs_popsize_sym.data.probs # weights should be inverse of probs - @test srs_popsize_sym.sampsize > 0 - ### popsize given as Vector - apisrs = copy(apisrs_original) - srs_popsize_vec = SimpleRandomSample(apisrs; popsize=apisrs.fpc) - @test srs_popsize_vec.data.weights == 1 ./ srs_popsize_vec.data.probs # weights should be inverse of probs - @test srs_popsize_vec.sampsize > 0 - ############################## - ### sampsize given as Symbol - apisrs = copy(apisrs_original) - srs_sampsize_sym = SimpleRandomSample(apisrs; sampsize=:derived_sampsize, weights=:pw) - @test srs_sampsize_sym.data.weights == 1 ./ srs_sampsize_sym.data.probs # weights should be inverse of probs - @test srs_sampsize_sym.sampsize > 0 - ### sampsize given as Vector - apisrs = copy(apisrs_original) - srs_sampsize_vec = SimpleRandomSample(apisrs; sampsize=apisrs.derived_sampsize, probs=:derived_probs) - @test srs_sampsize_vec.data.weights == 1 ./ srs_sampsize_vec.data.probs # weights should be inverse of probs - @test srs_sampsize_vec.sampsize > 0 - ############################## - ### both weights and probs given - # If weights given, probs is superfluous - apisrs = copy(apisrs_original) - srs_weights_probs = SimpleRandomSample(apisrs; weights=:pw, probs=:derived_probs) - srs_weights_probs = SimpleRandomSample(apisrs; weights=:pw, probs=:pw) - ############################## - ### sum of weights and probs condition check - apisrs = copy(apisrs_original) - @test_throws ErrorException SimpleRandomSample(apisrs, weights=fill(0.3, size(apisrs_original, 1))) - apisrs = copy(apisrs_original) - @test_throws ErrorException SimpleRandomSample(apisrs, probs=fill(0.3, size(apisrs_original, 1))) - ############################## - ### weights only as Vector - apisrs = copy(apisrs_original) - srs_weights = SimpleRandomSample(apisrs; weights=apisrs.pw) - @test srs_weights.data.weights[1] == 30.97 - @test srs_weights.data.weights == 1 ./ srs_weights.data.probs - ### probs only as Vector - apisrs = copy(apisrs_original) - srs_freq = SimpleRandomSample(apisrs; probs=apisrs.derived_probs) - @test srs_freq.data.weights[1] == 30.97 - @test srs_freq.data.weights == 1 ./ srs_freq.data.probs - ############################## - ### ignorefpc tests. TODO: change if ignorefpc functionality changed - apisrs = copy(apisrs_original) - srs_ignorefpc = SimpleRandomSample(apisrs; popsize=:fpc, ignorefpc=true) - @test srs_ignorefpc.data.weights == 1 ./ srs_ignorefpc.data.probs # weights should be inverse of probs - @test srs_ignorefpc.sampsize > 0 - ### incorrect probs with correct popsize, ignorefpc = true - apisrs = copy(apisrs_original) - srs_w_p = SimpleRandomSample(apisrs, popsize=:fpc, probs=fill(0.3, size(apisrs_original, 1)), ignorefpc=true) - @test srs_w_p.data.probs == 1 ./ srs_w_p.data.weights - ### ingorefpc = true with probs given - apisrs = copy(apisrs_original) - srs = SimpleRandomSample(apisrs, ignorefpc=true, probs=:derived_probs) - @test srs.data.probs == 1 ./ srs.data.weights - ############################## - ### probs as vector declared on-the-fly - apisrs = copy(apisrs_original) - srs_prob = SimpleRandomSample(apisrs; probs=1 ./ apisrs.pw) - @test srs_prob.data.weights[1] == 30.97 - @test srs_prob.data.weights == 1 ./ srs_prob.data.probs -end - -@testset "StratifiedSample" begin - ### StratifiedSample tests - # Load API datasets - apistrat_original = load_data("apistrat") - apistrat_original[!, :derived_probs] = 1 ./ apistrat_original.pw - apistrat_original[!, :derived_sampsize] = apistrat_original.fpc ./ apistrat_original.pw - ############################## - ### Valid type checking tests - apistrat = copy(apistrat_original) - @test_throws TypeError StratifiedSample(apistrat,:stype; popsize=-2.83, ignorefpc=true) - @test_throws TypeError StratifiedSample(apistrat,:stype; sampsize=-300) - @test_throws TypeError StratifiedSample(apistrat,:stype; sampsize=-2.8, ignorefpc=true) - @test_throws TypeError StratifiedSample(apistrat,:stype; weights=50) - @test_throws TypeError StratifiedSample(apistrat,:stype; probs=1) - ############################## - ### weights as Symbol - apistrat = copy(apistrat_original) - strat_wt = StratifiedSample(apistrat, :stype; weights=:pw) - @test strat_wt.data.probs == 1 ./ strat_wt.data.weights - ### probs as Symbol - apistrat = copy(apistrat_original) - strat_probs = StratifiedSample(apistrat, :stype; probs=:derived_probs) - @test strat_probs.data.probs == 1 ./ strat_probs.data.weights - ### weights as Vector{<:Real} - apistrat = copy(apistrat_original) - strat_wt = StratifiedSample(apistrat, :stype; weights=apistrat.pw) - @test strat_wt.data.probs == 1 ./ strat_wt.data.weights - ### probs as Vector{<:Real} - apistrat = copy(apistrat_original) - strat_probs = StratifiedSample(apistrat, :stype; probs=apistrat.derived_probs) - @test strat_probs.data.probs == 1 ./ strat_probs.data.weights - ############################## - ### popsize as Symbol - apistrat = copy(apistrat_original) - strat_pop = StratifiedSample(apistrat, :stype; popsize=:fpc) - @test strat_pop.data.probs == 1 ./ strat_pop.data.weights - ### popsize given as Vector (should give error for now, not implemented Vector input directly for popsize) - apistrat = copy(apistrat_original) - @test_throws TypeError StratifiedSample(apistrat,:stype; popsize=apistrat.fpc) - ############################## - ### sampsize given as Symbol - apistrat = copy(apistrat_original) - strat_sampsize_sym = StratifiedSample(apistrat,:stype; sampsize=:derived_sampsize, weights=:pw) - @test strat_sampsize_sym.data.weights == 1 ./ strat_sampsize_sym.data.probs # weights should be inverse of probs - ### sampsize given as symbol without weights or probs, and popsize not given - raise error - apistrat = copy(apistrat_original) - @test_throws ErrorException StratifiedSample(apistrat,:stype; sampsize=:derived_sampsize) - ############################## - ### both weights and probs given - # If weights given, probs is superfluous - apistrat = copy(apistrat_original) - strat_weights_probs = StratifiedSample(apistrat,:stype; weights=:pw, probs=:derived_probs) - strat_weights_probs = StratifiedSample(apistrat,:stype; weights=:pw, probs=:pw) - ############################## - ### ignorefpc test (Modify if ignorefpc changed) - apistrat = copy(apistrat_original) - strat_ignorefpc=StratifiedSample(apistrat,:stype; popsize=:fpc, ignorefpc=true) - @test strat_ignorefpc.data.probs == 1 ./ strat_ignorefpc.data.weights - ############################## - # For now, no sum checks on probs and weights for StratifiedSample (unlike SRS) - apistrat = copy(apistrat_original) - strat_probs1 = StratifiedSample(apistrat, :stype; probs=fill(0.3, size(apistrat, 1))) - @test strat_probs1.data.probs == 1 ./ strat_probs1.data.weights - ############################## - #should throw error because sampsize > popsize - apistrat = copy(apistrat_original) - @test_throws ErrorException StratifiedSample(apistrat, :stype; popsize= :pw, sampsize=:fpc) -end - @testset "SurveyDesign" begin # Load API datasets apiclus1_original = load_data("apiclus1") @@ -172,32 +5,8 @@ end ############################## # one-stage cluster sample with popsize apiclus1 = copy(apiclus1_original) - dclus1 = SurveyDesign(apiclus1, :dnum, :fpc) - @test dclus1.data[!,dclus1.weights] ≈ fill(50.4667,size(apiclus1,1)) atol = 1e-3 - @test dclus1.data[!,dclus1.sampsize] ≈ fill(15,size(apiclus1,1)) - @test dclus1.data[!,:allprobs] ≈ dclus1.data[!,:probs] atol = 1e-4 - - ############################## - # one-stage cluster sample with weights - apiclus1 = copy(apiclus1_original) - dclus1 = SurveyDesign(apiclus1, :dnum; weights=:pw) - @test dclus1.data[!,dclus1.weights] ≈ fill(50.4667,size(apiclus1,1)) atol = 1e-3 + dclus1 = SurveyDesign(apiclus1; clusters = :dnum, popsize =:fpc) + @test dclus1.data[!, :weights] ≈ fill(50.4667,size(apiclus1,1)) atol = 1e-3 @test dclus1.data[!,dclus1.sampsize] ≈ fill(15,size(apiclus1,1)) @test dclus1.data[!,:allprobs] ≈ dclus1.data[!,:probs] atol = 1e-4 -end - -# @testset "ClusterSample" begin -# # # Load API datasets -# # apiclus1_original = load_data("apiclus1") -# # apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column -# # apiclus2_original = load_data("apiclus2") -# ############################## -# ### TODO when they are implemented -# # one-stage cluster sample -# # apiclus1 = copy(apiclus1_original) -# # dclus2 = ClusterSample(apiclus1, :dnum, :fpc) -# # # two-stage cluster sample -# # dclus2 = ClusterSample(apiclus2, [:dnum,:snum], [:fpc1,:fpc2]) -# # # two-stage `with replacement' -# # dclus2wr = ClusterSample(apiclus2, [:dnum,:snum]; weights=:pw) -# end \ No newline at end of file +end \ No newline at end of file diff --git a/test/bootstrap.jl b/test/bootstrap.jl deleted file mode 100644 index e49c1b3f..00000000 --- a/test/bootstrap.jl +++ /dev/null @@ -1,10 +0,0 @@ -using Random, StatsBase -apiclus1 = load_data("apiclus1") -dclus1 = SurveyDesign(apiclus1, :dnum, :fpc); -rng = MersenneTwister(111); -func = wsum; -est = Survey.bootstrap(:api00, dclus1, func; replicates=1000, rng) -@testset "bootstrap.jl" begin - @test est.SE[1] ≈ 1.365925776009e6 - @test est.statistic[1] ≈ 5.9491620666e6 -end \ No newline at end of file diff --git a/test/boxplot.jl b/test/boxplot.jl index f40dd87f..563fec48 100644 --- a/test/boxplot.jl +++ b/test/boxplot.jl @@ -1,7 +1,7 @@ @testset "boxplot.jl" begin # SimpleRandomSample apisrs = load_data("apisrs") - srs = SimpleRandomSample(apisrs,popsize = apisrs.fpc) + srs = srs = SurveyDesign(apisrs; weights=:pw); bp = boxplot(srs, :stype, :enroll; weights = :pw) @test bp.grid[1].entries[1].positional[2] == srs.data[!, :enroll] diff --git a/test/dimnames.jl b/test/dimnames.jl deleted file mode 100644 index 6241bd64..00000000 --- a/test/dimnames.jl +++ /dev/null @@ -1,16 +0,0 @@ -@testset "dimnames.jl" begin - # Simple random sampling tests - apisrs = load_data("apisrs") - # make a copy to not modify the original dataset - apisrs_copy = copy(apisrs) - srs = SimpleRandomSample(apisrs_copy,popsize=:fpc,ignorefpc = true) - # `dim` - @test dim(srs)[2] == 42 - # `colnames` - @test length(colnames(srs)) == dim(srs)[2] - # `dimnames` - @test length(dimnames(srs)[1]) == parse(Int, last(dimnames(srs)[1])) - @test dimnames(srs)[2] == colnames(srs) - - # Stratified sampling tests -end diff --git a/test/hist.jl b/test/hist.jl index 64e260b9..f48b6d70 100644 --- a/test/hist.jl +++ b/test/hist.jl @@ -4,7 +4,7 @@ # SimpleRandomSample apisrs = load_data("apisrs") - srs = SimpleRandomSample(apisrs,popsize=:fpc) + srs = srs = SurveyDesign(apisrs; weights=:pw); h = hist(srs, :enroll) @test h.grid[1].entries[1].positional[2] |> length == 21 diff --git a/test/jackknife.jl b/test/jackknife.jl index 73e90f78..25e35e91 100644 --- a/test/jackknife.jl +++ b/test/jackknife.jl @@ -1,10 +1,8 @@ @testset "jackknife.jl" begin apiclus1_original = load_data("apiclus1") apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column - ############################## - # one-stage cluster sample apiclus1 = copy(apiclus1_original) - dclus1 = SurveyDesign(apiclus1, :dnum, :fpc) - @test jkknife(:api00,dclus1, mean).SE[1] ≈ 26.5997 atol = 1e-4 + dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights=:pw); + @test jkknife(:api00, dclus1, mean).SE[1] ≈ 26.5997 atol = 1e-4 @test jkknife(:api00, dclus1, mean).Statistic[1] ≈ 644.1693 atol = 1e-4 end diff --git a/test/mean.jl b/test/mean.jl index d2fbd8a2..6e8bea18 100644 --- a/test/mean.jl +++ b/test/mean.jl @@ -7,34 +7,26 @@ ############################## ### Basic functionality apisrs = copy(apisrs_original) - srs = SimpleRandomSample(apisrs, popsize = :fpc) - @test mean(:api00, srs).mean[1] ≈ 656.585 atol = 1e-4 - @test mean(:api00, srs).SE[1] ≈ 9.249722039282807 atol = 1e-4 - @test mean(:enroll, srs).mean[1] ≈ 584.61 atol = 1e-4 - @test mean(:enroll, srs).SE[1] ≈ 27.36836524766856 atol = 1e-4 - # ignorefpc = true - apisrs = copy(apisrs_original) - srs = SimpleRandomSample(apisrs, popsize=:fpc,ignorefpc = true) + srs = SurveyDesign(apisrs, weights = :pw) |> bootweights + @test mean(:api00, srs).mean[1] ≈ 656.585 atol = 1e-4 - @test mean(:api00, srs).SE[1] ≈ 9.402772170880636 atol = 1e-4 + @test mean(:api00, srs).SE[1] ≈ 9.402772170880636 atol = 1e-1 @test mean(:enroll, srs).mean[1] ≈ 584.61 atol = 1e-4 - @test mean(:enroll, srs).SE[1] ≈ 27.821214737089324 atol = 1e-4 + @test mean(:enroll, srs).SE[1] ≈ 27.821214737089324 atol = 1 ############################## ### Vector of Symbols - apisrs = copy(apisrs_original) - srs = SimpleRandomSample(apisrs, popsize = :fpc) mean_vec_sym = mean([:api00,:enroll], srs) @test mean_vec_sym.mean[1] ≈ 656.585 atol = 1e-4 - @test mean_vec_sym.SE[1] ≈ 9.249722039282807 atol = 1e-4 + @test mean_vec_sym.SE[1] ≈ 9.49199 atol = 1e-2 @test mean_vec_sym.mean[2] ≈ 584.61 atol = 1e-4 - @test mean_vec_sym.SE[2] ≈ 27.36836524766856 atol = 1e-4 + @test mean_vec_sym.SE[2] ≈ 27.9994 atol = 1e-2 ############################## ### Categorical Array - estimating proportions - apisrs_categ = copy(apisrs_original) - apisrs_categ.stype = CategoricalArray(apisrs_categ.stype) # Convert a column to CategoricalArray - srs_design_categ = SimpleRandomSample(apisrs_categ, popsize = :fpc) + # apisrs_categ = copy(apisrs_original) + # apisrs_categ.stype = CategoricalArray(apisrs_categ.stype) # Convert a column to CategoricalArray + # srs_design_categ = SurveyDesign(apisrs_categ, weights = :pw) #>>>>>>>>> complete this suite - mean_categ = mean(:stype,srs_design_categ) + # mean_categ = mean(:stype,srs_design_categ) # complete this end @@ -63,14 +55,14 @@ end @testset "mean_svyby_Stratified" begin apistrat_original = load_data("apistrat") apistrat = copy(apistrat_original) - strat = StratifiedSample(apistrat, :stype; popsize = :fpc) - mean_strat_symb = mean(:api00,:stype, strat) + strat = SurveyDesign(apistrat; strata = :stype, weights = :pw) |> bootweights + mean_strat_symb = mean(:api00, :stype, strat) @test mean_strat_symb.mean[1] ≈ 674.43 atol = 1e-2 @test mean_strat_symb.mean[2] ≈ 636.6 atol = 1e-2 @test mean_strat_symb.mean[3] ≈ 625.82 atol = 1e-2 - @test mean_strat_symb.SE[1] ≈ 12.3825 atol = 1e-2 - @test mean_strat_symb.SE[2] ≈ 16.2147 atol = 1e-2 - @test mean_strat_symb.SE[3] ≈ 14.9371 atol = 1e-2 + @test mean_strat_symb.SE[1] ≈ 12.6528 atol = 1e-2 + @test mean_strat_symb.SE[2] ≈ 16.3125 atol = 1e-2 + @test mean_strat_symb.SE[3] ≈ 15.3952 atol = 1e-2 end @testset "mean_OneStageCluster" begin @@ -80,8 +72,7 @@ end ############################## # one-stage cluster sample apiclus1 = copy(apiclus1_original) - dclus1 = SurveyDesign(apiclus1, :dnum, :fpc) - - @test mean(:api00,dclus1, Bootstrap()).mean[1] ≈ 644.17 atol = 1 - @test mean(:api00,dclus1, Bootstrap(replicates = 10000)).SE[1] ≈ 23.779 atol = 0.5 # without fpc as it hasn't been figured out for bootstrap. + dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) |> bootweights + @test mean(:api00, dclus1).mean[1] ≈ 644.17 atol = 1e-2 + @test mean(:api00, dclus1).SE[1] ≈ 22.9042 atol = 1e-2 # without fpc as it hasn't been figured out for bootstrap. end diff --git a/test/plot.jl b/test/plot.jl index e9e59c36..c2476f65 100644 --- a/test/plot.jl +++ b/test/plot.jl @@ -1,7 +1,7 @@ @testset "plot.jl" begin # SimpleRandomSample apisrs = load_data("apisrs") - srs = SimpleRandomSample(apisrs,popsize=:fpc) + srs = SurveyDesign(apisrs, weights=:pw) s = plot(srs, :api99, :api00) @test s.grid[1].entries[1].named[:markersize] == srs.data.weights @test s.grid[1].entries[1].positional[1] == srs.data.api99 diff --git a/test/quantile.jl b/test/quantile.jl index cab18fdb..59bb0a69 100644 --- a/test/quantile.jl +++ b/test/quantile.jl @@ -6,10 +6,10 @@ apisrs_original[!, :derived_sampsize] = fill(200.0, size(apisrs_original, 1)) ############################## apisrs = copy(apisrs_original) - srs_design = SimpleRandomSample(apisrs,popsize=:fpc) - @test quantile(:api00,srs_design,0.5)[!,2][1] ≈ 659.0 atol=1e-4 - @test quantile(:api00,srs_design,[0.1753,0.25,0.5,0.75,0.975])[!,2] ≈ [512.8847,544,659,752.5,905] atol = 1e-4 - @test quantile(:enroll,srs_design,[0.1,0.2,0.5,0.75,0.95])[!,2] ≈ [245.5,317.6,453.0,668.5,1473.1] atol = 1e-4 + srs_design = SurveyDesign(apisrs; weights=:pw) + @test quantile(:api00, srs_design, 0.5)[!,2][1] ≈ 659.0 atol=1e-4 + @test quantile(:api00, srs_design, [0.1753,0.25,0.5,0.75,0.975])[!,2] ≈ [512.8847,544,659,752.5,905] atol = 1e-4 + @test quantile(:enroll,srs_design, [0.1,0.2,0.5,0.75,0.95])[!,2] ≈ [245.5,317.6,453.0,668.5,1473.1] atol = 1e-4 end @testset "quantile_Stratified" begin @@ -20,15 +20,7 @@ end apistrat_original[!, :derived_sampsize] = apistrat_original.fpc ./ apistrat_original.pw # base functionality apistrat = copy(apistrat_original) - dstrat = StratifiedSample(apistrat, :stype; popsize = :fpc) + dstrat = SurveyDesign(apistrat; strata = :stype, popsize = :fpc) # Check which definition of quantile for StratifiedSample - # @test quantile(:enroll,dstrat,[0.1,0.2,0.5,0.75,0.95])[!,2] ≈ [262,309.3366,446.4103,658.8764,1589.7881] atol = 1e-4 -end - -@testset "quantile_by_SimpleRandomSample" begin - ## Add tests -end - -@testset "quantile_by_Stratified" begin - ## Add tests + # @test quantile(:enroll, dstrat, [0.1,0.2,0.5,0.75,0.95])[!,2] ≈ [262,309.3366,446.4103,658.8764,1589.7881] atol = 1e-4 end \ No newline at end of file diff --git a/test/ratio.jl b/test/ratio.jl index d198ce1b..b8652ef1 100644 --- a/test/ratio.jl +++ b/test/ratio.jl @@ -4,7 +4,7 @@ ############################## # one-stage cluster sample apiclus1 = copy(apiclus1_original) - dclus1 = SurveyDesign(apiclus1, :dnum, :fpc) + dclus1 = SurveyDesign(apiclus1; clusters = :dnum, popsize = :fpc) @test ratio(:api00, :enroll, dclus1).SE[1] ≈ 0.151242 atol = 1e-4 @test ratio(:api00, :enroll, dclus1).Statistic[1] ≈ 1.17182 atol = 1e-4 end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 607add34..e8f18a3a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -13,9 +13,7 @@ include("SurveyDesign.jl") include("total.jl") include("quantile.jl") include("mean.jl") -include("dimnames.jl") include("plot.jl") include("hist.jl") include("boxplot.jl") -include("bootstrap.jl") include("ratio.jl") \ No newline at end of file diff --git a/test/sampling.jl b/test/sampling.jl deleted file mode 100644 index 276ed8a1..00000000 --- a/test/sampling.jl +++ /dev/null @@ -1,3 +0,0 @@ -""" - Testing suite for sampling functions -""" \ No newline at end of file diff --git a/test/total.jl b/test/total.jl index de19c41c..0e329e76 100644 --- a/test/total.jl +++ b/test/total.jl @@ -1,14 +1,14 @@ -@testset "total_SimpleRandomSample" begin +@testset "Simple random sample" begin apisrs_original = load_data("apisrs") # base functionality apisrs = copy(apisrs_original) - srs = SimpleRandomSample(apisrs; popsize = :fpc) + srs = SurveyDesign(apisrs; weights = :pw) |> bootweights tot = total(:api00, srs) @test tot.total[1] ≈ 4.06688749e6 atol = 1e-4 - @test tot.SE[1] ≈ 57292.7783113177 atol = 1e-4 + @test tot.SE[1] ≈ 292392.42247601174 atol = 1e-1 # without fpc - srs_ignorefpc = SimpleRandomSample(apisrs; popsize = :fpc, ignorefpc = true) + srs_ignorefpc = SurveyDesign(apisrs; popsize = :fpc, ignorefpc = true) tot = total(:api00, srs_ignorefpc) # TODO: uncomment after correcting `total` function # @test tot.total[1] ≈ 131317 atol = 1 @@ -17,7 +17,7 @@ # CategoricalArray apisrs = copy(apisrs_original) apisrs[!, :cname] = CategoricalArrays.categorical(apisrs.cname) - srs = SimpleRandomSample(apisrs; popsize = :fpc) + srs = SurveyDesign(apisrs; popsize = :fpc) tot = total(:cname, srs) @test size(tot)[1] == apisrs.cname |> unique |> length @test filter(:cname => ==("Alameda"), tot).total[1] ≈ 340.67 atol = 1e-2 @@ -27,7 +27,7 @@ # Vector{Symbol} apisrs = copy(apisrs_original) - srs = SimpleRandomSample(apisrs; popsize = :fpc) + srs = SurveyDesign(apisrs; popsize = :fpc) tot = total([:api00, :enroll], srs) ## :api00 @test tot.total[1] ≈ 4066888 atol = 1 @@ -38,7 +38,7 @@ # subpopulation apisrs = copy(apisrs_original) - srs = SimpleRandomSample(apisrs; popsize = :fpc) + srs = SurveyDesign(apisrs; popsize = :fpc) tot = total(:api00, :cname, srs) @test size(tot)[1] == apisrs.cname |> unique |> length @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 917238.49 atol = 1e-2 From eb290ebc82a4a8d6713010395a8e7dacb00bce8e Mon Sep 17 00:00:00 2001 From: ayushpatnaikgit Date: Mon, 2 Jan 2023 22:59:02 +0530 Subject: [PATCH 06/80] All tests pass. --- src/mean.jl | 4 +++- src/total.jl | 15 +++++++++--- test/mean.jl | 26 ++++++++++----------- test/total.jl | 65 ++++++++++++++++++++++----------------------------- 4 files changed, 56 insertions(+), 54 deletions(-) diff --git a/src/mean.jl b/src/mean.jl index 5dc679f3..501230d7 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -79,7 +79,9 @@ julia> mean(:api00, :cname, bclus1) |> print """ function mean(x::Symbol, domain::Symbol, design::ReplicateDesign) weighted_mean(x, w) = mean(x, StatsBase.weights(w)) - bydomain(x, domain, design, weighted_mean) + df = bydomain(x, domain, design, weighted_mean) + rename!(df, :statistic => :mean) + return df end function mean(x::Vector{Symbol}, design::ReplicateDesign) diff --git a/src/total.jl b/src/total.jl index d298d62d..3a3185c0 100644 --- a/src/total.jl +++ b/src/total.jl @@ -4,16 +4,18 @@ julia> using Survey; julia> apiclus1 = load_data("apiclus1"); +julia> apiclus1[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column + julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); julia> bclus1 = bootweights(dclus1; replicates = 1000); julia> total(:api00, bclus1) 1×2 DataFrame - Row │ mean SE + Row │ total SE │ Float64 Float64 ─────┼────────────────────── - 1 │ 5.94916e6 2.01705e6 + 1 │ 5.94916e6 1.31977e6 ``` """ function total(x::Symbol, design::ReplicateDesign) @@ -51,5 +53,12 @@ julia> total(:api00, :cname, bclus1) |> print ``` """ function total(x::Symbol, domain::Symbol, design::ReplicateDesign) - bydomain(x, domain, design, wsum) + df = bydomain(x, domain, design, wsum) + rename!(df, :statistic => :total) +end + +function total(x::Vector{Symbol}, design::ReplicateDesign) + df = reduce(vcat, [total(i, design) for i in x]) + insertcols!(df, 1, :names => String.(x)) + return df end \ No newline at end of file diff --git a/test/mean.jl b/test/mean.jl index 6e8bea18..4745125b 100644 --- a/test/mean.jl +++ b/test/mean.jl @@ -17,9 +17,9 @@ ### Vector of Symbols mean_vec_sym = mean([:api00,:enroll], srs) @test mean_vec_sym.mean[1] ≈ 656.585 atol = 1e-4 - @test mean_vec_sym.SE[1] ≈ 9.49199 atol = 1e-2 + @test mean_vec_sym.SE[1] ≈ 9.3065 atol = 1e-2 @test mean_vec_sym.mean[2] ≈ 584.61 atol = 1e-4 - @test mean_vec_sym.SE[2] ≈ 27.9994 atol = 1e-2 + @test mean_vec_sym.SE[2] ≈ 28.1048 atol = 1e-2 ############################## ### Categorical Array - estimating proportions # apisrs_categ = copy(apisrs_original) @@ -33,23 +33,23 @@ end @testset "mean_Stratified" begin apistrat_original = load_data("apistrat") apistrat = copy(apistrat_original) - strat = StratifiedSample(apistrat, :stype; popsize = :fpc) + strat = SurveyDesign(apistrat, strata = :stype, weights = :pw) |> bootweights mean_strat = mean(:api00, strat) - @test mean_strat.mean[1] ≈ 662.287 atol = 1e-2 - @test mean_strat.SE[1] ≈ 9.40894 atol = 1e-2 + @test mean_strat.mean[1] ≈ 662.29 atol = 1e-2 + @test mean_strat.SE[1] ≈ 9.48296 atol = 1e-1 end @testset "mean_svyby_SimpleRandomSample" begin apisrs_original = load_data("apisrs") apisrs = copy(apisrs_original) - srs = SimpleRandomSample(apisrs, popsize = :fpc) + srs = SurveyDesign(apisrs; popsize = :fpc) |> bootweights mean_symb_srs = mean(:api00, :stype, srs) @test mean_symb_srs.mean[1] ≈ 605.36 atol = 1e-2 @test mean_symb_srs.mean[2] ≈ 666.141 atol = 1e-2 @test mean_symb_srs.mean[3] ≈ 654.273 atol = 1e-2 - @test mean_symb_srs.SE[1] ≈ 21.9266 atol = 1e-2 - @test mean_symb_srs.SE[2] ≈ 11.1935 atol = 1e-2 - @test mean_symb_srs.SE[3] ≈ 21.8261 atol = 1e-2 + @test mean_symb_srs.SE[1] ≈ 22.6718 atol = 1e-2 + @test mean_symb_srs.SE[2] ≈ 11.35390 atol = 1e-2 + @test mean_symb_srs.SE[3] ≈ 22.3298 atol = 1e-2 end @testset "mean_svyby_Stratified" begin @@ -60,9 +60,9 @@ end @test mean_strat_symb.mean[1] ≈ 674.43 atol = 1e-2 @test mean_strat_symb.mean[2] ≈ 636.6 atol = 1e-2 @test mean_strat_symb.mean[3] ≈ 625.82 atol = 1e-2 - @test mean_strat_symb.SE[1] ≈ 12.6528 atol = 1e-2 - @test mean_strat_symb.SE[2] ≈ 16.3125 atol = 1e-2 - @test mean_strat_symb.SE[3] ≈ 15.3952 atol = 1e-2 + @test mean_strat_symb.SE[1] ≈ 12.4398 atol = 1e-2 + @test mean_strat_symb.SE[2] ≈ 16.5628 atol = 1e-2 + @test mean_strat_symb.SE[3] ≈ 15.42320 atol = 1e-2 end @testset "mean_OneStageCluster" begin @@ -74,5 +74,5 @@ end apiclus1 = copy(apiclus1_original) dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) |> bootweights @test mean(:api00, dclus1).mean[1] ≈ 644.17 atol = 1e-2 - @test mean(:api00, dclus1).SE[1] ≈ 22.9042 atol = 1e-2 # without fpc as it hasn't been figured out for bootstrap. + @test mean(:api00, dclus1).SE[1] ≈ 23.291 atol = 1e-2 # without fpc as it hasn't been figured out for bootstrap. end diff --git a/test/total.jl b/test/total.jl index 0e329e76..5825a319 100644 --- a/test/total.jl +++ b/test/total.jl @@ -6,45 +6,42 @@ srs = SurveyDesign(apisrs; weights = :pw) |> bootweights tot = total(:api00, srs) @test tot.total[1] ≈ 4.06688749e6 atol = 1e-4 - @test tot.SE[1] ≈ 292392.42247601174 atol = 1e-1 - # without fpc - srs_ignorefpc = SurveyDesign(apisrs; popsize = :fpc, ignorefpc = true) - tot = total(:api00, srs_ignorefpc) + @test tot.SE[1] ≈ 60518.199 atol = 1e-1 # TODO: uncomment after correcting `total` function # @test tot.total[1] ≈ 131317 atol = 1 # @test tot.SE[1] ≈ 1880.6 atol = 1e-1 # CategoricalArray - apisrs = copy(apisrs_original) - apisrs[!, :cname] = CategoricalArrays.categorical(apisrs.cname) - srs = SurveyDesign(apisrs; popsize = :fpc) - tot = total(:cname, srs) - @test size(tot)[1] == apisrs.cname |> unique |> length - @test filter(:cname => ==("Alameda"), tot).total[1] ≈ 340.67 atol = 1e-2 - @test filter(:cname => ==("Alameda"), tot).SE[1] ≈ 98.472 atol = 1e-3 - @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 1393.65 atol = 1e-2 - @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 180.368 atol = 1e-3 + # apisrs = copy(apisrs_original) + # apisrs[!, :cname] = CategoricalArrays.categorical(apisrs.cname) + # srs = SurveyDesign(apisrs; popsize = :fpc) + # tot = total(:cname, srs) + # @test size(tot)[1] == apisrs.cname |> unique |> length + # @test filter(:cname => ==("Alameda"), tot).total[1] ≈ 340.67 atol = 1e-2 + # @test filter(:cname => ==("Alameda"), tot).SE[1] ≈ 98.472 atol = 1e-3 + # @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 1393.65 atol = 1e-2 + # @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 180.368 atol = 1e-3 # Vector{Symbol} apisrs = copy(apisrs_original) - srs = SurveyDesign(apisrs; popsize = :fpc) + srs = SurveyDesign(apisrs; popsize = :fpc) |> bootweights tot = total([:api00, :enroll], srs) ## :api00 @test tot.total[1] ≈ 4066888 atol = 1 - @test tot.SE[1] ≈ 57293 atol = 1 + @test tot.SE[1] ≈ 60518.199 atol = 1 ## :enroll @test tot.total[2] ≈ 3621074 atol = 1 - @test tot.SE[2] ≈ 169520 atol = 1 + @test tot.SE[2] ≈ 173784.343 atol = 1 # subpopulation apisrs = copy(apisrs_original) - srs = SurveyDesign(apisrs; popsize = :fpc) + srs = SurveyDesign(apisrs; popsize = :fpc) |> bootweights tot = total(:api00, :cname, srs) @test size(tot)[1] == apisrs.cname |> unique |> length @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 917238.49 atol = 1e-2 - @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 122289.00 atol = 1e-2 + @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 122366.33 atol = 1e-2 @test filter(:cname => ==("Monterey"), tot).total[1] ≈ 74947.40 atol = 1e-2 - @test filter(:cname => ==("Monterey"), tot).SE[1] ≈ 37616.17 atol = 1e-2 + @test filter(:cname => ==("Monterey"), tot).SE[1] ≈ 38178.35 atol = 1e-2 end @testset "total_Stratified" begin @@ -52,22 +49,18 @@ end # base functionality apistrat = copy(apistrat_original) - strat = StratifiedSample(apistrat, :stype; popsize = :fpc) + strat = SurveyDesign(apistrat; strata = :stype, weights = :pw) |> bootweights tot = total(:api00, strat) - @test tot.total[1] ≈ 4102208 atol = 1e-1 - @test tot.SE[1] ≈ 58279 atol = 1e-1 + @test tot.total[1] ≈ 4102208 atol = 10 + @test tot.SE[1] ≈ 77211.61 atol = 1e-1 # without fpc - apistrat = copy(apistrat_original) - strat_ignorefpc = StratifiedSample(apistrat, :stype; popsize = :fpc, ignorefpc = true) - tot = total(:api00, strat_ignorefpc) - @test tot.total[1] ≈ 130564 atol = 1e-4 # TODO: uncomment after correcting `total` function # @test tot.SE[1] ≈ 1690.4 atol = 1e-1 # CategoricalArray - apistrat = copy(apistrat_original) - apistrat[!, :cname] = CategoricalArrays.categorical(apistrat.cname) - strat = StratifiedSample(apistrat, :stype; popsize = :fpc) + # apistrat = copy(apistrat_original) + # apistrat[!, :cname] = CategoricalArrays.categorical(apistrat.cname) + # strat = StratifiedSample(apistrat, :stype; popsize = :fpc) # TODO: uncomment after adding `CategoricalArray` support # @test tot.SE[1] ≈ 1690.4 atol = 1e-1 # tot = total(:cname, strat) @@ -78,15 +71,13 @@ end # @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 199.635 atol = 1e-3 # Vector{Symbol} - apistrat = copy(apistrat_original) - strat = StratifiedSample(apistrat, :stype; popsize = :fpc) tot = total([:api00, :enroll], strat) ## :api00 @test tot.total[1] ≈ 4102208 atol = 1 - @test tot.SE[1] ≈ 58279 atol = 1 + @test tot.SE[1] ≈ 77211.61 atol = 1 ## :enroll @test tot.total[2] ≈ 3687178 atol = 1 - @test tot.SE[2] ≈ 114642 atol = 1 + @test tot.SE[2] ≈ 127021.5540 atol = 1 # subpopulation # TODO: add functionality in `src/total.jl` @@ -100,11 +91,11 @@ end ############################## # one-stage cluster sample apiclus1 = copy(apiclus1_original) - dclus1 = SurveyDesign(apiclus1, :dnum, :fpc) + dclus1 = SurveyDesign(apiclus1, clusters = :dnum, weights = :pw) |> bootweights @test total(:api00,dclus1).total[1] ≈ 5949162 atol = 1 - @test total(:api00,dclus1).SE[1] ≈ 1339481 atol = 1 + @test total(:api00,dclus1).SE[1] ≈ 1.3338978891316957e6 atol = 1 - @test total(:api00,dclus1, Bootstrap()).total[1] ≈ 5949162 atol = 1 - @test total(:api00,dclus1, Bootstrap(replicates = 10000)).SE[1] ≈ 1352953 atol = 50000 # without fpc as it hasn't been figured out for bootstrap. + @test total(:api00, dclus1).total[1] ≈ 5949162 atol = 1 + @test total(:api00, dclus1).SE[1] ≈ 1352953 atol = 50000 # without fpc as it hasn't been figured out for bootstrap. end \ No newline at end of file From f9efaea9f7314ae4ed61ff0aec0b9a0c40c11c3a Mon Sep 17 00:00:00 2001 From: ayushpatnaikgit Date: Wed, 4 Jan 2023 21:19:10 +0530 Subject: [PATCH 07/80] Fix bug in total. --- Project.toml | 1 + src/Survey.jl | 1 + src/bootstrap.jl | 41 +++++++++++++++++++++++++++++++---------- src/by.jl | 2 +- src/mean.jl | 12 ++++++------ src/total.jl | 2 +- 6 files changed, 41 insertions(+), 18 deletions(-) diff --git a/Project.toml b/Project.toml index 2659ebc2..f288d231 100644 --- a/Project.toml +++ b/Project.toml @@ -10,6 +10,7 @@ CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" diff --git a/src/Survey.jl b/src/Survey.jl index f6d3d030..fee13d6d 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -11,6 +11,7 @@ using CairoMakie using AlgebraOfGraphics using CategoricalArrays using Random +using Missings include("SurveyDesign.jl") include("bootstrap.jl") diff --git a/src/bootstrap.jl b/src/bootstrap.jl index 70df8a81..a28fabe3 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -26,27 +26,48 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis H = length(unique(design.data[!, design.strata])) stratified = groupby(design.data, design.strata) function replicate(stratified, H) - for j in 1:H - substrata = DataFrame(stratified[j]) + for h in 1:H + substrata = DataFrame(stratified[h]) psus = unique(substrata[!, design.cluster]) - if length(psus) == 1 - return DataFrame(statistic = X, SE = 0) + # @show psus + if length(psus) <= 1 + return DataFrame(statistic = X, SE = 0) # bug! end nh = length(psus) randinds = rand(rng, 1:(nh), (nh-1)) # Main bootstrap algo. Draw nh-1 out of nh, with replacement. rh = [(count(==(i), randinds)) for i in 1:nh] # main bootstrap algo. gdf = groupby(substrata, design.cluster) + # @show keys(gdf) for i in 1:nh - gdf[i].rh = repeat([rh[i]], nrow(gdf[i])) - end - stratified[j].rh = DataFrame(gdf).rh + gdf[i].whij = repeat([rh[i]], nrow(gdf[i])) .* gdf[i].weights .* (nh / (nh - 1)) + end + stratified[h].whij = transform(gdf).whij + end - return DataFrame(stratified) + return transform(stratified, :whij) end df = replicate(stratified, H) - rename!(df,:rh => :replicate_1) + rename!(df,:whij => :replicate_1) + df.replicate_1 = disallowmissing(df.replicate_1) for i in 2:(replicates) - df[!, "replicate_"*string(i)] = Float64.(replicate(stratified, H).rh) + df[!, "replicate_"*string(i)] = disallowmissing(replicate(stratified, H).whij) end return ReplicateDesign(df, design.cluster, design.popsize, design.sampsize, design.strata, design.pps, replicates) +end + +function bootstrap(x::Symbol, design::SurveyDesign, func = wsum; replicates = 100, rng = MersenneTwister(1234)) + gdf = groupby(design.data, design.cluster) + psus = unique(design.data[!, design.cluster]) + nh = length(psus) + X = func(design.data[:, x], design.data.weights) + Xt = Array{Float64, 1}(undef, replicates) + for i in 1:replicates + selected_psus = psus[rand(rng, 1:nh, (nh-1))] # simple random sample of PSUs, with replacement. Select (nh-1) out of nh + xhij = (reduce(vcat, [gdf[(i,)][!, x] for i in selected_psus])) + whij = (reduce(vcat, [gdf[(i,)].weights * (nh / (nh - 1)) for i in selected_psus])) + Xt[i] = func(xhij, whij) + end + @show Xt + variance = sum((Xt .- X).^2) / replicates + return DataFrame(statistic = X, SE = sqrt(variance)) end \ No newline at end of file diff --git a/src/by.jl b/src/by.jl index 30cb2dd2..be26d5a3 100644 --- a/src/by.jl +++ b/src/by.jl @@ -4,7 +4,7 @@ function bydomain(x::Symbol, domain::Symbol, design::ReplicateDesign, func::Func X = combine(gdf, [x, :weights] => ((a, b) -> func(a, weights(b))) => :statistic) Xt_mat = Array{Float64, 2}(undef, (nd, design.replicates)) for i in 1:design.replicates - Xt_mat[:, i] = combine(gdf, [x, :weights, Symbol("replicate_"*string(i))] => ((a, b, c) -> func(a, weights(b .* c))) => :statistic).statistic + Xt_mat[:, i] = combine(gdf, [x, Symbol("replicate_"*string(i))] => ((a, c) -> func(a, weights(c))) => :statistic).statistic end ses = [] for i in 1:nd diff --git a/src/mean.jl b/src/mean.jl index 501230d7..2bf8b925 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -4,21 +4,21 @@ julia> using Survey, Random, StatsBase; julia> apiclus1 = load_data("apiclus1"); -julia> dclus1 = SurveyDesign(apiclus1, :dnum, :fpc); +julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); -julia> bclus1 = bootweights(apiclus1; replicates = 1000) +julia> bclus1 = bootweights(dclus1; replicates = 1000) julia> mean(:api00, bclus1) 1×2 DataFrame Row │ mean SE │ Float64 Float64 ─────┼────────────────── - 1 │ 644.169 23.0897 + 1 │ 644.169 23.7208 ``` """ function mean(x::Symbol, design::ReplicateDesign) X = mean(design.data[!, x], weights(design.data.weights)) - Xt = [mean(design.data[!, x], weights(design.data.weights .* design.data[! , "replicate_"*string(i)])) for i in 1:design.replicates] + Xt = [mean(design.data[!, x], weights(design.data[! , "replicate_"*string(i)])) for i in 1:design.replicates] variance = sum((Xt .- X).^2) / design.replicates DataFrame(mean = X, SE = sqrt(variance)) end @@ -28,9 +28,9 @@ julia> using Survey, Random, StatsBase; julia> apiclus1 = load_data("apiclus1"); -julia> dclus1 = SurveyDesign(apiclus1, :dnum, :fpc); +julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); -julia> bclus1 = bootweights(apiclus1; replicates = 1000) +julia> bclus1 = bootweights(dclus1; replicates = 1000) julia> mean(:api00, :cname, bclus1) |> print 38×3 DataFrame diff --git a/src/total.jl b/src/total.jl index 3a3185c0..fdf83216 100644 --- a/src/total.jl +++ b/src/total.jl @@ -20,7 +20,7 @@ julia> total(:api00, bclus1) """ function total(x::Symbol, design::ReplicateDesign) X = wsum(design.data[!, x], weights(design.data.weights)) - Xt = [wsum(design.data[!, x], weights(design.data.weights .* design.data[! , "replicate_"*string(i)])) for i in 1:design.replicates] + Xt = [wsum(design.data[!, x], weights(design.data[! , "replicate_"*string(i)])) for i in 1:design.replicates] variance = sum((Xt .- X).^2) / design.replicates DataFrame(total = X, SE = sqrt(variance)) end From 7a65293550935ef7355a516e3a8deec3985cef7f Mon Sep 17 00:00:00 2001 From: ayushpatnaikgit Date: Thu, 5 Jan 2023 11:18:08 +0530 Subject: [PATCH 08/80] Remove comments and unused function. --- src/bootstrap.jl | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/src/bootstrap.jl b/src/bootstrap.jl index a28fabe3..9dd605c1 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -29,7 +29,6 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis for h in 1:H substrata = DataFrame(stratified[h]) psus = unique(substrata[!, design.cluster]) - # @show psus if length(psus) <= 1 return DataFrame(statistic = X, SE = 0) # bug! end @@ -37,7 +36,6 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis randinds = rand(rng, 1:(nh), (nh-1)) # Main bootstrap algo. Draw nh-1 out of nh, with replacement. rh = [(count(==(i), randinds)) for i in 1:nh] # main bootstrap algo. gdf = groupby(substrata, design.cluster) - # @show keys(gdf) for i in 1:nh gdf[i].whij = repeat([rh[i]], nrow(gdf[i])) .* gdf[i].weights .* (nh / (nh - 1)) end @@ -53,21 +51,4 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis df[!, "replicate_"*string(i)] = disallowmissing(replicate(stratified, H).whij) end return ReplicateDesign(df, design.cluster, design.popsize, design.sampsize, design.strata, design.pps, replicates) -end - -function bootstrap(x::Symbol, design::SurveyDesign, func = wsum; replicates = 100, rng = MersenneTwister(1234)) - gdf = groupby(design.data, design.cluster) - psus = unique(design.data[!, design.cluster]) - nh = length(psus) - X = func(design.data[:, x], design.data.weights) - Xt = Array{Float64, 1}(undef, replicates) - for i in 1:replicates - selected_psus = psus[rand(rng, 1:nh, (nh-1))] # simple random sample of PSUs, with replacement. Select (nh-1) out of nh - xhij = (reduce(vcat, [gdf[(i,)][!, x] for i in selected_psus])) - whij = (reduce(vcat, [gdf[(i,)].weights * (nh / (nh - 1)) for i in selected_psus])) - Xt[i] = func(xhij, whij) - end - @show Xt - variance = sum((Xt .- X).^2) / replicates - return DataFrame(statistic = X, SE = sqrt(variance)) end \ No newline at end of file From af7ee993962a6c710b42c23c2fb757b09a30d25b Mon Sep 17 00:00:00 2001 From: ayushpatnaikgit Date: Thu, 5 Jan 2023 11:27:03 +0530 Subject: [PATCH 09/80] Attemp bug fix. --- src/bootstrap.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bootstrap.jl b/src/bootstrap.jl index 9dd605c1..2055b6bf 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -30,7 +30,7 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis substrata = DataFrame(stratified[h]) psus = unique(substrata[!, design.cluster]) if length(psus) <= 1 - return DataFrame(statistic = X, SE = 0) # bug! + stratified[h].whij .= 0 # hasn't been tested yet. end nh = length(psus) randinds = rand(rng, 1:(nh), (nh-1)) # Main bootstrap algo. Draw nh-1 out of nh, with replacement. From 9c0b68547be12744fa4390e5be8554615f89c4d7 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Mon, 9 Jan 2023 18:11:31 +0530 Subject: [PATCH 10/80] Fix and add tests for SRS for `total` --- test/total.jl | 144 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 129 insertions(+), 15 deletions(-) diff --git a/test/total.jl b/test/total.jl index 5825a319..4c3788a8 100644 --- a/test/total.jl +++ b/test/total.jl @@ -5,11 +5,20 @@ apisrs = copy(apisrs_original) srs = SurveyDesign(apisrs; weights = :pw) |> bootweights tot = total(:api00, srs) - @test tot.total[1] ≈ 4.06688749e6 atol = 1e-4 - @test tot.SE[1] ≈ 60518.199 atol = 1e-1 - # TODO: uncomment after correcting `total` function - # @test tot.total[1] ≈ 131317 atol = 1 - # @test tot.SE[1] ≈ 1880.6 atol = 1e-1 + @test tot.total[1] ≈ 4066888 rtol = 1e-5 + @test tot.SE[1] ≈ 58526 rtol = 1e-1 + mn = mean(:api00, srs) + @test mn.mean[1] ≈ 656.58 rtol = 1e-5 + @test mn.SE[1] ≈ 9.4488 rtol = 1e-1 + # equivalent R code and results: + # > srs <- svydesign(data=apisrs, id=~1, weights=~pw) + # > srsrep <- as.svrepdesign(srs, type="bootstrap", replicates=4000) + # > svytotal(~api00, srsrep) + # total SE + # api00 4066888 58526 + # > svymean(~api00, srsrep) + # mean SE + # api00 656.58 9.4488 # CategoricalArray # apisrs = copy(apisrs_original) @@ -24,24 +33,129 @@ # Vector{Symbol} apisrs = copy(apisrs_original) - srs = SurveyDesign(apisrs; popsize = :fpc) |> bootweights + srs = SurveyDesign(apisrs; weights = :pw) |> bootweights tot = total([:api00, :enroll], srs) + mn = mean([:api00, :enroll], srs) ## :api00 - @test tot.total[1] ≈ 4066888 atol = 1 - @test tot.SE[1] ≈ 60518.199 atol = 1 + @test tot.total[1] ≈ 4066888 rtol = 1e-5 + @test tot.SE[1] ≈ 57502 rtol = 1e-1 + @test mn.mean[1] ≈ 656.58 rtol = 1e-5 + @test mn.SE[1] ≈ 9.2835 rtol = 1e-1 ## :enroll - @test tot.total[2] ≈ 3621074 atol = 1 - @test tot.SE[2] ≈ 173784.343 atol = 1 + @test tot.total[2] ≈ 3621074 rtol = 1e-5 + @test tot.SE[2] ≈ 176793 rtol = 1e-1 + @test mn.mean[2] ≈ 584.61 rtol = 1e-5 + @test mn.SE[2] ≈ 28.5427 rtol = 1e-1 + # equivalent R code and results: + # > srs <- svydesign(data=apisrs, id=~1, weights=~pw) + # > srsrep <- as.svrepdesign(srs, type="bootstrap", replicates=4000) + # > svytotal(~api00+~enroll, srsrep) + # total SE + # api00 4066888 57502 + # enroll 3621074 176793 + # > svymean(~api00+~enroll, srsrep) + # mean SE + # api00 656.58 9.2835 + # enroll 584.61 28.5427 # subpopulation apisrs = copy(apisrs_original) - srs = SurveyDesign(apisrs; popsize = :fpc) |> bootweights + srs = SurveyDesign(apisrs; weights = :pw) |> bootweights tot = total(:api00, :cname, srs) @test size(tot)[1] == apisrs.cname |> unique |> length - @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 917238.49 atol = 1e-2 - @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 122366.33 atol = 1e-2 - @test filter(:cname => ==("Monterey"), tot).total[1] ≈ 74947.40 atol = 1e-2 - @test filter(:cname => ==("Monterey"), tot).SE[1] ≈ 38178.35 atol = 1e-2 + @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 917238.49 rtol = 1e-5 + @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 122193.02 rtol = 1e-1 + @test filter(:cname => ==("Monterey"), tot).total[1] ≈ 74947.40 rtol = 1e-5 + @test filter(:cname => ==("Monterey"), tot).SE[1] ≈ 38862.71 rtol = 1e-1 + mn = mean(:api00, :cname, srs) + @test size(mn)[1] == apisrs.cname |> unique |> length + @test filter(:cname => ==("Los Angeles"), mn).mean[1] ≈ 658.1556 rtol = 1e-5 + @test filter(:cname => ==("Los Angeles"), mn).SE[1] ≈ 2.126852e+01 rtol = 1e-1 + @test filter(:cname => ==("Santa Clara"), mn).mean[1] ≈ 718.2857 rtol = 1e-5 + @test filter(:cname => ==("Santa Clara"), mn).SE[1] ≈ 5.835346e+01 rtol = 1e-1 + # equivalent R code and results: + # > srs <- svydesign(data=apisrs, id=~1, weights=~pw) + # > srsrep <- as.svrepdesign(srs, type="bootstrap", replicates=4000) + # > svyby(~api00, ~cname, srsrep, svytotal) + # cname api00 se + # Alameda Alameda 230323.89 67808.91 + # Calaveras Calaveras 24466.30 24199.26 + # Contra Costa Contra Costa 213538.15 68780.65 + # Fresno Fresno 148717.94 54174.78 + # Imperial Imperial 19263.34 19292.34 + # Kern Kern 177643.92 56429.75 + # Kings Kings 29080.83 20659.88 + # Lake Lake 24899.88 24796.24 + # Lassen Lassen 23289.44 23150.91 + # Los Angeles Los Angeles 917238.49 122193.02 + # Madera Madera 44596.80 25684.62 + # Marin Marin 74297.03 43018.64 + # Merced Merced 18427.15 18057.21 + # Modoc Modoc 20780.87 20977.35 + # Monterey Monterey 74947.40 38862.71 + # Napa Napa 45030.38 31747.05 + # Orange Orange 208861.68 66824.94 + # Placer Placer 23506.23 23426.32 + # Riverside Riverside 177860.71 55697.57 + # Sacramento Sacramento 152620.16 53266.09 + # San Bernardino San Bernardino 247388.36 66806.58 + # San Diego San Diego 254387.58 71730.93 + # San Francisco San Francisco 51874.75 29597.88 + # San Joaquin San Joaquin 113102.44 46195.42 + # San Luis Obispo San Luis Obispo 22886.83 22984.23 + # San Mateo San Mateo 38216.98 27075.67 + # Santa Barbara Santa Barbara 67700.42 38550.72 + # Santa Clara Santa Clara 155717.16 58101.15 + # Santa Cruz Santa Cruz 58006.81 34633.27 + # Shasta Shasta 46702.76 32882.09 + # Siskiyou Siskiyou 21648.03 21667.03 + # Solano Solano 57882.93 33095.96 + # Sonoma Sonoma 19511.10 19782.71 + # Stanislaus Stanislaus 68412.73 39997.43 + # Sutter Sutter 23041.68 22738.16 + # Tulare Tulare 41128.16 28933.90 + # Ventura Ventura 115177.43 51200.56 + # Yolo Yolo 14710.75 14676.49 + # > svyby(~api00, ~cname, srsrep, svymean) + # cname api00 se + # Alameda Alameda 676.0909 3.522082e+01 + # Calaveras Calaveras 790.0000 0.000000e+00 + # Contra Costa Contra Costa 766.1111 5.435054e+01 + # Fresno Fresno 600.2500 5.811781e+01 + # Imperial Imperial 622.0000 0.000000e+00 + # Kern Kern 573.6000 4.634744e+01 + # Kings Kings 469.5000 4.264356e+01 + # Lake Lake 804.0000 0.000000e+00 + # Lassen Lassen 752.0000 0.000000e+00 + # Los Angeles Los Angeles 658.1556 2.126852e+01 + # Madera Madera 480.0000 3.461786e+00 + # Marin Marin 799.6667 3.509912e+01 + # Merced Merced 595.0000 0.000000e+00 + # Modoc Modoc 671.0000 0.000000e+00 + # Monterey Monterey 605.0000 8.356655e+01 + # Napa Napa 727.0000 4.770914e+01 + # Orange Orange 749.3333 2.876956e+01 + # Placer Placer 759.0000 0.000000e+00 + # Riverside Riverside 574.3000 2.789294e+01 + # Sacramento Sacramento 616.0000 3.785063e+01 + # San Bernardino San Bernardino 614.4615 2.985197e+01 + # San Diego San Diego 684.5000 3.254291e+01 + # San Francisco San Francisco 558.3333 4.404227e+01 + # San Joaquin San Joaquin 608.6667 4.153241e+01 + # San Luis Obispo San Luis Obispo 739.0000 2.691382e-14 + # San Mateo San Mateo 617.0000 7.352923e+01 + # Santa Barbara Santa Barbara 728.6667 2.551393e+01 + # Santa Clara Santa Clara 718.2857 5.835346e+01 + # Santa Cruz Santa Cruz 624.3333 1.131098e+02 + # Shasta Shasta 754.0000 5.731963e+01 + # Siskiyou Siskiyou 699.0000 0.000000e+00 + # Solano Solano 623.0000 4.541173e+01 + # Sonoma Sonoma 630.0000 0.000000e+00 + # Stanislaus Stanislaus 736.3333 5.176843e+00 + # Sutter Sutter 744.0000 0.000000e+00 + # Tulare Tulare 664.0000 2.061011e+01 + # Ventura Ventura 743.8000 3.153839e+01 + # Yolo Yolo 475.0000 0.000000e+00 end @testset "total_Stratified" begin From a5f3a4e13e858041b8ea4c2dd3460fcc014eae43 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Mon, 9 Jan 2023 18:44:17 +0530 Subject: [PATCH 11/80] Fix and add tests for Stratified, add constants for tolerances --- test/total.jl | 213 ++++++++++++++++---------------------------------- 1 file changed, 69 insertions(+), 144 deletions(-) diff --git a/test/total.jl b/test/total.jl index 4c3788a8..9b7c8e56 100644 --- a/test/total.jl +++ b/test/total.jl @@ -1,3 +1,6 @@ +const STAT_TOL = 1e-5 +const SE_TOL = 1e-1 + @testset "Simple random sample" begin apisrs_original = load_data("apisrs") @@ -5,11 +8,11 @@ apisrs = copy(apisrs_original) srs = SurveyDesign(apisrs; weights = :pw) |> bootweights tot = total(:api00, srs) - @test tot.total[1] ≈ 4066888 rtol = 1e-5 - @test tot.SE[1] ≈ 58526 rtol = 1e-1 + @test tot.total[1] ≈ 4066888 rtol = STAT_TOL + @test tot.SE[1] ≈ 58526 rtol = SE_TOL mn = mean(:api00, srs) - @test mn.mean[1] ≈ 656.58 rtol = 1e-5 - @test mn.SE[1] ≈ 9.4488 rtol = 1e-1 + @test mn.mean[1] ≈ 656.58 rtol = STAT_TOL + @test mn.SE[1] ≈ 9.4488 rtol = SE_TOL # equivalent R code and results: # > srs <- svydesign(data=apisrs, id=~1, weights=~pw) # > srsrep <- as.svrepdesign(srs, type="bootstrap", replicates=4000) @@ -20,35 +23,20 @@ # mean SE # api00 656.58 9.4488 - # CategoricalArray - # apisrs = copy(apisrs_original) - # apisrs[!, :cname] = CategoricalArrays.categorical(apisrs.cname) - # srs = SurveyDesign(apisrs; popsize = :fpc) - # tot = total(:cname, srs) - # @test size(tot)[1] == apisrs.cname |> unique |> length - # @test filter(:cname => ==("Alameda"), tot).total[1] ≈ 340.67 atol = 1e-2 - # @test filter(:cname => ==("Alameda"), tot).SE[1] ≈ 98.472 atol = 1e-3 - # @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 1393.65 atol = 1e-2 - # @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 180.368 atol = 1e-3 - # Vector{Symbol} - apisrs = copy(apisrs_original) - srs = SurveyDesign(apisrs; weights = :pw) |> bootweights tot = total([:api00, :enroll], srs) mn = mean([:api00, :enroll], srs) ## :api00 - @test tot.total[1] ≈ 4066888 rtol = 1e-5 - @test tot.SE[1] ≈ 57502 rtol = 1e-1 - @test mn.mean[1] ≈ 656.58 rtol = 1e-5 - @test mn.SE[1] ≈ 9.2835 rtol = 1e-1 + @test tot.total[1] ≈ 4066888 rtol = STAT_TOL + @test tot.SE[1] ≈ 57502 rtol = SE_TOL + @test mn.mean[1] ≈ 656.58 rtol = STAT_TOL + @test mn.SE[1] ≈ 9.2835 rtol = SE_TOL ## :enroll - @test tot.total[2] ≈ 3621074 rtol = 1e-5 - @test tot.SE[2] ≈ 176793 rtol = 1e-1 - @test mn.mean[2] ≈ 584.61 rtol = 1e-5 - @test mn.SE[2] ≈ 28.5427 rtol = 1e-1 + @test tot.total[2] ≈ 3621074 rtol = STAT_TOL + @test tot.SE[2] ≈ 176793 rtol = SE_TOL + @test mn.mean[2] ≈ 584.61 rtol = STAT_TOL + @test mn.SE[2] ≈ 28.5427 rtol = SE_TOL # equivalent R code and results: - # > srs <- svydesign(data=apisrs, id=~1, weights=~pw) - # > srsrep <- as.svrepdesign(srs, type="bootstrap", replicates=4000) # > svytotal(~api00+~enroll, srsrep) # total SE # api00 4066888 57502 @@ -59,146 +47,83 @@ # enroll 584.61 28.5427 # subpopulation - apisrs = copy(apisrs_original) - srs = SurveyDesign(apisrs; weights = :pw) |> bootweights tot = total(:api00, :cname, srs) @test size(tot)[1] == apisrs.cname |> unique |> length - @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 917238.49 rtol = 1e-5 - @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 122193.02 rtol = 1e-1 - @test filter(:cname => ==("Monterey"), tot).total[1] ≈ 74947.40 rtol = 1e-5 - @test filter(:cname => ==("Monterey"), tot).SE[1] ≈ 38862.71 rtol = 1e-1 + @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 917238.49 rtol = STAT_TOL + @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 122193.02 rtol = SE_TOL + @test filter(:cname => ==("Monterey"), tot).total[1] ≈ 74947.40 rtol = STAT_TOL + @test filter(:cname => ==("Monterey"), tot).SE[1] ≈ 38862.71 rtol = SE_TOL mn = mean(:api00, :cname, srs) @test size(mn)[1] == apisrs.cname |> unique |> length - @test filter(:cname => ==("Los Angeles"), mn).mean[1] ≈ 658.1556 rtol = 1e-5 - @test filter(:cname => ==("Los Angeles"), mn).SE[1] ≈ 2.126852e+01 rtol = 1e-1 - @test filter(:cname => ==("Santa Clara"), mn).mean[1] ≈ 718.2857 rtol = 1e-5 - @test filter(:cname => ==("Santa Clara"), mn).SE[1] ≈ 5.835346e+01 rtol = 1e-1 - # equivalent R code and results: - # > srs <- svydesign(data=apisrs, id=~1, weights=~pw) - # > srsrep <- as.svrepdesign(srs, type="bootstrap", replicates=4000) + @test filter(:cname => ==("Los Angeles"), mn).mean[1] ≈ 658.1556 rtol = STAT_TOL + @test filter(:cname => ==("Los Angeles"), mn).SE[1] ≈ 2.126852e+01 rtol = SE_TOL + @test filter(:cname => ==("Santa Clara"), mn).mean[1] ≈ 718.2857 rtol = STAT_TOL + @test filter(:cname => ==("Santa Clara"), mn).SE[1] ≈ 5.835346e+01 rtol = SE_TOL + # equivalent R code (results cause clutter): # > svyby(~api00, ~cname, srsrep, svytotal) - # cname api00 se - # Alameda Alameda 230323.89 67808.91 - # Calaveras Calaveras 24466.30 24199.26 - # Contra Costa Contra Costa 213538.15 68780.65 - # Fresno Fresno 148717.94 54174.78 - # Imperial Imperial 19263.34 19292.34 - # Kern Kern 177643.92 56429.75 - # Kings Kings 29080.83 20659.88 - # Lake Lake 24899.88 24796.24 - # Lassen Lassen 23289.44 23150.91 - # Los Angeles Los Angeles 917238.49 122193.02 - # Madera Madera 44596.80 25684.62 - # Marin Marin 74297.03 43018.64 - # Merced Merced 18427.15 18057.21 - # Modoc Modoc 20780.87 20977.35 - # Monterey Monterey 74947.40 38862.71 - # Napa Napa 45030.38 31747.05 - # Orange Orange 208861.68 66824.94 - # Placer Placer 23506.23 23426.32 - # Riverside Riverside 177860.71 55697.57 - # Sacramento Sacramento 152620.16 53266.09 - # San Bernardino San Bernardino 247388.36 66806.58 - # San Diego San Diego 254387.58 71730.93 - # San Francisco San Francisco 51874.75 29597.88 - # San Joaquin San Joaquin 113102.44 46195.42 - # San Luis Obispo San Luis Obispo 22886.83 22984.23 - # San Mateo San Mateo 38216.98 27075.67 - # Santa Barbara Santa Barbara 67700.42 38550.72 - # Santa Clara Santa Clara 155717.16 58101.15 - # Santa Cruz Santa Cruz 58006.81 34633.27 - # Shasta Shasta 46702.76 32882.09 - # Siskiyou Siskiyou 21648.03 21667.03 - # Solano Solano 57882.93 33095.96 - # Sonoma Sonoma 19511.10 19782.71 - # Stanislaus Stanislaus 68412.73 39997.43 - # Sutter Sutter 23041.68 22738.16 - # Tulare Tulare 41128.16 28933.90 - # Ventura Ventura 115177.43 51200.56 - # Yolo Yolo 14710.75 14676.49 # > svyby(~api00, ~cname, srsrep, svymean) - # cname api00 se - # Alameda Alameda 676.0909 3.522082e+01 - # Calaveras Calaveras 790.0000 0.000000e+00 - # Contra Costa Contra Costa 766.1111 5.435054e+01 - # Fresno Fresno 600.2500 5.811781e+01 - # Imperial Imperial 622.0000 0.000000e+00 - # Kern Kern 573.6000 4.634744e+01 - # Kings Kings 469.5000 4.264356e+01 - # Lake Lake 804.0000 0.000000e+00 - # Lassen Lassen 752.0000 0.000000e+00 - # Los Angeles Los Angeles 658.1556 2.126852e+01 - # Madera Madera 480.0000 3.461786e+00 - # Marin Marin 799.6667 3.509912e+01 - # Merced Merced 595.0000 0.000000e+00 - # Modoc Modoc 671.0000 0.000000e+00 - # Monterey Monterey 605.0000 8.356655e+01 - # Napa Napa 727.0000 4.770914e+01 - # Orange Orange 749.3333 2.876956e+01 - # Placer Placer 759.0000 0.000000e+00 - # Riverside Riverside 574.3000 2.789294e+01 - # Sacramento Sacramento 616.0000 3.785063e+01 - # San Bernardino San Bernardino 614.4615 2.985197e+01 - # San Diego San Diego 684.5000 3.254291e+01 - # San Francisco San Francisco 558.3333 4.404227e+01 - # San Joaquin San Joaquin 608.6667 4.153241e+01 - # San Luis Obispo San Luis Obispo 739.0000 2.691382e-14 - # San Mateo San Mateo 617.0000 7.352923e+01 - # Santa Barbara Santa Barbara 728.6667 2.551393e+01 - # Santa Clara Santa Clara 718.2857 5.835346e+01 - # Santa Cruz Santa Cruz 624.3333 1.131098e+02 - # Shasta Shasta 754.0000 5.731963e+01 - # Siskiyou Siskiyou 699.0000 0.000000e+00 - # Solano Solano 623.0000 4.541173e+01 - # Sonoma Sonoma 630.0000 0.000000e+00 - # Stanislaus Stanislaus 736.3333 5.176843e+00 - # Sutter Sutter 744.0000 0.000000e+00 - # Tulare Tulare 664.0000 2.061011e+01 - # Ventura Ventura 743.8000 3.153839e+01 - # Yolo Yolo 475.0000 0.000000e+00 end -@testset "total_Stratified" begin +@testset "Stratified sample" begin apistrat_original = load_data("apistrat") # base functionality apistrat = copy(apistrat_original) strat = SurveyDesign(apistrat; strata = :stype, weights = :pw) |> bootweights tot = total(:api00, strat) - @test tot.total[1] ≈ 4102208 atol = 10 - @test tot.SE[1] ≈ 77211.61 atol = 1e-1 - # without fpc - # TODO: uncomment after correcting `total` function - # @test tot.SE[1] ≈ 1690.4 atol = 1e-1 - - # CategoricalArray - # apistrat = copy(apistrat_original) - # apistrat[!, :cname] = CategoricalArrays.categorical(apistrat.cname) - # strat = StratifiedSample(apistrat, :stype; popsize = :fpc) - # TODO: uncomment after adding `CategoricalArray` support - # @test tot.SE[1] ≈ 1690.4 atol = 1e-1 - # tot = total(:cname, strat) - # @test size(tot)[1] == apistrat.cname |> unique |> length - # @test filter(:cname => ==("Kern"), tot).total[1] ≈ 291.97 atol = 1e-2 - # @test filter(:cname => ==("Kern"), tot).SE[1] ≈ 101.760 atol = 1e-3 - # @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 1373.15 atol = 1e-2 - # @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 199.635 atol = 1e-3 + @test tot.total[1] ≈ 4102208 rtol = STAT_TOL + @test tot.SE[1] ≈ 60746 rtol = SE_TOL + @test mn.mean[1] ≈ 662.29 rtol = STAT_TOL + @test mn.SE[1] ≈ 9.8072 rtol = SE_TOL + # equivalent R code and results: + # > strat <- svydesign(data=apistrat, id=~1, weights=~pw, strata=~stype) + # > stratrep <- as.svrepdesign(strat, type="bootstrap", replicates=4000) + # > svytotal(~api00, stratrep) + # total SE + # api00 4102208 60746 + # > svymean(~api00, stratrep) + # mean SE + # api00 662.29 9.8072 # Vector{Symbol} tot = total([:api00, :enroll], strat) + mn = mean([:api00, :enroll], strat) ## :api00 - @test tot.total[1] ≈ 4102208 atol = 1 - @test tot.SE[1] ≈ 77211.61 atol = 1 + @test tot.total[1] ≈ 4102208 rtol = STAT_TOL + @test tot.SE[1] ≈ 60746 rtol = SE_TOL + @test mn.mean[1] ≈ 662.29 rtol = STAT_TOL + @test mn.SE[1] ≈ 9.8072 rtol = SE_TOL ## :enroll - @test tot.total[2] ≈ 3687178 atol = 1 - @test tot.SE[2] ≈ 127021.5540 atol = 1 + @test tot.total[2] ≈ 3687178 rtol = STAT_TOL + @test tot.SE[2] ≈ 117322 rtol = SE_TOL + @test mn.mean[2] ≈ 595.28 rtol = STAT_TOL + @test mn.SE[2] ≈ 18.9412 rtol = SE_TOL + # equivalent R code and results: + # > svytotal(~api00+~enroll, stratrep) + # > svymean(~api00+~enroll, stratrep) + # mean SE + # api00 662.29 9.8072 + # enroll 595.28 18.9412 # subpopulation - # TODO: add functionality in `src/total.jl` - # TODO: add tests + tot = total(:api00, :cname, strat) + @test size(tot)[1] == apistrat.cname |> unique |> length + @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 869905.98 rtol = STAT_TOL + @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 134195.81 rtol = SE_TOL + @test filter(:cname => ==("Monterey"), tot).total[1] ≈ 72103.09 rtol = STAT_TOL + @test filter(:cname => ==("Monterey"), tot).SE[1] ≈ 45532.88 rtol = SE_TOL + mn = mean(:api00, :cname, strat) + @test size(mn)[1] == apistrat.cname |> unique |> length + @test filter(:cname => ==("Los Angeles"), mn).mean[1] ≈ 633.5113 rtol = STAT_TOL + @test filter(:cname => ==("Los Angeles"), mn).SE[1] ≈ 21.681068 rtol = SE_TOL + @test filter(:cname => ==("Santa Clara"), mn).mean[1] ≈ 664.1212 rtol = STAT_TOL + @test filter(:cname => ==("Santa Clara"), mn).SE[1] ≈ 48.817277 rtol = SE_TOL + # equivalent R code (results cause clutter): + # > svyby(~api00, ~cname, stratrep, svytotal) + # > svyby(~api00, ~cname, stratrep, svymean) end -@testset "total_OneStageClusterSample" begin +@testset "One stage cluster sample" begin # Load API datasets apiclus1_original = load_data("apiclus1") apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column From 87c11dc01c23e55da98dcfd2cfe6dc58b573d72f Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Mon, 9 Jan 2023 19:55:15 +0530 Subject: [PATCH 12/80] Fix and add tests for Cluster and minor reordering --- test/total.jl | 87 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 66 insertions(+), 21 deletions(-) diff --git a/test/total.jl b/test/total.jl index 9b7c8e56..6ac6ab06 100644 --- a/test/total.jl +++ b/test/total.jl @@ -1,12 +1,11 @@ const STAT_TOL = 1e-5 const SE_TOL = 1e-1 -@testset "Simple random sample" begin - apisrs_original = load_data("apisrs") +@testset "total SRS" begin + apisrs = load_data("apisrs") + srs = SurveyDesign(apisrs; weights = :pw) |> bootweights # base functionality - apisrs = copy(apisrs_original) - srs = SurveyDesign(apisrs; weights = :pw) |> bootweights tot = total(:api00, srs) @test tot.total[1] ≈ 4066888 rtol = STAT_TOL @test tot.SE[1] ≈ 58526 rtol = SE_TOL @@ -64,15 +63,15 @@ const SE_TOL = 1e-1 # > svyby(~api00, ~cname, srsrep, svymean) end -@testset "Stratified sample" begin - apistrat_original = load_data("apistrat") +@testset "total Stratified" begin + apistrat = load_data("apistrat") + strat = SurveyDesign(apistrat; strata = :stype, weights = :pw) |> bootweights # base functionality - apistrat = copy(apistrat_original) - strat = SurveyDesign(apistrat; strata = :stype, weights = :pw) |> bootweights tot = total(:api00, strat) @test tot.total[1] ≈ 4102208 rtol = STAT_TOL @test tot.SE[1] ≈ 60746 rtol = SE_TOL + mn = mean(:api00, strat) @test mn.mean[1] ≈ 662.29 rtol = STAT_TOL @test mn.SE[1] ≈ 9.8072 rtol = SE_TOL # equivalent R code and results: @@ -123,18 +122,64 @@ end # > svyby(~api00, ~cname, stratrep, svymean) end -@testset "One stage cluster sample" begin - # Load API datasets - apiclus1_original = load_data("apiclus1") - apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column - ############################## - # one-stage cluster sample - apiclus1 = copy(apiclus1_original) - dclus1 = SurveyDesign(apiclus1, clusters = :dnum, weights = :pw) |> bootweights - @test total(:api00,dclus1).total[1] ≈ 5949162 atol = 1 - @test total(:api00,dclus1).SE[1] ≈ 1.3338978891316957e6 atol = 1 +@testset "total Cluster" begin + apiclus1 = load_data("apiclus1") + clus1 = SurveyDesign(apiclus1, clusters = :dnum, weights = :pw) |> bootweights + + # base functionality + tot = total(:api00, clus1) + @test tot.total[1] ≈ 3989986 rtol = STAT_TOL + @test tot.SE[1] ≈ 900323 rtol = SE_TOL + mn = mean(:api00, clus1) + @test mn.mean[1] ≈ 644.17 rtol = STAT_TOL + @test mn.SE[1] ≈ 23.534 rtol = SE_TOL + # equivalent R code and results: + # > clus1 <- svydesign(data=apiclus1, id=~dnum, weights=~pw) + # > clus1rep <- as.svrepdesign(clus1, type="bootstrap", replicates=4000) + # > svytotal(~api00, clus1rep) + # total SE + # api00 3989986 900323 + # > svymean(~api00, clus1rep) + # mean SE + # api00 644.17 23.534 - @test total(:api00, dclus1).total[1] ≈ 5949162 atol = 1 - @test total(:api00, dclus1).SE[1] ≈ 1352953 atol = 50000 # without fpc as it hasn't been figured out for bootstrap. - + # Vector{Symbol} + tot = total([:api00, :enroll], clus1) + mn = mean([:api00, :enroll], clus1) + ## :api00 + @test tot.total[1] ≈ 3989986 rtol = STAT_TOL + @test tot.SE[1] ≈ 900323 rtol = SE_TOL + @test mn.mean[1] ≈ 644.17 rtol = STAT_TOL + @test mn.SE[1] ≈ 23.534 rtol = SE_TOL + ## :enroll + @test tot.total[2] ≈ 3404940 rtol = STAT_TOL + @test tot.SE[2] ≈ 941501 rtol = SE_TOL + @test mn.mean[2] ≈ 549.72 rtol = STAT_TOL + @test mn.SE[2] ≈ 46.070 rtol = SE_TOL + # equivalent R code and results: + # > svytotal(~api00+~enroll, clus1rep) + # total SE + # api00 3989986 900323 + # enroll 3404940 941501 + # > svymean(~api00+~enroll, clus1rep) + # mean SE + # api00 644.17 23.534 + # enroll 549.72 46.070 + + # subpopulation + tot = total(:api00, :cname, clus1) + @test size(tot)[1] == apiclus1.cname |> unique |> length + @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 328620.49 rtol = STAT_TOL + @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 292840.83 rtol = SE_TOL + @test filter(:cname => ==("San Diego"), tot).total[1] ≈ 1227596.71 rtol = STAT_TOL + @test filter(:cname => ==("San Diego"), tot).SE[1] ≈ 860028.39 rtol = SE_TOL + mn = mean(:api00, :cname, clus1) + @test size(mn)[1] == apiclus1.cname |> unique |> length + @test filter(:cname => ==("Los Angeles"), mn).mean[1] ≈ 647.2667 rtol = STAT_TOL + @test filter(:cname => ==("Los Angeles"), mn).SE[1] ≈ 41.537132 rtol = 1 # tolerance is too large + @test filter(:cname => ==("Santa Clara"), mn).mean[1] ≈ 732.0769 rtol = STAT_TOL + @test filter(:cname => ==("Santa Clara"), mn).SE[1] ≈ 52.336574 rtol = SE_TOL + # equivalent R code (results cause clutter): + # > svyby(~api00, ~cname, clus1rep, svytotal) + # > svyby(~api00, ~cname, clus1rep, svymean) end \ No newline at end of file From 734f1bbb78862d0f04c9a4c09cb60cce52c648c2 Mon Sep 17 00:00:00 2001 From: smishr <43640926+smishr@users.noreply.github.com> Date: Tue, 10 Jan 2023 15:52:53 +0530 Subject: [PATCH 13/80] Update survey design, add tests, remove extra quantile --- src/SurveyDesign.jl | 39 +++++++++++++++++++++++++-------------- src/quantile.jl | 9 --------- src/show.jl | 8 ++++---- test/SurveyDesign.jl | 41 +++++++++++++++++++++++++++++++++++++++-- 4 files changed, 68 insertions(+), 29 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 1541a3b9..7b68ea86 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -33,15 +33,14 @@ julia> apiclus1[!, :pw] = fill(757/15,(size(apiclus1,1),)); # Correct api mistak julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) SurveyDesign: -data: 183x46 DataFrame +data: 183x44 DataFrame cluster: dnum design.data[!,design.cluster]: 637, 637, 637, ..., 448 popsize: popsize design.data[!,design.popsize]: 9240.0, 9240.0, 9240.0, ..., 9240.0 sampsize: sampsize design.data[!,design.sampsize]: 15, 15, 15, ..., 15 -design.data[!,:probs]: 0.0198, 0.0198, 0.0198, ..., 0.0198 -design.data[!,:allprobs]: 0.0198, 0.0198, 0.0198, ..., 0.0198 +design.data[!,design.allprobs]: 0.0198, 0.0198, 0.0198, ..., 0.0198 ``` """ struct SurveyDesign <: AbstractSurveyDesign @@ -50,9 +49,15 @@ struct SurveyDesign <: AbstractSurveyDesign popsize::Symbol sampsize::Symbol strata::Symbol - pps::Bool + weights::Symbol # Effective weights in case of singlestage approx supported + allprobs::Symbol # Right now only singlestage approx supported + pps::Bool # TODO functionality # Single stage clusters sample, like apiclus1 - function SurveyDesign(data::AbstractDataFrame; strata::Union{Nothing,Symbol} = nothing, weights::Union{Nothing,Symbol}= nothing, clusters::Union{Nothing, Symbol, Vector{Symbol}} = nothing, popsize::Union{Nothing, Int,Symbol}=nothing) + function SurveyDesign(data::AbstractDataFrame; + strata::Union{Nothing,Symbol} = nothing, + weights::Union{Nothing,Symbol}= nothing, + clusters::Union{Nothing, Symbol, Vector{Symbol}} = nothing, + popsize::Union{Nothing, Int,Symbol}=nothing) # sampsize here is number of clusters completely sampled, popsize is total clusters in population if typeof(strata) <:Nothing data.false_strata = repeat(["FALSE_STRATA"], nrow(data)) @@ -73,20 +78,26 @@ struct SurveyDesign <: AbstractSurveyDesign sampsize_labels = :sampsize data[!, sampsize_labels] = fill(length(unique(data[!, cluster])),(nrow(data),)) if !(typeof(popsize) <: Nothing) - data[!, :weights] = data[!, popsize] ./ data[!, sampsize_labels] - elseif !(typeof(weights) <: Nothing) - data.weights = data[!, weights] + weights_labels = :weights + data[!, weights_labels] = data[!, popsize] ./ data[!, sampsize_labels] + elseif typeof(weights) <: Symbol + if !(typeof(data[!,weights]) <:Vector{<:Real}) + @show typeof(data[!,weights]) + error("weights column has to be numeric") + end + weights_labels = weights else - data.weights = repeat([1], nrow(data)) + weights_labels = :weights + data[!,weights_labels] = repeat([1], nrow(data)) end - data[!, :probs] = 1 ./ data[!, :weights] # Many formulae are easily defined in terms of sampling probabilties - data[!, :allprobs] = data[!, :probs] # In one-stage cluster sample, allprobs is just probs, no multiplication needed - pps = false + allprobs_labels = :allprobs + data[!, allprobs_labels] = 1 ./ data[!, weights_labels] # In one-stage cluster sample, allprobs is just probs, no multiplication needed + pps = false # for now no explicit pps support if !(typeof(popsize) <: Symbol) - data.popsize = repeat([sum(data.weights)], nrow(data)) + data.popsize = repeat([sum(data[!,weights_labels])], nrow(data)) popsize = :popsize end - new(data, cluster, popsize, sampsize_labels, strata, pps) + new(data, cluster, popsize, sampsize_labels, strata, weights_labels, allprobs_labels, pps) end end diff --git a/src/quantile.jl b/src/quantile.jl index 1cc9646c..d4e399a5 100644 --- a/src/quantile.jl +++ b/src/quantile.jl @@ -41,13 +41,4 @@ function quantile(var::Symbol, design::SurveyDesign, p::Union{<:Real,Vector{<:Re df = DataFrame(probability=p, quantile=Statistics.quantile(v, ProbabilityWeights(probs), p)) # TODO: Add CI and SE of the quantile return df -end - -function quantile(var::Symbol, design::SurveyDesign, p::Union{<:Real,Vector{<:Real}}; - alpha::Float64=0.05, ci::Bool=false, se::Bool=false, qrule="hf7",kwargs...) - v = design.data[!, var] - probs = design.data[!, :probs] - df = DataFrame(probability=p, quantile=Statistics.quantile(v, ProbabilityWeights(probs), p)) # Not sure which quantile defintion this returns - # TODO: Add CI and SE of the quantile - return df end \ No newline at end of file diff --git a/src/show.jl b/src/show.jl index 3319e653..bb37059c 100644 --- a/src/show.jl +++ b/src/show.jl @@ -46,8 +46,8 @@ function Base.show(io::IO, ::MIME"text/plain", design::SurveyDesign) printinfo(io, "design.data[!,design.popsize]", makeshort(design.data[!,design.popsize])) printinfo(io, "sampsize", string(design.sampsize); newline=true) printinfo(io, "design.data[!,design.sampsize]", makeshort(design.data[!,design.sampsize])) - printinfo(io, "design.data[!,:probs]", makeshort(design.data.probs)) - printinfo(io, "design.data[!,:allprobs]", makeshort(design.data.allprobs)) + # printinfo(io, "design.data[!,:probs]", makeshort(design.data.probs)) + printinfo(io, "design.data[!,design.allprobs]", makeshort(design.data.allprobs)) end "Print information about a repliocate design." @@ -62,8 +62,8 @@ function Base.show(io::IO, ::MIME"text/plain", design::ReplicateDesign) printinfo(io, "design.data[!,design.popsize]", makeshort(design.data[!,design.popsize])) printinfo(io, "sampsize", string(design.sampsize); newline=true) printinfo(io, "design.data[!,design.sampsize]", makeshort(design.data[!,design.sampsize])) - printinfo(io, "design.data[!,:probs]", makeshort(design.data.probs)) - printinfo(io, "design.data[!,:allprobs]", makeshort(design.data.allprobs)) + # printinfo(io, "design.data[!,:probs]", makeshort(design.data.probs)) + printinfo(io, "design.data[!,design.allprobs]", makeshort(design.data.allprobs)) printstyled(io, "replicates: "; bold=true) println(io, design.replicates) end \ No newline at end of file diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl index 6851f4e8..adc927e3 100644 --- a/test/SurveyDesign.jl +++ b/test/SurveyDesign.jl @@ -1,4 +1,35 @@ -@testset "SurveyDesign" begin +@testset "SurveyDesign_srs" begin + ##### Simple Random Sample tests + # Load API datasets + apisrs_original = load_data("apisrs") + apisrs_original[!, :derived_probs] = 1 ./ apisrs_original.pw + apisrs_original[!, :derived_sampsize] = fill(200.0, size(apisrs_original, 1)) + ############################## + ### Basic functionality + ### weights as Symbol + apisrs = copy(apisrs_original) + srs_weights = SurveyDesign(apisrs, weights=:pw) + @test srs_weights.data[!,srs_weights.weights][1] ≈ 30.97 atol = 1e-4 + @test srs_weights.data[!,srs_weights.weights] == 1 ./ srs_weights.data[!,srs_weights.allprobs] + ############################## + ### Weights as non-numeric error + apisrs = copy(apisrs_original) + @test_throws ErrorException SurveyDesign(apisrs, weights=:stype) +end + +@testset "SurveyDesign_strat" begin + ### StratifiedSample tests + # Load API datasets + apistrat_original = load_data("apistrat") + apistrat_original[!, :derived_probs] = 1 ./ apistrat_original.pw + apistrat_original[!, :derived_sampsize] = apistrat_original.fpc ./ apistrat_original.pw + + apistrat = copy(apistrat_original) + strat = SurveyDesign(apistrat, strata = :stype, weights = :pw) |> bootweights + +end + +@testset "SurveyDesign_multistage" begin # Load API datasets apiclus1_original = load_data("apiclus1") apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column @@ -9,4 +40,10 @@ @test dclus1.data[!, :weights] ≈ fill(50.4667,size(apiclus1,1)) atol = 1e-3 @test dclus1.data[!,dclus1.sampsize] ≈ fill(15,size(apiclus1,1)) @test dclus1.data[!,:allprobs] ≈ dclus1.data[!,:probs] atol = 1e-4 -end \ No newline at end of file + + ############################## + # Load API datasets + nhanes = load_data("nhanes") + nhanes_design = SurveyDesign(nhanes; clusters = :SDMVPSU, strata = :SDMVSTRA, weights = :WTMEC2YR) +end + From 547725ffb6d111ff0e9ce2de2ddc88bbe2f7b98b Mon Sep 17 00:00:00 2001 From: smishr <43640926+smishr@users.noreply.github.com> Date: Tue, 10 Jan 2023 16:07:28 +0530 Subject: [PATCH 14/80] julia formatter --- src/SurveyDesign.jl | 21 +++++++++++---------- test/SurveyDesign.jl | 6 ++++++ 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 7b68ea86..69fe3b51 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -54,12 +54,13 @@ struct SurveyDesign <: AbstractSurveyDesign pps::Bool # TODO functionality # Single stage clusters sample, like apiclus1 function SurveyDesign(data::AbstractDataFrame; - strata::Union{Nothing,Symbol} = nothing, - weights::Union{Nothing,Symbol}= nothing, - clusters::Union{Nothing, Symbol, Vector{Symbol}} = nothing, - popsize::Union{Nothing, Int,Symbol}=nothing) + clusters::Union{Nothing,Symbol,Vector{Symbol}}=nothing, + strata::Union{Nothing,Symbol}=nothing, + popsize::Union{Nothing,Int,Symbol}=nothing, + weights::Union{Nothing,Symbol}=nothing + ) # sampsize here is number of clusters completely sampled, popsize is total clusters in population - if typeof(strata) <:Nothing + if typeof(strata) <: Nothing data.false_strata = repeat(["FALSE_STRATA"], nrow(data)) strata = :false_strata end @@ -76,25 +77,25 @@ struct SurveyDesign <: AbstractSurveyDesign end # For one-stage sample only one sampsize vector sampsize_labels = :sampsize - data[!, sampsize_labels] = fill(length(unique(data[!, cluster])),(nrow(data),)) + data[!, sampsize_labels] = fill(length(unique(data[!, cluster])), (nrow(data),)) if !(typeof(popsize) <: Nothing) weights_labels = :weights data[!, weights_labels] = data[!, popsize] ./ data[!, sampsize_labels] elseif typeof(weights) <: Symbol - if !(typeof(data[!,weights]) <:Vector{<:Real}) - @show typeof(data[!,weights]) + if !(typeof(data[!, weights]) <: Vector{<:Real}) + @show typeof(data[!, weights]) error("weights column has to be numeric") end weights_labels = weights else weights_labels = :weights - data[!,weights_labels] = repeat([1], nrow(data)) + data[!, weights_labels] = repeat([1], nrow(data)) end allprobs_labels = :allprobs data[!, allprobs_labels] = 1 ./ data[!, weights_labels] # In one-stage cluster sample, allprobs is just probs, no multiplication needed pps = false # for now no explicit pps support if !(typeof(popsize) <: Symbol) - data.popsize = repeat([sum(data[!,weights_labels])], nrow(data)) + data.popsize = repeat([sum(data[!, weights_labels])], nrow(data)) popsize = :popsize end new(data, cluster, popsize, sampsize_labels, strata, weights_labels, allprobs_labels, pps) diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl index adc927e3..90989918 100644 --- a/test/SurveyDesign.jl +++ b/test/SurveyDesign.jl @@ -11,10 +11,16 @@ srs_weights = SurveyDesign(apisrs, weights=:pw) @test srs_weights.data[!,srs_weights.weights][1] ≈ 30.97 atol = 1e-4 @test srs_weights.data[!,srs_weights.weights] == 1 ./ srs_weights.data[!,srs_weights.allprobs] + ### popsize as Symbol + apisrs = copy(apisrs_original) + srs_pop = SurveyDesign(apisrs, popsize=:fpc) + @test srs_pop.data[!,srs_pop.weights][1] ≈ 30.97 atol = 1e-4 + @test srs_pop.data[!,srs_pop.weights] == 1 ./ srs_pop.data[!,srs_pop.allprobs] ############################## ### Weights as non-numeric error apisrs = copy(apisrs_original) @test_throws ErrorException SurveyDesign(apisrs, weights=:stype) + end @testset "SurveyDesign_strat" begin From 81b77e8c55fd3020585354615fa1358bb24476aa Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Tue, 10 Jan 2023 16:52:42 +0530 Subject: [PATCH 15/80] Remove references to old designs, add references to new functions --- docs/src/api.md | 29 +++++++++-------------------- docs/src/index.md | 41 +++++------------------------------------ src/Survey.jl | 5 ++--- 3 files changed, 16 insertions(+), 59 deletions(-) diff --git a/docs/src/api.md b/docs/src/api.md index 062d379d..5431b9ae 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -7,27 +7,19 @@ Module = [Survey] Order = [:type, :function] Private = false ``` -Survey data can be loaded from a `DataFrame` into a survey design. The package currently supports simple random sample and stratified sample designs. -```@docs -AbstractSurveyDesign -SimpleRandomSample -StratifiedSample -``` ```@docs +AbstractSurveyDesign +SurveyDesign +ReplicateDesign load_data -Survey.mean(x::Symbol, design::SimpleRandomSample) -total(x::Symbol, design::SimpleRandomSample) +bootweights +mean(x::Symbol, design::ReplicateDesign) +mean(x::Symbol, domain::Symbol, design::ReplicateDesign) +total(x::Symbol, design::ReplicateDesign) +total(x::Symbol, domain::Symbol, design::ReplicateDesign) quantile -``` - -It is often required to estimate population parameters for sub-populations of interest. For example, you may have a sample of heights, but you want the average heights of males and females separately. -```@docs -mean(x::Symbol, by::Symbol, design::SimpleRandomSample) -total(x::Symbol, by::Symbol, design::SimpleRandomSample) -``` -```@docs -ratio(variable_num:: Symbol, variable_den:: Symbol, design::SurveyDesign) +ratio(variable_num::Symbol, variable_den::Symbol, design::SurveyDesign) plot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...) boxplot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...) hist(design::AbstractSurveyDesign, var::Symbol, @@ -35,7 +27,4 @@ hist(design::AbstractSurveyDesign, var::Symbol, normalization = :density, kwargs... ) -dim(design::AbstractSurveyDesign) -dimnames(design::AbstractSurveyDesign) -colnames(design::AbstractSurveyDesign) ``` diff --git a/docs/src/index.md b/docs/src/index.md index a099c73c..eddbcf0f 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -18,45 +18,14 @@ with at least 100 students and for various probability samples of the data. The API program has been discontinued at the end of 2018. Information is archived at [https://www.cde.ca.gov/re/pr/api.asp](https://www.cde.ca.gov/re/pr/api.asp) -Firstly, a survey design needs a dataset from which to gather information. - - -The sample datasets provided with the package can be loaded as `DataFrames` using the `load_data` function: +Firstly, a survey design needs a dataset from which to gather information. The sample +datasets provided with the package can be loaded as `DataFrame`s using [`load_data`](@ref): ```julia julia> apisrs = load_data("apisrs"); ``` -`apisrs` is a simple random sample of the Academic Performance Index of Californian schools. - -Next, we can build a design. The design corresponding to a simple random sample is [`SimpleRandomSample`](@ref), which can be instantiated by calling the constructor: - -```julia -julia> srs = SimpleRandomSample(apisrs; weights = :pw) -SimpleRandomSample: -data: 200x42 DataFrame -weights: 31.0, 31.0, 31.0, ..., 31.0 -probs: 0.0323, 0.0323, 0.0323, ..., 0.0323 -fpc: 6194, 6194, 6194, ..., 6194 -popsize: 6194 -sampsize: 200 -sampfraction: 0.0323 -ignorefpc: false -``` -With a `SimpleRandomSample` (as well as with any subtype of [`AbstractSurveyDesign`](@ref)) it is possible to calculate estimates of the mean, population total, etc., for a given variable, along with the corresponding standard errors. +`apisrs` is a simple random sample of the Academic Performance Index of Californian schools. -```julia -julia> mean(:api00, srs) -1×2 DataFrame - Row │ mean sem - │ Float64 Float64 -─────┼────────────────── - 1 │ 656.585 9.24972 - -julia> total(:api00, srs) -1×2 DataFrame - Row │ total se_total - │ Float64 Float64 -─────┼───────────────────── - 1 │ 4.06689e6 57292.8 -``` +Next, we can build a design. +#TODO: continue tutorial diff --git a/src/Survey.jl b/src/Survey.jl index fee13d6d..dd71a092 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -28,12 +28,11 @@ include("ratio.jl") include("by.jl") export load_data -export AbstractSurveyDesign, SimpleRandomSample, StratifiedSample -export SurveyDesign +export AbstractSurveyDesign, SurveyDesign, ReplicateDesign export dim, colnames, dimnames export mean, total, quantile export plot -export hist +export hist, sturges, freedman_diaconis export boxplot export bootweights export jkknife From 751813822326ea0e8001b0f42314dfc5e16795d7 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Tue, 10 Jan 2023 16:57:01 +0530 Subject: [PATCH 16/80] Fix docstring and minor style modifications --- src/bootstrap.jl | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/bootstrap.jl b/src/bootstrap.jl index 2055b6bf..b4e226a8 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -1,15 +1,15 @@ """ ```jldoctest -julia> using Survey, Random; +julia> using Random -julia> apiclus1 = load_data("apiclus1"); +julia> apiclus1 = load_data("apiclus1"); -julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum); +julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum); -julia> rng = MersenneTwister(111); +julia> rng = MersenneTwister(111); -julia> Survey.bootweights(dclus1; replicates=1000, rng) -Survey.ReplicateDesign: +julia> bootweights(dclus1; replicates=1000, rng) +ReplicateDesign: data: 183x1046 DataFrame cluster: dnum design.data[!,design.cluster]: 637, 637, 637, ..., 448 @@ -22,7 +22,7 @@ design.data[!,:allprobs]: 1.0, 1.0, 1.0, ..., 1.0 replicates: 1000 ``` """ -function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwister(1234)) +function bootweights(design::SurveyDesign; replicates=4000, rng=MersenneTwister(1234)) H = length(unique(design.data[!, design.strata])) stratified = groupby(design.data, design.strata) function replicate(stratified, H) @@ -45,10 +45,10 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis return transform(stratified, :whij) end df = replicate(stratified, H) - rename!(df,:whij => :replicate_1) + rename!(df, :whij => :replicate_1) df.replicate_1 = disallowmissing(df.replicate_1) for i in 2:(replicates) - df[!, "replicate_"*string(i)] = disallowmissing(replicate(stratified, H).whij) + df[!, "replicate_" * string(i)] = disallowmissing(replicate(stratified, H).whij) end return ReplicateDesign(df, design.cluster, design.popsize, design.sampsize, design.strata, design.pps, replicates) end \ No newline at end of file From efe2f6a32db2c9fddfc61135ec7baf54aa379547 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Tue, 10 Jan 2023 17:02:57 +0530 Subject: [PATCH 17/80] Convert indentation to spaces --- docs/src/api.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/src/api.md b/docs/src/api.md index 5431b9ae..5b538a55 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -23,8 +23,8 @@ ratio(variable_num::Symbol, variable_den::Symbol, design::SurveyDesign) plot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...) boxplot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...) hist(design::AbstractSurveyDesign, var::Symbol, - bins::Union{Integer, AbstractVector} = freedman_diaconis(design, var); - normalization = :density, - kwargs... - ) + bins::Union{Integer, AbstractVector} = freedman_diaconis(design, var); + normalization = :density, + kwargs... + ) ``` From 2bac93875eaa85a5a16fb06cbb2fa513d16684a0 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Tue, 10 Jan 2023 17:16:22 +0530 Subject: [PATCH 18/80] Fix docstrings, minor rearrangements and style checks --- src/SurveyDesign.jl | 65 ++++++++++++++------------ src/mean.jl | 108 ++++++++++++++++++-------------------------- src/total.jl | 81 ++++++++++++++++++--------------- 3 files changed, 124 insertions(+), 130 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 1541a3b9..07b9e1de 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -13,35 +13,32 @@ abstract type AbstractSurveyDesign end """ SurveyDesign <: AbstractSurveyDesign -Survey design sampled by one stage clusters sampling. -Clusters chosen by SRS followed by complete sampling of selected clusters. -Assumes each individual in one and only one clusters; disjoint and nested clusters. +General survey design encompassing a simple random, stratified, cluster or multi-stage design. -`clusters` must be specified as a Symbol name of a column in `data`. +In the case of cluster sample, the clusters are chosen by simple random sampling. All +individuals in one cluster are sampled. The clusters are considered disjoint and nested. # Arguments: `data::AbstractDataFrame`: the survey dataset (!this gets modified by the constructor). -`clusters::Symbol`: the stratification variable - must be given as a column in `data`. -`popsize::Union{Nothing,Symbol,<:Unsigned,Vector{<:Real}}=nothing`: the (expected) survey population size. For - -`weights::Union{Nothing,Symbol,Vector{<:Real}}=nothing`: the sampling weights. +`strata::Union{Nothing, Symbol}=nothing`: the stratification variable - must be given as a column in `data`. +`clusters::Union{Nothing, Symbol, Vector{Symbol}}=nothing`: the clustering variable - must be given as column(s) in `data`. +`weights::Union{Nothing, Symbol}=nothing`: the sampling weights. +`popsize::Union{Nothing, Int, Symbol}=nothing`: the (expected) survey population size. ```jldoctest julia> apiclus1 = load_data("apiclus1"); -julia> apiclus1[!, :pw] = fill(757/15,(size(apiclus1,1),)); # Correct api mistake for pw column - -julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) +julia> dclus1 = SurveyDesign(apiclus1; clusters=:dnum, weights=:pw) SurveyDesign: data: 183x46 DataFrame cluster: dnum design.data[!,design.cluster]: 637, 637, 637, ..., 448 popsize: popsize -design.data[!,design.popsize]: 9240.0, 9240.0, 9240.0, ..., 9240.0 +design.data[!,design.popsize]: 6190.0, 6190.0, 6190.0, ..., 6190.0 sampsize: sampsize design.data[!,design.sampsize]: 15, 15, 15, ..., 15 -design.data[!,:probs]: 0.0198, 0.0198, 0.0198, ..., 0.0198 -design.data[!,:allprobs]: 0.0198, 0.0198, 0.0198, ..., 0.0198 +design.data[!,:probs]: 0.0295, 0.0295, 0.0295, ..., 0.0295 +design.data[!,:allprobs]: 0.0295, 0.0295, 0.0295, ..., 0.0295 ``` """ struct SurveyDesign <: AbstractSurveyDesign @@ -52,9 +49,15 @@ struct SurveyDesign <: AbstractSurveyDesign strata::Symbol pps::Bool # Single stage clusters sample, like apiclus1 - function SurveyDesign(data::AbstractDataFrame; strata::Union{Nothing,Symbol} = nothing, weights::Union{Nothing,Symbol}= nothing, clusters::Union{Nothing, Symbol, Vector{Symbol}} = nothing, popsize::Union{Nothing, Int,Symbol}=nothing) + function SurveyDesign( + data::AbstractDataFrame; + strata::Union{Nothing, Symbol}=nothing, + clusters::Union{Nothing, Symbol, Vector{Symbol}}=nothing, + weights::Union{Nothing, Symbol}=nothing, + popsize::Union{Nothing, Int, Symbol}=nothing + ) # sampsize here is number of clusters completely sampled, popsize is total clusters in population - if typeof(strata) <:Nothing + if typeof(strata) <: Nothing data.false_strata = repeat(["FALSE_STRATA"], nrow(data)) strata = :false_strata end @@ -71,7 +74,7 @@ struct SurveyDesign <: AbstractSurveyDesign end # For one-stage sample only one sampsize vector sampsize_labels = :sampsize - data[!, sampsize_labels] = fill(length(unique(data[!, cluster])),(nrow(data),)) + data[!, sampsize_labels] = fill(length(unique(data[!, cluster])), (nrow(data),)) if !(typeof(popsize) <: Nothing) data[!, :weights] = data[!, popsize] ./ data[!, sampsize_labels] elseif !(typeof(weights) <: Nothing) @@ -91,24 +94,26 @@ struct SurveyDesign <: AbstractSurveyDesign end """ -```jldoctest -julia> apiclus1 = load_data("apiclus1"); + ReplicateDesign <: AbstractSurveyDesign -julia> apiclus1[!, :pw] = fill(757/15,(size(apiclus1,1),)); # Correct api mistake for pw column +Survey design obtained by replicating an original design using [`bootweights`](@ref). -julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); +```jldoctest +julia> apistrat = load_data("apistrat"); -julia> bclus1 = Survey.bootweights(dclus1; replicates = 1000) -Survey.ReplicateDesign: -data: 183x1046 DataFrame -cluster: dnum -design.data[!,design.cluster]: 637, 637, 637, ..., 448 +julia> strat = SurveyDesign(apistrat; strata=:stype, weights=:pw); + +julia> bootstrat = bootweights(strat; replicates=1000) +ReplicateDesign: +data: 200x1046 DataFrame +cluster: false_cluster +design.data[!,design.cluster]: 1, 2, 3, ..., 200 popsize: popsize -design.data[!,design.popsize]: 9240.0, 9240.0, 9240.0, ..., 9240.0 +design.data[!,design.popsize]: 6190.0, 6190.0, 6190.0, ..., 6190.0 sampsize: sampsize -design.data[!,design.sampsize]: 15, 15, 15, ..., 15 -design.data[!,:probs]: 0.0198, 0.0198, 0.0198, ..., 0.0198 -design.data[!,:allprobs]: 0.0198, 0.0198, 0.0198, ..., 0.0198 +design.data[!,design.sampsize]: 200, 200, 200, ..., 200 +design.data[!,:probs]: 0.0226, 0.0226, 0.0226, ..., 0.0662 +design.data[!,:allprobs]: 0.0226, 0.0226, 0.0226, ..., 0.0662 replicates: 1000 ``` """ diff --git a/src/mean.jl b/src/mean.jl index 2bf8b925..0ef5bb37 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -1,19 +1,27 @@ """ -```jldoctest -julia> using Survey, Random, StatsBase; + mean(var, design) -julia> apiclus1 = load_data("apiclus1"); +Compute the estimated mean of one or more variables within a survey design. -julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); +```jldoctest +julia> apiclus1 = load_data("apiclus1"); -julia> bclus1 = bootweights(dclus1; replicates = 1000) +julia> clus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) |> bootweights; -julia> mean(:api00, bclus1) +julia> mean(:api00, clus1) 1×2 DataFrame - Row │ mean SE - │ Float64 Float64 + Row │ mean SE + │ Float64 Float64 ─────┼────────────────── - 1 │ 644.169 23.7208 + 1 │ 644.169 23.2919 + +julia> mean([:api00, :enroll], clus1) +2×3 DataFrame + Row │ names mean SE + │ String Float64 Float64 +─────┼────────────────────────── + 1 │ api00 644.169 23.2919 + 2 │ enroll 549.716 45.3655 ``` """ function mean(x::Symbol, design::ReplicateDesign) @@ -22,59 +30,39 @@ function mean(x::Symbol, design::ReplicateDesign) variance = sum((Xt .- X).^2) / design.replicates DataFrame(mean = X, SE = sqrt(variance)) end + +function mean(x::Vector{Symbol}, design::ReplicateDesign) + df = reduce(vcat, [mean(i, design) for i in x]) + insertcols!(df, 1, :names => String.(x)) + return df +end + """ -```jldoctest -julia> using Survey, Random, StatsBase; + mean(var, domain, design) -julia> apiclus1 = load_data("apiclus1"); +Compute the estimated mean within a domain. -julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); +```jldoctest +julia> apiclus1 = load_data("apiclus1"); -julia> bclus1 = bootweights(dclus1; replicates = 1000) +julia> clus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) |> bootweights; -julia> mean(:api00, :cname, bclus1) |> print -38×3 DataFrame - Row │ cname statistic SE - │ String15 Float64 Any -─────┼───────────────────────────────────────── - 1 │ Kern 573.6 44.5578 - 2 │ Los Angeles 658.156 22.2058 - 3 │ Orange 749.333 29.5701 - 4 │ San Luis Obispo 739.0 3.37273e-14 - 5 │ San Francisco 558.333 45.6266 - 6 │ Modoc 671.0 0.0 - 7 │ Alameda 676.091 37.3104 - 8 │ Solano 623.0 45.1222 - 9 │ Santa Cruz 624.333 113.43 - 10 │ Monterey 605.0 85.4116 - 11 │ San Bernardino 614.462 30.0066 - 12 │ Riverside 574.3 27.2025 - 13 │ Tulare 664.0 22.0097 - 14 │ San Diego 684.5 32.2241 - 15 │ Sacramento 616.0 39.7877 - 16 │ Marin 799.667 35.2397 - 17 │ Imperial 622.0 0.0 - 18 │ Ventura 743.8 31.7425 - 19 │ San Joaquin 608.667 40.8592 - 20 │ Sonoma 630.0 0.0 - 21 │ Fresno 600.25 56.9173 - 22 │ Santa Clara 718.286 58.562 - 23 │ Sutter 744.0 0.0 - 24 │ Contra Costa 766.111 53.598 - 25 │ Stanislaus 736.333 5.26576 - 26 │ Madera 480.0 3.5861 - 27 │ Placer 759.0 0.0 - 28 │ Lassen 752.0 0.0 - 29 │ Santa Barbara 728.667 25.8749 - 30 │ San Mateo 617.0 78.1173 - 31 │ Siskiyou 699.0 0.0 - 32 │ Kings 469.5 44.6284 - 33 │ Shasta 754.0 60.5829 - 34 │ Yolo 475.0 0.0 - 35 │ Calaveras 790.0 0.0 - 36 │ Napa 727.0 50.5542 - 37 │ Lake 804.0 0.0 - 38 │ Merced 595.0 0 +julia> mean(:api00, :cname, clus1) +11×3 DataFrame + Row │ cname mean SE + │ String15 Float64 Any +─────┼─────────────────────────────────── + 1 │ Alameda 669.0 1.27388e-13 + 2 │ Fresno 472.0 1.13687e-13 + 3 │ Kern 452.5 0.0 + 4 │ Los Angeles 647.267 47.4938 + 5 │ Mendocino 623.25 1.0931e-13 + 6 │ Merced 519.25 4.57038e-15 + 7 │ Orange 710.563 2.19684e-13 + 8 │ Plumas 709.556 1.27773e-13 + 9 │ San Diego 659.436 2.63446 + 10 │ San Joaquin 551.189 2.17471e-13 + 11 │ Santa Clara 732.077 56.2584 ``` """ function mean(x::Symbol, domain::Symbol, design::ReplicateDesign) @@ -82,10 +70,4 @@ function mean(x::Symbol, domain::Symbol, design::ReplicateDesign) df = bydomain(x, domain, design, weighted_mean) rename!(df, :statistic => :mean) return df -end - -function mean(x::Vector{Symbol}, design::ReplicateDesign) - df = reduce(vcat, [mean(i, design) for i in x]) - insertcols!(df, 1, :names => String.(x)) - return df end \ No newline at end of file diff --git a/src/total.jl b/src/total.jl index fdf83216..e5fbbdcb 100644 --- a/src/total.jl +++ b/src/total.jl @@ -1,21 +1,27 @@ """ -```jldoctest -julia> using Survey; - -julia> apiclus1 = load_data("apiclus1"); + total(var, design) -julia> apiclus1[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column +Compute the estimated population total for one or more variables within a survey design. -julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); +```jldoctest +julia> apiclus1 = load_data("apiclus1"); -julia> bclus1 = bootweights(dclus1; replicates = 1000); +julia> clus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) |> bootweights; -julia> total(:api00, bclus1) +julia> total(:api00, clus1) 1×2 DataFrame - Row │ total SE - │ Float64 Float64 + Row │ total SE + │ Float64 Float64 ─────┼────────────────────── - 1 │ 5.94916e6 1.31977e6 + 1 │ 3.98999e6 9.22175e5 + +julia> total([:api00, :enroll], clus1) +2×3 DataFrame + Row │ names total SE + │ String Float64 Float64 +─────┼────────────────────────────── + 1 │ api00 3.98999e6 9.22175e5 + 2 │ enroll 3.40494e6 9.51557e5 ``` """ function total(x::Symbol, design::ReplicateDesign) @@ -24,41 +30,42 @@ function total(x::Symbol, design::ReplicateDesign) variance = sum((Xt .- X).^2) / design.replicates DataFrame(total = X, SE = sqrt(variance)) end + +function total(x::Vector{Symbol}, design::ReplicateDesign) + df = reduce(vcat, [total(i, design) for i in x]) + insertcols!(df, 1, :names => String.(x)) + return df +end + """ -```jldoctest -julia> using Survey; + total(var, domain, design) -julia> apiclus1 = load_data("apiclus1"); +Compute the estimated population total within a domain. -julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); +```jldoctest +julia> apiclus1 = load_data("apiclus1"); -julia> bclus1 = bootweights(dclus1; replicates = 1000); +julia> clus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) |> bootweights; -julia> total(:api00, :cname, bclus1) |> print +julia> total(:api00, :cname, clus1) 11×3 DataFrame - Row │ cname statistic SE - │ String15 Float64 Any -─────┼─────────────────────────────────────── - 1 │ Alameda 3.71384e5 3.78375e5 - 2 │ Fresno 95281.1 96134.8 - 3 │ Kern 45672.3 43544.7 - 4 │ Los Angeles 4.89981e5 4.42865e5 - 5 │ Mendocino 1.25813e5 1.22757e5 - 6 │ Merced 1.04819e5 1.09032e5 - 7 │ Orange 5.73756e5 6.01213e5 - 8 │ Plumas 3.2228e5 3.26443e5 - 9 │ San Diego 1.83038e6 1.34155e6 - 10 │ San Joaquin 1.02922e6 1.04048e6 - 11 │ Santa Clara 9.60583e5 643492.0 + Row │ cname total SE + │ String15 Float64 Any +─────┼──────────────────────────────────────── + 1 │ Alameda 249080.0 2.48842e5 + 2 │ Fresno 63903.1 64452.2 + 3 │ Kern 30631.5 31083.0 + 4 │ Los Angeles 3.2862e5 2.93649e5 + 5 │ Mendocino 84380.6 83154.4 + 6 │ Merced 70300.2 69272.5 + 7 │ Orange 3.84807e5 3.90097e5 + 8 │ Plumas 2.16147e5 2.17811e5 + 9 │ San Diego 1.2276e6 8.78559e5 + 10 │ San Joaquin 6.90276e5 6.90685e5 + 11 │ Santa Clara 6.44244e5 4.09943e5 ``` """ function total(x::Symbol, domain::Symbol, design::ReplicateDesign) df = bydomain(x, domain, design, wsum) rename!(df, :statistic => :total) -end - -function total(x::Vector{Symbol}, design::ReplicateDesign) - df = reduce(vcat, [total(i, design) for i in x]) - insertcols!(df, 1, :names => String.(x)) - return df end \ No newline at end of file From 76275cd66b93f0b5794d8a8638023c2e2857da01 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Tue, 10 Jan 2023 17:26:11 +0530 Subject: [PATCH 19/80] Fix argument rendering in docstring --- src/SurveyDesign.jl | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 07b9e1de..69657cd0 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -18,12 +18,14 @@ General survey design encompassing a simple random, stratified, cluster or multi In the case of cluster sample, the clusters are chosen by simple random sampling. All individuals in one cluster are sampled. The clusters are considered disjoint and nested. +`strata` and `clusters` must be given as columns in `data`. + # Arguments: -`data::AbstractDataFrame`: the survey dataset (!this gets modified by the constructor). -`strata::Union{Nothing, Symbol}=nothing`: the stratification variable - must be given as a column in `data`. -`clusters::Union{Nothing, Symbol, Vector{Symbol}}=nothing`: the clustering variable - must be given as column(s) in `data`. -`weights::Union{Nothing, Symbol}=nothing`: the sampling weights. -`popsize::Union{Nothing, Int, Symbol}=nothing`: the (expected) survey population size. +- `data::AbstractDataFrame`: the survey dataset (!this gets modified by the constructor). +- `strata::Union{Nothing, Symbol}=nothing`: the stratification variable. +- `clusters::Union{Nothing, Symbol, Vector{Symbol}}=nothing`: the clustering variable. +- `weights::Union{Nothing, Symbol}=nothing`: the sampling weights. +- `popsize::Union{Nothing, Int, Symbol}=nothing`: the (expected) survey population size. ```jldoctest julia> apiclus1 = load_data("apiclus1"); From b63acd1498ef386194c928a0225f72660bdad54a Mon Sep 17 00:00:00 2001 From: smishr <43640926+smishr@users.noreply.github.com> Date: Tue, 10 Jan 2023 18:05:03 +0530 Subject: [PATCH 20/80] Add tests for stratified sampling SurveyDesign --- src/SurveyDesign.jl | 3 +-- test/SurveyDesign.jl | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 69fe3b51..0b53fb57 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -83,7 +83,6 @@ struct SurveyDesign <: AbstractSurveyDesign data[!, weights_labels] = data[!, popsize] ./ data[!, sampsize_labels] elseif typeof(weights) <: Symbol if !(typeof(data[!, weights]) <: Vector{<:Real}) - @show typeof(data[!, weights]) error("weights column has to be numeric") end weights_labels = weights @@ -95,8 +94,8 @@ struct SurveyDesign <: AbstractSurveyDesign data[!, allprobs_labels] = 1 ./ data[!, weights_labels] # In one-stage cluster sample, allprobs is just probs, no multiplication needed pps = false # for now no explicit pps support if !(typeof(popsize) <: Symbol) - data.popsize = repeat([sum(data[!, weights_labels])], nrow(data)) popsize = :popsize + data[!,popsize] = repeat([sum(data[!, weights_labels])], nrow(data)) end new(data, cluster, popsize, sampsize_labels, strata, weights_labels, allprobs_labels, pps) end diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl index 90989918..01d5baf5 100644 --- a/test/SurveyDesign.jl +++ b/test/SurveyDesign.jl @@ -16,11 +16,12 @@ srs_pop = SurveyDesign(apisrs, popsize=:fpc) @test srs_pop.data[!,srs_pop.weights][1] ≈ 30.97 atol = 1e-4 @test srs_pop.data[!,srs_pop.weights] == 1 ./ srs_pop.data[!,srs_pop.allprobs] + ### Both ways should achieve same weights and allprobs! + @test srs_pop.data[!,srs_pop.weights] == srs_weights.data[!,srs_weights.weights] ############################## ### Weights as non-numeric error apisrs = copy(apisrs_original) @test_throws ErrorException SurveyDesign(apisrs, weights=:stype) - end @testset "SurveyDesign_strat" begin @@ -29,10 +30,17 @@ end apistrat_original = load_data("apistrat") apistrat_original[!, :derived_probs] = 1 ./ apistrat_original.pw apistrat_original[!, :derived_sampsize] = apistrat_original.fpc ./ apistrat_original.pw - + ############################## + ### weights as Symbol apistrat = copy(apistrat_original) - strat = SurveyDesign(apistrat, strata = :stype, weights = :pw) |> bootweights - + strat_wt = SurveyDesign(apistrat, strata=:stype, weights=:pw) + @test strat_wt.data[!,strat_wt.weights] == 1 ./ strat_wt.data[!,strat_wt.allprobs] + ### popsize as Symbol + apistrat = copy(apistrat_original) + strat_pop = SurveyDesign(apistrat, strata=:stype, popsize=:fpc) + @test strat_pop.data[!,strat_pop.weights] == 1 ./ strat_pop.data[!,strat_pop.allprobs] + ############################## + # @test strat_pop.data[!,strat_pop.weights] == strat_wt.data[!,strat_wt.weights] end @testset "SurveyDesign_multistage" begin From e371402ff4b3c95f275e511f29c596095d56db0c Mon Sep 17 00:00:00 2001 From: smishr <43640926+smishr@users.noreply.github.com> Date: Wed, 11 Jan 2023 13:23:37 +0530 Subject: [PATCH 21/80] nhanes and yrbs testing --- src/SurveyDesign.jl | 4 ++-- test/SurveyDesign.jl | 13 +++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index b1d26204..81e99767 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -55,7 +55,7 @@ struct SurveyDesign <: AbstractSurveyDesign function SurveyDesign(data::AbstractDataFrame; clusters::Union{Nothing,Symbol,Vector{Symbol}}=nothing, strata::Union{Nothing,Symbol}=nothing, - popsize::Union{Nothing,Int,Symbol}=nothing, + popsize::Union{Nothing,Symbol}=nothing, weights::Union{Nothing,Symbol}=nothing ) # sampsize here is number of clusters completely sampled, popsize is total clusters in population @@ -82,7 +82,7 @@ struct SurveyDesign <: AbstractSurveyDesign data[!, weights_labels] = data[!, popsize] ./ data[!, sampsize_labels] elseif typeof(weights) <: Symbol if !(typeof(data[!, weights]) <: Vector{<:Real}) - error("weights column has to be numeric") + error(string("given weights column ", weights , " is not of numeric type")) end weights_labels = weights else diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl index 01d5baf5..be515926 100644 --- a/test/SurveyDesign.jl +++ b/test/SurveyDesign.jl @@ -61,3 +61,16 @@ end nhanes_design = SurveyDesign(nhanes; clusters = :SDMVPSU, strata = :SDMVSTRA, weights = :WTMEC2YR) end +@testset "SurveyDesign_realSurveys" begin + # Load API datasets + yrbs_original = load_data("yrbs") + nhanes_original = load_data("nhanes") + ############################## + # NHANES + nhanes = copy(nhanes_original) + dnhanes = SingleStageSurveyDesign(nhanes; cluster = :SDMVPSU, strata=:SDMVSTRA, weights=:WTMEC2YR) + ############################## + # YRBS + yrbs = copy(yrbs_original) + dyrbs = SingleStageSurveyDesign(yrbs; cluster = :psu, strata=:stratum, weights=:weight) +end From ae0cb2aa8bc87b7d318d97046f5f8de8d58a1be9 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Wed, 11 Jan 2023 19:02:01 +0530 Subject: [PATCH 22/80] Change show for `SurveyDesign` --- src/show.jl | 49 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/src/show.jl b/src/show.jl index 3319e653..5dd5b21a 100644 --- a/src/show.jl +++ b/src/show.jl @@ -6,16 +6,20 @@ function makeshort(x) x = round.(x, sigdigits=3) end # print short vectors or single values as they are, compress otherwise - x = length(x) < 3 ? join(x, ", ") : join(x[1:3], ", ") * ", ..., " * string(last(x)) + if length(x) > 1 + return "[" * (length(x) < 3 ? join(x, ", ") : join(x[1:3], ", ") * " … " * string(last(x))) * "]" + end + + return x end """ Print information in the form: **name:** content[\n] """ -function printinfo(io::IO, name::String, content::String; newline::Bool=true) +function printinfo(io::IO, name::String, content, args...; newline::Bool=true) printstyled(io, name, ": "; bold=true) - newline ? println(io, content) : print(io, content) + newline ? println(io, content, args...) : print(io, content, args...) end "Print information about a survey design." @@ -33,24 +37,37 @@ function Base.show(io::IO, ::MIME"text/plain", design::AbstractSurveyDesign) printinfo(io, "ignorefpc", string(design.ignorefpc); newline=false) end - "Print information about a survey design." -function Base.show(io::IO, ::MIME"text/plain", design::SurveyDesign) +Base.show(io::IO, ::MIME"text/plain", design::SurveyDesign) = + surveyshow(IOContext(io, :compact=>true, :limit=>true, :displaysize=>(50, 50)), design) + +function surveyshow(io::IO, design::SurveyDesign) + # structure name type = typeof(design) printstyled(io, "$type:\n"; bold=true) - printstyled(io, "data: "; bold=true) - println(io, size(design.data, 1), "x", size(design.data, 2), " DataFrame") - printinfo(io, "cluster", string(design.cluster); newline=true) - printinfo(io, "design.data[!,design.cluster]", makeshort(design.data[!,design.cluster])) - printinfo(io, "popsize", string(design.popsize); newline=true) - printinfo(io, "design.data[!,design.popsize]", makeshort(design.data[!,design.popsize])) - printinfo(io, "sampsize", string(design.sampsize); newline=true) - printinfo(io, "design.data[!,design.sampsize]", makeshort(design.data[!,design.sampsize])) - printinfo(io, "design.data[!,:probs]", makeshort(design.data.probs)) - printinfo(io, "design.data[!,:allprobs]", makeshort(design.data.allprobs)) + # data info + printinfo(io, "data", summary(design.data)) + # strata info + strata_content = + design.strata == :false_strata ? + "none" : + (string(design.strata), "\n ", makeshort(design.data[!, design.strata])) + printinfo(io, "strata", strata_content...) + # cluster(s) info + cluster_content = + design.cluster == :false_cluster ? + "none" : + (string(design.cluster), "\n ", makeshort(design.data[!, design.cluster])) + printinfo(io, "cluster", cluster_content...) + # popsize and sampsize info + printinfo(io, "popsize", "\n ", makeshort(design.data[!, design.popsize])) + printinfo(io, "sampsize", "\n ", makeshort(design.data[!, design.sampsize])) + # weights and probs info + printinfo(io, "weights", "\n ", makeshort(design.data[!, :weights])) + printinfo(io, "probs", "\n ", makeshort(design.data[!, :probs]); newline=false) end -"Print information about a repliocate design." +"Print information about a replicate design." function Base.show(io::IO, ::MIME"text/plain", design::ReplicateDesign) type = typeof(design) printstyled(io, "$type:\n"; bold=true) From b92a8d8ed026131248e586595e06341d50af7ff7 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Wed, 11 Jan 2023 19:30:34 +0530 Subject: [PATCH 23/80] Change show for `AbstractSurveyDesign` and `ReplicateDesign`, restructure code --- src/show.jl | 60 ++++++++++++++++------------------------------------- 1 file changed, 18 insertions(+), 42 deletions(-) diff --git a/src/show.jl b/src/show.jl index 5dd5b21a..2af3d6d1 100644 --- a/src/show.jl +++ b/src/show.jl @@ -1,3 +1,5 @@ +surveyio(io) = IOContext(io, :compact=>true, :limit=>true, :displaysize=>(50, 50)) + """ Helper function that transforms a given `Number` or `Vector` into a short-form string. """ @@ -23,41 +25,33 @@ function printinfo(io::IO, name::String, content, args...; newline::Bool=true) end "Print information about a survey design." -function Base.show(io::IO, ::MIME"text/plain", design::AbstractSurveyDesign) - type = typeof(design) - printstyled(io, "$type:\n"; bold=true) - printstyled(io, "data: "; bold=true) - println(io, size(design.data, 1), "x", size(design.data, 2), " DataFrame") - printinfo(io, "weights", makeshort(design.data.weights)) - printinfo(io, "probs", makeshort(design.data.probs)) - printinfo(io, "fpc", makeshort(design.data.fpc)) - printinfo(io, "popsize", makeshort(design.popsize)) - printinfo(io, "sampsize", makeshort(design.sampsize)) - printinfo(io, "sampfraction", makeshort(design.sampfraction)) - printinfo(io, "ignorefpc", string(design.ignorefpc); newline=false) -end +Base.show(io::IO, ::MIME"text/plain", design::AbstractSurveyDesign) = + surveyshow(surveyio(io), design) -"Print information about a survey design." Base.show(io::IO, ::MIME"text/plain", design::SurveyDesign) = - surveyshow(IOContext(io, :compact=>true, :limit=>true, :displaysize=>(50, 50)), design) + surveyshow(surveyio(io), design) + +function Base.show(io::IO, ::MIME"text/plain", design::ReplicateDesign) + # new_io = IOContext(io, :compact=>true, :limit=>true, :displaysize=>(50, 50)) + surveyshow(surveyio(io), design) + printinfo(surveyio(io), "\nreplicates", design.replicates; newline=false) +end -function surveyshow(io::IO, design::SurveyDesign) +function surveyshow(io::IO, design::AbstractSurveyDesign) # structure name type = typeof(design) printstyled(io, "$type:\n"; bold=true) # data info printinfo(io, "data", summary(design.data)) # strata info - strata_content = - design.strata == :false_strata ? - "none" : - (string(design.strata), "\n ", makeshort(design.data[!, design.strata])) + strata_content = design.strata == :false_strata ? + "none" : + (string(design.strata), "\n ", makeshort(design.data[!, design.strata])) printinfo(io, "strata", strata_content...) # cluster(s) info - cluster_content = - design.cluster == :false_cluster ? - "none" : - (string(design.cluster), "\n ", makeshort(design.data[!, design.cluster])) + cluster_content = design.cluster == :false_cluster ? + "none" : + (string(design.cluster), "\n ", makeshort(design.data[!, design.cluster])) printinfo(io, "cluster", cluster_content...) # popsize and sampsize info printinfo(io, "popsize", "\n ", makeshort(design.data[!, design.popsize])) @@ -66,21 +60,3 @@ function surveyshow(io::IO, design::SurveyDesign) printinfo(io, "weights", "\n ", makeshort(design.data[!, :weights])) printinfo(io, "probs", "\n ", makeshort(design.data[!, :probs]); newline=false) end - -"Print information about a replicate design." -function Base.show(io::IO, ::MIME"text/plain", design::ReplicateDesign) - type = typeof(design) - printstyled(io, "$type:\n"; bold=true) - printstyled(io, "data: "; bold=true) - println(io, size(design.data, 1), "x", size(design.data, 2), " DataFrame") - printinfo(io, "cluster", string(design.cluster); newline=true) - printinfo(io, "design.data[!,design.cluster]", makeshort(design.data[!,design.cluster])) - printinfo(io, "popsize", string(design.popsize); newline=true) - printinfo(io, "design.data[!,design.popsize]", makeshort(design.data[!,design.popsize])) - printinfo(io, "sampsize", string(design.sampsize); newline=true) - printinfo(io, "design.data[!,design.sampsize]", makeshort(design.data[!,design.sampsize])) - printinfo(io, "design.data[!,:probs]", makeshort(design.data.probs)) - printinfo(io, "design.data[!,:allprobs]", makeshort(design.data.allprobs)) - printstyled(io, "replicates: "; bold=true) - println(io, design.replicates) -end \ No newline at end of file From aaeebb1c35685f64548caaae9349bb1f0b10299e Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Wed, 11 Jan 2023 19:37:14 +0530 Subject: [PATCH 24/80] Revert "Change show for `AbstractSurveyDesign` and `ReplicateDesign`, restructure code" This reverts commit b92a8d8ed026131248e586595e06341d50af7ff7. --- src/show.jl | 60 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/src/show.jl b/src/show.jl index 2af3d6d1..5dd5b21a 100644 --- a/src/show.jl +++ b/src/show.jl @@ -1,5 +1,3 @@ -surveyio(io) = IOContext(io, :compact=>true, :limit=>true, :displaysize=>(50, 50)) - """ Helper function that transforms a given `Number` or `Vector` into a short-form string. """ @@ -25,33 +23,41 @@ function printinfo(io::IO, name::String, content, args...; newline::Bool=true) end "Print information about a survey design." -Base.show(io::IO, ::MIME"text/plain", design::AbstractSurveyDesign) = - surveyshow(surveyio(io), design) +function Base.show(io::IO, ::MIME"text/plain", design::AbstractSurveyDesign) + type = typeof(design) + printstyled(io, "$type:\n"; bold=true) + printstyled(io, "data: "; bold=true) + println(io, size(design.data, 1), "x", size(design.data, 2), " DataFrame") + printinfo(io, "weights", makeshort(design.data.weights)) + printinfo(io, "probs", makeshort(design.data.probs)) + printinfo(io, "fpc", makeshort(design.data.fpc)) + printinfo(io, "popsize", makeshort(design.popsize)) + printinfo(io, "sampsize", makeshort(design.sampsize)) + printinfo(io, "sampfraction", makeshort(design.sampfraction)) + printinfo(io, "ignorefpc", string(design.ignorefpc); newline=false) +end +"Print information about a survey design." Base.show(io::IO, ::MIME"text/plain", design::SurveyDesign) = - surveyshow(surveyio(io), design) - -function Base.show(io::IO, ::MIME"text/plain", design::ReplicateDesign) - # new_io = IOContext(io, :compact=>true, :limit=>true, :displaysize=>(50, 50)) - surveyshow(surveyio(io), design) - printinfo(surveyio(io), "\nreplicates", design.replicates; newline=false) -end + surveyshow(IOContext(io, :compact=>true, :limit=>true, :displaysize=>(50, 50)), design) -function surveyshow(io::IO, design::AbstractSurveyDesign) +function surveyshow(io::IO, design::SurveyDesign) # structure name type = typeof(design) printstyled(io, "$type:\n"; bold=true) # data info printinfo(io, "data", summary(design.data)) # strata info - strata_content = design.strata == :false_strata ? - "none" : - (string(design.strata), "\n ", makeshort(design.data[!, design.strata])) + strata_content = + design.strata == :false_strata ? + "none" : + (string(design.strata), "\n ", makeshort(design.data[!, design.strata])) printinfo(io, "strata", strata_content...) # cluster(s) info - cluster_content = design.cluster == :false_cluster ? - "none" : - (string(design.cluster), "\n ", makeshort(design.data[!, design.cluster])) + cluster_content = + design.cluster == :false_cluster ? + "none" : + (string(design.cluster), "\n ", makeshort(design.data[!, design.cluster])) printinfo(io, "cluster", cluster_content...) # popsize and sampsize info printinfo(io, "popsize", "\n ", makeshort(design.data[!, design.popsize])) @@ -60,3 +66,21 @@ function surveyshow(io::IO, design::AbstractSurveyDesign) printinfo(io, "weights", "\n ", makeshort(design.data[!, :weights])) printinfo(io, "probs", "\n ", makeshort(design.data[!, :probs]); newline=false) end + +"Print information about a replicate design." +function Base.show(io::IO, ::MIME"text/plain", design::ReplicateDesign) + type = typeof(design) + printstyled(io, "$type:\n"; bold=true) + printstyled(io, "data: "; bold=true) + println(io, size(design.data, 1), "x", size(design.data, 2), " DataFrame") + printinfo(io, "cluster", string(design.cluster); newline=true) + printinfo(io, "design.data[!,design.cluster]", makeshort(design.data[!,design.cluster])) + printinfo(io, "popsize", string(design.popsize); newline=true) + printinfo(io, "design.data[!,design.popsize]", makeshort(design.data[!,design.popsize])) + printinfo(io, "sampsize", string(design.sampsize); newline=true) + printinfo(io, "design.data[!,design.sampsize]", makeshort(design.data[!,design.sampsize])) + printinfo(io, "design.data[!,:probs]", makeshort(design.data.probs)) + printinfo(io, "design.data[!,:allprobs]", makeshort(design.data.allprobs)) + printstyled(io, "replicates: "; bold=true) + println(io, design.replicates) +end \ No newline at end of file From 15e34ece548cc990bfeca4683150ee2350032705 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Wed, 11 Jan 2023 19:39:24 +0530 Subject: [PATCH 25/80] Revert "Change show for `SurveyDesign`" This reverts commit ae0cb2aa8bc87b7d318d97046f5f8de8d58a1be9. --- src/show.jl | 49 ++++++++++++++++--------------------------------- 1 file changed, 16 insertions(+), 33 deletions(-) diff --git a/src/show.jl b/src/show.jl index 5dd5b21a..3319e653 100644 --- a/src/show.jl +++ b/src/show.jl @@ -6,20 +6,16 @@ function makeshort(x) x = round.(x, sigdigits=3) end # print short vectors or single values as they are, compress otherwise - if length(x) > 1 - return "[" * (length(x) < 3 ? join(x, ", ") : join(x[1:3], ", ") * " … " * string(last(x))) * "]" - end - - return x + x = length(x) < 3 ? join(x, ", ") : join(x[1:3], ", ") * ", ..., " * string(last(x)) end """ Print information in the form: **name:** content[\n] """ -function printinfo(io::IO, name::String, content, args...; newline::Bool=true) +function printinfo(io::IO, name::String, content::String; newline::Bool=true) printstyled(io, name, ": "; bold=true) - newline ? println(io, content, args...) : print(io, content, args...) + newline ? println(io, content) : print(io, content) end "Print information about a survey design." @@ -37,37 +33,24 @@ function Base.show(io::IO, ::MIME"text/plain", design::AbstractSurveyDesign) printinfo(io, "ignorefpc", string(design.ignorefpc); newline=false) end -"Print information about a survey design." -Base.show(io::IO, ::MIME"text/plain", design::SurveyDesign) = - surveyshow(IOContext(io, :compact=>true, :limit=>true, :displaysize=>(50, 50)), design) -function surveyshow(io::IO, design::SurveyDesign) - # structure name +"Print information about a survey design." +function Base.show(io::IO, ::MIME"text/plain", design::SurveyDesign) type = typeof(design) printstyled(io, "$type:\n"; bold=true) - # data info - printinfo(io, "data", summary(design.data)) - # strata info - strata_content = - design.strata == :false_strata ? - "none" : - (string(design.strata), "\n ", makeshort(design.data[!, design.strata])) - printinfo(io, "strata", strata_content...) - # cluster(s) info - cluster_content = - design.cluster == :false_cluster ? - "none" : - (string(design.cluster), "\n ", makeshort(design.data[!, design.cluster])) - printinfo(io, "cluster", cluster_content...) - # popsize and sampsize info - printinfo(io, "popsize", "\n ", makeshort(design.data[!, design.popsize])) - printinfo(io, "sampsize", "\n ", makeshort(design.data[!, design.sampsize])) - # weights and probs info - printinfo(io, "weights", "\n ", makeshort(design.data[!, :weights])) - printinfo(io, "probs", "\n ", makeshort(design.data[!, :probs]); newline=false) + printstyled(io, "data: "; bold=true) + println(io, size(design.data, 1), "x", size(design.data, 2), " DataFrame") + printinfo(io, "cluster", string(design.cluster); newline=true) + printinfo(io, "design.data[!,design.cluster]", makeshort(design.data[!,design.cluster])) + printinfo(io, "popsize", string(design.popsize); newline=true) + printinfo(io, "design.data[!,design.popsize]", makeshort(design.data[!,design.popsize])) + printinfo(io, "sampsize", string(design.sampsize); newline=true) + printinfo(io, "design.data[!,design.sampsize]", makeshort(design.data[!,design.sampsize])) + printinfo(io, "design.data[!,:probs]", makeshort(design.data.probs)) + printinfo(io, "design.data[!,:allprobs]", makeshort(design.data.allprobs)) end -"Print information about a replicate design." +"Print information about a repliocate design." function Base.show(io::IO, ::MIME"text/plain", design::ReplicateDesign) type = typeof(design) printstyled(io, "$type:\n"; bold=true) From f38db7da9625ca621fbb1aeea78d1ae93439902a Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Wed, 11 Jan 2023 19:42:41 +0530 Subject: [PATCH 26/80] Revert "Revert "Change show for `SurveyDesign`"" This reverts commit 15e34ece548cc990bfeca4683150ee2350032705. --- src/show.jl | 49 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/src/show.jl b/src/show.jl index 3319e653..5dd5b21a 100644 --- a/src/show.jl +++ b/src/show.jl @@ -6,16 +6,20 @@ function makeshort(x) x = round.(x, sigdigits=3) end # print short vectors or single values as they are, compress otherwise - x = length(x) < 3 ? join(x, ", ") : join(x[1:3], ", ") * ", ..., " * string(last(x)) + if length(x) > 1 + return "[" * (length(x) < 3 ? join(x, ", ") : join(x[1:3], ", ") * " … " * string(last(x))) * "]" + end + + return x end """ Print information in the form: **name:** content[\n] """ -function printinfo(io::IO, name::String, content::String; newline::Bool=true) +function printinfo(io::IO, name::String, content, args...; newline::Bool=true) printstyled(io, name, ": "; bold=true) - newline ? println(io, content) : print(io, content) + newline ? println(io, content, args...) : print(io, content, args...) end "Print information about a survey design." @@ -33,24 +37,37 @@ function Base.show(io::IO, ::MIME"text/plain", design::AbstractSurveyDesign) printinfo(io, "ignorefpc", string(design.ignorefpc); newline=false) end - "Print information about a survey design." -function Base.show(io::IO, ::MIME"text/plain", design::SurveyDesign) +Base.show(io::IO, ::MIME"text/plain", design::SurveyDesign) = + surveyshow(IOContext(io, :compact=>true, :limit=>true, :displaysize=>(50, 50)), design) + +function surveyshow(io::IO, design::SurveyDesign) + # structure name type = typeof(design) printstyled(io, "$type:\n"; bold=true) - printstyled(io, "data: "; bold=true) - println(io, size(design.data, 1), "x", size(design.data, 2), " DataFrame") - printinfo(io, "cluster", string(design.cluster); newline=true) - printinfo(io, "design.data[!,design.cluster]", makeshort(design.data[!,design.cluster])) - printinfo(io, "popsize", string(design.popsize); newline=true) - printinfo(io, "design.data[!,design.popsize]", makeshort(design.data[!,design.popsize])) - printinfo(io, "sampsize", string(design.sampsize); newline=true) - printinfo(io, "design.data[!,design.sampsize]", makeshort(design.data[!,design.sampsize])) - printinfo(io, "design.data[!,:probs]", makeshort(design.data.probs)) - printinfo(io, "design.data[!,:allprobs]", makeshort(design.data.allprobs)) + # data info + printinfo(io, "data", summary(design.data)) + # strata info + strata_content = + design.strata == :false_strata ? + "none" : + (string(design.strata), "\n ", makeshort(design.data[!, design.strata])) + printinfo(io, "strata", strata_content...) + # cluster(s) info + cluster_content = + design.cluster == :false_cluster ? + "none" : + (string(design.cluster), "\n ", makeshort(design.data[!, design.cluster])) + printinfo(io, "cluster", cluster_content...) + # popsize and sampsize info + printinfo(io, "popsize", "\n ", makeshort(design.data[!, design.popsize])) + printinfo(io, "sampsize", "\n ", makeshort(design.data[!, design.sampsize])) + # weights and probs info + printinfo(io, "weights", "\n ", makeshort(design.data[!, :weights])) + printinfo(io, "probs", "\n ", makeshort(design.data[!, :probs]); newline=false) end -"Print information about a repliocate design." +"Print information about a replicate design." function Base.show(io::IO, ::MIME"text/plain", design::ReplicateDesign) type = typeof(design) printstyled(io, "$type:\n"; bold=true) From 50d0e5afce695f3bcb9c92050015965e294dbde7 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Wed, 11 Jan 2023 19:42:54 +0530 Subject: [PATCH 27/80] Revert "Revert "Change show for `AbstractSurveyDesign` and `ReplicateDesign`, restructure code"" This reverts commit aaeebb1c35685f64548caaae9349bb1f0b10299e. --- src/show.jl | 60 ++++++++++++++++------------------------------------- 1 file changed, 18 insertions(+), 42 deletions(-) diff --git a/src/show.jl b/src/show.jl index 5dd5b21a..2af3d6d1 100644 --- a/src/show.jl +++ b/src/show.jl @@ -1,3 +1,5 @@ +surveyio(io) = IOContext(io, :compact=>true, :limit=>true, :displaysize=>(50, 50)) + """ Helper function that transforms a given `Number` or `Vector` into a short-form string. """ @@ -23,41 +25,33 @@ function printinfo(io::IO, name::String, content, args...; newline::Bool=true) end "Print information about a survey design." -function Base.show(io::IO, ::MIME"text/plain", design::AbstractSurveyDesign) - type = typeof(design) - printstyled(io, "$type:\n"; bold=true) - printstyled(io, "data: "; bold=true) - println(io, size(design.data, 1), "x", size(design.data, 2), " DataFrame") - printinfo(io, "weights", makeshort(design.data.weights)) - printinfo(io, "probs", makeshort(design.data.probs)) - printinfo(io, "fpc", makeshort(design.data.fpc)) - printinfo(io, "popsize", makeshort(design.popsize)) - printinfo(io, "sampsize", makeshort(design.sampsize)) - printinfo(io, "sampfraction", makeshort(design.sampfraction)) - printinfo(io, "ignorefpc", string(design.ignorefpc); newline=false) -end +Base.show(io::IO, ::MIME"text/plain", design::AbstractSurveyDesign) = + surveyshow(surveyio(io), design) -"Print information about a survey design." Base.show(io::IO, ::MIME"text/plain", design::SurveyDesign) = - surveyshow(IOContext(io, :compact=>true, :limit=>true, :displaysize=>(50, 50)), design) + surveyshow(surveyio(io), design) + +function Base.show(io::IO, ::MIME"text/plain", design::ReplicateDesign) + # new_io = IOContext(io, :compact=>true, :limit=>true, :displaysize=>(50, 50)) + surveyshow(surveyio(io), design) + printinfo(surveyio(io), "\nreplicates", design.replicates; newline=false) +end -function surveyshow(io::IO, design::SurveyDesign) +function surveyshow(io::IO, design::AbstractSurveyDesign) # structure name type = typeof(design) printstyled(io, "$type:\n"; bold=true) # data info printinfo(io, "data", summary(design.data)) # strata info - strata_content = - design.strata == :false_strata ? - "none" : - (string(design.strata), "\n ", makeshort(design.data[!, design.strata])) + strata_content = design.strata == :false_strata ? + "none" : + (string(design.strata), "\n ", makeshort(design.data[!, design.strata])) printinfo(io, "strata", strata_content...) # cluster(s) info - cluster_content = - design.cluster == :false_cluster ? - "none" : - (string(design.cluster), "\n ", makeshort(design.data[!, design.cluster])) + cluster_content = design.cluster == :false_cluster ? + "none" : + (string(design.cluster), "\n ", makeshort(design.data[!, design.cluster])) printinfo(io, "cluster", cluster_content...) # popsize and sampsize info printinfo(io, "popsize", "\n ", makeshort(design.data[!, design.popsize])) @@ -66,21 +60,3 @@ function surveyshow(io::IO, design::SurveyDesign) printinfo(io, "weights", "\n ", makeshort(design.data[!, :weights])) printinfo(io, "probs", "\n ", makeshort(design.data[!, :probs]); newline=false) end - -"Print information about a replicate design." -function Base.show(io::IO, ::MIME"text/plain", design::ReplicateDesign) - type = typeof(design) - printstyled(io, "$type:\n"; bold=true) - printstyled(io, "data: "; bold=true) - println(io, size(design.data, 1), "x", size(design.data, 2), " DataFrame") - printinfo(io, "cluster", string(design.cluster); newline=true) - printinfo(io, "design.data[!,design.cluster]", makeshort(design.data[!,design.cluster])) - printinfo(io, "popsize", string(design.popsize); newline=true) - printinfo(io, "design.data[!,design.popsize]", makeshort(design.data[!,design.popsize])) - printinfo(io, "sampsize", string(design.sampsize); newline=true) - printinfo(io, "design.data[!,design.sampsize]", makeshort(design.data[!,design.sampsize])) - printinfo(io, "design.data[!,:probs]", makeshort(design.data.probs)) - printinfo(io, "design.data[!,:allprobs]", makeshort(design.data.allprobs)) - printstyled(io, "replicates: "; bold=true) - println(io, design.replicates) -end \ No newline at end of file From 16b04448b7f673a3d7ce528911edd696e10b13bf Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Wed, 11 Jan 2023 21:09:15 +0530 Subject: [PATCH 28/80] Change `surveyio(io)` to `io` --- src/show.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/show.jl b/src/show.jl index 2af3d6d1..90782c5b 100644 --- a/src/show.jl +++ b/src/show.jl @@ -26,15 +26,15 @@ end "Print information about a survey design." Base.show(io::IO, ::MIME"text/plain", design::AbstractSurveyDesign) = - surveyshow(surveyio(io), design) + surveyshow(io, design) Base.show(io::IO, ::MIME"text/plain", design::SurveyDesign) = - surveyshow(surveyio(io), design) + surveyshow(io, design) function Base.show(io::IO, ::MIME"text/plain", design::ReplicateDesign) # new_io = IOContext(io, :compact=>true, :limit=>true, :displaysize=>(50, 50)) - surveyshow(surveyio(io), design) - printinfo(surveyio(io), "\nreplicates", design.replicates; newline=false) + surveyshow(io, design) + printinfo(io, "\nreplicates", design.replicates; newline=false) end function surveyshow(io::IO, design::AbstractSurveyDesign) From 5e102c7a0e2abc7e9ffe9c9cb6e6a16000a681b8 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Thu, 12 Jan 2023 10:44:52 +0530 Subject: [PATCH 29/80] Remove new line after `popsize`, `sampsize` and weights --- src/show.jl | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/show.jl b/src/show.jl index 90782c5b..a72c3250 100644 --- a/src/show.jl +++ b/src/show.jl @@ -11,7 +11,6 @@ function makeshort(x) if length(x) > 1 return "[" * (length(x) < 3 ? join(x, ", ") : join(x[1:3], ", ") * " … " * string(last(x))) * "]" end - return x end @@ -54,9 +53,9 @@ function surveyshow(io::IO, design::AbstractSurveyDesign) (string(design.cluster), "\n ", makeshort(design.data[!, design.cluster])) printinfo(io, "cluster", cluster_content...) # popsize and sampsize info - printinfo(io, "popsize", "\n ", makeshort(design.data[!, design.popsize])) - printinfo(io, "sampsize", "\n ", makeshort(design.data[!, design.sampsize])) + printinfo(io, "popsize", makeshort(design.data[!, design.popsize])) + printinfo(io, "sampsize", makeshort(design.data[!, design.sampsize])) # weights and probs info - printinfo(io, "weights", "\n ", makeshort(design.data[!, :weights])) - printinfo(io, "probs", "\n ", makeshort(design.data[!, :probs]); newline=false) + printinfo(io, "weights", makeshort(design.data[!, :weights])) + printinfo(io, "probs", makeshort(design.data[!, :probs]); newline=false) end From e95e5de5dd0dcb74cff8343098b3763f66144e45 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Thu, 12 Jan 2023 13:56:54 +0530 Subject: [PATCH 30/80] Change docstrings to account for new `show` --- src/Survey.jl | 2 +- src/SurveyDesign.jl | 38 ++++++++++++++++++-------------------- src/bootstrap.jl | 23 ++++++++++------------- src/boxplot.jl | 2 +- src/by.jl | 2 +- src/hist.jl | 2 +- src/jackknife.jl | 2 +- src/mean.jl | 12 ++++++------ src/ratio.jl | 11 ++++------- src/total.jl | 12 ++++++------ 10 files changed, 49 insertions(+), 57 deletions(-) diff --git a/src/Survey.jl b/src/Survey.jl index dd71a092..f25e33a7 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -38,4 +38,4 @@ export bootweights export jkknife export ratio -end \ No newline at end of file +end diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 69657cd0..0cc1e6b0 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -28,19 +28,18 @@ individuals in one cluster are sampled. The clusters are considered disjoint and - `popsize::Union{Nothing, Int, Symbol}=nothing`: the (expected) survey population size. ```jldoctest -julia> apiclus1 = load_data("apiclus1"); +julia> apistrat = load_data("apistrat"); -julia> dclus1 = SurveyDesign(apiclus1; clusters=:dnum, weights=:pw) +julia> strat = SurveyDesign(apistrat; strata=:stype, weights=:pw) SurveyDesign: -data: 183x46 DataFrame -cluster: dnum -design.data[!,design.cluster]: 637, 637, 637, ..., 448 -popsize: popsize -design.data[!,design.popsize]: 6190.0, 6190.0, 6190.0, ..., 6190.0 -sampsize: sampsize -design.data[!,design.sampsize]: 15, 15, 15, ..., 15 -design.data[!,:probs]: 0.0295, 0.0295, 0.0295, ..., 0.0295 -design.data[!,:allprobs]: 0.0295, 0.0295, 0.0295, ..., 0.0295 +data: 200×46 DataFrame +strata: stype + [E, E, E … H] +cluster: none +popsize: [6190.0, 6190.0, 6190.0 … 6190.0] +sampsize: [200, 200, 200 … 200] +weights: [44.2, 44.2, 44.2 … 15.1] +probs: [0.0226, 0.0226, 0.0226 … 0.0662] ``` """ struct SurveyDesign <: AbstractSurveyDesign @@ -107,15 +106,14 @@ julia> strat = SurveyDesign(apistrat; strata=:stype, weights=:pw); julia> bootstrat = bootweights(strat; replicates=1000) ReplicateDesign: -data: 200x1046 DataFrame -cluster: false_cluster -design.data[!,design.cluster]: 1, 2, 3, ..., 200 -popsize: popsize -design.data[!,design.popsize]: 6190.0, 6190.0, 6190.0, ..., 6190.0 -sampsize: sampsize -design.data[!,design.sampsize]: 200, 200, 200, ..., 200 -design.data[!,:probs]: 0.0226, 0.0226, 0.0226, ..., 0.0662 -design.data[!,:allprobs]: 0.0226, 0.0226, 0.0226, ..., 0.0662 +data: 200×1046 DataFrame +strata: stype + [E, E, E … H] +cluster: none +popsize: [6190.0, 6190.0, 6190.0 … 6190.0] +sampsize: [200, 200, 200 … 200] +weights: [44.2, 44.2, 44.2 … 15.1] +probs: [0.0226, 0.0226, 0.0226 … 0.0662] replicates: 1000 ``` """ diff --git a/src/bootstrap.jl b/src/bootstrap.jl index b4e226a8..83defc97 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -4,21 +4,18 @@ julia> using Random julia> apiclus1 = load_data("apiclus1"); -julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum); +julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum); -julia> rng = MersenneTwister(111); - -julia> bootweights(dclus1; replicates=1000, rng) +julia> bootweights(clus_one_stage; replicates=1000, rng=MersenneTwister(111)) # choose a seed for deterministic results ReplicateDesign: -data: 183x1046 DataFrame +data: 183×1046 DataFrame +strata: none cluster: dnum -design.data[!,design.cluster]: 637, 637, 637, ..., 448 -popsize: popsize -design.data[!,design.popsize]: 183, 183, 183, ..., 183 -sampsize: sampsize -design.data[!,design.sampsize]: 15, 15, 15, ..., 15 -design.data[!,:probs]: 1.0, 1.0, 1.0, ..., 1.0 -design.data[!,:allprobs]: 1.0, 1.0, 1.0, ..., 1.0 + [637, 637, 637 … 448] +popsize: [183, 183, 183 … 183] +sampsize: [15, 15, 15 … 15] +weights: [1, 1, 1 … 1] +probs: [1.0, 1.0, 1.0 … 1.0] replicates: 1000 ``` """ @@ -51,4 +48,4 @@ function bootweights(design::SurveyDesign; replicates=4000, rng=MersenneTwister( df[!, "replicate_" * string(i)] = disallowmissing(replicate(stratified, H).whij) end return ReplicateDesign(df, design.cluster, design.popsize, design.sampsize, design.strata, design.pps, replicates) -end \ No newline at end of file +end diff --git a/src/boxplot.jl b/src/boxplot.jl index 8790f116..8ee3dcc4 100644 --- a/src/boxplot.jl +++ b/src/boxplot.jl @@ -10,7 +10,7 @@ The keyword arguments are all the arguments that can be passed to `mapping` in ```@example boxplot apisrs = load_data("apisrs"); -srs = srs = SurveyDesign(apisrs; weights=:pw); +srs = SurveyDesign(apisrs; weights=:pw); bp = boxplot(srs, :stype, :enroll; weights = :pw) save("boxplot.png", bp); nothing # hide ``` diff --git a/src/by.jl b/src/by.jl index be26d5a3..a4de2f55 100644 --- a/src/by.jl +++ b/src/by.jl @@ -14,4 +14,4 @@ function bydomain(x::Symbol, domain::Symbol, design::ReplicateDesign, func::Func replace!(ses, NaN => 0) X.SE = ses return X -end \ No newline at end of file +end diff --git a/src/hist.jl b/src/hist.jl index 90d42d1b..17b54098 100644 --- a/src/hist.jl +++ b/src/hist.jl @@ -61,7 +61,7 @@ For the complete argument list see [Makie.hist](https://makie.juliaplots.org/sta ```@example histogram apisrs = load_data("apisrs"); -srs = SimpleRandomSample(apisrs;popsize=:fpc); +srs = SurveyDesign(apisrs; weights=:pw); h = hist(srs, :enroll) save("hist.png", h); nothing # hide ``` diff --git a/src/jackknife.jl b/src/jackknife.jl index 794ef10b..55880df9 100644 --- a/src/jackknife.jl +++ b/src/jackknife.jl @@ -13,4 +13,4 @@ function jkknife(variable:: Symbol, design::SurveyDesign ,func:: Function; para end var = c*(nh-1)/nh return DataFrame(Statistic = statistic, SE = sqrt(var)) -end \ No newline at end of file +end diff --git a/src/mean.jl b/src/mean.jl index 0ef5bb37..c1d80259 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -6,16 +6,16 @@ Compute the estimated mean of one or more variables within a survey design. ```jldoctest julia> apiclus1 = load_data("apiclus1"); -julia> clus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) |> bootweights; +julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) |> bootweights; -julia> mean(:api00, clus1) +julia> mean(:api00, clus_one_stage) 1×2 DataFrame Row │ mean SE │ Float64 Float64 ─────┼────────────────── 1 │ 644.169 23.2919 -julia> mean([:api00, :enroll], clus1) +julia> mean([:api00, :enroll], clus_one_stage) 2×3 DataFrame Row │ names mean SE │ String Float64 Float64 @@ -45,9 +45,9 @@ Compute the estimated mean within a domain. ```jldoctest julia> apiclus1 = load_data("apiclus1"); -julia> clus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) |> bootweights; +julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) |> bootweights; -julia> mean(:api00, :cname, clus1) +julia> mean(:api00, :cname, clus_one_stage) 11×3 DataFrame Row │ cname mean SE │ String15 Float64 Any @@ -70,4 +70,4 @@ function mean(x::Symbol, domain::Symbol, design::ReplicateDesign) df = bydomain(x, domain, design, weighted_mean) rename!(df, :statistic => :mean) return df -end \ No newline at end of file +end diff --git a/src/ratio.jl b/src/ratio.jl index 67e51668..1623eb3a 100644 --- a/src/ratio.jl +++ b/src/ratio.jl @@ -1,17 +1,14 @@ """ ratio(numerator, denominator, design) + Estimate the ratio of the columns specified in numerator and denominator ```jldoctest -julia> using Survey; - julia> apiclus1 = load_data("apiclus1"); -julia> apiclus1[!, :pw] = fill(757/15,(size(apiclus1,1),)); # Correct api mistake for pw column - -julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); +julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); -julia> ratio(:api00, :enroll, dclus1) +julia> ratio(:api00, :enroll, clus_one_stage) 1×2 DataFrame Row │ Statistic SE │ Float64 Float64 @@ -35,4 +32,4 @@ function ratio(variable_num:: Symbol, variable_den:: Symbol, design::SurveyDesig end var = c*(nh-1)/nh return DataFrame(Statistic = statistic, SE = sqrt(var)) -end \ No newline at end of file +end diff --git a/src/total.jl b/src/total.jl index e5fbbdcb..0c5001e5 100644 --- a/src/total.jl +++ b/src/total.jl @@ -6,16 +6,16 @@ Compute the estimated population total for one or more variables within a survey ```jldoctest julia> apiclus1 = load_data("apiclus1"); -julia> clus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) |> bootweights; +julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) |> bootweights; -julia> total(:api00, clus1) +julia> total(:api00, clus_one_stage) 1×2 DataFrame Row │ total SE │ Float64 Float64 ─────┼────────────────────── 1 │ 3.98999e6 9.22175e5 -julia> total([:api00, :enroll], clus1) +julia> total([:api00, :enroll], clus_one_stage) 2×3 DataFrame Row │ names total SE │ String Float64 Float64 @@ -45,9 +45,9 @@ Compute the estimated population total within a domain. ```jldoctest julia> apiclus1 = load_data("apiclus1"); -julia> clus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) |> bootweights; +julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) |> bootweights; -julia> total(:api00, :cname, clus1) +julia> total(:api00, :cname, clus_one_stage) 11×3 DataFrame Row │ cname total SE │ String15 Float64 Any @@ -68,4 +68,4 @@ julia> total(:api00, :cname, clus1) function total(x::Symbol, domain::Symbol, design::ReplicateDesign) df = bydomain(x, domain, design, wsum) rename!(df, :statistic => :total) -end \ No newline at end of file +end From 21c6bbd2c4af720eec7ded52e21d30c9972370c5 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Thu, 12 Jan 2023 14:50:42 +0530 Subject: [PATCH 31/80] Add tests for `show` --- test/runtests.jl | 3 +- test/show.jl | 111 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 test/show.jl diff --git a/test/runtests.jl b/test/runtests.jl index e8f18a3a..6bb9c738 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -16,4 +16,5 @@ include("mean.jl") include("plot.jl") include("hist.jl") include("boxplot.jl") -include("ratio.jl") \ No newline at end of file +include("ratio.jl") +include("show.jl") diff --git a/test/show.jl b/test/show.jl new file mode 100644 index 00000000..f81ed2a3 --- /dev/null +++ b/test/show.jl @@ -0,0 +1,111 @@ +@testset "No strata, no clusters" begin + io = IOBuffer() + + apisrs = load_data("apisrs") + srs = SurveyDesign(apisrs; weights=:pw) + refstr = """ + SurveyDesign: + data: 200×47 DataFrame + strata: none + cluster: none + popsize: [6190.0, 6190.0, 6190.0 … 6190.0] + sampsize: [200, 200, 200 … 200] + weights: [31.0, 31.0, 31.0 … 31.0] + probs: [0.0323, 0.0323, 0.0323 … 0.0323]""" + + show(io, MIME("text/plain"), srs) + str = String(take!(io)) + @test str == refstr + + bsrs = srs |> bootweights + refstrb = """ + ReplicateDesign: + data: 200×4047 DataFrame + strata: none + cluster: none + popsize: [6190.0, 6190.0, 6190.0 … 6190.0] + sampsize: [200, 200, 200 … 200] + weights: [31.0, 31.0, 31.0 … 31.0] + probs: [0.0323, 0.0323, 0.0323 … 0.0323] + replicates: 4000""" + + show(io, MIME("text/plain"), bsrs) + strb = String(take!(io)) + @test strb == refstrb +end + +@testset "With strata, no clusters" begin + io = IOBuffer() + + apistrat = load_data("apistrat") + strat = SurveyDesign(apistrat; strata=:stype, weights=:pw) + refstr = """ + SurveyDesign: + data: 200×46 DataFrame + strata: stype + [E, E, E … H] + cluster: none + popsize: [6190.0, 6190.0, 6190.0 … 6190.0] + sampsize: [200, 200, 200 … 200] + weights: [44.2, 44.2, 44.2 … 15.1] + probs: [0.0226, 0.0226, 0.0226 … 0.0662]""" + + show(io, MIME("text/plain"), strat) + str = String(take!(io)) + @test str == refstr + + stratb = strat |> bootweights + refstrb = """ + ReplicateDesign: + data: 200×4046 DataFrame + strata: stype + [E, E, E … H] + cluster: none + popsize: [6190.0, 6190.0, 6190.0 … 6190.0] + sampsize: [200, 200, 200 … 200] + weights: [44.2, 44.2, 44.2 … 15.1] + probs: [0.0226, 0.0226, 0.0226 … 0.0662] + replicates: 4000""" + + show(io, MIME("text/plain"), stratb) + strb = String(take!(io)) + @test strb == refstrb +end + +@testset "No strata, with clusters" begin + io = IOBuffer() + + apiclus1 = load_data("apiclus1") + clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) + refstr = """ + SurveyDesign: + data: 183×46 DataFrame + strata: none + cluster: dnum + [637, 637, 637 … 448] + popsize: [6190.0, 6190.0, 6190.0 … 6190.0] + sampsize: [15, 15, 15 … 15] + weights: [33.8, 33.8, 33.8 … 33.8] + probs: [0.0295, 0.0295, 0.0295 … 0.0295]""" + + show(io, MIME("text/plain"), clus_one_stage) + str = String(take!(io)) + @test str == refstr + + clus_one_stageb = clus_one_stage |> bootweights + refstrb = """ + ReplicateDesign: + data: 183×4046 DataFrame + strata: none + cluster: dnum + [637, 637, 637 … 448] + popsize: [6190.0, 6190.0, 6190.0 … 6190.0] + sampsize: [15, 15, 15 … 15] + weights: [33.8, 33.8, 33.8 … 33.8] + probs: [0.0295, 0.0295, 0.0295 … 0.0295] + replicates: 4000""" + + show(io, MIME("text/plain"), clus_one_stageb) + strb = String(take!(io)) + @test strb == refstrb +end From d3d0a445a4e39415b2bcdfd78ca6e1e1f0d960b2 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Fri, 13 Jan 2023 14:08:41 +0530 Subject: [PATCH 32/80] Update README according to new `show` --- README.md | 218 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 139 insertions(+), 79 deletions(-) diff --git a/README.md b/README.md index 65f70cd2..3f1542a3 100644 --- a/README.md +++ b/README.md @@ -6,111 +6,171 @@ [![codecov](https://codecov.io/gh/xKDR/Survey.jl/branch/main/graph/badge.svg?token=4PFSF47BT2)](https://codecov.io/gh/xKDR/Survey.jl) [![Milestones](https://img.shields.io/badge/-milestones-brightgreen)](https://github.com/xKDR/Survey.jl/milestones) +This package is used to study complex survey data. It aims to be a fast alternative +to the [Survey package in R](https://cran.r-project.org/web/packages/survey/index.html) +developed by [Professor Thomas Lumley](https://www.stat.auckland.ac.nz/people/tlum005). -This package is used to study complex survey data. It aims to be a fast alternative to the [Survey package in R](https://cran.r-project.org/web/packages/survey/index.html) developed by [Professor Thomas Lumley](https://www.stat.auckland.ac.nz/people/tlum005). +All types of survey design are supported by this package. -This package currently supports simple random sample and stratified sample. In future releases, it will support multistage sampling as well. +> **_NOTE:_** For multistage sampling a single stage approximation is used. For +more information see the [TODO](https://xkdr.github.io/Survey.jl/dev/) section of +the documentation. -## Documentation -See [Documentation](https://xkdr.github.io/Survey.jl/dev/) to learn how to use the package - -## How to install +## Installation ```julia ] add "https://github.com/xKDR/Survey.jl.git" ``` + ## Basic usage -### Simple Random Sample +The `SurveyDesign` constructor can take data corresponding to any type of design. +Depending on the keyword arguments passed, the data is processed in order to obtain +correct results for the given design. -In the following example, we will load a simple random sample of the Academic Performance Index dataset for Californian schools and do basic analysis. -```julia -using Survey +The following examples show how to create and manipulate different survey designs +using the [Academic Performance Index dataset for Californian schools](https://r-survey.r-forge.r-project.org/survey/html/api.html). + +### Constructing a survey design + +A survey design can be created by calling the constructor with some keywords, +depending on the survey type. Let's create a simple random sample, a stratified +sample, a single-stage and a two-stage cluster sample. -srs = load_data("apisrs") +```julia +julia> apisrs = load_data("apisrs"); + +julia> srs = SurveyDesign(apisrs; weights=:pw) +SurveyDesign: +data: 200×47 DataFrame +strata: none +cluster: none +popsize: [6190.0, 6190.0, 6190.0 … 6190.0] +sampsize: [200, 200, 200 … 200] +weights: [31.0, 31.0, 31.0 … 31.0] +probs: [0.0323, 0.0323, 0.0323 … 0.0323] + +julia> apistrat = load_data("apistrat"); + +julia> strat = SurveyDesign(apistrat; strata=:stype, weights=:pw) +SurveyDesign: +data: 200×46 DataFrame +strata: stype + [E, E, E … H] +cluster: none +popsize: [6190.0, 6190.0, 6190.0 … 6190.0] +sampsize: [200, 200, 200 … 200] +weights: [44.2, 44.2, 44.2 … 15.1] +probs: [0.0226, 0.0226, 0.0226 … 0.0662] + +julia> apiclus1 = load_data("apiclus1"); + +julia> clus_one_stage = SurveyDesign(apiclus1; clusters=:dnum, weights=:pw) +SurveyDesign: +data: 183×46 DataFrame +strata: none +cluster: dnum + [637, 637, 637 … 448] +popsize: [6190.0, 6190.0, 6190.0 … 6190.0] +sampsize: [15, 15, 15 … 15] +weights: [33.8, 33.8, 33.8 … 33.8] +probs: [0.0295, 0.0295, 0.0295 … 0.0295] + +julia> apiclus2 = load_data("apiclus2"); + +julia> clus_two_stage = SurveyDesign(apiclus2; clusters=[:dnum, :snum], weights=:pw) +SurveyDesign: +data: 126×47 DataFrame +strata: none +cluster: dnum + [15, 63, 83 … 795] +popsize: [5130.0, 5130.0, 5130.0 … 5130.0] +sampsize: [40, 40, 40 … 40] +weights: [18.9, 18.9, 18.9 … 18.9] +probs: [0.0528, 0.0528, 0.0528 … 0.0528] +``` -dsrs = SimpleRandomSample(srs; weights = :pw) +Using these designs we can compute estimates of statistics such as mean and +population total. The designs must first be resampled using +[bootstrapping](https://en.wikipedia.org/wiki/Bootstrapping_(statistics)) in order +to compute the standard errors. -mean(:api00, dsrs) +```julia +julia> bootsrs = bootweights(srs; replicates=1000) +ReplicateDesign: +data: 200×1047 DataFrame +strata: none +cluster: none +popsize: [6190.0, 6190.0, 6190.0 … 6190.0] +sampsize: [200, 200, 200 … 200] +weights: [31.0, 31.0, 31.0 … 31.0] +probs: [0.0323, 0.0323, 0.0323 … 0.0323] +replicates: 1000 + +julia> mean(:api00, bootsrs) 1×2 DataFrame - Row │ mean SE - │ Float64 Float64 + Row │ mean SE + │ Float64 Float64 ─────┼────────────────── - 1 │ 656.585 9.24972 + 1 │ 656.585 9.5409 -total(:enroll, dsrs) +julia> total(:enroll, bootsrs) 1×2 DataFrame - Row │ total SE - │ Float64 Float64 -─────┼───────────────────── - 1 │ 3.62107e6 1.6952e5 - -mean(:api00, :cname, dsrs) -38×3 DataFrame - Row │ cname mean SE - │ String15 Float64 Float64 -─────┼──────────────────────────────────── - 1 │ Kern 573.6 42.8026 - 2 │ Los Angeles 658.156 21.0728 - 3 │ Orange 749.333 27.0613 - ⋮ │ ⋮ ⋮ ⋮ - 36 │ Napa 727.0 46.722 - 37 │ Lake 804.0 NaN - 38 │ Merced 595.0 NaN - -quantile(:enroll,dsrs,[0.1,0.2,0.5,0.75,0.95]) -5×2 DataFrame - Row │ probability quantile - │ Float64 Float64 -─────┼─────────────────────── - 1 │ 0.1 245.5 - 2 │ 0.2 317.6 - 3 │ 0.5 453.0 - 4 │ 0.75 668.5 - 5 │ 0.95 1473.1 + Row │ total SE + │ Float64 Float64 +─────┼────────────────────── + 1 │ 3.62107e6 1.72846e5 ``` -### Stratified Sample - -In the following example, we will load a stratified sample of the Academic Performance Index dataset for Californian schools and do basic analysis. +Now we know the mean academic performance index from the year 2000 and the total +number of students enrolled in the sampled Californian schools. We can also +calculate the statistic of multiple variables in one go... ```julia -using Survey +julia> mean([:api99, :api00], bootsrs) +2×3 DataFrame + Row │ names mean SE + │ String Float64 Float64 +─────┼────────────────────────── + 1 │ api99 624.685 9.84669 + 2 │ api00 656.585 9.5409 +``` + +... or we can calculate domain estimates: -strat = load_data("apistrat") +```julia +julia> total(:enroll, :cname, bootsrs) +38×3 DataFrame + Row │ cname total SE + │ String15 Float64 Any +─────┼──────────────────────────────────────────── + 1 │ Kern 1.95823e5 74731.2 + 2 │ Los Angeles 867129.0 1.36622e5 + 3 │ Orange 1.68786e5 63858.0 + 4 │ San Luis Obispo 6720.49 6790.49 + ⋮ │ ⋮ ⋮ ⋮ + 35 │ Calaveras 12976.4 13241.6 + 36 │ Napa 39239.0 30181.9 + 37 │ Lake 6410.79 6986.29 + 38 │ Merced 15392.1 15202.2 + 30 rows omitted +``` -dstrat = StratifiedSample(strat, :stype; weights = :pw, popsize = :fpc) +This gives us the total number of enrolled students in each county. -mean(:api00, dstrat) -1×2 DataFrame - Row │ mean SE - │ Float64 Float64 -─────┼────────────────── - 1 │ 662.287 9.40894 +All functionalities are supported by each design type. For a more complete guide, +see the [Tutorial](https://xkdr.github.io/Survey.jl/dev/#Basic-demo) section in +the documentation. -total(:api00, dstrat) -1×2 DataFrame - Row │ total SE - │ Float64 Float64 -─────┼──────────────────── - 1 │ 4.10221e6 58279.0 - -mean(:api00, :cname, dstrat) -40×3 DataFrame - Row │ cname mean SE - │ String15 Float64 Float64 -─────┼─────────────────────────────────────── - 1 │ Los Angeles 633.511 21.3912 - 2 │ Ventura 707.172 31.6856 - 3 │ Kern 678.235 53.1337 - ⋮ │ ⋮ ⋮ ⋮ - 39 │ Mendocino 632.018 1.04942 - 40 │ Butte 627.0 0.0 -``` +## Future goals -## Strategic goals -We want to implement all the features provided by the [Survey package in R](https://cran.r-project.org/web/packages/survey/index.html) +We want to implement all the features provided by the +[Survey package in R](https://cran.r-project.org/web/packages/survey/index.html) +in a Julia-native way. The main goal is to have a complete package that provides +a large range of functionality and takes efficiency into consideration, such that +large surveys can be analysed fast. -The [milestones](https://github.com/xKDR/Survey.jl/milestones) sections of the repository contains a list of features that contributors can implement in the short-term. +The [milestones](https://github.com/xKDR/Survey.jl/milestones) section of the repository +contains a list of features that contributors can implement in the short-term. ## Support From dbc6ba771cbbffb038291b560a04adeb9e6e87de Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Fri, 13 Jan 2023 14:12:16 +0530 Subject: [PATCH 33/80] Change Future goals to Goals --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3f1542a3..df4f5696 100644 --- a/README.md +++ b/README.md @@ -161,7 +161,7 @@ All functionalities are supported by each design type. For a more complete guide see the [Tutorial](https://xkdr.github.io/Survey.jl/dev/#Basic-demo) section in the documentation. -## Future goals +## Goals We want to implement all the features provided by the [Survey package in R](https://cran.r-project.org/web/packages/survey/index.html) From bd59a671dcf2154e7378cf0bf5bc37db33dd8ef0 Mon Sep 17 00:00:00 2001 From: smishr Date: Fri, 13 Jan 2023 20:28:09 +0530 Subject: [PATCH 34/80] add ht.jl to gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index e4441b23..494288f6 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ .gitignore .DS_Store *.json +src/ht.jl \ No newline at end of file From 3f4c07ff30d21f9dd204c803ac820c88484e5540 Mon Sep 17 00:00:00 2001 From: smishr Date: Sat, 14 Jan 2023 01:42:05 +0530 Subject: [PATCH 35/80] digits=4 in show, combine not popsize in weight, strat tests --- src/SurveyDesign.jl | 48 +++++++++++++++++++++++++-------------------- src/show.jl | 6 +++--- 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 596d925a..c2dde058 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -25,21 +25,22 @@ individuals in one cluster are sampled. The clusters are considered disjoint and - `strata::Union{Nothing, Symbol}=nothing`: the stratification variable. - `clusters::Union{Nothing, Symbol, Vector{Symbol}}=nothing`: the clustering variable. - `weights::Union{Nothing, Symbol}=nothing`: the sampling weights. -- `popsize::Union{Nothing, Int, Symbol}=nothing`: the (expected) survey population size. +- `popsize::Union{Nothing, Symbol}=nothing`: the (expected) survey population size. ```jldoctest -julia> apistrat = load_data("apistrat"); +julia> apiclus1 = load_data("apiclus1"); -julia> strat = SurveyDesign(apistrat; strata=:stype, weights=:pw) +julia> dclus1 = SurveyDesign(apiclus1; clusters=:dnum, strata=:stype, weights=:pw) SurveyDesign: -data: 200×46 DataFrame +data: 183×43 DataFrame strata: stype - [E, E, E … H] -cluster: none -popsize: [6190.0, 6190.0, 6190.0 … 6190.0] -sampsize: [200, 200, 200 … 200] -weights: [44.2, 44.2, 44.2 … 15.1] -probs: [0.0226, 0.0226, 0.0226 … 0.0662] + [H, E, E … E] +cluster: dnum + [637, 637, 637 … 448] +popsize: [507.7049, 507.7049, 507.7049 … 507.7049] +sampsize: [15, 15, 15 … 15] +weights: [33.847, 33.847, 33.847 … 33.847] +allprobs: [0.0295, 0.0295, 0.0295 … 0.0295] ``` """ struct SurveyDesign <: AbstractSurveyDesign @@ -74,28 +75,33 @@ struct SurveyDesign <: AbstractSurveyDesign if typeof(clusters) <: Symbol cluster = clusters end - # For one-stage sample only one sampsize vector + # For single-stage approximation only one "effective" sampsize vector sampsize_labels = :sampsize - data[!, sampsize_labels] = fill(length(unique(data[!, cluster])), (nrow(data),)) - if !(typeof(popsize) <: Nothing) + if isa(strata,Symbol) && isnothing(clusters) # If stratified sample then sampsize is inside strata + data[!, sampsize_labels] = transform(groupby(data, strata), nrow => :counts).counts + else + data[!, sampsize_labels] = fill(length(unique(data[!, cluster])), (nrow(data),)) + end + if isa(popsize, Symbol) weights_labels = :weights data[!, weights_labels] = data[!, popsize] ./ data[!, sampsize_labels] - elseif typeof(weights) <: Symbol + elseif isa(weights, Symbol) if !(typeof(data[!, weights]) <: Vector{<:Real}) - error(string("given weights column ", weights , " is not of numeric type")) + throw(ArgumentError(string("given weights column ", weights , " is not of numeric type"))) + else + weights_labels = weights + # derive popsize from given `weights` + popsize = :popsize + data[!, popsize] = data[!, sampsize_labels] .* data[!, weights_labels] end - weights_labels = weights else + # neither popsize nor weights given weights_labels = :weights data[!, weights_labels] = repeat([1], nrow(data)) end allprobs_labels = :allprobs data[!, allprobs_labels] = 1 ./ data[!, weights_labels] # In one-stage cluster sample, allprobs is just probs, no multiplication needed - pps = false # for now no explicit pps support - if !(typeof(popsize) <: Symbol) - popsize = :popsize - data[!,popsize] = repeat([sum(data[!, weights_labels])], nrow(data)) - end + pps = false # for now no explicit pps supported faster functions, but they can be added new(data, cluster, popsize, sampsize_labels, strata, weights_labels, allprobs_labels, pps) end end diff --git a/src/show.jl b/src/show.jl index a72c3250..730d506b 100644 --- a/src/show.jl +++ b/src/show.jl @@ -5,7 +5,7 @@ Helper function that transforms a given `Number` or `Vector` into a short-form s """ function makeshort(x) if isa(x[1], Float64) - x = round.(x, sigdigits=3) + x = round.(x, digits=4) # Rounded to 4 digits after the decimal place end # print short vectors or single values as they are, compress otherwise if length(x) > 1 @@ -56,6 +56,6 @@ function surveyshow(io::IO, design::AbstractSurveyDesign) printinfo(io, "popsize", makeshort(design.data[!, design.popsize])) printinfo(io, "sampsize", makeshort(design.data[!, design.sampsize])) # weights and probs info - printinfo(io, "weights", makeshort(design.data[!, :weights])) - printinfo(io, "probs", makeshort(design.data[!, :probs]); newline=false) + printinfo(io, "weights", makeshort(design.data[!, design.weights])) + printinfo(io, "allprobs", makeshort(design.data[!, design.allprobs]); newline=false) end From c00d5937d298a90319de903e4eb8c8c9985435bd Mon Sep 17 00:00:00 2001 From: smishr Date: Sat, 14 Jan 2023 01:42:22 +0530 Subject: [PATCH 36/80] stratified tests --- test/SurveyDesign.jl | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl index be515926..baa07b45 100644 --- a/test/SurveyDesign.jl +++ b/test/SurveyDesign.jl @@ -21,7 +21,7 @@ ############################## ### Weights as non-numeric error apisrs = copy(apisrs_original) - @test_throws ErrorException SurveyDesign(apisrs, weights=:stype) + @test_throws ArgumentError SurveyDesign(apisrs, weights=:stype) end @testset "SurveyDesign_strat" begin @@ -34,13 +34,25 @@ end ### weights as Symbol apistrat = copy(apistrat_original) strat_wt = SurveyDesign(apistrat, strata=:stype, weights=:pw) + @test strat_wt.data[!,strat_wt.weights][1] ≈ 44.2100 atol = 1e-4 + @test strat_wt.data[!,strat_wt.weights][200] ≈ 15.1000 atol = 1e-4 @test strat_wt.data[!,strat_wt.weights] == 1 ./ strat_wt.data[!,strat_wt.allprobs] - ### popsize as Symbol + ### popsize as Symbol (should be same as above) apistrat = copy(apistrat_original) strat_pop = SurveyDesign(apistrat, strata=:stype, popsize=:fpc) + @test strat_pop.data[!,strat_pop.weights][1] ≈ 44.2100 atol = 1e-4 + @test strat_pop.data[!,strat_pop.weights][200] ≈ 15.1000 atol = 1e-4 @test strat_pop.data[!,strat_pop.weights] == 1 ./ strat_pop.data[!,strat_pop.allprobs] + ### popsize and weights as Symbol (should be same as above two) + apistrat = copy(apistrat_original) + dstrat = SurveyDesign(apistrat, strata=:stype, weights=:pw, popsize=:fpc) + @test dstrat.data[!,dstrat.weights][1] ≈ 44.2100 atol = 1e-4 + @test dstrat.data[!,dstrat.weights][200] ≈ 15.1000 atol = 1e-4 + @test dstrat.data[!,dstrat.weights] == 1 ./ dstrat.data[!,dstrat.allprobs] ############################## - # @test strat_pop.data[!,strat_pop.weights] == strat_wt.data[!,strat_wt.weights] + # Check all three ways get equivalent weights + @test strat_pop.data[!,strat_pop.weights] ≈ strat_wt.data[!,strat_wt.weights] rtol = 1e-4 + @test strat_wt.data[!,strat_wt.weights] ≈ strat_wt.data[!,strat_wt.weights] rtol = 1e-4 end @testset "SurveyDesign_multistage" begin @@ -51,10 +63,8 @@ end # one-stage cluster sample with popsize apiclus1 = copy(apiclus1_original) dclus1 = SurveyDesign(apiclus1; clusters = :dnum, popsize =:fpc) - @test dclus1.data[!, :weights] ≈ fill(50.4667,size(apiclus1,1)) atol = 1e-3 - @test dclus1.data[!,dclus1.sampsize] ≈ fill(15,size(apiclus1,1)) - @test dclus1.data[!,:allprobs] ≈ dclus1.data[!,:probs] atol = 1e-4 - + @test dclus1.data[!, dclus1.weights] ≈ fill(50.4667,size(apiclus1,1)) atol = 1e-3 + @test dclus1.data[!, dclus1.sampsize] ≈ fill(15,size(apiclus1,1)) ############################## # Load API datasets nhanes = load_data("nhanes") @@ -68,9 +78,9 @@ end ############################## # NHANES nhanes = copy(nhanes_original) - dnhanes = SingleStageSurveyDesign(nhanes; cluster = :SDMVPSU, strata=:SDMVSTRA, weights=:WTMEC2YR) + dnhanes = SurveyDesign(nhanes; clusters = :SDMVPSU, strata=:SDMVSTRA, weights=:WTMEC2YR) ############################## # YRBS yrbs = copy(yrbs_original) - dyrbs = SingleStageSurveyDesign(yrbs; cluster = :psu, strata=:stratum, weights=:weight) + dyrbs = SurveyDesign(yrbs; clusters = :psu, strata=:stratum, weights=:weight) end From ece7181e9417c595c06639f585a0df685282b550 Mon Sep 17 00:00:00 2001 From: smishr Date: Sun, 15 Jan 2023 11:44:04 +0530 Subject: [PATCH 37/80] Add apiclus2 and sampsize testing --- src/SurveyDesign.jl | 2 +- test/SurveyDesign.jl | 67 ++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 63 insertions(+), 6 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index c2dde058..ff67f682 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -89,8 +89,8 @@ struct SurveyDesign <: AbstractSurveyDesign if !(typeof(data[!, weights]) <: Vector{<:Real}) throw(ArgumentError(string("given weights column ", weights , " is not of numeric type"))) else - weights_labels = weights # derive popsize from given `weights` + weights_labels = weights popsize = :popsize data[!, popsize] = data[!, sampsize_labels] .* data[!, weights_labels] end diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl index baa07b45..de74f2f4 100644 --- a/test/SurveyDesign.jl +++ b/test/SurveyDesign.jl @@ -11,11 +11,15 @@ srs_weights = SurveyDesign(apisrs, weights=:pw) @test srs_weights.data[!,srs_weights.weights][1] ≈ 30.97 atol = 1e-4 @test srs_weights.data[!,srs_weights.weights] == 1 ./ srs_weights.data[!,srs_weights.allprobs] + @test srs_weights.data[!,srs_weights.allprobs] ≈ srs_weights.data[!, :derived_probs] atol = 1e-4 + @test srs_weights.data[!,srs_weights.sampsize] ≈ srs_weights.data[!, :derived_sampsize] atol = 1e-4 ### popsize as Symbol apisrs = copy(apisrs_original) srs_pop = SurveyDesign(apisrs, popsize=:fpc) @test srs_pop.data[!,srs_pop.weights][1] ≈ 30.97 atol = 1e-4 @test srs_pop.data[!,srs_pop.weights] == 1 ./ srs_pop.data[!,srs_pop.allprobs] + @test srs_pop.data[!,srs_pop.allprobs] ≈ srs_pop.data[!, :derived_probs] atol = 1e-4 + @test srs_pop.data[!,srs_pop.sampsize] ≈ srs_pop.data[!, :derived_sampsize] atol = 1e-4 ### Both ways should achieve same weights and allprobs! @test srs_pop.data[!,srs_pop.weights] == srs_weights.data[!,srs_weights.weights] ############################## @@ -37,38 +41,91 @@ end @test strat_wt.data[!,strat_wt.weights][1] ≈ 44.2100 atol = 1e-4 @test strat_wt.data[!,strat_wt.weights][200] ≈ 15.1000 atol = 1e-4 @test strat_wt.data[!,strat_wt.weights] == 1 ./ strat_wt.data[!,strat_wt.allprobs] - ### popsize as Symbol (should be same as above) + @test strat_wt.data[!,strat_wt.allprobs] ≈ strat_wt.data[!, :derived_probs] atol = 1e-4 + @test strat_wt.data[!,strat_wt.sampsize] ≈ strat_wt.data[!, :derived_sampsize] atol = 1e-4 + ### popsize as Symbol (should be same as above (for now)) apistrat = copy(apistrat_original) strat_pop = SurveyDesign(apistrat, strata=:stype, popsize=:fpc) @test strat_pop.data[!,strat_pop.weights][1] ≈ 44.2100 atol = 1e-4 @test strat_pop.data[!,strat_pop.weights][200] ≈ 15.1000 atol = 1e-4 @test strat_pop.data[!,strat_pop.weights] == 1 ./ strat_pop.data[!,strat_pop.allprobs] + @test strat_pop.data[!,strat_pop.allprobs] ≈ strat_pop.data[!, :derived_probs] atol = 1e-4 + @test strat_pop.data[!,strat_pop.sampsize] ≈ strat_pop.data[!, :derived_sampsize] atol = 1e-4 ### popsize and weights as Symbol (should be same as above two) apistrat = copy(apistrat_original) dstrat = SurveyDesign(apistrat, strata=:stype, weights=:pw, popsize=:fpc) @test dstrat.data[!,dstrat.weights][1] ≈ 44.2100 atol = 1e-4 @test dstrat.data[!,dstrat.weights][200] ≈ 15.1000 atol = 1e-4 @test dstrat.data[!,dstrat.weights] == 1 ./ dstrat.data[!,dstrat.allprobs] + @test dstrat.data[!,dstrat.allprobs] ≈ dstrat.data[!, :derived_probs] atol = 1e-4 + @test dstrat.data[!,dstrat.sampsize] ≈ dstrat.data[!, :derived_sampsize] atol = 1e-4 ############################## # Check all three ways get equivalent weights @test strat_pop.data[!,strat_pop.weights] ≈ strat_wt.data[!,strat_wt.weights] rtol = 1e-4 @test strat_wt.data[!,strat_wt.weights] ≈ strat_wt.data[!,strat_wt.weights] rtol = 1e-4 end -@testset "SurveyDesign_multistage" begin +@testset "SurveyDesign_apiclus1" begin # Load API datasets apiclus1_original = load_data("apiclus1") apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column + apiclus1_original[!, :derived_probs] = 1 ./ apiclus1_original.pw ############################## # one-stage cluster sample with popsize apiclus1 = copy(apiclus1_original) dclus1 = SurveyDesign(apiclus1; clusters = :dnum, popsize =:fpc) @test dclus1.data[!, dclus1.weights] ≈ fill(50.4667,size(apiclus1,1)) atol = 1e-3 @test dclus1.data[!, dclus1.sampsize] ≈ fill(15,size(apiclus1,1)) - ############################## + @test dclus1.data[!, dclus1.allprobs] ≈ dclus1.data[!, :derived_probs] atol = 1e-4 +end + +@testset "SurveyDesign_apiclus2" begin # Load API datasets - nhanes = load_data("nhanes") - nhanes_design = SurveyDesign(nhanes; clusters = :SDMVPSU, strata = :SDMVSTRA, weights = :WTMEC2YR) + apiclus2_original = load_data("apiclus2") + apiclus2_original[!, :derived_probs] = 1 ./ apiclus2_original.pw + ############################## + calculated_probs_R = [0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.024018254, 0.024018254, 0.024018254, 0.024018254, + 0.024018254, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.007338911, 0.007338911, 0.007338911, + 0.007338911, 0.007338911, 0.052840159, 0.009435743, 0.009435743, + 0.009435743, 0.009435743, 0.009435743, 0.037742970, 0.037742970, + 0.037742970, 0.037742970, 0.037742970, 0.003669455, 0.003669455, + 0.003669455, 0.003669455, 0.003669455, 0.018871485, 0.018871485, + 0.018871485, 0.018871485, 0.018871485, 0.037742970, 0.037742970, + 0.037742970, 0.037742970, 0.037742970, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.029355644, 0.029355644, 0.029355644, + 0.029355644, 0.029355644, 0.052840159, 0.052840159, 0.052840159, + 0.044033465, 0.044033465, 0.044033465, 0.044033465, 0.044033465, + 0.052840159] + + # two stage cluster sampling `with replacement' + apiclus2 = copy(apiclus2_original) + dclus2 = SurveyDesign(apiclus2; clusters = [:dnum,:snum], weights=:pw) # cant pass popsize as Vector + @test dclus2.data[!,dclus2.weights][1] ≈ 1 / calculated_probs_R[1] atol = 1e-4 + @test dclus2.data[!,dclus2.weights][25] ≈ 1 / calculated_probs_R[25] atol = 1e-4 + @test dclus2.data[!,dclus2.weights][121] ≈ 1 / calculated_probs_R[121] atol = 1e-4 + @test dclus2.data[!,dclus2.weights][125] ≈ 1 / calculated_probs_R[125] atol = 1e-4 + + # TODO: sampsize and popsize testing + ## NOT THE SAME AS R object right now + + ######################### + ## Complete multistage sampling (when implemented) should look like + ## weights should theoretically be optional if both clusters and popsize given + # dclus2_complete = SurveyDesign(apiclus2; clusters = [:dnum,:snum], popsize=[:fpc1,:fpc2], {weights=:pw}) end @testset "SurveyDesign_realSurveys" begin From 266fad0c885514ebc1d10d080665daba08cada86 Mon Sep 17 00:00:00 2001 From: smishr Date: Sun, 15 Jan 2023 12:30:33 +0530 Subject: [PATCH 38/80] bootstrap change :weights to design.weights --- src/bootstrap.jl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/bootstrap.jl b/src/bootstrap.jl index 83defc97..a5e9f019 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -4,18 +4,18 @@ julia> using Random julia> apiclus1 = load_data("apiclus1"); -julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum); +julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum, popsize=:fpc); julia> bootweights(clus_one_stage; replicates=1000, rng=MersenneTwister(111)) # choose a seed for deterministic results ReplicateDesign: -data: 183×1046 DataFrame +data: 183×1044 DataFrame strata: none cluster: dnum [637, 637, 637 … 448] -popsize: [183, 183, 183 … 183] +popsize: [757, 757, 757 … 757] sampsize: [15, 15, 15 … 15] -weights: [1, 1, 1 … 1] -probs: [1.0, 1.0, 1.0 … 1.0] +weights: [50.4667, 50.4667, 50.4667 … 50.4667] +allprobs: [0.0198, 0.0198, 0.0198 … 0.0198] replicates: 1000 ``` """ @@ -34,7 +34,7 @@ function bootweights(design::SurveyDesign; replicates=4000, rng=MersenneTwister( rh = [(count(==(i), randinds)) for i in 1:nh] # main bootstrap algo. gdf = groupby(substrata, design.cluster) for i in 1:nh - gdf[i].whij = repeat([rh[i]], nrow(gdf[i])) .* gdf[i].weights .* (nh / (nh - 1)) + gdf[i].whij = repeat([rh[i]], nrow(gdf[i])) .* gdf[i][!,design.weights] .* (nh / (nh - 1)) end stratified[h].whij = transform(gdf).whij @@ -47,5 +47,5 @@ function bootweights(design::SurveyDesign; replicates=4000, rng=MersenneTwister( for i in 2:(replicates) df[!, "replicate_" * string(i)] = disallowmissing(replicate(stratified, H).whij) end - return ReplicateDesign(df, design.cluster, design.popsize, design.sampsize, design.strata, design.pps, replicates) + return ReplicateDesign(df, design.cluster, design.popsize, design.sampsize, design.strata, design.weights, design.allprobs, design.pps, replicates) end From d59b9a48131b86a66ee7c0c79d28a8a79e6512de Mon Sep 17 00:00:00 2001 From: smishr Date: Sun, 15 Jan 2023 12:31:33 +0530 Subject: [PATCH 39/80] Change :weights to design.weights --- src/by.jl | 2 +- src/hist.jl | 2 +- src/mean.jl | 6 +++--- src/plot.jl | 2 +- src/quantile.jl | 2 +- src/total.jl | 2 +- test/plot.jl | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/by.jl b/src/by.jl index a4de2f55..cea2187d 100644 --- a/src/by.jl +++ b/src/by.jl @@ -1,7 +1,7 @@ function bydomain(x::Symbol, domain::Symbol, design::ReplicateDesign, func::Function) gdf = groupby(design.data, domain) nd = length(unique(design.data[!, domain])) - X = combine(gdf, [x, :weights] => ((a, b) -> func(a, weights(b))) => :statistic) + X = combine(gdf, [x, design.weights] => ((a, b) -> func(a, weights(b))) => :statistic) Xt_mat = Array{Float64, 2}(undef, (nd, design.replicates)) for i in 1:design.replicates Xt_mat[:, i] = combine(gdf, [x, Symbol("replicate_"*string(i))] => ((a, c) -> func(a, weights(c))) => :statistic).statistic diff --git a/src/hist.jl b/src/hist.jl index 17b54098..40935a1e 100644 --- a/src/hist.jl +++ b/src/hist.jl @@ -74,7 +74,7 @@ function hist(design::AbstractSurveyDesign, var::Symbol, kwargs... ) hist = histogram(bins = bins, normalization = normalization, kwargs...) - data(design.data) * mapping(var, weights = :weights) * hist |> draw + data(design.data) * mapping(var, weights = design.weights) * hist |> draw end function hist(design::AbstractSurveyDesign, var::Symbol, diff --git a/src/mean.jl b/src/mean.jl index c1d80259..593b1d79 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -10,8 +10,8 @@ julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) julia> mean(:api00, clus_one_stage) 1×2 DataFrame - Row │ mean SE - │ Float64 Float64 + Row │ mean SE + │ Float64 Float64 ─────┼────────────────── 1 │ 644.169 23.2919 @@ -25,7 +25,7 @@ julia> mean([:api00, :enroll], clus_one_stage) ``` """ function mean(x::Symbol, design::ReplicateDesign) - X = mean(design.data[!, x], weights(design.data.weights)) + X = mean(design.data[!, x], weights(design.data[!,design.weights])) Xt = [mean(design.data[!, x], weights(design.data[! , "replicate_"*string(i)])) for i in 1:design.replicates] variance = sum((Xt .- X).^2) / design.replicates DataFrame(mean = X, SE = sqrt(variance)) diff --git a/src/plot.jl b/src/plot.jl index 7dd4f555..79f1b97d 100644 --- a/src/plot.jl +++ b/src/plot.jl @@ -16,5 +16,5 @@ save("scatter.png", s); nothing # hide ![](assets/scatter.png) """ function plot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...) - data(design.data) * mapping(x, y, markersize = :weights) * visual(Scatter, marker = '○') |> draw + data(design.data) * mapping(x, y, markersize = design.weights) * visual(Scatter, marker = '○') |> draw end diff --git a/src/quantile.jl b/src/quantile.jl index d4e399a5..09ba9326 100644 --- a/src/quantile.jl +++ b/src/quantile.jl @@ -37,7 +37,7 @@ julia> quantile(:enroll,srs,[0.1,0.2,0.5,0.75,0.95]) function quantile(var::Symbol, design::SurveyDesign, p::Union{<:Real,Vector{<:Real}}; alpha::Float64=0.05, ci::Bool=false, se::Bool=false, qrule="hf7",kwargs...) v = design.data[!, var] - probs = design.data[!, :probs] + probs = design.data[!, design.allprobs] df = DataFrame(probability=p, quantile=Statistics.quantile(v, ProbabilityWeights(probs), p)) # TODO: Add CI and SE of the quantile return df diff --git a/src/total.jl b/src/total.jl index 0c5001e5..1b200797 100644 --- a/src/total.jl +++ b/src/total.jl @@ -25,7 +25,7 @@ julia> total([:api00, :enroll], clus_one_stage) ``` """ function total(x::Symbol, design::ReplicateDesign) - X = wsum(design.data[!, x], weights(design.data.weights)) + X = wsum(design.data[!, x], weights(design.data[!,design.weights])) Xt = [wsum(design.data[!, x], weights(design.data[! , "replicate_"*string(i)])) for i in 1:design.replicates] variance = sum((Xt .- X).^2) / design.replicates DataFrame(total = X, SE = sqrt(variance)) diff --git a/test/plot.jl b/test/plot.jl index c2476f65..7e31fc74 100644 --- a/test/plot.jl +++ b/test/plot.jl @@ -3,7 +3,7 @@ apisrs = load_data("apisrs") srs = SurveyDesign(apisrs, weights=:pw) s = plot(srs, :api99, :api00) - @test s.grid[1].entries[1].named[:markersize] == srs.data.weights + @test s.grid[1].entries[1].named[:markersize] == srs.data[!,srs.weights] @test s.grid[1].entries[1].positional[1] == srs.data.api99 @test s.grid[1].entries[1].positional[2] == srs.data.api00 # StratifiedSample From 7c4706cce57214552d1796296d258bd104a04546 Mon Sep 17 00:00:00 2001 From: smishr Date: Sun, 15 Jan 2023 12:32:15 +0530 Subject: [PATCH 40/80] Update ReplicateDesign struct and doctest --- src/SurveyDesign.jl | 12 +++++++----- test/SurveyDesign.jl | 4 ++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index ff67f682..a08cd1d9 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -118,14 +118,14 @@ julia> strat = SurveyDesign(apistrat; strata=:stype, weights=:pw); julia> bootstrat = bootweights(strat; replicates=1000) ReplicateDesign: -data: 200×1046 DataFrame +data: 200×1044 DataFrame strata: stype [E, E, E … H] cluster: none -popsize: [6190.0, 6190.0, 6190.0 … 6190.0] -sampsize: [200, 200, 200 … 200] -weights: [44.2, 44.2, 44.2 … 15.1] -probs: [0.0226, 0.0226, 0.0226 … 0.0662] +popsize: [4420.9999, 4420.9999, 4420.9999 … 755.0] +sampsize: [100, 100, 100 … 50] +weights: [44.21, 44.21, 44.21 … 15.1] +allprobs: [0.0226, 0.0226, 0.0226 … 0.0662] replicates: 1000 ``` """ @@ -135,6 +135,8 @@ struct ReplicateDesign <: AbstractSurveyDesign popsize::Symbol sampsize::Symbol strata::Symbol + weights::Symbol # Effective weights in case of singlestage approx supported + allprobs::Symbol # Right now only singlestage approx supported pps::Bool replicates::UInt end diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl index de74f2f4..89dff3bf 100644 --- a/test/SurveyDesign.jl +++ b/test/SurveyDesign.jl @@ -119,9 +119,9 @@ end @test dclus2.data[!,dclus2.weights][121] ≈ 1 / calculated_probs_R[121] atol = 1e-4 @test dclus2.data[!,dclus2.weights][125] ≈ 1 / calculated_probs_R[125] atol = 1e-4 - # TODO: sampsize and popsize testing + # TODO: sampsize and popsize testing once #178 resolved ## NOT THE SAME AS R object right now - + ######################### ## Complete multistage sampling (when implemented) should look like ## weights should theoretically be optional if both clusters and popsize given From 7eeb1faa779a97e9f908da8fa80e187082b9077a Mon Sep 17 00:00:00 2001 From: smishr Date: Sun, 15 Jan 2023 12:38:16 +0530 Subject: [PATCH 41/80] Update show testing suite --- test/show.jl | 52 ++++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/test/show.jl b/test/show.jl index f81ed2a3..1231a910 100644 --- a/test/show.jl +++ b/test/show.jl @@ -5,13 +5,13 @@ srs = SurveyDesign(apisrs; weights=:pw) refstr = """ SurveyDesign: - data: 200×47 DataFrame + data: 200×45 DataFrame strata: none cluster: none - popsize: [6190.0, 6190.0, 6190.0 … 6190.0] + popsize: [6194.0, 6194.0, 6194.0 … 6194.0] sampsize: [200, 200, 200 … 200] - weights: [31.0, 31.0, 31.0 … 31.0] - probs: [0.0323, 0.0323, 0.0323 … 0.0323]""" + weights: [30.97, 30.97, 30.97 … 30.97] + allprobs: [0.0323, 0.0323, 0.0323 … 0.0323]""" show(io, MIME("text/plain"), srs) str = String(take!(io)) @@ -20,13 +20,13 @@ bsrs = srs |> bootweights refstrb = """ ReplicateDesign: - data: 200×4047 DataFrame + data: 200×4045 DataFrame strata: none cluster: none - popsize: [6190.0, 6190.0, 6190.0 … 6190.0] + popsize: [6194.0, 6194.0, 6194.0 … 6194.0] sampsize: [200, 200, 200 … 200] - weights: [31.0, 31.0, 31.0 … 31.0] - probs: [0.0323, 0.0323, 0.0323 … 0.0323] + weights: [30.97, 30.97, 30.97 … 30.97] + allprobs: [0.0323, 0.0323, 0.0323 … 0.0323] replicates: 4000""" show(io, MIME("text/plain"), bsrs) @@ -41,14 +41,14 @@ end strat = SurveyDesign(apistrat; strata=:stype, weights=:pw) refstr = """ SurveyDesign: - data: 200×46 DataFrame + data: 200×44 DataFrame strata: stype [E, E, E … H] cluster: none - popsize: [6190.0, 6190.0, 6190.0 … 6190.0] - sampsize: [200, 200, 200 … 200] - weights: [44.2, 44.2, 44.2 … 15.1] - probs: [0.0226, 0.0226, 0.0226 … 0.0662]""" + popsize: [4420.9999, 4420.9999, 4420.9999 … 755.0] + sampsize: [100, 100, 100 … 50] + weights: [44.21, 44.21, 44.21 … 15.1] + allprobs: [0.0226, 0.0226, 0.0226 … 0.0662]""" show(io, MIME("text/plain"), strat) str = String(take!(io)) @@ -57,14 +57,14 @@ end stratb = strat |> bootweights refstrb = """ ReplicateDesign: - data: 200×4046 DataFrame + data: 200×4044 DataFrame strata: stype [E, E, E … H] cluster: none - popsize: [6190.0, 6190.0, 6190.0 … 6190.0] - sampsize: [200, 200, 200 … 200] - weights: [44.2, 44.2, 44.2 … 15.1] - probs: [0.0226, 0.0226, 0.0226 … 0.0662] + popsize: [4420.9999, 4420.9999, 4420.9999 … 755.0] + sampsize: [100, 100, 100 … 50] + weights: [44.21, 44.21, 44.21 … 15.1] + allprobs: [0.0226, 0.0226, 0.0226 … 0.0662] replicates: 4000""" show(io, MIME("text/plain"), stratb) @@ -79,14 +79,14 @@ end clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) refstr = """ SurveyDesign: - data: 183×46 DataFrame + data: 183×44 DataFrame strata: none cluster: dnum [637, 637, 637 … 448] - popsize: [6190.0, 6190.0, 6190.0 … 6190.0] + popsize: [507.7049, 507.7049, 507.7049 … 507.7049] sampsize: [15, 15, 15 … 15] - weights: [33.8, 33.8, 33.8 … 33.8] - probs: [0.0295, 0.0295, 0.0295 … 0.0295]""" + weights: [33.847, 33.847, 33.847 … 33.847] + allprobs: [0.0295, 0.0295, 0.0295 … 0.0295]""" show(io, MIME("text/plain"), clus_one_stage) str = String(take!(io)) @@ -95,14 +95,14 @@ end clus_one_stageb = clus_one_stage |> bootweights refstrb = """ ReplicateDesign: - data: 183×4046 DataFrame + data: 183×4044 DataFrame strata: none cluster: dnum [637, 637, 637 … 448] - popsize: [6190.0, 6190.0, 6190.0 … 6190.0] + popsize: [507.7049, 507.7049, 507.7049 … 507.7049] sampsize: [15, 15, 15 … 15] - weights: [33.8, 33.8, 33.8 … 33.8] - probs: [0.0295, 0.0295, 0.0295 … 0.0295] + weights: [33.847, 33.847, 33.847 … 33.847] + allprobs: [0.0295, 0.0295, 0.0295 … 0.0295] replicates: 4000""" show(io, MIME("text/plain"), clus_one_stageb) From b79fce0419e476e96c56174cae3622fd1367221c Mon Sep 17 00:00:00 2001 From: smishr <43640926+smishr@users.noreply.github.com> Date: Mon, 16 Jan 2023 13:23:37 +0530 Subject: [PATCH 42/80] Update .gitignore acidentally pushed local gitignore Co-authored-by: Ayush Patnaik --- .gitignore | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 494288f6..1d4d0304 100644 --- a/.gitignore +++ b/.gitignore @@ -8,5 +8,4 @@ /dev/* .gitignore .DS_Store -*.json -src/ht.jl \ No newline at end of file +*.json \ No newline at end of file From 5275be9a14ba6499fb30acd966fffabbeee2dffe Mon Sep 17 00:00:00 2001 From: smishr Date: Mon, 16 Jan 2023 13:52:41 +0530 Subject: [PATCH 43/80] Append _ to :weights :popsize :sampsize --- src/SurveyDesign.jl | 10 +++++----- src/ratio.jl | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index a08cd1d9..2ee5a9c3 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -76,14 +76,14 @@ struct SurveyDesign <: AbstractSurveyDesign cluster = clusters end # For single-stage approximation only one "effective" sampsize vector - sampsize_labels = :sampsize - if isa(strata,Symbol) && isnothing(clusters) # If stratified sample then sampsize is inside strata + sampsize_labels = :_sampsize + if isa(strata,Symbol) && isnothing(clusters) # If stratified only then sampsize is inside strata data[!, sampsize_labels] = transform(groupby(data, strata), nrow => :counts).counts else data[!, sampsize_labels] = fill(length(unique(data[!, cluster])), (nrow(data),)) end if isa(popsize, Symbol) - weights_labels = :weights + weights_labels = :_weights data[!, weights_labels] = data[!, popsize] ./ data[!, sampsize_labels] elseif isa(weights, Symbol) if !(typeof(data[!, weights]) <: Vector{<:Real}) @@ -91,12 +91,12 @@ struct SurveyDesign <: AbstractSurveyDesign else # derive popsize from given `weights` weights_labels = weights - popsize = :popsize + popsize = :_popsize data[!, popsize] = data[!, sampsize_labels] .* data[!, weights_labels] end else # neither popsize nor weights given - weights_labels = :weights + weights_labels = :_weights data[!, weights_labels] = repeat([1], nrow(data)) end allprobs_labels = :allprobs diff --git a/src/ratio.jl b/src/ratio.jl index 1623eb3a..ebfef889 100644 --- a/src/ratio.jl +++ b/src/ratio.jl @@ -17,14 +17,14 @@ julia> ratio(:api00, :enroll, clus_one_stage) ``` """ function ratio(variable_num:: Symbol, variable_den:: Symbol, design::SurveyDesign) - statistic = wsum(design.data[!,variable_num],design.data.weights)/wsum(design.data[!,variable_den],design.data.weights) + statistic = wsum(design.data[!,variable_num],design.data[!,design.weights])/wsum(design.data[!,variable_den],design.data[!,design.weights]) nh = length(unique(design.data[!,design.cluster])) newv = [] gdf = groupby(design.data, design.cluster) replicates = [filter(n -> n != i, 1:nh) for i in 1:nh] for i in replicates df = DataFrame(gdf[i]) - push!(newv, wsum(df[!,variable_num],df[!,:weights])/wsum(df[!,variable_den],df[!,:weights])) + push!(newv, wsum(df[!,variable_num],df[!,design.weights])/wsum(df[!,variable_den],df[!,design.weights])) end c = 0 for i in 1:nh From b675dd9003aba7125b6186a2df4b0f5941a6786f Mon Sep 17 00:00:00 2001 From: ayushpatnaikgit Date: Wed, 18 Jan 2023 22:44:13 +0530 Subject: [PATCH 44/80] Add SE for quantile --- src/quantile.jl | 55 ++++++++++++++++++++++++++++++------------------ test/quantile.jl | 17 ++------------- 2 files changed, 37 insertions(+), 35 deletions(-) diff --git a/src/quantile.jl b/src/quantile.jl index 09ba9326..81003e43 100644 --- a/src/quantile.jl +++ b/src/quantile.jl @@ -13,32 +13,47 @@ The Julia, R and Python-numpy use the same defaults ```jldoctest julia> apisrs = load_data("apisrs"); -julia> srs = SurveyDesign(apisrs; weights=:pw); +julia> srs = SurveyDesign(apisrs; weights=:pw) |> bootweights; julia> quantile(:api00,srs,0.5) 1×2 DataFrame - Row │ probability quantile - │ Float64 Float64 -─────┼─────────────────────── - 1 │ 0.5 659.0 - -julia> quantile(:enroll,srs,[0.1,0.2,0.5,0.75,0.95]) -5×2 DataFrame - Row │ probability quantile - │ Float64 Float64 -─────┼─────────────────────── - 1 │ 0.1 245.5 - 2 │ 0.2 317.6 - 3 │ 0.5 453.0 - 4 │ 0.75 668.5 - 5 │ 0.95 1473.1 + Row │ 0.5th percentile SE + │ Float64 Float64 +─────┼─────────────────────────── + 1 │ 659.0 14.9764 ``` """ -function quantile(var::Symbol, design::SurveyDesign, p::Union{<:Real,Vector{<:Real}}; - alpha::Float64=0.05, ci::Bool=false, se::Bool=false, qrule="hf7",kwargs...) +function quantile(var::Symbol, design::ReplicateDesign, p::Real;kwargs...) v = design.data[!, var] probs = design.data[!, design.allprobs] - df = DataFrame(probability=p, quantile=Statistics.quantile(v, ProbabilityWeights(probs), p)) - # TODO: Add CI and SE of the quantile + X = Statistics.quantile(v, ProbabilityWeights(probs), p) + Xt = [Statistics.quantile(v, ProbabilityWeights(design.data[! , "replicate_"*string(i)]), p) for i in 1:design.replicates] + variance = sum((Xt .- X).^2) / design.replicates + df = DataFrame(percentile = X, SE = sqrt(variance)) + rename!(df, :percentile => string(p) * "th percentile") return df +end + +""" +```jldoctest +julia> apisrs = load_data("apisrs"); + +julia> srs = SurveyDesign(apisrs; weights=:pw) |> bootweights; + +julia> quantile(:enroll,srs,[0.1,0.2,0.5,0.75,0.95]) +5×3 DataFrame + Row │ percentile statistic SE + │ String Float64 Float64 +─────┼───────────────────────────────── + 1 │ 0.1 245.5 20.2964 + 2 │ 0.2 317.6 13.5435 + 3 │ 0.5 453.0 24.9719 + 4 │ 0.75 668.5 34.2487 + 5 │ 0.95 1473.1 142.568 +``` +""" +function quantile(var::Symbol, design::ReplicateDesign, probs::Vector{<:Real}; kwargs...) + df = vcat([rename!(quantile(var, design, prob; kwargs...),[:statistic, :SE]) for prob in probs]...) + df.percentile = string.(probs) + return df[!, [:percentile, :statistic, :SE]] end \ No newline at end of file diff --git a/test/quantile.jl b/test/quantile.jl index 59bb0a69..cca58da0 100644 --- a/test/quantile.jl +++ b/test/quantile.jl @@ -6,21 +6,8 @@ apisrs_original[!, :derived_sampsize] = fill(200.0, size(apisrs_original, 1)) ############################## apisrs = copy(apisrs_original) - srs_design = SurveyDesign(apisrs; weights=:pw) - @test quantile(:api00, srs_design, 0.5)[!,2][1] ≈ 659.0 atol=1e-4 + srs_design = SurveyDesign(apisrs; weights=:pw) |> bootweights + @test quantile(:api00, srs_design, 0.5)[!,1][1] ≈ 659.0 atol=1e-4 @test quantile(:api00, srs_design, [0.1753,0.25,0.5,0.75,0.975])[!,2] ≈ [512.8847,544,659,752.5,905] atol = 1e-4 @test quantile(:enroll,srs_design, [0.1,0.2,0.5,0.75,0.95])[!,2] ≈ [245.5,317.6,453.0,668.5,1473.1] atol = 1e-4 -end - -@testset "quantile_Stratified" begin - ##### StratifiedSample tests - # Load API datasets - apistrat_original = load_data("apistrat") - apistrat_original[!, :derived_probs] = 1 ./ apistrat_original.pw - apistrat_original[!, :derived_sampsize] = apistrat_original.fpc ./ apistrat_original.pw - # base functionality - apistrat = copy(apistrat_original) - dstrat = SurveyDesign(apistrat; strata = :stype, popsize = :fpc) - # Check which definition of quantile for StratifiedSample - # @test quantile(:enroll, dstrat, [0.1,0.2,0.5,0.75,0.95])[!,2] ≈ [262,309.3366,446.4103,658.8764,1589.7881] atol = 1e-4 end \ No newline at end of file From 8bb8bfd9db924394471570b99d331b6e77a291a5 Mon Sep 17 00:00:00 2001 From: ayushpatnaikgit Date: Thu, 19 Jan 2023 14:55:59 +0530 Subject: [PATCH 45/80] Remove jackknife and using bootstrap in ratio estimation. --- src/Survey.jl | 1 - src/jackknife.jl | 16 ---------------- src/ratio.jl | 33 +++++++++++---------------------- test/jackknife.jl | 8 -------- test/ratio.jl | 6 +++--- 5 files changed, 14 insertions(+), 50 deletions(-) delete mode 100644 src/jackknife.jl delete mode 100644 test/jackknife.jl diff --git a/src/Survey.jl b/src/Survey.jl index f25e33a7..66de8042 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -17,7 +17,6 @@ include("SurveyDesign.jl") include("bootstrap.jl") include("mean.jl") include("quantile.jl") -include("jackknife.jl") include("total.jl") include("load_data.jl") include("hist.jl") diff --git a/src/jackknife.jl b/src/jackknife.jl deleted file mode 100644 index 55880df9..00000000 --- a/src/jackknife.jl +++ /dev/null @@ -1,16 +0,0 @@ -function jkknife(variable:: Symbol, design::SurveyDesign ,func:: Function; params =[]) - statistic = func(design.data[!,variable],params...) - nh = length(unique(design.data[!,design.cluster])) - newv = [] - gdf = groupby(design.data, design.cluster) - replicates = [filter(n -> n != i, 1:nh) for i in 1:nh] - for i in replicates - push!(newv,func(DataFrame(gdf[i])[!,variable])) - end - c = 0 - for i in 1:nh - c = c+(newv[i]-statistic)^2 - end - var = c*(nh-1)/nh - return DataFrame(Statistic = statistic, SE = sqrt(var)) -end diff --git a/src/ratio.jl b/src/ratio.jl index ebfef889..8a0226f3 100644 --- a/src/ratio.jl +++ b/src/ratio.jl @@ -6,30 +6,19 @@ Estimate the ratio of the columns specified in numerator and denominator ```jldoctest julia> apiclus1 = load_data("apiclus1"); -julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); +julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) |> bootweights; -julia> ratio(:api00, :enroll, clus_one_stage) +ratio(:api00, :enroll, clus_one_stage) 1×2 DataFrame - Row │ Statistic SE - │ Float64 Float64 -─────┼───────────────────── - 1 │ 1.17182 0.151242 + Row │ ratio SE + │ Float64 Float64 +─────┼─────────────────── + 1 │ 1.17182 0.130834 ``` """ -function ratio(variable_num:: Symbol, variable_den:: Symbol, design::SurveyDesign) - statistic = wsum(design.data[!,variable_num],design.data[!,design.weights])/wsum(design.data[!,variable_den],design.data[!,design.weights]) - nh = length(unique(design.data[!,design.cluster])) - newv = [] - gdf = groupby(design.data, design.cluster) - replicates = [filter(n -> n != i, 1:nh) for i in 1:nh] - for i in replicates - df = DataFrame(gdf[i]) - push!(newv, wsum(df[!,variable_num],df[!,design.weights])/wsum(df[!,variable_den],df[!,design.weights])) - end - c = 0 - for i in 1:nh - c = c+(newv[i]-statistic)^2 - end - var = c*(nh-1)/nh - return DataFrame(Statistic = statistic, SE = sqrt(var)) +function ratio(variable_num::Symbol, variable_den::Symbol, design::ReplicateDesign) + X = wsum(design.data[!, variable_num], design.data[!, design.weights]) / wsum(design.data[!, variable_den], design.data[!, design.weights]) + Xt = [(wsum(design.data[!, variable_num], weights(design.data[! , "replicate_"*string(i)]))) / (wsum(design.data[!, variable_den], weights(design.data[! , "replicate_"*string(i)]))) for i in 1:design.replicates] + variance = sum((Xt .- X).^2) / design.replicates + DataFrame(ratio = X, SE = sqrt(variance)) end diff --git a/test/jackknife.jl b/test/jackknife.jl deleted file mode 100644 index 25e35e91..00000000 --- a/test/jackknife.jl +++ /dev/null @@ -1,8 +0,0 @@ -@testset "jackknife.jl" begin - apiclus1_original = load_data("apiclus1") - apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column - apiclus1 = copy(apiclus1_original) - dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights=:pw); - @test jkknife(:api00, dclus1, mean).SE[1] ≈ 26.5997 atol = 1e-4 - @test jkknife(:api00, dclus1, mean).Statistic[1] ≈ 644.1693 atol = 1e-4 -end diff --git a/test/ratio.jl b/test/ratio.jl index b8652ef1..9185952c 100644 --- a/test/ratio.jl +++ b/test/ratio.jl @@ -4,7 +4,7 @@ ############################## # one-stage cluster sample apiclus1 = copy(apiclus1_original) - dclus1 = SurveyDesign(apiclus1; clusters = :dnum, popsize = :fpc) - @test ratio(:api00, :enroll, dclus1).SE[1] ≈ 0.151242 atol = 1e-4 - @test ratio(:api00, :enroll, dclus1).Statistic[1] ≈ 1.17182 atol = 1e-4 + dclus1 = SurveyDesign(apiclus1; clusters = :dnum, popsize = :fpc) |> bootweights + @test ratio(:api00, :enroll, dclus1).SE[1] ≈ 0.1275446 atol = 1e-1 + @test ratio(:api00, :enroll, dclus1).ratio[1] ≈ 1.17182 atol = 1e-4 end \ No newline at end of file From b95d30af5d5ff7b25d0b87b3dc7e44d21d53c6e0 Mon Sep 17 00:00:00 2001 From: ayushpatnaikgit Date: Thu, 19 Jan 2023 15:12:32 +0530 Subject: [PATCH 46/80] Add warning to tell users that single stage approximation is used. --- src/SurveyDesign.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 2ee5a9c3..71ee1356 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -70,6 +70,7 @@ struct SurveyDesign <: AbstractSurveyDesign end ## Single stage approximation if typeof(clusters) <: Vector{Symbol} + @warn As part of single-stage approximation, only the first stage cluster ID is retained. cluster = first(clusters) end if typeof(clusters) <: Symbol From e9530e195d87131a32f37d314c8f406902308d54 Mon Sep 17 00:00:00 2001 From: ayushpatnaikgit Date: Thu, 19 Jan 2023 15:42:17 +0530 Subject: [PATCH 47/80] Fix syntax. --- src/SurveyDesign.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 71ee1356..fc8030f1 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -70,7 +70,7 @@ struct SurveyDesign <: AbstractSurveyDesign end ## Single stage approximation if typeof(clusters) <: Vector{Symbol} - @warn As part of single-stage approximation, only the first stage cluster ID is retained. + @warn "As part of single-stage approximation, only the first stage cluster ID is retained." cluster = first(clusters) end if typeof(clusters) <: Symbol From 4e5e50a216a15c53fd52ecdf0dd4b5ebd1af5ee3 Mon Sep 17 00:00:00 2001 From: ayushpatnaikgit Date: Fri, 20 Jan 2023 16:28:59 +0530 Subject: [PATCH 48/80] Speed up bootstrap --- src/bootstrap.jl | 45 ++++++++++++++++++++------------------------- src/mean.jl | 32 ++++++++++++++++---------------- src/total.jl | 28 ++++++++++++++-------------- test/mean.jl | 41 ++++++++++++++++++++++++----------------- test/total.jl | 6 ------ 5 files changed, 74 insertions(+), 78 deletions(-) diff --git a/src/bootstrap.jl b/src/bootstrap.jl index a5e9f019..630e98b8 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -4,14 +4,16 @@ julia> using Random julia> apiclus1 = load_data("apiclus1"); + julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum, popsize=:fpc); + julia> bootweights(clus_one_stage; replicates=1000, rng=MersenneTwister(111)) # choose a seed for deterministic results ReplicateDesign: data: 183×1044 DataFrame strata: none cluster: dnum - [637, 637, 637 … 448] + [61, 61, 61 … 815] popsize: [757, 757, 757 … 757] sampsize: [15, 15, 15 … 15] weights: [50.4667, 50.4667, 50.4667 … 50.4667] @@ -20,32 +22,25 @@ replicates: 1000 ``` """ function bootweights(design::SurveyDesign; replicates=4000, rng=MersenneTwister(1234)) - H = length(unique(design.data[!, design.strata])) stratified = groupby(design.data, design.strata) - function replicate(stratified, H) - for h in 1:H - substrata = DataFrame(stratified[h]) - psus = unique(substrata[!, design.cluster]) - if length(psus) <= 1 - stratified[h].whij .= 0 # hasn't been tested yet. + H = length(keys(stratified)) + substrata_dfs = [] + for h in 1:H + substrata = DataFrame(stratified[h]) + cluster_sorted = sort(substrata, design.cluster) + psus = unique(cluster_sorted[!, design.cluster]) + npsus = [(count(==(i), cluster_sorted[!, design.cluster])) for i in psus] + nh = length(psus) + randinds = rand(rng, 1:(nh), replicates, (nh-1)) + for replicate in 1:replicates + rh = zeros(Int, nh) + for i in randinds[replicate, :] + rh[i] += 1 end - nh = length(psus) - randinds = rand(rng, 1:(nh), (nh-1)) # Main bootstrap algo. Draw nh-1 out of nh, with replacement. - rh = [(count(==(i), randinds)) for i in 1:nh] # main bootstrap algo. - gdf = groupby(substrata, design.cluster) - for i in 1:nh - gdf[i].whij = repeat([rh[i]], nrow(gdf[i])) .* gdf[i][!,design.weights] .* (nh / (nh - 1)) - end - stratified[h].whij = transform(gdf).whij - - end - return transform(stratified, :whij) + cluster_sorted[!, "replicate_" * string(replicate)] = vcat([repeat([rh[i] * (nh / (nh-1))], npsus[i]) for i in 1:length(rh)]...) .* cluster_sorted[!, design.weights] + end + push!(substrata_dfs, cluster_sorted) end - df = replicate(stratified, H) - rename!(df, :whij => :replicate_1) - df.replicate_1 = disallowmissing(df.replicate_1) - for i in 2:(replicates) - df[!, "replicate_" * string(i)] = disallowmissing(replicate(stratified, H).whij) - end + df = vcat(substrata_dfs...) return ReplicateDesign(df, design.cluster, design.popsize, design.sampsize, design.strata, design.weights, design.allprobs, design.pps, replicates) end diff --git a/src/mean.jl b/src/mean.jl index 593b1d79..5b87ffdf 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -10,18 +10,18 @@ julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) julia> mean(:api00, clus_one_stage) 1×2 DataFrame - Row │ mean SE - │ Float64 Float64 + Row │ mean SE + │ Float64 Float64 ─────┼────────────────── - 1 │ 644.169 23.2919 + 1 │ 644.169 23.2877 julia> mean([:api00, :enroll], clus_one_stage) 2×3 DataFrame Row │ names mean SE │ String Float64 Float64 ─────┼────────────────────────── - 1 │ api00 644.169 23.2919 - 2 │ enroll 549.716 45.3655 + 1 │ api00 644.169 23.2877 + 2 │ enroll 549.716 46.2597 ``` """ function mean(x::Symbol, design::ReplicateDesign) @@ -52,17 +52,17 @@ julia> mean(:api00, :cname, clus_one_stage) Row │ cname mean SE │ String15 Float64 Any ─────┼─────────────────────────────────── - 1 │ Alameda 669.0 1.27388e-13 - 2 │ Fresno 472.0 1.13687e-13 - 3 │ Kern 452.5 0.0 - 4 │ Los Angeles 647.267 47.4938 - 5 │ Mendocino 623.25 1.0931e-13 - 6 │ Merced 519.25 4.57038e-15 - 7 │ Orange 710.563 2.19684e-13 - 8 │ Plumas 709.556 1.27773e-13 - 9 │ San Diego 659.436 2.63446 - 10 │ San Joaquin 551.189 2.17471e-13 - 11 │ Santa Clara 732.077 56.2584 + 1 │ Santa Clara 732.077 59.6794 + 2 │ San Diego 659.436 2.63657 + 3 │ Merced 519.25 8.18989e-15 + 4 │ Los Angeles 647.267 47.7685 + 5 │ Orange 710.563 2.21461e-13 + 6 │ Fresno 472.0 1.13687e-13 + 7 │ Plumas 709.556 1.26823e-13 + 8 │ Alameda 669.0 1.26888e-13 + 9 │ San Joaquin 551.189 2.17297e-13 + 10 │ Kern 452.5 0.0 + 11 │ Mendocino 623.25 1.09409e-13 ``` """ function mean(x::Symbol, domain::Symbol, design::ReplicateDesign) diff --git a/src/total.jl b/src/total.jl index 1b200797..f6ceb823 100644 --- a/src/total.jl +++ b/src/total.jl @@ -13,15 +13,15 @@ julia> total(:api00, clus_one_stage) Row │ total SE │ Float64 Float64 ─────┼────────────────────── - 1 │ 3.98999e6 9.22175e5 + 1 │ 3.98999e6 9.10443e5 julia> total([:api00, :enroll], clus_one_stage) 2×3 DataFrame Row │ names total SE │ String Float64 Float64 ─────┼────────────────────────────── - 1 │ api00 3.98999e6 9.22175e5 - 2 │ enroll 3.40494e6 9.51557e5 + 1 │ api00 3.98999e6 9.10443e5 + 2 │ enroll 3.40494e6 9.47987e5 ``` """ function total(x::Symbol, design::ReplicateDesign) @@ -52,17 +52,17 @@ julia> total(:api00, :cname, clus_one_stage) Row │ cname total SE │ String15 Float64 Any ─────┼──────────────────────────────────────── - 1 │ Alameda 249080.0 2.48842e5 - 2 │ Fresno 63903.1 64452.2 - 3 │ Kern 30631.5 31083.0 - 4 │ Los Angeles 3.2862e5 2.93649e5 - 5 │ Mendocino 84380.6 83154.4 - 6 │ Merced 70300.2 69272.5 - 7 │ Orange 3.84807e5 3.90097e5 - 8 │ Plumas 2.16147e5 2.17811e5 - 9 │ San Diego 1.2276e6 8.78559e5 - 10 │ San Joaquin 6.90276e5 6.90685e5 - 11 │ Santa Clara 6.44244e5 4.09943e5 + 1 │ Santa Clara 6.44244e5 4.29558e5 + 2 │ San Diego 1.2276e6 8.60246e5 + 3 │ Merced 70300.2 70757.4 + 4 │ Los Angeles 3.2862e5 2.95688e5 + 5 │ Orange 3.84807e5 3.77128e5 + 6 │ Fresno 63903.1 64455.2 + 7 │ Plumas 2.16147e5 2.12279e5 + 8 │ Alameda 249080.0 2.5221e5 + 9 │ San Joaquin 6.90276e5 6.92353e5 + 10 │ Kern 30631.5 30333.5 + 11 │ Mendocino 84380.6 80774.4 ``` """ function total(x::Symbol, domain::Symbol, design::ReplicateDesign) diff --git a/test/mean.jl b/test/mean.jl index 4745125b..2e9682a8 100644 --- a/test/mean.jl +++ b/test/mean.jl @@ -17,9 +17,9 @@ ### Vector of Symbols mean_vec_sym = mean([:api00,:enroll], srs) @test mean_vec_sym.mean[1] ≈ 656.585 atol = 1e-4 - @test mean_vec_sym.SE[1] ≈ 9.3065 atol = 1e-2 + @test mean_vec_sym.SE[1] ≈ 9.3065 rtol = 1e-1 @test mean_vec_sym.mean[2] ≈ 584.61 atol = 1e-4 - @test mean_vec_sym.SE[2] ≈ 28.1048 atol = 1e-2 + @test mean_vec_sym.SE[2] ≈ 28.1048 rtol = 1e-1 ############################## ### Categorical Array - estimating proportions # apisrs_categ = copy(apisrs_original) @@ -35,7 +35,7 @@ end apistrat = copy(apistrat_original) strat = SurveyDesign(apistrat, strata = :stype, weights = :pw) |> bootweights mean_strat = mean(:api00, strat) - @test mean_strat.mean[1] ≈ 662.29 atol = 1e-2 + @test mean_strat.mean[1] ≈ 662.29 rtol = 1e-1 @test mean_strat.SE[1] ≈ 9.48296 atol = 1e-1 end @@ -44,12 +44,12 @@ end apisrs = copy(apisrs_original) srs = SurveyDesign(apisrs; popsize = :fpc) |> bootweights mean_symb_srs = mean(:api00, :stype, srs) - @test mean_symb_srs.mean[1] ≈ 605.36 atol = 1e-2 - @test mean_symb_srs.mean[2] ≈ 666.141 atol = 1e-2 - @test mean_symb_srs.mean[3] ≈ 654.273 atol = 1e-2 - @test mean_symb_srs.SE[1] ≈ 22.6718 atol = 1e-2 - @test mean_symb_srs.SE[2] ≈ 11.35390 atol = 1e-2 - @test mean_symb_srs.SE[3] ≈ 22.3298 atol = 1e-2 + @test mean_symb_srs.mean[1] ≈ 605.36 rtol = 1e-1 + @test mean_symb_srs.mean[2] ≈ 666.141 rtol = 1e-1 + @test mean_symb_srs.mean[3] ≈ 654.273 rtol = 1e-1 + @test mean_symb_srs.SE[1] ≈ 22.6718 rtol = 1e-1 + @test mean_symb_srs.SE[2] ≈ 11.35390 rtol = 1e-1 + @test mean_symb_srs.SE[3] ≈ 22.3298 rtol = 1e-1 end @testset "mean_svyby_Stratified" begin @@ -57,12 +57,12 @@ end apistrat = copy(apistrat_original) strat = SurveyDesign(apistrat; strata = :stype, weights = :pw) |> bootweights mean_strat_symb = mean(:api00, :stype, strat) - @test mean_strat_symb.mean[1] ≈ 674.43 atol = 1e-2 - @test mean_strat_symb.mean[2] ≈ 636.6 atol = 1e-2 - @test mean_strat_symb.mean[3] ≈ 625.82 atol = 1e-2 - @test mean_strat_symb.SE[1] ≈ 12.4398 atol = 1e-2 - @test mean_strat_symb.SE[2] ≈ 16.5628 atol = 1e-2 - @test mean_strat_symb.SE[3] ≈ 15.42320 atol = 1e-2 + @test mean_strat_symb.mean[1] ≈ 674.43 rtol = 1e-1 + @test mean_strat_symb.mean[2] ≈ 636.6 rtol = 1e-1 + @test mean_strat_symb.mean[3] ≈ 625.82 rtol = 1e-1 + @test mean_strat_symb.SE[1] ≈ 12.4398 rtol = 1e-1 + @test mean_strat_symb.SE[2] ≈ 16.5628 rtol = 1e-1 + @test mean_strat_symb.SE[3] ≈ 15.42320 rtol = 1e-1 end @testset "mean_OneStageCluster" begin @@ -73,6 +73,13 @@ end # one-stage cluster sample apiclus1 = copy(apiclus1_original) dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) |> bootweights - @test mean(:api00, dclus1).mean[1] ≈ 644.17 atol = 1e-2 - @test mean(:api00, dclus1).SE[1] ≈ 23.291 atol = 1e-2 # without fpc as it hasn't been figured out for bootstrap. + @test mean(:api00, dclus1).mean[1] ≈ 644.17 rtol = 1e-1 + @test mean(:api00, dclus1).SE[1] ≈ 23.291 rtol = 1e-1 # without fpc as it hasn't been figured out for bootstrap. + + mn = mean(:api00, :cname, dclus1) + @test size(mn)[1] == apiclus1.cname |> unique |> length + @test filter(:cname => ==("Los Angeles"), mn).mean[1] ≈ 647.2667 rtol = STAT_TOL + @test filter(:cname => ==("Los Angeles"), mn).SE[1] ≈ 41.537132 rtol = 1 # tolerance is too large + @test filter(:cname => ==("Santa Clara"), mn).mean[1] ≈ 732.0769 rtol = STAT_TOL + @test filter(:cname => ==("Santa Clara"), mn).SE[1] ≈ 54.215099 rtol = SE_TOL end diff --git a/test/total.jl b/test/total.jl index 6ac6ab06..51fb2f00 100644 --- a/test/total.jl +++ b/test/total.jl @@ -173,12 +173,6 @@ end @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 292840.83 rtol = SE_TOL @test filter(:cname => ==("San Diego"), tot).total[1] ≈ 1227596.71 rtol = STAT_TOL @test filter(:cname => ==("San Diego"), tot).SE[1] ≈ 860028.39 rtol = SE_TOL - mn = mean(:api00, :cname, clus1) - @test size(mn)[1] == apiclus1.cname |> unique |> length - @test filter(:cname => ==("Los Angeles"), mn).mean[1] ≈ 647.2667 rtol = STAT_TOL - @test filter(:cname => ==("Los Angeles"), mn).SE[1] ≈ 41.537132 rtol = 1 # tolerance is too large - @test filter(:cname => ==("Santa Clara"), mn).mean[1] ≈ 732.0769 rtol = STAT_TOL - @test filter(:cname => ==("Santa Clara"), mn).SE[1] ≈ 52.336574 rtol = SE_TOL # equivalent R code (results cause clutter): # > svyby(~api00, ~cname, clus1rep, svytotal) # > svyby(~api00, ~cname, clus1rep, svymean) From f084eb2d89eb5bc26c014c7fa5351985788ec138 Mon Sep 17 00:00:00 2001 From: ayushpatnaikgit Date: Fri, 20 Jan 2023 17:39:23 +0530 Subject: [PATCH 49/80] Add ratio doctest changes. --- src/ratio.jl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/ratio.jl b/src/ratio.jl index 8a0226f3..c5c897b0 100644 --- a/src/ratio.jl +++ b/src/ratio.jl @@ -8,12 +8,13 @@ julia> apiclus1 = load_data("apiclus1"); julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) |> bootweights; -ratio(:api00, :enroll, clus_one_stage) +julia> ratio(:api00, :enroll, clus_one_stage) 1×2 DataFrame - Row │ ratio SE - │ Float64 Float64 + Row │ ratio SE + │ Float64 Float64 ─────┼─────────────────── - 1 │ 1.17182 0.130834 + 1 │ 1.17182 0.133361 + ``` """ function ratio(variable_num::Symbol, variable_den::Symbol, design::ReplicateDesign) From 4d25a36efb8138e022d2b80696b3e2b75856756b Mon Sep 17 00:00:00 2001 From: ayushpatnaikgit Date: Fri, 20 Jan 2023 17:39:40 +0530 Subject: [PATCH 50/80] Add commented code for fixing doctests. --- docs/make.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/make.jl b/docs/make.jl index c79915e1..ba49de19 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -7,6 +7,7 @@ DocMeta.setdocmeta!(Survey, :DocTestSetup, :(using Survey); recursive=true) makedocs(; modules=[Survey], authors="xKDR Forum", + # doctest = :fix, repo="https://github.com/xKDR/Survey.jl/blob/{commit}{path}#{line}", sitename="$Survey.jl", format=Documenter.HTML(; From b1718c016c46e2eac132e2de699782cef29980bd Mon Sep 17 00:00:00 2001 From: ayushpatnaikgit Date: Fri, 20 Jan 2023 18:06:27 +0530 Subject: [PATCH 51/80] Use fill instead of repeat --- src/bootstrap.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/bootstrap.jl b/src/bootstrap.jl index 630e98b8..db3bb96f 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -36,8 +36,8 @@ function bootweights(design::SurveyDesign; replicates=4000, rng=MersenneTwister( rh = zeros(Int, nh) for i in randinds[replicate, :] rh[i] += 1 - end - cluster_sorted[!, "replicate_" * string(replicate)] = vcat([repeat([rh[i] * (nh / (nh-1))], npsus[i]) for i in 1:length(rh)]...) .* cluster_sorted[!, design.weights] + end + cluster_sorted[!, "replicate_" * string(replicate)] = vcat([fill(rh[i] * (nh / (nh-1)), npsus[i]) for i in 1:length(rh)]...) .* cluster_sorted[!, design.weights] end push!(substrata_dfs, cluster_sorted) end From 7fa8f5418e2a57f7f20d132ea04555939c2ad060 Mon Sep 17 00:00:00 2001 From: ayushpatnaikgit Date: Fri, 20 Jan 2023 18:06:31 +0530 Subject: [PATCH 52/80] Fix test. --- test/show.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/show.jl b/test/show.jl index 1231a910..05ae7a1f 100644 --- a/test/show.jl +++ b/test/show.jl @@ -98,7 +98,7 @@ end data: 183×4044 DataFrame strata: none cluster: dnum - [637, 637, 637 … 448] + [61, 61, 61 … 815] popsize: [507.7049, 507.7049, 507.7049 … 507.7049] sampsize: [15, 15, 15 … 15] weights: [33.847, 33.847, 33.847 … 33.847] From c60a95adf3bdd82d2c8903a2aa17334acef779a8 Mon Sep 17 00:00:00 2001 From: ayushpatnaikgit Date: Sun, 22 Jan 2023 13:16:23 +0530 Subject: [PATCH 53/80] Further speedup bootstrap algo by 2x. --- src/bootstrap.jl | 9 +++------ src/mean.jl | 24 ++++++++++++------------ src/ratio.jl | 5 +++-- src/total.jl | 28 ++++++++++++++-------------- 4 files changed, 32 insertions(+), 34 deletions(-) diff --git a/src/bootstrap.jl b/src/bootstrap.jl index db3bb96f..c57649a8 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -31,13 +31,10 @@ function bootweights(design::SurveyDesign; replicates=4000, rng=MersenneTwister( psus = unique(cluster_sorted[!, design.cluster]) npsus = [(count(==(i), cluster_sorted[!, design.cluster])) for i in psus] nh = length(psus) - randinds = rand(rng, 1:(nh), replicates, (nh-1)) + cluster_weights = cluster_sorted[!, design.weights] for replicate in 1:replicates - rh = zeros(Int, nh) - for i in randinds[replicate, :] - rh[i] += 1 - end - cluster_sorted[!, "replicate_" * string(replicate)] = vcat([fill(rh[i] * (nh / (nh-1)), npsus[i]) for i in 1:length(rh)]...) .* cluster_sorted[!, design.weights] + randinds = rand(rng, 1:(nh), (nh-1)) + cluster_sorted[!, "replicate_" * string(replicate)] = vcat([fill((count(==(i), randinds)) * (nh / (nh-1)), npsus[i]) for i in 1:nh]...) .* cluster_weights end push!(substrata_dfs, cluster_sorted) end diff --git a/src/mean.jl b/src/mean.jl index 5b87ffdf..51de1b74 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -13,15 +13,15 @@ julia> mean(:api00, clus_one_stage) Row │ mean SE │ Float64 Float64 ─────┼────────────────── - 1 │ 644.169 23.2877 + 1 │ 644.169 23.4107 julia> mean([:api00, :enroll], clus_one_stage) 2×3 DataFrame Row │ names mean SE │ String Float64 Float64 ─────┼────────────────────────── - 1 │ api00 644.169 23.2877 - 2 │ enroll 549.716 46.2597 + 1 │ api00 644.169 23.4107 + 2 │ enroll 549.716 45.7835 ``` """ function mean(x::Symbol, design::ReplicateDesign) @@ -52,17 +52,17 @@ julia> mean(:api00, :cname, clus_one_stage) Row │ cname mean SE │ String15 Float64 Any ─────┼─────────────────────────────────── - 1 │ Santa Clara 732.077 59.6794 - 2 │ San Diego 659.436 2.63657 - 3 │ Merced 519.25 8.18989e-15 - 4 │ Los Angeles 647.267 47.7685 - 5 │ Orange 710.563 2.21461e-13 + 1 │ Santa Clara 732.077 58.2169 + 2 │ San Diego 659.436 2.66703 + 3 │ Merced 519.25 2.28936e-15 + 4 │ Los Angeles 647.267 47.6233 + 5 │ Orange 710.563 2.19826e-13 6 │ Fresno 472.0 1.13687e-13 - 7 │ Plumas 709.556 1.26823e-13 - 8 │ Alameda 669.0 1.26888e-13 - 9 │ San Joaquin 551.189 2.17297e-13 + 7 │ Plumas 709.556 1.26058e-13 + 8 │ Alameda 669.0 1.27527e-13 + 9 │ San Joaquin 551.189 2.1791e-13 10 │ Kern 452.5 0.0 - 11 │ Mendocino 623.25 1.09409e-13 + 11 │ Mendocino 623.25 1.09545e-13 ``` """ function mean(x::Symbol, domain::Symbol, design::ReplicateDesign) diff --git a/src/ratio.jl b/src/ratio.jl index bc72d2f7..4a7385ac 100644 --- a/src/ratio.jl +++ b/src/ratio.jl @@ -8,12 +8,13 @@ julia> apiclus1 = load_data("apiclus1"); julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) |> bootweights; -ratio(:api00, :enroll, clus_one_stage) +julia> ratio(:api00, :enroll, clus_one_stage) 1×2 DataFrame Row │ ratio SE │ Float64 Float64 ─────┼─────────────────── - 1 │ 1.17182 0.133361 + 1 │ 1.17182 0.131518 + ``` """ function ratio(variable_num::Symbol, variable_den::Symbol, design::ReplicateDesign) diff --git a/src/total.jl b/src/total.jl index f6ceb823..878d2635 100644 --- a/src/total.jl +++ b/src/total.jl @@ -13,15 +13,15 @@ julia> total(:api00, clus_one_stage) Row │ total SE │ Float64 Float64 ─────┼────────────────────── - 1 │ 3.98999e6 9.10443e5 + 1 │ 3.98999e6 9.01611e5 julia> total([:api00, :enroll], clus_one_stage) 2×3 DataFrame Row │ names total SE │ String Float64 Float64 ─────┼────────────────────────────── - 1 │ api00 3.98999e6 9.10443e5 - 2 │ enroll 3.40494e6 9.47987e5 + 1 │ api00 3.98999e6 9.01611e5 + 2 │ enroll 3.40494e6 9.33396e5 ``` """ function total(x::Symbol, design::ReplicateDesign) @@ -52,17 +52,17 @@ julia> total(:api00, :cname, clus_one_stage) Row │ cname total SE │ String15 Float64 Any ─────┼──────────────────────────────────────── - 1 │ Santa Clara 6.44244e5 4.29558e5 - 2 │ San Diego 1.2276e6 8.60246e5 - 3 │ Merced 70300.2 70757.4 - 4 │ Los Angeles 3.2862e5 2.95688e5 - 5 │ Orange 3.84807e5 3.77128e5 - 6 │ Fresno 63903.1 64455.2 - 7 │ Plumas 2.16147e5 2.12279e5 - 8 │ Alameda 249080.0 2.5221e5 - 9 │ San Joaquin 6.90276e5 6.92353e5 - 10 │ Kern 30631.5 30333.5 - 11 │ Mendocino 84380.6 80774.4 + 1 │ Santa Clara 6.44244e5 4.2273e5 + 2 │ San Diego 1.2276e6 8.62727e5 + 3 │ Merced 70300.2 71336.3 + 4 │ Los Angeles 3.2862e5 2.93936e5 + 5 │ Orange 3.84807e5 3.88014e5 + 6 │ Fresno 63903.1 64781.7 + 7 │ Plumas 2.16147e5 2.12089e5 + 8 │ Alameda 249080.0 2.49228e5 + 9 │ San Joaquin 6.90276e5 6.81604e5 + 10 │ Kern 30631.5 30870.3 + 11 │ Mendocino 84380.6 80215.9 ``` """ function total(x::Symbol, domain::Symbol, design::ReplicateDesign) From 7290feee8b62a2303ad2cadf74ee4eba6d8a91d1 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Thu, 26 Jan 2023 19:48:15 +0200 Subject: [PATCH 54/80] Restructure documentation, finish tutorial --- docs/make.jl | 6 +- docs/src/api.md | 2 +- docs/src/getting_started.md | 203 ++++++++++++++++++++++++++++++++++++ docs/src/index.md | 28 +---- docs/src/manual.md | 1 + 5 files changed, 212 insertions(+), 28 deletions(-) create mode 100644 docs/src/getting_started.md create mode 100644 docs/src/manual.md diff --git a/docs/make.jl b/docs/make.jl index ba49de19..2f6535ca 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -17,6 +17,8 @@ makedocs(; ), pages=[ "Home" => "index.md", + "Getting Started" => "getting_started.md", + "Manual" => "manual.md", "Moving from R" => "R_comparison.md", "API reference" => "api.md" ], @@ -25,6 +27,6 @@ makedocs(; deploydocs(; repo="github.com/xKDR/Survey.jl", - target = "build", - devbranch="main" + target="build", + devbranch="main", ) diff --git a/docs/src/api.md b/docs/src/api.md index 5b538a55..0890ab65 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -19,7 +19,7 @@ mean(x::Symbol, domain::Symbol, design::ReplicateDesign) total(x::Symbol, design::ReplicateDesign) total(x::Symbol, domain::Symbol, design::ReplicateDesign) quantile -ratio(variable_num::Symbol, variable_den::Symbol, design::SurveyDesign) +ratio plot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...) boxplot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...) hist(design::AbstractSurveyDesign, var::Symbol, diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md new file mode 100644 index 00000000..54b97243 --- /dev/null +++ b/docs/src/getting_started.md @@ -0,0 +1,203 @@ +## Instalation + +The `Survey.jl` package is not yet registered. For now, installation of the package +is done using the following command: + +```julia +] add "https://github.com/xKDR/Survey.jl.git" +``` + +After registration, the regular `Pkg` commands can be used for installing the package: + +```julia +julia> using Pkg + +julia> Pkg.add("Survey") +``` + +```julia +julia> ] add Survey +``` + +## Tutorial + +This tutorial assumes basic knowledge of statistics and survey analysis. + +To begin this tutorial, load the package in your workspace: + +```julia +julia> using Survey +``` + +Now load a survey dataset that you want to study. In this tutorial we will be using +the [Academic Performance Index](https://r-survey.r-forge.r-project.org/survey/html/api.html) +(API) datasets for Californian schools. The datasets contain information for all +schools with at least 100 students and for various probability samples of the +data. + +!!! note + + The API program has been discontinued at the end of 2018. Information is archived + at [https://www.cde.ca.gov/re/pr/api.asp](https://www.cde.ca.gov/re/pr/api.asp) + +```julia +julia> apisrs = load_data("apisrs") +200×40 DataFrame + Row │ Column1 cds stype name sname snum dn ⋯ + │ Int64 Int64 String1 String15 String Int64 St ⋯ +─────┼────────────────────────────────────────────────────────────────────────────────────────────── + 1 │ 1039 15739081534155 H McFarland High McFarland High 1039 Mc ⋯ + 2 │ 1124 19642126066716 E Stowers (Cecil Stowers (Cecil B.) Elementary 1124 AB + 3 │ 2868 30664493030640 H Brea-Olinda Hig Brea-Olinda High 2868 Br + 4 │ 1273 19644516012744 E Alameda Element Alameda Elementary 1273 Do + 5 │ 4926 40688096043293 E Sunnyside Eleme Sunnyside Elementary 4926 Sa ⋯ + 6 │ 2463 19734456014278 E Los Molinos Ele Los Molinos Elementary 2463 Ha + ⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋱ + 196 │ 969 15635291534775 H North High North High 969 Ke + 197 │ 1752 19647336017446 E Hammel Street E Hammel Street Elementary 1752 Lo + 198 │ 4480 37683386039143 E Audubon Element Audubon Elementary 4480 Sa ⋯ + 199 │ 4062 36678196036222 E Edison Elementa Edison Elementary 4062 On + 200 │ 2683 24657716025621 E Franklin Elemen Franklin Elementary 2683 Me + 34 columns and 189 rows omitted +``` + +`apisrs` is a simple random sample of the Academic Performance Index of Californian +schools. The [`load_data`](@ref) function loads it as a +[`DataFrame`](https://dataframes.juliadata.org/stable/lib/types/#DataFrames.DataFrame). +You can look at the column names of `apisrs` to get an idea of what the dataset +contains. + +```julia +julia> names(apisrs) +40-element Vector{String}: + "Column1" + "cds" + "stype" + "name" + "sname" + "snum" + "dname" + "dnum" + ⋮ + "avg.ed" + "full" + "emer" + "enroll" + "api.stu" + "pw" + "fpc" +``` + +Next, build a survey design from your `DataFrame`: + +```julia +julia> srs = SurveyDesign(apisrs; weights=:pw) +SurveyDesign: +data: 200×45 DataFrame +strata: none +cluster: none +popsize: [6194.0, 6194.0, 6194.0 … 6194.0] +sampsize: [200, 200, 200 … 200] +weights: [30.97, 30.97, 30.97 … 30.97] +allprobs: [0.0323, 0.0323, 0.0323 … 0.0323] +``` + +This is a simple random sample design with weights given by the column `:pw` of +`apisrs`. You can also create more complex designs such as stratified or cluster +sample designs. You can find more information on the complete capabilities of +the package in the [Manual](@ref). The purpose of this tutorial is to show the +basic usage of the package. For that, we will stick with a simple random sample. + +Now you can analyse your design according to your needs using the +[functionality](@ref Index) provided by the package. For example, you can compute +the estimated mean or population total for a given variable. Let's say we're +interested in the mean Academic Performance Index from the year 1999. First we +need to convert the [`SurveyDesign`](@ref) to a [`ReplicateDesign`](@ref) using +bootstrapping: + +```julia +julia> bsrs = bootweights(srs) +ReplicateDesign: +data: 200×4045 DataFrame +strata: none +cluster: none +popsize: [6194.0, 6194.0, 6194.0 … 6194.0] +sampsize: [200, 200, 200 … 200] +weights: [30.97, 30.97, 30.97 … 30.97] +allprobs: [0.0323, 0.0323, 0.0323 … 0.0323] +replicates: 4000 +``` + +We do this because [TODO: explain why]. Now we can compute the estimated mean: + +```julia +julia> mean(:api99, bsrs) +1×2 DataFrame + Row │ mean SE + │ Float64 Float64 +─────┼────────────────── + 1 │ 624.685 9.5747 +``` + +We can also find the mean of both the 1999 API and 2000 API for a clear +comparison between students' performance from one year to another: + +```julia +2×3 DataFrame + Row │ names mean SE + │ String Float64 Float64 +─────┼────────────────────────── + 1 │ api99 624.685 9.5747 + 2 │ api00 656.585 9.30656 +``` + +The [`ratio`](@ref) is also appropriate for studying the relationship between +the two APIs: + +```julia +julia> ratio(:api00, :api99, bsrs) +1×2 DataFrame + Row │ ratio SE + │ Float64 Float64 +─────┼───────────────────── + 1 │ 1.05107 0.00364165 +``` + +If we're interested in a certain statistic estimated by a specific domain, we +can add the domain as the second parameter to our function. Let's say we want +to find the estimated total number of students enrolled in schools from each +county: + +```julia +julia> total(:enroll, :cname, bsrs) +38×3 DataFrame + Row │ cname total SE + │ String15 Float64 Any +─────┼──────────────────────────────────────────── + 1 │ Kern 1.95823e5 74984.5 + 2 │ Los Angeles 867129.0 1.34517e5 + 3 │ Orange 1.68786e5 63990.2 + 4 │ San Luis Obispo 6720.49 6731.29 + 5 │ San Francisco 30319.6 18024.1 + 6 │ Modoc 6503.7 6500.84 + ⋮ │ ⋮ ⋮ ⋮ + 34 │ Yolo 12171.2 12131.8 + 35 │ Calaveras 12976.4 13095.7 + 36 │ Napa 39239.0 29841.1 + 37 │ Lake 6410.79 6562.72 + 38 │ Merced 15392.1 14921.9 + 27 rows omitted +``` + +Another way to visualize data is through graphs. We can make a histogram to +better see the distribution of enrolled students: + +```@setup warning +# !!!THIS NEEDS TO MATCH THE EXAMPLE IN THE DOCSTRING OF `hist` +``` + +```julia +julia> hist(srs, :enroll) +``` + +![](assets/hist.png) diff --git a/docs/src/index.md b/docs/src/index.md index eddbcf0f..dd95d64d 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -4,28 +4,6 @@ CurrentModule = Survey # Survey -This package is used to study complex survey data. It aims to be a fast alternative to the [Survey package in R](https://cran.r-project.org/web/packages/survey/index.html) developed by [Professor Thomas Lumley](https://www.stat.auckland.ac.nz/people/tlum005). - -This package currently supports simple random sample and stratified sample. In future releases, it will support multistage sampling as well. - -## Basic demo - -The following demo uses the -[Academic Performance Index](https://r-survey.r-forge.r-project.org/survey/html/api.html) -(API) dataset for Californian schools. The data sets contain information for all schools -with at least 100 students and for various probability samples of the data. - -The API program has been discontinued at the end of 2018. Information is archived at -[https://www.cde.ca.gov/re/pr/api.asp](https://www.cde.ca.gov/re/pr/api.asp) - -Firstly, a survey design needs a dataset from which to gather information. The sample -datasets provided with the package can be loaded as `DataFrame`s using [`load_data`](@ref): - -```julia -julia> apisrs = load_data("apisrs"); -``` - -`apisrs` is a simple random sample of the Academic Performance Index of Californian schools. - -Next, we can build a design. -#TODO: continue tutorial +This package is used to study complex survey data. It aims to be a fast alternative +to the [Survey package in R](https://cran.r-project.org/web/packages/survey/index.html) +developed by [Professor Thomas Lumley](https://www.stat.auckland.ac.nz/people/tlum005). diff --git a/docs/src/manual.md b/docs/src/manual.md new file mode 100644 index 00000000..39021136 --- /dev/null +++ b/docs/src/manual.md @@ -0,0 +1 @@ +# Manual From cbb1a015e11f233b88aa7ac747f192606463c5bd Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Thu, 26 Jan 2023 20:16:38 +0200 Subject: [PATCH 55/80] Add manual structure --- docs/src/manual.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/src/manual.md b/docs/src/manual.md index 39021136..57063217 100644 --- a/docs/src/manual.md +++ b/docs/src/manual.md @@ -1 +1,9 @@ # Manual + +## `DataFrames` dependence + +## Bootstrapping + +## Plotting + +## Performance From ca186e498e536e148fc734a286b2e45c497b2895 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Fri, 27 Jan 2023 11:45:29 +0200 Subject: [PATCH 56/80] Add underscore to `allprobs` column label for consistency --- src/SurveyDesign.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index fc8030f1..bce497e3 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -100,7 +100,7 @@ struct SurveyDesign <: AbstractSurveyDesign weights_labels = :_weights data[!, weights_labels] = repeat([1], nrow(data)) end - allprobs_labels = :allprobs + allprobs_labels = :_allprobs data[!, allprobs_labels] = 1 ./ data[!, weights_labels] # In one-stage cluster sample, allprobs is just probs, no multiplication needed pps = false # for now no explicit pps supported faster functions, but they can be added new(data, cluster, popsize, sampsize_labels, strata, weights_labels, allprobs_labels, pps) From 40a7fbfca3c79e62bf65e3009510ff486d0297ac Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Fri, 27 Jan 2023 11:45:56 +0200 Subject: [PATCH 57/80] Add subsection on `DataFrames` --- docs/src/manual.md | 162 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 161 insertions(+), 1 deletion(-) diff --git a/docs/src/manual.md b/docs/src/manual.md index 57063217..a83e3fa3 100644 --- a/docs/src/manual.md +++ b/docs/src/manual.md @@ -1,6 +1,166 @@ # Manual -## `DataFrames` dependence +## `DataFrames` in `Survey` + +The internal structure of a survey design is build upon +[`DataFrames`](https://dataframes.juliadata.org/stable/). In fact, the `data` +argument is the only required argument for the constructor and it must be an +[`AbstractDataFrame`](https://dataframes.juliadata.org/stable/lib/types/#DataFrames.AbstractDataFrame). + +### Data manipulation + +The provided `DataFrame` is altered by the [`SurveyDesign`](@ref) constructor +in order to add columns for frequency and probability weights, sample and +population sizes and, if necessary, strata and cluster information. + +Notice the change in `apisrs`: + +```julia +julia> apisrs = load_data("apisrs") +200×40 DataFrame + Row │ Column1 cds stype name sname ⋯ + │ Int64 Int64 String1 String15 String ⋯ +─────┼────────────────────────────────────────────────────────────────────────── + 1 │ 1039 15739081534155 H McFarland High McFarland High ⋯ + 2 │ 1124 19642126066716 E Stowers (Cecil Stowers (Cecil B.) E + 3 │ 2868 30664493030640 H Brea-Olinda Hig Brea-Olinda High + 4 │ 1273 19644516012744 E Alameda Element Alameda Elementary + 5 │ 4926 40688096043293 E Sunnyside Eleme Sunnyside Elementary ⋯ + 6 │ 2463 19734456014278 E Los Molinos Ele Los Molinos Elementa + ⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋱ + 196 │ 969 15635291534775 H North High North High + 197 │ 1752 19647336017446 E Hammel Street E Hammel Street Elemen + 198 │ 4480 37683386039143 E Audubon Element Audubon Elementary ⋯ + 199 │ 4062 36678196036222 E Edison Elementa Edison Elementary + 200 │ 2683 24657716025621 E Franklin Elemen Franklin Elementary + 36 columns and 189 rows omitted + +julia> names(apisrs) +40-element Vector{String}: + "Column1" + "cds" + "stype" + "name" + "sname" + "snum" + "dname" + "dnum" + ⋮ + "avg.ed" + "full" + "emer" + "enroll" + "api.stu" + "pw" + "fpc" + +julia> srs = SurveyDesign(apisrs; weights=:pw); + +julia> apisrs +200×45 DataFrame + Row │ Column1 cds stype name sname ⋯ + │ Int64 Int64 String1 String15 String ⋯ +─────┼────────────────────────────────────────────────────────────────────────── + 1 │ 1039 15739081534155 H McFarland High McFarland High ⋯ + 2 │ 1124 19642126066716 E Stowers (Cecil Stowers (Cecil B.) E + 3 │ 2868 30664493030640 H Brea-Olinda Hig Brea-Olinda High + 4 │ 1273 19644516012744 E Alameda Element Alameda Elementary + 5 │ 4926 40688096043293 E Sunnyside Eleme Sunnyside Elementary ⋯ + 6 │ 2463 19734456014278 E Los Molinos Ele Los Molinos Elementa + ⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋱ + 196 │ 969 15635291534775 H North High North High + 197 │ 1752 19647336017446 E Hammel Street E Hammel Street Elemen + 198 │ 4480 37683386039143 E Audubon Element Audubon Elementary ⋯ + 199 │ 4062 36678196036222 E Edison Elementa Edison Elementary + 200 │ 2683 24657716025621 E Franklin Elemen Franklin Elementary + 41 columns and 189 rows omitted + +julia> names(apisrs) +45-element Vector{String}: + "Column1" + "cds" + "stype" + "name" + "sname" + "snum" + "dname" + "dnum" + ⋮ + "pw" + "fpc" + "false_strata" + "false_cluster" + "_sampsize" + "_popsize" + "_allprobs" +``` + +Five columns were added: + +- `false_strata` - only in the case of no stratification + + This column is necessary because when making a [`ReplicateDesign`](@ref), the + [`bootweights`](@ref) function uses [`groupby`](https://dataframes.juliadata.org/stable/lib/functions/#DataFrames.groupby) + with a column representing the stratification variable. If there are no strata, + there is no such column so it should be added in order to keep `bootweights` + general. + +- `false_cluster` - only in the case of no clustering + + The reasoning is the same as in the case of no stratification. + +- `_sampsize` - sample sizes + +- `_popsize` - population sizes + + These match the stratification variable: + + ```julia + julia> apistrat = load_data("apistrat"); + + julia> strat = SurveyDesign(apistrat; strata=:stype, weights=:pw); + + julia> apistrat[:, [:stype, :_sampsize, :_popsize]] + 200×3 DataFrame + Row │ stype _sampsize _popsize + │ String1 Int64 Float64 + ─────┼────────────────────────────── + 1 │ E 100 4421.0 + 2 │ E 100 4421.0 + 3 │ E 100 4421.0 + 4 │ E 100 4421.0 + 5 │ E 100 4421.0 + 6 │ E 100 4421.0 + ⋮ │ ⋮ ⋮ ⋮ + 196 │ E 100 4421.0 + 197 │ H 50 755.0 + 198 │ M 50 1018.0 + 199 │ E 100 4421.0 + 200 │ H 50 755.0 + 189 rows omitted + ``` + +- `_allprobs` - probability weights + +No column was added for frequency weights because the column passed through the +`weights` argument is used by other functions, hence there is no need to add a +new column. If `weights` is not specified, then a column called `_weights` is +added. + +### Why `DataFrames` + +Survey data most of the time, if not always, is structured in a way that is very +well suited for data frames. The [`DataFrames.jl`](https://dataframes.juliadata.org/stable/) +package is mature and well maintained and provides a lot of functionality that +proves useful for using inside functions such as [`bootweights`](@ref) or +[`mean`](@ref). Mainly, the functions used are +[`groupby`](https://dataframes.juliadata.org/stable/lib/functions/#DataFrames.groupby) +and [`combine`](https://dataframes.juliadata.org/stable/lib/functions/#DataFrames.combine). + +Now that support for [metadata](https://dataframes.juliadata.org/stable/lib/metadata/) +was introduced in `DataFrames.jl`, it becomes possible to use metadata in +`Survey.jl` to reduce space complexity. For example, stratification and clustering +information could be stored as metadata of the `DataFrame` passed through `data`. ## Bootstrapping From 7ac37e14123ea990da570e7ba744ac36ac25e239 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Fri, 27 Jan 2023 13:13:36 +0200 Subject: [PATCH 58/80] Change `julia` blocks to `@repl` blocks --- docs/src/getting_started.md | 135 ++++++++---------------------------- docs/src/manual.md | 117 ++++--------------------------- 2 files changed, 45 insertions(+), 207 deletions(-) diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md index 54b97243..7fdd8ba6 100644 --- a/docs/src/getting_started.md +++ b/docs/src/getting_started.md @@ -9,14 +9,14 @@ is done using the following command: After registration, the regular `Pkg` commands can be used for installing the package: -```julia -julia> using Pkg +```@repl +using Pkg -julia> Pkg.add("Survey") +Pkg.add("Survey") ``` ```julia -julia> ] add Survey +] add Survey ``` ## Tutorial @@ -25,8 +25,8 @@ This tutorial assumes basic knowledge of statistics and survey analysis. To begin this tutorial, load the package in your workspace: -```julia -julia> using Survey +```@repl tutorial +using Survey ``` Now load a survey dataset that you want to study. In this tutorial we will be using @@ -40,25 +40,8 @@ data. The API program has been discontinued at the end of 2018. Information is archived at [https://www.cde.ca.gov/re/pr/api.asp](https://www.cde.ca.gov/re/pr/api.asp) -```julia -julia> apisrs = load_data("apisrs") -200×40 DataFrame - Row │ Column1 cds stype name sname snum dn ⋯ - │ Int64 Int64 String1 String15 String Int64 St ⋯ -─────┼────────────────────────────────────────────────────────────────────────────────────────────── - 1 │ 1039 15739081534155 H McFarland High McFarland High 1039 Mc ⋯ - 2 │ 1124 19642126066716 E Stowers (Cecil Stowers (Cecil B.) Elementary 1124 AB - 3 │ 2868 30664493030640 H Brea-Olinda Hig Brea-Olinda High 2868 Br - 4 │ 1273 19644516012744 E Alameda Element Alameda Elementary 1273 Do - 5 │ 4926 40688096043293 E Sunnyside Eleme Sunnyside Elementary 4926 Sa ⋯ - 6 │ 2463 19734456014278 E Los Molinos Ele Los Molinos Elementary 2463 Ha - ⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋱ - 196 │ 969 15635291534775 H North High North High 969 Ke - 197 │ 1752 19647336017446 E Hammel Street E Hammel Street Elementary 1752 Lo - 198 │ 4480 37683386039143 E Audubon Element Audubon Elementary 4480 Sa ⋯ - 199 │ 4062 36678196036222 E Edison Elementa Edison Elementary 4062 On - 200 │ 2683 24657716025621 E Franklin Elemen Franklin Elementary 2683 Me - 34 columns and 189 rows omitted +```@repl tutorial +apisrs = load_data("apisrs") ``` `apisrs` is a simple random sample of the Academic Performance Index of Californian @@ -67,39 +50,14 @@ schools. The [`load_data`](@ref) function loads it as a You can look at the column names of `apisrs` to get an idea of what the dataset contains. -```julia -julia> names(apisrs) -40-element Vector{String}: - "Column1" - "cds" - "stype" - "name" - "sname" - "snum" - "dname" - "dnum" - ⋮ - "avg.ed" - "full" - "emer" - "enroll" - "api.stu" - "pw" - "fpc" +```@repl tutorial +names(apisrs) ``` Next, build a survey design from your `DataFrame`: -```julia -julia> srs = SurveyDesign(apisrs; weights=:pw) -SurveyDesign: -data: 200×45 DataFrame -strata: none -cluster: none -popsize: [6194.0, 6194.0, 6194.0 … 6194.0] -sampsize: [200, 200, 200 … 200] -weights: [30.97, 30.97, 30.97 … 30.97] -allprobs: [0.0323, 0.0323, 0.0323 … 0.0323] +```@repl tutorial +srs = SurveyDesign(apisrs; weights=:pw) ``` This is a simple random sample design with weights given by the column `:pw` of @@ -115,52 +73,28 @@ interested in the mean Academic Performance Index from the year 1999. First we need to convert the [`SurveyDesign`](@ref) to a [`ReplicateDesign`](@ref) using bootstrapping: -```julia -julia> bsrs = bootweights(srs) -ReplicateDesign: -data: 200×4045 DataFrame -strata: none -cluster: none -popsize: [6194.0, 6194.0, 6194.0 … 6194.0] -sampsize: [200, 200, 200 … 200] -weights: [30.97, 30.97, 30.97 … 30.97] -allprobs: [0.0323, 0.0323, 0.0323 … 0.0323] -replicates: 4000 +```@repl tutorial +bsrs = bootweights(srs) ``` We do this because [TODO: explain why]. Now we can compute the estimated mean: -```julia -julia> mean(:api99, bsrs) -1×2 DataFrame - Row │ mean SE - │ Float64 Float64 -─────┼────────────────── - 1 │ 624.685 9.5747 +```@repl tutorial +mean(:api99, bsrs) ``` We can also find the mean of both the 1999 API and 2000 API for a clear comparison between students' performance from one year to another: -```julia -2×3 DataFrame - Row │ names mean SE - │ String Float64 Float64 -─────┼────────────────────────── - 1 │ api99 624.685 9.5747 - 2 │ api00 656.585 9.30656 +```@repl tutorial +mean([:api99, :api00], bsrs) ``` The [`ratio`](@ref) is also appropriate for studying the relationship between the two APIs: -```julia -julia> ratio(:api00, :api99, bsrs) -1×2 DataFrame - Row │ ratio SE - │ Float64 Float64 -─────┼───────────────────── - 1 │ 1.05107 0.00364165 +```@repl tutorial +ratio(:api00, :api99, bsrs) ``` If we're interested in a certain statistic estimated by a specific domain, we @@ -168,25 +102,8 @@ can add the domain as the second parameter to our function. Let's say we want to find the estimated total number of students enrolled in schools from each county: -```julia -julia> total(:enroll, :cname, bsrs) -38×3 DataFrame - Row │ cname total SE - │ String15 Float64 Any -─────┼──────────────────────────────────────────── - 1 │ Kern 1.95823e5 74984.5 - 2 │ Los Angeles 867129.0 1.34517e5 - 3 │ Orange 1.68786e5 63990.2 - 4 │ San Luis Obispo 6720.49 6731.29 - 5 │ San Francisco 30319.6 18024.1 - 6 │ Modoc 6503.7 6500.84 - ⋮ │ ⋮ ⋮ ⋮ - 34 │ Yolo 12171.2 12131.8 - 35 │ Calaveras 12976.4 13095.7 - 36 │ Napa 39239.0 29841.1 - 37 │ Lake 6410.79 6562.72 - 38 │ Merced 15392.1 14921.9 - 27 rows omitted +```@repl tutorial +total(:enroll, :cname, bsrs) ``` Another way to visualize data is through graphs. We can make a histogram to @@ -200,4 +117,12 @@ better see the distribution of enrolled students: julia> hist(srs, :enroll) ``` +The REPL doesn't show the plot. To see it, you need to save it locally. + +```julia +julia> import AlgebraOfGraphics.save + +julia> save("hist.png", h) +``` + ![](assets/hist.png) diff --git a/docs/src/manual.md b/docs/src/manual.md index a83e3fa3..9b1006a1 100644 --- a/docs/src/manual.md +++ b/docs/src/manual.md @@ -15,84 +15,16 @@ population sizes and, if necessary, strata and cluster information. Notice the change in `apisrs`: -```julia -julia> apisrs = load_data("apisrs") -200×40 DataFrame - Row │ Column1 cds stype name sname ⋯ - │ Int64 Int64 String1 String15 String ⋯ -─────┼────────────────────────────────────────────────────────────────────────── - 1 │ 1039 15739081534155 H McFarland High McFarland High ⋯ - 2 │ 1124 19642126066716 E Stowers (Cecil Stowers (Cecil B.) E - 3 │ 2868 30664493030640 H Brea-Olinda Hig Brea-Olinda High - 4 │ 1273 19644516012744 E Alameda Element Alameda Elementary - 5 │ 4926 40688096043293 E Sunnyside Eleme Sunnyside Elementary ⋯ - 6 │ 2463 19734456014278 E Los Molinos Ele Los Molinos Elementa - ⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋱ - 196 │ 969 15635291534775 H North High North High - 197 │ 1752 19647336017446 E Hammel Street E Hammel Street Elemen - 198 │ 4480 37683386039143 E Audubon Element Audubon Elementary ⋯ - 199 │ 4062 36678196036222 E Edison Elementa Edison Elementary - 200 │ 2683 24657716025621 E Franklin Elemen Franklin Elementary - 36 columns and 189 rows omitted - -julia> names(apisrs) -40-element Vector{String}: - "Column1" - "cds" - "stype" - "name" - "sname" - "snum" - "dname" - "dnum" - ⋮ - "avg.ed" - "full" - "emer" - "enroll" - "api.stu" - "pw" - "fpc" - -julia> srs = SurveyDesign(apisrs; weights=:pw); - -julia> apisrs -200×45 DataFrame - Row │ Column1 cds stype name sname ⋯ - │ Int64 Int64 String1 String15 String ⋯ -─────┼────────────────────────────────────────────────────────────────────────── - 1 │ 1039 15739081534155 H McFarland High McFarland High ⋯ - 2 │ 1124 19642126066716 E Stowers (Cecil Stowers (Cecil B.) E - 3 │ 2868 30664493030640 H Brea-Olinda Hig Brea-Olinda High - 4 │ 1273 19644516012744 E Alameda Element Alameda Elementary - 5 │ 4926 40688096043293 E Sunnyside Eleme Sunnyside Elementary ⋯ - 6 │ 2463 19734456014278 E Los Molinos Ele Los Molinos Elementa - ⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋱ - 196 │ 969 15635291534775 H North High North High - 197 │ 1752 19647336017446 E Hammel Street E Hammel Street Elemen - 198 │ 4480 37683386039143 E Audubon Element Audubon Elementary ⋯ - 199 │ 4062 36678196036222 E Edison Elementa Edison Elementary - 200 │ 2683 24657716025621 E Franklin Elemen Franklin Elementary - 41 columns and 189 rows omitted - -julia> names(apisrs) -45-element Vector{String}: - "Column1" - "cds" - "stype" - "name" - "sname" - "snum" - "dname" - "dnum" - ⋮ - "pw" - "fpc" - "false_strata" - "false_cluster" - "_sampsize" - "_popsize" - "_allprobs" +```@setup manual_DataFrames +using Survey +``` + +```@repl manual_DataFrames +apisrs = load_data("apisrs") +names(apisrs) +srs = SurveyDesign(apisrs; weights=:pw); +apisrs +names(apisrs) ``` Five columns were added: @@ -115,30 +47,11 @@ Five columns were added: These match the stratification variable: - ```julia - julia> apistrat = load_data("apistrat"); - - julia> strat = SurveyDesign(apistrat; strata=:stype, weights=:pw); - - julia> apistrat[:, [:stype, :_sampsize, :_popsize]] - 200×3 DataFrame - Row │ stype _sampsize _popsize - │ String1 Int64 Float64 - ─────┼────────────────────────────── - 1 │ E 100 4421.0 - 2 │ E 100 4421.0 - 3 │ E 100 4421.0 - 4 │ E 100 4421.0 - 5 │ E 100 4421.0 - 6 │ E 100 4421.0 - ⋮ │ ⋮ ⋮ ⋮ - 196 │ E 100 4421.0 - 197 │ H 50 755.0 - 198 │ M 50 1018.0 - 199 │ E 100 4421.0 - 200 │ H 50 755.0 - 189 rows omitted - ``` +```@repl manual_DataFrames +apistrat = load_data("apistrat"); +strat = SurveyDesign(apistrat; strata=:stype, weights=:pw); +apistrat[:, [:stype, :_sampsize, :_popsize]] +``` - `_allprobs` - probability weights From 1cb9f7883a48695243e2746c2379973772265bf3 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Fri, 27 Jan 2023 13:24:12 +0200 Subject: [PATCH 59/80] Remove extra blank lines --- src/bootstrap.jl | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/bootstrap.jl b/src/bootstrap.jl index c57649a8..1b1972eb 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -4,10 +4,8 @@ julia> using Random julia> apiclus1 = load_data("apiclus1"); - julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum, popsize=:fpc); - julia> bootweights(clus_one_stage; replicates=1000, rng=MersenneTwister(111)) # choose a seed for deterministic results ReplicateDesign: data: 183×1044 DataFrame From d9f72f6ea32497d119aaf5600cee8b7b6f712b66 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Fri, 27 Jan 2023 13:35:29 +0200 Subject: [PATCH 60/80] Combine docstrings into one --- src/quantile.jl | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/src/quantile.jl b/src/quantile.jl index 81003e43..187e5e19 100644 --- a/src/quantile.jl +++ b/src/quantile.jl @@ -21,24 +21,6 @@ julia> quantile(:api00,srs,0.5) │ Float64 Float64 ─────┼─────────────────────────── 1 │ 659.0 14.9764 -``` -""" -function quantile(var::Symbol, design::ReplicateDesign, p::Real;kwargs...) - v = design.data[!, var] - probs = design.data[!, design.allprobs] - X = Statistics.quantile(v, ProbabilityWeights(probs), p) - Xt = [Statistics.quantile(v, ProbabilityWeights(design.data[! , "replicate_"*string(i)]), p) for i in 1:design.replicates] - variance = sum((Xt .- X).^2) / design.replicates - df = DataFrame(percentile = X, SE = sqrt(variance)) - rename!(df, :percentile => string(p) * "th percentile") - return df -end - -""" -```jldoctest -julia> apisrs = load_data("apisrs"); - -julia> srs = SurveyDesign(apisrs; weights=:pw) |> bootweights; julia> quantile(:enroll,srs,[0.1,0.2,0.5,0.75,0.95]) 5×3 DataFrame @@ -52,6 +34,17 @@ julia> quantile(:enroll,srs,[0.1,0.2,0.5,0.75,0.95]) 5 │ 0.95 1473.1 142.568 ``` """ +function quantile(var::Symbol, design::ReplicateDesign, p::Real; kwargs...) + v = design.data[!, var] + probs = design.data[!, design.allprobs] + X = Statistics.quantile(v, ProbabilityWeights(probs), p) + Xt = [Statistics.quantile(v, ProbabilityWeights(design.data[! , "replicate_"*string(i)]), p) for i in 1:design.replicates] + variance = sum((Xt .- X).^2) / design.replicates + df = DataFrame(percentile = X, SE = sqrt(variance)) + rename!(df, :percentile => string(p) * "th percentile") + return df +end + function quantile(var::Symbol, design::ReplicateDesign, probs::Vector{<:Real}; kwargs...) df = vcat([rename!(quantile(var, design, prob; kwargs...),[:statistic, :SE]) for prob in probs]...) df.percentile = string.(probs) From 9d0c7ca61367fb9e5ae74913bce5e6d107e8c053 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Fri, 27 Jan 2023 13:35:48 +0200 Subject: [PATCH 61/80] Add dot at the end of sentence --- src/ratio.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ratio.jl b/src/ratio.jl index 4a7385ac..3b20b806 100644 --- a/src/ratio.jl +++ b/src/ratio.jl @@ -1,7 +1,7 @@ """ ratio(numerator, denominator, design) -Estimate the ratio of the columns specified in numerator and denominator +Estimate the ratio of the columns specified in numerator and denominator. ```jldoctest julia> apiclus1 = load_data("apiclus1"); From 74d06e5b02796121003631b72727b2403ee24850 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Fri, 27 Jan 2023 13:36:32 +0200 Subject: [PATCH 62/80] Change docstring example to REPL-style --- src/boxplot.jl | 15 ++++++++++----- src/hist.jl | 15 ++++++++++----- src/plot.jl | 15 ++++++++++----- 3 files changed, 30 insertions(+), 15 deletions(-) diff --git a/src/boxplot.jl b/src/boxplot.jl index 8ee3dcc4..b28d84b4 100644 --- a/src/boxplot.jl +++ b/src/boxplot.jl @@ -8,11 +8,16 @@ Weights can be specified by a `Symbol` using the keyword argument `weights`. The keyword arguments are all the arguments that can be passed to `mapping` in [AlgebraOfGraphics](https://docs.juliahub.com/AlgebraOfGraphics/CHIaw/0.4.7/). -```@example boxplot -apisrs = load_data("apisrs"); -srs = SurveyDesign(apisrs; weights=:pw); -bp = boxplot(srs, :stype, :enroll; weights = :pw) -save("boxplot.png", bp); nothing # hide +```julia +julia> using AlgebraOfGraphics + +julia> apisrs = load_data("apisrs"); + +julia> srs = SurveyDesign(apisrs; weights=:pw); + +julia> bp = boxplot(srs, :stype, :enroll; weights = :pw); + +julia> save("boxplot.png", bp) ``` ![](assets/boxplot.png) diff --git a/src/hist.jl b/src/hist.jl index 40935a1e..369620c4 100644 --- a/src/hist.jl +++ b/src/hist.jl @@ -59,11 +59,16 @@ For the complete argument list see [Makie.hist](https://makie.juliaplots.org/sta The `weights` argument should be a `Symbol` specifying a design variable. -```@example histogram -apisrs = load_data("apisrs"); -srs = SurveyDesign(apisrs; weights=:pw); -h = hist(srs, :enroll) -save("hist.png", h); nothing # hide +```julia +julia> using AlgebraOfGraphics + +julia> apisrs = load_data("apisrs"); + +julia> srs = SurveyDesign(apisrs; weights=:pw); + +julia> h = hist(srs, :enroll); + +julia> save("hist.png", h) ``` ![](assets/hist.png) diff --git a/src/plot.jl b/src/plot.jl index 79f1b97d..2e1e2338 100644 --- a/src/plot.jl +++ b/src/plot.jl @@ -6,11 +6,16 @@ Scatter plot of survey design variables `x` and `y`. The plot takes into account the frequency weights specified by the user in the design. -```@example plot -apisrs = load_data("apisrs"); -srs = SurveyDesign(apisrs; weights=:pw); -s = plot(srs, :api99, :api00) -save("scatter.png", s); nothing # hide +```julia +julia> using AlgebraOfGraphics + +julia> apisrs = load_data("apisrs"); + +julia> srs = SurveyDesign(apisrs; weights=:pw); + +julia> s = plot(srs, :api99, :api00); + +julia> save("scatter.png", s) ``` ![](assets/scatter.png) From 1582090e797ee39948fefdfa4ab3efbef89fc9af Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Fri, 27 Jan 2023 13:48:09 +0200 Subject: [PATCH 63/80] Restructure API to include multiple methods nicer --- docs/src/api.md | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/docs/src/api.md b/docs/src/api.md index 0890ab65..341dcbba 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -14,17 +14,11 @@ SurveyDesign ReplicateDesign load_data bootweights -mean(x::Symbol, design::ReplicateDesign) -mean(x::Symbol, domain::Symbol, design::ReplicateDesign) -total(x::Symbol, design::ReplicateDesign) -total(x::Symbol, domain::Symbol, design::ReplicateDesign) +mean +total quantile ratio -plot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...) -boxplot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...) -hist(design::AbstractSurveyDesign, var::Symbol, - bins::Union{Integer, AbstractVector} = freedman_diaconis(design, var); - normalization = :density, - kwargs... - ) +plot +boxplot +hist ``` From 0252a4bc3690d5bb5a33bdeadfb42579d33bd9c6 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Fri, 27 Jan 2023 16:52:39 +0200 Subject: [PATCH 64/80] Add Plotting section --- docs/src/manual.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/docs/src/manual.md b/docs/src/manual.md index 9b1006a1..e87ef2c6 100644 --- a/docs/src/manual.md +++ b/docs/src/manual.md @@ -79,4 +79,23 @@ information could be stored as metadata of the `DataFrame` passed through `data` ## Plotting +`Survey` uses [`AlgebraOfGraphics`](https://aog.makie.org/stable/) for plotting. +All plotting functions support a variable number of keyword arguments (through +`kwargs...`) that are passed internally to corresponding `AlgebraOfGraphics` +functions. See the source code for details: +[`plot`](https://github.com/xKDR/Survey.jl/blob/main/src/plot.jl), +[`hist`](https://github.com/xKDR/Survey.jl/blob/main/src/hist.jl), +[`boxplot`](https://github.com/xKDR/Survey.jl/blob/main/src/boxplot.jl). +This means that all functionality provided by `AlgebraOfGraphics` is supported +in `Survey`. + +Specific functionality might need to be imported from `AlgebraOfGraphics`. +Moreover, in order to choose the preferred +[`Makie backend`](https://docs.makie.org/stable/#makie_ecosystem) you must +explicitly use it: + +```@repl +using AlgebraOfGraphics, CairoMakie +``` + ## Performance From c2346c84715ee985971b3ba71b52b04a5be24c11 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Fri, 27 Jan 2023 17:40:34 +0200 Subject: [PATCH 65/80] Style check --- src/quantile.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/quantile.jl b/src/quantile.jl index 187e5e19..2fd8b797 100644 --- a/src/quantile.jl +++ b/src/quantile.jl @@ -15,14 +15,14 @@ julia> apisrs = load_data("apisrs"); julia> srs = SurveyDesign(apisrs; weights=:pw) |> bootweights; -julia> quantile(:api00,srs,0.5) +julia> quantile(:api00, srs, 0.5) 1×2 DataFrame Row │ 0.5th percentile SE │ Float64 Float64 ─────┼─────────────────────────── 1 │ 659.0 14.9764 -julia> quantile(:enroll,srs,[0.1,0.2,0.5,0.75,0.95]) +julia> quantile(:enroll, srs, [0.1,0.2,0.5,0.75,0.95]) 5×3 DataFrame Row │ percentile statistic SE │ String Float64 Float64 From 97389a2f78f2904269451e063668fd472449dc36 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Fri, 27 Jan 2023 18:53:13 +0200 Subject: [PATCH 66/80] Add comparisons --- docs/make.jl | 1 - docs/src/manual.md | 98 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 97 insertions(+), 2 deletions(-) diff --git a/docs/make.jl b/docs/make.jl index 2f6535ca..aeaab208 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -19,7 +19,6 @@ makedocs(; "Home" => "index.md", "Getting Started" => "getting_started.md", "Manual" => "manual.md", - "Moving from R" => "R_comparison.md", "API reference" => "api.md" ], checkdocs=:exports, diff --git a/docs/src/manual.md b/docs/src/manual.md index e87ef2c6..6d726064 100644 --- a/docs/src/manual.md +++ b/docs/src/manual.md @@ -98,4 +98,100 @@ explicitly use it: using AlgebraOfGraphics, CairoMakie ``` -## Performance +## Comparison with other languages + +There are multiple languages that offer survey analysis tools, most notably +[SAS/STAT](https://support.sas.com/rnd/app/stat/procedures/SurveyAnalysis.html) +and [R](https://CRAN.R-project.org/package=survey). + +### R comparison + +The inspiration for `Survey.jl` comes from R. Hence the syntax is in most cases +very similar to the syntax in the [`survey` package](https://cran.r-project.org/web/packages/survey/survey.pdf) +from R. To showcase this we will use the `apisrs` dataset found in both R's +`survey` and `Survey.jl`. See the [Tutorial](@ref) section for more details about +the `api` datesets. + +All examples show the R code first, followed by the Julia code. + +#### Loading data + +```R +data(api) +# all `api` datasets are loaded globally +``` + +```julia +srs = load_data("apisrs") +# only one dataset is loaded and stored in a variable +``` + +#### Creating a design + +```R +srs = svydesign(id=~1, data=apisrs, weights=~pw) # simple random sample +strat = svydesign(id=~1, data=apistrat, strata=~stype, weights=~pw) # stratified +clus1 = svydesign(id=~dnum, data=apiclus1, weights=~pw) # clustered (one stage) +``` + +```julia +srs = SurveyDesign(apisrs; weights=:pw) # simple random sample +strat = SurveyDesign(apistrat; strata=:stype, weights=:pw) # stratified +clus1 = SurveyDesign(apiclus1; clusters=:dnum, weights=:pw) # clustered (one stage) +``` + +#### Creating a replicate design + +```R +bsrs = as.svrepdesign(srs, type="bootstrap") +``` + +```julia +bsrs = bootweights(srs) +``` + +#### Computing the estimated mean + +```R +svymean(~api00, bsrs) +svymean(~api99+~api00, bsrs) +``` + +```julia +mean(:api00, bsrs) +mean([:api99, :api00], bsrs) +``` + +#### Computing the estimated total + +```R +svytotal(~api00, bsrs) +svytotal(~api99+~api00, bsrs) +``` + +```julia +total(:api00, bsrs) +total([:api99, :api00], bsrs) +``` + +#### Computing quantiles + +```R +svyquantile(~api00, bsrs, 0.5) +svyquantile(~api00, bsrs, c(0.25, 0.5, 0.75)) +``` + +```julia +quantile(:api00, bsrs, 0.5) +quantile(:api00, bsrs, [0.25, 0.5, 0.75]) +``` + +#### Domain estimation + +```R +svyby(~api00, ~cname, bsrs, svymean) +``` + +```julia +mean(:api00, :cname, bsrs) +``` From 185198d5bd2777c0bbc1326e5c6631990825b238 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Fri, 27 Jan 2023 19:00:24 +0200 Subject: [PATCH 67/80] Remove R_comparison.md file --- docs/src/R_comparison.md | 131 --------------------------------------- 1 file changed, 131 deletions(-) delete mode 100644 docs/src/R_comparison.md diff --git a/docs/src/R_comparison.md b/docs/src/R_comparison.md deleted file mode 100644 index 1d1a4b2d..00000000 --- a/docs/src/R_comparison.md +++ /dev/null @@ -1,131 +0,0 @@ -# Moving from R to Julia -This section presents examples to help move from R to Julia. Examples show R and Julia code for common operations in survey analysis.
-For the same operation, first the R and then the Julia code is presented. - -## Simple random sample - -The `apisrs` data, which is provided in both `survey` and `Survey.jl`, is used as an example. It's a simple random sample of the Academic Performance Index of Californian schools. - -### 1. Creating a survey design -Instantiating a simple random sample survey design. - -```R -library(survey) -data(api) -dsrs = svydesign(id = ~1, data = apisrs, weights = ~pw, fpc = ~fpc) -``` - -```julia -using Survey -srs = load_data("apisrs") -dsrs = SimpleRandomSample(srs; popsize = :fpc) -``` - -### 2. Mean -In the following example the mean of the variable `api00` is calculated. - -```R -svymean(~api00, dsrs) -``` -```julia -mean(:api00, dsrs) -``` - -### 3. Total -In the following example the sum of the variable `api00` is calculated. - -```R -svytotal(~api00, dsrs) -``` -```julia -total(:api00, dsrs) -``` - -### 4. Quantile -In the following example the median of the variable `api00` is calculated. -```R -svyquantile(~api00, dsrs, 0.5) -``` -```julia -quantile(:api00, dsrs, 0.5) -``` - -### 5. Domain estimation -In the following example the mean of the variable `api00` is calculated grouped by the variable `cname`. - -```R -svyby(~api00, ~cname, dsrs, svymean) -``` - -```julia -mean(:api00, :cname, dsrs) -``` - -In the following example the total of the variable `api00` is calculated grouped by the variable `cname`. - -```R -svyby(~api00, ~cname, dsrs, svytotal) -``` - -```julia -total(:api00, :cname, dsrs) -``` - -## Stratified sample - -The `apistrat` data, which is provided in both `survey` and `Survey`, is used as an example. It's a stratified sample of the Academic Performance Index of Californian schools. - -### 1. Creating a design object -The following example shows how to construct a design object for a stratified sample. - -```R -library(survey) -data(api) -dstrat = svydesign(id = ~1, data = apistrat, strata = ~stype, weights = ~pw, fpc = ~fpc) -``` - -```julia -using Survey -strat = load_data("apistrat") -dstrat = StratifiedSample(strat, :stype; popsize = :fpc) -``` - -### 2. Mean -In the following example the mean of the variable `api00` is calculated. - -```R -svymean(~api00, dstrat) -``` -```julia -mean(:api00, dstrat) -``` - -### 3. Total -In the following example the sum of the variable `api00` is calculated. - -```R -svytotal(~api00, dstrat) -``` -```julia -total(:api00, dstrat) -``` - -### 4. Quantile -In the following example the median of the variable `api00` is calculated. -```R -svyquantile(~api00, dstrat, 0.5) -``` -```julia -quantile(:api00, dstrat, 0.5) -``` - -### 5. Domain estimation -In the following example the mean of the variable `api00` is calculated grouped by the variable `cname`. - -```R -svyby(~api00, ~cname, dstrat, svymean) -``` - -```julia -mean(:api00, :cname, dstrat) -``` \ No newline at end of file From 36d308b8a08c5c09de856f7333923089ae453621 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Fri, 27 Jan 2023 19:01:46 +0200 Subject: [PATCH 68/80] Add Future plans section --- docs/src/manual.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/src/manual.md b/docs/src/manual.md index 6d726064..7310e072 100644 --- a/docs/src/manual.md +++ b/docs/src/manual.md @@ -195,3 +195,5 @@ svyby(~api00, ~cname, bsrs, svymean) ```julia mean(:api00, :cname, bsrs) ``` + +## Future plans From fb32c8f0f7598d739fe4b6fb151180561eb6d8cc Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Fri, 27 Jan 2023 19:03:29 +0200 Subject: [PATCH 69/80] Change section name to ReplicateDesign --- docs/src/manual.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/manual.md b/docs/src/manual.md index 7310e072..3197bb6a 100644 --- a/docs/src/manual.md +++ b/docs/src/manual.md @@ -75,7 +75,7 @@ was introduced in `DataFrames.jl`, it becomes possible to use metadata in `Survey.jl` to reduce space complexity. For example, stratification and clustering information could be stored as metadata of the `DataFrame` passed through `data`. -## Bootstrapping +## ReplicateDesign ## Plotting From 1ed029701625208614ee7228f5431207ff0423a1 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Fri, 27 Jan 2023 19:03:54 +0200 Subject: [PATCH 70/80] Remove backticks --- docs/src/manual.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/src/manual.md b/docs/src/manual.md index 3197bb6a..1adcc8c2 100644 --- a/docs/src/manual.md +++ b/docs/src/manual.md @@ -1,6 +1,6 @@ # Manual -## `DataFrames` in `Survey` +## DataFrames in Survey The internal structure of a survey design is build upon [`DataFrames`](https://dataframes.juliadata.org/stable/). In fact, the `data` @@ -60,7 +60,7 @@ No column was added for frequency weights because the column passed through the new column. If `weights` is not specified, then a column called `_weights` is added. -### Why `DataFrames` +### Why DataFrames Survey data most of the time, if not always, is structured in a way that is very well suited for data frames. The [`DataFrames.jl`](https://dataframes.juliadata.org/stable/) From 907625f4531d7ee0085e5c2d487616f95bb2e28e Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Fri, 27 Jan 2023 19:24:09 +0200 Subject: [PATCH 71/80] Restructure files --- docs/make.jl | 10 +- docs/src/getting_started.md | 2 +- docs/src/man/comparisons.md | 97 ++++++++++++++++++ docs/src/man/dataframes.md | 74 ++++++++++++++ docs/src/man/future.md | 1 + docs/src/man/plotting.md | 20 ++++ docs/src/man/replicate.md | 1 + docs/src/manual.md | 199 ------------------------------------ 8 files changed, 202 insertions(+), 202 deletions(-) create mode 100644 docs/src/man/comparisons.md create mode 100644 docs/src/man/dataframes.md create mode 100644 docs/src/man/future.md create mode 100644 docs/src/man/plotting.md create mode 100644 docs/src/man/replicate.md delete mode 100644 docs/src/manual.md diff --git a/docs/make.jl b/docs/make.jl index aeaab208..8ef8a61b 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -18,8 +18,14 @@ makedocs(; pages=[ "Home" => "index.md", "Getting Started" => "getting_started.md", - "Manual" => "manual.md", - "API reference" => "api.md" + "Manual" => [ + "DataFrames in Survey" => "man/dataframes.md", + "ReplicateDesign" => "man/replicate.md", + "Plotting" => "man/plotting.md", + "Comparison with other languages" => "man/comparisons.md", + "Future plans" => "man/future.md", + ], + "API reference" => "api.md", ], checkdocs=:exports, ) diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md index 7fdd8ba6..de7a09bc 100644 --- a/docs/src/getting_started.md +++ b/docs/src/getting_started.md @@ -63,7 +63,7 @@ srs = SurveyDesign(apisrs; weights=:pw) This is a simple random sample design with weights given by the column `:pw` of `apisrs`. You can also create more complex designs such as stratified or cluster sample designs. You can find more information on the complete capabilities of -the package in the [Manual](@ref). The purpose of this tutorial is to show the +the package in the [Manual](@ref manual). The purpose of this tutorial is to show the basic usage of the package. For that, we will stick with a simple random sample. Now you can analyse your design according to your needs using the diff --git a/docs/src/man/comparisons.md b/docs/src/man/comparisons.md new file mode 100644 index 00000000..232dd905 --- /dev/null +++ b/docs/src/man/comparisons.md @@ -0,0 +1,97 @@ +# Comparison with other languages + +There are multiple languages that offer survey analysis tools, most notably +[SAS/STAT](https://support.sas.com/rnd/app/stat/procedures/SurveyAnalysis.html) +and [R](https://CRAN.R-project.org/package=survey). + +## R comparison + +The inspiration for `Survey.jl` comes from R. Hence the syntax is in most cases +very similar to the syntax in the [`survey` package](https://cran.r-project.org/web/packages/survey/survey.pdf) +from R. To showcase this we will use the `apisrs` dataset found in both R's +`survey` and `Survey.jl`. See the [Tutorial](@ref) section for more details about +the `api` datesets. + +All examples show the R code first, followed by the Julia code. + +#### Loading data + +```R +data(api) +# all `api` datasets are loaded globally +``` + +```julia +srs = load_data("apisrs") +# only one dataset is loaded and stored in a variable +``` + +#### Creating a design + +```R +srs = svydesign(id=~1, data=apisrs, weights=~pw) # simple random sample +strat = svydesign(id=~1, data=apistrat, strata=~stype, weights=~pw) # stratified +clus1 = svydesign(id=~dnum, data=apiclus1, weights=~pw) # clustered (one stage) +``` + +```julia +srs = SurveyDesign(apisrs; weights=:pw) # simple random sample +strat = SurveyDesign(apistrat; strata=:stype, weights=:pw) # stratified +clus1 = SurveyDesign(apiclus1; clusters=:dnum, weights=:pw) # clustered (one stage) +``` + +#### Creating a replicate design + +```R +bsrs = as.svrepdesign(srs, type="bootstrap") +``` + +```julia +bsrs = bootweights(srs) +``` + +#### Computing the estimated mean + +```R +svymean(~api00, bsrs) +svymean(~api99+~api00, bsrs) +``` + +```julia +mean(:api00, bsrs) +mean([:api99, :api00], bsrs) +``` + +#### Computing the estimated total + +```R +svytotal(~api00, bsrs) +svytotal(~api99+~api00, bsrs) +``` + +```julia +total(:api00, bsrs) +total([:api99, :api00], bsrs) +``` + +#### Computing quantiles + +```R +svyquantile(~api00, bsrs, 0.5) +svyquantile(~api00, bsrs, c(0.25, 0.5, 0.75)) +``` + +```julia +quantile(:api00, bsrs, 0.5) +quantile(:api00, bsrs, [0.25, 0.5, 0.75]) +``` + +#### Domain estimation + +```R +svyby(~api00, ~cname, bsrs, svymean) +``` + +```julia +mean(:api00, :cname, bsrs) +``` diff --git a/docs/src/man/dataframes.md b/docs/src/man/dataframes.md new file mode 100644 index 00000000..df310019 --- /dev/null +++ b/docs/src/man/dataframes.md @@ -0,0 +1,74 @@ +# [DataFrames in Survey](@id manual) + +The internal structure of a survey design is build upon +[`DataFrames`](https://dataframes.juliadata.org/stable/). In fact, the `data` +argument is the only required argument for the constructor and it must be an +[`AbstractDataFrame`](https://dataframes.juliadata.org/stable/lib/types/#DataFrames.AbstractDataFrame). + +## Data manipulation + +The provided `DataFrame` is altered by the [`SurveyDesign`](@ref) constructor +in order to add columns for frequency and probability weights, sample and +population sizes and, if necessary, strata and cluster information. + +Notice the change in `apisrs`: + +```@setup manual_DataFrames +using Survey +``` + +```@repl manual_DataFrames +apisrs = load_data("apisrs") +names(apisrs) +srs = SurveyDesign(apisrs; weights=:pw); +apisrs +names(apisrs) +``` + +Five columns were added: + +- `false_strata` - only in the case of no stratification + + This column is necessary because when making a [`ReplicateDesign`](@ref), the + [`bootweights`](@ref) function uses [`groupby`](https://dataframes.juliadata.org/stable/lib/functions/#DataFrames.groupby) + with a column representing the stratification variable. If there are no strata, + there is no such column so it should be added in order to keep `bootweights` + general. + +- `false_cluster` - only in the case of no clustering + + The reasoning is the same as in the case of no stratification. + +- `_sampsize` - sample sizes + +- `_popsize` - population sizes + + These match the stratification variable: + +```@repl manual_DataFrames +apistrat = load_data("apistrat"); +strat = SurveyDesign(apistrat; strata=:stype, weights=:pw); +apistrat[:, [:stype, :_sampsize, :_popsize]] +``` + +- `_allprobs` - probability weights + +No column was added for frequency weights because the column passed through the +`weights` argument is used by other functions, hence there is no need to add a +new column. If `weights` is not specified, then a column called `_weights` is +added. + +## Why DataFrames + +Survey data most of the time, if not always, is structured in a way that is very +well suited for data frames. The [`DataFrames.jl`](https://dataframes.juliadata.org/stable/) +package is mature and well maintained and provides a lot of functionality that +proves useful for using inside functions such as [`bootweights`](@ref) or +[`mean`](@ref). Mainly, the functions used are +[`groupby`](https://dataframes.juliadata.org/stable/lib/functions/#DataFrames.groupby) +and [`combine`](https://dataframes.juliadata.org/stable/lib/functions/#DataFrames.combine). + +Now that support for [metadata](https://dataframes.juliadata.org/stable/lib/metadata/) +was introduced in `DataFrames.jl`, it becomes possible to use metadata in +`Survey.jl` to reduce space complexity. For example, stratification and clustering +information could be stored as metadata of the `DataFrame` passed through `data`. diff --git a/docs/src/man/future.md b/docs/src/man/future.md new file mode 100644 index 00000000..99e3ce5a --- /dev/null +++ b/docs/src/man/future.md @@ -0,0 +1 @@ +# Future plans diff --git a/docs/src/man/plotting.md b/docs/src/man/plotting.md new file mode 100644 index 00000000..2fecc07f --- /dev/null +++ b/docs/src/man/plotting.md @@ -0,0 +1,20 @@ +# Plotting + +`Survey` uses [`AlgebraOfGraphics`](https://aog.makie.org/stable/) for plotting. +All plotting functions support a variable number of keyword arguments (through +`kwargs...`) that are passed internally to corresponding `AlgebraOfGraphics` +functions. See the source code for details: +[`plot`](https://github.com/xKDR/Survey.jl/blob/main/src/plot.jl), +[`hist`](https://github.com/xKDR/Survey.jl/blob/main/src/hist.jl), +[`boxplot`](https://github.com/xKDR/Survey.jl/blob/main/src/boxplot.jl). +This means that all functionality provided by `AlgebraOfGraphics` is supported +in `Survey`. + +Specific functionality might need to be imported from `AlgebraOfGraphics`. +Moreover, in order to choose the preferred +[`Makie backend`](https://docs.makie.org/stable/#makie_ecosystem) you must +explicitly use it: + +```@repl +using AlgebraOfGraphics, CairoMakie +``` diff --git a/docs/src/man/replicate.md b/docs/src/man/replicate.md new file mode 100644 index 00000000..dbfd5089 --- /dev/null +++ b/docs/src/man/replicate.md @@ -0,0 +1 @@ +# ReplicateDesign diff --git a/docs/src/manual.md b/docs/src/manual.md deleted file mode 100644 index 1adcc8c2..00000000 --- a/docs/src/manual.md +++ /dev/null @@ -1,199 +0,0 @@ -# Manual - -## DataFrames in Survey - -The internal structure of a survey design is build upon -[`DataFrames`](https://dataframes.juliadata.org/stable/). In fact, the `data` -argument is the only required argument for the constructor and it must be an -[`AbstractDataFrame`](https://dataframes.juliadata.org/stable/lib/types/#DataFrames.AbstractDataFrame). - -### Data manipulation - -The provided `DataFrame` is altered by the [`SurveyDesign`](@ref) constructor -in order to add columns for frequency and probability weights, sample and -population sizes and, if necessary, strata and cluster information. - -Notice the change in `apisrs`: - -```@setup manual_DataFrames -using Survey -``` - -```@repl manual_DataFrames -apisrs = load_data("apisrs") -names(apisrs) -srs = SurveyDesign(apisrs; weights=:pw); -apisrs -names(apisrs) -``` - -Five columns were added: - -- `false_strata` - only in the case of no stratification - - This column is necessary because when making a [`ReplicateDesign`](@ref), the - [`bootweights`](@ref) function uses [`groupby`](https://dataframes.juliadata.org/stable/lib/functions/#DataFrames.groupby) - with a column representing the stratification variable. If there are no strata, - there is no such column so it should be added in order to keep `bootweights` - general. - -- `false_cluster` - only in the case of no clustering - - The reasoning is the same as in the case of no stratification. - -- `_sampsize` - sample sizes - -- `_popsize` - population sizes - - These match the stratification variable: - -```@repl manual_DataFrames -apistrat = load_data("apistrat"); -strat = SurveyDesign(apistrat; strata=:stype, weights=:pw); -apistrat[:, [:stype, :_sampsize, :_popsize]] -``` - -- `_allprobs` - probability weights - -No column was added for frequency weights because the column passed through the -`weights` argument is used by other functions, hence there is no need to add a -new column. If `weights` is not specified, then a column called `_weights` is -added. - -### Why DataFrames - -Survey data most of the time, if not always, is structured in a way that is very -well suited for data frames. The [`DataFrames.jl`](https://dataframes.juliadata.org/stable/) -package is mature and well maintained and provides a lot of functionality that -proves useful for using inside functions such as [`bootweights`](@ref) or -[`mean`](@ref). Mainly, the functions used are -[`groupby`](https://dataframes.juliadata.org/stable/lib/functions/#DataFrames.groupby) -and [`combine`](https://dataframes.juliadata.org/stable/lib/functions/#DataFrames.combine). - -Now that support for [metadata](https://dataframes.juliadata.org/stable/lib/metadata/) -was introduced in `DataFrames.jl`, it becomes possible to use metadata in -`Survey.jl` to reduce space complexity. For example, stratification and clustering -information could be stored as metadata of the `DataFrame` passed through `data`. - -## ReplicateDesign - -## Plotting - -`Survey` uses [`AlgebraOfGraphics`](https://aog.makie.org/stable/) for plotting. -All plotting functions support a variable number of keyword arguments (through -`kwargs...`) that are passed internally to corresponding `AlgebraOfGraphics` -functions. See the source code for details: -[`plot`](https://github.com/xKDR/Survey.jl/blob/main/src/plot.jl), -[`hist`](https://github.com/xKDR/Survey.jl/blob/main/src/hist.jl), -[`boxplot`](https://github.com/xKDR/Survey.jl/blob/main/src/boxplot.jl). -This means that all functionality provided by `AlgebraOfGraphics` is supported -in `Survey`. - -Specific functionality might need to be imported from `AlgebraOfGraphics`. -Moreover, in order to choose the preferred -[`Makie backend`](https://docs.makie.org/stable/#makie_ecosystem) you must -explicitly use it: - -```@repl -using AlgebraOfGraphics, CairoMakie -``` - -## Comparison with other languages - -There are multiple languages that offer survey analysis tools, most notably -[SAS/STAT](https://support.sas.com/rnd/app/stat/procedures/SurveyAnalysis.html) -and [R](https://CRAN.R-project.org/package=survey). - -### R comparison - -The inspiration for `Survey.jl` comes from R. Hence the syntax is in most cases -very similar to the syntax in the [`survey` package](https://cran.r-project.org/web/packages/survey/survey.pdf) -from R. To showcase this we will use the `apisrs` dataset found in both R's -`survey` and `Survey.jl`. See the [Tutorial](@ref) section for more details about -the `api` datesets. - -All examples show the R code first, followed by the Julia code. - -#### Loading data - -```R -data(api) -# all `api` datasets are loaded globally -``` - -```julia -srs = load_data("apisrs") -# only one dataset is loaded and stored in a variable -``` - -#### Creating a design - -```R -srs = svydesign(id=~1, data=apisrs, weights=~pw) # simple random sample -strat = svydesign(id=~1, data=apistrat, strata=~stype, weights=~pw) # stratified -clus1 = svydesign(id=~dnum, data=apiclus1, weights=~pw) # clustered (one stage) -``` - -```julia -srs = SurveyDesign(apisrs; weights=:pw) # simple random sample -strat = SurveyDesign(apistrat; strata=:stype, weights=:pw) # stratified -clus1 = SurveyDesign(apiclus1; clusters=:dnum, weights=:pw) # clustered (one stage) -``` - -#### Creating a replicate design - -```R -bsrs = as.svrepdesign(srs, type="bootstrap") -``` - -```julia -bsrs = bootweights(srs) -``` - -#### Computing the estimated mean - -```R -svymean(~api00, bsrs) -svymean(~api99+~api00, bsrs) -``` - -```julia -mean(:api00, bsrs) -mean([:api99, :api00], bsrs) -``` - -#### Computing the estimated total - -```R -svytotal(~api00, bsrs) -svytotal(~api99+~api00, bsrs) -``` - -```julia -total(:api00, bsrs) -total([:api99, :api00], bsrs) -``` - -#### Computing quantiles - -```R -svyquantile(~api00, bsrs, 0.5) -svyquantile(~api00, bsrs, c(0.25, 0.5, 0.75)) -``` - -```julia -quantile(:api00, bsrs, 0.5) -quantile(:api00, bsrs, [0.25, 0.5, 0.75]) -``` - -#### Domain estimation - -```R -svyby(~api00, ~cname, bsrs, svymean) -``` - -```julia -mean(:api00, :cname, bsrs) -``` - -## Future plans From e28ab3ee2e1c55d015569b4c7ccb588fe0706836 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Fri, 27 Jan 2023 20:09:21 +0200 Subject: [PATCH 72/80] Remove explicit url --- docs/src/getting_started.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md index de7a09bc..629f10a0 100644 --- a/docs/src/getting_started.md +++ b/docs/src/getting_started.md @@ -38,7 +38,7 @@ data. !!! note The API program has been discontinued at the end of 2018. Information is archived - at [https://www.cde.ca.gov/re/pr/api.asp](https://www.cde.ca.gov/re/pr/api.asp) + [here](https://www.cde.ca.gov/re/pr/api.asp). ```@repl tutorial apisrs = load_data("apisrs") From da66283f1a26731266e759913706788926c8fc16 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Fri, 27 Jan 2023 20:11:44 +0200 Subject: [PATCH 73/80] Rename Comparisons section --- docs/make.jl | 2 +- docs/src/man/comparisons.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/make.jl b/docs/make.jl index 8ef8a61b..9a67b5b0 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -22,7 +22,7 @@ makedocs(; "DataFrames in Survey" => "man/dataframes.md", "ReplicateDesign" => "man/replicate.md", "Plotting" => "man/plotting.md", - "Comparison with other languages" => "man/comparisons.md", + "Comparison with other survey analysis tools" => "man/comparisons.md", "Future plans" => "man/future.md", ], "API reference" => "api.md", diff --git a/docs/src/man/comparisons.md b/docs/src/man/comparisons.md index 232dd905..5a5328fd 100644 --- a/docs/src/man/comparisons.md +++ b/docs/src/man/comparisons.md @@ -1,6 +1,6 @@ -# Comparison with other languages +# Comparison with other survey analysis tools -There are multiple languages that offer survey analysis tools, most notably +There are multiple alternatives that offer survey analysis tools, most notably [SAS/STAT](https://support.sas.com/rnd/app/stat/procedures/SurveyAnalysis.html) and [R](https://CRAN.R-project.org/package=survey). From 0fbaf52761bab3075ff5f24c8f90ece2e1a11dc2 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru <84318573+iuliadmtru@users.noreply.github.com> Date: Sun, 29 Jan 2023 10:39:07 +0200 Subject: [PATCH 74/80] Update docs/src/getting_started.md Add replicates Co-authored-by: Ayush Patnaik --- docs/src/getting_started.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md index 629f10a0..b9020b4f 100644 --- a/docs/src/getting_started.md +++ b/docs/src/getting_started.md @@ -74,7 +74,7 @@ need to convert the [`SurveyDesign`](@ref) to a [`ReplicateDesign`](@ref) using bootstrapping: ```@repl tutorial -bsrs = bootweights(srs) +bsrs = bootweights(srs; replicates = 1000) ``` We do this because [TODO: explain why]. Now we can compute the estimated mean: From 8237c3f8a823a26d1bc51818d958b76b0d0d6daa Mon Sep 17 00:00:00 2001 From: Iulia Dumitru <84318573+iuliadmtru@users.noreply.github.com> Date: Sun, 29 Jan 2023 10:43:14 +0200 Subject: [PATCH 75/80] Update docs/src/man/comparisons.md Add Stata reference --- docs/src/man/comparisons.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/src/man/comparisons.md b/docs/src/man/comparisons.md index 5a5328fd..f76fd264 100644 --- a/docs/src/man/comparisons.md +++ b/docs/src/man/comparisons.md @@ -1,7 +1,8 @@ # Comparison with other survey analysis tools There are multiple alternatives that offer survey analysis tools, most notably -[SAS/STAT](https://support.sas.com/rnd/app/stat/procedures/SurveyAnalysis.html) +[SAS](https://support.sas.com/rnd/app/stat/procedures/SurveyAnalysis.html), +[Stata](https://www.stata.com/features/survey-methods/) and [R](https://CRAN.R-project.org/package=survey). ## R comparison From 2c8a11ecca32098049b6462e623015860438f5e3 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru <84318573+iuliadmtru@users.noreply.github.com> Date: Sun, 29 Jan 2023 10:43:44 +0200 Subject: [PATCH 76/80] Update docs/src/man/comparisons.md Change type to "subbootstrap" Co-authored-by: Ayush Patnaik --- docs/src/man/comparisons.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/man/comparisons.md b/docs/src/man/comparisons.md index f76fd264..fa9ffb54 100644 --- a/docs/src/man/comparisons.md +++ b/docs/src/man/comparisons.md @@ -44,7 +44,7 @@ clus1 = SurveyDesign(apiclus1; clusters=:dnum, weights=:pw) # clustered (one sta #### Creating a replicate design ```R -bsrs = as.svrepdesign(srs, type="bootstrap") +bsrs = as.svrepdesign(srs, type="subbootstrap") ``` ```julia From 1d4fccb7142362a60bd6989a89c21e7d3ea52087 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru <84318573+iuliadmtru@users.noreply.github.com> Date: Sun, 29 Jan 2023 10:55:07 +0200 Subject: [PATCH 77/80] Update docs/src/man/comparisons.md Co-authored-by: Ayush Patnaik --- docs/src/man/comparisons.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/man/comparisons.md b/docs/src/man/comparisons.md index fa9ffb54..ca0b7716 100644 --- a/docs/src/man/comparisons.md +++ b/docs/src/man/comparisons.md @@ -9,7 +9,7 @@ and [R](https://CRAN.R-project.org/package=survey). The inspiration for `Survey.jl` comes from R. Hence the syntax is in most cases very similar to the syntax in the [`survey` package](https://cran.r-project.org/web/packages/survey/survey.pdf) -from R. To showcase this we will use the `apisrs` dataset found in both R's +from R. To showcase this we will use the `api` datasets found in both R's `survey` and `Survey.jl`. See the [Tutorial](@ref) section for more details about the `api` datesets. From 8a5d3672d87a9a5beef10d97b8e7dbaeedb14558 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Sun, 29 Jan 2023 10:54:08 +0200 Subject: [PATCH 78/80] Add `mean` with `SurveyDesign` and rephrase for bootstrap --- docs/src/getting_started.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md index b9020b4f..e9a6e063 100644 --- a/docs/src/getting_started.md +++ b/docs/src/getting_started.md @@ -69,17 +69,19 @@ basic usage of the package. For that, we will stick with a simple random sample. Now you can analyse your design according to your needs using the [functionality](@ref Index) provided by the package. For example, you can compute the estimated mean or population total for a given variable. Let's say we're -interested in the mean Academic Performance Index from the year 1999. First we -need to convert the [`SurveyDesign`](@ref) to a [`ReplicateDesign`](@ref) using -bootstrapping: +interested in the mean Academic Performance Index from the year 1999. If we are +only interested in the estimated mean, then we can directly pass our design to +the [`mean`](@ref) function: ```@repl tutorial -bsrs = bootweights(srs; replicates = 1000) +mean(:api99, srs) ``` -We do this because [TODO: explain why]. Now we can compute the estimated mean: +If we also want to know the standard error of the mean, we need to convert the +[`SurveyDesign`](@ref) to a [`ReplicateDesign`](@ref) using bootstrapping: ```@repl tutorial +bsrs = bootweights(srs; replicates = 1000) mean(:api99, bsrs) ``` From ec6e0a3faeaa0bab9897beecc0d8822c3d3e51fa Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Sun, 29 Jan 2023 10:57:38 +0200 Subject: [PATCH 79/80] Minor alignment change --- docs/src/man/comparisons.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/src/man/comparisons.md b/docs/src/man/comparisons.md index ca0b7716..c04829b8 100644 --- a/docs/src/man/comparisons.md +++ b/docs/src/man/comparisons.md @@ -2,8 +2,8 @@ There are multiple alternatives that offer survey analysis tools, most notably [SAS](https://support.sas.com/rnd/app/stat/procedures/SurveyAnalysis.html), -[Stata](https://www.stata.com/features/survey-methods/) -and [R](https://CRAN.R-project.org/package=survey). +[Stata](https://www.stata.com/features/survey-methods/) and +[R](https://CRAN.R-project.org/package=survey). ## R comparison From b9515abbf46c506b5fbfa6c750124421de90adc5 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Sun, 29 Jan 2023 13:17:52 +0200 Subject: [PATCH 80/80] Change "we" to "you" and minor rephrasing --- docs/src/getting_started.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md index e9a6e063..7495e93b 100644 --- a/docs/src/getting_started.md +++ b/docs/src/getting_started.md @@ -68,16 +68,16 @@ basic usage of the package. For that, we will stick with a simple random sample. Now you can analyse your design according to your needs using the [functionality](@ref Index) provided by the package. For example, you can compute -the estimated mean or population total for a given variable. Let's say we're -interested in the mean Academic Performance Index from the year 1999. If we are -only interested in the estimated mean, then we can directly pass our design to -the [`mean`](@ref) function: +the estimated mean or population total for a given variable. Let's say you want +to find the mean Academic Performance Index from the year 1999. If you are only +interested in the estimated mean, then you can directly pass your design to the +[`mean`](@ref) function: ```@repl tutorial mean(:api99, srs) ``` -If we also want to know the standard error of the mean, we need to convert the +If you also want to know the standard error of the mean, you need to convert the [`SurveyDesign`](@ref) to a [`ReplicateDesign`](@ref) using bootstrapping: ```@repl tutorial @@ -85,8 +85,8 @@ bsrs = bootweights(srs; replicates = 1000) mean(:api99, bsrs) ``` -We can also find the mean of both the 1999 API and 2000 API for a clear -comparison between students' performance from one year to another: +You can find the mean of both the 1999 API and 2000 API for a clear comparison +between students' performance from one year to another: ```@repl tutorial mean([:api99, :api00], bsrs) @@ -99,8 +99,8 @@ the two APIs: ratio(:api00, :api99, bsrs) ``` -If we're interested in a certain statistic estimated by a specific domain, we -can add the domain as the second parameter to our function. Let's say we want +If you're interested in a certain statistic estimated by a specific domain, you +can add the domain as the second parameter to your function. Let's say you want to find the estimated total number of students enrolled in schools from each county: @@ -108,7 +108,7 @@ county: total(:enroll, :cname, bsrs) ``` -Another way to visualize data is through graphs. We can make a histogram to +Another way to visualize data is through graphs. You can make a histogram to better see the distribution of enrolled students: ```@setup warning