From 734f1bbb78862d0f04c9a4c09cb60cce52c648c2 Mon Sep 17 00:00:00 2001 From: smishr <43640926+smishr@users.noreply.github.com> Date: Tue, 10 Jan 2023 15:52:53 +0530 Subject: [PATCH 01/14] Update survey design, add tests, remove extra quantile --- src/SurveyDesign.jl | 39 +++++++++++++++++++++++++-------------- src/quantile.jl | 9 --------- src/show.jl | 8 ++++---- test/SurveyDesign.jl | 41 +++++++++++++++++++++++++++++++++++++++-- 4 files changed, 68 insertions(+), 29 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 1541a3b9..7b68ea86 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -33,15 +33,14 @@ julia> apiclus1[!, :pw] = fill(757/15,(size(apiclus1,1),)); # Correct api mistak julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) SurveyDesign: -data: 183x46 DataFrame +data: 183x44 DataFrame cluster: dnum design.data[!,design.cluster]: 637, 637, 637, ..., 448 popsize: popsize design.data[!,design.popsize]: 9240.0, 9240.0, 9240.0, ..., 9240.0 sampsize: sampsize design.data[!,design.sampsize]: 15, 15, 15, ..., 15 -design.data[!,:probs]: 0.0198, 0.0198, 0.0198, ..., 0.0198 -design.data[!,:allprobs]: 0.0198, 0.0198, 0.0198, ..., 0.0198 +design.data[!,design.allprobs]: 0.0198, 0.0198, 0.0198, ..., 0.0198 ``` """ struct SurveyDesign <: AbstractSurveyDesign @@ -50,9 +49,15 @@ struct SurveyDesign <: AbstractSurveyDesign popsize::Symbol sampsize::Symbol strata::Symbol - pps::Bool + weights::Symbol # Effective weights in case of singlestage approx supported + allprobs::Symbol # Right now only singlestage approx supported + pps::Bool # TODO functionality # Single stage clusters sample, like apiclus1 - function SurveyDesign(data::AbstractDataFrame; strata::Union{Nothing,Symbol} = nothing, weights::Union{Nothing,Symbol}= nothing, clusters::Union{Nothing, Symbol, Vector{Symbol}} = nothing, popsize::Union{Nothing, Int,Symbol}=nothing) + function SurveyDesign(data::AbstractDataFrame; + strata::Union{Nothing,Symbol} = nothing, + weights::Union{Nothing,Symbol}= nothing, + clusters::Union{Nothing, Symbol, Vector{Symbol}} = nothing, + popsize::Union{Nothing, Int,Symbol}=nothing) # sampsize here is number of clusters completely sampled, popsize is total clusters in population if typeof(strata) <:Nothing data.false_strata = repeat(["FALSE_STRATA"], nrow(data)) @@ -73,20 +78,26 @@ struct SurveyDesign <: AbstractSurveyDesign sampsize_labels = :sampsize data[!, sampsize_labels] = fill(length(unique(data[!, cluster])),(nrow(data),)) if !(typeof(popsize) <: Nothing) - data[!, :weights] = data[!, popsize] ./ data[!, sampsize_labels] - elseif !(typeof(weights) <: Nothing) - data.weights = data[!, weights] + weights_labels = :weights + data[!, weights_labels] = data[!, popsize] ./ data[!, sampsize_labels] + elseif typeof(weights) <: Symbol + if !(typeof(data[!,weights]) <:Vector{<:Real}) + @show typeof(data[!,weights]) + error("weights column has to be numeric") + end + weights_labels = weights else - data.weights = repeat([1], nrow(data)) + weights_labels = :weights + data[!,weights_labels] = repeat([1], nrow(data)) end - data[!, :probs] = 1 ./ data[!, :weights] # Many formulae are easily defined in terms of sampling probabilties - data[!, :allprobs] = data[!, :probs] # In one-stage cluster sample, allprobs is just probs, no multiplication needed - pps = false + allprobs_labels = :allprobs + data[!, allprobs_labels] = 1 ./ data[!, weights_labels] # In one-stage cluster sample, allprobs is just probs, no multiplication needed + pps = false # for now no explicit pps support if !(typeof(popsize) <: Symbol) - data.popsize = repeat([sum(data.weights)], nrow(data)) + data.popsize = repeat([sum(data[!,weights_labels])], nrow(data)) popsize = :popsize end - new(data, cluster, popsize, sampsize_labels, strata, pps) + new(data, cluster, popsize, sampsize_labels, strata, weights_labels, allprobs_labels, pps) end end diff --git a/src/quantile.jl b/src/quantile.jl index 1cc9646c..d4e399a5 100644 --- a/src/quantile.jl +++ b/src/quantile.jl @@ -41,13 +41,4 @@ function quantile(var::Symbol, design::SurveyDesign, p::Union{<:Real,Vector{<:Re df = DataFrame(probability=p, quantile=Statistics.quantile(v, ProbabilityWeights(probs), p)) # TODO: Add CI and SE of the quantile return df -end - -function quantile(var::Symbol, design::SurveyDesign, p::Union{<:Real,Vector{<:Real}}; - alpha::Float64=0.05, ci::Bool=false, se::Bool=false, qrule="hf7",kwargs...) - v = design.data[!, var] - probs = design.data[!, :probs] - df = DataFrame(probability=p, quantile=Statistics.quantile(v, ProbabilityWeights(probs), p)) # Not sure which quantile defintion this returns - # TODO: Add CI and SE of the quantile - return df end \ No newline at end of file diff --git a/src/show.jl b/src/show.jl index 3319e653..bb37059c 100644 --- a/src/show.jl +++ b/src/show.jl @@ -46,8 +46,8 @@ function Base.show(io::IO, ::MIME"text/plain", design::SurveyDesign) printinfo(io, "design.data[!,design.popsize]", makeshort(design.data[!,design.popsize])) printinfo(io, "sampsize", string(design.sampsize); newline=true) printinfo(io, "design.data[!,design.sampsize]", makeshort(design.data[!,design.sampsize])) - printinfo(io, "design.data[!,:probs]", makeshort(design.data.probs)) - printinfo(io, "design.data[!,:allprobs]", makeshort(design.data.allprobs)) + # printinfo(io, "design.data[!,:probs]", makeshort(design.data.probs)) + printinfo(io, "design.data[!,design.allprobs]", makeshort(design.data.allprobs)) end "Print information about a repliocate design." @@ -62,8 +62,8 @@ function Base.show(io::IO, ::MIME"text/plain", design::ReplicateDesign) printinfo(io, "design.data[!,design.popsize]", makeshort(design.data[!,design.popsize])) printinfo(io, "sampsize", string(design.sampsize); newline=true) printinfo(io, "design.data[!,design.sampsize]", makeshort(design.data[!,design.sampsize])) - printinfo(io, "design.data[!,:probs]", makeshort(design.data.probs)) - printinfo(io, "design.data[!,:allprobs]", makeshort(design.data.allprobs)) + # printinfo(io, "design.data[!,:probs]", makeshort(design.data.probs)) + printinfo(io, "design.data[!,design.allprobs]", makeshort(design.data.allprobs)) printstyled(io, "replicates: "; bold=true) println(io, design.replicates) end \ No newline at end of file diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl index 6851f4e8..adc927e3 100644 --- a/test/SurveyDesign.jl +++ b/test/SurveyDesign.jl @@ -1,4 +1,35 @@ -@testset "SurveyDesign" begin +@testset "SurveyDesign_srs" begin + ##### Simple Random Sample tests + # Load API datasets + apisrs_original = load_data("apisrs") + apisrs_original[!, :derived_probs] = 1 ./ apisrs_original.pw + apisrs_original[!, :derived_sampsize] = fill(200.0, size(apisrs_original, 1)) + ############################## + ### Basic functionality + ### weights as Symbol + apisrs = copy(apisrs_original) + srs_weights = SurveyDesign(apisrs, weights=:pw) + @test srs_weights.data[!,srs_weights.weights][1] ≈ 30.97 atol = 1e-4 + @test srs_weights.data[!,srs_weights.weights] == 1 ./ srs_weights.data[!,srs_weights.allprobs] + ############################## + ### Weights as non-numeric error + apisrs = copy(apisrs_original) + @test_throws ErrorException SurveyDesign(apisrs, weights=:stype) +end + +@testset "SurveyDesign_strat" begin + ### StratifiedSample tests + # Load API datasets + apistrat_original = load_data("apistrat") + apistrat_original[!, :derived_probs] = 1 ./ apistrat_original.pw + apistrat_original[!, :derived_sampsize] = apistrat_original.fpc ./ apistrat_original.pw + + apistrat = copy(apistrat_original) + strat = SurveyDesign(apistrat, strata = :stype, weights = :pw) |> bootweights + +end + +@testset "SurveyDesign_multistage" begin # Load API datasets apiclus1_original = load_data("apiclus1") apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column @@ -9,4 +40,10 @@ @test dclus1.data[!, :weights] ≈ fill(50.4667,size(apiclus1,1)) atol = 1e-3 @test dclus1.data[!,dclus1.sampsize] ≈ fill(15,size(apiclus1,1)) @test dclus1.data[!,:allprobs] ≈ dclus1.data[!,:probs] atol = 1e-4 -end \ No newline at end of file + + ############################## + # Load API datasets + nhanes = load_data("nhanes") + nhanes_design = SurveyDesign(nhanes; clusters = :SDMVPSU, strata = :SDMVSTRA, weights = :WTMEC2YR) +end + From 547725ffb6d111ff0e9ce2de2ddc88bbe2f7b98b Mon Sep 17 00:00:00 2001 From: smishr <43640926+smishr@users.noreply.github.com> Date: Tue, 10 Jan 2023 16:07:28 +0530 Subject: [PATCH 02/14] julia formatter --- src/SurveyDesign.jl | 21 +++++++++++---------- test/SurveyDesign.jl | 6 ++++++ 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 7b68ea86..69fe3b51 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -54,12 +54,13 @@ struct SurveyDesign <: AbstractSurveyDesign pps::Bool # TODO functionality # Single stage clusters sample, like apiclus1 function SurveyDesign(data::AbstractDataFrame; - strata::Union{Nothing,Symbol} = nothing, - weights::Union{Nothing,Symbol}= nothing, - clusters::Union{Nothing, Symbol, Vector{Symbol}} = nothing, - popsize::Union{Nothing, Int,Symbol}=nothing) + clusters::Union{Nothing,Symbol,Vector{Symbol}}=nothing, + strata::Union{Nothing,Symbol}=nothing, + popsize::Union{Nothing,Int,Symbol}=nothing, + weights::Union{Nothing,Symbol}=nothing + ) # sampsize here is number of clusters completely sampled, popsize is total clusters in population - if typeof(strata) <:Nothing + if typeof(strata) <: Nothing data.false_strata = repeat(["FALSE_STRATA"], nrow(data)) strata = :false_strata end @@ -76,25 +77,25 @@ struct SurveyDesign <: AbstractSurveyDesign end # For one-stage sample only one sampsize vector sampsize_labels = :sampsize - data[!, sampsize_labels] = fill(length(unique(data[!, cluster])),(nrow(data),)) + data[!, sampsize_labels] = fill(length(unique(data[!, cluster])), (nrow(data),)) if !(typeof(popsize) <: Nothing) weights_labels = :weights data[!, weights_labels] = data[!, popsize] ./ data[!, sampsize_labels] elseif typeof(weights) <: Symbol - if !(typeof(data[!,weights]) <:Vector{<:Real}) - @show typeof(data[!,weights]) + if !(typeof(data[!, weights]) <: Vector{<:Real}) + @show typeof(data[!, weights]) error("weights column has to be numeric") end weights_labels = weights else weights_labels = :weights - data[!,weights_labels] = repeat([1], nrow(data)) + data[!, weights_labels] = repeat([1], nrow(data)) end allprobs_labels = :allprobs data[!, allprobs_labels] = 1 ./ data[!, weights_labels] # In one-stage cluster sample, allprobs is just probs, no multiplication needed pps = false # for now no explicit pps support if !(typeof(popsize) <: Symbol) - data.popsize = repeat([sum(data[!,weights_labels])], nrow(data)) + data.popsize = repeat([sum(data[!, weights_labels])], nrow(data)) popsize = :popsize end new(data, cluster, popsize, sampsize_labels, strata, weights_labels, allprobs_labels, pps) diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl index adc927e3..90989918 100644 --- a/test/SurveyDesign.jl +++ b/test/SurveyDesign.jl @@ -11,10 +11,16 @@ srs_weights = SurveyDesign(apisrs, weights=:pw) @test srs_weights.data[!,srs_weights.weights][1] ≈ 30.97 atol = 1e-4 @test srs_weights.data[!,srs_weights.weights] == 1 ./ srs_weights.data[!,srs_weights.allprobs] + ### popsize as Symbol + apisrs = copy(apisrs_original) + srs_pop = SurveyDesign(apisrs, popsize=:fpc) + @test srs_pop.data[!,srs_pop.weights][1] ≈ 30.97 atol = 1e-4 + @test srs_pop.data[!,srs_pop.weights] == 1 ./ srs_pop.data[!,srs_pop.allprobs] ############################## ### Weights as non-numeric error apisrs = copy(apisrs_original) @test_throws ErrorException SurveyDesign(apisrs, weights=:stype) + end @testset "SurveyDesign_strat" begin From b63acd1498ef386194c928a0225f72660bdad54a Mon Sep 17 00:00:00 2001 From: smishr <43640926+smishr@users.noreply.github.com> Date: Tue, 10 Jan 2023 18:05:03 +0530 Subject: [PATCH 03/14] Add tests for stratified sampling SurveyDesign --- src/SurveyDesign.jl | 3 +-- test/SurveyDesign.jl | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 69fe3b51..0b53fb57 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -83,7 +83,6 @@ struct SurveyDesign <: AbstractSurveyDesign data[!, weights_labels] = data[!, popsize] ./ data[!, sampsize_labels] elseif typeof(weights) <: Symbol if !(typeof(data[!, weights]) <: Vector{<:Real}) - @show typeof(data[!, weights]) error("weights column has to be numeric") end weights_labels = weights @@ -95,8 +94,8 @@ struct SurveyDesign <: AbstractSurveyDesign data[!, allprobs_labels] = 1 ./ data[!, weights_labels] # In one-stage cluster sample, allprobs is just probs, no multiplication needed pps = false # for now no explicit pps support if !(typeof(popsize) <: Symbol) - data.popsize = repeat([sum(data[!, weights_labels])], nrow(data)) popsize = :popsize + data[!,popsize] = repeat([sum(data[!, weights_labels])], nrow(data)) end new(data, cluster, popsize, sampsize_labels, strata, weights_labels, allprobs_labels, pps) end diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl index 90989918..01d5baf5 100644 --- a/test/SurveyDesign.jl +++ b/test/SurveyDesign.jl @@ -16,11 +16,12 @@ srs_pop = SurveyDesign(apisrs, popsize=:fpc) @test srs_pop.data[!,srs_pop.weights][1] ≈ 30.97 atol = 1e-4 @test srs_pop.data[!,srs_pop.weights] == 1 ./ srs_pop.data[!,srs_pop.allprobs] + ### Both ways should achieve same weights and allprobs! + @test srs_pop.data[!,srs_pop.weights] == srs_weights.data[!,srs_weights.weights] ############################## ### Weights as non-numeric error apisrs = copy(apisrs_original) @test_throws ErrorException SurveyDesign(apisrs, weights=:stype) - end @testset "SurveyDesign_strat" begin @@ -29,10 +30,17 @@ end apistrat_original = load_data("apistrat") apistrat_original[!, :derived_probs] = 1 ./ apistrat_original.pw apistrat_original[!, :derived_sampsize] = apistrat_original.fpc ./ apistrat_original.pw - + ############################## + ### weights as Symbol apistrat = copy(apistrat_original) - strat = SurveyDesign(apistrat, strata = :stype, weights = :pw) |> bootweights - + strat_wt = SurveyDesign(apistrat, strata=:stype, weights=:pw) + @test strat_wt.data[!,strat_wt.weights] == 1 ./ strat_wt.data[!,strat_wt.allprobs] + ### popsize as Symbol + apistrat = copy(apistrat_original) + strat_pop = SurveyDesign(apistrat, strata=:stype, popsize=:fpc) + @test strat_pop.data[!,strat_pop.weights] == 1 ./ strat_pop.data[!,strat_pop.allprobs] + ############################## + # @test strat_pop.data[!,strat_pop.weights] == strat_wt.data[!,strat_wt.weights] end @testset "SurveyDesign_multistage" begin From e371402ff4b3c95f275e511f29c596095d56db0c Mon Sep 17 00:00:00 2001 From: smishr <43640926+smishr@users.noreply.github.com> Date: Wed, 11 Jan 2023 13:23:37 +0530 Subject: [PATCH 04/14] nhanes and yrbs testing --- src/SurveyDesign.jl | 4 ++-- test/SurveyDesign.jl | 13 +++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index b1d26204..81e99767 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -55,7 +55,7 @@ struct SurveyDesign <: AbstractSurveyDesign function SurveyDesign(data::AbstractDataFrame; clusters::Union{Nothing,Symbol,Vector{Symbol}}=nothing, strata::Union{Nothing,Symbol}=nothing, - popsize::Union{Nothing,Int,Symbol}=nothing, + popsize::Union{Nothing,Symbol}=nothing, weights::Union{Nothing,Symbol}=nothing ) # sampsize here is number of clusters completely sampled, popsize is total clusters in population @@ -82,7 +82,7 @@ struct SurveyDesign <: AbstractSurveyDesign data[!, weights_labels] = data[!, popsize] ./ data[!, sampsize_labels] elseif typeof(weights) <: Symbol if !(typeof(data[!, weights]) <: Vector{<:Real}) - error("weights column has to be numeric") + error(string("given weights column ", weights , " is not of numeric type")) end weights_labels = weights else diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl index 01d5baf5..be515926 100644 --- a/test/SurveyDesign.jl +++ b/test/SurveyDesign.jl @@ -61,3 +61,16 @@ end nhanes_design = SurveyDesign(nhanes; clusters = :SDMVPSU, strata = :SDMVSTRA, weights = :WTMEC2YR) end +@testset "SurveyDesign_realSurveys" begin + # Load API datasets + yrbs_original = load_data("yrbs") + nhanes_original = load_data("nhanes") + ############################## + # NHANES + nhanes = copy(nhanes_original) + dnhanes = SingleStageSurveyDesign(nhanes; cluster = :SDMVPSU, strata=:SDMVSTRA, weights=:WTMEC2YR) + ############################## + # YRBS + yrbs = copy(yrbs_original) + dyrbs = SingleStageSurveyDesign(yrbs; cluster = :psu, strata=:stratum, weights=:weight) +end From bd59a671dcf2154e7378cf0bf5bc37db33dd8ef0 Mon Sep 17 00:00:00 2001 From: smishr Date: Fri, 13 Jan 2023 20:28:09 +0530 Subject: [PATCH 05/14] add ht.jl to gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index e4441b23..494288f6 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ .gitignore .DS_Store *.json +src/ht.jl \ No newline at end of file From 3f4c07ff30d21f9dd204c803ac820c88484e5540 Mon Sep 17 00:00:00 2001 From: smishr Date: Sat, 14 Jan 2023 01:42:05 +0530 Subject: [PATCH 06/14] digits=4 in show, combine not popsize in weight, strat tests --- src/SurveyDesign.jl | 48 +++++++++++++++++++++++++-------------------- src/show.jl | 6 +++--- 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 596d925a..c2dde058 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -25,21 +25,22 @@ individuals in one cluster are sampled. The clusters are considered disjoint and - `strata::Union{Nothing, Symbol}=nothing`: the stratification variable. - `clusters::Union{Nothing, Symbol, Vector{Symbol}}=nothing`: the clustering variable. - `weights::Union{Nothing, Symbol}=nothing`: the sampling weights. -- `popsize::Union{Nothing, Int, Symbol}=nothing`: the (expected) survey population size. +- `popsize::Union{Nothing, Symbol}=nothing`: the (expected) survey population size. ```jldoctest -julia> apistrat = load_data("apistrat"); +julia> apiclus1 = load_data("apiclus1"); -julia> strat = SurveyDesign(apistrat; strata=:stype, weights=:pw) +julia> dclus1 = SurveyDesign(apiclus1; clusters=:dnum, strata=:stype, weights=:pw) SurveyDesign: -data: 200×46 DataFrame +data: 183×43 DataFrame strata: stype - [E, E, E … H] -cluster: none -popsize: [6190.0, 6190.0, 6190.0 … 6190.0] -sampsize: [200, 200, 200 … 200] -weights: [44.2, 44.2, 44.2 … 15.1] -probs: [0.0226, 0.0226, 0.0226 … 0.0662] + [H, E, E … E] +cluster: dnum + [637, 637, 637 … 448] +popsize: [507.7049, 507.7049, 507.7049 … 507.7049] +sampsize: [15, 15, 15 … 15] +weights: [33.847, 33.847, 33.847 … 33.847] +allprobs: [0.0295, 0.0295, 0.0295 … 0.0295] ``` """ struct SurveyDesign <: AbstractSurveyDesign @@ -74,28 +75,33 @@ struct SurveyDesign <: AbstractSurveyDesign if typeof(clusters) <: Symbol cluster = clusters end - # For one-stage sample only one sampsize vector + # For single-stage approximation only one "effective" sampsize vector sampsize_labels = :sampsize - data[!, sampsize_labels] = fill(length(unique(data[!, cluster])), (nrow(data),)) - if !(typeof(popsize) <: Nothing) + if isa(strata,Symbol) && isnothing(clusters) # If stratified sample then sampsize is inside strata + data[!, sampsize_labels] = transform(groupby(data, strata), nrow => :counts).counts + else + data[!, sampsize_labels] = fill(length(unique(data[!, cluster])), (nrow(data),)) + end + if isa(popsize, Symbol) weights_labels = :weights data[!, weights_labels] = data[!, popsize] ./ data[!, sampsize_labels] - elseif typeof(weights) <: Symbol + elseif isa(weights, Symbol) if !(typeof(data[!, weights]) <: Vector{<:Real}) - error(string("given weights column ", weights , " is not of numeric type")) + throw(ArgumentError(string("given weights column ", weights , " is not of numeric type"))) + else + weights_labels = weights + # derive popsize from given `weights` + popsize = :popsize + data[!, popsize] = data[!, sampsize_labels] .* data[!, weights_labels] end - weights_labels = weights else + # neither popsize nor weights given weights_labels = :weights data[!, weights_labels] = repeat([1], nrow(data)) end allprobs_labels = :allprobs data[!, allprobs_labels] = 1 ./ data[!, weights_labels] # In one-stage cluster sample, allprobs is just probs, no multiplication needed - pps = false # for now no explicit pps support - if !(typeof(popsize) <: Symbol) - popsize = :popsize - data[!,popsize] = repeat([sum(data[!, weights_labels])], nrow(data)) - end + pps = false # for now no explicit pps supported faster functions, but they can be added new(data, cluster, popsize, sampsize_labels, strata, weights_labels, allprobs_labels, pps) end end diff --git a/src/show.jl b/src/show.jl index a72c3250..730d506b 100644 --- a/src/show.jl +++ b/src/show.jl @@ -5,7 +5,7 @@ Helper function that transforms a given `Number` or `Vector` into a short-form s """ function makeshort(x) if isa(x[1], Float64) - x = round.(x, sigdigits=3) + x = round.(x, digits=4) # Rounded to 4 digits after the decimal place end # print short vectors or single values as they are, compress otherwise if length(x) > 1 @@ -56,6 +56,6 @@ function surveyshow(io::IO, design::AbstractSurveyDesign) printinfo(io, "popsize", makeshort(design.data[!, design.popsize])) printinfo(io, "sampsize", makeshort(design.data[!, design.sampsize])) # weights and probs info - printinfo(io, "weights", makeshort(design.data[!, :weights])) - printinfo(io, "probs", makeshort(design.data[!, :probs]); newline=false) + printinfo(io, "weights", makeshort(design.data[!, design.weights])) + printinfo(io, "allprobs", makeshort(design.data[!, design.allprobs]); newline=false) end From c00d5937d298a90319de903e4eb8c8c9985435bd Mon Sep 17 00:00:00 2001 From: smishr Date: Sat, 14 Jan 2023 01:42:22 +0530 Subject: [PATCH 07/14] stratified tests --- test/SurveyDesign.jl | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl index be515926..baa07b45 100644 --- a/test/SurveyDesign.jl +++ b/test/SurveyDesign.jl @@ -21,7 +21,7 @@ ############################## ### Weights as non-numeric error apisrs = copy(apisrs_original) - @test_throws ErrorException SurveyDesign(apisrs, weights=:stype) + @test_throws ArgumentError SurveyDesign(apisrs, weights=:stype) end @testset "SurveyDesign_strat" begin @@ -34,13 +34,25 @@ end ### weights as Symbol apistrat = copy(apistrat_original) strat_wt = SurveyDesign(apistrat, strata=:stype, weights=:pw) + @test strat_wt.data[!,strat_wt.weights][1] ≈ 44.2100 atol = 1e-4 + @test strat_wt.data[!,strat_wt.weights][200] ≈ 15.1000 atol = 1e-4 @test strat_wt.data[!,strat_wt.weights] == 1 ./ strat_wt.data[!,strat_wt.allprobs] - ### popsize as Symbol + ### popsize as Symbol (should be same as above) apistrat = copy(apistrat_original) strat_pop = SurveyDesign(apistrat, strata=:stype, popsize=:fpc) + @test strat_pop.data[!,strat_pop.weights][1] ≈ 44.2100 atol = 1e-4 + @test strat_pop.data[!,strat_pop.weights][200] ≈ 15.1000 atol = 1e-4 @test strat_pop.data[!,strat_pop.weights] == 1 ./ strat_pop.data[!,strat_pop.allprobs] + ### popsize and weights as Symbol (should be same as above two) + apistrat = copy(apistrat_original) + dstrat = SurveyDesign(apistrat, strata=:stype, weights=:pw, popsize=:fpc) + @test dstrat.data[!,dstrat.weights][1] ≈ 44.2100 atol = 1e-4 + @test dstrat.data[!,dstrat.weights][200] ≈ 15.1000 atol = 1e-4 + @test dstrat.data[!,dstrat.weights] == 1 ./ dstrat.data[!,dstrat.allprobs] ############################## - # @test strat_pop.data[!,strat_pop.weights] == strat_wt.data[!,strat_wt.weights] + # Check all three ways get equivalent weights + @test strat_pop.data[!,strat_pop.weights] ≈ strat_wt.data[!,strat_wt.weights] rtol = 1e-4 + @test strat_wt.data[!,strat_wt.weights] ≈ strat_wt.data[!,strat_wt.weights] rtol = 1e-4 end @testset "SurveyDesign_multistage" begin @@ -51,10 +63,8 @@ end # one-stage cluster sample with popsize apiclus1 = copy(apiclus1_original) dclus1 = SurveyDesign(apiclus1; clusters = :dnum, popsize =:fpc) - @test dclus1.data[!, :weights] ≈ fill(50.4667,size(apiclus1,1)) atol = 1e-3 - @test dclus1.data[!,dclus1.sampsize] ≈ fill(15,size(apiclus1,1)) - @test dclus1.data[!,:allprobs] ≈ dclus1.data[!,:probs] atol = 1e-4 - + @test dclus1.data[!, dclus1.weights] ≈ fill(50.4667,size(apiclus1,1)) atol = 1e-3 + @test dclus1.data[!, dclus1.sampsize] ≈ fill(15,size(apiclus1,1)) ############################## # Load API datasets nhanes = load_data("nhanes") @@ -68,9 +78,9 @@ end ############################## # NHANES nhanes = copy(nhanes_original) - dnhanes = SingleStageSurveyDesign(nhanes; cluster = :SDMVPSU, strata=:SDMVSTRA, weights=:WTMEC2YR) + dnhanes = SurveyDesign(nhanes; clusters = :SDMVPSU, strata=:SDMVSTRA, weights=:WTMEC2YR) ############################## # YRBS yrbs = copy(yrbs_original) - dyrbs = SingleStageSurveyDesign(yrbs; cluster = :psu, strata=:stratum, weights=:weight) + dyrbs = SurveyDesign(yrbs; clusters = :psu, strata=:stratum, weights=:weight) end From ece7181e9417c595c06639f585a0df685282b550 Mon Sep 17 00:00:00 2001 From: smishr Date: Sun, 15 Jan 2023 11:44:04 +0530 Subject: [PATCH 08/14] Add apiclus2 and sampsize testing --- src/SurveyDesign.jl | 2 +- test/SurveyDesign.jl | 67 ++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 63 insertions(+), 6 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index c2dde058..ff67f682 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -89,8 +89,8 @@ struct SurveyDesign <: AbstractSurveyDesign if !(typeof(data[!, weights]) <: Vector{<:Real}) throw(ArgumentError(string("given weights column ", weights , " is not of numeric type"))) else - weights_labels = weights # derive popsize from given `weights` + weights_labels = weights popsize = :popsize data[!, popsize] = data[!, sampsize_labels] .* data[!, weights_labels] end diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl index baa07b45..de74f2f4 100644 --- a/test/SurveyDesign.jl +++ b/test/SurveyDesign.jl @@ -11,11 +11,15 @@ srs_weights = SurveyDesign(apisrs, weights=:pw) @test srs_weights.data[!,srs_weights.weights][1] ≈ 30.97 atol = 1e-4 @test srs_weights.data[!,srs_weights.weights] == 1 ./ srs_weights.data[!,srs_weights.allprobs] + @test srs_weights.data[!,srs_weights.allprobs] ≈ srs_weights.data[!, :derived_probs] atol = 1e-4 + @test srs_weights.data[!,srs_weights.sampsize] ≈ srs_weights.data[!, :derived_sampsize] atol = 1e-4 ### popsize as Symbol apisrs = copy(apisrs_original) srs_pop = SurveyDesign(apisrs, popsize=:fpc) @test srs_pop.data[!,srs_pop.weights][1] ≈ 30.97 atol = 1e-4 @test srs_pop.data[!,srs_pop.weights] == 1 ./ srs_pop.data[!,srs_pop.allprobs] + @test srs_pop.data[!,srs_pop.allprobs] ≈ srs_pop.data[!, :derived_probs] atol = 1e-4 + @test srs_pop.data[!,srs_pop.sampsize] ≈ srs_pop.data[!, :derived_sampsize] atol = 1e-4 ### Both ways should achieve same weights and allprobs! @test srs_pop.data[!,srs_pop.weights] == srs_weights.data[!,srs_weights.weights] ############################## @@ -37,38 +41,91 @@ end @test strat_wt.data[!,strat_wt.weights][1] ≈ 44.2100 atol = 1e-4 @test strat_wt.data[!,strat_wt.weights][200] ≈ 15.1000 atol = 1e-4 @test strat_wt.data[!,strat_wt.weights] == 1 ./ strat_wt.data[!,strat_wt.allprobs] - ### popsize as Symbol (should be same as above) + @test strat_wt.data[!,strat_wt.allprobs] ≈ strat_wt.data[!, :derived_probs] atol = 1e-4 + @test strat_wt.data[!,strat_wt.sampsize] ≈ strat_wt.data[!, :derived_sampsize] atol = 1e-4 + ### popsize as Symbol (should be same as above (for now)) apistrat = copy(apistrat_original) strat_pop = SurveyDesign(apistrat, strata=:stype, popsize=:fpc) @test strat_pop.data[!,strat_pop.weights][1] ≈ 44.2100 atol = 1e-4 @test strat_pop.data[!,strat_pop.weights][200] ≈ 15.1000 atol = 1e-4 @test strat_pop.data[!,strat_pop.weights] == 1 ./ strat_pop.data[!,strat_pop.allprobs] + @test strat_pop.data[!,strat_pop.allprobs] ≈ strat_pop.data[!, :derived_probs] atol = 1e-4 + @test strat_pop.data[!,strat_pop.sampsize] ≈ strat_pop.data[!, :derived_sampsize] atol = 1e-4 ### popsize and weights as Symbol (should be same as above two) apistrat = copy(apistrat_original) dstrat = SurveyDesign(apistrat, strata=:stype, weights=:pw, popsize=:fpc) @test dstrat.data[!,dstrat.weights][1] ≈ 44.2100 atol = 1e-4 @test dstrat.data[!,dstrat.weights][200] ≈ 15.1000 atol = 1e-4 @test dstrat.data[!,dstrat.weights] == 1 ./ dstrat.data[!,dstrat.allprobs] + @test dstrat.data[!,dstrat.allprobs] ≈ dstrat.data[!, :derived_probs] atol = 1e-4 + @test dstrat.data[!,dstrat.sampsize] ≈ dstrat.data[!, :derived_sampsize] atol = 1e-4 ############################## # Check all three ways get equivalent weights @test strat_pop.data[!,strat_pop.weights] ≈ strat_wt.data[!,strat_wt.weights] rtol = 1e-4 @test strat_wt.data[!,strat_wt.weights] ≈ strat_wt.data[!,strat_wt.weights] rtol = 1e-4 end -@testset "SurveyDesign_multistage" begin +@testset "SurveyDesign_apiclus1" begin # Load API datasets apiclus1_original = load_data("apiclus1") apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column + apiclus1_original[!, :derived_probs] = 1 ./ apiclus1_original.pw ############################## # one-stage cluster sample with popsize apiclus1 = copy(apiclus1_original) dclus1 = SurveyDesign(apiclus1; clusters = :dnum, popsize =:fpc) @test dclus1.data[!, dclus1.weights] ≈ fill(50.4667,size(apiclus1,1)) atol = 1e-3 @test dclus1.data[!, dclus1.sampsize] ≈ fill(15,size(apiclus1,1)) - ############################## + @test dclus1.data[!, dclus1.allprobs] ≈ dclus1.data[!, :derived_probs] atol = 1e-4 +end + +@testset "SurveyDesign_apiclus2" begin # Load API datasets - nhanes = load_data("nhanes") - nhanes_design = SurveyDesign(nhanes; clusters = :SDMVPSU, strata = :SDMVSTRA, weights = :WTMEC2YR) + apiclus2_original = load_data("apiclus2") + apiclus2_original[!, :derived_probs] = 1 ./ apiclus2_original.pw + ############################## + calculated_probs_R = [0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.024018254, 0.024018254, 0.024018254, 0.024018254, + 0.024018254, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.007338911, 0.007338911, 0.007338911, + 0.007338911, 0.007338911, 0.052840159, 0.009435743, 0.009435743, + 0.009435743, 0.009435743, 0.009435743, 0.037742970, 0.037742970, + 0.037742970, 0.037742970, 0.037742970, 0.003669455, 0.003669455, + 0.003669455, 0.003669455, 0.003669455, 0.018871485, 0.018871485, + 0.018871485, 0.018871485, 0.018871485, 0.037742970, 0.037742970, + 0.037742970, 0.037742970, 0.037742970, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.052840159, 0.052840159, 0.052840159, + 0.052840159, 0.052840159, 0.029355644, 0.029355644, 0.029355644, + 0.029355644, 0.029355644, 0.052840159, 0.052840159, 0.052840159, + 0.044033465, 0.044033465, 0.044033465, 0.044033465, 0.044033465, + 0.052840159] + + # two stage cluster sampling `with replacement' + apiclus2 = copy(apiclus2_original) + dclus2 = SurveyDesign(apiclus2; clusters = [:dnum,:snum], weights=:pw) # cant pass popsize as Vector + @test dclus2.data[!,dclus2.weights][1] ≈ 1 / calculated_probs_R[1] atol = 1e-4 + @test dclus2.data[!,dclus2.weights][25] ≈ 1 / calculated_probs_R[25] atol = 1e-4 + @test dclus2.data[!,dclus2.weights][121] ≈ 1 / calculated_probs_R[121] atol = 1e-4 + @test dclus2.data[!,dclus2.weights][125] ≈ 1 / calculated_probs_R[125] atol = 1e-4 + + # TODO: sampsize and popsize testing + ## NOT THE SAME AS R object right now + + ######################### + ## Complete multistage sampling (when implemented) should look like + ## weights should theoretically be optional if both clusters and popsize given + # dclus2_complete = SurveyDesign(apiclus2; clusters = [:dnum,:snum], popsize=[:fpc1,:fpc2], {weights=:pw}) end @testset "SurveyDesign_realSurveys" begin From 266fad0c885514ebc1d10d080665daba08cada86 Mon Sep 17 00:00:00 2001 From: smishr Date: Sun, 15 Jan 2023 12:30:33 +0530 Subject: [PATCH 09/14] bootstrap change :weights to design.weights --- src/bootstrap.jl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/bootstrap.jl b/src/bootstrap.jl index 83defc97..a5e9f019 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -4,18 +4,18 @@ julia> using Random julia> apiclus1 = load_data("apiclus1"); -julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum); +julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum, popsize=:fpc); julia> bootweights(clus_one_stage; replicates=1000, rng=MersenneTwister(111)) # choose a seed for deterministic results ReplicateDesign: -data: 183×1046 DataFrame +data: 183×1044 DataFrame strata: none cluster: dnum [637, 637, 637 … 448] -popsize: [183, 183, 183 … 183] +popsize: [757, 757, 757 … 757] sampsize: [15, 15, 15 … 15] -weights: [1, 1, 1 … 1] -probs: [1.0, 1.0, 1.0 … 1.0] +weights: [50.4667, 50.4667, 50.4667 … 50.4667] +allprobs: [0.0198, 0.0198, 0.0198 … 0.0198] replicates: 1000 ``` """ @@ -34,7 +34,7 @@ function bootweights(design::SurveyDesign; replicates=4000, rng=MersenneTwister( rh = [(count(==(i), randinds)) for i in 1:nh] # main bootstrap algo. gdf = groupby(substrata, design.cluster) for i in 1:nh - gdf[i].whij = repeat([rh[i]], nrow(gdf[i])) .* gdf[i].weights .* (nh / (nh - 1)) + gdf[i].whij = repeat([rh[i]], nrow(gdf[i])) .* gdf[i][!,design.weights] .* (nh / (nh - 1)) end stratified[h].whij = transform(gdf).whij @@ -47,5 +47,5 @@ function bootweights(design::SurveyDesign; replicates=4000, rng=MersenneTwister( for i in 2:(replicates) df[!, "replicate_" * string(i)] = disallowmissing(replicate(stratified, H).whij) end - return ReplicateDesign(df, design.cluster, design.popsize, design.sampsize, design.strata, design.pps, replicates) + return ReplicateDesign(df, design.cluster, design.popsize, design.sampsize, design.strata, design.weights, design.allprobs, design.pps, replicates) end From d59b9a48131b86a66ee7c0c79d28a8a79e6512de Mon Sep 17 00:00:00 2001 From: smishr Date: Sun, 15 Jan 2023 12:31:33 +0530 Subject: [PATCH 10/14] Change :weights to design.weights --- src/by.jl | 2 +- src/hist.jl | 2 +- src/mean.jl | 6 +++--- src/plot.jl | 2 +- src/quantile.jl | 2 +- src/total.jl | 2 +- test/plot.jl | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/by.jl b/src/by.jl index a4de2f55..cea2187d 100644 --- a/src/by.jl +++ b/src/by.jl @@ -1,7 +1,7 @@ function bydomain(x::Symbol, domain::Symbol, design::ReplicateDesign, func::Function) gdf = groupby(design.data, domain) nd = length(unique(design.data[!, domain])) - X = combine(gdf, [x, :weights] => ((a, b) -> func(a, weights(b))) => :statistic) + X = combine(gdf, [x, design.weights] => ((a, b) -> func(a, weights(b))) => :statistic) Xt_mat = Array{Float64, 2}(undef, (nd, design.replicates)) for i in 1:design.replicates Xt_mat[:, i] = combine(gdf, [x, Symbol("replicate_"*string(i))] => ((a, c) -> func(a, weights(c))) => :statistic).statistic diff --git a/src/hist.jl b/src/hist.jl index 17b54098..40935a1e 100644 --- a/src/hist.jl +++ b/src/hist.jl @@ -74,7 +74,7 @@ function hist(design::AbstractSurveyDesign, var::Symbol, kwargs... ) hist = histogram(bins = bins, normalization = normalization, kwargs...) - data(design.data) * mapping(var, weights = :weights) * hist |> draw + data(design.data) * mapping(var, weights = design.weights) * hist |> draw end function hist(design::AbstractSurveyDesign, var::Symbol, diff --git a/src/mean.jl b/src/mean.jl index c1d80259..593b1d79 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -10,8 +10,8 @@ julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) julia> mean(:api00, clus_one_stage) 1×2 DataFrame - Row │ mean SE - │ Float64 Float64 + Row │ mean SE + │ Float64 Float64 ─────┼────────────────── 1 │ 644.169 23.2919 @@ -25,7 +25,7 @@ julia> mean([:api00, :enroll], clus_one_stage) ``` """ function mean(x::Symbol, design::ReplicateDesign) - X = mean(design.data[!, x], weights(design.data.weights)) + X = mean(design.data[!, x], weights(design.data[!,design.weights])) Xt = [mean(design.data[!, x], weights(design.data[! , "replicate_"*string(i)])) for i in 1:design.replicates] variance = sum((Xt .- X).^2) / design.replicates DataFrame(mean = X, SE = sqrt(variance)) diff --git a/src/plot.jl b/src/plot.jl index 7dd4f555..79f1b97d 100644 --- a/src/plot.jl +++ b/src/plot.jl @@ -16,5 +16,5 @@ save("scatter.png", s); nothing # hide ![](assets/scatter.png) """ function plot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...) - data(design.data) * mapping(x, y, markersize = :weights) * visual(Scatter, marker = '○') |> draw + data(design.data) * mapping(x, y, markersize = design.weights) * visual(Scatter, marker = '○') |> draw end diff --git a/src/quantile.jl b/src/quantile.jl index d4e399a5..09ba9326 100644 --- a/src/quantile.jl +++ b/src/quantile.jl @@ -37,7 +37,7 @@ julia> quantile(:enroll,srs,[0.1,0.2,0.5,0.75,0.95]) function quantile(var::Symbol, design::SurveyDesign, p::Union{<:Real,Vector{<:Real}}; alpha::Float64=0.05, ci::Bool=false, se::Bool=false, qrule="hf7",kwargs...) v = design.data[!, var] - probs = design.data[!, :probs] + probs = design.data[!, design.allprobs] df = DataFrame(probability=p, quantile=Statistics.quantile(v, ProbabilityWeights(probs), p)) # TODO: Add CI and SE of the quantile return df diff --git a/src/total.jl b/src/total.jl index 0c5001e5..1b200797 100644 --- a/src/total.jl +++ b/src/total.jl @@ -25,7 +25,7 @@ julia> total([:api00, :enroll], clus_one_stage) ``` """ function total(x::Symbol, design::ReplicateDesign) - X = wsum(design.data[!, x], weights(design.data.weights)) + X = wsum(design.data[!, x], weights(design.data[!,design.weights])) Xt = [wsum(design.data[!, x], weights(design.data[! , "replicate_"*string(i)])) for i in 1:design.replicates] variance = sum((Xt .- X).^2) / design.replicates DataFrame(total = X, SE = sqrt(variance)) diff --git a/test/plot.jl b/test/plot.jl index c2476f65..7e31fc74 100644 --- a/test/plot.jl +++ b/test/plot.jl @@ -3,7 +3,7 @@ apisrs = load_data("apisrs") srs = SurveyDesign(apisrs, weights=:pw) s = plot(srs, :api99, :api00) - @test s.grid[1].entries[1].named[:markersize] == srs.data.weights + @test s.grid[1].entries[1].named[:markersize] == srs.data[!,srs.weights] @test s.grid[1].entries[1].positional[1] == srs.data.api99 @test s.grid[1].entries[1].positional[2] == srs.data.api00 # StratifiedSample From 7c4706cce57214552d1796296d258bd104a04546 Mon Sep 17 00:00:00 2001 From: smishr Date: Sun, 15 Jan 2023 12:32:15 +0530 Subject: [PATCH 11/14] Update ReplicateDesign struct and doctest --- src/SurveyDesign.jl | 12 +++++++----- test/SurveyDesign.jl | 4 ++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index ff67f682..a08cd1d9 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -118,14 +118,14 @@ julia> strat = SurveyDesign(apistrat; strata=:stype, weights=:pw); julia> bootstrat = bootweights(strat; replicates=1000) ReplicateDesign: -data: 200×1046 DataFrame +data: 200×1044 DataFrame strata: stype [E, E, E … H] cluster: none -popsize: [6190.0, 6190.0, 6190.0 … 6190.0] -sampsize: [200, 200, 200 … 200] -weights: [44.2, 44.2, 44.2 … 15.1] -probs: [0.0226, 0.0226, 0.0226 … 0.0662] +popsize: [4420.9999, 4420.9999, 4420.9999 … 755.0] +sampsize: [100, 100, 100 … 50] +weights: [44.21, 44.21, 44.21 … 15.1] +allprobs: [0.0226, 0.0226, 0.0226 … 0.0662] replicates: 1000 ``` """ @@ -135,6 +135,8 @@ struct ReplicateDesign <: AbstractSurveyDesign popsize::Symbol sampsize::Symbol strata::Symbol + weights::Symbol # Effective weights in case of singlestage approx supported + allprobs::Symbol # Right now only singlestage approx supported pps::Bool replicates::UInt end diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl index de74f2f4..89dff3bf 100644 --- a/test/SurveyDesign.jl +++ b/test/SurveyDesign.jl @@ -119,9 +119,9 @@ end @test dclus2.data[!,dclus2.weights][121] ≈ 1 / calculated_probs_R[121] atol = 1e-4 @test dclus2.data[!,dclus2.weights][125] ≈ 1 / calculated_probs_R[125] atol = 1e-4 - # TODO: sampsize and popsize testing + # TODO: sampsize and popsize testing once #178 resolved ## NOT THE SAME AS R object right now - + ######################### ## Complete multistage sampling (when implemented) should look like ## weights should theoretically be optional if both clusters and popsize given From 7eeb1faa779a97e9f908da8fa80e187082b9077a Mon Sep 17 00:00:00 2001 From: smishr Date: Sun, 15 Jan 2023 12:38:16 +0530 Subject: [PATCH 12/14] Update show testing suite --- test/show.jl | 52 ++++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/test/show.jl b/test/show.jl index f81ed2a3..1231a910 100644 --- a/test/show.jl +++ b/test/show.jl @@ -5,13 +5,13 @@ srs = SurveyDesign(apisrs; weights=:pw) refstr = """ SurveyDesign: - data: 200×47 DataFrame + data: 200×45 DataFrame strata: none cluster: none - popsize: [6190.0, 6190.0, 6190.0 … 6190.0] + popsize: [6194.0, 6194.0, 6194.0 … 6194.0] sampsize: [200, 200, 200 … 200] - weights: [31.0, 31.0, 31.0 … 31.0] - probs: [0.0323, 0.0323, 0.0323 … 0.0323]""" + weights: [30.97, 30.97, 30.97 … 30.97] + allprobs: [0.0323, 0.0323, 0.0323 … 0.0323]""" show(io, MIME("text/plain"), srs) str = String(take!(io)) @@ -20,13 +20,13 @@ bsrs = srs |> bootweights refstrb = """ ReplicateDesign: - data: 200×4047 DataFrame + data: 200×4045 DataFrame strata: none cluster: none - popsize: [6190.0, 6190.0, 6190.0 … 6190.0] + popsize: [6194.0, 6194.0, 6194.0 … 6194.0] sampsize: [200, 200, 200 … 200] - weights: [31.0, 31.0, 31.0 … 31.0] - probs: [0.0323, 0.0323, 0.0323 … 0.0323] + weights: [30.97, 30.97, 30.97 … 30.97] + allprobs: [0.0323, 0.0323, 0.0323 … 0.0323] replicates: 4000""" show(io, MIME("text/plain"), bsrs) @@ -41,14 +41,14 @@ end strat = SurveyDesign(apistrat; strata=:stype, weights=:pw) refstr = """ SurveyDesign: - data: 200×46 DataFrame + data: 200×44 DataFrame strata: stype [E, E, E … H] cluster: none - popsize: [6190.0, 6190.0, 6190.0 … 6190.0] - sampsize: [200, 200, 200 … 200] - weights: [44.2, 44.2, 44.2 … 15.1] - probs: [0.0226, 0.0226, 0.0226 … 0.0662]""" + popsize: [4420.9999, 4420.9999, 4420.9999 … 755.0] + sampsize: [100, 100, 100 … 50] + weights: [44.21, 44.21, 44.21 … 15.1] + allprobs: [0.0226, 0.0226, 0.0226 … 0.0662]""" show(io, MIME("text/plain"), strat) str = String(take!(io)) @@ -57,14 +57,14 @@ end stratb = strat |> bootweights refstrb = """ ReplicateDesign: - data: 200×4046 DataFrame + data: 200×4044 DataFrame strata: stype [E, E, E … H] cluster: none - popsize: [6190.0, 6190.0, 6190.0 … 6190.0] - sampsize: [200, 200, 200 … 200] - weights: [44.2, 44.2, 44.2 … 15.1] - probs: [0.0226, 0.0226, 0.0226 … 0.0662] + popsize: [4420.9999, 4420.9999, 4420.9999 … 755.0] + sampsize: [100, 100, 100 … 50] + weights: [44.21, 44.21, 44.21 … 15.1] + allprobs: [0.0226, 0.0226, 0.0226 … 0.0662] replicates: 4000""" show(io, MIME("text/plain"), stratb) @@ -79,14 +79,14 @@ end clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) refstr = """ SurveyDesign: - data: 183×46 DataFrame + data: 183×44 DataFrame strata: none cluster: dnum [637, 637, 637 … 448] - popsize: [6190.0, 6190.0, 6190.0 … 6190.0] + popsize: [507.7049, 507.7049, 507.7049 … 507.7049] sampsize: [15, 15, 15 … 15] - weights: [33.8, 33.8, 33.8 … 33.8] - probs: [0.0295, 0.0295, 0.0295 … 0.0295]""" + weights: [33.847, 33.847, 33.847 … 33.847] + allprobs: [0.0295, 0.0295, 0.0295 … 0.0295]""" show(io, MIME("text/plain"), clus_one_stage) str = String(take!(io)) @@ -95,14 +95,14 @@ end clus_one_stageb = clus_one_stage |> bootweights refstrb = """ ReplicateDesign: - data: 183×4046 DataFrame + data: 183×4044 DataFrame strata: none cluster: dnum [637, 637, 637 … 448] - popsize: [6190.0, 6190.0, 6190.0 … 6190.0] + popsize: [507.7049, 507.7049, 507.7049 … 507.7049] sampsize: [15, 15, 15 … 15] - weights: [33.8, 33.8, 33.8 … 33.8] - probs: [0.0295, 0.0295, 0.0295 … 0.0295] + weights: [33.847, 33.847, 33.847 … 33.847] + allprobs: [0.0295, 0.0295, 0.0295 … 0.0295] replicates: 4000""" show(io, MIME("text/plain"), clus_one_stageb) From b79fce0419e476e96c56174cae3622fd1367221c Mon Sep 17 00:00:00 2001 From: smishr <43640926+smishr@users.noreply.github.com> Date: Mon, 16 Jan 2023 13:23:37 +0530 Subject: [PATCH 13/14] Update .gitignore acidentally pushed local gitignore Co-authored-by: Ayush Patnaik --- .gitignore | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 494288f6..1d4d0304 100644 --- a/.gitignore +++ b/.gitignore @@ -8,5 +8,4 @@ /dev/* .gitignore .DS_Store -*.json -src/ht.jl \ No newline at end of file +*.json \ No newline at end of file From 5275be9a14ba6499fb30acd966fffabbeee2dffe Mon Sep 17 00:00:00 2001 From: smishr Date: Mon, 16 Jan 2023 13:52:41 +0530 Subject: [PATCH 14/14] Append _ to :weights :popsize :sampsize --- src/SurveyDesign.jl | 10 +++++----- src/ratio.jl | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index a08cd1d9..2ee5a9c3 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -76,14 +76,14 @@ struct SurveyDesign <: AbstractSurveyDesign cluster = clusters end # For single-stage approximation only one "effective" sampsize vector - sampsize_labels = :sampsize - if isa(strata,Symbol) && isnothing(clusters) # If stratified sample then sampsize is inside strata + sampsize_labels = :_sampsize + if isa(strata,Symbol) && isnothing(clusters) # If stratified only then sampsize is inside strata data[!, sampsize_labels] = transform(groupby(data, strata), nrow => :counts).counts else data[!, sampsize_labels] = fill(length(unique(data[!, cluster])), (nrow(data),)) end if isa(popsize, Symbol) - weights_labels = :weights + weights_labels = :_weights data[!, weights_labels] = data[!, popsize] ./ data[!, sampsize_labels] elseif isa(weights, Symbol) if !(typeof(data[!, weights]) <: Vector{<:Real}) @@ -91,12 +91,12 @@ struct SurveyDesign <: AbstractSurveyDesign else # derive popsize from given `weights` weights_labels = weights - popsize = :popsize + popsize = :_popsize data[!, popsize] = data[!, sampsize_labels] .* data[!, weights_labels] end else # neither popsize nor weights given - weights_labels = :weights + weights_labels = :_weights data[!, weights_labels] = repeat([1], nrow(data)) end allprobs_labels = :allprobs diff --git a/src/ratio.jl b/src/ratio.jl index 1623eb3a..ebfef889 100644 --- a/src/ratio.jl +++ b/src/ratio.jl @@ -17,14 +17,14 @@ julia> ratio(:api00, :enroll, clus_one_stage) ``` """ function ratio(variable_num:: Symbol, variable_den:: Symbol, design::SurveyDesign) - statistic = wsum(design.data[!,variable_num],design.data.weights)/wsum(design.data[!,variable_den],design.data.weights) + statistic = wsum(design.data[!,variable_num],design.data[!,design.weights])/wsum(design.data[!,variable_den],design.data[!,design.weights]) nh = length(unique(design.data[!,design.cluster])) newv = [] gdf = groupby(design.data, design.cluster) replicates = [filter(n -> n != i, 1:nh) for i in 1:nh] for i in replicates df = DataFrame(gdf[i]) - push!(newv, wsum(df[!,variable_num],df[!,:weights])/wsum(df[!,variable_den],df[!,:weights])) + push!(newv, wsum(df[!,variable_num],df[!,design.weights])/wsum(df[!,variable_den],df[!,design.weights])) end c = 0 for i in 1:nh