From 5360700432e019c70f80e37f3ca9d7ab09fee76b Mon Sep 17 00:00:00 2001 From: smishr Date: Fri, 30 Sep 2022 12:46:51 +0530 Subject: [PATCH 01/18] changed clean examples --- clean_examples.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clean_examples.jl b/clean_examples.jl index 7bcb4cb7..40634465 100644 --- a/clean_examples.jl +++ b/clean_examples.jl @@ -105,7 +105,7 @@ combine(gdf, [:api00, :pw] => ((a, b) -> svymean(a, srs_design, b)) => AsTable) # return 0 ## 21.09.22 Stratified test 1 -# Ideally you should stratify on a CategoricalArray, alternatively, convert the StringX to categorical value before running stratifiedSample +# Ideally you should stratify on a CategoricalArray, alternatively, convert the StringX to categorical value before running StratifiedSample using Revise using Survey using DataFrames From b330c34a1bcf254378fd4feaefa93dc8bdf97dbe Mon Sep 17 00:00:00 2001 From: smishr Date: Thu, 20 Oct 2022 17:11:20 +0530 Subject: [PATCH 02/18] Update SurveyDesign class, parse Non-cluster designs --- src/SurveyDesign.jl | 151 +++++++++++++++++++++++++++----------------- 1 file changed, 93 insertions(+), 58 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index f635ff60..e368b060 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -10,7 +10,7 @@ abstract type AbstractSurveyDesign end """ SimpleRandomSample <: AbstractSurveyDesign -Survey design sampled by simple random sampling. + Survey design sampled by simple random sampling. """ struct SimpleRandomSample <: AbstractSurveyDesign data::AbstractDataFrame @@ -49,10 +49,10 @@ struct SimpleRandomSample <: AbstractSurveyDesign if !all(p -> p == first(probs), probs) error("all probability weights must be equal for Simple Random Sample") end - weights = 1 ./ probs + weights = 1 / probs end # estimate population size - popsize = round(sampsize .* first(weights)) |> UInt + popsize = round(sampsize * first(weights)) |> UInt if sampsize > popsize error("sample size cannot be greater than population size") end @@ -66,13 +66,13 @@ struct SimpleRandomSample <: AbstractSurveyDesign error("either population size or frequency/probability weights must be specified") end # set sampling fraction - sampfraction = sampsize ./ popsize + sampfraction = sampsize / popsize # set fpc - fpc = ignorefpc ? 1 : 1 .- (sampsize ./ popsize) + fpc = ignorefpc ? 1 : 1 - (sampsize / popsize) # add columns for frequency and probability weights to `data` data[!, :weights] = weights if isnothing(probs) - probs = 1 ./ data[!, :weights] + probs = 1 ./ data[!, :weights] end data[!, :probs] = probs @@ -88,14 +88,14 @@ Survey design sampled by stratification. struct StratifiedSample <: AbstractSurveyDesign data::AbstractDataFrame strata::Symbol - sampsize::Vector{Union{Nothing,Float64}} - popsize::Vector{Union{Nothing,Float64}} + sampsize::Union{Nothing,Vector{Real}} + popsize::Union{Nothing,Vector{Real}} sampfraction::Vector{Real} fpc::Vector{Real} ignorefpc::Bool function StratifiedSample(data::AbstractDataFrame, strata::Symbol; popsize=nothing, - sampsize= transform(groupby(data,strata), nrow => :counts ).counts , + sampsize=transform(groupby(data, strata), nrow => :counts).counts, weights=nothing, probs=nothing, ignorefpc=false @@ -129,7 +129,7 @@ struct StratifiedSample <: AbstractSurveyDesign elseif typeof(popsize) <: Vector{<:Real} # TODO: change `elseif` condition weights = popsize ./ sampsize # expansion estimator - # TODO: add probability weights + # TODO: add probability weights else error("either population size or frequency/probability weights must be specified") end @@ -150,69 +150,104 @@ struct StratifiedSample <: AbstractSurveyDesign end """ - GeneralSample <: AbstractSurveyDesign - -Survey design sampled by clustering. + SurveyDesign <: AbstractSurveyDesign + + Survey design with arbitrary design weights + + clusters: can be passed as `Symbol`, Vector{Symbol}, Vector{Real} or Nothing + strata: can be passed as `Symbol`, Vector{Symbol}, Vector{Real} or Nothing + sampsize: can be passed as `Symbol`, Vector{Symbol}, Vector{Real} or Nothing + popsize: can be passed as `Symbol`, Vector{Symbol}, Vector{Real} or Nothing """ -struct GeneralSample <: AbstractSurveyDesign +struct SurveyDesign <: AbstractSurveyDesign data::AbstractDataFrame - strata::Symbol - sampsize::Vector{Union{Nothing,Float64}} - popsize::Vector{Union{Nothing,Float64}} - sampfraction::Vector{Real} - fpc::Vector{Real} + clusters::Union{Symbol,Vector{Symbol},Nothing} + strata::Union{Symbol,Vector{Symbol},Nothing} + sampsize::Union{Real,Vector{Real},Nothing} + popsize::Union{Real,Vector{Real},Nothing} + sampfraction::Union{Real,Vector{Real}} + fpc::Union{Real,Vector{Real}} ignorefpc::Bool - # TODO: change entire struct body - function GeneralSample(data::AbstractDataFrame, strata::Symbol; + # TODO: struct body still work in progress + function SurveyDesign(data::AbstractDataFrame; + clusters=nothing, + strata=nothing, popsize=nothing, - sampsize= transform(groupby(data,strata), nrow => :counts ).counts , + sampsize=nothing, weights=nothing, probs=nothing, ignorefpc=false ) - if isa(weights, Symbol) - weights = data[!, weights] - end - if isa(probs, Symbol) - probs = data[!, probs] - end + # TODO: Do the other case where clusters are given + if isnothing(clusters) + if isnothing(sampsize) + if isnothing(strata) + sampsize = nrow(data) + else + sampsize = transform(groupby(data, strata), nrow => :counts).counts + end - if ignorefpc # && (isnothing(popsize) || isnothing(weights) || isnothing(probs)) - @warn "Assuming equal weights" - weights = ones(nrow(data)) - end + end - # set population size if it is not given; `weights` and `sampsize` must be given - if isnothing(popsize) - # TODO: add probability weights if `weights` is not `nothing` - if typeof(probs) <: Vector{<:Real} - weights = 1 ./ probs + if isa(weights, Symbol) + weights = data[!, weights] + end + if isa(probs, Symbol) + probs = data[!, probs] + end + if isa(popsize, Symbol) + popsize = data[!, popsize] end - # estimate population size - popsize = sampsize .* weights - if sampsize > popsize - error("sample size cannot be greater than population size") + if ignorefpc # && (isnothing(popsize) || isnothing(weights) || isnothing(probs)) + @warn "Assuming equal weights" + weights = ones(nrow(data)) end - elseif typeof(popsize) <: Vector{<:Real} - # TODO: change `elseif` condition - weights = popsize ./ sampsize # expansion estimator + + # set population size if it is not given; `weights` and `sampsize` must be given + if isnothing(popsize) + # TODO: add probability weights if `weights` is not `nothing` + if typeof(probs) <: Vector{<:Real} + weights = 1 ./ probs + end + # estimate population size + popsize = sampsize .* weights + + if sampsize > popsize + error("sample size cannot be greater than population size") + end + elseif typeof(popsize) <: Vector{<:Real} + # TODO: change `elseif` condition + weights = popsize ./ sampsize # expansion estimator # TODO: add probability weights + else + error("either population size or frequency/probability weights must be specified") + end + # TODO: for now support SRS within SurveyDesign. Later, just call SimpleRandomSample?? + if typeof(sampsize) <: Real && typeof(popsize) <: Vector{<:Real} # Eg when SRS + if !all(y -> y == first(popsize), popsize) # SRS by definition is equi-weighted + error("Simple Random Sample must be equi-weighted. Different sampling weights detected in vectors") + end + popsize = first(popsize) |> Real + end + + # set sampling fraction + sampfraction = sampsize ./ popsize + # set fpc + fpc = ignorefpc ? 1 : 1 .- (sampsize ./ popsize) + # add columns for frequency and probability weights to `data` + if !isnothing(probs) + data[!, :probs] = probs + data[!, :weights] = 1 ./ data[!, :probs] + else + data[!, :weights] = weights + data[!, :probs] = 1 ./ data[!, :weights] + end + # @show clusters, strata, sampsize,popsize, sampfraction, fpc, ignorefpc + new(data, clusters, strata, sampsize, popsize, sampfraction, fpc, ignorefpc) else - error("either population size or frequency/probability weights must be specified") - end - # set sampling fraction - sampfraction = sampsize ./ popsize - # set fpc - fpc = ignorefpc ? 1 : 1 .- (sampsize ./ popsize) - # add columns for frequency and probability weights to `data` - if !isnothing(probs) - data[!, :probs] = probs - data[!, :weights] = 1 ./ data[!, :probs] - else - data[!, :weights] = weights - data[!, :probs] = 1 ./ data[!, :weights] + print("TODO Cluster sampling") end - new(data, strata, sampsize, popsize, sampfraction, fpc, ignorefpc) + end end From 7c244973f282868941d881ece8230de063acf81b Mon Sep 17 00:00:00 2001 From: smishr Date: Thu, 20 Oct 2022 17:11:50 +0530 Subject: [PATCH 03/18] Add SurveyDesign struct and fns to export --- src/Survey.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Survey.jl b/src/Survey.jl index 40ea51d5..e56e22b6 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -27,6 +27,7 @@ include("ht.jl") export load_data export AbstractSurveyDesign, SimpleRandomSample, StratifiedSample +export SurveyDesign export svydesign export svyglm export svyby @@ -37,6 +38,7 @@ export @formula export svyplot export svyhist, sturges, freedman_diaconis export svyboxplot +export ht_svytotal, ht_svymean export #families Normal, From a3db52e85b15f7c35120188e15e0ce89c67e16ee Mon Sep 17 00:00:00 2001 From: smishr Date: Thu, 20 Oct 2022 17:13:57 +0530 Subject: [PATCH 04/18] Display options for SurveyDesign --- src/show.jl | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/show.jl b/src/show.jl index 5361edd1..f38668c4 100644 --- a/src/show.jl +++ b/src/show.jl @@ -20,11 +20,26 @@ function Base.show(io::IO, ::MIME"text/plain", design::AbstractSurveyDesign) printstyled(io, "$type:\n"; bold=true) printstyled(io, "data: "; bold=true) println(io, size(design.data, 1), "x", size(design.data, 2), " DataFrame") + printinfo(io, "fpc", makeshort(design.data.fpc)) + printinfo(io, "popsize", makeshort(design.popsize)) + printinfo(io, "sampsize", makeshort(design.sampsize)) + printinfo(io, "sampfraction", makeshort(design.sampfraction)) + printinfo(io, "ignorefpc", string(design.ignorefpc); newline=false) +end + +"`show` method for printing information about a survey design" +function Base.show(io::IO, ::MIME"text/plain", design::SurveyDesign) + type = typeof(design) + printstyled(io, "$type:\n"; bold=true) + printstyled(io, "data: "; bold=true) + println(io, size(design.data, 1), "x", size(design.data, 2), " DataFrame") + println(io, "clusters: ", design.clusters) + println(io, "strata: ", design.strata) printinfo(io, "weights", makeshort(design.data.weights)) printinfo(io, "probs", makeshort(design.data.probs)) - printinfo(io, "fpc", makeshort(design.data.fpc)) printinfo(io, "popsize", makeshort(design.popsize)) printinfo(io, "sampsize", makeshort(design.sampsize)) + printinfo(io, "fpc", makeshort(design.data.fpc)) printinfo(io, "sampfraction", makeshort(design.sampfraction)) printinfo(io, "ignorefpc", string(design.ignorefpc); newline=false) end From dc2b4bee18d7ba42e1cc9647edef66115004943b Mon Sep 17 00:00:00 2001 From: smishr Date: Thu, 20 Oct 2022 17:14:34 +0530 Subject: [PATCH 05/18] Horvitz Thomp correct means, HR variance --- src/ht.jl | 46 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/src/ht.jl b/src/ht.jl index abe8eceb..2f165a0d 100644 --- a/src/ht.jl +++ b/src/ht.jl @@ -1,17 +1,43 @@ # TODO: add docstrings -function HorwitzThompson_HartleyRaoApproximation(y::AbstractVector, design::AbstractSurveyDesign, HT_total) +""" + Hartley Rao Approximation of the variance of the Horvitz-Thompson Estimator. + Avoids exhaustively calculating joint (inclusion) probabilities πᵢⱼ of the sampling scheme. +""" +function HT_HartleyRaoVarApprox(y::AbstractVector, design::AbstractSurveyDesign, HT_total) # TODO: change function name - # TODO: change variable name (gets mixed up with the constant pi) - pi = design.data.probs - hartley_rao_var = sum((1 .- ((design.sampsize .- 1) ./ design.sampsize) .* pi) .* + πᵢ = design.data.probs + hartley_rao_var = sum((1 .- ((design.sampsize .- 1) ./ design.sampsize) .* πᵢ) .* ((y .* design.data.weights) .- (HT_total ./ design.sampsize)) .^ 2) - return sqrt(hartley_rao_var) + return hartley_rao_var end -# TODO: add docstrings -function ht_calc(x::Symbol, design) - # TODO: change function name +""" + Horvitz-Thompson Estimator of Population Total + For arbitrary sampling probabilities +""" +function ht_svytotal(x::Symbol, design) total = wsum(Float32.(design.data[!, x]), design.data.weights) - se = HorwitzThompson_HartleyRaoApproximation(design.data[!, x], design, total) - return DataFrame(total = total, se = se) + var_total = HT_HartleyRaoVarApprox(design.data[!, x], design, total) + return DataFrame(total = total, se = sqrt(var_total)) end + +# TODO: add docstrings +""" + Horvitz-Thompson Estimator of Population Mean + Scales the Population Total by the relevant +""" +function ht_svymean(x::Symbol, design) + total_df = ht_svytotal(x, design) + total = total_df.total[1] + var_total = total_df.se[1] + + mean = 1.0 ./ sum(design.popsize) .* total + # @show total_df, total, var_total, mean + #1.0 ./ (sum(design.popsize).^2) .* + + se = sqrt( var_total ) + + @show design.popsize + + return DataFrame(mean = mean, se = se) +end \ No newline at end of file From 0dc0d6efa8b6941218f72700fbc0d2839225b8fc Mon Sep 17 00:00:00 2001 From: smishr Date: Thu, 20 Oct 2022 23:14:38 +0530 Subject: [PATCH 06/18] Added comments on testing --- test/SurveyDesign.jl | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl index 17994e14..c1b23fd5 100644 --- a/test/SurveyDesign.jl +++ b/test/SurveyDesign.jl @@ -1,7 +1,15 @@ @testset "SurveyDesign.jl" begin - # SimpleRandomSample tests - apisrs_original = load_data("apisrs") - apisrs = copy(apisrs_original) + ##### SimpleRandomSample tests + # Load API datasets + apisrs_original = load_data("apisrs") + apistrat_original = load_data("apistrat") + apiclus1_original = load_data("apiclus1") + apiclus2_original = load_data("apiclus2") + # Work on copy, keep original + apisrs = copy(apisrs_original) + apistrat = copy(apistrat_original) + apiclus1 = copy(apiclus1_original) + apiclus2 = copy(apiclus2_original) srs = SimpleRandomSample(apisrs, popsize = apisrs.fpc) @test srs.data.weights == 1 ./ srs.data.probs # weights should be inverse of probs @@ -11,9 +19,32 @@ @test srs_freq.data.weights[1] == 30.97 @test srs_freq.data.weights == 1 ./ srs_freq.data.probs - # TODO: needs change; this works but isn't what the user is expecting + ##### TODO: needs change; this works but isn't what the user is expecting srs_prob = SimpleRandomSample(apisrs; probs = fill(0.3, size(apisrs_original, 1))) @test srs_prob.data.probs[1] == 0.3 - # StratifiedSample tests + # TODO: StratifiedSample tests + # ... @sayantika @iulia @shikhar + # apistrat examples from R, check the main if-else cases + + # Test with probs = , weight = , and popsize = arguments, as vectors and sybols + + ##### SurveyDesign tests + + # Case 1: Simple Random Sample + + # Case 1b: SRS 'with replacement' approximation ie ignorefpc = true + + # Case 2: Stratified Random Sample + # strat_design = SurveyDesign(apistrat,strata = :stype, popsize =:fpc, ignorefpc = false) + + # Case: Arbitrary weights + + # Case: One-stage cluster sampling, no strata + + # Case: One-stage cluster sampling, with one-stage strata + + # Case: Two cluster, two strata + + # Case: Multi stage stratified design end From c1b32abc5ee39f6c3b92fd4fc55916dd4ecefc48 Mon Sep 17 00:00:00 2001 From: smishr Date: Fri, 21 Oct 2022 14:19:56 +0530 Subject: [PATCH 07/18] Add poststratification and sampling templates --- src/poststratify.jl | 3 +++ src/sampling.jl | 3 +++ test/poststratify.jl | 3 +++ test/sampling.jl | 3 +++ 4 files changed, 12 insertions(+) create mode 100644 src/poststratify.jl create mode 100644 src/sampling.jl create mode 100644 test/poststratify.jl create mode 100644 test/sampling.jl diff --git a/src/poststratify.jl b/src/poststratify.jl new file mode 100644 index 00000000..2dc6c421 --- /dev/null +++ b/src/poststratify.jl @@ -0,0 +1,3 @@ +""" + Postratification functions +""" \ No newline at end of file diff --git a/src/sampling.jl b/src/sampling.jl new file mode 100644 index 00000000..0da65a24 --- /dev/null +++ b/src/sampling.jl @@ -0,0 +1,3 @@ +""" + Sampling functions using various schemes +""" \ No newline at end of file diff --git a/test/poststratify.jl b/test/poststratify.jl new file mode 100644 index 00000000..9af4e1c8 --- /dev/null +++ b/test/poststratify.jl @@ -0,0 +1,3 @@ +""" + Testing suite for postratification functions +""" diff --git a/test/sampling.jl b/test/sampling.jl new file mode 100644 index 00000000..276ed8a1 --- /dev/null +++ b/test/sampling.jl @@ -0,0 +1,3 @@ +""" + Testing suite for sampling functions +""" \ No newline at end of file From 4c31b8b25fbe732a8306f747105b765cfa57808b Mon Sep 17 00:00:00 2001 From: smishr Date: Fri, 21 Oct 2022 14:31:10 +0530 Subject: [PATCH 08/18] Reorder cluster parsing --- src/SurveyDesign.jl | 55 ++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index e368b060..e0f7c43d 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -178,32 +178,33 @@ struct SurveyDesign <: AbstractSurveyDesign probs=nothing, ignorefpc=false ) - # TODO: Do the other case where clusters are given - if isnothing(clusters) - if isnothing(sampsize) - if isnothing(strata) - sampsize = nrow(data) - else - sampsize = transform(groupby(data, strata), nrow => :counts).counts - end - + if isnothing(sampsize) + if isnothing(strata) + sampsize = nrow(data) + else + sampsize = transform(groupby(data, strata), nrow => :counts).counts end - if isa(weights, Symbol) - weights = data[!, weights] - end - if isa(probs, Symbol) - probs = data[!, probs] - end - if isa(popsize, Symbol) - popsize = data[!, popsize] - end + end - if ignorefpc # && (isnothing(popsize) || isnothing(weights) || isnothing(probs)) - @warn "Assuming equal weights" - weights = ones(nrow(data)) - end + if isa(weights, Symbol) + weights = data[!, weights] + end + if isa(probs, Symbol) + probs = data[!, probs] + end + if isa(popsize, Symbol) + popsize = data[!, popsize] + end + # TODO: Check below, may not be correct for all designs + if ignorefpc # && (isnothing(popsize) || isnothing(weights) || isnothing(probs)) + @warn "Assuming equal weights" + weights = ones(nrow(data)) + end + + # TODO: Do the other case where clusters are given + if isnothing(clusters) # set population size if it is not given; `weights` and `sampsize` must be given if isnothing(popsize) # TODO: add probability weights if `weights` is not `nothing` @@ -245,9 +246,13 @@ struct SurveyDesign <: AbstractSurveyDesign end # @show clusters, strata, sampsize,popsize, sampfraction, fpc, ignorefpc new(data, clusters, strata, sampsize, popsize, sampfraction, fpc, ignorefpc) - else - print("TODO Cluster sampling") + elseif isa(clusters,Symbol) + # One Cluster sampling - PSU chosen with SRS, + print("One stage cluster design with PSU SRS") + elseif typeof(clusters) <: Vector{Symbol} + # TODO "Multistage cluster designs" + print("TODO: Multistage cluster design with PSU SRS") end end -end +end \ No newline at end of file From 5c929040d3bb38430a9cd8bb418369c3b02cf0b0 Mon Sep 17 00:00:00 2001 From: smishr Date: Fri, 21 Oct 2022 22:36:00 +0530 Subject: [PATCH 09/18] Fix err sampsize>popsize, Add C;usterSample class --- src/SurveyDesign.jl | 75 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index b666f39a..c080d260 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -53,6 +53,7 @@ struct SimpleRandomSample <: AbstractSurveyDesign end # estimate population size popsize = round(sampsize * first(weights)) |> UInt + @show popsize, sampsize if sampsize > popsize error("sample size cannot be greater than population size") end @@ -83,7 +84,7 @@ end """ StratifiedSample <: AbstractSurveyDesign -Survey design sampled by stratification. + Survey design sampled by stratification. """ struct StratifiedSample <: AbstractSurveyDesign data::AbstractDataFrame @@ -149,6 +150,78 @@ struct StratifiedSample <: AbstractSurveyDesign end end +""" + ClusterSample <: AbstractSurveyDesign + + One stage clusters, Primary Sampling Units (PSU) chosen with SRS. No stratification. + Assuming every individual is in one and only one cluster, and the subsampling probabilities + for any given cluster do not depend on which other clusters were sampled. (Lumley pg41, para2) +""" +struct ClusterSample <: AbstractSurveyDesign + data::AbstractDataFrame + clusters::Union{Symbol,Nothing} + sampsize::Union{Nothing,Vector{Real}} + popsize::Union{Nothing,Vector{Real}} + sampfraction::Vector{Real} + fpc::Vector{Real} + ignorefpc::Bool + function StratifiedSample(data::AbstractDataFrame, strata::Symbol; + popsize=nothing, + sampsize=transform(groupby(data, strata), nrow => :counts).counts, + weights=nothing, + probs=nothing, + ignorefpc=false + ) + if isa(weights, Symbol) + weights = data[!, weights] + end + if isa(probs, Symbol) + probs = data[!, probs] + end + + if ignorefpc + # TODO: change what happens if `ignorepfc == true` or if the user only + # specifies `data` + @warn "assuming equal weights" + weights = ones(nrow(data)) + end + + # set population size if it is not given; `weights` and `sampsize` must be given + if isnothing(popsize) + # TODO: add probability weights if `weights` is not `nothing` + if typeof(probs) <: Vector{<:Real} + weights = 1 ./ probs + end + # estimate population size + popsize = sampsize .* weights + + if sampsize > popsize + error("sample size cannot be greater than population size") + end + elseif typeof(popsize) <: Vector{<:Real} + # TODO: change `elseif` condition + weights = popsize ./ sampsize # expansion estimator + # TODO: add probability weights + else + error("either population size or frequency/probability weights must be specified") + end + # set sampling fraction + sampfraction = sampsize ./ popsize + # set fpc + fpc = ignorefpc ? 1 : 1 .- (sampsize ./ popsize) + # add columns for frequency and probability weights to `data` + if !isnothing(probs) + data[!, :probs] = probs + data[!, :weights] = 1 ./ data[!, :probs] + else + data[!, :weights] = weights + data[!, :probs] = 1 ./ data[!, :weights] + end + new(data, strata, sampsize, popsize, sampfraction, fpc, ignorefpc) + end +end + + """ SurveyDesign <: AbstractSurveyDesign From 590dbe8bbd4027d1f7c23641fe7af7dda59de782 Mon Sep 17 00:00:00 2001 From: smishr Date: Fri, 21 Oct 2022 22:37:01 +0530 Subject: [PATCH 10/18] Remove test line 24 --- test/SurveyDesign.jl | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl index c1b23fd5..c4243624 100644 --- a/test/SurveyDesign.jl +++ b/test/SurveyDesign.jl @@ -15,36 +15,36 @@ @test srs.data.weights == 1 ./ srs.data.probs # weights should be inverse of probs @test srs.sampsize > 0 - srs_freq = SimpleRandomSample(apisrs; popsize = apisrs.fpc , weights = fill(0.3, size(apisrs_original, 1))) + srs_freq = SimpleRandomSample(apisrs; weights = apisrs.pw ) @test srs_freq.data.weights[1] == 30.97 @test srs_freq.data.weights == 1 ./ srs_freq.data.probs ##### TODO: needs change; this works but isn't what the user is expecting - srs_prob = SimpleRandomSample(apisrs; probs = fill(0.3, size(apisrs_original, 1))) - @test srs_prob.data.probs[1] == 0.3 + srs_prob = SimpleRandomSample(apisrs; probs = 1 ./ apisrs.pw) + # @test srs_prob.data.probs[1] == 0.3 - # TODO: StratifiedSample tests - # ... @sayantika @iulia @shikhar - # apistrat examples from R, check the main if-else cases + #### TODO: StratifiedSample tests + # ... @sayantika @iulia @shikhar + # apistrat examples from R, check the main if-else cases - # Test with probs = , weight = , and popsize = arguments, as vectors and sybols + # Test with probs = , weight = , and popsize = arguments, as vectors and sybols ##### SurveyDesign tests - # Case 1: Simple Random Sample + # Case 1: Simple Random Sample - # Case 1b: SRS 'with replacement' approximation ie ignorefpc = true + # Case 1b: SRS 'with replacement' approximation ie ignorefpc = true - # Case 2: Stratified Random Sample - # strat_design = SurveyDesign(apistrat,strata = :stype, popsize =:fpc, ignorefpc = false) + # Case 2: Stratified Random Sample + # strat_design = SurveyDesign(apistrat,strata = :stype, popsize =:fpc, ignorefpc = false) - # Case: Arbitrary weights + # Case: Arbitrary weights - # Case: One-stage cluster sampling, no strata + # Case: One-stage cluster sampling, no strata - # Case: One-stage cluster sampling, with one-stage strata + # Case: One-stage cluster sampling, with one-stage strata - # Case: Two cluster, two strata - - # Case: Multi stage stratified design + # Case: Two cluster, two strata + + # Case: Multi stage stratified design end From 61a9e5cdbe16517d111609d9e2f91cdad1899257 Mon Sep 17 00:00:00 2001 From: smishr Date: Fri, 21 Oct 2022 22:38:24 +0530 Subject: [PATCH 11/18] Add svyratio estimator --- src/svyratio.jl | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 src/svyratio.jl diff --git a/src/svyratio.jl b/src/svyratio.jl new file mode 100644 index 00000000..b622e2e6 --- /dev/null +++ b/src/svyratio.jl @@ -0,0 +1,14 @@ +""" + Ratio estimators and Subpopulation estimates. + Domain Estimation is special case of subpopulation estimate. +""" + +function ratio(x::Symbol,y::Symbol,design::AbstractSurveyDesign) + num = design.data[!,x] + denom = design.data[!,y] + # fill(A,) + # A == B + + mean = num / denom + return DataFrame(mean = mean) +end \ No newline at end of file From c14c088783791b58da75e1d7b59c5c2c656f982f Mon Sep 17 00:00:00 2001 From: smishr Date: Fri, 21 Oct 2022 22:39:03 +0530 Subject: [PATCH 12/18] Add strat test2, line 121 --- clean_examples.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/clean_examples.jl b/clean_examples.jl index 40634465..43c0b453 100644 --- a/clean_examples.jl +++ b/clean_examples.jl @@ -118,6 +118,7 @@ apistrat_categ.stype = CategoricalArray(apistrat_categ.stype) eltype(apistrat_categ.stype) strat_categ_design = StratifiedSample(apistrat_categ, :stype ; popsize = apistrat_categ.fpc ) +strat_categ_design = StratifiedSample(apistrat_categ, :stype ; weights = :pw ) svymean(:stype,strat_categ_design) svytotal(:stype,strat_categ_design) From ace6bbe9b607a01fffdb887be363a7e1649bda91 Mon Sep 17 00:00:00 2001 From: smishr Date: Mon, 31 Oct 2022 15:21:11 +0530 Subject: [PATCH 13/18] Add check for invalid probs weights args --- src/SurveyDesign.jl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index c080d260..8724f69d 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -32,7 +32,11 @@ struct SimpleRandomSample <: AbstractSurveyDesign if isa(probs, Symbol) probs = data[!, probs] end - + if !( sum(probs) ≈ 1 ) + error("Sampling probabilities for SRS must sum to 1") + elseif !(sum(1 ./ weights) ≈ 1) + error("Inverse of weights should for SRS must sum to 1") + end if ignorefpc @warn "assuming all weights are equal to 1.0" weights = ones(nrow(data)) From ba00f816b0e0a87342f1873258a600bf528d12ea Mon Sep 17 00:00:00 2001 From: smishr Date: Mon, 31 Oct 2022 15:33:38 +0530 Subject: [PATCH 14/18] Add SurveyDesign test cases SRS --- src/SurveyDesign.jl | 2 +- test/SurveyDesign.jl | 28 +++++++++++++++++----------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 8724f69d..f1323329 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -169,7 +169,7 @@ struct ClusterSample <: AbstractSurveyDesign sampfraction::Vector{Real} fpc::Vector{Real} ignorefpc::Bool - function StratifiedSample(data::AbstractDataFrame, strata::Symbol; + function ClusterSample(data::AbstractDataFrame, strata::Symbol; popsize=nothing, sampsize=transform(groupby(data, strata), nrow => :counts).counts, weights=nothing, diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl index c4243624..535ef940 100644 --- a/test/SurveyDesign.jl +++ b/test/SurveyDesign.jl @@ -20,7 +20,7 @@ @test srs_freq.data.weights == 1 ./ srs_freq.data.probs ##### TODO: needs change; this works but isn't what the user is expecting - srs_prob = SimpleRandomSample(apisrs; probs = 1 ./ apisrs.pw) + # srs_prob = SimpleRandomSample(apisrs; probs = 1 ./ apisrs.pw) # @test srs_prob.data.probs[1] == 0.3 #### TODO: StratifiedSample tests @@ -31,20 +31,26 @@ ##### SurveyDesign tests - # Case 1: Simple Random Sample + # Case 1: Simple Random Sample + svydesign1 = SurveyDesign(apisrs, popsize = apisrs.fpc) + @test svydesign1.data.weights == 1 ./ svydesign1.data.probs # weights should be inverse of probs + @test svydesign1.sampsize > 0 - # Case 1b: SRS 'with replacement' approximation ie ignorefpc = true + # Case 1b: SRS 'with replacement' approximation ie ignorefpc = true + svydesign2 = SurveyDesign(apisrs, popsize = apisrs.fpc, ignorefpc = true) + @test svydesign2.data.weights == 1 ./ svydesign2.data.probs # weights should be inverse of probs + @test svydesign2.sampsize > 0 - # Case 2: Stratified Random Sample - # strat_design = SurveyDesign(apistrat,strata = :stype, popsize =:fpc, ignorefpc = false) + # Case 2: Stratified Random Sample + # strat_design = SurveyDesign(apistrat,strata = :stype, popsize =:fpc, ignorefpc = false) - # Case: Arbitrary weights + # Case: Arbitrary weights - # Case: One-stage cluster sampling, no strata + # Case: One-stage cluster sampling, no strata - # Case: One-stage cluster sampling, with one-stage strata + # Case: One-stage cluster sampling, with one-stage strata - # Case: Two cluster, two strata - - # Case: Multi stage stratified design + # Case: Two cluster, two strata + + # Case: Multi stage stratified design end From 0c8b988a3a134863a51fd75de8ddebe9c71cc2a5 Mon Sep 17 00:00:00 2001 From: smishr Date: Sat, 5 Nov 2022 23:45:59 +0530 Subject: [PATCH 15/18] Revert "Add check for invalid probs weights args" This reverts commit ace6bbe9b607a01fffdb887be363a7e1649bda91. --- src/SurveyDesign.jl | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index f1323329..56bf2f70 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -32,11 +32,7 @@ struct SimpleRandomSample <: AbstractSurveyDesign if isa(probs, Symbol) probs = data[!, probs] end - if !( sum(probs) ≈ 1 ) - error("Sampling probabilities for SRS must sum to 1") - elseif !(sum(1 ./ weights) ≈ 1) - error("Inverse of weights should for SRS must sum to 1") - end + if ignorefpc @warn "assuming all weights are equal to 1.0" weights = ones(nrow(data)) From ee93e5d5c85b1f16107150c230a2b4fc496e9854 Mon Sep 17 00:00:00 2001 From: smishr Date: Sat, 5 Nov 2022 23:54:40 +0530 Subject: [PATCH 16/18] Remove @show in SimpleRandomSample --- src/SurveyDesign.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 56bf2f70..d4f657d0 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -53,7 +53,7 @@ struct SimpleRandomSample <: AbstractSurveyDesign end # estimate population size popsize = round(sampsize * first(weights)) |> UInt - @show popsize, sampsize + # @show popsize, sampsize # Check this Line TODO if sampsize > popsize error("sample size cannot be greater than population size") end From cb69bfd8fca9e688881d450ac240c934a1aa8bfd Mon Sep 17 00:00:00 2001 From: smishr Date: Sat, 5 Nov 2022 23:57:07 +0530 Subject: [PATCH 17/18] Remove clean_examples file --- clean_examples.jl | 169 ---------------------------------------------- 1 file changed, 169 deletions(-) delete mode 100644 clean_examples.jl diff --git a/clean_examples.jl b/clean_examples.jl deleted file mode 100644 index 43c0b453..00000000 --- a/clean_examples.jl +++ /dev/null @@ -1,169 +0,0 @@ -# ### Lumley Texbook code, Fig 2.2 pg 20 -using Revise -using Survey -using DataFrames -using CSV - -# Load in dataframe -apisrs = CSV.read("assets/apisrs.csv",DataFrame) - -### Set design (All should give identical results) -srs_design = SimpleRandomSample(apisrs, popsize = apisrs.fpc) # popsize only -srs_design = SimpleRandomSample(apisrs, weights = apisrs.pw) # no popsize, so weights given as Vector -srs_design = SimpleRandomSample(apisrs, weights = :pw) # no popsize, so weights given as Symbol -srs_design = SimpleRandomSample(apisrs, probs = 1 ./ apisrs.pw) # no popsize, so probs given as Vector - -svytotal(:enroll,srs_design) -svymean([:enroll,:api00],srs_design) -svymean(:enroll,srs_design) - -# svytotal error -svytotal(:api00, srs) - -# No fpc example -no_fpc = SimpleRandomSample(apisrs, ignorefpc = true) -svytotal(:enroll,no_fpc) -svytotal(:api00,no_fpc) -svymean(:enroll,no_fpc) - -#### -using Revise -using Survey -using DataFrames -using CSV -using CategoricalArrays -# Test feature for categorical variables -apisrs_categ = CSV.read("assets/apisrs.csv",DataFrame) -eltype(apisrs_categ.stype) -# Convert a column to CategoricalArray -apisrs_categ.stype = CategoricalArray(apisrs_categ.stype) -eltype(apisrs_categ.stype) - -srs_design_categ = SimpleRandomSample(apisrs_categ, popsize = apisrs_categ.fpc) - -# isa(srs_design_categ.data.stype, CategoricalArray) -# isa(srs_design_categ.data[!,:stype], CategoricalArray) - -# Svymean and svytotal example -svymean(:enroll,srs_design_categ) # works -svymean(:stype,srs_design_categ) # no method matching /(::CategoricalValue{String1, UInt32}, ::Int64) -svytotal(:stype,srs_design_categ) - -# way to update -srs_design.data.apidiff = srs_design.data.api00 - srs_design.data.api99 - - -svyquantile(:enroll, srs_design_categ,0.5) - -# isa(srs_design_categ.data.stype, CategoricalArray) - - -# # apisrs = DataFrame(CSV.file("data/apisrs.csv")) -# # Base.format_bytes(Base.summarysize(apisrs.stype)) -# # Base.format_bytes(Base.summarysize(CategoricalArray(apisrs.stype))) - - -# ### Test 10.09.22 - -# gdf = groupby(design.data, by) -# combine(gdf, [formula, :weights] => ((a, b) -> func(a, design, b, params...)) => AsTable) - -# using Revise -# using Survey -# using DataFrames -# using CSV -# using StatsBase - -# apisrs_categ = CSV.read("assets/apisrs.csv",DataFrame) # laod data -# srs_design = SimpleRandomSample(apisrs_categ, popsize = apisrs_categ.fpc) # create design object -# # manually grouby to get result -# gdf = groupby(srs_design.data, :cname ) -# combine(gdf, :api00 => mean) # works -# combine(gdf, (:api00,srs_design) => svymean) - -# combine(gdf, [:api00, :pw] => ((a, b) -> svymean(a, srs_design, b)) => AsTable) - -# Test 12.09.22 -using Revise -using Survey -using DataFrames -using CSV -using StatsBase -apisrs_categ = CSV.read("assets/apisrs.csv",DataFrame) # laod data -srs_design = SimpleRandomSample(apisrs_categ, popsize = apisrs_categ.fpc) # create design object -gdf = groupby(srs_design.data, :cname ) -combine(gdf, [:api00, :pw] => ((a, b) -> svymean(a, srs_design, b)) => AsTable) - - - - - # # print("Yolo") - # test = combine(gdf, x => mean => :mean) # |> DataFrame |> AsTable # , (x , design) => sem => :sem ) |> DataFrame - # @show test - # # show(test) - # # delay(50000) - # return 0 - -## 21.09.22 Stratified test 1 -# Ideally you should stratify on a CategoricalArray, alternatively, convert the StringX to categorical value before running StratifiedSample -using Revise -using Survey -using DataFrames -using CSV -using StatsBase -using CategoricalArrays - -apistrat_categ = CSV.read("assets/apistrat.csv",DataFrame) # load data -apistrat_categ.stype = CategoricalArray(apistrat_categ.stype) -eltype(apistrat_categ.stype) - -strat_categ_design = StratifiedSample(apistrat_categ, :stype ; popsize = apistrat_categ.fpc ) -strat_categ_design = StratifiedSample(apistrat_categ, :stype ; weights = :pw ) -svymean(:stype,strat_categ_design) -svytotal(:stype,strat_categ_design) - -### Strat normal -using Revise -using Survey -using DataFrames -using CSV -using StatsBase - -apistrat = CSV.read("assets/apistrat.csv",DataFrame) # laod data -strat_design = StratifiedSample(apistrat, :stype ; popsize = apistrat.fpc ) -svytotal(:api00,strat_design) -svymean(:api00,strat_design) - -svytotal(:enroll,strat_design) -svymean(:enroll,strat_design) - -# Support for categorical var - -# Test feature for categorical variables - - -srs_design_categ = SimpleRandomSample(apisrs_categ, popsize = apisrs_categ.fpc) - -# V̂ȳₕ = Nₕ .^2 ./ nₕ .* (1 .- fₕ) .* s²ₕ - # V̂Ȳ̂ = 1 ./ sum(Nₕ) .* sum( Nₕ .^2 .* V̂ȳₕ) #(Nₕ .^ 2) .* design.fpc .* s²h ./ design.sampsize # sum(combine(gdf, [x,:weights] => ( (a,b) -> wsum(a,b) ) => :total).total) - - -StratifiedSample(apistrat, :stype ; weights = :pw ) - - -## 26.09.22 HT test -using Revise -using Survey -using DataFrames -using CSV - -# Load in dataframe -apisrs = CSV.read("assets/apisrs.csv",DataFrame) - -### Set design (All should give identical results) -srs_design = SimpleRandomSample(apisrs, popsize = apisrs.fpc) # popsize only - -ht_calc(:api00, srs_design) - - -ht_calc(:api00, strat_design) \ No newline at end of file From dfc26ed1461ddcd6aa64ca48f24ec17698d2e4a4 Mon Sep 17 00:00:00 2001 From: smishr Date: Sun, 6 Nov 2022 00:03:51 +0530 Subject: [PATCH 18/18] tried fixing git merge in show.jl --- src/show.jl | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/src/show.jl b/src/show.jl index f38668c4..0a1f91d0 100644 --- a/src/show.jl +++ b/src/show.jl @@ -1,6 +1,7 @@ -"Helper function for nice printing" +""" +Helper function that transforms a given `Number` or `Vector` into a short-form string. +""" function makeshort(x) - # write floats in short form if isa(x[1], Float64) x = round.(x, sigdigits=3) end @@ -8,18 +9,23 @@ function makeshort(x) x = length(x) < 3 ? join(x, ", ") : join(x[1:3], ", ") * ", ..., " * string(last(x)) end -"Prints title: content" +""" +Print information in the form: + **name:** content[\n] +""" function printinfo(io::IO, name::String, content::String; newline::Bool=true) printstyled(io, name, ": "; bold=true) newline ? println(io, content) : print(io, content) end -"`show` method for printing information about a survey design" +"Print information about a survey design." function Base.show(io::IO, ::MIME"text/plain", design::AbstractSurveyDesign) type = typeof(design) printstyled(io, "$type:\n"; bold=true) printstyled(io, "data: "; bold=true) println(io, size(design.data, 1), "x", size(design.data, 2), " DataFrame") + printinfo(io, "weights", makeshort(design.data.weights)) + printinfo(io, "probs", makeshort(design.data.probs)) printinfo(io, "fpc", makeshort(design.data.fpc)) printinfo(io, "popsize", makeshort(design.popsize)) printinfo(io, "sampsize", makeshort(design.sampsize)) @@ -43,3 +49,19 @@ function Base.show(io::IO, ::MIME"text/plain", design::SurveyDesign) printinfo(io, "sampfraction", makeshort(design.sampfraction)) printinfo(io, "ignorefpc", string(design.ignorefpc); newline=false) end + +"Print information about a survey design initialized using `svydesign`." +function Base.show(io::IO, ::MIME"text/plain", design::svydesign) + printstyled(io, "Survey Design:\n"; bold=true) + printstyled(io, "variables: "; bold=true) + println(io, size(design.variables, 1), "x", size(design.variables, 2), " DataFrame") + printinfo(io, "id", makeshort(design.id)) + printinfo(io, "strata", makeshort(design.variables.strata)) + printinfo(io, "probs", makeshort(design.variables.probs)) + printinfo(io, "fpc:\n popsize", makeshort(design.variables.popsize)) + printinfo(io, " sampsize", makeshort(design.variables.sampsize); newline=false) + printstyled("\nnest: "; bold=true) + print(design.nest) + printstyled("\ncheck_strat: "; bold=true) + print(design.check_strat) +end \ No newline at end of file