diff --git a/src/Survey.jl b/src/Survey.jl index 064262ea..ad5c0b27 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -27,6 +27,7 @@ include("show.jl") export load_data export AbstractSurveyDesign, SimpleRandomSample, StratifiedSample +export SurveyDesign export svydesign export svyglm export svyby @@ -37,6 +38,7 @@ export @formula export svyplot export svyhist, sturges, freedman_diaconis export svyboxplot +export ht_svytotal, ht_svymean export #families Normal, diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 7adfab91..0e05c858 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -14,7 +14,7 @@ abstract type AbstractSurveyDesign end """ SimpleRandomSample <: AbstractSurveyDesign -Survey design sampled by simple random sampling. + Survey design sampled by simple random sampling. """ struct SimpleRandomSample <: AbstractSurveyDesign data::AbstractDataFrame @@ -53,10 +53,11 @@ struct SimpleRandomSample <: AbstractSurveyDesign if !all(p -> p == first(probs), probs) error("all probability weights must be equal for Simple Random Sample") end - weights = 1 ./ probs + weights = 1 / probs end # estimate population size - popsize = round(sampsize .* first(weights)) |> UInt + popsize = round(sampsize * first(weights)) |> UInt + # @show popsize, sampsize # Check this Line TODO if sampsize > popsize error("sample size cannot be greater than population size") end @@ -70,13 +71,13 @@ struct SimpleRandomSample <: AbstractSurveyDesign error("either population size or frequency/probability weights must be specified") end # set sampling fraction - sampfraction = sampsize ./ popsize + sampfraction = sampsize / popsize # set fpc - fpc = ignorefpc ? 1 : 1 .- (sampsize ./ popsize) + fpc = ignorefpc ? 1 : 1 - (sampsize / popsize) # add columns for frequency and probability weights to `data` data[!, :weights] = weights if isnothing(probs) - probs = 1 ./ data[!, :weights] + probs = 1 ./ data[!, :weights] end data[!, :probs] = probs @@ -87,19 +88,19 @@ end """ StratifiedSample <: AbstractSurveyDesign -Survey design sampled by stratification. + Survey design sampled by stratification. """ struct StratifiedSample <: AbstractSurveyDesign data::AbstractDataFrame strata::Symbol - sampsize::Vector{Union{Nothing,Float64}} - popsize::Vector{Union{Nothing,Float64}} + sampsize::Union{Nothing,Vector{Real}} + popsize::Union{Nothing,Vector{Real}} sampfraction::Vector{Real} fpc::Vector{Real} ignorefpc::Bool function StratifiedSample(data::AbstractDataFrame, strata::Symbol; popsize=nothing, - sampsize= transform(groupby(data,strata), nrow => :counts ).counts , + sampsize=transform(groupby(data, strata), nrow => :counts).counts, weights=nothing, probs=nothing, ignorefpc=false @@ -133,7 +134,7 @@ struct StratifiedSample <: AbstractSurveyDesign elseif typeof(popsize) <: Vector{<:Real} # TODO: change `elseif` condition weights = popsize ./ sampsize # expansion estimator - # TODO: add probability weights + # TODO: add probability weights else error("either population size or frequency/probability weights must be specified") end @@ -156,20 +157,21 @@ end """ ClusterSample <: AbstractSurveyDesign -Survey design sampled by clustering. + One stage clusters, Primary Sampling Units (PSU) chosen with SRS. No stratification. + Assuming every individual is in one and only one cluster, and the subsampling probabilities + for any given cluster do not depend on which other clusters were sampled. (Lumley pg41, para2) """ struct ClusterSample <: AbstractSurveyDesign data::AbstractDataFrame - strata::Symbol - sampsize::Vector{Union{Nothing,Float64}} - popsize::Vector{Union{Nothing,Float64}} + clusters::Union{Symbol,Nothing} + sampsize::Union{Nothing,Vector{Real}} + popsize::Union{Nothing,Vector{Real}} sampfraction::Vector{Real} fpc::Vector{Real} ignorefpc::Bool - # TODO: change entire struct body - function GeneralSample(data::AbstractDataFrame, strata::Symbol; + function ClusterSample(data::AbstractDataFrame, strata::Symbol; popsize=nothing, - sampsize= transform(groupby(data,strata), nrow => :counts ).counts , + sampsize=transform(groupby(data, strata), nrow => :counts).counts, weights=nothing, probs=nothing, ignorefpc=false @@ -181,8 +183,10 @@ struct ClusterSample <: AbstractSurveyDesign probs = data[!, probs] end - if ignorefpc # && (isnothing(popsize) || isnothing(weights) || isnothing(probs)) - @warn "Assuming equal weights" + if ignorefpc + # TODO: change what happens if `ignorepfc == true` or if the user only + # specifies `data` + @warn "assuming equal weights" weights = ones(nrow(data)) end @@ -201,7 +205,7 @@ struct ClusterSample <: AbstractSurveyDesign elseif typeof(popsize) <: Vector{<:Real} # TODO: change `elseif` condition weights = popsize ./ sampsize # expansion estimator - # TODO: add probability weights + # TODO: add probability weights else error("either population size or frequency/probability weights must be specified") end @@ -220,3 +224,111 @@ struct ClusterSample <: AbstractSurveyDesign new(data, strata, sampsize, popsize, sampfraction, fpc, ignorefpc) end end + + +""" + SurveyDesign <: AbstractSurveyDesign + + Survey design with arbitrary design weights + + clusters: can be passed as `Symbol`, Vector{Symbol}, Vector{Real} or Nothing + strata: can be passed as `Symbol`, Vector{Symbol}, Vector{Real} or Nothing + sampsize: can be passed as `Symbol`, Vector{Symbol}, Vector{Real} or Nothing + popsize: can be passed as `Symbol`, Vector{Symbol}, Vector{Real} or Nothing +""" +struct SurveyDesign <: AbstractSurveyDesign + data::AbstractDataFrame + clusters::Union{Symbol,Vector{Symbol},Nothing} + strata::Union{Symbol,Vector{Symbol},Nothing} + sampsize::Union{Real,Vector{Real},Nothing} + popsize::Union{Real,Vector{Real},Nothing} + sampfraction::Union{Real,Vector{Real}} + fpc::Union{Real,Vector{Real}} + ignorefpc::Bool + # TODO: struct body still work in progress + function SurveyDesign(data::AbstractDataFrame; + clusters=nothing, + strata=nothing, + popsize=nothing, + sampsize=nothing, + weights=nothing, + probs=nothing, + ignorefpc=false + ) + if isnothing(sampsize) + if isnothing(strata) + sampsize = nrow(data) + else + sampsize = transform(groupby(data, strata), nrow => :counts).counts + end + + end + + if isa(weights, Symbol) + weights = data[!, weights] + end + if isa(probs, Symbol) + probs = data[!, probs] + end + if isa(popsize, Symbol) + popsize = data[!, popsize] + end + + # TODO: Check below, may not be correct for all designs + if ignorefpc # && (isnothing(popsize) || isnothing(weights) || isnothing(probs)) + @warn "Assuming equal weights" + weights = ones(nrow(data)) + end + + # TODO: Do the other case where clusters are given + if isnothing(clusters) + # set population size if it is not given; `weights` and `sampsize` must be given + if isnothing(popsize) + # TODO: add probability weights if `weights` is not `nothing` + if typeof(probs) <: Vector{<:Real} + weights = 1 ./ probs + end + # estimate population size + popsize = sampsize .* weights + + if sampsize > popsize + error("sample size cannot be greater than population size") + end + elseif typeof(popsize) <: Vector{<:Real} + # TODO: change `elseif` condition + weights = popsize ./ sampsize # expansion estimator + # TODO: add probability weights + else + error("either population size or frequency/probability weights must be specified") + end + # TODO: for now support SRS within SurveyDesign. Later, just call SimpleRandomSample?? + if typeof(sampsize) <: Real && typeof(popsize) <: Vector{<:Real} # Eg when SRS + if !all(y -> y == first(popsize), popsize) # SRS by definition is equi-weighted + error("Simple Random Sample must be equi-weighted. Different sampling weights detected in vectors") + end + popsize = first(popsize) |> Real + end + + # set sampling fraction + sampfraction = sampsize ./ popsize + # set fpc + fpc = ignorefpc ? 1 : 1 .- (sampsize ./ popsize) + # add columns for frequency and probability weights to `data` + if !isnothing(probs) + data[!, :probs] = probs + data[!, :weights] = 1 ./ data[!, :probs] + else + data[!, :weights] = weights + data[!, :probs] = 1 ./ data[!, :weights] + end + # @show clusters, strata, sampsize,popsize, sampfraction, fpc, ignorefpc + new(data, clusters, strata, sampsize, popsize, sampfraction, fpc, ignorefpc) + elseif isa(clusters,Symbol) + # One Cluster sampling - PSU chosen with SRS, + print("One stage cluster design with PSU SRS") + elseif typeof(clusters) <: Vector{Symbol} + # TODO "Multistage cluster designs" + print("TODO: Multistage cluster design with PSU SRS") + end + end +end \ No newline at end of file diff --git a/src/ht.jl b/src/ht.jl index abe8eceb..2f165a0d 100644 --- a/src/ht.jl +++ b/src/ht.jl @@ -1,17 +1,43 @@ # TODO: add docstrings -function HorwitzThompson_HartleyRaoApproximation(y::AbstractVector, design::AbstractSurveyDesign, HT_total) +""" + Hartley Rao Approximation of the variance of the Horvitz-Thompson Estimator. + Avoids exhaustively calculating joint (inclusion) probabilities πᵢⱼ of the sampling scheme. +""" +function HT_HartleyRaoVarApprox(y::AbstractVector, design::AbstractSurveyDesign, HT_total) # TODO: change function name - # TODO: change variable name (gets mixed up with the constant pi) - pi = design.data.probs - hartley_rao_var = sum((1 .- ((design.sampsize .- 1) ./ design.sampsize) .* pi) .* + πᵢ = design.data.probs + hartley_rao_var = sum((1 .- ((design.sampsize .- 1) ./ design.sampsize) .* πᵢ) .* ((y .* design.data.weights) .- (HT_total ./ design.sampsize)) .^ 2) - return sqrt(hartley_rao_var) + return hartley_rao_var end -# TODO: add docstrings -function ht_calc(x::Symbol, design) - # TODO: change function name +""" + Horvitz-Thompson Estimator of Population Total + For arbitrary sampling probabilities +""" +function ht_svytotal(x::Symbol, design) total = wsum(Float32.(design.data[!, x]), design.data.weights) - se = HorwitzThompson_HartleyRaoApproximation(design.data[!, x], design, total) - return DataFrame(total = total, se = se) + var_total = HT_HartleyRaoVarApprox(design.data[!, x], design, total) + return DataFrame(total = total, se = sqrt(var_total)) end + +# TODO: add docstrings +""" + Horvitz-Thompson Estimator of Population Mean + Scales the Population Total by the relevant +""" +function ht_svymean(x::Symbol, design) + total_df = ht_svytotal(x, design) + total = total_df.total[1] + var_total = total_df.se[1] + + mean = 1.0 ./ sum(design.popsize) .* total + # @show total_df, total, var_total, mean + #1.0 ./ (sum(design.popsize).^2) .* + + se = sqrt( var_total ) + + @show design.popsize + + return DataFrame(mean = mean, se = se) +end \ No newline at end of file diff --git a/src/poststratify.jl b/src/poststratify.jl new file mode 100644 index 00000000..2dc6c421 --- /dev/null +++ b/src/poststratify.jl @@ -0,0 +1,3 @@ +""" + Postratification functions +""" \ No newline at end of file diff --git a/src/sampling.jl b/src/sampling.jl new file mode 100644 index 00000000..0da65a24 --- /dev/null +++ b/src/sampling.jl @@ -0,0 +1,3 @@ +""" + Sampling functions using various schemes +""" \ No newline at end of file diff --git a/src/show.jl b/src/show.jl index 9e621d4b..b3eb7876 100644 --- a/src/show.jl +++ b/src/show.jl @@ -33,6 +33,23 @@ function Base.show(io::IO, ::MIME"text/plain", design::AbstractSurveyDesign) printinfo(io, "ignorefpc", string(design.ignorefpc); newline=false) end +"`show` method for printing information about a survey design" +function Base.show(io::IO, ::MIME"text/plain", design::SurveyDesign) + type = typeof(design) + printstyled(io, "$type:\n"; bold=true) + printstyled(io, "data: "; bold=true) + println(io, size(design.data, 1), "x", size(design.data, 2), " DataFrame") + println(io, "clusters: ", design.clusters) + println(io, "strata: ", design.strata) + printinfo(io, "weights", makeshort(design.data.weights)) + printinfo(io, "probs", makeshort(design.data.probs)) + printinfo(io, "popsize", makeshort(design.popsize)) + printinfo(io, "sampsize", makeshort(design.sampsize)) + printinfo(io, "fpc", makeshort(design.data.fpc)) + printinfo(io, "sampfraction", makeshort(design.sampfraction)) + printinfo(io, "ignorefpc", string(design.ignorefpc); newline=false) +end + "Print information about a survey design initialized using `svydesign`." function Base.show(io::IO, ::MIME"text/plain", design::svydesign) printstyled(io, "Survey Design:\n"; bold=true) diff --git a/src/svyratio.jl b/src/svyratio.jl new file mode 100644 index 00000000..b622e2e6 --- /dev/null +++ b/src/svyratio.jl @@ -0,0 +1,14 @@ +""" + Ratio estimators and Subpopulation estimates. + Domain Estimation is special case of subpopulation estimate. +""" + +function ratio(x::Symbol,y::Symbol,design::AbstractSurveyDesign) + num = design.data[!,x] + denom = design.data[!,y] + # fill(A,) + # A == B + + mean = num / denom + return DataFrame(mean = mean) +end \ No newline at end of file diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl index 17994e14..535ef940 100644 --- a/test/SurveyDesign.jl +++ b/test/SurveyDesign.jl @@ -1,19 +1,56 @@ @testset "SurveyDesign.jl" begin - # SimpleRandomSample tests - apisrs_original = load_data("apisrs") - apisrs = copy(apisrs_original) + ##### SimpleRandomSample tests + # Load API datasets + apisrs_original = load_data("apisrs") + apistrat_original = load_data("apistrat") + apiclus1_original = load_data("apiclus1") + apiclus2_original = load_data("apiclus2") + # Work on copy, keep original + apisrs = copy(apisrs_original) + apistrat = copy(apistrat_original) + apiclus1 = copy(apiclus1_original) + apiclus2 = copy(apiclus2_original) srs = SimpleRandomSample(apisrs, popsize = apisrs.fpc) @test srs.data.weights == 1 ./ srs.data.probs # weights should be inverse of probs @test srs.sampsize > 0 - srs_freq = SimpleRandomSample(apisrs; popsize = apisrs.fpc , weights = fill(0.3, size(apisrs_original, 1))) + srs_freq = SimpleRandomSample(apisrs; weights = apisrs.pw ) @test srs_freq.data.weights[1] == 30.97 @test srs_freq.data.weights == 1 ./ srs_freq.data.probs - # TODO: needs change; this works but isn't what the user is expecting - srs_prob = SimpleRandomSample(apisrs; probs = fill(0.3, size(apisrs_original, 1))) - @test srs_prob.data.probs[1] == 0.3 + ##### TODO: needs change; this works but isn't what the user is expecting + # srs_prob = SimpleRandomSample(apisrs; probs = 1 ./ apisrs.pw) + # @test srs_prob.data.probs[1] == 0.3 - # StratifiedSample tests + #### TODO: StratifiedSample tests + # ... @sayantika @iulia @shikhar + # apistrat examples from R, check the main if-else cases + + # Test with probs = , weight = , and popsize = arguments, as vectors and sybols + + ##### SurveyDesign tests + + # Case 1: Simple Random Sample + svydesign1 = SurveyDesign(apisrs, popsize = apisrs.fpc) + @test svydesign1.data.weights == 1 ./ svydesign1.data.probs # weights should be inverse of probs + @test svydesign1.sampsize > 0 + + # Case 1b: SRS 'with replacement' approximation ie ignorefpc = true + svydesign2 = SurveyDesign(apisrs, popsize = apisrs.fpc, ignorefpc = true) + @test svydesign2.data.weights == 1 ./ svydesign2.data.probs # weights should be inverse of probs + @test svydesign2.sampsize > 0 + + # Case 2: Stratified Random Sample + # strat_design = SurveyDesign(apistrat,strata = :stype, popsize =:fpc, ignorefpc = false) + + # Case: Arbitrary weights + + # Case: One-stage cluster sampling, no strata + + # Case: One-stage cluster sampling, with one-stage strata + + # Case: Two cluster, two strata + + # Case: Multi stage stratified design end diff --git a/test/poststratify.jl b/test/poststratify.jl new file mode 100644 index 00000000..9af4e1c8 --- /dev/null +++ b/test/poststratify.jl @@ -0,0 +1,3 @@ +""" + Testing suite for postratification functions +""" diff --git a/test/sampling.jl b/test/sampling.jl new file mode 100644 index 00000000..276ed8a1 --- /dev/null +++ b/test/sampling.jl @@ -0,0 +1,3 @@ +""" + Testing suite for sampling functions +""" \ No newline at end of file