diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 0e05c858..63623f88 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -150,6 +150,10 @@ struct StratifiedSample <: AbstractSurveyDesign data[!, :weights] = weights data[!, :probs] = 1 ./ data[!, :weights] end + data[!, :sampsize] = sampsize + data[!, :popsize] = popsize + data[!, :fpc] = fpc + data[!, :sampfraction] = sampfraction new(data, strata, sampsize, popsize, sampfraction, fpc, ignorefpc) end end diff --git a/src/ht.jl b/src/ht.jl index 2f165a0d..cc62ff1e 100644 --- a/src/ht.jl +++ b/src/ht.jl @@ -1,4 +1,4 @@ -# TODO: add docstrings +# TODO: improve docstrings? """ Hartley Rao Approximation of the variance of the Horvitz-Thompson Estimator. Avoids exhaustively calculating joint (inclusion) probabilities πᵢⱼ of the sampling scheme. @@ -30,14 +30,11 @@ function ht_svymean(x::Symbol, design) total_df = ht_svytotal(x, design) total = total_df.total[1] var_total = total_df.se[1] - mean = 1.0 ./ sum(design.popsize) .* total + # TODO check standard error, it is incorrect? # @show total_df, total, var_total, mean #1.0 ./ (sum(design.popsize).^2) .* - + # @show design.popsize se = sqrt( var_total ) - - @show design.popsize - return DataFrame(mean = mean, se = se) end \ No newline at end of file diff --git a/src/svyby.jl b/src/svyby.jl index 46eef093..1eda74eb 100644 --- a/src/svyby.jl +++ b/src/svyby.jl @@ -37,3 +37,37 @@ function svyby(formula::Symbol, by::Symbol, design::AbstractSurveyDesign, func:: gdf = groupby(design.data, by) return combine(gdf, [formula, :weights] => ((a, b) -> func(a, design, b, params...)) => AsTable) end + +""" + svyby(formula, by, design, function) + +Generate subsets of a StratifiedSample. + +```jldoctest +julia> apistrat = load_data("apistrat"); + +julia> strat = StratifiedSample(apistrat, :stype ; popsize = apistrat.fpc); + +julia> svyby(:api00, :cname, strat, svymean) +40×3 DataFrame + Row │ cname domain_mean domain_mean_se + │ String15 Float64 Float64 +─────┼───────────────────────────────────────────── + 1 │ Los Angeles 633.511 21.3912 + 2 │ Ventura 707.172 31.6856 + 3 │ Kern 678.235 53.1337 + 4 │ San Diego 704.121 32.3311 + 5 │ San Bernardino 567.551 32.0866 + ⋮ │ ⋮ ⋮ ⋮ + 37 │ Napa 660.0 0.0 + 38 │ Mariposa 706.0 0.0 + 39 │ Mendocino 632.018 1.04942 + 40 │ Butte 627.0 0.0 + 31 rows omitted +``` +""" +function svyby(formula::Symbol, by::Symbol, design::StratifiedSample, func::Function) + # TODO: add functionality for `formula::AbstractVector` + gdf_domain = groupby(design.data, by) + return combine(gdf_domain, [formula, :popsize,:sampsize,:sampfraction, design.strata] => ((a,b,c,d,e) -> func(a,b,c,d,e)) => AsTable ) +end \ No newline at end of file diff --git a/src/svymean.jl b/src/svymean.jl index 372b9dc9..09c621a3 100644 --- a/src/svymean.jl +++ b/src/svymean.jl @@ -67,8 +67,11 @@ function svymean(x::Vector{Symbol}, design::SimpleRandomSample) return df end +""" +Inner method for `svyby` +""" # Inner methods for `svyby` -function sem_svyby(x::AbstractVector, design::SimpleRandomSample, _) +function sem_svyby(x::AbstractVector, design::SimpleRandomSample) # domain size dsize = length(x) # sample size @@ -80,13 +83,59 @@ function sem_svyby(x::AbstractVector, design::SimpleRandomSample, _) # return the standard error return sqrt(variance) end + function svymean(x::AbstractVector, design::SimpleRandomSample, weights) - return DataFrame(mean = mean(x), sem = sem_svyby(x, design::SimpleRandomSample, weights)) + return DataFrame(mean = mean(x), sem = sem_svyby(x, design)) end +""" +Inner method for `svyby` for SimpleRandomSample +""" +function svymean(x::AbstractVector, design::SimpleRandomSample, weights) + return DataFrame(mean = mean(x), sem = sem_svyby(x, design)) +end -# StratifiedSample +""" +Inner method for `svyby` for StratifiedSample +Calculates domain mean and its std error, based example 10.3.3 on pg394 Sarndal (1992) +""" +function svymean(x::AbstractVector, popsize::AbstractVector,sampsize::AbstractVector,sampfraction::AbstractVector,strata::AbstractVector) + df = DataFrame(x = x, popsize = popsize, sampsize = sampsize, sampfraction = sampfraction,strata = strata) + nsdh = [] + substrata_domain_totals = [] + Nh = [] + nh = [] + fh = [] + ȳsdh = [] + sigma_ȳsh_squares = [] + grouped_frame = groupby(df,:strata) + for each_strata in keys(grouped_frame) + nsh = nrow(grouped_frame[each_strata])#, nrow=>:nsdh).nsdh + push!(nsdh,nsh) + substrata_domain_total = sum(grouped_frame[each_strata].x) + ȳdh = mean(grouped_frame[each_strata].x) + push!(ȳsdh, ȳdh) + push!(substrata_domain_totals,substrata_domain_total) + popsizes = first(grouped_frame[each_strata].popsize) + push!(Nh,popsizes) + sampsizes = first(grouped_frame[each_strata].sampsize) + push!(nh,sampsizes) + sampfrac = first(grouped_frame[each_strata].sampfraction) + push!(fh,sampfrac) + push!(sigma_ȳsh_squares,sum((grouped_frame[each_strata].x .- ȳdh).^2) ) + end + domain_mean = sum(Nh .* substrata_domain_totals ./ nh)/sum(Nh .* nsdh ./ nh) + pdh = nsdh./nh + N̂d = sum(Nh .* pdh) + domain_var = sum(Nh.^2 .* (1 .- fh) .* (sigma_ȳsh_squares .+ (nsdh .* (1 .-pdh) .* (ȳsdh .- domain_mean).^2)) ./ (nh .* (nh .- 1) )) ./ N̂d.^2 + domain_mean_se = sqrt(domain_var) + return DataFrame(domain_mean = domain_mean, domain_mean_se = domain_mean_se) +end +""" + Survey mean for StratifiedSample objects. + Ref: Cochran (1977) +""" function svymean(x::Symbol, design::StratifiedSample) if x == design.strata gdf = groupby(design.data, x) @@ -104,17 +153,14 @@ function svymean(x::Symbol, design::StratifiedSample) return p end gdf = groupby(design.data, design.strata) - ȳₕ = combine(gdf, x => mean => :mean).mean Nₕ = combine(gdf, :weights => sum => :Nₕ).Nₕ nₕ = combine(gdf, nrow => :nₕ).nₕ fₕ = nₕ ./ Nₕ Wₕ = Nₕ ./ sum(Nₕ) Ȳ̂ = sum(Wₕ .* ȳₕ) - s²ₕ = combine(gdf, x => var => :s²h).s²h V̂Ȳ̂ = sum((Wₕ .^ 2) .* (1 .- fₕ) .* s²ₕ ./ nₕ) SE = sqrt(V̂Ȳ̂) - return DataFrame(Ȳ̂ = Ȳ̂, SE = SE) end diff --git a/test/svyby.jl b/test/svyby.jl new file mode 100644 index 00000000..0797e734 --- /dev/null +++ b/test/svyby.jl @@ -0,0 +1,28 @@ +# TODO testing for svyby +@testset "svyby.jl" begin + #################################################################### + # SimpleRandomSample Test + apisrs = load_data("apisrs") + srs = SimpleRandomSample(apisrs, popsize = apisrs.fpc) + ## Test svymean with svyby + srs_svymean_svyby = svyby(:api00,:cname,srs,svymean) + ###>>> Add tests here + + ## Test svytotal with svyby + # srs_svytotal_svyby = svyby(:api00,:cname,srs,svytotal) + ###>>> Add tests here + + #################################################################### + # StratifiedSample Test + apistrat = load_data("apistrat") # load data + strat = StratifiedSample(apistrat, :stype ; popsize = apistrat.fpc ) + ## Test svymean with svyby + strat_svymean_svyby = svyby(:api00,:cname,strat,svymean) + ###>>> Add tests here + + ## Test svytotal with svyby + # strat_svytotal_svyby = svyby(:api00,:cname,strat,svytotal) + ###>>> Add tests here + + #################################################################### +end \ No newline at end of file