Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Domain stratified sample merge into Shikhar design_update branch #3

Merged
merged 11 commits into from
Nov 22, 2022
4 changes: 4 additions & 0 deletions src/SurveyDesign.jl
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,10 @@ struct StratifiedSample <: AbstractSurveyDesign
data[!, :weights] = weights
data[!, :probs] = 1 ./ data[!, :weights]
end
data[!, :sampsize] = sampsize
data[!, :popsize] = popsize
data[!, :fpc] = fpc
data[!, :sampfraction] = sampfraction
new(data, strata, sampsize, popsize, sampfraction, fpc, ignorefpc)
end
end
Expand Down
9 changes: 3 additions & 6 deletions src/ht.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# TODO: add docstrings
# TODO: improve docstrings?
"""
Hartley Rao Approximation of the variance of the Horvitz-Thompson Estimator.
Avoids exhaustively calculating joint (inclusion) probabilities πᵢⱼ of the sampling scheme.
Expand Down Expand Up @@ -30,14 +30,11 @@ function ht_svymean(x::Symbol, design)
total_df = ht_svytotal(x, design)
total = total_df.total[1]
var_total = total_df.se[1]

mean = 1.0 ./ sum(design.popsize) .* total
# TODO check standard error, it is incorrect?
# @show total_df, total, var_total, mean
#1.0 ./ (sum(design.popsize).^2) .*

# @show design.popsize
se = sqrt( var_total )

@show design.popsize

return DataFrame(mean = mean, se = se)
end
34 changes: 34 additions & 0 deletions src/svyby.jl
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,37 @@ function svyby(formula::Symbol, by::Symbol, design::AbstractSurveyDesign, func::
gdf = groupby(design.data, by)
return combine(gdf, [formula, :weights] => ((a, b) -> func(a, design, b, params...)) => AsTable)
end

"""
svyby(formula, by, design, function)

Generate subsets of a StratifiedSample.

```jldoctest
julia> apistrat = load_data("apistrat");

julia> strat = StratifiedSample(apistrat, :stype ; popsize = apistrat.fpc);

julia> svyby(:api00, :cname, strat, svymean)
40×3 DataFrame
Row │ cname domain_mean domain_mean_se
│ String15 Float64 Float64
─────┼─────────────────────────────────────────────
1 │ Los Angeles 633.511 21.3912
2 │ Ventura 707.172 31.6856
3 │ Kern 678.235 53.1337
4 │ San Diego 704.121 32.3311
5 │ San Bernardino 567.551 32.0866
⋮ │ ⋮ ⋮ ⋮
37 │ Napa 660.0 0.0
38 │ Mariposa 706.0 0.0
39 │ Mendocino 632.018 1.04942
40 │ Butte 627.0 0.0
31 rows omitted
```
"""
function svyby(formula::Symbol, by::Symbol, design::StratifiedSample, func::Function)
# TODO: add functionality for `formula::AbstractVector`
gdf_domain = groupby(design.data, by)
return combine(gdf_domain, [formula, :popsize,:sampsize,:sampfraction, design.strata] => ((a,b,c,d,e) -> func(a,b,c,d,e)) => AsTable )
end
58 changes: 52 additions & 6 deletions src/svymean.jl
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,11 @@ function svymean(x::Vector{Symbol}, design::SimpleRandomSample)
return df
end

"""
Inner method for `svyby`
"""
# Inner methods for `svyby`
function sem_svyby(x::AbstractVector, design::SimpleRandomSample, _)
function sem_svyby(x::AbstractVector, design::SimpleRandomSample)
# domain size
dsize = length(x)
# sample size
Expand All @@ -80,13 +83,59 @@ function sem_svyby(x::AbstractVector, design::SimpleRandomSample, _)
# return the standard error
return sqrt(variance)
end

function svymean(x::AbstractVector, design::SimpleRandomSample, weights)
return DataFrame(mean = mean(x), sem = sem_svyby(x, design::SimpleRandomSample, weights))
return DataFrame(mean = mean(x), sem = sem_svyby(x, design))
end

"""
Inner method for `svyby` for SimpleRandomSample
"""
function svymean(x::AbstractVector, design::SimpleRandomSample, weights)
return DataFrame(mean = mean(x), sem = sem_svyby(x, design))
end

# StratifiedSample
"""
Inner method for `svyby` for StratifiedSample
Calculates domain mean and its std error, based example 10.3.3 on pg394 Sarndal (1992)
"""
function svymean(x::AbstractVector, popsize::AbstractVector,sampsize::AbstractVector,sampfraction::AbstractVector,strata::AbstractVector)
df = DataFrame(x = x, popsize = popsize, sampsize = sampsize, sampfraction = sampfraction,strata = strata)
nsdh = []
substrata_domain_totals = []
Nh = []
nh = []
fh = []
ȳsdh = []
sigma_ȳsh_squares = []
grouped_frame = groupby(df,:strata)
for each_strata in keys(grouped_frame)
nsh = nrow(grouped_frame[each_strata])#, nrow=>:nsdh).nsdh
push!(nsdh,nsh)
substrata_domain_total = sum(grouped_frame[each_strata].x)
ȳdh = mean(grouped_frame[each_strata].x)
push!(ȳsdh, ȳdh)
push!(substrata_domain_totals,substrata_domain_total)
popsizes = first(grouped_frame[each_strata].popsize)
push!(Nh,popsizes)
sampsizes = first(grouped_frame[each_strata].sampsize)
push!(nh,sampsizes)
sampfrac = first(grouped_frame[each_strata].sampfraction)
push!(fh,sampfrac)
push!(sigma_ȳsh_squares,sum((grouped_frame[each_strata].x .- ȳdh).^2) )
end
domain_mean = sum(Nh .* substrata_domain_totals ./ nh)/sum(Nh .* nsdh ./ nh)
pdh = nsdh./nh
N̂d = sum(Nh .* pdh)
domain_var = sum(Nh.^2 .* (1 .- fh) .* (sigma_ȳsh_squares .+ (nsdh .* (1 .-pdh) .* (ȳsdh .- domain_mean).^2)) ./ (nh .* (nh .- 1) )) ./ N̂d.^2
domain_mean_se = sqrt(domain_var)
return DataFrame(domain_mean = domain_mean, domain_mean_se = domain_mean_se)
end

"""
Survey mean for StratifiedSample objects.
Ref: Cochran (1977)
"""
function svymean(x::Symbol, design::StratifiedSample)
if x == design.strata
gdf = groupby(design.data, x)
Expand All @@ -104,17 +153,14 @@ function svymean(x::Symbol, design::StratifiedSample)
return p
end
gdf = groupby(design.data, design.strata)

ȳₕ = combine(gdf, x => mean => :mean).mean
Nₕ = combine(gdf, :weights => sum => :Nₕ).Nₕ
nₕ = combine(gdf, nrow => :nₕ).nₕ
fₕ = nₕ ./ Nₕ
Wₕ = Nₕ ./ sum(Nₕ)
Ȳ̂ = sum(Wₕ .* ȳₕ)

s²ₕ = combine(gdf, x => var => :s²h).s²h
V̂Ȳ̂ = sum((Wₕ .^ 2) .* (1 .- fₕ) .* s²ₕ ./ nₕ)
SE = sqrt(V̂Ȳ̂)

return DataFrame(Ȳ̂ = Ȳ̂, SE = SE)
end
28 changes: 28 additions & 0 deletions test/svyby.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# TODO testing for svyby
@testset "svyby.jl" begin
####################################################################
# SimpleRandomSample Test
apisrs = load_data("apisrs")
srs = SimpleRandomSample(apisrs, popsize = apisrs.fpc)
## Test svymean with svyby
srs_svymean_svyby = svyby(:api00,:cname,srs,svymean)
###>>> Add tests here

## Test svytotal with svyby
# srs_svytotal_svyby = svyby(:api00,:cname,srs,svytotal)
###>>> Add tests here

####################################################################
# StratifiedSample Test
apistrat = load_data("apistrat") # load data
strat = StratifiedSample(apistrat, :stype ; popsize = apistrat.fpc )
## Test svymean with svyby
strat_svymean_svyby = svyby(:api00,:cname,strat,svymean)
###>>> Add tests here

## Test svytotal with svyby
# strat_svytotal_svyby = svyby(:api00,:cname,strat,svytotal)
###>>> Add tests here

####################################################################
end