smishr · smishr · Nov 22, 2022 · Nov 10, 2022 · Nov 10, 2022 · Nov 11, 2022
diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl
@@ -150,6 +150,10 @@ struct StratifiedSample <: AbstractSurveyDesign
             data[!, :weights] = weights
             data[!, :probs] = 1 ./ data[!, :weights]
         end
+        data[!, :sampsize] = sampsize
+        data[!, :popsize] = popsize
+        data[!, :fpc] = fpc
+        data[!, :sampfraction] = sampfraction
         new(data, strata, sampsize, popsize, sampfraction, fpc, ignorefpc)
     end
 end

diff --git a/src/ht.jl b/src/ht.jl
@@ -1,4 +1,4 @@
-# TODO: add docstrings
+# TODO: improve docstrings?
 """
     Hartley Rao Approximation of the variance of the Horvitz-Thompson Estimator.
     Avoids exhaustively calculating joint (inclusion) probabilities πᵢⱼ of the sampling scheme. 
@@ -30,14 +30,11 @@ function ht_svymean(x::Symbol, design)
     total_df = ht_svytotal(x, design)
     total = total_df.total[1]
     var_total = total_df.se[1]
-
     mean = 1.0 ./ sum(design.popsize) .* total
+    # TODO check standard error, it is incorrect?
     # @show total_df, total, var_total, mean
     #1.0 ./ (sum(design.popsize).^2) .* 
-
+    # @show design.popsize
     se = sqrt( var_total )
-
-    @show design.popsize
-
     return DataFrame(mean = mean, se = se)
 end
diff --git a/src/svyby.jl b/src/svyby.jl
@@ -37,3 +37,37 @@ function svyby(formula::Symbol, by::Symbol, design::AbstractSurveyDesign, func::
     gdf = groupby(design.data, by)
     return combine(gdf, [formula, :weights] => ((a, b) -> func(a, design, b, params...)) => AsTable)
 end
+
+"""
+    svyby(formula, by, design, function)
+
+Generate subsets of a StratifiedSample.
+
+```jldoctest
+julia> apistrat = load_data("apistrat");
+
+julia> strat = StratifiedSample(apistrat, :stype ; popsize = apistrat.fpc);
+
+julia> svyby(:api00, :cname, strat, svymean)
+40×3 DataFrame
+ Row │ cname           domain_mean  domain_mean_se 
+     │ String15        Float64      Float64        
+─────┼─────────────────────────────────────────────
+   1 │ Los Angeles         633.511    21.3912
+   2 │ Ventura             707.172    31.6856
+   3 │ Kern                678.235    53.1337
+   4 │ San Diego           704.121    32.3311
+   5 │ San Bernardino      567.551    32.0866
+  ⋮  │       ⋮              ⋮             ⋮
+  37 │ Napa                660.0       0.0
+  38 │ Mariposa            706.0       0.0
+  39 │ Mendocino           632.018     1.04942
+  40 │ Butte               627.0       0.0
+                                    31 rows omitted
+```
+"""
+function svyby(formula::Symbol, by::Symbol, design::StratifiedSample, func::Function)
+    # TODO: add functionality for `formula::AbstractVector`
+    gdf_domain = groupby(design.data, by)
+    return combine(gdf_domain, [formula, :popsize,:sampsize,:sampfraction, design.strata] => ((a,b,c,d,e) -> func(a,b,c,d,e)) => AsTable ) 
+end
diff --git a/src/svymean.jl b/src/svymean.jl
@@ -67,8 +67,11 @@ function svymean(x::Vector{Symbol}, design::SimpleRandomSample)
     return df
 end
 
+"""
+Inner method for `svyby`
+"""
 # Inner methods for `svyby`
-function sem_svyby(x::AbstractVector, design::SimpleRandomSample, _)
+function sem_svyby(x::AbstractVector, design::SimpleRandomSample)
     # domain size
     dsize = length(x)
     # sample size
@@ -80,13 +83,59 @@ function sem_svyby(x::AbstractVector, design::SimpleRandomSample, _)
     # return the standard error
     return sqrt(variance)
 end
+
 function svymean(x::AbstractVector, design::SimpleRandomSample, weights)
-    return DataFrame(mean = mean(x), sem = sem_svyby(x, design::SimpleRandomSample, weights))
+    return DataFrame(mean = mean(x), sem = sem_svyby(x, design))
 end
 
+"""
+Inner method for `svyby` for SimpleRandomSample
+"""
+function svymean(x::AbstractVector, design::SimpleRandomSample, weights)
+    return DataFrame(mean = mean(x), sem = sem_svyby(x, design))
+end
 
-# StratifiedSample
+"""
+Inner method for `svyby` for StratifiedSample
+Calculates domain mean and its std error, based example 10.3.3 on pg394 Sarndal (1992)
+"""
+function svymean(x::AbstractVector, popsize::AbstractVector,sampsize::AbstractVector,sampfraction::AbstractVector,strata::AbstractVector)
+    df = DataFrame(x = x, popsize = popsize, sampsize = sampsize, sampfraction = sampfraction,strata = strata)
+    nsdh = []
+    substrata_domain_totals = []
+    Nh = []
+    nh = []
+    fh = []
+    ȳsdh = []
+    sigma_ȳsh_squares = []
+    grouped_frame = groupby(df,:strata)
+    for each_strata in keys(grouped_frame)
+        nsh = nrow(grouped_frame[each_strata])#, nrow=>:nsdh).nsdh
+        push!(nsdh,nsh)
+        substrata_domain_total = sum(grouped_frame[each_strata].x)
+        ȳdh = mean(grouped_frame[each_strata].x)
+        push!(ȳsdh, ȳdh)
+        push!(substrata_domain_totals,substrata_domain_total)
+        popsizes = first(grouped_frame[each_strata].popsize)
+        push!(Nh,popsizes)
+        sampsizes = first(grouped_frame[each_strata].sampsize)
+        push!(nh,sampsizes)
+        sampfrac = first(grouped_frame[each_strata].sampfraction)
+        push!(fh,sampfrac)
+        push!(sigma_ȳsh_squares,sum((grouped_frame[each_strata].x .- ȳdh).^2) )
+    end
+    domain_mean = sum(Nh .* substrata_domain_totals ./ nh)/sum(Nh .* nsdh ./ nh)
+    pdh = nsdh./nh
+    N̂d = sum(Nh .* pdh)
+    domain_var = sum(Nh.^2 .* (1 .- fh) .*  (sigma_ȳsh_squares .+ (nsdh .* (1 .-pdh) .* (ȳsdh .- domain_mean).^2)) ./ (nh .* (nh .- 1) )) ./ N̂d.^2
+    domain_mean_se = sqrt(domain_var)
+    return DataFrame(domain_mean = domain_mean, domain_mean_se = domain_mean_se)
+end
 
+"""
+    Survey mean for StratifiedSample objects.
+    Ref: Cochran (1977)
+"""
 function svymean(x::Symbol, design::StratifiedSample)
     if x == design.strata
         gdf = groupby(design.data, x)
@@ -104,17 +153,14 @@ function svymean(x::Symbol, design::StratifiedSample)
         return p
     end
     gdf = groupby(design.data, design.strata)
-
     ȳₕ = combine(gdf, x => mean => :mean).mean
     Nₕ = combine(gdf, :weights => sum => :Nₕ).Nₕ
     nₕ = combine(gdf, nrow => :nₕ).nₕ
     fₕ = nₕ ./ Nₕ
     Wₕ = Nₕ ./ sum(Nₕ)
     Ȳ̂ = sum(Wₕ .* ȳₕ)
-
     s²ₕ = combine(gdf, x => var => :s²h).s²h
     V̂Ȳ̂ = sum((Wₕ .^ 2) .* (1 .- fₕ) .* s²ₕ ./ nₕ)
     SE = sqrt(V̂Ȳ̂)
-
     return DataFrame(Ȳ̂ = Ȳ̂, SE = SE)
 end
diff --git a/test/svyby.jl b/test/svyby.jl
@@ -0,0 +1,28 @@
+# TODO testing for svyby
+@testset "svyby.jl" begin
+    ####################################################################
+    # SimpleRandomSample Test
+    apisrs = load_data("apisrs")
+    srs = SimpleRandomSample(apisrs, popsize = apisrs.fpc)
+    ## Test svymean with svyby
+    srs_svymean_svyby = svyby(:api00,:cname,srs,svymean)
+    ###>>> Add tests here
+
+    ## Test svytotal with svyby
+    # srs_svytotal_svyby = svyby(:api00,:cname,srs,svytotal)
+    ###>>> Add tests here
+
+    ####################################################################
+    # StratifiedSample Test
+    apistrat = load_data("apistrat") # load data
+    strat = StratifiedSample(apistrat, :stype ; popsize = apistrat.fpc )
+    ## Test svymean with svyby
+    strat_svymean_svyby = svyby(:api00,:cname,strat,svymean)
+    ###>>> Add tests here
+
+    ## Test svytotal with svyby
+    # strat_svytotal_svyby = svyby(:api00,:cname,strat,svytotal)
+    ###>>> Add tests here
+
+    ####################################################################
+end