xKDR · smishr · Dec 16, 2022 · Dec 9, 2022 · Dec 12, 2022 · Dec 12, 2022
diff --git a/src/Survey.jl b/src/Survey.jl
@@ -24,14 +24,11 @@ include("show.jl")
 
 export load_data
 export AbstractSurveyDesign, SimpleRandomSample, StratifiedSample
-export SurveyDesign
-export by
-export ht_calc
+export OneStageClusterSample
 export dim, colnames, dimnames
 export mean, total, quantile
 export plot
 export hist, sturges, freedman_diaconis
 export boxplot
-export ht_total, ht_mean
 
 end
diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl
@@ -320,4 +320,51 @@ struct StratifiedSample <: AbstractSurveyDesign
         data[!, :sampfraction] = sampfraction
         new(data, strata, ignorefpc)
     end
-end
+end
+
+"""
+    OneStageClusterSample <: AbstractSurveyDesign
+
+Survey design sampled by one stage cluster sampling.
+Clusters chosen by SRS followed by complete sampling of selected clusters.
+Assumes each individual in one and only one cluster; disjoint and nested clusters.
+
+`cluster` must be specified as a Symbol name of a column in `data`.
+
+# Arguments:
+`data::AbstractDataFrame`: the survey dataset (!this gets modified by the constructor).
+`cluster::Symbol`: the stratification variable - must be given as a column in `data`.
+`popsize::Union{Nothing,Symbol,<:Unsigned,Vector{<:Real}}=nothing`: the (expected) survey population size. For 
+
+`weights::Union{Nothing,Symbol,Vector{<:Real}}=nothing`: the sampling weights.
+"""
+struct OneStageClusterSample <: AbstractSurveyDesign
+    data::AbstractDataFrame
+    cluster::Symbol
+    popsize::Symbol
+    sampsize::Symbol
+    pps::Bool
+    has_strata::Bool
+    # Single stage cluster sample, like apiclus1
+    function OneStageClusterSample(data::AbstractDataFrame, cluster::Symbol, popsize::Symbol; kwargs...) # Right now kwargs does nothing, for expansion
+        # sampsize here is number of clusters completely sampled, popsize is total clusters in population
+        if !(typeof(data[!, popsize]) <: Vector{<:Real})
+            error(string("given popsize column ", popsize , " is not of numeric type"))
+        end
+        if !all(w -> w == first(data[!, popsize]), data[!, popsize])
+            error("popsize must be same for all observations within the cluster in ClusterSample")
+        end
+        # For one-stage sample only one sampsize vector
+        sampsize_labels = :sampsize
+        data_groupedby_cluster = groupby(data, cluster)
+        data[!, sampsize_labels] = fill(size(data_groupedby_cluster, 1),(nrow(data),))
+        data[!, :weights] = data[!, popsize] ./ data[!, sampsize_labels]
+        data[!, :probs] = 1 ./ data[!, :weights] # Many formulae are easily defined in terms of sampling probabilties
+        data[!, :allprobs] = data[!, :probs] # In one-stage cluster sample, allprobs is just probs, no multiplication needed
+        data[!, :strata] = ones(nrow(data))
+        pps = false
+        has_strata = false
+        new(data, cluster, popsize, sampsize_labels, pps, has_strata)
+    end
+end
+
diff --git a/src/mean.jl b/src/mean.jl
@@ -6,6 +6,8 @@ Estimate the population mean of a variable of a simple random sample, and the co
 The calculations were done according to the book [Sampling Techniques](https://www.academia.edu/29684662/Cochran_1977_Sampling_Techniques_Third_Edition)
 by William Cochran.
 
+For OneStageClusterSample, formula adapted from Sarndal pg129, section 4.2.2 Simple Random Cluster Sampling
+
 ```jldoctest
 julia> apisrs = load_data("apisrs");
 
@@ -90,6 +92,22 @@ function mean(x::Symbol, design::StratifiedSample)
     return DataFrame(mean=Ȳ̂, SE=SE)
 end
 
+function mean(x::Symbol, design::OneStageClusterSample)
+    ## Based on logical translation of corresponding in total.jl
+    ## Not quite same from R as it rounds of `total`, so division results in difference
+    # > svymean(~api00,dclus1)
+    #     mean     SE
+    # api00 644.17 23.542
+    gdf = groupby(design.data, design.cluster)
+    ȳₜ = combine(gdf, x => mean => :mean).mean
+    Nₜ = first(design.data[!,design.popsize])
+    nₜ = first(design.data[!,design.sampsize])
+    Ȳ = mean(ȳₜ)
+    s²ₜ = var(ȳₜ)
+    VȲ = (1 - nₜ/Nₜ) * s²ₜ / nₜ
+    return DataFrame(mean = Ȳ, SE = sqrt(VȲ))
+end
+
 """
     mean(x, by, design)
 

diff --git a/src/show.jl b/src/show.jl
@@ -46,4 +46,19 @@ function Base.show(io::IO, ::MIME"text/plain", design::StratifiedSample)
     printinfo(io, "sampsize", makeshort(design.data.sampsize))
     printinfo(io, "sampfraction", makeshort(design.data.sampfraction))
     printinfo(io, "ignorefpc", string(design.ignorefpc); newline=false)
+end
+
+"Print information about a survey design."
+function Base.show(io::IO, ::MIME"text/plain", design::OneStageClusterSample)
+    type = typeof(design)
+    printstyled(io, "$type:\n"; bold=true)
+    printstyled(io, "data: "; bold=true)
+    println(io, size(design.data, 1), "x", size(design.data, 2), " DataFrame")
+    printinfo(io, "cluster", string(design.cluster); newline=true)
+    printinfo(io, "popsize", string(design.popsize); newline=true)
+    printinfo(io, "sampsize", string(design.sampsize); newline=true)
+    printinfo(io, "strata", makeshort(design.data.strata))
+    printinfo(io, "weights", makeshort(design.data.weights))
+    printinfo(io, "probs", makeshort(design.data.probs))
+    printinfo(io, "allprobs", makeshort(design.data.allprobs))
 end
diff --git a/src/total.jl b/src/total.jl
@@ -3,6 +3,8 @@
 
 Estimate the population total for the variable specified by `x`.
 
+For OneStageClusterSample, formula adapted from Sarndal pg129, section 4.2.2 Simple Random Cluster Sampling
+
 ```jldoctest
 julia> using Survey;
 
@@ -86,6 +88,17 @@ function total(x::Vector{Symbol}, design::AbstractSurveyDesign)
     return df
 end
 
+function total(x::Symbol, design::OneStageClusterSample)
+    gdf = groupby(design.data, design.cluster)
+    ŷₜ = combine(gdf, x => sum => :sum).sum
+    Nₜ = first(design.data[!,design.popsize])
+    Ȳ = Nₜ * mean(ŷₜ)
+    nₜ = first(design.data[!,design.sampsize])
+    s²ₜ = var(ŷₜ)
+    VȲ = Nₜ^2 * (1 - nₜ/Nₜ) * s²ₜ / nₜ
+    return DataFrame(mean = Ȳ, SE = sqrt(VȲ))
+end
+
 """
     total(x, by, design)
 

diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl
@@ -163,4 +163,33 @@ end
     #should throw error because sampsize > popsize
     apistrat = copy(apistrat_original)
     @test_throws ErrorException StratifiedSample(apistrat, :stype; popsize= :pw, sampsize=:fpc) 
-end
+end
+
+@testset "OneStageClusterSample" begin
+    # Load API datasets
+    apiclus1_original = load_data("apiclus1")
+    apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column
+    ##############################
+    # one-stage cluster sample
+    apiclus1 = copy(apiclus1_original)
+    dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc)
+    @test dclus1.data[!,:weights] ≈ fill(50.4667,size(apiclus1,1)) atol = 1e-3
+    @test dclus1.data[!,dclus1.sampsize] ≈ fill(15,size(apiclus1,1))
+    @test dclus1.data[!,:allprobs] ≈ dclus1.data[!,:probs] atol = 1e-4
+end
+
+# @testset "ClusterSample" begin
+#     # # Load API datasets
+#     # apiclus1_original = load_data("apiclus1")
+#     # apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column
+#     # apiclus2_original = load_data("apiclus2")
+#     ##############################
+#     ### TODO when they are implemented
+#     # one-stage cluster sample
+#     # apiclus1 = copy(apiclus1_original)
+#     # dclus2 = ClusterSample(apiclus1, :dnum, :fpc)
+#     # # two-stage cluster sample
+#     # dclus2 = ClusterSample(apiclus2, [:dnum,:snum], [:fpc1,:fpc2])
+#     # # two-stage `with replacement'
+#     # dclus2wr = ClusterSample(apiclus2, [:dnum,:snum]; weights=:pw)
+# end
diff --git a/test/total.jl b/test/total.jl
@@ -92,3 +92,15 @@ end
     # TODO: add functionality in `src/total.jl`
     # TODO: add tests
 end
+
+@testset "total_OneStageClusterSample" begin
+    # Load API datasets
+    apiclus1_original = load_data("apiclus1")
+    apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column
+    ##############################
+    # one-stage cluster sample
+    apiclus1 = copy(apiclus1_original)
+    dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc)
+    @test total(:api00,dclus1).mean[1] ≈ 5949162 atol = 1
+    @test total(:api00,dclus1).SE[1] ≈ 1339481 atol = 1
+end