diff --git a/src/Survey.jl b/src/Survey.jl index f902bdc5..095ae659 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -24,14 +24,11 @@ include("show.jl") export load_data export AbstractSurveyDesign, SimpleRandomSample, StratifiedSample -export SurveyDesign -export by -export ht_calc +export OneStageClusterSample export dim, colnames, dimnames export mean, total, quantile export plot export hist, sturges, freedman_diaconis export boxplot -export ht_total, ht_mean end \ No newline at end of file diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 13003458..5fb91c96 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -320,4 +320,51 @@ struct StratifiedSample <: AbstractSurveyDesign data[!, :sampfraction] = sampfraction new(data, strata, ignorefpc) end -end \ No newline at end of file +end + +""" + OneStageClusterSample <: AbstractSurveyDesign + +Survey design sampled by one stage cluster sampling. +Clusters chosen by SRS followed by complete sampling of selected clusters. +Assumes each individual in one and only one cluster; disjoint and nested clusters. + +`cluster` must be specified as a Symbol name of a column in `data`. + +# Arguments: +`data::AbstractDataFrame`: the survey dataset (!this gets modified by the constructor). +`cluster::Symbol`: the stratification variable - must be given as a column in `data`. +`popsize::Union{Nothing,Symbol,<:Unsigned,Vector{<:Real}}=nothing`: the (expected) survey population size. For + +`weights::Union{Nothing,Symbol,Vector{<:Real}}=nothing`: the sampling weights. +""" +struct OneStageClusterSample <: AbstractSurveyDesign + data::AbstractDataFrame + cluster::Symbol + popsize::Symbol + sampsize::Symbol + pps::Bool + has_strata::Bool + # Single stage cluster sample, like apiclus1 + function OneStageClusterSample(data::AbstractDataFrame, cluster::Symbol, popsize::Symbol; kwargs...) # Right now kwargs does nothing, for expansion + # sampsize here is number of clusters completely sampled, popsize is total clusters in population + if !(typeof(data[!, popsize]) <: Vector{<:Real}) + error(string("given popsize column ", popsize , " is not of numeric type")) + end + if !all(w -> w == first(data[!, popsize]), data[!, popsize]) + error("popsize must be same for all observations within the cluster in ClusterSample") + end + # For one-stage sample only one sampsize vector + sampsize_labels = :sampsize + data_groupedby_cluster = groupby(data, cluster) + data[!, sampsize_labels] = fill(size(data_groupedby_cluster, 1),(nrow(data),)) + data[!, :weights] = data[!, popsize] ./ data[!, sampsize_labels] + data[!, :probs] = 1 ./ data[!, :weights] # Many formulae are easily defined in terms of sampling probabilties + data[!, :allprobs] = data[!, :probs] # In one-stage cluster sample, allprobs is just probs, no multiplication needed + data[!, :strata] = ones(nrow(data)) + pps = false + has_strata = false + new(data, cluster, popsize, sampsize_labels, pps, has_strata) + end +end + diff --git a/src/mean.jl b/src/mean.jl index 95869857..dc04b4a9 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -6,6 +6,8 @@ Estimate the population mean of a variable of a simple random sample, and the co The calculations were done according to the book [Sampling Techniques](https://www.academia.edu/29684662/Cochran_1977_Sampling_Techniques_Third_Edition) by William Cochran. +For OneStageClusterSample, formula adapted from Sarndal pg129, section 4.2.2 Simple Random Cluster Sampling + ```jldoctest julia> apisrs = load_data("apisrs"); @@ -90,6 +92,22 @@ function mean(x::Symbol, design::StratifiedSample) return DataFrame(mean=Ȳ̂, SE=SE) end +function mean(x::Symbol, design::OneStageClusterSample) + ## Based on logical translation of corresponding in total.jl + ## Not quite same from R as it rounds of `total`, so division results in difference + # > svymean(~api00,dclus1) + # mean SE + # api00 644.17 23.542 + gdf = groupby(design.data, design.cluster) + ȳₜ = combine(gdf, x => mean => :mean).mean + Nₜ = first(design.data[!,design.popsize]) + nₜ = first(design.data[!,design.sampsize]) + Ȳ = mean(ȳₜ) + s²ₜ = var(ȳₜ) + VȲ = (1 - nₜ/Nₜ) * s²ₜ / nₜ + return DataFrame(mean = Ȳ, SE = sqrt(VȲ)) +end + """ mean(x, by, design) diff --git a/src/show.jl b/src/show.jl index 0daffe24..1a7f2d0b 100644 --- a/src/show.jl +++ b/src/show.jl @@ -46,4 +46,19 @@ function Base.show(io::IO, ::MIME"text/plain", design::StratifiedSample) printinfo(io, "sampsize", makeshort(design.data.sampsize)) printinfo(io, "sampfraction", makeshort(design.data.sampfraction)) printinfo(io, "ignorefpc", string(design.ignorefpc); newline=false) +end + +"Print information about a survey design." +function Base.show(io::IO, ::MIME"text/plain", design::OneStageClusterSample) + type = typeof(design) + printstyled(io, "$type:\n"; bold=true) + printstyled(io, "data: "; bold=true) + println(io, size(design.data, 1), "x", size(design.data, 2), " DataFrame") + printinfo(io, "cluster", string(design.cluster); newline=true) + printinfo(io, "popsize", string(design.popsize); newline=true) + printinfo(io, "sampsize", string(design.sampsize); newline=true) + printinfo(io, "strata", makeshort(design.data.strata)) + printinfo(io, "weights", makeshort(design.data.weights)) + printinfo(io, "probs", makeshort(design.data.probs)) + printinfo(io, "allprobs", makeshort(design.data.allprobs)) end \ No newline at end of file diff --git a/src/total.jl b/src/total.jl index be8879cb..c6ad99f7 100644 --- a/src/total.jl +++ b/src/total.jl @@ -3,6 +3,8 @@ Estimate the population total for the variable specified by `x`. +For OneStageClusterSample, formula adapted from Sarndal pg129, section 4.2.2 Simple Random Cluster Sampling + ```jldoctest julia> using Survey; @@ -86,6 +88,17 @@ function total(x::Vector{Symbol}, design::AbstractSurveyDesign) return df end +function total(x::Symbol, design::OneStageClusterSample) + gdf = groupby(design.data, design.cluster) + ŷₜ = combine(gdf, x => sum => :sum).sum + Nₜ = first(design.data[!,design.popsize]) + Ȳ = Nₜ * mean(ŷₜ) + nₜ = first(design.data[!,design.sampsize]) + s²ₜ = var(ŷₜ) + VȲ = Nₜ^2 * (1 - nₜ/Nₜ) * s²ₜ / nₜ + return DataFrame(mean = Ȳ, SE = sqrt(VȲ)) +end + """ total(x, by, design) diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl index fa882a31..6065aeb9 100644 --- a/test/SurveyDesign.jl +++ b/test/SurveyDesign.jl @@ -163,4 +163,33 @@ end #should throw error because sampsize > popsize apistrat = copy(apistrat_original) @test_throws ErrorException StratifiedSample(apistrat, :stype; popsize= :pw, sampsize=:fpc) -end \ No newline at end of file +end + +@testset "OneStageClusterSample" begin + # Load API datasets + apiclus1_original = load_data("apiclus1") + apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column + ############################## + # one-stage cluster sample + apiclus1 = copy(apiclus1_original) + dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc) + @test dclus1.data[!,:weights] ≈ fill(50.4667,size(apiclus1,1)) atol = 1e-3 + @test dclus1.data[!,dclus1.sampsize] ≈ fill(15,size(apiclus1,1)) + @test dclus1.data[!,:allprobs] ≈ dclus1.data[!,:probs] atol = 1e-4 +end + +# @testset "ClusterSample" begin +# # # Load API datasets +# # apiclus1_original = load_data("apiclus1") +# # apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column +# # apiclus2_original = load_data("apiclus2") +# ############################## +# ### TODO when they are implemented +# # one-stage cluster sample +# # apiclus1 = copy(apiclus1_original) +# # dclus2 = ClusterSample(apiclus1, :dnum, :fpc) +# # # two-stage cluster sample +# # dclus2 = ClusterSample(apiclus2, [:dnum,:snum], [:fpc1,:fpc2]) +# # # two-stage `with replacement' +# # dclus2wr = ClusterSample(apiclus2, [:dnum,:snum]; weights=:pw) +# end \ No newline at end of file diff --git a/test/total.jl b/test/total.jl index 80e36cba..7714e11b 100644 --- a/test/total.jl +++ b/test/total.jl @@ -92,3 +92,15 @@ end # TODO: add functionality in `src/total.jl` # TODO: add tests end + +@testset "total_OneStageClusterSample" begin + # Load API datasets + apiclus1_original = load_data("apiclus1") + apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column + ############################## + # one-stage cluster sample + apiclus1 = copy(apiclus1_original) + dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc) + @test total(:api00,dclus1).mean[1] ≈ 5949162 atol = 1 + @test total(:api00,dclus1).SE[1] ≈ 1339481 atol = 1 +end \ No newline at end of file