Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

One stage cluster sample #146

Merged
merged 12 commits into from
Dec 16, 2022
5 changes: 1 addition & 4 deletions src/Survey.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,11 @@ include("show.jl")

export load_data
export AbstractSurveyDesign, SimpleRandomSample, StratifiedSample
export SurveyDesign
export by
export ht_calc
export OneStageClusterSample
export dim, colnames, dimnames
export mean, total, quantile
export plot
export hist, sturges, freedman_diaconis
export boxplot
export ht_total, ht_mean

end
49 changes: 48 additions & 1 deletion src/SurveyDesign.jl
Original file line number Diff line number Diff line change
Expand Up @@ -320,4 +320,51 @@ struct StratifiedSample <: AbstractSurveyDesign
data[!, :sampfraction] = sampfraction
new(data, strata, ignorefpc)
end
end
end

"""
OneStageClusterSample <: AbstractSurveyDesign

Survey design sampled by one stage cluster sampling.
Clusters chosen by SRS followed by complete sampling of selected clusters.
Assumes each individual in one and only one cluster; disjoint and nested clusters.

`cluster` must be specified as a Symbol name of a column in `data`.

# Arguments:
`data::AbstractDataFrame`: the survey dataset (!this gets modified by the constructor).
`cluster::Symbol`: the stratification variable - must be given as a column in `data`.
`popsize::Union{Nothing,Symbol,<:Unsigned,Vector{<:Real}}=nothing`: the (expected) survey population size. For

`weights::Union{Nothing,Symbol,Vector{<:Real}}=nothing`: the sampling weights.
"""
struct OneStageClusterSample <: AbstractSurveyDesign
data::AbstractDataFrame
cluster::Symbol
popsize::Symbol
sampsize::Symbol
pps::Bool
has_strata::Bool
# Single stage cluster sample, like apiclus1
function OneStageClusterSample(data::AbstractDataFrame, cluster::Symbol, popsize::Symbol; kwargs...) # Right now kwargs does nothing, for expansion
# sampsize here is number of clusters completely sampled, popsize is total clusters in population
if !(typeof(data[!, popsize]) <: Vector{<:Real})
error(string("given popsize column ", popsize , " is not of numeric type"))
end
if !all(w -> w == first(data[!, popsize]), data[!, popsize])
error("popsize must be same for all observations within the cluster in ClusterSample")
end
# For one-stage sample only one sampsize vector
sampsize_labels = :sampsize
data_groupedby_cluster = groupby(data, cluster)
data[!, sampsize_labels] = fill(size(data_groupedby_cluster, 1),(nrow(data),))
data[!, :weights] = data[!, popsize] ./ data[!, sampsize_labels]
data[!, :probs] = 1 ./ data[!, :weights] # Many formulae are easily defined in terms of sampling probabilties
data[!, :allprobs] = data[!, :probs] # In one-stage cluster sample, allprobs is just probs, no multiplication needed
data[!, :strata] = ones(nrow(data))
pps = false
has_strata = false
new(data, cluster, popsize, sampsize_labels, pps, has_strata)
end
end

18 changes: 18 additions & 0 deletions src/mean.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ Estimate the population mean of a variable of a simple random sample, and the co
The calculations were done according to the book [Sampling Techniques](https://www.academia.edu/29684662/Cochran_1977_Sampling_Techniques_Third_Edition)
by William Cochran.

For OneStageClusterSample, formula adapted from Sarndal pg129, section 4.2.2 Simple Random Cluster Sampling

```jldoctest
julia> apisrs = load_data("apisrs");

Expand Down Expand Up @@ -90,6 +92,22 @@ function mean(x::Symbol, design::StratifiedSample)
return DataFrame(mean=Ȳ̂, SE=SE)
end

function mean(x::Symbol, design::OneStageClusterSample)
## Based on logical translation of corresponding in total.jl
## Not quite same from R as it rounds of `total`, so division results in difference
# > svymean(~api00,dclus1)
# mean SE
# api00 644.17 23.542
gdf = groupby(design.data, design.cluster)
ȳₜ = combine(gdf, x => mean => :mean).mean
Nₜ = first(design.data[!,design.popsize])
nₜ = first(design.data[!,design.sampsize])
Ȳ = mean(ȳₜ)
s²ₜ = var(ȳₜ)
VȲ = (1 - nₜ/Nₜ) * s²ₜ / nₜ
return DataFrame(mean = Ȳ, SE = sqrt(VȲ))
end

"""
mean(x, by, design)

Expand Down
15 changes: 15 additions & 0 deletions src/show.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,19 @@ function Base.show(io::IO, ::MIME"text/plain", design::StratifiedSample)
printinfo(io, "sampsize", makeshort(design.data.sampsize))
printinfo(io, "sampfraction", makeshort(design.data.sampfraction))
printinfo(io, "ignorefpc", string(design.ignorefpc); newline=false)
end

"Print information about a survey design."
function Base.show(io::IO, ::MIME"text/plain", design::OneStageClusterSample)
type = typeof(design)
printstyled(io, "$type:\n"; bold=true)
printstyled(io, "data: "; bold=true)
println(io, size(design.data, 1), "x", size(design.data, 2), " DataFrame")
printinfo(io, "cluster", string(design.cluster); newline=true)
printinfo(io, "popsize", string(design.popsize); newline=true)
printinfo(io, "sampsize", string(design.sampsize); newline=true)
printinfo(io, "strata", makeshort(design.data.strata))
printinfo(io, "weights", makeshort(design.data.weights))
printinfo(io, "probs", makeshort(design.data.probs))
printinfo(io, "allprobs", makeshort(design.data.allprobs))
end
13 changes: 13 additions & 0 deletions src/total.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

Estimate the population total for the variable specified by `x`.

For OneStageClusterSample, formula adapted from Sarndal pg129, section 4.2.2 Simple Random Cluster Sampling

```jldoctest
julia> using Survey;

Expand Down Expand Up @@ -86,6 +88,17 @@ function total(x::Vector{Symbol}, design::AbstractSurveyDesign)
return df
end

function total(x::Symbol, design::OneStageClusterSample)
gdf = groupby(design.data, design.cluster)
ŷₜ = combine(gdf, x => sum => :sum).sum
Nₜ = first(design.data[!,design.popsize])
Ȳ = Nₜ * mean(ŷₜ)
nₜ = first(design.data[!,design.sampsize])
s²ₜ = var(ŷₜ)
VȲ = Nₜ^2 * (1 - nₜ/Nₜ) * s²ₜ / nₜ
return DataFrame(mean = Ȳ, SE = sqrt(VȲ))
end

"""
total(x, by, design)

Expand Down
31 changes: 30 additions & 1 deletion test/SurveyDesign.jl
Original file line number Diff line number Diff line change
Expand Up @@ -163,4 +163,33 @@ end
#should throw error because sampsize > popsize
apistrat = copy(apistrat_original)
@test_throws ErrorException StratifiedSample(apistrat, :stype; popsize= :pw, sampsize=:fpc)
end
end

@testset "OneStageClusterSample" begin
# Load API datasets
apiclus1_original = load_data("apiclus1")
apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column
##############################
# one-stage cluster sample
apiclus1 = copy(apiclus1_original)
dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc)
@test dclus1.data[!,:weights] ≈ fill(50.4667,size(apiclus1,1)) atol = 1e-3
@test dclus1.data[!,dclus1.sampsize] ≈ fill(15,size(apiclus1,1))
@test dclus1.data[!,:allprobs] ≈ dclus1.data[!,:probs] atol = 1e-4
end

# @testset "ClusterSample" begin
# # # Load API datasets
# # apiclus1_original = load_data("apiclus1")
# # apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column
# # apiclus2_original = load_data("apiclus2")
# ##############################
# ### TODO when they are implemented
# # one-stage cluster sample
# # apiclus1 = copy(apiclus1_original)
# # dclus2 = ClusterSample(apiclus1, :dnum, :fpc)
# # # two-stage cluster sample
# # dclus2 = ClusterSample(apiclus2, [:dnum,:snum], [:fpc1,:fpc2])
# # # two-stage `with replacement'
# # dclus2wr = ClusterSample(apiclus2, [:dnum,:snum]; weights=:pw)
# end
12 changes: 12 additions & 0 deletions test/total.jl
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,15 @@ end
# TODO: add functionality in `src/total.jl`
# TODO: add tests
end

@testset "total_OneStageClusterSample" begin
# Load API datasets
apiclus1_original = load_data("apiclus1")
apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column
##############################
# one-stage cluster sample
apiclus1 = copy(apiclus1_original)
dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc)
@test total(:api00,dclus1).mean[1] ≈ 5949162 atol = 1
@test total(:api00,dclus1).SE[1] ≈ 1339481 atol = 1
end