Skip to content

Commit

Permalink
Merge pull request #146 from smishr/one_stage_cluster_sample
Browse files Browse the repository at this point in the history
One stage cluster sample
  • Loading branch information
smishr authored Dec 16, 2022
2 parents 03d2832 + 675dc89 commit 9ad29fc
Show file tree
Hide file tree
Showing 7 changed files with 137 additions and 6 deletions.
5 changes: 1 addition & 4 deletions src/Survey.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,11 @@ include("show.jl")

export load_data
export AbstractSurveyDesign, SimpleRandomSample, StratifiedSample
export SurveyDesign
export by
export ht_calc
export OneStageClusterSample
export dim, colnames, dimnames
export mean, total, quantile
export plot
export hist, sturges, freedman_diaconis
export boxplot
export ht_total, ht_mean

end
49 changes: 48 additions & 1 deletion src/SurveyDesign.jl
Original file line number Diff line number Diff line change
Expand Up @@ -320,4 +320,51 @@ struct StratifiedSample <: AbstractSurveyDesign
data[!, :sampfraction] = sampfraction
new(data, strata, ignorefpc)
end
end
end

"""
OneStageClusterSample <: AbstractSurveyDesign
Survey design sampled by one stage cluster sampling.
Clusters chosen by SRS followed by complete sampling of selected clusters.
Assumes each individual in one and only one cluster; disjoint and nested clusters.
`cluster` must be specified as a Symbol name of a column in `data`.
# Arguments:
`data::AbstractDataFrame`: the survey dataset (!this gets modified by the constructor).
`cluster::Symbol`: the stratification variable - must be given as a column in `data`.
`popsize::Union{Nothing,Symbol,<:Unsigned,Vector{<:Real}}=nothing`: the (expected) survey population size. For
`weights::Union{Nothing,Symbol,Vector{<:Real}}=nothing`: the sampling weights.
"""
struct OneStageClusterSample <: AbstractSurveyDesign
data::AbstractDataFrame
cluster::Symbol
popsize::Symbol
sampsize::Symbol
pps::Bool
has_strata::Bool
# Single stage cluster sample, like apiclus1
function OneStageClusterSample(data::AbstractDataFrame, cluster::Symbol, popsize::Symbol; kwargs...) # Right now kwargs does nothing, for expansion
# sampsize here is number of clusters completely sampled, popsize is total clusters in population
if !(typeof(data[!, popsize]) <: Vector{<:Real})
error(string("given popsize column ", popsize , " is not of numeric type"))
end
if !all(w -> w == first(data[!, popsize]), data[!, popsize])
error("popsize must be same for all observations within the cluster in ClusterSample")
end
# For one-stage sample only one sampsize vector
sampsize_labels = :sampsize
data_groupedby_cluster = groupby(data, cluster)
data[!, sampsize_labels] = fill(size(data_groupedby_cluster, 1),(nrow(data),))
data[!, :weights] = data[!, popsize] ./ data[!, sampsize_labels]
data[!, :probs] = 1 ./ data[!, :weights] # Many formulae are easily defined in terms of sampling probabilties
data[!, :allprobs] = data[!, :probs] # In one-stage cluster sample, allprobs is just probs, no multiplication needed
data[!, :strata] = ones(nrow(data))
pps = false
has_strata = false
new(data, cluster, popsize, sampsize_labels, pps, has_strata)
end
end

18 changes: 18 additions & 0 deletions src/mean.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ Estimate the population mean of a variable of a simple random sample, and the co
The calculations were done according to the book [Sampling Techniques](https://www.academia.edu/29684662/Cochran_1977_Sampling_Techniques_Third_Edition)
by William Cochran.
For OneStageClusterSample, formula adapted from Sarndal pg129, section 4.2.2 Simple Random Cluster Sampling
```jldoctest
julia> apisrs = load_data("apisrs");
Expand Down Expand Up @@ -90,6 +92,22 @@ function mean(x::Symbol, design::StratifiedSample)
return DataFrame(mean=Ȳ̂, SE=SE)
end

function mean(x::Symbol, design::OneStageClusterSample)
## Based on logical translation of corresponding in total.jl
## Not quite same from R as it rounds of `total`, so division results in difference
# > svymean(~api00,dclus1)
# mean SE
# api00 644.17 23.542
gdf = groupby(design.data, design.cluster)
ȳₜ = combine(gdf, x => mean => :mean).mean
Nₜ = first(design.data[!,design.popsize])
nₜ = first(design.data[!,design.sampsize])
= mean(ȳₜ)
s²ₜ = var(ȳₜ)
VȲ = (1 - nₜ/Nₜ) * s²ₜ / nₜ
return DataFrame(mean = Ȳ, SE = sqrt(VȲ))
end

"""
mean(x, by, design)
Expand Down
15 changes: 15 additions & 0 deletions src/show.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,19 @@ function Base.show(io::IO, ::MIME"text/plain", design::StratifiedSample)
printinfo(io, "sampsize", makeshort(design.data.sampsize))
printinfo(io, "sampfraction", makeshort(design.data.sampfraction))
printinfo(io, "ignorefpc", string(design.ignorefpc); newline=false)
end

"Print information about a survey design."
function Base.show(io::IO, ::MIME"text/plain", design::OneStageClusterSample)
type = typeof(design)
printstyled(io, "$type:\n"; bold=true)
printstyled(io, "data: "; bold=true)
println(io, size(design.data, 1), "x", size(design.data, 2), " DataFrame")
printinfo(io, "cluster", string(design.cluster); newline=true)
printinfo(io, "popsize", string(design.popsize); newline=true)
printinfo(io, "sampsize", string(design.sampsize); newline=true)
printinfo(io, "strata", makeshort(design.data.strata))
printinfo(io, "weights", makeshort(design.data.weights))
printinfo(io, "probs", makeshort(design.data.probs))
printinfo(io, "allprobs", makeshort(design.data.allprobs))
end
13 changes: 13 additions & 0 deletions src/total.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
Estimate the population total for the variable specified by `x`.
For OneStageClusterSample, formula adapted from Sarndal pg129, section 4.2.2 Simple Random Cluster Sampling
```jldoctest
julia> using Survey;
Expand Down Expand Up @@ -86,6 +88,17 @@ function total(x::Vector{Symbol}, design::AbstractSurveyDesign)
return df
end

function total(x::Symbol, design::OneStageClusterSample)
gdf = groupby(design.data, design.cluster)
ŷₜ = combine(gdf, x => sum => :sum).sum
Nₜ = first(design.data[!,design.popsize])
= Nₜ * mean(ŷₜ)
nₜ = first(design.data[!,design.sampsize])
s²ₜ = var(ŷₜ)
VȲ = Nₜ^2 * (1 - nₜ/Nₜ) * s²ₜ / nₜ
return DataFrame(mean = Ȳ, SE = sqrt(VȲ))
end

"""
total(x, by, design)
Expand Down
31 changes: 30 additions & 1 deletion test/SurveyDesign.jl
Original file line number Diff line number Diff line change
Expand Up @@ -163,4 +163,33 @@ end
#should throw error because sampsize > popsize
apistrat = copy(apistrat_original)
@test_throws ErrorException StratifiedSample(apistrat, :stype; popsize= :pw, sampsize=:fpc)
end
end

@testset "OneStageClusterSample" begin
# Load API datasets
apiclus1_original = load_data("apiclus1")
apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column
##############################
# one-stage cluster sample
apiclus1 = copy(apiclus1_original)
dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc)
@test dclus1.data[!,:weights] fill(50.4667,size(apiclus1,1)) atol = 1e-3
@test dclus1.data[!,dclus1.sampsize] fill(15,size(apiclus1,1))
@test dclus1.data[!,:allprobs] dclus1.data[!,:probs] atol = 1e-4
end

# @testset "ClusterSample" begin
# # # Load API datasets
# # apiclus1_original = load_data("apiclus1")
# # apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column
# # apiclus2_original = load_data("apiclus2")
# ##############################
# ### TODO when they are implemented
# # one-stage cluster sample
# # apiclus1 = copy(apiclus1_original)
# # dclus2 = ClusterSample(apiclus1, :dnum, :fpc)
# # # two-stage cluster sample
# # dclus2 = ClusterSample(apiclus2, [:dnum,:snum], [:fpc1,:fpc2])
# # # two-stage `with replacement'
# # dclus2wr = ClusterSample(apiclus2, [:dnum,:snum]; weights=:pw)
# end
12 changes: 12 additions & 0 deletions test/total.jl
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,15 @@ end
# TODO: add functionality in `src/total.jl`
# TODO: add tests
end

@testset "total_OneStageClusterSample" begin
# Load API datasets
apiclus1_original = load_data("apiclus1")
apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column
##############################
# one-stage cluster sample
apiclus1 = copy(apiclus1_original)
dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc)
@test total(:api00,dclus1).mean[1] 5949162 atol = 1
@test total(:api00,dclus1).SE[1] 1339481 atol = 1
end

0 comments on commit 9ad29fc

Please sign in to comment.