Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: SingleStageSurveyDesign for one cluster, one strata and one weights #155

Merged
merged 4 commits into from
Jan 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/Survey.jl
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ include("ratio.jl")
export load_data
export AbstractSurveyDesign, SimpleRandomSample, StratifiedSample
export OneStageClusterSample
export SingleStageSurveyDesign
export dim, colnames, dimnames
export mean, total, quantile
export plot
Expand Down
64 changes: 64 additions & 0 deletions src/SurveyDesign.jl
Original file line number Diff line number Diff line change
Expand Up @@ -427,4 +427,68 @@ struct OneStageClusterSample <: AbstractSurveyDesign
has_strata = false
new(data, cluster, popsize, sampsize_labels, weights, pps, has_strata)
end
end

"""
TwoStage <: AbstractSurveyDesign

Survey design sampled by two stage sampling, firstly with clusters then stratified.
Clusters chosen by SRS followed by stratified sampling of selected clusters.
Assumes each individual in one and only one cluster; disjoint and nested clusters.
`cluster` must be specified as a Symbol name of a column in `data`.
# Arguments:
`data::AbstractDataFrame`: the survey dataset (!this gets modified by the constructor).
`cluster::Symbol`: the stratification variable - must be given as a column in `data`.
`popsize::Union{Nothing,Symbol,<:Unsigned,Vector{<:Real}}=nothing`: the (expected) survey population size. For
`weights::Union{Nothing,Symbol,Vector{<:Real}}=nothing`: the sampling weights.

"""
struct SingleStageSurveyDesign <: AbstractSurveyDesign
data::AbstractDataFrame
cluster::Symbol
strata::Symbol
popsize::Symbol
sampsize::Symbol
weights::Symbol
pps::Bool
has_strata::Bool
# Two stage stratified random sample, like apiclus2
function SingleStageSurveyDesign(data::AbstractDataFrame; cluster::Symbol, strata::Symbol=nothing, weights::Symbol=nothing) # Right now kwargs does nothing, for expansion
# Return error if any keyword empty
if isnothing(cluster) || isnothing(strata) || isnothing(weights)
error("must specify cluster, strata and popsize")
end
# sampsize here is number of clusters completely sampled, popsize is total clusters in population
if !(typeof(data[!, weights]) <: Vector{<:Real})
error(string("given weights column ", weights , " is not of numeric type"))
end
# For one-stage sample only one sampsize vector
sampsize_labels = :sampsize
data_groupedby_cluster = groupby(data, [cluster,strata])
################
# TODO: This is not the same as sampsize in R
data[!, sampsize_labels] = fill(size(data_groupedby_cluster, 1),(nrow(data),))
###############
popsize = :popsize
data[!, popsize] = data[!, weights] .* data[!, sampsize_labels]
data[!, :probs] = 1 ./ data[!, weights] # Many formulae are easily defined in terms of sampling probabilties
data[!, :allprobs] = data[!, :probs] # In one-stage cluster sample, allprobs is just probs, no multiplication needed
pps = false
if !isnothing(strata)
has_strata = true
else
has_strata = false
end
new(data, cluster, strata, popsize, sampsize_labels, weights, pps, has_strata)
end
# Constructor for probs, just calls weights constructor where weight=1/probs
function SingleStageSurveyDesign(data::AbstractDataFrame, cluster::Symbol; strata::Symbol=nothing, probs::Symbol=nothing)
# Return error if any keyword empty
if isnothing(cluster) || isnothing(strata) || isnothing(popsize)
error("must specify cluster, strata and popsize")
end
weights_label = :weights
data[!,weights_label] = 1 ./ data[!,probs]
SingleStageSurveyDesign(data; cluster, strata, weights=weights_label)
end
end
20 changes: 20 additions & 0 deletions src/show.jl
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,24 @@ function Base.show(io::IO, ::MIME"text/plain", design::OneStageClusterSample)
printinfo(io, "design.data[!,:strata]", makeshort(design.data[!,:strata]))
printinfo(io, "design.data[!,:probs]", makeshort(design.data.probs))
printinfo(io, "design.data[!,:allprobs]", makeshort(design.data.allprobs))
end

"Print information about a survey design."
function Base.show(io::IO, ::MIME"text/plain", design::SingleStageSurveyDesign)
type = typeof(design)
printstyled(io, "$type:\n"; bold=true)
printstyled(io, "data: "; bold=true)
println(io, size(design.data, 1), "x", size(design.data, 2), " DataFrame")
printinfo(io, "cluster", string(design.cluster); newline=true)
printinfo(io, "design.data[!,design.cluster]", makeshort(design.data[!,design.cluster]))
printinfo(io, "strata", string(design.strata); newline=true)
printinfo(io, "design.data[!,design.strata]", makeshort(design.data[!,design.strata]))
printinfo(io, "popsize", string(design.popsize); newline=true)
printinfo(io, "design.data[!,design.popsize]", makeshort(design.data[!,design.popsize]))
printinfo(io, "sampsize", string(design.sampsize); newline=true)
printinfo(io, "design.data[!,design.sampsize]", makeshort(design.data[!,design.sampsize]))
printinfo(io, "weights", string(design.weights); newline=true)
printinfo(io, "design.data[!,design.weights]", makeshort(design.data[!,design.weights]))
printinfo(io, "design.data[!,:probs]", makeshort(design.data.probs))
printinfo(io, "design.data[!,:allprobs]", makeshort(design.data.allprobs))
end
20 changes: 20 additions & 0 deletions test/SurveyDesign.jl
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,20 @@ end
@test dclus1.data[!,:allprobs] ≈ dclus1.data[!,:probs] atol = 1e-4
end

@testset "SingleStageSurveyDesign" begin
# Load API datasets
yrbs_original = load_data("yrbs")
nhanes_original = load_data("nhanes")
##############################
# NHANES
nhanes = copy(nhanes_original)
dnhanes = SingleStageSurveyDesign(nhanes; cluster = :SDMVPSU, strata=:SDMVSTRA, weights=:WTMEC2YR)
##############################
# YRBS
yrbs = copy(yrbs_original)
dyrbs = SingleStageSurveyDesign(yrbs; cluster = :psu, strata=:stratum, weights=:weight)
end

# @testset "ClusterSample" begin
# # # Load API datasets
# # apiclus1_original = load_data("apiclus1")
Expand All @@ -200,4 +214,10 @@ end
# # dclus2 = ClusterSample(apiclus2, [:dnum,:snum], [:fpc1,:fpc2])
# # # two-stage `with replacement'
# # dclus2wr = ClusterSample(apiclus2, [:dnum,:snum]; weights=:pw)
# end
# @testset "mu284" begin
# ##############################
# mu284_original = load_data("mu284")
# mu284 = copy(mu284_original)
# dmu284 = SingleStageSurveyDesign(mu284; cluster = :psu, strata=:stratum, weights=:weight)
# end