From 7a87768b2a989ed91501e46b2ba98a36d220dde5 Mon Sep 17 00:00:00 2001 From: smishr <43640926+smishr@users.noreply.github.com> Date: Tue, 20 Dec 2022 21:21:34 +0530 Subject: [PATCH 1/2] Add SingleStage Design --- src/Survey.jl | 1 + src/SurveyDesign.jl | 63 ++++++++++++++++++++++++++++++++++++++++++++ src/show.jl | 20 ++++++++++++++ test/SurveyDesign.jl | 20 ++++++++++++++ 4 files changed, 104 insertions(+) diff --git a/src/Survey.jl b/src/Survey.jl index b7638233..7758fc2f 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -28,6 +28,7 @@ include("bootstrap.jl") export load_data export AbstractSurveyDesign, SimpleRandomSample, StratifiedSample export OneStageClusterSample +export SingleStageSurveyDesign export dim, colnames, dimnames export mean, total, quantile export plot diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 5fb91c96..9a1f229e 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -368,3 +368,66 @@ struct OneStageClusterSample <: AbstractSurveyDesign end end +""" + TwoStage <: AbstractSurveyDesign + +Survey design sampled by two stage sampling, firstly with clusters then stratified. +Clusters chosen by SRS followed by stratified sampling of selected clusters. +Assumes each individual in one and only one cluster; disjoint and nested clusters. +`cluster` must be specified as a Symbol name of a column in `data`. +# Arguments: +`data::AbstractDataFrame`: the survey dataset (!this gets modified by the constructor). +`cluster::Symbol`: the stratification variable - must be given as a column in `data`. +`popsize::Union{Nothing,Symbol,<:Unsigned,Vector{<:Real}}=nothing`: the (expected) survey population size. For +`weights::Union{Nothing,Symbol,Vector{<:Real}}=nothing`: the sampling weights. + +""" +struct SingleStageSurveyDesign <: AbstractSurveyDesign + data::AbstractDataFrame + cluster::Symbol + strata::Symbol + popsize::Symbol + sampsize::Symbol + weights::Symbol + pps::Bool + has_strata::Bool + # Two stage stratified random sample, like apiclus2 + function SingleStageSurveyDesign(data::AbstractDataFrame; cluster::Symbol, strata::Symbol=nothing, weights::Symbol=nothing) # Right now kwargs does nothing, for expansion + # Return error if any keyword empty + if isnothing(cluster) || isnothing(strata) || isnothing(weights) + error("must specify cluster, strata and popsize") + end + # sampsize here is number of clusters completely sampled, popsize is total clusters in population + if !(typeof(data[!, weights]) <: Vector{<:Real}) + error(string("given weights column ", weights , " is not of numeric type")) + end + # For one-stage sample only one sampsize vector + sampsize_labels = :sampsize + data_groupedby_cluster = groupby(data, [cluster,strata]) + ################ + # TODO: This is not the same as sampsize in R + data[!, sampsize_labels] = fill(size(data_groupedby_cluster, 1),(nrow(data),)) + ############### + popsize = :popsize + data[!, popsize] = data[!, weights] .* data[!, sampsize_labels] + data[!, :probs] = 1 ./ data[!, weights] # Many formulae are easily defined in terms of sampling probabilties + data[!, :allprobs] = data[!, :probs] # In one-stage cluster sample, allprobs is just probs, no multiplication needed + pps = false + if !isnothing(strata) + has_strata = true + else + has_strata = false + end + new(data, cluster, strata, popsize, sampsize_labels, weights, pps, has_strata) + end + # Constructor for probs, just calls weights constructor where weight=1/probs + function SingleStageSurveyDesign(data::AbstractDataFrame, cluster::Symbol; strata::Symbol=nothing, probs::Symbol=nothing) + # Return error if any keyword empty + if isnothing(cluster) || isnothing(strata) || isnothing(popsize) + error("must specify cluster, strata and popsize") + end + weights_label = :weights + data[!,weights_label] = 1 ./ data[!,probs] + SingleStageSurveyDesign(data; cluster, strata, weights=weights_label) + end +end \ No newline at end of file diff --git a/src/show.jl b/src/show.jl index 1a7f2d0b..49f88ff2 100644 --- a/src/show.jl +++ b/src/show.jl @@ -61,4 +61,24 @@ function Base.show(io::IO, ::MIME"text/plain", design::OneStageClusterSample) printinfo(io, "weights", makeshort(design.data.weights)) printinfo(io, "probs", makeshort(design.data.probs)) printinfo(io, "allprobs", makeshort(design.data.allprobs)) +end + +"Print information about a survey design." +function Base.show(io::IO, ::MIME"text/plain", design::SingleStageSurveyDesign) + type = typeof(design) + printstyled(io, "$type:\n"; bold=true) + printstyled(io, "data: "; bold=true) + println(io, size(design.data, 1), "x", size(design.data, 2), " DataFrame") + printinfo(io, "cluster", string(design.cluster); newline=true) + printinfo(io, "design.data[!,design.cluster]", makeshort(design.data[!,design.cluster])) + printinfo(io, "strata", string(design.strata); newline=true) + printinfo(io, "design.data[!,design.strata]", makeshort(design.data[!,design.strata])) + printinfo(io, "popsize", string(design.popsize); newline=true) + printinfo(io, "design.data[!,design.popsize]", makeshort(design.data[!,design.popsize])) + printinfo(io, "sampsize", string(design.sampsize); newline=true) + printinfo(io, "design.data[!,design.sampsize]", makeshort(design.data[!,design.sampsize])) + printinfo(io, "weights", string(design.weights); newline=true) + printinfo(io, "design.data[!,design.weights]", makeshort(design.data[!,design.weights])) + printinfo(io, "design.data[!,:probs]", makeshort(design.data.probs)) + printinfo(io, "design.data[!,:allprobs]", makeshort(design.data.allprobs)) end \ No newline at end of file diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl index 6065aeb9..21b38e72 100644 --- a/test/SurveyDesign.jl +++ b/test/SurveyDesign.jl @@ -178,6 +178,20 @@ end @test dclus1.data[!,:allprobs] ≈ dclus1.data[!,:probs] atol = 1e-4 end +@testset "SingleStageSurveyDesign" begin + # Load API datasets + yrbs_original = load_data("yrbs") + nhanes_original = load_data("nhanes") + ############################## + # NHANES + nhanes = copy(nhanes_original) + dnhanes = SingleStageSurveyDesign(nhanes; cluster = :SDMVPSU, strata=:SDMVSTRA, weights=:WTMEC2YR) + ############################## + # YRBS + yrbs = copy(yrbs_original) + dyrbs = SingleStageSurveyDesign(yrbs; cluster = :psu, strata=:stratum, weights=:weight) +end + # @testset "ClusterSample" begin # # # Load API datasets # # apiclus1_original = load_data("apiclus1") @@ -192,4 +206,10 @@ end # # dclus2 = ClusterSample(apiclus2, [:dnum,:snum], [:fpc1,:fpc2]) # # # two-stage `with replacement' # # dclus2wr = ClusterSample(apiclus2, [:dnum,:snum]; weights=:pw) +# end +# @testset "mu284" begin +# ############################## +# mu284_original = load_data("mu284") +# mu284 = copy(mu284_original) +# dmu284 = SingleStageSurveyDesign(mu284; cluster = :psu, strata=:stratum, weights=:weight) # end \ No newline at end of file From 33a48b088881af59c186228a0062f5f84d1fe67d Mon Sep 17 00:00:00 2001 From: smishr <43640926+smishr@users.noreply.github.com> Date: Tue, 20 Dec 2022 21:29:56 +0530 Subject: [PATCH 2/2] Resolve conflict --- src/show.jl | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/src/show.jl b/src/show.jl index c39dd978..46aa57f6 100644 --- a/src/show.jl +++ b/src/show.jl @@ -85,22 +85,4 @@ function Base.show(io::IO, ::MIME"text/plain", design::SingleStageSurveyDesign) printinfo(io, "design.data[!,design.weights]", makeshort(design.data[!,design.weights])) printinfo(io, "design.data[!,:probs]", makeshort(design.data.probs)) printinfo(io, "design.data[!,:allprobs]", makeshort(design.data.allprobs)) -end -"Print information about a survey design." -function Base.show(io::IO, ::MIME"text/plain", design::OneStageClusterSample) - type = typeof(design) - printstyled(io, "$type:\n"; bold=true) - printstyled(io, "data: "; bold=true) - println(io, size(design.data, 1), "x", size(design.data, 2), " DataFrame") - printinfo(io, "cluster", string(design.cluster); newline=true) - printinfo(io, "design.data[!,design.cluster]", makeshort(design.data[!,design.cluster])) - printinfo(io, "popsize", string(design.popsize); newline=true) - printinfo(io, "design.data[!,design.popsize]", makeshort(design.data[!,design.popsize])) - printinfo(io, "sampsize", string(design.sampsize); newline=true) - printinfo(io, "design.data[!,design.sampsize]", makeshort(design.data[!,design.sampsize])) - printinfo(io, "weights", string(design.weights); newline=true) - printinfo(io, "design.data[!,design.weights]", makeshort(design.data[!,design.weights])) - printinfo(io, "design.data[!,:strata]", makeshort(design.data[!,:strata])) - printinfo(io, "design.data[!,:probs]", makeshort(design.data.probs)) - printinfo(io, "design.data[!,:allprobs]", makeshort(design.data.allprobs)) end \ No newline at end of file