From 6346972510f12dfcdfd0ef108008fd8173cdf6b3 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Tue, 2 May 2023 18:18:02 +0530 Subject: [PATCH 01/21] Adding new structs for difference replicate methods, and exporting them. --- src/Survey.jl | 1 + src/SurveyDesign.jl | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/src/Survey.jl b/src/Survey.jl index 6960cfb5..ced437da 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -29,6 +29,7 @@ include("jackknife.jl") export load_data export AbstractSurveyDesign, SurveyDesign, ReplicateDesign +export BootstrapReplicates, JackknifeReplicates export dim, colnames, dimnames export mean, total, quantile export plot diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index f4c28ab4..776c1fe4 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -123,6 +123,15 @@ struct SurveyDesign <: AbstractSurveyDesign end end + +struct BootstrapReplicates + replicates::UInt +end + +struct JackknifeReplicates + replicates::UInt +end + """ ReplicateDesign <: AbstractSurveyDesign From ca964b16b5bab6e3887829f565194e0f37e0e13b Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Tue, 2 May 2023 19:04:05 +0530 Subject: [PATCH 02/21] Incorporating inference method to the `ReplicateDesign` struct, and adding a new abstract type `InferenceMethod`. --- src/SurveyDesign.jl | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 776c1fe4..028c2959 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -123,12 +123,13 @@ struct SurveyDesign <: AbstractSurveyDesign end end +abstract type InferenceMethod end -struct BootstrapReplicates +struct BootstrapReplicates <: InferenceMethod replicates::UInt end -struct JackknifeReplicates +struct JackknifeReplicates <: InferenceMethod replicates::UInt end @@ -261,7 +262,7 @@ replicates: 1000 ``` """ -struct ReplicateDesign <: AbstractSurveyDesign +struct ReplicateDesign{ReplicateType} <: AbstractSurveyDesign data::AbstractDataFrame cluster::Symbol popsize::Symbol @@ -273,9 +274,10 @@ struct ReplicateDesign <: AbstractSurveyDesign type::String replicates::UInt replicate_weights::Vector{Symbol} + inference_method::ReplicateType # default constructor - function ReplicateDesign( + function ReplicateDesign{ReplicateType}( data::DataFrame, cluster::Symbol, popsize::Symbol, @@ -286,21 +288,21 @@ struct ReplicateDesign <: AbstractSurveyDesign pps::Bool, type::String, replicates::UInt, - replicate_weights::Vector{Symbol} - ) - new(data, cluster, popsize, sampsize, strata, weights, allprobs, - pps, type, replicates, replicate_weights) + replicate_weights::Vector{Symbol}, + ) where {ReplicateType} + new{ReplicateType}(data, cluster, popsize, sampsize, strata, weights, allprobs, + pps, type, replicates, replicate_weights, ReplicateType(replicates)) end # constructor with given replicate_weights - function ReplicateDesign( + function ReplicateDesign{ReplicateType}( data::AbstractDataFrame, replicate_weights::Vector{Symbol}; clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing, strata::Union{Nothing,Symbol} = nothing, popsize::Union{Nothing,Symbol} = nothing, weights::Union{Nothing,Symbol} = nothing - ) + ) where {ReplicateType} # rename the replicate weights if needed rename!(data, [replicate_weights[index] => "replicate_"*string(index) for index in 1:length(replicate_weights)]) @@ -312,7 +314,7 @@ struct ReplicateDesign <: AbstractSurveyDesign popsize=popsize, weights=weights ) - new( + new{ReplicateType}( base_design.data, base_design.cluster, base_design.popsize, @@ -323,20 +325,21 @@ struct ReplicateDesign <: AbstractSurveyDesign base_design.pps, "bootstrap", length(replicate_weights), - replicate_weights + replicate_weights, + ReplicateType(length(replicate_weights)) ) end # replicate weights given as a range of columns - ReplicateDesign( + ReplicateDesign{ReplicateType}( data::AbstractDataFrame, replicate_weights::UnitRange{Int}; clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing, strata::Union{Nothing,Symbol} = nothing, popsize::Union{Nothing,Symbol} = nothing, weights::Union{Nothing,Symbol} = nothing - ) = - ReplicateDesign( + ) where {ReplicateType} = + ReplicateDesign{ReplicateType}( data, Symbol.(names(data)[replicate_weights]); clusters=clusters, @@ -346,15 +349,15 @@ struct ReplicateDesign <: AbstractSurveyDesign ) # replicate weights given as regular expression - ReplicateDesign( + ReplicateDesign{ReplicateType}( data::AbstractDataFrame, replicate_weights::Regex; clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing, strata::Union{Nothing,Symbol} = nothing, popsize::Union{Nothing,Symbol} = nothing, weights::Union{Nothing,Symbol} = nothing - ) = - ReplicateDesign( + ) where {ReplicateType} = + ReplicateDesign{ReplicateType}( data, Symbol.(names(data)[findall(name -> occursin(replicate_weights, name), names(data))]); clusters=clusters, From be94678a96460ba288f77cb562a2743cf367136b Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Tue, 2 May 2023 19:10:03 +0530 Subject: [PATCH 03/21] Incorporating inference method in `bootweights`. --- src/bootstrap.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/bootstrap.jl b/src/bootstrap.jl index ef66ad9f..63c35162 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -1,5 +1,5 @@ """ -Use bootweights to create replicate weights using Rao-Wu bootstrap. The function accepts a `SurveyDesign` and returns a `ReplicateDesign` which has additional columns for replicate weights. +Use bootweights to create replicate weights using Rao-Wu bootstrap. The function accepts a `SurveyDesign` and returns a `ReplicateDesign{BootstrapReplicates}` which has additional columns for replicate weights. ```jldoctest julia> using Random @@ -37,7 +37,7 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis substrata_dfs[h] = cluster_sorted end df = reduce(vcat, substrata_dfs) - return ReplicateDesign( + return ReplicateDesign{BootstrapReplicates}( df, design.cluster, design.popsize, @@ -48,7 +48,7 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis design.pps, "bootstrap", UInt(replicates), - [Symbol("replicate_"*string(replicate)) for replicate in 1:replicates] + [Symbol("replicate_"*string(replicate)) for replicate in 1:replicates], ) end From 084c9005d6d759d6e4673610a3a13b38ef42bc25 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Tue, 2 May 2023 19:11:40 +0530 Subject: [PATCH 04/21] Incorporating inference method in `jackknifeweights`. --- src/jackknife.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/jackknife.jl b/src/jackknife.jl index 315de348..65a94835 100644 --- a/src/jackknife.jl +++ b/src/jackknife.jl @@ -67,7 +67,7 @@ function jackknifeweights(design::SurveyDesign) end end - return ReplicateDesign( + return ReplicateDesign{JackknifeReplicates}( df, design.cluster, design.popsize, From 662d3562f8f2d46dcc94f39006f90cab81da573e Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Tue, 2 May 2023 19:19:42 +0530 Subject: [PATCH 05/21] Forcing `ReplicateType` to be a subtype of `InferenceMethod`. --- src/SurveyDesign.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 028c2959..d24ed82a 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -289,7 +289,7 @@ struct ReplicateDesign{ReplicateType} <: AbstractSurveyDesign type::String, replicates::UInt, replicate_weights::Vector{Symbol}, - ) where {ReplicateType} + ) where {ReplicateType <: InferenceMethod} new{ReplicateType}(data, cluster, popsize, sampsize, strata, weights, allprobs, pps, type, replicates, replicate_weights, ReplicateType(replicates)) end @@ -302,7 +302,7 @@ struct ReplicateDesign{ReplicateType} <: AbstractSurveyDesign strata::Union{Nothing,Symbol} = nothing, popsize::Union{Nothing,Symbol} = nothing, weights::Union{Nothing,Symbol} = nothing - ) where {ReplicateType} + ) where {ReplicateType <: InferenceMethod} # rename the replicate weights if needed rename!(data, [replicate_weights[index] => "replicate_"*string(index) for index in 1:length(replicate_weights)]) @@ -338,7 +338,7 @@ struct ReplicateDesign{ReplicateType} <: AbstractSurveyDesign strata::Union{Nothing,Symbol} = nothing, popsize::Union{Nothing,Symbol} = nothing, weights::Union{Nothing,Symbol} = nothing - ) where {ReplicateType} = + ) where {ReplicateType <: InferenceMethod} = ReplicateDesign{ReplicateType}( data, Symbol.(names(data)[replicate_weights]); @@ -356,7 +356,7 @@ struct ReplicateDesign{ReplicateType} <: AbstractSurveyDesign strata::Union{Nothing,Symbol} = nothing, popsize::Union{Nothing,Symbol} = nothing, weights::Union{Nothing,Symbol} = nothing - ) where {ReplicateType} = + ) where {ReplicateType <: InferenceMethod} = ReplicateDesign{ReplicateType}( data, Symbol.(names(data)[findall(name -> occursin(replicate_weights, name), names(data))]); From 95803b23e47efd4bba594c76b7470fe19f7ab799 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Tue, 2 May 2023 20:06:27 +0530 Subject: [PATCH 06/21] Correcting docstrings in `src/SurveyDesign.jl`. --- src/SurveyDesign.jl | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index d24ed82a..4352abe0 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -136,43 +136,43 @@ end """ ReplicateDesign <: AbstractSurveyDesign -Survey design obtained by replicating an original design using [`bootweights`](@ref). If -replicate weights are available, then they can be used to directly create a `ReplicateDesign`. +Survey design obtained by replicating an original design using an inference method like [`bootweights`](@ref) or [`jackknifeweights`](@ref). If +replicate weights are available, then they can be used to directly create a `ReplicateDesign` object. # Constructors ```julia -ReplicateDesign( +ReplicateDesign{ReplicateType}( data::AbstractDataFrame, replicate_weights::Vector{Symbol}; clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing, strata::Union{Nothing,Symbol} = nothing, popsize::Union{Nothing,Symbol} = nothing, weights::Union{Nothing,Symbol} = nothing -) +) where {ReplicateType <: InferenceType} -ReplicateDesign( +ReplicateDesign{ReplicateType}( data::AbstractDataFrame, replicate_weights::UnitIndex{Int}; clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing, strata::Union{Nothing,Symbol} = nothing, popsize::Union{Nothing,Symbol} = nothing, weights::Union{Nothing,Symbol} = nothing -) +) where {ReplicateType <: InferenceType} -ReplicateDesign( +ReplicateDesign{ReplicateType}( data::AbstractDataFrame, replicate_weights::Regex; clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing, strata::Union{Nothing,Symbol} = nothing, popsize::Union{Nothing,Symbol} = nothing, weights::Union{Nothing,Symbol} = nothing -) +) where {ReplicateType <: InferenceType} ``` # Arguments -The constructor has the same arguments as [`SurveyDesign`](@ref). The only additional argument is `replicate_weights`, which can +`ReplicateType` must be one of the supported inference types; currently the package supports [`BootstrapReplicates`](@ref) and [`JackknifeReplicates`](@ref). The constructor has the same arguments as [`SurveyDesign`](@ref). The only additional argument is `replicate_weights`, which can be of one of the following types. - `Vector{Symbol}`: In this case, each `Symbol` in the vector should represent a column of `data` containing the replicate weights. @@ -183,7 +183,7 @@ All the columns containing the replicate weights will be renamed to the form `re # Examples -Here is an example where the [`bootweights`](@ref) function is used to create a `ReplicateDesign`. +Here is an example where the [`bootweights`](@ref) function is used to create a `ReplicateDesign{BootstrapReplicates}`. ```jldoctest replicate-design; setup = :(using Survey, CSV, DataFrames) julia> apistrat = load_data("apistrat"); @@ -191,7 +191,7 @@ julia> apistrat = load_data("apistrat"); julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw); julia> bootstrat = bootweights(dstrat; replicates=1000) # creating a ReplicateDesign using bootweights -ReplicateDesign: +ReplicateDesign{BootstrapReplicates}: data: 200×1044 DataFrame strata: stype [E, E, E … H] @@ -220,8 +220,8 @@ julia> CSV.write("apistrat_withreplicates.csv", bootstrat.data); We can now pass the replicate weights directly to the `ReplicateDesign` constructor, either as a `Vector{Symbol}`, a `UnitRange` or a `Regex`. ```jldoctest replicate-design -julia> bootstrat_direct = ReplicateDesign(CSV.read("apistrat_withreplicates.csv", DataFrame), [Symbol("r_"*string(replicate)) for replicate in 1:1000]; strata=:stype, weights=:pw) -ReplicateDesign: +julia> bootstrat_direct = ReplicateDesign{BootstrapReplicates}(CSV.read("apistrat_withreplicates.csv", DataFrame), [Symbol("r_"*string(replicate)) for replicate in 1:1000]; strata=:stype, weights=:pw) +ReplicateDesign{BootstrapReplicates}: data: 200×1044 DataFrame strata: stype [E, E, E … H] @@ -233,8 +233,8 @@ allprobs: [0.0226, 0.0226, 0.0226 … 0.0662] type: bootstrap replicates: 1000 -julia> bootstrat_unitrange = ReplicateDesign(CSV.read("apistrat_withreplicates.csv", DataFrame), UnitRange(45:1044);strata=:stype, weights=:pw) -ReplicateDesign: +julia> bootstrat_unitrange = ReplicateDesign{BootstrapReplicates}(CSV.read("apistrat_withreplicates.csv", DataFrame), UnitRange(45:1044);strata=:stype, weights=:pw) +ReplicateDesign{BootstrapReplicates}: data: 200×1044 DataFrame strata: stype [E, E, E … H] @@ -246,8 +246,8 @@ allprobs: [0.0226, 0.0226, 0.0226 … 0.0662] type: bootstrap replicates: 1000 -julia> bootstrat_regex = ReplicateDesign(CSV.read("apistrat_withreplicates.csv", DataFrame), r"r_\\d";strata=:stype, weights=:pw) -ReplicateDesign: +julia> bootstrat_regex = ReplicateDesign{BootstrapReplicates}(CSV.read("apistrat_withreplicates.csv", DataFrame), r"r_\\d";strata=:stype, weights=:pw) +ReplicateDesign{BootstrapReplicates}: data: 200×1044 DataFrame strata: stype [E, E, E … H] From dfc4f4a69f9cb53dfe81373d5f32e74b3a0c29d0 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Tue, 2 May 2023 20:17:11 +0530 Subject: [PATCH 07/21] Fixing documentation for `bootweights` and `jackknife.jl`. Also forcing inference method `JackknifeReplicates` in `jackknife_variance`. --- src/bootstrap.jl | 3 ++- src/jackknife.jl | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/bootstrap.jl b/src/bootstrap.jl index 63c35162..5569c1ac 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -9,7 +9,7 @@ julia> apiclus1 = load_data("apiclus1"); julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, popsize=:fpc); julia> bootweights(dclus1; replicates=1000, rng=MersenneTwister(111)) # choose a seed for deterministic results -ReplicateDesign: +ReplicateDesign{BootstrapReplicates}: data: 183×1044 DataFrame strata: none cluster: dnum @@ -20,6 +20,7 @@ weights: [50.4667, 50.4667, 50.4667 … 50.4667] allprobs: [0.0198, 0.0198, 0.0198 … 0.0198] type: bootstrap replicates: 1000 + ``` """ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwister(1234)) diff --git a/src/jackknife.jl b/src/jackknife.jl index 65a94835..20623d7b 100644 --- a/src/jackknife.jl +++ b/src/jackknife.jl @@ -22,7 +22,7 @@ julia> apistrat = load_data("apistrat"); julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw); julia> rstrat = jackknifeweights(dstrat) -ReplicateDesign: +ReplicateDesign{JackknifeReplicates}: data: 200×244 DataFrame strata: stype [E, E, E … M] @@ -83,7 +83,7 @@ function jackknifeweights(design::SurveyDesign) end """ - jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign) + jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign{JackknifeReplicates}) Compute variance of column `x` for the given `func` using the Jackknife method. The formula to compute this variance is the following. @@ -102,7 +102,7 @@ julia> apistrat = load_data("apistrat"); julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw); julia> rstrat = jackknifeweights(dstrat) -ReplicateDesign: +ReplicateDesign{JackknifeReplicates}: data: 200×244 DataFrame strata: stype [E, E, E … M] @@ -127,7 +127,7 @@ julia> jackknife_variance(:api00, weightedmean, rstrat) # Reference pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010) """ -function jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign) +function jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign{JackknifeReplicates}) df = design.data # sort!(df, [design.strata, design.cluster]) stratified_gdf = groupby(df, design.strata) From b1b3eb55c5485a37d34a19785e2030123e44eee8 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Tue, 2 May 2023 20:39:59 +0530 Subject: [PATCH 08/21] Adding documentation for the inference method types. --- src/SurveyDesign.jl | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 4352abe0..64eede5c 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -123,12 +123,27 @@ struct SurveyDesign <: AbstractSurveyDesign end end +""" + InferenceMethod + +Abstract type for inference methods. +""" abstract type InferenceMethod end +""" + BootstrapReplicates <: InferenceMethod + +Type for the bootstrap replicates method. For more details, see [`bootweights`](@ref). +""" struct BootstrapReplicates <: InferenceMethod replicates::UInt end +""" + JackknifeReplicates <: InferenceMethod + +Type for the Jackknife replicates method. For more details, see [`jackknifeweights`](@ref). +""" struct JackknifeReplicates <: InferenceMethod replicates::UInt end From e9acc5deb6be965b5a4566f0ad421027f0f665dd Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Tue, 2 May 2023 20:40:59 +0530 Subject: [PATCH 09/21] Minor fixes in tests to incorporate the inference method types. --- test/runtests.jl | 20 ++++++++++---------- test/show.jl | 6 +++--- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index d709103c..0d466895 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -13,18 +13,18 @@ apisrs = load_data("apisrs") # Load API dataset srs = SurveyDesign(apisrs, weights = :pw) unitrange = UnitRange((length(names(apisrs)) + 1):(TOTAL_REPLICATES + length(names(apisrs)))) bsrs = srs |> bootweights # Create replicate design -bsrs_direct = ReplicateDesign(bsrs.data, REPLICATES_VECTOR, weights = :pw) # using ReplicateDesign constructor -bsrs_unitrange = ReplicateDesign(bsrs.data, unitrange, weights = :pw) # using ReplicateDesign constructor -bsrs_regex = ReplicateDesign(bsrs.data, REPLICATES_REGEX, weights = :pw) # using ReplicateDesign constructor +bsrs_direct = ReplicateDesign{BootstrapReplicates}(bsrs.data, REPLICATES_VECTOR, weights = :pw) # using ReplicateDesign constructor +bsrs_unitrange = ReplicateDesign{BootstrapReplicates}(bsrs.data, unitrange, weights = :pw) # using ReplicateDesign constructor +bsrs_regex = ReplicateDesign{BootstrapReplicates}(bsrs.data, REPLICATES_REGEX, weights = :pw) # using ReplicateDesign constructor # Stratified sample apistrat = load_data("apistrat") # Load API dataset dstrat = SurveyDesign(apistrat, strata = :stype, weights = :pw) # Create SurveyDesign unitrange = UnitRange((length(names(apistrat)) + 1):(TOTAL_REPLICATES + length(names(apistrat)))) bstrat = dstrat |> bootweights # Create replicate design -bstrat_direct = ReplicateDesign(bstrat.data, REPLICATES_VECTOR, strata=:stype, weights=:pw) # using ReplicateDesign constructor -bstrat_unitrange = ReplicateDesign(bstrat.data, unitrange, strata=:stype, weights=:pw) # using ReplicateDesign constructor -bstrat_regex = ReplicateDesign(bstrat.data, REPLICATES_REGEX, strata=:stype, weights=:pw) # using ReplicateDesign constructor +bstrat_direct = ReplicateDesign{BootstrapReplicates}(bstrat.data, REPLICATES_VECTOR, strata=:stype, weights=:pw) # using ReplicateDesign constructor +bstrat_unitrange = ReplicateDesign{BootstrapReplicates}(bstrat.data, unitrange, strata=:stype, weights=:pw) # using ReplicateDesign constructor +bstrat_regex = ReplicateDesign{BootstrapReplicates}(bstrat.data, REPLICATES_REGEX, strata=:stype, weights=:pw) # using ReplicateDesign constructor # One-stage cluster sample apiclus1 = load_data("apiclus1") # Load API dataset @@ -32,9 +32,9 @@ apiclus1[!, :pw] = fill(757 / 15, (size(apiclus1, 1),)) # Correct api mistake fo dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) # Create SurveyDesign unitrange = UnitRange((length(names(apiclus1)) + 1):(TOTAL_REPLICATES + length(names(apiclus1)))) dclus1_boot = dclus1 |> bootweights # Create replicate design -dclus1_boot_direct = ReplicateDesign(dclus1_boot.data, REPLICATES_VECTOR, clusters=:dnum, weights=:pw) # using ReplicateDesign constructor -dclus1_boot_unitrange = ReplicateDesign(dclus1_boot.data, unitrange, clusters=:dnum, weights=:pw) # using ReplicateDesign constructor -dclus1_boot_regex = ReplicateDesign(dclus1_boot.data, REPLICATES_REGEX, clusters=:dnum, weights=:pw) # using ReplicateDesign constructor +dclus1_boot_direct = ReplicateDesign{BootstrapReplicates}(dclus1_boot.data, REPLICATES_VECTOR, clusters=:dnum, weights=:pw) # using ReplicateDesign constructor +dclus1_boot_unitrange = ReplicateDesign{BootstrapReplicates}(dclus1_boot.data, unitrange, clusters=:dnum, weights=:pw) # using ReplicateDesign constructor +dclus1_boot_regex = ReplicateDesign{BootstrapReplicates}(dclus1_boot.data, REPLICATES_REGEX, clusters=:dnum, weights=:pw) # using ReplicateDesign constructor # Two-stage cluster sample apiclus2 = load_data("apiclus2") # Load API dataset @@ -63,4 +63,4 @@ include("hist.jl") include("boxplot.jl") include("ratio.jl") include("show.jl") -include("jackknife.jl") \ No newline at end of file +include("jackknife.jl") diff --git a/test/show.jl b/test/show.jl index 3035190f..56765b7b 100644 --- a/test/show.jl +++ b/test/show.jl @@ -15,7 +15,7 @@ @test str == refstr refstrb = """ - ReplicateDesign: + ReplicateDesign{BootstrapReplicates}: data: 200×4045 DataFrame strata: none cluster: none @@ -50,7 +50,7 @@ end @test str == refstr refstrb = """ - ReplicateDesign: + ReplicateDesign{BootstrapReplicates}: data: 200×4044 DataFrame strata: stype [E, E, E … H] @@ -86,7 +86,7 @@ end @test str == refstr refstrb = """ - ReplicateDesign: + ReplicateDesign{BootstrapReplicates}: data: 183×4044 DataFrame strata: none cluster: dnum From 69b7aff4ea5cecd1a5b417a7e344c5ce054d358c Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Tue, 2 May 2023 20:59:37 +0530 Subject: [PATCH 10/21] Added the inference method types to the API reference. --- docs/src/api.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/src/api.md b/docs/src/api.md index e3dd4f08..1c56fca7 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -12,6 +12,8 @@ Private = false AbstractSurveyDesign SurveyDesign ReplicateDesign +BootstrapReplicates +JackknifeReplicates load_data bootweights jackknifeweights From 67be0b181aab2767c3773cbd4d23bc170fab29c3 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Wed, 3 May 2023 12:24:24 +0530 Subject: [PATCH 11/21] Renaming `jackknife_variance` to `variance`. --- docs/src/api.md | 2 +- src/Survey.jl | 2 +- src/jackknife.jl | 6 +++--- src/mean.jl | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/src/api.md b/docs/src/api.md index 1c56fca7..629d118d 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -17,7 +17,7 @@ JackknifeReplicates load_data bootweights jackknifeweights -jackknife_variance +variance mean total quantile diff --git a/src/Survey.jl b/src/Survey.jl index ced437da..77bbce4a 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -37,6 +37,6 @@ export hist, sturges, freedman_diaconis export boxplot export bootweights export ratio -export jackknifeweights, jackknife_variance +export jackknifeweights, variance end diff --git a/src/jackknife.jl b/src/jackknife.jl index 20623d7b..008ca4cb 100644 --- a/src/jackknife.jl +++ b/src/jackknife.jl @@ -83,7 +83,7 @@ function jackknifeweights(design::SurveyDesign) end """ - jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign{JackknifeReplicates}) + variance(x::Symbol, func::Function, design::ReplicateDesign{JackknifeReplicates}) Compute variance of column `x` for the given `func` using the Jackknife method. The formula to compute this variance is the following. @@ -116,7 +116,7 @@ replicates: 200 julia> weightedmean(x, y) = mean(x, weights(y)); -julia> jackknife_variance(:api00, weightedmean, rstrat) +julia> variance(:api00, weightedmean, rstrat) 1×2 DataFrame Row │ estimator SE │ Float64 Float64 @@ -127,7 +127,7 @@ julia> jackknife_variance(:api00, weightedmean, rstrat) # Reference pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010) """ -function jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign{JackknifeReplicates}) +function variance(x::Symbol, func::Function, design::ReplicateDesign{JackknifeReplicates}) df = design.data # sort!(df, [design.strata, design.cluster]) stratified_gdf = groupby(df, design.strata) diff --git a/src/mean.jl b/src/mean.jl index 580c63f2..f272bc1d 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -57,7 +57,7 @@ function mean(x::Symbol, design::ReplicateDesign) # Jackknife integration elseif design.type == "jackknife" weightedmean(x, y) = mean(x, weights(y)) - return jackknife_variance(x, weightedmean, design) + return variance(x, weightedmean, design) end end From f11c6c6897653f760d032dd07cc3a8da696efe52 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Wed, 3 May 2023 12:53:22 +0530 Subject: [PATCH 12/21] Minor correction; qualifying the `variance` method to `Survey`'s `variance`. --- src/Survey.jl | 2 +- src/mean.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Survey.jl b/src/Survey.jl index 77bbce4a..c0b3db27 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -15,6 +15,7 @@ using Missings include("SurveyDesign.jl") include("bootstrap.jl") +include("jackknife.jl") include("mean.jl") include("quantile.jl") include("total.jl") @@ -25,7 +26,6 @@ include("boxplot.jl") include("show.jl") include("ratio.jl") include("by.jl") -include("jackknife.jl") export load_data export AbstractSurveyDesign, SurveyDesign, ReplicateDesign diff --git a/src/mean.jl b/src/mean.jl index f272bc1d..16856498 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -57,7 +57,7 @@ function mean(x::Symbol, design::ReplicateDesign) # Jackknife integration elseif design.type == "jackknife" weightedmean(x, y) = mean(x, weights(y)) - return variance(x, weightedmean, design) + return Survey.variance(x, weightedmean, design) end end From 93d23db4741474ecced1b94d70a67c7a568fddee Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Fri, 5 May 2023 13:11:10 +0530 Subject: [PATCH 13/21] Removing redundant `mean` function from `mean.jl`, and moving it to the `variance` functions of `bootstrap.jl` and `jackknife.jl`. --- src/bootstrap.jl | 41 +++++++++++++++++++++++++++++++++++++++++ src/mean.jl | 31 ------------------------------- 2 files changed, 41 insertions(+), 31 deletions(-) diff --git a/src/bootstrap.jl b/src/bootstrap.jl index 5569c1ac..f8d91a7f 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -53,6 +53,47 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis ) end +""" + variance(x::Symbol, func::Function, design::ReplicateDesign{BootstrapReplicates}) + + +Use replicate weights to compute the standard error of the estimated mean using the bootstrap method. The variance is calculated using the formula + +```math +\\hat{V}(\\hat{\\theta}) = \\dfrac{1}{R}\\sum_{i = 1}^R(\\theta_i - \\hat{\\theta})^2 +``` + +where above ``R`` is the number of replicate weights, ``\\theta_i`` is the estimated mean using the ``i``th set of replicate weights, and ``\\hat{\\theta}`` is the estimated mean using the original weights. + +```jldoctest; +julia> using Survey, StatsBase; + +julia> apiclus1 = load_data("apiclus1"); + +julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); + +julia> bclus1 = dclus1 |> bootweights; + +julia> weightedmean(x, y) = mean(x, weights(y)); + +julia> variance(:api00, weightedmean, bclus1) +1×2 DataFrame + Row │ mean SE + │ Float64 Float64 +─────┼────────────────── + 1 │ 644.169 23.4107 +``` +""" +function variance(x::Symbol, func::Function, design::ReplicateDesign{BootstrapReplicates}) + θ̂ = func(design.data[!, x], design.data[!, design.weights]) + θ̂t = [ + func(design.data[!, x], design.data[!, "replicate_"*string(i)]) for + i = 1:design.replicates + ] + variance = sum((θ̂t .- θ̂) .^ 2) / design.replicates + return DataFrame(mean = θ̂, SE = sqrt(variance)) +end + function _bootweights_cluster_sorted!(cluster_sorted, cluster_weights, cluster_sorted_designcluster, replicates, rng) diff --git a/src/mean.jl b/src/mean.jl index 16856498..cb483710 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -30,37 +30,6 @@ function mean(x::Symbol, design::SurveyDesign) DataFrame(mean = X) end -""" - -Use replicate weights to compute the standard error of the estimated mean. - -```jldoctest; setup = :(apiclus1 = load_data("apiclus1"); dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw)) -julia> bclus1 = dclus1 |> bootweights; - -julia> mean(:api00, bclus1) -1×2 DataFrame - Row │ mean SE - │ Float64 Float64 -─────┼────────────────── - 1 │ 644.169 23.4107 -``` -""" -function mean(x::Symbol, design::ReplicateDesign) - if design.type == "bootstrap" - θ̂ = mean(design.data[!, x], weights(design.data[!, design.weights])) - θ̂t = [ - mean(design.data[!, x], weights(design.data[!, "replicate_"*string(i)])) for - i = 1:design.replicates - ] - variance = sum((θ̂t .- θ̂) .^ 2) / design.replicates - return DataFrame(mean = θ̂, SE = sqrt(variance)) - # Jackknife integration - elseif design.type == "jackknife" - weightedmean(x, y) = mean(x, weights(y)) - return Survey.variance(x, weightedmean, design) - end -end - """ Estimate the mean of a list of variables. From aa4cd6722876f288142f48b34a26ff27cec59680 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Fri, 5 May 2023 13:18:33 +0530 Subject: [PATCH 14/21] Minor change; changing column name from `mean` to `estimator` in `variance` function for bootstrap replicate. --- src/bootstrap.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/bootstrap.jl b/src/bootstrap.jl index f8d91a7f..b449ef69 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -78,11 +78,11 @@ julia> weightedmean(x, y) = mean(x, weights(y)); julia> variance(:api00, weightedmean, bclus1) 1×2 DataFrame - Row │ mean SE - │ Float64 Float64 -─────┼────────────────── - 1 │ 644.169 23.4107 -``` + Row │ estimator SE + │ Float64 Float64 +─────┼──────────────────── + 1 │ 644.169 23.4107 + """ function variance(x::Symbol, func::Function, design::ReplicateDesign{BootstrapReplicates}) θ̂ = func(design.data[!, x], design.data[!, design.weights]) @@ -91,7 +91,7 @@ function variance(x::Symbol, func::Function, design::ReplicateDesign{BootstrapRe i = 1:design.replicates ] variance = sum((θ̂t .- θ̂) .^ 2) / design.replicates - return DataFrame(mean = θ̂, SE = sqrt(variance)) + return DataFrame(estimator = θ̂, SE = sqrt(variance)) end function _bootweights_cluster_sorted!(cluster_sorted, From 1d9bf3e25d825e6f5f1a0dfba51437c7bfc66b2d Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Fri, 5 May 2023 13:27:21 +0530 Subject: [PATCH 15/21] Adding the `mean` function back, which now uses the `variance` function. --- src/mean.jl | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/mean.jl b/src/mean.jl index cb483710..6e0e05e0 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -30,6 +30,29 @@ function mean(x::Symbol, design::SurveyDesign) DataFrame(mean = X) end +""" + mean(x::Symbol, design::ReplicateDesign) + +Use replicate weights to compute the standard error of the estimated mean. + +# Examples + +```jldoctest; setup = :(apiclus1 = load_data("apiclus1"); dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw)) +julia> bclus1 = dclus1 |> bootweights; + +julia> mean(:api00, bclus1) +1×2 DataFrame + Row │ mean SE + │ Float64 Float64 +─────┼────────────────── + 1 │ 644.169 23.4107 +``` +""" +function mean(x::Symbol, design::ReplicateDesign) + weightedmean(x, y) = mean(x, weights(y)) + Survey.variance(x, weightedmean, design) +end + """ Estimate the mean of a list of variables. From cd8554a4fd2a391cbf254886e181784386a9ae10 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Fri, 5 May 2023 14:09:10 +0530 Subject: [PATCH 16/21] Minor change in `mean` function; renaming `estimator` column to `mean`. --- src/mean.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/mean.jl b/src/mean.jl index 6e0e05e0..4d867669 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -50,7 +50,9 @@ julia> mean(:api00, bclus1) """ function mean(x::Symbol, design::ReplicateDesign) weightedmean(x, y) = mean(x, weights(y)) - Survey.variance(x, weightedmean, design) + df = Survey.variance(x, weightedmean, design) + rename!(df, :estimator => :mean) + return df end """ From 8910308f3d8ad0a3d357a489ab632e295007ed27 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Fri, 5 May 2023 14:09:44 +0530 Subject: [PATCH 17/21] Correcting test for `jackknife`. --- test/jackknife.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/jackknife.jl b/test/jackknife.jl index 73ab5b71..ae582d5e 100644 --- a/test/jackknife.jl +++ b/test/jackknife.jl @@ -19,9 +19,9 @@ # Tests using for NHANES mean_nhanes_jk = mean([:seq1, :seq2],dnhanes_jk) - @test mean_nhanes_jk.estimator[1] ≈ 21393.96 atol = 1e-3 + @test mean_nhanes_jk.mean[1] ≈ 21393.96 atol = 1e-3 @test mean_nhanes_jk.SE[1] ≈ 143.371 atol = 1e-3 # R is slightly diff in 2nd decimal place - @test mean_nhanes_jk.estimator[2] ≈ 38508.328 atol = 1e-3 + @test mean_nhanes_jk.mean[2] ≈ 38508.328 atol = 1e-3 @test mean_nhanes_jk.SE[2] ≈ 258.068 atol = 1e-3 # R is slightly diff in 2nd decimal place end @@ -85,4 +85,4 @@ end # > svymean(~seq1+seq2,dnhanes) # mean SE # seq1 21394 143.34 -# seq2 38508 258.01 \ No newline at end of file +# seq2 38508 258.01 From feef09b19c2e5283bbd1522ea29fb391d2f5218e Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Fri, 5 May 2023 14:47:56 +0530 Subject: [PATCH 18/21] Minor change in docstring. --- src/bootstrap.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/bootstrap.jl b/src/bootstrap.jl index b449ef69..ee304aee 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -63,9 +63,9 @@ Use replicate weights to compute the standard error of the estimated mean using \\hat{V}(\\hat{\\theta}) = \\dfrac{1}{R}\\sum_{i = 1}^R(\\theta_i - \\hat{\\theta})^2 ``` -where above ``R`` is the number of replicate weights, ``\\theta_i`` is the estimated mean using the ``i``th set of replicate weights, and ``\\hat{\\theta}`` is the estimated mean using the original weights. +where above ``R`` is the number of replicate weights, ``\\theta_i`` is the estimator computed using the ``i``th set of replicate weights, and ``\\hat{\\theta}`` is the estimator computed using the original weights. -```jldoctest; +```jldoctest julia> using Survey, StatsBase; julia> apiclus1 = load_data("apiclus1"); @@ -83,6 +83,7 @@ julia> variance(:api00, weightedmean, bclus1) ─────┼──────────────────── 1 │ 644.169 23.4107 +``` """ function variance(x::Symbol, func::Function, design::ReplicateDesign{BootstrapReplicates}) θ̂ = func(design.data[!, x], design.data[!, design.weights]) From 79b4c3981413469cf5de7be0b352246fb25e280c Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Fri, 5 May 2023 14:54:31 +0530 Subject: [PATCH 19/21] Minor fix in docstring. --- src/SurveyDesign.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 64eede5c..5b7df85a 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -164,7 +164,7 @@ ReplicateDesign{ReplicateType}( strata::Union{Nothing,Symbol} = nothing, popsize::Union{Nothing,Symbol} = nothing, weights::Union{Nothing,Symbol} = nothing -) where {ReplicateType <: InferenceType} +) where {ReplicateType <: InferenceMethod} ReplicateDesign{ReplicateType}( data::AbstractDataFrame, @@ -173,7 +173,7 @@ ReplicateDesign{ReplicateType}( strata::Union{Nothing,Symbol} = nothing, popsize::Union{Nothing,Symbol} = nothing, weights::Union{Nothing,Symbol} = nothing -) where {ReplicateType <: InferenceType} +) where {ReplicateType <: InferenceMethod} ReplicateDesign{ReplicateType}( data::AbstractDataFrame, @@ -182,7 +182,7 @@ ReplicateDesign{ReplicateType}( strata::Union{Nothing,Symbol} = nothing, popsize::Union{Nothing,Symbol} = nothing, weights::Union{Nothing,Symbol} = nothing -) where {ReplicateType <: InferenceType} +) where {ReplicateType <: InferenceMethod} ``` # Arguments From 64019c682c5e21b6ca96da626523a8a4aeaa2d54 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Mon, 8 May 2023 19:18:27 +0530 Subject: [PATCH 20/21] Rewriting `quantile` using `variance`. --- src/quantile.jl | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/src/quantile.jl b/src/quantile.jl index f2feaa4d..2ffec5aa 100644 --- a/src/quantile.jl +++ b/src/quantile.jl @@ -47,19 +47,9 @@ julia> quantile(:api00, bsrs, 0.5) ``` """ function quantile(var::Symbol, design::ReplicateDesign, p::Real; kwargs...) - v = design.data[!, var] - probs = design.data[!, design.allprobs] - X = Statistics.quantile(v, ProbabilityWeights(probs), p) - Xt = [ - Statistics.quantile( - v, - ProbabilityWeights(design.data[!, "replicate_"*string(i)]), - p, - ) for i = 1:design.replicates - ] - variance = sum((Xt .- X) .^ 2) / design.replicates - df = DataFrame(percentile = X, SE = sqrt(variance)) - rename!(df, :percentile => string(p) * "th percentile") + quantile_func(v, weights) = Statistics.quantile(v, ProbabilityWeights(weights), p) + df = Survey.variance(var, quantile_func, design) + rename!(df, :estimator => string(p) * "th percentile") return df end From d99f38918d3456174a55540387c65a6c4fdcf010 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Mon, 8 May 2023 19:28:20 +0530 Subject: [PATCH 21/21] Rewriting `total` using `variance`. --- src/total.jl | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/total.jl b/src/total.jl index a451324c..de2fb218 100644 --- a/src/total.jl +++ b/src/total.jl @@ -36,13 +36,10 @@ julia> total(:api00, bclus1) ``` """ function total(x::Symbol, design::ReplicateDesign) - X = wsum(design.data[!, x], weights(design.data[!, design.weights])) - Xt = [ - wsum(design.data[!, x], weights(design.data[!, "replicate_"*string(i)])) for - i = 1:design.replicates - ] - variance = sum((Xt .- X) .^ 2) / design.replicates - DataFrame(total = X, SE = sqrt(variance)) + total_func(x, y) = wsum(x, weights(y)) + df = variance(x, total_func, design) + rename!(df, :estimator => :total) + return df end """