xKDR · ayushpatnaikgit · Mar 1, 2023 · Feb 22, 2023 · Feb 22, 2023 · Feb 22, 2023
diff --git a/README.md b/README.md
@@ -98,7 +98,8 @@ cluster: none
 popsize: [6190.0, 6190.0, 6190.0  …  6190.0]
 sampsize: [200, 200, 200  …  200]
 weights: [31.0, 31.0, 31.0  …  31.0]
-probs: [0.0323, 0.0323, 0.0323  …  0.0323]
+allprobs: [0.0323, 0.0323, 0.0323  …  0.0323]
+type: bootstrap
 replicates: 1000
 
 julia> mean(:api00, bootsrs)
@@ -171,4 +172,4 @@ We gratefully acknowledge the JuliaLab at MIT for financial support for this pro
 
 ## References
 
-[^1]: [Lumley, Thomas. Complex surveys: a guide to analysis using R. John Wiley & Sons, 2011.](https://books.google.co.in/books?hl=en&lr=&id=L96ludyhFBsC&oi=fnd&pg=PP12&dq=complex+surveys+lumley&ots=ie0y1lnzv1&sig=c4UHI3arjspMJ6OYzlX32E9rNRI#v=onepage&q=complex%20surveys%20lumley&f=false) Page 44
+[^1]: [Lumley, Thomas. Complex surveys: a guide to analysis using R. John Wiley & Sons, 2011.](https://books.google.co.in/books?hl=en&lr=&id=L96ludyhFBsC&oi=fnd&pg=PP12&dq=complex+surveys+lumley&ots=ie0y1lnzv1&sig=c4UHI3arjspMJ6OYzlX32E9rNRI#v=onepage&q=complex%20surveys%20lumley&f=false) Page 44
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -1,5 +1,7 @@
 [deps]
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
-Survey = "c1a98b4d-6cd2-47ec-b9e9-69b59c35373c"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
+Survey = "c1a98b4d-6cd2-47ec-b9e9-69b59c35373c"
diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl
@@ -127,14 +127,117 @@ end
 """
     ReplicateDesign <: AbstractSurveyDesign
 
-Survey design obtained by replicating an original design using [`bootweights`](@ref).
+Survey design obtained by replicating an original design using [`bootweights`](@ref). If
+replicate weights are available, then they can be used to directly create a `ReplicateDesign`.
 
-```jldoctest
+# Constructors
+
+```julia
+ReplicateDesign(
+    data::AbstractDataFrame,
+    replicate_weights::Vector{Symbol};
+    clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing,
+    strata::Union{Nothing,Symbol} = nothing,
+    popsize::Union{Nothing,Symbol} = nothing,
+    weights::Union{Nothing,Symbol} = nothing
+)
+
+ReplicateDesign(
+    data::AbstractDataFrame,
+    replicate_weights::UnitIndex{Int};
+    clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing,
+    strata::Union{Nothing,Symbol} = nothing,
+    popsize::Union{Nothing,Symbol} = nothing,
+    weights::Union{Nothing,Symbol} = nothing
+)
+
+ReplicateDesign(
+    data::AbstractDataFrame,
+    replicate_weights::Regex;
+    clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing,
+    strata::Union{Nothing,Symbol} = nothing,
+    popsize::Union{Nothing,Symbol} = nothing,
+    weights::Union{Nothing,Symbol} = nothing
+)
+```
+
+# Arguments
+
+The constructor has the same arguments as [`SurveyDesign`](@ref). The only additional argument is `replicate_weights`, which can
+be of one of the following types.
+
+- `Vector{Symbol}`: In this case, each `Symbol` in the vector should represent a column of `data` containing the replicate weights.
+- `UnitIndex{Int}`: For instance, this could be UnitRange(5:10). This will mean that the replicate weights are contained in columns 5 through 10.
+- `Regex`: In this case, all the columns of `data` which match this `Regex` will be treated as the columns containing the replicate weights.
+
+All the columns containing the replicate weights will be renamed to the form `replicate_i`, where `i` ranges from 1 to the number of columns containing the replicate weights.
+
+# Examples
+
+Here is an example where the [`bootweights`](@ref) function is used to create a `ReplicateDesign`.
+
+```jldoctest replicate-design; setup = :(using Survey, CSV, DataFrames)
 julia> apistrat = load_data("apistrat");
 
 julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw);
 
-julia> bootstrat = bootweights(dstrat; replicates=1000)
+julia> bootstrat = bootweights(dstrat; replicates=1000)     # creating a ReplicateDesign using bootweights
+ReplicateDesign:
+data: 200×1044 DataFrame
+strata: stype
+    [E, E, E  …  H]
+cluster: none
+popsize: [4420.9999, 4420.9999, 4420.9999  …  755.0]
+sampsize: [100, 100, 100  …  50]
+weights: [44.21, 44.21, 44.21  …  15.1]
+allprobs: [0.0226, 0.0226, 0.0226  …  0.0662]
+type: bootstrap
+replicates: 1000
+
+```
+
+If the replicate weights are given to us already, then we can directly pass them to the `ReplicateDesign` constructor. For instance, in
+the above example, suppose we had the `bootstrat` data as a CSV file (for this example, we also rename the columns containing the replicate weights to the form `r_i`).
+
+```jldoctest replicate-design
+julia> using CSV;
+
+julia> DataFrames.rename!(bootstrat.data, ["replicate_"*string(index) => "r_"*string(index) for index in 1:1000]);
+
+julia> CSV.write("apistrat_withreplicates.csv", bootstrat.data);
+
+```
+
+We can now pass the replicate weights directly to the `ReplicateDesign` constructor, either as a `Vector{Symbol}`, a `UnitRange` or a `Regex`.
+
+```jldoctest replicate-design
+julia> bootstrat_direct = ReplicateDesign(CSV.read("apistrat_withreplicates.csv", DataFrame), [Symbol("r_"*string(replicate)) for replicate in 1:1000]; strata=:stype, weights=:pw)
+ReplicateDesign:
+data: 200×1044 DataFrame
+strata: stype
+    [E, E, E  …  H]
+cluster: none
+popsize: [4420.9999, 4420.9999, 4420.9999  …  755.0]
+sampsize: [100, 100, 100  …  50]
+weights: [44.21, 44.21, 44.21  …  15.1]
+allprobs: [0.0226, 0.0226, 0.0226  …  0.0662]
+type: bootstrap
+replicates: 1000
+
+julia> bootstrat_unitrange = ReplicateDesign(CSV.read("apistrat_withreplicates.csv", DataFrame), UnitRange(45:1044);strata=:stype, weights=:pw)
+ReplicateDesign:
+data: 200×1044 DataFrame
+strata: stype
+    [E, E, E  …  H]
+cluster: none
+popsize: [4420.9999, 4420.9999, 4420.9999  …  755.0]
+sampsize: [100, 100, 100  …  50]
+weights: [44.21, 44.21, 44.21  …  15.1]
+allprobs: [0.0226, 0.0226, 0.0226  …  0.0662]
+type: bootstrap
+replicates: 1000
+
+julia> bootstrat_regex = ReplicateDesign(CSV.read("apistrat_withreplicates.csv", DataFrame), r"r_\\d";strata=:stype, weights=:pw)
 ReplicateDesign:
 data: 200×1044 DataFrame
 strata: stype
@@ -144,8 +247,11 @@ popsize: [4420.9999, 4420.9999, 4420.9999  …  755.0]
 sampsize: [100, 100, 100  …  50]
 weights: [44.21, 44.21, 44.21  …  15.1]
 allprobs: [0.0226, 0.0226, 0.0226  …  0.0662]
+type: bootstrap
 replicates: 1000
+
 ```
+
 """
 struct ReplicateDesign <: AbstractSurveyDesign
     data::AbstractDataFrame
@@ -156,5 +262,96 @@ struct ReplicateDesign <: AbstractSurveyDesign
     weights::Symbol # Effective weights in case of singlestage approx supported
     allprobs::Symbol # Right now only singlestage approx supported
     pps::Bool
+    type::String
     replicates::UInt
+    replicate_weights::Vector{Symbol}
+
+    # default constructor
+    function ReplicateDesign(
+        data::DataFrame,
+        cluster::Symbol,
+        popsize::Symbol,
+        sampsize::Symbol,
+        strata::Symbol,
+        weights::Symbol,
+        allprobs::Symbol,
+        pps::Bool,
+        type::String,
+        replicates::UInt,
+        replicate_weights::Vector{Symbol}
+    )
+        new(data, cluster, popsize, sampsize, strata, weights, allprobs,
+           pps, type, replicates, replicate_weights)
+    end
+
+    # constructor with given replicate_weights
+    function ReplicateDesign(
+        data::AbstractDataFrame,
+        replicate_weights::Vector{Symbol};
+        clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing,
+        strata::Union{Nothing,Symbol} = nothing,
+        popsize::Union{Nothing,Symbol} = nothing,
+        weights::Union{Nothing,Symbol} = nothing
+    )
+        # rename the replicate weights if needed
+        rename!(data, [replicate_weights[index] => "replicate_"*string(index) for index in 1:length(replicate_weights)])
+
+        # call the SurveyDesign constructor
+        base_design = SurveyDesign(
+                        data;
+                        clusters=clusters,
+                        strata=strata,
+                        popsize=popsize,
+                        weights=weights
+                      )
+        new(
+            base_design.data,
+            base_design.cluster,
+            base_design.popsize,
+            base_design.sampsize,
+            base_design.strata,
+            base_design.weights,
+            base_design.allprobs,
+            base_design.pps,
+            "bootstrap",
+            length(replicate_weights),
+            replicate_weights
+        )
+    end
+
+    # replicate weights given as a range of columns
+    ReplicateDesign(
+        data::AbstractDataFrame,
+        replicate_weights::UnitRange{Int};
+        clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing,
+        strata::Union{Nothing,Symbol} = nothing,
+        popsize::Union{Nothing,Symbol} = nothing,
+        weights::Union{Nothing,Symbol} = nothing
+    ) =
+        ReplicateDesign(
+            data,
+            Symbol.(names(data)[replicate_weights]);
+            clusters=clusters,
+            strata=strata,
+            popsize=popsize,
+            weights=weights
+        )
+
+    # replicate weights given as regular expression
+    ReplicateDesign(
+        data::AbstractDataFrame,
+        replicate_weights::Regex;
+        clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing,
+        strata::Union{Nothing,Symbol} = nothing,
+        popsize::Union{Nothing,Symbol} = nothing,
+        weights::Union{Nothing,Symbol} = nothing
+    ) =
+        ReplicateDesign(
+            data,
+            Symbol.(names(data)[findall(name -> occursin(replicate_weights, name), names(data))]);
+            clusters=clusters,
+            strata=strata,
+            popsize=popsize,
+            weights=weights
+        )
 end
diff --git a/src/bootstrap.jl b/src/bootstrap.jl
@@ -18,6 +18,7 @@ popsize: [757, 757, 757  …  757]
 sampsize: [15, 15, 15  …  15]
 weights: [50.4667, 50.4667, 50.4667  …  50.4667]
 allprobs: [0.0198, 0.0198, 0.0198  …  0.0198]
+type: bootstrap
 replicates: 1000
 ```
 """
@@ -54,6 +55,8 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis
         design.weights,
         design.allprobs,
         design.pps,
-        replicates,
+        "bootstrap",
+        UInt(replicates),
+        [Symbol("replicate_"*string(replicate)) for replicate in 1:replicates]
     )
 end
diff --git a/src/show.jl b/src/show.jl
@@ -37,6 +37,7 @@ Base.show(io::IO, ::MIME"text/plain", design::SurveyDesign) = surveyshow(io, des
 function Base.show(io::IO, ::MIME"text/plain", design::ReplicateDesign)
     # new_io = IOContext(io, :compact=>true, :limit=>true, :displaysize=>(50, 50))
     surveyshow(io, design)
+    printinfo(io, "\ntype", design.type; newline = false)
     printinfo(io, "\nreplicates", design.replicates; newline = false)
 end
 

diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl
@@ -259,3 +259,45 @@ end
     yrbs = copy(yrbs_original)
     dyrbs = SurveyDesign(yrbs; clusters = :psu, strata = :stratum, weights = :weight)
 end
+
+@testset "ReplicateDesign_direct" begin
+    for (sample, sample_direct) in [(bsrs, bsrs_direct), (bstrat, bstrat_direct), (dclus1_boot, dclus1_boot_direct)]
+        @test isequal(sample.data, sample_direct.data)
+        @test isequal(sample.popsize, sample_direct.popsize)
+        @test isequal(sample.sampsize, sample_direct.sampsize)
+        @test isequal(sample.strata, sample_direct.strata)
+        @test isequal(sample.weights, sample_direct.weights)
+        @test isequal(sample.allprobs, sample_direct.allprobs)
+        @test isequal(sample.pps, sample_direct.pps)
+        @test isequal(sample.replicates, sample_direct.replicates)
+        @test isequal(sample.replicate_weights, sample_direct.replicate_weights)
+    end
+end
+
+@testset "ReplicateDesign_unitrange" begin
+    for (sample, sample_unitrange) in [(bsrs, bsrs_unitrange), (bstrat, bstrat_unitrange), (dclus1_boot, dclus1_boot_unitrange)]
+        @test isequal(sample.data, sample_unitrange.data)
+        @test isequal(sample.popsize, sample_unitrange.popsize)
+        @test isequal(sample.sampsize, sample_unitrange.sampsize)
+        @test isequal(sample.strata, sample_unitrange.strata)
+        @test isequal(sample.weights, sample_unitrange.weights)
+        @test isequal(sample.allprobs, sample_unitrange.allprobs)
+        @test isequal(sample.pps, sample_unitrange.pps)
+        @test isequal(sample.replicates, sample_unitrange.replicates)
+        @test isequal(sample.replicate_weights, sample_unitrange.replicate_weights)
+    end
+end
+
+@testset "ReplicateDesign_regex" begin
+    for (sample, sample_regex) in [(bsrs, bsrs_regex), (bstrat, bstrat_regex), (dclus1_boot, dclus1_boot_regex)]
+        @test isequal(sample.data, sample_regex.data)
+        @test isequal(sample.popsize, sample_regex.popsize)
+        @test isequal(sample.sampsize, sample_regex.sampsize)
+        @test isequal(sample.strata, sample_regex.strata)
+        @test isequal(sample.weights, sample_regex.weights)
+        @test isequal(sample.allprobs, sample_regex.allprobs)
+        @test isequal(sample.pps, sample_regex.pps)
+        @test isequal(sample.replicates, sample_regex.replicates)
+        @test isequal(sample.replicate_weights, sample_regex.replicate_weights)
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -4,21 +4,37 @@ using CategoricalArrays
 
 const STAT_TOL = 1e-5
 const SE_TOL = 1e-1
+TOTAL_REPLICATES = 4000
+REPLICATES_VECTOR = [Symbol("replicate_"*string(i)) for i in 1:TOTAL_REPLICATES]
+REPLICATES_REGEX = r"r*_\d"
 
 # Simple random sample
 apisrs = load_data("apisrs") # Load API dataset
 srs = SurveyDesign(apisrs, weights = :pw)
+unitrange = UnitRange((length(names(apisrs)) + 1):(TOTAL_REPLICATES + length(names(apisrs))))
 bsrs = srs |> bootweights # Create replicate design
+bsrs_direct = ReplicateDesign(bsrs.data, REPLICATES_VECTOR, weights = :pw)  # using ReplicateDesign constructor
+bsrs_unitrange = ReplicateDesign(bsrs.data, unitrange, weights = :pw)  # using ReplicateDesign constructor
+bsrs_regex = ReplicateDesign(bsrs.data, REPLICATES_REGEX, weights = :pw)  # using ReplicateDesign constructor
+
 # Stratified sample
 apistrat = load_data("apistrat") # Load API dataset
 dstrat = SurveyDesign(apistrat, strata = :stype, weights = :pw) # Create SurveyDesign
+unitrange = UnitRange((length(names(apistrat)) + 1):(TOTAL_REPLICATES + length(names(apistrat))))
 bstrat = dstrat |> bootweights # Create replicate design
+bstrat_direct = ReplicateDesign(bstrat.data, REPLICATES_VECTOR, strata=:stype, weights=:pw)  # using ReplicateDesign constructor
+bstrat_unitrange = ReplicateDesign(bstrat.data, unitrange, strata=:stype, weights=:pw)  # using ReplicateDesign constructor
+bstrat_regex = ReplicateDesign(bstrat.data, REPLICATES_REGEX, strata=:stype, weights=:pw)  # using ReplicateDesign constructor
 
 # One-stage cluster sample
 apiclus1 = load_data("apiclus1") # Load API dataset
 apiclus1[!, :pw] = fill(757 / 15, (size(apiclus1, 1),)) # Correct api mistake for pw column
 dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) # Create SurveyDesign
+unitrange = UnitRange((length(names(apiclus1)) + 1):(TOTAL_REPLICATES + length(names(apiclus1))))
 dclus1_boot = dclus1 |> bootweights # Create replicate design
+dclus1_boot_direct = ReplicateDesign(dclus1_boot.data, REPLICATES_VECTOR, clusters=:dnum, weights=:pw)  # using ReplicateDesign constructor
+dclus1_boot_unitrange = ReplicateDesign(dclus1_boot.data, unitrange, clusters=:dnum, weights=:pw)  # using ReplicateDesign constructor
+dclus1_boot_regex = ReplicateDesign(dclus1_boot.data, REPLICATES_REGEX, clusters=:dnum, weights=:pw)  # using ReplicateDesign constructor
 
 @testset "Survey.jl" begin
     @test size(load_data("apiclus1")) == (183, 40)

diff --git a/test/show.jl b/test/show.jl
@@ -23,6 +23,7 @@
     sampsize: [200, 200, 200  …  200]
     weights: [30.97, 30.97, 30.97  …  30.97]
     allprobs: [0.0323, 0.0323, 0.0323  …  0.0323]
+    type: bootstrap
     replicates: 4000"""
 
     show(io, MIME("text/plain"), bsrs)
@@ -58,6 +59,7 @@ end
     sampsize: [100, 100, 100  …  50]
     weights: [44.21, 44.21, 44.21  …  15.1]
     allprobs: [0.0226, 0.0226, 0.0226  …  0.0662]
+    type: bootstrap
     replicates: 4000"""
 
     show(io, MIME("text/plain"), bstrat)
@@ -93,6 +95,7 @@ end
     sampsize: [15, 15, 15  …  15]
     weights: [50.4667, 50.4667, 50.4667  …  50.4667]
     allprobs: [0.0198, 0.0198, 0.0198  …  0.0198]
+    type: bootstrap
     replicates: 4000"""
 
     show(io, MIME("text/plain"), dclus1_boot)