xKDR · smishr · Jan 10, 2023 · Jan 4, 2023 · Jan 5, 2023 · Jan 5, 2023
diff --git a/Project.toml b/Project.toml
@@ -10,6 +10,7 @@ CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"

diff --git a/docs/src/api.md b/docs/src/api.md
@@ -7,35 +7,24 @@ Module = [Survey]
 Order = [:type, :function]
 Private = false
 ```
-Survey data can be loaded from a `DataFrame` into a survey design. The package currently supports simple random sample and stratified sample designs. 
-```@docs
-AbstractSurveyDesign
-SimpleRandomSample
-StratifiedSample
-```
 
 ```@docs
+AbstractSurveyDesign
+SurveyDesign
+ReplicateDesign
 load_data
-Survey.mean(x::Symbol, design::SimpleRandomSample)
-total(x::Symbol, design::SimpleRandomSample)
+bootweights
+mean(x::Symbol, design::ReplicateDesign)
+mean(x::Symbol, domain::Symbol, design::ReplicateDesign)
+total(x::Symbol, design::ReplicateDesign)
+total(x::Symbol, domain::Symbol, design::ReplicateDesign)
 quantile
-```
-
-It is often required to estimate population parameters for sub-populations of interest. For example, you may have a sample of heights, but you want the average heights of males and females separately. 
-```@docs
-mean(x::Symbol, by::Symbol, design::SimpleRandomSample) 
-total(x::Symbol, by::Symbol, design::SimpleRandomSample) 
-```
-```@docs
-ratio(variable_num:: Symbol, variable_den:: Symbol, design::SurveyDesign)
+ratio(variable_num::Symbol, variable_den::Symbol, design::SurveyDesign)
 plot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...)
 boxplot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...)
 hist(design::AbstractSurveyDesign, var::Symbol,
-				 bins::Union{Integer, AbstractVector} = freedman_diaconis(design, var);
-				 normalization = :density,
-				 kwargs...
-    			)
-dim(design::AbstractSurveyDesign)
-dimnames(design::AbstractSurveyDesign)
-colnames(design::AbstractSurveyDesign)
+                 bins::Union{Integer, AbstractVector} = freedman_diaconis(design, var);
+                 normalization = :density,
+                 kwargs...
+                )
 ```
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -18,45 +18,14 @@ with at least 100 students and for various probability samples of the data.
 The API program has been discontinued at the end of 2018. Information is archived at
 [https://www.cde.ca.gov/re/pr/api.asp](https://www.cde.ca.gov/re/pr/api.asp)
 
-Firstly, a survey design needs a dataset from which to gather information. 
-
-
-The sample datasets provided with the package can be loaded as `DataFrames` using the `load_data` function:
+Firstly, a survey design needs a dataset from which to gather information. The sample
+datasets provided with the package can be loaded as `DataFrame`s using [`load_data`](@ref):
 
 ```julia
 julia> apisrs = load_data("apisrs");
 ```
-`apisrs` is a simple random sample of the Academic Performance Index of Californian schools.
-
-Next, we can build a design. The design corresponding to a simple random sample is [`SimpleRandomSample`](@ref), which can be instantiated by calling the constructor:
-
-```julia
-julia> srs = SimpleRandomSample(apisrs; weights = :pw)
-SimpleRandomSample:
-data: 200x42 DataFrame
-weights: 31.0, 31.0, 31.0, ..., 31.0
-probs: 0.0323, 0.0323, 0.0323, ..., 0.0323
-fpc: 6194, 6194, 6194, ..., 6194
-popsize: 6194
-sampsize: 200
-sampfraction: 0.0323
-ignorefpc: false
-```
 
-With a `SimpleRandomSample` (as well as with any subtype of [`AbstractSurveyDesign`](@ref)) it is possible to calculate estimates of the mean, population total, etc., for a given variable, along with the corresponding standard errors.
+`apisrs` is a simple random sample of the Academic Performance Index of Californian schools.
 
-```julia
-julia> mean(:api00, srs)
-1×2 DataFrame
- Row │ mean     sem     
-     │ Float64  Float64 
-─────┼──────────────────
-   1 │ 656.585  9.24972
-
-julia> total(:api00, srs)
-1×2 DataFrame
- Row │ total      se_total 
-     │ Float64    Float64  
-─────┼─────────────────────
-   1 │ 4.06689e6   57292.8
-```
+Next, we can build a design.
+#TODO: continue tutorial
diff --git a/src/Survey.jl b/src/Survey.jl
@@ -11,6 +11,7 @@ using CairoMakie
 using AlgebraOfGraphics
 using CategoricalArrays
 using Random
+using Missings
 
 include("SurveyDesign.jl")
 include("bootstrap.jl")
@@ -27,12 +28,11 @@ include("ratio.jl")
 include("by.jl")
 
 export load_data
-export AbstractSurveyDesign, SimpleRandomSample, StratifiedSample
-export SurveyDesign
+export AbstractSurveyDesign, SurveyDesign, ReplicateDesign
 export dim, colnames, dimnames
 export mean, total, quantile
 export plot
-export hist
+export hist, sturges, freedman_diaconis
 export boxplot
 export bootweights
 export jkknife

diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl
@@ -13,35 +13,34 @@ abstract type AbstractSurveyDesign end
 """
     SurveyDesign <: AbstractSurveyDesign
 
-Survey design sampled by one stage clusters sampling.
-Clusters chosen by SRS followed by complete sampling of selected clusters.
-Assumes each individual in one and only one clusters; disjoint and nested clusters.
+General survey design encompassing a simple random, stratified, cluster or multi-stage design.
 
-`clusters` must be specified as a Symbol name of a column in `data`.
+In the case of cluster sample, the clusters are chosen by simple random sampling. All
+individuals in one cluster are sampled. The clusters are considered disjoint and nested.
 
-# Arguments:
-`data::AbstractDataFrame`: the survey dataset (!this gets modified by the constructor).
-`clusters::Symbol`: the stratification variable - must be given as a column in `data`.
-`popsize::Union{Nothing,Symbol,<:Unsigned,Vector{<:Real}}=nothing`: the (expected) survey population size. For 
+`strata` and `clusters` must be given as columns in `data`.
 
-`weights::Union{Nothing,Symbol,Vector{<:Real}}=nothing`: the sampling weights.
+# Arguments:
+- `data::AbstractDataFrame`: the survey dataset (!this gets modified by the constructor).
+- `strata::Union{Nothing, Symbol}=nothing`: the stratification variable.
+- `clusters::Union{Nothing, Symbol, Vector{Symbol}}=nothing`: the clustering variable.
+- `weights::Union{Nothing, Symbol}=nothing`: the sampling weights.
+- `popsize::Union{Nothing, Int, Symbol}=nothing`: the (expected) survey population size.
 
 ```jldoctest
 julia> apiclus1 = load_data("apiclus1");
 
-julia> apiclus1[!, :pw] = fill(757/15,(size(apiclus1,1),)); # Correct api mistake for pw column
-
-julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw)
+julia> dclus1 = SurveyDesign(apiclus1; clusters=:dnum, weights=:pw)
 SurveyDesign:
 data: 183x46 DataFrame
 cluster: dnum
 design.data[!,design.cluster]: 637, 637, 637, ..., 448
 popsize: popsize
-design.data[!,design.popsize]: 9240.0, 9240.0, 9240.0, ..., 9240.0
+design.data[!,design.popsize]: 6190.0, 6190.0, 6190.0, ..., 6190.0
 sampsize: sampsize
 design.data[!,design.sampsize]: 15, 15, 15, ..., 15
-design.data[!,:probs]: 0.0198, 0.0198, 0.0198, ..., 0.0198
-design.data[!,:allprobs]: 0.0198, 0.0198, 0.0198, ..., 0.0198
+design.data[!,:probs]: 0.0295, 0.0295, 0.0295, ..., 0.0295
+design.data[!,:allprobs]: 0.0295, 0.0295, 0.0295, ..., 0.0295
 ```
 """
 struct SurveyDesign <: AbstractSurveyDesign
@@ -52,9 +51,15 @@ struct SurveyDesign <: AbstractSurveyDesign
     strata::Symbol
     pps::Bool
     # Single stage clusters sample, like apiclus1
-    function SurveyDesign(data::AbstractDataFrame; strata::Union{Nothing,Symbol} = nothing, weights::Union{Nothing,Symbol}= nothing, clusters::Union{Nothing, Symbol, Vector{Symbol}} = nothing, popsize::Union{Nothing, Int,Symbol}=nothing) 
+    function SurveyDesign(
+		data::AbstractDataFrame;
+        strata::Union{Nothing, Symbol}=nothing,
+        clusters::Union{Nothing, Symbol, Vector{Symbol}}=nothing,
+        weights::Union{Nothing, Symbol}=nothing,
+        popsize::Union{Nothing, Int, Symbol}=nothing
+    )
         # sampsize here is number of clusters completely sampled, popsize is total clusters in population
-        if typeof(strata) <:Nothing
+        if typeof(strata) <: Nothing
             data.false_strata = repeat(["FALSE_STRATA"], nrow(data))
             strata = :false_strata
         end
@@ -71,7 +76,7 @@ struct SurveyDesign <: AbstractSurveyDesign
         end
         # For one-stage sample only one sampsize vector
         sampsize_labels = :sampsize
-        data[!, sampsize_labels] = fill(length(unique(data[!, cluster])),(nrow(data),))
+        data[!, sampsize_labels] = fill(length(unique(data[!, cluster])), (nrow(data),))
         if !(typeof(popsize) <: Nothing)
             data[!, :weights] = data[!, popsize] ./ data[!, sampsize_labels]
         elseif !(typeof(weights) <: Nothing)
@@ -91,24 +96,26 @@ struct SurveyDesign <: AbstractSurveyDesign
 end
 
 """
-```jldoctest
-julia> apiclus1 = load_data("apiclus1");
+    ReplicateDesign <: AbstractSurveyDesign
 
-julia> apiclus1[!, :pw] = fill(757/15,(size(apiclus1,1),)); # Correct api mistake for pw column
+Survey design obtained by replicating an original design using [`bootweights`](@ref).
 
-julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); 
+```jldoctest
+julia> apistrat = load_data("apistrat");
 
-julia> bclus1 = Survey.bootweights(dclus1; replicates = 1000)
-Survey.ReplicateDesign:
-data: 183x1046 DataFrame
-cluster: dnum
-design.data[!,design.cluster]: 637, 637, 637, ..., 448
+julia> strat = SurveyDesign(apistrat; strata=:stype, weights=:pw);
+
+julia> bootstrat = bootweights(strat; replicates=1000)
+ReplicateDesign:
+data: 200x1046 DataFrame
+cluster: false_cluster
+design.data[!,design.cluster]: 1, 2, 3, ..., 200
 popsize: popsize
-design.data[!,design.popsize]: 9240.0, 9240.0, 9240.0, ..., 9240.0
+design.data[!,design.popsize]: 6190.0, 6190.0, 6190.0, ..., 6190.0
 sampsize: sampsize
-design.data[!,design.sampsize]: 15, 15, 15, ..., 15
-design.data[!,:probs]: 0.0198, 0.0198, 0.0198, ..., 0.0198
-design.data[!,:allprobs]: 0.0198, 0.0198, 0.0198, ..., 0.0198
+design.data[!,design.sampsize]: 200, 200, 200, ..., 200
+design.data[!,:probs]: 0.0226, 0.0226, 0.0226, ..., 0.0662
+design.data[!,:allprobs]: 0.0226, 0.0226, 0.0226, ..., 0.0662
 replicates: 1000
 ```
 """

diff --git a/src/bootstrap.jl b/src/bootstrap.jl
@@ -1,15 +1,15 @@
 """
 ```jldoctest
-julia> using Survey, Random;
+julia> using Random
 
-julia> apiclus1 = load_data("apiclus1"); 
+julia> apiclus1 = load_data("apiclus1");
 
-julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum); 
+julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum);
 
-julia> rng = MersenneTwister(111); 
+julia> rng = MersenneTwister(111);
 
-julia> Survey.bootweights(dclus1; replicates=1000, rng) 
-Survey.ReplicateDesign:
+julia> bootweights(dclus1; replicates=1000, rng)
+ReplicateDesign:
 data: 183x1046 DataFrame
 cluster: dnum
 design.data[!,design.cluster]: 637, 637, 637, ..., 448
@@ -22,31 +22,33 @@ design.data[!,:allprobs]: 1.0, 1.0, 1.0, ..., 1.0
 replicates: 1000
 ```
 """
-function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwister(1234))
+function bootweights(design::SurveyDesign; replicates=4000, rng=MersenneTwister(1234))
     H = length(unique(design.data[!, design.strata]))
     stratified = groupby(design.data, design.strata)
     function replicate(stratified, H)
-        for j in 1:H
-            substrata = DataFrame(stratified[j])
+        for h in 1:H
+            substrata = DataFrame(stratified[h])
             psus = unique(substrata[!, design.cluster])
-            if length(psus) == 1
-                return DataFrame(statistic = X, SE = 0)
+            if length(psus) <= 1
+                stratified[h].whij .= 0 # hasn't been tested yet. 
             end
             nh = length(psus)
             randinds = rand(rng, 1:(nh), (nh-1)) # Main bootstrap algo. Draw nh-1 out of nh, with replacement.  
             rh = [(count(==(i), randinds)) for i in 1:nh] # main bootstrap algo. 
             gdf = groupby(substrata, design.cluster)
             for i in 1:nh
-                gdf[i].rh = repeat([rh[i]], nrow(gdf[i]))
-            end
-            stratified[j].rh = DataFrame(gdf).rh
+                gdf[i].whij = repeat([rh[i]], nrow(gdf[i])) .* gdf[i].weights .* (nh / (nh - 1))
+            end            
+            stratified[h].whij = transform(gdf).whij
+
         end
-        return DataFrame(stratified)
+        return transform(stratified, :whij)
     end
     df = replicate(stratified, H)
-    rename!(df,:rh => :replicate_1)
+    rename!(df, :whij => :replicate_1)
+    df.replicate_1 = disallowmissing(df.replicate_1)
     for i in 2:(replicates)
-        df[!, "replicate_"*string(i)] = Float64.(replicate(stratified, H).rh)
+        df[!, "replicate_" * string(i)] = disallowmissing(replicate(stratified, H).whij)
     end 
     return ReplicateDesign(df, design.cluster, design.popsize, design.sampsize, design.strata, design.pps, replicates) 
 end
diff --git a/src/by.jl b/src/by.jl
@@ -4,7 +4,7 @@ function bydomain(x::Symbol, domain::Symbol, design::ReplicateDesign, func::Func
     X = combine(gdf, [x, :weights] => ((a, b) -> func(a, weights(b))) => :statistic)
     Xt_mat = Array{Float64, 2}(undef, (nd, design.replicates))
     for i in 1:design.replicates
-        Xt_mat[:, i] = combine(gdf, [x, :weights, Symbol("replicate_"*string(i))] => ((a, b, c) -> func(a, weights(b .* c))) => :statistic).statistic
+        Xt_mat[:, i] = combine(gdf, [x, Symbol("replicate_"*string(i))] => ((a, c) -> func(a, weights(c))) => :statistic).statistic
     end
     ses = []
     for i in 1:nd