Merge pull request #172 from smishr/singledesign_testing

Update survey design, add tests, remove extra quantile, add `design.weights`
xKDR · Jan 16, 2023 · e12c9e5 · e12c9e5
2 parents 0fb80c0 + 5275be9
commit e12c9e5
Show file tree

Hide file tree

Showing 14 changed files with 235 additions and 99 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,4 @@
 /dev/*
 .gitignore
 .DS_Store
-*.json
+*.json
diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl
@@ -25,21 +25,22 @@ individuals in one cluster are sampled. The clusters are considered disjoint and
 - `strata::Union{Nothing, Symbol}=nothing`: the stratification variable.
 - `clusters::Union{Nothing, Symbol, Vector{Symbol}}=nothing`: the clustering variable.
 - `weights::Union{Nothing, Symbol}=nothing`: the sampling weights.
-- `popsize::Union{Nothing, Int, Symbol}=nothing`: the (expected) survey population size.
+- `popsize::Union{Nothing, Symbol}=nothing`: the (expected) survey population size.
 
 ```jldoctest
-julia> apistrat = load_data("apistrat");
+julia> apiclus1 = load_data("apiclus1");
 
-julia> strat = SurveyDesign(apistrat; strata=:stype, weights=:pw)
+julia> dclus1 = SurveyDesign(apiclus1; clusters=:dnum, strata=:stype, weights=:pw)
 SurveyDesign:
-data: 200×46 DataFrame
+data: 183×43 DataFrame
 strata: stype
-    [E, E, E  …  H]
-cluster: none
-popsize: [6190.0, 6190.0, 6190.0  …  6190.0]
-sampsize: [200, 200, 200  …  200]
-weights: [44.2, 44.2, 44.2  …  15.1]
-probs: [0.0226, 0.0226, 0.0226  …  0.0662]
+    [H, E, E  …  E]
+cluster: dnum
+    [637, 637, 637  …  448]
+popsize: [507.7049, 507.7049, 507.7049  …  507.7049]
+sampsize: [15, 15, 15  …  15]
+weights: [33.847, 33.847, 33.847  …  33.847]
+allprobs: [0.0295, 0.0295, 0.0295  …  0.0295]
 ```
 """
 struct SurveyDesign <: AbstractSurveyDesign
@@ -48,14 +49,15 @@ struct SurveyDesign <: AbstractSurveyDesign
     popsize::Symbol
     sampsize::Symbol
     strata::Symbol
-    pps::Bool
+    weights::Symbol # Effective weights in case of singlestage approx supported
+    allprobs::Symbol # Right now only singlestage approx supported
+    pps::Bool # TODO functionality
     # Single stage clusters sample, like apiclus1
-    function SurveyDesign(
-		data::AbstractDataFrame;
-        strata::Union{Nothing, Symbol}=nothing,
-        clusters::Union{Nothing, Symbol, Vector{Symbol}}=nothing,
-        weights::Union{Nothing, Symbol}=nothing,
-        popsize::Union{Nothing, Int, Symbol}=nothing
+    function SurveyDesign(data::AbstractDataFrame;
+        clusters::Union{Nothing,Symbol,Vector{Symbol}}=nothing,
+        strata::Union{Nothing,Symbol}=nothing,
+        popsize::Union{Nothing,Symbol}=nothing,
+        weights::Union{Nothing,Symbol}=nothing
     )
         # sampsize here is number of clusters completely sampled, popsize is total clusters in population
         if typeof(strata) <: Nothing
@@ -73,24 +75,34 @@ struct SurveyDesign <: AbstractSurveyDesign
         if typeof(clusters) <: Symbol
             cluster = clusters
         end
-        # For one-stage sample only one sampsize vector
-        sampsize_labels = :sampsize
-        data[!, sampsize_labels] = fill(length(unique(data[!, cluster])), (nrow(data),))
-        if !(typeof(popsize) <: Nothing)
-            data[!, :weights] = data[!, popsize] ./ data[!, sampsize_labels]
-        elseif !(typeof(weights) <: Nothing)
-            data.weights = data[!, weights]
+        # For single-stage approximation only one "effective" sampsize vector
+        sampsize_labels = :_sampsize
+        if isa(strata,Symbol) && isnothing(clusters) # If stratified only then sampsize is inside strata
+            data[!, sampsize_labels] = transform(groupby(data, strata), nrow => :counts).counts
         else
-            data.weights = repeat([1], nrow(data))
+            data[!, sampsize_labels] = fill(length(unique(data[!, cluster])), (nrow(data),))
         end
-        data[!, :probs] = 1 ./ data[!, :weights] # Many formulae are easily defined in terms of sampling probabilties
-        data[!, :allprobs] = data[!, :probs] # In one-stage cluster sample, allprobs is just probs, no multiplication needed
-        pps = false
-        if !(typeof(popsize) <: Symbol)
-            data.popsize = repeat([sum(data.weights)], nrow(data))
-            popsize = :popsize
+        if isa(popsize, Symbol)
+            weights_labels = :_weights
+            data[!, weights_labels] = data[!, popsize] ./ data[!, sampsize_labels]
+        elseif isa(weights, Symbol)
+            if !(typeof(data[!, weights]) <: Vector{<:Real})
+                throw(ArgumentError(string("given weights column ", weights , " is not of numeric type")))
+            else
+                # derive popsize from given `weights`
+                weights_labels = weights
+                popsize = :_popsize
+                data[!, popsize] = data[!, sampsize_labels] .* data[!, weights_labels]
+            end
+        else
+            # neither popsize nor weights given
+            weights_labels = :_weights
+            data[!, weights_labels] = repeat([1], nrow(data))
         end
-        new(data, cluster, popsize, sampsize_labels, strata, pps)
+        allprobs_labels = :allprobs
+        data[!, allprobs_labels] = 1 ./ data[!, weights_labels] # In one-stage cluster sample, allprobs is just probs, no multiplication needed
+        pps = false # for now no explicit pps supported faster functions, but they can be added
+        new(data, cluster, popsize, sampsize_labels, strata, weights_labels, allprobs_labels, pps)
     end
 end
 
@@ -106,14 +118,14 @@ julia> strat = SurveyDesign(apistrat; strata=:stype, weights=:pw);
 
 julia> bootstrat = bootweights(strat; replicates=1000)
 ReplicateDesign:
-data: 200×1046 DataFrame
+data: 200×1044 DataFrame
 strata: stype
     [E, E, E  …  H]
 cluster: none
-popsize: [6190.0, 6190.0, 6190.0  …  6190.0]
-sampsize: [200, 200, 200  …  200]
-weights: [44.2, 44.2, 44.2  …  15.1]
-probs: [0.0226, 0.0226, 0.0226  …  0.0662]
+popsize: [4420.9999, 4420.9999, 4420.9999  …  755.0]
+sampsize: [100, 100, 100  …  50]
+weights: [44.21, 44.21, 44.21  …  15.1]
+allprobs: [0.0226, 0.0226, 0.0226  …  0.0662]
 replicates: 1000
 ```
 """
@@ -123,6 +135,8 @@ struct ReplicateDesign <: AbstractSurveyDesign
     popsize::Symbol
     sampsize::Symbol
     strata::Symbol
+    weights::Symbol # Effective weights in case of singlestage approx supported
+    allprobs::Symbol # Right now only singlestage approx supported
     pps::Bool
     replicates::UInt
 end
diff --git a/src/bootstrap.jl b/src/bootstrap.jl
@@ -4,18 +4,18 @@ julia> using Random
 
 julia> apiclus1 = load_data("apiclus1");
 
-julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum);
+julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum, popsize=:fpc);
 
 julia> bootweights(clus_one_stage; replicates=1000, rng=MersenneTwister(111)) # choose a seed for deterministic results
 ReplicateDesign:
-data: 183×1046 DataFrame
+data: 183×1044 DataFrame
 strata: none
 cluster: dnum
     [637, 637, 637  …  448]
-popsize: [183, 183, 183  …  183]
+popsize: [757, 757, 757  …  757]
 sampsize: [15, 15, 15  …  15]
-weights: [1, 1, 1  …  1]
-probs: [1.0, 1.0, 1.0  …  1.0]
+weights: [50.4667, 50.4667, 50.4667  …  50.4667]
+allprobs: [0.0198, 0.0198, 0.0198  …  0.0198]
 replicates: 1000
 ```
 """
@@ -34,7 +34,7 @@ function bootweights(design::SurveyDesign; replicates=4000, rng=MersenneTwister(
             rh = [(count(==(i), randinds)) for i in 1:nh] # main bootstrap algo. 
             gdf = groupby(substrata, design.cluster)
             for i in 1:nh
-                gdf[i].whij = repeat([rh[i]], nrow(gdf[i])) .* gdf[i].weights .* (nh / (nh - 1))
+                gdf[i].whij = repeat([rh[i]], nrow(gdf[i])) .* gdf[i][!,design.weights] .* (nh / (nh - 1))
             end            
             stratified[h].whij = transform(gdf).whij
 
@@ -47,5 +47,5 @@ function bootweights(design::SurveyDesign; replicates=4000, rng=MersenneTwister(
     for i in 2:(replicates)
         df[!, "replicate_" * string(i)] = disallowmissing(replicate(stratified, H).whij)
     end 
-    return ReplicateDesign(df, design.cluster, design.popsize, design.sampsize, design.strata, design.pps, replicates) 
+    return ReplicateDesign(df, design.cluster, design.popsize, design.sampsize, design.strata, design.weights, design.allprobs, design.pps, replicates) 
 end
diff --git a/src/by.jl b/src/by.jl
@@ -1,7 +1,7 @@
 function bydomain(x::Symbol, domain::Symbol, design::ReplicateDesign, func::Function)
     gdf = groupby(design.data, domain)
     nd = length(unique(design.data[!, domain]))
-    X = combine(gdf, [x, :weights] => ((a, b) -> func(a, weights(b))) => :statistic)
+    X = combine(gdf, [x, design.weights] => ((a, b) -> func(a, weights(b))) => :statistic)
     Xt_mat = Array{Float64, 2}(undef, (nd, design.replicates))
     for i in 1:design.replicates
         Xt_mat[:, i] = combine(gdf, [x, Symbol("replicate_"*string(i))] => ((a, c) -> func(a, weights(c))) => :statistic).statistic

diff --git a/src/hist.jl b/src/hist.jl
@@ -74,7 +74,7 @@ function hist(design::AbstractSurveyDesign, var::Symbol,
 				 kwargs...
     			)
 	hist = histogram(bins = bins, normalization = normalization, kwargs...)
-	data(design.data) * mapping(var, weights = :weights) * hist |> draw
+	data(design.data) * mapping(var, weights = design.weights) * hist |> draw
 end
 
 function hist(design::AbstractSurveyDesign, var::Symbol,

diff --git a/src/mean.jl b/src/mean.jl
@@ -10,8 +10,8 @@ julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw)
 
 julia> mean(:api00, clus_one_stage)
 1×2 DataFrame
- Row │ mean     SE
-     │ Float64  Float64
+ Row │ mean     SE      
+     │ Float64  Float64 
 ─────┼──────────────────
    1 │ 644.169  23.2919
 
@@ -25,7 +25,7 @@ julia> mean([:api00, :enroll], clus_one_stage)
 ```
 """
 function mean(x::Symbol, design::ReplicateDesign)
-    X = mean(design.data[!, x], weights(design.data.weights))
+    X = mean(design.data[!, x], weights(design.data[!,design.weights]))
     Xt = [mean(design.data[!, x], weights(design.data[! , "replicate_"*string(i)])) for i in 1:design.replicates]
     variance = sum((Xt .- X).^2) / design.replicates
     DataFrame(mean = X, SE = sqrt(variance))

diff --git a/src/plot.jl b/src/plot.jl
@@ -16,5 +16,5 @@ save("scatter.png", s); nothing # hide
 ![](assets/scatter.png)
 """
 function plot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...)
-    data(design.data) * mapping(x, y, markersize = :weights) * visual(Scatter, marker = '￮') |> draw
+    data(design.data) * mapping(x, y, markersize = design.weights) * visual(Scatter, marker = '￮') |> draw
 end
diff --git a/src/quantile.jl b/src/quantile.jl
@@ -37,17 +37,8 @@ julia> quantile(:enroll,srs,[0.1,0.2,0.5,0.75,0.95])
 function quantile(var::Symbol, design::SurveyDesign, p::Union{<:Real,Vector{<:Real}}; 
     alpha::Float64=0.05, ci::Bool=false, se::Bool=false, qrule="hf7",kwargs...)
     v = design.data[!, var]
-    probs = design.data[!, :probs]
+    probs = design.data[!, design.allprobs]
     df = DataFrame(probability=p, quantile=Statistics.quantile(v, ProbabilityWeights(probs), p))
     # TODO: Add CI and SE of the quantile
     return df
-end
-
-function quantile(var::Symbol, design::SurveyDesign, p::Union{<:Real,Vector{<:Real}}; 
-    alpha::Float64=0.05, ci::Bool=false, se::Bool=false, qrule="hf7",kwargs...)
-    v = design.data[!, var]
-    probs = design.data[!, :probs]
-    df = DataFrame(probability=p, quantile=Statistics.quantile(v, ProbabilityWeights(probs), p)) # Not sure which quantile defintion this returns
-    # TODO: Add CI and SE of the quantile
-    return df
 end
diff --git a/src/ratio.jl b/src/ratio.jl
@@ -17,14 +17,14 @@ julia> ratio(:api00, :enroll, clus_one_stage)
 ```
 """
 function ratio(variable_num:: Symbol, variable_den:: Symbol, design::SurveyDesign)
-    statistic = wsum(design.data[!,variable_num],design.data.weights)/wsum(design.data[!,variable_den],design.data.weights)
+    statistic = wsum(design.data[!,variable_num],design.data[!,design.weights])/wsum(design.data[!,variable_den],design.data[!,design.weights])
     nh = length(unique(design.data[!,design.cluster]))
     newv = []
     gdf = groupby(design.data, design.cluster)
     replicates = [filter(n -> n != i, 1:nh) for i in 1:nh] 
     for i in replicates
         df = DataFrame(gdf[i])
-        push!(newv, wsum(df[!,variable_num],df[!,:weights])/wsum(df[!,variable_den],df[!,:weights]))
+        push!(newv, wsum(df[!,variable_num],df[!,design.weights])/wsum(df[!,variable_den],df[!,design.weights]))
     end
     c = 0
     for i in 1:nh

diff --git a/src/show.jl b/src/show.jl
@@ -5,7 +5,7 @@ Helper function that transforms a given `Number` or `Vector` into a short-form s
 """
 function makeshort(x)
     if isa(x[1], Float64)
-        x = round.(x, sigdigits=3)
+        x = round.(x, digits=4) # Rounded to 4 digits after the decimal place
     end
     # print short vectors or single values as they are, compress otherwise
     if length(x) > 1
@@ -56,6 +56,6 @@ function surveyshow(io::IO, design::AbstractSurveyDesign)
     printinfo(io, "popsize", makeshort(design.data[!, design.popsize]))
     printinfo(io, "sampsize", makeshort(design.data[!, design.sampsize]))
     # weights and probs info
-    printinfo(io, "weights", makeshort(design.data[!, :weights]))
-    printinfo(io, "probs", makeshort(design.data[!, :probs]); newline=false)
+    printinfo(io, "weights", makeshort(design.data[!, design.weights]))
+    printinfo(io, "allprobs", makeshort(design.data[!, design.allprobs]); newline=false)
 end
diff --git a/src/total.jl b/src/total.jl
@@ -25,7 +25,7 @@ julia> total([:api00, :enroll], clus_one_stage)
 ```
 """
 function total(x::Symbol, design::ReplicateDesign)
-    X = wsum(design.data[!, x], weights(design.data.weights))
+    X = wsum(design.data[!, x], weights(design.data[!,design.weights]))
     Xt = [wsum(design.data[!, x], weights(design.data[! , "replicate_"*string(i)])) for i in 1:design.replicates]
     variance = sum((Xt .- X).^2) / design.replicates
     DataFrame(total = X, SE = sqrt(variance))
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,4 +8,4 @@ @@
     /dev/*
     .gitignore
     .DS_Store
-    *.json
+    *.json