Merge pull request #53 from smishr/design_update

Improved SRS Design with args error checking, better types
xKDR · Sep 21, 2022 · 06f9bbd · 06f9bbd
2 parents aa0715a + 1ef381d
commit 06f9bbd
Show file tree

Hide file tree

Showing 14 changed files with 274 additions and 157 deletions.
diff --git a/Project.toml b/Project.toml
@@ -7,6 +7,7 @@ version = "0.11.1"
 AlgebraOfGraphics = "cbdf2221-f076-402e-a563-3d30da359d67"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
+CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

diff --git a/src/Survey.jl b/src/Survey.jl
@@ -8,6 +8,7 @@ using GLM
 using LinearAlgebra
 using CairoMakie
 using AlgebraOfGraphics
+using CategoricalArrays # For CategoricalArray support
 
 include("SurveyDesign.jl")
 include("svydesign.jl")

diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl
@@ -2,13 +2,13 @@
 function print_short(x)
     # write floats in short form
     if isa(x[1], Float64)
-        x = round.(x, sigdigits = 3)
+        x = round.(x, sigdigits=3)
     end
     # print short vectors or single values as they are, compress otherwise
     if length(x) < 3
         print(x)
     else
-        print( x[1], ", ", x[2], ", ", x[3], " ... ", last(x))
+        print(x[1], ", ", x[2], ", ", x[3], " ... ", last(x))
     end
 end
 
@@ -20,63 +20,114 @@ The data to a survey constructor is modified. To avoid this pass a copy of the d
 instead of the original.
 """
 abstract type AbstractSurveyDesign end
-
 """
     SimpleRandomSample <: AbstractSurveyDesign
 
 Survey design sampled by simple random sampling.
-
-The population size is equal to the sample size unless `popsize` is explicitly provided.
 """
 struct SimpleRandomSample <: AbstractSurveyDesign
     data::AbstractDataFrame
-    sampsize::UInt
-    popsize::Union{UInt,Nothing}
+    sampsize::Union{Nothing,Unsigned}
+    popsize::Union{Nothing,Unsigned}
     sampfraction::Real
     fpc::Real
     ignorefpc::Bool
     function SimpleRandomSample(data::AbstractDataFrame;
-                                popsize = nothing,
-                                sampsize = nrow(data),
-                                weights = ones(nrow(data)), # Check the defaults
-                                probs = nothing,
-                                ignorefpc = true
-                                )
+        popsize=nothing,
+        sampsize=nrow(data),
+        weights=nothing, # Check the defaults
+        probs=nothing,
+        ignorefpc=false
+    )
+        # Functionality: weights arg can be passed as Symbol instead of vector
         if isa(weights, Symbol)
             weights = data[!, weights]
         end
-        # set population size if it is not given; `weights` and `sampsize` must be given
+        # Set population size if it is not given; `weights` and `sampsize` must be given
+        if ignorefpc # && (isnothing(popsize) || isnothing(weights) || isnothing(probs))
+            @warn "Assuming equal weights"
+            weights = ones(nrow(data))
+        end
         if isnothing(popsize)
-            popsize = round(sum(weights)) |> UInt
+            if typeof(weights) <: Vector{<:Real}
+                if !all(y -> y == first(weights), weights) # SRS by definition is equi-weighted
+                    error("Simple Random Sample must be equi-weighted. Different sampling weights detected in vectors")
+                end
+            elseif isa(weights, Symbol)
+                weights = data[!, weights]
+                if !all(y -> y == first(weights), weights) # SRS by definition is equi-weighted
+                    error("Simple Random Sample must be equi-weighted. Different sampling weights detected in vectors")
+                end
+            elseif typeof(probs) <: Vector{<:Real}
+                if !all(y -> y == first(probs), probs) # SRS by definition is equi-weighted
+                    error("Simple Random Sample must be equi-weighted. Different sampling probabilities detected in vector")
+                end
+                weights = 1 ./ probs
+            elseif isa(probs, Symbol)
+                probs = data[!, probs]
+                if !all(y -> y == first(probs), probs) # SRS by definition is equi-weighted
+                    error("Simple Random Sample must be equi-weighted. Different sampling probabilities detected in vector")
+                end
+                weights = 1 ./ probs
+            end
+            # If all weights are equal then estimate
+            equal_weight = first(weights)
+            popsize = round(sampsize .* equal_weight) |> UInt
+            if sampsize > popsize
+                error("You have either given wrong or not enough keyword args. sampsize cannot be greate than popsize. Check given inputs. eg if weights given then popsize must be given (for now)")
+            end
+        elseif typeof(popsize) <: Vector{<:Real}
+            if !all(y -> y == first(popsize), popsize) # SRS by definition is equi-weighted
+                error("Simple Random Sample must be equi-weighted. Different sampling weights detected in vectors")
+            end
+            weights = popsize ./ sampsize # This line is ratio estimator, we may need to change it when doing compley surveys
+            popsize = first(popsize) |> UInt
+        else
+            error("If popsize not given then either sampling weights or sampling probabilities must be given")
         end
-        # add frequency weights column to `data`
-        data[!, :weights] = weights
-        # add probability weights column to `data`
-        data[!, :probs] = 1 ./ data[!, :weights]
         # set sampling fraction
-        sampfraction = sampsize / popsize
+        sampfraction = sampsize ./ popsize
         # set fpc
-        fpc = ignorefpc ? 1 : 1 - (sampsize / popsize)
-
+        fpc = ignorefpc ? 1 : 1 .- (sampsize ./ popsize)
+        # Add columns for weights and probs in data -- Check if really needed to add them as columns
+        if !isnothing(probs)
+            # add probability weights column to `data`
+            data[!, :probs] = probs
+            # add frequency weights column to `data`
+            data[!, :weights] = 1 ./ data[!, :probs]
+        else # else weights were specified
+            # add frequency weights column to `data`
+            data[!, :weights] = weights
+            # add probability weights column to `data`
+            data[!, :probs] = 1 ./ data[!, :weights]
+        end
         new(data, sampsize, popsize, sampfraction, fpc, ignorefpc)
     end
+    function SimpleRandomSample(data::AbstractDataFrame)
+        ignorefpc = true
+        return SimpleRandomSample(data; popsize=nothing,sampsize=nrow(data), weights=nothing, probs=nothing, ignorefpc=ignorefpc)
+    end
 end
 
 # `show` method for printing information about a `SimpleRandomSample` after construction
 function Base.show(io::IO, ::MIME"text/plain", design::SimpleRandomSample)
-    printstyled("Simple Random Sample:\n"; bold = true)
-    printstyled("data: "; bold = true)
+    printstyled("Simple Random Sample:\n"; bold=true)
+    printstyled("data: "; bold=true)
     print(size(design.data, 1), "x", size(design.data, 2), " DataFrame")
-    printstyled("\nweights: "; bold = true)
+    printstyled("\nweights: "; bold=true)
     print_short(design.data.weights)
-    printstyled("\nprobs: "; bold = true)
+    printstyled("\nprobs: "; bold=true)
     print_short(design.data.probs)
-    printstyled("\nfpc: "; bold = true)
+    printstyled("\nfpc: "; bold=true)
     print_short(design.fpc)
-    printstyled("\n    popsize: "; bold = true)
-    print(design.popsize)
-    printstyled("\n    sampsize: "; bold = true)
-    print(design.sampsize)
+    printstyled("\npopsize: "; bold=true)
+    print_short(design.popsize)
+    printstyled("\nsampsize: "; bold=true)
+    print_short(design.sampsize)
+    printstyled("\nsampfraction: "; bold=true)
+    print_short(design.sampfraction)
+    printstyled("\nignorefpc: "; bold=true)
+    print(design.ignorefpc)
 end
 
 """
@@ -91,7 +142,7 @@ struct StratifiedSample <: AbstractSurveyDesign
     sampfraction::Real
     fpc::Real
     nofpc::Bool
-    function StratifiedSample(data::DataFrame, strata::AbstractVector; weights = ones(nrow(data)), probs = 1 ./ weights)
+    function StratifiedSample(data::DataFrame, strata::AbstractVector; weights=ones(nrow(data)), probs=1 ./ weights)
         # add frequency weights, probability weights and sample size columns
         data[!, :weights] = weights
         data[!, :probs] = probs
@@ -100,26 +151,26 @@ struct StratifiedSample <: AbstractSurveyDesign
         data[!, :sampsize] = repeat([nrow(data)], nrow(data))
         data[!, :strata] = strata
 
-        new(data)
+        new(data,sampsize,popsize,sampfraction,fpc,nofpc)
     end
 end
 
 # `show` method for printing information about a `StratifiedSample` after construction
 function Base.show(io::IO, design::StratifiedSample)
-    printstyled("Stratified Sample:\n"; bold = true)
-    printstyled("data: "; bold = true)
+    printstyled("Stratified Sample:\n"; bold=true)
+    printstyled("data: "; bold=true)
     print(size(design.data, 1), "x", size(design.data, 2), " DataFrame")
-    printstyled("\nweights: "; bold = true)
+    printstyled("\nweights: "; bold=true)
     print_short(design.data.weights)
-    printstyled("\nprobs: "; bold = true)
+    printstyled("\nprobs: "; bold=true)
     print_short(design.data.probs)
-    printstyled("\nstrata: "; bold = true)
+    printstyled("\nstrata: "; bold=true)
     print_short(design.data.strata)
-    printstyled("\nfpc: "; bold = true)
+    printstyled("\nfpc: "; bold=true)
     print_short(design.fpc)
-    printstyled("\n    popsize: "; bold = true)
+    printstyled("\n    popsize: "; bold=true)
     print(design.popsize)
-    printstyled("\n    sampsize: "; bold = true)
+    printstyled("\n    sampsize: "; bold=true)
     print(design.sampsize)
 end
 
@@ -134,17 +185,17 @@ end
 
 # `show` method for printing information about a `ClusterSample` after construction
 function Base.show(io::IO, design::ClusterSample)
-    printstyled("Cluster Sample:\n"; bold = true)
-    printstyled("data: "; bold = true)
+    printstyled("Cluster Sample:\n"; bold=true)
+    printstyled("data: "; bold=true)
     print(size(design.data, 1), "x", size(design.data, 2), " DataFrame")
-    printstyled("\nweights: "; bold = true)
+    printstyled("\nweights: "; bold=true)
     print_short(design.data.weights)
-    printstyled("\nprobs: "; bold = true)
+    printstyled("\nprobs: "; bold=true)
     print_short(design.data.probs)
-    printstyled("\nfpc: "; bold = true)
+    printstyled("\nfpc: "; bold=true)
     print_short(design.fpc)
-    printstyled("\n    popsize: "; bold = true)
+    printstyled("\n    popsize: "; bold=true)
     print(design.popsize)
-    printstyled("\n    sampsize: "; bold = true)
+    printstyled("\n    sampsize: "; bold=true)
     print(design.sampsize)
 end
diff --git a/src/svymean.jl b/src/svymean.jl
@@ -15,14 +15,14 @@ julia> svymean(:enroll, srs)
 ```
 """
 function var_of_mean(x::Symbol, design::SimpleRandomSample)
-    return design.fpc / design.sampsize * var(design.data[!, x])
+    return design.fpc ./ design.sampsize .* var(design.data[!, x])
 end
 
 """
 Inner method for `svyby`.
 """
 function var_of_mean(x::AbstractVector, design::SimpleRandomSample)
-    return design.fpc / design.sampsize * var(x)
+    return design.fpc ./ design.sampsize .* var(x)
 end
 
 function sem(x, design::SimpleRandomSample)
@@ -36,14 +36,62 @@ function sem(x::AbstractVector, design::SimpleRandomSample)
     return sqrt(var_of_mean(x, design))
 end
 
-function svymean(x::Symbol, design::SimpleRandomSample)
+function svymean(x, design::SimpleRandomSample)
+    # Support behaviour like R for CategoricalArray type data
+    if isa(x,Symbol) && isa(design.data[!,x], CategoricalArray)
+        gdf = groupby(design.data, x)
+        p = combine(gdf, nrow => :counts )
+        p.proportion = p.counts ./ sum(p.counts)
+        p.var = design.fpc .* p.proportion .* (1 .- p.proportion) ./ (design.sampsize - 1) # Formula for variance of proportion
+        p.se = sqrt.(p.var)
+        return p
+    end
     return DataFrame(mean = mean(design.data[!, x]), sem = sem(x, design::SimpleRandomSample))
 end
 
 """
 Inner method for `svyby`.
 """
+
+function sem_svyby(x::AbstractVector, design::SimpleRandomSample, weights)
+    # return sqrt( (1 - length(x)/design.popsize) ./ length(x) .* var(x) ) .*  sqrt(1 - 1 / length(x) + 1 / design.sampsize)
+    # f =  sqrt()
+    # pweights = 1 ./ design.data.prob
+    # N = sum(design.data.weights)
+    # Nd = sum(weights)
+    # Nd = length(x)
+    N = sum(weights)
+    Nd = length(x)
+    Pd = Nd/N
+    n = design.sampsize
+    n̄d = n * Pd
+    Ȳd = mean(x)
+    S2d = var(x)
+    Sd = sqrt(S2d)
+    CVd = Sd / Ȳd
+    @show(N,Nd,Pd,n,n̄d,Ȳd,S2d,Sd,CVd)
+    @show((1 ./ n̄d - 1 ./Nd ))
+    V̂_Ȳd = ((1 ./ n̄d) - (1 ./Nd) ) * S2d * (1 + (1 - Pd) /  CVd^2 )
+    print(V̂_Ȳd)
+    print("\n")
+    # print(V̂_Ȳd,"\n")
+    return sqrt(V̂_Ȳd)
+end
+    # vsrs = var_of_mean(x, design)
+
+
+    # vsrs2 = vsrs .* (psum-length(x)) ./ psum
+    # return sqrt(vsrs2)
+    # return sqrt( (1 - 1 / length(x) + 1 / design.sampsize) .* var(x) ./ length(x) ) 
+
 # TODO: results not matching for `sem`
-function svymean(x::AbstractVector , design::SimpleRandomSample, _)
-    return DataFrame(mean = mean(x), sem = sem(x, design::SimpleRandomSample))
+function svymean(x::AbstractVector , design::SimpleRandomSample, weights)
+    return DataFrame(mean = mean(x), sem = sem_svyby(x, design::SimpleRandomSample, weights))
 end
+
+""" mean for Categorical variables 
+"""
+
+# function svymean(x::, design::SimpleRandomSample)
+#     return DataFrame(mean = mean(design.data[!, x]), sem = sem(x, design::SimpleRandomSample))
+# end
diff --git a/src/svyquantile.jl b/src/svyquantile.jl
@@ -15,12 +15,11 @@ julia> svyquantile(:enroll, srs, 0.5)
 ```
 """
 # TODO: modify for SimpleRandomSample
-function svyquantile(var, design::SimpleRandomSample, q)
+function svyquantile(var, design::SimpleRandomSample, q; kwargs...)
     x = design.data[!, var]
-    w = design.data.probs
-    df = DataFrame(tmp = quantile(Float32.(x), weights(w), q))
+    # w = design.data.probs
+    df = DataFrame(tmp = quantile(Float32.(x), q; kwargs...)) # Define Lumley quantile
     rename!(df, :tmp => Symbol(string(q) .* "th percentile"))
-
     return df
 end