Skip to content

Commit

Permalink
Merge pull request #53 from smishr/design_update
Browse files Browse the repository at this point in the history
Improved SRS Design with args error checking, better types
  • Loading branch information
ayushpatnaikgit authored Sep 21, 2022
2 parents aa0715a + 1ef381d commit 06f9bbd
Show file tree
Hide file tree
Showing 14 changed files with 274 additions and 157 deletions.
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ version = "0.11.1"
AlgebraOfGraphics = "cbdf2221-f076-402e-a563-3d30da359d67"
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Expand Down
1 change: 1 addition & 0 deletions src/Survey.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ using GLM
using LinearAlgebra
using CairoMakie
using AlgebraOfGraphics
using CategoricalArrays # For CategoricalArray support

include("SurveyDesign.jl")
include("svydesign.jl")
Expand Down
147 changes: 99 additions & 48 deletions src/SurveyDesign.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
function print_short(x)
# write floats in short form
if isa(x[1], Float64)
x = round.(x, sigdigits = 3)
x = round.(x, sigdigits=3)
end
# print short vectors or single values as they are, compress otherwise
if length(x) < 3
print(x)
else
print( x[1], ", ", x[2], ", ", x[3], " ... ", last(x))
print(x[1], ", ", x[2], ", ", x[3], " ... ", last(x))
end
end

Expand All @@ -20,63 +20,114 @@ The data to a survey constructor is modified. To avoid this pass a copy of the d
instead of the original.
"""
abstract type AbstractSurveyDesign end

"""
SimpleRandomSample <: AbstractSurveyDesign
Survey design sampled by simple random sampling.
The population size is equal to the sample size unless `popsize` is explicitly provided.
"""
struct SimpleRandomSample <: AbstractSurveyDesign
data::AbstractDataFrame
sampsize::UInt
popsize::Union{UInt,Nothing}
sampsize::Union{Nothing,Unsigned}
popsize::Union{Nothing,Unsigned}
sampfraction::Real
fpc::Real
ignorefpc::Bool
function SimpleRandomSample(data::AbstractDataFrame;
popsize = nothing,
sampsize = nrow(data),
weights = ones(nrow(data)), # Check the defaults
probs = nothing,
ignorefpc = true
)
popsize=nothing,
sampsize=nrow(data),
weights=nothing, # Check the defaults
probs=nothing,
ignorefpc=false
)
# Functionality: weights arg can be passed as Symbol instead of vector
if isa(weights, Symbol)
weights = data[!, weights]
end
# set population size if it is not given; `weights` and `sampsize` must be given
# Set population size if it is not given; `weights` and `sampsize` must be given
if ignorefpc # && (isnothing(popsize) || isnothing(weights) || isnothing(probs))
@warn "Assuming equal weights"
weights = ones(nrow(data))
end
if isnothing(popsize)
popsize = round(sum(weights)) |> UInt
if typeof(weights) <: Vector{<:Real}
if !all(y -> y == first(weights), weights) # SRS by definition is equi-weighted
error("Simple Random Sample must be equi-weighted. Different sampling weights detected in vectors")
end
elseif isa(weights, Symbol)
weights = data[!, weights]
if !all(y -> y == first(weights), weights) # SRS by definition is equi-weighted
error("Simple Random Sample must be equi-weighted. Different sampling weights detected in vectors")
end
elseif typeof(probs) <: Vector{<:Real}
if !all(y -> y == first(probs), probs) # SRS by definition is equi-weighted
error("Simple Random Sample must be equi-weighted. Different sampling probabilities detected in vector")
end
weights = 1 ./ probs
elseif isa(probs, Symbol)
probs = data[!, probs]
if !all(y -> y == first(probs), probs) # SRS by definition is equi-weighted
error("Simple Random Sample must be equi-weighted. Different sampling probabilities detected in vector")
end
weights = 1 ./ probs
end
# If all weights are equal then estimate
equal_weight = first(weights)
popsize = round(sampsize .* equal_weight) |> UInt
if sampsize > popsize
error("You have either given wrong or not enough keyword args. sampsize cannot be greate than popsize. Check given inputs. eg if weights given then popsize must be given (for now)")
end
elseif typeof(popsize) <: Vector{<:Real}
if !all(y -> y == first(popsize), popsize) # SRS by definition is equi-weighted
error("Simple Random Sample must be equi-weighted. Different sampling weights detected in vectors")
end
weights = popsize ./ sampsize # This line is ratio estimator, we may need to change it when doing compley surveys
popsize = first(popsize) |> UInt
else
error("If popsize not given then either sampling weights or sampling probabilities must be given")
end
# add frequency weights column to `data`
data[!, :weights] = weights
# add probability weights column to `data`
data[!, :probs] = 1 ./ data[!, :weights]
# set sampling fraction
sampfraction = sampsize / popsize
sampfraction = sampsize ./ popsize
# set fpc
fpc = ignorefpc ? 1 : 1 - (sampsize / popsize)

fpc = ignorefpc ? 1 : 1 .- (sampsize ./ popsize)
# Add columns for weights and probs in data -- Check if really needed to add them as columns
if !isnothing(probs)
# add probability weights column to `data`
data[!, :probs] = probs
# add frequency weights column to `data`
data[!, :weights] = 1 ./ data[!, :probs]
else # else weights were specified
# add frequency weights column to `data`
data[!, :weights] = weights
# add probability weights column to `data`
data[!, :probs] = 1 ./ data[!, :weights]
end
new(data, sampsize, popsize, sampfraction, fpc, ignorefpc)
end
function SimpleRandomSample(data::AbstractDataFrame)
ignorefpc = true
return SimpleRandomSample(data; popsize=nothing,sampsize=nrow(data), weights=nothing, probs=nothing, ignorefpc=ignorefpc)
end
end

# `show` method for printing information about a `SimpleRandomSample` after construction
function Base.show(io::IO, ::MIME"text/plain", design::SimpleRandomSample)
printstyled("Simple Random Sample:\n"; bold = true)
printstyled("data: "; bold = true)
printstyled("Simple Random Sample:\n"; bold=true)
printstyled("data: "; bold=true)
print(size(design.data, 1), "x", size(design.data, 2), " DataFrame")
printstyled("\nweights: "; bold = true)
printstyled("\nweights: "; bold=true)
print_short(design.data.weights)
printstyled("\nprobs: "; bold = true)
printstyled("\nprobs: "; bold=true)
print_short(design.data.probs)
printstyled("\nfpc: "; bold = true)
printstyled("\nfpc: "; bold=true)
print_short(design.fpc)
printstyled("\n popsize: "; bold = true)
print(design.popsize)
printstyled("\n sampsize: "; bold = true)
print(design.sampsize)
printstyled("\npopsize: "; bold=true)
print_short(design.popsize)
printstyled("\nsampsize: "; bold=true)
print_short(design.sampsize)
printstyled("\nsampfraction: "; bold=true)
print_short(design.sampfraction)
printstyled("\nignorefpc: "; bold=true)
print(design.ignorefpc)
end

"""
Expand All @@ -91,7 +142,7 @@ struct StratifiedSample <: AbstractSurveyDesign
sampfraction::Real
fpc::Real
nofpc::Bool
function StratifiedSample(data::DataFrame, strata::AbstractVector; weights = ones(nrow(data)), probs = 1 ./ weights)
function StratifiedSample(data::DataFrame, strata::AbstractVector; weights=ones(nrow(data)), probs=1 ./ weights)
# add frequency weights, probability weights and sample size columns
data[!, :weights] = weights
data[!, :probs] = probs
Expand All @@ -100,26 +151,26 @@ struct StratifiedSample <: AbstractSurveyDesign
data[!, :sampsize] = repeat([nrow(data)], nrow(data))
data[!, :strata] = strata

new(data)
new(data,sampsize,popsize,sampfraction,fpc,nofpc)
end
end

# `show` method for printing information about a `StratifiedSample` after construction
function Base.show(io::IO, design::StratifiedSample)
printstyled("Stratified Sample:\n"; bold = true)
printstyled("data: "; bold = true)
printstyled("Stratified Sample:\n"; bold=true)
printstyled("data: "; bold=true)
print(size(design.data, 1), "x", size(design.data, 2), " DataFrame")
printstyled("\nweights: "; bold = true)
printstyled("\nweights: "; bold=true)
print_short(design.data.weights)
printstyled("\nprobs: "; bold = true)
printstyled("\nprobs: "; bold=true)
print_short(design.data.probs)
printstyled("\nstrata: "; bold = true)
printstyled("\nstrata: "; bold=true)
print_short(design.data.strata)
printstyled("\nfpc: "; bold = true)
printstyled("\nfpc: "; bold=true)
print_short(design.fpc)
printstyled("\n popsize: "; bold = true)
printstyled("\n popsize: "; bold=true)
print(design.popsize)
printstyled("\n sampsize: "; bold = true)
printstyled("\n sampsize: "; bold=true)
print(design.sampsize)
end

Expand All @@ -134,17 +185,17 @@ end

# `show` method for printing information about a `ClusterSample` after construction
function Base.show(io::IO, design::ClusterSample)
printstyled("Cluster Sample:\n"; bold = true)
printstyled("data: "; bold = true)
printstyled("Cluster Sample:\n"; bold=true)
printstyled("data: "; bold=true)
print(size(design.data, 1), "x", size(design.data, 2), " DataFrame")
printstyled("\nweights: "; bold = true)
printstyled("\nweights: "; bold=true)
print_short(design.data.weights)
printstyled("\nprobs: "; bold = true)
printstyled("\nprobs: "; bold=true)
print_short(design.data.probs)
printstyled("\nfpc: "; bold = true)
printstyled("\nfpc: "; bold=true)
print_short(design.fpc)
printstyled("\n popsize: "; bold = true)
printstyled("\n popsize: "; bold=true)
print(design.popsize)
printstyled("\n sampsize: "; bold = true)
printstyled("\n sampsize: "; bold=true)
print(design.sampsize)
end
58 changes: 53 additions & 5 deletions src/svymean.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@ julia> svymean(:enroll, srs)
```
"""
function var_of_mean(x::Symbol, design::SimpleRandomSample)
return design.fpc / design.sampsize * var(design.data[!, x])
return design.fpc ./ design.sampsize .* var(design.data[!, x])
end

"""
Inner method for `svyby`.
"""
function var_of_mean(x::AbstractVector, design::SimpleRandomSample)
return design.fpc / design.sampsize * var(x)
return design.fpc ./ design.sampsize .* var(x)
end

function sem(x, design::SimpleRandomSample)
Expand All @@ -36,14 +36,62 @@ function sem(x::AbstractVector, design::SimpleRandomSample)
return sqrt(var_of_mean(x, design))
end

function svymean(x::Symbol, design::SimpleRandomSample)
function svymean(x, design::SimpleRandomSample)
# Support behaviour like R for CategoricalArray type data
if isa(x,Symbol) && isa(design.data[!,x], CategoricalArray)
gdf = groupby(design.data, x)
p = combine(gdf, nrow => :counts )
p.proportion = p.counts ./ sum(p.counts)
p.var = design.fpc .* p.proportion .* (1 .- p.proportion) ./ (design.sampsize - 1) # Formula for variance of proportion
p.se = sqrt.(p.var)
return p
end
return DataFrame(mean = mean(design.data[!, x]), sem = sem(x, design::SimpleRandomSample))
end

"""
Inner method for `svyby`.
"""

function sem_svyby(x::AbstractVector, design::SimpleRandomSample, weights)
# return sqrt( (1 - length(x)/design.popsize) ./ length(x) .* var(x) ) .* sqrt(1 - 1 / length(x) + 1 / design.sampsize)
# f = sqrt()
# pweights = 1 ./ design.data.prob
# N = sum(design.data.weights)
# Nd = sum(weights)
# Nd = length(x)
N = sum(weights)
Nd = length(x)
Pd = Nd/N
n = design.sampsize
n̄d = n * Pd
Ȳd = mean(x)
S2d = var(x)
Sd = sqrt(S2d)
CVd = Sd / Ȳd
@show(N,Nd,Pd,n,n̄d,Ȳd,S2d,Sd,CVd)
@show((1 ./ n̄d - 1 ./Nd ))
V̂_Ȳd = ((1 ./ n̄d) - (1 ./Nd) ) * S2d * (1 + (1 - Pd) / CVd^2 )
print(V̂_Ȳd)
print("\n")
# print(V̂_Ȳd,"\n")
return sqrt(V̂_Ȳd)
end
# vsrs = var_of_mean(x, design)


# vsrs2 = vsrs .* (psum-length(x)) ./ psum
# return sqrt(vsrs2)
# return sqrt( (1 - 1 / length(x) + 1 / design.sampsize) .* var(x) ./ length(x) )

# TODO: results not matching for `sem`
function svymean(x::AbstractVector , design::SimpleRandomSample, _)
return DataFrame(mean = mean(x), sem = sem(x, design::SimpleRandomSample))
function svymean(x::AbstractVector , design::SimpleRandomSample, weights)
return DataFrame(mean = mean(x), sem = sem_svyby(x, design::SimpleRandomSample, weights))
end

""" mean for Categorical variables
"""

# function svymean(x::, design::SimpleRandomSample)
# return DataFrame(mean = mean(design.data[!, x]), sem = sem(x, design::SimpleRandomSample))
# end
7 changes: 3 additions & 4 deletions src/svyquantile.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,11 @@ julia> svyquantile(:enroll, srs, 0.5)
```
"""
# TODO: modify for SimpleRandomSample
function svyquantile(var, design::SimpleRandomSample, q)
function svyquantile(var, design::SimpleRandomSample, q; kwargs...)
x = design.data[!, var]
w = design.data.probs
df = DataFrame(tmp = quantile(Float32.(x), weights(w), q))
# w = design.data.probs
df = DataFrame(tmp = quantile(Float32.(x), q; kwargs...)) # Define Lumley quantile
rename!(df, :tmp => Symbol(string(q) .* "th percentile"))

return df
end

Expand Down
Loading

0 comments on commit 06f9bbd

Please sign in to comment.