Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Exhaustively testing SimpleRandomSample, software eng improvements #94

Merged
merged 27 commits into from
Nov 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
a34aa52
Merge branch 'xKDR:design_update' into design_update
smishr Nov 10, 2022
4c7bbf6
Merge pull request #3 from smishr/domain_stratified_sample
smishr Nov 22, 2022
e6a90de
Merge branch 'xKDR:design_update' into design_update
smishr Nov 22, 2022
191f84e
Merge branch 'xKDR:design_update' into design_update
smishr Nov 22, 2022
f2dd1d2
Update SRS struct, Add type checking, reorder
smishr Nov 23, 2022
a3936b0
Remove accidental twice fn declaration
smishr Nov 23, 2022
6bc4094
Editing sayantika SRS tests
smishr Nov 23, 2022
fa8a01a
Add and reorder type checks on weights
smishr Nov 23, 2022
3cdf066
Edited weights and probs checking
smishr Nov 23, 2022
94fd34f
Add ErrorException to @test_throws lines
smishr Nov 23, 2022
36b35c0
popsize sampsize Unsigned not Integer
smishr Nov 23, 2022
e1149da
Add testthrows for type checking keywords
smishr Nov 23, 2022
39c060a
ingorefpc improvements still not 100%, revert T,S parametric types
smishr Nov 25, 2022
79af6ab
line by line adding tests SRS
smishr Nov 25, 2022
9331edd
Ran Julia Formatter
smishr Nov 25, 2022
9aca851
Change Float64 to <:Real as sometime Int64
smishr Nov 25, 2022
9a42e84
Add popsize Symbol StratifiedSample
smishr Nov 25, 2022
94586d1
Fixed Strat and SurveyDesign tests for time being
smishr Nov 25, 2022
0eb9f6e
Fix testing suite with ignorefpc=true
smishr Nov 25, 2022
c0c36c2
Update src/SurveyDesign.jl
smishr Nov 26, 2022
8eefd75
Update src/SurveyDesign.jl
smishr Nov 26, 2022
60be750
Update src/SurveyDesign.jl
smishr Nov 26, 2022
d92c63d
Update src/SurveyDesign.jl
smishr Nov 26, 2022
2751ae3
Update src/SurveyDesign.jl
smishr Nov 26, 2022
6a2fff3
Update src/SurveyDesign.jl
smishr Nov 26, 2022
b9f09a1
Update src/SurveyDesign.jl
smishr Nov 26, 2022
3ea908c
Update src/SurveyDesign.jl
smishr Nov 26, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 120 additions & 28 deletions src/SurveyDesign.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,36 +15,102 @@ abstract type AbstractSurveyDesign end
SimpleRandomSample <: AbstractSurveyDesign

Survey design sampled by simple random sampling.
# Required arguments:
data - This is the survey dataset loaded as a DataFrame in memory.
Note: Keeping with Julia conventions, original data object
is modified, not copied. Be careful
# Optional arguments:
sampsize - Sample size of the survey, given as Symbol name of
column in `data`, an `Unsigned` integer, or a Vector
popsize - The (expected) population size of survey, given as Symbol
name of column in `data`, an `Unsigned` integer, or a Vector
weights - Sampling weights, passed as Symbol or Vector
probs - Sampling probabilities, passed as Symbol or Vector
ignorefpc- Ignore finite population correction and assume all weights equal to 1.0

Precedence order of using `popsize`, `weights` and `probs` is `popsize` > `weights` > `probs`
Eg. if `popsize` given then assumed ground truth over `weights` or `probs`

If `popsize` not given, `weights` or `probs` must be given, so that in combination
with `sampsize`, `popsize` can be calculated.
"""
struct SimpleRandomSample <: AbstractSurveyDesign
data::AbstractDataFrame
sampsize::Union{Nothing,Unsigned}
popsize::Union{Nothing,Unsigned}
sampfraction::Real
fpc::Real
sampsize::Union{Unsigned,Nothing}
popsize::Union{Unsigned,Nothing}
sampfraction::Float64
fpc::Float64
ignorefpc::Bool
function SimpleRandomSample(data::AbstractDataFrame;
popsize=nothing,
sampsize=nrow(data),
sampsize=nrow(data) |> UInt,
weights=nothing,
probs=nothing,
ignorefpc=false
)
# Only valid argument types given to constructor
argtypes_weights = Union{Nothing,Symbol,Vector{<:Real}}
argtypes_probs = Union{Nothing,Symbol,Vector{<:Real}}
argtypes_popsize = Union{Nothing,Symbol,<:Unsigned,Vector{<:Real}}
argtypes_sampsize = Union{Nothing,Symbol,<:Unsigned,Vector{<:Real}}
# If any invalid type raise error
if !(isa(weights, argtypes_weights))
error("invalid type of argument given for `weights` argument")
elseif !(isa(probs, argtypes_probs))
error("invalid type of argument given for `probs` argument")
elseif !(isa(popsize, argtypes_popsize))
error("invalid type of argument given for `popsize` argument")
elseif !(isa(sampsize, argtypes_sampsize))
error("invalid type of argument given for `sampsize` argument")
end
# If any of weights or probs given as Symbol,
# find the corresponding column in `data`
if isa(weights, Symbol)
weights = data[!, weights]
end
if isa(probs, Symbol)
probs = data[!, probs]
end

if ignorefpc
@warn "assuming all weights are equal to 1.0"
weights = ones(nrow(data))
# If weights/probs vector not numeric/real, ie. string column passed for weights, then raise error
if !isa(weights, Union{Nothing,Vector{<:Real}})
error("Weights should be Vector{<:Real}. You passed $(typeof(weights))")
elseif !isa(probs, Union{Nothing,Vector{<:Real}})
error("Sampling probabilities should be Vector{<:Real}. You passed $(typeof(probs))")
end

# set population size if it is not given; `weights` and `sampsize` must be given
# If popsize given as Symbol or Vector, check all records equal
if isa(popsize, Symbol)
if !all(w -> w == first(data[!, popsize]), data[!, popsize])
error("popsize must be same for all observations in Simple Random Sample")
end
popsize = first(data[!, popsize]) |> UInt
elseif isa(popsize, Vector{<:Real})
if !all(w -> w == first(popsize), popsize)
error("popsize must be same for all observations in Simple Random Sample")
end
popsize = first(popsize) |> UInt
end
# If sampsize given as Symbol or Vector, check all records equal
if isa(sampsize, Symbol)
if !all(w -> w == first(data[!, sampsize]), data[!, sampsize])
error("sampsize must be same for all observations in Simple Random Sample")
end
sampsize = first(data[!, sampsize]) |> UInt
elseif isa(sampsize, Vector{<:Real})
if !all(w -> w == first(sampsize), sampsize)
error("sampsize must be same for all observations in Simple Random Sample")
end
sampsize = first(sampsize) |> UInt
end
# If both `weights` and `probs` given, then `weights` is assumed to be ground truth for probs.
if !isnothing(weights) && !isnothing(probs)
probs = 1 ./ weights
data[!, :probs] = probs
end
# popsize must be nothing or <:Integer by now
if isnothing(popsize)
# check that all weights are equal (SRS is by definition equi-weighted)
# If popsize not given, fallback to weights, probs and sampsize to estimate `popsize`
@warn "Using weights/probs and sampsize to estimate `popsize`"
# Check that all weights (or probs if weights not given) are equal, as SRS is by definition equi-weighted
if typeof(weights) <: Vector{<:Real}
if !all(w -> w == first(weights), weights)
error("all frequency weights must be equal for Simple Random Sample")
Expand All @@ -53,23 +119,46 @@ struct SimpleRandomSample <: AbstractSurveyDesign
if !all(p -> p == first(probs), probs)
error("all probability weights must be equal for Simple Random Sample")
end
weights = 1 / probs
weights = 1 ./ probs
else
error("either weights or probs must be given if `popsize` not given")
end
# estimate population size
# Estimate population size
popsize = round(sampsize * first(weights)) |> UInt
# @show popsize, sampsize # Check this Line TODO
if sampsize > popsize
error("sample size cannot be greater than population size")
error("population size was estimated to be greater than given sampsize. Please check input arguments.")
end
elseif typeof(popsize) <: Vector{<:Real}
if !all(y -> y == first(popsize), popsize) # SRS by definition is equi-weighted
error("Simple Random Sample must be equi-weighted. Different sampling weights detected in vectors")
end
weights = popsize ./ sampsize # ratio estimator for SRS
popsize = first(popsize) |> UInt
elseif typeof(popsize) <: Unsigned
weights = fill(popsize / sampsize, nrow(data)) # If popsize is given, weights vector is made concordant with popsize and sampsize, regardless of given weights argument
else
error("either population size or frequency/probability weights must be specified")
error("something went wrong, please check validity of inputs.")
end
# If ignorefpc then set weights to 1 ??
# TODO: This works under some cases, but should find better way to process ignoring fpc
if ignorefpc
@warn "assuming all weights are equal to 1.0"
weights = ones(nrow(data))
probs = 1 ./ weights
end
# sum of weights must equal to `popsize` for SRS
if !isnothing(weights) && !(isapprox(sum(weights), popsize; atol=1e-4))
if ignorefpc && !(isapprox(sum(weights), sampsize; atol=1e-4)) # Change if ignorefpc functionality changes
error("sum of sampling weights should be equal to `sampsize` for `SimpleRandomSample` with `ignorefpc`")
elseif !ignorefpc
@show sum(weights)
error("sum of sampling weights must be equal to `popsize` for `SimpleRandomSample`")
end
end
# sum of probs must equal popsize for SRS
if !isnothing(probs) && !(isapprox(sum(1 ./ probs), popsize; atol=1e-4))
if ignorefpc && !(isapprox(sum(1 ./ probs), sampsize; atol=1e-4)) # Change if ignorefpc functionality changes
error("sum of inverse sampling probabilities should be equal to `sampsize` for `SimpleRandomSample` with `ignorefpc`")
elseif !ignorefpc
@show sum(1 ./ probs)
error("Sum of inverse of sampling probabilities must be equal to `popsize` for Simple Random Sample")
end
end
## Set remaining parts of data structure
# set sampling fraction
sampfraction = sampsize / popsize
# set fpc
Expand All @@ -80,7 +169,7 @@ struct SimpleRandomSample <: AbstractSurveyDesign
probs = 1 ./ data[!, :weights]
end
data[!, :probs] = probs

# Initialise the structure
new(data, sampsize, popsize, sampfraction, fpc, ignorefpc)
end
end
Expand All @@ -105,6 +194,9 @@ struct StratifiedSample <: AbstractSurveyDesign
probs=nothing,
ignorefpc=false
)
if isa(popsize, Symbol)
popsize = data[!, popsize]
end
if isa(weights, Symbol)
weights = data[!, weights]
end
Expand Down Expand Up @@ -141,7 +233,7 @@ struct StratifiedSample <: AbstractSurveyDesign
# set sampling fraction
sampfraction = sampsize ./ popsize
# set fpc
fpc = ignorefpc ? fill(1,size(data, 1)) : 1 .- (sampsize ./ popsize)
fpc = ignorefpc ? fill(1, size(data, 1)) : 1 .- (sampsize ./ popsize)
# add columns for frequency and probability weights to `data`
if !isnothing(probs)
data[!, :probs] = probs
Expand Down Expand Up @@ -282,8 +374,8 @@ struct SurveyDesign <: AbstractSurveyDesign
if ignorefpc # && (isnothing(popsize) || isnothing(weights) || isnothing(probs))
@warn "Assuming equal weights"
weights = ones(nrow(data))
end
end

# TODO: Do the other case where clusters are given
if isnothing(clusters)
# set population size if it is not given; `weights` and `sampsize` must be given
Expand Down Expand Up @@ -327,7 +419,7 @@ struct SurveyDesign <: AbstractSurveyDesign
end
# @show clusters, strata, sampsize,popsize, sampfraction, fpc, ignorefpc
new(data, clusters, strata, sampsize, popsize, sampfraction, fpc, ignorefpc)
elseif isa(clusters,Symbol)
elseif isa(clusters, Symbol)
# One Cluster sampling - PSU chosen with SRS,
print("One stage cluster design with PSU SRS")
elseif typeof(clusters) <: Vector{Symbol}
Expand Down
38 changes: 17 additions & 21 deletions src/svymean.jl
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ function svymean(x::Symbol, design::SimpleRandomSample)
p.se = sqrt.(p.var)
return p
end
return DataFrame(mean = mean(design.data[!, x]), sem = sem(x, design::SimpleRandomSample))
return DataFrame(mean=mean(design.data[!, x]), sem=sem(x, design::SimpleRandomSample))
smishr marked this conversation as resolved.
Show resolved Hide resolved
end

function svymean(x::Vector{Symbol}, design::SimpleRandomSample)
Expand Down Expand Up @@ -84,52 +84,48 @@ function sem_svyby(x::AbstractVector, design::SimpleRandomSample)
return sqrt(variance)
end

function svymean(x::AbstractVector, design::SimpleRandomSample, weights)
return DataFrame(mean = mean(x), sem = sem_svyby(x, design))
end

"""
Inner method for `svyby` for SimpleRandomSample
"""
function svymean(x::AbstractVector, design::SimpleRandomSample, weights)
return DataFrame(mean = mean(x), sem = sem_svyby(x, design))
return DataFrame(mean=mean(x), sem=sem_svyby(x, design))
end

"""
Inner method for `svyby` for StratifiedSample
Calculates domain mean and its std error, based example 10.3.3 on pg394 Sarndal (1992)
"""
function svymean(x::AbstractVector, popsize::AbstractVector,sampsize::AbstractVector,sampfraction::AbstractVector,strata::AbstractVector)
df = DataFrame(x = x, popsize = popsize, sampsize = sampsize, sampfraction = sampfraction,strata = strata)
function svymean(x::AbstractVector, popsize::AbstractVector, sampsize::AbstractVector, sampfraction::AbstractVector, strata::AbstractVector)
df = DataFrame(x=x, popsize=popsize, sampsize=sampsize, sampfraction=sampfraction, strata=strata)
nsdh = []
substrata_domain_totals = []
Nh = []
nh = []
fh = []
ȳsdh = []
sigma_ȳsh_squares = []
grouped_frame = groupby(df,:strata)
grouped_frame = groupby(df, :strata)
for each_strata in keys(grouped_frame)
nsh = nrow(grouped_frame[each_strata])#, nrow=>:nsdh).nsdh
push!(nsdh,nsh)
push!(nsdh, nsh)
substrata_domain_total = sum(grouped_frame[each_strata].x)
ȳdh = mean(grouped_frame[each_strata].x)
push!(ȳsdh, ȳdh)
push!(substrata_domain_totals,substrata_domain_total)
push!(substrata_domain_totals, substrata_domain_total)
popsizes = first(grouped_frame[each_strata].popsize)
push!(Nh,popsizes)
push!(Nh, popsizes)
sampsizes = first(grouped_frame[each_strata].sampsize)
push!(nh,sampsizes)
push!(nh, sampsizes)
sampfrac = first(grouped_frame[each_strata].sampfraction)
push!(fh,sampfrac)
push!(sigma_ȳsh_squares,sum((grouped_frame[each_strata].x .- ȳdh).^2) )
push!(fh, sampfrac)
push!(sigma_ȳsh_squares, sum((grouped_frame[each_strata].x .- ȳdh) .^ 2))
end
domain_mean = sum(Nh .* substrata_domain_totals ./ nh)/sum(Nh .* nsdh ./ nh)
pdh = nsdh./nh
domain_mean = sum(Nh .* substrata_domain_totals ./ nh) / sum(Nh .* nsdh ./ nh)
pdh = nsdh ./ nh
N̂d = sum(Nh .* pdh)
domain_var = sum(Nh.^2 .* (1 .- fh) .* (sigma_ȳsh_squares .+ (nsdh .* (1 .-pdh) .* (ȳsdh .- domain_mean).^2)) ./ (nh .* (nh .- 1) )) ./ N̂d.^2
domain_var = sum(Nh .^ 2 .* (1 .- fh) .* (sigma_ȳsh_squares .+ (nsdh .* (1 .- pdh) .* (ȳsdh .- domain_mean) .^ 2)) ./ (nh .* (nh .- 1))) ./ N̂d .^ 2
domain_mean_se = sqrt(domain_var)
return DataFrame(domain_mean = domain_mean, domain_mean_se = domain_mean_se)
return DataFrame(domain_mean=domain_mean, domain_mean_se=domain_mean_se)
end

"""
Expand Down Expand Up @@ -162,12 +158,12 @@ function svymean(x::Symbol, design::StratifiedSample)
s²ₕ = combine(gdf, x => var => :s²h).s²h
V̂Ȳ̂ = sum((Wₕ .^ 2) .* (1 .- fₕ) .* s²ₕ ./ nₕ)
SE = sqrt(V̂Ȳ̂)
return DataFrame(Ȳ̂ = Ȳ̂, SE = SE)
return DataFrame(Ȳ̂=Ȳ̂, SE=SE)
end

function svymean(::Bool; x::Symbol, design::StratifiedSample)
gdf = groupby(design.data, design.strata)
ȳₕ = combine(gdf, x => mean => :mean).mean
s²ₕ = combine(gdf, x => var => :s²h).s²h
return DataFrame(ȳₕ,s²ₕ)
return DataFrame(ȳₕ, s²ₕ)
end
6 changes: 3 additions & 3 deletions src/svytotal.jl
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ function svytotal(x::Symbol, design::SimpleRandomSample)
return p
end
total = design.popsize * mean(design.data[!, x])
return DataFrame(total = total, se_total = se_tot(x, design::SimpleRandomSample))
return DataFrame(total=total, se_total=se_tot(x, design::SimpleRandomSample))
end

# Inner methods for `svyby`
Expand All @@ -57,7 +57,7 @@ function se_total_svyby(x::AbstractVector, design::SimpleRandomSample, _)
end
function svytotal(x::AbstractVector, design::SimpleRandomSample, weights)
total = wsum(x, weights)
return DataFrame(total = total, sem = se_total_svyby(x, design::SimpleRandomSample, weights))
return DataFrame(total=total, sem=se_total_svyby(x, design::SimpleRandomSample, weights))
end

# StratifiedSample
Expand All @@ -82,7 +82,7 @@ function svytotal(x::Symbol, design::StratifiedSample)
# the only difference between total and mean variance is the Nₕ instead of Wₕ
V̂Ȳ̂ = sum((Nₕ .^ 2) .* (1 .- fₕ) .* s²ₕ ./ nₕ)
SE = sqrt(V̂Ȳ̂)
return DataFrame(grand_total = grand_total, SE = SE)
return DataFrame(grand_total=grand_total, SE=SE)
end

function svytotal(x::Vector{Symbol}, design::SimpleRandomSample)
Expand Down
Loading