Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Design update some features #66

Merged
merged 22 commits into from
Nov 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/Survey.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ include("show.jl")

export load_data
export AbstractSurveyDesign, SimpleRandomSample, StratifiedSample
export SurveyDesign
export svydesign
export svyglm
export svyby
Expand All @@ -37,6 +38,7 @@ export @formula
export svyplot
export svyhist, sturges, freedman_diaconis
export svyboxplot
export ht_svytotal, ht_svymean
export
#families
Normal,
Expand Down
154 changes: 133 additions & 21 deletions src/SurveyDesign.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ abstract type AbstractSurveyDesign end
"""
SimpleRandomSample <: AbstractSurveyDesign

Survey design sampled by simple random sampling.
Survey design sampled by simple random sampling.
"""
struct SimpleRandomSample <: AbstractSurveyDesign
data::AbstractDataFrame
Expand Down Expand Up @@ -53,10 +53,11 @@ struct SimpleRandomSample <: AbstractSurveyDesign
if !all(p -> p == first(probs), probs)
error("all probability weights must be equal for Simple Random Sample")
end
weights = 1 ./ probs
weights = 1 / probs
end
# estimate population size
popsize = round(sampsize .* first(weights)) |> UInt
popsize = round(sampsize * first(weights)) |> UInt
# @show popsize, sampsize # Check this Line TODO
if sampsize > popsize
error("sample size cannot be greater than population size")
end
Expand All @@ -70,13 +71,13 @@ struct SimpleRandomSample <: AbstractSurveyDesign
error("either population size or frequency/probability weights must be specified")
end
# set sampling fraction
sampfraction = sampsize ./ popsize
sampfraction = sampsize / popsize
# set fpc
fpc = ignorefpc ? 1 : 1 .- (sampsize ./ popsize)
fpc = ignorefpc ? 1 : 1 - (sampsize / popsize)
# add columns for frequency and probability weights to `data`
data[!, :weights] = weights
if isnothing(probs)
probs = 1 ./ data[!, :weights]
probs = 1 ./ data[!, :weights]
end
data[!, :probs] = probs

Expand All @@ -87,19 +88,19 @@ end
"""
StratifiedSample <: AbstractSurveyDesign

Survey design sampled by stratification.
Survey design sampled by stratification.
"""
struct StratifiedSample <: AbstractSurveyDesign
data::AbstractDataFrame
strata::Symbol
sampsize::Vector{Union{Nothing,Float64}}
popsize::Vector{Union{Nothing,Float64}}
sampsize::Union{Nothing,Vector{Real}}
popsize::Union{Nothing,Vector{Real}}
sampfraction::Vector{Real}
fpc::Vector{Real}
ignorefpc::Bool
function StratifiedSample(data::AbstractDataFrame, strata::Symbol;
popsize=nothing,
sampsize= transform(groupby(data,strata), nrow => :counts ).counts ,
sampsize=transform(groupby(data, strata), nrow => :counts).counts,
weights=nothing,
probs=nothing,
ignorefpc=false
Expand Down Expand Up @@ -133,7 +134,7 @@ struct StratifiedSample <: AbstractSurveyDesign
elseif typeof(popsize) <: Vector{<:Real}
# TODO: change `elseif` condition
weights = popsize ./ sampsize # expansion estimator
# TODO: add probability weights
# TODO: add probability weights
else
error("either population size or frequency/probability weights must be specified")
end
Expand All @@ -156,20 +157,21 @@ end
"""
ClusterSample <: AbstractSurveyDesign

Survey design sampled by clustering.
One stage clusters, Primary Sampling Units (PSU) chosen with SRS. No stratification.
Assuming every individual is in one and only one cluster, and the subsampling probabilities
for any given cluster do not depend on which other clusters were sampled. (Lumley pg41, para2)
"""
struct ClusterSample <: AbstractSurveyDesign
data::AbstractDataFrame
strata::Symbol
sampsize::Vector{Union{Nothing,Float64}}
popsize::Vector{Union{Nothing,Float64}}
clusters::Union{Symbol,Nothing}
sampsize::Union{Nothing,Vector{Real}}
popsize::Union{Nothing,Vector{Real}}
sampfraction::Vector{Real}
fpc::Vector{Real}
ignorefpc::Bool
# TODO: change entire struct body
function GeneralSample(data::AbstractDataFrame, strata::Symbol;
function ClusterSample(data::AbstractDataFrame, strata::Symbol;
popsize=nothing,
sampsize= transform(groupby(data,strata), nrow => :counts ).counts ,
sampsize=transform(groupby(data, strata), nrow => :counts).counts,
weights=nothing,
probs=nothing,
ignorefpc=false
Expand All @@ -181,8 +183,10 @@ struct ClusterSample <: AbstractSurveyDesign
probs = data[!, probs]
end

if ignorefpc # && (isnothing(popsize) || isnothing(weights) || isnothing(probs))
@warn "Assuming equal weights"
if ignorefpc
# TODO: change what happens if `ignorepfc == true` or if the user only
# specifies `data`
@warn "assuming equal weights"
weights = ones(nrow(data))
end

Expand All @@ -201,7 +205,7 @@ struct ClusterSample <: AbstractSurveyDesign
elseif typeof(popsize) <: Vector{<:Real}
# TODO: change `elseif` condition
weights = popsize ./ sampsize # expansion estimator
# TODO: add probability weights
# TODO: add probability weights
else
error("either population size or frequency/probability weights must be specified")
end
Expand All @@ -220,3 +224,111 @@ struct ClusterSample <: AbstractSurveyDesign
new(data, strata, sampsize, popsize, sampfraction, fpc, ignorefpc)
end
end


"""
SurveyDesign <: AbstractSurveyDesign

Survey design with arbitrary design weights

clusters: can be passed as `Symbol`, Vector{Symbol}, Vector{Real} or Nothing
strata: can be passed as `Symbol`, Vector{Symbol}, Vector{Real} or Nothing
sampsize: can be passed as `Symbol`, Vector{Symbol}, Vector{Real} or Nothing
popsize: can be passed as `Symbol`, Vector{Symbol}, Vector{Real} or Nothing
"""
struct SurveyDesign <: AbstractSurveyDesign
data::AbstractDataFrame
clusters::Union{Symbol,Vector{Symbol},Nothing}
strata::Union{Symbol,Vector{Symbol},Nothing}
sampsize::Union{Real,Vector{Real},Nothing}
popsize::Union{Real,Vector{Real},Nothing}
sampfraction::Union{Real,Vector{Real}}
fpc::Union{Real,Vector{Real}}
ignorefpc::Bool
# TODO: struct body still work in progress
function SurveyDesign(data::AbstractDataFrame;
clusters=nothing,
strata=nothing,
popsize=nothing,
sampsize=nothing,
weights=nothing,
probs=nothing,
ignorefpc=false
)
if isnothing(sampsize)
if isnothing(strata)
sampsize = nrow(data)
else
sampsize = transform(groupby(data, strata), nrow => :counts).counts
end

end

if isa(weights, Symbol)
weights = data[!, weights]
end
if isa(probs, Symbol)
probs = data[!, probs]
end
if isa(popsize, Symbol)
popsize = data[!, popsize]
end

# TODO: Check below, may not be correct for all designs
if ignorefpc # && (isnothing(popsize) || isnothing(weights) || isnothing(probs))
@warn "Assuming equal weights"
weights = ones(nrow(data))
end

# TODO: Do the other case where clusters are given
if isnothing(clusters)
# set population size if it is not given; `weights` and `sampsize` must be given
if isnothing(popsize)
# TODO: add probability weights if `weights` is not `nothing`
if typeof(probs) <: Vector{<:Real}
weights = 1 ./ probs
end
# estimate population size
popsize = sampsize .* weights

if sampsize > popsize
error("sample size cannot be greater than population size")
end
elseif typeof(popsize) <: Vector{<:Real}
# TODO: change `elseif` condition
weights = popsize ./ sampsize # expansion estimator
# TODO: add probability weights
else
error("either population size or frequency/probability weights must be specified")
end
# TODO: for now support SRS within SurveyDesign. Later, just call SimpleRandomSample??
if typeof(sampsize) <: Real && typeof(popsize) <: Vector{<:Real} # Eg when SRS
if !all(y -> y == first(popsize), popsize) # SRS by definition is equi-weighted
error("Simple Random Sample must be equi-weighted. Different sampling weights detected in vectors")
end
popsize = first(popsize) |> Real
end

# set sampling fraction
sampfraction = sampsize ./ popsize
# set fpc
fpc = ignorefpc ? 1 : 1 .- (sampsize ./ popsize)
# add columns for frequency and probability weights to `data`
if !isnothing(probs)
data[!, :probs] = probs
data[!, :weights] = 1 ./ data[!, :probs]
else
data[!, :weights] = weights
data[!, :probs] = 1 ./ data[!, :weights]
end
# @show clusters, strata, sampsize,popsize, sampfraction, fpc, ignorefpc
new(data, clusters, strata, sampsize, popsize, sampfraction, fpc, ignorefpc)
elseif isa(clusters,Symbol)
# One Cluster sampling - PSU chosen with SRS,
print("One stage cluster design with PSU SRS")
elseif typeof(clusters) <: Vector{Symbol}
# TODO "Multistage cluster designs"
print("TODO: Multistage cluster design with PSU SRS")
end
end
end
46 changes: 36 additions & 10 deletions src/ht.jl
Original file line number Diff line number Diff line change
@@ -1,17 +1,43 @@
# TODO: add docstrings
function HorwitzThompson_HartleyRaoApproximation(y::AbstractVector, design::AbstractSurveyDesign, HT_total)
"""
Hartley Rao Approximation of the variance of the Horvitz-Thompson Estimator.
Avoids exhaustively calculating joint (inclusion) probabilities πᵢⱼ of the sampling scheme.
"""
function HT_HartleyRaoVarApprox(y::AbstractVector, design::AbstractSurveyDesign, HT_total)
# TODO: change function name
# TODO: change variable name (gets mixed up with the constant pi)
pi = design.data.probs
hartley_rao_var = sum((1 .- ((design.sampsize .- 1) ./ design.sampsize) .* pi) .*
πᵢ = design.data.probs
hartley_rao_var = sum((1 .- ((design.sampsize .- 1) ./ design.sampsize) .* πᵢ) .*
((y .* design.data.weights) .- (HT_total ./ design.sampsize)) .^ 2)
return sqrt(hartley_rao_var)
return hartley_rao_var
end

# TODO: add docstrings
function ht_calc(x::Symbol, design)
# TODO: change function name
"""
Horvitz-Thompson Estimator of Population Total
For arbitrary sampling probabilities
"""
function ht_svytotal(x::Symbol, design)
total = wsum(Float32.(design.data[!, x]), design.data.weights)
se = HorwitzThompson_HartleyRaoApproximation(design.data[!, x], design, total)
return DataFrame(total = total, se = se)
var_total = HT_HartleyRaoVarApprox(design.data[!, x], design, total)
return DataFrame(total = total, se = sqrt(var_total))
end

# TODO: add docstrings
"""
Horvitz-Thompson Estimator of Population Mean
Scales the Population Total by the relevant
"""
function ht_svymean(x::Symbol, design)
total_df = ht_svytotal(x, design)
total = total_df.total[1]
var_total = total_df.se[1]

mean = 1.0 ./ sum(design.popsize) .* total
# @show total_df, total, var_total, mean
#1.0 ./ (sum(design.popsize).^2) .*

se = sqrt( var_total )

@show design.popsize

return DataFrame(mean = mean, se = se)
end
3 changes: 3 additions & 0 deletions src/poststratify.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"""
Postratification functions
"""
3 changes: 3 additions & 0 deletions src/sampling.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"""
Sampling functions using various schemes
"""
17 changes: 17 additions & 0 deletions src/show.jl
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,23 @@ function Base.show(io::IO, ::MIME"text/plain", design::AbstractSurveyDesign)
printinfo(io, "ignorefpc", string(design.ignorefpc); newline=false)
end

"`show` method for printing information about a survey design"
function Base.show(io::IO, ::MIME"text/plain", design::SurveyDesign)
type = typeof(design)
printstyled(io, "$type:\n"; bold=true)
printstyled(io, "data: "; bold=true)
println(io, size(design.data, 1), "x", size(design.data, 2), " DataFrame")
println(io, "clusters: ", design.clusters)
println(io, "strata: ", design.strata)
printinfo(io, "weights", makeshort(design.data.weights))
printinfo(io, "probs", makeshort(design.data.probs))
printinfo(io, "popsize", makeshort(design.popsize))
printinfo(io, "sampsize", makeshort(design.sampsize))
printinfo(io, "fpc", makeshort(design.data.fpc))
printinfo(io, "sampfraction", makeshort(design.sampfraction))
printinfo(io, "ignorefpc", string(design.ignorefpc); newline=false)
end

"Print information about a survey design initialized using `svydesign`."
function Base.show(io::IO, ::MIME"text/plain", design::svydesign)
printstyled(io, "Survey Design:\n"; bold=true)
Expand Down
14 changes: 14 additions & 0 deletions src/svyratio.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""
Ratio estimators and Subpopulation estimates.
Domain Estimation is special case of subpopulation estimate.
"""

function ratio(x::Symbol,y::Symbol,design::AbstractSurveyDesign)
num = design.data[!,x]
denom = design.data[!,y]
# fill(A,)
# A == B

mean = num / denom
return DataFrame(mean = mean)
end
Loading