Skip to content

Commit

Permalink
Merge pull request #63 from iuliadmtru/minor_corrections
Browse files Browse the repository at this point in the history
Minor corrections
  • Loading branch information
ayushpatnaikgit authored Oct 16, 2022
2 parents d1bf631 + 86d139d commit ffea267
Show file tree
Hide file tree
Showing 17 changed files with 177 additions and 514 deletions.
3 changes: 2 additions & 1 deletion src/Survey.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@ using GLM
using LinearAlgebra
using CairoMakie
using AlgebraOfGraphics
using CategoricalArrays # For CategoricalArray support
using CategoricalArrays

include("SurveyDesign.jl")
include("show.jl")
include("svydesign.jl")
include("svymean.jl")
include("svyquantile.jl")
Expand Down
269 changes: 68 additions & 201 deletions src/SurveyDesign.jl

Large diffs are not rendered by default.

89 changes: 13 additions & 76 deletions src/ht.jl
Original file line number Diff line number Diff line change
@@ -1,80 +1,17 @@
function HorwitzThompson_HartleyRaoApproximation(y::AbstractVector, design, HT_total)
# TODO: add docstrings
function HorwitzThompson_HartleyRaoApproximation(y::AbstractVector, design::AbstractSurveyDesign, HT_total)
# TODO: change function name
# TODO: change variable name (gets mixed up with the constant pi)
pi = design.data.probs
hartley_rao_variance_formula = sum( (1 .- ((design.sampsize .- 1) ./ design.sampsize ) .* pi ) .* ( (y .* design.data.weights) .- (HT_total./design.sampsize) ).^2 )
return sqrt(hartley_rao_variance_formula)
hartley_rao_var = sum((1 .- ((design.sampsize .- 1) ./ design.sampsize) .* pi) .*
((y .* design.data.weights) .- (HT_total ./ design.sampsize)) .^ 2)
return sqrt(hartley_rao_var)
end

function ht_calc(x::Symbol,design)
total = wsum(Float32.( design.data[!,x] ), design.data.weights )
# se = hte_se( design.data[!,x], design ) # works
se = HorwitzThompson_HartleyRaoApproximation( design.data[!,x], design , total) # Also works
return DataFrame(total = total, se = se )
# TODO: add docstrings
function ht_calc(x::Symbol, design)
# TODO: change function name
total = wsum(Float32.(design.data[!, x]), design.data.weights)
se = HorwitzThompson_HartleyRaoApproximation(design.data[!, x], design, total)
return DataFrame(total = total, se = se)
end

# function HansenHurwitzEstimator()

# end

# function HorwitzThompson_HartleyRaoApproximation(y::Vector{Int}, design, HT_total)
# pi = design.data.probs
# hartley_rao_variance_formula = sum( (1 .- ((design.sampsize .- 1) ./ design.sampsize ) .* pi ) .* ( (y./pi) .- (total./design.sampsize) ).^2 )
# return sqrt(hartley_rao_variance_formula)
# end

# # prob = 1 .- ((1 .- pi ) .^ (1/n))
# # For SRS only, in other design, we need to make a function
# # pi_i = design.sampsize / design.popsize
# # pi_j = design.sampsize / design.popsize

# # Use Hartley Rao approximation to calc joint probability of selecting two units, for any sample
# pi_ij = hartley_rao_approx() # design.sampsize .* (design.sampsize .- 1 ) ./ ( design.popsize .* (design.popsize .- 1) )

# first_sum = sum((1 .- pi) ./ (pi .^ 2) .* (y .^ 2))
# @show pi, pi_ij, first_sum
# second_sum = 0 # sum( (pi_ij - ( pi_i * pi_j) ) / (pi_i * pi_j * pi_ij) .* y[i] .* y[j] )

# # pimat = zeros(length(pi),length(pi))
# # coefmat=zeros(length(pi),length(pi))

# for i in 1:length(pi)
# for j in 1:length(pi)
# if i != j
# # pimat[i,j] = pi[i] + pi[j] - (1 - ((1 - prob[i] - prob[j] ) ^ n))
# coefmat = ((pi_ij - ( pi_i * pi_j) ) / (pi_i * pi_j * pi_ij))
# second_sum += coefmat * y[i] * y[j]
# end
# end
# end

# return sqrt(first_sum + second_sum)
# # catch
# # return -1
# # end
# end

# function hte_se(y::AbstractVector, design::SimpleRandomSample)
# pi = design.data.probs

# # For SRS only, in other design, we need to make a function
# pi_i = design.sampsize / design.popsize
# pi_j = design.sampsize / design.popsize
# pi_ij = design.sampsize .* (design.sampsize .- 1 ) ./ ( design.popsize .* (design.popsize .- 1) )

# first_sum = sum((1 .- pi_i) ./ (pi_i .^ 2) .* (y .^ 2))
# @show pi_i , pi_j, pi_ij, first_sum

# second_sum = 0
# coefmat = ((pi_ij - ( pi_i * pi_j) ) / (pi_i * pi_j * pi_ij))
# for i in 1:length(pi)
# for j in 1:length(pi)
# if i != j
# second_sum += coefmat * y[i] * y[j]
# end
# end
# end
# @show second_sum
# return sqrt(first_sum + second_sum)
# end



30 changes: 30 additions & 0 deletions src/show.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"Helper function for nice printing"
function makeshort(x)
# write floats in short form
if isa(x[1], Float64)
x = round.(x, sigdigits=3)
end
# print short vectors or single values as they are, compress otherwise
x = length(x) < 3 ? join(x, ", ") : join(x[1:3], ", ") * ", ..., " * string(last(x))
end

"Prints title: content"
function printinfo(io::IO, name::String, content::String; newline::Bool=true)
printstyled(io, name, ": "; bold=true)
newline ? println(io, content) : print(io, content)
end

"`show` method for printing information about a survey design"
function Base.show(io::IO, ::MIME"text/plain", design::AbstractSurveyDesign)
type = typeof(design)
printstyled(io, "$type:\n"; bold=true)
printstyled(io, "data: "; bold=true)
println(io, size(design.data, 1), "x", size(design.data, 2), " DataFrame")
printinfo(io, "weights", makeshort(design.data.weights))
printinfo(io, "probs", makeshort(design.data.probs))
printinfo(io, "fpc", makeshort(design.data.fpc))
printinfo(io, "popsize", makeshort(design.popsize))
printinfo(io, "sampsize", makeshort(design.sampsize))
printinfo(io, "sampfraction", makeshort(design.sampfraction))
printinfo(io, "ignorefpc", string(design.ignorefpc); newline=false)
end
2 changes: 1 addition & 1 deletion src/svyby.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ julia> svyby(:api00, :cname, srs, svytotal)
38 │ Merced 595.0 NaN
23 rows omitted
```
TODO: functionality for `formula::AbstractVector`
"""
function svyby(formula::Symbol, by::Symbol, design::AbstractSurveyDesign, func::Function, params = [])
# TODO: add functionality for `formula::AbstractVector`
gdf = groupby(design.data, by)
return combine(gdf, [formula, :weights] => ((a, b) -> func(a, design, b, params...)) => AsTable)
end
9 changes: 0 additions & 9 deletions src/svydesign.jl
Original file line number Diff line number Diff line change
@@ -1,12 +1,3 @@
# # Helper function for nice printing
# function print_short(x)
# if length(x) < 3
# print(x)
# else
# print( x[1], ", ", x[2], ", ", x[3], " ...", " (length = ", length(x), ")")
# end
# end

"""
The `svydesign` object combines a data frame and all the survey design information needed to analyse it.
Expand Down
85 changes: 25 additions & 60 deletions src/svymean.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SimpleRandomSample

"""
Compute the mean of the survey variable `var`.
Expand Down Expand Up @@ -37,12 +39,12 @@ function sem(x::AbstractVector, design::SimpleRandomSample)
end

function svymean(x::Symbol, design::SimpleRandomSample)
# Support behaviour like R for CategoricalArray type data
if isa(x,Symbol) && isa(design.data[!,x], CategoricalArray)
if isa(x, Symbol) && isa(design.data[!, x], CategoricalArray)
gdf = groupby(design.data, x)
p = combine(gdf, nrow => :counts )
p = combine(gdf, nrow => :counts)
p.proportion = p.counts ./ sum(p.counts)
p.var = design.fpc .* p.proportion .* (1 .- p.proportion) ./ (design.sampsize - 1) # Formula for variance of proportion
# variance of proportion
p.var = design.fpc .* p.proportion .* (1 .- p.proportion) ./ (design.sampsize - 1)
p.se = sqrt.(p.var)
return p
end
Expand All @@ -52,102 +54,65 @@ end
function svymean(x::Vector{Symbol}, design::SimpleRandomSample)
means_list = []
for i in x
push!(means_list, svymean(i,design))
push!(means_list, svymean(i, design))
end
df = reduce(vcat, means_list)
insertcols!(df,1, :names => String.(x))
insertcols!(df, 1, :names => String.(x))
return df
end

"""
Inner method for `svyby`.
"""

function sem_svyby(x::AbstractVector, design::SimpleRandomSample, weights)
# return sqrt( (1 - length(x)/design.popsize) ./ length(x) .* var(x) ) .* sqrt(1 - 1 / length(x) + 1 / design.sampsize)
# f = sqrt()
# pweights = 1 ./ design.data.prob
# N = sum(design.data.weights)
# Nd = sum(weights)
# Nd = length(x)
N = sum(weights)
Nd = length(x)
Pd = Nd/N
Pd = Nd / N
n = design.sampsize
n̄d = n * Pd
Ȳd = mean(x)
S2d = var(x)
Sd = sqrt(S2d)
CVd = Sd / Ȳd
@show(N,Nd,Pd,n,n̄d,Ȳd,S2d,Sd,CVd)
@show((1 ./ n̄d - 1 ./Nd ))
V̂_Ȳd = ((1 ./ n̄d) - (1 ./Nd) ) * S2d * (1 + (1 - Pd) / CVd^2 )
print(V̂_Ȳd)
print("\n")
# print(V̂_Ȳd,"\n")
V̂_Ȳd = ((1 ./ n̄d) - (1 ./ Nd)) * S2d * (1 + (1 - Pd) / CVd^2)
return sqrt(V̂_Ȳd)
end
# vsrs = var_of_mean(x, design)


# vsrs2 = vsrs .* (psum-length(x)) ./ psum
# return sqrt(vsrs2)
# return sqrt( (1 - 1 / length(x) + 1 / design.sampsize) .* var(x) ./ length(x) )

# TODO: results not matching for `sem`
function svymean(x::AbstractVector , design::SimpleRandomSample, weights)
function svymean(x::AbstractVector, design::SimpleRandomSample, weights)
return DataFrame(mean = mean(x), sem = sem_svyby(x, design::SimpleRandomSample, weights))
end

""" mean for Categorical variables
"""

"""
StratifiedSample functions
"""
# StratifiedSample

# function svymean(x::, design::SimpleRandomSample)
# return DataFrame(mean = mean(design.data[!, x]), sem = sem(x, design::SimpleRandomSample))
# end
# function strata_sample_variance(y)
# 1 / ()
# end
function svymean(x::Symbol, design::StratifiedSample)
# Support behaviour like R for CategoricalArray type data
print("Yolo")
if x == design.strata
gdf = groupby(design.data, x)
# nₕ = combine(gdf, nrow => :counts )
p = combine(gdf, :weights => sum => :Nₕ )
p = combine(gdf, :weights => sum => :Nₕ)
p.Wₕ = p.Nₕ ./ sum(p.Nₕ)
p = select!(p, Not(:Nₕ))
# p.proportion = p.counts ./ sum(p.counts)
# @show nₕ , Nₕ , Wₕ
return p
elseif isa(x,Symbol) && isa(design.data[!,x], CategoricalArray)
print("Yolo")
return p
elseif isa(x, Symbol) && isa(design.data[!, x], CategoricalArray)
gdf = groupby(design.data, x)
p = combine(gdf, nrow => :counts )
# @show p
p = combine(gdf, nrow => :counts)
p.proportion = p.counts ./ sum(p.counts)
p.var = design.fpc .* p.proportion .* (1 .- p.proportion) ./ (design.sampsize - 1) # Formula for variance of proportion
# variance of proportion
p.var = design.fpc .* p.proportion .* (1 .- p.proportion) ./ (design.sampsize - 1)
p.se = sqrt.(p.var)
return p
end
gdf = groupby(design.data,design.strata)
gdf = groupby(design.data, design.strata)

ȳₕ = combine(gdf, x => mean => :mean).mean
Nₕ = combine(gdf, :weights => sum => :Nₕ ).Nₕ
Nₕ = combine(gdf, :weights => sum => :Nₕ).Nₕ
nₕ = combine(gdf, nrow => :nₕ).nₕ
fₕ = nₕ ./ Nₕ
Wₕ = Nₕ ./ sum(Nₕ)
Ȳ̂ = sum(Wₕ .* ȳₕ)
# Ȳ̂ = mean(ȳₕ, Wₕ ) # Is also correct

s²ₕ = combine(gdf, x => var => :s²h ).s²h
V̂Ȳ̂ = sum( (Wₕ .^2) .* (1 .- fₕ) .* s²ₕ ./ nₕ )
s²ₕ = combine(gdf, x => var => :s²h).s²h
V̂Ȳ̂ = sum((Wₕ .^ 2) .* (1 .- fₕ) .* s²ₕ ./ nₕ)
SE = sqrt(V̂Ȳ̂)

# @show nₕ , Nₕ , fₕ , ȳₕ , Wₕ , Ȳ̂ , s²ₕ
return DataFrame(Ȳ̂ = Ȳ̂ , SE = SE) # , sem = sem(x, design::SimpleRandomSample))

return DataFrame(Ȳ̂ = Ȳ̂, SE = SE)
end
9 changes: 1 addition & 8 deletions src/svyquantile.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,13 @@ julia> svyquantile(:enroll, srs, 0.5)
1 │ 453.0
```
"""
# TODO: modify for SimpleRandomSample
function svyquantile(var, design::AbstractSurveyDesign)
return error("Please specify q quantile as a vector or float")
end

function svyquantile(var, design::SimpleRandomSample, q; kwargs...)
x = design.data[!, var]
# w = design.data.probs
df = DataFrame(tmp = quantile(Float32.(x), q; kwargs...)) # Define Lumley quantile
df = DataFrame(tmp = quantile(Float32.(x), q; kwargs...))
rename!(df, :tmp => Symbol(string(q) .* "th percentile"))
return df
end

# TODO: modify for StratifiedSample
function svyquantile(var, design::StratifiedSample, q)
x = design.data[!, var]
w = design.data.probs
Expand Down
Loading

0 comments on commit ffea267

Please sign in to comment.