Merge pull request #63 from iuliadmtru/minor_corrections

Minor corrections
xKDR · Oct 16, 2022 · ffea267 · ffea267
2 parents d1bf631 + 86d139d
commit ffea267
Show file tree

Hide file tree

Showing 17 changed files with 177 additions and 514 deletions.
diff --git a/src/Survey.jl b/src/Survey.jl
@@ -8,9 +8,10 @@ using GLM
 using LinearAlgebra
 using CairoMakie
 using AlgebraOfGraphics
-using CategoricalArrays # For CategoricalArray support
+using CategoricalArrays
 
 include("SurveyDesign.jl")
+include("show.jl")
 include("svydesign.jl")
 include("svymean.jl")
 include("svyquantile.jl")

diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl
diff --git a/src/ht.jl b/src/ht.jl
@@ -1,80 +1,17 @@
-function HorwitzThompson_HartleyRaoApproximation(y::AbstractVector, design, HT_total)
+# TODO: add docstrings
+function HorwitzThompson_HartleyRaoApproximation(y::AbstractVector, design::AbstractSurveyDesign, HT_total)
+    # TODO: change function name
+    # TODO: change variable name (gets mixed up with the constant pi)
     pi = design.data.probs
-    hartley_rao_variance_formula = sum( (1 .- ((design.sampsize .- 1) ./ design.sampsize ) .* pi ) .* ( (y .* design.data.weights) .- (HT_total./design.sampsize) ).^2 )
-    return sqrt(hartley_rao_variance_formula)
+    hartley_rao_var = sum((1 .- ((design.sampsize .- 1) ./ design.sampsize) .* pi) .*
+                          ((y .* design.data.weights) .- (HT_total ./ design.sampsize)) .^ 2)
+    return sqrt(hartley_rao_var)
 end
 
-function ht_calc(x::Symbol,design)
-    total = wsum(Float32.( design.data[!,x] ), design.data.weights )
-    # se = hte_se( design.data[!,x], design ) # works
-    se = HorwitzThompson_HartleyRaoApproximation( design.data[!,x], design , total) # Also works
-    return DataFrame(total = total, se = se  )
+# TODO: add docstrings
+function ht_calc(x::Symbol, design)
+    # TODO: change function name
+    total = wsum(Float32.(design.data[!, x]), design.data.weights)
+    se = HorwitzThompson_HartleyRaoApproximation(design.data[!, x], design, total)
+    return DataFrame(total = total, se = se)
 end
-
-# function HansenHurwitzEstimator()
-
-# end
-
-# function HorwitzThompson_HartleyRaoApproximation(y::Vector{Int}, design, HT_total)
-#     pi = design.data.probs
-#     hartley_rao_variance_formula = sum( (1 .- ((design.sampsize .- 1) ./ design.sampsize ) .* pi ) .* ( (y./pi) .- (total./design.sampsize) ).^2 )
-#     return sqrt(hartley_rao_variance_formula)
-# end
-
-#     # prob = 1 .- ((1 .- pi ) .^ (1/n))
-#     # For SRS only, in other design, we need to make a function
-#     # pi_i = design.sampsize / design.popsize
-#     # pi_j = design.sampsize / design.popsize
-
-#     # Use Hartley Rao approximation to calc joint probability of selecting two units, for any sample
-#     pi_ij = hartley_rao_approx() #  design.sampsize .* (design.sampsize .- 1 ) ./ ( design.popsize .* (design.popsize .- 1) )
-
-#     first_sum = sum((1 .- pi) ./ (pi .^ 2) .* (y .^ 2))
-#     @show pi, pi_ij, first_sum
-#     second_sum = 0 # sum( (pi_ij - ( pi_i * pi_j) ) / (pi_i * pi_j * pi_ij) .* y[i] .* y[j] )
-
-#     # pimat = zeros(length(pi),length(pi))
-#     # coefmat=zeros(length(pi),length(pi))
-
-#     for i in 1:length(pi)
-#         for j in 1:length(pi)
-#             if i != j
-#                 # pimat[i,j] = pi[i] + pi[j] - (1 - ((1 - prob[i] - prob[j] ) ^ n))
-#                 coefmat = ((pi_ij - ( pi_i * pi_j) ) / (pi_i * pi_j * pi_ij))
-#                 second_sum += coefmat * y[i] * y[j]
-#             end
-#         end
-#     end
-
-#     return sqrt(first_sum + second_sum)
-#     # catch
-#     #     return -1
-#     # end
-# end
-
-# function hte_se(y::AbstractVector, design::SimpleRandomSample)
-#     pi = design.data.probs
-
-#     # For SRS only, in other design, we need to make a function
-#     pi_i = design.sampsize / design.popsize
-#     pi_j = design.sampsize / design.popsize
-#     pi_ij = design.sampsize .* (design.sampsize .- 1 ) ./ ( design.popsize .* (design.popsize .- 1) )
-
-#     first_sum = sum((1 .- pi_i) ./ (pi_i .^ 2) .* (y .^ 2))
-#     @show pi_i , pi_j, pi_ij, first_sum
-
-#     second_sum = 0
-#     coefmat = ((pi_ij - ( pi_i * pi_j) ) / (pi_i * pi_j * pi_ij))
-#     for i in 1:length(pi)
-#         for j in 1:length(pi)
-#             if i != j
-#                 second_sum += coefmat * y[i] * y[j]
-#             end
-#         end
-#     end
-#     @show second_sum
-#     return sqrt(first_sum + second_sum)
-# end
-
-
-
diff --git a/src/show.jl b/src/show.jl
@@ -0,0 +1,30 @@
+"Helper function for nice printing"
+function makeshort(x)
+    # write floats in short form
+    if isa(x[1], Float64)
+        x = round.(x, sigdigits=3)
+    end
+    # print short vectors or single values as they are, compress otherwise
+    x = length(x) < 3 ? join(x, ", ") : join(x[1:3], ", ") * ", ..., " * string(last(x))
+end
+
+"Prints title: content"
+function printinfo(io::IO, name::String, content::String; newline::Bool=true)
+    printstyled(io, name, ": "; bold=true)
+    newline ? println(io, content) : print(io, content)
+end
+
+"`show` method for printing information about a survey design"
+function Base.show(io::IO, ::MIME"text/plain", design::AbstractSurveyDesign)
+    type = typeof(design)
+    printstyled(io, "$type:\n"; bold=true)
+    printstyled(io, "data: "; bold=true)
+    println(io, size(design.data, 1), "x", size(design.data, 2), " DataFrame")
+    printinfo(io, "weights", makeshort(design.data.weights))
+    printinfo(io, "probs", makeshort(design.data.probs))
+    printinfo(io, "fpc", makeshort(design.data.fpc))
+    printinfo(io, "popsize", makeshort(design.popsize))
+    printinfo(io, "sampsize", makeshort(design.sampsize))
+    printinfo(io, "sampfraction", makeshort(design.sampfraction))
+    printinfo(io, "ignorefpc", string(design.ignorefpc); newline=false)
+end
diff --git a/src/svyby.jl b/src/svyby.jl
@@ -31,9 +31,9 @@ julia> svyby(:api00, :cname, srs, svytotal)
   38 │ Merced             595.0   NaN
                            23 rows omitted
 ```
-TODO: functionality for `formula::AbstractVector`
 """
 function svyby(formula::Symbol, by::Symbol, design::AbstractSurveyDesign, func::Function, params = [])
+    # TODO: add functionality for `formula::AbstractVector`
     gdf = groupby(design.data, by)
     return combine(gdf, [formula, :weights] => ((a, b) -> func(a, design, b, params...)) => AsTable)
 end
diff --git a/src/svydesign.jl b/src/svydesign.jl
@@ -1,12 +1,3 @@
-# # Helper function for nice printing
-# function print_short(x)
-#     if length(x) < 3
-#         print(x)
-#     else
-#         print( x[1], ", ", x[2], ", ", x[3], " ...", " (length = ", length(x), ")")
-#     end
-# end
-
 """
 The `svydesign` object combines a data frame and all the survey design information needed to analyse it.
 

diff --git a/src/svymean.jl b/src/svymean.jl
@@ -1,3 +1,5 @@
+# SimpleRandomSample
+
 """
 Compute the mean of the survey variable `var`.
 
@@ -37,12 +39,12 @@ function sem(x::AbstractVector, design::SimpleRandomSample)
 end
 
 function svymean(x::Symbol, design::SimpleRandomSample)
-    # Support behaviour like R for CategoricalArray type data
-    if isa(x,Symbol) && isa(design.data[!,x], CategoricalArray)
+    if isa(x, Symbol) && isa(design.data[!, x], CategoricalArray)
         gdf = groupby(design.data, x)
-        p = combine(gdf, nrow => :counts )
+        p = combine(gdf, nrow => :counts)
         p.proportion = p.counts ./ sum(p.counts)
-        p.var = design.fpc .* p.proportion .* (1 .- p.proportion) ./ (design.sampsize - 1) # Formula for variance of proportion
+        # variance of proportion
+        p.var = design.fpc .* p.proportion .* (1 .- p.proportion) ./ (design.sampsize - 1)
         p.se = sqrt.(p.var)
         return p
     end
@@ -52,102 +54,65 @@ end
 function svymean(x::Vector{Symbol}, design::SimpleRandomSample)
     means_list = []
     for i in x
-        push!(means_list, svymean(i,design))
+        push!(means_list, svymean(i, design))
     end
     df = reduce(vcat, means_list)
-    insertcols!(df,1, :names => String.(x))
+    insertcols!(df, 1, :names => String.(x))
     return df
 end
 
 """
 Inner method for `svyby`.
 """
-
 function sem_svyby(x::AbstractVector, design::SimpleRandomSample, weights)
-    # return sqrt( (1 - length(x)/design.popsize) ./ length(x) .* var(x) ) .*  sqrt(1 - 1 / length(x) + 1 / design.sampsize)
-    # f =  sqrt()
-    # pweights = 1 ./ design.data.prob
-    # N = sum(design.data.weights)
-    # Nd = sum(weights)
-    # Nd = length(x)
     N = sum(weights)
     Nd = length(x)
-    Pd = Nd/N
+    Pd = Nd / N
     n = design.sampsize
     n̄d = n * Pd
     Ȳd = mean(x)
     S2d = var(x)
     Sd = sqrt(S2d)
     CVd = Sd / Ȳd
-    @show(N,Nd,Pd,n,n̄d,Ȳd,S2d,Sd,CVd)
-    @show((1 ./ n̄d - 1 ./Nd ))
-    V̂_Ȳd = ((1 ./ n̄d) - (1 ./Nd) ) * S2d * (1 + (1 - Pd) /  CVd^2 )
-    print(V̂_Ȳd)
-    print("\n")
-    # print(V̂_Ȳd,"\n")
+    V̂_Ȳd = ((1 ./ n̄d) - (1 ./ Nd)) * S2d * (1 + (1 - Pd) / CVd^2)
     return sqrt(V̂_Ȳd)
 end
-    # vsrs = var_of_mean(x, design)
-
-
-    # vsrs2 = vsrs .* (psum-length(x)) ./ psum
-    # return sqrt(vsrs2)
-    # return sqrt( (1 - 1 / length(x) + 1 / design.sampsize) .* var(x) ./ length(x) ) 
 
-# TODO: results not matching for `sem`
-function svymean(x::AbstractVector , design::SimpleRandomSample, weights)
+function svymean(x::AbstractVector, design::SimpleRandomSample, weights)
     return DataFrame(mean = mean(x), sem = sem_svyby(x, design::SimpleRandomSample, weights))
 end
 
-""" mean for Categorical variables 
-"""
 
-"""
-StratifiedSample functions
-"""
+# StratifiedSample
 
-# function svymean(x::, design::SimpleRandomSample)
-#     return DataFrame(mean = mean(design.data[!, x]), sem = sem(x, design::SimpleRandomSample))
-# end
-# function strata_sample_variance(y)
-#     1 / ()
-# end
 function svymean(x::Symbol, design::StratifiedSample)
-    # Support behaviour like R for CategoricalArray type data
-    print("Yolo")
     if x == design.strata
         gdf = groupby(design.data, x)
-        # nₕ = combine(gdf, nrow => :counts )
-        p = combine(gdf, :weights => sum => :Nₕ )
+        p = combine(gdf, :weights => sum => :Nₕ)
         p.Wₕ = p.Nₕ ./ sum(p.Nₕ)
         p = select!(p, Not(:Nₕ))
-        # p.proportion = p.counts ./ sum(p.counts)
-        # @show nₕ , Nₕ , Wₕ
-        return p 
-    elseif isa(x,Symbol) && isa(design.data[!,x], CategoricalArray)
-        print("Yolo")
+        return p
+    elseif isa(x, Symbol) && isa(design.data[!, x], CategoricalArray)
         gdf = groupby(design.data, x)
-        p = combine(gdf, nrow => :counts )
-        # @show p
+        p = combine(gdf, nrow => :counts)
         p.proportion = p.counts ./ sum(p.counts)
-        p.var = design.fpc .* p.proportion .* (1 .- p.proportion) ./ (design.sampsize - 1) # Formula for variance of proportion
+        # variance of proportion
+        p.var = design.fpc .* p.proportion .* (1 .- p.proportion) ./ (design.sampsize - 1)
         p.se = sqrt.(p.var)
         return p
     end
-    gdf = groupby(design.data,design.strata)
-    
+    gdf = groupby(design.data, design.strata)
+
     ȳₕ = combine(gdf, x => mean => :mean).mean
-    Nₕ = combine(gdf, :weights => sum => :Nₕ ).Nₕ
+    Nₕ = combine(gdf, :weights => sum => :Nₕ).Nₕ
     nₕ = combine(gdf, nrow => :nₕ).nₕ
     fₕ = nₕ ./ Nₕ
     Wₕ = Nₕ ./ sum(Nₕ)
     Ȳ̂ = sum(Wₕ .* ȳₕ)
-    # Ȳ̂ = mean(ȳₕ, Wₕ ) # Is also correct 
 
-    s²ₕ = combine(gdf, x => var => :s²h ).s²h
-    V̂Ȳ̂ = sum( (Wₕ .^2) .* (1 .- fₕ) .* s²ₕ ./ nₕ )
+    s²ₕ = combine(gdf, x => var => :s²h).s²h
+    V̂Ȳ̂ = sum((Wₕ .^ 2) .* (1 .- fₕ) .* s²ₕ ./ nₕ)
     SE = sqrt(V̂Ȳ̂)
-
-    # @show nₕ , Nₕ , fₕ , ȳₕ , Wₕ , Ȳ̂ , s²ₕ
-    return DataFrame(Ȳ̂ = Ȳ̂ , SE = SE) # , sem = sem(x, design::SimpleRandomSample))
+
+    return DataFrame(Ȳ̂ = Ȳ̂, SE = SE)
 end
diff --git a/src/svyquantile.jl b/src/svyquantile.jl
@@ -14,20 +14,13 @@ julia> svyquantile(:enroll, srs, 0.5)
    1 │            453.0
 ```
 """
-# TODO: modify for SimpleRandomSample
-function svyquantile(var, design::AbstractSurveyDesign)
-    return error("Please specify q quantile as a vector or float")
-end
-
 function svyquantile(var, design::SimpleRandomSample, q; kwargs...)
     x = design.data[!, var]
-    # w = design.data.probs
-    df = DataFrame(tmp = quantile(Float32.(x), q; kwargs...)) # Define Lumley quantile
+    df = DataFrame(tmp = quantile(Float32.(x), q; kwargs...))
     rename!(df, :tmp => Symbol(string(q) .* "th percentile"))
     return df
 end
 
-# TODO: modify for StratifiedSample
 function svyquantile(var, design::StratifiedSample, q)
     x = design.data[!, var]
     w = design.data.probs