diff --git a/clean_examples.jl b/clean_examples.jl new file mode 100644 index 00000000..7bcb4cb7 --- /dev/null +++ b/clean_examples.jl @@ -0,0 +1,168 @@ +# ### Lumley Texbook code, Fig 2.2 pg 20 +using Revise +using Survey +using DataFrames +using CSV + +# Load in dataframe +apisrs = CSV.read("assets/apisrs.csv",DataFrame) + +### Set design (All should give identical results) +srs_design = SimpleRandomSample(apisrs, popsize = apisrs.fpc) # popsize only +srs_design = SimpleRandomSample(apisrs, weights = apisrs.pw) # no popsize, so weights given as Vector +srs_design = SimpleRandomSample(apisrs, weights = :pw) # no popsize, so weights given as Symbol +srs_design = SimpleRandomSample(apisrs, probs = 1 ./ apisrs.pw) # no popsize, so probs given as Vector + +svytotal(:enroll,srs_design) +svymean([:enroll,:api00],srs_design) +svymean(:enroll,srs_design) + +# svytotal error +svytotal(:api00, srs) + +# No fpc example +no_fpc = SimpleRandomSample(apisrs, ignorefpc = true) +svytotal(:enroll,no_fpc) +svytotal(:api00,no_fpc) +svymean(:enroll,no_fpc) + +#### +using Revise +using Survey +using DataFrames +using CSV +using CategoricalArrays +# Test feature for categorical variables +apisrs_categ = CSV.read("assets/apisrs.csv",DataFrame) +eltype(apisrs_categ.stype) +# Convert a column to CategoricalArray +apisrs_categ.stype = CategoricalArray(apisrs_categ.stype) +eltype(apisrs_categ.stype) + +srs_design_categ = SimpleRandomSample(apisrs_categ, popsize = apisrs_categ.fpc) + +# isa(srs_design_categ.data.stype, CategoricalArray) +# isa(srs_design_categ.data[!,:stype], CategoricalArray) + +# Svymean and svytotal example +svymean(:enroll,srs_design_categ) # works +svymean(:stype,srs_design_categ) # no method matching /(::CategoricalValue{String1, UInt32}, ::Int64) +svytotal(:stype,srs_design_categ) + +# way to update +srs_design.data.apidiff = srs_design.data.api00 - srs_design.data.api99 + + +svyquantile(:enroll, srs_design_categ,0.5) + +# isa(srs_design_categ.data.stype, CategoricalArray) + + +# # apisrs = DataFrame(CSV.file("data/apisrs.csv")) +# # Base.format_bytes(Base.summarysize(apisrs.stype)) +# # Base.format_bytes(Base.summarysize(CategoricalArray(apisrs.stype))) + + +# ### Test 10.09.22 + +# gdf = groupby(design.data, by) +# combine(gdf, [formula, :weights] => ((a, b) -> func(a, design, b, params...)) => AsTable) + +# using Revise +# using Survey +# using DataFrames +# using CSV +# using StatsBase + +# apisrs_categ = CSV.read("assets/apisrs.csv",DataFrame) # laod data +# srs_design = SimpleRandomSample(apisrs_categ, popsize = apisrs_categ.fpc) # create design object +# # manually grouby to get result +# gdf = groupby(srs_design.data, :cname ) +# combine(gdf, :api00 => mean) # works +# combine(gdf, (:api00,srs_design) => svymean) + +# combine(gdf, [:api00, :pw] => ((a, b) -> svymean(a, srs_design, b)) => AsTable) + +# Test 12.09.22 +using Revise +using Survey +using DataFrames +using CSV +using StatsBase +apisrs_categ = CSV.read("assets/apisrs.csv",DataFrame) # laod data +srs_design = SimpleRandomSample(apisrs_categ, popsize = apisrs_categ.fpc) # create design object +gdf = groupby(srs_design.data, :cname ) +combine(gdf, [:api00, :pw] => ((a, b) -> svymean(a, srs_design, b)) => AsTable) + + + + + # # print("Yolo") + # test = combine(gdf, x => mean => :mean) # |> DataFrame |> AsTable # , (x , design) => sem => :sem ) |> DataFrame + # @show test + # # show(test) + # # delay(50000) + # return 0 + +## 21.09.22 Stratified test 1 +# Ideally you should stratify on a CategoricalArray, alternatively, convert the StringX to categorical value before running stratifiedSample +using Revise +using Survey +using DataFrames +using CSV +using StatsBase +using CategoricalArrays + +apistrat_categ = CSV.read("assets/apistrat.csv",DataFrame) # load data +apistrat_categ.stype = CategoricalArray(apistrat_categ.stype) +eltype(apistrat_categ.stype) + +strat_categ_design = StratifiedSample(apistrat_categ, :stype ; popsize = apistrat_categ.fpc ) +svymean(:stype,strat_categ_design) +svytotal(:stype,strat_categ_design) + +### Strat normal +using Revise +using Survey +using DataFrames +using CSV +using StatsBase + +apistrat = CSV.read("assets/apistrat.csv",DataFrame) # laod data +strat_design = StratifiedSample(apistrat, :stype ; popsize = apistrat.fpc ) +svytotal(:api00,strat_design) +svymean(:api00,strat_design) + +svytotal(:enroll,strat_design) +svymean(:enroll,strat_design) + +# Support for categorical var + +# Test feature for categorical variables + + +srs_design_categ = SimpleRandomSample(apisrs_categ, popsize = apisrs_categ.fpc) + +# V̂ȳₕ = Nₕ .^2 ./ nₕ .* (1 .- fₕ) .* s²ₕ + # V̂Ȳ̂ = 1 ./ sum(Nₕ) .* sum( Nₕ .^2 .* V̂ȳₕ) #(Nₕ .^ 2) .* design.fpc .* s²h ./ design.sampsize # sum(combine(gdf, [x,:weights] => ( (a,b) -> wsum(a,b) ) => :total).total) + + +StratifiedSample(apistrat, :stype ; weights = :pw ) + + +## 26.09.22 HT test +using Revise +using Survey +using DataFrames +using CSV + +# Load in dataframe +apisrs = CSV.read("assets/apisrs.csv",DataFrame) + +### Set design (All should give identical results) +srs_design = SimpleRandomSample(apisrs, popsize = apisrs.fpc) # popsize only + +ht_calc(:api00, srs_design) + + +ht_calc(:api00, strat_design) \ No newline at end of file diff --git a/shikharTests.jl b/shikharTests.jl new file mode 100644 index 00000000..e311257b --- /dev/null +++ b/shikharTests.jl @@ -0,0 +1,101 @@ +## Shikhar added test 24.08.22 +using Revise; +using Survey; +apisrs = load_data("apisrs"); +srs = SimpleRandomSample(apisrs, weights = apisrs.pw ); +svymean(:enroll, srs) + +# Test without fpc +using Revise; +using Survey; +apisrs_nofpc = load_data("apisrs"); +srs = SimpleRandomSample(apisrs_nofpc,weights = apisrs.pw,ignorefpc = true); +svytotal(:enroll, srs) + +using Revise; +using Survey; +using DataFrames; +apisrs = load_data("apisrs"); +srs = SimpleRandomSample(apisrs, weights = apisrs.pw ); +svytotal(:enroll, srs) + +srs_design = SimpleRandomSample(apisrs, weights = apisrs.pw ); +factor_variable_test = svytotal(:stype, srs) + +########## +using Survey +srs_design = SimpleRandomSample(apisrs, weights = apisrs.pw ) + + +macro svypipe(design::AbstractSurveyDesign, args...) + # Some definitions +end +@svypipe design |> groupby(:country) |> mean(:height) + +using StatsBase +combine(groupby(x, :country) , :height => mean) + +# Works +@pipe x |> groupby(_, :country) |> combine(_, :height => mean) +#doesnt work +@pipe x |> groupby(:country) |> combine(_, :height => mean) + +using Lazy +import DataFrames.groupby +@> x groupby(:country) combine(:height => mean) + + + + +### Test svyby +svyby(:api00,:cname, srs, svymean ) +groupby(apisrs,:cname) +combine(groupby(apisrs,:cname) , :api00 => mean) +combine(groupby(apisrs,:cname) , :api00 => svymean => AsTable) + + + + +x = DataFrame(country = [1,2,3,4,4], height = [10,20,30,40,20]) + +svyby(srs_desing, [enroll,] , summarise = mean, col = col1) + +(srs_design, enroll) + +# function |> (design::AbstractSurveyDesign ; func) +# design.data |> func(...) +# end + + + +### 5.09.22 Cleaned up tests +using Revise; +using Survey; +apisrs = load_data("apisrs"); +srs = SimpleRandomSample(apisrs, weights = apisrs.pw ); +svymean(:enroll, srs) + + +# New issue: +# Add CategoricalArrays ("Factor") support, multiple dispatch +# Add multiple dispatch methods for `CategoricalArray` type columns in the dataset + +# • Intelligent parsing of `StringX` columns to be read as CategoricalArrays. +# Eg/ if nunique(col) < len(col)/2 + + # # If sampling probabilities given then sampling weights is inverse of probs + # if !isnothing(probs) + # weights = 1 ./ probs + # end + + + # sampsize::Union{Nothing,Vector{<:Real}} + # popsize::Union{Nothing,Vector{<:Real}} + # sampfraction::Vector{<:Real} + # fpc::Vector{<:Real} + # combine(gdf) do sdf + # DataFrame(mean = mean(sdf[!, x], sem = sem(x, design::SimpleRandomSample))) + # end + + # if isa(x,Symbol) && + # return DataFrame(mean = ["Yolo"], sem = ["Yolo"]) \ No newline at end of file diff --git a/src/Survey.jl b/src/Survey.jl index ebfa08fc..0e13c280 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -22,12 +22,14 @@ include("svyplot.jl") include("dimnames.jl") include("svyboxplot.jl") include("svyby.jl") +include("ht.jl") export load_data export AbstractSurveyDesign, SimpleRandomSample, StratifiedSample export svydesign export svyglm export svyby +export ht_calc export dim, colnames, dimnames export svymean, svytotal, svyquantile export @formula diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index abb8b1a0..239f8a55 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -114,9 +114,9 @@ function Base.show(io::IO, ::MIME"text/plain", design::SimpleRandomSample) printstyled("Simple Random Sample:\n"; bold=true) printstyled("data: "; bold=true) print(size(design.data, 1), "x", size(design.data, 2), " DataFrame") - printstyled("\nweights: "; bold=true) + printstyled("\ndata.weights: "; bold=true) print_short(design.data.weights) - printstyled("\nprobs: "; bold=true) + printstyled("\ndata.probs: "; bold=true) print_short(design.data.probs) printstyled("\nfpc: "; bold=true) print_short(design.fpc) @@ -136,22 +136,89 @@ end Survey design sampled by stratification. """ struct StratifiedSample <: AbstractSurveyDesign - data::DataFrame - sampsize::UInt - popsize::Union{UInt,Nothing} - sampfraction::Real - fpc::Real - nofpc::Bool - function StratifiedSample(data::DataFrame, strata::AbstractVector; weights=ones(nrow(data)), probs=1 ./ weights) - # add frequency weights, probability weights and sample size columns - data[!, :weights] = weights - data[!, :probs] = probs - # TODO: change `sampsize` and `popsize` - data[!, :popsize] = repeat([nrow(data)], nrow(data)) - data[!, :sampsize] = repeat([nrow(data)], nrow(data)) - data[!, :strata] = strata + data::AbstractDataFrame + strata::Symbol + sampsize::Vector{Union{Nothing,Float64}} + popsize::Vector{Union{Nothing,Float64}} + sampfraction::Vector{Real} + fpc::Vector{Real} + ignorefpc::Bool + function StratifiedSample(data::AbstractDataFrame, strata::Symbol; + popsize=nothing, + sampsize= transform(groupby(data,strata), nrow => :counts ).counts , + weights=nothing, # Check the defaults + probs=nothing, + ignorefpc=false + ) + # Functionality: weights arg can be passed as Symbol instead of vector + if isa(weights, Symbol) + weights = data[!, weights] + end + # Set population size if it is not given; `weights` and `sampsize` must be given + if ignorefpc # && (isnothing(popsize) || isnothing(weights) || isnothing(probs)) + @warn "Assuming equal weights" + weights = ones(nrow(data)) + end + if isnothing(popsize) + # if typeof(weights) <: Vector{<:Real} + # if !all(y -> y == first(weights), weights) # SRS by definition is equi-weighted + # error("Simple Random Sample must be equi-weighted. Different sampling weights detected in vectors") + # end + if isa(weights, Symbol) + weights = data[!, weights] + # if !all(y -> y == first(weights), weights) # SRS by definition is equi-weighted + # error("Simple Random Sample must be equi-weighted. Different sampling weights detected in vectors") + # end + elseif typeof(probs) <: Vector{<:Real} + # if !all(y -> y == first(probs), probs) # SRS by definition is equi-weighted + # error("Simple Random Sample must be equi-weighted. Different sampling probabilities detected in vector") + # end + weights = 1 ./ probs + elseif isa(probs, Symbol) + probs = data[!, probs] + # if !all(y -> y == first(probs), probs) # SRS by definition is equi-weighted + # error("Simple Random Sample must be equi-weighted. Different sampling probabilities detected in vector") + # end + weights = 1 ./ probs + end + # If all weights are equal then estimate + # equal_weight = first(weights) + popsize = sampsize .* weights # |> Vector{Float64} - new(data,sampsize,popsize,sampfraction,fpc,nofpc) + if sampsize > popsize + error("You have either given wrong or not enough keyword args. sampsize cannot be greate than popsize. Check given inputs. eg if weights given then popsize must be given (for now)") + end + elseif typeof(popsize) <: Vector{<:Real} + # if !all(y -> y == first(popsize), popsize) # SRS by definition is equi-weighted + # error("Simple Random Sample must be equi-weighted. Different sampling weights detected in vectors") + # end + # @show(popsize,sampsize) + weights = popsize ./ sampsize # This line is expansion estimator, we may need to change it when doing compley surveys + # popsize = first(popsize) |> UInt + else + error("If popsize not given then either sampling weights or sampling probabilities must be given") + end + # set sampling fraction + sampfraction = sampsize ./ popsize + # set fpc + fpc = ignorefpc ? 1 : 1 .- (sampsize ./ popsize) + # Add columns for weights and probs in data -- Check if really needed to add them as columns + if !isnothing(probs) + # add probability weights column to `data` + data[!, :probs] = probs + # add frequency weights column to `data` + data[!, :weights] = 1 ./ data[!, :probs] + else # else weights were specified + # add frequency weights column to `data` + data[!, :weights] = weights + # add probability weights column to `data` + data[!, :probs] = 1 ./ data[!, :weights] + end + new(data, strata, sampsize, popsize, sampfraction, fpc, ignorefpc) + end + function StratifiedSample(data::AbstractDataFrame,strata::Symbol) + ignorefpc = true + return StratifiedSample(data,strata; popsize=nothing,sampsize= transform(groupby(data,strata), nrow => :counts ).counts, weights=nothing, probs=nothing, ignorefpc=ignorefpc) end end @@ -160,31 +227,114 @@ function Base.show(io::IO, design::StratifiedSample) printstyled("Stratified Sample:\n"; bold=true) printstyled("data: "; bold=true) print(size(design.data, 1), "x", size(design.data, 2), " DataFrame") - printstyled("\nweights: "; bold=true) + printstyled("\nstrata: "; bold=true) + print(design.strata) + printstyled("\ndata.weights: "; bold=true) print_short(design.data.weights) - printstyled("\nprobs: "; bold=true) + printstyled("\ndata.probs: "; bold=true) print_short(design.data.probs) - printstyled("\nstrata: "; bold=true) - print_short(design.data.strata) printstyled("\nfpc: "; bold=true) print_short(design.fpc) - printstyled("\n popsize: "; bold=true) - print(design.popsize) - printstyled("\n sampsize: "; bold=true) - print(design.sampsize) + printstyled("\npopsize: "; bold=true) + print_short(design.popsize) + printstyled("\nsampsize: "; bold=true) + print_short(design.sampsize) end """ - ClusterSample <: AbstractSurveyDesign + GeneralSample <: AbstractSurveyDesign Survey design sampled by clustering. """ -struct ClusterSample <: AbstractSurveyDesign - data::DataFrame +struct GeneralSample <: AbstractSurveyDesign + data::AbstractDataFrame + strata::Symbol + sampsize::Vector{Union{Nothing,Float64}} + popsize::Vector{Union{Nothing,Float64}} + sampfraction::Vector{Real} + fpc::Vector{Real} + ignorefpc::Bool + function StratifiedSample(data::AbstractDataFrame, strata::Symbol; + popsize=nothing, + sampsize= transform(groupby(data,strata), nrow => :counts ).counts , + weights=nothing, # Check the defaults + probs=nothing, + ignorefpc=false + ) + # Functionality: weights arg can be passed as Symbol instead of vector + if isa(weights, Symbol) + weights = data[!, weights] + end + # Set population size if it is not given; `weights` and `sampsize` must be given + if ignorefpc # && (isnothing(popsize) || isnothing(weights) || isnothing(probs)) + @warn "Assuming equal weights" + weights = ones(nrow(data)) + end + if isnothing(popsize) + # if typeof(weights) <: Vector{<:Real} + # if !all(y -> y == first(weights), weights) # SRS by definition is equi-weighted + # error("Simple Random Sample must be equi-weighted. Different sampling weights detected in vectors") + # end + if isa(weights, Symbol) + weights = data[!, weights] + # if !all(y -> y == first(weights), weights) # SRS by definition is equi-weighted + # error("Simple Random Sample must be equi-weighted. Different sampling weights detected in vectors") + # end + elseif typeof(probs) <: Vector{<:Real} + # if !all(y -> y == first(probs), probs) # SRS by definition is equi-weighted + # error("Simple Random Sample must be equi-weighted. Different sampling probabilities detected in vector") + # end + weights = 1 ./ probs + elseif isa(probs, Symbol) + probs = data[!, probs] + # if !all(y -> y == first(probs), probs) # SRS by definition is equi-weighted + # error("Simple Random Sample must be equi-weighted. Different sampling probabilities detected in vector") + # end + weights = 1 ./ probs + end + # If all weights are equal then estimate + # equal_weight = first(weights) + popsize = sampsize .* weights # |> Vector{Float64} + + if sampsize > popsize + error("You have either given wrong or not enough keyword args. sampsize cannot be greate than popsize. Check given inputs. eg if weights given then popsize must be given (for now)") + end + elseif typeof(popsize) <: Vector{<:Real} + # if !all(y -> y == first(popsize), popsize) # SRS by definition is equi-weighted + # error("Simple Random Sample must be equi-weighted. Different sampling weights detected in vectors") + # end + # @show(popsize,sampsize) + weights = popsize ./ sampsize # This line is expansion estimator, we may need to change it when doing compley surveys + # popsize = first(popsize) |> UInt + else + error("If popsize not given then either sampling weights or sampling probabilities must be given") + end + # set sampling fraction + sampfraction = sampsize ./ popsize + # set fpc + fpc = ignorefpc ? 1 : 1 .- (sampsize ./ popsize) + # Add columns for weights and probs in data -- Check if really needed to add them as columns + if !isnothing(probs) + # add probability weights column to `data` + data[!, :probs] = probs + # add frequency weights column to `data` + data[!, :weights] = 1 ./ data[!, :probs] + else # else weights were specified + # add frequency weights column to `data` + data[!, :weights] = weights + # add probability weights column to `data` + data[!, :probs] = 1 ./ data[!, :weights] + end + new(data, strata, sampsize, popsize, sampfraction, fpc, ignorefpc) + end + function StratifiedSample(data::AbstractDataFrame,strata::Symbol) + ignorefpc = true + return StratifiedSample(data,strata; popsize=nothing,sampsize= transform(groupby(data,strata), nrow => :counts ).counts, weights=nothing, probs=nothing, ignorefpc=ignorefpc) + end end # `show` method for printing information about a `ClusterSample` after construction -function Base.show(io::IO, design::ClusterSample) +function Base.show(io::IO, design::GeneralSample) printstyled("Cluster Sample:\n"; bold=true) printstyled("data: "; bold=true) print(size(design.data, 1), "x", size(design.data, 2), " DataFrame") diff --git a/src/ht.jl b/src/ht.jl new file mode 100644 index 00000000..e4f6adc5 --- /dev/null +++ b/src/ht.jl @@ -0,0 +1,80 @@ +function HorwitzThompson_HartleyRaoApproximation(y::AbstractVector, design, HT_total) + pi = design.data.probs + hartley_rao_variance_formula = sum( (1 .- ((design.sampsize .- 1) ./ design.sampsize ) .* pi ) .* ( (y .* design.data.weights) .- (HT_total./design.sampsize) ).^2 ) + return sqrt(hartley_rao_variance_formula) +end + +function ht_calc(x::Symbol,design) + total = wsum(Float32.( design.data[!,x] ), design.data.weights ) + # se = hte_se( design.data[!,x], design ) # works + se = HorwitzThompson_HartleyRaoApproximation( design.data[!,x], design , total) # Also works + return DataFrame(total = total, se = se ) +end + +# function HansenHurwitzEstimator() + +# end + +# function HorwitzThompson_HartleyRaoApproximation(y::Vector{Int}, design, HT_total) +# pi = design.data.probs +# hartley_rao_variance_formula = sum( (1 .- ((design.sampsize .- 1) ./ design.sampsize ) .* pi ) .* ( (y./pi) .- (total./design.sampsize) ).^2 ) +# return sqrt(hartley_rao_variance_formula) +# end + +# # prob = 1 .- ((1 .- pi ) .^ (1/n)) +# # For SRS only, in other design, we need to make a function +# # pi_i = design.sampsize / design.popsize +# # pi_j = design.sampsize / design.popsize + +# # Use Hartley Rao approximation to calc joint probability of selecting two units, for any sample +# pi_ij = hartley_rao_approx() # design.sampsize .* (design.sampsize .- 1 ) ./ ( design.popsize .* (design.popsize .- 1) ) + +# first_sum = sum((1 .- pi) ./ (pi .^ 2) .* (y .^ 2)) +# @show pi, pi_ij, first_sum +# second_sum = 0 # sum( (pi_ij - ( pi_i * pi_j) ) / (pi_i * pi_j * pi_ij) .* y[i] .* y[j] ) + +# # pimat = zeros(length(pi),length(pi)) +# # coefmat=zeros(length(pi),length(pi)) + +# for i in 1:length(pi) +# for j in 1:length(pi) +# if i != j +# # pimat[i,j] = pi[i] + pi[j] - (1 - ((1 - prob[i] - prob[j] ) ^ n)) +# coefmat = ((pi_ij - ( pi_i * pi_j) ) / (pi_i * pi_j * pi_ij)) +# second_sum += coefmat * y[i] * y[j] +# end +# end +# end + +# return sqrt(first_sum + second_sum) +# # catch +# # return -1 +# # end +# end + +# function hte_se(y::AbstractVector, design::SimpleRandomSample) +# pi = design.data.probs + +# # For SRS only, in other design, we need to make a function +# pi_i = design.sampsize / design.popsize +# pi_j = design.sampsize / design.popsize +# pi_ij = design.sampsize .* (design.sampsize .- 1 ) ./ ( design.popsize .* (design.popsize .- 1) ) + +# first_sum = sum((1 .- pi_i) ./ (pi_i .^ 2) .* (y .^ 2)) +# @show pi_i , pi_j, pi_ij, first_sum + +# second_sum = 0 +# coefmat = ((pi_ij - ( pi_i * pi_j) ) / (pi_i * pi_j * pi_ij)) +# for i in 1:length(pi) +# for j in 1:length(pi) +# if i != j +# second_sum += coefmat * y[i] * y[j] +# end +# end +# end +# @show second_sum +# return sqrt(first_sum + second_sum) +# end + + + diff --git a/src/svymean.jl b/src/svymean.jl index 11e36967..341a0728 100644 --- a/src/svymean.jl +++ b/src/svymean.jl @@ -36,7 +36,7 @@ function sem(x::AbstractVector, design::SimpleRandomSample) return sqrt(var_of_mean(x, design)) end -function svymean(x, design::SimpleRandomSample) +function svymean(x::Symbol, design::SimpleRandomSample) # Support behaviour like R for CategoricalArray type data if isa(x,Symbol) && isa(design.data[!,x], CategoricalArray) gdf = groupby(design.data, x) @@ -49,6 +49,18 @@ function svymean(x, design::SimpleRandomSample) return DataFrame(mean = mean(design.data[!, x]), sem = sem(x, design::SimpleRandomSample)) end +function svymean(x::Vector{Symbol}, design::SimpleRandomSample) + means_list = [] + for i in x + push!(means_list, svymean(i,design)) + end + # df = DataFrame(names = String.(x)) + df = vcat( means_list...) + # df.names = String.(x) + insertcols!(df,1, :names => String.(x)) # df[!,[3,1,2]] + return df +end + """ Inner method for `svyby`. """ @@ -92,6 +104,52 @@ end """ mean for Categorical variables """ +""" +StratifiedSample functions +""" + # function svymean(x::, design::SimpleRandomSample) # return DataFrame(mean = mean(design.data[!, x]), sem = sem(x, design::SimpleRandomSample)) -# end \ No newline at end of file +# end +# function strata_sample_variance(y) +# 1 / () +# end +function svymean(x::Symbol, design::StratifiedSample) + # Support behaviour like R for CategoricalArray type data + print("Yolo") + if x == design.strata + gdf = groupby(design.data, x) + # nₕ = combine(gdf, nrow => :counts ) + p = combine(gdf, :weights => sum => :Nₕ ) + p.Wₕ = p.Nₕ ./ sum(p.Nₕ) + p = select!(p, Not(:Nₕ)) + # p.proportion = p.counts ./ sum(p.counts) + # @show nₕ , Nₕ , Wₕ + return p + elseif isa(x,Symbol) && isa(design.data[!,x], CategoricalArray) + print("Yolo") + gdf = groupby(design.data, x) + p = combine(gdf, nrow => :counts ) + # @show p + p.proportion = p.counts ./ sum(p.counts) + p.var = design.fpc .* p.proportion .* (1 .- p.proportion) ./ (design.sampsize - 1) # Formula for variance of proportion + p.se = sqrt.(p.var) + return p + end + gdf = groupby(design.data,design.strata) + + ȳₕ = combine(gdf, x => mean => :mean).mean + Nₕ = combine(gdf, :weights => sum => :Nₕ ).Nₕ + nₕ = combine(gdf, nrow => :nₕ).nₕ + fₕ = nₕ ./ Nₕ + Wₕ = Nₕ ./ sum(Nₕ) + Ȳ̂ = sum(Wₕ .* ȳₕ) + # Ȳ̂ = mean(ȳₕ, Wₕ ) # Is also correct + + s²ₕ = combine(gdf, x => var => :s²h ).s²h + V̂Ȳ̂ = sum( (Wₕ .^2) .* (1 .- fₕ) .* s²ₕ ./ nₕ ) + SE = sqrt(V̂Ȳ̂) + + # @show nₕ , Nₕ , fₕ , ȳₕ , Wₕ , Ȳ̂ , s²ₕ + return DataFrame(Ȳ̂ = Ȳ̂ , SE = SE) # , sem = sem(x, design::SimpleRandomSample)) +end \ No newline at end of file diff --git a/src/svyquantile.jl b/src/svyquantile.jl index da027b34..f3bdd89d 100644 --- a/src/svyquantile.jl +++ b/src/svyquantile.jl @@ -15,6 +15,10 @@ julia> svyquantile(:enroll, srs, 0.5) ``` """ # TODO: modify for SimpleRandomSample +function svyquantile(var, design::AbstractSurveyDesign) + return error("Please specify q quantile as a vector or float") +end + function svyquantile(var, design::SimpleRandomSample, q; kwargs...) x = design.data[!, var] # w = design.data.probs diff --git a/src/svytotal.jl b/src/svytotal.jl index 6a09d345..82689cf7 100644 --- a/src/svytotal.jl +++ b/src/svytotal.jl @@ -31,9 +31,9 @@ function se_tot(x::Symbol, design::SimpleRandomSample) return sqrt(var_of_total(x, design)) end -function total(x::Symbol, design::SimpleRandomSample) - return wsum(design.data[!, x] , weights(design.data.weights) ) -end +# function total(x::Symbol, design::SimpleRandomSample) +# return wsum(design.data[!, x] , weights(design.data.weights) ) +# end function svytotal(x::Symbol, design::SimpleRandomSample) # Support behaviour like R for CategoricalArray type data @@ -55,3 +55,43 @@ end # function svytotal(x::Symbol, design::svydesign) # # TODO # end + +""" +StratifiedSample functions +""" + +function svytotal(x::Symbol, design::StratifiedSample) + # # Support behaviour like R for CategoricalArray type data + if x == design.strata + gdf = groupby(design.data, x) + return combine(gdf, :weights => sum => :Nₕ ) + end + # if isa(x,Symbol) && isa(design.data[!,x], CategoricalArray) + # gdf = groupby(design.data, x) + # p = combine(gdf, nrow => :count ) + # p.total = design.popsize .* p.count ./ sum(p.count) + # p.proportion = p.total ./ design.popsize + # p = select!(p, Not(:count)) # Drop the count column as not really desired for svytotal + # p.var = design.popsize^2 .* design.fpc .* p.proportion .* (1 .- p.proportion) ./ (design.sampsize - 1) # N^2 .* Formula for variance of proportion + # p.se = sqrt.(p.var) + # return p + # end + gdf = groupby(design.data,design.strata) + # wsum(design.data[!, x] , weights(design.data.weights) ) + grand_total = sum(combine(gdf, [x,:weights] => ( (a,b) -> wsum(a,b) ) => :total).total) # works + # grand_total = wsum( combine(gdf, x => mean => :mean).mean, weights(combine(gdf, :weights => sum => :Nₕ ).Nₕ ) ) # Also works but above is simpler + + ### Variance estimation using closed-form formula + ȳₕ = combine(gdf, x => mean => :mean).mean + Nₕ = combine(gdf, :weights => sum => :Nₕ ).Nₕ + nₕ = combine(gdf, nrow => :nₕ).nₕ + fₕ = nₕ ./ Nₕ + Wₕ = Nₕ ./ sum(Nₕ) + Ȳ̂ = sum(Wₕ .* ȳₕ) + # Ȳ̂ = mean(ȳₕ, Wₕ ) # Is also correct + + s²ₕ = combine(gdf, x => var => :s²h ).s²h + V̂Ȳ̂ = sum( (Nₕ .^2) .* (1 .- fₕ) .* s²ₕ ./ nₕ ) # Only difference between total and mean variance is the Nₕ instead of Wₕ + SE = sqrt(V̂Ȳ̂) + return DataFrame(grand_total = grand_total, SE = SE) # , sem = sem(x, design::SimpleRandomSample)) +end