From 34dddcc721aff354c6b18f244322610aa57a5d8b Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Sun, 4 Dec 2022 17:33:48 +0200 Subject: [PATCH 1/4] Remove extra docstrings and reorder methods --- src/mean.jl | 152 +++++++++++++++++++++++----------------------------- 1 file changed, 66 insertions(+), 86 deletions(-) diff --git a/src/mean.jl b/src/mean.jl index 4999a3bc..51308c31 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -2,6 +2,9 @@ mean(x, design) Estimate the population mean of a variable of a simple random sample, and the corresponding standard error. +The calculations were done according to the book [Sampling Techniques](https://www.academia.edu/29684662/Cochran_1977_Sampling_Techniques_Third_Edition) +by William Cochran. + ```jldoctest julia> apisrs = load_data("apisrs"); @@ -13,6 +16,25 @@ julia> mean(:enroll, srs) │ Float64 Float64 ─────┼────────────────── 1 │ 584.61 27.3684 + +julia> mean([:api00, :api99], srs) +2×3 DataFrame + Row │ names mean SE + │ String Float64 Float64 +─────┼────────────────────────── + 1 │ api00 656.585 9.24972 + 2 │ api99 624.685 9.5003 + +julia> strat = load_data("apistrat"); + +julia> dstrat = StratifiedSample(strat, :stype; popsize = :fpc); + +julia> mean(:api00, dstrat) +1×2 DataFrame + Row │ mean SE + │ Float64 Float64 +─────┼────────────────── + 1 │ 662.287 9.40894 ``` """ function mean(x::Symbol, design::SimpleRandomSample) @@ -32,34 +54,47 @@ function mean(x::Symbol, design::SimpleRandomSample) return DataFrame(mean=mean(design.data[!, x]), SE=se(x, design)) end -""" - mean(x, design) -Estimate the population mean of a variable of a simple random sample, and the corresponding standard error. - -```jldoctest -julia> apisrs = load_data("apisrs"); - -julia> srs = SimpleRandomSample(apisrs;popsize=:fpc); - -julia> mean([:api00, :api99], srs) -2×3 DataFrame - Row │ names mean SE - │ String Float64 Float64 -─────┼────────────────────────── - 1 │ api00 656.585 9.24972 - 2 │ api99 624.685 9.5003 -``` -""" function mean(x::Vector{Symbol}, design::SimpleRandomSample) df = reduce(vcat, [mean(i, design) for i in x]) insertcols!(df, 1, :names => String.(x)) return df end +function mean(x::Symbol, design::StratifiedSample) + if x == design.strata + gdf = groupby(design.data, x) + p = combine(gdf, :weights => sum => :Nₕ) + p.Wₕ = p.Nₕ ./ sum(p.Nₕ) + p = select!(p, Not(:Nₕ)) + return p + elseif isa(design.data[!, x], CategoricalArray) + gdf = groupby(design.data, x) + p = combine(gdf, nrow => :counts) + p.proportion = p.counts ./ sum(p.counts) + # variance of proportion + p.var = design.fpc .* p.proportion .* (1 .- p.proportion) ./ (design.sampsize - 1) + p.se = sqrt.(p.var) + return p + end + gdf = groupby(design.data, design.strata) + ȳₕ = combine(gdf, x => mean => :mean).mean + Nₕ = combine(gdf, :weights => sum => :Nₕ).Nₕ + nₕ = combine(gdf, nrow => :nₕ).nₕ + fₕ = nₕ ./ Nₕ + Wₕ = Nₕ ./ sum(Nₕ) + Ȳ̂ = sum(Wₕ .* ȳₕ) + s²ₕ = combine(gdf, x => var => :s²h).s²h + V̂Ȳ̂ = sum((Wₕ .^ 2) .* (1 .- fₕ) .* s²ₕ ./ nₕ) + SE = sqrt(V̂Ȳ̂) + return DataFrame(mean=Ȳ̂, SE=SE) +end + """ Estimate domain-wise mean. -The calculations were done according to the book [Calibration Estimators in Survey Sampling] by Jean-Claude Deville and Carl-Erik Sarndal(https://www.tandfonline.com/doi/abs/10.1080/01621459.1992.10475217). +The calculations were done according to the book [Calibration Estimators in Survey Sampling](https://www.tandfonline.com/doi/abs/10.1080/01621459.1992.10475217) +by Jean-Claude Deville and Carl-Erik Sarndal. + ```jldoctest julia> using Survey; @@ -73,6 +108,17 @@ DataFrameRow │ String15 Float64 Float64 ─────┼──────────────────────────── 1 │ Kern 573.6 42.8026 + +julia> strat = load_data("apistrat"); + +julia> dstrat = StratifiedSample(strat, :stype; popsize = :fpc); + +julia> mean(:api00, :cname, dstrat) |> first +DataFrameRow + Row │ cname mean SE + │ String15 Float64 Float64 +─────┼─────────────────────────────── + 1 │ Los Angeles 633.511 21.3912 ``` """ function mean(x::Symbol, by::Symbol, design::SimpleRandomSample) @@ -90,24 +136,6 @@ function mean(x::Symbol, by::Symbol, design::SimpleRandomSample) combine(gdf, [x, :weights] => ((a, b) -> domain_mean(a, design, b)) => AsTable) end -""" -Calculates domain mean and its std error, based example 10.3.3 on pg394 Sarndal (1992) - -```jldoctest -julia> using Survey; - -julia> strat = load_data("apistrat"); - -julia> dstrat = StratifiedSample(strat, :stype; popsize = :fpc); - -julia> mean(:api00, :cname, dstrat) |> first -DataFrameRow - Row │ cname mean SE - │ String15 Float64 Float64 -─────┼─────────────────────────────── - 1 │ Los Angeles 633.511 21.3912 -``` -""" function mean(x::Symbol, by::Symbol, design::StratifiedSample) function domain_mean(x::AbstractVector, popsize::AbstractVector, sampsize::AbstractVector, sampfraction::AbstractVector, strata::AbstractVector) df = DataFrame(x=x, popsize=popsize, sampsize=sampsize, sampfraction=sampfraction, strata=strata) @@ -134,58 +162,10 @@ function mean(x::Symbol, by::Symbol, design::StratifiedSample) gdf_domain = groupby(design.data, by) combine(gdf_domain, [x, :popsize,:sampsize,:sampfraction, design.strata] => domain_mean => AsTable) end -""" -Estimate the population mean of a variable of a stratified sample, and the corresponding standard error. - Ref: Cochran (1977) - -```jldoctest -julia> using Survey; - -julia> strat = load_data("apistrat"); - -julia> dstrat = StratifiedSample(strat, :stype; popsize = :fpc); - -julia> mean(:api00, dstrat) -1×2 DataFrame - Row │ mean SE - │ Float64 Float64 -─────┼────────────────── - 1 │ 662.287 9.40894 - -``` -""" -function mean(x::Symbol, design::StratifiedSample) - if x == design.strata - gdf = groupby(design.data, x) - p = combine(gdf, :weights => sum => :Nₕ) - p.Wₕ = p.Nₕ ./ sum(p.Nₕ) - p = select!(p, Not(:Nₕ)) - return p - elseif isa(design.data[!, x], CategoricalArray) - gdf = groupby(design.data, x) - p = combine(gdf, nrow => :counts) - p.proportion = p.counts ./ sum(p.counts) - # variance of proportion - p.var = design.fpc .* p.proportion .* (1 .- p.proportion) ./ (design.sampsize - 1) - p.se = sqrt.(p.var) - return p - end - gdf = groupby(design.data, design.strata) - ȳₕ = combine(gdf, x => mean => :mean).mean - Nₕ = combine(gdf, :weights => sum => :Nₕ).Nₕ - nₕ = combine(gdf, nrow => :nₕ).nₕ - fₕ = nₕ ./ Nₕ - Wₕ = Nₕ ./ sum(Nₕ) - Ȳ̂ = sum(Wₕ .* ȳₕ) - s²ₕ = combine(gdf, x => var => :s²h).s²h - V̂Ȳ̂ = sum((Wₕ .^ 2) .* (1 .- fₕ) .* s²ₕ ./ nₕ) - SE = sqrt(V̂Ȳ̂) - return DataFrame(mean=Ȳ̂, SE=SE) -end function mean(::Bool; x::Symbol, design::StratifiedSample) gdf = groupby(design.data, design.strata) ȳₕ = combine(gdf, x => mean => :mean).mean s²ₕ = combine(gdf, x => var => :s²h).s²h return DataFrame(ȳₕ, s²ₕ) -end \ No newline at end of file +end From c5b21a3099e2216ece31f3f93172e3f773f8a524 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Sun, 4 Dec 2022 18:20:45 +0200 Subject: [PATCH 2/4] Remove extra docstrings and reorder methods for `total` --- src/total.jl | 121 ++++++++++++++++++++++----------------------------- 1 file changed, 52 insertions(+), 69 deletions(-) diff --git a/src/total.jl b/src/total.jl index 6b9d0a27..a5d4a4a8 100644 --- a/src/total.jl +++ b/src/total.jl @@ -1,5 +1,3 @@ -# SimpleRandomSample - """ total(x, design) @@ -10,7 +8,7 @@ julia> using Survey; julia> apisrs = load_data("apisrs"); -julia> srs = SimpleRandomSample(apisrs;popsize=:fpc); +julia> srs = SimpleRandomSample(apisrs; popsize=:fpc); julia> total(:enroll, srs) 1×2 DataFrame @@ -18,6 +16,25 @@ julia> total(:enroll, srs) │ Float64 Float64 ─────┼───────────────────── 1 │ 3.62107e6 1.6952e5 + +julia> strat = load_data("apistrat"); + +julia> dstrat = StratifiedSample(strat, :stype; popsize=:fpc); + +julia> total(:api00, dstrat) +1×2 DataFrame + Row │ total SE + │ Float64 Float64 +─────┼──────────────────── + 1 │ 4.10221e6 58279.0 + +julia> total([:api00, :enroll], dstrat) +2×3 DataFrame + Row │ names total SE + │ String Float64 Float64 +─────┼────────────────────────────────── + 1 │ api00 4.10221e6 58279.0 + 2 │ enroll 3.68718e6 1.14642e5 ``` """ function total(x::Symbol, design::SimpleRandomSample) @@ -43,57 +60,6 @@ function total(x::Symbol, design::SimpleRandomSample) return DataFrame(total=total, SE=se(x, design)) end -""" -Estimate subpopulation total for a stratified sample. -```jldoctest -julia> using Survey; - -julia> apisrs = load_data("apisrs"); - -julia> srs = SimpleRandomSample(apisrs;popsize=:fpc); - -julia> total(:api00, :cname, srs) |> first -DataFrameRow - Row │ cname total SE - │ String15 Float64 Float64 -─────┼────────────────────────────── - 1 │ Kern 1.77644e5 55600.8 - -``` -""" -function total(x::Symbol, by::Symbol, design::SimpleRandomSample) - function domain_total(x::AbstractVector, design::SimpleRandomSample, weights) - function se(x::AbstractVector, design::SimpleRandomSample, _) - # vector of length equal to `sampsize` containing `x` and zeros - z = cat(zeros(design.sampsize - length(x)), x; dims=1) - variance = design.popsize^2 / design.sampsize * design.fpc * var(z) - return sqrt(variance) - end - total = wsum(x, weights) - return DataFrame(total=total, SE=se(x, design::SimpleRandomSample, weights)) - end - gdf = groupby(design.data, by) - combine(gdf, [x, :weights] => ((a, b) -> domain_total(a, design, b)) => AsTable) -end - -""" -total for StratifiedSample - -```jldoctest -julia> using Survey; - -julia> strat = load_data("apistrat"); - -julia> dstrat = StratifiedSample(strat, :stype; popsize = :fpc); - -julia> total(:api00, dstrat) -1×2 DataFrame - Row │ total SE - │ Float64 Float64 -─────┼──────────────────── - 1 │ 4.10221e6 58279.0 -``` -""" function total(x::Symbol, design::StratifiedSample) # TODO: check if statement if x == design.strata @@ -117,27 +83,44 @@ function total(x::Symbol, design::StratifiedSample) return DataFrame(total=grand_total, SE=SE) end +function total(x::Vector{Symbol}, design::AbstractSurveyDesign) + df = reduce(vcat, [total(i, design) for i in x]) + insertcols!(df, 1, :names => String.(x)) + return df +end + """ -Vectorise total operation over Vector{Symbol} + total(x, by, design) + +Estimate the subpopulation total of a variable `x`. ```jldoctest -julia> using Survey; +julia> using Survey; -julia> strat = load_data("apistrat"); +julia> apisrs = load_data("apisrs"); -julia> dstrat = StratifiedSample(strat, :stype; popsize = :fpc); +julia> srs = SimpleRandomSample(apisrs;popsize=:fpc); + +julia> total(:api00, :cname, srs) |> first +DataFrameRow + Row │ cname total SE + │ String15 Float64 Float64 +─────┼────────────────────────────── + 1 │ Kern 1.77644e5 55600.8 -julia> total([:api00, :enroll], dstrat) -2×3 DataFrame - Row │ names total SE - │ String Float64 Float64 -─────┼────────────────────────────────── - 1 │ api00 4.10221e6 58279.0 - 2 │ enroll 3.68718e6 1.14642e5 ``` """ -function total(x::Vector{Symbol}, design::AbstractSurveyDesign) - df = reduce(vcat, [total(i, design) for i in x]) - insertcols!(df, 1, :names => String.(x)) - return df +function total(x::Symbol, by::Symbol, design::SimpleRandomSample) + function domain_total(x::AbstractVector, design::SimpleRandomSample, weights) + function se(x::AbstractVector, design::SimpleRandomSample, _) + # vector of length equal to `sampsize` containing `x` and zeros + z = cat(zeros(design.sampsize - length(x)), x; dims=1) + variance = design.popsize^2 / design.sampsize * design.fpc * var(z) + return sqrt(variance) + end + total = wsum(x, weights) + return DataFrame(total=total, SE=se(x, design::SimpleRandomSample, weights)) + end + gdf = groupby(design.data, by) + combine(gdf, [x, :weights] => ((a, b) -> domain_total(a, design, b)) => AsTable) end From c2dede67a838b98a6c9bb1ac52ce72ff8120407d Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Sun, 4 Dec 2022 18:22:33 +0200 Subject: [PATCH 3/4] Add new lines, function signature and rephrase docstring --- src/mean.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/mean.jl b/src/mean.jl index 51308c31..a29f115a 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -1,5 +1,6 @@ """ mean(x, design) + Estimate the population mean of a variable of a simple random sample, and the corresponding standard error. The calculations were done according to the book [Sampling Techniques](https://www.academia.edu/29684662/Cochran_1977_Sampling_Techniques_Third_Edition) @@ -90,7 +91,9 @@ function mean(x::Symbol, design::StratifiedSample) end """ -Estimate domain-wise mean. + mean(x, by, design) + +Estimate the subpopulation mean of a variable `x`. The calculations were done according to the book [Calibration Estimators in Survey Sampling](https://www.tandfonline.com/doi/abs/10.1080/01621459.1992.10475217) by Jean-Claude Deville and Carl-Erik Sarndal. From 690a424ce8b51bd022d0b748d383bd111679df53 Mon Sep 17 00:00:00 2001 From: smishr <43640926+smishr@users.noreply.github.com> Date: Mon, 5 Dec 2022 09:18:33 +0530 Subject: [PATCH 4/4] Remove strata means Removed strata means option for now. will reformat and add back at later stage. --- src/mean.jl | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/mean.jl b/src/mean.jl index a29f115a..2acdf0f7 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -165,10 +165,3 @@ function mean(x::Symbol, by::Symbol, design::StratifiedSample) gdf_domain = groupby(design.data, by) combine(gdf_domain, [x, :popsize,:sampsize,:sampfraction, design.strata] => domain_mean => AsTable) end - -function mean(::Bool; x::Symbol, design::StratifiedSample) - gdf = groupby(design.data, design.strata) - ȳₕ = combine(gdf, x => mean => :mean).mean - s²ₕ = combine(gdf, x => var => :s²h).s²h - return DataFrame(ȳₕ, s²ₕ) -end