Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove extra docstrings and reorder methods for mean and total #117

Merged
merged 4 commits into from
Dec 5, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 70 additions & 87 deletions src/mean.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
"""
mean(x, design)

Estimate the population mean of a variable of a simple random sample, and the corresponding standard error.

The calculations were done according to the book [Sampling Techniques](https://www.academia.edu/29684662/Cochran_1977_Sampling_Techniques_Third_Edition)
by William Cochran.

```jldoctest
julia> apisrs = load_data("apisrs");

Expand All @@ -13,6 +17,25 @@ julia> mean(:enroll, srs)
│ Float64 Float64
─────┼──────────────────
1 │ 584.61 27.3684

julia> mean([:api00, :api99], srs)
2×3 DataFrame
Row │ names mean SE
│ String Float64 Float64
─────┼──────────────────────────
1 │ api00 656.585 9.24972
2 │ api99 624.685 9.5003

julia> strat = load_data("apistrat");

julia> dstrat = StratifiedSample(strat, :stype; popsize = :fpc);

julia> mean(:api00, dstrat)
1×2 DataFrame
Row │ mean SE
│ Float64 Float64
─────┼──────────────────
1 │ 662.287 9.40894
```
"""
function mean(x::Symbol, design::SimpleRandomSample)
Expand All @@ -32,34 +55,49 @@ function mean(x::Symbol, design::SimpleRandomSample)
return DataFrame(mean=mean(design.data[!, x]), SE=se(x, design))
end

"""
mean(x, design)
Estimate the population mean of a variable of a simple random sample, and the corresponding standard error.

```jldoctest
julia> apisrs = load_data("apisrs");

julia> srs = SimpleRandomSample(apisrs;popsize=:fpc);

julia> mean([:api00, :api99], srs)
2×3 DataFrame
Row │ names mean SE
│ String Float64 Float64
─────┼──────────────────────────
1 │ api00 656.585 9.24972
2 │ api99 624.685 9.5003
```
"""
function mean(x::Vector{Symbol}, design::SimpleRandomSample)
df = reduce(vcat, [mean(i, design) for i in x])
insertcols!(df, 1, :names => String.(x))
return df
end

function mean(x::Symbol, design::StratifiedSample)
if x == design.strata
gdf = groupby(design.data, x)
p = combine(gdf, :weights => sum => :Nₕ)
p.Wₕ = p.Nₕ ./ sum(p.Nₕ)
p = select!(p, Not(:Nₕ))
return p
elseif isa(design.data[!, x], CategoricalArray)
gdf = groupby(design.data, x)
p = combine(gdf, nrow => :counts)
p.proportion = p.counts ./ sum(p.counts)
# variance of proportion
p.var = design.fpc .* p.proportion .* (1 .- p.proportion) ./ (design.sampsize - 1)
p.se = sqrt.(p.var)
return p
end
gdf = groupby(design.data, design.strata)
ȳₕ = combine(gdf, x => mean => :mean).mean
Nₕ = combine(gdf, :weights => sum => :Nₕ).Nₕ
nₕ = combine(gdf, nrow => :nₕ).nₕ
fₕ = nₕ ./ Nₕ
Wₕ = Nₕ ./ sum(Nₕ)
Ȳ̂ = sum(Wₕ .* ȳₕ)
s²ₕ = combine(gdf, x => var => :s²h).s²h
V̂Ȳ̂ = sum((Wₕ .^ 2) .* (1 .- fₕ) .* s²ₕ ./ nₕ)
SE = sqrt(V̂Ȳ̂)
return DataFrame(mean=Ȳ̂, SE=SE)
end

"""
Estimate domain-wise mean.
mean(x, by, design)

Estimate the subpopulation mean of a variable `x`.

The calculations were done according to the book [Calibration Estimators in Survey Sampling](https://www.tandfonline.com/doi/abs/10.1080/01621459.1992.10475217)
by Jean-Claude Deville and Carl-Erik Sarndal.

The calculations were done according to the book [Calibration Estimators in Survey Sampling] by Jean-Claude Deville and Carl-Erik Sarndal(https://www.tandfonline.com/doi/abs/10.1080/01621459.1992.10475217).
```jldoctest
julia> using Survey;

Expand All @@ -73,6 +111,17 @@ DataFrameRow
│ String15 Float64 Float64
─────┼────────────────────────────
1 │ Kern 573.6 42.8026

julia> strat = load_data("apistrat");

julia> dstrat = StratifiedSample(strat, :stype; popsize = :fpc);

julia> mean(:api00, :cname, dstrat) |> first
DataFrameRow
Row │ cname mean SE
│ String15 Float64 Float64
─────┼───────────────────────────────
1 │ Los Angeles 633.511 21.3912
```
"""
function mean(x::Symbol, by::Symbol, design::SimpleRandomSample)
Expand All @@ -90,24 +139,6 @@ function mean(x::Symbol, by::Symbol, design::SimpleRandomSample)
combine(gdf, [x, :weights] => ((a, b) -> domain_mean(a, design, b)) => AsTable)
end

"""
Calculates domain mean and its std error, based example 10.3.3 on pg394 Sarndal (1992)

```jldoctest
julia> using Survey;

julia> strat = load_data("apistrat");

julia> dstrat = StratifiedSample(strat, :stype; popsize = :fpc);

julia> mean(:api00, :cname, dstrat) |> first
DataFrameRow
Row │ cname mean SE
│ String15 Float64 Float64
─────┼───────────────────────────────
1 │ Los Angeles 633.511 21.3912
```
"""
function mean(x::Symbol, by::Symbol, design::StratifiedSample)
function domain_mean(x::AbstractVector, popsize::AbstractVector, sampsize::AbstractVector, sampfraction::AbstractVector, strata::AbstractVector)
df = DataFrame(x=x, popsize=popsize, sampsize=sampsize, sampfraction=sampfraction, strata=strata)
Expand All @@ -134,58 +165,10 @@ function mean(x::Symbol, by::Symbol, design::StratifiedSample)
gdf_domain = groupby(design.data, by)
combine(gdf_domain, [x, :popsize,:sampsize,:sampfraction, design.strata] => domain_mean => AsTable)
end
"""
Estimate the population mean of a variable of a stratified sample, and the corresponding standard error.
Ref: Cochran (1977)

```jldoctest
julia> using Survey;

julia> strat = load_data("apistrat");

julia> dstrat = StratifiedSample(strat, :stype; popsize = :fpc);

julia> mean(:api00, dstrat)
1×2 DataFrame
Row │ mean SE
│ Float64 Float64
─────┼──────────────────
1 │ 662.287 9.40894

```
"""
function mean(x::Symbol, design::StratifiedSample)
if x == design.strata
gdf = groupby(design.data, x)
p = combine(gdf, :weights => sum => :Nₕ)
p.Wₕ = p.Nₕ ./ sum(p.Nₕ)
p = select!(p, Not(:Nₕ))
return p
elseif isa(design.data[!, x], CategoricalArray)
gdf = groupby(design.data, x)
p = combine(gdf, nrow => :counts)
p.proportion = p.counts ./ sum(p.counts)
# variance of proportion
p.var = design.fpc .* p.proportion .* (1 .- p.proportion) ./ (design.sampsize - 1)
p.se = sqrt.(p.var)
return p
end
gdf = groupby(design.data, design.strata)
ȳₕ = combine(gdf, x => mean => :mean).mean
Nₕ = combine(gdf, :weights => sum => :Nₕ).Nₕ
nₕ = combine(gdf, nrow => :nₕ).nₕ
fₕ = nₕ ./ Nₕ
Wₕ = Nₕ ./ sum(Nₕ)
Ȳ̂ = sum(Wₕ .* ȳₕ)
s²ₕ = combine(gdf, x => var => :s²h).s²h
V̂Ȳ̂ = sum((Wₕ .^ 2) .* (1 .- fₕ) .* s²ₕ ./ nₕ)
SE = sqrt(V̂Ȳ̂)
return DataFrame(mean=Ȳ̂, SE=SE)
end

function mean(::Bool; x::Symbol, design::StratifiedSample)
gdf = groupby(design.data, design.strata)
ȳₕ = combine(gdf, x => mean => :mean).mean
s²ₕ = combine(gdf, x => var => :s²h).s²h
return DataFrame(ȳₕ, s²ₕ)
end
end
121 changes: 52 additions & 69 deletions src/total.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# SimpleRandomSample

"""
total(x, design)

Expand All @@ -10,14 +8,33 @@ julia> using Survey;

julia> apisrs = load_data("apisrs");

julia> srs = SimpleRandomSample(apisrs;popsize=:fpc);
julia> srs = SimpleRandomSample(apisrs; popsize=:fpc);

julia> total(:enroll, srs)
1×2 DataFrame
Row │ total SE
│ Float64 Float64
─────┼─────────────────────
1 │ 3.62107e6 1.6952e5

julia> strat = load_data("apistrat");

julia> dstrat = StratifiedSample(strat, :stype; popsize=:fpc);

julia> total(:api00, dstrat)
1×2 DataFrame
Row │ total SE
│ Float64 Float64
─────┼────────────────────
1 │ 4.10221e6 58279.0

julia> total([:api00, :enroll], dstrat)
2×3 DataFrame
Row │ names total SE
│ String Float64 Float64
─────┼──────────────────────────────────
1 │ api00 4.10221e6 58279.0
2 │ enroll 3.68718e6 1.14642e5
```
"""
function total(x::Symbol, design::SimpleRandomSample)
Expand All @@ -43,57 +60,6 @@ function total(x::Symbol, design::SimpleRandomSample)
return DataFrame(total=total, SE=se(x, design))
end

"""
Estimate subpopulation total for a stratified sample.
```jldoctest
julia> using Survey;

julia> apisrs = load_data("apisrs");

julia> srs = SimpleRandomSample(apisrs;popsize=:fpc);

julia> total(:api00, :cname, srs) |> first
DataFrameRow
Row │ cname total SE
│ String15 Float64 Float64
─────┼──────────────────────────────
1 │ Kern 1.77644e5 55600.8

```
"""
function total(x::Symbol, by::Symbol, design::SimpleRandomSample)
function domain_total(x::AbstractVector, design::SimpleRandomSample, weights)
function se(x::AbstractVector, design::SimpleRandomSample, _)
# vector of length equal to `sampsize` containing `x` and zeros
z = cat(zeros(design.sampsize - length(x)), x; dims=1)
variance = design.popsize^2 / design.sampsize * design.fpc * var(z)
return sqrt(variance)
end
total = wsum(x, weights)
return DataFrame(total=total, SE=se(x, design::SimpleRandomSample, weights))
end
gdf = groupby(design.data, by)
combine(gdf, [x, :weights] => ((a, b) -> domain_total(a, design, b)) => AsTable)
end

"""
total for StratifiedSample

```jldoctest
julia> using Survey;

julia> strat = load_data("apistrat");

julia> dstrat = StratifiedSample(strat, :stype; popsize = :fpc);

julia> total(:api00, dstrat)
1×2 DataFrame
Row │ total SE
│ Float64 Float64
─────┼────────────────────
1 │ 4.10221e6 58279.0
```
"""
function total(x::Symbol, design::StratifiedSample)
# TODO: check if statement
if x == design.strata
Expand All @@ -117,27 +83,44 @@ function total(x::Symbol, design::StratifiedSample)
return DataFrame(total=grand_total, SE=SE)
end

function total(x::Vector{Symbol}, design::AbstractSurveyDesign)
df = reduce(vcat, [total(i, design) for i in x])
insertcols!(df, 1, :names => String.(x))
return df
end

"""
Vectorise total operation over Vector{Symbol}
total(x, by, design)

Estimate the subpopulation total of a variable `x`.

```jldoctest
julia> using Survey;
julia> using Survey;

julia> strat = load_data("apistrat");
julia> apisrs = load_data("apisrs");

julia> dstrat = StratifiedSample(strat, :stype; popsize = :fpc);
julia> srs = SimpleRandomSample(apisrs;popsize=:fpc);

julia> total(:api00, :cname, srs) |> first
DataFrameRow
Row │ cname total SE
│ String15 Float64 Float64
─────┼──────────────────────────────
1 │ Kern 1.77644e5 55600.8

julia> total([:api00, :enroll], dstrat)
2×3 DataFrame
Row │ names total SE
│ String Float64 Float64
─────┼──────────────────────────────────
1 │ api00 4.10221e6 58279.0
2 │ enroll 3.68718e6 1.14642e5
```
"""
function total(x::Vector{Symbol}, design::AbstractSurveyDesign)
df = reduce(vcat, [total(i, design) for i in x])
insertcols!(df, 1, :names => String.(x))
return df
function total(x::Symbol, by::Symbol, design::SimpleRandomSample)
function domain_total(x::AbstractVector, design::SimpleRandomSample, weights)
function se(x::AbstractVector, design::SimpleRandomSample, _)
# vector of length equal to `sampsize` containing `x` and zeros
z = cat(zeros(design.sampsize - length(x)), x; dims=1)
variance = design.popsize^2 / design.sampsize * design.fpc * var(z)
return sqrt(variance)
end
total = wsum(x, weights)
return DataFrame(total=total, SE=se(x, design::SimpleRandomSample, weights))
end
gdf = groupby(design.data, by)
combine(gdf, [x, :weights] => ((a, b) -> domain_total(a, design, b)) => AsTable)
end