Skip to content

Commit

Permalink
Merge branch 'AnalyticSolution' into two_stage
Browse files Browse the repository at this point in the history
  • Loading branch information
smishr authored Jan 22, 2023
2 parents 33a48b0 + 264dcf2 commit 515eb24
Show file tree
Hide file tree
Showing 13 changed files with 147 additions and 13 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Analytic Solution branch
This branch has survey designs that calculate statistics and variances using analytical solutions where possible.

# Survey

[![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://xKDR.github.io/Survey.jl/dev)
Expand Down
1 change: 1 addition & 0 deletions docs/src/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ mean(x::Symbol, by::Symbol, design::SimpleRandomSample)
total(x::Symbol, by::Symbol, design::SimpleRandomSample)
```
```@docs
ratio(variable_num:: Symbol, variable_den:: Symbol, design::OneStageClusterSample)
plot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...)
boxplot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...)
hist(design::AbstractSurveyDesign, var::Symbol,
Expand Down
6 changes: 4 additions & 2 deletions src/Survey.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ using CategoricalArrays
using Random

include("SurveyDesign.jl")
include("bootstrap.jl")
include("mean.jl")
include("quantile.jl")
include("jackknife.jl")
Expand All @@ -23,7 +24,7 @@ include("plot.jl")
include("dimnames.jl")
include("boxplot.jl")
include("show.jl")
include("bootstrap.jl")
include("ratio.jl")

export load_data
export AbstractSurveyDesign, SimpleRandomSample, StratifiedSample
Expand All @@ -34,7 +35,8 @@ export mean, total, quantile
export plot
export hist, sturges, freedman_diaconis
export boxplot
export bootstrap
export Bootstrap
export jkknife
export ratio

end
9 changes: 3 additions & 6 deletions src/SurveyDesign.jl
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ julia> apiclus1[!, :pw] = fill(757/15,(size(apiclus1,1),)); # Correct api mistak
julia> dclus1 = OneStageClusterSample(apiclus1, :dnum; weights=:pw)
OneStageClusterSample:
data: 183x45 DataFrame
data: 183x46 DataFrame
cluster: dnum
design.data[!,design.cluster]: 637, 637, 637, ..., 448
popsize: popsize
Expand Down Expand Up @@ -400,7 +400,7 @@ struct OneStageClusterSample <: AbstractSurveyDesign
data_groupedby_cluster = groupby(data, cluster)
data[!, sampsize_labels] = fill(size(data_groupedby_cluster, 1),(nrow(data),))
weights = :weights
data[!, weights] = data[!, popsize] ./ data[!, sampsize_labels]
data[!, :weights] = data[!, popsize] ./ data[!, sampsize_labels]
data[!, :probs] = 1 ./ data[!, weights] # Many formulae are easily defined in terms of sampling probabilties
data[!, :allprobs] = data[!, :probs] # In one-stage cluster sample, allprobs is just probs, no multiplication needed
data[!, :strata] = ones(nrow(data))
Expand All @@ -414,16 +414,13 @@ struct OneStageClusterSample <: AbstractSurveyDesign
if !(typeof(data[!, weights]) <: Vector{<:Real})
error(string("given weights column ", weights , " is not of numeric type"))
end
if !all(w -> w == first(data[!, weights]), data[!, weights])
error("weights must be same for all observations for OneStageClusterSample")
end
# For one-stage sample only one sampsize vector
sampsize_labels = :sampsize
data_groupedby_cluster = groupby(data, cluster)
data[!, sampsize_labels] = fill(size(data_groupedby_cluster, 1),(nrow(data),))
popsize = :popsize
data[!, popsize] = data[!, weights] .* data[!, sampsize_labels]
data[!, :probs] = 1 ./ data[!, weights] # Many formulae are easily defined in terms of sampling probabilties
data[!, :weights] = data[!, weights]
data[!, :allprobs] = data[!, :probs] # In one-stage cluster sample, allprobs is just probs, no multiplication needed
data[!, :strata] = ones(nrow(data))
pps = false
Expand Down
10 changes: 9 additions & 1 deletion src/bootstrap.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
struct Bootstrap
replicates
rng
function Bootstrap(; replicates = 100, rng = MersenneTwister(111))
new(replicates, rng)
end
end

"""
```jldoctest
julia> using Survey, Random, StatsBase;
Expand All @@ -10,7 +18,7 @@ julia> rng = MersenneTwister(111);
julia> func = wsum;
julia> bootstrap(:api00, dclus1, func; replicates=1000, rng)
julia> Survey.bootstrap(:api00, dclus1, func; replicates=1000, rng)
1×2 DataFrame
Row │ statistic SE
│ Float64 Float64
Expand Down
22 changes: 22 additions & 0 deletions src/mean.jl
Original file line number Diff line number Diff line change
Expand Up @@ -183,3 +183,25 @@ function mean(x::Symbol, by::Symbol, design::StratifiedSample)
gdf_domain = groupby(design.data, by)
combine(gdf_domain, [x, :popsize,:sampsize,:sampfraction, design.strata] => domain_mean => AsTable)
end

"""
```jldoctest
julia> using Survey, Random, StatsBase;
julia> apiclus1 = load_data("apiclus1");
julia> dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc);
julia> mean(:api00, dclus1, Bootstrap(replicates = 1000, rng = MersenneTwister(111)))
1×2 DataFrame
Row │ mean SE
│ Float64 Float64
─────┼──────────────────
1 │ 644.169 23.0897
```
"""
function mean(x::Symbol, design::OneStageClusterSample, method::Bootstrap)
weighted_mean(x, w) = mean(x, weights(w))
df = bootstrap(x, design, weighted_mean; method.replicates, method.rng)
df = rename(df, :statistic => :mean)
end
38 changes: 38 additions & 0 deletions src/ratio.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""
ratio(numerator, denominator, design)
Estimate the ratio of the columns specified in numerator and denominator
```jldoctest
julia> using Survey;
julia> apiclus1 = load_data("apiclus1");
julia> apiclus1[!, :pw] = fill(757/15,(size(apiclus1,1),)); # Correct api mistake for pw column
julia> dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc);
julia> ratio(:api00, :enroll, dclus1)
1×2 DataFrame
Row │ Statistic SE
│ Float64 Float64
─────┼─────────────────────
1 │ 1.17182 0.151242
```
"""
function ratio(variable_num:: Symbol, variable_den:: Symbol, design::OneStageClusterSample)
statistic = wsum(design.data[!,variable_num],design.data.weights)/wsum(design.data[!,variable_den],design.data.weights)
nh = length(unique(design.data[!,design.cluster]))
newv = []
gdf = groupby(design.data, design.cluster)
replicates = [filter(n -> n != i, 1:nh) for i in 1:nh]
for i in replicates
df = DataFrame(gdf[i])
push!(newv, wsum(df[!,variable_num],df[!,:weights])/wsum(df[!,variable_den],df[!,:weights]))
end
c = 0
for i in 1:nh
c = c+(newv[i]-statistic)^2
end
var = c*(nh-1)/nh
return DataFrame(Statistic = statistic, SE = sqrt(var))
end
39 changes: 38 additions & 1 deletion src/total.jl
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,22 @@ function total(x::Vector{Symbol}, design::AbstractSurveyDesign)
return df
end

"""
```jldoctest
julia> using Survey
julia> apiclus1 = load_data("apiclus1");
julia> dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc);
julia> total(:api00, dclus1)
1×2 DataFrame
Row │ total SE
│ Float64 Float64
─────┼──────────────────────
1 │ 5.94916e6 1.33948e6
```
"""
function total(x::Symbol, design::OneStageClusterSample)
gdf = groupby(design.data, design.cluster)
ŷₜ = combine(gdf, x => sum => :sum).sum
Expand All @@ -96,7 +112,7 @@ function total(x::Symbol, design::OneStageClusterSample)
nₜ = first(design.data[!,design.sampsize])
s²ₜ = var(ŷₜ)
VȲ = Nₜ^2 * (1 - nₜ/Nₜ) * s²ₜ / nₜ
return DataFrame(mean = Ȳ, SE = sqrt(VȲ))
return DataFrame(total = Ȳ, SE = sqrt(VȲ))
end

"""
Expand Down Expand Up @@ -134,3 +150,24 @@ function total(x::Symbol, by::Symbol, design::SimpleRandomSample)
gdf = groupby(design.data, by)
combine(gdf, [x, :weights] => ((a, b) -> domain_total(a, design, b)) => AsTable)
end

"""
```jldoctest
julia> using Survey, Random, StatsBase;
julia> apiclus1 = load_data("apiclus1");
julia> dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc);
julia> total(:api00, dclus1, Bootstrap(replicates = 1000, rng = MersenneTwister(111)))
1×2 DataFrame
Row │ total SE
│ Float64 Float64
─────┼──────────────────────
1 │ 5.94916e6 1.36593e6
```
"""
function total(x::Symbol, design::OneStageClusterSample, method::Bootstrap)
df = bootstrap(x, design, wsum; method.replicates, method.rng)
df = rename(df, :statistic => :total)
end
2 changes: 1 addition & 1 deletion test/bootstrap.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ apiclus1 = load_data("apiclus1")
dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc);
rng = MersenneTwister(111);
func = wsum;
est = bootstrap(:api00, dclus1, func; replicates=1000, rng)
est = Survey.bootstrap(:api00, dclus1, func; replicates=1000, rng)
@testset "bootstrap.jl" begin
@test est.SE[1] 1.365925776009e6
@test est.statistic[1] 5.9491620666e6
Expand Down
11 changes: 11 additions & 0 deletions test/mean.jl
Original file line number Diff line number Diff line change
Expand Up @@ -73,4 +73,15 @@ end
@test mean_strat_symb.SE[3] 14.9371 atol = 1e-2
end

@testset "mean_OneStageCluster" begin

apiclus1_original = load_data("apiclus1")
apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column
##############################
# one-stage cluster sample
apiclus1 = copy(apiclus1_original)
dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc)

@test mean(:api00,dclus1, Bootstrap()).mean[1] 644.17 atol = 1
@test mean(:api00,dclus1, Bootstrap(replicates = 10000)).SE[1] 23.779 atol = 0.5 # without fpc as it hasn't been figured out for bootstrap.
end
10 changes: 10 additions & 0 deletions test/ratio.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
@testset "ratio.jl" begin
apiclus1_original = load_data("apiclus1")
apiclus1_original[!, :pw] = fill(757/15,(size(apiclus1_original,1),)) # Correct api mistake for pw column
##############################
# one-stage cluster sample
apiclus1 = copy(apiclus1_original)
dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc)
@test ratio(:api00, :enroll, dclus1).SE[1] 0.151242 atol = 1e-4
@test ratio(:api00, :enroll, dclus1).Statistic[1] 1.17182 atol = 1e-4
end
3 changes: 2 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@ include("dimnames.jl")
include("plot.jl")
include("hist.jl")
include("boxplot.jl")
include("bootstrap.jl")
include("bootstrap.jl")
include("ratio.jl")
6 changes: 5 additions & 1 deletion test/total.jl
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,10 @@ end
# one-stage cluster sample
apiclus1 = copy(apiclus1_original)
dclus1 = OneStageClusterSample(apiclus1, :dnum, :fpc)
@test total(:api00,dclus1).mean[1] 5949162 atol = 1
@test total(:api00,dclus1).total[1] 5949162 atol = 1
@test total(:api00,dclus1).SE[1] 1339481 atol = 1

@test total(:api00,dclus1, Bootstrap()).total[1] 5949162 atol = 1
@test total(:api00,dclus1, Bootstrap(replicates = 10000)).SE[1] 1352953 atol = 50000 # without fpc as it hasn't been figured out for bootstrap.

end

0 comments on commit 515eb24

Please sign in to comment.