Skip to content

Commit

Permalink
Merge pull request #172 from smishr/singledesign_testing
Browse files Browse the repository at this point in the history
Update survey design, add tests, remove extra quantile, add `design.weights`
  • Loading branch information
smishr authored Jan 16, 2023
2 parents 0fb80c0 + 5275be9 commit e12c9e5
Show file tree
Hide file tree
Showing 14 changed files with 235 additions and 99 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
/dev/*
.gitignore
.DS_Store
*.json
*.json
88 changes: 51 additions & 37 deletions src/SurveyDesign.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,22 @@ individuals in one cluster are sampled. The clusters are considered disjoint and
- `strata::Union{Nothing, Symbol}=nothing`: the stratification variable.
- `clusters::Union{Nothing, Symbol, Vector{Symbol}}=nothing`: the clustering variable.
- `weights::Union{Nothing, Symbol}=nothing`: the sampling weights.
- `popsize::Union{Nothing, Int, Symbol}=nothing`: the (expected) survey population size.
- `popsize::Union{Nothing, Symbol}=nothing`: the (expected) survey population size.
```jldoctest
julia> apistrat = load_data("apistrat");
julia> apiclus1 = load_data("apiclus1");
julia> strat = SurveyDesign(apistrat; strata=:stype, weights=:pw)
julia> dclus1 = SurveyDesign(apiclus1; clusters=:dnum, strata=:stype, weights=:pw)
SurveyDesign:
data: 200×46 DataFrame
data: 183×43 DataFrame
strata: stype
[E, E, E … H]
cluster: none
popsize: [6190.0, 6190.0, 6190.0 … 6190.0]
sampsize: [200, 200, 200 … 200]
weights: [44.2, 44.2, 44.2 … 15.1]
probs: [0.0226, 0.0226, 0.0226 … 0.0662]
[H, E, E … E]
cluster: dnum
[637, 637, 637 … 448]
popsize: [507.7049, 507.7049, 507.7049 … 507.7049]
sampsize: [15, 15, 15 … 15]
weights: [33.847, 33.847, 33.847 … 33.847]
allprobs: [0.0295, 0.0295, 0.0295 … 0.0295]
```
"""
struct SurveyDesign <: AbstractSurveyDesign
Expand All @@ -48,14 +49,15 @@ struct SurveyDesign <: AbstractSurveyDesign
popsize::Symbol
sampsize::Symbol
strata::Symbol
pps::Bool
weights::Symbol # Effective weights in case of singlestage approx supported
allprobs::Symbol # Right now only singlestage approx supported
pps::Bool # TODO functionality
# Single stage clusters sample, like apiclus1
function SurveyDesign(
data::AbstractDataFrame;
strata::Union{Nothing, Symbol}=nothing,
clusters::Union{Nothing, Symbol, Vector{Symbol}}=nothing,
weights::Union{Nothing, Symbol}=nothing,
popsize::Union{Nothing, Int, Symbol}=nothing
function SurveyDesign(data::AbstractDataFrame;
clusters::Union{Nothing,Symbol,Vector{Symbol}}=nothing,
strata::Union{Nothing,Symbol}=nothing,
popsize::Union{Nothing,Symbol}=nothing,
weights::Union{Nothing,Symbol}=nothing
)
# sampsize here is number of clusters completely sampled, popsize is total clusters in population
if typeof(strata) <: Nothing
Expand All @@ -73,24 +75,34 @@ struct SurveyDesign <: AbstractSurveyDesign
if typeof(clusters) <: Symbol
cluster = clusters
end
# For one-stage sample only one sampsize vector
sampsize_labels = :sampsize
data[!, sampsize_labels] = fill(length(unique(data[!, cluster])), (nrow(data),))
if !(typeof(popsize) <: Nothing)
data[!, :weights] = data[!, popsize] ./ data[!, sampsize_labels]
elseif !(typeof(weights) <: Nothing)
data.weights = data[!, weights]
# For single-stage approximation only one "effective" sampsize vector
sampsize_labels = :_sampsize
if isa(strata,Symbol) && isnothing(clusters) # If stratified only then sampsize is inside strata
data[!, sampsize_labels] = transform(groupby(data, strata), nrow => :counts).counts
else
data.weights = repeat([1], nrow(data))
data[!, sampsize_labels] = fill(length(unique(data[!, cluster])), (nrow(data),))
end
data[!, :probs] = 1 ./ data[!, :weights] # Many formulae are easily defined in terms of sampling probabilties
data[!, :allprobs] = data[!, :probs] # In one-stage cluster sample, allprobs is just probs, no multiplication needed
pps = false
if !(typeof(popsize) <: Symbol)
data.popsize = repeat([sum(data.weights)], nrow(data))
popsize = :popsize
if isa(popsize, Symbol)
weights_labels = :_weights
data[!, weights_labels] = data[!, popsize] ./ data[!, sampsize_labels]
elseif isa(weights, Symbol)
if !(typeof(data[!, weights]) <: Vector{<:Real})
throw(ArgumentError(string("given weights column ", weights , " is not of numeric type")))
else
# derive popsize from given `weights`
weights_labels = weights
popsize = :_popsize
data[!, popsize] = data[!, sampsize_labels] .* data[!, weights_labels]
end
else
# neither popsize nor weights given
weights_labels = :_weights
data[!, weights_labels] = repeat([1], nrow(data))
end
new(data, cluster, popsize, sampsize_labels, strata, pps)
allprobs_labels = :allprobs
data[!, allprobs_labels] = 1 ./ data[!, weights_labels] # In one-stage cluster sample, allprobs is just probs, no multiplication needed
pps = false # for now no explicit pps supported faster functions, but they can be added
new(data, cluster, popsize, sampsize_labels, strata, weights_labels, allprobs_labels, pps)
end
end

Expand All @@ -106,14 +118,14 @@ julia> strat = SurveyDesign(apistrat; strata=:stype, weights=:pw);
julia> bootstrat = bootweights(strat; replicates=1000)
ReplicateDesign:
data: 200×1046 DataFrame
data: 200×1044 DataFrame
strata: stype
[E, E, E … H]
cluster: none
popsize: [6190.0, 6190.0, 6190.06190.0]
sampsize: [200, 200, 200200]
weights: [44.2, 44.2, 44.2 … 15.1]
probs: [0.0226, 0.0226, 0.0226 … 0.0662]
popsize: [4420.9999, 4420.9999, 4420.9999755.0]
sampsize: [100, 100, 10050]
weights: [44.21, 44.21, 44.21 … 15.1]
allprobs: [0.0226, 0.0226, 0.0226 … 0.0662]
replicates: 1000
```
"""
Expand All @@ -123,6 +135,8 @@ struct ReplicateDesign <: AbstractSurveyDesign
popsize::Symbol
sampsize::Symbol
strata::Symbol
weights::Symbol # Effective weights in case of singlestage approx supported
allprobs::Symbol # Right now only singlestage approx supported
pps::Bool
replicates::UInt
end
14 changes: 7 additions & 7 deletions src/bootstrap.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,18 @@ julia> using Random
julia> apiclus1 = load_data("apiclus1");
julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum);
julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum, popsize=:fpc);
julia> bootweights(clus_one_stage; replicates=1000, rng=MersenneTwister(111)) # choose a seed for deterministic results
ReplicateDesign:
data: 183×1046 DataFrame
data: 183×1044 DataFrame
strata: none
cluster: dnum
[637, 637, 637 … 448]
popsize: [183, 183, 183183]
popsize: [757, 757, 757757]
sampsize: [15, 15, 15 … 15]
weights: [1, 1, 11]
probs: [1.0, 1.0, 1.01.0]
weights: [50.4667, 50.4667, 50.466750.4667]
allprobs: [0.0198, 0.0198, 0.01980.0198]
replicates: 1000
```
"""
Expand All @@ -34,7 +34,7 @@ function bootweights(design::SurveyDesign; replicates=4000, rng=MersenneTwister(
rh = [(count(==(i), randinds)) for i in 1:nh] # main bootstrap algo.
gdf = groupby(substrata, design.cluster)
for i in 1:nh
gdf[i].whij = repeat([rh[i]], nrow(gdf[i])) .* gdf[i].weights .* (nh / (nh - 1))
gdf[i].whij = repeat([rh[i]], nrow(gdf[i])) .* gdf[i][!,design.weights] .* (nh / (nh - 1))
end
stratified[h].whij = transform(gdf).whij

Expand All @@ -47,5 +47,5 @@ function bootweights(design::SurveyDesign; replicates=4000, rng=MersenneTwister(
for i in 2:(replicates)
df[!, "replicate_" * string(i)] = disallowmissing(replicate(stratified, H).whij)
end
return ReplicateDesign(df, design.cluster, design.popsize, design.sampsize, design.strata, design.pps, replicates)
return ReplicateDesign(df, design.cluster, design.popsize, design.sampsize, design.strata, design.weights, design.allprobs, design.pps, replicates)
end
2 changes: 1 addition & 1 deletion src/by.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
function bydomain(x::Symbol, domain::Symbol, design::ReplicateDesign, func::Function)
gdf = groupby(design.data, domain)
nd = length(unique(design.data[!, domain]))
X = combine(gdf, [x, :weights] => ((a, b) -> func(a, weights(b))) => :statistic)
X = combine(gdf, [x, design.weights] => ((a, b) -> func(a, weights(b))) => :statistic)
Xt_mat = Array{Float64, 2}(undef, (nd, design.replicates))
for i in 1:design.replicates
Xt_mat[:, i] = combine(gdf, [x, Symbol("replicate_"*string(i))] => ((a, c) -> func(a, weights(c))) => :statistic).statistic
Expand Down
2 changes: 1 addition & 1 deletion src/hist.jl
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ function hist(design::AbstractSurveyDesign, var::Symbol,
kwargs...
)
hist = histogram(bins = bins, normalization = normalization, kwargs...)
data(design.data) * mapping(var, weights = :weights) * hist |> draw
data(design.data) * mapping(var, weights = design.weights) * hist |> draw
end

function hist(design::AbstractSurveyDesign, var::Symbol,
Expand Down
6 changes: 3 additions & 3 deletions src/mean.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ julia> clus_one_stage = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw)
julia> mean(:api00, clus_one_stage)
1×2 DataFrame
Row │ mean SE
│ Float64 Float64
Row │ mean SE
│ Float64 Float64
─────┼──────────────────
1 │ 644.169 23.2919
Expand All @@ -25,7 +25,7 @@ julia> mean([:api00, :enroll], clus_one_stage)
```
"""
function mean(x::Symbol, design::ReplicateDesign)
X = mean(design.data[!, x], weights(design.data.weights))
X = mean(design.data[!, x], weights(design.data[!,design.weights]))
Xt = [mean(design.data[!, x], weights(design.data[! , "replicate_"*string(i)])) for i in 1:design.replicates]
variance = sum((Xt .- X).^2) / design.replicates
DataFrame(mean = X, SE = sqrt(variance))
Expand Down
2 changes: 1 addition & 1 deletion src/plot.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,5 @@ save("scatter.png", s); nothing # hide
![](assets/scatter.png)
"""
function plot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...)
data(design.data) * mapping(x, y, markersize = :weights) * visual(Scatter, marker = '') |> draw
data(design.data) * mapping(x, y, markersize = design.weights) * visual(Scatter, marker = '') |> draw
end
11 changes: 1 addition & 10 deletions src/quantile.jl
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,8 @@ julia> quantile(:enroll,srs,[0.1,0.2,0.5,0.75,0.95])
function quantile(var::Symbol, design::SurveyDesign, p::Union{<:Real,Vector{<:Real}};
alpha::Float64=0.05, ci::Bool=false, se::Bool=false, qrule="hf7",kwargs...)
v = design.data[!, var]
probs = design.data[!, :probs]
probs = design.data[!, design.allprobs]
df = DataFrame(probability=p, quantile=Statistics.quantile(v, ProbabilityWeights(probs), p))
# TODO: Add CI and SE of the quantile
return df
end

function quantile(var::Symbol, design::SurveyDesign, p::Union{<:Real,Vector{<:Real}};
alpha::Float64=0.05, ci::Bool=false, se::Bool=false, qrule="hf7",kwargs...)
v = design.data[!, var]
probs = design.data[!, :probs]
df = DataFrame(probability=p, quantile=Statistics.quantile(v, ProbabilityWeights(probs), p)) # Not sure which quantile defintion this returns
# TODO: Add CI and SE of the quantile
return df
end
4 changes: 2 additions & 2 deletions src/ratio.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@ julia> ratio(:api00, :enroll, clus_one_stage)
```
"""
function ratio(variable_num:: Symbol, variable_den:: Symbol, design::SurveyDesign)
statistic = wsum(design.data[!,variable_num],design.data.weights)/wsum(design.data[!,variable_den],design.data.weights)
statistic = wsum(design.data[!,variable_num],design.data[!,design.weights])/wsum(design.data[!,variable_den],design.data[!,design.weights])
nh = length(unique(design.data[!,design.cluster]))
newv = []
gdf = groupby(design.data, design.cluster)
replicates = [filter(n -> n != i, 1:nh) for i in 1:nh]
for i in replicates
df = DataFrame(gdf[i])
push!(newv, wsum(df[!,variable_num],df[!,:weights])/wsum(df[!,variable_den],df[!,:weights]))
push!(newv, wsum(df[!,variable_num],df[!,design.weights])/wsum(df[!,variable_den],df[!,design.weights]))
end
c = 0
for i in 1:nh
Expand Down
6 changes: 3 additions & 3 deletions src/show.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Helper function that transforms a given `Number` or `Vector` into a short-form s
"""
function makeshort(x)
if isa(x[1], Float64)
x = round.(x, sigdigits=3)
x = round.(x, digits=4) # Rounded to 4 digits after the decimal place
end
# print short vectors or single values as they are, compress otherwise
if length(x) > 1
Expand Down Expand Up @@ -56,6 +56,6 @@ function surveyshow(io::IO, design::AbstractSurveyDesign)
printinfo(io, "popsize", makeshort(design.data[!, design.popsize]))
printinfo(io, "sampsize", makeshort(design.data[!, design.sampsize]))
# weights and probs info
printinfo(io, "weights", makeshort(design.data[!, :weights]))
printinfo(io, "probs", makeshort(design.data[!, :probs]); newline=false)
printinfo(io, "weights", makeshort(design.data[!, design.weights]))
printinfo(io, "allprobs", makeshort(design.data[!, design.allprobs]); newline=false)
end
2 changes: 1 addition & 1 deletion src/total.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ julia> total([:api00, :enroll], clus_one_stage)
```
"""
function total(x::Symbol, design::ReplicateDesign)
X = wsum(design.data[!, x], weights(design.data.weights))
X = wsum(design.data[!, x], weights(design.data[!,design.weights]))
Xt = [wsum(design.data[!, x], weights(design.data[! , "replicate_"*string(i)])) for i in 1:design.replicates]
variance = sum((Xt .- X).^2) / design.replicates
DataFrame(total = X, SE = sqrt(variance))
Expand Down
Loading

0 comments on commit e12c9e5

Please sign in to comment.