diff --git a/src/Survey.jl b/src/Survey.jl index 331eaa31..f902bdc5 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -2,6 +2,7 @@ module Survey using DataFrames using Statistics +import Statistics: quantile using StatsBase import StatsBase: mean,quantile using CSV diff --git a/src/mean.jl b/src/mean.jl index f24ace7b..95869857 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -95,8 +95,8 @@ end Estimate the subpopulation mean of a variable `x`. -The calculations were done according to the book [Calibration Estimators in Survey Sampling](https://www.tandfonline.com/doi/abs/10.1080/01621459.1992.10475217) -by Jean-Claude Deville and Carl-Erik Sarndal. +The calculations were done according to the book [Model-Assisted Survey Sampling](https://link.springer.com/book/9780387406206) +by Carl-Erik Sarndal, Bengt Swensson, Jan Wretman, section 3.3 and Chap 10. Assumes popsize is known and subpopulation size is unknown. ```jldoctest julia> using Survey; diff --git a/src/quantile.jl b/src/quantile.jl index f2134e9d..8ee6000c 100644 --- a/src/quantile.jl +++ b/src/quantile.jl @@ -1,39 +1,53 @@ """ - quantile(var, design, q) -Estimate quantiles for `SurveyDesign`s. + quantile(var, design, p; kwargs...) +Estimate quantiles for a complex survey. + +Hyndman and Fan compiled a taxonomy of nine algorithms to estimate quantiles. These are implemented in Statistics.quantile, which this function calls. +The Julia, R and Python-numpy use the same defaults + +# References: +- Hyndman, R.J and Fan, Y. (1996) ["Sample Quantiles in Statistical Packages"](https://www.amherst.edu/media/view/129116/original/Sample+Quantiles.pdf), The American Statistician, Vol. 50, No. 4, pp. 361-365. +- [Quantiles](https://en.m.wikipedia.org/wiki/Quantile) on wikipedia +- [Complex Surveys: a guide to analysis using R](https://r-survey.r-forge.r-project.org/svybook/), Section 2.4.1 and Appendix C.4. ```jldoctest julia> apisrs = load_data("apisrs"); julia> srs = SimpleRandomSample(apisrs;popsize=:fpc); -julia> quantile(:enroll, srs, 0.5) -1×1 DataFrame - Row │ 0.5th percentile - │ Float64 -─────┼────────────────── - 1 │ 453.0 +julia> quantile(:api00,srs,0.5) +1×2 DataFrame + Row │ probability quantile + │ Float64 Float64 +─────┼─────────────────────── + 1 │ 0.5 659.0 + +julia> quantile(:enroll,srs,[0.1,0.2,0.5,0.75,0.95]) +5×2 DataFrame + Row │ probability quantile + │ Float64 Float64 +─────┼─────────────────────── + 1 │ 0.1 245.5 + 2 │ 0.2 317.6 + 3 │ 0.5 453.0 + 4 │ 0.75 668.5 + 5 │ 0.95 1473.1 ``` """ -function quantile(var, design::SimpleRandomSample, q; kwargs...) - x = design.data[!, var] - df = DataFrame(tmp = Statistics.quantile(Float32.(x), q; kwargs...)) - rename!(df, :tmp => Symbol(string(q) .* "th percentile")) - return df -end - -function quantile(var, design::StratifiedSample, q) - x = design.data[!, var] - w = design.data.probs - df = DataFrame(tmp = Statistics.quantile(Float32.(x), weights(w), q)) - rename!(df, :tmp => Symbol(string(q) .* "th percentile")) +function quantile(var::Symbol, design::SimpleRandomSample, p::Union{<:Real,Vector{<:Real}}; + alpha::Float64=0.05, ci::Bool=false, se::Bool=false, qrule="hf7",kwargs...) + v = design.data[!, var] + probs = design.data[!, :probs] + df = DataFrame(probability=p, quantile=Statistics.quantile(v, ProbabilityWeights(probs), p)) + # TODO: Add CI and SE of the quantile return df end -# Inner method for `by` -function quantile(x, w, _, q) - df = DataFrame(tmp = Statistics.quantile(Float32.(x), weights(w), q)) - rename!(df, :tmp => Symbol(string(q) .* "th percentile")) - +function quantile(var::Symbol, design::StratifiedSample, p::Union{<:Real,Vector{<:Real}}; + alpha::Float64=0.05, ci::Bool=false, se::Bool=false, qrule="hf7",kwargs...) + v = design.data[!, var] + probs = design.data[!, :probs] + df = DataFrame(probability=p, quantile=Statistics.quantile(v, ProbabilityWeights(probs), p)) # Not sure which quantile defintion this returns + # TODO: Add CI and SE of the quantile return df -end +end \ No newline at end of file diff --git a/test/quantile.jl b/test/quantile.jl index 3dcedfbb..cab18fdb 100644 --- a/test/quantile.jl +++ b/test/quantile.jl @@ -1,11 +1,34 @@ -@testset "quantile.jl" begin - # SimpleRandomSample +@testset "quantile_SimpleRandomSample" begin + ##### SimpleRandomSample tests + # Load API datasets apisrs_original = load_data("apisrs") + apisrs_original[!, :derived_probs] = 1 ./ apisrs_original.pw + apisrs_original[!, :derived_sampsize] = fill(200.0, size(apisrs_original, 1)) + ############################## + apisrs = copy(apisrs_original) + srs_design = SimpleRandomSample(apisrs,popsize=:fpc) + @test quantile(:api00,srs_design,0.5)[!,2][1] ≈ 659.0 atol=1e-4 + @test quantile(:api00,srs_design,[0.1753,0.25,0.5,0.75,0.975])[!,2] ≈ [512.8847,544,659,752.5,905] atol = 1e-4 + @test quantile(:enroll,srs_design,[0.1,0.2,0.5,0.75,0.95])[!,2] ≈ [245.5,317.6,453.0,668.5,1473.1] atol = 1e-4 +end - apisrs = copy(apisrs_original) - srs_new = SimpleRandomSample(apisrs; popsize=:fpc, ignorefpc=true) - # 0.5th percentile - # 0.25th percentile +@testset "quantile_Stratified" begin + ##### StratifiedSample tests + # Load API datasets + apistrat_original = load_data("apistrat") + apistrat_original[!, :derived_probs] = 1 ./ apistrat_original.pw + apistrat_original[!, :derived_sampsize] = apistrat_original.fpc ./ apistrat_original.pw + # base functionality + apistrat = copy(apistrat_original) + dstrat = StratifiedSample(apistrat, :stype; popsize = :fpc) + # Check which definition of quantile for StratifiedSample + # @test quantile(:enroll,dstrat,[0.1,0.2,0.5,0.75,0.95])[!,2] ≈ [262,309.3366,446.4103,658.8764,1589.7881] atol = 1e-4 +end - # StratifiedSample +@testset "quantile_by_SimpleRandomSample" begin + ## Add tests end + +@testset "quantile_by_Stratified" begin + ## Add tests +end \ No newline at end of file