diff --git a/docs/src/assets/hist.png b/docs/src/assets/hist.png new file mode 100644 index 00000000..622a627b Binary files /dev/null and b/docs/src/assets/hist.png differ diff --git a/src/Survey.jl b/src/Survey.jl index 6316d562..82fb20c9 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -18,7 +18,7 @@ export svydesign, svyby, svyglm export data, api, apiclus1, apiclus2, apipop, apistrat, apisrs export svymean, svytotal, svyquantile export @formula -export svyhist +export svyhist, sturges, freedman_diaconis export #families Normal, diff --git a/src/svyhist.jl b/src/svyhist.jl index 1e9273a5..e4e48402 100644 --- a/src/svyhist.jl +++ b/src/svyhist.jl @@ -1,24 +1,164 @@ -sturges(n::Integer) = ceil(Int, log2(n)) + 1 +""" + sturges(v) + +Calculate the number of bins to use in a histogram using the Sturges rule. +# Examples +```jldoctest +julia> sturges(10) +5 + +julia> sturges([10, 20, 30, 40, 50]) +4 +``` +""" +sturges(n::Integer) = ceil(Int, log2(n)) + 1 sturges(vec::AbstractVector) = ceil(Int, log2(length(vec))) + 1 +""" + sturges(df::DataFrame, var::Symbol) + +Calculate the number of bins for a `DataFrame` variable. + +# Examples +```jldoctest +julia> using DataFrames + +julia> df = DataFrame((a=[1, 2, 3, 4, 5], b=[10, 20, 30, 40, 50])) +5×2 DataFrame + Row │ a b + │ Int64 Int64 +─────┼────────────── + 1 │ 1 10 + 2 │ 2 20 + 3 │ 3 30 + 4 │ 4 40 + 5 │ 5 50 + +julia> sturges(df, :b) +4 +``` +""" sturges(df::DataFrame, var::Symbol) = ceil(Int, log2(size(df[!, var], 1))) + 1 +""" + sturges(design::svydesign, var::Symbol) + +Calculate the number of bins for a survey design variable. + +# Examples +```jldoctest +julia> data(api); + +julia> dstrat = svydesign(data = apistrat, id = :1, strata = :stype, weights = :pw, fpc = :fpc); + +julia> sturges(dstrat, :enroll) +9 +``` +""" sturges(design::svydesign, var::Symbol) = sturges(design.variables, var) +""" + freedman_diaconis(v::AbstractVector) + +Calculate the number of bins to use in a histogram using the Freedman-Diaconis rule. + +# Examples +```jldoctest +julia> freedman_diaconis([10, 20, 30, 40, 50]) +2 +``` +""" +freedman_diaconis(v::AbstractVector) = round(Int, length(v)^(1 / 3) * (maximum(v) - minimum(v)) / (2 * iqr(v))) + +""" + freedman_diaconis(df::DataFrame, var::Symbol) + +Calculate the number of bins for a `DataFrame` variable. + +# Examples +```jldoctest +julia> using DataFrames + +julia> df = DataFrame((a=[1, 2, 3, 4, 5], b=[10, 20, 30, 40, 50])); + +julia> freedman_diaconis(df, :b) +2 +``` +""" +freedman_diaconis(df::DataFrame, var::Symbol) = freedman_diaconis(df[!, var]) + +""" + freedman_diaconis(design::svydesign, var::Symbol) + +Calculate the number of bins for a survey design variable. + +# Examples +```jldoctest +julia> data(api); + +julia> dstrat = svydesign(data = apistrat, id = :1, strata = :stype, weights = :pw, fpc = :fpc); + +julia> freedman_diaconis(dstrat, :enroll) +15 +``` +""" +freedman_diaconis(design::svydesign, var::Symbol) = freedman_diaconis(design.variables[!, var]) """ -Histogram plot of a survey design variable. +```julia +svyhist(design, var, bins = freedman_diaconis; normalization = :density, weights = ones(size(design.variables, 1), ...) +``` +Histogram plot of a survey design variable given by `var`. + +`bins` can be an `Integer` specifying the number of equal-width bins, +an `AbstractVector` specifying the bins intervals, or a `Function` specifying +the function used for calculating the number of bins. The possible functions +are `sturges` and `freedman_diaconis`. + +The normalization can be set to `:none`, `:density`, `:probability` or `:pdf`. +See [Makie.hist](https://makie.juliaplots.org/stable/examples/plotting_functions/hist/) +for more information. + +The `weights` argument can be an `AbstractVector` or a `Symbol` specifying a +design variable. + +For the complete argument list see [Makie.hist](https://makie.juliaplots.org/stable/examples/plotting_functions/hist/). + +```@example e1 +julia> using survey + +julia> data(api); + +julia> dstrat = svydesign(data = apistrat, id = :1, strata = :stype, weights = :pw, fpc = :fpc); + +julia> h = svyhist(dstrat, :enroll) +``` + +![](./assets/hist.png) """ -function svyhist(design::svydesign, var::Symbol; bins = sturges(design, var), - normalization = :density, - weights::AbstractVector = ones(size(design.variables, 1)), kwargs...) +function svyhist(design::svydesign, var::Symbol, + bins::Union{Integer, AbstractVector} = freedman_diaconis(design, var); + normalization = :density, + weights::Union{Symbol, AbstractVector} = ones(size(design.variables, 1)), + kwargs... + ) + if isa(weights, Symbol) + weights = design.variables[!, weights] + end + hist(design.variables[!, var]; bins = bins, normalization = normalization, weights = weights, kwargs...) end -function svyhist(design::svydesign, var::Symbol; bins = sturges(design, var), - normalization = :density, - weights::Symbol, kwargs...) - weights = design.variables[!, weights] - hist(design.variables[!, var]; bins = bins, normalization = normalization, weights = weights, kwargs...) +function svyhist(design::svydesign, var::Symbol, + bins::Function = freedman_diaconis; + normalization = :density, + weights::Union{Symbol, AbstractVector} = ones(size(design.variables, 1)), + kwargs... + ) + if isa(weights, Symbol) + weights = design.variables[!, weights] + end + + hist(design.variables[!, var]; bins = bins(design, var), normalization = normalization, weights = weights, kwargs...) end diff --git a/test/svyhist.jl b/test/svyhist.jl index 690897ae..b297c365 100644 --- a/test/svyhist.jl +++ b/test/svyhist.jl @@ -4,8 +4,27 @@ using Test @testset "svyhist.jl" begin data(api) dstrat = svydesign(data = apistrat, id = :1, strata = :stype, weights = :pw, fpc = :fpc) - h = svyhist(dstrat, :enroll, weights = :pw) + @test Survey.sturges(10) == 5 + @test Survey.sturges([1, 2, 5, 10, 15, 17, 20]) == 4 + + h = svyhist(dstrat, :enroll) + @test getindex(h.plot.bins) == 15 + @test getindex(h.plot.weights) == ones(length(dstrat.variables.pw)) + + h = svyhist(dstrat, :enroll; weights = :pw) + @test getindex(h.plot.bins) == 15 + @test getindex(h.plot.weights) == dstrat.variables.pw + + h = svyhist(dstrat, :enroll, 9; weights = :pw) @test getindex(h.plot.bins) == 9 @test getindex(h.plot.weights) == dstrat.variables.pw + + h = svyhist(dstrat, :enroll, Survey.sturges; weights = :pw) + @test getindex(h.plot.bins) == 9 + @test getindex(h.plot.weights) == dstrat.variables.pw + + h = svyhist(dstrat, :enroll, [0, 1000, 2000, 3000]; weights = :pw) + @test getindex(h.plot.bins) == [0, 1000, 2000, 3000] + @test getindex(h.plot.weights) == dstrat.variables.pw end