diff --git a/docs/src/assets/hist.png b/docs/src/assets/hist.png index 58b3f413..966d3ea2 100644 Binary files a/docs/src/assets/hist.png and b/docs/src/assets/hist.png differ diff --git a/docs/src/assets/scatter.png b/docs/src/assets/scatter.png index 1c1304ec..fff128d7 100644 Binary files a/docs/src/assets/scatter.png and b/docs/src/assets/scatter.png differ diff --git a/docs/src/examples.md b/docs/src/examples.md index 1776ce2d..e9e22338 100644 --- a/docs/src/examples.md +++ b/docs/src/examples.md @@ -1,7 +1,62 @@ # Examples -The following examples use the Academic Performance Index (API) dataset for Californian schools. +The following examples use the +[Academic Performance Index](https://r-survey.r-forge.r-project.org/survey/html/api.html) +(API) dataset for Californian schools. The data sets contain information for all schools +with at least 100 students and for various probability samples of the data. -```@docs -svyby(formula::Symbol, by, design::svydesign, func::Function, params = []) +The API program has been discontinued at the end of 2018. Information is archived at +[https://www.cde.ca.gov/re/pr/api.asp](https://www.cde.ca.gov/re/pr/api.asp) + +## Simple Random Sample + +Firstly, a survey design needs a dataset from which to gather information. A dataset +can be loaded as a `DataFrame` using the `load_data` function: + +```julia +julia> apisrs = load_data("apisrs"); +``` + +Next, we can build a design. The most basic survey design is a simple random sample design. +A [`SimpleRandomSample`](@ref) can be instantianted by calling the constructor: + +```julia +julia> srs = SimpleRandomSample(apisrs; weights = :pw) +SimpleRandomSample: +data: 200x42 DataFrame +weights: 31.0, 31.0, 31.0, ..., 31.0 +probs: 0.0323, 0.0323, 0.0323, ..., 0.0323 +fpc: 6194, 6194, 6194, ..., 6194 +popsize: 6194 +sampsize: 200 +sampfraction: 0.0323 +ignorefpc: false ``` + +With a `SimpleRandomSample` (as well as with any subtype of [`AbstractSurveyDesign`](@ref)) +it is possible to calculate estimates of the mean or population total for a given variable, +along with the corresponding standard errors. + +```julia +julia> svymean(:api00, srs) +1×2 DataFrame + Row │ mean sem + │ Float64 Float64 +─────┼────────────────── + 1 │ 656.585 9.24972 + +julia> svytotal(:api00, srs) +1×2 DataFrame + Row │ total se_total + │ Float64 Float64 +─────┼───────────────────── + 1 │ 4.06689e6 57292.8 +``` + +The design can be tweaked by specifying the population or sample size or whether +or not to account for finite population correction (fpc). By default the weights +are equal to one, the sample size is equal to the number of rows in `data` and the +fpc is not ignored. The population size is calculated from the weights. + +When `ignorefpc` is set to `false` the `fpc` is calculated from the sample and population +sizes. When it is set to `true` it is set to 1. diff --git a/docs/src/index.md b/docs/src/index.md index 72f72983..9d295a4d 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -10,8 +10,32 @@ This package is the Julia implementation of the [Survey package in R](https://cr At [xKDR](https://xkdr.org/) we processed millions of records from household surveys using the survey package in R. This process took hours of computing time. By implementing the code in Julia, we are able to do the processing in seconds. In this package we have implemented the functions `svymean`, `svyquantile` and `svysum`. We have kept the syntax between the two packages similar so that we can easily move our existing code to the new language. -Documentation for [Survey](https://github.com/Survey.jl). +## Index -```@autodocs -Modules = [Survey] +```@index +Module = [Survey] +Private = false +``` + +## API +```@docs +load_data +AbstractSurveyDesign +SimpleRandomSample +StratifiedSample +ClusterSample +dim(design::AbstractSurveyDesign) +colnames(design::AbstractSurveyDesign) +dimnames(design::AbstractSurveyDesign) +svymean(x::Symbol, design::SimpleRandomSample) +svytotal(x::Symbol, design::SimpleRandomSample) +svyby +svyglm +svyplot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...) +svyhist(design::AbstractSurveyDesign, var::Symbol, + bins::Union{Integer, AbstractVector} = freedman_diaconis(design, var); + normalization = :density, + kwargs... + ) +svyboxplot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...) ``` diff --git a/shikharTests.jl b/shikharTests.jl deleted file mode 100644 index e311257b..00000000 --- a/shikharTests.jl +++ /dev/null @@ -1,101 +0,0 @@ -## Shikhar added test 24.08.22 -using Revise; -using Survey; -apisrs = load_data("apisrs"); -srs = SimpleRandomSample(apisrs, weights = apisrs.pw ); -svymean(:enroll, srs) - -# Test without fpc -using Revise; -using Survey; -apisrs_nofpc = load_data("apisrs"); -srs = SimpleRandomSample(apisrs_nofpc,weights = apisrs.pw,ignorefpc = true); -svytotal(:enroll, srs) - -using Revise; -using Survey; -using DataFrames; -apisrs = load_data("apisrs"); -srs = SimpleRandomSample(apisrs, weights = apisrs.pw ); -svytotal(:enroll, srs) - -srs_design = SimpleRandomSample(apisrs, weights = apisrs.pw ); -factor_variable_test = svytotal(:stype, srs) - -########## -using Survey -srs_design = SimpleRandomSample(apisrs, weights = apisrs.pw ) - - -macro svypipe(design::AbstractSurveyDesign, args...) - # Some definitions -end -@svypipe design |> groupby(:country) |> mean(:height) - -using StatsBase -combine(groupby(x, :country) , :height => mean) - -# Works -@pipe x |> groupby(_, :country) |> combine(_, :height => mean) -#doesnt work -@pipe x |> groupby(:country) |> combine(_, :height => mean) - -using Lazy -import DataFrames.groupby -@> x groupby(:country) combine(:height => mean) - - - - -### Test svyby -svyby(:api00,:cname, srs, svymean ) -groupby(apisrs,:cname) -combine(groupby(apisrs,:cname) , :api00 => mean) -combine(groupby(apisrs,:cname) , :api00 => svymean => AsTable) - - - - -x = DataFrame(country = [1,2,3,4,4], height = [10,20,30,40,20]) - -svyby(srs_desing, [enroll,] , summarise = mean, col = col1) - -(srs_design, enroll) - -# function |> (design::AbstractSurveyDesign ; func) -# design.data |> func(...) -# end - - - -### 5.09.22 Cleaned up tests -using Revise; -using Survey; -apisrs = load_data("apisrs"); -srs = SimpleRandomSample(apisrs, weights = apisrs.pw ); -svymean(:enroll, srs) - - -# New issue: -# Add CategoricalArrays ("Factor") support, multiple dispatch -# Add multiple dispatch methods for `CategoricalArray` type columns in the dataset - -# • Intelligent parsing of `StringX` columns to be read as CategoricalArrays. -# Eg/ if nunique(col) < len(col)/2 - - # # If sampling probabilities given then sampling weights is inverse of probs - # if !isnothing(probs) - # weights = 1 ./ probs - # end - - - # sampsize::Union{Nothing,Vector{<:Real}} - # popsize::Union{Nothing,Vector{<:Real}} - # sampfraction::Vector{<:Real} - # fpc::Vector{<:Real} - # combine(gdf) do sdf - # DataFrame(mean = mean(sdf[!, x], sem = sem(x, design::SimpleRandomSample))) - # end - - # if isa(x,Symbol) && - # return DataFrame(mean = ["Yolo"], sem = ["Yolo"]) \ No newline at end of file diff --git a/src/Survey.jl b/src/Survey.jl index e56e22b6..ad5c0b27 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -11,12 +11,11 @@ using AlgebraOfGraphics using CategoricalArrays include("SurveyDesign.jl") -include("show.jl") include("svydesign.jl") include("svymean.jl") include("svyquantile.jl") include("svytotal.jl") -include("example.jl") +include("load_data.jl") include("svyglm.jl") include("svyhist.jl") include("svyplot.jl") @@ -24,6 +23,7 @@ include("dimnames.jl") include("svyboxplot.jl") include("svyby.jl") include("ht.jl") +include("show.jl") export load_data export AbstractSurveyDesign, SimpleRandomSample, StratifiedSample diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index d4f657d0..0e05c858 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -1,9 +1,13 @@ """ -Supertype for every survey design type: `SimpleRandomSample`, `ClusterSample` -and `StratifiedSample`. + AbstractSurveyDesign -The data to a survey constructor is modified. To avoid this pass a copy of the data -instead of the original. +Supertype for every survey design type: [`SimpleRandomSample`](@ref), [`StratifiedSample`](@ref) +and [`ClusterSample`](@ref). + +!!! note + + The data passed to a survey constructor is modified. To avoid this pass a copy of the data + instead of the original. """ abstract type AbstractSurveyDesign end diff --git a/src/dimnames.jl b/src/dimnames.jl index 09b2ffcf..0a591ee4 100644 --- a/src/dimnames.jl +++ b/src/dimnames.jl @@ -1,11 +1,12 @@ """ dim(design) + Get the dimensions of a `SurveyDesign`. ```jldoctest julia> apisrs = load_data("apisrs"); -julia> srs = SimpleRandomSample(apisrs); +julia> srs = SimpleRandomSample(apisrs; weights = :pw); julia> dim(srs) (200, 42) @@ -14,7 +15,7 @@ julia> dim(srs) dim(design::AbstractSurveyDesign) = size(design.data) """ -Method for `svydesign` object. +Method for `svydesign`. ```jldoctest julia> apistrat = load_data("apistrat"); @@ -29,12 +30,13 @@ dim(design::svydesign) = size(design.variables) """ colnames(design) + Get the column names of a `SurveyDesign`. ```jldoctest julia> apisrs = load_data("apisrs"); -julia> srs = SimpleRandomSample(apisrs); +julia> srs = SimpleRandomSample(apisrs; weights = :pw); julia> colnames(srs) 42-element Vector{String}: @@ -63,7 +65,7 @@ julia> colnames(srs) colnames(design::AbstractSurveyDesign) = names(design.data) """ -Method for `svydesign` objects. +Method for `svydesign`. ```jldoctest julia> apistrat = load_data("apistrat"); @@ -98,12 +100,13 @@ colnames(design::svydesign) = names(design.variables) """ dimnames(design) + Get the names of the rows and columns of a `SurveyDesign`. ```jldoctest julia> apisrs = load_data("apisrs"); -julia> srs = SimpleRandomSample(apisrs); +julia> srs = SimpleRandomSample(apisrs; weights = :pw); julia> dimnames(srs) 2-element Vector{Vector{String}}: @@ -114,7 +117,7 @@ julia> dimnames(srs) dimnames(design::AbstractSurveyDesign) = [string.(1:size(design.data, 1)), names(design.data)] """ -Method for `svydesign` objects. +Method for `svydesign`. ```jldoctest julia> apistrat = load_data("apistrat"); diff --git a/src/example.jl b/src/example.jl deleted file mode 100644 index 0c49e0d4..00000000 --- a/src/example.jl +++ /dev/null @@ -1,26 +0,0 @@ -const PKG_DIR = joinpath(pathof(Survey), "..", "..") |> normpath -asset_path(args...) = joinpath(PKG_DIR, "assets", args...) - -""" -The Academic Performance Index is computed for all California schools based on standardised testing of students. -The data sets contain information for all schools with at least 100 students and for various probability samples of the data. - -Use `load_data(name)` to load API data, with -`name ∈ ["apiclus1", "apiclus2", "apipop", "apistrat", "apisrs"]` -being the name of the dataset. - -```julia -df = load_data("apiclus1") -``` - -Details about the columns of the dataset can be found here: https://r-survey.r-forge.r-project.org/survey/html/api.html - -The API program has been discontinued at the end of 2018. Information is archived at https: -//www.cde.ca.gov/re/pr/api.asp -""" -function load_data(name) - name = name * ".csv" - @assert name ∈ readdir(asset_path()) - - CSV.read(asset_path(name), DataFrame, missingstring="NA") -end diff --git a/src/load_data.jl b/src/load_data.jl new file mode 100644 index 00000000..bfae08d4 --- /dev/null +++ b/src/load_data.jl @@ -0,0 +1,42 @@ +const PKG_DIR = joinpath(pathof(Survey), "..", "..") |> normpath +asset_path(args...) = joinpath(PKG_DIR, "assets", args...) + +""" + load_data(name) + +Load a dataset as a `DataFrame`. + +All available datasets can be found in the [`assets/`](https://github.com/xKDR/Survey.jl/tree/main/assets) +directory. + +```jldoctest +julia> apisrs = load_data("apisrs") +200×40 DataFrame + Row │ Column1 cds stype name sname ⋯ + │ Int64 Int64 String1 String15 String ⋯ +─────┼────────────────────────────────────────────────────────────────────────── + 1 │ 1039 15739081534155 H McFarland High McFarland High ⋯ + 2 │ 1124 19642126066716 E Stowers (Cecil Stowers (Cecil B.) E + 3 │ 2868 30664493030640 H Brea-Olinda Hig Brea-Olinda High + 4 │ 1273 19644516012744 E Alameda Element Alameda Elementary + 5 │ 4926 40688096043293 E Sunnyside Eleme Sunnyside Elementary ⋯ + 6 │ 2463 19734456014278 E Los Molinos Ele Los Molinos Elementa + 7 │ 2031 19647336058200 M Northridge Midd Northridge Middle + 8 │ 1736 19647336017271 E Glassell Park E Glassell Park Elemen + ⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋱ + 194 │ 4880 39686766042782 E Tyler Skills El Tyler Skills Element ⋯ + 195 │ 993 15636851531987 H Desert Junior/S Desert Junior/Senior + 196 │ 969 15635291534775 H North High North High + 197 │ 1752 19647336017446 E Hammel Street E Hammel Street Elemen + 198 │ 4480 37683386039143 E Audubon Element Audubon Elementary ⋯ + 199 │ 4062 36678196036222 E Edison Elementa Edison Elementary + 200 │ 2683 24657716025621 E Franklin Elemen Franklin Elementary + 36 columns and 185 rows omitted +``` +""" +function load_data(name) + name = name * ".csv" + @assert name ∈ readdir(asset_path()) + + CSV.read(asset_path(name), DataFrame, missingstring="NA") +end diff --git a/src/show.jl b/src/show.jl index 0a1f91d0..b3eb7876 100644 --- a/src/show.jl +++ b/src/show.jl @@ -64,4 +64,4 @@ function Base.show(io::IO, ::MIME"text/plain", design::svydesign) print(design.nest) printstyled("\ncheck_strat: "; bold=true) print(design.check_strat) -end \ No newline at end of file +end diff --git a/src/svyboxplot.jl b/src/svyboxplot.jl index baa78512..68555bfb 100644 --- a/src/svyboxplot.jl +++ b/src/svyboxplot.jl @@ -1,23 +1,21 @@ """ -``` svyboxplot(design, x, y; kwargs...) -``` + Box plot of survey design variable `y` grouped by column `x`. -Weights can be specified by a Symbol using the keyword argument `weights`. +Weights can be specified by a `Symbol` using the keyword argument `weights`. The keyword arguments are all the arguments that can be passed to `mapping` in [AlgebraOfGraphics](https://docs.juliahub.com/AlgebraOfGraphics/CHIaw/0.4.7/). ```@example svyboxplot -julia> apisrs = load_data("apisrs"); - -julia> srs = SimpleRandomSample(apisrs); - -julia> bp = svyboxplot(srs, :stype, :enroll; weights = :pw) +apisrs = load_data("apisrs"); +srs = SimpleRandomSample(apisrs; weights = :pw); +bp = svyboxplot(srs, :stype, :enroll; weights = :pw) +save("boxplot.png", bp); nothing # hide ``` -![](./assets/boxplot.png) +![](assets/boxplot.png) """ function svyboxplot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...) map = mapping(x, y; kwargs...) @@ -29,8 +27,8 @@ end """ Method for `svydesign`. """ -# TODO: change function, make it a wrapper function svyboxplot(design::svydesign, x::Symbol, y::Symbol; kwargs...) + # TODO: change function, make it a wrapper map = mapping(x, y; kwargs...) data = AlgebraOfGraphics.data(design.variables) diff --git a/src/svyby.jl b/src/svyby.jl index 76b0b226..46eef093 100644 --- a/src/svyby.jl +++ b/src/svyby.jl @@ -1,34 +1,34 @@ """ -The `svyby` function can be used to generate subsets of a survey design. + svyby(formula, by, design, function, params) -```jldoctest -julia> using Survey +Generate subsets of a survey design. +```jldoctest julia> apisrs = load_data("apisrs"); -julia> srs = SimpleRandomSample(apisrs); +julia> srs = SimpleRandomSample(apisrs; weights = :pw); -julia> svyby(:api00, :cname, srs, svytotal) +julia> svyby(:api00, :cname, srs, svymean) 38×3 DataFrame - Row │ cname total se_total - │ String15 Float64 Float64 + Row │ cname mean sem + │ String15 Float64 Float64 ─────┼──────────────────────────────────── - 1 │ Kern 5736.0 2045.98 - 2 │ Los Angeles 29617.0 2050.04 - 3 │ Orange 6744.0 1234.81 - 4 │ San Luis Obispo 739.0 NaN - 5 │ San Francisco 1675.0 1193.85 - 6 │ Modoc 671.0 NaN - 7 │ Alameda 7437.0 1633.82 - 8 │ Solano 1869.0 1219.59 + 1 │ Kern 573.6 42.8026 + 2 │ Los Angeles 658.156 21.0728 + 3 │ Orange 749.333 27.0613 + 4 │ San Luis Obispo 739.0 NaN + 5 │ San Francisco 558.333 39.2453 + 6 │ Modoc 671.0 NaN + 7 │ Alameda 676.091 32.7536 + 8 │ Solano 623.0 40.0916 ⋮ │ ⋮ ⋮ ⋮ - 32 │ Kings 939.0 1190.0 - 33 │ Shasta 1508.0 1600.0 - 34 │ Yolo 475.0 NaN - 35 │ Calaveras 790.0 NaN - 36 │ Napa 1454.0 1340.0 - 37 │ Lake 804.0 NaN - 38 │ Merced 595.0 NaN + 32 │ Kings 469.5 41.4919 + 33 │ Shasta 754.0 55.7874 + 34 │ Yolo 475.0 NaN + 35 │ Calaveras 790.0 NaN + 36 │ Napa 727.0 46.722 + 37 │ Lake 804.0 NaN + 38 │ Merced 595.0 NaN 23 rows omitted ``` """ diff --git a/src/svydesign.jl b/src/svydesign.jl index d419059b..894920cf 100644 --- a/src/svydesign.jl +++ b/src/svydesign.jl @@ -1,18 +1,20 @@ """ -The `svydesign` object combines a data frame and all the survey design information needed to analyse it. + svydesign + +Type incorporating all necessary information to describe a survey design. ```jldoctest -julia> apiclus1 = load_data("apiclus1"); +julia> apistrat = load_data("apistrat"); -julia> dclus1 = svydesign(id= :dnum, weights= :pw, data = apiclus1, fpc= :fpc) |> print +julia> dstrat = svydesign(data = apistrat, id = :1, strata = :stype, weights = :pw, fpc = :fpc) Survey Design: -variables: 183x45 DataFrame -id: dnum -strata: 1, 1, 1 ... 1 -probs: 0.0295, 0.0295, 0.0295 ... 0.0295 +variables: 200x45 DataFrame +id: 1 +strata: E, E, E, ..., H +probs: 0.0226, 0.0226, 0.0226, ..., 0.0662 fpc: - popsize: 757, 757, 757 ... 757 - sampsize: 183, 183, 183 ... 183 + popsize: 4421, 4421, 4421, ..., 755 + sampsize: 200, 200, 200, ..., 200 nest: false check_strat: true ``` @@ -90,24 +92,3 @@ function svydesign(; data = DataFrame(), id = Symbol(), probs = nothing, strata df.strata = get_strata(data, strata) return svydesign(id, df, nest, check_strat) end - -function Base.show(io::IO, design::svydesign) - printstyled("Survey Design:\n") - printstyled("variables: "; bold = true) - print(size(design.variables)[1], "x", size(design.variables)[2], " DataFrame") - printstyled("\nid: "; bold = true) - print(design.id) - printstyled("\nstrata: "; bold = true) - print_short(design.variables.strata) - printstyled("\nprobs: "; bold = true) - print_short(design.variables.probs) - printstyled("\nfpc: "; bold = true) - print("\n popsize: ") - print_short(design.variables.popsize) - print("\n sampsize: ") - print_short(design.variables.sampsize) - printstyled("\nnest: "; bold = true) - print(design.nest) - printstyled("\ncheck_strat: "; bold = true) - print(design.check_strat) -end diff --git a/src/svyglm.jl b/src/svyglm.jl index d7758489..485bcc41 100644 --- a/src/svyglm.jl +++ b/src/svyglm.jl @@ -9,11 +9,9 @@ mutable struct control end """ -```julia -svyglm(formula, design, dist, link) -``` + svyglm(formula, design, dist, link) -The `svyglm` function can be used to fit glms on svydesign. +Fit Generalized Linear Models (GLMs) on `svydesign`. ```jldoctest julia> apiclus1 = load_data("apiclus1"); @@ -21,7 +19,21 @@ julia> apiclus1 = load_data("apiclus1"); julia> dclus1 = svydesign(id=:dnum, weights=:pw, data = apiclus1); julia> svyglm(@formula(ell~meals),dclus1,Normal(),IdentityLink()) +StatsModels.TableRegressionModel{GLM.GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Normal{Float64}, IdentityLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}, Matrix{Float64}} + +ell ~ 1 + meals +Coefficients: +──────────────────────────────────────────────────────────────────────── + Coef. Std. Error z Pr(>|z|) Lower 95% Upper 95% +──────────────────────────────────────────────────────────────────────── +(Intercept) 6.86665 0.350512 19.59 <1e-84 6.17966 7.55364 +meals 0.410511 0.00613985 66.86 <1e-99 0.398477 0.422545 +──────────────────────────────────────────────────────────────────────── +Degrees of Freedom: 6193.000324249264 (i.e. Null); 6192.000324249264 Residual +Null Deviance: 1.7556928968296547e6 +Residual Deviance: 1.0196009035970895e6 +AIC: 49195.42124574161 ``` """ mutable struct svyglm diff --git a/src/svyhist.jl b/src/svyhist.jl index 9c46ede8..82392edf 100644 --- a/src/svyhist.jl +++ b/src/svyhist.jl @@ -50,7 +50,7 @@ Calculate the number of bins for a `SurveyDesign`. ```jldoctest julia> apisrs = load_data("apisrs"); -julia> srs = SimpleRandomSample(apisrs); +julia> srs = SimpleRandomSample(apisrs; weights = :pw); julia> sturges(srs, :enroll) 9 @@ -114,7 +114,7 @@ Calculate the number of bins for a `SurveyDesign`. ```jldoctest julia> apisrs = load_data("apisrs"); -julia> srs = SimpleRandomSample(apisrs); +julia> srs = SimpleRandomSample(apisrs; weights = :pw); julia> freedman_diaconis(srs, :enroll) 18 @@ -140,9 +140,8 @@ julia> freedman_diaconis(dstrat, :enroll) freedman_diaconis(design::svydesign, var::Symbol) = freedman_diaconis(design.variables[!, var]) """ -```julia -svyhist(design, var, bins = freedman_diaconis; normalization = :density, weights = ones(size(design.variables, 1), ...) -``` + svyhist(design, var, bins = freedman_diaconis; normalization = :density, kwargs...) + Histogram plot of a survey design variable given by `var`. `bins` can be an `Integer` specifying the number of equal-width bins, @@ -151,32 +150,23 @@ the function used for calculating the number of bins. The possible functions are `sturges` and `freedman_diaconis`. The normalization can be set to `:none`, `:density`, `:probability` or `:pdf`. -See [Makie.hist](https://makie.juliaplots.org/stable/examples/plotting_functions/hist/) +See [AlgebraOfGraphics.histogram](https://docs.juliahub.com/AlgebraOfGraphics/CHIaw/0.4.9/generated/datatransformations/#AlgebraOfGraphics.histogram) for more information. -The `weights` argument should be a `Symbol` specifying a design variable. - For the complete argument list see [Makie.hist](https://makie.juliaplots.org/stable/examples/plotting_functions/hist/). -```julia -julia> apisrs = load_data("apisrs"); +!!! note -julia> srs = SimpleRandomSample(apisrs); + The `weights` argument should be a `Symbol` specifying a design variable. -julia> h = svyhist(srs, :enroll) +```@example histogram +apisrs = load_data("apisrs"); +srs = SimpleRandomSample(apisrs; weights = :pw); +h = svyhist(srs, :enroll) +save("hist.png", h); nothing # hide ``` -![](./assets/hist.png) - -The histogram plot also supports the old design. - -```julia -julia> apistrat = load_data("apistrat"); - -julia> dstrat = svydesign(data = apistrat, id = :1, strata = :stype, weights = :pw, fpc = :fpc); - -julia> h_old = svyhist(dstrat, :enroll) -``` +![](assets/hist.png) """ function svyhist(design::AbstractSurveyDesign, var::Symbol, bins::Union{Integer, AbstractVector} = freedman_diaconis(design, var); diff --git a/src/svymean.jl b/src/svymean.jl index 52fc0a58..fbcd4513 100644 --- a/src/svymean.jl +++ b/src/svymean.jl @@ -1,43 +1,49 @@ # SimpleRandomSample """ -Compute the mean of the survey variable `var`. + var_of_mean(x, design) -```jldoctest -julia> apisrs = load_data("apisrs"); - -julia> srs = SimpleRandomSample(apisrs); - -julia> svymean(:enroll, srs) -1×2 DataFrame - Row │ mean sem - │ Float64 Float64 -─────┼────────────────── - 1 │ 584.61 27.8212 -``` +Compute the variance of the mean for the variable `x`. """ function var_of_mean(x::Symbol, design::SimpleRandomSample) return design.fpc ./ design.sampsize .* var(design.data[!, x]) end -""" -Inner method for `svyby`. -""" function var_of_mean(x::AbstractVector, design::SimpleRandomSample) return design.fpc ./ design.sampsize .* var(x) end -function sem(x, design::SimpleRandomSample) +""" + sem(x, design) + +Compute the standard error of the mean for the variable `x`. +""" +function sem(x::Symbol, design::SimpleRandomSample) return sqrt(var_of_mean(x, design)) end -""" -Inner method for `svyby`. -""" function sem(x::AbstractVector, design::SimpleRandomSample) return sqrt(var_of_mean(x, design)) end +""" + svymean(x, design) + +Compute the mean and SEM of the survey variable `x`. + +```jldoctest +julia> apisrs = load_data("apisrs"); + +julia> srs = SimpleRandomSample(apisrs; weights = :pw); + +julia> svymean(:enroll, srs) +1×2 DataFrame + Row │ mean sem + │ Float64 Float64 +─────┼────────────────── + 1 │ 584.61 27.3684 +``` +""" function svymean(x::Symbol, design::SimpleRandomSample) if isa(x, Symbol) && isa(design.data[!, x], CategoricalArray) gdf = groupby(design.data, x) @@ -64,20 +70,22 @@ end """ Inner method for `svyby`. """ -function sem_svyby(x::AbstractVector, design::SimpleRandomSample, weights) - N = sum(weights) - Nd = length(x) - Pd = Nd / N - n = design.sampsize - n̄d = n * Pd - Ȳd = mean(x) - S2d = var(x) - Sd = sqrt(S2d) - CVd = Sd / Ȳd - V̂_Ȳd = ((1 ./ n̄d) - (1 ./ Nd)) * S2d * (1 + (1 - Pd) / CVd^2) - return sqrt(V̂_Ȳd) +function sem_svyby(x::AbstractVector, design::SimpleRandomSample, _) + # domain size + dsize = length(x) + # sample size + ssize = design.sampsize + # fpc + fpc = design.fpc + # variance of the mean + variance = (dsize / ssize)^(-2) / ssize * fpc * ((dsize - 1) / (ssize - 1)) * var(x) + # return the standard error + return sqrt(variance) end +""" +Inner method for `svyby`. +""" function svymean(x::AbstractVector, design::SimpleRandomSample, weights) return DataFrame(mean = mean(x), sem = sem_svyby(x, design::SimpleRandomSample, weights)) end diff --git a/src/svyplot.jl b/src/svyplot.jl index e4457088..259ac586 100644 --- a/src/svyplot.jl +++ b/src/svyplot.jl @@ -1,45 +1,26 @@ """ -``` svyplot(design, x, y; kwargs...) -``` + Scatter plot of survey design variables `x` and `y`. The plot takes into account the frequency weights specified by the user in the design. ```@example svyplot -julia> apisrs = load_data("apisrs"); - -julia> srs = SimpleRandomSample(apisrs); - -julia> s = svyplot(srs, :api99, :api00) +apisrs = load_data("apisrs"); +srs = SimpleRandomSample(apisrs; weights = :pw); +s = svyplot(srs, :api99, :api00) +save("scatter.png", s); nothing # hide ``` -# TODO: change the plot image and example -![](./assets/scatter.png) +![](assets/scatter.png) """ function svyplot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...) data(design.data) * mapping(x, y, markersize = :weights) * visual(Scatter, marker = '○') |> draw end """ -``` - svyplot(design, x, y; kwargs...) -``` -Scatter plot of survey design variables `x` and `y`. - -The plot takes into account the frequency weights specified by the user -in the design. - -```@example svyplot -julia> apistrat = load_data("apistrat"); - -julia> dstrat = svydesign(data = apistrat, id = :1, strata = :stype, weights = :pw, fpc = :fpc); - -julia> s = svyplot(dstrat, :api99, :api00) -``` - -![](./assets/scatter.png) +Method for `svydesign`. """ function svyplot(design::svydesign, x::Symbol, y::Symbol; kwargs...) data(design.variables) * mapping(x, y, markersize = :weights) * visual(Scatter, marker = '○') |> draw diff --git a/src/svyquantile.jl b/src/svyquantile.jl index 6db21157..e8fee87e 100644 --- a/src/svyquantile.jl +++ b/src/svyquantile.jl @@ -1,15 +1,16 @@ """ + svyquantile(var, design, q) Estimate quantiles for `SurveyDesign`s. ```jldoctest julia> apisrs = load_data("apisrs"); -julia> srs = SimpleRandomSample(apisrs); +julia> srs = SimpleRandomSample(apisrs; weights = :pw); julia> svyquantile(:enroll, srs, 0.5) 1×1 DataFrame Row │ 0.5th percentile - │ Float32 + │ Float64 ─────┼────────────────── 1 │ 453.0 ``` @@ -31,7 +32,7 @@ function svyquantile(var, design::StratifiedSample, q) end """ -Method for `svydesign` objects. +Method for `svydesign`. """ function svyquantile(var, design::svydesign, q) x = design.variables[!, var] diff --git a/src/svytotal.jl b/src/svytotal.jl index 3f248340..921c466d 100644 --- a/src/svytotal.jl +++ b/src/svytotal.jl @@ -1,5 +1,17 @@ # SimpleRandomSample +function var_of_total(x::Symbol, design::SimpleRandomSample) + return design.popsize^2 * design.fpc * var(design.data[!, x]) / design.sampsize +end + +function var_of_total(x::AbstractVector, design::SimpleRandomSample) + return design.popsize^2 * design.fpc / design.sampsize * var(x) +end + +function se_tot(x::Symbol, design::SimpleRandomSample) + return sqrt(var_of_total(x, design)) +end + """ svytotal(x, design) @@ -8,31 +20,16 @@ Estimate the population total for the variable specified by `x`. ```jldoctest julia> apisrs = load_data("apisrs"); -julia> srs = SimpleRandomSample(apisrs); +julia> srs = SimpleRandomSample(apisrs; weights = :pw); julia> svytotal(:enroll, srs) 1×2 DataFrame - Row │ total se_total - │ Float64 Float64 -─────┼──────────────────── - 1 │ 116922.0 5564.24 + Row │ total se_total + │ Float64 Float64 +─────┼───────────────────── + 1 │ 3.62107e6 1.6952e5 ``` """ -function var_of_total(x::Symbol, design::SimpleRandomSample) - return design.popsize^2 * design.fpc * var(design.data[!, x]) / design.sampsize -end - -""" -Inner method for `svyby`. -""" -function var_of_total(x::AbstractVector, design::SimpleRandomSample) - return design.popsize^2 * design.fpc / design.sampsize * var(x) -end - -function se_tot(x::Symbol, design::SimpleRandomSample) - return sqrt(var_of_total(x, design)) -end - function svytotal(x::Symbol, design::SimpleRandomSample) if isa(x, Symbol) && isa(design.data[!, x], CategoricalArray) gdf = groupby(design.data, x) @@ -49,10 +46,30 @@ function svytotal(x::Symbol, design::SimpleRandomSample) return DataFrame(total = total, se_total = se_tot(x, design::SimpleRandomSample)) end +""" +Inner method for `svyby`. +""" +function se_total_svyby(x::AbstractVector, design::SimpleRandomSample, _) + # vector of length equal to `sampsize` containing `x` and zeros + z = cat(zeros(design.sampsize - length(x)), x; dims=1) + # variance of the total + variance = design.popsize^2 / design.sampsize * design.fpc * var(z) + # return the standard error + return sqrt(variance) +end + +""" +Inner method for `svyby`. +""" +function svytotal(x::AbstractVector, design::SimpleRandomSample, weights) + total = wsum(x, weights) + return DataFrame(total = total, sem = se_total_svyby(x, design::SimpleRandomSample, weights)) +end # StratifiedSample function svytotal(x::Symbol, design::StratifiedSample) + # TODO: check if statement if x == design.strata gdf = groupby(design.data, x) return combine(gdf, :weights => sum => :Nₕ) @@ -74,7 +91,6 @@ function svytotal(x::Symbol, design::StratifiedSample) return DataFrame(grand_total = grand_total, SE = SE) end - function svytotal(x::Vector{Symbol}, design::SimpleRandomSample) totals_list = [] for i in x