From 3e31b1173cfd9a619841cde6463854abfacd96f9 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Fri, 26 Aug 2022 17:07:04 +0300 Subject: [PATCH 1/8] Use three argument method for `Base.show` as suggested in the documentation --- src/SurveyDesign.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 9c4724e6..7da998a8 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -37,8 +37,8 @@ end # `show` method for printing information about a `SimpleRandomSample` after construction # TODO: change `show` to 3 argument method -function Base.show(io::IO, design::SimpleRandomSample) - printstyled("Simple Random Sample:\n") +function Base.show(io::IO, ::MIME"text/plain", design::SimpleRandomSample) + printstyled("Simple Random Sample:\n"; bold = true) printstyled("data: "; bold = true) print(size(design.data)[1], "x", size(design.data)[2], " DataFrame") printstyled("\nprobs: "; bold = true) @@ -71,8 +71,8 @@ struct StratifiedSample <: AbstractSurveyDesign end # `show` method for printing information about a `StratifiedSample` after construction -function Base.show(io::IO, design::StratifiedSample) - printstyled("Stratified Sample:\n") +function Base.show(io::IO, ::MIME"text/plain", design::StratifiedSample) + printstyled("Stratified Sample:\n"; bold = true) printstyled("data: "; bold = true) print(size(design.data)[1], "x", size(design.data)[2], " DataFrame") printstyled("\nprobs: "; bold = true) @@ -96,8 +96,8 @@ struct ClusterSample <: AbstractSurveyDesign end # `show` method for printing information about a `ClusterSample` after construction -function Base.show(io::IO, design::ClusterSample) - printstyled("Simple Random Sample:\n") +function Base.show(io::IO, ::MIME"text/plain", design::ClusterSample) + printstyled("Simple Random Sample:\n"; bold = true) printstyled("data: "; bold = true) print(size(design.data)[1], "x", size(design.data)[2], " DataFrame") printstyled("\nprobs: "; bold = true) From f4a363ddd532bbf6d7b1c0303dedc8d3d3541978 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Fri, 26 Aug 2022 17:07:25 +0300 Subject: [PATCH 2/8] Make documentation tests run for all branches --- .github/workflows/documentation.yml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index 6a12610d..4d0f4044 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -1,12 +1,6 @@ name: Documentation -on: - push: - branches: - - main - pull_request: - branches: - - main +on: [push, pull_request] # can add tags if needed jobs: build: From 295708e75405fbcc157201b2e28244098a160e17 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Fri, 26 Aug 2022 17:23:25 +0300 Subject: [PATCH 3/8] Add `ClusterSample` constructor and change `print_short` function slightly --- src/Survey.jl | 2 +- src/SurveyDesign.jl | 28 ++++++++++++++++++++-------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/Survey.jl b/src/Survey.jl index 5434133b..1ae73ce6 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -23,7 +23,7 @@ include("dimnames.jl") include("svyboxplot.jl") export load_data -export AbstractSurveyDesign, SimpleRandomSample, StratifiedSample +export AbstractSurveyDesign, SimpleRandomSample, StratifiedSample, ClusterSample export svydesign export svyby, svyglm export dim, colnames, dimnames diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 7da998a8..d24ea70e 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -3,7 +3,7 @@ function print_short(x::AbstractVector) if length(x) < 3 print(x) else - print( x[1], ", ", x[2], ", ", x[3], " ...", " (length = ", length(x), ")") + print( x[1], ", ", x[2], ", ", x[3], " ... ", last(x)) end end @@ -36,7 +36,6 @@ struct SimpleRandomSample <: AbstractSurveyDesign end # `show` method for printing information about a `SimpleRandomSample` after construction -# TODO: change `show` to 3 argument method function Base.show(io::IO, ::MIME"text/plain", design::SimpleRandomSample) printstyled("Simple Random Sample:\n"; bold = true) printstyled("data: "; bold = true) @@ -45,9 +44,9 @@ function Base.show(io::IO, ::MIME"text/plain", design::SimpleRandomSample) print_short(design.data.probs) # TODO: change fpc printstyled("\nfpc: "; bold = true) - print("\n popsize: ") + printstyled("\n popsize: "; bold = true) print_short(design.data.popsize) - print("\n sampsize: ") + printstyled("\n sampsize: "; bold = true) print_short(design.data.sampsize) end @@ -81,9 +80,9 @@ function Base.show(io::IO, ::MIME"text/plain", design::StratifiedSample) print_short(design.data.strata) # TODO: change fpc printstyled("\nfpc: "; bold = true) - print("\n popsize: ") + printstyled("\n popsize: "; bold = true) print_short(design.data.popsize) - print("\n sampsize: ") + printstyled("\n sampsize: "; bold = true) print_short(design.data.sampsize) end @@ -93,6 +92,17 @@ clustering. """ struct ClusterSample <: AbstractSurveyDesign data::DataFrame + function ClusterSample(data::DataFrame, id::Symbol; weights = ones(nrow(data)), probs = 1 ./ weights) + # add frequency weights, probability weights and sample size columns + data[!, :weights] = weights + data[!, :probs] = probs + # TODO: change `sampsize` and `popsize` + data[!, :popsize] = repeat([nrow(data)], nrow(data)) + data[!, :sampsize] = repeat([nrow(data)], nrow(data)) + data[!, :id] = data[!, id] + + new(data) + end end # `show` method for printing information about a `ClusterSample` after construction @@ -102,10 +112,12 @@ function Base.show(io::IO, ::MIME"text/plain", design::ClusterSample) print(size(design.data)[1], "x", size(design.data)[2], " DataFrame") printstyled("\nprobs: "; bold = true) print_short(design.data.probs) + printstyled("\nid: "; bold = true) + print_short(design.data.id) # TODO: change fpc printstyled("\nfpc: "; bold = true) - print("\n popsize: ") + printstyled("\n popsize: "; bold = true) print_short(design.data.popsize) - print("\n sampsize: ") + printstyled("\n sampsize: "; bold = true) print_short(design.data.sampsize) end From 4ca93015fca3a882fdc895d64c41056399929149 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Fri, 26 Aug 2022 17:59:05 +0300 Subject: [PATCH 4/8] Improve documentation for `AbstractSurveyDesign` --- src/SurveyDesign.jl | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index d24ea70e..702d7b74 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -8,15 +8,22 @@ function print_short(x::AbstractVector) end """ -Supertype for every survey design type: `SimpleRandomSample`, `ClusterSample` -and `StratifiedSample`. + AbstractSurveyDesign + +Supertype for survey designs. `SimpleRandomSample`, `ClusterSample` +and `StratifiedSample` are subtypes of this. + +!!! note + When passing data to a survey design, the user should make a copy of the + data. The constructors modify the data passed as argument. """ abstract type AbstractSurveyDesign end """ + SimpleRandomSample + A `SimpleRandomSample` object contains survey design information needed to -analyse surveys sampled by simple random sampling. -TODO: documentation about user making a copy +analyse simple random sample surveys. TODO: add fpc """ struct SimpleRandomSample <: AbstractSurveyDesign From 5355a4bca8929b8484a5d452daedade24863235d Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Tue, 30 Aug 2022 20:35:28 +0300 Subject: [PATCH 5/8] Fix doctest to account for `Float64` compression --- src/svydesign.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/svydesign.jl b/src/svydesign.jl index 161fef32..dd0e6d93 100644 --- a/src/svydesign.jl +++ b/src/svydesign.jl @@ -18,7 +18,7 @@ Survey Design: variables: 183x45 DataFrame id: dnum strata: 1, 1, 1 ... 1 -probs: 0.029544719150814778, 0.029544719150814778, 0.029544719150814778 ... 0.029544719150814778 +probs: 0.0295, 0.0295, 0.0295 ... 0.0295 fpc: popsize: 757, 757, 757 ... 757 sampsize: 183, 183, 183 ... 183 From 6030a81ed7717951284d82329c50f27f703ed5c0 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Tue, 30 Aug 2022 20:58:41 +0300 Subject: [PATCH 6/8] Restructure the documentation --- docs/src/examples.md | 87 ++++++++++++++++++++++++++++++++++++++++++-- docs/src/index.md | 13 ++----- 2 files changed, 88 insertions(+), 12 deletions(-) diff --git a/docs/src/examples.md b/docs/src/examples.md index 1776ce2d..211ac46b 100644 --- a/docs/src/examples.md +++ b/docs/src/examples.md @@ -1,7 +1,88 @@ # Examples -The following examples use the Academic Performance Index (API) dataset for Californian schools. +The following examples use the [Academic Performance Index](https://r-survey.r-forge.r-project.org/survey/html/api.html) (API) dataset for Californian schools. -```@docs -svyby(formula::Symbol, by, design::svydesign, func::Function, params = []) +## Simple Random Sample + +The most basic survey design is a simple random sample design. A +[`SimpleRandomSample`](@ref) can be instantianted by calling the constructor: + +```julia +julia> apisrs = load_data("apisrs"); + +julia> srs = SimpleRandomSample(apisrs) +Simple Random Sample: +data: 200x42 DataFrame +probs: 1.0, 1.0, 1.0 ... 1.0 +fpc: 1 + popsize: 200 + sampsize: 200 +``` + +With a `SimpleRandomSample` (as well as with any subtype of [`AbstractSurveyDesign`](@ref)) +it is possible to calculate estimates of the mean or population total for a given variable, +along with the corresponding standard errors. + +```julia +julia> svymean(:api00, srs) +1×2 DataFrame + Row │ mean sem + │ Float64 Float64 +─────┼────────────────── + 1 │ 656.585 9.40277 + +julia> svytotal(:api00, srs) +1×2 DataFrame + Row │ total se_total + │ Float64 Float64 +─────┼──────────────────── + 1 │ 131317.0 1880.55 +``` + +The complexity of the design can be increased by specifying frequency or probability +weights, the population or sample size and whether or not to account for finite +population correction (fpc). By default the weights are equal to one, the sample size is +equal to the number of rows in `data` the fpc is ignored. The population size is calculated +from the weights. + +```julia +julia> wsrs = SimpleRandomSample(apisrs; weights = :pw) +Simple Random Sample: +data: 200x42 DataFrame +weights: 31.0, 31.0, 31.0 ... 31.0 +probs: 0.0323, 0.0323, 0.0323 ... 0.0323 +fpc: 1 + popsize: 6194 + sampsize: 200 + +julia> fpcwsrs = SimpleRandomSample(apisrs; weights = :pw, ignorefpc = false) +Simple Random Sample: +data: 200x42 DataFrame +weights: 31.0, 31.0, 31.0 ... 31.0 +probs: 0.0323, 0.0323, 0.0323 ... 0.0323 +fpc: 0.968 + popsize: 6194 + sampsize: 200 +``` + +When `ignorefpc` is set to `false` the `fpc` is calculated from the sample and population +sizes. + +The statistics for mean and population total are different when the design takes weights +and fpc into account: + +```julia +julia> svymean(:api00, fpcwsrs) +1×2 DataFrame + Row │ mean sem + │ Float64 Float64 +─────┼────────────────── + 1 │ 656.585 9.24972 + +julia> svytotal(:api00, fpcwsrs) +1×2 DataFrame + Row │ total se_total + │ Float64 Float64 +─────┼───────────────────── + 1 │ 4.06689e6 57292.8 ``` diff --git a/docs/src/index.md b/docs/src/index.md index 72f72983..fe4f2363 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,17 +1,12 @@ -```@meta -CurrentModule = Survey -``` - -# Survey +# Survey.jl This package is the Julia implementation of the [Survey package in R](https://cran.r-project.org/web/packages/survey/index.html) developed by [Professor Thomas Lumley](https://www.stat.auckland.ac.nz/people/tlum005). -## The need for moving the code to Julia. +## Introduction At [xKDR](https://xkdr.org/) we processed millions of records from household surveys using the survey package in R. This process took hours of computing time. By implementing the code in Julia, we are able to do the processing in seconds. In this package we have implemented the functions `svymean`, `svyquantile` and `svysum`. We have kept the syntax between the two packages similar so that we can easily move our existing code to the new language. -Documentation for [Survey](https://github.com/Survey.jl). +## API -```@autodocs -Modules = [Survey] +```@index ``` From 67aeb5ec1c368b56fd9e97d2d12f26d85764cf94 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Mon, 5 Sep 2022 20:56:29 +0300 Subject: [PATCH 7/8] Add type specification (`Symbol`) for first argument of `svymean` --- src/svymean.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/svymean.jl b/src/svymean.jl index 0fb1c7f3..a32c883d 100644 --- a/src/svymean.jl +++ b/src/svymean.jl @@ -22,7 +22,7 @@ function var_of_mean(x::AbstractVector, design::SimpleRandomSample) return design.fpc / design.sampsize * var(x) end -function sem(x, design::SimpleRandomSample) +function sem(x::Symbol, design::SimpleRandomSample) return sqrt(var_of_mean(x, design)) end @@ -30,7 +30,7 @@ function sem(x::AbstractVector, design::SimpleRandomSample) return sqrt(var_of_mean(x, design)) end -function svymean(x, design::SimpleRandomSample) +function svymean(x::Symbol, design::SimpleRandomSample) return DataFrame(mean = mean(design.data[!, x]), sem = sem(x, design::SimpleRandomSample)) end From 62b60aad31ade2ba2c11664d7a81b3b495103328 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Mon, 5 Sep 2022 20:57:17 +0300 Subject: [PATCH 8/8] Add Index and API to the documentation, incomplete --- docs/src/index.md | 26 +++++++++++++++++++++++++- src/svyboxplot.jl | 3 +-- src/svyplot.jl | 3 +-- 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/docs/src/index.md b/docs/src/index.md index fe4f2363..93550619 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -6,7 +6,31 @@ This package is the Julia implementation of the [Survey package in R](https://cr At [xKDR](https://xkdr.org/) we processed millions of records from household surveys using the survey package in R. This process took hours of computing time. By implementing the code in Julia, we are able to do the processing in seconds. In this package we have implemented the functions `svymean`, `svyquantile` and `svysum`. We have kept the syntax between the two packages similar so that we can easily move our existing code to the new language. -## API +## Index ```@index +Module = [Survey] +Private = false +``` + +## API +```@docs +load_data +AbstractSurveyDesign +SimpleRandomSample +StratifiedSample +ClusterSample +dim(design::AbstractSurveyDesign) +colnames(design::AbstractSurveyDesign) +dimnames(design::AbstractSurveyDesign) +svymean(x::Symbol, design::SimpleRandomSample) +svytotal(x::Symbol, design::SimpleRandomSample) +svyby +svyplot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...) +svyhist(design::AbstractSurveyDesign, var::Symbol, + bins::Union{Integer, AbstractVector} = freedman_diaconis(design, var); + normalization = :density, + kwargs... + ) +svyboxplot(design::AbstractSurveyDesign, x::Symbol, y::Symbol; kwargs...) ``` diff --git a/src/svyboxplot.jl b/src/svyboxplot.jl index baa78512..54ab703d 100644 --- a/src/svyboxplot.jl +++ b/src/svyboxplot.jl @@ -1,7 +1,6 @@ """ -``` svyboxplot(design, x, y; kwargs...) -``` + Box plot of survey design variable `y` grouped by column `x`. Weights can be specified by a Symbol using the keyword argument `weights`. diff --git a/src/svyplot.jl b/src/svyplot.jl index e4457088..0e95ba0c 100644 --- a/src/svyplot.jl +++ b/src/svyplot.jl @@ -1,7 +1,6 @@ """ -``` svyplot(design, x, y; kwargs...) -``` + Scatter plot of survey design variables `x` and `y`. The plot takes into account the frequency weights specified by the user