xKDR · iuliadmtru · Aug 26, 2022 · Aug 26, 2022 · Aug 26, 2022 · Aug 26, 2022
diff --git a/docs/src/examples.md b/docs/src/examples.md
@@ -1,7 +1,88 @@
 # Examples
 
-The following examples use the Academic Performance Index (API) dataset for Californian schools.
+The following examples use the [Academic Performance Index](https://r-survey.r-forge.r-project.org/survey/html/api.html) (API) dataset for Californian schools.
 
-```@docs
-svyby(formula::Symbol, by, design::svydesign, func::Function, params = [])
+## Simple Random Sample
+
+The most basic survey design is a simple random sample design. A
+[`SimpleRandomSample`](@ref) can be instantianted by calling the constructor:
+
+```julia
+julia> apisrs = load_data("apisrs");
+
+julia> srs = SimpleRandomSample(apisrs)
+Simple Random Sample:
+data: 200x42 DataFrame
+probs: 1.0, 1.0, 1.0 ... 1.0
+fpc: 1
+    popsize: 200
+    sampsize: 200
+```
+
+With a `SimpleRandomSample` (as well as with any subtype of [`AbstractSurveyDesign`](@ref))
+it is possible to calculate estimates of the mean or population total for a given variable,
+along with the corresponding standard errors.
+
+```julia
+julia> svymean(:api00, srs)
+1×2 DataFrame
+ Row │ mean     sem
+     │ Float64  Float64
+─────┼──────────────────
+   1 │ 656.585  9.40277
+
+julia> svytotal(:api00, srs)
+1×2 DataFrame
+ Row │ total     se_total
+     │ Float64   Float64
+─────┼────────────────────
+   1 │ 131317.0   1880.55
+```
+
+The complexity of the design can be increased by specifying frequency or probability
+weights, the population or sample size and whether or not to account for finite
+population correction (fpc). By default the weights are equal to one, the sample size is
+equal to the number of rows in `data` the fpc is ignored. The population size is calculated
+from the weights.
+
+```julia
+julia> wsrs = SimpleRandomSample(apisrs; weights = :pw)
+Simple Random Sample:
+data: 200x42 DataFrame
+weights: 31.0, 31.0, 31.0 ... 31.0
+probs: 0.0323, 0.0323, 0.0323 ... 0.0323
+fpc: 1
+    popsize: 6194
+    sampsize: 200
+
+julia> fpcwsrs = SimpleRandomSample(apisrs; weights = :pw, ignorefpc = false)
+Simple Random Sample:
+data: 200x42 DataFrame
+weights: 31.0, 31.0, 31.0 ... 31.0
+probs: 0.0323, 0.0323, 0.0323 ... 0.0323
+fpc: 0.968
+    popsize: 6194
+    sampsize: 200
+```
+
+When `ignorefpc` is set to `false` the `fpc` is calculated from the sample and population
+sizes.
+
+The statistics for mean and population total are different when the design takes weights
+and fpc into account:
+
+```julia
+julia> svymean(:api00, fpcwsrs)
+1×2 DataFrame
+ Row │ mean     sem
+     │ Float64  Float64
+─────┼──────────────────
+   1 │ 656.585  9.24972
+
+julia> svytotal(:api00, fpcwsrs)
+1×2 DataFrame
+ Row │ total      se_total
+     │ Float64    Float64
+─────┼─────────────────────
+   1 │ 4.06689e6   57292.8
 ```
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -1,17 +1,12 @@
-```@meta
-CurrentModule = Survey
-```
-
-# Survey
+# Survey.jl
 
 This package is the Julia implementation of the [Survey package in R](https://cran.r-project.org/web/packages/survey/index.html) developed by [Professor Thomas Lumley](https://www.stat.auckland.ac.nz/people/tlum005).
 
-## The need for moving the code to Julia.
+## Introduction
 
 At [xKDR](https://xkdr.org/) we processed millions of records from household surveys using the survey package in R. This process took hours of computing time. By implementing the code in Julia, we are able to do the processing in seconds. In this package we have implemented the functions `svymean`, `svyquantile` and `svysum`. We have kept the syntax between the two packages similar so that we can easily move our existing code to the new language.
 
-Documentation for [Survey](https://github.com/Survey.jl).
+## API
 
-```@autodocs
-Modules = [Survey]
+```@index
 ```
diff --git a/src/Survey.jl b/src/Survey.jl
@@ -22,7 +22,7 @@ include("dimnames.jl")
 include("svyboxplot.jl")
 
 export load_data
-export AbstractSurveyDesign, SimpleRandomSample, StratifiedSample
+export AbstractSurveyDesign, SimpleRandomSample, StratifiedSample, ClusterSample
 export svydesign
 export svyglm
 export dim, colnames, dimnames

diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl
@@ -13,11 +13,14 @@ function print_short(x)
 end
 
 """
-Supertype for every survey design type: `SimpleRandomSample`, `ClusterSample`
-and `StratifiedSample`.
+    AbstractSurveyDesign
 
-The data to a survey constructor is modified. To avoid this pass a copy of the data
-instead of the original.
+Supertype for survey designs. `SimpleRandomSample`, `ClusterSample`
+and `StratifiedSample` are subtypes of this.
+
+!!! note
+    When passing data to a survey design, the user should make a copy of the
+    data. The constructors modify the data passed as argument.
 """
 abstract type AbstractSurveyDesign end
 
@@ -113,7 +116,7 @@ struct StratifiedSample <: AbstractSurveyDesign
 end
 
 # `show` method for printing information about a `StratifiedSample` after construction
-function Base.show(io::IO, design::StratifiedSample)
+function Base.show(io::IO, ::MIME"text/plain", design::StratifiedSample)
     printstyled("Stratified Sample:\n"; bold = true)
     printstyled("data: "; bold = true)
     print(size(design.data, 1), "x", size(design.data, 2), " DataFrame")
@@ -138,11 +141,22 @@ Survey design sampled by clustering.
 """
 struct ClusterSample <: AbstractSurveyDesign
     data::DataFrame
+    function ClusterSample(data::DataFrame, id::Symbol; weights = ones(nrow(data)), probs = 1 ./ weights)
+        # add frequency weights, probability weights and sample size columns
+        data[!, :weights] = weights
+        data[!, :probs] = probs
+        # TODO: change `sampsize` and `popsize`
+        data[!, :popsize] = repeat([nrow(data)], nrow(data))
+        data[!, :sampsize] = repeat([nrow(data)], nrow(data))
+        data[!, :id] = data[!, id]
+
+        new(data)
+    end
 end
 
 # `show` method for printing information about a `ClusterSample` after construction
-function Base.show(io::IO, design::ClusterSample)
-    printstyled("Cluster Sample:\n"; bold = true)
+function Base.show(io::IO, ::MIME"text/plain", design::ClusterSample)
+    printstyled("Simple Random Sample:\n"; bold = true)
     printstyled("data: "; bold = true)
     print(size(design.data, 1), "x", size(design.data, 2), " DataFrame")
     printstyled("\nweights: "; bold = true)

diff --git a/src/svydesign.jl b/src/svydesign.jl
@@ -18,7 +18,7 @@ Survey Design:
 variables: 183x45 DataFrame
 id: dnum
 strata: 1, 1, 1 ... 1
-probs: 0.029544719150814778, 0.029544719150814778, 0.029544719150814778 ... 0.029544719150814778
+probs: 0.0295, 0.0295, 0.0295 ... 0.0295
 fpc:
     popsize: 757, 757, 757 ... 757
     sampsize: 183, 183, 183 ... 183