xKDR · nadiaenh · Jul 14, 2023 · Jul 14, 2023 · Jul 14, 2023 · Jul 19, 2023
diff --git a/README.md b/README.md
@@ -92,7 +92,7 @@ to compute the standard errors.
 
 ```julia
 julia> bootsrs = bootweights(srs; replicates=1000)
-ReplicateDesign:
+ReplicateDesign{BootstrapReplicates}:
 data: 200×1047 DataFrame
 strata: none
 cluster: none

diff --git a/docs/src/man/replicate.md b/docs/src/man/replicate.md
@@ -4,9 +4,9 @@ Replicate weights are a method for estimating the standard errors of survey stat
 
 The basic idea behind replicate weights is to create multiple versions of the original sample weights, each with small, randomly generated perturbations. The multiple versions of the sample weights are then used to calculate the survey statistic of interest, such as the mean or total, on multiple replicate samples. The variance of the survey statistic is then estimated by computing the variance across the replicate samples.
 
-Currently, the Rao-Wu bootstrap[^1] is the only method in the package for generating replicate weights. 
+Currently, the Rao-Wu bootstrap[^1] and the Jackknife [^2] are the only methods in the package for generating replicate weights. In the future, the package will support additional types of inference methods, which will be passed when creating a `ReplicateDesign` object.
 
-The `bootweights` function of the package can be used to generate a `ReplicateDesign` from a `SurveyDesign`
+The `bootweights` function of the package can be used to generate a `ReplicateDesign` using the Rao-Wu bootstrap method from a `SurveyDesign`.
 For example: 
 ```@repl bootstrap
 using Survey
@@ -15,7 +15,16 @@ dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw)
 bstrat = bootweights(dstrat; replicates = 10)
 ```
 
-For each replicate, the `DataFrame` of `ReplicateDesign` has an additional column. The of the column is `replicate_` followed by the replicate number.  
+The `jackknifeweights` function of the package can be used to generate a `ReplicateDesign` using the Jackknife method from a `SurveyDesign`.
+For example: 
+```@repl bootstrap
+using Survey
+apistrat = load_data("apistrat")
+dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw)
+bstrat = jackknifeweights(dstrat; replicates = 10)
+```
+
+For each replicate, the `DataFrame` of `ReplicateDesign` has an additional column. The name of the column is `replicate_` followed by the replicate number.  
 
 ```@repl bootstrap
 names(bstrat.data)
@@ -38,4 +47,5 @@ For each replicate weight, the statistic is calculated using it instead of the w
 
 ## References
 
-[^1]: [Rust, Keith F., and J. N. K. Rao. "Variance estimation for complex surveys using replication techniques." Statistical methods in medical research 5.3 (1996): 283-310.](https://journals.sagepub.com/doi/abs/10.1177/096228029600500305?journalCode=smma)
+[^1]: [Rust, Keith F., and J. N. K. Rao. "Variance estimation for complex surveys using replication techniques." Statistical methods in medical research 5.3 (1996): 283-310.](https://journals.sagepub.com/doi/abs/10.1177/096228029600500305?journalCode=smma)
+[^2]: [Miller, Rupert G. “The Jackknife--A Review.” Biometrika 61, no. 1 (1974): 1–15. https://doi.org/10.2307/2334280.](https://www.jstor.org/stable/2334280)
diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl
@@ -84,10 +84,8 @@ struct SurveyDesign <: AbstractSurveyDesign
         else
             data[!, sampsize_labels] = fill(length(unique(data[!, cluster])), (nrow(data),))
         end
-        if isa(popsize, Symbol)
-            weights_labels = :_weights
-            data[!, weights_labels] = data[!, popsize] ./ data[!, sampsize_labels]
-        elseif isa(weights, Symbol)
+
+        if isa(weights, Symbol)
             if !(typeof(data[!, weights]) <: Vector{<:Real})
                 throw(
                     ArgumentError(
@@ -100,6 +98,9 @@ struct SurveyDesign <: AbstractSurveyDesign
                 popsize = :_popsize
                 data[!, popsize] = data[!, sampsize_labels] .* data[!, weights_labels]
             end
+        elseif isa(popsize, Symbol)
+                weights_labels = :_weights
+                data[!, weights_labels] = data[!, popsize] ./ data[!, sampsize_labels]
         else
             # neither popsize nor weights given
             weights_labels = :_weights

diff --git a/src/bootstrap.jl b/src/bootstrap.jl
@@ -54,45 +54,68 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis
 end
 
 """
-    variance(x::Symbol, func::Function, design::ReplicateDesign{BootstrapReplicates})
+    variance(x::Union{Symbol, Vector{Symbol}}, func::Function, design::ReplicateDesign{BootstrapReplicates}, args...; kwargs...)
 
+Compute the standard error of the estimated mean using the bootstrap method.
 
-Use replicate weights to compute the standard error of the estimated mean using the bootstrap method. The variance is calculated using the formula
+# Arguments
+- `x::Union{Symbol, Vector{Symbol}}`: Symbol or vector of symbols representing the variable(s) for which the mean is estimated.
+- `func::Function`: Function used to calculate the mean.
+- `design::ReplicateDesign{BootstrapReplicates}`: Replicate design object.
+- `args...`: Additional arguments to be passed to the function.
+- `kwargs...`: Additional keyword arguments.
+
+# Returns
+- `df`: DataFrame containing the estimated mean and its standard error.
+
+The variance is calculated using the formula
 
 ```math
 \\hat{V}(\\hat{\\theta}) = \\dfrac{1}{R}\\sum_{i = 1}^R(\\theta_i - \\hat{\\theta})^2
 ```
 
 where above ``R`` is the number of replicate weights, ``\\theta_i`` is the estimator computed using the ``i``th set of replicate weights, and ``\\hat{\\theta}`` is the estimator computed using the original weights.
 
-```jldoctest
-julia> using Survey, StatsBase;
-
-julia> apiclus1 = load_data("apiclus1");
-
-julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw);
+# Examples
 
-julia> bclus1 = dclus1 |> bootweights;
+```jldoctest; setup = :(using Survey, StatsBase, DataFrames; apiclus1 = load_data("apiclus1"); dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); bclus1 = dclus1 |> bootweights;)
 
-julia> weightedmean(x, y) = mean(x, weights(y));
+julia> mean(df::DataFrame, column, weights) = StatsBase.mean(df[!, column], StatsBase.weights(df[!, weights]));
 
-julia> variance(:api00, weightedmean, bclus1)
+julia> variance(:api00, mean, bclus1)
 1×2 DataFrame
  Row │ estimator  SE
      │ Float64    Float64
 ─────┼────────────────────
    1 │   644.169  23.4107
-
 ```
 """
-function variance(x::Symbol, func::Function, design::ReplicateDesign{BootstrapReplicates})
-    θ̂ = func(design.data[!, x], design.data[!, design.weights])
-    θ̂t = [
-        func(design.data[!, x], design.data[!, "replicate_"*string(i)]) for
-        i = 1:design.replicates
+function variance(x::Union{Symbol, Vector{Symbol}}, func::Function, design::ReplicateDesign{BootstrapReplicates}, args...; kwargs...)
+
+    # Compute the estimators
+    θs = func(design.data, x, design.weights, args...; kwargs...)
+
+    # Compute the estimators for each replicate
+    θts = [
+        func(design.data, x, "replicate_" * string(i), args...; kwargs...) for i in 1:design.replicates
     ]
-    variance = sum((θ̂t .- θ̂) .^ 2) / design.replicates
-    return DataFrame(estimator = θ̂, SE = sqrt(variance))
+
+    # Convert θs and θts to a vector if they are not already
+    θs = (θs isa Vector) ? θs : [θs]  
+    θts = (θts[1] isa Vector) ? θts : [θts]
+
+    # Calculate variances for each estimator
+    variance = Float64[]
+
+    for i in 1:length(θs)
+        θ = θs[i]
+        θt = θts[i]
+        θt = filter(!isnan, θt)
+        num = sum((θt .- θ) .^ 2) / length(θt)
+        push!(variance, num)
+    end
+
+    return DataFrame(estimator = θs, SE = sqrt.(variance))
 end
 
 function _bootweights_cluster_sorted!(cluster_sorted,

diff --git a/src/by.jl b/src/by.jl
@@ -1,28 +1,23 @@
-function bydomain(x::Symbol, domain, design::SurveyDesign, func::Function)
-    gdf = groupby(design.data, domain)
-    X = combine(gdf, [x, design.weights] => ((a, b) -> func(a, weights(b))) => :statistic)
-    return X
+function subset(group, design::SurveyDesign)
+    return SurveyDesign(DataFrame(group);clusters = design.cluster, strata = design.strata, popsize = design.popsize, weights = design.weights)   
+end 
+
+function subset(group, design::ReplicateDesign)
+    return ReplicateDesign{typeof(design.inference_method)}(DataFrame(group), design.replicate_weights;clusters = design.cluster, strata = design.strata, popsize = design.popsize, weights = design.weights)   
 end
 
-function bydomain(x::Symbol, domain, design::ReplicateDesign, func::Function)
+function bydomain(x::Union{Symbol, Vector{Symbol}}, domain,design::Union{SurveyDesign, ReplicateDesign}, func::Function, args...; kwargs...)
+    domain_names = unique(design.data[!, domain])
     gdf = groupby(design.data, domain)
-    nd = length(gdf)
-    X = combine(gdf, [x, design.weights] => ((a, b) -> func(a, weights(b))) => :statistic)
-    Xt_mat = Array{Float64,2}(undef, (nd, design.replicates))
-    for i = 1:design.replicates
-        Xt_mat[:, i] =
-            combine(
-                gdf,
-                [x, Symbol("replicate_" * string(i))] =>
-                    ((a, c) -> func(a, weights(c))) => :statistic,
-            ).statistic
+    domain_names = [join(collect(keys(gdf)[i]), "-") for i in 1:length(gdf)]
+    vars = DataFrame[]
+    for group in gdf
+        push!(vars, func(x, subset(group, design), args...; kwargs...))
     end
-    ses = Float64[]
-    for i = 1:nd
-        filtered_dx = filter(!isnan, Xt_mat[i, :] .- X.statistic[i])
-        push!(ses, sqrt(sum(filtered_dx .^ 2) / length(filtered_dx)))
+    estimates = vcat(vars...)
+    if isa(domain, Vector{Symbol})
+        domain = join(domain, "_")
     end
-    replace!(ses, NaN => 0)
-    X.SE = ses
-    return X
-end
+    estimates[!, domain] = domain_names
+    return estimates
+end
diff --git a/src/jackknife.jl b/src/jackknife.jl
@@ -94,66 +94,56 @@ Compute variance of column `x` for the given `func` using the Jackknife method.
 Above, ``\\hat{\\theta}`` represents the estimator computed using the original weights, and ``\\hat{\\theta_{(hj)}}`` represents the estimator computed from the replicate weights obtained when PSU ``j`` from cluster ``h`` is removed.
 
 # Examples
-```jldoctest
-julia> using Survey, StatsBase
-
-julia> apistrat = load_data("apistrat");
+```jldoctest; setup = :(using Survey, StatsBase, DataFrames; apistrat = load_data("apistrat"); dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw); rstrat = jackknifeweights(dstrat);)
 
-julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw);
+julia> mean(df::DataFrame, column, weights) = StatsBase.mean(df[!, column], StatsBase.weights(df[!, weights]));
 
-julia> rstrat = jackknifeweights(dstrat)
-ReplicateDesign{JackknifeReplicates}:
-data: 200×244 DataFrame
-strata: stype
-    [E, E, E  …  M]
-cluster: none
-popsize: [4420.9999, 4420.9999, 4420.9999  …  1018.0]
-sampsize: [100, 100, 100  …  50]
-weights: [44.21, 44.21, 44.21  …  20.36]
-allprobs: [0.0226, 0.0226, 0.0226  …  0.0491]
-type: jackknife
-replicates: 200
-
-julia> weightedmean(x, y) = mean(x, weights(y));
-
-julia> variance(:api00, weightedmean, rstrat)
+julia> variance(:api00, mean, rstrat)
 1×2 DataFrame
  Row │ estimator  SE
      │ Float64    Float64
 ─────┼────────────────────
    1 │   662.287  9.53613
-
 ```
 # Reference
 pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)
 """
-function variance(x::Symbol, func::Function, design::ReplicateDesign{JackknifeReplicates})
+function variance(x::Union{Symbol, Vector{Symbol}}, func::Function, design::ReplicateDesign{JackknifeReplicates}, args...; kwargs...)
+
     df = design.data
-    # sort!(df, [design.strata, design.cluster])
     stratified_gdf = groupby(df, design.strata)
 
     # estimator from original weights
-    θ = func(df[!, x], df[!, design.weights])
+    θs = func(design.data, x, design.weights, args...; kwargs...)
 
-    variance = 0
+    # ensure that θs is a vector
+    θs = (θs isa Vector) ? θs : [θs]  
+
+    variance = zeros(length(θs))
     replicate_index = 1
+
     for subgroup in stratified_gdf
+
         psus_in_stratum = unique(subgroup[!, design.cluster])
         nh = length(psus_in_stratum)
-        cluster_variance = 0
+        cluster_variance = zeros(length(θs))
+
         for psu in psus_in_stratum
-            # get replicate weights corresponding to current stratum and psu
-            rep_weights = df[!, "replicate_"*string(replicate_index)]
 
             # estimator from replicate weights
-            θhj = func(df[!, x], rep_weights)
+            θhjs = func(design.data, x, "replicate_" * string(replicate_index), args...; kwargs...)
+
+            # update the cluster variance for each estimator
+            for i in 1:length(θs)
+                cluster_variance[i] += ((nh - 1)/nh) * (θhjs[i] - θs[i])^2
+            end
 
-            cluster_variance += ((nh - 1)/nh)*(θhj - θ)^2
             replicate_index += 1
         end
-        variance += cluster_variance
-    end
 
-    return DataFrame(estimator = θ, SE = sqrt(variance))
-end
+        # update the overall variance
+        variance .+= cluster_variance
+    end
 
+    return DataFrame(estimator = θs, SE = sqrt.(variance))
+end