tweaks

1. change `remove_na` to `show_na` and set default to conditionally show the missing category if any NA are present 2. use the Wilson confidence interval rather than the normal approximation to avoid CIs that escape the [0, 1] bounds 3. namespace ggplot2 functions 4. add arguments for the column and error bar colors, defaulting to blue hues for each
easystats · Aug 31, 2023 · e04947a · e04947a
1 parent 439c2a4
commit e04947a
Show file tree

Hide file tree

Showing 2 changed files with 104 additions and 39 deletions.
diff --git a/R/plot.dw_data_tabulate.R b/R/plot.dw_data_tabulate.R
@@ -1,29 +1,50 @@
 #' Plot tabulated data.
 #'
 #' @param x Object created by `datawizard::data_tabulate()`.
-#' @param value_lab Boolean. Should values and percentages be displayed at the
-#'  top of each bar.
-#' @param remove_na Boolean. Should missing values be dropped?
-#' @param na_label The label given to missing values (only when
-#'  `remove_na = FALSE`).
-#' @param error_bar Boolean. Should error bars be displayed?
+#' @param label_values Logical. Should values and percentages be displayed at the
+#'   top of each bar.
+#' @param show_na Should missing values be dropped? Can be `"if_any"` (default) to show
+#'   the missing category only if any missing values are present, `"always"` to
+#'   always show the missing category, or `"never"` to never show the missing
+#'   category.
+#' @param na_label The label given to missing values when they are shown.
+#' @param error_bar Logical. Should error bars be displayed?
+#'   If `TRUE`, confidence intervals computed using the Wilson method are shown.
+#'   See Brown et al. (2001) for details.
+#' @param ci Confidence Interval (CI) level. Default to `0.95` (⁠95%⁠).
+#' @param fill_col Color to use for category columns (default: "#87CEFA").
+#' @param color_error_bar Color to use for error bars (default: "#607B8B").
 #' @param ... Unused
 #'
+#' @references
+#' Brown, L. D., Cai, T. T., & DasGupta, A. (2001).
+#' Interval estimation for a binomial proportion.
+#' _Statistical Science, 16_(2), 101–133. \doi{10.1214/ss/1009213286}
+#'
 #' @rdname plot.dw_data_tabulate
 #' @export
 
-plot.dw_data_tabulates <- function(x, value_lab = TRUE, remove_na = FALSE,
-                                   na_label = "(Missing)", error_bar = TRUE,
+plot.dw_data_tabulates <- function(x, label_values = TRUE,
+                                   show_na = c("if_any", "always", "never"),
+                                   na_label = "(Missing)",
+                                   error_bar = TRUE, ci = .95,
+                                   fill_col = "#87CEFA",
+                                   color_error_bar = "#607B8B",
                                    ...) {
+  show_na <- match.arg(show_na, choices = c("if_any", "always", "never"))
   if (length(x) == 1) {
     plot.dw_data_tabulate(
-      x[[1]], value_lab = value_lab, remove_na = remove_na,
-      na_label = na_label, error_bar = error_bar
+      x[[1]], label_values = label_values,
+      show_na = show_na, na_label = na_label,
+      error_bar = error_bar, ci = ci,
+      fill_col = fill_col, color_error_bar = color_error_bar
     )
   } else {
     lapply(x, plot.dw_data_tabulate,
-           value_lab = value_lab, remove_na = remove_na,
-           na_label = na_label, error_bar = error_bar
+           label_values = label_values,
+           show_na = show_na, na_label = na_label,
+           error_bar = error_bar, ci = ci,
+           fill_col = fill_col, color_error_bar = color_error_bar
     )
   }
 }
@@ -32,12 +53,25 @@ plot.dw_data_tabulates <- function(x, value_lab = TRUE, remove_na = FALSE,
 #'
 #' @export
 
-plot.dw_data_tabulate <- function(x, value_lab = TRUE, remove_na = FALSE,
-                                  na_label = "(Missing)", error_bar = TRUE,
+plot.dw_data_tabulate <- function(x, label_values = TRUE,
+                                  show_na = c("if_any", "always", "never"),
+                                  na_label = "(Missing)",
+                                  error_bar = TRUE, ci = .95,
+                                  fill_col = "#87CEFA",
+                                  color_error_bar = "#607B8B",
                                   ...) {
+  show_na <- match.arg(show_na, choices = c("if_any", "always", "never"))
   dat <- as.data.frame(x)
 
-  if (isTRUE(remove_na)) {
+  if (show_na == "if_any") {
+    if (any(is.na(dat$Value))) {
+      show_na <- ifelse(dat[is.na(dat$Value), "N"] > 0, "always", "never")
+    } else {
+      show_na <- "never"
+    }
+  }
+
+  if (show_na == "never") {
     dat <- dat[!is.na(dat$Value), ]
     dat$output <- dat[[which(startsWith(names(dat), "Valid"))]]
   } else {
@@ -54,40 +88,50 @@ plot.dw_data_tabulate <- function(x, value_lab = TRUE, remove_na = FALSE,
 
   if (isTRUE(error_bar)) {
     total_n <- sum(dat$N)
-    rel_frq <- dat$output / 100
-    ci <- 1.96 * suppressWarnings(sqrt(rel_frq * (1 - rel_frq) / total_n))
-    dat$upper.ci <- total_n * (rel_frq + ci)
-    dat$lower.ci <- total_n * (rel_frq - ci)
+    props <- dat$output / 100
+    dat <- cbind(dat, CI = ci, .wilson_ci(prop = props, total_n = total_n, ci = ci) * total_n)
     dat$label <- paste0(dat$N, " (", round(dat$output, 2), "%)")
   } else {
     dat$label <- paste0(dat$N, "\n(", round(dat$output, 2), "%)")
   }
 
-  out <- ggplot(dat, aes(x = .data$Value, y = .data$N)) +
-    geom_col() +
-    labs(title = unique(dat$Variable)) +
+  out <- ggplot2::ggplot(dat) +
+    ggplot2::aes(x = .data$Value, y = .data$N) +
+    ggplot2::geom_col(fill = fill_col) +
+    ggplot2::labs(title = unique(dat$Variable)) +
     theme_modern()
 
-  if (isTRUE(value_lab)) {
+  if (isTRUE(label_values)) {
     if (isTRUE(error_bar)) {
       out <- out +
-        geom_text(aes(label = .data$label), vjust = -1, hjust = 1.2) +
-        ylim(c(0, max(dat$N) * 1.5))
+        ggplot2::geom_text(ggplot2::aes(label = .data$label), vjust = -1, hjust = 1.2) +
+        ggplot2::coord_cartesian(ylim = c(0, max(dat$CI_high)))
     } else {
       out <- out +
-        geom_text(aes(label = .data$label), vjust = -0.5) +
-        ylim(c(0, max(dat$N) * 1.2))
+        ggplot2::geom_text(ggplot2::aes(label = .data$label), vjust = -0.5) +
+        ggplot2::coord_cartesian(ylim = c(0, max(dat$N) * 1.2))
     }
   }
 
   # add confidence intervals for frequencies
   if (isTRUE(error_bar)) {
     out <- out +
-      geom_errorbar(
-        aes(ymin = .data$lower.ci, ymax = .data$upper.ci),
-        width = 0.5, color = "darkblue"
+      ggplot2::geom_linerange(
+        ggplot2::aes(ymin = .data$CI_low, ymax = .data$CI_high),
+        color = color_error_bar
       )
   }
 
   out
 }
+
+.wilson_ci <- function(prop, total_n, ci = .95) {
+  z <- qnorm((1 - ci) / 2, lower.tail = FALSE)
+  z2 <- z^2
+  p1 <- prop + 0.5 * z2 / total_n
+  p2 <- z * sqrt((prop * (1 - prop) + 0.25 * z2 / total_n) / total_n)
+  p3 <- 1 + z2 / total_n
+  CI_low <- (p1 - p2) / p3
+  CI_high <- (p1 + p2) / p3
+  return(data.frame(CI_low = CI_low, CI_high = CI_high))
+}
diff --git a/man/plot.dw_data_tabulate.Rd b/man/plot.dw_data_tabulate.Rd