-
Notifications
You must be signed in to change notification settings - Fork 190
/
ggcoefstats.R
493 lines (437 loc) · 17.8 KB
/
ggcoefstats.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
#' @title Dot-and-whisker plots for regression analyses
#' @name ggcoefstats
#'
#' @description
#'
#'
#'
#' Plot with the regression coefficients' point estimates as dots with
#' confidence interval whiskers and other statistical details included as
#' labels.
#'
#' @param x A model object to be tidied, or a tidy data frame containing results
#' from a regression model. Function internally uses
#' `parameters::model_parameters` to get a tidy dataframe. If
#' a data frame is used, it *must* contain columns named `term` (names of
#' predictors) and `estimate` (corresponding estimates of coefficients or
#' other quantities of interest).
#' @param output Character describing the expected output from this function:
#' `"plot"` (visualization of regression coefficients) or `"tidy"` (tidy
#' dataframe of results `parameters::model_parameters`) or `"glance"` (object
#' from `performance::model_performance`).
#' @param statistic Which statistic is to be displayed (either `"t"` or `"f"`or
#' `"z"` or `"chi"`) in the label. This is relevant if the `x` argument is a
#' *dataframe*.
#' @param bf.message Logical that decides whether results from running a
#' Bayesian meta-analysis assuming that the effect size *d* varies across
#' studies with standard deviation *t* (i.e., a random-effects analysis)
#' should be displayed in caption. Defaults to `TRUE`.
#' @param xlab,ylab Labels for `x`- and `y`- axis variables, respectively
#' (Defaults: `"regression coefficient"` and `"term"`).
#' @param subtitle The text for the plot subtitle. The input to this argument
#' will be ignored if `meta.analytic.effect` is set to `TRUE`.
#' @param point.args Additional arguments that will be passed to
#' `ggplot2::geom_point` geom. Please see documentation for that function to
#' know more about these arguments.
#' @param conf.int Logical. Decides whether to display confidence intervals as
#' error bars (Default: `TRUE`).
#' @param conf.level Numeric deciding level of confidence or credible intervals
#' (Default: `0.95`).
#' @param effsize Character describing the effect size to be displayed: `"eta"`
#' (default) or `"omega"`. This argument is relevant only for models objects
#' of class `aov`, `anova`, `aovlist`, `"Gam"`, and `"manova"`.
#' @param meta.analytic.effect Logical that decides whether subtitle for
#' meta-analysis via linear (mixed-effects) models (default: `FALSE`). If
#' `TRUE`, input to argument `subtitle` will be ignored. This will be mostly
#' relevant if a data frame with estimates and their standard errors is
#' entered.
#' @param meta.type Type of statistics used to carry out random-effects
#' meta-analysis. If `"parametric"` (default), `metafor::rma` function will be
#' used. If `"robust"`, `metaplus::metaplus` function will be used. If
#' `"bayes"`, `metaBMA::meta_random` function will be used.
#' @param exclude.intercept Logical that decides whether the intercept should be
#' excluded from the plot (Default: `FALSE`).
#' @param errorbar.args Additional arguments that will be passed to
#' `ggplot2::geom_errorbarh` geom. Please see documentation for that function
#' to know more about these arguments.
#' @param vline Decides whether to display a vertical line (Default: `"TRUE"`).
#' @param vline.args Additional arguments that will be passed to
#' `ggplot2::geom_vline` geom. Please see documentation for that function to
#' know more about these arguments.
#' @param sort If `"none"` (default) do not sort, `"ascending"` sort by
#' increasing coefficient value, or `"descending"` sort by decreasing
#' coefficient value.
#' @param stats.labels Logical. Decides whether the statistic and *p*-values for
#' each coefficient are to be attached to each dot as a text label using
#' `ggrepel` (Default: `TRUE`).
#' @param stats.label.color Color for the labels. If set to `NULL`, colors will
#' be chosen from the specified `package` (Default: `"RColorBrewer"`) and
#' `palette` (Default: `"Dark2"`).
#' @param stats.label.args Additional arguments that will be passed to
#' `ggrepel::geom_label_repel` geom. Please see documentation for that
#' function to know more about these arguments.
#' @param only.significant If `TRUE`, only stats labels for significant effects
#' is shown (Default: `FALSE`). This can be helpful when a large number of
#' regression coefficients are to be displayed in a single plot. Relevant only
#' when the `output` is a plot.
#' @param ... Additional arguments to tidying method. For more, see
#' `parameters::model_parameters`.
#' @inheritParams parameters::model_parameters
#' @inheritParams theme_ggstatsplot
#' @inheritParams statsExpressions::meta_analysis
#' @inheritParams ggbetweenstats
#'
#' @note
#'
#' **Important**: In case you want to carry out meta-analysis using this
#' function, it assumes that you have already downloaded the needed package
#' (`metafor`, `metaplus`, or `metaBMA`) for meta-analysis.
#'
#' 1. All rows of regression estimates where either of the following
#' quantities is `NA` will be removed if labels are requested: `estimate`,
#' `statistic`, `p.value`.
#'
#' 2. Given the rapid pace at which new methods are added to these packages, it
#' is recommended that you install the GitHub versions of `parameters` and
#' `performance` in order to make most of this function.
#'
#' @import ggplot2
#' @importFrom rlang exec !!! !!
#' @importFrom dplyr select mutate matches across row_number last group_by ungroup
#' @importFrom ggrepel geom_label_repel
#' @importFrom tidyr unite
#' @importFrom insight is_model find_statistic format_value
#' @importFrom statsExpressions meta_analysis
#' @importFrom parameters model_parameters standardize_names
#' @importFrom performance model_performance
#'
#' @references
#' \url{https://indrajeetpatil.github.io/ggstatsplot/articles/web_only/ggcoefstats.html}
#'
#'
#' @examples
#' \donttest{
#' # for reproducibility
#' set.seed(123)
#' library(ggstatsplot)
#'
#' # model object
#' mod <- lm(formula = mpg ~ cyl * am, data = mtcars)
#'
#' # to get a plot
#' ggcoefstats(x = mod, output = "plot")
#'
#' # to get a tidy dataframe
#' ggcoefstats(x = mod, output = "tidy")
#'
#' # to get a glance summary
#' ggcoefstats(x = mod, output = "glance")
#' }
#' @export
# function body
ggcoefstats <- function(x,
output = "plot",
statistic = NULL,
conf.int = TRUE,
conf.level = 0.95,
k = 2L,
exclude.intercept = FALSE,
effsize = "eta",
meta.analytic.effect = FALSE,
meta.type = "parametric",
bf.message = TRUE,
sort = "none",
xlab = "regression coefficient",
ylab = "term",
title = NULL,
subtitle = NULL,
caption = NULL,
only.significant = FALSE,
point.args = list(size = 3, color = "blue"),
errorbar.args = list(height = 0),
vline = TRUE,
vline.args = list(size = 1, linetype = "dashed"),
stats.labels = TRUE,
stats.label.color = NULL,
stats.label.args = list(size = 3, direction = "y"),
package = "RColorBrewer",
palette = "Dark2",
ggtheme = ggplot2::theme_bw(),
ggstatsplot.layer = TRUE,
...) {
# ============================= dataframe ===============================
if (isFALSE(insight::is_model(x))) {
# set tidy_df to entered dataframe
tidy_df <- as_tibble(x)
# check that `statistic` is specified
if (rlang::is_null(statistic)) {
# inform the user
if (output == "plot" && isTRUE(stats.labels)) {
message(cat(
"Note: The argument `statistic` must be specified.\n",
"Skipping labels with statistical details.\n"
))
}
# skip labels
stats.labels <- FALSE
}
}
# =========================== tidy it ====================================
if (isTRUE(insight::is_model(x))) {
# which effect size?
eta_squared <- omega_squared <- NULL
if (effsize == "eta") eta_squared <- "partial"
if (effsize == "omega") omega_squared <- "partial"
# converting model object to a tidy dataframe
tidy_df <-
parameters::model_parameters(
model = x,
eta_squared = eta_squared,
omega_squared = omega_squared,
ci = conf.level,
verbose = FALSE,
...
) %>%
parameters::standardize_names(style = "broom") %>%
dplyr::rename_all(~ gsub("omega2.|eta2.", "", .x))
# anova objects need further cleaning
if (class(x)[[1]] %in% c("aov", "aovlist", "anova", "Gam", "manova", "maov")) {
if (dim(dplyr::filter(tidy_df, term == "Residuals"))[[1]] > 0L) {
if ("group" %in% names(tidy_df)) tidy_df %<>% group_by(group)
# creating a new column for residual degrees of freedom
tidy_df %<>% dplyr::mutate(df.error = dplyr::last(df))
}
# renaming the `xlab` according to the estimate chosen
xlab <- paste("partial", " ", effsize, "-squared", sep = "")
# final cleanup
tidy_df %<>%
dplyr::filter(!is.na(statistic)) %>%
dplyr::select(-dplyr::matches("sq$")) %>%
dplyr::mutate(estimate.type = xlab) %>%
dplyr::ungroup()
}
}
# =================== tidy dataframe cleanup ================================
# check for the one necessary column
if (rlang::is_null(tidy_df) || !"estimate" %in% names(tidy_df)) {
stop(message(cat(
"Error: The tidy dataframe *must* contain column called 'estimate'.\n",
"Check the tidy output using argument `output = 'tidy'`."
)),
call. = FALSE
)
}
# remove NAs
if (isTRUE(stats.labels)) {
tidy_df %<>%
dplyr::filter(dplyr::across(
.cols = c(dplyr::matches("estimate|statistic|std.error|p.value")),
.fns = ~ !is.na(.)
))
}
# create a new term column if it's not present
if (!"term" %in% names(tidy_df)) {
tidy_df %<>% dplyr::mutate(term = paste("term", dplyr::row_number(), sep = "_"))
}
# ================ check for duplicate terms and columns ===================
# a check if there are repeated terms
# needed for maov, lqm, lqmm, etc. kind of objects
if (any(duplicated(dplyr::select(tidy_df, term)))) {
tidy_df %<>%
tidyr::unite(
data = .,
col = "term",
dplyr::matches("term|variable|parameter|method|curve|response|component|contrast|group"),
remove = TRUE,
sep = "_"
)
}
# halt if there are still repeated terms
if (any(duplicated(dplyr::select(tidy_df, term)))) {
message("Error: All elements in the column `term` should be unique.")
return(invisible(tidy_df))
}
# if `parameters` output doesn't contain p-value or statistic column
if (sum(c("p.value", "statistic") %in% names(tidy_df)) != 2L) stats.labels <- FALSE
# =========================== CIs and intercepts ===========================
# if `parameters` output doesn't contain CI
if (!"conf.low" %in% names(tidy_df)) {
# add NAs so that only dots will be shown
tidy_df %<>% dplyr::mutate(conf.low = NA_character_, conf.high = NA_character_)
# stop displaying whiskers
conf.int <- FALSE
}
# whether to show model intercept
if (isTRUE(exclude.intercept)) tidy_df %<>% dplyr::filter(!grepl("(Intercept)", term, TRUE))
# ========================== preparing label ================================
# adding a column with labels to be used with `ggrepel`
if (isTRUE(stats.labels)) {
# in case a dataframe was entered, `x` and `tidy_df` are going to be same
if (isTRUE(insight::is_model(x))) statistic <- insight::find_statistic(x)
# adding a column with labels using custom function
tidy_df %<>%
ggcoefstats_label_maker(
tidy_df = .,
statistic = substring(tolower(statistic), 1, 1),
k = k,
effsize = effsize
)
}
# ========================== summary caption ================================
# for non-dataframe objects
if (isTRUE(insight::is_model(x))) {
# creating glance dataframe
glance_df <- performance::model_performance(x, verbose = FALSE)
# rename to `broom` convention
if (!is.null(glance_df)) glance_df %<>% parameters::standardize_names(style = "broom")
# no meta-analysis in this context
meta.analytic.effect <- FALSE
# if glance is not available, inform the user
if (!is.null(glance_df) && all(c("aic", "bic") %in% names(glance_df))) {
# preparing caption with model diagnostics
caption <-
substitute(
expr = atop(displaystyle(top.text), expr = paste("AIC = ", AIC, ", BIC = ", BIC)),
env = list(
top.text = caption,
AIC = format_value(glance_df$aic[[1]], 0L),
BIC = format_value(glance_df$bic[[1]], 0L)
)
)
}
}
# running meta-analysis
if (isTRUE(meta.analytic.effect)) {
# standardizing type of statistics name
meta.type <- ipmisc::stats_type_switch(meta.type)
# results from frequentist random-effects meta-analysis
subtitle_df <- statsExpressions::meta_analysis(tidy_df, type = meta.type, k = k)
subtitle <- subtitle_df$expression[[1]]
# results from Bayesian random-effects meta-analysis (only for parametric)
if (meta.type == "parametric" && isTRUE(bf.message)) {
caption_df <-
statsExpressions::meta_analysis(
top.text = caption,
type = "bayes",
data = tidy_df,
k = k
)
caption <- caption_df$expression[[1]]
}
}
# ========================== sorting ===================================
# whether the term need to be arranged in any specified order
tidy_df %<>% dplyr::mutate(term = as.factor(term), .rowid = dplyr::row_number())
# sorting factor levels
new_order <-
switch(sort,
"none" = order(tidy_df$.rowid, decreasing = FALSE),
"ascending" = order(tidy_df$estimate, decreasing = FALSE),
"descending" = order(tidy_df$estimate, decreasing = TRUE),
order(tidy_df$.rowid, decreasing = FALSE)
)
# sorting `term` factor levels according to new sorting order
tidy_df %<>%
dplyr::mutate(term = as.character(term)) %>%
dplyr::mutate(term = factor(x = term, levels = term[new_order])) %>%
dplyr::select(-.rowid)
# ========================== basic plot ===================================
# palette check is necessary only if output is a plot
if (output == "plot") {
# setting up the basic architecture
plot <- ggplot2::ggplot(data = tidy_df, mapping = ggplot2::aes(x = estimate, y = term))
# if needed, adding the vertical line
if (isTRUE(vline)) {
# adding the line geom
plot <- plot +
rlang::exec(
.fn = ggplot2::geom_vline,
xintercept = 0,
na.rm = TRUE,
!!!vline.args
)
}
# if the confidence intervals are to be displayed on the plot
if (isTRUE(conf.int)) {
plot <- plot +
rlang::exec(
.fn = ggplot2::geom_errorbarh,
data = tidy_df,
mapping = ggplot2::aes(xmin = conf.low, xmax = conf.high),
na.rm = TRUE,
!!!errorbar.args
)
}
# changing the point aesthetics
plot <- plot +
rlang::exec(
.fn = ggplot2::geom_point,
na.rm = TRUE,
!!!point.args
)
# ========================= ggrepel labels ================================
# adding the labels
if (isTRUE(stats.labels)) {
# only significant p-value labels are shown
if (isTRUE(only.significant) && "p.value" %in% names(tidy_df)) {
tidy_df %<>% dplyr::mutate(label = dplyr::case_when(
p.value >= 0.05 ~ NA_character_,
TRUE ~ label
))
}
# ========================== palette check =================================
# if no. of factor levels is greater than the default palette color count
palette_message(package, palette, length(tidy_df$term))
# computing the number of colors in a given palette
palette_df <-
as_tibble(paletteer::palettes_d_names) %>%
dplyr::filter(package == !!package, palette == !!palette) %>%
dplyr::select(length)
# if insufficient number of colors are available in a given palette
if (palette_df$length[[1]] < length(tidy_df$term)) stats.label.color <- "black"
# if user has not specified colors, then use a color palette
if (is.null(stats.label.color)) {
stats.label.color <-
paletteer::paletteer_d(
palette = paste0(package, "::", palette),
n = length(tidy_df$term),
type = "discrete"
)
}
# adding labels
plot <- plot +
rlang::exec(
.fn = ggrepel::geom_label_repel,
data = tidy_df,
mapping = ggplot2::aes(x = estimate, y = term, label = label),
na.rm = TRUE,
show.legend = FALSE,
parse = TRUE,
min.segment.length = 0,
color = stats.label.color,
!!!stats.label.args
)
}
# ========================== annotations =============================
# adding other labels to the plot
plot <- plot +
ggplot2::labs(
x = xlab,
y = ylab,
caption = caption,
subtitle = subtitle,
title = title
) +
theme_ggstatsplot(ggtheme, ggstatsplot.layer) +
ggplot2::theme(plot.caption = ggplot2::element_text(size = 10))
}
# =========================== output =====================================
# what needs to be returned?
switch(output,
"subtitle" = subtitle,
"caption" = caption,
"tidy" = as_tibble(tidy_df),
"glance" = as_tibble(glance_df),
plot
)
}