-
Notifications
You must be signed in to change notification settings - Fork 189
/
ggcorrmat.R
545 lines (508 loc) · 19.1 KB
/
ggcorrmat.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
#' @title Visualization of a correlalogram (or correlation matrix)
#' @name ggcorrmat
#' @aliases ggcorrmat
#' @author Indrajeet Patil
#' @return Correlation matrix plot or correlation coefficient matrix or matrix
#' of p-values.
#'
#' @param data Dataframe from which variables specified are preferentially to be
#' taken.
#' @param cor.vars List of variables for which the correlation matrix is to be
#' computed and visualized. If `NULL` (default), all numeric variables from
#' `data` will be used.
#' @param cor.vars.names Optional list of names to be used for `cor.vars`. The
#' names should be entered in the same order.
#' @param output Character that decides expected output from this function:
#' `"plot"` (for visualization matrix) or `"correlations"` (or `"corr"` or
#' `"r"`; for correlation matrix) or `"p-values"` (or `"p.values"` or `"p"`;
#' for a matrix of *p*-values) or `"ci"` (for a tibble with confidence
#' intervals for unique correlation pairs; not available for robust
#' correlation) or `"n"` (or `"sample.size"` for a tibble with sample sizes
#' for each correlation pair).
#' @param matrix.type Character, `"full"` (default), `"upper"` or `"lower"`,
#' display full matrix, lower triangular or upper triangular matrix.
#' @param method Character argument that decides the visualization method of
#' correlation matrix to be used. Allowed values are `"square"` (default),
#' `"circle"`
#' @param corr.method,type A character string indicating which correlation
#' coefficient is to be computed (`"pearson"` (default) or `"kendall"` or
#' `"spearman"`). `"robust"` can also be entered but only if `output` argument
#' is set to either `"correlations"` or `"p-values"`. The robust correlation
#' used is percentage bend correlation (see `?WRS2::pball`). Abbreviations
#' will also work: `"p"` (for parametric/Pearson's *r*), `"np"`
#' (nonparametric/Spearman's *rho*), `"r"` (robust).
#' @param exact A logical indicating whether an exact *p*-value should be
#' computed. Used for Kendall's *tau* and Spearman's *rho*. For more details,
#' see `?stats::cor.test`.
#' @param continuity A logical. If `TRUE`, a continuity correction is used for
#' Kendall's *tau* and Spearman's *rho* when not computed exactly (Default:
#' `TRUE`).
#' @param beta A numeric bending constant for robust correlation coefficient
#' (Default: `0.1`).
#' @param digits,k Decides the number of decimal digits to be displayed
#' (Default: `2`).
#' @param sig.level Significance level (Default: `0.05`). If the *p*-value in
#' *p*-value matrix is bigger than `sig.level`, then the corresponding
#' correlation coefficient is regarded as insignificant and flagged as such in
#' the plot. This argument is relevant only when `output = "plot"`.
#' @param p.adjust.method What adjustment for multiple tests should be used?
#' (`"holm"`, `"hochberg"`, `"hommel"`, `"bonferroni"`, `"BH"`, `"BY"`,
#' `"fdr"`, `"none"`). See `stats::p.adjust` for details about why to use
#' `"holm"` rather than `"bonferroni"`). Default is `"none"`. If adjusted
#' *p*-values are displayed in the visualization of correlation matrix, the
#' **adjusted** *p*-values will be used for the **upper** triangle, while
#' **unadjusted** *p*-values will be used for the **lower** triangle of the
#' matrix.
#' @param hc.order Logical value. If `TRUE`, correlation matrix will be
#' hc.ordered using `hclust` function (Default is `FALSE`).
#' @param hc.method The agglomeration method to be used in `hclust` (see
#' `?hclust`).
#' @param lab Logical value. If `TRUE`, correlation coefficient values will be
#' displayed in the plot.
#' @param colors A vector of 3 colors for low, mid, and high correlation values.
#' If set to `NULL`, manual specification of colors will be turned off and 3
#' colors from the specified `palette` from `package` will be selected.
#' @param outline.color The outline color of square or circle. Default value is
#' `"gray"`.
#' @param title The text for the plot title.
#' @param subtitle The text for the plot subtitle.
#' @param caption The text for the plot caption. If not specified (if it is
#' `NULL`, i.e.), a default caption will be shown.
#' @param caption.default Logical decides whether the default caption should be
#' shown.
#' @param lab.col Color to be used for the correlation coefficient labels
#' (applicable only when `lab = TRUE`).
#' @param lab.size Size to be used for the correlation coefficient labels
#' (applicable only when `lab = TRUE`).
#' @param axis.text.x.margin.t,axis.text.x.margin.r,axis.text.x.margin.b,axis.text.x.margin.l
#' Margins between x-axis and the variable name texts (t: top, r: right, b:
#' bottom, l:left), especially useful in case the names are slanted, i.e. when
#' the tl.srt is between `45` and `75` (Defaults: `0`, `0`, `0`, `0`, resp.).
#' @param insig Character used to show specialized insignificant correlation
#' coefficients (`"pch"` (default) or `"blank"`). If `"blank"`, the
#' corresponding glyphs will be removed; if "pch" is used, characters (see
#' `?pch` for details) will be added on the corresponding glyphs.
#' @param pch Decides the glyphs (read point shapes) to be used for
#' insignificant correlation coefficients (only valid when `insig = "pch"`).
#' Default value is `pch = 4`.
#' @param pch.col,pch.cex The color and the cex (size) of `pch` (only valid when
#' `insig = "pch"`). Defaults are `pch.col = "#F0E442"` and `pch.cex = 10`.
#' @param tl.cex,tl.col,tl.srt The size, the color, and the string rotation of
#' text label (variable names, i.e.).
#' @param messages Decides whether messages references, notes, and warnings are
#' to be displayed (Default: `TRUE`).
#' @inheritParams theme_ggstatsplot
#' @inheritParams paletteer::paletteer_d
#'
#' @import ggplot2
#'
#' @importFrom ggcorrplot ggcorrplot
#' @importFrom dplyr select group_by summarize n arrange
#' @importFrom dplyr mutate mutate_at mutate_if
#' @importFrom purrr is_bare_double flatten_dbl keep %||%
#' @importFrom stats cor na.omit
#' @importFrom tibble as_tibble rownames_to_column
#' @importFrom rlang !! enquo quo_name
#' @importFrom crayon green blue yellow red
#' @importFrom WRS2 pball
#' @importFrom psych corr.p corr.test
#'
#' @seealso \code{\link{grouped_ggcorrmat}} \code{\link{ggscatterstats}}
#' \code{\link{grouped_ggscatterstats}}
#'
#' @references
#' \url{https://indrajeetpatil.github.io/ggstatsplot/articles/web_only/ggcorrmat.html}
#'
#' @examples
#'
#' # for reproducibility
#' set.seed(123)
#'
#' # if `cor.vars` not specified, all numeric varibles used
#' ggstatsplot::ggcorrmat(data = iris)
#'
#' # to get the correlalogram
#' # note that the function will run even if the vector with variable names is
#' # not of same length as the number of variables
#' ggstatsplot::ggcorrmat(
#' data = ggplot2::msleep,
#' cor.vars = sleep_total:bodywt,
#' cor.vars.names = c("total sleep", "REM sleep")
#' ) + # further modification using `ggplot2`
#' ggplot2::scale_y_discrete(position = "right")
#'
#' # to get the correlation matrix
#' ggstatsplot::ggcorrmat(
#' data = ggplot2::msleep,
#' cor.vars = sleep_total:bodywt,
#' output = "r"
#' )
#'
#' # setting output = "p-values" (or "p") will return the p-value matrix
#' ggstatsplot::ggcorrmat(
#' data = ggplot2::msleep,
#' cor.vars = sleep_total:bodywt,
#' corr.method = "r",
#' p.adjust.method = "bonferroni",
#' output = "p"
#' )
#'
#' # setting `output = "ci"` will return the confidence intervals for unique
#' # correlation pairs
#' ggstatsplot::ggcorrmat(
#' data = ggplot2::msleep,
#' cor.vars = sleep_total:bodywt,
#' p.adjust.method = "BH",
#' output = "ci"
#' )
#'
#' # modifying elements of the correlation matrix by changing function defaults
#' ggstatsplot::ggcorrmat(
#' data = datasets::iris,
#' cor.vars = c(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width),
#' sig.level = 0.01,
#' ggtheme = ggplot2::theme_bw(),
#' hc.order = TRUE,
#' matrix.type = "lower",
#' outline.col = "white",
#' title = "Dataset: Iris"
#' )
#' @export
# defining the function
ggcorrmat <- function(data,
cor.vars = NULL,
cor.vars.names = NULL,
output = "plot",
matrix.type = "full",
method = "square",
corr.method = "pearson",
type = NULL,
exact = FALSE,
continuity = TRUE,
beta = 0.1,
digits = 2,
k = NULL,
sig.level = 0.05,
p.adjust.method = "none",
hc.order = FALSE,
hc.method = "complete",
lab = TRUE,
package = "RColorBrewer",
palette = "Dark2",
direction = 1,
colors = c("#E69F00", "white", "#009E73"),
outline.color = "black",
ggtheme = ggplot2::theme_bw(),
ggstatsplot.layer = TRUE,
title = NULL,
subtitle = NULL,
caption = NULL,
caption.default = TRUE,
lab.col = "black",
lab.size = 5,
insig = "pch",
pch = 4,
pch.col = "black",
pch.cex = 11,
tl.cex = 12,
tl.col = "black",
tl.srt = 45,
axis.text.x.margin.l = 0,
axis.text.x.margin.t = 0,
axis.text.x.margin.r = 0,
axis.text.x.margin.b = 0,
messages = TRUE) {
# ======================= dataframe ========================================
# creating a dataframe out of the entered variables
if (base::missing(cor.vars)) {
df <- purrr::keep(.x = data, .p = purrr::is_bare_numeric)
} else {
# creating a dataframe out of the entered variables
df <- data %>%
dplyr::select(.data = ., !!rlang::enquo(cor.vars))
}
# counting number of NAs present in the dataframe
na_total <- df %>%
purrr::map_df(.x = ., .f = ~ sum(is.na(.))) %>%
purrr::flatten_dbl(.x = .) %>%
sum(., na.rm = TRUE)
# renaming the columns if so desired (must be equal to the number of number
# of cor.vars)
if (!is.null(cor.vars.names)) {
# check if number of cor.vars is equal to the number of names entered
if (length(df) != length(cor.vars.names)) {
# display error message if not
base::message(cat(
crayon::red("Warning: "),
crayon::blue(
"Number of variable names does not equal the number of variables.\n"
),
sep = ""
))
} else {
# otherwise rename the columns with the new names
colnames(df) <- cor.vars.names
}
}
# ============================ checking corr.method =======================
# see which method was used to specify type of correlation
corr.method <- type %||% corr.method
digits <- k %||% digits
# if any of the abbreviations have been entered, change them
if (corr.method == "p") {
corr.method <- "pearson"
} else if (corr.method == "np") {
corr.method <- "spearman"
} else if (corr.method == "r") {
corr.method <- "robust"
}
# create unique name for each method
if (corr.method == "pearson") {
corr.method.text <- "Pearson"
} else if (corr.method == "spearman") {
corr.method.text <- "Spearman"
} else if (corr.method == "robust") {
corr.method.text <- "robust (% bend)"
} else if (corr.method == "kendall") {
corr.method.text <- "Kendall"
}
# ===================== statistics ========================================
#
if (corr.method %in% c("pearson", "spearman", "kendall")) {
# confidence interval computation can take some time and also produce
# warnings, so compute them only when requested by the user
if (output == "ci") {
ci <- TRUE
} else {
ci <- FALSE
}
# computing correlations using `psych` package
corr_df <-
psych::corr.test(
x = base::as.data.frame(df),
y = NULL,
use = "pairwise",
method = corr.method,
adjust = p.adjust.method,
alpha = .05,
ci = ci,
minlength = 20
)
# computing correlations on all included variables
corr.mat <- base::round(x = corr_df$r, digits = digits)
# compute a correlation matrix of p-values
p.mat <- corr_df$p
} else if (corr.method == "robust") {
# get matrix of samples sizes to be used later in `corr.p` function (`n`)
corr_df <-
psych::corr.test(
x = base::as.data.frame(df),
y = NULL,
use = "pairwise",
adjust = "none",
alpha = .05,
ci = FALSE,
minlength = 20
)
# computing the percentage bend correlation matrix
rob_cor <- WRS2::pball(x = df, beta = beta)
# extracting the correlations and formatting them
corr.mat <- base::round(x = rob_cor$pbcorm, digits = digits)
# converting NAs to 1's
rob_cor$p.values[is.na(rob_cor$p.values)] <- 0
# adjusting for multiple comparisons (if needed)
if (p.adjust.method != "none") {
p.mat <-
psych::corr.p(
r = corr.mat,
n = corr_df$n,
adjust = p.adjust.method,
alpha = 0.05,
minlength = 20
)$p
} else {
# creating a correlation matrix of p-values
p.mat <- rob_cor$p.values
}
}
# ========================== plot =========================================
# if user has not specified colors, then use a color palette
if (is.null(colors)) {
colors <- paletteer::paletteer_d(
package = !!package,
palette = !!palette,
n = 3,
direction = direction,
type = "discrete"
)
}
# creating the basic plot
if (output == "plot") {
if (corr.method %in% c("pearson", "spearman", "kendall", "robust")) {
# legend title with information about correlation type and sample
if (na_total == 0) {
legend.title.text <-
bquote(atop(
atop(
scriptstyle(bold("sample size:")),
italic(n) ~ "=" ~ .(nrow(x = data))
),
atop(
scriptstyle(bold("correlation:")),
.(corr.method.text)
)
))
} else {
# in case of NAs, compute minimum and maximum sample sizes of pairs
n_summary <- numdf_summary(corr_df$n)
legend.title.text <-
bquote(atop(
atop(
atop(
scriptstyle(bold("sample size:")),
italic(n)[min] ~ "=" ~ .(n_summary$n_min[[1]])
),
atop(
italic(n)[median] ~ "=" ~ .(n_summary$n_median[[1]]),
italic(n)[max] ~ "=" ~ .(n_summary$n_max[[1]])
)
),
atop(
scriptstyle(bold("correlation:")),
.(corr.method.text)
)
))
}
# plotting the correlalogram
plot <-
ggcorrplot::ggcorrplot(
corr = corr.mat,
method = method,
p.mat = p.mat,
sig.level = sig.level,
type = matrix.type,
hc.method = hc.method,
hc.order = hc.order,
lab = lab,
outline.color = outline.color,
ggtheme = ggtheme,
colors = colors,
legend.title = legend.title.text,
lab_col = lab.col,
lab_size = lab.size,
insig = insig,
pch = pch,
pch.col = pch.col,
pch.cex = pch.cex,
tl.cex = tl.cex,
tl.col = tl.col,
tl.srt = tl.srt,
digits = digits
)
# =========================== labels ==================================
# if `caption` is not specified, use the generic version only if
# `caption.default` is `TRUE`
if (is.null(caption) &&
pch == 4 && isTRUE(caption.default)) {
# p value adjustment method description
p.adjust.method.text <-
paste(
"Adjustment (p-value): ",
p.adjust.method.description(p.adjust.method = p.adjust.method),
sep = ""
)
# preparing the caption
caption <-
base::substitute(
atop(
expr = paste(
bold("X"),
" = correlation non-significant at ",
italic("p"),
" < ",
sig.level,
sep = ""
),
bottom.text
),
env = list(
sig.level = sig.level,
bottom.text = p.adjust.method.text
)
)
}
# adding text details to the plot
plot <- plot +
ggplot2::labs(
title = title,
subtitle = subtitle,
caption = caption,
xlab = NULL,
ylab = NULL
)
# adding ggstatsplot theme for correlation matrix
if (isTRUE(ggstatsplot.layer)) {
plot <- plot +
theme_corrmat()
}
# even if ggstatsplot theme is not overlaid, still make sure there is
# enough distance between the axis and the label
plot <- plot +
ggplot2::theme(
axis.text.x = ggplot2::element_text(
margin = ggplot2::margin(
t = axis.text.x.margin.t,
r = axis.text.x.margin.r,
b = axis.text.x.margin.b,
l = axis.text.x.margin.l,
unit = "pt"
)
)
)
}
}
# =============================== output ==================================
# if p-values were adjusted, notify how they are going to be displayed
if (p.adjust.method != "none" && isTRUE(messages) && corr.method != "robust") {
ggcorrmat_matrix_message()
}
# return the desired result
if (output %in% c("correlations", "corr", "r")) {
# return a tibble
return(matrix_to_tibble(df = corr.mat))
} else if (output %in% c("n", "sample.size")) {
# return a tibble
return(matrix_to_tibble(df = corr_df$n))
} else if (output %in% c("p-values", "p.values", "p")) {
# return a tibble
return(matrix_to_tibble(df = p.mat))
} else if (output == "ci") {
if (corr.method %in% c("pearson", "spearman", "kendall")) {
# compute confidence intervals
ci.mat <- dplyr::full_join(
x = corr_df$ci %>%
tibble::rownames_to_column(.data = ., var = "pair") %>%
tibble::rowid_to_column(.data = ., var = "rowid") %>%
tibble::as_tibble(x = .),
y = corr_df$ci.adj %>%
tibble::rowid_to_column(.data = ., var = "rowid") %>%
tibble::as_tibble(x = .),
by = "rowid"
) %>%
dplyr::select(.data = ., pair, r, dplyr::everything(), -rowid)
# return a tible with CIs
return(ci.mat)
} else {
base::message(cat(
crayon::red("Warning: "),
crayon::blue(
"Confidence intervals currently not supported for robust correlation.\n"
),
sep = ""
))
}
} else if (output == "plot") {
# correlalogram plot
return(plot)
}
}