-
Notifications
You must be signed in to change notification settings - Fork 1
/
analyse_results_draft.R
125 lines (102 loc) · 5.75 KB
/
analyse_results_draft.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Setup -------------------------------------------------------------------
rm(list = ls())
options(stringsAsFactors = F, scipen = 19)
# Introduction ------------------------------------------------------------
# This is a draft script for generating plots from a metaMATE find
# run. It currently works only for non-multiplicative terms, but if you
# have any multiplicative terms don't worry, they'll just be ignored.
# As long as you have the below packages installed, it should just run.
# The plots are exactly the same format as figs 2 and 3 in the paper, see
# https://www.biorxiv.org/content/10.1101/2020.06.17.157347v1.full
# Load libraries ----------------------------------------------------------
library(dplyr)
library(tidyr)
library(ggplot2)
# Load variables ----------------------------------------------------------
termdescriptions = list(
'library_n' = 'Minimum absolute ASV abundance by library',
'library_p' = 'Minimum relative ASV abundance by library',
'library|clade_n' = 'Minimum absolute ASV abundance by library and clade',
'library|clade_p' = 'Minimum relative ASV abundance by library and clade',
'library|clade_n' = 'Minimum absolute ASV abundance by library and taxon',
'library|taxon_p' = 'Minimum relative ASV abundance by library and taxon',
'total_n' = 'Minimum absolute ASV abundance',
'total_p' = 'Minimum relative ASV abundance',
'total|clade_n' = 'Minimum absolute ASV abundance by clade',
'total|clade_p' = 'Minimum relative ASV abundance by clade',
'total|taxon_n' = 'Minimum absolute ASV abundance by taxon',
'total|taxon_p' = 'Minimum relative ASV abundance by taxon'
)
# Load data ---------------------------------------------------------------
### REPLACE WITH THE PATH TO YOUR RESULTS FILE HERE ###
results <- read.csv("_results.csv",
check.names = F) %>%
mutate(rejects_hash = as.character(rejects_hash))
# Review the best runs and communities --------------------------------------------------------
# rejects_hash is a code given to each unique set of rejected ASVs, thus each different value
# corresponds to a different set of retained = output ASVs (after ensuring the validation set
# is always rejected/retained despite the abundance evaluation)
# Get the runs with the highest of each score type
highestscores <- map_dfr(str_subset(names(results), "_score"), function(sc){
filter(results, pull(results, sc) == max(pull(results, sc)))
})
# Filter this to the unique ASV output sets
highestscores %>% filter(!duplicated(rejects_hash))
# Filter manually to select ideal set according to your thresholds ----------------------------
# Set your ideal maximum false positive and false negative rates
falsepositive <- 0.1 # The proportion of non-authentic ASVs that are erroneously not filtered out
falsenegative <- 0.1 # The proportion of authentic ASVs that are erroneously filtered out
results %>% filter(verifiedauthentic_rejected_p <= falsenegative &
verifiednonauthentic_retained_p <= falsepositive)
# Extract non-multiplicative terms and generate the two plot sets -----------------------------
# Reshape the data matrix and separate out the variable details
idvars <- colnames(results)[c(1:which(colnames(results) == 'asvs_total'), ncol(results))]
results <- pivot_longer(results, -all_of(idvars),
names_to = c('data', 'part', 'stat'),
names_sep = '_',
values_to = 'value')
# Extract
oneterms <- filter(results, !grepl('\\*', term)) %>%
pivot_longer(contains('_threshold'),
names_to = c('category', 'metric', 't'),
names_sep = '_',
values_to = 'threshold') %>%
filter(paste(category, metric, sep = '_') == term) %>%
select(-c(category, metric, t)) %>%
mutate(termdescription = unlist(termdescriptions[term]))
# To extract the passing ASVs at a certain threshold value, use the
# number shown on the plot as the resultindex for a NUMTdumper dump run
# Plots 1
unite(oneterms, 'part', data, part) %>%
filter(part %in% c('verifiedauthentic_rejected', 'verifiednonauthentic_retained'), stat == 'p') %>%
ggplot(aes(x = threshold, y = value, col = part)) +
geom_line() +
geom_point() +
geom_text(aes(label = resultindex), nudge_y = .1, size = 2.5) +
scale_colour_discrete(name = '',
labels = c('Verified Non-Authentic ASVs\nRetained (false positives)',
'Verified Authentic ASVs\nRejected (false negatives)')) +
labs(y = '% Retained/Rejected') +
facet_wrap( ~ termdescription, ncol = 1, scales = 'free_x', strip.position = 'bottom') +
theme_bw() +
theme(axis.title.x = element_blank(), legend.position = 'bottom',
strip.placement = 'outside', strip.background = element_blank())
# Plots 2
filter(oneterms, stat == 'estimate') %>%
pivot_wider(names_from = part, values_from = value) %>%
ggplot(aes(x = threshold, y = retained)) +
geom_ribbon(aes(ymin = 0, ymax = total, fill = data), alpha = .5) +
geom_line(aes(col = data)) +
geom_point(aes(col = data)) +
geom_text(aes(y = retained, label = resultindex), nudge_y = 10, size = 2.5) +
scale_colour_discrete(name = '',
labels = c('Estimated Retained\nNon-Authentic ASVs',
'Estimated Retained\nAuthentic ASVs')) +
scale_fill_discrete(name = '',
labels = c('Estimated Initial\nNon-Authentic ASVs',
'Estimated Initial\nAuthentic ASVs')) +
labs(y = 'Number of ASVs') +
facet_wrap( ~ termdescription, ncol = 1, scales = 'free_x', strip.position = 'bottom') +
theme_bw() +
theme(axis.title.x = element_blank(), legend.position = 'bottom',
strip.placement = 'outside', strip.background = element_blank())