-
Notifications
You must be signed in to change notification settings - Fork 12
/
phyloseq_in_PEMA.R
340 lines (265 loc) · 12 KB
/
phyloseq_in_PEMA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
#!/usr/bin/env Rscript
## SECTION 0
## Please let Phyloseq know which Silva version you chose.
## ATTENTION! Really important variable to fill!
## Move to SECTION 3 in order to set your phyloseq analysis. Please DO NOT change anything in SECTION 1.
## In SECTION 2 there are some statistics that are the same for each and every analysis.
## However, you are free to change anything you want to in SECTION 2 as well.
## In addition, take really good care of your metadata file. This needs to be called "metadata.csv"!
## If it is not called like this, the phyloseq will fail.
args<-commandArgs(TRUE)
markerGene = args[1]
silvaVersion = args[2]
#------------------------------------------------------------------------------------------------------------------
## SECTION 1
# ATTENTION! Do NOT change the following lines!!
# That is what needs to stay as it is for PEMA to execute this phyloseq script!
# Move into the next section in order to set the analysis the way you want to!
# setting the envrinoment where phyloseq is going to run
# set_working_dir = paste("/home1/haris/metabar_pipeline/PEMA/",analysis_name,"/phyloseq-output", sep="")
# setwd(set_working_dir)
# for the calculation of diversity indices, the bioenv and the permanova
library(vegan)
# for the actual analysis
library(phyloseq)
# to import the tree
library(ape)
# for the beautiful plots
library(ggplot2)
#to create the taxonomy table and play around with the tables
library(dplyr)
library(tidyr)
#to create beautiful colour palettes for your plots
library(RColorBrewer)
# Define a default theme for ggplot graphics
theme_set(theme_bw())
#import the OTU table (or else biotic data)
biotic <- read.csv("finalTable.tsv", sep = "\t", header=TRUE, row.names = 1)
#import the tree of the OTUs
testTree <- "OTUs_tree.tre"
pointer <- "no"
if (file.exists(testTree)) {
tree <- read.tree("OTUs_tree.tre")
pointer <- "yes"
}
#Create taxonomy table from the Classification column of the biotic data
if (markerGene == 'gene_16S' || markerGene == 'gene_18S') {
if (silvaVersion == 'silva_132') {
# Silva 132
taxonomy <- select(biotic, Classification)
colnames <- c("Root", "Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species")
taxonomy <- separate(taxonomy, Classification, colnames, sep = ";", remove = TRUE,
convert = FALSE, extra = "warn", fill = "warn")
getPalette = colorRampPalette(brewer.pal(8, "Set2"))
} else if (silvaVersion == 'silva_128') {
# Silva 128
taxonomy <- select(biotic, Classification)
colnames <- c("Root", "Domain", "Superkingdom", "Superphylum", "Phylum", "Class", "Order", "Family", "Genus", "Species")
taxonomy <- separate(taxonomy, Classification, colnames, sep = ";", remove = TRUE,
convert = FALSE, extra = "warn", fill = "warn")
getPalette = colorRampPalette(brewer.pal(10, "Set2"))
}
} else if (markerGene == 'gene_ITS') {
# UNITE database
taxonomy <- select(biotic, Classification)
colnames <- c("Root", "Superkingdom", "Subgroup", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
taxonomy <- separate(taxonomy, Classification, colnames, sep = ";", remove = TRUE,
convert = FALSE, extra = "warn", fill = "warn")
getPalette = colorRampPalette(brewer.pal(10, "Set2"))
} else if (markerGene == 'gene_COI') {
# MIDORI database
taxonomy <- select(biotic, Classification)
colnames <- c("Superkingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
taxonomy <- separate(taxonomy, Classification, colnames, sep = ";", remove = TRUE,
convert = FALSE, extra = "warn", fill = "warn")
getPalette = colorRampPalette(brewer.pal(7, "Set2"))
}
#convert the taxonomy data from data frame to matrix
taxonomy_matrix <- as.matrix(taxonomy)
# get index of NA values in the taxonomy
y <- which(is.na(taxonomy)==TRUE)
# replace all NA values with "Unknown"
taxonomy_matrix[y] <- "Unknown"
# prepare the object for the phyloseq object
TAX = tax_table(taxonomy_matrix)
#remove Classification column from biotic data
biotic <- select(biotic, -Classification)
#convert the biotic data from data frame to matrix
biotic_matrix <- as.matrix(biotic)
#import the metadata of the samples, if any
metadata <- read.csv("metadata.csv", header=TRUE, row.names = 1)
# prepare the objects for the phyloseq object
OTU = otu_table(biotic_matrix, taxa_are_rows = TRUE)
META = sample_data(metadata)
if ( pointer == 'yes') {
TREE = phy_tree(tree)
# combine them all to create the phyloseq object
physeq = phyloseq(OTU, TAX, META, TREE)
} else {
physeq = phyloseq(OTU, TAX, META)
}
print("this is where the nice part begins!")
#------------------------------------------------------------------------------------------------------------------
## SECTION 2
# In this section you can change the code however you want to, in order to get the analysis needed.
# You can remove any part you want or you can add anything.
# You are also free to make any changes you want to in the commands that have been included.
# These commands are only an attempt as a "traditional" analysis in microbial community studies.
#tranpose biotic data for the calculation of diversity indices
biotic_transposed <- t(biotic)
# Number of individuals
RelativeAbundance <- rowSums(biotic_transposed)
# Number of OTUs
NumberOfOTUs <- rowSums(biotic_transposed > 0)
# Shannon index
Shannon <- diversity(biotic_transposed, base=2)
# Simpson's index
Simpson <- diversity(biotic_transposed, "simpson")
# Pielou's index
Pielou <- Shannon/log(NumberOfOTUs)
# Margalef's index
Margalef <- (NumberOfOTUs-1)/log(RelativeAbundance)
#Chao1 and ACE indices
MoreIndices <- estimateR(biotic_transposed)
MoreIndices <- t(MoreIndices)
# join them together
div_indices <- data.frame(RelativeAbundance, NumberOfOTUs, Shannon, Simpson, Pielou, Margalef)
div_indices <- cbind.data.frame(div_indices, Chao1=as.factor(MoreIndices[,2]), ACE=as.factor(MoreIndices[,4]))
# save the results in an output file
write.csv(div_indices,"div_indices.csv",quote=F)
#------------------------------------------------------------------------------------------------------------------
## SECTION 3
# create a bar plot, for the taxon rank you like, e.g Phylum
# this bar chart is created with the default colour palette of phyloseq
barchart <- plot_bar(physeq, fill = "Phylum")
pdf("barchart.pdf", width = 12)
print(barchart)
dev.off()
# ATTENTION!
# in the below commands, we chose the "Habitat" variable from our metadata file as an example.
# in addition, we chose "Phylum" as an example.
# you need to set those parameters (metadata variables and taxon level) in order to address your data and your needs.
# create a bar plot for the 100 most abundant taxa, for the Phylum rank per e.g Habitat (info included in the metadata)
top100 <- names(sort(taxa_sums(physeq), decreasing=TRUE))[1:100]
physeq.top100 <- transform_sample_counts(physeq, function(OTU) OTU/sum(OTU))
physeq.top100 <- prune_taxa(top100, physeq.top100)
#here you can change the taxonomic rank and the grouping of samples
barchart100 <- plot_bar(physeq.top100, fill="Phylum") + facet_wrap(~Habitat, scales="free_x")
pdf("barchart100.pdf")
print(barchart100)
dev.off()
# create a bar plot for all the taxa, for the Phylum rank per e.g. Habitat and Location (info included in the metadata)
# this bar chart is created with the default colour palette of phyloseq
barchart_habitat <- plot_bar(physeq, "Location", fill="Phylum") + facet_wrap(~Habitat, scales="free_x")
pdf("barchart_habitat.pdf", width =12)
print(barchart_habitat)
dev.off()
#get the data frame from the phyloseq object
pd <- psmelt(physeq)
#now let's do the bar plot again, but with custom chosen colours
#Count how many Phyla are there in your samples
HowManyPhyla <- length(unique(unlist(pd[,c("Phylum")])))
#OR
HowManyPhyla <- length(levels(as.factor(pd$Phylum)))
# Build a colour palette with number of colours as many as
# the Phyla in your samples by interpolating the palette "Set2".
# "Set2" is one of the colorblind friendly palettes
# Another example of a colorblind friendly palette is "Dark2"
# If you want, by running the command display.brewer.all(colorblindFriendly = TRUE)
# you can see all the colorblind friendly palettes of the RColorBrewer package.
## Now the "getPalette" variable, we set it in the if statement of the marker gene.
#getPalette = colorRampPalette(brewer.pal(9, "Set2"))
PhylaPalette = getPalette(HowManyPhyla)
#and do the actual plotting
barchart_palette <- ggplot(pd, aes(x = Location, y = Abundance, factor(Phylum), fill = factor(Phylum))) +
geom_bar(stat = "identity") +
facet_wrap(~Habitat, scales = "free_x") +
scale_fill_manual(values = PhylaPalette) +
labs(fill = "Phylum") +
guides(fill=guide_legend(ncol=2))
pdf("barchart_palette.pdf", width = 12)
print(barchart_palette)
dev.off()
if ( pointer == 'yes') {
# plot the OTU tree with colour coding by e.g. Habitat (info included in the metadata)
tree1 <- plot_tree(physeq, color="Habitat", label.tips="taxa_names", ladderize="left", justify = "left", size = "Abundance", plot.margin=0.3)
pdf("tree1.pdf")
print(tree1)
dev.off()
tree2 <- plot_tree(physeq, color="Habitat", label.tips="taxa_names", ladderize="left", plot.margin=0.3)
pdf("tree2.pdf")
print(tree2)
dev.off()
tree3 <- plot_tree(physeq, color="Habitat", ladderize="left", plot.margin=0.3)
pdf("tree3.pdf")
print(tree3)
dev.off()
}
# create a heatmap, for the Phylum rank
heatmap <- plot_heatmap(physeq, taxa.label="Phylum")
pdf("heatmap.pdf")
print(heatmap)
dev.off()
# plot the diversity indices with colour coding by e.g. Location (info included in the metadata)
richness <- plot_richness(physeq, measures=c("Shannon", "Simpson", "Chao1"), color="Location")
pdf("richness.pdf", width = 12)
print(richness)
dev.off()
# create the nmds plot colour coded by e.g. Location (info included in the metadata)
ord.nmds.bray <- ordinate(physeq, method="NMDS", distance="bray")
p1 <- plot_ordination(physeq, ord.nmds.bray, color="Location", title="Bray NMDS")
pdf("p1.pdf")
print(p1)
dev.off()
# Split the previous nmds plot per e.g. Habitat (info included in the metadata)
p2 <- p1 + facet_wrap(~Habitat, 3)
pdf("p2.pdf")
print(p2)
dev.off()
# Create an nmds plot per Phylum
p3 <- plot_ordination(physeq, ord.nmds.bray, type="taxa", color="Phylum", title="Bray NMDS")
pdf("p3.pdf", width =12)
print(p3)
dev.off()
# Split an nmds plot per Phylum
p4 <- p3 + facet_wrap(~Phylum, 3)
pdf("p4.pdf", width = 20)
print(p4)
dev.off()
#Run PERMANOVA to check for significance based on the Habitat factor (info included in the metadata)
OTU.adonis.Habitat <- adonis(biotic_transposed ~ metadata$Habitat, data=metadata, permutations = 999, distance = "bray")
OTU.adonis.Habitat
#Run PERMANOVA to check for significance based on the combination (+) of Location & Habitat factors
OTU.adonis <- adonis(biotic_transposed ~ metadata$Location+metadata$Habitat,data=metadata,permutations = 999,distance = "bray")
#summary(OTU.adonis)
OTU.adonis
# Create a bar plot only for the Proteobacteria
physeq.proteobacteria = subset_taxa(physeq, Phylum=="Proteobacteria")
proteo1 <- plot_bar(physeq.proteobacteria, title = "Proteobacteria per sample")
proteo2 <- plot_bar(physeq.proteobacteria, "Class", "Abundance", "Habitat", title = "Proteobacteria per class, colour by Habitat")
proteo3 <- plot_bar(physeq.proteobacteria, "Habitat", title = "Proteobacteria per Habitat")
proteo4 <- plot_bar(physeq.proteobacteria, "Class", title = "Proteobacteria per class")
proteo5 <- plot_bar(physeq.proteobacteria, "Class", "Abundance", "Class", facet_grid="Habitat~.", title = "Proteobacteria per Class, per Habitat")
pdf("proteo1.pdf")
print(proteo1)
dev.off()
pdf("proteo2.pdf")
print(proteo2)
dev.off()
pdf("proteo3.pdf")
print(proteo3)
dev.off()
pdf("proteo4.pdf")
print(proteo4)
dev.off()
pdf("proteo5.pdf", width = 12)
print(proteo5)
dev.off()
#BIO-ENV
# select the columns of the metadata file that contain numeric data
metadata_without_factors <- select_if(metadata, is.numeric)
# run the bioenv test
bioenv <- bioenv(biotic_transposed, metadata_without_factors)
# display the results
summary(bioenv)
## You may check your results!