-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPipeline_RNAseq
258 lines (204 loc) · 12.5 KB
/
Pipeline_RNAseq
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
library ejemplo: RaizCK1
Indice de rfam (ver archivo rfam dentro de esta carpeta)
manager@bl8vbox[manager] bowtie-build input.fa nombre_base (nombre base es el nombre que va a tener el índice)
Limpieza
bioinfo@bioinfo-pc[bioinfo] bowtie -v 2 /home/bioinfo/Agustin/Genoma_Naranjo/RFam_Index/RFamNaranja -1 /media/bioinfo/ToshibaExt/Naranjo_Fastq/Raiz_Xuegan/RaizCK1_1.fastq -2 /media/bioinfo/ToshibaExt/Naranjo_Fastq/Raiz_Xuegan/RaizCK1_2.fastq --un /media/bioinfo/ToshibaExt/Naranjo_Fastq/Raiz_Xuegan/Clean/RaizCK1_Clean_Paired.fastq --al /media/bioinfo/ToshibaExt/Naranjo_Fastq/Raiz_Xuegan/Output_Rfam/RaizCK1_Ribosomales_Paired.fastq
Indice - Hisat2
bioinfo@bioinfo-pc[bioinfo] hisat2-build /home/bioinfo/Agustin/Genoma_Naranjo/csi.chromosome.fa /home/bioinfo/Agustin/Genoma_Naranjo/Csinensis/v1.1/Index_Hisat2_Chinos/csi
Mapeo
bioinfo@bioinfo-pc[bioinfo] hisat2 --no-unal -x /home/bioinfo/Agustin/Genoma_Naranjo/Csinensis/v1.1/Index_Hisat2/Csi -S /media/bioinfo/ToshibaExt/Naranjo_Fastq/Raiz_Xuegan/Output_Mapeo/RaizCK1.sam -1 /media/bioinfo/ToshibaExt/Naranjo_Fastq/Raiz_Xuegan/Clean/RaizCK1_Clean_Paired_1.fastq -2 /media/bioinfo/ToshibaExt/Naranjo_Fastq/Raiz_Xuegan/Clean/RaizCK1_Clean_Paired_2.fastq 2> /media/bioinfo/ToshibaExt/Naranjo_Fastq/Raiz_Xuegan/Output_Mapeo/Rep_Hisat_RaizCK1_Clean.txt
Si vamos a mapear libraries single-ended, en vez de -1 y -2, se usa -U
Si vamos a mapear un archivo fasta, se usa -f en vez de las anteriores
Htseq-counts
htseq-count -f bam -s no -t exon -i transcript_id -m union /media/bioinfo/ToshibaExt/Naranjo_Fastq/Hoja_Xuegan/Output_Mapeo/Hoja_Xck1.bam /home/bioinfo/Agustin/Genoma_Naranjo/Csinensis/v1.1/annotation/exons.gff3 > Hoja_Xck1.counts
Alternativa - Featurecounts (Me gusta más, es mucho mas rápido)
#Ojo con -t y -g, puede variar según el archivo a analizar, yo lo hago por prueba y error
featureCounts -p -t gene -g ID -a /home/bioinfo/Agustin/Genoma_Naranjo/Csinensis/v1.1/annotation/Csinensis_154_v1.1.gene.gff3 -o /media/bioinfo/ToshibaExt/Naranjo_Fastq/Abscission/Output_Mapeo/Archivos_Bam/Counts_Clean/Rh2_R2_Featurecounts.tab /media/bioinfo/ToshibaExt/Naranjo_Fastq/Abscission/Output_Mapeo/Archivos_Bam/Rh2_R2_name.bam
Para DESeq2
Two csv files are generated, move them to DESeq_out and generate separate files for each sample, containing transcript_ID and counts.
Start R in DEseq_out folder
> #setwd('/Users/hernanrosli/Documents/Trabajo/BTI_Backup/EXPERIMENTS/lncRNAs/33_samples/DEseq_analysis/')
> setwd("/media/bioinfo/ToshibaExt/Naranjo_Fastq/HTSeq_Gene")
> #directory<-'~/Documents/Trabajo/BTI_Backup/EXPERIMENTS/lncRNAs/33_samples/DEseq_analysis/'
> directory<-'/media/bioinfo/ToshibaExt/Naranjo_Fastq/HTSeq_Gene/'
> #mock_MAMP_30min_1.txt=read.table('mock_MAMP_30min_1.txt')
> #mock_MAMP_30min_2.txt=read.table('mock_MAMP_30min_2.txt')
> #mock_MAMP_30min_3.txt=read.table('mock_MAMP_30min_3.txt')
> #flg22_30min_1.txt=read.table('flg22_30min_1.txt')
> #flg22_30min_2.txt=read.table('flg22_30min_2.txt')
> #flg22_30min_3.txt=read.table('flg22_30min_3.txt')
> Hoja1_counts=read.table('Hoja1_htseq.counts')
> Hoja2_counts=read.table('Hoja2_htseq.counts')
> Pulpa1_counts=read.table('Pulpa1_htseq.counts')
> Pulpa2_counts=read.table('Pulpa2_htseq.counts')
> #sampleFiles<-c('mock_MAMP_30min_1.txt', 'mock_MAMP_30min_2.txt', 'mock_MAMP_30min_3.txt', 'flg22_30min_1.txt', 'flg22_30min_2.txt', 'flg22_30min_3.txt')
> sampleFiles<-c('Hoja1_htseq.counts', 'Hoja2_htseq.counts', 'Pulpa1_htseq.counts', 'Pulpa2_htseq.counts', 'Pulpa3_htseq.counts')
> #sampleCondition<-c('mock_MAMP_30min', 'mock_MAMP_30min', 'mock_MAMP_30min', 'flg22_30min', 'flg22_30min', 'flg22_30min')
> sampleCondition<-c('Hoja', 'Hoja', 'Pulpa', 'Pulpa', 'Pulpa')
> library(DESeq2)
> sampleTable <- data.frame(sampleName=sampleFiles, fileName=sampleFiles, condition=sampleCondition)
> ddsHTSeq<-DESeqDataSetFromHTSeqCount(sampleTable=sampleTable, directory=directory, design=~condition)
> #colData(ddsHTSeq)$condition<-factor(colData(ddsHTSeq)$condition, levels=c("mock_MAMP_30min","flg22_30min"))
> colData(ddsHTSeq)$condition<-factor(colData(ddsHTSeq)$condition, levels=c("Hoja","Pulpa"))
> dds<-DESeq(ddsHTSeq)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
> res<-results(dds)
> #write.table(res, "./mock_flg22_30min_DESeq_results.txt", sep="\t")
> write.table(res, "HojavsPulpa_Resultado.tab", sep="\t")
> table_counts_normalized <- counts(dds, normalized=TRUE)
> write.table(table_counts_normalized, "Normalizadas.tab", sep="\t")
Comando alternativo:
Extraido de https://gif.biotech.iastate.edu/rnaseq-analysis-walk-through
# Import the data
countdata <- read.table("counts.txt", header=TRUE, row.names=1)
# Remove .bam or .sam from filenames
colnames(countdata) <- gsub("\\.[sb]am$", "", colnames(countdata))
# Convert to matrix
countdata <- as.matrix(countdata)
head(countdata)
# Assign condition (first four are controls, second four and third four contain two different experiments)
(condition <- factor(c(rep("ctl", 4), rep("inf1", 4), rep("inf2", 4))))
# Analysis with DESeq2
library(DESeq2)
# Create a coldata frame and instantiate the DESeqDataSet. See ?DESeqDataSetFromMatrix
(coldata <- data.frame(row.names=colnames(countdata), condition))
dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition)
dds
# Run the DESeq pipeline
dds <- DESeq(dds)
# Plot dispersions
png("qc-dispersions.png", 1000, 1000, pointsize=20)
plotDispEsts(dds, main="Dispersion plot")
dev.off()
# Regularized log transformation for clustering/heatmaps, etc
rld <- rlogTransformation(dds)
head(assay(rld))
hist(assay(rld))
# Colors for plots below
## Ugly:
## (mycols <- 1:length(unique(condition)))
## Use RColorBrewer, better
library(RColorBrewer)
(mycols <- brewer.pal(8, "Dark2")[1:length(unique(condition))])
# Sample distance heatmap
sampleDists <- as.matrix(dist(t(assay(rld))))
library(gplots)
png("qc-heatmap-samples.png", w=1000, h=1000, pointsize=20)
heatmap.2(as.matrix(sampleDists), key=F, trace="none",
col=colorpanel(100, "black", "white"),
ColSideColors=mycols[condition], RowSideColors=mycols[condition],
margin=c(10, 10), main="Sample Distance Matrix")
dev.off()
# Principal components analysis
## Could do with built-in DESeq2 function:
## DESeq2::plotPCA(rld, intgroup="condition")
## I (Stephen Turner) like mine better:
rld_pca <- function (rld, intgroup = "condition", ntop = 500, colors=NULL, legendpos="bottomleft", main="PCA Biplot", textcx=1, ...) {
require(genefilter)
require(calibrate)
require(RColorBrewer)
rv = rowVars(assay(rld))
select = order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
pca = prcomp(t(assay(rld)[select, ]))
fac = factor(apply(as.data.frame(colData(rld)[, intgroup, drop = FALSE]), 1, paste, collapse = " : "))
if (is.null(colors)) {
if (nlevels(fac) >= 3) {
colors = brewer.pal(nlevels(fac), "Paired")
} else {
colors = c("black", "red")
}
}
pc1var <- round(summary(pca)$importance[2,1]*100, digits=1)
pc2var <- round(summary(pca)$importance[2,2]*100, digits=1)
pc1lab <- paste0("PC1 (",as.character(pc1var),"%)")
pc2lab <- paste0("PC1 (",as.character(pc2var),"%)")
plot(PC2~PC1, data=as.data.frame(pca$x), bg=colors[fac], pch=21, xlab=pc1lab, ylab=pc2lab, main=main, ...)
with(as.data.frame(pca$x), textxy(PC1, PC2, labs=rownames(as.data.frame(pca$x)), cex=textcx))
legend(legendpos, legend=levels(fac), col=colors, pch=20)
# rldyplot(PC2 ~ PC1, groups = fac, data = as.data.frame(pca$rld),
# pch = 16, cerld = 2, aspect = "iso", col = colours, main = draw.key(key = list(rect = list(col = colours),
# terldt = list(levels(fac)), rep = FALSE)))
}
png("qc-pca.png", 1000, 1000, pointsize=20)
rld_pca(rld, colors=mycols, intgroup="condition", xlim=c(-75, 35))
dev.off()
# Get differential expression results
res <- results(dds)
table(res$padj<0.05)
## Order by adjusted p-value
res <- res[order(res$padj), ]
## Merge with normalized count data
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "Gene"
head(resdata)
## Write results
write.csv(resdata, file="diffexpr-results.csv")
## Examine plot of p-values
hist(res$pvalue, breaks=50, col="grey")
## Examine independent filtering
attr(res, "filterThreshold")
plot(attr(res,"filterNumRej"), type="b", xlab="quantiles of baseMean", ylab="number of rejections")
## MA plot
## Could do with built-in DESeq2 function:
## DESeq2::plotMA(dds, ylim=c(-1,1), cex=1)
## I like mine better:
maplot <- function (res, thresh=0.05, labelsig=TRUE, textcx=1, ...) {
with(res, plot(baseMean, log2FoldChange, pch=20, cex=.5, log="x", ...))
with(subset(res, padj<thresh), points(baseMean, log2FoldChange, col="red", pch=20, cex=1.5))
if (labelsig) {
require(calibrate)
with(subset(res, padj<thresh), textxy(baseMean, log2FoldChange, labs=Gene, cex=textcx, col=2))
}
}
png("diffexpr-maplot.png", 1500, 1000, pointsize=20)
maplot(resdata, main="MA Plot")
dev.off()
## Volcano plot with "significant" genes labeled
volcanoplot <- function (res, lfcthresh=2, sigthresh=0.05, main="Volcano Plot", legendpos="bottomright", labelsig=TRUE, textcx=1, ...) {
with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main=main, ...))
with(subset(res, padj<sigthresh ), points(log2FoldChange, -log10(pvalue), pch=20, col="red", ...))
with(subset(res, abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="orange", ...))
with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="green", ...))
if (labelsig) {
require(calibrate)
with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), textxy(log2FoldChange, -log10(pvalue), labs=Gene, cex=textcx, ...))
}
legend(legendpos, xjust=1, yjust=1, legend=c(paste("FDR<",sigthresh,sep=""), paste("|LogFC|>",lfcthresh,sep=""), "both"), pch=20, col=c("red","orange","green"))
}
png("diffexpr-volcanoplot.png", 1200, 1000, pointsize=20)
volcanoplot(resdata, lfcthresh=1, sigthresh=0.05, textcx=.8, xlim=c(-2.3, 2))
dev.off()
Para hacer los heatmaps
library(ggplot2)
library(gplots)
setwd("/media/bioinfo/ToshibaExt/Naranjo_Fastq/HTSeq_Gene/DESeq2/Heatmap")
dge <- read.csv("Completo_Familias.tab", sep="\t")
row.names(dge) <- dge$X
dge <- subset( dge, select = -1 )
dge_matrix <- data.matrix(dge)
my_palette <- colorRampPalette(c("pink", "red"))(n = 25)
heatmap.2(as.matrix(dge_matrix), col=my_palette, breaks = seq(0, n, length.out = 26), density.info="none", cexRow=1,cexCol=1, margins=c(4,10), trace="none", dendrogram='none', symm=F,symkey=T,symbreaks=T, scale="none", Rowv=FALSE, Colv = FALSE)
density.info = etiqueta en la referencia de color
margins = márgenes del gráfico
trace = traza unas lineas en las filas, columnas o ambas (none no traza nada)
dendrogram = agrega o quita el dendrograma
symm = para la simetria del eje X, solo debe ser true si la matriz es cuadrada
scale = tiene que ser none para que respete el valor de las counts de la matriz
breaks = son las divisiones que crea en la leyenda
length.out = siempre tiene que ser mayor por 1 al n seteado en cantidad de colores
symbreaks = hace los cortes simetricos al cero si la matriz tiene valores negativos
symkey = indica si la leyenda de color es o no simetrica al cero
Rowv = indica el orden de las filas. Por defecto TRUE, pero si quiero que mantenga el orden de la matriz, debe ser FALSE
Colv = indica el orden de las columnas. Por defecto TRUE, pero si quiero que mantenga el orden de la matriz, debe ser FALSE
FUNDAMENTAL: CORREGIR MANUALMENTE EL RANGO DE BREAKS, SEGUN EL VALOR MAS GRANDE DE COUNTS EN LA MATRIZ.
> my_palette <- colorRampPalette(c("pink", "red"))(n = 25)
> heatmap.2(as.matrix(dge_matrix), col=my_palette,
+ breaks = seq(0, 1500, length.out = 26), density.info="none", cexRow=1,cexCol=1, margins=c(4,10), trace="none", dendrogram='none', symm=F,symkey=T,symbreaks=T, scale="none")
heatmap.2(as.matrix(dge_matrix), col=my_palette,
breaks = seq(-3, 3, length.out = 101), density.info="none", trace="none",
dendrogram='none', symm=F,symkey=T,symbreaks=T, scale="none")