-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDESeq2.R
130 lines (103 loc) · 4.94 KB
/
DESeq2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
BiocManager::install("DESeq2")
BiocManager::install("genefilter")
install.packages("readxl")
# Load libraries
library(readxl)
library(DESeq2)
library(tidyverse)
library(pheatmap)
library(gplots)
library(RColorBrewer)
library(genefilter)
# Read in metadata
MetaData <- read_excel("SraRunTable1.xlsm")
# Read in count files
count_files <- c("SRR6040092_Aligned.sortedByCoord.out_count.txt",
"SRR6040093_Aligned.sortedByCoord.out_count.txt",
"SRR6040094_Aligned.sortedByCoord.out_count.txt",
"SRR6040096_Aligned.sortedByCoord.out_count.txt",
"SRR6040097_Aligned.sortedByCoord.out_count.txt",
"SRR6156066_Aligned.sortedByCoord.out_count.txt",
"SRR6156067_Aligned.sortedByCoord.out_count.txt",
"SRR6156069_Aligned.sortedByCoord.out_count.txt")
# Create sampleTable
sampleNames <- c("SRR6040092_Leaf", "SRR6040093_Root", "SRR6040094_Aril", "SRR6040096_Stem",
"SRR6040097_Aril", "SRR6156066_Aril", "SRR6156067_Aril", "SRR6156069_Aril")
sampleTable <- matrix(ncol = 8, nrow = 3792)
colnames(sampleTable) <- sampleNames
rownames_data <- read.table(count_files[1], header = FALSE, colClasses = "character")[, 1]
rownames(sampleTable) <- rownames_data
for (i in seq_along(count_files)) {
# Read the counts from the second column of each count file
counts <- read.table(count_files[i], header = FALSE)[, 2]
# Assign the counts to the corresponding column in the sampleTable matrix
sampleTable[, i] <- counts
}
# Create DESeq object
ddsHTSeq_tissue <- DESeqDataSetFromMatrix(countData = sampleTable, colData = MetaData, design = ~ tissue)
ddsHT_seq_cultivar <- DESeqDataSetFromMatrix(countData = sampleTable, colData = MetaData, design = ~ Cultivar)
# Perform DESeq2
dds <- DESeq(ddsHTSeq_tissue)
results <- results(dds)
normalized_counts <- counts(dds, normalized = TRUE)
# Perform PCA
rld <- rlog(dds)
plotPCA(rld, intgroup = c("Cultivar", "tissue"))
# Heatmap
# Extract the row names (gene names) for the top 10 differentially expressed genes
top_10_DE_genes <- head(row.names(results[order(results$padj), ]), 10)
# Extract the row names (gene names) for the top 10 differentially expressed genes
top_10_DE_genes <- head(row.names(results[order(results$padj), ]), 10)
# Subset rlog data to include only the top 10 differentially expressed genes
rlog_data_top_10_DE <- normalized_counts[top_10_DE_genes, ]
# Set row names to the data matrix
rownames(rlog_data_top_10_DE) <- top_10_DE_genes
# Extract the tissue types and cultivar from the metadata
tissue_types <- MetaData$tissue
cultivar <- MetaData$Cultivar
# Create column names without "SRR" names
col_names_without_SRR <- gsub("^SRR[0-9]+_", "", colnames(rlog_data_top_10_DE))
# Set column names to include both sample names, tissue types, and cultivar without "SRR" names
colnames(rlog_data_top_10_DE) <- paste(col_names_without_SRR, cultivar, sep = " - ")
# Create heatmap for the top 10 differentially expressed genes
pheatmap(as.matrix(rlog_data_top_10_DE),
scale = "row",
col = colorRampPalette(rev(brewer.pal(9, "RdBu")))(100),
main = "Top 10 Differentially Expressed Genes for design",
labRow = TRUE, # Show row labels (gene names)
labCol = TRUE) # Show column labels
# Volcano plot
# Calculate the log2 fold change and -log10 adjusted p-values
log2_fold_change <- results$log2FoldChange
adjusted_p_values <- -log10(results$padj)
# Remove non-finite values from both vectors
finite_indices <- is.finite(log2_fold_change) & is.finite(adjusted_p_values)
log2_fold_change <- log2_fold_change[finite_indices]
adjusted_p_values <- adjusted_p_values[finite_indices]
# Create a volcano plot
plot(log2_fold_change, adjusted_p_values,
main = "Volcano Plot for cultivar design",
xlab = "Log2 Fold Change",
ylab = "-Log10 Adjusted P-value",
xlim = c(-max(abs(log2_fold_change)), max(abs(log2_fold_change))),
ylim = c(0, max(adjusted_p_values)),
pch = 20, # Use filled circles as points
col = ifelse(abs(log2_fold_change) > 1 & results$padj[finite_indices] < 0.05, "red", "black")) # Highlight significant points
# Add a horizontal line at -log10(0.05) to indicate significance threshold
abline(h = -log10(0.05), col = "blue", lty = 2)
# Citations
citEntry(entry="article",
title = "Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2",
author = personList( as.person("Michael I. Love"),
as.person("Wolfgang Huber"),
as.person("Simon Anders")),
year = 2014,
journal = "Genome Biology",
doi = "10.1186/s13059-014-0550-8",
volume = 15,
issue = 12,
pages = 550,
textVersion =
paste("Love, M.I., Huber, W., Anders, S.",
"Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2",
"Genome Biology 15(12):550 (2014)" ) )