-
Notifications
You must be signed in to change notification settings - Fork 0
/
snATAC_DARs.Rmd
341 lines (262 loc) · 10.5 KB
/
snATAC_DARs.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
## Seurat DARs
Celltype and cluster DARs
```{r eval=FALSE}
library(Seurat)
library(Signac)
library(tidyverse)
NucSeq.atac <- readRDS(file='data/NucSeq_macs2Peaks_signac.rds')
# compute DARs for celltypes:
Idents(NucSeq.peaks) <- NucSeq.peaks$monocle_clusters_umap_Cell.Type
celltype_peaks <- FindAllMarkers(
object = NucSeq.peaks,
min.pct = 0.05,
logfc.threshold=0,
test.use = 'LR'
)
save(celltype_peaks, file='data/celltype_DARs.rda')
Idents(NucSeq.peaks) <- NucSeq.peaks$monocle_clusters_umap_ID
cluster_peaks <- FindAllMarkers(
object = NucSeq.peaks,
min.pct = 0.05, # min.pct = 0.1 for cluster_DARs2
logfc.threshold=0.0,
test.use = 'LR'
)
save(cluster_peaks, file='data/cluster_DARs.rda')
```
Diagnosis DARs:
```{r eval=FALSE}
# control vs AD in each celltype
diagnosis_celltype_peaks <- data.frame()
for(celltype in unique(NucSeq.peaks$monocle_clusters_umap_Cell.Type)){
print(celltype)
cur_seurat <- NucSeq.peaks[,NucSeq.peaks$monocle_clusters_umap_Cell.Type == celltype]
Idents(cur_seurat) <- cur_seurat$Diagnosis
cur_markers <- FindMarkers(
object = cur_seurat,
ident.1 = 'AD',
ident.2 = 'Control',
min.pct = 0.05,
logfc.threshold=0,
test.use = 'LR'
)
cur_markers$cluster <- celltype
cur_markers$gene <- rownames(cur_markers)
diagnosis_celltype_peaks <- rbind(diagnosis_celltype_peaks, cur_markers)
}
save(diagnosis_celltype_peaks, file='data/diagnosis_celltype_DARs.rda')
# control vs AD in each cluster
diagnosis_cluster_peaks <- data.frame()
for(celltype in unique(NucSeq.peaks$monocle_clusters_umap_ID)){
print(celltype)
cur_seurat <- NucSeq.peaks[,NucSeq.peaks$monocle_clusters_umap_ID == celltype]
Idents(cur_seurat) <- cur_seurat$Diagnosis
cur_markers <- FindMarkers(
object = cur_seurat,
ident.1 = 'AD',
ident.2 = 'Control',
min.pct = 0.05,
logfc.threshold=0,
test.use = 'LR'
)
cur_markers$cluster <- celltype
cur_markers$gene <- rownames(cur_markers)
diagnosis_cluster_peaks <- rbind(diagnosis_cluster_peaks, cur_markers)
print(dim(cur_markers))
}
save(diagnosis_cluster_peaks, file='data/diagnosis_cluster_DARs.rda')
load(file='data/diagnosis_cluster_DARs.rda')
all.equal(rownames(cur_markers), rownames(subset(diagnosis_celltype_peaks, cluster=='OPC')))
```
Write DARs to bed files
```{r eval=FALSE}
library(Signac)
dir.create('data/DARs/cluster')
# load DARs:
load(file='data/celltype_DARs.rda')
load(file='data/cluster_DARs3.rda')
load(file='data/diagnosis_celltype_DARs.rda')
load(file='data/diagnosis_cluster_DARs.rda')
# write to tsv files for the paper!!!
celltype_peaks <- dplyr::rename(celltype_peaks, c(Peak=gene, cell_type=cluster))
cluster_peaks <- dplyr::rename(cluster_peaks, c(Peak=gene))
diagnosis_celltype_peaks <- dplyr::rename(diagnosis_celltype_peaks, c(Peak=gene, cell_type=cluster))
diagnosis_cluster_peaks <- dplyr::rename(diagnosis_cluster_peaks, Peak=gene)
write.table(celltype_peaks, file='data/celltype_DARs.tsv', sep='\t', quote=FALSE, row.names=FALSE)
write.table(cluster_peaks, file='data/cluster_DARs.tsv', sep='\t', quote=FALSE, row.names=FALSE)
write.table(diagnosis_celltype_peaks, file='data/diagnosis_celltype_DARs.tsv', sep='\t', quote=FALSE, row.names=FALSE)
write.table(diagnosis_cluster_peaks, file='data/diagnosis_cluster_DARs.tsv', sep='\t', quote=FALSE, row.names=FALSE)
DAR_list <- list(
"celltype" = celltype_peaks,
"cluster" = cluster_peaks,
"diagnosis_celltype" = diagnosis_celltype_peaks,
"diagnosis_cluster" = diagnosis_cluster_peaks
)
# peak names in ArchR that are same style
proj@peakSet$site_name <- paste0(as.character(seqnames(proj@peakSet)), ':', start(proj@peakSet), '-', end(proj@peakSet))
# write all peaks to file to serve as background set:
peak_ranges <- Signac::StringToGRanges(proj@peakSet$site_name, sep = c(":", "-")) %>% sort
write.table(
as.data.frame(peak_ranges)[,1:3],
file='data/DARs/allPeaks.bed',
row.names=F, col.names=F, sep='\t', quote=F
)
peak_ranges <- Signac::StringToGRanges(subset(proj@peakSet, peakType == 'Distal') %>% .$site_name, sep = c(":", "-")) %>% sort
write.table(
as.data.frame(peak_ranges)[,1:3],
file='data/DARs/distalPeaks.bed',
row.names=F, col.names=F, sep='\t', quote=F
)
peak_ranges <- Signac::StringToGRanges(subset(proj@peakSet, peakType != 'Distal') %>% .$site_name, sep = c(":", "-")) %>% sort
write.table(
as.data.frame(peak_ranges)[,1:3],
file='data/DARs/proximalPeaks.bed',
row.names=F, col.names=F, sep='\t', quote=F
)
# some settings
min_peaks <- 25 # min number of up-regulated peaks per cluster. skips if thresh is not met.
upreg <- TRUE # up-regulated or down-regulated peaks?
peak_outdir <- 'data/DARs/' # directory to dump output files
# loop through DAR list:
for(DARs in names(DAR_list)){
print(DARs)
# crete output directory if it hasn't been made already
dir.create(paste0(peak_outdir,DARs))
# get current set of DARs
cur_DARs <- DAR_list[[DARs]]
# add a column for peak type
cur_DARs$peakType <- as.character(proj@peakSet$peakType)[match(cur_DARs$gene, proj@peakSet$site_name)]
# loop over clusters:
for(clust in unique(cur_DARs$cluster)){
print(clust)
# get peaks for this celltype
cur_peaks <- subset(cur_DARs, cluster == clust)
# get up-regulated peaks
if(upreg){
cur_peaks <- subset(cur_peaks, avg_logFC >= 0)
file_suffix = 'upregulated'
} else{
cur_peaks <- subset(cur_peaks, avg_logFC < 0)
file_suffix = 'downregulated'
}
if(dim(cur_peaks) < min_peaks){
print(paste('too few peaks, skipping', clust))
next
}
# split by distal and gene-proximal peaks and write:
distal_peaks <- subset(cur_peaks, peakType == 'Distal')
proximal_peaks <- subset(cur_peaks, peakType != 'Distal')
# convert to GRanges objects and sort:
distal_ranges <- Signac::StringToGRanges(distal_peaks$gene, sep = c(":", "-")) %>% sort
proximal_ranges <- Signac::StringToGRanges(proximal_peaks$gene, sep = c(":", "-")) %>% sort
cur_peak_ranges <- Signac::StringToGRanges(cur_peaks$gene, sep = c(":", "-")) %>% sort
# write to bed files
write.table(
as.data.frame(distal_ranges)[,1:3],
file=paste0(peak_outdir, DARs, '/', clust, '_',file_suffix, '_distal.bed'),
row.names=F, col.names=F, sep='\t', quote=F
)
write.table(
as.data.frame(proximal_ranges)[,1:3],
file=paste0(peak_outdir, DARs, '/', clust, '_', file_suffix, '_proximal.bed'),
row.names=F, col.names=F, sep='\t', quote=F
)
write.table(
as.data.frame(cur_peak_ranges)[,1:3],
file=paste0(peak_outdir, DARs, '/', clust, '_', file_suffix, '.bed'),
row.names=F, col.names=F, sep='\t', quote=F
)
}
}
```
Plot output from rGREAT (which was run using the above .bed files)
```{r eval=FALSE}
library(tidyverse)
library(cowplot)
theme_set(theme_cowplot())
wrapText <- function(x, len) {
sapply(x, function(y) paste(strwrap(y, len), collapse = "\n"), USE.NAMES = FALSE)
}
great_parent_dir <- 'data/OUTPUTs/'
great_output_dirs <- paste0(great_parent_dir, dir(great_parent_dir), '/All_GOAnnotations/')
output_groups <- c('celltype', 'cluster', 'diagnosis_celltype', 'diagnosis_cluster')
# settings:
n_terms <- 20
n_terms_barplot <- 20
# heatmap of just up-regulated proximal peaks
upregulated <- TRUE
# loop through each output dir
for(i in 1:length(great_output_dirs)){
cur_output_dir <- great_output_dirs[i]
great_group <- output_groups[i]
# split files by proximal, distal, all peaks:
cur_files <- dir(cur_output_dir)
cur_files_list <- list(
'proximal' = cur_files[grepl('proximal', cur_files)],
'distal' = cur_files[grepl('distal', cur_files)]
)
cur_files_list$allPeaks <- cur_files[!(cur_files %in% c(cur_files_list$proximal, cur_files_list$distal))]
# loop through files list:
for(peakType in names(cur_files_list)){
cur_files <- cur_files_list[[peakType]]
fig_name <- paste0(great_group, '_', peakType, '_')
if(upregulated){
cur_files <- cur_files[grepl('upregulated', cur_files)]
fig_name <- paste0(fig_name, 'upregulated')
} else{
cur_files <- cur_files[grepl('downregulated', cur_files)]
fig_name <- paste0(fig_name, 'downregulated')
}
# combine files into one table:
dir.create(paste0('figures/barplots/', great_group))
great_df <- data.frame()
for(file in cur_files){
cur_df <- readRDS(paste0(cur_output_dir, file))[[2]]
# add column for cluster or celltype:
group <- unlist(str_split(file, '_'))[1]
cur_df$group <- group
# add significance levels:
measure <- "Hyper_Adjp_BH"
cur_df$Significance <- ifelse(cur_df[[measure]] > 0.05, '', ifelse(cur_df[[measure]] > 0.005, '*', ifelse(cur_df[[measure]] > 0.0005, '**', '***')))
# add text wrapping:
cur_df$wrap <- wrapText(cur_df$name, 45)
# plot barplot
plot_df <- head(cur_df, n_terms_barplot)
p <- ggplot(plot_df, aes(y=log(Hyper_Fold_Enrichment), x=reorder(wrap, log(Hyper_Fold_Enrichment)))) +
geom_bar(stat='identity', color='black', fill='black') +
xlab('') +
coord_flip() +
ggtitle(paste(unique(plot_df$group), fig_name))
plot_name <- paste0(unique(plot_df$group), '_', fig_name)
pdf(paste0('figures/barplots/', great_group, '/', plot_name, '.pdf'), width=8, height=10)
print(p)
dev.off()
# get top entries by p-value (default sorting of these files):
cur_df <- head(cur_df, n_terms)
great_df <- rbind(great_df, cur_df)
}
# force levels for GO terms:
great_df$name <- factor(great_df$name, levels=rev(unique(great_df$name)))
great_df$wrap <- factor(great_df$wrap, levels=rev(unique(great_df$wrap)))
# plot heatmap
p <- ggplot(great_df, aes(group, wrap, fill=log(Hyper_Fold_Enrichment))) +
geom_tile() +
geom_text(aes(label=Significance)) +
scale_fill_gradient(low = "white", high = "red", space = "Lab",
guide = guide_colorbar(barwidth=.5, barheight=7.5, ticks=FALSE)) +
xlab('') + ylab('') + labs(fill = "log(fold enrichment)") +
theme(
axis.text.x=element_text(angle=90, vjust=0.5),
panel.background=element_blank(),
panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
plot.background=element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.5)
)
pdf(paste0('figures/heatmaps/', fig_name,'.pdf'), width=10, height=16)
print(p)
dev.off()
great_df %>% dplyr::select(-c(Significance, wrap)) %>%
write.table(paste0('data/', fig_name, '.tsv'), quote=FALSE, row.names=FALSE, sep='\t')
}
}
```