-
Notifications
You must be signed in to change notification settings - Fork 0
/
Calculate Ave_pwd.Rmd
199 lines (152 loc) · 5.62 KB
/
Calculate Ave_pwd.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
---
title: "Calculate Ave_pwd"
output: html_document
---
```{r setup, echo=FALSE, warning=FALSE}
library(tidyverse)
library(dplyr)
library(slider)
library(ggplot2)
library(readxl)
library(liftOver)
library(annotatr)
library(GenomicRanges)
library(GenomicFeatures)
library(data.table)
library(xlsx)
library(rtracklayer)
require(readr)
library(ggridges)
library(viridis)
library("ROCR")
```
```{r funs, echo=FALSE, warning=FALSE}
read_pwd <- function(x){
dat <- fread(x, header = F)
colnames(dat) <- c("chr", "start", "end", "pwd", "beta.x", "beta.y")
dat <- dat %>% arrange(chr, start) %>% group_by(chr) %>% mutate(rank = row_number())
#%>% filter(chr=="chr19")
return(dat)
}
# Annotation function
annot <- function(dat, region){
wgpos <- GRanges(dat)
dm_annotated <- annotate_regions(
regions = wgpos,
annotations = region,
ignore.strand = TRUE,
quiet = FALSE)
dat_annot <- data.frame(dm_annotated)
return(dat_annot)
}
```
```{r echo=FALSE, warning=FALSE}
wgdf14pwd <- read_pwd("~/bismarkData/bed/wgdf14cpwdm.bed")
wgdf23pwd <- read_pwd("~/bismarkData/bed/wgdf23cpwdm.bed")
wgdf56pwd <- read_pwd("~/bismarkData/bed/wgdf56cpwdm.bed")
wgdf12pwd <- read_pwd("~/bismarkData/bed/wgdf12pwdm.bed")
wgdf34pwd <- read_pwd("~/bismarkData/bed/wgdf34pwdm.bed")
wgdf56npwd <- read_pwd("~/bismarkData/bed/wgdf56pwdm.bed")
ENJNpwd <- read_pwd("~/bismarkData/bed/ENJNpwd.bed")
ENINpwd <- read_pwd("~/bismarkData/bed/ENINpwd.bed")
EAEBpwd <- read_pwd("~/bismarkData/bed/EABpwd.bed")
FAFBpwd <- read_pwd("~/bismarkData/bed/FABpwd.bed")
KAKBpwd <- read_pwd("~/bismarkData/bed/KABpwd.bed")
PAPBpwd <- read_pwd("~/bismarkData/bed/PABpwd.bed")
SASBpwd <- read_pwd("~/bismarkData/bed/SABpwd.bed")
XAXBpwd <- read_pwd("~/bismarkData/bed/XABpwd.bed")
generef <- readGFF("~/My-Nhi Nguyen/DNAm/Caihong/DepMap/hg38.ncbiRefSeq.gtf.gz")
uniquetranscript <- generef %>% filter(type=="transcript") %>%
mutate(region_length=ifelse(end==start, 2, end-start+1)) %>%
group_by(gene_id) %>%
mutate(maxtran = max(region_length)) %>%
filter(region_length==maxtran) %>% ungroup %>%
group_by(gene_id) %>% mutate(rank=row_number()) %>% filter(rank==1) %>%
dplyr::select(gene_id, transcript_id)
generef2 <- generef %>% right_join(uniquetranscript,by=c("gene_id", "transcript_id")) %>% mutate(region_length=ifelse(end==start, 2, end-start+1)) %>%
dplyr::select(seqid, start, end, region_length, source, type, everything())
wholegene <- generef2 %>% dplyr::filter(type=="transcript") %>%
#mutate(tss_start = ifelse(strand=="+", start - 200, start),
mutate(tss_start = ifelse(strand=="+", start - 2000, start),
#tss_end = ifelse(strand=="+", end, end + 200),
tss_end = ifelse(strand=="+", end, end + 2000),
region_length = tss_end - tss_start +1,
tss_type = "wholegene") %>%
dplyr::select(-start, -end, -type) %>%
mutate(start=tss_start, end=tss_end, type=tss_type) %>%
dplyr::select(-tss_start, -tss_end, -tss_type) %>%
dplyr::select(seqid, start, end, region_length, source, type, everything())
gene_region <- GRanges(wholegene)
```
# N1-N2
```{r ingene, echo=FALSE, warning=FALSE}
summary(wgdf14pwd$pwd)
ingene <- annot(wgdf14pwd, gene_region) %>% distinct(seqnames, start, end, pwd)
summary(ingene$pwd)
notgene <- wgdf14pwd %>% anti_join(ingene, by=c("chr"="seqnames", "start", "end"))
summary(notgene$pwd)
```
# JA-JB
```{r ingene, echo=FALSE, warning=FALSE}
summary(wgdf23pwd$pwd)
ingene <- annot(wgdf23pwd, gene_region) %>% distinct(seqnames, start, end, pwd)
summary(ingene$pwd)
notgene <- wgdf23pwd %>% anti_join(ingene, by=c("chr"="seqnames", "start", "end"))
summary(notgene$pwd)
```
# IA-IB
```{r ingene, echo=FALSE, warning=FALSE}
summary(wgdf56pwd$pwd)
ingene <- annot(wgdf56pwd, gene_region) %>% distinct(seqnames, start, end, pwd)
summary(ingene$pwd)
notgene <- wgdf56pwd %>% anti_join(ingene, by=c("chr"="seqnames", "start", "end"))
summary(notgene$pwd)
```
# DA-DB
```{r ingene, echo=FALSE, warning=FALSE}
summary(wgdf12pwd$pwd)
ingene <- annot(wgdf12pwd, gene_region) %>% distinct(seqnames, start, end, pwd)
summary(ingene$pwd)
notgene <- wgdf12pwd %>% anti_join(ingene, by=c("chr"="seqnames", "start", "end"))
summary(notgene$pwd)
```
# HA-HB
```{r ingene, echo=FALSE, warning=FALSE}
summary(wgdf34pwd$pwd)
ingene <- annot(wgdf34pwd, gene_region) %>% distinct(seqnames, start, end, pwd)
summary(ingene$pwd)
notgene <- wgdf34pwd %>% anti_join(ingene, by=c("chr"="seqnames", "start", "end"))
summary(notgene$pwd)
```
# MA-MB
```{r ingene, echo=FALSE, warning=FALSE}
summary(wgdf56npwd$pwd)
ingene <- annot(wgdf56npwd, gene_region) %>% distinct(seqnames, start, end, pwd)
summary(ingene$pwd)
notgene <- wgdf56npwd %>% anti_join(ingene, by=c("chr"="seqnames", "start", "end"))
summary(notgene$pwd)
```
# EN-JN
```{r ingene, echo=FALSE, warning=FALSE}
summary(ENJNnpwd$pwd)
ingene <- annot(ENJNnpwd, gene_region) %>% distinct(seqnames, start, end, pwd)
summary(ingene$pwd)
notgene <- ENJNnpwd %>% anti_join(ingene, by=c("chr"="seqnames", "start", "end"))
summary(notgene$pwd)
```
# EN-IN
```{r ingene, echo=FALSE, warning=FALSE}
summary(ENINnpwd$pwd)
ingene <- annot(ENINnpwd, gene_region) %>% distinct(seqnames, start, end, pwd)
summary(ingene$pwd)
notgene <- ENINnpwd %>% anti_join(ingene, by=c("chr"="seqnames", "start", "end"))
summary(notgene$pwd)
```
# EAEB
```{r ingene, echo=FALSE, warning=FALSE}
summary(EAEBpwd$pwd)
ingene <- annot(ENINnpwd, gene_region) %>% distinct(seqnames, start, end, pwd)
summary(ingene$pwd)
notgene <- ENINnpwd %>% anti_join(ingene, by=c("chr"="seqnames", "start", "end"))
summary(notgene$pwd)
```