-
Notifications
You must be signed in to change notification settings - Fork 0
/
count_genes.R
78 lines (57 loc) · 2.26 KB
/
count_genes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
### This script counts genes within the QTL (without candidate gene analysis)
suppressMessages(library(rtracklayer))
suppressMessages(library(readr))
suppressMessages(library(GenomicFeatures))
#' BED to GRanges
#'
#' This function loads a BED-like file and stores it as a GRanges object.
#' The tab-delimited file must be ordered as 'chr', 'start', 'end', 'id', 'score', 'strand'.
#' The minimal BED file must have the 'chr', 'start', 'end' columns.
#' Any columns after the strand column are ignored.
#'
#' @param file Location of your file
#' @keywords BED GRanges
#' @export
#' @examples
#' bed_to_granges('my_bed_file.bed')
bed_to_granges <- function(file){
df <- read.table(file,
header=F,
stringsAsFactors=F)
if(length(df) > 6){
df <- df[,-c(7:length(df))]
}
if(length(df)<3){
stop("File has less than 3 columns")
}
header <- c('chr','start','end','id','score','strand')
names(df) <- header[1:length(names(df))]
if('strand' %in% colnames(df)){
df$strand <- gsub(pattern="[^+-]+", replacement = '*', x = df$strand)
}
library("GenomicRanges")
if(length(df)==3){
gr <- with(df, GRanges(chr, IRanges(start, end)))
} else if (length(df)==4){
gr <- with(df, GRanges(chr, IRanges(start, end), id=id))
} else if (length(df)==5){
gr <- with(df, GRanges(chr, IRanges(start, end), id=id, score=score))
} else if (length(df)==6){
gr <- with(df, GRanges(chr, IRanges(start, end), id=id, score=score, strand=strand))
}
return(gr)
}
#------------------------------------------------------------------------------#
# Download from FTP
# ftp_url <- "https://ftp.ensembl.org/pub/release-79/gtf/mus_musculus/Mus_musculus.GRCm38.79.gtf.gz"
gtf <- rtracklayer::import('source_data/Mus_musculus.GRCm38.79.gtf')
print('Imported Mus_musculus.GRCm38.79.gtf')
gtf_genes <- gtf[gtf$type == 'gene'] # filter down to only genes
qtls <- bed_to_granges('source_data/qtl-regions.bed')
print('Imported qtl-regions.bed')
hits <- findOverlaps(query = qtls, subject = gtf_genes)
qtlids <- paste0('Qpa', seq(1,8))
gene_counts <- data.frame(QTL = qtlids, gene_count = as.table(hits))
print('Genes counted')
write_csv(gene_counts, file = 'results/qtl-gene-counts.csv')
print('Results saved to qtl-gene-counts.csv')