-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcreateListExons.py
181 lines (151 loc) · 6.35 KB
/
createListExons.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
##################################
## ##
## createListExons.py ##
## ##
##################################
#
# Purpose: identifying all exons that may contain targets of interest
# If a gene has two or more isoforms, this means only common exons.
# Otherwise, this means all exons.
#
#
# Inputs:
# - refGene file, that contains all genes
# - optionally, a list of genes for which we want to extract exons
#
#
# Output:
# - an "exons of interest" file, where each line is an exon
#
import re
import sys
import os
import ast
from time import localtime, strftime, sleep
from subprocess import call
dir = "./mm10_input/"
in_ = "refGene.txt"
out_ = "exonList.txt"
pattern = r"\d+\t(N[MR]_\d+)\t(chr[MXY\d]+)\t([\+-])\t\d+\t\d+\t(\d+)\t(\d+)\t\d+\t([\d,]+)\t([\d,]+)\t0\t(.+)\t(cmpl|unk|incmpl)\t(cmpl|unk|incmpl)"
if len(sys.argv) > 2:
print "Usage: python createListExons.py [ID for list of genes]"
quit ()
if len(sys.argv) == 1:
print "Creating a list of exons for the whole genome"
else:
filename = sys.argv[1]+".txt"
print "Creating a list of exons for genes listed in "+filename
if os.path.isfile(dir+filename)==False:
print "Error: file "+filename+" does not exist."
quit()
genes = r""
inFile = open(dir+filename,'r')
for line in inFile:
genes += line.rstrip()+"|"
pattern = r"\d+\t(N[MR]_\d+)\t(chr[MXY\d]+)\t([\+-])\t\d+\t\d+\t(\d+)\t(\d+)\t\d+\t([\d,]+)\t([\d,]+)\t0\t("
pattern += genes[:-1]
pattern += r")\t(cmpl|unk|incmpl)\t(cmpl|unk|incmpl)"
out_ = "exonList.txt"[:-4]+"_"+filename
print strftime("%H:%M:%S", localtime())+": Loading data from "+in_
savedData = []
nb = 0
genes = dict()
# We start by reading the refGene file
inFile = open(dir+in_,'r')
for line in inFile:
match = re.search(pattern,line.rstrip())
if match:
# For each gene isoform, we save the ID, the chromosome and direction, the exon positions, and the gene name
savedData.append(match.group(1)+"\t"+match.group(2)+"\t"+match.group(3)+"\t"+match.group(6)+"\t"+match.group(7)+"\t"+match.group(8)+"\t"+match.group(4)+"\t"+match.group(5))
currentGene = match.group(8)
# We also save the position of this data into a dictionnary. This will help detecting genes with multiple isoforms
if currentGene in genes:
genes[currentGene].append(nb)
else:
genes[currentGene] = []
genes[currentGene].append(nb)
nb+=1
# Next, we process all this data
print strftime("%H:%M:%S", localtime())+": Processing data, and saving to "+out_
outFile = open(dir+out_,'w')
for currentGene in genes:
nbIsoforms = len(genes[currentGene])
# If the gene has a unique isoform, all exons are interesting
if nbIsoforms==1:
# We locate the position of the saved data
offset = genes[currentGene][0]
# We extract the data
split_data = savedData[offset].split("\t")
chr = split_data[1]
direction = split_data[2]
name = split_data[5]
exon_starts = split_data[3].rstrip(",").split(",")
exon_ends = split_data[4].rstrip(",").split(",")
cds_start = split_data[6]
cds_end = split_data[7]
# We save the data, exon by exon
for i in range(0,len(exon_starts)):
exon_data = name+"\t"+chr+"\t"+direction+"\t"+exon_starts[i]+"\t"+exon_ends[i]+"\t"+cds_start+"\t"+cds_end+"\tNo_Isoform"
outFile.write(exon_data+"\n")
# If the gene has two or more isoforms, only the common exons are interesting
else:
exon_overlap = dict()
# First, we prepare the CDS information
cds_start = ""
cds_end = ""
cds_status = ""
for j in range(0,nbIsoforms):
offset = genes[currentGene][j]
# We extract the data
split_data = savedData[offset].split("\t")
# For the first isoform, we simply initialise
if j==0:
cds_start = split_data[6]
cds_end = split_data[7]
cds_status = "Unique_CDS"
# For the next ones, we compare the CDS information
else:
# If the CDS is the same, nothing happens
if cds_start == split_data[6] and cds_end == split_data[7]:
continue
# Otherwise, we save the intersection of the two regions
else:
old_CDS_start = ast.literal_eval(cds_start)
old_CDS_end = ast.literal_eval(cds_end)
new_CDS_start = ast.literal_eval(split_data[6])
new_CDS_end = ast.literal_eval(split_data[7])
cds_start = str(max(old_CDS_start,new_CDS_start))
cds_end = str(min(old_CDS_end,new_CDS_end))
cds_status = "CDS_Intersection"
# We locate the position of the saved data for each isoform
for j in range(0,nbIsoforms):
offset = genes[currentGene][j]
# We extract the data
split_data = savedData[offset].split("\t")
chr = split_data[1]
direction = split_data[2]
name = split_data[5]
exon_starts = split_data[3].rstrip(",").split(",")
exon_ends = split_data[4].rstrip(",").split(",")
# We add the data, exon by exon, to the dictionnary
for i in range(0,len(exon_starts)):
exon_data = name+"\t"+chr+"\t"+direction+"\t"+exon_starts[i]+"\t"+exon_ends[i]+"\t"+cds_start+"\t"+cds_end+"\t"+cds_status
if exon_data in exon_overlap:
exon_overlap[exon_data] += 1
else:
exon_overlap[exon_data] = 1
# Now, only exons that are present all the time are saved
# Note: we only accept total overlap
# If, in one isoform, the start or end codon is different, we reject all versions of this exon
for ex in exon_overlap:
if exon_overlap[ex] == nbIsoforms:
outFile.write(ex+"\n")
outFile.close()
cmd = "sort -k 2,2 -k4,4n "+dir+out_+" > "+dir+"temp.txt && mv "+dir+"temp.txt "+dir+out_
call([cmd],shell=True)
print strftime("%H:%M:%S", localtime())+": Done."
#######################
## ##
## End of File ##
## ##
#######################