-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLrgParser_fq.py
executable file
·198 lines (178 loc) · 9.5 KB
/
LrgParser_fq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
from xml.etree.ElementTree import parse
__author__ = 'mwelland'
__version__ = 0.9
__version_date__ = '11/02/2015'
class LrgParser:
"""
Class version: 0.9
Modified Date: 21/07/2015
Author : Matt Welland
Minimal version of previously developed class
Only requires exon numbers and coordinates, and full sequence
Parses the input file to find all the useful values
This will populate a dictionary to be returned at completion
Dict { filename
genename
transcripts { transcript { exons { exon_number { genomic_start
genomic_stop
padded sequence
length
padded length
"""
def __init__(self, file_name):
self.fileName = file_name
self.padding = 0
self.sequence = ''
# Read in the specified input file into a variable
try:
self.tree = parse(self.fileName)
self.transcriptdict = {'transcripts': {},
'root': self.tree.getroot(),
'filename': file_name}
self.transcriptdict['fixannot'] = self.transcriptdict['root'].find(
'fixed_annotation') # ensures only exons from the fixed annotation will be taken
self.transcriptdict['updatable'] = self.transcriptdict['root'].find(
'updatable_annotation')
self.transcriptdict['genename'] = self.transcriptdict['root'].find(
'updatable_annotation/annotation_set/lrg_locus').text
self.transcriptdict['refseqname'] = self.transcriptdict['root'].find(
'fixed_annotation/sequence_source').text
if self.transcriptdict['root'].attrib['schema_version'] != '1.9':
print 'This LRG file is not the correct version for this script'
print 'This is designed for v.1.8'
print 'This file is v.' + self.transcriptdict['root'].attrib['schema_version']
self.is_matt_awesome = True
except IOError as fileNotPresent:
print "The specified file cannot be located: " + fileNotPresent.filename
exit()
@property
def get_version(self):
"""
Quick function to grab version details for final printing
:return:
"""
return 'Version: {0}, Version Date: {1}'.format(str(__version__), __version_date__)
# Grabs the sequence string from the <sequence/> tagged block
def grab_element(self, path):
""" Grabs specific element from the xml file from a provided path """
try:
for item in self.transcriptdict['root'].findall(path):
return item.text
except:
print "No sequence was identified"
print self.transcriptdict['filename']
exit()
def get_exon_coords(self):
""" Traverses the LRG ETree to find all the useful values
This should allow more robust use of the stored values, and enhances
transparency of the methods put in place. Absolute references should
also make the program more easily extensible
"""
for items in self.transcriptdict['fixannot'].findall('transcript'):
t_number = int(items.attrib['name'][1:])
# print 'first t number = ' + str(t_number)
self.transcriptdict['transcripts'][t_number] = {} # First should be indicated with '1'
self.transcriptdict['transcripts'][t_number]['exons'] = {}
self.transcriptdict['transcripts'][t_number]['list_of_exons'] = []
# Gene sequence main coordinates are required to take introns
# Transcript coordinates wanted for output
genomic_start = 0
genomic_end = 0
for exon in items.iter('exon'):
exon_number = exon.attrib['label']
if exon_number[-1] in ('a', 'b', 'c', 'd'):
exon_number = exon_number[:-1]
else:
exon_number = int(exon_number)
self.transcriptdict['transcripts'][t_number]['list_of_exons'].append(exon_number)
self.transcriptdict['transcripts'][t_number]['exons'][exon_number] = {}
for coordinates in exon:
if coordinates.attrib['coord_system'][-2] not in ['t', 'p']:
genomic_start = int(coordinates.attrib['start'])
genomic_end = int(coordinates.attrib['end'])
assert genomic_start >= 0, "Exon index out of bounds"
try:
self.transcriptdict['transcripts'][t_number]["exons"][exon_number]['genomic_start'] = genomic_start
self.transcriptdict['transcripts'][t_number]["exons"][exon_number]['genomic_end'] = genomic_end
except:
print 'coordinates grabbed'
self.transcriptdict['offset'] = self.padding
try:
exon_seq = list(self.sequence[genomic_start - self.padding: genomic_end + self.padding])
except:
print 'got exon seq'
try:
self.transcriptdict['transcripts'][t_number]["exons"][exon_number]['padded seq'] = exon_seq
self.transcriptdict['transcripts'][t_number]["exons"][exon_number]['length'] = genomic_end-genomic_start
self.transcriptdict['transcripts'][t_number]["exons"][exon_number]['padded length'] = len(exon_seq)
except:
print 'stored lengths'
def find_cds_delay(self, transcript):
""" Method to find the actual start of the translated sequence
introduced to sort out non-coding exon problems """
offset_total = 0
offset = self.transcriptdict['transcripts'][transcript]['old_cds_offset']
for exon in self.transcriptdict['transcripts'][transcript]['list_of_exons']:
g_start = self.transcriptdict['transcripts'][transcript]['exons'][exon]['genomic_start']
g_stop = self.transcriptdict['transcripts'][transcript]['exons'][exon]['genomic_end']
if offset > g_stop:
self.transcriptdict['transcripts'][transcript]['exons'][exon]['cds'] = 'before'
offset_total = offset_total + (g_stop - g_start) + 1
elif g_stop >= offset >= g_start:
self.transcriptdict['transcripts'][transcript]['cds_offset'] = offset_total + (offset - g_start)
self.transcriptdict['transcripts'][transcript]['exons'][exon]['cds'] = 'after'
elif offset < g_start:
self.transcriptdict['transcripts'][transcript]['exons'][exon]['cds'] = 'after'
def get_nm(self):
annotation_sets = self.transcriptdict['updatable'].findall('annotation_set')
for annotation_set in annotation_sets:
if annotation_set.attrib['type'] == 'ncbi':
features = annotation_set.find('features')
genes = features.findall('gene') # Multiple 'genes' includedin LRG
for gene in genes:
transcripts = gene.findall('transcript')
for transcript_block in transcripts:
try:
t_number = transcript_block.attrib['fixed_id'][1:]
# print transcript_block.attrib['accession']
self.transcriptdict['transcripts'][int(t_number)]['NM_number'] = transcript_block.attrib['accession']
except KeyError:
pass
# print 'found redundant transcript'
def get_protein_exons(self):
""" Collects full protein sequence for the appropriate transcript """
for item in self.transcriptdict['fixannot'].findall('transcript'):
p_number = int(item.attrib['name'][1:])
coding_region = item.find('coding_region')
coordinates = coding_region.find('coordinates')
self.transcriptdict['transcripts'][p_number]['old_cds_offset'] = int(coordinates.attrib['start'])
translation = coding_region.find('translation')
sequence = translation.find('sequence').text
self.transcriptdict['transcripts'][p_number]['protein_length'] = len(sequence)*3
def run(self, padding):
self.padding = padding
# Initial sequence grabbing and populating dictionaries
self.sequence = self.grab_element('fixed_annotation/sequence')
self.transcriptdict['full sequence'] = list(self.sequence)
try:
self.get_exon_coords()
except:
print 'Exon coord'
this = raw_input()
try:
self.get_nm()
except:
print 'get NM'
this = raw_input()
try:
self.get_protein_exons()
except:
print 'protein exons'
this = raw_input()
for transcript in self.transcriptdict['transcripts'].keys():
try:
self.find_cds_delay(transcript)
except:
print 'CDS delay for %s' % transcript
self.transcriptdict['transcripts'][transcript]['list_of_exons'].sort(key=float)
return self.transcriptdict