forked from lpryszcz/bin
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathembl2gtf.py
executable file
·135 lines (111 loc) · 5.67 KB
/
embl2gtf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python
"""
Convert embl to gtf.
USAGE:
cat in.embl | embl2gtf.py > out.gtf
NOTE:
It's designed to work with gb files coming from EMBL. gene is used as gene_id and transcript_id (locus_tag if gene not present).
Only entries having types in allowedTypes = ['gene','CDS','tRNA','tmRNA','rRNA','ncRNA'] are stored in GTF. Need to include exon processing.
No frame info is processed. Need to be included in order to process genes having introns!
In addition, some embl files misses gene entries - then gene is considered as entire CDS/*RNA
AUTHOR:
Leszek Pryszcz
Version 0.1
"""
import os, sys, urllib
from datetime import datetime
from Bio import SeqIO
def _get_unique_id( gene_id,products,i=1 ):
""" """
new_id = '%s.%s' % ( gene_id,i )
while new_id in products:
i += 1
new_id = '%s.%s' % ( gene_id,i )
return new_id
def embl2gtf( source='embl2gtf',allowedTypes=set(['CDS', 'tRNA', 'tmRNA', 'rRNA', 'ncRNA']) ): #'mRNA', 'gene',
"""
"""
#make sure no duplicated products
products = set()
handle = sys.stdin
for r in SeqIO.parse( handle,'embl' ):
acc = r.name # r.id r.description
skipped = 0
skippedTypes = set()
for f in r.features:
#process only gene and CDS entries
if f.type not in allowedTypes:
skipped += 1
skippedTypes.add( f.type )
continue
#generate comments field
if 'locus_tag' in f.qualifiers:
#use locul tag as gene_id/transcript_id
gene_id = transcript_id = f.qualifiers['locus_tag'][0]
elif 'gene' in f.qualifiers:
#use gene name as gene_id/transcript_id
gene_id = transcript_id = f.qualifiers['gene'][0]
elif 'product' in f.qualifiers:
#use locul tag as gene_id/transcript_id
gene_id = transcript_id = f.qualifiers['product'][0]
else:
sys.stderr.write( "Error: Neither `gene` nor `locus_tag` found for entry: %s\n" % '; '.join( str(f).split('\n') ) )
continue
#make sure no duplicated entries in GTF
if gene_id in products:
#sys.stderr.write( "Warning: Duplicated entry found: %s\n" % '; '.join( str(f).split('\n') ) )
continue
#gene_id = transcript_id = _get_unique_id( gene_id,products ) #continue
products.add(gene_id)
comments = 'gene_id "%s"; transcript_id "%s"' % ( gene_id,transcript_id )
#if not 'systematic_id' in f.qualifiers:
# continue
#comments = 'gene_id "%s"; transcript_id "%s"' % ( f.qualifiers['systematic_id'][0],f.qualifiers['systematic_id'][0] )
if 'protein_id' in f.qualifiers:
comments += '; protein_id "%s"' % f.qualifiers['protein_id'][0]
if 'note' in f.qualifiers:
comments += '; note "%s"' % f.qualifiers['note'][0]
if 'product' in f.qualifiers:
if f.qualifiers['product'][0] != 'hypothetical protein':
comments += '; product "%s"' % urllib.quote(f.qualifiers['product'][0])
#add external IDs
if 'db_xref' in f.qualifiers:
for extData in f.qualifiers['db_xref']:
comments += '; db_xref "%s"' % extData
#code strand as +/- (in genbank 1 or -1)
if int(f.strand)>0: strand = '+'
else: strand = '-'
#define gb
"""
seqname - The name of the sequence. Must be a chromosome or scaffold.
source - The program that generated this feature.
feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon".
start - The starting position of the feature in the sequence. The first base is numbered 1.
end - The ending position of the feature (inclusive).
score - A score between 0 and 1000. If the track line useScore attribute is set to 1 for this annotation data set, the score value will determine the level of gray in which this feature is displayed (higher numbers = darker gray). If there is no score value, enter ".".
strand - Valid entries include '+', '-', or '.' (for don't know/don't care).
frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
comments - gene_id "Em:U62317.C22.6.mRNA"; transcript_id "Em:U62317.C22.6.mRNA"; exon_number 1
"""
s,e = f.location.start.position+1,f.location.end.position
gtf = '%s\t%s\t%s\t%s\t%s\t.\t%s\t.\t%s\n' % ( acc,source,'gene',s,e,strand,comments )
#add start and stop codon
if strand=='+':
gtf += '%s\t%s\t%s\t%s\t%s\t.\t%s\t.\t%s\n' % ( acc,source,'start_codon',s,s+3,strand,comments )
gtf += '%s\t%s\t%s\t%s\t%s\t.\t%s\t.\t%s\n' % ( acc,source,'stop_codon', e-3,e,strand,comments )
else:
gtf += '%s\t%s\t%s\t%s\t%s\t.\t%s\t.\t%s\n' % ( acc,source,'stop_codon',s,s+3,strand,comments )
gtf += '%s\t%s\t%s\t%s\t%s\t.\t%s\t.\t%s\n' % ( acc,source,'start_codon', e-3,e,strand,comments )
if f.sub_features:
for subf in f.sub_features:
gtf += '%s\t%s\t%s\t%s\t%s\t.\t%s\t.\t%s\n' % ( acc,source,f.type,subf.location.start.position+1,subf.location.end.position,strand,comments )
else: #use entire entry as CDS
gtf += '%s\t%s\t%s\t%s\t%s\t.\t%s\t0\t%s\n' % ( acc,source,f.type,f.location.start.position+1,f.location.end.position,strand,comments ) #f.frame,
sys.stdout.write( gtf )
sys.stderr.write( "%s\tSkipped %s entries having types: %s.\n" % ( r.name,skipped,', '.join(skippedTypes) ) )
if __name__=='__main__':
t0=datetime.now()
embl2gtf()
dt=datetime.now()-t0
sys.stderr.write( "#Time elapsed: %s\n" % dt )