-
Notifications
You must be signed in to change notification settings - Fork 248
/
convert_augustus_to_gff3.py
executable file
·200 lines (152 loc) · 8.19 KB
/
convert_augustus_to_gff3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/usr/bin/env python3
import argparse
import re
from biocode import gff, things
'''
This script converts native (GTF) or GFF output (via the --gff3 option) of Augustus
into GFF3 format.
Admittedly, when you run augustus with --gff3=on you will get legal GFF3 with lots of
comments. What I often need is GFF3 that also follows the gene model graph conventions
defined in the GFF3, which the default output does not. There are no exon or mRNA
features, for example.
This script reads the native GFF output and makes it conform to the suggested canonical
gene model, defined here:
http://www.sequenceontology.org/gff3.shtml
WARNING: The augustus output format appears to change pretty frequently. I've provided
an example of the expected input below, and the versions I tested with are 2.7, 3.0.1 and 3.3
WARNING: Only the gene/transcript/CDS rows are kept (and exon rows are created). All
others are discarded.
EXPECTED INPUT example:
###
# start gene g2
FO082871 AUGUSTUS gene 16991 18816 0.04 + . ID=g2
FO082871 AUGUSTUS transcript 16991 18816 0.04 + . ID=g2.t1;Parent=g2
FO082871 AUGUSTUS transcription_start_site 16991 16991 . + . Parent=g2.t1
FO082871 AUGUSTUS five_prime_utr 16991 17193 0.04 + . Parent=g2.t1
FO082871 AUGUSTUS start_codon 17194 17196 . + 0 Parent=g2.t1
FO082871 AUGUSTUS intron 17317 17557 0.15 + . Parent=g2.t1
FO082871 AUGUSTUS CDS 17194 17316 0.15 + 0 ID=g2.t1.cds;Parent=g2.t1
FO082871 AUGUSTUS CDS 17558 17647 0.15 + 0 ID=g2.t1.cds;Parent=g2.t1
FO082871 AUGUSTUS stop_codon 17645 17647 . + 0 Parent=g2.t1
FO082871 AUGUSTUS three_prime_utr 17648 18816 0.07 + . Parent=g2.t1
FO082871 AUGUSTUS transcription_end_site 18816 18816 . + . Parent=g2.t1
# protein sequence = [MQDNYTDGQPGIHRAIAKLEAIINEVDQQLYDYLLERQIHLSPPSKYWEISDIDELVAKAYVCFYAGLCC]
# Evidence for and against this transcript:
# % of transcript supported by hints (any source): 80
# CDS exons: 2/2
# W: 2
# CDS introns: 0/1
# 5'UTR exons and introns: 1/1
# W: 1
# 3'UTR exons and introns: 1/1
# W: 1
# hint groups fully obeyed: 148
# W: 148
# incompatible hint groups: 11
# W: 11
# end gene g2
'''
def main():
parser = argparse.ArgumentParser( description='Convert native (GTF) or GFF output from Augustus into GFF3 format')
## output file to be written
parser.add_argument('-i', '--input', type=str, required=True, help='Path to a GFF file created by Augustus' )
parser.add_argument('-o', '--output', type=str, required=True, help='Path to an output file to be created' )
args = parser.parse_args()
assemblies = dict()
current_assembly = None
gene = None
mRNAs = dict()
in_sequence = False
current_sequence = None
current_gene_comment_lines = list()
## Used for tracking the exon count for each gene (for ID purposes)
exon_count_by_mRNA = dict()
fout = open(args.output, mode='wt', encoding='utf-8')
fout.write("##gff-version 3\n")
for line in open(args.input):
if line.startswith("#"):
## Skip any header comments since we printed our own
if line.startswith("##gff-version"):
continue
current_gene_comment_lines.append(line)
if line.startswith("# end gene "):
## purge the comments, then write the gene
fout.write( "".join(current_gene_comment_lines) )
gene.print_as(fh=fout, source='AUGUSTUS', format='gff3')
gene = None
mRNAs = dict()
in_sequence = False
current_sequence = None
current_gene_comment_lines = list()
elif line.startswith("# protein sequence = ["):
pass
elif in_sequence is True:
# build 'current_sequence'
pass
else:
cols = line.split("\t")
if len(cols) != 9:
continue
mol_id = cols[0]
feat_type = cols[2]
if feat_type not in ['gene', 'transcript', 'CDS']:
continue
## The output format is GTF by default and (mostly) GFF if the --gff option is used.
# If GTF is detected, let's start by transforming the 9th column into GFF so the
# libraries can use it
# g1 -> ID=g1
# g1.t1 -> ID=g1.t1;Parent=g1
# transcript_id "g1.t1"; gene_id "g1"; -> ID=g1.t1.cds;Parent=g1.t1
m_gene = re.match('(g\d+)', cols[8])
m_transcript = re.match('((g\d+).t\d+)', cols[8])
m_CDS = re.match('transcript_id "(g\d+.t\d+)"; gene_id "g\d+";', cols[8])
# the input can be in GTF or GFF. We need to reformat the 9th column for the GTF entries
if not cols[8].startswith('ID') and not cols[8].startswith('Parent'):
if feat_type == 'gene':
if m_gene:
cols[8] = "ID={0}".format(m_gene.group(1))
else:
raise Exception("ERROR: GTF detected but gene row has bad 9th column format: {0}".format(cols[8]))
elif feat_type == 'transcript':
if m_transcript:
cols[8] = "ID={0};Parent={1}".format(m_transcript.group(1), m_transcript.group(2))
else:
raise Exception("ERROR: GTF detected but transcript row has bad 9th column format: {0}".format(cols[8]))
elif feat_type == 'CDS':
if m_CDS:
cols[8] = "ID={0}.cds;Parent={0}".format(m_CDS.group(1))
else:
raise Exception("ERROR: GTF detected but CDS row has bad 9th column format: {0}".format(cols[8]))
feat_id = gff.column_9_value(cols[8], 'ID')
## initialize this assembly if we haven't seen it yet
if mol_id not in assemblies:
assemblies[mol_id] = things.Assembly(id=mol_id)
current_assembly = assemblies[mol_id]
if feat_type == "gene":
gene = things.Gene(id=feat_id)
gene.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] )
elif feat_type == "transcript":
mRNA = things.mRNA(id=feat_id, parent=gene)
mRNA.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] )
gene.add_mRNA(mRNA)
mRNAs[mRNA.id] = mRNA
if feat_id in exon_count_by_mRNA:
raise Exception( "ERROR: two different mRNAs found with same ID: {0}".format(feat_id) )
else:
exon_count_by_mRNA[feat_id] = 0
elif feat_type == "CDS":
parent_id = gff.column_9_value(cols[8], 'Parent')
## sanity check that we've seen this parent
if parent_id not in mRNAs:
raise Exception("ERROR: Found CDS column with parent ({0}) mRNA not yet in the file".format(parent_id))
CDS = things.CDS(id=feat_id, parent=mRNAs[parent_id])
CDS.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6], phase=int(cols[7]) )
mRNA.add_CDS(CDS)
## exons weren't explicitly defined in the input file, so we need to derive new IDs for them
exon_count_by_mRNA[parent_id] += 1
exon_id = "{0}.exon{1}".format(parent_id, exon_count_by_mRNA[parent_id])
exon = things.Exon(id=exon_id, parent=mRNAs[parent_id])
exon.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] )
mRNA.add_exon(exon)
if __name__ == '__main__':
main()