-
Notifications
You must be signed in to change notification settings - Fork 12
/
fasta2gff.py
70 lines (47 loc) · 1.63 KB
/
fasta2gff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
### Boas Pucker ###
### [email protected] ###
### v0.2 ###
__usage__ = """
python fasta2gff.py\n
--fasta <INPUT_FILE>\n
--gff3 <OUTPUT_FILE>\n
bug reports and feature requests: [email protected]
"""
import os, re, sys
# --- end of imports --- #
def load_seqs_from_mult_fasta( filename ):
"""! @brief load all contigs of assembly """
sequences = {}
with open( filename, "r" ) as f:
header = f.readline().strip()[1:].split(' ')[0]
line = f.readline()
seq = ""
while line:
if line[0] == '>':
sequences.update( { header: seq } )
header = line.strip()[1:].split(' ')[0]
seq = ""
else:
seq += line.strip()
line = f.readline()
sequences.update( { header: seq } )
print "number of sequences in file: " + str( len( sequences.keys() ) )
return sequences
def construct_gff( sequences, output_file ):
"""! @brief construct gff3 file based on available sequences """
with open( output_file, "w" ) as out:
for key in sorted( sequences.keys() ):
new_line = [ key, ".", "mRNA", "1", len( sequences[key] ), ".", "+", ".", "ID=" + key + ";Parent=" + key ]
out.write( "\t".join( map( str, new_line ) ) + '\n' )
def main( arguments ):
"""! @brief calls all functions of this script """
input_fasta = arguments[ arguments.index( '--fasta' )+1 ]
output_gff =arguments[ arguments.index( '--gff3' )+1 ]
sequences = load_seqs_from_mult_fasta( input_fasta )
construct_gff( sequences, output_gff )
if __name__ == '__main__':
if '--fasta' in sys.argv and '--gff3' in sys.argv:
main( sys.argv )
else:
sys.exit( __usage__ )
print "all done!"