-
Notifications
You must be signed in to change notification settings - Fork 0
/
split_scaffolds_at_gaps.py
73 lines (56 loc) · 2.15 KB
/
split_scaffolds_at_gaps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python
### author: Margaret Ho, [email protected]
### last updated: Aug 15 2023
### This script takes fasta input and outputs an expanded fasta that has been ### split at gaps (indicated by any number of N's)
### Scaffolds with a gap are divided into separate entries, each subsequence has a subseq number appended to their fasta file
### Scaffolds without a gap are left alone
import argparse
def main():
parser = argparse.ArgumentParser(description='Process an input fasta file, which will be split into fasta sequences if gaps (indicated by Ns) are detected')
# Add an argument for the input file
parser.add_argument('input_file', type=str, help='Path to the input fasta file')
args = parser.parse_args()
input_file = args.input_file
#print(f"Processing input file: {input_file}")
f = open(input_file)
parsed_seqs = parse_fasta_file(f)
f.close()
return parsed_seqs
def parse_fasta_file(file_handle):
"""Return a dict of {id:gene_seq} pairs based on the sequences in the input FASTA file
input_file -- a file handle for an input fasta file
"""
parsed_seqs = {} #dictionary
curr_seq_id = None
curr_seq = [] #list
for line in file_handle:
line = line.strip()
if line.startswith(">"):
if curr_seq_id is not None:
parsed_seqs[curr_seq_id] = ''.join(curr_seq)
curr_seq_id = line[1:] #store the part of the fasta line header except the ">"
curr_seq = []
continue
curr_seq.append(line)
#Add the final sequence to the dict
parsed_seqs[curr_seq_id] = ''.join(curr_seq)
return parsed_seqs
def split_sequence(sequence, delimiter='N'):
sequences = sequence.split(delimiter)
non_empty_sequences = [s for s in sequences if s]
return non_empty_sequences
if __name__ == '__main__':
parsed_seqs = main()
#print(parsed_seqs)
#for key in parsed_seqs:
# print(key, parsed_seqs[key])
for key in parsed_seqs:
# print(len(split_sequence(parsed_seqs[key])))
if len(split_sequence(parsed_seqs[key])) <= 1:
print(">"+ key + "\n")
print(parsed_seqs[key])
else:
for subseq_n in range(len(split_sequence(parsed_seqs[key]))):
print(">" + key + "_subseq" + str(subseq_n+1))
print(split_sequence(parsed_seqs[key])[subseq_n])
main()