-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNCBI_proteinisoform_headers.py
56 lines (47 loc) · 2.62 KB
/
NCBI_proteinisoform_headers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
##create the header structure for each gene in the protein file [geneID_genename_proteinID_proteinfunction]
#follows the NCBI_proteinisoform_headers.py script
import re
def extract_protein_info_from_fasta(file_path):
protein_info = {}
with open(file_path, 'r') as fasta_file:
sequence_header = ''
for line in fasta_file:
if line.startswith('>'):
if sequence_header != '':
protein_info[protein_id] = sequence_header
sequence_header = line.strip()
protein_id_match = re.search(r'_proteinID=(\S+)_', sequence_header)
if protein_id_match:
protein_id = protein_id_match.group(1)
else:
# Ignore sequence lines
continue
# Add the last sequence entry to the dictionary
if protein_id and sequence_header:
protein_info[protein_id] = sequence_header
return protein_info
def write_protein_info_to_file(protein_info, file_path):
with open(file_path, 'w') as output_file:
for protein_id, sequence_header in protein_info.items():
output_file.write(f'{protein_id}:{sequence_header}\n')
def replace_headers_in_fasta(file_path, protein_info, output_file_path):
with open(file_path, 'r') as input_file, open(output_file_path, 'w') as output_file:
for line in input_file:
if line.startswith('>'):
header = line.strip()[1:] # Exclude the '>' character
if header in protein_info:
new_header = protein_info[header]
output_file.write(f'>{new_header}\n') # Write the updated header
else:
output_file.write(line) # Write the original header if no replacement found
else:
output_file.write(line) # Write the sequence lines as-is
# Provide the path to your output from the CDS_alternate.py for that species ###alternative file
fasta_file_path = '/Users/rhianah/Documents/Dissertation/species_alternate/Alligatormississippiensis.fasta'
protein_info = extract_protein_info_from_fasta(fasta_file_path)
# Provide the path where you want to save the modified protein fatsa file
output_file_path = '/Users/rhianah/Documents/Dissertation/species_proteinfilter/Alligatormississippiensis2.fasta'
# Provide the path to the FASTA file where you want to replace the headers
#The output from NCBI_proteinisoform.py
input_fasta_path = '/Users/rhianah/Documents/Dissertation/species_protein/Alligatormississippiensisalternate.fasta'
replace_headers_in_fasta(input_fasta_path, protein_info, output_file_path)