-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPFAM_mapping.py
42 lines (32 loc) · 1.72 KB
/
PFAM_mapping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
###this file takes the input file consisting of the eggnog results with the first column as the gene names, second column has pfam annotations
def create_dictionary_from_file(file_path):
data_dict = {}
with open(file_path, 'r') as file:
for line in file:
if line.startswith('#') or line.startswith('##'):
continue # Skip comment lines or lines starting with #
line = line.strip() # Remove leading/trailing whitespaces
gene_name, pfam_info = line.split(' ', 1) # Split the line based on the first space character
data_dict[gene_name] = pfam_info
return data_dict
def find_matching_values(keys_file_path, data_dict, output_file_path):
matching_values = {}
with open(keys_file_path, 'r') as keys_file:
for key in keys_file:
key = key.strip() # Remove leading/trailing whitespaces
matching_value = data_dict.get(key)
if matching_value is not None:
matching_values[key] = matching_value
with open(output_file_path, 'w') as output_file:
for key, value in matching_values.items():
output_file.write(f"{key}\t{value}\n")
# File to eggnog pfam data
data_file_path = '/gpfs01/home/mbxrs14/motif_mapping2/new_file.txt'
# File path containing gene names we want to map on
keys_file_path = '/gpfs01/home/mbxrs14/motif_mapping2/OG0001255_amphibians_headers.txt'
# write the file
output_file_path = '/gpfs01/home/mbxrs14/motif_mapping2/OG0001255_amphibians_headers_output.txt'
# Create the dictionary from the data file
result_dict = create_dictionary_from_file(data_file_path)
# find matching values from the list
find_matching_values(keys_file_path, result_dict, output_file_path)