-
Notifications
You must be signed in to change notification settings - Fork 0
/
ko_to_go_dict.py
executable file
·87 lines (49 loc) · 1.58 KB
/
ko_to_go_dict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/python
"""
This script aims at building a .json file with keys
the GO Molecular Functions terms and values all the KO
terms related.
"""
import sys
import json
set_of_gomf = set()
gomf_list = open("/data/databases/mappings/dict_go_molecular_function_ids.tsv", "r")
for entry in gomf_list:
set_of_gomf.add(entry[:-1])
triples = open("/data/databases/mappings/GOs_KOs_via_Uniref50.tsv", "r")
ko_to_go = {}
counter = 0
for triplet in triples:
counter += 1
if counter % 100000 == 0:
print(str(counter) + " out of 142,771,206 triplets")
triplet = triplet.split("\t")
ko = triplet[1]
go = triplet[0]
if go in set_of_gomf:
if ko not in ko_to_go.keys():
ko_to_go[ko] = {}
ko_to_go[ko]['0'] = go
else:
index = str(len(ko_to_go[ko]))
ko_to_go[ko][index] = go
with open('/data/databases/mappings/ko_togomfs.json', 'w') as f:
json.dump(ko_to_go, f)
# counter = 0
# uniref_to_go = {}
# for triplet in triples:
# counter += 1
# if counter % 100000 == 0:
# print(str(counter) + " out of 142,771,206 triplets")
# triplet = triplet.split("\t")
# uniref = triplet[2]
# go = triplet[0]
# if go in set_of_gomf:
# if uniref not in ko_to_go.keys():
# uniref_to_go[uniref] = {}
# uniref_to_go[uniref]['0'] = go
# else:
# index = str(len(uniref_to_go[uniref]))
# uniref_to_go[uniref][index] = go
# with open('/data/databases/mappings/uniref_togomfs.json', 'w') as f:
# json.dump(uniref_to_go, f)