-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsimsearch.py
59 lines (46 loc) · 1.4 KB
/
simsearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import argparse
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from tqdm import tqdm
def getmaxsmi(smi):
smi = str(smi)
smi = smi.strip()
m = Chem.MolFromSmiles(smi)
if m is None:
return np.nan
fp2 = AllChem.GetMorganFingerprint(m, 2)
max_sim = 0
for fp1 in fps:
max_sim = max(max_sim, DataStructs.TanimotoSimilarity(fp1, fp2))
return max_sim
def compute_fp_dict(source="kinasesmiles/john_smiles_kinasei.smi"):
with open(source, 'r') as f:
smiles = map(lambda x: x.split(' ')[0], f.readlines())
fps = []
for smi in tqdm(smiles):
m = Chem.MolFromSmiles(smi)
if m is None:
continue
fp1 = AllChem.GetMorganFingerprint(m, 2)
fps.append(fp1)
return fps
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('-i', required=True, type=str)
parser.add_argument('-o', required=True, type=str)
parser.add_argument('-n', required=False, default=1, type=int)
return parser.parse_args()
args = get_args()
fps = compute_fp_dict()
print("loaded fps")
df = pd.read_csv(args.i)
print("loaded", df.shape[0], 'smiles')
import multiprocessing
with multiprocessing.Pool(args.n) as p:
it = p.imap(getmaxsmi, df.loc[:, 'smiles'].tolist())
res = list(tqdm(it))
df['sim'] = res
df.to_csv(args.o, index=False)