-
Notifications
You must be signed in to change notification settings - Fork 28
/
createsmartsdescriptors.py
executable file
·94 lines (82 loc) · 4.22 KB
/
createsmartsdescriptors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python3
'''Routines for enumerating SMARTS expressions representing fragments of provided molecules'''
import sys,argparse,collections
import numpy as np
from rdkit.Chem import AllChem as Chem
def computepathsmarts(mol, size):
if size <= 0: size = 7 # default to 7 atoms
ret = set()
for length in range(2,size+1):
paths = Chem.FindAllPathsOfLengthN(mol, length, useBonds=False)
for path in paths:
smi = Chem.MolFragmentToSmiles(mol, path, canonical=True, allBondsExplicit=True)
ret.add(smi)
return ret
def computesubgraphsmarts(mol, size):
'''Given an rdkit mol, extract all the paths up to length size (if zero,
use default length). Return smarts expressions for these paths.'''
if size <= 0: size = 6 #default to 6 bonds
#find all paths of various sizes
paths = Chem.FindAllSubgraphsOfLengthMToN(mol,1,size)
#paths is a list of lists, each of which is the bond indices for paths of a different length
paths = [path for subpath in paths for path in subpath]
ret = set()
for path in paths:
atoms=set()
for bidx in path:
atoms.add(mol.GetBondWithIdx(bidx).GetBeginAtomIdx())
atoms.add(mol.GetBondWithIdx(bidx).GetEndAtomIdx())
smi = Chem.MolFragmentToSmiles(mol, atoms, canonical=True, allBondsExplicit=True)
ret.add(smi)
return ret
def computecircularsmarts(mol, size):
'''Given an rdkit mol, extract all the circular type descriptors up to
radius size (if zero, use default length). Return smarts expressions for these fragments.'''
if size <= 0: size = 2
ret = set()
for a in mol.GetAtoms():
for r in range(1,size+1):
env = Chem.FindAtomEnvironmentOfRadiusN(mol,r,a.GetIdx())
atoms=set()
for bidx in env:
atoms.add(mol.GetBondWithIdx(bidx).GetBeginAtomIdx())
atoms.add(mol.GetBondWithIdx(bidx).GetEndAtomIdx())
smi = Chem.MolFragmentToSmiles(mol, atoms, canonical=True, allBondsExplicit=True)
ret.add(smi)
return ret
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Create SMARTS-based descriptors from set of molecules')
parser.add_argument('smi',help='SMILES input file')
parser.add_argument('-o','--outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout)
descriptortype = parser.add_mutually_exclusive_group()
descriptortype.add_argument('--path',action='store_const',dest="type",const="path",help='Use path descriptors')
descriptortype.add_argument('--circular',action='store_const',dest="type",const="circular",help='Use circular descriptors')
descriptortype.add_argument('--subgraph',action='store_const',dest="type",const="subgraph",help='Use subgraph descriptors')
parser.set_defaults(type='path')
parser.add_argument('--size',type=int,default=0,help="Max descriptor size (path length or circular radius)")
parser.add_argument('-c','--cutoff',type=int,metavar="N",default=0,help="Remove smarts that appear in <= N molecules")
args = parser.parse_args()
smartcnts = collections.defaultdict(int) #count the number of molecules that have a given smart
with open(args.smi) as f:
for line in f:
try:
if line.startswith('#') or len(line.strip()) == 0:
continue
else:
tokens = line.split()
mol = Chem.MolFromSmiles(tokens[0])
if args.type == 'path':
smarts = computepathsmarts(mol, args.size)
elif args.type == 'circular':
smarts = computecircularsmarts(mol, args.size)
elif args.type == 'subgraph':
smarts = computesubgraphsmarts(mol, args.size)
else: #should never happen
smarts = []
for s in smarts:
smartcnts[s] += 1
except Exception as e:
sys.stderr.write("%s\nProblem with line: %s" % (e,line))
for (smart,cnt) in smartcnts.items():
if cnt > args.cutoff:
args.outfile.write('%s %d\n' % (smart,cnt))