createsmartsdescriptors.py

#!/usr/bin/env python3

'''Routines for enumerating SMARTS expressions representing fragments of provided molecules'''

import sys,argparse,collections
import numpy as np
from rdkit.Chem import AllChem as Chem

def computepathsmarts(mol, size):
    if size <= 0: size = 7 # default to 7 atoms
    ret = set()
    for length in range(2,size+1):
        paths = Chem.FindAllPathsOfLengthN(mol, length, useBonds=False)
        for path in paths:
            smi = Chem.MolFragmentToSmiles(mol, path, canonical=True, allBondsExplicit=True)
            ret.add(smi)
    return ret

def computesubgraphsmarts(mol, size):
    '''Given an rdkit mol, extract all the paths up to length size (if zero,
    use default length).  Return smarts expressions for these paths.'''
    if size <= 0: size = 6 #default to 6 bonds
    #find all paths of various sizes
    paths = Chem.FindAllSubgraphsOfLengthMToN(mol,1,size)
    #paths is a list of lists, each of which is the bond indices for paths of a different length
    paths = [path for subpath in paths for path in subpath]
    ret = set()
    for path in paths:
        atoms=set()
        for bidx in path:
            atoms.add(mol.GetBondWithIdx(bidx).GetBeginAtomIdx())
            atoms.add(mol.GetBondWithIdx(bidx).GetEndAtomIdx())
        smi = Chem.MolFragmentToSmiles(mol, atoms, canonical=True, allBondsExplicit=True)
        ret.add(smi)
    return ret

def computecircularsmarts(mol, size):
    '''Given an rdkit mol, extract all the circular type descriptors up to 
    radius size (if zero, use default length).  Return smarts expressions for these fragments.'''
    if size <= 0: size = 2
    ret = set()
    for a in mol.GetAtoms():
        for r in range(1,size+1):
            env = Chem.FindAtomEnvironmentOfRadiusN(mol,r,a.GetIdx())
            atoms=set()
            for bidx in env:
                atoms.add(mol.GetBondWithIdx(bidx).GetBeginAtomIdx())
                atoms.add(mol.GetBondWithIdx(bidx).GetEndAtomIdx())
            smi = Chem.MolFragmentToSmiles(mol, atoms, canonical=True, allBondsExplicit=True)
            ret.add(smi)
    return ret

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Create SMARTS-based descriptors from set of molecules')
    parser.add_argument('smi',help='SMILES input file')
    parser.add_argument('-o','--outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout)
    
    descriptortype = parser.add_mutually_exclusive_group()
    descriptortype.add_argument('--path',action='store_const',dest="type",const="path",help='Use path descriptors')
    descriptortype.add_argument('--circular',action='store_const',dest="type",const="circular",help='Use circular descriptors')
    descriptortype.add_argument('--subgraph',action='store_const',dest="type",const="subgraph",help='Use subgraph descriptors')
    parser.set_defaults(type='path')
    
    parser.add_argument('--size',type=int,default=0,help="Max descriptor size (path length or circular radius)")
    parser.add_argument('-c','--cutoff',type=int,metavar="N",default=0,help="Remove smarts that appear in <= N molecules")
 
    args = parser.parse_args()

    smartcnts = collections.defaultdict(int) #count the number of molecules that have a given smart
    with open(args.smi) as f:
        for line in f:
            try:
                if line.startswith('#') or len(line.strip()) == 0:
                    continue
                else:
                    tokens = line.split()
                    mol = Chem.MolFromSmiles(tokens[0])
                    if args.type == 'path':
                        smarts = computepathsmarts(mol, args.size)
                    elif args.type == 'circular':
                        smarts = computecircularsmarts(mol, args.size)
                    elif args.type == 'subgraph':
                        smarts = computesubgraphsmarts(mol, args.size)
                    else: #should never happen
                        smarts = []
                    
                    for s in smarts:
                        smartcnts[s] += 1
            except Exception as e:
                sys.stderr.write("%s\nProblem with line: %s" % (e,line))
    
    for (smart,cnt) in smartcnts.items():
        if cnt > args.cutoff:
            args.outfile.write('%s %d\n' % (smart,cnt))