Skip to content

Commit

Permalink
Add support of SDF file
Browse files Browse the repository at this point in the history
  • Loading branch information
FanwangM committed Nov 9, 2021
1 parent 9c6b49e commit c7df177
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 29 deletions.
2 changes: 1 addition & 1 deletion padelpy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from padelpy.wrapper import padeldescriptor
from padelpy.functions import from_mdl, from_smiles
from padelpy.functions import from_mdl, from_smiles, from_sdf
__version__ = '0.1.10'
129 changes: 101 additions & 28 deletions padelpy/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

def from_smiles(smiles, output_csv: str = None, descriptors: bool = True,
fingerprints: bool = False, timeout: int = 60) -> OrderedDict:
''' from_smiles: converts SMILES string to QSPR descriptors/fingerprints
""" from_smiles: converts SMILES string to QSPR descriptors/fingerprints
Args:
smiles (str, list): SMILES string for a given molecule, or a list of
Expand All @@ -38,30 +38,30 @@ def from_smiles(smiles, output_csv: str = None, descriptors: bool = True,
list of OrderedDicts, else single OrderedDict; each OrderedDict
contains labels and values for each descriptor generated for each
supplied molecule
'''
"""

timestamp = datetime.now().strftime('%Y%m%d%H%M%S%f')[:-3]
timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f")[:-3]

with open('{}.smi'.format(timestamp), 'w') as smi_file:
with open("{}.smi".format(timestamp), "w") as smi_file:
if type(smiles) == str:
smi_file.write(smiles)
elif type(smiles) == list:
smi_file.write('\n'.join(smiles))
smi_file.write("\n".join(smiles))
else:
raise RuntimeError('Unknown input format for `smiles`: {}'.format(
raise RuntimeError("Unknown input format for `smiles`: {}".format(
type(smiles)
))
smi_file.close()

save_csv = True
if output_csv is None:
save_csv = False
output_csv = '{}.csv'.format(timestamp)
output_csv = "{}.csv".format(timestamp)

for attempt in range(3):
try:
padeldescriptor(
mol_dir='{}.smi'.format(timestamp),
mol_dir="{}.smi".format(timestamp),
d_file=output_csv,
convert3d=True,
retain3d=True,
Expand All @@ -74,7 +74,7 @@ def from_smiles(smiles, output_csv: str = None, descriptors: bool = True,
break
except RuntimeError as exception:
if attempt == 2:
remove('{}.smi'.format(timestamp))
remove("{}.smi".format(timestamp))
if not save_csv:
sleep(0.5)
try:
Expand All @@ -85,33 +85,33 @@ def from_smiles(smiles, output_csv: str = None, descriptors: bool = True,
else:
continue

with open(output_csv, 'r', encoding='utf-8') as desc_file:
with open(output_csv, "r", encoding="utf-8") as desc_file:
reader = DictReader(desc_file)
rows = [row for row in reader]
desc_file.close()

remove('{}.smi'.format(timestamp))
remove("{}.smi".format(timestamp))
if not save_csv:
remove(output_csv)

if type(smiles) == list and len(rows) != len(smiles):
raise RuntimeError('PaDEL-Descriptor failed on one or more mols.' +
' Ensure the input structures are correct.')
raise RuntimeError("PaDEL-Descriptor failed on one or more mols." +
" Ensure the input structures are correct.")
elif type(smiles) == str and len(rows) == 0:
raise RuntimeError(
'PaDEL-Descriptor failed on {}.'.format(smiles) +
' Ensure input structure is correct.'
"PaDEL-Descriptor failed on {}.".format(smiles) +
" Ensure input structure is correct."
)

for idx, r in enumerate(rows):
if len(r) == 0:
raise RuntimeError(
'PaDEL-Descriptor failed on {}.'.format(smiles[idx]) +
' Ensure input structure is correct.'
"PaDEL-Descriptor failed on {}.".format(smiles[idx]) +
" Ensure input structure is correct."
)

for idx in range(len(rows)):
del rows[idx]['Name']
del rows[idx]["Name"]

if type(smiles) == str:
return rows[0]
Expand All @@ -120,7 +120,7 @@ def from_smiles(smiles, output_csv: str = None, descriptors: bool = True,

def from_mdl(mdl_file: str, output_csv: str = None, descriptors: bool = True,
fingerprints: bool = False, timeout: int = 60) -> list:
''' from_mdl: converts MDL file into QSPR descriptors/fingerprints;
""" from_mdl: converts MDL file into QSPR descriptors/fingerprints;
multiple molecules may be represented in the MDL file
Args:
Expand All @@ -133,19 +133,19 @@ def from_mdl(mdl_file: str, output_csv: str = None, descriptors: bool = True,
Returns:
list: list of dicts, where each dict corresponds sequentially to a
compound in the supplied MDL file
'''
"""

is_mdl = compile(r'.*\.mdl$', IGNORECASE)
is_mdl = compile(r".*\.mdl$", IGNORECASE)
if is_mdl.match(mdl_file) is None:
raise ValueError('MDL file must have a `.mdl` extension: {}'.format(
raise ValueError("MDL file must have a `.mdl` extension: {}".format(
mdl_file
))

save_csv = True
if output_csv is None:
save_csv = False
output_csv = '{}.csv'.format(
datetime.now().strftime('%Y%m%d%H%M%S%f')[:-3]
output_csv = "{}.csv".format(
datetime.now().strftime("%Y%m%d%H%M%S%f")[:-3]
)

for attempt in range(3):
Expand Down Expand Up @@ -174,15 +174,88 @@ def from_mdl(mdl_file: str, output_csv: str = None, descriptors: bool = True,
else:
continue

with open(output_csv, 'r', encoding='utf-8') as desc_file:
with open(output_csv, "r", encoding="utf-8") as desc_file:
reader = DictReader(desc_file)
rows = [row for row in reader]
desc_file.close()
if not save_csv:
remove(output_csv)
if len(rows) == 0:
raise RuntimeError('PaDEL-Descriptor returned no calculated values.' +
' Ensure the input structure is correct.')
raise RuntimeError("PaDEL-Descriptor returned no calculated values." +
" Ensure the input structure is correct.")
for row in rows:
del row['Name']
del row["Name"]
return rows


def from_sdf(sdf_file: str,
output_csv: str = None,
descriptors: bool = True,
fingerprints: bool = False,
timeout: int = 60) -> list:
""" Converts sdf file into QSPR descriptors/fingerprints.
Multiple molecules may be represented in the sdf file
Args:
sdf_file (str): path to sdf file
output_csv (str): if supplied, saves descriptors/fingerprints here
descriptors (bool): if `True`, calculates descriptors
fingerprints (bool): if `True`, calculates fingerprints
timeout (int): maximum time, in seconds, for conversion
Returns:
list: list of dicts, where each dict corresponds sequentially to a compound in the
supplied sdf file
"""

is_sdf = compile(r".*\.sdf$", IGNORECASE)
if is_sdf.match(sdf_file) is None:
raise ValueError("sdf file must have a `.sdf` extension: {}".format(
sdf_file
))

save_csv = True
if output_csv is None:
save_csv = False
output_csv = "{}.csv".format(
datetime.now().strftime("%Y%m%d%H%M%S%f")[:-3]
)

for attempt in range(3):
try:
padeldescriptor(
mol_dir=sdf_file,
d_file=output_csv,
convert3d=True,
retain3d=True,
retainorder=True,
d_2d=descriptors,
d_3d=descriptors,
fingerprints=fingerprints,
sp_timeout=timeout
)
break
except RuntimeError as exception:
if attempt == 2:
if not save_csv:
sleep(0.5)
try:
remove(output_csv)
except FileNotFoundError as e:
warnings.warn(e, RuntimeWarning)
raise RuntimeError(exception)
else:
continue

with open(output_csv, "r", encoding="utf-8") as desc_file:
reader = DictReader(desc_file)
rows = [row for row in reader]
desc_file.close()
if not save_csv:
remove(output_csv)
if len(rows) == 0:
raise RuntimeError("PaDEL-Descriptor returned no calculated values." +
" Ensure the input structure is correct.")
for row in rows:
del row["Name"]
return rows

0 comments on commit c7df177

Please sign in to comment.