diff --git a/padelpy/__init__.py b/padelpy/__init__.py index 79627f6..de5b956 100644 --- a/padelpy/__init__.py +++ b/padelpy/__init__.py @@ -1,3 +1,3 @@ from padelpy.wrapper import padeldescriptor -from padelpy.functions import from_mdl, from_smiles +from padelpy.functions import from_mdl, from_smiles, from_sdf __version__ = '0.1.10' diff --git a/padelpy/functions.py b/padelpy/functions.py index d4ea3ee..8ae1ace 100644 --- a/padelpy/functions.py +++ b/padelpy/functions.py @@ -23,7 +23,7 @@ def from_smiles(smiles, output_csv: str = None, descriptors: bool = True, fingerprints: bool = False, timeout: int = 60) -> OrderedDict: - ''' from_smiles: converts SMILES string to QSPR descriptors/fingerprints + """ from_smiles: converts SMILES string to QSPR descriptors/fingerprints Args: smiles (str, list): SMILES string for a given molecule, or a list of @@ -38,17 +38,17 @@ def from_smiles(smiles, output_csv: str = None, descriptors: bool = True, list of OrderedDicts, else single OrderedDict; each OrderedDict contains labels and values for each descriptor generated for each supplied molecule - ''' + """ - timestamp = datetime.now().strftime('%Y%m%d%H%M%S%f')[:-3] + timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f")[:-3] - with open('{}.smi'.format(timestamp), 'w') as smi_file: + with open("{}.smi".format(timestamp), "w") as smi_file: if type(smiles) == str: smi_file.write(smiles) elif type(smiles) == list: - smi_file.write('\n'.join(smiles)) + smi_file.write("\n".join(smiles)) else: - raise RuntimeError('Unknown input format for `smiles`: {}'.format( + raise RuntimeError("Unknown input format for `smiles`: {}".format( type(smiles) )) smi_file.close() @@ -56,12 +56,12 @@ def from_smiles(smiles, output_csv: str = None, descriptors: bool = True, save_csv = True if output_csv is None: save_csv = False - output_csv = '{}.csv'.format(timestamp) + output_csv = "{}.csv".format(timestamp) for attempt in range(3): try: padeldescriptor( - mol_dir='{}.smi'.format(timestamp), + mol_dir="{}.smi".format(timestamp), d_file=output_csv, convert3d=True, retain3d=True, @@ -74,7 +74,7 @@ def from_smiles(smiles, output_csv: str = None, descriptors: bool = True, break except RuntimeError as exception: if attempt == 2: - remove('{}.smi'.format(timestamp)) + remove("{}.smi".format(timestamp)) if not save_csv: sleep(0.5) try: @@ -85,33 +85,33 @@ def from_smiles(smiles, output_csv: str = None, descriptors: bool = True, else: continue - with open(output_csv, 'r', encoding='utf-8') as desc_file: + with open(output_csv, "r", encoding="utf-8") as desc_file: reader = DictReader(desc_file) rows = [row for row in reader] desc_file.close() - remove('{}.smi'.format(timestamp)) + remove("{}.smi".format(timestamp)) if not save_csv: remove(output_csv) if type(smiles) == list and len(rows) != len(smiles): - raise RuntimeError('PaDEL-Descriptor failed on one or more mols.' + - ' Ensure the input structures are correct.') + raise RuntimeError("PaDEL-Descriptor failed on one or more mols." + + " Ensure the input structures are correct.") elif type(smiles) == str and len(rows) == 0: raise RuntimeError( - 'PaDEL-Descriptor failed on {}.'.format(smiles) + - ' Ensure input structure is correct.' + "PaDEL-Descriptor failed on {}.".format(smiles) + + " Ensure input structure is correct." ) for idx, r in enumerate(rows): if len(r) == 0: raise RuntimeError( - 'PaDEL-Descriptor failed on {}.'.format(smiles[idx]) + - ' Ensure input structure is correct.' + "PaDEL-Descriptor failed on {}.".format(smiles[idx]) + + " Ensure input structure is correct." ) for idx in range(len(rows)): - del rows[idx]['Name'] + del rows[idx]["Name"] if type(smiles) == str: return rows[0] @@ -120,7 +120,7 @@ def from_smiles(smiles, output_csv: str = None, descriptors: bool = True, def from_mdl(mdl_file: str, output_csv: str = None, descriptors: bool = True, fingerprints: bool = False, timeout: int = 60) -> list: - ''' from_mdl: converts MDL file into QSPR descriptors/fingerprints; + """ from_mdl: converts MDL file into QSPR descriptors/fingerprints; multiple molecules may be represented in the MDL file Args: @@ -133,19 +133,19 @@ def from_mdl(mdl_file: str, output_csv: str = None, descriptors: bool = True, Returns: list: list of dicts, where each dict corresponds sequentially to a compound in the supplied MDL file - ''' + """ - is_mdl = compile(r'.*\.mdl$', IGNORECASE) + is_mdl = compile(r".*\.mdl$", IGNORECASE) if is_mdl.match(mdl_file) is None: - raise ValueError('MDL file must have a `.mdl` extension: {}'.format( + raise ValueError("MDL file must have a `.mdl` extension: {}".format( mdl_file )) save_csv = True if output_csv is None: save_csv = False - output_csv = '{}.csv'.format( - datetime.now().strftime('%Y%m%d%H%M%S%f')[:-3] + output_csv = "{}.csv".format( + datetime.now().strftime("%Y%m%d%H%M%S%f")[:-3] ) for attempt in range(3): @@ -174,15 +174,88 @@ def from_mdl(mdl_file: str, output_csv: str = None, descriptors: bool = True, else: continue - with open(output_csv, 'r', encoding='utf-8') as desc_file: + with open(output_csv, "r", encoding="utf-8") as desc_file: reader = DictReader(desc_file) rows = [row for row in reader] desc_file.close() if not save_csv: remove(output_csv) if len(rows) == 0: - raise RuntimeError('PaDEL-Descriptor returned no calculated values.' + - ' Ensure the input structure is correct.') + raise RuntimeError("PaDEL-Descriptor returned no calculated values." + + " Ensure the input structure is correct.") for row in rows: - del row['Name'] + del row["Name"] + return rows + + +def from_sdf(sdf_file: str, + output_csv: str = None, + descriptors: bool = True, + fingerprints: bool = False, + timeout: int = 60) -> list: + """ Converts sdf file into QSPR descriptors/fingerprints. + Multiple molecules may be represented in the sdf file + + Args: + sdf_file (str): path to sdf file + output_csv (str): if supplied, saves descriptors/fingerprints here + descriptors (bool): if `True`, calculates descriptors + fingerprints (bool): if `True`, calculates fingerprints + timeout (int): maximum time, in seconds, for conversion + + Returns: + list: list of dicts, where each dict corresponds sequentially to a compound in the + supplied sdf file + """ + + is_sdf = compile(r".*\.sdf$", IGNORECASE) + if is_sdf.match(sdf_file) is None: + raise ValueError("sdf file must have a `.sdf` extension: {}".format( + sdf_file + )) + + save_csv = True + if output_csv is None: + save_csv = False + output_csv = "{}.csv".format( + datetime.now().strftime("%Y%m%d%H%M%S%f")[:-3] + ) + + for attempt in range(3): + try: + padeldescriptor( + mol_dir=sdf_file, + d_file=output_csv, + convert3d=True, + retain3d=True, + retainorder=True, + d_2d=descriptors, + d_3d=descriptors, + fingerprints=fingerprints, + sp_timeout=timeout + ) + break + except RuntimeError as exception: + if attempt == 2: + if not save_csv: + sleep(0.5) + try: + remove(output_csv) + except FileNotFoundError as e: + warnings.warn(e, RuntimeWarning) + raise RuntimeError(exception) + else: + continue + + with open(output_csv, "r", encoding="utf-8") as desc_file: + reader = DictReader(desc_file) + rows = [row for row in reader] + desc_file.close() + if not save_csv: + remove(output_csv) + if len(rows) == 0: + raise RuntimeError("PaDEL-Descriptor returned no calculated values." + + " Ensure the input structure is correct.") + for row in rows: + del row["Name"] return rows