valence-labs · shenoynikhil · Oct 15, 2023 · Oct 4, 2023 · Oct 4, 2023 · Oct 4, 2023
@@ -19,3 +19,31 @@ You can run tests locally with:
 ```bash
 pytest
 ```
+
+# Overview of Datasets
+
+<!-- Create a table with the following columns
+1. Name of Dataset (with reference of paper) [Dataset Name](paper link)
+2. Number of Molecules
+3. Number of Conformers
+4. Average Conformer to Molecule Ratio (in 2 lines)
+5. Labels
+6. QM Level of Theory
+ -->
+
+We provide support for the following publicly available QM Datasets.
+
+| Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | Off-Equilibrium Conformations|
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| [GEOM](https://www.nature.com/articles/s41597-022-01288-4) |  450,000 | 37,000,000 | 82 | No | 18 | GFN2-xTB | No |
+| [Molecule3D](https://arxiv.org/abs/2110.01717) |  3,899,647 | 3,899,647 | 1 | No | 5 | B3LYP/6-31G* | No |
+| [NablaDFT](https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D) |  1,000,000 | 5,000,000 | 5 | No | 6 | ωB97X-D/def2-SVP | |
+| [QMugs](https://www.nature.com/articles/s41597-022-01390-7) |  665,000 | 2,000,000 | 3 | No | 10 | GFN2-xTB, ωB97X-D/def2-SVP | No |
+| [Spice](https://arxiv.org/abs/2209.10702) |  19,238 | 1,132,808 | 59 | Yes | 15 | ωB97M-D3(BJ)/def2-TZVPPD | Yes |
+| [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) |  57,462 | 20,000,000 | 348 | No | 4 | ωB97x:6-31G(d) | Yes |
+| [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) |  86,665 | |  | No | | TPSSh-D3BJ/def2-SVP | |
+| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) | Yes |
+| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 | Yes |
+| [OrbNet Denali](https://arxiv.org/abs/2107.00299) | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB | Yes |
+| [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP | |
+| [QM7X](https://www.nature.com/articles/s41597-021-00812-2) |  6,950 | 4,195,237 | 603 | Yes | 7 | PBE0+MBD | Yes |
@@ -0,0 +1,26 @@
+from .ani import ANI1, ANI1CCX, ANI1X
+from .comp6 import COMP6
+from .gdml import GDML
+from .geom import GEOM
+from .iso_17 import ISO17
+from .molecule3d import Molecule3D
+from .orbnet_denali import OrbnetDenali
+from .qmugs import QMugs
+from .sn2_rxn import SN2RXN
+from .spice import Spice
+
+__all__ = [
+    "ANI1",
+    "ANI1CCX",
+    "ANI1X",
+    "Spice",
+    "GEOM",
+    "QMugs",
+    "ISO17",
+    "COMP6",
+    "GDML",
+    "Molecule3D",
+    "OrbnetDenali",
+    "QMugs",
+    "SN2RXN",
+]
@@ -9,6 +9,22 @@
 
 
 class ANI1(BaseDataset):
+    """
+    The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small
+    organic molecules with energy labels calculated using DFT. The molecules
+    contain 4 distinct atoms, C, N, O and H.
+
+    Usage
+    ```python
+    from openqdc.datasets import ANI1
+    dataset = ANI1()
+    ```
+
+    References:
+    - ANI-1: https://www.nature.com/articles/sdata2017193
+    - Github: https://github.com/aiqm/ANI1x_datasets
+    """
+
     __name__ = "ani1"
 
     # Energy in hartree, all zeros by default
@@ -42,6 +58,21 @@ def read_raw_entries(self):
 
 
 class ANI1CCX(ANI1):
+    """
+    ANI1-CCX is a dataset of 500k conformers subsampled from the 5.5M conformers of ANI-1X dataset. The selected
+    conformations are then labelled using a high accuracy CCSD(T)*/CBS method.
+
+    Usage
+    ```python
+    from openqdc.datasets import ANI1CCX
+    dataset = ANI1CCX()
+    ```
+
+    References:
+    - ANI-1ccx: https://doi.org/10.1038/s41467-019-10827-4
+    - Github: https://github.com/aiqm/ANI1x_datasets
+    """
+
     __name__ = "ani1ccx"
 
     # Energy in hartree, all zeros by default
@@ -69,6 +100,21 @@ def __init__(self) -> None:
 
 
 class ANI1X(ANI1):
+    """
+    The ANI-1X dataset consists of ANI-1 molecules + some molecules added using active learning which leads to
+    a total of 5,496,771 conformers with 63,865 unique molecules.
+
+    Usage
+    ```python
+    from openqdc.datasets import ANI1X
+    dataset = ANI1X()
+    ```
+
+    References:
+    - ANI-1x: https://doi.org/10.1063/1.5023802
+    - Github: https://github.com/aiqm/ANI1x_datasets
+    """
+
     __name__ = "ani1x"
 
     # Energy in hartree, all zeros by default

@@ -1,7 +1,9 @@
 import os
 from os.path import join as p_join
+from typing import Dict, List, Optional
 
 import numpy as np
+import pandas as pd
 import torch
 from loguru import logger
 from sklearn.utils import Bunch
@@ -18,7 +20,13 @@
 from openqdc.utils.molecule import atom_table
 
 
-def extract_entry(df, i, subset, energy_target_names, force_target_names=None):
+def extract_entry(
+    df: pd.DataFrame,
+    i: int,
+    subset: str,
+    energy_target_names: List[str],
+    force_target_names: Optional[List[str]] = None,
+) -> Dict[str, np.ndarray]:
     x = np.array([atom_table.GetAtomicNumber(s) for s in df["symbols"][i]])
     xs = np.stack((x, np.zeros_like(x)), axis=-1)
     positions = df["geometry"][i].reshape((-1, 3))
@@ -42,18 +50,12 @@ def extract_entry(df, i, subset, energy_target_names, force_target_names=None):
     return res
 
 
-def read_qc_archive_h5(raw_path, subset, energy_target_names, force_target_names):
+def read_qc_archive_h5(
+    raw_path: str, subset: str, energy_target_names: List[str], force_target_names: List[str]
+) -> List[Dict[str, np.ndarray]]:
     data = load_hdf5_file(raw_path)
     data_t = {k2: data[k1][k2][:] for k1 in data.keys() for k2 in data[k1].keys()}
     n = len(data_t["molecule_id"])
-    # print(f"Reading {n} entries from {raw_path}")
-    # for k in data_t:
-    #     print(f"Loaded {k} with shape {data_t[k].shape}, dtype {data_t[k].dtype}")
-    #     if "Energy" in k:
-    #         print(np.isnan(data_t[k]).mean(), f"{data_t[k][0]}")
-
-    # print('\n'*3)
-    # exit()
 
     samples = [extract_entry(data_t, i, subset, energy_target_names, force_target_names) for i in tqdm(range(n))]
     return samples

@@ -7,6 +7,22 @@
 
 
 class COMP6(BaseDataset):
+    """
+    COMP6 is a benchmark suite consisting of broad regions of bio-chemical and organic space
+    developed for testing the ANI-1x potential. It is curated from 6 benchmark sets:
+    S66x8, ANI Molecular Dynamics, GDB7to9, GDB10to13, DrugBank, and Tripeptides.
+
+    Usage
+    ```python
+    from openqdc.datasets import COMP6
+    dataset = COMP6()
+    ```
+
+    References:
+    - https://aip.scitation.org/doi/abs/10.1063/1.5023802
+    - Github: https://github.com/isayev/COMP6
+    """
+
     __name__ = "comp6"
 
     # Energy in hartree, all zeros by default

@@ -7,6 +7,29 @@
 
 
 class GDML(BaseDataset):
+    """
+    Gradient Domain Machine Learning (GDML) is a dataset consisting of samples from ab initio
+    molecular dynamics (AIMD) trajectories. The dataset consists of,
+    - Benzene: 627000 samples
+    - Uracil: 133000 samples
+    - Naptalene: 326000 samples
+    - Aspirin: 211000 samples
+    - Salicylic Acid: 320000 samples
+    - Malonaldehyde: 993000 samples
+    - Ethanol: 555000 samples
+    - Toluene: 100000 samples
+
+    Usage
+    ```python
+    from openqdc.datasets import GDML
+    dataset = GDML()
+    ```
+
+    References:
+    - https://www.science.org/doi/10.1126/sciadv.1603015
+    - http://www.sgdml.org/#datasets
+    """
+
     __name__ = "gdml"
 
     # Energy in hartree, all zeros by default

@@ -1,4 +1,5 @@
 from os.path import join as p_join
+from typing import Dict
 
 import datamol as dm
 import numpy as np
@@ -9,7 +10,7 @@
 from openqdc.utils.molecule import get_atomic_number_and_charge
 
 
-def read_mol(mol_id, mol_dict, base_path, partition):
+def read_mol(mol_id: str, mol_dict, base_path: str, partition: str) -> Dict[str, np.ndarray]:
     """Read molecule from pickle file and return dict with conformers and energies
 
     Parameters
@@ -20,15 +21,18 @@ def read_mol(mol_id, mol_dict, base_path, partition):
         Dictionary containing the pickle_path and smiles of the molecule
     base_path: str
         Path to the folder containing the pickle files
+    partition: str
+        Name of the dataset partition, one of ['qm9', 'drugs']
 
     Returns
     -------
     res: dict
         Dictionary containing the following keys:
-            - atomic_inputs: flatten np.ndarray of shape (M, 4) containing the atomic numbers and positions
-            - smiles: np.ndarray of shape (N,) containing the smiles of the molecule
-            - energies: np.ndarray of shape (N,1) containing the energies of the conformers
-            - n_atoms: np.ndarray of shape (N,) containing the number of atoms in each conformer
+        - atomic_inputs: flatten np.ndarray of shape (M, 5) containing the atomic numbers, charges and positions
+        - smiles: np.ndarray of shape (N,) containing the smiles of the molecule
+        - energies: np.ndarray of shape (N,1) containing the energies of the conformers
+        - n_atoms: np.ndarray of shape (N,) containing the number of atoms in each conformer
+        - subset: np.ndarray of shape (N,) containing the name of the dataset partition
     """
 
     try:
@@ -56,6 +60,22 @@ def read_mol(mol_id, mol_dict, base_path, partition):
 
 
 class GEOM(BaseDataset):
+    """
+    The Geometric Ensemble Of Molecules (GEOM) dataset contains 37 million conformers for 133,000 molecules
+    from QM9, and 317,000 molecules with experimental data related to biophysics, physiology,
+    and physical chemistry. The dataset is generated using the GFN2-xTB semi-empirical method.
+
+    Usage:
+    ```python
+    from openqdc.datasets import GEOM
+    dataset = GEOM()
+    ```
+
+    References:
+    - https://www.nature.com/articles/s41597-022-01288-4
+    - https://github.com/learningmatter-mit/geom
+    """
+
     __name__ = "geom"
     __energy_methods__ = ["gfn2_xtb"]
 

@@ -7,6 +7,23 @@
 
 
 class ISO17(BaseDataset):
+    """
+    ISO17 dataset consists of the largest set of isomers from the QM9 dataset that consists of a fixed
+    composition of atoms (C7O2H10) arranged in different chemically valid structures. It consists of consist
+    of 129 molecules each containing 5,000 conformational geometries, energies and forces with a resolution
+    of 1 femtosecond in the molecular dynamics trajectories. The simulations were carried out using the
+    Perdew-Burke-Ernzerhof (PBE) functional and the Tkatchenko-Scheffler (TS) van der Waals correction method.
+
+    Usage:
+    ```python
+    from openqdc.datasets import ISO17
+    dataset = ISO17()
+    ```
+
+    References:
+    - https://paperswithcode.com/dataset/iso17
+    """
+
     __name__ = "iso_17"
 
     # Energy in hartree, all zeros by default

@@ -1,5 +1,6 @@
 from glob import glob
 from os.path import join as p_join
+from typing import Dict, List
 
 import datamol as dm
 import numpy as np
@@ -12,7 +13,26 @@
 from openqdc.utils.molecule import get_atomic_number_and_charge
 
 
-def read_mol(mol, energy):
+def read_mol(mol: Chem.rdchem.Mol, energy: float) -> Dict[str, np.ndarray]:
+    """Read molecule (Chem.rdchem.Mol) and energy (float) and return dict with conformers and energies
+
+    Parameters
+    ----------
+    mol: Chem.rdchem.Mol
+        RDKit molecule
+    energy: float
+        Energy of the molecule
+
+    Returns
+    -------
+    res: dict
+        Dictionary containing the following keys:
+        - name: np.ndarray of shape (N,) containing the smiles of the molecule
+        - atomic_inputs: flatten np.ndarray of shape (M, 5) containing the atomic numbers, charges and positions
+        - energies: np.ndarray of shape (1,) containing the energy of the conformer
+        - n_atoms: np.ndarray of shape (1) containing the number of atoms in the conformer
+        - subset: np.ndarray of shape (1) containing "molecule3d"
+    """
     smiles = dm.to_smiles(mol, explicit_hs=False)
     # subset = dm.to_smiles(dm.to_scaffold_murcko(mol, make_generic=True), explicit_hs=False)
     x = get_atomic_number_and_charge(mol)
@@ -29,7 +49,8 @@ def read_mol(mol, energy):
     return res
 
 
-def _read_sdf(sdf_path, properties_path):
+def _read_sdf(sdf_path: str, properties_path: str) -> List[Dict[str, np.ndarray]]:
+    """Reads the sdf path and properties file."""
     properties = pd.read_csv(properties_path, dtype={"cid": str})
     properties.drop_duplicates(subset="cid", inplace=True, keep="first")
     xys = properties[["cid", "scf energy"]]
@@ -45,6 +66,22 @@ def _read_sdf(sdf_path, properties_path):
 
 
 class Molecule3D(BaseDataset):
+    """
+    Molecule3D dataset consists of 3,899,647 molecules with ground state geometries and energies
+    calculated at B3LYP/6-31G* level of theory. The molecules are extracted from the
+    PubChem database and cleaned by removing invalid molecule files.
+
+    Usage:
+    ```python
+    from openqdc.datasets import Molecule3D
+    dataset = Molecule3D()
+    ```
+
+    References:
+    - https://arxiv.org/abs/2110.01717
+    - https://github.com/divelab/MoleculeX
+    """
+
     __name__ = "molecule3d"
     __energy_methods__ = ["b3lyp_6-31g*"]