diff --git a/.gitignore b/.gitignore index cbc05b2..7a6dd93 100644 --- a/.gitignore +++ b/.gitignore @@ -148,3 +148,4 @@ cookie.txt *.csv *.txt *.sh +.DS_Store diff --git a/docs/API/available_datasets.md b/docs/API/available_datasets.md new file mode 100644 index 0000000..fa630b8 --- /dev/null +++ b/docs/API/available_datasets.md @@ -0,0 +1,3 @@ +# Available Datasets + +::: openqdc.datasets diff --git a/docs/API/isolated_atom_energies.md b/docs/API/isolated_atom_energies.md new file mode 100644 index 0000000..966b6a8 --- /dev/null +++ b/docs/API/isolated_atom_energies.md @@ -0,0 +1,5 @@ +# Isolated atoms energy + +This page contains the isolated atom energies. + +::: openqdc.utils.atomization_energies diff --git a/docs/_overrides/main.html b/docs/_overrides/main.html new file mode 100644 index 0000000..2eafd76 --- /dev/null +++ b/docs/_overrides/main.html @@ -0,0 +1,46 @@ +{% extends "base.html" %} + +{% block content %} +{{ super() }} + + +{% endblock content %} diff --git a/docs/css/custom.css b/docs/css/custom.css new file mode 100644 index 0000000..65db8ea --- /dev/null +++ b/docs/css/custom.css @@ -0,0 +1,33 @@ +/* Indentation. */ +div.doc-contents:not(.first) { + padding-left: 25px; + border-left: 4px solid rgba(230, 230, 230); + margin-bottom: 80px; + } + + /* Don't capitalize names. */ + h5.doc-heading { + text-transform: none !important; + } + + /* Don't use vertical space on hidden ToC entries. */ + .hidden-toc::before { + margin-top: 0 !important; + padding-top: 0 !important; + } + + /* Don't show permalink of hidden ToC entries. */ + .hidden-toc a.headerlink { + display: none; + } + + /* Avoid breaking parameters name, etc. in table cells. */ + td code { + word-break: normal !important; + } + + /* For pieces of Markdown rendered in table cells. */ + td p { + margin-top: 0 !important; + margin-bottom: 0 !important; + } diff --git a/docs/datasets.md b/docs/datasets.md new file mode 100644 index 0000000..a2323fb --- /dev/null +++ b/docs/datasets.md @@ -0,0 +1,27 @@ +# Overview of Datasets + + + +We provide support for the following publicly available QM Datasets. + +| Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | Off-Equilibrium Conformations| +| --- | --- | --- | --- | --- | --- | --- | --- | +| [GEOM](https://www.nature.com/articles/s41597-022-01288-4) | 450,000 | 37,000,000 | 82 | No | 18 | GFN2-xTB | No | +| [Molecule3D](https://arxiv.org/abs/2110.01717) | 3,899,647 | 3,899,647 | 1 | No | 5 | B3LYP/6-31G* | No | +| [NablaDFT](https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D) | 1,000,000 | 5,000,000 | 5 | No | 6 | ωB97X-D/def2-SVP | | +| [QMugs](https://www.nature.com/articles/s41597-022-01390-7) | 665,000 | 2,000,000 | 3 | No | 10 | GFN2-xTB, ωB97X-D/def2-SVP | No | +| [Spice](https://arxiv.org/abs/2209.10702) | 19,238 | 1,132,808 | 59 | Yes | 15 | ωB97M-D3(BJ)/def2-TZVPPD | Yes | +| [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) | 57,462 | 20,000,000 | 348 | No | 4 | ωB97x:6-31G(d) | Yes | +| [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) | 86,665 | | | No | | TPSSh-D3BJ/def2-SVP | | +| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) | Yes | +| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 | Yes | +| [OrbNet Denali](https://arxiv.org/abs/2107.00299) | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB | Yes | +| [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP | | +| [QM7X](https://www.nature.com/articles/s41597-021-00812-2) | 6,950 | 4,195,237 | 603 | Yes | 7 | PBE0+MBD | Yes | diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..264211f --- /dev/null +++ b/docs/index.md @@ -0,0 +1,30 @@ +# openQDC + +Open Quantum Data Commons + +## Setup Datasets + +Use the scripts in `setup/` to download the datasets. For more information, see the [README](setup/README.md) in the `setup/` directory. + +# Install the library in dev mode +```bash +# Install the deps +mamba env create -n qdc -f env.yml + +# Activate the environment +mamba activate qdc + +# Install the qdc library in dev mode +pip install -e . + +``` + +## Development lifecycle + +### Tests + +You can run tests locally with: + +```bash +pytest . +``` diff --git a/docs/tutorials/usage.ipynb b/docs/tutorials/usage.ipynb new file mode 100644 index 0000000..b494396 --- /dev/null +++ b/docs/tutorials/usage.ipynb @@ -0,0 +1,424 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Really Hard example\n", + "\n", + "## Instantiate and GO!\n", + "\n", + "If you don't have the dataset downloaded it will be downloaded automatically and cached. You just instantiate the class and you are ready to go." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/Caskroom/miniconda/base/envs/qdc/lib/python3.11/site-packages/google/auth/_default.py:76: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", + " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n", + "\u001b[32m2023-10-31 11:43:09.510\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mopenqdc.datasets.base\u001b[0m:\u001b[36mread_preprocess\u001b[0m:\u001b[36m236\u001b[0m - \u001b[1mReading preprocessed data\u001b[0m\n", + "\u001b[32m2023-10-31 11:43:09.511\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mopenqdc.datasets.base\u001b[0m:\u001b[36mread_preprocess\u001b[0m:\u001b[36m237\u001b[0m - \u001b[1mspice data with the following units:\n", + " Energy: hartree,\n", + " Distance: bohr,\n", + " Forces: hartree/bohr\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded atomic_inputs with shape (33175288, 5), dtype float32\n", + "Loaded position_idx_range with shape (1110165, 2), dtype int32\n", + "Loaded energies with shape (1110165, 1), dtype float32\n", + "Loaded forces with shape (33175288, 3, 1), dtype float32\n", + "Loaded name_uniques with shape (19155,), dtype =2.3.1 - - pytorch_sparse >=0.6.17 - - pytorch_cluster >=1.6 - - pytorch_scatter >=2.1 - - torch-ema + #- einops =0.6.0 + - pytorch + - dscribe # other stuffs - h5py >=3.8.0 - - omegaconf #==2.3.0 - gdown #==4.6.4 - - hydra-core #==1.3.1 - - wandb #==0.13.10 # Viz - matplotlib @@ -63,7 +50,14 @@ dependencies: - pre-commit - ruff - ipykernel - - pydantic <= 2.0 - - - pip: - - torch-nl + - isort + + # Doc + - mkdocs + - mkdocs-material + - mkdocs-material-extensions + - mkdocstrings + - mkdocs-click + - mkdocs-jupyter + - markdown-include + - mdx_truly_sane_lists diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..e174b70 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,76 @@ +site_name: "Open Quantum Data Commons (openQDC)" +site_description: "I don't know... Something about data and Quantum stuff I guess :D" +site_url: "https://github.com/OpenDrugDiscovery/openQDC" +repo_url: "https://github.com/OpenDrugDiscovery/openQDC" +repo_name: "openQDC" +copyright: Copyright 2023 Valence Labs + +remote_branch: "privpage" +use_directory_urls: false +docs_dir: "docs" + +nav: + - Overview: index.md + - Available Datasets: datasets.md + - Tutorials: + - Really hard example: tutorials/usage.ipynb + - API: + - Datasets: API/available_datasets.md + - Isolated Atoms Energies: API/isolated_atom_energies.md +theme: + name: material + custom_dir: docs/_overrides + palette: + primary: teal + accent: purple + features: + - navigation.tabs + - navigation.expand + + +extra_css: + - css/custom.css + +extra_javascript: + - javascripts/config.js + - https://polyfill.io/v3/polyfill.min.js?features=es6 + - https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js + +markdown_extensions: + - admonition + - markdown_include.include + - pymdownx.emoji + - pymdownx.highlight + - pymdownx.magiclink + - pymdownx.superfences + - pymdownx.tabbed + - pymdownx.tasklist + # For `tab_length=2` in the markdown extension + # See https://github.com/mkdocs/mkdocs/issues/545 + - mdx_truly_sane_lists + - mkdocs-click + - attr_list + - md_in_html + - toc: + permalink: true + +plugins: + - search + - mkdocstrings: + watch: + - src/ + handlers: + python: + setup_commands: + - import sys + - sys.path.append("docs") + - sys.path.append("src") + selection: + new_path_syntax: yes + rendering: + show_root_heading: yes + heading_level: 3 + show_if_no_docstring: true + - mkdocs-jupyter: + execute: False + # kernel_name: python3 diff --git a/src/openqdc/__init__.py b/src/openqdc/__init__.py index e69de29..1432923 100644 --- a/src/openqdc/__init__.py +++ b/src/openqdc/__init__.py @@ -0,0 +1,41 @@ +import importlib +import os +from typing import TYPE_CHECKING # noqa F401 + +# The below lazy import logic is coming from openff-toolkit: +# https://github.com/openforcefield/openff-toolkit/blob/b52879569a0344878c40248ceb3bd0f90348076a/openff/toolkit/__init__.py#L44 + +# Dictionary of objects to lazily import; maps the object's name to its module path + +_lazy_imports_obj = {} + +_lazy_imports_mod = {"datasets": "openqdc.datamodule", "utils": "openqdc.utils"} + + +def __getattr__(name): + """Lazily import objects from _lazy_imports_obj or _lazy_imports_mod + + Note that this method is only called by Python if the name cannot be found + in the current module.""" + obj_mod = _lazy_imports_obj.get(name) + if obj_mod is not None: + mod = importlib.import_module(obj_mod) + return mod.__dict__[name] + + lazy_mod = _lazy_imports_mod.get(name) + if lazy_mod is not None: + return importlib.import_module(lazy_mod) + + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def __dir__(): + """Add _lazy_imports_obj and _lazy_imports_mod to dir()""" + keys = (*globals().keys(), *_lazy_imports_obj.keys(), *_lazy_imports_mod.keys()) + return sorted(keys) + + +if TYPE_CHECKING or os.environ.get("OPENQDC_DISABLE_LAZY_LOADING", "0") == "1": + # These types are imported lazily at runtime, but we need to tell type + # checkers what they are. + from .datasets import * diff --git a/src/openqdc/datasets/__init__.py b/src/openqdc/datasets/__init__.py index 5dfe6a1..d989935 100644 --- a/src/openqdc/datasets/__init__.py +++ b/src/openqdc/datasets/__init__.py @@ -1,42 +1,107 @@ -from .ani import ANI1, ANI1CCX, ANI1X -from .comp6 import COMP6 -from .dess import DESS -from .gdml import GDML -from .geom import GEOM -from .iso_17 import ISO17 -from .molecule3d import Molecule3D -from .nabladft import NablaDFT -from .orbnet_denali import OrbnetDenali -from .pcqm import PCQM_B3LYP, PCQM_PM6 -from .qm7x import QM7X -from .qmugs import QMugs -from .sn2_rxn import SN2RXN -from .solvated_peptides import SolvatedPeptides -from .spice import Spice -from .tmqm import TMQM -from .transition1x import Transition1X -from .waterclusters3_30 import WaterClusters - -__all__ = [ - "ANI1", - "ANI1CCX", - "ANI1X", - "COMP6", - "DESS", - "GDML", - "GEOM", - "ISO17", - "Molecule3D", - "NablaDFT", - "OrbnetDenali", - "PCQM_B3LYP", - "PCQM_PM6", - "QM7X", - "QMugs", - "SN2RXN", - "SolvatedPeptides", - "Spice", - "TMQM", - "Transition1X", - "WaterClusters", -] +import importlib +import os +from typing import TYPE_CHECKING # noqa F401 + +# The below lazy import logic is coming from openff-toolkit: +# https://github.com/openforcefield/openff-toolkit/blob/b52879569a0344878c40248ceb3bd0f90348076a/openff/toolkit/__init__.py#L44 + +# Dictionary of objects to lazily import; maps the object's name to its module path + +_lazy_imports_obj = { + "ANI1": "openqdc.datasets.ani", + "ANI1CCX": "openqdc.datasets.ani", + "ANI1X": "openqdc.datasets.ani", + "Spice": "openqdc.datasets.spice", + "GEOM": "openqdc.datasets.geom", + "QMugs": "openqdc.datasets.qmugs", + "ISO17": "openqdc.datasets.iso_17", + "COMP6": "openqdc.datasets.comp6", + "GDML": "openqdc.datasets.gdml", + "Molecule3D": "openqdc.datasets.molecule3d", + "OrbnetDenali": "openqdc.datasets.orbnet_denali", + "SN2RXN": "openqdc.datasets.sn2_rxn", + "QM7X": "openqdc.datasets.qm7x", + "DESS": "openqdc.datasets.dess", + "NablaDFT": "openqdc.datasets.nabladft", + "SolvatedPeptides": "openqdc.datasets.solvated_peptides", + "WaterClusters": "openqdc.datasets.waterclusters3_30", + "TMQM": "openqdc.datasets.tmqm", + "Dummy": "openqdc.datasets.dummy", + "PCQM_B3LYP": "openqdc.datasets.pcqm", + "PCQM_PM6": "openqdc.datasets.pcqm", + "Transition1X": "openqdc.datasets.transition1x", +} + +_lazy_imports_mod = {} + + +def __getattr__(name): + """Lazily import objects from _lazy_imports_obj or _lazy_imports_mod + + Note that this method is only called by Python if the name cannot be found + in the current module.""" + obj_mod = _lazy_imports_obj.get(name) + if obj_mod is not None: + mod = importlib.import_module(obj_mod) + return mod.__dict__[name] + + lazy_mod = _lazy_imports_mod.get(name) + if lazy_mod is not None: + return importlib.import_module(lazy_mod) + + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def __dir__(): + """Add _lazy_imports_obj and _lazy_imports_mod to dir()""" + keys = (*globals().keys(), *_lazy_imports_obj.keys(), *_lazy_imports_mod.keys()) + return sorted(keys) + + +if TYPE_CHECKING or os.environ.get("OPENQDC_DISABLE_LAZY_LOADING", "0") == "1": + # These types are imported lazily at runtime, but we need to tell type + # checkers what they are. + from .ani import ANI1, ANI1CCX, ANI1X # noqa + from .comp6 import COMP6 # noqa + from .dess import DESS # noqa + from .dummy import Dummy # noqa + from .gdml import GDML # noqa + from .geom import GEOM # noqa + from .iso_17 import ISO17 # noqa + from .molecule3d import Molecule3D # noqa + from .nabladft import NablaDFT # noqa + from .orbnet_denali import OrbnetDenali # noqa + from .pcqm import PCQM_B3LYP, PCQM_PM6 # noqa + from .qm7x import QM7X # noqa + from .qmugs import QMugs # noqa + from .sn2_rxn import SN2RXN # noqa + from .solvated_peptides import SolvatedPeptides # noqa + from .spice import Spice # noqa + from .tmqm import TMQM # noqa + from .transition1x import Transition1X # noqa + from .waterclusters3_30 import WaterClusters # noqa + + __all__ = [ + "ANI1", + "ANI1X", + "ANI1CCX", + "Spice", + "GEOM", + "QMugs", + "ISO17", + "COMP6", + "GDML", + "Molecule3D", + "OrbnetDenali", + "SN2RXN", + "QM7X", + "DESS", + "NablaDFT", + "SolvatedPeptides", + "WaterClusters", + "TMQM", + "PCQM_B3LYP", + "PCQM_PM6", + "Transition1X", + "Dummy", + ] diff --git a/src/openqdc/datasets/ani.py b/src/openqdc/datasets/ani.py index 73a1ccb..913fb8a 100644 --- a/src/openqdc/datasets/ani.py +++ b/src/openqdc/datasets/ani.py @@ -32,11 +32,8 @@ class ANI1(BaseDataset): "ωB97x:6-31G(d) Energy", ] __energy_unit__ = "hartree" - __distance_unit__ = "ang" - __forces_unit__ = "hartree/ang" - - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) + __distance_unit__ = "bohr" + __forces_unit__ = "hartree/bohr" @property def root(self): @@ -71,12 +68,15 @@ class ANI1CCX(ANI1): """ __name__ = "ani1ccx" + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" __energy_methods__ = [ - "ccsd(t)_cbs", - "npno_ccsd(t)_dz", - "npno_ccsd(t)_tz", - "tpno_ccsd(t)_dz", + "ccsd(t)/cbs", + "ccsd(t)/cc-pvdz", + "ccsd(t)/cc-pvtz", + "tccsd(t)/cc-pvdz", ] energy_target_names = [ @@ -89,9 +89,6 @@ class ANI1CCX(ANI1): __force_methods__ = [] force_target_names = [] - def __init__(self) -> None: - super().__init__() - class ANI1X(ANI1): """ @@ -110,16 +107,19 @@ class ANI1X(ANI1): """ __name__ = "ani1x" + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" __energy_methods__ = [ - "hf_dz", - "hf_qz", - "hf_tz", - "mp2_dz", - "mp2_qz", - "mp2_tz", - "wb97x_6-31g(d)", - "wb97x_tz", + "hf/cc-pvdz", + "hf/cc-pvqz", + "hf/cc-pvtz", + "mp2/cc-pvdz", + "mp2/cc-pvqz", + "mp2/cc-pvtz", + "wb97x/6-31g(d)", + "wb97x/cc-pvtz", ] energy_target_names = [ @@ -139,9 +139,9 @@ class ANI1X(ANI1): ] __force_methods__ = [ - "wb97x_6-31g(d)", - "wb97x_tz", + "wb97x/6-31g(d)", + "wb97x/cc-pvtz", ] - def __init__(self) -> None: - super().__init__() + def convert_forces(self, x): + return super().convert_forces(x) * 0.529177249 # correct the Dataset error diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index 598a0b3..1de6ff1 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -1,23 +1,31 @@ import os from os.path import join as p_join -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Union import numpy as np import pandas as pd import torch +from ase.io.extxyz import write_extxyz from loguru import logger from sklearn.utils import Bunch from tqdm import tqdm +from openqdc.utils.atomization_energies import ( + IsolatedAtomEnergyFactory, + chemical_symbols, +) from openqdc.utils.constants import NB_ATOMIC_FEATURES from openqdc.utils.io import ( copy_exists, + dict_to_atoms, get_local_cache, load_hdf5_file, pull_locally, push_remote, + set_cache_dir, ) from openqdc.utils.molecule import atom_table +from openqdc.utils.package_utils import requires_package from openqdc.utils.units import get_conversion @@ -67,21 +75,44 @@ class BaseDataset(torch.utils.data.Dataset): __force_methods__ = [] energy_target_names = [] force_target_names = [] + __isolated_atom_energies__ = [] __energy_unit__ = "hartree" - __distance_unit__ = "bohr" - __forces_unit__ = "hartree/bohr" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" __fn_energy__ = lambda x: x __fn_distance__ = lambda x: x __fn_forces__ = lambda x: x - def __init__(self, energy_unit=None, distance_unit=None, overwrite_local_cache=False) -> None: + def __init__( + self, + energy_unit: Optional[str] = None, + distance_unit: Optional[str] = None, + overwrite_local_cache: bool = False, + cache_dir: Optional[str] = None, + ) -> None: + set_cache_dir(cache_dir) self.data = None self._set_units(energy_unit, distance_unit) if not self.is_preprocessed(): logger.info("This dataset not available. Please open an issue on Github for the team to look into it.") + # entries = self.read_raw_entries() + # res = self.collate_list(entries) + # self.save_preprocess(res) else: self.read_preprocess(overwrite_local_cache=overwrite_local_cache) + self._set_isolated_atom_energies() + + @property + def numbers(self): + if hasattr(self, "_numbers"): + return self._numbers + self._numbers = np.array(list(set(self.data["atomic_inputs"][..., 0])), dtype=np.int32) + return self._numbers + + @property + def chemical_species(self): + return [chemical_symbols[z] for z in self.numbers] @property def energy_unit(self): @@ -140,6 +171,14 @@ def _set_units(self, en, ds): self.__forces_unit__ = self.energy_unit + "/" + self.distance_unit self.__class__.__fn_forces__ = get_conversion(old_en + "/" + old_ds, self.__forces_unit__) + def _set_isolated_atom_energies(self): + if self.__energy_methods__ is None: + logger.error("No energy methods defined for this dataset.") + f = get_conversion("hartree", self.__energy_unit__) + self.__isolated_atom_energies__ = f( + np.array([IsolatedAtomEnergyFactory.get_matrix(en_method) for en_method in self.__energy_methods__]) + ) + def convert_energy(self, x): return self.__class__.__fn_energy__(x) @@ -149,12 +188,18 @@ def convert_distance(self, x): def convert_forces(self, x): return self.__class__.__fn_forces__(x) - def set_energy_unit(self, value): + def set_energy_unit(self, value: str): + """ + Set a new energy unit for the dataset. + """ old_unit = self.energy_unit self.__energy_unit__ = value self.__class__.__fn_energy__ = get_conversion(old_unit, value) - def set_distance_unit(self, value): + def set_distance_unit(self, value: str): + """ + Set a new distance unit for the dataset. + """ old_unit = self.distance_unit self.__distance_unit__ = value self.__class__.__fn_distance__ = get_conversion(old_unit, value) @@ -175,11 +220,6 @@ def collate_list(self, list_entries): def save_preprocess(self, data_dict): # save memmaps logger.info("Preprocessing data and saving it to cache.") - logger.info( - f"Dataset {self.__name__} data with the following units:\n" - f"Energy: {self.energy_unit}, Distance: {self.distance_unit}, " - f"Forces: {self.force_unit if self.__force_methods__ else 'None'}" - ) for key in self.data_keys: local_path = p_join(self.preprocess_path, f"{key}.mmap") out = np.memmap(local_path, mode="w+", dtype=data_dict[key].dtype, shape=data_dict[key].shape) @@ -198,10 +238,10 @@ def save_preprocess(self, data_dict): def read_preprocess(self, overwrite_local_cache=False): logger.info("Reading preprocessed data") logger.info( - f"{self.__name__} data with the following units:\ - Energy: {self.energy_unit},\ - Distance: {self.distance_unit},\ - Forces: {self.force_unit}" + f"{self.__name__} data with the following units:\n\ + Energy: {self.energy_unit},\n\ + Distance: {self.distance_unit},\n\ + Forces: {self.force_unit if self.__force_methods__ else 'None'}" ) self.data = {} for key in self.data_keys: @@ -237,10 +277,123 @@ def preprocess(self): res = self.collate_list(entries) self.save_preprocess(res) + def save_xyz(self, idx: int, path: Optional[str] = None): + """ + Save the entry at index idx as an extxyz file. + """ + if path is None: + path = os.getcwd() + at = self.get_ase_atoms(idx, ext=True) + name = at.info["name"] + write_extxyz(p_join(path, f"{name}.xyz"), at) + + def get_ase_atoms(self, idx: int, ext=True): + """ + Get the ASE atoms object for the entry at index idx. + + Parameters + ---------- + idx : int + Index of the entry. + ext : bool, optional + Whether to include additional informations + """ + entry = self[idx] + # _ = entry.pop("forces") + at = dict_to_atoms(entry, ext=ext) + return at + + @requires_package("dscribe") + @requires_package("datamol") + def chemical_space( + self, + n_samples: Optional[Union[List[int], int]] = None, + return_idxs: bool = True, + progress: bool = True, + **soap_kwargs, + ) -> Dict[str, np.ndarray]: + """ + Compute the SOAP descriptors for the dataset. + + Parameters + ---------- + n_samples : Optional[Union[List[int],int]], optional + Number of samples to use for the computation, by default None. If None, all the dataset is used. + If a list of integers is provided, the descriptors are computed for each of the specified idx of samples. + return_idxs : bool, optional + Whether to return the indices of the samples used, by default True. + progress : bool, optional + Whether to show a progress bar, by default True. + **soap_kwargs : dict + Keyword arguments to pass to the SOAP descriptor. + By defaut, the following values are used: + - r_cut : 5.0 + - n_max : 8 + - l_max : 6 + - average : "inner" + - periodic : False + - compression : {"mode" : "mu1nu1"} + + Returns + ------- + Dict[str, np.ndarray] + Dictionary containing the following keys: + - soap : np.ndarray of shape (N, M) containing the SOAP descriptors for the dataset + - soap_kwargs : dict containing the keyword arguments used for the SOAP descriptor + - idxs : np.ndarray of shape (N,) containing the indices of the samples used + + """ + import datamol as dm + from dscribe.descriptors import SOAP + + if n_samples is None: + idxs = list(range(len(self))) + elif isinstance(n_samples, int): + idxs = np.random.choice(len(self), size=n_samples, replace=False) + elif isinstance(n_samples, list): + idxs = n_samples + datum = {} + r_cut = soap_kwargs.pop("r_cut", 5.0) + n_max = soap_kwargs.pop("n_max", 8) + l_max = soap_kwargs.pop("l_max", 6) + average = soap_kwargs.pop("average", "inner") + periodic = soap_kwargs.pop("periodic", False) + compression = soap_kwargs.pop("compression", {"mode": "mu1nu1"}) + soap = SOAP( + species=self.chemical_species, + periodic=periodic, + r_cut=r_cut, + n_max=n_max, + l_max=l_max, + average=average, + compression=compression, + ) + datum["soap_kwargs"] = { + "r_cut": r_cut, + "n_max": n_max, + "l_max": l_max, + "average": average, + "compression": compression, + "species": self.chemical_species, + "periodic": periodic, + **soap_kwargs, + } + + def wrapper(idx): + entry = self.get_ase_atoms(idx, ext=False) + return soap.create(entry, centers=entry.positions) + + descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler="threads") + datum["soap"] = np.vstack(descr) + if return_idxs: + datum["idxs"] = idxs + return datum + def __len__(self): return self.data["energies"].shape[0] def __getitem__(self, idx: int): + shift = IsolatedAtomEnergyFactory.max_charge p_start, p_end = self.data["position_idx_range"][idx] input = self.data["atomic_inputs"][p_start:p_end] z, c, positions, energies = ( @@ -256,14 +409,19 @@ def __getitem__(self, idx: int): forces = self.convert_forces(np.array(self.data["forces"][p_start:p_end], dtype=np.float32)) else: forces = None - return Bunch( positions=positions, atomic_numbers=z, charges=c, - e0=self.convert_energy(self.atomic_energies[z]), + e0=self.__isolated_atom_energies__[..., z, c + shift].T, energies=energies, name=name, subset=subset, forces=forces, ) + + def __str__(self): + return f"{self.__name__}" + + def __repr__(self): + return f"{self.__name__}" diff --git a/src/openqdc/datasets/comp6.py b/src/openqdc/datasets/comp6.py index 16f43ca..c95ec17 100644 --- a/src/openqdc/datasets/comp6.py +++ b/src/openqdc/datasets/comp6.py @@ -23,20 +23,20 @@ class COMP6(BaseDataset): __name__ = "comp6" # watchout that forces are stored as -grad(E) - __energy_unit__ = "hartree" - __distance_unit__ = "ang" - __forces_unit__ = "hartree/ang" + __energy_unit__ = "kcal/mol" + __distance_unit__ = "bohr" # bohr + __forces_unit__ = "kcal/mol/bohr" __energy_methods__ = [ "wb97x/6-31g*", - "b3lyp-d3m(bj)_tz", - "b3lyp_tz", - "hf_tz", - "pbe-d3(bj)_dz", - "pbe_tz", - "svwm_tz", - "wb97m-d3(bj)_tz", - "wb97m_tz", + "b3lyp-d3mbj/def2-tzvp", + "b3lyp/def2-tzvp", + "hf/def2-tzvp", + "pbe-d3bj/def2-tzvp", + "pbe/def2-tzvp", + "svwn/def2-tzvp", + "wb97m-d3bj/def2-tzvp", + "wb97m/def2-tzvp", ] energy_target_names = [ @@ -59,9 +59,6 @@ class COMP6(BaseDataset): "Gradient", ] - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - def read_raw_entries(self): samples = [] for subset in ["ani_md", "drugbank", "gdb7_9", "gdb10_13", "s66x8", "tripeptides"]: diff --git a/src/openqdc/datasets/dess.py b/src/openqdc/datasets/dess.py index 7136ab1..80b1e1c 100644 --- a/src/openqdc/datasets/dess.py +++ b/src/openqdc/datasets/dess.py @@ -34,15 +34,18 @@ def read_mol(mol_path, smiles, subset, targets): class DESS(BaseDataset): __name__ = "dess" + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" __energy_methods__ = [ - "mp2_cc", - "mp2_qz", - "mp2_tz", - "mp2_cbs", - "ccsd(t)_cc", - "ccsd(t)_cbs", - "ccsd(t)_nn", - "sapt", + "mp2/cc-pvdz", + "mp2/cc-pvqz", + "mp2/cc-pvtz", + "mp2/cbs", + "ccsd(t)/cc-pvdz", + "ccsd(t)/cbs", # cbs + "ccsd(t)/nn", # nn + "sapt0/aug-cc-pwcvxz", ] energy_target_names = [ @@ -59,9 +62,6 @@ class DESS(BaseDataset): partitions = ["DES370K", "DES5M"] - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - def _read_raw_(self, part): df = pd.read_csv(p_join(self.root, f"{part}.csv")) for col in self.energy_target_names: diff --git a/src/openqdc/datasets/dummy.py b/src/openqdc/datasets/dummy.py new file mode 100644 index 0000000..4e1ff17 --- /dev/null +++ b/src/openqdc/datasets/dummy.py @@ -0,0 +1,47 @@ +import numpy as np # noqa +from sklearn.utils import Bunch + +from openqdc.datasets.base import BaseDataset + + +class Dummy(BaseDataset): + """ + Dummy dataset + """ + + __name__ = "dummy" + __energy_methods__ = ["I_solved_the_schrodinger_equation_by_hand"] + __force_methods__ = ["I_made_up_random_forces"] + __energy_unit__ = "kcal/mol" + __distance_unit__ = "ang" + __forces_unit__ = "kcal/mol/ang" + + energy_target_names = ["energy"] + + force_target_names = ["forces"] + + def __init__(self, energy_unit=None, distance_unit=None, cache_dir=None) -> None: + try: + super().__init__(energy_unit=energy_unit, distance_unit=distance_unit, cache_dir=cache_dir) + except: # noqa + pass + + def read_raw_entries(self): + pass + + def __len__(self): + return 999999999 + + def __getitem__(self, idx: int): + size = np.random.randint(1, 250) + z = np.random.randint(1, 100, size) + return Bunch( + positions=np.random.rand(size, 3) * 10, + atomic_numbers=z, + charges=np.random.randint(-1, 2, size), + e0=np.zeros(size), + energies=np.random.rand(1) * 100, + name="dummy_{}".format(idx), + subset="dummy", + forces=np.random.rand(size, 3) * 100, + ) diff --git a/src/openqdc/datasets/gdml.py b/src/openqdc/datasets/gdml.py index 80ab0ba..789f84a 100644 --- a/src/openqdc/datasets/gdml.py +++ b/src/openqdc/datasets/gdml.py @@ -32,9 +32,9 @@ class GDML(BaseDataset): __energy_methods__ = [ "ccsd/cc-pvdz", "ccsd(t)/cc-pvdz", - # "pbe+mbd/light", #MD22 + "pbe/mbd", # MD22 # "pbe+mbd/tight", #MD22 - "pbe+vdw-ts", # MD17 + "pbe/vdw-ts", # MD17 ] energy_target_names = [ @@ -46,9 +46,9 @@ class GDML(BaseDataset): __force_methods__ = [ "ccsd/cc-pvdz", "ccsd(t)/cc-pvdz", - # "pbe+mbd/light", #MD22 + "pbe/mbd", # MD22 # "pbe+mbd/tight", #MD22 - "pbe+vdw-ts", # MD17 + "pbe/vdw-ts", # MD17 ] force_target_names = [ @@ -58,11 +58,8 @@ class GDML(BaseDataset): ] __energy_unit__ = "kcal/mol" - __distance_unit__ = "ang" - __forces_unit__ = "kcal/mol/ang" - - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) + __distance_unit__ = "bohr" + __forces_unit__ = "kcal/mol/bohr" def read_raw_entries(self): raw_path = p_join(self.root, "gdml.h5") diff --git a/src/openqdc/datasets/geom.py b/src/openqdc/datasets/geom.py index 065606d..c016a9f 100644 --- a/src/openqdc/datasets/geom.py +++ b/src/openqdc/datasets/geom.py @@ -87,9 +87,6 @@ class GEOM(BaseDataset): partitions = ["qm9", "drugs"] - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - def _read_raw_(self, partition): raw_path = p_join(self.root, "rdkit_folder") diff --git a/src/openqdc/datasets/iso_17.py b/src/openqdc/datasets/iso_17.py index be811c4..735ae67 100644 --- a/src/openqdc/datasets/iso_17.py +++ b/src/openqdc/datasets/iso_17.py @@ -24,7 +24,7 @@ class ISO17(BaseDataset): __name__ = "iso_17" __energy_methods__ = [ - "pbe+vdw-ts", + "pbe/vdw-ts", ] energy_target_names = [ @@ -32,7 +32,7 @@ class ISO17(BaseDataset): ] __force_methods__ = [ - "pbe+vdw-ts", + "pbe/vdw-ts", ] force_target_names = [ @@ -40,11 +40,8 @@ class ISO17(BaseDataset): ] __energy_unit__ = "ev" - __distance_unit__ = "ang" - __forces_unit__ = "ev/ang" - - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) + __distance_unit__ = "bohr" # bohr + __forces_unit__ = "ev/bohr" def read_raw_entries(self): raw_path = p_join(self.root, "iso_17.h5") diff --git a/src/openqdc/datasets/molecule3d.py b/src/openqdc/datasets/molecule3d.py index 9a49445..dc47e53 100644 --- a/src/openqdc/datasets/molecule3d.py +++ b/src/openqdc/datasets/molecule3d.py @@ -84,15 +84,12 @@ class Molecule3D(BaseDataset): __name__ = "molecule3d" __energy_methods__ = ["b3lyp/6-31g*"] # UNITS MOST LIKELY WRONG, MUST CHECK THEM MANUALLY - __energy_unit__ = "hartree" + __energy_unit__ = "ev" # CALCULATED __distance_unit__ = "ang" - __forces_unit__ = "hartree/ang" + __forces_unit__ = "ev/ang" energy_target_names = ["b3lyp/6-31g*.energy"] - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - def read_raw_entries(self): raw = p_join(self.root, "data", "raw") sdf_paths = glob(p_join(raw, "*.sdf")) diff --git a/src/openqdc/datasets/nabladft.py b/src/openqdc/datasets/nabladft.py index 58f7839..e7d9eb8 100644 --- a/src/openqdc/datasets/nabladft.py +++ b/src/openqdc/datasets/nabladft.py @@ -4,10 +4,10 @@ import datamol as dm import numpy as np -from nablaDFT.dataset import HamiltonianDatabase from tqdm import tqdm from openqdc.datasets.base import BaseDataset +from openqdc.utils.package_utils import requires_package def to_mol(entry) -> Dict[str, np.ndarray]: @@ -26,7 +26,10 @@ def to_mol(entry) -> Dict[str, np.ndarray]: return res +@requires_package("nablaDFT") def read_chunk_from_db(raw_path, start_idx, stop_idx, step_size=1000): + from nablaDFT.dataset import HamiltonianDatabase + print(f"Loading from {start_idx} to {stop_idx}") db = HamiltonianDatabase(raw_path) idxs = list(np.arange(start_idx, stop_idx)) @@ -58,13 +61,13 @@ class NablaDFT(BaseDataset): energy_target_names = ["wb97x-d/def2-svp"] __energy_unit__ = "hartree" - __distance_unit__ = "ang" - __forces_unit__ = "hartree/ang" - - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) + __distance_unit__ = "bohr" + __forces_unit__ = "hartree/bohr" + @requires_package("nablaDFT") def read_raw_entries(self): + from nablaDFT.dataset import HamiltonianDatabase + raw_path = p_join(self.root, "dataset_full.db") train = HamiltonianDatabase(raw_path) n, c = len(train), 20 diff --git a/src/openqdc/datasets/orbnet_denali.py b/src/openqdc/datasets/orbnet_denali.py index a39933c..614e252 100644 --- a/src/openqdc/datasets/orbnet_denali.py +++ b/src/openqdc/datasets/orbnet_denali.py @@ -53,15 +53,11 @@ class OrbnetDenali(BaseDataset): __name__ = "orbnet_denali" __energy_methods__ = ["wb97x-d3/def2-tzvp", "gfn1_xtb"] - # not sure probably Hartree ang -> must manually check energy_target_names = ["dft_energy", "xtb1_energy"] __energy_unit__ = "hartree" __distance_unit__ = "ang" __forces_unit__ = "hartree/ang" - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - def read_raw_entries(self): label_path = p_join(self.root, "denali_labels.csv") df = pd.read_csv(label_path, usecols=["sample_id", "mol_id", "subset", "dft_energy", "xtb1_energy"]) diff --git a/src/openqdc/datasets/qm7x.py b/src/openqdc/datasets/qm7x.py index be68794..eb8b015 100644 --- a/src/openqdc/datasets/qm7x.py +++ b/src/openqdc/datasets/qm7x.py @@ -35,11 +35,11 @@ def read_mol(mol_h5, mol_name, energy_target_names, force_target_names): class QM7X(BaseDataset): __name__ = "qm7x" - __energy_methods__ = ["pbe0+mbd", "dft3b+mbd"] + __energy_methods__ = ["pbe0/mbd", "dft3b"] energy_target_names = ["ePBE0", "eMBD"] - __force_methods__ = ["pbe0+mbd", "dft3b+mbd"] + __force_methods__ = ["pbe0/mbd", "dft3b"] force_target_names = ["pbe0FOR", "vdwFOR"] diff --git a/src/openqdc/datasets/qmugs.py b/src/openqdc/datasets/qmugs.py index e1ca2c2..c75f8b5 100644 --- a/src/openqdc/datasets/qmugs.py +++ b/src/openqdc/datasets/qmugs.py @@ -52,7 +52,7 @@ class QMugs(BaseDataset): """ __name__ = "qmugs" - __energy_methods__ = ["gfn2_xtb", "b3lyp/6-31g*"] + __energy_methods__ = ["gfn2_xtb", "wb97x-d/def2-svp"] __energy_unit__ = "hartree" __distance_unit__ = "ang" __forces_unit__ = "hartree/ang" @@ -62,9 +62,6 @@ class QMugs(BaseDataset): "DFT:TOTAL_ENERGY", ] - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - def read_raw_entries(self): raw_path = p_join(self.root, "structures") mol_dirs = [p_join(raw_path, d) for d in os.listdir(raw_path)] diff --git a/src/openqdc/datasets/sn2_rxn.py b/src/openqdc/datasets/sn2_rxn.py index fcdcf24..3e75e91 100644 --- a/src/openqdc/datasets/sn2_rxn.py +++ b/src/openqdc/datasets/sn2_rxn.py @@ -7,27 +7,24 @@ class SN2RXN(BaseDataset): __name__ = "sn2_rxn" __energy_methods__ = [ - "dsd-blyp-d3(bj)_tz", + "dsd-blyp-d3(bj)/def2-tzvp", ] __energy_unit__ = "ev" - __distance_unit__ = "ang" - __forces_unit__ = "ev/ang" + __distance_unit__ = "bohr" + __forces_unit__ = "ev/bohr" energy_target_names = [ "DSD-BLYP-D3(BJ):def2-TZVP Atomization Energy", ] __force_methods__ = [ - "dsd-blyp-d3(bj)_tz", + "dsd-blyp-d3(bj)/def2-tzvp", ] force_target_names = [ "DSD-BLYP-D3(BJ):def2-TZVP Gradient", ] - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - def read_raw_entries(self): raw_path = p_join(self.root, "sn2_rxn.h5") samples = read_qc_archive_h5(raw_path, "sn2_rxn", self.energy_target_names, self.force_target_names) diff --git a/src/openqdc/datasets/solvated_peptides.py b/src/openqdc/datasets/solvated_peptides.py index 9b44b76..9846bdf 100644 --- a/src/openqdc/datasets/solvated_peptides.py +++ b/src/openqdc/datasets/solvated_peptides.py @@ -27,9 +27,6 @@ class SolvatedPeptides(BaseDataset): __distance_unit__ = "bohr" __forces_unit__ = "hartree/bohr" - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - def read_raw_entries(self): raw_path = p_join(self.root, "solvated_peptides.h5") samples = read_qc_archive_h5(raw_path, "solvated_peptides", self.energy_target_names, self.force_target_names) diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/spice.py index ace4ecc..974d45f 100644 --- a/src/openqdc/datasets/spice.py +++ b/src/openqdc/datasets/spice.py @@ -6,7 +6,6 @@ from openqdc.datasets.base import BaseDataset from openqdc.utils import load_hdf5_file -from openqdc.utils.constants import MAX_ATOMIC_NUMBER from openqdc.utils.molecule import get_atomic_number_and_charge @@ -21,7 +20,9 @@ def read_record(r): name=np.array([smiles] * n_confs), subset=np.array([Spice.subset_mapping[subset]] * n_confs), energies=r[Spice.energy_target_names[0]][:][:, None].astype(np.float32), - forces=r[Spice.force_target_names[0]][:].reshape(-1, 3, 1) * (-1.0), # forces -ve of energy gradient + forces=r[Spice.force_target_names[0]][:].reshape( + -1, 3, 1 + ), # forces -ve of energy gradient but the -1.0 is done in the convert_forces method atomic_inputs=np.concatenate( (x[None, ...].repeat(n_confs, axis=0), positions), axis=-1, dtype=np.float32 ).reshape(-1, 5), @@ -49,38 +50,16 @@ class Spice(BaseDataset): """ __name__ = "spice" - __energy_methods__ = ["wb97x/def2-tzvp"] - __force_methods__ = ["wb97x/def2-tzvp"] + __energy_methods__ = ["wb97m-d3bj/def2-tzvppd"] + __force_methods__ = ["wb97m-d3bj/def2-tzvppd"] __energy_unit__ = "hartree" - __distance_unit__ = "ang" - __forces_unit__ = "hartree/ang" + __distance_unit__ = "bohr" + __forces_unit__ = "hartree/bohr" energy_target_names = ["dft_total_energy"] force_target_names = ["dft_total_gradient"] - # Energy in hartree, all zeros by default - atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) - tmp = { - 35: -2574.2451510945853, - 6: -37.91424135791358, - 20: -676.9528465198214, - 17: -460.3350243496703, - 9: -99.91298732343974, - 1: -0.5027370838721259, - 53: -297.8813829975981, - 19: -599.8025677513111, - 3: -7.285254714046546, - 12: -199.2688420040449, - 7: -54.62327513368922, - 11: -162.11366478783253, - 8: -75.17101657391741, - 15: -341.3059197024934, - 16: -398.2405387031612, - } - for key in tmp: - atomic_energies[key] = tmp[key] - subset_mapping = { "SPICE Solvated Amino Acids Single Points Dataset v1.1": "Solvated Amino Acids", "SPICE Dipeptides Single Points Dataset v1.2": "Dipeptides", @@ -96,8 +75,8 @@ class Spice(BaseDataset): "SPICE Ion Pairs Single Points Dataset v1.1": "Ion Pairs", } - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) + def convert_forces(self, x): + return (-1.0) * super().convert_forces(x) def read_raw_entries(self): raw_path = p_join(self.root, "SPICE-1.1.4.hdf5") diff --git a/src/openqdc/datasets/tmqm.py b/src/openqdc/datasets/tmqm.py index 4fae561..8952aaa 100644 --- a/src/openqdc/datasets/tmqm.py +++ b/src/openqdc/datasets/tmqm.py @@ -47,12 +47,13 @@ def read_xyz(fname, e_map): class TMQM(BaseDataset): __name__ = "tmqm" - __energy_methods__ = ["tpssh/def2tzvp"] + __energy_methods__ = ["tpssh/def2-tzvp"] energy_target_names = ["TPSSh/def2TZVP level"] - def __init__(self) -> None: - super().__init__() + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" def read_raw_entries(self): df = pd.read_csv(p_join(self.root, "tmQM_y.csv"), sep=";", usecols=["CSD_code", "Electronic_E"]) diff --git a/src/openqdc/datasets/transition1x.py b/src/openqdc/datasets/transition1x.py index 6a6f844..56ae7e6 100644 --- a/src/openqdc/datasets/transition1x.py +++ b/src/openqdc/datasets/transition1x.py @@ -55,9 +55,6 @@ class Transition1X(BaseDataset): "wB97x_6-31G(d).forces", ] - def __init__(self) -> None: - super().__init__() - def read_raw_entries(self): raw_path = p_join(self.root, "Transition1x.h5") f = load_hdf5_file(raw_path)["data"] diff --git a/src/openqdc/datasets/waterclusters3_30.py b/src/openqdc/datasets/waterclusters3_30.py index 1de2e14..6aa5748 100644 --- a/src/openqdc/datasets/waterclusters3_30.py +++ b/src/openqdc/datasets/waterclusters3_30.py @@ -5,6 +5,7 @@ from tqdm import tqdm from openqdc.datasets.base import BaseDataset +from openqdc.utils.constants import MAX_ATOMIC_NUMBER from openqdc.utils.molecule import atom_table # we could use ase.io.read to read extxyz files @@ -50,14 +51,15 @@ def read_xyz(fname, n_waters): class WaterClusters(BaseDataset): __name__ = "waterclusters3_30" - # need to know where to find the data - __energy_methods__ = ["ttm2.1-f"] + # Energy in hartree, all zeros by default + atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) + __energy_unit__ = "kcal/mol" + __distance_unit__ = "ang" + __forces_unit__ = "kcal/mol/ang" + __energy_methods__ = ["ttm2.1-f"] energy_target_names = ["TTM2.1-F Potential"] - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - def read_raw_entries(self): samples = [] for i in range(3, 31): diff --git a/src/openqdc/utils/__init__.py b/src/openqdc/utils/__init__.py index 92eec25..aeb5321 100644 --- a/src/openqdc/utils/__init__.py +++ b/src/openqdc/utils/__init__.py @@ -1,13 +1,17 @@ from .io import ( check_file, create_hdf5_file, + get_local_cache, + get_remote_cache, load_hdf5_file, load_json, load_pkl, load_torch, makedirs, save_pkl, + set_cache_dir, ) +from .units import get_conversion __all__ = [ "load_pkl", @@ -18,4 +22,8 @@ "load_torch", "create_hdf5_file", "check_file", + "set_cache_dir", + "get_local_cache", + "get_remote_cache", + "get_conversion", ] diff --git a/src/openqdc/utils/atomization_energies.py b/src/openqdc/utils/atomization_energies.py new file mode 100644 index 0000000..40d0d13 --- /dev/null +++ b/src/openqdc/utils/atomization_energies.py @@ -0,0 +1,1877 @@ +from typing import Dict, Tuple, TypeAlias + +import numpy as np +from loguru import logger + +from openqdc.utils.constants import MAX_ATOMIC_NUMBER + +__all__ = ["chemical_symbols", "atomic_numbers", "IsolatedAtomEnergyFactory"] + +EF_KEY: TypeAlias = Tuple[str, int] + +ATOM_SPECIES = "H", "Li", "B", "C", "N", "O", "F", "Na", "Mg", "Si", "P", "S", "Cl", "K", "Ca", "Br", "I" +# Energy in atomic unit/ Hartree / Ang + +# didn t calculate for Pd, Pt, Mo, Ni, Fe, Cu, see DESS +atomic_numbers = {} +chemical_symbols = [ + "X", + "H", + "He", + "Li", + "Be", + "B", + "C", + "N", + "O", + "F", + "Ne", + "Na", + "Mg", + "Al", + "Si", + "P", + "S", + "Cl", + "Ar", + "K", + "Ca", + "Sc", + "Ti", + "V", + "Cr", + "Mn", + "Fe", + "Co", + "Ni", + "Cu", + "Zn", + "Ga", + "Ge", + "As", + "Se", + "Br", + "Kr", + "Rb", + "Sr", + "Y", + "Zr", + "Nb", + "Mo", + "Tc", + "Ru", + "Rh", + "Pd", + "Ag", + "Cd", + "In", + "Sn", + "Sb", + "Te", + "I", + "Xe", + "Cs", + "Ba", + "La", + "Ce", + "Pr", + "Nd", + "Pm", + "Sm", + "Eu", + "Gd", + "Tb", + "Dy", + "Ho", + "Er", + "Tm", + "Yb", + "Lu", + "Hf", + "Ta", + "W", + "Re", + "Os", + "Ir", + "Pt", + "Au", + "Hg", + "Tl", + "Pb", + "Bi", + "Po", + "At", + "Rn", + "Fr", + "Ra", + "Ac", + "Th", + "Pa", + "U", + "Np", + "Pu", + "Am", + "Cm", + "Bk", + "Cf", + "Es", + "Fm", + "Md", + "No", + "Lr", +] + + +for Z, symbol in enumerate(chemical_symbols): + atomic_numbers[symbol] = Z + + +class IsolatedAtomEnergyFactory: + """ + Factory method to get the isolated atom energies for a given level of theory. + """ + + max_charge = 4 + + def __init__(self): + pass + + def __call__(self, level_of_theory: str): + """ + Wrapper to the get method + + Parameters + ---------- + level_of_theory: str + """ + return self.get(level_of_theory=level_of_theory) + + @staticmethod + def get(level_of_theory: str) -> Dict[EF_KEY, float]: + """ + Get the dict isolated atom energies for a given level of theory + + Parameters + ---------- + level_of_theory: str + Level of theory in the format "functional/basis" or "functional" if semi empirical + + Returns + ------- + dict[tuple[str, int], float] + Dictionary containing the isolated atom energies for each entry written as a tuple (atom, charge): + + {("H", 1): 0.0, ...} + + """ + level_of_theory = level_of_theory.lower() + is_dft = True + try: + func, basis = level_of_theory.split("/") + except ValueError: + func = level_of_theory + is_dft = not is_dft + functional_dict = ISOLATED_ATOM_ENERGIES.get(func, None) + if functional_dict is None: + logger.warning(f"Isolated atom energies not found for {level_of_theory}") + return ZEROS + if not is_dft: + return functional_dict + return functional_dict.get(basis, ZEROS) + + @staticmethod + def get_matrix(level_of_theory: str) -> np.ndarray: + """ + Get the matrix of isolated atom energies for a given level of theory + + Parameters + ---------- + level_of_theory: str + Level of theory in the format "functional/basis" or "functional" if semi empirical + + Returns + ------- + np.ndarray of shape (MAX_ATOMIC_NUMBER, 2 * max_charge + 1) + Matrix containing the isolated atom energies for each atom and charge written in the form: + + | | -2 | -1 | 0 | +1 | +2 | <- charges + |---|----|----|---|----|----| + | 0 | | | | | | + | 1 | | | | | | + | 2 | | | | | | + + """ + shift = IsolatedAtomEnergyFactory.max_charge + matrix = np.zeros((MAX_ATOMIC_NUMBER, shift * 2 + 1)) + tuple_hashmap = IsolatedAtomEnergyFactory.get(level_of_theory) + if tuple_hashmap is None: + return matrix + for key in tuple_hashmap.keys(): + matrix[atomic_numbers[key[0]], key[1] + shift] = tuple_hashmap[key] + return matrix + + +ZEROS = { + ("Br", -1): 0.0, + ("Br", 0): 0.0, + ("C", -1): 0.0, + ("C", 0): 0.0, + ("C", 1): 0.0, + ("Ca", 2): 0.0, + ("Cl", -1): 0.0, + ("Cl", 0): 0.0, + ("F", -1): 0.0, + ("F", 0): 0.0, + ("H", 0): 0.0, + ("I", -1): 0.0, + ("I", 0): 0.0, + ("K", 1): 0.0, + ("Li", 1): 0.0, + ("Mg", 2): 0.0, + ("N", -1): 0.0, + ("N", 0): 0.0, + ("N", 1): 0.0, + ("Na", 1): 0.0, + ("O", -1): 0.0, + ("O", 0): 0.0, + ("O", 1): 0.0, + ("P", 0): 0.0, + ("P", 1): 0.0, + ("S", -1): 0.0, + ("S", 0): 0.0, + ("S", 1): 0.0, +} + +wb97m_d3bj_def2_tzvp = { + ("Br", -1): -2574.2451510945853, + ("Br", 0): -2574.1167240829964, + ("C", -1): -37.91424135791358, + ("C", 0): -37.87264507233593, + ("C", 1): -37.45349214963933, + ("Ca", 2): -676.9528465198214, + ("Cl", -2): -459.6072967078548, + ("Cl", -1): -460.3350243496703, + ("Cl", 0): -460.1988762285739, + ("Cl", 2): -458.7433813454319, + ("F", -1): -99.91298732343974, + ("F", 0): -99.78611622985483, + ("H", -1): -0.5027370838721212, + ("H", 0): -0.4987605100487341, + ("H", 1): 0.0, + ("I", -1): -297.8813829975981, + ("I", 0): -297.76228914445625, + ("K", 1): -599.8025677513111, + ("Li", 1): -7.285254714046546, + ("Mg", 2): -199.2688420040449, + ("N", -1): -54.602291095426494, + ("N", 0): -54.62327513368922, + ("N", 1): -54.08594142587869, + ("Na", 1): -162.11366478783253, + ("O", -1): -75.17101657391741, + ("O", 0): -75.11317840410095, + ("O", 1): -74.60241514396725, + ("P", 0): -341.3059197024934, + ("P", 1): -340.9258392474849, + ("S", -1): -398.2405387031612, + ("S", 0): -398.1599636677874, + ("S", 1): -397.7746615977658, +} +GFN1 = { + ("H", -1): -0.5678094489236601, + ("H", 0): -0.4014294744618301, + ("H", 1): 0.2350495, + ("Li", 1): 0.13691666666666666, + ("B", -3): -1.652343221335327, + ("B", -1): -1.3514075648859643, + ("B", 0): -1.1998696279038876, + ("B", 3): 2.7107996287190113, + ("C", -1): -1.9170116002810327, + ("C", 0): -1.7411359557542052, + ("C", 1): -1.1060742863488982, + ("N", -1): -3.128423313087365, + ("N", 0): -2.8988862104065958, + ("N", 1): -2.1782414865973068, + ("O", -1): -4.705386032968986, + ("O", 0): -4.352652340864803, + ("O", 1): -3.3929027848641797, + ("F", -1): -5.322297034311178, + ("F", 0): -4.9969448424630265, + ("Na", 1): 0.12295400000000001, + ("Mg", 2): 1.0016353333333334, + ("Si", 4): 5.448927240930351, + ("Si", 0): -1.625263132618416, + ("Si", -4): -4.503876330547808, + ("P", 0): -2.4250620380497385, + ("P", 1): -1.7319786163576927, + ("S", -1): -3.761566793286506, + ("S", 0): -3.535920743315634, + ("S", 1): -2.772567335542398, + ("Cl", -2): -4.177925186599567, + ("Cl", -1): -4.527948236258716, + ("Cl", 0): -4.166353944016668, + ("Cl", 2): -2.3809951798365505, + ("K", 1): 0.08160976666666667, + ("Ca", 2): 0.5662308, + ("Br", -1): -3.957113536482028, + ("Br", 0): -3.818039553459528, + ("I", -1): -4.043592677461303, + ("I", 0): -3.885757275227844, +} +GFN2 = { + ("H", -1): -0.6107466928548624, + ("H", 0): -0.3934827590437188, + ("H", 1): 0.22955216666666667, + ("Li", 1): 0.1659637, + ("B", -3): 0.4947743711421284, + ("B", -1): -0.8833252789733281, + ("B", 0): -0.9524366145568732, + ("B", 3): 2.886742362272, + ("C", -1): -1.9209221941523813, + ("C", 0): -1.7951105194038206, + ("C", 1): -1.7951105194038206, + ("N", -1): -2.8228473813671173, + ("N", 0): -2.609452454632062, + ("N", 1): -1.9127945803017519, + ("O", -1): -4.0689442489122944, + ("O", 0): -3.769421095414337, + ("O", 1): -2.948538063156781, + ("F", -1): -4.909635517185826, + ("F", 0): -4.619339955465996, + ("Na", 1): 0.19548556666666667, + ("Mg", 2): 1.3160877333333334, + ("Si", 4): 4.473259319583333, + ("Si", 0): -1.5714240856447492, + ("Si", -4): -1.0243162958137662, + ("P", 0): -2.377807088085606, + ("P", 1): -1.8635041144652795, + ("S", -1): -3.4046900452338025, + ("S", 0): -3.1482710158768508, + ("S", 1): -2.5869831371080387, + ("Cl", -2): -4.249780801412338, + ("Cl", -1): -4.785133953760966, + ("Cl", 2): -2.6084223252074965, + ("Cl", 0): -4.482525134292114, + ("K", 1): 0.19157049999999998, + ("Ca", 2): 1.1759288, + ("Br", -1): -4.332231166471951, + ("Br", 0): -4.048339370569741, + ("I", -1): -4.060355599036047, + ("I", 0): -3.7796302627467933, +} +DFTB = { + ("H", -1): -0.267450800, + ("H", 0): -0.2386004000, + ("H", 1): 0.2097500000, + ("Li", 1): 0.000000000, + ("B", -3): 0.1087536003, + ("B", -1): -0.8108828001, + ("B", 0): -0.8263560001, + ("B", 3): 1.3330350000, + ("C", -1): -1.4104987700, + ("C", 0): -1.3984936602, + ("C", 1): -1.0217885507, + ("N", -1): -2.1474619199, + ("N", 0): -2.1021839400, + ("N", 1): -1.6260059609, + ("O", -1): -3.1706232699, + ("O", 0): -3.0861916005, + ("O", 1): -2.5063599300, + ("F", -1): -4.3647240000, + ("F", 0): -4.2352190003, + ("Na", 1): 0.0825500000, + ("Mg", 2): 0.4492000000, + ("Si", 4): 0.2875390800, + ("Si", 0): -1.0920777201, + ("Si", -4): 1.9808720000, + ("P", 0): -1.6295741400, + ("P", 1): -1.2821088196, + ("S", -1): -2.3857500900, + ("S", 0): -2.2921235603, + ("S", 1): -1.8696970300, + ("Cl", -2): -3.31200000, + ("Cl", -1): -3.2238180000, + ("Cl", 0): -3.0908230002, + ("Cl", 2): -1.7244330000, + ("K", 1): 0.0678210000, + ("Ca", 2): 0.3528980000, + ("Br", -1): -3.0478250000, + ("Br", 0): -2.9228540002, + ("I", -1): -2.6981275000, + ("I", 0): -2.5796080002, +} +PM6 = { + ("H", -1): 0.20069130482, + ("H", 0): 0.08302988483033709, + ("H", 1): 0.49634827548, + ("Li", 1): 0.23429648020984556, + ("B", -3): 1.042845967149475, + ("B", -1): 0.2915413006028599, + ("B", 0): 0.2162518784591137, + ("B", 3): 2.036692812374006, + ("C", -1): 0.3702885058222273, + ("C", 0): 0.34355728762455995, + ("C", 1): 0.5942116527412356, + ("N", -1): 0.29851662685316066, + ("N", 0): 0.3266578327960236, + ("N", 1): 0.8167661499675701, + ("O", -1): 0.06245921572439598, + ("O", 0): 0.2760200570828466, + ("O", 1): 0.6881966155067099, + ("F", -1): -0.09819551592088718, + ("F", 0): 0.030103153898987902, + ("Na", 1): 0.20761332506784766, + ("Mg", 2): 0.8654790767941177, + ("Si", 4): 2.6874249452995893, + ("Si", 0): 0.19559781612694002, + ("Si", -4): 0.909424581958187, + ("P", 0): 0.1881765839215055, + ("P", 1): 0.5283679118546506, + ("S", -1): 0.00773920374050412, + ("S", 0): 0.15340740929612162, + ("S", 1): 0.5198027279290017, + ("Cl", -2): 3.87282505908, + ("Cl", -1): -0.09598933242391743, + ("Cl", 2): 1.6530454862, + ("Cl", 0): 0.04614458119325779, + ("K", 1): 0.17382321209735638, + ("Ca", 2): 0.6490542924483952, + ("Br", -1): -0.0878626123290662, + ("Br", 0): 0.04068832478896717, + ("I", -1): -0.06868953273976947, + ("I", 0): 0.038916541436059084, +} + + +# tpssh/def2-tzvp +TMQM = { + ("H", -1): -0.5066148831768739, + ("H", 0): -0.4998936035891093, + ("H", 1): 0.0, + ("Li", 1): -7.285942861425713, + ("B", -3): -24.011884397333016, + ("B", -1): -24.671478908940745, + ("B", 0): -24.66555991803692, + ("B", 3): -22.03729209090186, + ("C", -1): -37.902383828698945, + ("C", 0): -37.8619600939805, + ("C", 1): -37.44108173595555, + ("N", -1): -54.58878376740317, + ("N", 0): -54.61011499135528, + ("N", 1): -54.07150720832228, + ("O", -1): -75.12797596615384, + ("O", 0): -75.0993524949928, + ("O", 1): -74.58770047919643, + ("F", -1): -99.86387164958151, + ("F", 0): -99.76596802854195, + ("Na", 1): -162.0916076478938, + ("Mg", 2): -199.24528576913457, + ("Si", 4): -285.59703939232946, + ("Si", 0): -289.3842044105128, + ("Si", -4): -288.1798768489279, + ("P", 0): -341.2798907965112, + ("P", 1): -340.89320025019333, + ("S", -1): -398.19525449701325, + ("S", 0): -398.130358877624, + ("S", 1): -397.7467993687058, + ("Cl", -2): -459.4908872312368, + ("Cl", -1): -460.28412127843484, + ("Cl", 0): -460.1641720279233, + ("Cl", 2): -458.485405333257, + ("K", 1): -599.7644436257333, + ("Ca", 2): -676.9154959968483, + ("Br", -1): -2574.1448096288846, + ("Br", 0): -2574.0232838745055, + ("I", -1): -297.70580680306847, + ("I", 0): -297.5887657326151, +} +# "wb97m-d3bj/def2-TZVPPD" +SPICE = { + ("H", -1): -0.5027370838426788, + ("H", 0): -0.4987605100487541, + ("H", 1): 0.0, + ("Li", 1): -7.285254714046117, + ("B", -3): -24.191211616488623, + ("B", -1): -24.677421752607636, + ("B", 0): -24.671520535412856, + ("B", 3): -22.051237471894204, + ("C", -1): -37.914241357934024, + ("C", 0): -37.872645072317844, + ("C", 1): -37.45349214963851, + ("N", -1): -54.602291095940885, + ("N", 0): -54.62327513391132, + ("N", 1): -54.08594142612827, + ("O", -1): -75.17101657361833, + ("O", 0): -75.11317840403545, + ("O", 1): -74.6024151438455, + ("F", -1): -99.9129873233742, + ("F", 0): -99.78611622966918, + ("Na", 1): -162.11366478753402, + ("Mg", 2): -199.26884200420963, + ("Si", 4): -285.6283113353237, + ("Si", 0): -289.413135230185, + ("Si", -4): -288.27589059244787, + ("P", 0): -341.3059197004091, + ("P", 1): -340.92583924542475, + ("S", -1): -398.24053870171247, + ("S", 0): -398.15996366615616, + ("S", 1): -397.7746615960709, + ("Cl", -2): -460.08763805127313, + ("Cl", -1): -460.33502435018204, + ("Cl", 0): -460.1988762286936, + ("Cl", 2): -458.7438528011782, + ("K", 1): -599.8025677532396, + ("Ca", 2): -676.9528465165403, + ("Br", -1): -2574.2451510820465, + ("Br", 0): -2574.1167240800246, + ("I", -1): -297.88138299501395, + ("I", 0): -297.7622891423178, +} +# "revpbe-d3(bj)/def2-tzvp" +SolvatedPeptides = { + ("H", -1): -0.4931715827683033, + ("H", 0): -0.5041476427597161, + ("H", 1): 0.0, + ("Li", 1): -7.280731201437635, + ("B", -3): -24.006372610643076, + ("B", -1): -24.660992037766704, + ("B", 0): -24.652853868669744, + ("B", 3): -22.023688582481086, + ("C", -1): -37.88698396215454, + ("C", 0): -37.845600548516586, + ("C", 1): -37.42375720909004, + ("N", -1): -54.56844448819074, + ("N", 0): -54.58772405988695, + ("N", 1): -54.04957647943518, + ("O", -1): -75.10545816278959, + ("O", 0): -75.07120398742593, + ("O", 1): -74.55841255571633, + ("F", -1): -99.83653702337733, + ("F", 0): -99.7348800787186, + ("Na", 1): -162.04202541023028, + ("Mg", 2): -199.1857779742493, + ("Si", 4): -285.5196533711662, + ("Si", 0): -289.31537776907356, + ("Si", -4): -288.11458640061954, + ("P", 0): -341.20094262951534, + ("P", 1): -340.81665455610573, + ("S", -1): -398.10497764958086, + ("S", 0): -398.04159371790865, + ("S", 1): -397.6599146755941, + ("Cl", -2): -459.3527862471638, + ("Cl", -1): -460.1836953722962, + ("Cl", 0): -460.0661711540315, + ("Cl", 2): -458.51775405333257, + ("K", 1): -599.6472569880391, + ("Ca", 2): -676.7916386065199, + ("Br", -1): -2574.0081469191155, + ("Br", 0): -2573.890240418883, + ("I", -1): -297.8357436124949, + ("I", 0): -297.72268439613055, +} +# "DSD-BLYP-D3BJ/def2-TZVPPD" +SN2RXN = { + ("H", -1): -0.4931715827683033, + ("H", 0): -0.4990585651127987, + ("H", 1): 0.0, + ("Li", 1): -7.2751828330696995, + ("B", -3): -24.127790514752746, + ("B", -1): -24.62825292497449, + ("B", 0): -24.628518170377323, + ("B", 3): -22.01440439226537, + ("C", -1): -37.85187643574064, + ("C", 0): -37.81800653654633, + ("C", 1): -37.4026616247957, + ("N", -1): -54.529773519860626, + ("N", 0): -54.55929475542038, + ("N", 1): -54.02654716655024, + ("O", -1): -75.08730105751656, + ("O", 0): -75.03632370546934, + ("O", 1): -74.53620016366052, + ("F", -1): -99.82374475663487, + ("F", 0): -99.6990797359127, + ("Na", 1): -161.96633141740327, + ("Mg", 2): -199.1186151803418, + ("Si", 4): -285.4592439444118, + ("Si", 0): -289.2354767511652, + ("Si", -4): -288.12487758144147, + ("P", 0): -341.1278868392075, + ("P", 1): -340.7469511203367, + ("S", -1): -398.0441756257772, + ("S", 0): -397.9705195592595, + ("S", 1): -397.5944122508692, + ("Cl", -2): -459.3527862471638, + ("Cl", -1): -460.13181548141955, + ("Cl", 0): -460.0006937311494, + ("Cl", 2): -458.51775405333257, + ("K", 1): -599.4901238823808, + ("Ca", 2): -676.6456698988475, + ("Br", -1): -2573.604327011817, + ("Br", 0): -2573.477602568216, + ("I", -1): -297.5733470600828, + ("I", 0): -297.4541938789708, +} +# "b3lyp/6-31g*" +QMUGS_DFT = { + ("H", -1): -0.4618190740256503, + ("H", 0): -0.5002733301377901, + ("H", 1): 0.0, + ("Li", 1): -7.284546111273075, + ("B", -3): -23.577268753399462, + ("B", -1): -24.614577395156598, + ("B", 0): -24.65435524492553, + ("B", 3): -22.018169862974275, + ("C", -1): -37.844269871879376, + ("C", 0): -37.84628033285479, + ("C", 1): -37.42731164237431, + ("N", -1): -54.52864356359092, + ("N", 0): -54.584488815424095, + ("N", 1): -54.0458621835885, + ("O", -1): -75.05272792994404, + ("O", 0): -75.06062109946738, + ("O", 1): -74.54659271939704, + ("F", -1): -99.75408410035712, + ("F", 0): -99.71553471526475, + ("Na", 1): -162.081235395777, + ("Mg", 2): -199.22734695613283, + ("Si", 4): -285.5564410277949, + ("Si", 0): -289.3717359984153, + ("Si", -4): -288.02795351148654, + ("P", 0): -341.2580911838578, + ("P", 1): -340.8765976669208, + ("S", -1): -398.16568433994024, + ("S", 0): -398.1049932797066, + ("S", 1): -397.7199808615457, + ("Cl", -2): -459.5066184980746, + ("Cl", -1): -460.25223446009306, + ("Cl", 0): -460.13624346967765, + ("Cl", 2): -458.6740467177361, + ("K", 1): -599.7247062673807, + ("Ca", 2): -676.8667395990246, + ("Br", -1): -2573.824201570383, + ("Br", 0): -2573.705283744811, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} +# "wb97x-d3/def2-tzvp" +ORBNET = { + ("H", -1): -0.5051390575292232, + ("H", 0): -0.5025865385814652, + ("H", 1): 0.0, + ("Li", 1): -7.289728176048259, + ("B", -3): -23.984063702375366, + ("B", -1): -24.655892805089884, + ("B", 0): -24.652426319775287, + ("B", 3): -22.068923453406843, + ("C", -1): -37.88249635015094, + ("C", 0): -37.84495506623085, + ("C", 1): -37.42572594563294, + ("N", -1): -54.566013571722955, + ("N", 0): -54.58956332659741, + ("N", 1): -54.053510120855016, + ("O", -1): -75.10770262264376, + ("O", 0): -75.07371685344017, + ("O", 1): -74.56770852466894, + ("F", -1): -99.84730255807874, + ("F", 0): -99.74441357744517, + ("Na", 1): -162.08090997566165, + ("Mg", 2): -199.2423311291131, + ("Si", 4): -285.61307018231093, + ("Si", 0): -289.36007009205474, + ("Si", -4): -288.13938913442, + ("P", 0): -341.2535866489386, + ("P", 1): -340.8713081439191, + ("S", -1): -398.17523835330115, + ("S", 0): -398.1081144325829, + ("S", 1): -397.7235371215097, + ("Cl", -2): -459.55571935610567, + ("Cl", -1): -460.26962615981756, + ("Cl", 0): -460.1472726772528, + ("Cl", 2): -458.68793188715097, + ("K", 1): -599.7560426196044, + ("Ca", 2): -676.9122500284535, + ("Br", -1): -2574.293316484485, + ("Br", 0): -2574.1721188129304, + ("I", -1): -297.8647496186801, + ("I", 0): -297.7482461760336, +} +# "wb97x-d/def2-svp" +NABLADFT = { + ("H", -1): -0.487196574630614, + ("H", 0): -0.5024927493280441, + ("H", 1): 0.0, + ("Li", 1): -7.289461512680954, + ("B", -3): -23.76326340520956, + ("B", -1): -24.616565541453497, + ("B", 0): -24.62229041950939, + ("B", 3): -22.05799995059738, + ("C", -1): -37.819977678758974, + ("C", 0): -37.79809943233551, + ("C", 1): -37.37569908192604, + ("N", -1): -54.459277717462086, + ("N", 0): -54.522416758144296, + ("N", 1): -53.98339066860825, + ("O", -1): -74.96664546628877, + ("O", 0): -74.97667950172594, + ("O", 1): -74.47138898492452, + ("F", -1): -99.66683980036512, + ("F", 0): -99.61447206028255, + ("Na", 1): -162.0226698276339, + ("Mg", 2): -199.1739400418112, + ("Si", 4): -285.52441678317916, + ("Si", 0): -289.2630396380861, + ("Si", -4): -287.76522279776617, + ("P", 0): -341.13939934765074, + ("P", 1): -340.75715448577955, + ("S", -1): -398.0129589348639, + ("S", 0): -397.9719510287289, + ("S", 1): -397.58695970543334, + ("Cl", -2): -459.17907026002734, + ("Cl", -1): -460.0809386171713, + ("Cl", 0): -459.9885726673416, + ("Cl", 2): -458.52265869014025, + ("K", 1): -599.6772169304438, + ("Ca", 2): -676.8244048230532, + ("Br", -1): -2573.9600885084546, + ("Br", 0): -2573.856581446253, + ("I", -1): -297.8445820598362, + ("I", 0): -297.7376955031015, +} +# "wb97x/6-31g(d)" +ANI1 = { + ("H", -1): -0.45658037701866955, + ("H", 0): -0.4993457316092281, + ("H", 1): 0.0, + ("Li", 1): -7.2856300653219614, + ("B", -3): -23.575157416550805, + ("B", -1): -24.603134775026213, + ("B", 0): -24.642610267398982, + ("B", 3): -22.07124234970699, + ("C", -1): -37.834042127064706, + ("C", 0): -37.83384116353608, + ("C", 1): -37.41881056856161, + ("N", -1): -54.513028620185864, + ("N", 0): -54.573313922039716, + ("N", 1): -54.036340248157515, + ("O", -1): -75.03386211245754, + ("O", 0): -75.04249624495868, + ("O", 1): -74.53884510892807, + ("F", -1): -99.7350451879463, + ("F", 0): -99.69494212517318, + ("Na", 1): -162.0682250235374, + ("Mg", 2): -199.22919949102433, + ("Si", 4): -285.5967323489095, + ("Si", 0): -289.3398443488577, + ("Si", -4): -288.0053873657048, + ("P", 0): -341.2319240654614, + ("P", 1): -340.85012602930203, + ("S", -1): -398.14261145000256, + ("S", 0): -398.0814606242194, + ("S", 1): -397.6998359561112, + ("Cl", -2): -459.479319530353, + ("Cl", -1): -460.2341096421279, + ("Cl", 0): -460.1166957612669, + ("Cl", 2): -458.6588365149308, + ("K", 1): -599.7184666927276, + ("Ca", 2): -676.8704088358037, + ("Br", -1): -2573.8502718776604, + ("Br", 0): -2573.733913792756, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} +# "WB97X/6-31g*" +COMP6_1 = { + ("H", -1): -0.4565803770186695, + ("H", 0): -0.4993457316092281, + ("H", 1): 0.0, + ("Li", 1): -7.285630065321961, + ("B", -3): -23.5751574165508, + ("B", -1): -24.603134775026216, + ("B", 0): -24.64261026739898, + ("B", 3): -22.071242349706992, + ("C", -1): -37.834042127064706, + ("C", 0): -37.83384116353608, + ("C", 1): -37.4188105685616, + ("N", -1): -54.5130286201859, + ("N", 0): -54.57331392203972, + ("N", 1): -54.03634024815754, + ("O", -1): -75.03386211245756, + ("O", 0): -75.0424962449587, + ("O", 1): -74.5388451089281, + ("F", -1): -99.7350451879463, + ("F", 0): -99.69494212517317, + ("Na", 1): -162.06822502353745, + ("Mg", 2): -199.2291994910244, + ("Si", 4): -285.5967323489095, + ("Si", 0): -289.3398443488578, + ("Si", -4): -288.00538736570485, + ("P", 0): -341.2319240654613, + ("P", 1): -340.85012602930215, + ("S", -1): -398.14261145000256, + ("S", 0): -398.0814606242193, + ("S", 1): -397.6998359561114, + ("Cl", -2): -459.47931953035305, + ("Cl", -1): -460.23410964212803, + ("Cl", 0): -460.1166957612671, + ("Cl", 2): -458.65883651493084, + ("K", 1): -599.7184666927277, + ("Ca", 2): -676.8704088358036, + ("Br", -1): -2573.8502718776604, + ("Br", 0): -2573.7339137927547, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} +# "ccsd/aug-cc-pVDZ" +ccsdaug = { + ("H", -1): -0.5240286252725133, + ("H", 0): -0.49933431543958506, + ("H", 1): 0.0, + ("Li", 1): -7.23623079003172, + ("B", -3): -24.135298809957895, + ("B", -1): -24.595731151135812, + ("B", 0): -24.591070884515084, + ("B", 3): -21.985913735106703, + ("C", -1): -37.80520563794191, + ("C", 0): -37.76484921430014, + ("C", 1): -37.35862660518426, + ("N", -1): -54.46561904421205, + ("N", 0): -54.48723914213882, + ("N", 1): -53.959899854043286, + ("O", -1): -74.96558003564495, + ("O", 0): -74.9255348291028, + ("O", 1): -74.4432579985748, + ("F", -1): -99.66462266282274, + ("F", 0): -99.54960172383534, + ("Na", 1): -161.67194573263333, + ("Mg", 2): -198.8268633109654, + ("Si", 4): -285.1795420310209, + ("Si", 0): -288.9225171059681, + ("Si", -4): -288.13012523255236, + ("P", 0): -340.80119511758613, + ("P", 1): -340.42190068851625, + ("S", -1): -397.67826887815926, + ("S", 0): -397.6146112492681, + ("S", 1): -397.2542253763525, + ("Cl", -2): -459.42201473799554, + ("Cl", -1): -459.7398865093852, + ("Cl", 0): -459.6156482951034, + ("Cl", 2): -458.1975299396907, + ("K", 1): None, # not available with this basis set + ("Ca", 2): None, # not available with this basis set + ("Br", -1): -2572.6265539931533, + ("Br", 0): -2572.5063313966352, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} +# "ccsd(t)/aug-cc-pVDZ" +ccsdtaug = { + ("H", -1): -0.489676276755859, + ("H", 0): -0.4993343154395853, + ("H", 1): 0.0, + ("Li", 1): -7.236230790031718, + ("B", -3): -24.14659676027675, + ("B", -1): -24.59834841644963, + ("B", 0): -24.592013924578307, + ("B", 3): -21.98591373510674, + ("C", -1): -37.80822234639533, + ("C", 0): -37.7661399495972, + ("C", 1): -37.3593489962868, + ("N", -1): -54.46970203317129, + ("N", 0): -54.488530163663306, + ("N", 1): -53.96079905255966, + ("O", -1): -74.97107484978555, + ("O", 0): -74.92736838177342, + ("O", 1): -74.44405741349318, + ("F", -1): -99.67058259815346, + ("F", 0): -99.55194323117622, + ("Na", 1): -161.67196199847683, + ("Mg", 2): -198.8269101640321, + ("Si", 4): -285.1796031904412, + ("Si", 0): -288.9239884021825, + ("Si", -4): -288.14250182593497, + ("P", 0): -340.80293105856066, + ("P", 1): -340.4231288782063, + ("S", -1): -397.68239119590464, + ("S", 0): -397.61679149962197, + ("S", 1): -397.2555638941634, + ("Cl", -1): -459.74421517568555, + ("Cl", 0): -459.6181191157645, + ("K", 1): None, # not available with this basis set + ("Ca", 2): None, # not available with this basis set + ("Br", -1): -2572.630606833861, + ("Br", 0): -2572.508930744571, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} +# "mp2/aug-cc-pVDZ" +mp2aug = { + ("H", -1): -0.5118536127440081, + ("H", 0): -0.4993343154395852, + ("H", 1): 0.0, + ("Li", 1): -7.2362434239942885, + ("B", -3): -24.11454063530035, + ("B", -1): -24.57403291869507, + ("B", 0): -24.568723938484855, + ("B", 3): -21.98592739023366, + ("C", -1): -37.78658968444089, + ("C", 0): -37.74289655875525, + ("C", 1): -37.33330128905729, + ("N", -1): -54.44347106000461, + ("N", 0): -54.46985977846849, + ("N", 1): -53.93770877612693, + ("O", -1): -74.95558042845218, + ("O", 0): -74.90882930239204, + ("O", 1): -74.42742702171483, + ("F", -1): -99.66810645703836, + ("F", 0): -99.5377379527871, + ("Na", 1): -161.67200581779124, + ("Mg", 2): -198.8269131203642, + ("Si", 4): -285.17950758651557, + ("Si", 0): -288.90336148257995, + ("Si", -4): -288.12382709478203, + ("P", 0): -340.78346939708916, + ("P", 1): -340.4015180393644, + ("S", -1): -397.6614469463811, + ("S", 0): -397.5953187556735, + ("S", 1): -397.236034450623, + ("Cl", -2): -459.4111711211486, + ("Cl", -1): -459.7293671162834, + ("Cl", 0): -459.5986332871817, + ("Cl", 2): -458.16109262813154, + ("K", 1): None, # not available with this basis set + ("Ca", 2): None, # not available with this basis set + ("Br", -1): -2571.9455214335435, + ("Br", 0): -2571.8203622687925, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} +# "mp2/def2-TZVP" +mp2def2TZVP = { + ("H", -1): -0.48253121006249655, + ("H", 0): -0.4998098322318883, + ("H", 1): 0.0, + ("Li", 1): -7.26625465274989, + ("B", -3): -23.89130329586724, + ("B", -1): -24.58967154224317, + ("B", 0): -24.59074548143485, + ("B", 3): -21.99943494200725, + ("C", -1): -37.81110910609783, + ("C", 0): -37.77471406753249, + ("C", 1): -37.36120515772786, + ("N", -1): -54.474221753525356, + ("N", 0): -54.51486367243164, + ("N", 1): -53.97922862858532, + ("O", -1): -75.00152176187984, + ("O", 0): -74.97513105465687, + ("O", 1): -74.48759502971161, + ("F", -1): -99.73457909250294, + ("F", 0): -99.62808382176112, + ("Na", 1): -161.83073450947992, + ("Mg", 2): -198.9798405609494, + ("Si", 4): -285.26774080524564, + ("Si", 0): -289.0086162111446, + ("Si", -4): -287.737519515362, + ("P", 0): -340.89251993087385, + ("P", 1): -340.5074615537276, + ("S", -1): -397.7717421040001, + ("S", 0): -397.71573728264894, + ("S", 1): -397.34975334831165, + ("Cl", -2): -459.09862455580026, + ("Cl", -1): -459.84969455647206, + ("Cl", 0): -459.7312731162239, + ("Cl", 2): -458.28486559837125, + ("K", 1): -599.1623610013563, + ("Ca", 2): -676.3191334447123, + ("Br", -1): -2572.8329868011315, + ("Br", 0): -2572.7140648042205, + ("I", -1): -297.32915651116025, + ("I", 0): -297.2135511448063, +} +# SVWN/def2-TZVP +COMP6_7 = { + ("H", -1): -0.5173468733170209, + ("H", 0): -0.4961415246858913, + ("H", 1): 0.0, + ("Li", 1): -7.182160595407815, + ("B", -3): -23.858154175760482, + ("B", -1): -24.477102446655582, + ("B", 0): -24.446672986035107, + ("B", 3): -21.78388674779827, + ("C", -1): -37.648803413486476, + ("C", 0): -37.57960202253736, + ("C", 1): -37.13377025356311, + ("N", -1): -54.268858501552714, + ("N", 0): -54.264236284313675, + ("N", 1): -53.69660297293359, + ("O", -1): -74.75021611814427, + ("O", 0): -74.68022879998783, + ("O", 1): -74.14595350398997, + ("F", -1): -99.4308126971536, + ("F", 0): -99.2855801211432, + ("Na", 1): -161.43940087938617, + ("Mg", 2): -198.482989208704, + ("Si", 4): -284.6095063412437, + ("Si", 0): None, + ("Si", -4): -287.36361152706985, + ("P", 0): -340.28781390909336, + ("P", 1): None, + ("S", -1): -396.74391290562517, + ("S", 0): -397.0472344910708, + ("S", 1): -396.6400428334645, + ("Cl", -2): None, + ("Cl", -1): -459.1427217366059, + ("Cl", 0): -457.029433121817, + ("Cl", 2): -457.5432679710133, + ("K", 1): -598.3826110301004, + ("Ca", 2): -675.4148005786843, + ("Br", -1): -2571.43279407191, + ("Br", 0): None, + ("I", -1): -297.89817894897124, + ("I", 0): None, +} +# "PBE-D3BJ2B/def2-TZVP" +COMP6_5 = { + ("H", -1): -0.4984251407077053, + ("H", 0): -0.49963874688778964, + ("H", 1): 0.0, + ("Li", 1): -7.256644236856915, + ("B", -3): -23.965651173919607, + ("B", -1): -24.61987718656591, + ("B", 0): -24.610084509857693, + ("B", 3): -21.981186468975643, + ("C", -1): -37.839839802893856, + ("C", 0): -37.79597394493031, + ("C", 1): -37.37216480722536, + ("N", -1): -54.51524854184836, + ("N", 0): -54.53214830302369, + ("N", 1): -53.99133373760564, + ("O", -1): -75.04792601078884, + ("O", 0): -75.00968214869428, + ("O", 1): -74.49434051926339, + ("F", -1): -99.77558183886408, + ("F", 0): -99.6691400940838, + ("Na", 1): -161.96413737180777, + ("Mg", 2): -199.10001096170987, + ("Si", 4): -285.4180171255296, + ("Si", 0): -289.2228701070572, + ("Si", -4): -288.0227167833236, + ("P", 0): -341.1030537066697, + ("P", 1): -340.7177213193741, + ("S", -1): -398.00391422389356, + ("S", 0): -397.93836821335026, + ("S", 1): -397.5554184472038, + ("Cl", -2): -459.386408262179, + ("Cl", -1): -460.0784728779802, + ("Cl", 0): -459.9584144179813, + ("Cl", 2): -458.5661867317756, + ("K", 1): -599.5277926006078, + ("Ca", 2): -676.665524794864, + ("Br", -1): -2573.8415230490864, + ("Br", 0): -2573.720729522128, + ("I", -1): -297.7815346863239, + ("I", 0): -297.66553802500096, +} +# "B3LYP-D3MBJ2B/def2-TZVP" +COMP6_2 = { + ("H", -1): -0.5104276111528594, + ("H", 0): -0.5021763508982502, + ("H", 1): 0.0, + ("Li", 1): -7.28605166725753, + ("B", -3): -24.00227248681287, + ("B", -1): -24.670150534162623, + ("B", 0): -24.66392221445664, + ("B", 3): -22.020454695632036, + ("C", -1): -37.89817823158867, + ("C", 0): -37.85948152785869, + ("C", 1): -37.43552078960403, + ("N", -1): -54.58873727556918, + ("N", 0): -54.60398141018468, + ("N", 1): -54.065523148633176, + ("O", -1): -75.13521710860505, + ("O", 0): -75.09628346877744, + ("O", 1): -74.57769937644677, + ("F", -1): -99.87634645410799, + ("F", 0): -99.77016379237457, + ("Na", 1): -162.09255440877646, + ("Mg", 2): -199.2394349246892, + ("Si", 4): -285.575845762374, + ("Si", 0): -289.3920722437195, + ("Si", -4): -288.17382798168956, + ("P", 0): -341.28064911053326, + ("P", 1): -340.89904032318145, + ("S", -1): -398.200223492228, + ("S", 0): -398.1324076067549, + ("S", 1): -397.7448455107872, + ("Cl", -2): -459.58678053070076, + ("Cl", -1): -460.2889124003806, + ("Cl", 0): -460.16699382696663, + ("Cl", 2): -458.70493083496865, + ("K", 1): -599.7602668684151, + ("Ca", 2): -676.9064118669689, + ("Br", -1): -2574.264312179195, + ("Br", 0): -2574.140975849301, + ("I", -1): -297.89704873064437, + ("I", 0): -297.7784640477503, +} +# "b3lyp/def2-TZVP" +COMP6_3 = { + ("H", -1): -0.5104276111528594, + ("H", 0): -0.5021763508982502, + ("H", 1): 0.0, + ("Li", 1): -7.2860516672575315, + ("B", -3): -24.002272486812885, + ("B", -1): -24.67015053416263, + ("B", 0): -24.663922214456655, + ("B", 3): -22.020454695632043, + ("C", -1): -37.89817823158866, + ("C", 0): -37.85948152785869, + ("C", 1): -37.435520789604034, + ("N", -1): -54.588737275569194, + ("N", 0): -54.603981410184666, + ("N", 1): -54.065523148633176, + ("O", -1): -75.13521710860508, + ("O", 0): -75.09628346877746, + ("O", 1): -74.57769937644687, + ("F", -1): -99.8763464541079, + ("F", 0): -99.7701637923746, + ("Na", 1): -162.0925544087764, + ("Mg", 2): -199.23943492468925, + ("Si", 4): -285.5758457623741, + ("Si", 0): -289.3920722437192, + ("Si", -4): -288.1738279816895, + ("P", 0): -341.28064911053326, + ("P", 1): -340.8990403231815, + ("S", -1): -398.2002234922283, + ("S", 0): -398.1324076067552, + ("S", 1): -397.744845510787, + ("Cl", -2): -459.58678053070065, + ("Cl", -1): -460.28891240038075, + ("Cl", 0): -460.1669938269668, + ("Cl", 2): -458.70493083496893, + ("K", 1): -599.7602668684153, + ("Ca", 2): -676.9064118669687, + ("Br", -1): -2574.264312179194, + ("Br", 0): -2574.140975849301, + ("I", -1): -297.8970487306444, + ("I", 0): -297.7784640477502, +} + +# ccsd(t)/cc-pVDZ +GDML_2 = { + ("H", -1): -0.489739656382323, + ("H", 0): -0.49927840341958285, + ("H", 1): 0.0, + ("Li", 1): -7.236223739656382, + ("B", -3): -23.61782373835322, + ("B", -1): -24.528388906235705, + ("B", 0): -24.590264050112527, + ("B", 3): -21.98588333987049, + ("C", -1): -37.688228871632006, + ("C", 0): -37.70277208656365, + ("C", 1): -37.3579597779074, + ("N", -1): -54.321974972075715, + ("N", 0): -54.373768477368074, + ("N", 1): -53.87510137954731, + ("O", -1): -74.87516352403559, + ("O", 0): -74.82827800838686, + ("O", 1): -74.30135465859384, + ("F", -1): -99.56030962418485, + ("F", 0): -99.52932183945009, + ("Na", 1): -161.67188329184694, + ("Mg", 2): -198.82669320079302, + ("Si", 4): -285.17919483395195, + ("Si", 0): -288.88085983569533, + ("Si", -4): -287.40461285633614, + ("P", 0): -340.7265584017754, + ("P", 1): -340.36984136674585, + ("S", -1): -397.63315120158666, + ("S", 0): -397.55317747510554, + ("S", 1): -397.1659426092399, + ("Cl", -1): -459.69470422539786, + ("Cl", 0): -459.60398876941906, + ("K", 1): None, # not available with this basis set + ("Ca", 2): -676.2271898047749, + ("Br", -1): -2572.584907858833, + ("Br", 0): -2572.4941153123455, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} +# ccsd(t)/cc-pVTZ +ANI1CCX_2 = { + ("H", -1): -0.4963122609799637, + ("H", 0): -0.49980981130184293, + ("H", 1): 0.0, + ("Li", 1): -7.249353374937752, + ("B", -3): -23.793685421585884, + ("B", -1): -24.56648780776967, + ("B", 0): -24.605381789792233, + ("B", 3): -21.991368552278544, + ("C", -1): -37.747141724045164, + ("C", 0): -37.735863889731654, + ("C", 1): -37.37850843579137, + ("N", -1): -54.41337048412563, + ("N", 0): -54.42353049479941, + ("N", 1): -53.91625772121427, + ("O", -1): -74.99249367544891, + ("O", 0): -74.90337716789482, + ("O", 1): -74.36027901195692, + ("F", -1): -99.71046952902925, + ("F", 0): -99.63219230886922, + ("Na", 1): -161.68615285472157, + ("Mg", 2): -198.8436504300981, + ("Si", 4): -285.2290232109956, + ("Si", 0): -288.954195226872, + ("Si", -4): -287.62141587617776, + ("P", 0): -340.79678977311414, + ("P", 1): -340.432199862984, + ("S", -1): -397.7409199255247, + ("S", 0): -397.6361063083311, + ("S", 1): -397.2347675440139, + ("Cl", -2): -459.069378694994, + ("Cl", -1): -459.8163494320064, + ("Cl", 0): -459.70310084056786, + ("Cl", 2): -458.277524056067, + ("K", 1): None, # not available with this basis set + ("Ca", 2): -676.3176100772968, + ("Br", -1): -2572.8167538662433, + ("Br", 0): -2572.702100151291, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} +# ccsd/cc-pVDZ +GDML_1 = { + ("H", -1): -0.49927840341958285, + ("H", 0): -0.49927840341958285, + ("H", 1): 0.0, + ("Li", 1): -7.236223739656382, + ("B", -3): -23.613877846876942, + ("B", -1): -24.52547666267111, + ("B", 0): -24.589429443373188, + ("B", 3): -21.98588333987049, + ("C", -1): -37.68362301484667, + ("C", 0): -37.69937564411741, + ("C", 1): -37.35727461654343, + ("N", -1): -54.31612564560329, + ("N", 0): -54.3667355223191, + ("N", 1): -53.871756805827864, + ("O", -1): -74.87454456240714, + ("O", 0): -74.82074180638969, + ("O", 1): -74.29143146516834, + ("F", -1): -99.55969095436343, + ("F", 0): -99.5284215563597, + ("Na", 1): -161.67186865791962, + ("Mg", 2): -198.826650230425, + ("Si", 4): -285.17913845059644, + ("Si", 0): -288.87753485972564, + ("Si", -4): -287.40275985231415, + ("P", 0): -340.7210732625289, + ("P", 1): -340.3662836136086, + ("S", -1): -397.631810717651, + ("S", 0): -397.54760940641853, + ("S", 1): -397.15909131565013, + ("Cl", -2): -458.6471183178738, + ("Cl", -1): -459.6933866998589, + ("Cl", 0): -459.60268687745884, + ("Cl", 2): -458.1932998145885, + ("K", 1): None, # not available with this basis set + ("Ca", 2): -676.2265307613668, + ("Br", -1): -2572.5834492880094, + ("Br", 0): -2572.492623348252, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} +# ccsd/cc-pVTZ +CCSD_VTZ = { + ("H", -1): -0.49631226097996367, + ("H", 0): -0.49980981130184293, + ("H", 1): 0.0, + ("Li", 1): -7.249353374937752, + ("B", -3): -23.78682468678494, + ("B", -1): -24.56193370904525, + ("B", 0): -24.60388179904298, + ("B", 3): -21.991368552278544, + ("C", -1): -37.74093800618891, + ("C", 0): -37.73042268826894, + ("C", 1): -37.377165803324715, + ("N", -1): -54.40441588438247, + ("N", 0): -54.4152043962678, + ("N", 1): -53.91038920924042, + ("O", -1): -74.98771409352835, + ("O", 0): -74.89293727915536, + ("O", 1): -74.34899994406153, + ("F", -1): -99.70481088713056, + ("F", 0): -99.62851668514091, + ("Na", 1): -161.68598877560345, + ("Mg", 2): -198.84332758531946, + ("Si", 4): -285.228514965889, + ("Si", 0): -288.9476846603088, + ("Si", -4): -287.6138873496766, + ("P", 0): -340.78870701737065, + ("P", 1): -340.42522678302885, + ("S", -1): -397.73415929387704, + ("S", 0): -397.62619555322124, + ("S", 1): -397.225460043223, + ("Cl", -2): -459.06087948746443, + ("Cl", -1): -459.80856103622415, + ("Cl", 0): -459.69693046874454, + ("Cl", 2): -458.26687876975234, + ("K", 1): None, # not available with this basis set + ("Ca", 2): -676.3160445414744, + ("Br", -1): -2572.8073946290465, + ("Br", 0): -2572.694327605488, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} +# hf/cc-pVDZ +ANI1X_1 = { + ("H", -1): -0.4488383380351602, + ("H", 0): -0.4992784034195828, + ("H", 1): 0.0, + ("Li", 1): -7.236120435571012, + ("B", -3): -23.517631518350836, + ("B", -1): -24.43849458753095, + ("B", 0): -24.52995828509406, + ("B", 3): -21.98542712791857, + ("C", -1): -37.57949842909864, + ("C", 0): -37.59598618627132, + ("C", 1): -37.28952528470851, + ("N", -1): -54.170756777551894, + ("N", 0): -54.251655645342815, + ("N", 1): -53.75577765594358, + ("O", -1): -74.72122641123744, + ("O", 0): -74.66528700138886, + ("O", 1): -74.16935785917661, + ("F", -1): -99.3660232395006, + ("F", 0): -99.37525020985224, + ("Na", 1): -161.67106997000676, + ("Mg", 2): -198.82420265081305, + ("Si", 4): -285.17413886038224, + ("Si", 0): -288.7869064370983, + ("Si", -4): -287.3055013422455, + ("P", 0): -340.6188035921855, + ("P", 1): -340.26328028589194, + ("S", -1): -397.506997287547, + ("S", 0): -397.4131194811572, + ("S", 1): -397.04821663752654, + ("Cl", -2): -458.49341773983207, + ("Cl", -1): -459.54222556583767, + ("Cl", 0): -459.4711432886898, + ("Cl", 2): -458.07541032143655, + ("K", 1): None, # not available with this basis set + ("Ca", 2): -676.1457625057777, + ("Br", -1): -2571.766685524917, + ("Br", 0): -2571.6943737649776, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} +# hf/cc-pVTZ +ANI1X_3 = { + ("H", -1): -0.4668418892599132, + ("H", 0): -0.49980981130184304, + ("H", 1): 0.0, + ("Li", 1): -7.236381928884647, + ("B", -3): -23.654030528094694, + ("B", -1): -24.45440782122731, + ("B", 0): -24.532065412570418, + ("B", 3): -21.985654326745827, + ("C", -1): -37.6036322232934, + ("C", 0): -37.602187116127666, + ("C", 1): -37.294742506720475, + ("N", -1): -54.20897619252452, + ("N", 0): -54.263903101255586, + ("N", 1): -53.765473796977965, + ("O", -1): -74.76618798136187, + ("O", 0): -74.6842428689006, + ("O", 1): -74.18751432538998, + ("F", -1): -99.42428986904464, + ("F", 0): -99.40551931536073, + ("Na", 1): -161.67601880318512, + ("Mg", 2): -198.82947207595663, + ("Si", 4): -285.1793556127226, + ("Si", 0): -288.7945961163259, + ("Si", -4): -287.41256067563575, + ("P", 0): -340.6294583289231, + ("P", 1): -340.2717794204319, + ("S", -1): -397.5319459632172, + ("S", 0): -397.4249161291449, + ("S", 1): -397.06067984991046, + ("Cl", -2): -458.80494925757927, + ("Cl", -1): -459.5646668064105, + ("Cl", 0): -459.4854291853036, + ("Cl", 2): -458.09232019709674, + ("K", 1): None, # not available with this basis set + ("Ca", 2): -676.1540716436532, + ("Br", -1): -2572.528468875192, + ("Br", 0): -2572.445069318686, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} + +# mp2/cc-pVDZ +DES1 = { + ("H", -1): -0.46472136044848017, + ("H", 0): -0.4992784034195828, + ("H", 1): 0.0, + ("Li", 1): -7.236236031279599, + ("B", -3): -23.59075634654498, + ("B", -1): -24.496049160245956, + ("B", 0): -24.56749154944109, + ("B", 3): -21.985897030619704, + ("C", -1): -37.65666509987848, + ("C", 0): -37.66302875884139, + ("C", 1): -37.3321238689667, + ("N", -1): -54.28620525567718, + ("N", 0): -54.334987200983385, + ("N", 1): -53.827357208281775, + ("O", -1): -74.86327217217499, + ("O", 0): -74.78617322485147, + ("O", 1): -74.25332362507456, + ("F", -1): -99.55668287878551, + ("F", 0): -99.51775797009576, + ("Na", 1): -161.67192521516694, + ("Mg", 2): -198.82669914019823, + ("Si", 4): -285.1791105165065, + ("Si", 0): -288.8472784365606, + ("Si", -4): -287.3919999801635, + ("P", 0): -340.6925553040255, + ("P", 1): -340.33066918694686, + ("S", -1): -397.61602048346754, + ("S", 0): -397.5157894668129, + ("S", 1): -397.126843359414, + ("Cl", -2): -458.63292301888237, + ("Cl", -1): -459.68240407270594, + ("Cl", 0): -459.5865928328137, + ("Cl", 2): -458.1568260632668, + ("K", 1): None, # not available with this basis set + ("Ca", 2): -676.2188060975801, + ("Br", -1): -2571.903217203978, + ("Br", 0): -2571.8074873037867, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} + +# mp2/cc-pVQZ +DES2 = { + ("H", -1): -0.49885469416811784, + ("H", 0): -0.4999455685829884, + ("H", 1): 0.0, + ("Li", 1): -7.250250946178424, + ("B", -3): -23.881056379140478, + ("B", -1): -24.562769033198762, + ("B", 0): -24.601332055304802, + ("B", 3): -22.00384581220691, + ("C", -1): -37.78757616460555, + ("C", 0): -37.72055375923268, + ("C", 1): -37.374641050923756, + ("N", -1): -54.42675509155296, + ("N", 0): -54.41599555658964, + ("N", 1): -53.89571949369111, + ("O", -1): -75.03532831936059, + ("O", 0): -74.89960636766679, + ("O", 1): -74.42732171580235, + ("F", -1): -99.77773243315134, + ("F", 0): -99.66592682518191, + ("Na", 1): -161.68639387893282, + ("Mg", 2): -198.85342876070732, + ("Si", 4): -285.21266596906895, + ("Si", 0): -288.9153023940409, + ("Si", -4): -287.84995588475664, + ("P", 0): -340.78254912688595, + ("P", 1): -340.41137033923945, + ("S", -1): -397.764457176497, + ("S", 0): -397.63328479696963, + ("S", 1): -397.2291889048987, + ("Cl", -2): -459.276002809114, + ("Cl", -1): -459.85575358503627, + ("Cl", 0): -459.725756402736, + ("Cl", 2): -458.27234841921444, + ("K", 1): None, # not available with this basis set + ("Ca", 2): -676.353471955094, + ("Br", -1): -2572.9216392833405, + ("Br", 0): -2572.79376070567, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} +# pbe/def2-tzvp +ISO17 = { + ("H", -1): -0.4984251407077052, + ("H", 0): -0.4996387468896132, + ("H", 1): 0.0, + ("Li", 1): -7.256644236856955, + ("B", -3): -23.935382459402287, + ("B", -1): -24.585965866081416, + ("B", 0): -24.610084509908482, + ("B", 3): -21.98118646897415, + ("C", -1): -37.77594560897306, + ("C", 0): -37.732895049756756, + ("C", 1): -37.38238697233679, + ("N", -1): -54.441487575279545, + ("N", 0): -54.43218609912527, + ("N", 1): -53.89863329199101, + ("O", -1): -75.04792601076215, + ("O", 0): -74.9084975444151, + ("O", 1): -74.35740906502845, + ("F", -1): -99.77558183886431, + ("F", 0): -99.66914009406862, + ("Na", 1): -161.9641373718238, + ("Mg", 2): -199.1000109617099, + ("Si", 4): -285.4180171255296, + ("Si", 0): -289.2015108290971, + ("Si", -4): -288.02271678330254, + ("P", 0): -341.06484223053843, + ("P", 1): -340.68322234698707, + ("S", -1): -398.00391422392744, + ("S", 0): -397.9053091661701, + ("S", 1): -397.5008759502245, + ("Cl", -2): -459.38640826217886, + ("Cl", -1): -460.0784728780043, + ("Cl", 0): -459.95841441797796, + ("Cl", 2): -458.566186731762, + ("K", 1): -599.5277926006352, + ("Ca", 2): -676.6655247948639, + ("Br", -1): -2573.8415230488945, + ("Br", 0): -2573.720729522105, + ("I", -1): -297.7815346863186, + ("I", 0): -297.66553802494457, +} + + +# hf/cc-pVQZ +ANI1X_2 = { + ("H", -1): -0.47386028485392406, + ("H", 0): -0.49994556858298844, + ("H", 1): 0.0, + ("Li", 1): -7.236386237851972, + ("B", -3): -23.74309031828107, + ("B", -1): -24.46286773184739, + ("B", 0): -24.5329645824744, + ("B", 3): -21.986158801102064, + ("C", -1): -37.66896328779905, + ("C", 0): -37.604262031495196, + ("C", 1): -37.29646463702154, + ("N", -1): -54.22426108804101, + ("N", 0): -54.26750374803837, + ("N", 1): -53.76849831230501, + ("O", -1): -74.78286297582162, + ("O", 0): -74.68967002333635, + ("O", 1): -74.19286214550267, + ("F", -1): -99.44462949539432, + ("F", 0): -99.41376829607128, + ("Na", 1): -161.67672032176134, + ("Mg", 2): -198.83037897754207, + ("Si", 4): -285.1803724364078, + ("Si", 0): -288.79743501319945, + ("Si", -4): -287.65204471889274, + ("P", 0): -340.63262408709096, + ("P", 1): -340.27442412596326, + ("S", -1): -397.54055244875906, + ("S", 0): -397.42820343953593, + ("S", 1): -397.06412575498064, + ("Cl", -2): -458.978571599394, + ("Cl", -1): -459.57282279413744, + ("Cl", 0): -459.4890928627921, + ("Cl", 2): -458.0963453990511, + ("K", 1): None, # not available with this basis set + ("Ca", 2): -676.1542980250254, + ("Br", -1): -2572.5345236382864, + ("Br", 0): -2572.448003418184, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} + + +# mp2/cc-pVTZ +DES3 = { + ("H", -1): -0.4891625462679369, + ("H", 0): -0.49980981130184304, + ("H", 1): 0.0, + ("Li", 1): -7.24726155786237, + ("B", -3): -23.763643794842856, + ("B", -1): -24.53409654753541, + ("B", 0): -24.583383154203396, + ("B", 3): -21.991094434286477, + ("C", -1): -37.71496709817741, + ("C", 0): -37.69583488009523, + ("C", 1): -37.35364857976649, + ("N", -1): -54.37687246581612, + ("N", 0): -54.38498928095387, + ("N", 1): -53.86758718077272, + ("O", -1): -74.97696880669871, + ("O", 0): -74.85981462857248, + ("O", 1): -74.3128417784704, + ("F", -1): -99.70562180844765, + ("F", 0): -99.61731492045887, + ("Na", 1): -161.68534038705675, + ("Mg", 2): -198.84302024453982, + ("Si", 4): -285.22727858476895, + ("Si", 0): -288.9183509250862, + ("Si", -4): -287.5995448051336, + ("P", 0): -340.75961526664724, + ("P", 1): -340.3904498977919, + ("S", -1): -397.7141036332652, + ("S", 0): -397.5920220310466, + ("S", 1): -397.19206598949114, + ("Cl", -2): -459.0459580553311, + ("Cl", -1): -459.79402765207186, + ("Cl", 0): -459.67567575694216, + ("Cl", 2): -458.22960655909685, + ("K", 1): None, # not available with this basis set + ("Ca", 2): -676.3023664599882, + ("Br", -1): -2572.801814668155, + ("Br", 0): -2572.6834739695705, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} + +# pbe0/def2-tzvp +QM7X_DFT = { + ("H", -1): -0.5000012696776297, + ("H", 0): -0.5010619187567116, + ("H", 1): 0.0, + ("Li", 1): -7.262402336780465, + ("B", -3): -23.93538245940231, + ("B", -1): -24.58596586608141, + ("B", 0): -24.618279526937158, + ("B", 3): -21.993880405036222, + ("C", -1): -37.775945608973075, + ("C", 0): -37.73289504975675, + ("C", 1): -37.38238697233677, + ("N", -1): -54.4414875752795, + ("N", 0): -54.43218609912527, + ("N", 1): -53.898633291991025, + ("O", -1): -75.04858314388663, + ("O", 0): -74.9084975444151, + ("O", 1): -74.35740906502848, + ("F", -1): -99.77378866090523, + ("F", 0): -99.67618937527747, + ("Na", 1): -161.98136849490916, + ("Mg", 2): -199.1241396537923, + ("Si", 4): -285.4539026316095, + ("Si", 0): -289.20151082909706, + ("Si", -4): -288.04650100943854, + ("P", 0): -341.06484223053843, + ("P", 1): -340.6832223469869, + ("S", -1): -398.03842612700186, + ("S", 0): -397.90530916617007, + ("S", 1): -397.5008759502245, + ("Cl", -2): -459.4152688089829, + ("Cl", -1): -460.11739716845636, + ("Cl", 0): -459.9974100829532, + ("Cl", 2): -458.6052342125039, + ("K", 1): -599.5783201878277, + ("Ca", 2): -676.7194481655977, + ("Br", -1): -2573.9328383617813, + ("Br", 0): -2573.8118913577364, + ("I", -1): -297.8097622358941, + ("I", 0): -297.6931741613416, +} + +# LEVEL OF THEORY: WB97M-V/def2-tzvp +COMP6_9 = { + ("H", -1): -0.5043034149209957, + ("H", 0): -0.4942304316867456, + ("H", 1): 0.0, + ("Li", 1): -7.275845986964876, + ("B", -3): -23.944386486890433, + ("B", -1): -24.620648350767315, + ("B", 0): -24.649626180737634, + ("B", 3): -22.041679002146115, + ("C", -1): -37.81902657653025, + ("C", 0): -37.78784557278033, + ("C", 1): -37.43099787866309, + ("N", -1): -54.50330209852381, + ("N", 0): -54.48942541262065, + ("N", 1): -53.97039551980893, + ("O", -1): -75.10937339867125, + ("O", 0): -74.98274472768641, + ("O", 1): -74.42816465620183, + ("F", -1): -99.8448159370651, + ("F", 0): -99.74528654206127, + ("Na", 1): -162.06872009995914, + ("Mg", 2): -199.22338375053474, + ("Si", 4): -285.5821192636676, + ("Si", 0): -289.31658008917617, + ("Si", -4): -288.11126408870666, + ("P", 0): -341.2109132073535, + ("P", 1): -340.8136624526414, + ("S", -1): -398.1550625555495, + ("S", 0): -398.0362575878335, + ("S", 1): -397.63036775088466, + ("Cl", -2): -459.52873734619544, + ("Cl", -1): -460.24520403058557, + ("Cl", 0): -460.12503955811985, + ("Cl", 2): -458.6770781144964, + ("K", 1): -599.7242257909018, + ("Ca", 2): -676.8737360488551, + ("Br", -1): -2574.0859799330883, + ("Br", 0): -2573.967555604986, + ("I", -1): -297.7777930229968, + ("I", 0): -297.66455265533017, +} + +# hf/def2-tzvp +HF_DEF2 = { + ("H", -1): -0.4668133747908114, + ("H", 0): -0.4998098322318885, + ("H", 1): 0.0, + ("Li", 1): -7.236374246714073, + ("B", -3): -23.74140302512685, + ("B", -1): -24.462195925378662, + ("B", 0): -24.53233202503875, + ("B", 3): -21.985926089783565, + ("C", -1): -37.613473799868544, + ("C", 0): -37.603219252494, + ("C", 1): -37.295541183753926, + ("N", -1): -54.223174834464814, + ("N", 0): -54.266099796938654, + ("N", 1): -53.76717547003795, + ("O", -1): -74.78142147694243, + ("O", 0): -74.68804805190297, + ("O", 1): -74.19115875887655, + ("F", -1): -99.44317910914634, + ("F", 0): -99.41179977280933, + ("Na", 1): -161.67025708598274, + ("Mg", 2): -198.82300763311338, + ("Si", 4): -285.17360760657004, + ("Si", 0): -288.7894100524365, + ("Si", -4): -287.5042786445288, + ("P", 0): -340.6233882863439, + ("P", 1): -340.26541318034015, + ("S", -1): -397.5252097143351, + ("S", 0): -397.4176274212401, + ("S", 1): -397.0534456500219, + ("Cl", -2): -458.7948759929542, + ("Cl", -1): -459.55564984013716, + ("Cl", 0): -459.47680800709793, + ("Cl", 2): -458.0838125597828, + ("K", 1): -599.0060338509219, + ("Ca", 2): -676.1418445564589, + ("Br", -1): -2572.4811033491237, + ("Br", 0): -2572.398074528429, + ("I", -1): -296.7409981252531, + ("I", 0): -296.6585948224954, +} +ANI1X_8 = { + ("H", -1): -0.5043034149209957, + ("H", 0): -0.5013136410415637, + ("H", 1): 0.0, + ("Li", 1): -7.286464366413948, + ("B", -3): -23.86534129296109, + ("B", -1): -24.613473886395223, + ("B", 0): -24.65142963156562, + ("B", 3): -22.073004626190233, + ("C", 0): -37.780134440896255, + ("N", -1): -54.481657808873116, + ("N", 0): -54.48280823582692, + ("N", 1): -53.95708783281901, + ("O", -1): -75.09104966465256, + ("O", 0): -74.97131697424727, + ("O", 1): -74.41885693671637, + ("F", -1): -99.82474743242214, + ("F", 0): -99.73990054006921, + ("Na", 1): -162.08501075159776, + ("Mg", 2): -199.24620625842113, + ("Si", 4): -285.6197527177925, + ("Si", 0): -289.323387632431, + ("Si", -4): -288.04657476482333, + ("P", 0): -341.1958015245573, + ("P", 1): -340.8193558685238, + ("S", -1): -398.1805976553139, + ("S", 0): -398.0529588010547, + ("S", 1): -397.69734443410385, + ("Cl", -2): -459.5595393232076, + ("Cl", -1): -460.2768559014631, + ("Cl", 0): -460.1543938788908, + ("Cl", 2): -458.6962780587144, + ("K", 1): None, + ("Ca", 2): -676.921587688464, + ("Br", -1): -2574.3069571951482, + ("Br", 0): -2574.1862987794157, + ("I", -1): None, + ("I", 0): None, +} +# FF ttm2.1-f, calculated with ttm3-f f90 routine +# Link: https://www.pnnl.gov/science/ttm3f.asp +# For isolated atoms doesn't change as it is always 0 +# Typed down for clarity +TTM2 = { + ("H", 0): 0.0, + ("O", 0): 0.0, +} + + +ISOLATED_ATOM_ENERGIES = { + # DFT + "wb97x": { + "6-31g*": COMP6_1, + "6-31g(d)": ANI1, + "cc-pvtz": ANI1X_8, + }, + "wb97x-d": {"def2-svp": NABLADFT}, + "wb97x-d3": {"def2-tzvp": ORBNET}, + "wb97m": { + "def2-tzvp": COMP6_9, + }, + "wb97m-d3bj": {"def2-tzvp": wb97m_d3bj_def2_tzvp, "def2-tzvppd": SPICE}, + "tpssh": {"def2-tzvp": TMQM}, + "revpbe-d3(bj)": {"def2-tzvp": SolvatedPeptides}, + "dsd-blyp-d3(bj)": {"def2-tzvp": SN2RXN}, + "b3lyp": { + "6-31g*": QMUGS_DFT, + "def2-tzv": COMP6_3, + }, + "b3lyp-d3mbj": {"def2-tzvp": COMP6_2}, + "pbe-d3bj": { + "def2-tzvp": COMP6_5, + }, + "hf": { + "def2-tzvp": HF_DEF2, + "cc-pvdz": ANI1X_1, + "cc-pvqz": ANI1X_2, + "cc-pvtz": ANI1X_3, + }, + "svwn": { + "def2-tzv": COMP6_7, + }, + # PAW + "pbe0": { + "mbd": QM7X_DFT, + }, + "pbe": { + "vdw-ts": ISO17, + "mbd": ISO17, + "def2-tzvp": ISO17, + }, + # HIGHER LEVEL OF THEORY + "ccsd": { + "cc-pvdz": GDML_1, + "cc-pvtz": CCSD_VTZ, + }, + "tccsd(t)": { + "cc-pvdz": ANI1CCX_2, + }, + "ccsd(t)": { + "cc-pvdz": GDML_2, + "cc-pvtz": ANI1CCX_2, + "cbs": ccsdtaug, + "nn": None, # ML Calculated + }, + "mp2": { + "cc-pvdz": DES1, + "cc-pvqz": DES2, + "cc-pvtz": DES3, + "cbs": mp2aug, + }, + # SAPT0 + "sapt0": { + "aug-cc-pwcvxz": None, # DOESNT MAKE SENSE + }, + # SEMI EMPIRICAL + "gfn2_xtb": GFN2, + "gfn1_xtb": GFN1, + "dft3b": DFTB, + "pm6": PM6, + # FF + "ttm2.1-f": TTM2, +} + +# TODO: Talk with ivan about cbs extrapolation from from av[TQ]z. For now this should be ok diff --git a/src/openqdc/utils/io.py b/src/openqdc/utils/io.py index f0853dd..6105d93 100644 --- a/src/openqdc/utils/io.py +++ b/src/openqdc/utils/io.py @@ -6,6 +6,7 @@ import fsspec import h5py import torch +from ase.atoms import Atoms from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem from rdkit.Chem import MolFromXYZFile @@ -13,14 +14,35 @@ gcp_filesys = fsspec.filesystem("gs") local_filesys = LocalFileSystem() +_OPENQDC_CACHE_DIR = "~/.cache/openqdc" -def get_local_cache(): - cache_dir = os.path.expanduser(os.path.expandvars("~/.cache/openqdc")) + +def set_cache_dir(d): + r""" + Optionally set the _OPENQDC_CACHE_DIR directory. + + Args: + d (str): path to a local folder. + """ + if d is None: + return + global _OPENQDC_CACHE_DIR + _OPENQDC_CACHE_DIR = os.path.expanduser(d) + + +def get_local_cache() -> str: + """ + Returns the local cache directory. It creates it if it does not exist. + + Returns: + str: path to the local cache directory + """ + cache_dir = os.path.expanduser(os.path.expandvars(_OPENQDC_CACHE_DIR)) os.makedirs(cache_dir, exist_ok=True) return cache_dir -def get_remote_cache(): +def get_remote_cache() -> str: remote_cache = "gs://opendatasets/openqdc" return remote_cache @@ -152,6 +174,22 @@ def load_xyz(path): return MolFromXYZFile(path) +def dict_to_atoms(d: dict, ext: bool = False) -> Atoms: + """ + Converts dictionary to ase atoms object + + Args: + d (dict): dictionary containing keys: positions, atomic_numbers, charges + ext (bool, optional): Whether to include all the rest of the dictionary in the atoms object info field. + Defaults to False. + """ + pos, atomic_numbers, charges = d.pop("positions"), d.pop("atomic_numbers"), d.pop("charges") + at = Atoms(positions=pos, numbers=atomic_numbers, charges=charges) + if ext: + at.info = d + return at + + def print_h5_tree(val, pre=""): items = len(val) for key, val in val.items(): diff --git a/src/openqdc/utils/package_utils.py b/src/openqdc/utils/package_utils.py new file mode 100644 index 0000000..c7b8aac --- /dev/null +++ b/src/openqdc/utils/package_utils.py @@ -0,0 +1,130 @@ +import importlib +from functools import wraps +from typing import Any, Callable, TypeVar + +F = TypeVar("F", bound=Callable[..., Any]) + + +class MissingOptionalDependencyError(BaseException): + """ + An exception raised when an optional dependency is required + but cannot be found. + + Attributes + ---------- + library_name + The name of the missing library. + """ + + def __init__(self, library_name: str): + """ + + Parameters + ---------- + library_name + The name of the missing library. + license_issue + Whether the library was importable but was unusable due + to a missing license. + """ + + message = f"The required {library_name} module could not be imported." + + super(MissingOptionalDependencyError, self).__init__(message) + + self.library_name = library_name + + +def has_package(package_name: str) -> bool: + """ + Helper function to generically check if a Python package is installed. + Intended to be used to check for optional dependencies. + + Parameters + ---------- + package_name : str + The name of the Python package to check the availability of + + Returns + ------- + package_available : bool + Boolean indicator if the package is available or not + + Examples + -------- + >>> has_numpy = has_package('numpy') + >>> has_numpy + True + >>> has_foo = has_package('other_non_installed_package') + >>> has_foo + False + """ + try: + importlib.import_module(package_name) + except ModuleNotFoundError: + return False + return True + + +def requires_package(package_name: str) -> Callable[..., Any]: + """ + Helper function to denote that a funciton requires some optional + dependency. A function decorated with this decorator will raise + `MissingOptionalDependencyError` if the package is not found by + `importlib.import_module()`. + + Parameters + ---------- + package_name : str + The name of the module to be imported. + + Raises + ------ + MissingOptionalDependencyError + + """ + + def inner_decorator(function: F) -> F: + @wraps(function) + def wrapper(*args, **kwargs): + import importlib + + try: + importlib.import_module(package_name) + except ImportError: + raise MissingOptionalDependencyError(library_name=package_name) + except Exception as e: + raise e + + return function(*args, **kwargs) + + return wrapper + + return inner_decorator + + +def get_dir(): + r""" + Get the Torch Hub cache directory used for storing downloaded models & weights. + + If :func:`~torch.hub.set_dir` is not called, default path is ``$TORCH_HOME/hub`` where + environment variable ``$TORCH_HOME`` defaults to ``$XDG_CACHE_HOME/torch``. + ``$XDG_CACHE_HOME`` follows the X Design Group specification of the Linux + filesystem layout, with a default value ``~/.cache`` if the environment + variable is not set. + """ + + if _hub_dir is not None: + return _hub_dir + # return os.path.join(_get_torch_home(), 'hub') + + +def set_dir(d): + r""" + Optionally set the Torch Hub directory used to save downloaded models & weights. + + Args: + d (str): path to a local folder to save downloaded models & weights. + """ + global _hub_dir + # _hub_dir = os.path.expanduser(d) diff --git a/src/openqdc/utils/units.py b/src/openqdc/utils/units.py index a810f1f..fb895ce 100644 --- a/src/openqdc/utils/units.py +++ b/src/openqdc/utils/units.py @@ -72,3 +72,4 @@ def get_conversion(in_unit: str, out_unit: str): ) Conversion("hartree/ang", "kcal/mol/ang", lambda x: get_conversion("hartree", "kcal/mol")(x)) Conversion("hartree/ang", "hartree/bohr", lambda x: get_conversion("bohr", "ang")(x)) +Conversion("hartree/bohr", "hartree/ang", lambda x: get_conversion("ang", "bohr")(x)) diff --git a/tests/test_dummy.py b/tests/test_dummy.py new file mode 100644 index 0000000..65fe9b6 --- /dev/null +++ b/tests/test_dummy.py @@ -0,0 +1,21 @@ +"""Path hack to make tests work.""" + +from openqdc.datasets.dummy import Dummy # noqa: E402 +from openqdc.utils.atomization_energies import ( + ISOLATED_ATOM_ENERGIES, + IsolatedAtomEnergyFactory, +) + + +def test_dummy(): + ds = Dummy() + assert len(ds) > 10 + assert ds[100] + + +def test_is_at_factory(): + res = IsolatedAtomEnergyFactory.get("mp2/cc-pvdz") + assert len(res) == len(ISOLATED_ATOM_ENERGIES["mp2"]["cc-pvdz"]) + res = IsolatedAtomEnergyFactory.get("PM6") + assert len(res) == len(ISOLATED_ATOM_ENERGIES["pm6"]) + assert isinstance(res[("H", 0)], float)