Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make input feature invariant to resonance #25

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
37 changes: 37 additions & 0 deletions espaloma_charge/tests/test_resonance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import pytest
# import torch

def test_cooh():
from rdkit import Chem
from espaloma_charge.utils import from_rdkit_mol
import torch
mol = Chem.MolFromSmiles("[O-]C=O")
mol = Chem.AddHs(mol)
g = from_rdkit_mol(mol)
assert torch.unique(g.ndata["h0"][g.ndata["type"] == 8], dim=0).shape[0] == 1

def test_benzamidine():
from rdkit import Chem
from espaloma_charge.utils import from_rdkit_mol
import torch
mol = Chem.MolFromSmiles("C1=CC=C(C=C1)C(=[NH2+])N")
mol = Chem.AddHs(mol)
g = from_rdkit_mol(mol)
assert torch.unique(g.ndata["h0"][g.ndata["type"] == 7], dim=0).shape[0] == 1

mol = Chem.MolFromSmiles("C1=CC=C(C=C1)C(=[N])N")
mol = Chem.AddHs(mol)
g = from_rdkit_mol(mol)
assert torch.unique(g.ndata["h0"][g.ndata["type"] == 7], dim=0).shape[0] == 2

def test_guanidinium():
import torch
from rdkit import Chem
from espaloma_charge.utils import from_rdkit_mol
mol = Chem.MolFromSmiles("C(=[NH2+])(N)N")
mol = Chem.AddHs(mol)
g = from_rdkit_mol(mol)
assert torch.unique(g.ndata["h0"][g.ndata["type"] == 7], dim=0).shape[0] == 1



15 changes: 7 additions & 8 deletions espaloma_charge/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,18 +46,14 @@ def fp_rdkit(atom):
[0, 0, 0, 0, 0],
dtype=torch.get_default_dtype(),
),
Chem.rdchem.HybridizationType.UNSPECIFIED: torch.tensor(
[0, 0, 0, 0, 0],
dtype=torch.get_default_dtype(),
),
}
return torch.cat(
[
torch.tensor(
[
atom.GetTotalDegree(),
atom.GetTotalValence(),
atom.GetExplicitValence(),
# atom.GetTotalValence(),
# atom.GetExplicitValence(),
# atom.GetFormalCharge(),
atom.GetIsAromatic() * 1.0,
atom.GetMass(),
Expand All @@ -76,9 +72,10 @@ def fp_rdkit(atom):
)


def from_rdkit_mol(mol, use_fp=True):
def from_rdkit_mol(mol):
import dgl
from rdkit import Chem
from dgllife.utils import mol_to_bigraph

# initialize graph
g = dgl.DGLGraph()
Expand All @@ -89,9 +86,11 @@ def from_rdkit_mol(mol, use_fp=True):
g.ndata["type"] = torch.Tensor(
[[atom.GetAtomicNum()] for atom in mol.GetAtoms()]
)

g.ndata["q_ref"] = torch.Tensor(
[[atom.GetFormalCharge()] for atom in mol.GetAtoms()]
)

h_v = torch.zeros(g.ndata["type"].shape[0], 100, dtype=torch.float32)

h_v[
Expand All @@ -116,6 +115,6 @@ def from_rdkit_mol(mol, use_fp=True):
g.add_edges(bonds_begin_idxs, bonds_end_idxs)
g.add_edges(bonds_end_idxs, bonds_begin_idxs)

# g.edata["type"] = torch.Tensor(bonds_types)[:, None].repeat(2, 1)
g.edata["type"] = torch.Tensor(bonds_types)[:, None].repeat(2, 1)

return g
48 changes: 48 additions & 0 deletions scripts/characterization/eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import pandas as pd
import numpy as np

def bootstrap_mean(x, n_samples=1000, ci=0.95):
results = []
for _ in range(n_samples):
idxs = np.random.randint(x.shape[0], size=(x.shape[0], ))
results.append(x[idxs].mean())

low = np.percentile(results, 100.0 * 0.5 * (1 - ci))
high = np.percentile(results, (1 - ((1 - ci) * 0.5)) * 100.0)
original = x.mean()
return original, low, high





def run(in_path):
df = pd.read_csv(
in_path,
names=["smiles", "q_amber", "q_esp", "q_oe", "t_amber", "t_esp", "t_oe"],
)

for name in ["q_amber", "q_esp", "q_oe"]:
df[name] = df.apply(lambda x: np.array(eval(x[name])), axis=1)

df["q_rmse_esp"] = ((df["q_esp"] - df["q_oe"]) ** 2).apply(np.mean) ** 0.5
df["q_rmse_amber"] = ((df["q_amber"] - df["q_oe"]) ** 2).apply(np.mean) ** 0.5
df["n_atoms"] = df.apply(lambda x: len(x["q_oe"]), axis=1)

n_mol = "$ %s $" % len(df)
n_atoms = "$ %.2f $" % df["n_atoms"].mean()

q_rmse_esp = "$%.4f_{%.4f}^{%.4f}$" % bootstrap_mean(df["q_rmse_esp"].values.flatten())
q_rmse_amber = "$%.4f_{%.4f}^{%.4f}$" % bootstrap_mean(df["q_rmse_amber"].values.flatten())

t_esp = "$ %.2f $" % df["t_esp"].mean()
t_amber = "$ %.2f $" % df["t_amber"].mean()
t_oe = "$ %.2f $" % df["t_oe"].mean()

print(" & ".join([n_mol, n_atoms, q_rmse_esp, q_rmse_amber, t_esp, t_amber, t_oe]))

if __name__ == "__main__":
import sys
run(sys.argv[1])


6 changes: 6 additions & 0 deletions scripts/characterization/fda.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#BSUB -J "data[1-1038]"
#BSUB -o %J.stdout
#BSUB -W 0:10

python run.py --in_path fda.smi --idx $((LSB_JOBINDEX - 1)) --out fda.csv

240 changes: 240 additions & 0 deletions scripts/characterization/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
import time
import numpy as np
import torch
import pandas as pd
import scipy
from openff.toolkit.topology import Molecule
from espaloma_charge.openff_wrapper import EspalomaChargeToolkitWrapper
from openff.toolkit.utils.toolkit_registry import AmberToolsToolkitWrapper, OpenEyeToolkitWrapper

from typing import Union

import numpy as np

from openff.units import unit
from openff.toolkit.topology import Molecule
from openff.recharge.grids import GridGenerator, MSKGridSettings


def calculate_esp(
grid_coordinates: unit.Quantity, # N x 3
atom_coordinates: unit.Quantity, # M x 3
charges: unit.Quantity, # M
with_units: bool = False,
) -> Union[float, unit.Quantity]:
"""
Calculate the electrostatic potential at a set of grid points

Parameters
----------
grid_coordinates
The coordinates of the points at which to calculate the ESP
atom_coordinates
The coordinates of the atoms in the molecule
charges
The charges of the atoms in the molecule.
Should be in same order as ``atom_coordinates``
with_units
Whether to return the ESP as a float in atomic units
or as a unit.Quantity

Returns
-------
esp: float or unit.Quantity
The ESP at the grid points
"""
# make sure charges are in correct unit
# charges = unify(charges)
charges = charges * unit.elementary_charge

AU_ESP = unit.atomic_unit_of_energy / unit.elementary_charge
ke = 1 / (4 * np.pi * unit.epsilon_0)

displacement = (
grid_coordinates[:, None, :]
- atom_coordinates[None, :, :] # N x M x 3
)
distance = (displacement ** 2).sum(axis=-1) ** 0.5 # N x M
inv_distance = 1 / distance

esp = ke * (inv_distance @ charges) # N

esp_q = esp.m_as(AU_ESP)
if not with_units:
return esp_q
return esp



def compare_molecule_esp(
molecule: Molecule,
charges1: unit.Quantity,
charges2: unit.Quantity,
) -> float:
"""Calculate the RMSD of the ESPs a molecule with different charges

Parameters
----------
molecule: openff.toolkit.topology.Molecule
The molecule to calculate the ESPs for
charges1: unit.Quantity
The charges to use for the first ESP
charges2: unit.Quantity
The charges to use for the second ESP

Returns
-------
rmsd: float
"""

assert len(molecule.conformers) > 0, "Molecule has no conformers"

settings = MSKGridSettings()
rmses = np.zeros((len(molecule.conformers),), dtype=float)

for i, conformer in enumerate(molecule.conformers):
grid = GridGenerator.generate(molecule, conformer, settings)
esp1 = calculate_esp(grid, conformer, charges1)
esp2 = calculate_esp(grid, conformer, charges2)
delta = esp1 - esp2
rmsd = (delta ** 2).mean() ** 0.5
rmses[i] = rmsd

return rmses.mean()

def generate_elf10_conformers(
molecule: Molecule,
n_conformer_pool: int = 500,
n_conformers: int = 10,
rms_cutoff: Union[float, unit.Quantity] = 0.05,
):
"""Generate ELF conformers for a molecule inplace

Parameters
----------
molecule: openff.toolkit.topology.Molecule
The molecule to generate conformers for
n_conformer_pool: int
The number of conformers to generate
to select ELF conformers from
n_conformers: int
The number of ELF conformers to select.
The molecule will have maximum this number
of conformers
rms_cutoff: float or unit.Quantity
The RMSD cutoff to use when selecting conformers.
If a float is provided, it will be interpreted as
an Angstrom RMSD cutoff

Returns
-------
molecule: openff.toolkit.topology.Molecule
The same molecule, with ELF conformers
"""
if isinstance(rms_cutoff, float):
rms_cutoff = rms_cutoff * unit.angstrom

molecule.generate_conformers(
n_conformers=n_conformer_pool,
rms_cutoff=rms_cutoff,
make_carboxylic_acids_cis=True,
)
molecule.apply_elf_conformer_selection(limit=n_conformers)
return molecule

def compute_zap(molecule):
from openeye import oezap, oegrid, oechem

mol = molecule.to_openeye()
oechem.OEAssignBondiVdWRadii(mol)
## This is only necessary if the molecule doesn't already have 3D
## coordinates
oechem.OEGenerate2DCoordinates(mol)
mol.SetDimension(3)

# ## Already handled automatically when converting to OE
# charges = molecule.partial_charges.magnitude
# for atom in mol.GetAtoms():
# atom.SetPartialCharge(charges[atom.GetIdx()])

zap = oezap.OEZap()
zap.SetInnerDielectric(1.0)
zap.SetGridSpacing(0.5)
zap.SetMolecule(mol)

grid = oegrid.OEScalarGrid()
zap.CalcPotentialGrid(grid)
atom_potentials = oechem.OEFloatArray(mol.GetMaxAtomIdx())
zap.CalcAtomPotentials(atom_potentials)
return np.array(atom_potentials)

def run(args):
smiles = open(args.in_path, "r").readlines()[args.idx].strip()
molecule = Molecule.from_smiles(smiles)

toolkit_registry = EspalomaChargeToolkitWrapper()
molecule.assign_partial_charges('espaloma-am1bcc', toolkit_registry=toolkit_registry)
time0 = time.time()
molecule.assign_partial_charges('espaloma-am1bcc', toolkit_registry=toolkit_registry)
time1 = time.time()
q_esp = molecule.partial_charges.magnitude.tolist()
t_esp = time1 - time0
zap_esp = compute_zap(molecule)

toolkit_registry = AmberToolsToolkitWrapper()
molecule.assign_partial_charges('am1bcc', toolkit_registry=toolkit_registry)
time0 = time.time()
molecule.assign_partial_charges('am1bcc', toolkit_registry=toolkit_registry)
time1 = time.time()
q_amber = molecule.partial_charges.magnitude.tolist()
t_amber = time1 - time0
zap_amber = compute_zap(molecule)

toolkit_registry = OpenEyeToolkitWrapper()
molecule.assign_partial_charges('am1bcc', toolkit_registry=toolkit_registry)
time0 = time.time()
molecule.assign_partial_charges('am1bcc', toolkit_registry=toolkit_registry)
time1 = time.time()
q_oe = molecule.partial_charges.magnitude.tolist()
t_oe = time1 - time0
zap_oe = compute_zap(molecule)

generate_elf10_conformers(molecule)

rmse_esp_amber = compare_molecule_esp(molecule, q_oe, q_amber)
rmse_esp_espaloma = compare_molecule_esp(molecule, q_oe, q_esp)

df = pd.DataFrame.from_dict(
{
smiles:
[
q_amber, q_esp, q_oe,
t_amber, t_esp, t_oe,
zap_amber, zap_esp, zap_oe,
rmse_esp_amber, rmse_esp_espaloma,
]
},
orient="index")

print(df)
return df

if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--idx", type=int)
parser.add_argument("--in_path", type=str)
parser.add_argument("--out", type=str)
args = parser.parse_args()
df = run(args)
df.to_csv(args.out, mode="a", header=None)










Loading
Loading