Merge pull request #110 from wiederm/common_core_search

new find_mcs function for exclusion of hydrogens
cbc-univie · Apr 14, 2023 · 8eb4ceb · 8eb4ceb
2 parents f6e40f9 + 295bf2d
commit 8eb4ceb
Show file tree

Hide file tree

Showing 3 changed files with 327 additions and 73 deletions.
diff --git a/transformato/mutate.py b/transformato/mutate.py
@@ -590,7 +590,7 @@ def _check_for_lp(
         ]
         lp_dict_dummy_region = defaultdict(list)
         lp_dict_common_core = defaultdict(list)
-        print(f"die version {pm.__version__}")
+
         for atom in psf.view[f":{tlc}"].atoms:
             if atom.name.find("LP") == False:
                 print(f"die Atome {atom}")
@@ -959,6 +959,8 @@ def _find_mcs(
         self,
         mol1_name: str,
         mol2_name: str,
+        iterate_over_matches: bool = False,
+        max_matches: int = 10,
     ):
         """
         A class that proposes the mutation route between two molecules with a
@@ -980,15 +982,33 @@ def _find_mcs(
 
         m1, m2 = [deepcopy(self.mols[mol1_name]), deepcopy(self.mols[mol2_name])]
 
+        # second copy of mols - to use as representation with removed hydrogens
+        remmol1 = deepcopy(m1)
+        remmol2 = deepcopy(m2)
+
+        # removal of hydrogens - if not removed, common core for molecule + hydrogens is computed!
+        remmol1 = Chem.rdmolops.RemoveAllHs(remmol1)
+        remmol2 = Chem.rdmolops.RemoveAllHs(remmol2)
+
+        # remmols contains both molecules with removed hydrogens
+        remmols = [remmol1, remmol2]
+
         for m in [m1, m2]:
             logger.debug("Mol in SMILES format: {}.".format(Chem.MolToSmiles(m, True)))
 
         # make copy of mols
         changed_mols = [Chem.Mol(x) for x in [m1, m2]]
 
         # find substructure match (ignore bond order but enforce element matching)
+
+        # findmcs-function is called for mol-objects with removed hydrogens
+
+        # original Transformato-parameters (yield bad / for Transformato not usable results for molecules with cyclic structures, e.g., ccores between 2-CPI and 7-CPI)
+        # especially because completeRingsOnly is set to False
+        """
         mcs = rdFMCS.FindMCS(
-            changed_mols,
+            #changed_mols,
+            remmols,
             bondCompare=self.bondCompare,
             timeout=120,
             atomCompare=self.atomCompare,
@@ -997,27 +1017,180 @@ def _find_mcs(
             completeRingsOnly=self.completeRingsOnly,
             ringMatchesRingOnly=self.ringMatchesRingOnly,
         )
+        """
+
+        # find_mcs-function from tf_routes:
+        # yields more reasonable common cores (e.g. for 2-CPI/7-CPI )
+        # in particular, completeRingsOnly=True is important
+
+        mcs = rdFMCS.FindMCS(
+            remmols,
+            timeout=120,
+            ringMatchesRingOnly=True,
+            completeRingsOnly=True,
+            ringCompare=Chem.rdFMCS.RingCompare.StrictRingFusion,
+            bondCompare=rdFMCS.BondCompare.CompareAny,
+            matchValences=False,
+        )
+
         logger.debug("Substructure match: {}".format(mcs.smartsString))
         # convert from SMARTS
         mcsp = Chem.MolFromSmarts(mcs.smartsString, False)
 
-        s1 = m1.GetSubstructMatch(mcsp)
-        logger.debug("Substructere match idx: {}".format(s1))
+        # iterate_over_matches == False: the common core atoms for a single stubstructure match are determined
+        # possibly a different match yields a bigger ccore - i.e. a ccore with more hydrogens (neopentane - methane)
+        if iterate_over_matches == False:
+            s1 = m1.GetSubstructMatch(mcsp)
+            logger.debug("Substructere match idx: {}".format(s1))
+            self._show_common_core(
+                m1, self.get_common_core_idx_mol1(), show_atom_type=False, internal=True
+            )
+            s2 = m2.GetSubstructMatch(mcsp)
+            logger.debug("Substructere match idx: {}".format(s2))
+            self._show_common_core(
+                m2, self.get_common_core_idx_mol2(), show_atom_type=False, internal=True
+            )
 
-        self._show_common_core(
-            m1, self.get_common_core_idx_mol1(), show_atom_type=False, internal=True
-        )
+            # new code: add hydrogens to both common-core-on-molecule-projections
+            # set with all common core atom indices for both molecules
+            hit_ats1_compl = list(s1)
+            hit_ats2_compl = list(s2)
+
+            # check for each common core atom whether hydrogen atoms are in its neighbourhood
+            # s1/s2 contain the mapping of the common core (without hydrogens) to both molecules
+            # iterating over all mapped atoms, the number of hydrogens attached to the common core atom is determined
+            # the minimum number (i.e. if the atom of molecule 1 has one hydrogen bond, the atom of molecule 2 zero hydrogen bonds, it is zero) gives the number of hydrogen atoms to add to the common core
+
+            for indexpos, indexnr in enumerate(s1):
+                # get mapped atoms
+                atom1 = m1.GetAtomWithIdx(s1[indexpos])
+                atom2 = m2.GetAtomWithIdx(s2[indexpos])
+
+                # determine number of hydrogens in the neighbourhood of the atom from molecule1
+                h_atoms1 = 0
+                for x in atom1.GetNeighbors():
+                    if x.GetSymbol() == "H":
+                        h_atoms1 = h_atoms1 + 1
+
+                # determine number of hydrogens in the neighbourhood of the atom from molecule2
+                h_atoms2 = 0
+                for x in atom2.GetNeighbors():
+                    if x.GetSymbol() == "H":
+                        h_atoms2 = h_atoms2 + 1
+
+                # find minimum number of hydrogens
+                min_h_atoms = min(h_atoms1, h_atoms2)
+
+                # add minimum number of hydrogens to the ccore for molecule1
+                h_atoms1 = 0
+                for x in atom1.GetNeighbors():
+                    if x.GetSymbol() == "H" and h_atoms1 < min_h_atoms:
+                        hit_ats1_compl.append(x.GetIdx())
+                        h_atoms1 = h_atoms1 + 1
+
+                # add minimum number of hydrogens to the ccore for molecule2
+                h_atoms2 = 0
+                for x in atom2.GetNeighbors():
+                    if x.GetSymbol() == "H" and h_atoms2 < min_h_atoms:
+                        hit_ats2_compl.append(x.GetIdx())
+                        h_atoms2 = h_atoms2 + 1
+
+            # create new tuple of common core atom indices with additional hydrogens (molecule 1)
+            hit_ats1 = tuple(hit_ats1_compl)
+
+            # create new tuple of common core atom indices with additional hydrogens (molecule 2)
+            hit_ats2 = tuple(hit_ats2_compl)
+
+            self._substructure_match[mol1_name] = list(hit_ats1)
+            self._substructure_match[mol2_name] = list(hit_ats2)
+
+            # self._substructure_match[mol1_name] = list(s1)
+            # self._substructure_match[mol2_name] = list(s2)
 
-        s2 = m2.GetSubstructMatch(mcsp)
-        logger.debug("Substructere match idx: {}".format(s2))
+            return mcs
 
-        self._show_common_core(
-            m2, self.get_common_core_idx_mol2(), show_atom_type=False, internal=True
-        )
-        self._substructure_match[mol1_name] = list(s1)
-        self._substructure_match[mol2_name] = list(s2)
+        # iterate_over_matches == True: it is iterated over all pairs of substructure matches
+        # the substructure matches with the biggest emering common cores are finally chosen
+        # the common cores for different substructure match pairs contain the same heavy atoms, but differ in the number of hydrogens, i.e. the finally chosen matches have the common cores with most hydrogens
+        else:
+            s1s = m1.GetSubstructMatches(mcsp, maxMatches=max_matches)
+            logger.debug("Substructere match idx: {}".format(s1s))
+            self._show_common_core(
+                m1, self.get_common_core_idx_mol1(), show_atom_type=False, internal=True
+            )
+            s2s = m2.GetSubstructMatches(mcsp, maxMatches=max_matches)
+            logger.debug("Substructere match idx: {}".format(s2s))
+            self._show_common_core(
+                m2, self.get_common_core_idx_mol2(), show_atom_type=False, internal=True
+            )
+
+            curr_size_of_ccores = 0
+            for s1 in s1s:
+                for s2 in s2s:
+                    # new code: add hydrogens to both common-core-on-molecule-projections
+                    # set with all common core atom indices for both molecules
+                    hit_ats1_compl = list(s1)
+                    hit_ats2_compl = list(s2)
+
+                    # check for each common core atom whether hydrogen atoms are in its neighbourhood
+                    # s1/s2 contain the mapping of the common core (without hydrogens) to both molecules
+                    # iterating over all mapped atoms, the number of hydrogens attached to the common core atom is determined
+                    # the minimum number (i.e. if the atom of molecule 1 has one hydrogen bond, the atom of molecule 2 zero hydrogen bonds, it is zero) gives the number of hydrogen atoms to add to the common core
+
+                    for indexpos, indexnr in enumerate(s1):
+                        # get mapped atoms
+                        atom1 = m1.GetAtomWithIdx(s1[indexpos])
+                        atom2 = m2.GetAtomWithIdx(s2[indexpos])
+
+                        # determine number of hydrogens in the neighbourhood of the atom from molecule1
+                        h_atoms1 = 0
+                        for x in atom1.GetNeighbors():
+                            if x.GetSymbol() == "H":
+                                h_atoms1 = h_atoms1 + 1
+
+                        # determine number of hydrogens in the neighbourhood of the atom from molecule2
+                        h_atoms2 = 0
+                        for x in atom2.GetNeighbors():
+                            if x.GetSymbol() == "H":
+                                h_atoms2 = h_atoms2 + 1
+
+                        # find minimum number of hydrogens
+                        min_h_atoms = min(h_atoms1, h_atoms2)
+
+                        # add minimum number of hydrogens to the ccore for molecule1
+                        h_atoms1 = 0
+                        for x in atom1.GetNeighbors():
+                            if x.GetSymbol() == "H" and h_atoms1 < min_h_atoms:
+                                hit_ats1_compl.append(x.GetIdx())
+                                h_atoms1 = h_atoms1 + 1
+
+                        # add minimum number of hydrogens to the ccore for molecule2
+                        h_atoms2 = 0
+                        for x in atom2.GetNeighbors():
+                            if x.GetSymbol() == "H" and h_atoms2 < min_h_atoms:
+                                hit_ats2_compl.append(x.GetIdx())
+                                h_atoms2 = h_atoms2 + 1
+
+                    # count whether the new common cores are bigger (i.e. contain more hydrogens) than the previous common cores
+                    # if this is the case, the current substructure matches are chosen
+                    if len(hit_ats1_compl) > curr_size_of_ccores:
+                        curr_size_of_ccores = len(hit_ats1_compl)
+                        hit_ats1_compl_final = hit_ats1_compl
+                        hit_ats2_compl_final = hit_ats2_compl
+
+            # create new tuple of common core atom indices with additional hydrogens (molecule 1)
+            hit_ats1 = tuple(hit_ats1_compl_final)
+
+            # create new tuple of common core atom indices with additional hydrogens (molecule 2)
+            hit_ats2 = tuple(hit_ats2_compl_final)
+
+            self._substructure_match[mol1_name] = list(hit_ats1)
+            self._substructure_match[mol2_name] = list(hit_ats2)
+
+            # self._substructure_match[mol1_name] = list(s1)
+            # self._substructure_match[mol2_name] = list(s2)
 
-        return mcs
+            return mcs
 
     def _return_atom_idx_from_bond_idx(self, mol: Chem.Mol, bond_idx: int):
         return (

diff --git a/transformato/tests/test_alchemical_path_generation.py b/transformato/tests/test_alchemical_path_generation.py
@@ -230,7 +230,7 @@ def test_generate_alchemical_path_for_2_CPI_to_common_core():
     )
 
     output_files = mutate_2_CPI_to_7_CPI_cc(configuration=configuration)
-    assert len(output_files) == 18
+    assert len(output_files) == 17
 
 
 @pytest.mark.rsfe
@@ -244,7 +244,7 @@ def test_generate_alchemical_path_for_7_CPI_to_common_core():
     )
 
     output_files = mutate_7_CPI_to_2_CPI_cc(configuration=configuration)
-    assert len(output_files) == 13
+    assert len(output_files) == 12
 
 
 @pytest.mark.rsfe
@@ -269,3 +269,42 @@ def test_generate_alchemical_path_for_1a0q_1a07(caplog):
     )
     perform_mutations(configuration=configuration, i=i, mutation_list=mutation_list)
     assert i.current_step == 24
+
+
+@pytest.mark.rbfe
+def test_generate_path_for_ppar_system():
+    conf = f"{get_testsystems_dir()}/config/ppar-cpd31_cpd25.yaml"
+    configuration = load_config_yaml(
+        config=conf, input_dir=get_testsystems_dir(), output_dir=get_test_output_dir()
+    )
+
+    s1 = SystemStructure(configuration, "structure1")
+    s2 = SystemStructure(configuration, "structure2")
+    s1_to_s2 = ProposeMutationRoute(s1, s2)
+    s1_to_s2.propose_common_core()
+    s1_to_s2.finish_common_core()
+
+    assert (
+        len(s1_to_s2.get_common_core_idx_mol1())
+        == len(s1_to_s2.get_common_core_idx_mol2())
+        == 46
+    )
+
+    assert s1_to_s2.get_idx_not_in_common_core_for_mol1() == [31, 32, 33, 49, 50, 51]
+    assert s1_to_s2.get_idx_not_in_common_core_for_mol2() == [0, 1, 40, 49, 50, 51]
+
+    assert s1_to_s2.terminal_real_atom_cc1 == [19, 12]
+    assert s1_to_s2.terminal_real_atom_cc2 == [21, 14]
+
+    assert s1_to_s2.dummy_region_cc1.lj_default == [31, 33]
+    assert s1_to_s2.dummy_region_cc2.lj_default == [40, 1]
+    assert s1_to_s2.dummy_region_cc2.connected_dummy_regions == [
+        [49, 50, 51, 0, 1],
+        [40],
+    ]
+    assert s1_to_s2.dummy_region_cc1.connected_dummy_regions == [
+        [33],
+        [51, 50, 49, 32, 31],
+    ]
+
+    assert s1_to_s2.matching_terminal_atoms_between_cc == [(12, 14), (19, 21)]