fix: support alternative atom names within connect_via_residue_names

biotite-dev · Dec 10, 2024 · c6acc78 · c6acc78
1 parent 3a7437d
commit c6acc78
Show file tree

Hide file tree

Showing 4 changed files with 15,353 additions and 1 deletion.
diff --git a/src/biotite/structure/bonds.pyx b/src/biotite/structure/bonds.pyx
@@ -1620,20 +1620,22 @@ def connect_via_residue_names(atoms, bint inter_residue=True,
     """
     from .info.bonds import bonds_in_residue
     from .residues import get_residue_starts
+    from .info.ccd import get_from_ccd
 
     cdef list bonds = []
     cdef int res_i
     cdef int i, j
     cdef int curr_start_i, next_start_i
     cdef np.ndarray atom_names = atoms.atom_name
     cdef np.ndarray atom_names_in_res
+    cdef np.ndarray std_atom_ids
     cdef np.ndarray res_names = atoms.res_name
     cdef str atom_name1, atom_name2
     cdef int64[:] atom_indices1, atom_indices2
     cdef dict bond_dict_for_res
 
     residue_starts = get_residue_starts(atoms, add_exclusive_stop=True)
-    # Omit exclsive stop in 'residue_starts'
+    # Omit exclusive stop in 'residue_starts'
     for res_i in range(len(residue_starts)-1):
         curr_start_i = residue_starts[res_i]
         next_start_i = residue_starts[res_i+1]
@@ -1646,6 +1648,38 @@ def connect_via_residue_names(atoms, bint inter_residue=True,
             )
 
         atom_names_in_res = atom_names[curr_start_i : next_start_i]
+
+        # Check if we should use alternative atom names
+        std_atom_ids = get_from_ccd(
+            "chem_comp_atom", 
+            res_names[curr_start_i], 
+            "atom_id"
+        ) 
+        if (atom_names_in_res is not None and \
+            std_atom_ids is not None and \
+            not set(atom_names_in_res).issubset(std_atom_ids)):
+            # We do not assume that the order of atoms within 
+            # atom_names_in_res matches that of the CCD
+            alt_atom_ids = get_from_ccd(
+                "chem_comp_atom", 
+                res_names[curr_start_i], 
+                "alt_atom_id"
+            ) 
+            if set(atom_names_in_res).issubset(alt_atom_ids):
+                # Standardize atom IDs
+                mapping = dict(zip(alt_atom_ids, std_atom_ids))
+                mapped_atom_names_in_res = np.vectorize(
+                    mapping.get
+                )(atom_names_in_res)
+                atom_names_in_res = mapped_atom_names_in_res
+
+                # If we uncomment the line below, we modify the atom_name in-place
+                # And thus enforce standardized atom names (which may be an unexpected behavior)
+                # TODO: Is that a desired behavior?
+                # atoms.atom_name[curr_start_i : next_start_i] = atom_names_in_res
+
+            # TODO: How to handle cases that do not fit either mapping?
+
         for (atom_name1, atom_name2), bond_type in bond_dict_for_res.items():
             atom_indices1 = np.where(atom_names_in_res == atom_name1)[0] \
                             .astype(np.int64, copy=False)