From f00ee28e0143ab41c75880af2c6eb8c3f0761429 Mon Sep 17 00:00:00 2001
From: Luthaf <luthaf@luthaf.fr>
Date: Wed, 15 May 2019 22:15:41 +0200
Subject: [PATCH] Convert MDAnalysis topology to chemfiles when writing

---
 package/MDAnalysis/coordinates/chemfiles.py   | 125 +++++++++++++++---
 .../coordinates/test_chemfiles.py             |  74 ++++++++++-
 2 files changed, 179 insertions(+), 20 deletions(-)
diff --git a/package/MDAnalysis/coordinates/chemfiles.py b/package/MDAnalysis/coordinates/chemfiles.py
index 83864a4168e..39794598f90 100644
--- a/package/MDAnalysis/coordinates/chemfiles.py
+++ b/package/MDAnalysis/coordinates/chemfiles.py
@@ -27,11 +27,6 @@
 provides C++ implementation of multiple formats readers and writers, the full
 list if available `here <formats>`_.
 
-This module only contains the coordinate reader and writer, which means that the
-topology read by chemfiles is ignored when reading a file, and no topology is
-written either. This can be surprising when using formats where topology and
-coordinates are not separated, such as XYZ or PDB.
-
 .. _chemfiles: https://chemfiles.org
 .. _formats: http://chemfiles.org/chemfiles/latest/formats.html
 
@@ -47,11 +42,12 @@
 from __future__ import print_function, unicode_literals
 
 import numpy as np
+from six import string_types
 
 from . import base, core
 
 import chemfiles
-from chemfiles import Trajectory, Frame, Atom, UnitCell, Residue
+from chemfiles import Trajectory, Frame, Atom, UnitCell, Residue, Topology
 
 
 assert chemfiles.__version__.startswith("0.9.")
@@ -90,7 +86,6 @@ def __init__(self, filename, chemfiles_format="", **kwargs):
         self._open()
         self.ts = self._Timestep(self.n_atoms, **self._ts_kwargs)
         self.next()
-        print(kwargs)
 
     def _open(self):
         self._closed = False
@@ -137,7 +132,7 @@ def _read_next_timestep(self, ts=None):
         self.ts = ts
         frame = self._file.read_step(self._step)
         self._frame_to_ts(frame, ts)
-        ts.frame = self._step
+        ts.frame = frame.step
         self._step += 1
         return ts
 
@@ -145,8 +140,8 @@ def _frame_to_ts(self, frame, ts):
         """convert a chemfiles frame to a :class:`TimeStep`"""
         if len(frame.atoms) != self.n_atoms:
             raise IOError(
-                "The number of atom changed in the trajectory. This is not "
-                "supported by MDAnalysis."
+                "The number of atom changed in the trajectory. "
+                "This is not supported by MDAnalysis."
             )
 
         ts.dimensions[:] = frame.cell.lengths + frame.cell.angles
@@ -155,12 +150,6 @@ def _frame_to_ts(self, frame, ts):
             ts.has_velocities = True
             ts.velocities[:] = frame.velocities[:]
 
-    @property
-    def dimensions(self):
-        """unitcell dimensions (*A*, *B*, *C*, *alpha*, *beta*, *gamma*)
-        """
-        return self.ts.dimensions
-
     def Writer(self, filename, n_atoms=None, **kwargs):
         """Return writer for trajectory format"""
         if n_atoms is None:
@@ -185,7 +174,7 @@ class ChemfilesWriter(base.WriterBase):
     multiframe = True
     units = {'time': 'fs', 'length': 'Angstrom'}
 
-    def __init__(self, filename, n_atoms=0, mode="w", chemfiles_format="", **kwargs):
+    def __init__(self, filename, n_atoms=0, mode="w", chemfiles_format="", topology=None, **kwargs):
         """
         Parameters
         ----------
@@ -202,6 +191,10 @@ def __init__(self, filename, n_atoms=0, mode="w", chemfiles_format="", **kwargs)
             use the given format name instead of guessing from the extension.
             The `list of supported formats <formats>`_ and the associated names
             is available in chemfiles documentation.
+        topology : Universe or AtomGroup (optional)
+            use the topology from this :class:`~MDAnalysis.core.groups.AtomGroup`
+            or :class:`~MDAnalysis.core.universe.Universe` to write all the
+            timesteps to the file
         **kwargs : dict
             General writer arguments.
 
@@ -214,6 +207,12 @@ def __init__(self, filename, n_atoms=0, mode="w", chemfiles_format="", **kwargs)
             raise IOError("Expected 'a' or 'w' as mode in ChemfilesWriter")
         self._file = Trajectory(self.filename, mode, chemfiles_format)
         self._closed = False
+        if topology is not None:
+            if isinstance(topology, string_types):
+                self._file.set_topology(topology)
+            else:
+                topology = self._topology_to_chemfiles(topology, n_atoms)
+                self._file.set_topology(topology)
 
     def close(self):
         """Close the trajectory file and finalize the writing"""
@@ -221,6 +220,42 @@ def close(self):
             self._file.close()
             self._closed = True
 
+    def write(self, obj):
+        """Write object `obj` at current trajectory frame to file.
+
+        Topology for the output is taken from the `obj` or default
+        to the value of the `topology` keyword supplied to the
+        :class:`ChemfilesWriter` constructor.
+
+        Parameters
+        ----------
+        obj : Universe or AtomGroup
+            The :class:`~MDAnalysis.core.groups.AtomGroup` or
+            :class:`~MDAnalysis.core.universe.Universe` to write.
+        """
+        if hasattr(obj, "atoms"):
+            if hasattr(obj, 'universe'):
+                # For AtomGroup and children (Residue, ResidueGroup, Segment)
+                ts_full = obj.universe.trajectory.ts
+                if ts_full.n_atoms == obj.atoms.n_atoms:
+                    ts = ts_full
+                else:
+                    # Only populate a time step with the selected atoms.
+                    ts = ts_full.copy_slice(atoms.indices)
+            elif hasattr(obj, 'trajectory'):
+                # For Universe only --- get everything
+                ts = obj.trajectory.ts
+        else:
+            if isinstance(obj, base.Timestep):
+                ts = obj
+                topology = None
+            else:
+                raise TypeError("No Timestep found in obj argument")
+
+        frame = self._timestep_to_chemfiles(ts)
+        frame.topology = self._topology_to_chemfiles(obj, len(frame.atoms))
+        self._file.write(frame)
+
     def write_next_timestep(self, ts):
         """Write timestep object into trajectory.
 
@@ -228,10 +263,10 @@ def write_next_timestep(self, ts):
         ----------
         ts: TimeStep
         """
-        frame = self._ts_to_frame(ts)
+        frame = self._timestep_to_chemfiles(ts)
         self._file.write(frame)
 
-    def _ts_to_frame(self, ts):
+    def _timestep_to_chemfiles(self, ts):
         """
         Convert a Timestep to a chemfiles Frame
         """
@@ -244,3 +279,55 @@ def _ts_to_frame(self, ts):
             frame.velocities[:] = ts.velocities[:]
         frame.cell = UnitCell(*ts.dimensions)
         return frame
+
+    def _topology_to_chemfiles(self, obj, n_atoms):
+        """
+        Convert an AtomGroup or an Universe to a chemfiles Topology
+        """
+        topology = Topology()
+        if not hasattr(obj, "atoms"):
+            # use an empty topology
+            topology.resize(n_atoms)
+            return topology
+
+        # (1) add all atoms to the topology
+        residues = {}
+        for atom in obj.atoms:
+            name = getattr(atom, 'name', "")
+            type = getattr(atom, 'type', name)
+            chemfiles_atom = Atom(name, type)
+
+            if hasattr(atom, 'altLoc'):
+                chemfiles_atom["altloc"] = str(atom.altLoc)
+
+            if hasattr(atom, 'segid'):
+                # TODO: what is the type of atom.segid?
+                chemfiles_atom["segid"] = atom.segid
+
+            if hasattr(atom, 'resid'):
+                resname = getattr(atom, 'resname', "")
+                if atom.resid not in residues.keys():
+                    residues[atom.resid] = Residue(resname, atom.resid)
+                residue = residues[atom.resid]
+
+                atom_idx = len(topology.atoms)
+                residue.atoms.append(atom_idx)
+
+                if hasattr(atom, "record_type"):
+                    # set corresponding chemfiles residue property
+                    if atom.record_type == "ATOM":
+                        residue["is_standard_pdb"] = True
+                    else:
+                        residue["is_standard_pdb"] = False
+
+            topology.atoms.append(chemfiles_atom)
+
+        # (2) add residues to the topology
+        for residue in residues.values():
+            topology.residues.append(residue)
+
+        # (3) add bonds to the topology
+        for bond in getattr(obj, "bonds", []):
+            topology.add_bond(bond.atoms[0].ix, bond.atoms[1].ix)
+
+        return topology
diff --git a/testsuite/MDAnalysisTests/coordinates/test_chemfiles.py b/testsuite/MDAnalysisTests/coordinates/test_chemfiles.py
index 458dc7fd324..e94e6063454 100644
--- a/testsuite/MDAnalysisTests/coordinates/test_chemfiles.py
+++ b/testsuite/MDAnalysisTests/coordinates/test_chemfiles.py
@@ -29,7 +29,7 @@
 
 from MDAnalysis.coordinates.chemfiles import ChemfilesReader, ChemfilesWriter
 
-from MDAnalysisTests import datafiles
+from MDAnalysisTests import datafiles, tempdir
 from MDAnalysisTests.coordinates.base import (
     MultiframeReaderTest, BaseWriterTest, BaseReference
 )
@@ -78,3 +78,75 @@ def test_read_chemfiles_format(self):
 
         for ts in u.trajectory:
             self.assertEqual(ts.n_atoms, 18364)
+
+    def test_changing_system_size(self):
+        outfile = tempdir.TempDir().name + "chemfiles-changing-size.xyz"
+        with open(outfile, "w") as fd:
+            fd.write(VARYING_XYZ)
+
+        u = mda.Universe(outfile, format="chemfiles", topology_format="XYZ")
+
+        with self.assertRaises(IOError):
+            u.trajectory._read_next_timestep()
+
+    def test_wrong_open_mode(self):
+        with self.assertRaises(IOError):
+            _ = ChemfilesWriter("", mode="r")
+
+    def check_topology(self, reference, file):
+        u = mda.Universe(reference)
+        atoms = set([
+            (atom.name, atom.type, atom.record_type)
+            for atom in u.atoms
+        ])
+        bonds = set([
+            (bond.atoms[0].ix, bond.atoms[1].ix)
+            for bond in u.bonds
+        ])
+
+        check = mda.Universe(file)
+        np.testing.assert_equal(
+            u.trajectory.ts.positions,
+            check.trajectory.ts.positions
+        )
+
+        for atom in check.atoms:
+            self.assertIn((atom.name, atom.type, atom.record_type), atoms)
+
+        for bond in check.bonds:
+            self.assertIn((bond.atoms[0].ix, bond.atoms[1].ix), bonds)
+
+    def test_write_topology(self):
+        u = mda.Universe(datafiles.CONECT)
+        outfile = tempdir.TempDir().name + "chemfiles-write-topology.pdb"
+        with ChemfilesWriter(outfile) as writer:
+            writer.write(u)
+        self.check_topology(datafiles.CONECT, outfile)
+
+        # Manually setting the topology when creating the ChemfilesWriter
+        # (1) from an object
+        with ChemfilesWriter(outfile, topology=u) as writer:
+            writer.write_next_timestep(u.trajectory.ts)
+        self.check_topology(datafiles.CONECT, outfile)
+
+        # (2) from a file
+        with ChemfilesWriter(outfile, topology=datafiles.CONECT) as writer:
+            writer.write_next_timestep(u.trajectory.ts)
+        # FIXME: this does not work, since chemfiles also insert the bonds
+        # which are implicit in PDB format (bewteen standard residues), while
+        # MDAnalysis only read the explicit CONNECT records.
+
+        # self.check_topology(datafiles.CONECT, outfile)
+
+
+VARYING_XYZ = """2
+
+A 0 0 0
+A 0 0 0
+4
+
+A 0 0 0
+A 0 0 0
+A 0 0 0
+A 0 0 0
+"""