diff --git a/package/CHANGELOG b/package/CHANGELOG index 5e53d0df8c2..db4715ecb6a 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -46,6 +46,7 @@ Fixes Changes + * PDB parsers/readers/writers replaced by permissive counterparts (Issue #777) * Generalized contact analysis class added. (Issue #702) Deprecations (Issue #599) diff --git a/package/MDAnalysis/coordinates/PDB.py b/package/MDAnalysis/coordinates/PDB.py index 7ec8c6bb9ad..c72411974d0 100644 --- a/package/MDAnalysis/coordinates/PDB.py +++ b/package/MDAnalysis/coordinates/PDB.py @@ -56,44 +56,24 @@ Implementations --------------- -Two different implementations of PDB I/O are available: the -":ref:`permissive`" and the ":ref:`strict`" Reader/Writers. -The default are the "permissive" ones but this can be changed by setting the -flag "permissive_pdb_reader" in :data:`MDAnalysis.core.flags` (see -:ref:`flags-label`) to ``False``:: +PDB I/O is available in the form of the":ref:`permissive`" +Reader/Writers. - MDAnalysis.core.flags["permissive_pdb_reader"] = False - -The *default for MDAnalysis* is to use the -":ref:`permissive`" :class:`PrimitivePDBReader` and -:class:`PrimitivePDBWriter`, corresponding to :: - - MDAnalysis.core.flags["permissive_pdb_reader"] = True - -On a case-by-case basis one kind of reader can be selected with the -*permissive* keyword to :class:`~MDAnalysis.core.AtomGroup.Universe`, e.g. :: - - u = MDAnalysis.Universe(PDB, permissive=False) - -would select :class:`PDBReader` instead of the default -:class:`PrimitivePDBReader`. +..deprecated:: 0.14.1 +Only available in the form of the ":ref:":ref:`permissive`" +Readers and Writers, see below .. _permissive: - Simple (permissive) PDB Reader and Writer ----------------------------------------- - A pure-Python implementation for PDB files commonly encountered in MD -simulations comes under the names :class:`PrimitivePDBReader` and -:class:`PrimitivePDBWriter`. It only implements a subset of the `PDB standard`_ +simulations comes under the names :class:`PDBReader` and +:class:`PDBWriter`. It only implements a subset of the `PDB standard`_ (for instance, it does not deal with insertion codes) and also allows some -typical enhancements such as 4-letter resids (introduced by CHARMM/NAMD). The -"primitive PDB Reader/Writer" are the *default* in MDAnalysis (equivalent to -supplying the *permissive* = ``True`` keyword to -:class:`~MDAnalysis.core.AtomGroup.Universe`). +typical enhancements such as 4-letter resids (introduced by CHARMM/NAMD). -The :class:`PrimitivePDBReader` can read multi-frame PDB files and represents -them as a trajectory. The :class:`PrimitivePDBWriter` can write single and +The :class:`PDBReader` can read multi-frame PDB files and represents +them as a trajectory. The :class:`PDBWriter` can write single and multi-frame PDB files as specified by the *multiframe* keyword. By default, it writes single frames. On the other hand, the :class:`MultiPDBWriter` is set up to write a PDB trajectory by default (equivalent to using *multiframe* = @@ -137,10 +117,10 @@ Classes ~~~~~~~ -.. autoclass:: PrimitivePDBReader +.. autoclass:: PDBReader :members: -.. autoclass:: PrimitivePDBWriter +.. autoclass:: PDBWriter :members: .. automethod:: _check_pdb_coordinates @@ -151,68 +131,20 @@ .. autoclass:: MultiPDBWriter :members: -.. _strict: - -Biopython (strict) PDB Reader and Writer ----------------------------------------- -The :mod:`PDB` module can make use of Biopython's :mod:`Bio.PDB` -[Hamelryck2003]_ but replaces the standard PDB file parser with one that uses -the :class:`MDAnalysis.coordinates.pdb.extensions.SloppyStructureBuilder` to -cope with very large pdb files as commonly encountered in MD simulations. The -Biopython-based :class:`PDBReader` has the advantage that it implements the -`PDB standard`_ rigorously but this comes at the cost of flexibility and -performance. It is also difficult to write out selections using this -implementation (:class:`PDBWriter`) and multi frame PDB files are not -implemented. The Biopython Reader/Writer can be selected when loading data into -a :class:`~MDAnalysis.core.AtomGroup.Universe` by providing the keyword -*permissive* = ``False``. +..deprecated:: 0.14.1 + Setting the flag "permissive_pdb_reader" in :data:`MDAnalysis.core.flags` + (see :ref:`flags-label`) to ``False``:: -The Biopython PDB parser :class:`Bio.PDB.PDBParser` is fairly strict and even -in its own permissive mode (which MDAnalysis employs) typically warns about -missing element names with a -:exc:`Bio.PDB.PDBExceptions.PDBConstructionWarning` . Such warnings, however, -are generally harmless and therefore are filtered (and ignored) by MDAnalysis -with the help of :func:`warnings.filterwarnings`. - - -Classes -~~~~~~~ - -.. autoclass:: PDBReader - :members: - -.. autoclass:: PDBWriter - :members: + MDAnalysis.core.flags["permissive_pdb_reader"] = False -References ----------- - -.. [Hamelryck2003] Hamelryck, T., Manderick, B. (2003) PDB parser and structure - class implemented in Python. Bioinformatics, 19, 2308-2310. - http://biopython.org - -.. _PDB standard: http://www.wwpdb.org/documentation/format32/v3.2.html -.. _END: http://www.wwpdb.org/documentation/format32/sect11.html#END + Now yields a placeholder Primitive PDB Reader/Writer that inherits from + PDBReader and Writer """ from six.moves import range -try: - # BioPython is overkill but potentially extensible (altLoc etc) - import Bio.PDB - from . import pdb - # disable PDBConstructionWarning from picky builder - import warnings - - warnings.filterwarnings('ignore', - category=Bio.PDB.PDBExceptions.PDBConstructionWarning, - message="Could not assign element|Used element .* for Atom") -except ImportError: - # TODO: fall back to PrimitivePDBReader - raise ImportError("No full-feature PDB I/O functionality. Install biopython.") - import os import errno import textwrap @@ -234,167 +166,7 @@ class implemented in Python. Bioinformatics, 19, 2308-2310. # Pairs of residue name / atom name in use to deduce PDB formatted atom names Pair = collections.namedtuple('Atom', 'resname name') - -class PDBReader(base.SingleFrameReader): - """Read a pdb file into a :mod:`BioPython.PDB` structure. - - The coordinates are also supplied as one numpy array and wrapped - into a Timestep object. - - .. Note:: The Biopython.PDB reader does not parse the ``CRYST1`` - record and hence the unitcell dimensions are not set. - Use the :class:`PrimitivePDBReader` instead (i.e. use - the ``primitive=True`` keyword for :class:`Universe`). - - .. versionchanged:: 0.11.0 - * Frames now 0-based instead of 1-based. - * All PDB header metadata parsed by the reader is available in - the dict :attr:`metadata`. - - """ - format = 'PDB' - units = {'time': None, 'length': 'Angstrom'} - - def _read_first_frame(self): - pdb_id = "0UNK" - self.pdb = pdb.extensions.get_structure(self.filename, pdb_id) - pos = np.array([atom.coord for atom in self.pdb.get_atoms()]) - self.n_atoms = pos.shape[0] - self.fixed = 0 # parse B field for fixed atoms? - #self.ts._unitcell[:] = ??? , from CRYST1? --- not implemented in Biopython.PDB - self.ts = self._Timestep.from_coordinates(pos, **self._ts_kwargs) - self.ts.frame = 0 - del pos - if self.convert_units: - self.convert_pos_from_native(self.ts._pos) # in-place ! - # metadata - self.metadata = self.pdb.header - - def Writer(self, filename, **kwargs): - """Returns a strict PDBWriter for *filename*. - - :Arguments: - *filename* - filename of the output PDB file - - :Returns: :class:`PDBWriter` - - .. Note:: - - This :class:`PDBWriter` 's :meth:`~PDBWriter.write` method - always requires a :class:`base.Timestep` as an argument (it is - not optional anymore when the Writer is obtained through - this method of :class:`PDBReader` .) - """ - # This is messy; we cannot get a universe from the Reader, which would - # be also needed to be fed to the PDBWriter (which is a total mess...). - # Hence we ignore the problem and document it in the doc string... --- - # the limitation is simply that PDBWriter.write() must always be called - # with an argument. - kwargs['BioPDBstructure'] = self.pdb # make sure that this Writer is - # always linked to this reader, don't bother with Universe - kwargs.pop('universe', None) - return PDBWriter(filename, **kwargs) - - -class PDBWriter(base.Writer): - """Write out the current time step as a pdb file. - - This is not cleanly implemented at the moment. One must supply a - universe, even though this is nominally an optional argument. The - class behaves slightly differently depending on if the structure - was loaded from a PDB (then the full-fledged :mod:`Bio.PDB` writer is - used) or if this is really only an atom selection (then a less - sophistiocated writer is employed). - - .. Note:: - - The standard PDBWriter can only write the *whole system*. In - order to write a selection, use the :class:`PrimitivePDBWriter` , - which happens automatically when the - :meth:`~MDAnalysis.core.AtomGroup.AtomGroup.write` method of a - :class:`~MDAnalysis.core.AtomGroup.AtomGroup` instance is used. - """ - format = 'PDB' - units = {'time': None, 'length': 'Angstrom'} - - # PDBWriter is a bit more complicated than the DCDWriter in the - # sense that a DCD frame only contains coordinate information. The - # PDB contains atom data as well and hence it MUST access the - # universe. In order to present a unified (and backwards - # compatible) interface we must keep the universe argument an - # optional keyword argument even though it really is required. - - def __init__(self, pdbfilename, universe=None, multi=False, **kwargs): - """pdbwriter = PDBWriter(,universe=universe,**kwargs) - - :Arguments: - pdbfilename filename; if multi=True, embed a %%d formatstring - so that write_next_timestep() can insert the frame number - - universe supply a universe [really REQUIRED; optional only for compatibility] - - multi False: write a single structure to a single pdb - True: write all frames to multiple pdb files - """ - import Bio.PDB.Structure - - self.universe = universe - # hack for PDBReader.Writer() - self.PDBstructure = kwargs.pop('BioPDBstructure', None) - if not self.PDBstructure: - try: - self.PDBstructure = universe.trajectory.pdb - except AttributeError: - pass - self.filename = pdbfilename - self.multi = multi - if self.multi: - raise NotImplementedError('Sorry, multi=True does not work yet.') - if self.PDBstructure is not None \ - and not isinstance(self.PDBstructure, Bio.PDB.Structure.Structure): - raise TypeError('If defined, PDBstructure must be a Bio.PDB.Structure.Structure, eg ' - 'Universe.trajectory.pdb.') - - def write_next_timestep(self, ts=None): - self.write(ts) - - def write(self, ts=None): - """Write timestep as a pdb file. - - If ts=None then we try to get the current one from the universe. - """ - if self.PDBstructure is None: - if self.universe is None: - warnings.warn("PDBWriter: Not writing frame as neither Timestep nor Universe supplied.") - return - # primitive PDB writing (ignores timestep argument) - ppw = PrimitivePDBWriter(self.filename) - ppw.write(self.universe.select_atoms('all')) - ppw.close() - else: - # full fledged PDB writer - # Let's cheat and use universe.pdb.pdb: modify coordinates - # and save... - if ts is None: - try: - ts = self.universe.trajectory.ts - except AttributeError: - warnings.warn("PDBWriter: Not writing frame as neither universe nor timestep supplied.") - return - if not hasattr(ts, '_pos'): - raise TypeError("The PDBWriter can only process a Timestep as " - " optional argument, not e.g. a selection. " - "Use the PrimitivePDBWriter instead and see " - "the docs.") - for a, pos in zip(self.PDBstructure.get_atoms(), ts._pos): - a.set_coord(pos) - io = pdb.extensions.SloppyPDBIO() - io.set_structure(self.PDBstructure) - io.save(self.filename) - - -class PrimitivePDBReader(base.Reader): +class PDBReader(base.Reader): """PDBReader that reads a `PDB-formatted`_ file, no frills. The following *PDB records* are parsed (see `PDB coordinate section`_ for @@ -445,7 +217,7 @@ class PrimitivePDBReader(base.Reader): ============= ============ =========== ============================================= - .. SeeAlso:: :class:`PrimitivePDBWriter`; :class:`PDBReader` + .. SeeAlso:: :class:`PDBWriter`; :class:`PDBReader` implements a larger subset of the header records, which are accessible as :attr:`PDBReader.metadata`. @@ -454,7 +226,7 @@ class PrimitivePDBReader(base.Reader): * New :attr:`title` (list with all TITLE lines). """ - format = 'Permissive_PDB' + format = 'PDB' units = {'time': None, 'length': 'Angstrom'} def __init__(self, filename, **kwargs): @@ -467,12 +239,12 @@ def __init__(self, filename, **kwargs): frame numbers. Therefore, the MODEL numbers must be a sequence of integers (typically starting at 1 or 0). """ - super(PrimitivePDBReader, self).__init__(filename, **kwargs) + super(PDBReader, self).__init__(filename, **kwargs) try: self._n_atoms = kwargs['n_atoms'] except KeyError: - raise ValueError("PrimitivePDBReader requires the n_atoms keyword") + raise ValueError("PDBReader requires the n_atoms keyword") self.model_offset = kwargs.pop("model_offset", 0) @@ -567,11 +339,11 @@ def Writer(self, filename, **kwargs): *filename* filename of the output PDB file - :Returns: :class:`PrimitivePDBWriter` + :Returns: :class:`PDBWriter` """ kwargs.setdefault('multiframe', self.n_frames > 1) - return PrimitivePDBWriter(filename, **kwargs) + return PDBWriter(filename, **kwargs) def rewind(self): self._read_frame(0) @@ -586,7 +358,7 @@ def _read_next_timestep(self, ts=None): ts = self.ts else: # TODO: cleanup _read_frame() to use a "free" Timestep - raise NotImplementedError("PrimitivePDBReader cannot assign to a timestep") + raise NotImplementedError("PDBReader cannot assign to a timestep") # frame is 1-based. Normally would add 1 to frame before calling # self._read_frame to retrieve the subsequent ts. But self._read_frame # assumes it is being passed a 0-based frame, and adjusts. @@ -649,13 +421,13 @@ def _read_frame(self, frame): return self.ts -class PrimitivePDBWriter(base.Writer): +class PDBWriter(base.Writer): """PDB writer that implements a subset of the `PDB 3.2 standard`_ . PDB format as used by NAMD/CHARMM: 4-letter resnames and segID are allowed, altLoc is written. - The :class:`PrimitivePDBWriter` can be used to either dump a coordinate + The :class:`PDBWriter` can be used to either dump a coordinate set to a PDB file (operating as a "single frame writer", selected with the constructor keyword *multiframe* = ``False``, the default) or by writing a PDB "movie" (multi frame mode, *multiframe* = ``True``), consisting of @@ -743,7 +515,7 @@ class PrimitivePDBWriter(base.Writer): Pair('PF5', 'FE2'), Pair('UNL', 'UNL')) def __init__(self, filename, bonds="conect", n_atoms=None, start=0, step=1, - remarks="Created by PrimitivePDBWriter", + remarks="Created by PDBWriter", convert_units=None, multiframe=None): """Create a new PDBWriter @@ -834,12 +606,12 @@ def _write_pdb_header(self): self._write_pdb_title(self) return if self.first_frame_done == True: - return + return self.first_frame_done = True u = self.obj.universe self.HEADER(u.trajectory) - + self._write_pdb_title() self.COMPND(u.trajectory) @@ -907,7 +679,7 @@ def _write_pdb_bonds(self): records for anything smaller than the :class:`Universe` are written. .. versionchanged:: 0.7.6 - Only write CONECT records if :attr:`PrimitivePDBWriter.bonds` ``== True``. + Only write CONECT records if :attr:`PDBWriter.bonds` ``== True``. Raises :exc:`NotImplementedError` if it would produce wrong output. """ @@ -978,7 +750,7 @@ def _update_frame(self, obj): """ if isinstance(obj, base.Timestep): - raise TypeError("PrimitivePDBWriter cannot write Timestep objects " + raise TypeError("PDBWriter cannot write Timestep objects " "directly, since they lack topology information (" "atom names and types) required in PDB files") # remember obj for some of other methods --- NOTE: this is an evil/lazy @@ -995,7 +767,7 @@ def _update_frame(self, obj): traj = obj.trajectory if not (ts and traj): - raise AssertionError("PrimitivePDBWriter couldn't extract " + raise AssertionError("PDBWriter couldn't extract " "trajectory and timestep information " "from an object; inheritance problem.") @@ -1040,18 +812,19 @@ def write_all_timesteps(self, obj): constructor). Thus, if *u* is a Universe then :: u.trajectory[-2] - pdb = PrimitivePDBWriter("out.pdb", u.atoms.n_atoms) + pdb = PDBWriter("out.pdb", u.atoms.n_atoms) pdb.write_all_timesteps(u) will write a PDB trajectory containing the last 2 frames and :: - pdb = PrimitivePDBWriter("out.pdb", u.atoms.n_atoms, start=12, skip=2) + pdb = PDBWriter("out.pdb", u.atoms.n_atoms, start=12, skip=2) pdb.write_all_timesteps(u) will be writing frames 12, 14, 16, ... .. versionchanged:: 0.11.0 Frames now 0-based instead of 1-based + """ self._update_frame(obj) @@ -1136,7 +909,7 @@ def _write_timestep(self, ts, multiframe=False): the moment we do *not* write the NUMMDL_ record.) The *multiframe* = ``False`` keyword signals that the - :class:`PrimitivePDBWriter` is in single frame mode and no MODEL_ + :class:`PDBWriter` is in single frame mode and no MODEL_ records are written. .. _MODEL: http://www.wwpdb.org/documentation/format32/sect9.html#MODEL @@ -1252,7 +1025,7 @@ def END(self): """Write END_ record. Only a single END record is written. Calling END multiple times has no - effect. Because :meth:`~PrimitivePDBWriter.close` also calls this + effect. Because :meth:`~PDBWriter.close` also calls this method right before closing the file it is recommended to *not* call :meth:`~PrimitivePDBWriter.END` explicitly. @@ -1282,8 +1055,19 @@ def CONECT(self, conect): conect = "".join(conect) self.pdbfile.write(self.fmt['CONECT'].format(conect)) +warnings.warn('PrimitivePDBReader is identical to the PDBReader,' + 'it is deprecated in favor of the shorter name', + category=DeprecationWarning) +class PrimitivePDBReader(PDBReader): + format = 'Permissive_PDB' + +warnings.warn('PrimitivePDBWriter is now identical to the PDBWriter,' + 'it is deprecated in favor of the shorter name', + category=DeprecationWarning) +class PrimitivePDBWriter(PDBWriter): + format = 'Permissive_PDB' -class ExtendedPDBReader(PrimitivePDBReader): +class ExtendedPDBReader(PDBReader): """PDBReader that reads a PDB-formatted file with five-digit residue numbers. This reader does not conform to the `PDB standard`_ because it allows @@ -1292,7 +1076,7 @@ class ExtendedPDBReader(PrimitivePDBReader): insertion code in the PDB standard). PDB files in this format are written by popular programs such as VMD_. - .. SeeAlso:: :class:`PrimitivePDBReader` + .. SeeAlso:: :class:`PDBReader` .. _PDB standard: http://www.wwpdb.org/documentation/format32/sect9.html .. _VMD: http://www.ks.uiuc.edu/Research/vmd/ @@ -1302,7 +1086,7 @@ class ExtendedPDBReader(PrimitivePDBReader): format = "XPDB" -class MultiPDBWriter(PrimitivePDBWriter): +class MultiPDBWriter(PDBWriter): """PDB writer that implements a subset of the `PDB 3.2 standard`_ . PDB format as used by NAMD/CHARMM: 4-letter resnames and segID, altLoc @@ -1321,7 +1105,7 @@ class MultiPDBWriter(PrimitivePDBWriter): .. SeeAlso:: - This class is identical to :class:`PrimitivePDBWriter` with the one + This class is identical to :class:`PDBWriter` with the one exception that it defaults to writing multi-frame PDB files instead of single frames. diff --git a/package/MDAnalysis/coordinates/__init__.py b/package/MDAnalysis/coordinates/__init__.py index 106c96fcd19..026c1d70393 100644 --- a/package/MDAnalysis/coordinates/__init__.py +++ b/package/MDAnalysis/coordinates/__init__.py @@ -179,10 +179,8 @@ | | | | optional `netcdf4-python`_ module (coordinates and | | | | | velocities). Module :mod:`MDAnalysis.coordinates.TRJ`| +---------------+-----------+-------+------------------------------------------------------+ - | Brookhaven | pdb | r/w | a simplified PDB format (as used in MD simulations) | - | [#a]_ | | | is read by default; the full format can be read by | - | | | | supplying the `permissive=False` flag to | - | | | | :class:`MDAnalysis.Universe`. Multiple frames (MODEL)| + | Brookhaven | pdb | r/w | a relaxed PDB format (as used in MD simulations) | + | [#a]_ | | | is read by default; Multiple frames (MODEL) | | | | | are supported but require the *multiframe* keyword. | | | | | Module :mod:`MDAnalysis.coordinates.PDB` | +---------------+-----------+-------+------------------------------------------------------+ @@ -265,7 +263,7 @@ - 2015-01-15 Timestep._init_unitcell() method added - 2015-06-11 Reworked Timestep init. Base Timestep now does Vels & Forces - 2015-07-21 Major changes to Timestep and Reader API (release 0.11.0) - +- 2016-04-03 Removed references to Strict Readers for PDBS [jdetle] .. _Issue 49: https://github.com/MDAnalysis/mdanalysis/issues/49 .. _Context Manager: http://docs.python.org/2/reference/datamodel.html#context-managers diff --git a/package/MDAnalysis/coordinates/core.py b/package/MDAnalysis/coordinates/core.py index c741d4757fa..d64bd71023e 100644 --- a/package/MDAnalysis/coordinates/core.py +++ b/package/MDAnalysis/coordinates/core.py @@ -51,7 +51,8 @@ def get_reader_for(filename, permissive=False, format=None): filename of the input trajectory or coordinate file permissive : bool If set to ``True``, a reader is selected that is more tolerant of the - input (currently only implemented for PDB). [``False``] + input (currently a deprecated feature only implemented for PDB). + [``False``] kwargs Keyword arguments for the selected Reader class. @@ -67,7 +68,7 @@ def get_reader_for(filename, permissive=False, format=None): if format is None: format = util.guess_format(filename) format = format.upper() - if permissive and format == 'PDB': + if format == 'PDB' and permissive: return _READERS['Permissive_PDB'] try: return _READERS[format] @@ -101,6 +102,7 @@ def reader(filename, **kwargs): filename : str or tuple filename (or tuple of filenames) of the input coordinate file permissive : bool + If set to ``True``, a reader is selected that is more tolerant of the input (currently only implemented for PDB). [``False``] kwargs @@ -113,8 +115,8 @@ def reader(filename, **kwargs): .. SeeAlso:: For trajectory formats: :class:`~DCD.DCDReader`, :class:`~XTC.XTCReader`, :class:`~TRR.TRRReader`, :class:`~XYZ.XYZReader`. For single frame formats: - :class:`~CRD.CRDReader`, :class:`~PDB.PDBReader` and - :class:`~PDB.PrimitivePDBReader`, :class:`~GRO.GROReader`, + :class:`~CRD.CRDReader`, and + :class:`~PDB.PDBReader`, :class:`~GRO.GROReader`, """ if isinstance(filename, tuple): Reader = get_reader_for(filename[0], diff --git a/package/MDAnalysis/core/AtomGroup.py b/package/MDAnalysis/core/AtomGroup.py index fc896113cf8..eb00c28d541 100644 --- a/package/MDAnalysis/core/AtomGroup.py +++ b/package/MDAnalysis/core/AtomGroup.py @@ -4195,10 +4195,10 @@ def __init__(self, *args, **kwargs): topology) is always required. *permissive* - currently only relevant for PDB files: Set to ``True`` in order to ignore most errors - and read typical MD simulation PDB files; set to ``False`` to read with the Bio.PDB reader, - which can be useful for real Protein Databank PDB files. ``None`` selects the - MDAnalysis default (which is set in :class:`MDAnalysis.core.flags`) [``None``] + currently only relevant to PDBs, Readers and Writers and Parsers + returned by setting ::permisive = False are depecrated in favor + of their ::permissive = True counterparts + *topology_format* provide the file format of the topology file; ``None`` guesses it from the file extension [``None``] @@ -4260,6 +4260,9 @@ def __init__(self, *args, **kwargs): .. versionchanged:: 0.11.0 Added the *is_anchor* and *anchor_name* keywords for finer behavior control when unpickling instances of :class:`MDAnalysis.core.AtomGroup.AtomGroup`. + .. versionchanged:: 0.14.1 + *permissive* set to ``True`` and set to ``False`` both yield permissive + PDB readers, writers and parsers """ from ..topology.core import get_parser_for diff --git a/package/MDAnalysis/core/__init__.py b/package/MDAnalysis/core/__init__.py index 50c3d147183..a5ac338b507 100644 --- a/package/MDAnalysis/core/__init__.py +++ b/package/MDAnalysis/core/__init__.py @@ -1,5 +1,5 @@ # -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*- -# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 +# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 # # MDAnalysis --- http://www.MDAnalysis.org # Copyright (c) 2006-2015 Naveen Michaud-Agrawal, Elizabeth J. Denning, Oliver Beckstein @@ -390,22 +390,21 @@ def __doc__(self): >>> flags['%(name)s'] = value - The Bio.PDB reader (value=``False``) can deal with 'proper' PDB - files from the Protein Databank that contain special PDB features - such as insertion codes and it can auto-correct some common - mistakes; see :mod:`Bio.PDB` for details. However, Bio.PDB has been - known to read some simulation system PDB files **incompletely**; a - sure sign of problems is a warning that an atom has appeared twice - in a residue. - - Therefore, the default for the PDB reader is ``True``, which - selects the "primitive" (or "permissive") reader - :class:`MDAnalysis.coordinates.PDB.PrimitivePDBReader`, which + The default for the PDB reader is ``True``, which + selects the (or "permissive") reader + :class:`MDAnalysis.coordinates.PDB.PDBReader`, which essentially just reads ATOM and HETATM lines and puts atoms in a list. + ``False`` selects the :class: + `MDAnalysis.coordinates.PDB.PrimitivePDBReader` which is a copy of + the :class:`MDAnalysis.coordinates.PDB.PDBReader` deprecated in favor + of its counterpart + One can manually switch between the two by providing the *permissive* keyword to :class:`MDAnalysis.Universe`. + + """ ), _Flag( diff --git a/package/MDAnalysis/topology/ExtendedPDBParser.py b/package/MDAnalysis/topology/ExtendedPDBParser.py index b0285a36292..bbd8c40ae1b 100644 --- a/package/MDAnalysis/topology/ExtendedPDBParser.py +++ b/package/MDAnalysis/topology/ExtendedPDBParser.py @@ -21,7 +21,7 @@ This topology parser uses a PDB file to build a minimum internal structure representation (list of atoms). The only difference from -:mod:`~MDAnalysis.topology.PrimitivePDBParser` is that this parser reads a +:mod:`~MDAnalysis.topology.PDBParser` is that this parser reads a non-standard PDB-like format in which residue numbers can be five digits instead of four. @@ -37,7 +37,7 @@ .. SeeAlso:: - * :mod:`MDAnalysis.topology.PrimitivePDBParser` + * :mod:`MDAnalysis.topology.PDBParser` * :class:`MDAnalysis.coordinates.PDB.ExtendedPDBReader` * :class:`MDAnalysis.core.AtomGroup.Universe` @@ -51,10 +51,10 @@ """ from __future__ import absolute_import -from . import PrimitivePDBParser +from . import PDBParser -class ExtendedPDBParser(PrimitivePDBParser.PrimitivePDBParser): +class ExtendedPDBParser(PDBParser.PDBParser): """Parser that obtains a list of atoms from an non-standard "extended" PDB file. Extended PDB files (MDAnalysis format specifier *XPDB*) may contain residue diff --git a/package/MDAnalysis/topology/PDBParser.py b/package/MDAnalysis/topology/PDBParser.py index 0e2e3e59973..699fc072405 100644 --- a/package/MDAnalysis/topology/PDBParser.py +++ b/package/MDAnalysis/topology/PDBParser.py @@ -1,9 +1,10 @@ + # -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*- -# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 +# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 # # MDAnalysis --- http://www.MDAnalysis.org -# Copyright (c) 2006-2015 Naveen Michaud-Agrawal, Elizabeth J. Denning, Oliver Beckstein -# and contributors (see AUTHORS for the full list) +# Copyright (c) 2006-2015 Naveen Michaud-Agrawal, Elizabeth J. Denning, Oliver +# Beckstein and contributors (see AUTHORS for the full list) # # Released under the GNU Public Licence, v2 or any higher version # @@ -14,20 +15,29 @@ # """ -PDB topology parser -=================== +PDB Topology Parser +========================================================================= + +This topology parser uses a standard PDB file to build a minimum +internal structure representation (list of atoms). -Use a PDB file to build a minimum internal structure representation. +The topology reader reads a PDB file line by line and ignores atom +numbers but only reads residue numbers up to 9,999 correctly. If you +have systems containing at least 10,000 residues then you need to use +a different file format (e.g. the "extended" PDB, *XPDB* format, see +:mod:`~MDAnalysis.topology.ExtendedPDBParser`) that can handle residue +numbers up to 99,999. -.. Note:: Only atoms and their names are read; no bond connectivity of - (partial) charges are deduced. Masses are guessed and set to - 0 if unknown. +.. Note:: -.. SeeAlso:: :mod:`MDAnalysis.coordinates.PDB` and :mod:`Bio.PDB` + The parser processes atoms and their names. Masses are guessed and set to 0 + if unknown. Partial charges are not set. -.. SeeAlso:: :mod:`MDAnalysis.topology.PrimitivePDBParser` (which - *can* guess conectivity but does not support all subleties of the full - PDB format) +.. SeeAlso:: + + * :mod:`MDAnalysis.topology.ExtendedPDBParser` + * :class:`MDAnalysis.coordinates.PDB.PDBReader` + * :class:`MDAnalysis.core.AtomGroup.Universe` Classes ------- @@ -37,62 +47,172 @@ :inherited-members: """ -from __future__ import absolute_import +from __future__ import absolute_import, print_function -try: - # BioPython is overkill but potentially extensible (altLoc etc) - import Bio.PDB -except ImportError: - raise ImportError("Bio.PDB from biopython not found." - "Required for PDB topology parser.") +import numpy as np +import warnings -from .base import TopologyReader from ..core.AtomGroup import Atom -from ..coordinates.pdb.extensions import get_structure -from .core import guess_atom_type, guess_atom_mass, guess_atom_charge +from .core import get_atom_mass, guess_atom_element +from ..lib.util import openany +from .base import TopologyReader class PDBParser(TopologyReader): - """Read minimum topology information from a PDB file.""" + """Parser that obtains a list of atoms from a standard PDB file. + + See Also + -------- + :class:`MDAnalysis.coordinates.PDB.PDBReader` + + .. versionadded:: 0.8 + """ format = 'PDB' def parse(self): - """Parse atom information from PDB file *pdbfile*. + """Parse atom information from PDB file *filename*. - Only reads the list of atoms. + Returns + ------- + MDAnalysis internal *structure* dict - This functions uses the :class:`Bio.PDB.PDBParser` as used by - :func:`MDAnalysis.coordinates.pdb.extensions.get_structure`. + See Also + -------- + The *structure* dict is defined in `MDAnalysis.topology` and the file + is read with :class:`MDAnalysis.coordinates.PDB.PDBReader`. - :Returns: MDAnalysis internal *structure* dict - - .. SeeAlso:: The *structure* dict is defined in `MDAnalysis.topology`. """ + structure = {} + atoms = self._parseatoms() + structure['atoms'] = atoms - structure = {'atoms': atoms} + bonds = self._parsebonds(atoms) + structure['bonds'] = bonds return structure def _parseatoms(self): - # use Sloppy PDB parser to cope with big PDBs! - pdb = get_structure(self.filename, "0UNK") - + iatom = 0 atoms = [] - # translate Bio.PDB atom objects to MDAnalysis Atom. - for iatom, atom in enumerate(pdb.get_atoms()): - residue = atom.parent - chain_id = residue.parent.id - atomname = atom.name - atomtype = guess_atom_type(atomname) - resname = residue.resname - resid = int(residue.id[1]) - # no empty segids (or Universe throws IndexError) - segid = residue.get_segid().strip() or chain_id or "SYSTEM" - mass = guess_atom_mass(atomname) - charge = guess_atom_charge(atomname) - bfactor = atom.bfactor - # occupancy = atom.occupancy - atoms.append(Atom(iatom, atomname, atomtype, resname, resid, segid, - mass, charge, bfactor=bfactor, universe=self._u)) + + with openany(self.filename) as f: + resid_prev = 0 # resid looping hack + for i, line in enumerate(f): + line = line.strip() # Remove extra spaces + if len(line) == 0: # Skip line if empty + continue + record = line[:6].strip() + + if record.startswith('END'): + break + elif line[:6] in ('ATOM ', 'HETATM'): + try: + serial = int(line[6:11]) + except ValueError: + # serial can become '***' when they get too high + self._wrapped_serials = True + serial = None + name = line[12:16].strip() + altLoc = line[16:17].strip() + resName = line[17:21].strip() + # empty chainID is a single space ' '! + chainID = line[21:22].strip() + if self.format == "XPDB": # fugly but keeps code DRY + # extended non-standard format used by VMD + resSeq = int(line[22:27]) + resid = resSeq + else: + resSeq = int(line[22:26]) + resid = resSeq + + while resid - resid_prev < -5000: + resid += 10000 + resid_prev = resid + # insertCode = _c(27, 27, str) # not used + # occupancy = float(line[54:60]) + try: + tempFactor = float(line[60:66]) + except ValueError: + tempFactor = 0.0 + segID = line[66:76].strip() + element = line[76:78].strip() + + segid = segID.strip() or chainID.strip() or "SYSTEM" + + elem = guess_atom_element(name) + + atomtype = element or elem + mass = get_atom_mass(elem) + # charge = guess_atom_charge(name) + charge = 0.0 + + atom = Atom(iatom, name, atomtype, resName, resid, + segid, mass, charge, + bfactor=tempFactor, serial=serial, + altLoc=altLoc, universe=self._u, + resnum=resSeq) + iatom += 1 + atoms.append(atom) + return atoms + + def _parsebonds(self, atoms): + # Could optimise this by saving lines in the main loop + # then doing post processing after all Atoms have been read + # ie do one pass through the file only + # Problem is that in multiframe PDB, the CONECT is at end of file, + # so the "break" call happens before bonds are reached. + + # If the serials wrapped, this won't work + if hasattr(self, '_wrapped_serials'): + warnings.warn("Invalid atom serials were present, bonds will not" + " be parsed") + return tuple([]) + + # Mapping between the atom array indicies a.index and atom ids + # (serial) in the original PDB file + mapping = dict((a.serial, a.index) for a in atoms) + + bonds = set() + with openany(self.filename, "r") as f: + lines = (line for line in f if line[:6] == "CONECT") + for line in lines: + atom, atoms = _parse_conect(line.strip()) + for a in atoms: + bond = tuple([mapping[atom], mapping[a]]) + bonds.add(bond) + + bonds = tuple(bonds) + + return bonds + + +def _parse_conect(conect): + """parse a CONECT record from pdbs + + Parameters + ---------- + conect : str + white space striped CONECT record + + Returns + ------- + atom_id : int + atom index of bond + bonds : set + atom ids of bonded atoms + + Raises + ------ + RuntimeError + Raised if ``conect`` is not a valid CONECT record + """ + atom_id = np.int(conect[6:11]) + n_bond_atoms = len(conect[11:]) // 5 + if len(conect[11:]) % n_bond_atoms != 0: + raise RuntimeError("Bond atoms aren't aligned proberly for CONECT " + "record: {}".format(conect)) + bond_atoms = (int(conect[11 + i * 5: 16 + i * 5]) for i in + range(n_bond_atoms)) + return atom_id, bond_atoms diff --git a/package/MDAnalysis/topology/PrimitivePDBParser.py b/package/MDAnalysis/topology/PrimitivePDBParser.py index 4a5fd9eb841..d39c5324366 100644 --- a/package/MDAnalysis/topology/PrimitivePDBParser.py +++ b/package/MDAnalysis/topology/PrimitivePDBParser.py @@ -35,7 +35,7 @@ .. SeeAlso:: * :mod:`MDAnalysis.topology.ExtendedPDBParser` - * :class:`MDAnalysis.coordinates.PDB.PrimitivePDBReader` + * :class:`MDAnalysis.coordinates.PDB.PDBReader` * :class:`MDAnalysis.core.AtomGroup.Universe` Classes @@ -45,148 +45,33 @@ :members: :inherited-members: +..deprecated:: 0.14.1 + PrimitivePDBParser now has the same behavior as PDBParser, and will be + absent from future versions """ + from __future__ import absolute_import, print_function import numpy as np import warnings +from . import PDBParser from ..core.AtomGroup import Atom from .core import get_atom_mass, guess_atom_element from ..lib.util import openany from .base import TopologyReader -class PrimitivePDBParser(TopologyReader): - """Parser that obtains a list of atoms from a standard PDB file. - - See Also - -------- - :class:`MDAnalysis.coordinates.PDB.PrimitivePDBReader` - - .. versionadded:: 0.8 - """ +warnings.warn('PrimitivePDBParser is identical to the PDBParser,' + 'it is deprecated in favor of the shorter name', + category=DeprecationWarning) +class PrimitivePDBParser(PDBParser.PDBParser): format = 'Permissive_PDB' - def parse(self): - """Parse atom information from PDB file *filename*. - - Returns - ------- - MDAnalysis internal *structure* dict - - See Also - -------- - The *structure* dict is defined in `MDAnalysis.topology` and the file - is read with :class:`MDAnalysis.coordinates.PDB.PrimitivePDBReader`. - - """ - structure = {} - - atoms = self._parseatoms() - structure['atoms'] = atoms - - bonds = self._parsebonds(atoms) - structure['bonds'] = bonds - - return structure - - def _parseatoms(self): - iatom = 0 - atoms = [] - - with openany(self.filename) as f: - resid_prev = 0 # resid looping hack - for i, line in enumerate(f): - line = line.strip() # Remove extra spaces - if len(line) == 0: # Skip line if empty - continue - record = line[:6].strip() - - if record.startswith('END'): - break - elif line[:6] in ('ATOM ', 'HETATM'): - try: - serial = int(line[6:11]) - except ValueError: - # serial can become '***' when they get too high - self._wrapped_serials = True - serial = None - name = line[12:16].strip() - altLoc = line[16:17].strip() - resName = line[17:21].strip() - # empty chainID is a single space ' '! - chainID = line[21:22].strip() - if self.format == "XPDB": # fugly but keeps code DRY - # extended non-standard format used by VMD - resSeq = int(line[22:27]) - resid = resSeq - else: - resSeq = int(line[22:26]) - resid = resSeq - - while resid - resid_prev < -5000: - resid += 10000 - resid_prev = resid - # insertCode = _c(27, 27, str) # not used - # occupancy = float(line[54:60]) - try: - tempFactor = float(line[60:66]) - except ValueError: - tempFactor = 0.0 - segID = line[66:76].strip() - element = line[76:78].strip() - - segid = segID.strip() or chainID.strip() or "SYSTEM" - - elem = guess_atom_element(name) - - atomtype = element or elem - mass = get_atom_mass(elem) - # charge = guess_atom_charge(name) - charge = 0.0 - - atom = Atom(iatom, name, atomtype, resName, resid, - segid, mass, charge, - bfactor=tempFactor, serial=serial, - altLoc=altLoc, universe=self._u, - resnum=resSeq) - iatom += 1 - atoms.append(atom) - - return atoms - - def _parsebonds(self, atoms): - # Could optimise this by saving lines in the main loop - # then doing post processing after all Atoms have been read - # ie do one pass through the file only - # Problem is that in multiframe PDB, the CONECT is at end of file, - # so the "break" call happens before bonds are reached. - - # If the serials wrapped, this won't work - if hasattr(self, '_wrapped_serials'): - warnings.warn("Invalid atom serials were present, bonds will not" - " be parsed") - return tuple([]) - - # Mapping between the atom array indicies a.index and atom ids - # (serial) in the original PDB file - mapping = dict((a.serial, a.index) for a in atoms) - - bonds = set() - with openany(self.filename, "r") as f: - lines = (line for line in f if line[:6] == "CONECT") - for line in lines: - atom, atoms = _parse_conect(line.strip()) - for a in atoms: - bond = tuple([mapping[atom], mapping[a]]) - bonds.add(bond) - - bonds = tuple(bonds) - - return bonds - - +warnings.warn('PrimitivePDBParser is identical to the PDBParser,' + 'it is deprecated in favor of the shorter name,' + 'please use the parse_conect function of that class', + category=DeprecationWarning) def _parse_conect(conect): """parse a CONECT record from pdbs