Skip to content

Commit

Permalink
replaced netCDF4 with scipy.io.netcdf (issue #506)
Browse files Browse the repository at this point in the history
- API is different (less feature rich in the pure python
  implementation) so it is not straightforward to support
  both netCDF4 and fall back on scipy.io.netcdf (or perhaps
  even pupyere, which is a single-file package from which the
  scipy code originated)
- uses float32 for
  - positions
  - velocities
  - forces
  (port of resolution of issue #518)
- change NCDFReader, NCDFWriter and test_netcdf
- see discussion of issue #506 for performance benchmarks
  • Loading branch information
orbeckst committed Dec 22, 2015
1 parent 798f3a7 commit 4f4eef9
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 69 deletions.
2 changes: 2 additions & 0 deletions package/CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ Changes
* The lib._distances and lib_distances_openmp libraries now have a
OPENMP_ENABLED boolean flag which indicates if openmp was used in
compilation. (Issue #530)
* use scipy.io.netcdf pure python implementation for reading of Amber
netcdf3 trajctories instead of netCDF4 + netcdf lib (see also Issue #488)

Fixes

Expand Down
101 changes: 51 additions & 50 deletions package/MDAnalysis/coordinates/TRJ.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ def open_trajectory(self):
"This is probably not a AMBER trajectory.")
# reset ts
ts = self.ts
ts.frame = -1
ts.frame = -1

return self.trjfile

Expand Down Expand Up @@ -413,19 +413,15 @@ class NCDFReader(base.Reader):

def __init__(self, filename, n_atoms=None, **kwargs):
try:
import netCDF4 as netcdf
from scipy.io import netcdf
except ImportError:
logger.fatal(
"netcdf4-python with the netCDF and HDF5 libraries must be installed for the AMBER ncdf Reader.")
logger.fatal("See installation instructions at https://github.com/MDAnalysis/mdanalysis/wiki/netcdf")
raise ImportError("netCDF4 package missing.\n"
"netcdf4-python with the netCDF and HDF5 libraries must be installed for the AMBER ncdf "
"Reader.\n"
"See installation instructions at https://github.com/MDAnalysis/mdanalysis/wiki/netcdf")
logger.fatal("scip.io.netcdf must be installed for the AMBER ncdf Reader.")
raise ImportError("scipy.io.netcdf package missing but is required "
"for the Amber Reader.")

super(NCDFReader, self).__init__(filename, **kwargs)

self.trjfile = netcdf.Dataset(self.filename)
self.trjfile = netcdf.netcdf_file(self.filename)

if not ('AMBER' in self.trjfile.Conventions.split(',') or
'AMBER' in self.trjfile.Conventions.split()):
Expand All @@ -440,10 +436,15 @@ def __init__(self, filename, n_atoms=None, **kwargs):
warnings.warn(wmsg)
logger.warn(wmsg)

self.n_atoms = len(self.trjfile.dimensions['atom'])
self.n_frames = len(self.trjfile.dimensions['frame'])
# also records time steps in data.variables['time'] and unit
# but my example only has 0
self.n_atoms = self.trjfile.dimensions['atom']
self.n_frames = self.trjfile.dimensions['frame']
# example trajectory when read with scipy.io.netcdf has
# dimensions['frame'] == None (indicating a record dimension that can
# grow) whereas if read with netCDF4 I get len(dimensions['frame']) ==
# 10: in any case, we need to get the number of frames from somewhere
# such as the time variable:
if self.n_frames is None:
self.n_frames = self.trjfile.variables['time'].shape[0]

try:
self.remarks = self.trjfile.title
Expand All @@ -458,21 +459,20 @@ def __init__(self, filename, n_atoms=None, **kwargs):
# checks for not-implemented features (other units would need to be hacked into MDAnalysis.units)
if self.trjfile.variables['time'].units != "picosecond":
raise NotImplementedError(
"NETCDFReader currently assumes that the trajectory was written with a time unit of picoseconds and "
"not {0}.".format(
"NETCDFReader currently assumes that the trajectory was written "
"with a time unit of picoseconds and not {0}.".format(
self.trjfile.variables['time'].units))
if self.trjfile.variables['coordinates'].units != "angstrom":
raise NotImplementedError(
"NETCDFReader currently assumes that the trajectory was written with a length unit of Angstroem and "
"not {0}.".format(
"NETCDFReader currently assumes that the trajectory was written "
"with a length unit of Angstroem and not {0}.".format(
self.trjfile.variables['coordinates'].units))
if hasattr(self.trjfile.variables['coordinates'], 'scale_factor'):
raise NotImplementedError("scale_factors are not implemented")
if n_atoms is not None:
if n_atoms != self.n_atoms:
raise ValueError("Supplied n_atoms (%d) != natom from ncdf (%d). "
"Note: n_atoms can be None and then the ncdf value is used!" % (
n_atoms, self.n_atoms))
if n_atoms is not None and n_atoms != self.n_atoms:
raise ValueError("Supplied n_atoms ({0}) != natom from ncdf ({1}). "
"Note: n_atoms can be None and then the ncdf value "
"is used!".format(n_atoms, self.n_atoms))

self.has_velocities = 'velocities' in self.trjfile.variables
self.has_forces = 'forces' in self.trjfile.variables
Expand All @@ -493,9 +493,10 @@ def _read_frame(self, frame):
raise IOError("Trajectory is closed")
if np.dtype(type(frame)) != np.dtype(int):
# convention... for netcdf could also be a slice
raise TypeError("frame must be a positive integer")
raise TypeError("frame must be a positive integer or zero")
if frame >= self.n_frames or frame < 0:
raise IndexError("frame index must be 0 <= frame < {0}".format(self.n_frames))
raise IndexError("frame index must be 0 <= frame < {0}".format(
self.n_frames))
# note: self.trjfile.variables['coordinates'].shape == (frames, n_atoms, 3)
ts._pos[:] = self.trjfile.variables['coordinates'][frame]
ts.time = self.trjfile.variables['time'][frame]
Expand Down Expand Up @@ -531,7 +532,12 @@ def _read_next_timestep(self, ts=None):
raise IOError

def close(self):
"""Close trajectory; any further access will raise an :exc:`IOError`"""
"""Close trajectory; any further access will raise an :exc:`IOError`.
.. Note:: The underlying :mod:`scipy.io.netcdf` module open netcdf
files with `mmap()`. Hence *any* reference to an array
*must* be removed before the file can be closed.
"""
if not self.trjfile is None:
self.trjfile.close()
self.trjfile = None
Expand Down Expand Up @@ -660,20 +666,16 @@ def _init_netcdf(self, periodic=True):
https://storage.googleapis.com/google-code-attachments/mdanalysis/issue-109/comment-2/netcdf4storage.py
"""
try:
import netCDF4 as netcdf
from scipy.io import netcdf
except ImportError:
logger.fatal(
"netcdf4-python with the netCDF and HDF5 libraries must be installed for the AMBER ncdf Writer.")
logger.fatal("See installation instructions at https://github.com/MDAnalysis/mdanalysis/wiki/netcdf")
raise ImportError("netCDF4 package missing.\n"
"netcdf4-python with the netCDF and HDF5 libraries must be installed for the AMBER ncdf "
"Writer.\n"
"See installation instructions at https://github.com/MDAnalysis/mdanalysis/wiki/netcdf")
logger.fatal("scip.io.netcdf must be installed for the AMBER ncdf Reader.")
raise ImportError("scipy.io.netcdf package missing but is required "
"for the Amber Reader.")

if not self._first_frame:
raise IOError(errno.EIO, "Attempt to write to closed file {0}".format(self.filename))

ncfile = netcdf.Dataset(self.filename, clobber=True, mode='w', format='NETCDF3_64BIT')
ncfile = netcdf.netcdf_file(self.filename, mode='w', version=2)

# Set global attributes.
setattr(ncfile, 'program', 'MDAnalysis.coordinates.TRJ.NCDFWriter')
Expand All @@ -691,44 +693,43 @@ def _init_netcdf(self, periodic=True):
ncfile.createDimension('label', 5) # needed for cell_angular

# Create variables.
coords = ncfile.createVariable('coordinates', 'f4', ('frame', 'atom', 'spatial'),
zlib=self.zlib, complevel=self.cmplevel)
coords = ncfile.createVariable('coordinates', 'f4',
('frame', 'atom', 'spatial'))
setattr(coords, 'units', 'angstrom')

spatial = ncfile.createVariable('spatial', 'c', ('spatial',))
spatial[:] = np.asarray(list('xyz'))

time = ncfile.createVariable('time', 'f4', ('frame',),
zlib=self.zlib, complevel=self.cmplevel)

time = ncfile.createVariable('time', 'f4', ('frame',))
setattr(time, 'units', 'picosecond')

self.periodic = periodic
if self.periodic:
cell_lengths = ncfile.createVariable('cell_lengths', 'f8', ('frame', 'cell_spatial'),
zlib=self.zlib, complevel=self.cmplevel)
cell_lengths = ncfile.createVariable('cell_lengths', 'f8',
('frame', 'cell_spatial'))
setattr(cell_lengths, 'units', 'angstrom')

cell_spatial = ncfile.createVariable('cell_spatial', 'c',
('cell_spatial',))
cell_spatial[:] = np.asarray(list('abc'))
cell_angles = ncfile.createVariable('cell_angles', 'f8', ('frame', 'cell_angular'),
zlib=self.zlib, complevel=self.cmplevel)

cell_angles = ncfile.createVariable('cell_angles', 'f8',
('frame', 'cell_angular'))
setattr(cell_angles, 'units', 'degrees')

cell_angular = ncfile.createVariable('cell_angular', 'c',
('cell_angular', 'label'))
cell_angular[:] = np.asarray([list('alpha'), list('beta '),
list('gamma')])

# These properties are optional, and are specified on Writer creation
if self.has_velocities:
velocs = ncfile.createVariable('velocities', 'f8', ('frame', 'atom', 'spatial'),
zlib=self.zlib, complevel=self.cmplevel)
velocs = ncfile.createVariable('velocities', 'f4',
('frame', 'atom', 'spatial'))
setattr(velocs, 'units', 'angstrom/picosecond')
if self.has_forces:
forces = ncfile.createVariable('forces', 'f8', ('frame', 'atom', 'spatial'),
zlib=self.zlib, complevel=self.cmplevel)
forces = ncfile.createVariable('forces', 'f4',
('frame', 'atom', 'spatial'))
setattr(forces, 'units', 'kilocalorie/mole/angstrom')

ncfile.sync()
Expand Down
12 changes: 2 additions & 10 deletions package/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,8 @@
(Note that the group really is called `mdnalysis-discussion' because
Google groups forbids any name that contains the string `anal'.)
By default we use setuptools <http://pypi.python.org/pypi/setuptools>. The
details of such an "EasyInstall" installation procedure are shown on
http://peak.telecommunity.com/DevCenter/EasyInstall
By changing the code below you can also switch to a standard distutils
installation.
"""

from __future__ import print_function
from setuptools import setup, Extension, find_packages
from distutils.ccompiler import new_compiler
Expand Down Expand Up @@ -376,8 +369,7 @@ def extensions(config):
# you might prefer to use the version available through your
# packaging system
extras_require={
'AMBER': ['netCDF4>=1.0'], # for AMBER netcdf, also needs HDF5
# and netcdf-4
'AMBER': ['scipy'], # for AMBER netcdf, (does NOT need HDF5)
'analysis': [
'matplotlib',
'scipy',
Expand Down
17 changes: 8 additions & 9 deletions testsuite/MDAnalysisTests/coordinates/test_netcdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,26 +177,25 @@ def _check_new_traj(self):
dim_new = nc_copy.dimensions[k]
except KeyError:
raise AssertionError("NCDFWriter did not write "
"dimension '{}'".format(k))
"dimension '{0}'".format(k))
else:
assert_equal(len(dim), len(dim_new),
err_msg="Dimension '{0}' size mismatch".format(k))


for k, v in nc_orig.variables.items():
try:
v_new = nc_copy.variables[k]
except KeyError:
raise AssertionError("NCDFWriter did not write "
"variable '{}'".format(k))
"variable '{0}'".format(k))
else:
try:
assert_array_almost_equal(v[:], v_new[:], self.prec,
err_msg="Variable '{}' not "
err_msg="Variable '{0}' not "
"written correctly".format(k))
except TypeError:
assert_array_equal(v[:], v_new[:],
err_msg="Variable {} not written "
err_msg="Variable {0} not written "
"correctly".format(k))

@attr('slow')
Expand All @@ -214,14 +213,14 @@ def test_TRR2NCDF(self):
assert_array_almost_equal(written_ts._pos, orig_ts._pos, self.prec,
err_msg="coordinate mismatch between "
"original and written trajectory at "
"frame %d (orig) vs %d (written)" % (
orig_ts.frame, written_ts.frame))
"frame {0} (orig) vs {1} (written)".format(
orig_ts.frame, written_ts.frame))
assert_array_almost_equal(written_ts._velocities,
orig_ts._velocities, self.prec,
err_msg="velocity mismatch between "
"original and written trajectory at "
"frame %d (orig) vs %d (written)" % (
orig_ts.frame, written_ts.frame))
"frame {0} (orig) vs {1} (written)".format(
orig_ts.frame, written_ts.frame))
assert_almost_equal(orig_ts.time, written_ts.time, self.prec,
err_msg="Time for step {0} are not the "
"same.".format(orig_ts.frame))
Expand Down

0 comments on commit 4f4eef9

Please sign in to comment.