Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

read APBS .dx.gz generated grids #70

Merged
merged 11 commits into from
Nov 7, 2019
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions AUTHORS
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ Contributors:
* Dominik Mierzejewski <rathann>
* Tyler Luchko <tluchko>
* Giacomo Fiorin <giacomofiorin>
* Eloy Félix <eloyfelix>
14 changes: 10 additions & 4 deletions gridData/OpenDX.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@
import re
from six import next
from six.moves import range
import gzip

import warnings

Expand Down Expand Up @@ -483,7 +484,7 @@ def write(self, filename):
for component,object in self.sorted_components():
outfile.write('component "%s" value %s\n' % (component,str(object.id)))

def read(self,file):
def read(self, file):
"""Read DX field from file.

dx = OpenDX.field.read(dxfile)
Expand Down Expand Up @@ -652,7 +653,7 @@ def __init__(self, filename):
}


def parse(self,DXfield):
def parse(self, DXfield):
"""Parse the dx file and construct a DX field object with component classes.

A :class:`field` instance *DXfield* must be provided to be
Expand All @@ -678,8 +679,13 @@ def parse(self,DXfield):
self.currentobject = None # containers for data
self.objects = [] # |
self.tokens = [] # token buffer
with open(self.filename, 'r') as self.dxfile:
self.use_parser('general') # parse the whole file and populate self.objects

eloyfelix marked this conversation as resolved.
Show resolved Hide resolved
if self.filename.endswith('.gz'):
with gzip.open(self.filename, 'rt') as self.dxfile:
self.use_parser('general')
else:
with open(self.filename, 'r') as self.dxfile:
self.use_parser('general') # parse the whole file and populate self.objects

# assemble field from objects
for o in self.objects:
Expand Down
13 changes: 12 additions & 1 deletion gridData/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@

import os
import errno
import gzip
eloyfelix marked this conversation as resolved.
Show resolved Hide resolved

import numpy

Expand Down Expand Up @@ -380,7 +381,11 @@ def _guess_format(self, filename, file_format=None, export=True):
else:
available = self._loaders
if file_format is None:
file_format = os.path.splitext(filename)[1][1:]
splitted = os.path.splitext(filename)
if splitted[1][1:] in ('gz', ):
file_format = os.path.splitext(splitted[0])[1][1:]
else:
file_format = splitted[1][1:]
file_format = file_format.upper()
if not file_format:
file_format = self.default_format
Expand Down Expand Up @@ -545,6 +550,12 @@ def _export_dx(self, filename, type=None, typequote='"', **kwargs):
)
dx = OpenDX.field('density', components=components, comments=comments)
dx.write(filename)
if ext == '.gz':
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The compressed writing should be handled in dx.write() instead of working on the uncompressed file. I would generally try to avoid any intermediate file writing.

Sorry, I didn't notice on the first round of reviews.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I moved the logic to dx.write but given how it was originally implemented it was a bit tricky. I needed to add few isinstance to check if I needed to write string or bytes depending on the stream

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also needed to implement a simple version of your 'openany' to deal with it:

https://github.com/eloyfelix/GridDataFormats/blob/DXGZ_input_format/gridData/OpenDX.py#L486

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. I don't particularly like having to repeat the same code everywhere. But that's because of the poor structure of the original code (i.e., my fault ;-) where writing is actually done in each element of the DXfield. Can you think of a better code structure where we only need to do the encoding etc once? E.g., instead of having each element write, return a text representation that is then concatenated at the top level.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added write_line function: https://github.com/eloyfelix/GridDataFormats/blob/DXGZ_input_format/gridData/OpenDX.py#L193

It's probably not optimal but it's the best that I could do without doing major codebase changes (which I'm not sure I'm ready to do)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your approach is fine, I just added a few minor change requests to clean up the code. Importantly, this is really cleaning up my mess – I would be very grateful if you could do it as part of this PR!

with open(filename, 'rb') as in_file:
with gzip.open(root + ext, 'wb') as out_file:
for l in in_file:
out_file.write(l)
os.remove(filename)

def save(self, filename):
"""Save a grid object to <filename>.pickle
Expand Down
Binary file added gridData/tests/datafiles/test.dx.gz
Binary file not shown.
16 changes: 7 additions & 9 deletions gridData/tests/test_dx.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@

from . import datafiles

def test_read_dx():
g = Grid(datafiles.DX)
@pytest.mark.parametrize("infile", [datafiles.DX, datafiles.DX+'.gz'])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Be explicit and use datafiles.DXGZ

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you also need to

  • add a test.dx.gz to test files
  • add DXGZ to datafiles/__init__.py

I don't know if one can make a fixture work in which you generate the gzipped file on the fly. I can think of the first part, but not how to use the fixture together with datafiles.DX in a parameterized test or fixture. But in any case, here's the code I can think of (uses tmpdir_factory because the temporary file should be created in a temporary location but the tmpfile fixture is not module level)

@pytest.fixture(scope="module")
def dxgz_file(tmpdir_factory, src=datafiles.DX):
    basename = os.path.basename(src) + '.gz'
    fn = tmpdir_factory.mktemp("compressed").join(basename)
    with open(src, 'rb') as inp:
        with gzip.open(str(fn), "wb") as out:
             out.write(inp.read())
    return str(fn)

# use dxgz_file fixture when needed ... although I am actually not sure how 
# you use a fixture as part of a parameterized fixture.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added test.dx.gz file and DXGZ to datafiles/init.py but I'm not sure about the fixture...

def test_read_dx(infile):
g = Grid(infile)
POINTS = 8
ref = np.ones(POINTS)
ref[4] = 1e-6
Expand All @@ -21,7 +22,7 @@ def test_read_dx():
assert_equal(g.delta, np.ones(3))
assert_equal(g.origin, np.array([20.1, 3., -10.]))


@pytest.mark.parametrize("outfile", ["grid.dx", "grid.dx.gz"])
orbeckst marked this conversation as resolved.
Show resolved Hide resolved
@pytest.mark.parametrize("nptype,dxtype", [
("float16", "float"),
("float32", "float"),
Expand All @@ -35,7 +36,7 @@ def test_read_dx():
("int8", "signed byte"),
("uint8", "byte"),
])
def test_write_dx(tmpdir, nptype, dxtype, counts=100, ndim=3):
def test_write_dx(tmpdir, nptype, dxtype, outfile, counts=100, ndim=3):
# conversion from numpy array to DX file

h, edges = np.histogramdd(np.random.random((counts, ndim)), bins=10)
Expand All @@ -47,7 +48,6 @@ def test_write_dx(tmpdir, nptype, dxtype, counts=100, ndim=3):
assert_equal(g.grid.sum(), counts)

with tmpdir.as_cwd():
outfile = "grid.dx"
g.export(outfile)
g2 = Grid(outfile)

Expand All @@ -66,9 +66,10 @@ def test_write_dx(tmpdir, nptype, dxtype, counts=100, ndim=3):

assert_equal(out_dxtype, dxtype)

@pytest.mark.parametrize("outfile", ["grid.dx", "grid.dx.gz"])
@pytest.mark.parametrize('nptype', ("complex64", "complex128", "bool_"))
@pytest.mark.filterwarnings("ignore:array dtype.name =")
def test_write_dx_ValueError(tmpdir, nptype, counts=100, ndim=3):
def test_write_dx_ValueError(tmpdir, nptype, outfile, counts=100, ndim=3):
h, edges = np.histogramdd(np.random.random((counts, ndim)), bins=10)
g = Grid(h, edges)

Expand All @@ -77,8 +78,5 @@ def test_write_dx_ValueError(tmpdir, nptype, counts=100, ndim=3):

with pytest.raises(ValueError):
with tmpdir.as_cwd():
outfile = "grid.dx"
g.export(outfile)