Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[TEP014] Added HDFWriter class + Unit Tests #744

Merged
merged 6 commits into from
Jun 20, 2017
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 125 additions & 0 deletions tardis/io/tests/test_HDFWriter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import os

import numpy as np
import pandas as pd
import pandas.util.testing as pdt
import pytest
from astropy import units as u
from astropy.tests.helper import assert_quantity_allclose
from numpy.testing import assert_almost_equal, assert_array_almost_equal

from tardis.io.util import HDFWriter


#Test Cases

#DataFrame
#None
#Numpy Arrays
#Strings
#Numeric Values
#Pandas Series Object
#MultiIndex Object
#Quantity Objects with - Numeric Values, Numpy Arrays, DataFrame, Pandas Series, None objects

class MockHDF(HDFWriter, object):
hdf_properties = ['property']
class_properties = {}

def __init__(self, property):
self.property = property

simple_objects = [1.5, 'random_string', 4.2e7]

@pytest.mark.parametrize("attr", simple_objects)
def test_simple_write(tmpdir, attr):
fname = str(tmpdir.mkdir('data').join('test.hdf'))
actual = MockHDF(attr)
actual.to_hdf(fname, path='test')
expected = pd.read_hdf(fname, key='/test/mock_hdf/scalars')['property']
assert actual.property == expected

mock_df = pd.DataFrame({'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])})
complex_objects = [np.array([4.0e14, 2, 2e14, 27.5]),
pd.Series([1., 2., 3.]), mock_df]

@pytest.mark.parametrize("attr", complex_objects)
def test_complex_obj_write(tmpdir, attr):
fname = str(tmpdir.mkdir('data').join('test.hdf'))
actual = MockHDF(attr)
actual.to_hdf(fname, path='test')
expected = pd.read_hdf(fname, key='/test/mock_hdf/property').values
assert_array_almost_equal(actual.property, expected)

arrays = [['L1', 'L1', 'L2', 'L2', 'L3', 'L3', 'L4', 'L4'],
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
tuples = list(zip(*arrays))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is a very complicated way to create these arrays. I'd suggest you use np.array(...) and then simply transpose it.

mock_multiIndex = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])

def test_MultiIndex_write(tmpdir):
fname = str(tmpdir.mkdir('data').join('test.hdf'))
actual = MockHDF(mock_multiIndex)
actual.to_hdf(fname, path='test')
expected = pd.read_hdf(fname, key='/test/mock_hdf/property')
expected = pd.MultiIndex.from_tuples(
expected.unstack().values, names=['first', 'second'])
pdt.assert_almost_equal(actual.property, expected)

#Test Quantity Objects

quantity_objects = [np.array([4.0e14, 2, 2e14, 27.5]), mock_df]

@pytest.mark.parametrize("attr", quantity_objects)
def test_quantity_objects_write(tmpdir, attr):
fname = str(tmpdir.mkdir('data').join('test.hdf'))
attr_quantity = u.Quantity(attr, 'g/cm**3')
actual = MockHDF(attr_quantity)
actual.to_hdf(fname, path='test')
expected = pd.read_hdf(fname, key='/test/mock_hdf/property')
assert_array_almost_equal(actual.property.cgs.value, expected)

scalar_quantity_objects = [1.5, 4.2e7]

@pytest.mark.parametrize("attr", scalar_quantity_objects)
def test_scalar_quantity_objects_write(tmpdir, attr):
fname = str(tmpdir.mkdir('data').join('test.hdf'))
attr_quantity = u.Quantity(attr, 'g/cm**3')
actual = MockHDF(attr_quantity)
actual.to_hdf(fname, path='test')
expected = pd.read_hdf(fname, key='/test/mock_hdf/scalars/')['property']
assert_array_almost_equal(actual.property.cgs.value, expected)

def test_none_write(tmpdir):
fname = str(tmpdir.mkdir('data').join('test.hdf'))
actual = MockHDF(None)
actual.to_hdf(fname, path='test')
expected = pd.read_hdf(fname, key='/test/mock_hdf/scalars/')['property']
if expected == 'none':
expected = None
assert actual.property == expected

# Test class_properties parameter (like homologous_density is a class
# instance/object inside Model class)

class MockClass(HDFWriter, object):
hdf_properties = ['property', 'nested_object']
class_properties = {'nested_object': MockHDF}

def __init__(self, property, nested_object):
self.property = property
self.nested_object = nested_object

@pytest.mark.parametrize("attr", quantity_objects)
def test_objects_write(tmpdir, attr):
fname = str(tmpdir.mkdir('data').join('test.hdf'))
nested_object = MockHDF(np.array([4.0e14, 2, 2e14, 27.5]))
attr_quantity = u.Quantity(attr, 'g/cm**3')
actual = MockClass(attr_quantity, nested_object)
actual.to_hdf(fname, path='test')
expected_property = pd.read_hdf(fname, key='/test/mock_class/property')
assert_array_almost_equal(actual.property.cgs.value, expected_property)
nested_property = pd.read_hdf(
fname, key='/test/mock_class/nested_object/property')
assert_array_almost_equal(
actual.nested_object.property, nested_property)
101 changes: 101 additions & 0 deletions tardis/io/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#Utility functions for the IO part of TARDIS

import os
import re
import pandas as pd
import numpy as np
import collections
Expand Down Expand Up @@ -166,6 +167,106 @@ def check_equality(item1, item2):
return True


class HDFWriter(object):

@staticmethod
def to_hdf_util(path_or_buf, path, elements, complevel=9, complib='blosc'):
"""
A function to uniformly store TARDIS data
to an HDF file.

Scalars will be stored in a Series under path/scalars
1D arrays will be stored under path/property_name as distinct Series
2D arrays will be stored under path/property_name as distinct DataFrames

Units will be stored as their CGS value

Parameters
----------
path_or_buf:
Path or buffer to the HDF store
path: str
Path inside the HDF store to store the `elements`
elements: dict
A dict of property names and their values to be
stored.

Returns
-------

"""
scalars = {}
for key, value in elements.iteritems():
if value is None:
value = 'none'
if hasattr(value, 'cgs'):
value = value.cgs.value
if np.isscalar(value):
scalars[key] = value
elif hasattr(value, 'shape'):
if value.ndim == 1:
# This try,except block is only for model.plasma.levels
try:
pd.Series(value).to_hdf(path_or_buf,
os.path.join(path, key))
except NotImplementedError:
pd.DataFrame(value).to_hdf(path_or_buf,
os.path.join(path, key))
else:
pd.DataFrame(value).to_hdf(
path_or_buf, os.path.join(path, key))
else:
try:
value.to_hdf(path_or_buf, path, name=key)
except AttributeError:
data = pd.DataFrame([value])
data.to_hdf(path_or_buf, os.path.join(path, key))

if scalars:
scalars_series = pd.Series(scalars)

# Unfortunately, with to_hdf we cannot append, so merge beforehand
scalars_path = os.path.join(path, 'scalars')
with pd.HDFStore(path_or_buf, complevel=complevel, complib=complib) as store:
if scalars_path in store:
scalars_series = store[scalars_path].append(scalars_series)
scalars_series.to_hdf(path_or_buf, os.path.join(path, 'scalars'))

def get_properties(self):
data = {name: getattr(self, name) for name in self.hdf_properties}
return data

@staticmethod
def convert_to_camel_case(s):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I understand it, what we are doing here is converting to snake_case. Please rename the function accordingly.

s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', s)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please explain to me why we need two substitutions?
Additionally I don't understand why you added a [a-z]+ at the end of the capture group. This prevents AB from being converted to a_b, which I think we want?

Copy link
Contributor Author

@vg3095 vg3095 Jun 14, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yeganer I used this regex from this thread->Stackoverflow

return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As mentioned above, I don't understand why we need this second substitution as the first seems to cover everything?

Maybe you can add some simple tests for some strings like CamelCase, ABC, ABcdefG, snake_Case?


def to_hdf(self, file_path, path='', name=None):
"""
Parameters
----------
file_path: str
Path or buffer to the HDF store
path: str
Path inside the HDF store to store the `elements`
name: str
Group inside the HDF store to which the `elements` need to be saved

Returns
-------

"""
if name is None:
try:
name = self.hdf_name
except AttributeError:
name = self.convert_to_camel_case(self.__class__.__name__)

data = self.get_properties()
buff_path = os.path.join(path, name)
self.to_hdf_util(file_path, buff_path, data)

#Deprecated
def to_hdf(path_or_buf, path, elements, complevel=9, complib='blosc'):
"""
A function to uniformly store TARDIS data
Expand Down