Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow lazy loading of DataContainers #367

Merged
merged 19 commits into from
Aug 23, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 50 additions & 11 deletions pyiron_base/generic/datacontainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@

import numpy as np

from pyiron_base.generic.fileio import read, write
from pyiron_base.generic.hdfstub import HDFStub
from pyiron_base.interfaces.has_groups import HasGroups
from .fileio import read, write

__author__ = "Marvin Poul"
__copyright__ = (
Expand Down Expand Up @@ -48,7 +49,6 @@ def _normalize(key):

return key


class DataContainer(MutableMapping, HasGroups):
"""
Mutable sequence with optional keys.
Expand Down Expand Up @@ -157,7 +157,6 @@ class DataContainer(MutableMapping, HasGroups):
>>> list(pl.keys())
[0, 1, 2, 3]


Implements :class:`.HasGroups`. Groups are nested data containers and nodes are everything else.

>>> p = DataContainer({"a": 42, "b": [0, 1, 2]})
Expand All @@ -166,6 +165,12 @@ class DataContainer(MutableMapping, HasGroups):
>>> p.list_nodes()
['a']

If instantiated with the argument `lazy=True`, data read from HDF5 later via :method:`.from_hdf` are not actually
read, but only earmarked to be read later when actually accessed via :class:`.HDFStub`. This is largely
transparent, i.e. when accessing an earmarked value it will automatically be loaded and this loaded value is stored
in container. The only difference is in the string representation of the container, values not read yet appear as
'HDFStub(...)' in the output.

.. attention:: Subclasses beware!

DataContainer require some careful treatment when creating subclasses.
Expand All @@ -183,13 +188,14 @@ class DataContainer(MutableMapping, HasGroups):
of attributes it is better to create a new class that has an DataContainer as an attribute and dispatch to the
:meth:`DataContainer.from_hdf`, :meth:`DataContainer.to_hdf` and :meth:`DataContainer._repr_json_`
methods.
4. To allow lazy loading sub classes must accept a `lazy` keyword argument and pass it to `super().__init__`.


A few examples for subclasses

>>> class ExtendedContainer(DataContainer):
... def __init__(self, init=None, my_fancy_field=42, table_name=None):
... super().__init__(init=init, table_name=table_name)
... def __init__(self, init=None, my_fancy_field=42, table_name=None, lazy=False):
... super().__init__(init=init, table_name=table_name, lazy=lazy)
... object.__setattr__(self, "my_fancy_field", my_fancy_field)

After defining it once like this you can access my_fancy_field as a normal attribute, but it will not be stored in
Expand Down Expand Up @@ -245,11 +251,22 @@ def __new__(cls, *args, **kwargs):
object.__setattr__(instance, "_indices", {})
object.__setattr__(instance, "table_name", None)
object.__setattr__(instance, "_read_only", False)
object.__setattr__(instance, "_lazy", False)

return instance

def __init__(self, init=None, table_name=None):
def __init__(self, init=None, table_name=None, lazy=False):
"""
Create new container.

Args:
init (Sequence, Mapping): initial data for the container, nested occurances of Sequence and Mapping are
translated to nested containers
table_name (str): default name of the data container in HDF5
lazy (bool): if True, use :class:`.HDFStub` to load values lazily from HDF5
"""
self.table_name = table_name
self._lazy = lazy
if init is not None:
self.update(init, wrap=True)

Expand All @@ -272,13 +289,23 @@ def __getitem__(self, key):

elif isinstance(key, int):
try:
return self._store[key]
v = self._store[key]
if not isinstance(v, HDFStub):
return v
else:
v = self._store[key] = v.load()
return v
except IndexError:
raise IndexError("list index out of range") from None

elif isinstance(key, str):
try:
return self._store[self._indices[key]]
v = self._store[self._indices[key]]
if not isinstance(v, HDFStub):
return v
else:
v = self._store[self._indices[key]] = v.load()
return v
except KeyError:
raise KeyError(repr(key)) from None

Expand Down Expand Up @@ -383,7 +410,10 @@ def __dir__(self):
def __repr__(self):
name = self.__class__.__name__
if self.has_keys():
return name + "({" + ", ".join("{!r}: {!r}".format(k, v) for k, v in self.items()) + "})"
# access _store and _indices directly to avoid forcing HDFStubs
index2key = {v: k for k, v in self._indices.items()}
return name + "({" + ", ".join("{!r}: {!r}".format(index2key.get(i, i), self._store[i])
for i in range(len(self))) + "})"
else:
return name + "([" + ", ".join("{!r}".format(v) for v in self._store) + "])"

Expand Down Expand Up @@ -740,9 +770,11 @@ def normalize_key(name):
for n in hdf.list_nodes():
if n in _internal_hdf_nodes:
continue
items.append( (*normalize_key(n), hdf[n]))
items.append( (*normalize_key(n), hdf[n] if not self._lazy else HDFStub(hdf, n)) )
for g in hdf.list_groups():
items.append( (*normalize_key(g), hdf[g].to_object()))
items.append( (*normalize_key(g), hdf[g].to_object() if not self._lazy else HDFStub(hdf, g)) )


for _, k, v in sorted(items, key=lambda x: x[0]):
self[k] = v

Expand Down Expand Up @@ -803,3 +835,10 @@ def write(self, file_name):
file_name(str): the name of the file to be writen to.
"""
write(self.to_builtin(), file_name)

def __init_subclass__(cls):
# called whenever a subclass of DataContainer is defined, then register all subclasses with the same function
# that the DataContainer is registered
HDFStub.register(cls, lambda h, g: h[g].to_object(lazy=True))

HDFStub.register(DataContainer, lambda h, g: h[g].to_object(lazy=True))
101 changes: 101 additions & 0 deletions pyiron_base/generic/hdfstub.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
"""
Convenience class to lazily read values from HDF.
"""

# Copyright (c) Max-Planck-Institut für Eisenforschung GmbH - Computational Materials Design (CM) Department
# Distributed under the terms of "New BSD License", see the LICENSE file.

__author__ = "Marvin Poul"
__copyright__ = (
"Copyright 2020, Max-Planck-Institut für Eisenforschung GmbH - "
"Computational Materials Design (CM) Department"
)
__version__ = "1.0"
__maintainer__ = "Marvin Poul"
__email__ = "[email protected]"
__status__ = "production"
__date__ = "Apr 26, 2021"


class HDFStub:
"""
Provides lazy loading of data from HDF.

Instead of accessing an HDF group directly

>>> hdf[group_name]
...

you can wrap this with this class

>>> stub = HDFStub(hdf, group_name)

and then later perform this lookup with :method:`.load`

>>> stub.load() == hdf[group_name]
True

For simple datatypes there's not a big advantages to this, but :class:`.DataContainer` uses this to load its
contents lazily and ensure that nested containers are also lazily loaded. This is done by customizing what happend
on :method:`.load` via :method:`.register`. This class method adds a callback to the class that will be called
when the specified type name is found in the hdf group that is to be loaded.

>>> hdf['mytype/NAME']
MyType
>>> hdf['mytype/TYPE']
<class 'my.module.MyType'>
>>> HDFStub.register(MyType, lambda hdf, group: print(42) or hdf[group].to_object())
>>> my = HDFStub(hdf, 'mytype').load()
42
>>> my
MyType(...)

This is intended to allow classes that want to be lazily loaded in a certain way to customize what arguments they
pass `to_object()` (and therefore to their own initializers).
"""

_load_functions = {}

def __init__(self, hdf, group_name):
"""
Create new stub.

Args:
hdf (:class:`.ProjectHDFio`): hdf object to load from
group_name (str): node or group name to load from the hdf object
"""
self._hdf = hdf
self._group_name = group_name

@classmethod
def register(cls, type, load):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Finally, codacy complains that this overrides a built-in type...do you know what it's talking about? In pycharm I don't get any complaint, and I was expecting like when you try to use "id" or "dict" as a variable.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The argument type is a builtin. It is the base class of all classes and also a function to create new classes (iirc). Since I'm not using this inside this short method, I think it's ok, but I can also come up with a different name.

"""
Register call back for a new type.

Args:
type (type): class to be registered
load (function): callback that is called on :method:`.load` when the type matches `type_name`, must
accept `hdf` and `group_name` corresponding to the init parameters of this class and return
(lazily) loaded object
"""
cls._load_functions[str(type)] = load

def load(self):
"""
Read value from HDF.

If `group_name` is a node in HDF, simply its value will be returned. If it is a group in HDF and the 'NAME'
node matches any of the types registered with :method:`.register`, it will be loaded with the provided callback.
Otherwise it will be loaded with :method:`.ProjectHDFio.to_object()`.
"""
if self._group_name in self._hdf.list_nodes() or 'TYPE' not in self._hdf[self._group_name].list_nodes():
return self._hdf[self._group_name]

load = self._load_functions.get(
self._hdf[self._group_name]['TYPE'],
lambda h, g: h[g].to_object()
)
return load(self._hdf, self._group_name)

def __repr__(self):
return f"{self.__class__.__name__}({self._hdf}, {self._group_name})"
51 changes: 49 additions & 2 deletions tests/generic/test_datacontainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Distributed under the terms of "New BSD License", see the LICENSE file.
from pyiron_base._tests import TestWithCleanProject
from pyiron_base.generic.datacontainer import DataContainer
from pyiron_base.generic.hdfstub import HDFStub
from pyiron_base.generic.inputlist import InputList
from collections import Iterator
import copy
Expand All @@ -12,8 +13,8 @@


class Sub(DataContainer):
def __init__(self, init=None, table_name=None):
super().__init__(init=init, table_name=table_name)
def __init__(self, init=None, table_name=None, lazy=False):
super().__init__(init=init, table_name=table_name, lazy=lazy)
self.foo = 42


Expand Down Expand Up @@ -459,6 +460,52 @@ def test_subclass_preservation(self):
)
self.pl.pop('subclass')

def test_stub(self):
"""Lazily loaded containers should contain only stubs and only force them when directly accessed."""

self.pl.to_hdf(self.hdf, "lazy")
ll = self.hdf["lazy"].to_object(lazy=True)
self.assertTrue(all(isinstance(v, HDFStub) for v in ll._store),
"Not all values loaded as stubs!")

repr(ll)
self.assertTrue(all(isinstance(v, HDFStub) for v in ll._store),
"Some stubs have been loaded after getting string repr of container!")

ll0 = ll[0]
self.assertTrue(all(isinstance(v, HDFStub) for v in ll0._store),
"Recursive datacontainers not lazily loaded!")

self.assertEqual(ll[0].foo, self.pl[0].foo,
"Lazily loaded list not equal to orignal list!")

self.assertTrue(not isinstance(ll._store[0], HDFStub),
"Loaded value not stored back into container!")

def test_stub_sublasses(self):
"""Sub classes of DataContainer should also be able to be lazily loaded."""

sl = Sub(self.pl.to_builtin())

sl.to_hdf(self.hdf, "lazy_sub")
ll = Sub(lazy=True)
ll.from_hdf(self.hdf, "lazy_sub")
self.assertTrue(all(isinstance(v, HDFStub) for v in ll._store),
"Not all values loaded as stubs!")

repr(ll)
self.assertTrue(all(isinstance(v, HDFStub) for v in ll._store),
"Some stubs have been loaded after getting string repr of container!")

ll0 = ll[0]
self.assertTrue(all(isinstance(v, HDFStub) for v in ll0._store),
"Recursive datacontainers not lazily loaded!")

self.assertEqual(ll[0].foo, sl[0].foo,
"Lazily loaded list not equal to orignal list!")

self.assertTrue(not isinstance(ll._store[0], HDFStub),
"Loaded value not stored back into container!")

class TestInputList(unittest.TestCase):

Expand Down
25 changes: 25 additions & 0 deletions tests/generic/test_hdfstub.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from pyiron_base._tests import TestWithProject
from pyiron_base.generic.datacontainer import DataContainer
from pyiron_base.generic.hdfstub import HDFStub

import numpy as np

class TestHDFStub(TestWithProject):

@classmethod
def setUpClass(cls):
super().setUpClass()
cls.hdf = cls.project.create_hdf(cls.project.path, "hdf")
cls.hdf["number"] = 42
cls.hdf["array"] = np.arange(100)
cls.data = DataContainer([1, 2, "three", 4.0])
cls.data.to_hdf(cls.hdf, "data")

def test_load(self):
"""Lazily and eagerly read values should be the same."""
self.assertEqual(HDFStub(self.hdf, "number").load(), self.hdf["number"],
"Simple number read with load() not equal to eager read.")
self.assertTrue(np.all( HDFStub(self.hdf, "array").load() == self.hdf["array"] ),
"Numpy array read with load() not equal to eager read.")
for v1, v2 in zip(HDFStub(self.hdf, "data").load(), self.data):
self.assertEqual(v1, v2, "Data container values read with load() not equal to original container.")