pyiron · pmrv · Aug 23, 2021 · Apr 26, 2021 · Jul 6, 2021 · Jul 7, 2021
diff --git a/pyiron_base/generic/datacontainer.py b/pyiron_base/generic/datacontainer.py
@@ -12,8 +12,9 @@
 
 import numpy as np
 
+from pyiron_base.generic.fileio import read, write
+from pyiron_base.generic.hdfstub import HDFStub
 from pyiron_base.interfaces.has_groups import HasGroups
-from .fileio import read, write
 
 __author__ = "Marvin Poul"
 __copyright__ = (
@@ -48,7 +49,6 @@ def _normalize(key):
 
     return key
 
-
 class DataContainer(MutableMapping, HasGroups):
     """
     Mutable sequence with optional keys.
@@ -157,7 +157,6 @@ class DataContainer(MutableMapping, HasGroups):
     >>> list(pl.keys())
     [0, 1, 2, 3]
 
-
     Implements :class:`.HasGroups`.  Groups are nested data containers and nodes are everything else.
 
     >>> p = DataContainer({"a": 42, "b": [0, 1, 2]})
@@ -166,6 +165,12 @@ class DataContainer(MutableMapping, HasGroups):
     >>> p.list_nodes()
     ['a']
 
+    If instantiated with the argument `lazy=True`, data read from HDF5 later via :method:`.from_hdf` are not actually
+    read, but only earmarked to be read later when actually accessed via :class:`.HDFStub`.  This is largely
+    transparent, i.e. when accessing an earmarked value it will automatically be loaded and this loaded value is stored
+    in container.  The only difference is in the string representation of the container, values not read yet appear as
+    'HDFStub(...)' in the output.
+
     .. attention:: Subclasses beware!
 
         DataContainer require some careful treatment when creating subclasses.
@@ -183,13 +188,14 @@ class DataContainer(MutableMapping, HasGroups):
         of attributes it is better to create a new class that has an DataContainer as an attribute and dispatch to the
         :meth:`DataContainer.from_hdf`, :meth:`DataContainer.to_hdf` and :meth:`DataContainer._repr_json_`
         methods.
+        4. To allow lazy loading sub classes must accept a `lazy` keyword argument and pass it to `super().__init__`.
 
 
     A few examples for subclasses
 
     >>> class ExtendedContainer(DataContainer):
-    ...     def __init__(self, init=None, my_fancy_field=42, table_name=None):
-    ...         super().__init__(init=init, table_name=table_name)
+    ...     def __init__(self, init=None, my_fancy_field=42, table_name=None, lazy=False):
+    ...         super().__init__(init=init, table_name=table_name, lazy=lazy)
     ...         object.__setattr__(self, "my_fancy_field", my_fancy_field)
 
     After defining it once like this you can access my_fancy_field as a normal attribute, but it will not be stored in
@@ -245,11 +251,22 @@ def __new__(cls, *args, **kwargs):
         object.__setattr__(instance, "_indices", {})
         object.__setattr__(instance, "table_name", None)
         object.__setattr__(instance, "_read_only", False)
+        object.__setattr__(instance, "_lazy", False)
 
         return instance
 
-    def __init__(self, init=None, table_name=None):
+    def __init__(self, init=None, table_name=None, lazy=False):
+        """
+        Create new container.
+
+        Args:
+            init (Sequence, Mapping): initial data for the container, nested occurances of Sequence and Mapping are
+                                      translated to nested containers
+            table_name (str): default name of the data container in HDF5
+            lazy (bool): if True, use :class:`.HDFStub` to load values lazily from HDF5
+        """
         self.table_name = table_name
+        self._lazy = lazy
         if init is not None:
             self.update(init, wrap=True)
 
@@ -272,13 +289,23 @@ def __getitem__(self, key):
 
         elif isinstance(key, int):
             try:
-                return self._store[key]
+                v = self._store[key]
+                if not isinstance(v, HDFStub):
+                    return v
+                else:
+                    v = self._store[key] = v.load()
+                    return v
             except IndexError:
                 raise IndexError("list index out of range") from None
 
         elif isinstance(key, str):
             try:
-                return self._store[self._indices[key]]
+                v = self._store[self._indices[key]]
+                if not isinstance(v, HDFStub):
+                    return v
+                else:
+                    v = self._store[self._indices[key]] = v.load()
+                    return v
             except KeyError:
                 raise KeyError(repr(key)) from None
 
@@ -383,7 +410,10 @@ def __dir__(self):
     def __repr__(self):
         name = self.__class__.__name__
         if self.has_keys():
-            return name + "({" + ", ".join("{!r}: {!r}".format(k, v) for k, v in self.items()) + "})"
+            # access _store and _indices directly to avoid forcing HDFStubs
+            index2key = {v: k for k, v in self._indices.items()}
+            return name + "({" + ", ".join("{!r}: {!r}".format(index2key.get(i, i), self._store[i])
+                                                for i in range(len(self))) + "})"
         else:
             return name + "([" + ", ".join("{!r}".format(v) for v in self._store) + "])"
 
@@ -740,9 +770,11 @@ def normalize_key(name):
             for n in hdf.list_nodes():
                 if n in _internal_hdf_nodes:
                     continue
-                items.append( (*normalize_key(n), hdf[n]))
+                items.append( (*normalize_key(n), hdf[n] if not self._lazy else HDFStub(hdf, n)) )
             for g in hdf.list_groups():
-                items.append( (*normalize_key(g), hdf[g].to_object()))
+                items.append( (*normalize_key(g), hdf[g].to_object() if not self._lazy else HDFStub(hdf, g)) )
+
+
             for _, k, v in sorted(items, key=lambda x: x[0]):
                 self[k] = v
 
@@ -803,3 +835,10 @@ def write(self, file_name):
             file_name(str): the name of the file to be writen to.
         """
         write(self.to_builtin(), file_name)
+
+    def __init_subclass__(cls):
+        # called whenever a subclass of DataContainer is defined, then register all subclasses with the same function
+        # that the DataContainer is registered
+        HDFStub.register(cls, lambda h, g: h[g].to_object(lazy=True))
+
+HDFStub.register(DataContainer, lambda h, g: h[g].to_object(lazy=True))
diff --git a/pyiron_base/generic/hdfstub.py b/pyiron_base/generic/hdfstub.py
@@ -0,0 +1,101 @@
+"""
+Convenience class to lazily read values from HDF.
+"""
+
+# Copyright (c) Max-Planck-Institut für Eisenforschung GmbH - Computational Materials Design (CM) Department
+# Distributed under the terms of "New BSD License", see the LICENSE file.
+
+__author__ = "Marvin Poul"
+__copyright__ = (
+    "Copyright 2020, Max-Planck-Institut für Eisenforschung GmbH - "
+    "Computational Materials Design (CM) Department"
+)
+__version__ = "1.0"
+__maintainer__ = "Marvin Poul"
+__email__ = "[email protected]"
+__status__ = "production"
+__date__ = "Apr 26, 2021"
+
+
+class HDFStub:
+    """
+    Provides lazy loading of data from HDF.
+
+    Instead of accessing an HDF group directly
+
+    >>> hdf[group_name]
+    ...
+
+    you can wrap this with this class
+
+    >>> stub = HDFStub(hdf, group_name)
+
+    and then later perform this lookup with :method:`.load`
+
+    >>> stub.load() == hdf[group_name]
+    True
+
+    For simple datatypes there's not a big advantages to this, but :class:`.DataContainer` uses this to load its
+    contents lazily and ensure that nested containers are also lazily loaded.  This is done by customizing what happend
+    on :method:`.load` via :method:`.register`.  This class method adds a callback to the class that will be called
+    when the specified type name is found in the hdf group that is to be loaded.
+
+    >>> hdf['mytype/NAME']
+    MyType
+    >>> hdf['mytype/TYPE']
+    <class 'my.module.MyType'>
+    >>> HDFStub.register(MyType, lambda hdf, group: print(42) or hdf[group].to_object())
+    >>> my = HDFStub(hdf, 'mytype').load()
+    42
+    >>> my
+    MyType(...)
+
+    This is intended to allow classes that want to be lazily loaded in a certain way to customize what arguments they
+    pass `to_object()` (and therefore to their own initializers).
+    """
+
+    _load_functions = {}
+
+    def __init__(self, hdf, group_name):
+        """
+        Create new stub.
+
+        Args:
+            hdf (:class:`.ProjectHDFio`): hdf object to load from
+            group_name (str): node or group name to load from the hdf object
+        """
+        self._hdf = hdf
+        self._group_name = group_name
+
+    @classmethod
+    def register(cls, type, load):
+        """
+        Register call back for a new type.
+
+        Args:
+            type (type): class to be registered
+            load (function): callback that is called on :method:`.load` when the type matches `type_name`, must
+                             accept `hdf` and `group_name` corresponding to the init parameters of this class and return
+                             (lazily) loaded object
+        """
+        cls._load_functions[str(type)] = load
+
+    def load(self):
+        """
+        Read value from HDF.
+
+        If `group_name` is a node in HDF, simply its value will be returned.  If it is a group in HDF and the 'NAME'
+        node matches any of the types registered with :method:`.register`, it will be loaded with the provided callback.
+        Otherwise it will be loaded with :method:`.ProjectHDFio.to_object()`.
+        """
+        if self._group_name in self._hdf.list_nodes() or 'TYPE' not in self._hdf[self._group_name].list_nodes():
+            return self._hdf[self._group_name]
+
+        load = self._load_functions.get(
+                self._hdf[self._group_name]['TYPE'],
+                lambda h, g: h[g].to_object()
+        )
+        return load(self._hdf, self._group_name)
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}({self._hdf}, {self._group_name})"
diff --git a/tests/generic/test_datacontainer.py b/tests/generic/test_datacontainer.py
@@ -2,6 +2,7 @@
 # Distributed under the terms of "New BSD License", see the LICENSE file.
 from pyiron_base._tests import TestWithCleanProject
 from pyiron_base.generic.datacontainer import DataContainer
+from pyiron_base.generic.hdfstub import HDFStub
 from pyiron_base.generic.inputlist import InputList
 from collections import Iterator
 import copy
@@ -12,8 +13,8 @@
 
 
 class Sub(DataContainer):
-    def __init__(self, init=None, table_name=None):
-        super().__init__(init=init, table_name=table_name)
+    def __init__(self, init=None, table_name=None, lazy=False):
+        super().__init__(init=init, table_name=table_name, lazy=lazy)
         self.foo = 42
 
 
@@ -459,6 +460,52 @@ def test_subclass_preservation(self):
         )
         self.pl.pop('subclass')
 
+    def test_stub(self):
+        """Lazily loaded containers should contain only stubs and only force them when directly accessed."""
+
+        self.pl.to_hdf(self.hdf, "lazy")
+        ll = self.hdf["lazy"].to_object(lazy=True)
+        self.assertTrue(all(isinstance(v, HDFStub) for v in ll._store),
+                        "Not all values loaded as stubs!")
+
+        repr(ll)
+        self.assertTrue(all(isinstance(v, HDFStub) for v in ll._store),
+                        "Some stubs have been loaded after getting string repr of container!")
+
+        ll0 = ll[0]
+        self.assertTrue(all(isinstance(v, HDFStub) for v in ll0._store),
+                        "Recursive datacontainers not lazily loaded!")
+
+        self.assertEqual(ll[0].foo, self.pl[0].foo,
+                         "Lazily loaded list not equal to orignal list!")
+
+        self.assertTrue(not isinstance(ll._store[0], HDFStub),
+                        "Loaded value not stored back into container!")
+
+    def test_stub_sublasses(self):
+        """Sub classes of DataContainer should also be able to be lazily loaded."""
+
+        sl = Sub(self.pl.to_builtin())
+
+        sl.to_hdf(self.hdf, "lazy_sub")
+        ll = Sub(lazy=True)
+        ll.from_hdf(self.hdf, "lazy_sub")
+        self.assertTrue(all(isinstance(v, HDFStub) for v in ll._store),
+                        "Not all values loaded as stubs!")
+
+        repr(ll)
+        self.assertTrue(all(isinstance(v, HDFStub) for v in ll._store),
+                        "Some stubs have been loaded after getting string repr of container!")
+
+        ll0 = ll[0]
+        self.assertTrue(all(isinstance(v, HDFStub) for v in ll0._store),
+                        "Recursive datacontainers not lazily loaded!")
+
+        self.assertEqual(ll[0].foo, sl[0].foo,
+                         "Lazily loaded list not equal to orignal list!")
+
+        self.assertTrue(not isinstance(ll._store[0], HDFStub),
+                        "Loaded value not stored back into container!")
 
 class TestInputList(unittest.TestCase):
 

diff --git a/tests/generic/test_hdfstub.py b/tests/generic/test_hdfstub.py
@@ -0,0 +1,25 @@
+from pyiron_base._tests import TestWithProject
+from pyiron_base.generic.datacontainer import DataContainer
+from pyiron_base.generic.hdfstub import HDFStub
+
+import numpy as np
+
+class TestHDFStub(TestWithProject):
+
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls.hdf = cls.project.create_hdf(cls.project.path, "hdf")
+        cls.hdf["number"] = 42
+        cls.hdf["array"] = np.arange(100)
+        cls.data = DataContainer([1, 2, "three", 4.0])
+        cls.data.to_hdf(cls.hdf, "data")
+
+    def test_load(self):
+        """Lazily and eagerly read values should be the same."""
+        self.assertEqual(HDFStub(self.hdf, "number").load(), self.hdf["number"],
+                         "Simple number read with load() not equal to eager read.")
+        self.assertTrue(np.all( HDFStub(self.hdf, "array").load() == self.hdf["array"] ),
+                        "Numpy array read with load() not equal to eager read.")
+        for v1, v2 in zip(HDFStub(self.hdf, "data").load(), self.data):
+            self.assertEqual(v1, v2, "Data container values read with load() not equal to original container.")