Append References (#203)

* Append References * ruff updates * ruff updates * ruff updates * test * test * test * test * test * Update pyproject.toml * Update requirements-min.txt * Update CHANGELOG.md * ruff check * ruff check * clean up * clear * clear * clear * clear * Rename ZarrIO.__get_ref to ZarrIO._create_ref * fixed tests * clean up * Add failing check to test * Fix --------- Co-authored-by: Ryan Ly <[email protected]>
hdmf-dev · Jul 30, 2024 · 8ca5787 · 8ca5787
1 parent 2ba6bc5
commit 8ca5787
Show file tree

Hide file tree

Showing 8 changed files with 99 additions and 22 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 ## 0.9.0 (Upcoming)
 ### Enhancements
+* Added support for appending a dataset of references. @mavaylon1 [#203](https://github.com/hdmf-dev/hdmf-zarr/pull/203)
 * NWBZarrIO load_namespaces=True by default. @mavaylon1 [#204](https://github.com/hdmf-dev/hdmf-zarr/pull/204)
 * Added test for opening file with consolidated metadata from DANDI. @mavaylon1 [#206](https://github.com/hdmf-dev/hdmf-zarr/pull/206)
 

diff --git a/docs/source/integrating_data_stores.rst b/docs/source/integrating_data_stores.rst
@@ -36,7 +36,7 @@ Updating ZarrIO
    :py:meth:`~hdmf_zarr.backend.ZarrIO.get_builder_exists_on_disk`
    method may need to be updated to ensure
    references are opened correctly on read for files stored with your new store. The
-   :py:meth:`~hdmf_zarr.backend.ZarrIO.__get_ref` function may also need to be updated, in
+   :py:meth:`~hdmf_zarr.backend.ZarrIO._create_ref` function may also need to be updated, in
    particular in case the links to your store also modify the storage schema for links
    (e.g., if you need to store additional metadata in order to resolve links to your store).
 

diff --git a/docs/source/storage.rst b/docs/source/storage.rst
@@ -217,7 +217,7 @@ For example:
 
     In :py:class:`~hdmf_zarr.backend.ZarrIO`, links are written by the
     :py:meth:`~hdmf_zarr.backend.ZarrIO.__write_link__` function, which also uses the helper functions
-    i) :py:meth:`~hdmf_zarr.backend.ZarrIO.__get_ref` to construct py:meth:`~hdmf_zarr.utils.ZarrRefernce`
+    i) :py:meth:`~hdmf_zarr.backend.ZarrIO._create_ref` to construct py:meth:`~hdmf_zarr.utils.ZarrRefernce`
     and ii) :py:meth:`~hdmf_zarr.backend.ZarrIO.__add_link__` to add a link to the Zarr file.
     :py:meth:`~hdmf_zarr.backend.ZarrIO.__read_links` then parses links and also uses the
     :py:meth:`~hdmf_zarr.backend.ZarrIO.__resolve_ref` helper function to resolve the paths stored in links.
@@ -245,7 +245,7 @@ by their location (i.e., index) in the dataset. As such, object references only
 the relative path to the target Zarr file, and the ``path`` identifying the object within the source
 Zarr file. The individual object references are defined in the
 :py:class:`~hdmf_zarr.backend.ZarrIO` as py:class:`~hdmf_zarr.utils.ZarrReference` object created via
-the :py:meth:`~hdmf_zarr.backend.ZarrIO.__get_ref` helper function.
+the :py:meth:`~hdmf_zarr.backend.ZarrIO._create_ref` helper function.
 
 By default, :py:class:`~hdmf_zarr.backend.ZarrIO` uses the ``numcodecs.pickles.Pickle`` codec to
 encode object references defined as py:class:`~hdmf_zarr.utils.ZarrReference` dicts in datasets.
@@ -297,7 +297,7 @@ store the definition of the ``region`` that is being referenced, e.g., a slice o
     To implement region references will require updating:
     1)  py:class:`~hdmf_zarr.utils.ZarrReference` to add a ``region`` key to support storing
     the selection for the region,
-    2) :py:meth:`~hdmf_zarr.backend.ZarrIO.__get_ref` to support passing in the region definition to
+    2) :py:meth:`~hdmf_zarr.backend.ZarrIO._create_ref` to support passing in the region definition to
     be added to the py:class:`~hdmf_zarr.utils.ZarrReference`,
     3) :py:meth:`~hdmf_zarr.backend.ZarrIO.write_dataset` already partially implements the required
     logic for creating region references by checking for :py:class:`hdmf.build.RegionBuilder` inputs

diff --git a/pyproject.toml b/pyproject.toml
@@ -29,7 +29,7 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Medical Science Apps."
 ]
 dependencies = [
-    'hdmf>=3.9.0',
+    'hdmf>=3.14.2',
     'zarr>=2.11.0, <3.0', # pin below 3.0 until HDMF-zarr supports zarr 3.0
     'numpy>=1.24, <2.0', # pin below 2.0 until HDMF supports numpy 2.0
     'numcodecs>=0.9.1',
@@ -109,7 +109,7 @@ force-exclude = '''
 '''
 
 [tool.ruff]
-select = ["E", "F", "T100", "T201", "T203"]
+lint.select = ["E", "F", "T100", "T201", "T203"]
 exclude = [
   ".git",
   ".tox",
@@ -128,5 +128,5 @@ line-length = 120
 "src/*/__init__.py" = ["F401"]
 "test_gallery.py" = ["T201"]
 
-[tool.ruff.mccabe]
+[tool.ruff.lint.mccabe]
 max-complexity = 17
diff --git a/requirements-min.txt b/requirements-min.txt
@@ -1,4 +1,4 @@
-hdmf==3.12.0
+hdmf==3.14.2
 zarr==2.11.0
 numcodecs==0.9.1
 pynwb==2.5.0

diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py
@@ -595,13 +595,13 @@ def write_attributes(self, **kwargs):
                 # TODO: Region References are not yet supported
                 # if isinstance(value, RegionBuilder):
                 #     type_str = 'region'
-                #     refs = self.__get_ref(value.builder)
+                #     refs = self._create_ref(value.builder)
                 if isinstance(value, (ReferenceBuilder, Container, Builder)):
                     type_str = 'object'
                     if isinstance(value, Builder):
-                        refs = self.__get_ref(value, export_source)
+                        refs = self._create_ref(value, export_source)
                     else:
-                        refs = self.__get_ref(value.builder, export_source)
+                        refs = self._create_ref(value.builder, export_source)
                 tmp = {'zarr_dtype': type_str, 'value': refs}
                 obj.attrs[key] = tmp
             # Case 3: Scalar attributes
@@ -742,7 +742,7 @@ def resolve_ref(self, zarr_ref):
         # Return the create path
         return target_name, target_zarr_obj
 
-    def __get_ref(self, ref_object, export_source=None):
+    def _create_ref(self, ref_object, export_source=None):
         """
         Create a ZarrReference object that points to the given container
 
@@ -761,6 +761,7 @@ def __get_ref(self, ref_object, export_source=None):
             builder = ref_object.builder
         else:
             builder = self.manager.build(ref_object)
+
         path = self.__get_path(builder)
         # TODO Add to get region for region references.
         #      Also add  {'name': 'region', 'type': (slice, list, tuple),
@@ -838,7 +839,7 @@ def write_link(self, **kwargs):
         name = builder.name
         target_builder = builder.builder
         # Get the reference
-        zarr_ref = self.__get_ref(target_builder)
+        zarr_ref = self._create_ref(target_builder)
         # EXPORT WITH LINKS: Fix link source
         # if the target and source are both the same, then we need to ALWAYS use ourselves as a source
         # When exporting from one source to another, the LinkBuilders.source are not updated, i.e,. the
@@ -982,7 +983,7 @@ def write_dataset(self, **kwargs):  # noqa: C901
         elif isinstance(data, HDMFDataset):
             # If we have a dataset of containers we need to make the references to the containers
             if len(data) > 0 and isinstance(data[0], Container):
-                ref_data = [self.__get_ref(data[i], export_source=export_source) for i in range(len(data))]
+                ref_data = [self._create_ref(data[i], export_source=export_source) for i in range(len(data))]
                 shape = (len(data), )
                 type_str = 'object'
                 dset = parent.require_dataset(name,
@@ -1015,7 +1016,7 @@ def write_dataset(self, **kwargs):  # noqa: C901
             for i, dts in enumerate(options['dtype']):
                 if self.__is_ref(dts['dtype']):
                     refs.append(i)
-                    ref_tmp = self.__get_ref(data[0][i], export_source=export_source)
+                    ref_tmp = self._create_ref(data[0][i], export_source=export_source)
                     if isinstance(ref_tmp, ZarrReference):
                         dts_str = 'object'
                     else:
@@ -1035,7 +1036,7 @@ def write_dataset(self, **kwargs):  # noqa: C901
                 for j, item in enumerate(data):
                     new_item = list(item)
                     for i in refs:
-                        new_item[i] = self.__get_ref(item[i], export_source=export_source)
+                        new_item[i] = self._create_ref(item[i], export_source=export_source)
                     new_items.append(tuple(new_item))
 
                 # Create dtype for storage, replacing values to match hdmf's hdf5 behavior
@@ -1080,20 +1081,20 @@ def write_dataset(self, **kwargs):  # noqa: C901
             #  if isinstance(data, RegionBuilder):
             #      shape = (1,)
             #      type_str = 'region'
-            #      refs = self.__get_ref(data.builder, data.region)
+            #      refs = self._create_ref(data.builder, data.region)
             if isinstance(data, ReferenceBuilder):
                 shape = (1,)
                 type_str = 'object'
-                refs = self.__get_ref(data.builder, export_source=export_source)
+                refs = self._create_ref(data.builder, export_source=export_source)
             # TODO: Region References are not yet supported
             # elif options['dtype'] == 'region':
             #     shape = (len(data), )
             #     type_str = 'region'
-            #     refs = [self.__get_ref(item.builder, item.region) for item in data]
+            #     refs = [self._create_ref(item.builder, item.region) for item in data]
             else:
                 shape = (len(data), )
                 type_str = 'object'
-                refs = [self.__get_ref(item, export_source=export_source) for item in data]
+                refs = [self._create_ref(item, export_source=export_source) for item in data]
 
             dset = parent.require_dataset(name,
                                           shape=shape,
@@ -1283,7 +1284,7 @@ def __list_fill__(self, parent, name, data, options=None):  # noqa: C901
         dset.attrs['zarr_dtype'] = type_str
 
         # Write the data to file
-        if dtype == object:
+        if dtype == object: # noqa: E721
             for c in np.ndindex(data_shape):
                 o = data
                 for i in c:
@@ -1317,7 +1318,7 @@ def __scalar_fill__(self, parent, name, data, options=None):
             except Exception as exc:
                 msg = 'cannot add %s to %s - could not determine type' % (name, parent.name)
                 raise Exception(msg) from exc
-        if dtype == object:
+        if dtype == object: # noqa: E721
             io_settings['object_codec'] = self.__codec_cls()
 
         dset = parent.require_dataset(name, shape=(1, ), dtype=dtype, **io_settings)

diff --git a/src/hdmf_zarr/zarr_utils.py b/src/hdmf_zarr/zarr_utils.py
@@ -10,6 +10,7 @@
 from zarr import Array as ZarrArray
 
 from hdmf.build import DatasetBuilder
+from hdmf.data_utils import append_data
 from hdmf.array import Array
 from hdmf.query import HDMFDataset, ReferenceResolver, ContainerResolver, BuilderResolver
 from hdmf.utils import docval, popargs, get_docval
@@ -73,6 +74,29 @@ def __iter__(self):
     def __next__(self):
         return self._get_ref(super().__next__())
 
+    def append(self, arg):
+        # Building the root parent first.
+        # (Doing so will correctly set the parent of the child builder, which is needed to create the reference)
+        # Note: If the arg is a nested child such that objB is the parent of arg and objA is the parent of objB
+        # (and objA is not the root), then we need to have objA already added to the root as a child. Otherwise,
+        # the loop will use objA as the root. This might not raise an error (meaning the path could be correct),
+        # but it could lead to having an incorrect path for the reference.
+        # Having objA NOT be an orphaned container ensures correct functionality.
+        child = arg
+        while True:
+            if child.parent is not None:
+                parent = child.parent
+                child = parent
+            else:
+                parent = child
+                break
+        self.io.manager.build(parent)
+        builder = self.io.manager.build(arg)
+
+        # Create ZarrReference
+        ref = self.io._create_ref(builder)
+        append_data(self.dataset, ref)
+
 
 class BuilderResolverMixin(BuilderResolver):  # refactor to backend/utils.py
     """

diff --git a/tests/unit/test_zarrio.py b/tests/unit/test_zarrio.py
@@ -16,9 +16,12 @@
 from zarr.storage import (DirectoryStore,
                           TempStore,
                           NestedDirectoryStore)
+from tests.unit.utils import (Baz, BazData, BazBucket, get_baz_buildmanager)
 import zarr
 from hdmf_zarr.backend import ZarrIO
 import os
+import shutil
+import warnings
 
 
 CUR_DIR = os.path.dirname(os.path.realpath(__file__))
@@ -181,3 +184,51 @@ def test_force_open_without_consolidated_fails(self):
                 read_io._ZarrIO__open_file_consolidated(store=self.store, mode='r')
             except ValueError as e:
                 self.fail("ZarrIO.__open_file_consolidated raised an unexpected ValueError: {}".format(e))
+
+
+class TestDatasetofReferences(ZarrStoreTestCase):
+    def setUp(self):
+        self.store_path = "test_io.zarr"
+        self.store = DirectoryStore(self.store_path)
+
+    def tearDown(self):
+        """
+        Remove all files and folders defined by self.store_path
+        """
+        paths = self.store_path if isinstance(self.store_path, list) else [self.store_path, ]
+        for path in paths:
+            if os.path.exists(path):
+                if os.path.isdir(path):
+                    shutil.rmtree(path)
+                elif os.path.isfile(path):
+                    os.remove(path)
+                else:
+                    warnings.warn("Could not remove: %s" % path)
+
+    def test_append_references(self):
+        # Setup a file container with references
+        num_bazs = 10
+        bazs = []  # set up dataset of references
+        for i in range(num_bazs):
+            bazs.append(Baz(name='baz%d' % i))
+        baz_data = BazData(name='baz_data', data=bazs)
+        container = BazBucket(bazs=bazs, baz_data=baz_data)
+        manager = get_baz_buildmanager()
+
+        with ZarrIO(self.store, manager=manager, mode='w') as writer:
+            writer.write(container=container)
+
+        with ZarrIO(self.store, manager=manager, mode='a') as append_io:
+            read_container = append_io.read()
+            new_baz = Baz(name='new')
+            read_container.add_baz(new_baz)
+
+            DoR = read_container.baz_data.data
+            DoR.append(new_baz)
+
+            append_io.write(read_container)
+
+        with ZarrIO(self.store, manager=manager, mode='r') as append_io:
+            read_container = append_io.read()
+            self.assertEqual(len(read_container.baz_data.data), 11)
+            self.assertIs(read_container.baz_data.data[10], read_container.bazs["new"])