hdmf-dev · mavaylon1 · Oct 1, 2023 · Dec 24, 2022 · Dec 24, 2022 · Dec 24, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,11 @@
 
 ## 0.1.7 (Upcoming)
 
+### Enhancements
+* Updated storage of references to also save the ``object_id`` and ``source_object_id``. While these
+  fields are not strictly necessary to define a link they are useful for validation of references
+  and enhance the rigor of the definition of references. @oruebel [#57](https://github.com/hdmf-dev/hdmf-zarr/pull/57)
+
 ### Bugs
 * Use path relative to the current Zarr file in the definition of links and references to avoid breaking
   links when moving Zarr files @oruebel [#46](https://github.com/hdmf-dev/hdmf-zarr/pull/46)

diff --git a/docs/source/storage.rst b/docs/source/storage.rst
@@ -156,6 +156,12 @@ as JSON. Each dict (i.e., element) in the list defines a link, with each dict co
   links that point to object in another Zarr file, the value of source will be the path to
   the other Zarr file relative to the root path of the Zarr file containing the link.
 * ``path`` : Path to the linked object within the Zarr file idenfied by the ``source`` key
+* ``object_id``: Object id of the reference object. May be None in case the referenced object
+  does not have and assigned object_id (e.g., in the case we reference a dataset with a fixed
+  name but without and assigned ``data_type`` (or ``neurodata_type`` in the case of NWB).
+* ``source_object_id``: Object id of the source Zarr file indicated by the ``source`` key.
+  The ``source`` should always have an ``object_id`` (at least if the ``source`` file is
+  a valid HDMF formatted file).
 
 For example:
 
@@ -164,8 +170,10 @@ For example:
     "zarr_link": [
         {
             "name": "device",
+            "source": ".",
             "path": "/general/devices/array",
-            "source": "."
+            "object_id": "f6685427-3919-4e06-b195-ccb7ab42f0fa",
+            "source_object_id": "6224bb89-578a-4839-b31c-83f11009292c"
         }
     ]
 
@@ -237,9 +245,9 @@ Object references are stored in a attributes as dicts with the following keys:
 * ``zarr_dtype`` : Indicating the data type for the attribute. For object references
   ``zarr_dtype`` is set to ``"object"`` (or ``"region"`` for :ref:`sec-zarr-storage-references-region`)
 * ``value``: The value of the object references, i.e., here the py:class:`~hdmf_zarr.utils.ZarrReference`
-  dictionary with the ``source`` and ``path`` keys defining the object reference (again, ``source`` is
-  here the relative path to the target Zarr file, and ``path`` identifys the object within the source
-   Zarr file).
+  dictionary with the ``source``, ``path``, ``object_id``, and ``source_object_id`` keys defining
+  the object reference, with the definition of the keys being the same as
+  for :ref:`sec-zarr-storage-links`.
 
 For example in NWB, the attribute ``ElectricalSeries.electrodes.table`` would be defined as follows:
 
@@ -248,7 +256,9 @@ For example in NWB, the attribute ``ElectricalSeries.electrodes.table`` would be
     "table": {
         "value": {
             "path": "/general/extracellular_ephys/electrodes",
-            "source": "."
+            "source": ".",
+            "object_id": "f6685427-3919-4e06-b195-ccb7ab42f0fa",
+            "source_object_id": "6224bb89-578a-4839-b31c-83f11009292c"
         },
         "zarr_dtype": "object"
     }

diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py
@@ -501,6 +501,24 @@ def __get_ref(self, ref_object):
         # if isinstance(ref_object, RegionBuilder):
         #    region = ref_object.region
 
+        # get the object id if available
+        object_id = builder.get('object_id', None)
+
+        # determine the object_id of the source by following the parents of the builder until we find the root
+        # the root builder should be the same as the source file containing the reference
+        curr = builder
+        while curr is not None and curr.name != ROOT_NAME:
+            curr = curr.parent
+        if curr:
+            source_object_id = curr.get('object_id', None)
+        # We did not find ROOT_NAME as a parent. This should only happen if we have an invalid
+        # file as a source, e.g., if during testing we use an arbitrary builder. We check this
+        # anyways to avoid potential errors just in case
+        else:
+            source_object_id = None
+            warn_msg = "Could not determine source_object_id for builder with path: %s" % path
+            warnings.warn(warn_msg)
+
         # by checking os.isdir makes sure we have a valid link path to a dir for Zarr. For conversion
         # between backends a user should always use export which takes care of creating a clean set of builders.
         source = (builder.source
@@ -509,7 +527,12 @@ def __get_ref(self, ref_object):
         # Make the source relative to the current file
         source = os.path.relpath(os.path.abspath(source), start=self.abspath)
         # Return the ZarrReference object
-        return ZarrReference(source, path)
+        ref = ZarrReference(
+            source=source,
+            path=path,
+            object_id=object_id,
+            source_object_id=source_object_id)
+        return ref
 
     def __add_link__(self, parent, target_source, target_path, link_name):
         """

diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py
@@ -232,31 +232,59 @@ class ZarrReference(dict):
 
     @docval({'name': 'source',
              'type': str,
-             'doc': 'Source of referenced object',
+             'doc': 'Source of referenced object. Usually the relative path to the '
+                    'Zarr file containing the referenced object',
              'default': None},
             {'name': 'path',
              'type': str,
-             'doc': 'Path of referenced object',
+             'doc': 'Path of referenced object within the source',
+             'default': None},
+            {'name': 'object_id',
+             'type': str,
+             'doc': 'Object_id of the referenced object (if available)',
+             'default': None},
+            {'name': 'source_object_id',
+             'type': str,
+             'doc': 'Object_id of the source (should always be available)',
              'default': None}
             )
     def __init__(self, **kwargs):
-        dest_source, dest_path = getargs('source', 'path', kwargs)
+        dest_source, dest_path, dest_object_id, dest_source_object_id = getargs(
+            'source', 'path', 'object_id', 'source_object_id', kwargs)
         super(ZarrReference, self).__init__()
-        super(ZarrReference, self).__setitem__('source', dest_source)
-        super(ZarrReference, self).__setitem__('path', dest_path)
+        self.source = dest_source
+        self.path = dest_path
+        self.object_id = dest_object_id
+        self.source_object_id = dest_source_object_id
 
     @property
-    def source(self):
+    def source(self) -> str:
         return super(ZarrReference, self).__getitem__('source')
 
     @property
-    def path(self):
+    def path(self) -> str:
         return super(ZarrReference, self).__getitem__('path')
 
+    @property
+    def object_id(self) -> str:
+        return super(ZarrReference, self).__getitem__('object_id')
+
+    @property
+    def source_object_id(self) -> str:
+        return super(ZarrReference, self).__getitem__('source_object_id')
+
     @source.setter
-    def source(self, s):
-        super(ZarrReference, self).__setitem__('source', s)
+    def source(self, source: str):
+        super(ZarrReference, self).__setitem__('source', source)
 
     @path.setter
-    def path(self, p):
-        super(ZarrReference, self).__setitem__('path', p)
+    def path(self, path: str):
+        super(ZarrReference, self).__setitem__('path', path)
+
+    @object_id.setter
+    def object_id(self, object_id: str):
+        super(ZarrReference, self).__setitem__('object_id', object_id)
+
+    @source_object_id.setter
+    def source_object_id(self, object_id: str):
+        super(ZarrReference, self).__setitem__('source_object_id', object_id)
diff --git a/tests/unit/test_io_zarr.py b/tests/unit/test_io_zarr.py
@@ -504,8 +504,14 @@ def test_write_attributes_write_reference_to_datasetbuilder(self):
         dataset_1 = DatasetBuilder('dataset_1', data_1)
         testgroup = self.io._ZarrIO__file  # For testing we just use our file and create some attributes
         attr = {'attr1': dataset_1}
-        self.io.write_attributes(testgroup, attr)
-        expected_value = {'attr1': {'zarr_dtype': 'object', 'value': {'source': ".", 'path': '/dataset_1'}}}
+        with self.assertWarnsWith(UserWarning,
+                                  "Could not determine source_object_id for builder with path: /dataset_1"):
+            self.io.write_attributes(testgroup, attr)
+        expected_value = {'attr1': {'zarr_dtype': 'object',
+                                    'value': {'source': ".",
+                                              'path': '/dataset_1',
+                                              'object_id': None,
+                                              'source_object_id': None}}}
         self.assertDictEqual(testgroup.attrs.asdict(), expected_value)
 
     def test_write_attributes_write_reference_to_referencebuilder(self):
@@ -514,8 +520,16 @@ def test_write_attributes_write_reference_to_referencebuilder(self):
         ref1 = ReferenceBuilder(dataset_1)
         testgroup = self.io._ZarrIO__file  # For testing we just use our file and create some attributes
         attr = {'attr1': ref1}
-        self.io.write_attributes(testgroup, attr)
-        expected_value = {'attr1': {'zarr_dtype': 'object', 'value': {'source': ".", 'path': '/dataset_1'}}}
+        with self.assertWarnsWith(UserWarning,
+                                  "Could not determine source_object_id for builder with path: /dataset_1"):
+            self.io.write_attributes(testgroup, attr)
+        expected_value = {'attr1': {'zarr_dtype': 'object',
+                                    'value': {'source': ".",
+                                              'path': '/dataset_1',
+                                              'source_object_id': None,
+                                              'object_id': None},
+                                    }
+                          }
         self.assertDictEqual(testgroup.attrs.asdict(), expected_value)
 
     ##########################################
@@ -877,9 +891,16 @@ def test_soft_link_group(self):
         foofile = FooFile(buckets=[foobucket], foo_link=foo1)
         with ZarrIO(self.paths[0], manager=get_foo_buildmanager(), mode='w') as write_io:
             write_io.write(foofile)
+        with open(self.paths[0]+"/.zattrs", 'r') as f:
+            print(f.readlines())
+
         with ZarrIO(self.paths[0], manager=get_foo_buildmanager(), mode='r') as read_io:
             with ZarrIO(self.paths[1], mode='w') as export_io:
-                export_io.export(src_io=read_io, write_args=dict(link_data=False))
+                with self.assertWarnsWith(UserWarning, "Could not determine source_object_id for "
+                                                       "builder with path: /buckets/bucket1/foo_holder/foo1"):
+                    export_io.export(
+                        src_io=read_io,
+                        write_args=dict(link_data=False))
         with ZarrIO(self.paths[1], manager=get_foo_buildmanager(), mode='r') as read_io:
             read_foofile2 = read_io.read()
             # make sure the linked group is within the same file