improve html representation of datasets (#1100)

* improve dev repr * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * address ruff * add changelog * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add table representation for hdf5 info * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * ruff * handle division by zer * add zarr, array, hdf5 repr tests * generalize array html table description * remove zarr tests * fix nbytes * fix use of nbytes ahead * added TODO * add html test array data type * add array html repr utils * add generate_dataset_html method to io objects * add tests for array html repr * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix import style * update CHANGLEOG * add test for base hdmfio --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Steph Prince <[email protected]> Co-authored-by: Ryan Ly <[email protected]>
hdmf-dev · Nov 5, 2024 · be602e5 · be602e5
1 parent 06a62b9
commit be602e5
Show file tree

Hide file tree

Showing 6 changed files with 202 additions and 12 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,7 @@
 
 ### Enhancements
 - Added support for expandable datasets of references for untyped and compound data types. @stephprince [#1188](https://github.com/hdmf-dev/hdmf/pull/1188)
+- Improved html representation of data in `Containers` @h-mayorquin [#1100](https://github.com/hdmf-dev/hdmf/pull/1100)
 
 ### Bug fixes
 - Fixed inaccurate error message when validating reference data types. @stephprince [#1199](https://github.com/hdmf-dev/hdmf/pull/1199)

diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py
@@ -19,7 +19,8 @@
 from ...container import Container
 from ...data_utils import AbstractDataChunkIterator
 from ...spec import RefSpec, DtypeSpec, NamespaceCatalog
-from ...utils import docval, getargs, popargs, get_data_shape, get_docval, StrDataset
+from ...utils import (docval, getargs, popargs, get_data_shape, get_docval, StrDataset,
+                      get_basic_array_info, generate_array_html_repr)
 from ..utils import NamespaceToBuilderHelper, WriteStatusTracker
 
 ROOT_NAME = 'root'
@@ -1603,3 +1604,29 @@ def set_dataio(cls, **kwargs):
             data = H5DataIO(data)
         """
         return H5DataIO.__init__(**kwargs)
+
+    @staticmethod
+    def generate_dataset_html(dataset):
+        """Generates an html representation for a dataset for the HDF5IO class"""
+
+        # get info from hdf5 dataset
+        compressed_size = dataset.id.get_storage_size()
+        if hasattr(dataset, "nbytes"):  # TODO: Remove this after h5py minimal version is larger than 3.0
+            uncompressed_size = dataset.nbytes
+        else:
+            uncompressed_size = dataset.size * dataset.dtype.itemsize
+        compression_ratio = uncompressed_size / compressed_size if compressed_size != 0 else "undefined"
+
+        hdf5_info_dict = {"Chunk shape": dataset.chunks,
+                          "Compression": dataset.compression,
+                          "Compression opts": dataset.compression_opts,
+                          "Compression ratio": compression_ratio}
+
+        # get basic array info
+        array_info_dict = get_basic_array_info(dataset)
+        array_info_dict.update(hdf5_info_dict)
+
+        # generate html repr
+        repr_html = generate_array_html_repr(array_info_dict, dataset, "HDF5 dataset")
+
+        return repr_html
diff --git a/src/hdmf/backends/io.py b/src/hdmf/backends/io.py
@@ -5,7 +5,7 @@
 from ..build import BuildManager, GroupBuilder
 from ..container import Container, HERDManager
 from .errors import UnsupportedOperation
-from ..utils import docval, getargs, popargs
+from ..utils import docval, getargs, popargs, get_basic_array_info, generate_array_html_repr
 from warnings import warn
 
 
@@ -188,6 +188,14 @@ def close(self):
         ''' Close this HDMFIO object to further reading/writing'''
         pass
 
+    @staticmethod
+    def generate_dataset_html(dataset):
+        """Generates an html representation for a dataset"""
+        array_info_dict = get_basic_array_info(dataset)
+        repr_html = generate_array_html_repr(array_info_dict, dataset)
+
+        return repr_html
+
     def __enter__(self):
         return self
 

diff --git a/src/hdmf/container.py b/src/hdmf/container.py
@@ -12,7 +12,8 @@
 import pandas as pd
 
 from .data_utils import DataIO, append_data, extend_data, AbstractDataChunkIterator
-from .utils import docval, get_docval, getargs, ExtenderMeta, get_data_shape, popargs, LabelledDict
+from .utils import (docval, get_docval, getargs, ExtenderMeta, get_data_shape, popargs, LabelledDict,
+                    get_basic_array_info, generate_array_html_repr)
 
 from .term_set import TermSet, TermSetWrapper
 
@@ -707,8 +708,6 @@ def _generate_html_repr(self, fields, level=0, access_code="", is_field=False):
             for index, item in enumerate(fields):
                 access_code += f'[{index}]'
                 html_repr += self._generate_field_html(index, item, level, access_code)
-        elif isinstance(fields, np.ndarray):
-            html_repr += self._generate_array_html(fields, level)
         else:
             pass
 
@@ -724,18 +723,23 @@ def _generate_field_html(self, key, value, level, access_code):
             return f'<div style="margin-left: {level * 20}px;" class="container-fields"><span class="field-key"' \
                    f' title="{access_code}">{key}: </span><span class="field-value">{value}</span></div>'
 
-        if hasattr(value, "generate_html_repr"):
-            html_content = value.generate_html_repr(level + 1, access_code)
+        is_array_data = isinstance(value, (np.ndarray, h5py.Dataset, DataIO)) or \
+            (hasattr(value, "store") and hasattr(value, "shape"))  # Duck typing for zarr array
 
+        if is_array_data:
+            html_content = self._generate_array_html(value, level + 1)
+        elif hasattr(value, "generate_html_repr"):
+            html_content = value.generate_html_repr(level + 1, access_code)
         elif hasattr(value, '__repr_html__'):
             html_content = value.__repr_html__()
-
-        elif hasattr(value, "fields"):
+        elif hasattr(value, "fields"):  # Note that h5py.Dataset has a fields attribute so there is an implicit order
             html_content = self._generate_html_repr(value.fields, level + 1, access_code, is_field=True)
         elif isinstance(value, (list, dict, np.ndarray)):
             html_content = self._generate_html_repr(value, level + 1, access_code, is_field=False)
         else:
             html_content = f'<span class="field-key">{value}</span>'
+
+
         html_repr = (
             f'<details><summary style="display: list-item; margin-left: {level * 20}px;" '
             f'class="container-fields field-key" title="{access_code}"><b>{key}</b></summary>'
@@ -745,10 +749,18 @@ def _generate_field_html(self, key, value, level, access_code):
 
         return html_repr
 
+
     def _generate_array_html(self, array, level):
-        """Generates HTML for a NumPy array."""
-        str_ = str(array).replace("\n", "</br>")
-        return f'<div style="margin-left: {level * 20}px;" class="container-fields">{str_}</div>'
+        """Generates HTML for array data"""
+
+        read_io = self.get_read_io()  # if the Container was read from file, get IO object
+        if read_io is not None:
+            repr_html = read_io.generate_dataset_html(array)
+        else:
+            array_info_dict = get_basic_array_info(array)
+            repr_html = generate_array_html_repr(array_info_dict, array, "NumPy array")
+
+        return f'<div style="margin-left: {level * 20}px;" class="container-fields">{repr_html}</div>'
 
     @staticmethod
     def __smart_str(v, num_indent):

diff --git a/src/hdmf/utils.py b/src/hdmf/utils.py
@@ -967,6 +967,54 @@ def is_ragged(data):
 
     return False
 
+def get_basic_array_info(array):
+    def convert_bytes_to_str(bytes_size):
+        suffixes = ['bytes', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB']
+        i = 0
+        while bytes_size >= 1024 and i < len(suffixes)-1:
+            bytes_size /= 1024.
+            i += 1
+        return f"{bytes_size:.2f} {suffixes[i]}"
+
+    if hasattr(array, "nbytes"):  # TODO: Remove this after h5py minimal version is larger than 3.0
+        array_size_in_bytes = array.nbytes
+    else:
+        array_size_in_bytes = array.size * array.dtype.itemsize
+    array_size_repr = convert_bytes_to_str(array_size_in_bytes)
+    basic_array_info_dict = {"Data type": array.dtype, "Shape": array.shape, "Array size": array_size_repr}
+
+    return basic_array_info_dict
+
+def generate_array_html_repr(backend_info_dict, array, dataset_type=None):
+    def html_table(item_dicts) -> str:
+        """
+        Generates an html table from a dictionary
+        """
+        report = '<table class="data-info">'
+        report += "<tbody>"
+        for k, v in item_dicts.items():
+            report += (
+                f"<tr>"
+                f'<th style="text-align: left">{k}</th>'
+                f'<td style="text-align: left">{v}</td>'
+                f"</tr>"
+            )
+        report += "</tbody>"
+        report += "</table>"
+        return report
+
+    array_info_html = html_table(backend_info_dict)
+    repr_html = dataset_type + "<br>" + array_info_html if dataset_type is not None else array_info_html
+
+    if hasattr(array, "nbytes"):  # TODO: Remove this after h5py minimal version is larger than 3.0
+        array_size = array.nbytes
+    else:
+        array_size = array.size * array.dtype.itemsize
+    array_is_small = array_size < 1024 * 0.1 # 10 % a kilobyte to display the array
+    if array_is_small:
+        repr_html += "<br>" + str(np.asarray(array))
+
+    return repr_html
 
 class LabelledDict(dict):
     """A dict wrapper that allows querying by an attribute of the values and running a callable on removed items.

diff --git a/tests/unit/test_container.py b/tests/unit/test_container.py
@@ -8,6 +8,7 @@
 from hdmf.utils import docval
 from hdmf.common import DynamicTable, VectorData, DynamicTableRegion
 from hdmf.backends.hdf5.h5tools import HDF5IO
+from hdmf.backends.io import HDMFIO
 
 
 class Subcontainer(Container):
@@ -423,6 +424,23 @@ def __init__(self, **kwargs):
             self.data = kwargs['data']
             self.str = kwargs['str']
 
+    class ContainerWithData(Container):
+
+        __fields__ = (
+            "data",
+            "str"
+        )
+
+        @docval(
+            {'name': "data", "doc": 'data', 'type': 'array_data', "default": None},
+            {'name': "str", "doc": 'str', 'type': str, "default": None},
+
+        )
+        def __init__(self, **kwargs):
+            super().__init__('test name')
+            self.data = kwargs['data']
+            self.str = kwargs['str']
+
     def test_repr_html_(self):
         child_obj1 = Container('test child 1')
         obj1 = self.ContainerWithChildAndData(child=child_obj1, data=[1, 2, 3], str="hello")
@@ -455,6 +473,82 @@ def test_repr_html_(self):
             'class="field-value">hello</span></div></div>'
         )
 
+    def test_repr_html_array(self):
+        obj = self.ContainerWithData(data=np.array([1, 2, 3, 4], dtype=np.int64), str="hello")
+        expected_html_table = (
+            'class="container-fields">NumPy array<br><table class="data-info"><tbody><tr><th style="text-align: '
+            'left">Data type</th><td style="text-align: left">int64</td></tr><tr><th style="text-align: left">Shape'
+            '</th><td style="text-align: left">(4,)</td></tr><tr><th style="text-align: left">Array size</th><td '
+            'style="text-align: left">32.00 bytes</td></tr></tbody></table><br>[1 2 3 4]'
+        )
+        self.assertIn(expected_html_table, obj._repr_html_())
+
+    def test_repr_html_array_large_arrays_not_displayed(self):
+        obj = self.ContainerWithData(data=np.arange(200, dtype=np.int64), str="hello")
+        expected_html_table = (
+            'class="container-fields">NumPy array<br><table class="data-info"><tbody><tr><th style="text-align: '
+            'left">Data type</th><td style="text-align: left">int64</td></tr><tr><th style="text-align: left">Shape'
+            '</th><td style="text-align: left">(200,)</td></tr><tr><th style="text-align: left">Array size</th><td '
+            'style="text-align: left">1.56 KiB</td></tr></tbody></table></div></details>'
+        )
+        self.assertIn(expected_html_table, obj._repr_html_())
+
+    def test_repr_html_hdf5_dataset(self):
+        with HDF5IO('array_data.h5', mode='w') as io:
+            dataset = io._file.create_dataset(name='my_dataset', data=np.array([1, 2, 3, 4], dtype=np.int64))
+            obj = self.ContainerWithData(data=dataset, str="hello")
+            obj.read_io = io
+
+            expected_html_table = (
+                'class="container-fields">HDF5 dataset<br><table class="data-info"><tbody><tr><th style="text-align: '
+                'left">Data type</th><td style="text-align: left">int64</td></tr><tr><th style="text-align: left">'
+                'Shape</th><td style="text-align: left">(4,)</td></tr><tr><th style="text-align: left">Array size'
+                '</th><td style="text-align: left">32.00 bytes</td></tr><tr><th style="text-align: left">Chunk shape'
+                '</th><td style="text-align: left">None</td></tr><tr><th style="text-align: left">Compression</th><td '
+                'style="text-align: left">None</td></tr><tr><th style="text-align: left">Compression opts</th><td '
+                'style="text-align: left">None</td></tr><tr><th style="text-align: left">Compression ratio</th><td '
+                'style="text-align: left">1.0</td></tr></tbody></table><br>[1 2 3 4]'
+            )
+
+            self.assertIn(expected_html_table, obj._repr_html_())
+
+        os.remove('array_data.h5')
+
+    def test_repr_html_hdmf_io(self):
+        with HDF5IO('array_data.h5', mode='w') as io:
+            dataset = io._file.create_dataset(name='my_dataset', data=np.array([1, 2, 3, 4], dtype=np.int64))
+            obj = self.ContainerWithData(data=dataset, str="hello")
+
+            class OtherIO(HDMFIO):
+
+                @staticmethod
+                def can_read(path):
+                    pass
+
+                def read_builder(self):
+                    pass
+
+                def write_builder(self, **kwargs):
+                    pass
+
+                def open(self):
+                    pass
+
+                def close(self):
+                    pass
+
+            obj.read_io = OtherIO()
+
+            expected_html_table = (
+                'class="container-fields"><table class="data-info"><tbody><tr><th style="text-align: '
+                'left">Data type</th><td style="text-align: left">int64</td></tr><tr><th style="text-align: left">'
+                'Shape</th><td style="text-align: left">(4,)</td></tr><tr><th style="text-align: left">Array size'
+                '</th><td style="text-align: left">32.00 bytes</td></tr></tbody></table><br>[1 2 3 4]'
+            )
+
+            self.assertIn(expected_html_table, obj._repr_html_())
+
+        os.remove('array_data.h5')
 
 class TestData(TestCase):