From be602e51804922e01bcd0e6f89b23873f9e13b4b Mon Sep 17 00:00:00 2001 From: Heberto Mayorquin Date: Tue, 5 Nov 2024 15:43:29 -0600 Subject: [PATCH] improve html representation of datasets (#1100) * improve dev repr * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * address ruff * add changelog * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add table representation for hdf5 info * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * ruff * handle division by zer * add zarr, array, hdf5 repr tests * generalize array html table description * remove zarr tests * fix nbytes * fix use of nbytes ahead * added TODO * add html test array data type * add array html repr utils * add generate_dataset_html method to io objects * add tests for array html repr * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix import style * update CHANGLEOG * add test for base hdmfio --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Steph Prince <40640337+stephprince@users.noreply.github.com> Co-authored-by: Ryan Ly --- CHANGELOG.md | 1 + src/hdmf/backends/hdf5/h5tools.py | 29 +++++++++- src/hdmf/backends/io.py | 10 +++- src/hdmf/container.py | 32 +++++++---- src/hdmf/utils.py | 48 ++++++++++++++++ tests/unit/test_container.py | 94 +++++++++++++++++++++++++++++++ 6 files changed, 202 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c1c490089..b4f0fde80 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ### Enhancements - Added support for expandable datasets of references for untyped and compound data types. @stephprince [#1188](https://github.com/hdmf-dev/hdmf/pull/1188) +- Improved html representation of data in `Containers` @h-mayorquin [#1100](https://github.com/hdmf-dev/hdmf/pull/1100) ### Bug fixes - Fixed inaccurate error message when validating reference data types. @stephprince [#1199](https://github.com/hdmf-dev/hdmf/pull/1199) diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py index 36aeb7c8f..e9156dc50 100644 --- a/src/hdmf/backends/hdf5/h5tools.py +++ b/src/hdmf/backends/hdf5/h5tools.py @@ -19,7 +19,8 @@ from ...container import Container from ...data_utils import AbstractDataChunkIterator from ...spec import RefSpec, DtypeSpec, NamespaceCatalog -from ...utils import docval, getargs, popargs, get_data_shape, get_docval, StrDataset +from ...utils import (docval, getargs, popargs, get_data_shape, get_docval, StrDataset, + get_basic_array_info, generate_array_html_repr) from ..utils import NamespaceToBuilderHelper, WriteStatusTracker ROOT_NAME = 'root' @@ -1603,3 +1604,29 @@ def set_dataio(cls, **kwargs): data = H5DataIO(data) """ return H5DataIO.__init__(**kwargs) + + @staticmethod + def generate_dataset_html(dataset): + """Generates an html representation for a dataset for the HDF5IO class""" + + # get info from hdf5 dataset + compressed_size = dataset.id.get_storage_size() + if hasattr(dataset, "nbytes"): # TODO: Remove this after h5py minimal version is larger than 3.0 + uncompressed_size = dataset.nbytes + else: + uncompressed_size = dataset.size * dataset.dtype.itemsize + compression_ratio = uncompressed_size / compressed_size if compressed_size != 0 else "undefined" + + hdf5_info_dict = {"Chunk shape": dataset.chunks, + "Compression": dataset.compression, + "Compression opts": dataset.compression_opts, + "Compression ratio": compression_ratio} + + # get basic array info + array_info_dict = get_basic_array_info(dataset) + array_info_dict.update(hdf5_info_dict) + + # generate html repr + repr_html = generate_array_html_repr(array_info_dict, dataset, "HDF5 dataset") + + return repr_html diff --git a/src/hdmf/backends/io.py b/src/hdmf/backends/io.py index 35023066f..86fd25b26 100644 --- a/src/hdmf/backends/io.py +++ b/src/hdmf/backends/io.py @@ -5,7 +5,7 @@ from ..build import BuildManager, GroupBuilder from ..container import Container, HERDManager from .errors import UnsupportedOperation -from ..utils import docval, getargs, popargs +from ..utils import docval, getargs, popargs, get_basic_array_info, generate_array_html_repr from warnings import warn @@ -188,6 +188,14 @@ def close(self): ''' Close this HDMFIO object to further reading/writing''' pass + @staticmethod + def generate_dataset_html(dataset): + """Generates an html representation for a dataset""" + array_info_dict = get_basic_array_info(dataset) + repr_html = generate_array_html_repr(array_info_dict, dataset) + + return repr_html + def __enter__(self): return self diff --git a/src/hdmf/container.py b/src/hdmf/container.py index 7c450770a..8f961936f 100644 --- a/src/hdmf/container.py +++ b/src/hdmf/container.py @@ -12,7 +12,8 @@ import pandas as pd from .data_utils import DataIO, append_data, extend_data, AbstractDataChunkIterator -from .utils import docval, get_docval, getargs, ExtenderMeta, get_data_shape, popargs, LabelledDict +from .utils import (docval, get_docval, getargs, ExtenderMeta, get_data_shape, popargs, LabelledDict, + get_basic_array_info, generate_array_html_repr) from .term_set import TermSet, TermSetWrapper @@ -707,8 +708,6 @@ def _generate_html_repr(self, fields, level=0, access_code="", is_field=False): for index, item in enumerate(fields): access_code += f'[{index}]' html_repr += self._generate_field_html(index, item, level, access_code) - elif isinstance(fields, np.ndarray): - html_repr += self._generate_array_html(fields, level) else: pass @@ -724,18 +723,23 @@ def _generate_field_html(self, key, value, level, access_code): return f'
{key}: {value}
' - if hasattr(value, "generate_html_repr"): - html_content = value.generate_html_repr(level + 1, access_code) + is_array_data = isinstance(value, (np.ndarray, h5py.Dataset, DataIO)) or \ + (hasattr(value, "store") and hasattr(value, "shape")) # Duck typing for zarr array + if is_array_data: + html_content = self._generate_array_html(value, level + 1) + elif hasattr(value, "generate_html_repr"): + html_content = value.generate_html_repr(level + 1, access_code) elif hasattr(value, '__repr_html__'): html_content = value.__repr_html__() - - elif hasattr(value, "fields"): + elif hasattr(value, "fields"): # Note that h5py.Dataset has a fields attribute so there is an implicit order html_content = self._generate_html_repr(value.fields, level + 1, access_code, is_field=True) elif isinstance(value, (list, dict, np.ndarray)): html_content = self._generate_html_repr(value, level + 1, access_code, is_field=False) else: html_content = f'{value}' + + html_repr = ( f'
{key}' @@ -745,10 +749,18 @@ def _generate_field_html(self, key, value, level, access_code): return html_repr + def _generate_array_html(self, array, level): - """Generates HTML for a NumPy array.""" - str_ = str(array).replace("\n", "
") - return f'
{str_}
' + """Generates HTML for array data""" + + read_io = self.get_read_io() # if the Container was read from file, get IO object + if read_io is not None: + repr_html = read_io.generate_dataset_html(array) + else: + array_info_dict = get_basic_array_info(array) + repr_html = generate_array_html_repr(array_info_dict, array, "NumPy array") + + return f'
{repr_html}
' @staticmethod def __smart_str(v, num_indent): diff --git a/src/hdmf/utils.py b/src/hdmf/utils.py index 50db79c40..ccd3f0b0b 100644 --- a/src/hdmf/utils.py +++ b/src/hdmf/utils.py @@ -967,6 +967,54 @@ def is_ragged(data): return False +def get_basic_array_info(array): + def convert_bytes_to_str(bytes_size): + suffixes = ['bytes', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB'] + i = 0 + while bytes_size >= 1024 and i < len(suffixes)-1: + bytes_size /= 1024. + i += 1 + return f"{bytes_size:.2f} {suffixes[i]}" + + if hasattr(array, "nbytes"): # TODO: Remove this after h5py minimal version is larger than 3.0 + array_size_in_bytes = array.nbytes + else: + array_size_in_bytes = array.size * array.dtype.itemsize + array_size_repr = convert_bytes_to_str(array_size_in_bytes) + basic_array_info_dict = {"Data type": array.dtype, "Shape": array.shape, "Array size": array_size_repr} + + return basic_array_info_dict + +def generate_array_html_repr(backend_info_dict, array, dataset_type=None): + def html_table(item_dicts) -> str: + """ + Generates an html table from a dictionary + """ + report = '' + report += "" + for k, v in item_dicts.items(): + report += ( + f"" + f'' + f'' + f"" + ) + report += "" + report += "
{k}{v}
" + return report + + array_info_html = html_table(backend_info_dict) + repr_html = dataset_type + "
" + array_info_html if dataset_type is not None else array_info_html + + if hasattr(array, "nbytes"): # TODO: Remove this after h5py minimal version is larger than 3.0 + array_size = array.nbytes + else: + array_size = array.size * array.dtype.itemsize + array_is_small = array_size < 1024 * 0.1 # 10 % a kilobyte to display the array + if array_is_small: + repr_html += "
" + str(np.asarray(array)) + + return repr_html class LabelledDict(dict): """A dict wrapper that allows querying by an attribute of the values and running a callable on removed items. diff --git a/tests/unit/test_container.py b/tests/unit/test_container.py index 9ac81ba13..35d8e480c 100644 --- a/tests/unit/test_container.py +++ b/tests/unit/test_container.py @@ -8,6 +8,7 @@ from hdmf.utils import docval from hdmf.common import DynamicTable, VectorData, DynamicTableRegion from hdmf.backends.hdf5.h5tools import HDF5IO +from hdmf.backends.io import HDMFIO class Subcontainer(Container): @@ -423,6 +424,23 @@ def __init__(self, **kwargs): self.data = kwargs['data'] self.str = kwargs['str'] + class ContainerWithData(Container): + + __fields__ = ( + "data", + "str" + ) + + @docval( + {'name': "data", "doc": 'data', 'type': 'array_data', "default": None}, + {'name': "str", "doc": 'str', 'type': str, "default": None}, + + ) + def __init__(self, **kwargs): + super().__init__('test name') + self.data = kwargs['data'] + self.str = kwargs['str'] + def test_repr_html_(self): child_obj1 = Container('test child 1') obj1 = self.ContainerWithChildAndData(child=child_obj1, data=[1, 2, 3], str="hello") @@ -455,6 +473,82 @@ def test_repr_html_(self): 'class="field-value">hello' ) + def test_repr_html_array(self): + obj = self.ContainerWithData(data=np.array([1, 2, 3, 4], dtype=np.int64), str="hello") + expected_html_table = ( + 'class="container-fields">NumPy array
Data typeint64
Shape' + '(4,)
Array size32.00 bytes

[1 2 3 4]' + ) + self.assertIn(expected_html_table, obj._repr_html_()) + + def test_repr_html_array_large_arrays_not_displayed(self): + obj = self.ContainerWithData(data=np.arange(200, dtype=np.int64), str="hello") + expected_html_table = ( + 'class="container-fields">NumPy array
Data typeint64
Shape' + '(200,)
Array size1.56 KiB
' + ) + self.assertIn(expected_html_table, obj._repr_html_()) + + def test_repr_html_hdf5_dataset(self): + with HDF5IO('array_data.h5', mode='w') as io: + dataset = io._file.create_dataset(name='my_dataset', data=np.array([1, 2, 3, 4], dtype=np.int64)) + obj = self.ContainerWithData(data=dataset, str="hello") + obj.read_io = io + + expected_html_table = ( + 'class="container-fields">HDF5 dataset
Data typeint64
' + 'Shape(4,)
Array size' + '32.00 bytes
Chunk shape' + 'None
CompressionNone
Compression optsNone
Compression ratio1.0

[1 2 3 4]' + ) + + self.assertIn(expected_html_table, obj._repr_html_()) + + os.remove('array_data.h5') + + def test_repr_html_hdmf_io(self): + with HDF5IO('array_data.h5', mode='w') as io: + dataset = io._file.create_dataset(name='my_dataset', data=np.array([1, 2, 3, 4], dtype=np.int64)) + obj = self.ContainerWithData(data=dataset, str="hello") + + class OtherIO(HDMFIO): + + @staticmethod + def can_read(path): + pass + + def read_builder(self): + pass + + def write_builder(self, **kwargs): + pass + + def open(self): + pass + + def close(self): + pass + + obj.read_io = OtherIO() + + expected_html_table = ( + 'class="container-fields">
Data typeint64
' + 'Shape(4,)
Array size' + '32.00 bytes

[1 2 3 4]' + ) + + self.assertIn(expected_html_table, obj._repr_html_()) + + os.remove('array_data.h5') class TestData(TestCase):