Skip to content

Commit

Permalink
improve html representation of datasets (#1100)
Browse files Browse the repository at this point in the history
* improve dev repr

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* address ruff

* add changelog

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add table representation for hdf5 info

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add test

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* ruff

* handle division by zer

* add zarr, array, hdf5 repr tests

* generalize array html table description

* remove zarr tests

* fix nbytes

* fix use of nbytes ahead

* added TODO

* add html test array data type

* add array html repr utils

* add generate_dataset_html method to io objects

* add tests for array html repr

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix import style

* update CHANGLEOG

* add test for base hdmfio

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Steph Prince <[email protected]>
Co-authored-by: Ryan Ly <[email protected]>
  • Loading branch information
4 people authored Nov 5, 2024
1 parent 06a62b9 commit be602e5
Show file tree
Hide file tree
Showing 6 changed files with 202 additions and 12 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

### Enhancements
- Added support for expandable datasets of references for untyped and compound data types. @stephprince [#1188](https://github.com/hdmf-dev/hdmf/pull/1188)
- Improved html representation of data in `Containers` @h-mayorquin [#1100](https://github.com/hdmf-dev/hdmf/pull/1100)

### Bug fixes
- Fixed inaccurate error message when validating reference data types. @stephprince [#1199](https://github.com/hdmf-dev/hdmf/pull/1199)
Expand Down
29 changes: 28 additions & 1 deletion src/hdmf/backends/hdf5/h5tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
from ...container import Container
from ...data_utils import AbstractDataChunkIterator
from ...spec import RefSpec, DtypeSpec, NamespaceCatalog
from ...utils import docval, getargs, popargs, get_data_shape, get_docval, StrDataset
from ...utils import (docval, getargs, popargs, get_data_shape, get_docval, StrDataset,
get_basic_array_info, generate_array_html_repr)
from ..utils import NamespaceToBuilderHelper, WriteStatusTracker

ROOT_NAME = 'root'
Expand Down Expand Up @@ -1603,3 +1604,29 @@ def set_dataio(cls, **kwargs):
data = H5DataIO(data)
"""
return H5DataIO.__init__(**kwargs)

@staticmethod
def generate_dataset_html(dataset):
"""Generates an html representation for a dataset for the HDF5IO class"""

# get info from hdf5 dataset
compressed_size = dataset.id.get_storage_size()
if hasattr(dataset, "nbytes"): # TODO: Remove this after h5py minimal version is larger than 3.0
uncompressed_size = dataset.nbytes
else:
uncompressed_size = dataset.size * dataset.dtype.itemsize
compression_ratio = uncompressed_size / compressed_size if compressed_size != 0 else "undefined"

hdf5_info_dict = {"Chunk shape": dataset.chunks,
"Compression": dataset.compression,
"Compression opts": dataset.compression_opts,
"Compression ratio": compression_ratio}

# get basic array info
array_info_dict = get_basic_array_info(dataset)
array_info_dict.update(hdf5_info_dict)

# generate html repr
repr_html = generate_array_html_repr(array_info_dict, dataset, "HDF5 dataset")

return repr_html
10 changes: 9 additions & 1 deletion src/hdmf/backends/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from ..build import BuildManager, GroupBuilder
from ..container import Container, HERDManager
from .errors import UnsupportedOperation
from ..utils import docval, getargs, popargs
from ..utils import docval, getargs, popargs, get_basic_array_info, generate_array_html_repr
from warnings import warn


Expand Down Expand Up @@ -188,6 +188,14 @@ def close(self):
''' Close this HDMFIO object to further reading/writing'''
pass

@staticmethod
def generate_dataset_html(dataset):
"""Generates an html representation for a dataset"""
array_info_dict = get_basic_array_info(dataset)
repr_html = generate_array_html_repr(array_info_dict, dataset)

return repr_html

def __enter__(self):
return self

Expand Down
32 changes: 22 additions & 10 deletions src/hdmf/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
import pandas as pd

from .data_utils import DataIO, append_data, extend_data, AbstractDataChunkIterator
from .utils import docval, get_docval, getargs, ExtenderMeta, get_data_shape, popargs, LabelledDict
from .utils import (docval, get_docval, getargs, ExtenderMeta, get_data_shape, popargs, LabelledDict,
get_basic_array_info, generate_array_html_repr)

from .term_set import TermSet, TermSetWrapper

Expand Down Expand Up @@ -707,8 +708,6 @@ def _generate_html_repr(self, fields, level=0, access_code="", is_field=False):
for index, item in enumerate(fields):
access_code += f'[{index}]'
html_repr += self._generate_field_html(index, item, level, access_code)
elif isinstance(fields, np.ndarray):
html_repr += self._generate_array_html(fields, level)
else:
pass

Expand All @@ -724,18 +723,23 @@ def _generate_field_html(self, key, value, level, access_code):
return f'<div style="margin-left: {level * 20}px;" class="container-fields"><span class="field-key"' \
f' title="{access_code}">{key}: </span><span class="field-value">{value}</span></div>'

if hasattr(value, "generate_html_repr"):
html_content = value.generate_html_repr(level + 1, access_code)
is_array_data = isinstance(value, (np.ndarray, h5py.Dataset, DataIO)) or \
(hasattr(value, "store") and hasattr(value, "shape")) # Duck typing for zarr array

if is_array_data:
html_content = self._generate_array_html(value, level + 1)
elif hasattr(value, "generate_html_repr"):
html_content = value.generate_html_repr(level + 1, access_code)
elif hasattr(value, '__repr_html__'):
html_content = value.__repr_html__()

elif hasattr(value, "fields"):
elif hasattr(value, "fields"): # Note that h5py.Dataset has a fields attribute so there is an implicit order
html_content = self._generate_html_repr(value.fields, level + 1, access_code, is_field=True)
elif isinstance(value, (list, dict, np.ndarray)):
html_content = self._generate_html_repr(value, level + 1, access_code, is_field=False)
else:
html_content = f'<span class="field-key">{value}</span>'


html_repr = (
f'<details><summary style="display: list-item; margin-left: {level * 20}px;" '
f'class="container-fields field-key" title="{access_code}"><b>{key}</b></summary>'
Expand All @@ -745,10 +749,18 @@ def _generate_field_html(self, key, value, level, access_code):

return html_repr


def _generate_array_html(self, array, level):
"""Generates HTML for a NumPy array."""
str_ = str(array).replace("\n", "</br>")
return f'<div style="margin-left: {level * 20}px;" class="container-fields">{str_}</div>'
"""Generates HTML for array data"""

read_io = self.get_read_io() # if the Container was read from file, get IO object
if read_io is not None:
repr_html = read_io.generate_dataset_html(array)
else:
array_info_dict = get_basic_array_info(array)
repr_html = generate_array_html_repr(array_info_dict, array, "NumPy array")

return f'<div style="margin-left: {level * 20}px;" class="container-fields">{repr_html}</div>'

@staticmethod
def __smart_str(v, num_indent):
Expand Down
48 changes: 48 additions & 0 deletions src/hdmf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -967,6 +967,54 @@ def is_ragged(data):

return False

def get_basic_array_info(array):
def convert_bytes_to_str(bytes_size):
suffixes = ['bytes', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB']
i = 0
while bytes_size >= 1024 and i < len(suffixes)-1:
bytes_size /= 1024.
i += 1
return f"{bytes_size:.2f} {suffixes[i]}"

if hasattr(array, "nbytes"): # TODO: Remove this after h5py minimal version is larger than 3.0
array_size_in_bytes = array.nbytes
else:
array_size_in_bytes = array.size * array.dtype.itemsize
array_size_repr = convert_bytes_to_str(array_size_in_bytes)
basic_array_info_dict = {"Data type": array.dtype, "Shape": array.shape, "Array size": array_size_repr}

return basic_array_info_dict

def generate_array_html_repr(backend_info_dict, array, dataset_type=None):
def html_table(item_dicts) -> str:
"""
Generates an html table from a dictionary
"""
report = '<table class="data-info">'
report += "<tbody>"
for k, v in item_dicts.items():
report += (
f"<tr>"
f'<th style="text-align: left">{k}</th>'
f'<td style="text-align: left">{v}</td>'
f"</tr>"
)
report += "</tbody>"
report += "</table>"
return report

array_info_html = html_table(backend_info_dict)
repr_html = dataset_type + "<br>" + array_info_html if dataset_type is not None else array_info_html

if hasattr(array, "nbytes"): # TODO: Remove this after h5py minimal version is larger than 3.0
array_size = array.nbytes
else:
array_size = array.size * array.dtype.itemsize
array_is_small = array_size < 1024 * 0.1 # 10 % a kilobyte to display the array
if array_is_small:
repr_html += "<br>" + str(np.asarray(array))

return repr_html

class LabelledDict(dict):
"""A dict wrapper that allows querying by an attribute of the values and running a callable on removed items.
Expand Down
94 changes: 94 additions & 0 deletions tests/unit/test_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from hdmf.utils import docval
from hdmf.common import DynamicTable, VectorData, DynamicTableRegion
from hdmf.backends.hdf5.h5tools import HDF5IO
from hdmf.backends.io import HDMFIO


class Subcontainer(Container):
Expand Down Expand Up @@ -423,6 +424,23 @@ def __init__(self, **kwargs):
self.data = kwargs['data']
self.str = kwargs['str']

class ContainerWithData(Container):

__fields__ = (
"data",
"str"
)

@docval(
{'name': "data", "doc": 'data', 'type': 'array_data', "default": None},
{'name': "str", "doc": 'str', 'type': str, "default": None},
)
def __init__(self, **kwargs):
super().__init__('test name')
self.data = kwargs['data']
self.str = kwargs['str']

def test_repr_html_(self):
child_obj1 = Container('test child 1')
obj1 = self.ContainerWithChildAndData(child=child_obj1, data=[1, 2, 3], str="hello")
Expand Down Expand Up @@ -455,6 +473,82 @@ def test_repr_html_(self):
'class="field-value">hello</span></div></div>'
)

def test_repr_html_array(self):
obj = self.ContainerWithData(data=np.array([1, 2, 3, 4], dtype=np.int64), str="hello")
expected_html_table = (
'class="container-fields">NumPy array<br><table class="data-info"><tbody><tr><th style="text-align: '
'left">Data type</th><td style="text-align: left">int64</td></tr><tr><th style="text-align: left">Shape'
'</th><td style="text-align: left">(4,)</td></tr><tr><th style="text-align: left">Array size</th><td '
'style="text-align: left">32.00 bytes</td></tr></tbody></table><br>[1 2 3 4]'
)
self.assertIn(expected_html_table, obj._repr_html_())

def test_repr_html_array_large_arrays_not_displayed(self):
obj = self.ContainerWithData(data=np.arange(200, dtype=np.int64), str="hello")
expected_html_table = (
'class="container-fields">NumPy array<br><table class="data-info"><tbody><tr><th style="text-align: '
'left">Data type</th><td style="text-align: left">int64</td></tr><tr><th style="text-align: left">Shape'
'</th><td style="text-align: left">(200,)</td></tr><tr><th style="text-align: left">Array size</th><td '
'style="text-align: left">1.56 KiB</td></tr></tbody></table></div></details>'
)
self.assertIn(expected_html_table, obj._repr_html_())

def test_repr_html_hdf5_dataset(self):
with HDF5IO('array_data.h5', mode='w') as io:
dataset = io._file.create_dataset(name='my_dataset', data=np.array([1, 2, 3, 4], dtype=np.int64))
obj = self.ContainerWithData(data=dataset, str="hello")
obj.read_io = io

expected_html_table = (
'class="container-fields">HDF5 dataset<br><table class="data-info"><tbody><tr><th style="text-align: '
'left">Data type</th><td style="text-align: left">int64</td></tr><tr><th style="text-align: left">'
'Shape</th><td style="text-align: left">(4,)</td></tr><tr><th style="text-align: left">Array size'
'</th><td style="text-align: left">32.00 bytes</td></tr><tr><th style="text-align: left">Chunk shape'
'</th><td style="text-align: left">None</td></tr><tr><th style="text-align: left">Compression</th><td '
'style="text-align: left">None</td></tr><tr><th style="text-align: left">Compression opts</th><td '
'style="text-align: left">None</td></tr><tr><th style="text-align: left">Compression ratio</th><td '
'style="text-align: left">1.0</td></tr></tbody></table><br>[1 2 3 4]'
)

self.assertIn(expected_html_table, obj._repr_html_())

os.remove('array_data.h5')

def test_repr_html_hdmf_io(self):
with HDF5IO('array_data.h5', mode='w') as io:
dataset = io._file.create_dataset(name='my_dataset', data=np.array([1, 2, 3, 4], dtype=np.int64))
obj = self.ContainerWithData(data=dataset, str="hello")

class OtherIO(HDMFIO):

@staticmethod
def can_read(path):
pass

def read_builder(self):
pass

def write_builder(self, **kwargs):
pass

def open(self):
pass

def close(self):
pass

obj.read_io = OtherIO()

expected_html_table = (
'class="container-fields"><table class="data-info"><tbody><tr><th style="text-align: '
'left">Data type</th><td style="text-align: left">int64</td></tr><tr><th style="text-align: left">'
'Shape</th><td style="text-align: left">(4,)</td></tr><tr><th style="text-align: left">Array size'
'</th><td style="text-align: left">32.00 bytes</td></tr></tbody></table><br>[1 2 3 4]'
)

self.assertIn(expected_html_table, obj._repr_html_())

os.remove('array_data.h5')

class TestData(TestCase):

Expand Down

0 comments on commit be602e5

Please sign in to comment.