From 8bf4bf93488bf381a584d128958a604dffa4a473 Mon Sep 17 00:00:00 2001 From: Stuart Read Date: Thu, 7 Mar 2024 11:32:38 -0600 Subject: [PATCH 1/9] io: Table-only version of Orange on-disk format (HDF5) --- Orange/data/io.py | 95 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 94 insertions(+), 1 deletion(-) diff --git a/Orange/data/io.py b/Orange/data/io.py index 0959bb725c2..4c3de870043 100644 --- a/Orange/data/io.py +++ b/Orange/data/io.py @@ -1,5 +1,6 @@ import contextlib import csv +import json import locale import pickle import re @@ -18,13 +19,15 @@ from urllib.request import urlopen, Request from pathlib import Path +import h5py import numpy as np import xlrd import xlsxwriter import openpyxl -from Orange.data import _io, Table, Domain, ContinuousVariable, update_origin +from Orange.data import _io, Table, Domain, ContinuousVariable, update_origin, DiscreteVariable, TimeVariable, \ + StringVariable from Orange.data import Compression, open_compressed, detect_encoding, \ isnastr, guess_data_type, sanitize_variable from Orange.data.io_base import FileFormatBase, Flags, DataTableMixin, PICKLE_PROTOCOL @@ -511,3 +514,93 @@ def _suggest_filename(self, content_disposition): matches = re.findall(r"filename\*?=(?:\"|.{0,10}?'[^']*')([^\"]+)", content_disposition or '') return urlunquote(matches[-1]) if matches else default_name + +class HDF5Reader(FileFormat): + """Reader for Orange HDF5 files""" + EXTENSIONS = ('.hdf5',) + DESCRIPTION = 'Orange on-disk data' + SUPPORT_COMPRESSED = False + SUPPORT_SPARSE_DATA = False + + def read(self): + h5file = f = h5py.File(self.filename, "r") + + def read_domain(sub): + d = f['domain'] + subdomain = d[sub].asstr() if sub in d else [] + subdomain_args = d[f'{sub}_args'].asstr() \ + if f'{sub}_args' in d else ['{}'] * len(subdomain) + for attr, args in zip(subdomain, subdomain_args): + yield attr[0], attr[1], json.loads(args) + + def make_var(name, header, args): + var_cls = [var for var in (ContinuousVariable, + DiscreteVariable, + StringVariable, + TimeVariable) if header in var.TYPE_HEADERS][0] + new_var = var_cls(name, **{key: val for key, val in args.items() + if key != "attributes"}) + new_var.attributes = args.get("attributes", {}) + return new_var + + def read_hdf5(name, as_str=False): + if name in f: + if as_str: + return f[name].asstr()[:] + return f[name] + return None + + assert 'domain' in f + + domain = Domain(*[[make_var(*args) for args in read_domain(subdomain)] + for subdomain in ['attributes', 'class_vars', 'metas']]) + + X = read_hdf5("X") + Y = read_hdf5("Y") + + + if len(domain.metas) > 1: + metas = np.hstack([read_hdf5(f'metas/{i}', + isinstance(attr, StringVariable)) + for i, attr in enumerate(domain.metas)]) + elif len(domain.metas) == 1: + metas = read_hdf5('metas/0', + isinstance(domain.metas[0], StringVariable) + ) + else: + metas = None + + table = Table.from_numpy(domain, X, Y, metas) + if isinstance(self.filename, str): + table.name = path.splitext(path.split(self.filename)[-1])[0] + + return table + + @classmethod + def write_file(cls, filename, data): + def parse(attr): + params = (attr.name, attr.TYPE_HEADERS[1], {"attributes": attr.attributes}) + if isinstance(attr, DiscreteVariable): + params[2].update(values=attr.values) + elif isinstance(attr, TimeVariable): + params[2].update(have_date=attr.have_date, + have_time=attr.have_time) + elif isinstance(attr, ContinuousVariable): + params[2].update(number_of_decimals=attr.number_of_decimals) + return params + + with h5py.File(filename, 'w') as f: + for subdomain in ['attributes', 'class_vars', 'metas']: + parsed = [parse(feature) for feature in getattr(data.domain, subdomain)] + domain = np.array([[name, header] for name, header, _ in parsed], 'S') + domain_args = np.array([json.dumps(args) for *_, args in parsed], 'S') + f.create_dataset(f'domain/{subdomain}', data=domain) + f.create_dataset(f'domain/{subdomain}_args', data=domain_args) + f.create_dataset("X", data=data.X) + if data.Y.size: + f.create_dataset("Y", data=data.Y) + if data.metas.size: + for i, attr in enumerate(data.domain.metas): + col_type = 'S' if isinstance(attr, StringVariable) else 'f' + col_data = data.metas[:, [i]].astype(col_type) + f.create_dataset(f'metas/{i}', data=col_data) From e4a870d725d1cba6d2cc71ecedc3d4b64b29859e Mon Sep 17 00:00:00 2001 From: Stuart Read Date: Thu, 7 Mar 2024 11:36:26 -0600 Subject: [PATCH 2/9] Add h5py dependency --- requirements-core.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements-core.txt b/requirements-core.txt index b78f2934e56..a066a2dbc14 100644 --- a/requirements-core.txt +++ b/requirements-core.txt @@ -26,3 +26,5 @@ xgboost>=1.7.4,<2.1 xlrd>=1.2.0 # Writing Excel Files xlsxwriter +# HDF5 binary data format +h5py \ No newline at end of file From 9a52609562acf3539a65ab8f5bfcf3c44ec377f2 Mon Sep 17 00:00:00 2001 From: Stuart Read Date: Thu, 7 Mar 2024 11:55:16 -0600 Subject: [PATCH 3/9] io.test: roundtrip test for Orange HDF5 --- Orange/data/tests/test_io.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/Orange/data/tests/test_io.py b/Orange/data/tests/test_io.py index 01187f26b30..21dec385dfa 100644 --- a/Orange/data/tests/test_io.py +++ b/Orange/data/tests/test_io.py @@ -6,9 +6,10 @@ from Orange.data import ContinuousVariable, DiscreteVariable, StringVariable, \ TimeVariable, Domain, Table -from Orange.data.io import TabReader, ExcelReader +from Orange.data.io import TabReader, ExcelReader, HDF5Reader from Orange.data.io_util import guess_data_type from Orange.misc.collections import natural_sorted +from Orange.tests import named_file class TestTableFilters(unittest.TestCase): @@ -155,6 +156,16 @@ def test_roundtrip_xlsx(self): finally: os.remove(fname) + def test_roundtrip_hdf5(self): + with named_file('', suffix='.hdf5') as fn: + HDF5Reader.write(fn, self.data) + data = HDF5Reader(fn).read() + np.testing.assert_equal(data.X, self.data.X) + np.testing.assert_equal(data.Y, self.data.Y) + np.testing.assert_equal(data.metas[:2], self.data.metas[:2]) + self.assertEqual(data.metas[2, 0], "") + np.testing.assert_equal(data.domain, self.data.domain) + if __name__ == "__main__": unittest.main() From 2ef7b8d41a192870070ef1269f2405e1a39b6cf3 Mon Sep 17 00:00:00 2001 From: Stuart Read Date: Thu, 7 Mar 2024 14:15:24 -0600 Subject: [PATCH 4/9] Fix NaN handling for strings (ref #6670) In fixing this, switched string handling from fixed-length to variable length https://docs.h5py.org/en/stable/special.html#variable-length-strings --- Orange/data/io.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Orange/data/io.py b/Orange/data/io.py index 4c3de870043..8de8a08031d 100644 --- a/Orange/data/io.py +++ b/Orange/data/io.py @@ -21,6 +21,7 @@ import h5py import numpy as np +import pandas as pd import xlrd import xlsxwriter @@ -601,6 +602,8 @@ def parse(attr): f.create_dataset("Y", data=data.Y) if data.metas.size: for i, attr in enumerate(data.domain.metas): - col_type = 'S' if isinstance(attr, StringVariable) else 'f' + col_type = h5py.string_dtype() if isinstance(attr, StringVariable) else 'f' col_data = data.metas[:, [i]].astype(col_type) - f.create_dataset(f'metas/{i}', data=col_data) + if col_type is not 'f': + col_data[pd.isnull(col_data)] = "" + f.create_dataset(f'metas/{i}', data=col_data, dtype=col_type) From 07e6d916f3b59822e902d67a5e7108f1c9109af5 Mon Sep 17 00:00:00 2001 From: Stuart Read Date: Fri, 26 Apr 2024 11:00:07 -0600 Subject: [PATCH 5/9] Add Orange identifier and versions as attributes Fix small 'is not' bug --- Orange/data/io.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/Orange/data/io.py b/Orange/data/io.py index 8de8a08031d..d1f7bfa3a79 100644 --- a/Orange/data/io.py +++ b/Orange/data/io.py @@ -34,7 +34,7 @@ from Orange.data.io_base import FileFormatBase, Flags, DataTableMixin, PICKLE_PROTOCOL from Orange.util import flatten - +from Orange.version import short_version as ORANGE_VERSION # Support values longer than 128K (i.e. text contents features) csv.field_size_limit(100*1024*1024) @@ -551,7 +551,10 @@ def read_hdf5(name, as_str=False): return f[name] return None - assert 'domain' in f + try: + assert f.attrs['creator'] == "Orange" + except KeyError: + assert 'domain' in f domain = Domain(*[[make_var(*args) for args in read_domain(subdomain)] for subdomain in ['attributes', 'class_vars', 'metas']]) @@ -591,6 +594,10 @@ def parse(attr): return params with h5py.File(filename, 'w') as f: + f.attrs['creator'] = "Orange" + f.attrs['Orange_version'] = ORANGE_VERSION + f.attrs['HDF5_Version'] = h5py.version.hdf5_version + f.attrs['h5py_version'] = h5py.version.version for subdomain in ['attributes', 'class_vars', 'metas']: parsed = [parse(feature) for feature in getattr(data.domain, subdomain)] domain = np.array([[name, header] for name, header, _ in parsed], 'S') @@ -604,6 +611,6 @@ def parse(attr): for i, attr in enumerate(data.domain.metas): col_type = h5py.string_dtype() if isinstance(attr, StringVariable) else 'f' col_data = data.metas[:, [i]].astype(col_type) - if col_type is not 'f': + if col_type != 'f': col_data[pd.isnull(col_data)] = "" f.create_dataset(f'metas/{i}', data=col_data, dtype=col_type) From 1f4193a97d0b15790c521264831ec701215b74e3 Mon Sep 17 00:00:00 2001 From: Stuart Read Date: Thu, 2 May 2024 13:55:34 -0600 Subject: [PATCH 6/9] io: Use existing .metadata sidecar to hold Table.attributes --- Orange/data/io.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Orange/data/io.py b/Orange/data/io.py index d1f7bfa3a79..298b4e1ba36 100644 --- a/Orange/data/io.py +++ b/Orange/data/io.py @@ -577,7 +577,7 @@ def read_hdf5(name, as_str=False): table = Table.from_numpy(domain, X, Y, metas) if isinstance(self.filename, str): table.name = path.splitext(path.split(self.filename)[-1])[0] - + self.set_table_metadata(self.filename, table) return table @classmethod @@ -614,3 +614,4 @@ def parse(attr): if col_type != 'f': col_data[pd.isnull(col_data)] = "" f.create_dataset(f'metas/{i}', data=col_data, dtype=col_type) + cls.write_table_metadata(filename, data) From a829dbee8a0f4233a62bbd6622c675aea573cbf1 Mon Sep 17 00:00:00 2001 From: Stuart Read Date: Thu, 2 May 2024 14:01:55 -0600 Subject: [PATCH 7/9] io: use context manager for HDF5 reading --- Orange/data/io.py | 52 +++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/Orange/data/io.py b/Orange/data/io.py index 298b4e1ba36..534fb8b0170 100644 --- a/Orange/data/io.py +++ b/Orange/data/io.py @@ -524,8 +524,6 @@ class HDF5Reader(FileFormat): SUPPORT_SPARSE_DATA = False def read(self): - h5file = f = h5py.File(self.filename, "r") - def read_domain(sub): d = f['domain'] subdomain = d[sub].asstr() if sub in d else [] @@ -551,32 +549,32 @@ def read_hdf5(name, as_str=False): return f[name] return None - try: - assert f.attrs['creator'] == "Orange" - except KeyError: - assert 'domain' in f - - domain = Domain(*[[make_var(*args) for args in read_domain(subdomain)] - for subdomain in ['attributes', 'class_vars', 'metas']]) - - X = read_hdf5("X") - Y = read_hdf5("Y") - - - if len(domain.metas) > 1: - metas = np.hstack([read_hdf5(f'metas/{i}', - isinstance(attr, StringVariable)) - for i, attr in enumerate(domain.metas)]) - elif len(domain.metas) == 1: - metas = read_hdf5('metas/0', - isinstance(domain.metas[0], StringVariable) - ) - else: - metas = None + with h5py.File(self.filename, "r") as f: + try: + assert f.attrs['creator'] == "Orange" + except KeyError: + assert 'domain' in f + + domain = Domain(*[[make_var(*args) for args in read_domain(subdomain)] + for subdomain in ['attributes', 'class_vars', 'metas']]) + + X = read_hdf5("X") + Y = read_hdf5("Y") + + if len(domain.metas) > 1: + metas = np.hstack([read_hdf5(f'metas/{i}', + isinstance(attr, StringVariable)) + for i, attr in enumerate(domain.metas)]) + elif len(domain.metas) == 1: + metas = read_hdf5('metas/0', + isinstance(domain.metas[0], StringVariable) + ) + else: + metas = None - table = Table.from_numpy(domain, X, Y, metas) - if isinstance(self.filename, str): - table.name = path.splitext(path.split(self.filename)[-1])[0] + table = Table.from_numpy(domain, X, Y, metas) + if isinstance(self.filename, str): + table.name = path.splitext(path.split(self.filename)[-1])[0] self.set_table_metadata(self.filename, table) return table From 7db972784da1bdbc7d26800593bc961c3a2bc71e Mon Sep 17 00:00:00 2001 From: Stuart Read Date: Mon, 27 May 2024 09:21:02 -0600 Subject: [PATCH 8/9] lint --- Orange/data/io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Orange/data/io.py b/Orange/data/io.py index 534fb8b0170..4f0e7b29f05 100644 --- a/Orange/data/io.py +++ b/Orange/data/io.py @@ -27,8 +27,8 @@ import xlsxwriter import openpyxl -from Orange.data import _io, Table, Domain, ContinuousVariable, update_origin, DiscreteVariable, TimeVariable, \ - StringVariable +from Orange.data import _io, Table, Domain, ContinuousVariable, update_origin, DiscreteVariable, \ + TimeVariable, StringVariable from Orange.data import Compression, open_compressed, detect_encoding, \ isnastr, guess_data_type, sanitize_variable from Orange.data.io_base import FileFormatBase, Flags, DataTableMixin, PICKLE_PROTOCOL From c872d574db395c91986425c9439d0fbff9c5f789 Mon Sep 17 00:00:00 2001 From: Stuart Read Date: Fri, 27 Sep 2024 14:36:30 -0600 Subject: [PATCH 9/9] Use h5py.string_dtype() for all string arrays --- Orange/data/io.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Orange/data/io.py b/Orange/data/io.py index 4f0e7b29f05..97251bd3d0a 100644 --- a/Orange/data/io.py +++ b/Orange/data/io.py @@ -596,10 +596,11 @@ def parse(attr): f.attrs['Orange_version'] = ORANGE_VERSION f.attrs['HDF5_Version'] = h5py.version.hdf5_version f.attrs['h5py_version'] = h5py.version.version + str_dtype = h5py.string_dtype() for subdomain in ['attributes', 'class_vars', 'metas']: parsed = [parse(feature) for feature in getattr(data.domain, subdomain)] - domain = np.array([[name, header] for name, header, _ in parsed], 'S') - domain_args = np.array([json.dumps(args) for *_, args in parsed], 'S') + domain = np.array([[name, header] for name, header, _ in parsed], dtype=str_dtype) + domain_args = np.array([json.dumps(args) for *_, args in parsed], dtype=str_dtype) f.create_dataset(f'domain/{subdomain}', data=domain) f.create_dataset(f'domain/{subdomain}_args', data=domain_args) f.create_dataset("X", data=data.X) @@ -607,7 +608,7 @@ def parse(attr): f.create_dataset("Y", data=data.Y) if data.metas.size: for i, attr in enumerate(data.domain.metas): - col_type = h5py.string_dtype() if isinstance(attr, StringVariable) else 'f' + col_type = str_dtype if isinstance(attr, StringVariable) else 'f' col_data = data.metas[:, [i]].astype(col_type) if col_type != 'f': col_data[pd.isnull(col_data)] = ""