Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Orange Table-specific HDF5Reader #6791

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
107 changes: 105 additions & 2 deletions Orange/data/io.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import contextlib
import csv
import json
import locale
import pickle
import re
Expand All @@ -18,19 +19,22 @@
from urllib.request import urlopen, Request
from pathlib import Path

import h5py
import numpy as np
import pandas as pd

import xlrd
import xlsxwriter
import openpyxl

from Orange.data import _io, Table, Domain, ContinuousVariable, update_origin
from Orange.data import _io, Table, Domain, ContinuousVariable, update_origin, DiscreteVariable, \
TimeVariable, StringVariable
from Orange.data import Compression, open_compressed, detect_encoding, \
isnastr, guess_data_type, sanitize_variable
from Orange.data.io_base import FileFormatBase, Flags, DataTableMixin, PICKLE_PROTOCOL

from Orange.util import flatten

from Orange.version import short_version as ORANGE_VERSION

# Support values longer than 128K (i.e. text contents features)
csv.field_size_limit(100*1024*1024)
Expand Down Expand Up @@ -511,3 +515,102 @@ def _suggest_filename(self, content_disposition):
matches = re.findall(r"filename\*?=(?:\"|.{0,10}?'[^']*')([^\"]+)",
content_disposition or '')
return urlunquote(matches[-1]) if matches else default_name

class HDF5Reader(FileFormat):
"""Reader for Orange HDF5 files"""
EXTENSIONS = ('.hdf5',)
DESCRIPTION = 'Orange on-disk data'
SUPPORT_COMPRESSED = False
SUPPORT_SPARSE_DATA = False

def read(self):
def read_domain(sub):
d = f['domain']
subdomain = d[sub].asstr() if sub in d else []
subdomain_args = d[f'{sub}_args'].asstr() \
if f'{sub}_args' in d else ['{}'] * len(subdomain)
for attr, args in zip(subdomain, subdomain_args):
yield attr[0], attr[1], json.loads(args)

def make_var(name, header, args):
var_cls = [var for var in (ContinuousVariable,
DiscreteVariable,
StringVariable,
TimeVariable) if header in var.TYPE_HEADERS][0]
new_var = var_cls(name, **{key: val for key, val in args.items()
if key != "attributes"})
new_var.attributes = args.get("attributes", {})
return new_var

def read_hdf5(name, as_str=False):
if name in f:
if as_str:
return f[name].asstr()[:]
return f[name]
return None

with h5py.File(self.filename, "r") as f:
try:
assert f.attrs['creator'] == "Orange"
except KeyError:
assert 'domain' in f

domain = Domain(*[[make_var(*args) for args in read_domain(subdomain)]
for subdomain in ['attributes', 'class_vars', 'metas']])

X = read_hdf5("X")
Y = read_hdf5("Y")

if len(domain.metas) > 1:
metas = np.hstack([read_hdf5(f'metas/{i}',
isinstance(attr, StringVariable))
for i, attr in enumerate(domain.metas)])
elif len(domain.metas) == 1:
metas = read_hdf5('metas/0',
isinstance(domain.metas[0], StringVariable)
)
else:
metas = None

table = Table.from_numpy(domain, X, Y, metas)
if isinstance(self.filename, str):
table.name = path.splitext(path.split(self.filename)[-1])[0]
self.set_table_metadata(self.filename, table)
return table

@classmethod
def write_file(cls, filename, data):
def parse(attr):
params = (attr.name, attr.TYPE_HEADERS[1], {"attributes": attr.attributes})
if isinstance(attr, DiscreteVariable):
params[2].update(values=attr.values)
elif isinstance(attr, TimeVariable):
params[2].update(have_date=attr.have_date,
have_time=attr.have_time)
elif isinstance(attr, ContinuousVariable):
params[2].update(number_of_decimals=attr.number_of_decimals)
return params

with h5py.File(filename, 'w') as f:
f.attrs['creator'] = "Orange"
f.attrs['Orange_version'] = ORANGE_VERSION
f.attrs['HDF5_Version'] = h5py.version.hdf5_version
f.attrs['h5py_version'] = h5py.version.version
str_dtype = h5py.string_dtype()
for subdomain in ['attributes', 'class_vars', 'metas']:
parsed = [parse(feature) for feature in getattr(data.domain, subdomain)]
domain = np.array([[name, header] for name, header, _ in parsed], dtype=str_dtype)
domain_args = np.array([json.dumps(args) for *_, args in parsed], dtype=str_dtype)
f.create_dataset(f'domain/{subdomain}', data=domain)
f.create_dataset(f'domain/{subdomain}_args', data=domain_args)
f.create_dataset("X", data=data.X)
if data.Y.size:
f.create_dataset("Y", data=data.Y)
if data.metas.size:
for i, attr in enumerate(data.domain.metas):
col_type = str_dtype if isinstance(attr, StringVariable) else 'f'
col_data = data.metas[:, [i]].astype(col_type)
if col_type != 'f':
col_data[pd.isnull(col_data)] = ""
f.create_dataset(f'metas/{i}', data=col_data, dtype=col_type)
cls.write_table_metadata(filename, data)
13 changes: 12 additions & 1 deletion Orange/data/tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@

from Orange.data import ContinuousVariable, DiscreteVariable, StringVariable, \
TimeVariable, Domain, Table
from Orange.data.io import TabReader, ExcelReader
from Orange.data.io import TabReader, ExcelReader, HDF5Reader
from Orange.data.io_util import guess_data_type
from Orange.misc.collections import natural_sorted
from Orange.tests import named_file


class TestTableFilters(unittest.TestCase):
Expand Down Expand Up @@ -155,6 +156,16 @@ def test_roundtrip_xlsx(self):
finally:
os.remove(fname)

def test_roundtrip_hdf5(self):
with named_file('', suffix='.hdf5') as fn:
HDF5Reader.write(fn, self.data)
data = HDF5Reader(fn).read()
np.testing.assert_equal(data.X, self.data.X)
np.testing.assert_equal(data.Y, self.data.Y)
np.testing.assert_equal(data.metas[:2], self.data.metas[:2])
self.assertEqual(data.metas[2, 0], "")
np.testing.assert_equal(data.domain, self.data.domain)


if __name__ == "__main__":
unittest.main()
2 changes: 2 additions & 0 deletions requirements-core.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,5 @@ xgboost>=1.7.4,<2.1
xlrd>=1.2.0
# Writing Excel Files
xlsxwriter
# HDF5 binary data format
h5py