From 53208af32a6c7c018d041f7961a2831ccdcec7cf Mon Sep 17 00:00:00 2001 From: Agisilaos Kounelis <36283973+kounelisagis@users.noreply.github.com> Date: Tue, 29 Oct 2024 12:54:07 +0200 Subject: [PATCH] Fix GroupMetadata backwards compatibility (#2102) --- tiledb/cc/group.cc | 4 +- tiledb/group.py | 15 +++++ tiledb/tests/test_group.py | 132 +++++++++++++++++++++++++++++++++++++ 3 files changed, 150 insertions(+), 1 deletion(-) diff --git a/tiledb/cc/group.cc b/tiledb/cc/group.cc index 130b9266fb..dcdf918e6b 100644 --- a/tiledb/cc/group.cc +++ b/tiledb/cc/group.cc @@ -26,7 +26,9 @@ void put_metadata_numpy(Group &group, const std::string &key, py::array value) { throw py::type_error("Only 1D Numpy arrays can be stored as metadata"); py::size_t ncells = get_ncells(value.dtype()); - if (ncells != 1) + // we can't store multi-cell arrays as metadata + // e.g. an array of strings containing strings of more than one character + if (ncells != 1 && value.size() > 1) throw py::type_error("Unsupported dtype '" + std::string(py::str(value.dtype())) + "' for metadata"); diff --git a/tiledb/group.py b/tiledb/group.py index f32c394228..ff1366adef 100644 --- a/tiledb/group.py +++ b/tiledb/group.py @@ -119,6 +119,16 @@ def __setitem__(self, key: str, value: GroupMetadataValueType): # If the value is not a 1D ndarray, store its associated shape. # The value's shape will be stored as separate metadata with the correct prefix. self.__setitem__(f"{Group._NP_SHAPE_PREFIX}{key}", value.shape) + elif isinstance(value, np.generic): + tiledb_type = DataType.from_numpy(value.dtype).tiledb_type + if tiledb_type in (lt.DataType.BLOB, lt.DataType.CHAR): + put_metadata(key, tiledb_type, len(value), value) + elif tiledb_type == lt.DataType.STRING_UTF8: + put_metadata( + key, lt.DataType.STRING_UTF8, len(value), value.encode("UTF-8") + ) + else: + put_metadata(key, tiledb_type, 1, value) else: from .metadata import pack_metadata_val @@ -141,11 +151,16 @@ def __getitem__(self, key: str, include_type=False) -> GroupMetadataValueType: if self._group._has_metadata(key): data, tdb_type = self._group._get_metadata(key, False) + dtype = DataType.from_tiledb(tdb_type).np_dtype + # we return all int and float values as numpy scalars + if dtype.kind in ("i", "f") and not isinstance(data, tuple): + data = np.dtype(dtype).type(data) elif self._group._has_metadata(f"{Group._NP_DATA_PREFIX}{key}"): data, tdb_type = self._group._get_metadata( f"{Group._NP_DATA_PREFIX}{key}", True ) # reshape numpy array back to original shape, if needed + # this will not be found in any case for TileDB-Py <= 0.32.3. shape_key = f"{Group._NP_SHAPE_PREFIX}{key}" if self._group._has_metadata(shape_key): shape, tdb_type = self._group._get_metadata(shape_key, False) diff --git a/tiledb/tests/test_group.py b/tiledb/tests/test_group.py index 659b4f4826..4fe3536b0e 100644 --- a/tiledb/tests/test_group.py +++ b/tiledb/tests/test_group.py @@ -1,5 +1,8 @@ +import base64 +import io import os import pathlib +import tarfile import numpy as np import pytest @@ -762,3 +765,132 @@ def test_bytes_metadata(self, capfd): grp.meta.dump() assert_captured(capfd, "Type: DataType.BLOB") grp.close() + + def test_group_metadata_backwards_compat(self): + # This test ensures that metadata written with the TileDB-Py 0.32.3 + # will be read correctly in the future versions. + + # === The following code creates a group with metadata using the current version of TileDB-Py === + path_new = self.path("new_group") + tiledb.Group.create(path_new) + group = tiledb.Group(path_new, "w") + + # python primitive types + group.meta["python_int"] = -1234 + group.meta["python_float"] = 3.14 + group.meta["python_str"] = "hello" + group.meta["python_bytes"] = b"hello" + group.meta["python_bool"] = False + + # numpy primitive types + group.meta["numpy_int"] = np.int64(-93) + group.meta["numpy_uint"] = np.uint64(42) + group.meta["numpy_float64"] = np.float64(3.14) + group.meta["numpy_bytes"] = np.bytes_("hello") + group.meta["numpy_str"] = np.str_("hello") + group.meta["numpy_bool"] = np.bool(False) + + # lists/tuples + group.meta["list_int"] = [7] + group.meta["tuple_int"] = (7,) + group.meta["list_ints"] = [1, -2, 3] + group.meta["tuple_ints"] = (1, 2, 3) + group.meta["list_float"] = [1.1] + group.meta["tuple_float"] = (1.1,) + group.meta["list_floats"] = [1.1, 2.2, 3.3] + group.meta["tuple_floats"] = (1.1, 2.2, 3.3) + group.meta["list_empty"] = [] + group.meta["tuple_empty"] = () + + # numpy arrays + group.meta["numpy_int"] = np.array([-11], dtype=np.int64) + group.meta["numpy_ints"] = np.array([1, -2, 3], dtype=np.int64) + group.meta["numpy_uint"] = np.array([22], dtype=np.uint64) + group.meta["numpy_uints"] = np.array([1, 2, 3], dtype=np.uint64) + group.meta["numpy_float"] = np.array([3.14], dtype=np.float64) + group.meta["numpy_floats"] = np.array([1.1, 2.2, 3.3], dtype=np.float64) + group.meta["numpy_byte"] = np.array([b"hello"], dtype="S5") + group.meta["numpy_str"] = np.array(["hello"], dtype="U5") + group.meta["numpy_bool"] = np.array([True, False, True]) + + group.close() + # === End of the code that creates the group with metadata === + + # The following commented out code was used to generate the base64 encoded string of the group + # from the TileDB-Py 0.32.3 after creating the group with metadata in the exact same way as above. + ''' + # Compress the contents of the group folder to tgz + with tarfile.open("test.tar.gz", "w:gz") as tar: + with os.scandir(path_new) as entries: + for entry in entries: + tar.add(entry.path, arcname=entry.name) + + # Read the .tgz file and encode it to base64 + with open("test.tar.gz", 'rb') as f: + s = base64.encodebytes(f.read()) + + # Print the base64 encoded string + group_tgz = f"""{s.decode():>32}""" + print(group_tgz) + ''' + + # The following base64 encoded string is the contents of the group folder compressed + # to a tgz file using TileDB-Py 0.32.3. + group_tgz = b"""H4sICO/+G2cC/3Rlc3QudGFyANPT19N3CEis8EhNTEktYqAJMIAAXLSBgbEJgg0SNzQwMjRiUKhg + oAMoLS5JLAJazzAygZGFQm5JZm6qraG5kaWFhbmlhbGekaGphbGlJRfDKBj2ID4+N7UkUZ+mdoAy + tbmpKYQ2g9AGRqh53tDE3MDM3Nzc2NQcmP8NDc3NGRRM6Zn/E9Mzi/GpAypLSxt+8a83KMp/Y8zy + 33C0/KdL+W+Otfy3NBot/kdS+R8fj4h/YPSj8UxTktOSjQxMjNPMzS0MDCxTjVLNTUwS01IMzMxM + zJMTicj/ZiYmuMp/QwNjM9Ty38jQAFhdKBjQM/+P0PJfDIhfMULYV1khNAsjTFYITDIygAQYQbKM + YBYDQv0xIEcAymdEEqtgbA1x9DtsIBATrJgRpRfwgC18R8GqqqXxD1gDJwZtnTTb5YbtE0YbprhD + 8y0KH7SwVJTnps9d9sorMOX8Met7M8+yMHzas+bz0rgbMet7z3b75kqb3mSdtisqonQnu8GrGvHI + 6WGxX/Jm+7UW7V45+8/OVSZ3+O+Ic/0Sloo+8OKG6hqutaun9NgfXjqDz9ftBZNBwLvXt6+fX94/ + ++EfK0X1S2nBpVv5jQ0cut7nS8T3/wn7rOpq5q9/Jn2XW8OhQ/frZTLrkycxHt1evlKvrtbsXeIX + 2dw33D0fd0yt5vqe8T/k3d3wtO4UI5Vm8yMvspXTJE+ozFY+13ZA7e+avDertDwP+b1mcjq0JPar + QLS26mvFLQH6D97dDbyZlx1b8X/ZHYmHWpqMjTP6QiVvrZX/3nsqxv3WwofHjtgmbk+YGnhC/U1D + v5+z0SvXZ5YfmXhYiw4Ynmi727rZteXvpZULJ/jvNikQV1/tuiM73XDytc2ZVu6PRcy4NN3Cuze9 + 0GJc1KHr+mXOAxexJaUFAv/kVgi/K+FaI+2wZfqOxoYWocQPGzNeG9h9edh+3DfBJMYzOKL2l+em + ezc0Hyq98xaQ8eT40PDoxpYX60KKnogs7Ht2d+cf9lm5m9pGy8fhDvRG+/+j/X+M9p+JqYGJ+WgD + cES0/0oyc1JTkuLTi/JLC/RKUpJok//xtP+w9P+NTUD9v9H232j5P1r+D0j5b2ZoYDZa/o+I8h9c + 8NN0AJiM8V8TA9PR8d9RMApGwSgYBaNgFIyCUTAKRsEooCYAAP1+F2wAKAAA""" + + # Ceate a new group by extracting the contents of the tgz file + path_original = self.path("original_group") + with tarfile.open(fileobj=io.BytesIO(base64.b64decode(group_tgz))) as tf: + try: + tf.extractall(path_original, filter="fully_trusted") + except TypeError: + tf.extractall(path_original) + + # Open both the original and the new group and compare the metadata both in values and types + group_original = tiledb.Group(path_original, "r") + group_new = tiledb.Group(path_new, "r") + + self.assert_metadata_roundtrip(group_new.meta, group_original.meta) + + group_original.close() + group_new.close() + + def test_group_metadata_new_types(self): + # This kind of data was not supported for TileDB-Py <= 0.32.3 + path_new = self.path("new_group") + + tiledb.Group.create(path_new) + group = tiledb.Group(path_new, "w") + test_vals = { + "int64": np.array(-1111, dtype=np.int64), + "uint64": np.array(2, dtype=np.uint64), + "float64": np.array(3.14, dtype=np.float64), + "bool": np.array(True, dtype=bool), + "str": np.array(["a", "b", "c"], dtype="S"), + "unicode": np.array(["a", "b", "c"], dtype="U"), + "bytes": np.array([b"a", b"b", b"c"]), + "datetime": np.array( + [np.datetime64("2021-01-01"), np.datetime64("2021-01-02")] + ), + } + group.meta.update(test_vals) + group.close() + + group = tiledb.Group(path_new, "r") + self.assert_metadata_roundtrip(group.meta, test_vals) + group.close()