Skip to content

Commit

Permalink
Fix GroupMetadata backwards compatibility (#2102)
Browse files Browse the repository at this point in the history
  • Loading branch information
kounelisagis authored Oct 29, 2024
1 parent 8ccbb1c commit 53208af
Show file tree
Hide file tree
Showing 3 changed files with 150 additions and 1 deletion.
4 changes: 3 additions & 1 deletion tiledb/cc/group.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ void put_metadata_numpy(Group &group, const std::string &key, py::array value) {
throw py::type_error("Only 1D Numpy arrays can be stored as metadata");

py::size_t ncells = get_ncells(value.dtype());
if (ncells != 1)
// we can't store multi-cell arrays as metadata
// e.g. an array of strings containing strings of more than one character
if (ncells != 1 && value.size() > 1)
throw py::type_error("Unsupported dtype '" +
std::string(py::str(value.dtype())) +
"' for metadata");
Expand Down
15 changes: 15 additions & 0 deletions tiledb/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,16 @@ def __setitem__(self, key: str, value: GroupMetadataValueType):
# If the value is not a 1D ndarray, store its associated shape.
# The value's shape will be stored as separate metadata with the correct prefix.
self.__setitem__(f"{Group._NP_SHAPE_PREFIX}{key}", value.shape)
elif isinstance(value, np.generic):
tiledb_type = DataType.from_numpy(value.dtype).tiledb_type
if tiledb_type in (lt.DataType.BLOB, lt.DataType.CHAR):
put_metadata(key, tiledb_type, len(value), value)
elif tiledb_type == lt.DataType.STRING_UTF8:
put_metadata(
key, lt.DataType.STRING_UTF8, len(value), value.encode("UTF-8")
)
else:
put_metadata(key, tiledb_type, 1, value)
else:
from .metadata import pack_metadata_val

Expand All @@ -141,11 +151,16 @@ def __getitem__(self, key: str, include_type=False) -> GroupMetadataValueType:

if self._group._has_metadata(key):
data, tdb_type = self._group._get_metadata(key, False)
dtype = DataType.from_tiledb(tdb_type).np_dtype
# we return all int and float values as numpy scalars
if dtype.kind in ("i", "f") and not isinstance(data, tuple):
data = np.dtype(dtype).type(data)
elif self._group._has_metadata(f"{Group._NP_DATA_PREFIX}{key}"):
data, tdb_type = self._group._get_metadata(
f"{Group._NP_DATA_PREFIX}{key}", True
)
# reshape numpy array back to original shape, if needed
# this will not be found in any case for TileDB-Py <= 0.32.3.
shape_key = f"{Group._NP_SHAPE_PREFIX}{key}"
if self._group._has_metadata(shape_key):
shape, tdb_type = self._group._get_metadata(shape_key, False)
Expand Down
132 changes: 132 additions & 0 deletions tiledb/tests/test_group.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import base64
import io
import os
import pathlib
import tarfile

import numpy as np
import pytest
Expand Down Expand Up @@ -762,3 +765,132 @@ def test_bytes_metadata(self, capfd):
grp.meta.dump()
assert_captured(capfd, "Type: DataType.BLOB")
grp.close()

def test_group_metadata_backwards_compat(self):
# This test ensures that metadata written with the TileDB-Py 0.32.3
# will be read correctly in the future versions.

# === The following code creates a group with metadata using the current version of TileDB-Py ===
path_new = self.path("new_group")
tiledb.Group.create(path_new)
group = tiledb.Group(path_new, "w")

# python primitive types
group.meta["python_int"] = -1234
group.meta["python_float"] = 3.14
group.meta["python_str"] = "hello"
group.meta["python_bytes"] = b"hello"
group.meta["python_bool"] = False

# numpy primitive types
group.meta["numpy_int"] = np.int64(-93)
group.meta["numpy_uint"] = np.uint64(42)
group.meta["numpy_float64"] = np.float64(3.14)
group.meta["numpy_bytes"] = np.bytes_("hello")
group.meta["numpy_str"] = np.str_("hello")
group.meta["numpy_bool"] = np.bool(False)

# lists/tuples
group.meta["list_int"] = [7]
group.meta["tuple_int"] = (7,)
group.meta["list_ints"] = [1, -2, 3]
group.meta["tuple_ints"] = (1, 2, 3)
group.meta["list_float"] = [1.1]
group.meta["tuple_float"] = (1.1,)
group.meta["list_floats"] = [1.1, 2.2, 3.3]
group.meta["tuple_floats"] = (1.1, 2.2, 3.3)
group.meta["list_empty"] = []
group.meta["tuple_empty"] = ()

# numpy arrays
group.meta["numpy_int"] = np.array([-11], dtype=np.int64)
group.meta["numpy_ints"] = np.array([1, -2, 3], dtype=np.int64)
group.meta["numpy_uint"] = np.array([22], dtype=np.uint64)
group.meta["numpy_uints"] = np.array([1, 2, 3], dtype=np.uint64)
group.meta["numpy_float"] = np.array([3.14], dtype=np.float64)
group.meta["numpy_floats"] = np.array([1.1, 2.2, 3.3], dtype=np.float64)
group.meta["numpy_byte"] = np.array([b"hello"], dtype="S5")
group.meta["numpy_str"] = np.array(["hello"], dtype="U5")
group.meta["numpy_bool"] = np.array([True, False, True])

group.close()
# === End of the code that creates the group with metadata ===

# The following commented out code was used to generate the base64 encoded string of the group
# from the TileDB-Py 0.32.3 after creating the group with metadata in the exact same way as above.
'''
# Compress the contents of the group folder to tgz
with tarfile.open("test.tar.gz", "w:gz") as tar:
with os.scandir(path_new) as entries:
for entry in entries:
tar.add(entry.path, arcname=entry.name)
# Read the .tgz file and encode it to base64
with open("test.tar.gz", 'rb') as f:
s = base64.encodebytes(f.read())
# Print the base64 encoded string
group_tgz = f"""{s.decode():>32}"""
print(group_tgz)
'''

# The following base64 encoded string is the contents of the group folder compressed
# to a tgz file using TileDB-Py 0.32.3.
group_tgz = b"""H4sICO/+G2cC/3Rlc3QudGFyANPT19N3CEis8EhNTEktYqAJMIAAXLSBgbEJgg0SNzQwMjRiUKhg
oAMoLS5JLAJazzAygZGFQm5JZm6qraG5kaWFhbmlhbGekaGphbGlJRfDKBj2ID4+N7UkUZ+mdoAy
tbmpKYQ2g9AGRqh53tDE3MDM3Nzc2NQcmP8NDc3NGRRM6Zn/E9Mzi/GpAypLSxt+8a83KMp/Y8zy
33C0/KdL+W+Otfy3NBot/kdS+R8fj4h/YPSj8UxTktOSjQxMjNPMzS0MDCxTjVLNTUwS01IMzMxM
zJMTicj/ZiYmuMp/QwNjM9Ty38jQAFhdKBjQM/+P0PJfDIhfMULYV1khNAsjTFYITDIygAQYQbKM
YBYDQv0xIEcAymdEEqtgbA1x9DtsIBATrJgRpRfwgC18R8GqqqXxD1gDJwZtnTTb5YbtE0YbprhD
8y0KH7SwVJTnps9d9sorMOX8Met7M8+yMHzas+bz0rgbMet7z3b75kqb3mSdtisqonQnu8GrGvHI
6WGxX/Jm+7UW7V45+8/OVSZ3+O+Ic/0Sloo+8OKG6hqutaun9NgfXjqDz9ftBZNBwLvXt6+fX94/
++EfK0X1S2nBpVv5jQ0cut7nS8T3/wn7rOpq5q9/Jn2XW8OhQ/frZTLrkycxHt1evlKvrtbsXeIX
2dw33D0fd0yt5vqe8T/k3d3wtO4UI5Vm8yMvspXTJE+ozFY+13ZA7e+avDertDwP+b1mcjq0JPar
QLS26mvFLQH6D97dDbyZlx1b8X/ZHYmHWpqMjTP6QiVvrZX/3nsqxv3WwofHjtgmbk+YGnhC/U1D
v5+z0SvXZ5YfmXhYiw4Ynmi727rZteXvpZULJ/jvNikQV1/tuiM73XDytc2ZVu6PRcy4NN3Cuze9
0GJc1KHr+mXOAxexJaUFAv/kVgi/K+FaI+2wZfqOxoYWocQPGzNeG9h9edh+3DfBJMYzOKL2l+em
ezc0Hyq98xaQ8eT40PDoxpYX60KKnogs7Ht2d+cf9lm5m9pGy8fhDvRG+/+j/X+M9p+JqYGJ+WgD
cES0/0oyc1JTkuLTi/JLC/RKUpJok//xtP+w9P+NTUD9v9H232j5P1r+D0j5b2ZoYDZa/o+I8h9c
8NN0AJiM8V8TA9PR8d9RMApGwSgYBaNgFIyCUTAKRsEooCYAAP1+F2wAKAAA"""

# Ceate a new group by extracting the contents of the tgz file
path_original = self.path("original_group")
with tarfile.open(fileobj=io.BytesIO(base64.b64decode(group_tgz))) as tf:
try:
tf.extractall(path_original, filter="fully_trusted")
except TypeError:
tf.extractall(path_original)

# Open both the original and the new group and compare the metadata both in values and types
group_original = tiledb.Group(path_original, "r")
group_new = tiledb.Group(path_new, "r")

self.assert_metadata_roundtrip(group_new.meta, group_original.meta)

group_original.close()
group_new.close()

def test_group_metadata_new_types(self):
# This kind of data was not supported for TileDB-Py <= 0.32.3
path_new = self.path("new_group")

tiledb.Group.create(path_new)
group = tiledb.Group(path_new, "w")
test_vals = {
"int64": np.array(-1111, dtype=np.int64),
"uint64": np.array(2, dtype=np.uint64),
"float64": np.array(3.14, dtype=np.float64),
"bool": np.array(True, dtype=bool),
"str": np.array(["a", "b", "c"], dtype="S"),
"unicode": np.array(["a", "b", "c"], dtype="U"),
"bytes": np.array([b"a", b"b", b"c"]),
"datetime": np.array(
[np.datetime64("2021-01-01"), np.datetime64("2021-01-02")]
),
}
group.meta.update(test_vals)
group.close()

group = tiledb.Group(path_new, "r")
self.assert_metadata_roundtrip(group.meta, test_vals)
group.close()

0 comments on commit 53208af

Please sign in to comment.