add legacy vlen-utf8 codec

zarr-developers · Jul 14, 2024 · c05b9d1 · c05b9d1
1 parent b8baa68
commit c05b9d1
Show file tree

Hide file tree

Showing 4 changed files with 101 additions and 1 deletion.
diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py
@@ -283,7 +283,10 @@ class NDBuffer:
 
     def __init__(self, array: NDArrayLike):
         # assert array.ndim > 0
-        assert array.dtype != object
+
+        # Commented this out because string arrays have dtype object
+        # TODO: decide how to handle strings (e.g. numpy 2.0 StringDtype)
+        # assert array.dtype != object
         self._data = array
 
     @classmethod

diff --git a/src/zarr/codecs/__init__.py b/src/zarr/codecs/__init__.py
@@ -4,6 +4,7 @@
 from zarr.codecs.bytes import BytesCodec, Endian
 from zarr.codecs.crc32c_ import Crc32cCodec
 from zarr.codecs.gzip import GzipCodec
+from zarr.codecs.legacy_vlen import VLenUTF8Codec
 from zarr.codecs.pipeline import BatchedCodecPipeline
 from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation
 from zarr.codecs.transpose import TransposeCodec
@@ -21,5 +22,6 @@
     "ShardingCodec",
     "ShardingCodecIndexLocation",
     "TransposeCodec",
+    "VLenUTF8Codec",
     "ZstdCodec",
 ]
diff --git a/src/zarr/codecs/legacy_vlen.py b/src/zarr/codecs/legacy_vlen.py
@@ -0,0 +1,68 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+from numcodecs.vlen import VLenUTF8
+
+from zarr.abc.codec import ArrayBytesCodec
+from zarr.array_spec import ArraySpec
+from zarr.buffer import Buffer, NDBuffer
+from zarr.codecs.registry import register_codec
+from zarr.common import JSON, parse_named_configuration
+
+if TYPE_CHECKING:
+    from typing_extensions import Self
+
+
+# can use a global because there are no parameters
+vlen_utf8_codec = VLenUTF8()
+
+
+@dataclass(frozen=True)
+class VLenUTF8Codec(ArrayBytesCodec):
+    def __init__(self) -> None:
+        pass
+
+    @classmethod
+    def from_dict(cls, data: dict[str, JSON]) -> Self:
+        _, configuration_parsed = parse_named_configuration(
+            data, "vlen-utf8", require_configuration=False
+        )
+        configuration_parsed = configuration_parsed or {}
+        return cls(**configuration_parsed)
+
+    def to_dict(self) -> dict[str, JSON]:
+        return {"name": "vlen-utf8"}
+
+    def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
+        return self
+
+    async def _decode_single(
+        self,
+        chunk_bytes: Buffer,
+        chunk_spec: ArraySpec,
+    ) -> NDBuffer:
+        assert isinstance(chunk_bytes, Buffer)
+
+        raw_bytes = chunk_bytes.as_array_like()
+        decoded = vlen_utf8_codec.decode(raw_bytes)
+        decoded.shape = chunk_spec.shape
+        return chunk_spec.prototype.nd_buffer.from_numpy_array(decoded)
+
+    async def _encode_single(
+        self,
+        chunk_array: NDBuffer,
+        chunk_spec: ArraySpec,
+    ) -> Buffer | None:
+        assert isinstance(chunk_array, NDBuffer)
+        return chunk_spec.prototype.buffer.from_bytes(
+            vlen_utf8_codec.encode(chunk_array.as_numpy_array())
+        )
+
+    def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
+        # what is input_byte_length for an object dtype?
+        raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs")
+
+
+register_codec("vlen-utf8", VLenUTF8Codec)
diff --git a/tests/v3/test_codecs/test_vlen.py b/tests/v3/test_codecs/test_vlen.py
@@ -0,0 +1,27 @@
+import numpy as np
+import pytest
+
+from zarr.abc.store import Store
+from zarr.array import Array
+from zarr.codecs import VLenUTF8Codec
+from zarr.store.core import StorePath
+
+
+@pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"])
+def test_arrow_vlen_string(store: Store) -> None:
+    strings = ["hello", "world", "this", "is", "a", "test"]
+    data = np.array(strings).reshape((2, 3))
+
+    a = Array.create(
+        StorePath(store, path="arrow"),
+        shape=data.shape,
+        chunk_shape=data.shape,
+        dtype=data.dtype,
+        fill_value=0,
+        codecs=[VLenUTF8Codec()],
+    )
+
+    a[:, :] = data
+    print(a)
+    print(a[:])
+    assert np.array_equal(data, a[:, :])