Skip to content

Commit

Permalink
add legacy vlen-utf8 codec
Browse files Browse the repository at this point in the history
  • Loading branch information
rabernat committed Jul 14, 2024
1 parent b8baa68 commit c05b9d1
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 1 deletion.
5 changes: 4 additions & 1 deletion src/zarr/buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,10 @@ class NDBuffer:

def __init__(self, array: NDArrayLike):
# assert array.ndim > 0
assert array.dtype != object

# Commented this out because string arrays have dtype object
# TODO: decide how to handle strings (e.g. numpy 2.0 StringDtype)
# assert array.dtype != object
self._data = array

@classmethod
Expand Down
2 changes: 2 additions & 0 deletions src/zarr/codecs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from zarr.codecs.bytes import BytesCodec, Endian
from zarr.codecs.crc32c_ import Crc32cCodec
from zarr.codecs.gzip import GzipCodec
from zarr.codecs.legacy_vlen import VLenUTF8Codec
from zarr.codecs.pipeline import BatchedCodecPipeline
from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation
from zarr.codecs.transpose import TransposeCodec
Expand All @@ -21,5 +22,6 @@
"ShardingCodec",
"ShardingCodecIndexLocation",
"TransposeCodec",
"VLenUTF8Codec",
"ZstdCodec",
]
68 changes: 68 additions & 0 deletions src/zarr/codecs/legacy_vlen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING

from numcodecs.vlen import VLenUTF8

from zarr.abc.codec import ArrayBytesCodec
from zarr.array_spec import ArraySpec
from zarr.buffer import Buffer, NDBuffer
from zarr.codecs.registry import register_codec
from zarr.common import JSON, parse_named_configuration

if TYPE_CHECKING:
from typing_extensions import Self


# can use a global because there are no parameters
vlen_utf8_codec = VLenUTF8()


@dataclass(frozen=True)
class VLenUTF8Codec(ArrayBytesCodec):
def __init__(self) -> None:
pass

@classmethod
def from_dict(cls, data: dict[str, JSON]) -> Self:
_, configuration_parsed = parse_named_configuration(
data, "vlen-utf8", require_configuration=False
)
configuration_parsed = configuration_parsed or {}
return cls(**configuration_parsed)

def to_dict(self) -> dict[str, JSON]:
return {"name": "vlen-utf8"}

def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
return self

async def _decode_single(
self,
chunk_bytes: Buffer,
chunk_spec: ArraySpec,
) -> NDBuffer:
assert isinstance(chunk_bytes, Buffer)

raw_bytes = chunk_bytes.as_array_like()
decoded = vlen_utf8_codec.decode(raw_bytes)
decoded.shape = chunk_spec.shape
return chunk_spec.prototype.nd_buffer.from_numpy_array(decoded)

async def _encode_single(
self,
chunk_array: NDBuffer,
chunk_spec: ArraySpec,
) -> Buffer | None:
assert isinstance(chunk_array, NDBuffer)
return chunk_spec.prototype.buffer.from_bytes(
vlen_utf8_codec.encode(chunk_array.as_numpy_array())
)

def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
# what is input_byte_length for an object dtype?
raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs")


register_codec("vlen-utf8", VLenUTF8Codec)
27 changes: 27 additions & 0 deletions tests/v3/test_codecs/test_vlen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import numpy as np
import pytest

from zarr.abc.store import Store
from zarr.array import Array
from zarr.codecs import VLenUTF8Codec
from zarr.store.core import StorePath


@pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"])
def test_arrow_vlen_string(store: Store) -> None:
strings = ["hello", "world", "this", "is", "a", "test"]
data = np.array(strings).reshape((2, 3))

a = Array.create(
StorePath(store, path="arrow"),
shape=data.shape,
chunk_shape=data.shape,
dtype=data.dtype,
fill_value=0,
codecs=[VLenUTF8Codec()],
)

a[:, :] = data
print(a)
print(a[:])
assert np.array_equal(data, a[:, :])

0 comments on commit c05b9d1

Please sign in to comment.