Skip to content

Commit

Permalink
Add string and bytes dtypes plus vlen-utf8 and vlen-bytes cod…
Browse files Browse the repository at this point in the history
…ecs (#2036)

* add legacy vlen-utf8 codec

* got it working again

* got strings working; broke everything else

* change v3.metadata.data_type type

* fixed tests

* satisfy mypy for tests

* make strings work

* add missing module

* store -> storage

* rename module

* add vlen bytes

* fix type assertions in test

* much better validation of fill value

* retype parse_fill_value

* tests pass but not mypy

* attempted to change parse_fill_value typing

* restore DEFAULT_DTYPE

* fixup

* docstring

* update test

* add better DataType tests

* more progress on typing; still not passing mypy

* fix typing yay!

* make types work with numpy <, 2

* Apply suggestions from code review

Co-authored-by: Joe Hamman <[email protected]>

* Apply suggestions from code review

Co-authored-by: Joe Hamman <[email protected]>

* apply Joe's suggestions

* add missing module

* make _STRING_DTYPE private to try to make sphinx happy

---------

Co-authored-by: Davis Bennett <[email protected]>
Co-authored-by: Tom Augspurger <[email protected]>
Co-authored-by: Joe Hamman <[email protected]>
  • Loading branch information
4 people authored Oct 8, 2024
1 parent c258b27 commit 7e2be57
Show file tree
Hide file tree
Showing 12 changed files with 584 additions and 94 deletions.
21 changes: 21 additions & 0 deletions src/zarr/codecs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Any

if TYPE_CHECKING:
import numpy as np

from zarr.codecs.blosc import BloscCname, BloscCodec, BloscShuffle
from zarr.codecs.bytes import BytesCodec, Endian
from zarr.codecs.crc32c_ import Crc32cCodec
from zarr.codecs.gzip import GzipCodec
from zarr.codecs.pipeline import BatchedCodecPipeline
from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation
from zarr.codecs.transpose import TransposeCodec
from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec
from zarr.codecs.zstd import ZstdCodec
from zarr.core.metadata.v3 import DataType

__all__ = [
"BatchedCodecPipeline",
Expand All @@ -21,5 +28,19 @@
"ShardingCodec",
"ShardingCodecIndexLocation",
"TransposeCodec",
"VLenUTF8Codec",
"VLenBytesCodec",
"ZstdCodec",
]


def _get_default_array_bytes_codec(
np_dtype: np.dtype[Any],
) -> BytesCodec | VLenUTF8Codec | VLenBytesCodec:
dtype = DataType.from_numpy(np_dtype)
if dtype == DataType.string:
return VLenUTF8Codec()
elif dtype == DataType.bytes:
return VLenBytesCodec()
else:
return BytesCodec()
117 changes: 117 additions & 0 deletions src/zarr/codecs/vlen_utf8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING

import numpy as np
from numcodecs.vlen import VLenBytes, VLenUTF8

from zarr.abc.codec import ArrayBytesCodec
from zarr.core.buffer import Buffer, NDBuffer
from zarr.core.common import JSON, parse_named_configuration
from zarr.core.strings import cast_to_string_dtype
from zarr.registry import register_codec

if TYPE_CHECKING:
from typing import Self

from zarr.core.array_spec import ArraySpec


# can use a global because there are no parameters
_vlen_utf8_codec = VLenUTF8()
_vlen_bytes_codec = VLenBytes()


@dataclass(frozen=True)
class VLenUTF8Codec(ArrayBytesCodec):
@classmethod
def from_dict(cls, data: dict[str, JSON]) -> Self:
_, configuration_parsed = parse_named_configuration(
data, "vlen-utf8", require_configuration=False
)
configuration_parsed = configuration_parsed or {}
return cls(**configuration_parsed)

def to_dict(self) -> dict[str, JSON]:
return {"name": "vlen-utf8", "configuration": {}}

def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
return self

async def _decode_single(
self,
chunk_bytes: Buffer,
chunk_spec: ArraySpec,
) -> NDBuffer:
assert isinstance(chunk_bytes, Buffer)

raw_bytes = chunk_bytes.as_array_like()
decoded = _vlen_utf8_codec.decode(raw_bytes)
assert decoded.dtype == np.object_
decoded.shape = chunk_spec.shape
# coming out of the code, we know this is safe, so don't issue a warning
as_string_dtype = cast_to_string_dtype(decoded, safe=True)
return chunk_spec.prototype.nd_buffer.from_numpy_array(as_string_dtype)

async def _encode_single(
self,
chunk_array: NDBuffer,
chunk_spec: ArraySpec,
) -> Buffer | None:
assert isinstance(chunk_array, NDBuffer)
return chunk_spec.prototype.buffer.from_bytes(
_vlen_utf8_codec.encode(chunk_array.as_numpy_array())
)

def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
# what is input_byte_length for an object dtype?
raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs")


@dataclass(frozen=True)
class VLenBytesCodec(ArrayBytesCodec):
@classmethod
def from_dict(cls, data: dict[str, JSON]) -> Self:
_, configuration_parsed = parse_named_configuration(
data, "vlen-bytes", require_configuration=False
)
configuration_parsed = configuration_parsed or {}
return cls(**configuration_parsed)

def to_dict(self) -> dict[str, JSON]:
return {"name": "vlen-bytes", "configuration": {}}

def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
return self

async def _decode_single(
self,
chunk_bytes: Buffer,
chunk_spec: ArraySpec,
) -> NDBuffer:
assert isinstance(chunk_bytes, Buffer)

raw_bytes = chunk_bytes.as_array_like()
decoded = _vlen_bytes_codec.decode(raw_bytes)
assert decoded.dtype == np.object_
decoded.shape = chunk_spec.shape
return chunk_spec.prototype.nd_buffer.from_numpy_array(decoded)

async def _encode_single(
self,
chunk_array: NDBuffer,
chunk_spec: ArraySpec,
) -> Buffer | None:
assert isinstance(chunk_array, NDBuffer)
return chunk_spec.prototype.buffer.from_bytes(
_vlen_bytes_codec.encode(chunk_array.as_numpy_array())
)

def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
# what is input_byte_length for an object dtype?
raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs")


register_codec("vlen-utf8", VLenUTF8Codec)
register_codec("vlen-bytes", VLenBytesCodec)
8 changes: 6 additions & 2 deletions src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from zarr._compat import _deprecate_positional_args
from zarr.abc.store import Store, set_or_delete
from zarr.codecs import BytesCodec
from zarr.codecs import _get_default_array_bytes_codec
from zarr.codecs._v2 import V2Compressor, V2Filters
from zarr.core.attributes import Attributes
from zarr.core.buffer import (
Expand Down Expand Up @@ -318,7 +318,11 @@ async def _create_v3(
await ensure_no_existing_node(store_path, zarr_format=3)

shape = parse_shapelike(shape)
codecs = list(codecs) if codecs is not None else [BytesCodec()]
codecs = (
list(codecs)
if codecs is not None
else [_get_default_array_bytes_codec(np.dtype(dtype))]
)

if chunk_key_encoding is None:
chunk_key_encoding = ("default", "/")
Expand Down
6 changes: 3 additions & 3 deletions src/zarr/core/buffer/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,8 +313,6 @@ class NDBuffer:
"""

def __init__(self, array: NDArrayLike) -> None:
# assert array.ndim > 0
assert array.dtype != object
self._data = array

@classmethod
Expand Down Expand Up @@ -467,9 +465,11 @@ def all_equal(self, other: Any, equal_nan: bool = True) -> bool:
# Handle None fill_value for Zarr V2
return False
# use array_equal to obtain equal_nan=True functionality
# Since fill-value is a scalar, isn't there a faster path than allocating a new array for fill value
# every single time we have to write data?
_data, other = np.broadcast_arrays(self._data, other)
return np.array_equal(
self._data, other, equal_nan=equal_nan if self._data.dtype.kind not in "US" else False
self._data, other, equal_nan=equal_nan if self._data.dtype.kind not in "USTO" else False
)

def fill(self, value: Any) -> None:
Expand Down
2 changes: 2 additions & 0 deletions src/zarr/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ def reset(self) -> None:
"crc32c": "zarr.codecs.crc32c_.Crc32cCodec",
"sharding_indexed": "zarr.codecs.sharding.ShardingCodec",
"transpose": "zarr.codecs.transpose.TransposeCodec",
"vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec",
"vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec",
},
"buffer": "zarr.core.buffer.cpu.Buffer",
"ndbuffer": "zarr.core.buffer.cpu.NDBuffer",
Expand Down
Loading

0 comments on commit 7e2be57

Please sign in to comment.