Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add string and bytes dtypes plus vlen-utf8 and vlen-bytes codecs #2036

Merged
merged 35 commits into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
c05b9d1
add legacy vlen-utf8 codec
rabernat Jul 14, 2024
c86ddc6
Merge branch 'v3' into ryan/legacy-vlen
rabernat Sep 29, 2024
a322124
got it working again
rabernat Sep 29, 2024
2a1e2e3
got strings working; broke everything else
rabernat Oct 1, 2024
1d3d7a5
change v3.metadata.data_type type
rabernat Oct 1, 2024
cd40b08
merged
rabernat Oct 1, 2024
988f9df
fixed tests
rabernat Oct 1, 2024
507161a
satisfy mypy for tests
rabernat Oct 1, 2024
1ae5e63
make strings work
rabernat Oct 3, 2024
94ecdb5
add missing module
rabernat Oct 3, 2024
2c7d638
Merge branch 'v3' into ryan/legacy-vlen
d-v-b Oct 3, 2024
b1717d8
Merge remote-tracking branch 'upstream/v3' into ryan/legacy-vlen
rabernat Oct 4, 2024
79b7d43
store -> storage
rabernat Oct 4, 2024
a5c2a37
rename module
rabernat Oct 4, 2024
717f0c7
Merge remote-tracking branch 'origin/ryan/legacy-vlen' into ryan/lega…
rabernat Oct 4, 2024
b90d8f3
merged
rabernat Oct 4, 2024
0406ea1
add vlen bytes
rabernat Oct 7, 2024
8e61a18
fix type assertions in test
rabernat Oct 7, 2024
6cf7dde
much better validation of fill value
rabernat Oct 7, 2024
28d58fa
retype parse_fill_value
rabernat Oct 7, 2024
c6de878
tests pass but not mypy
rabernat Oct 7, 2024
4f026db
attempted to change parse_fill_value typing
rabernat Oct 8, 2024
e427c7a
restore DEFAULT_DTYPE
rabernat Oct 8, 2024
7d9d897
fixup
TomAugspurger Oct 8, 2024
0c21994
docstring
TomAugspurger Oct 8, 2024
c12ac41
update test
TomAugspurger Oct 8, 2024
3aeea1e
add better DataType tests
rabernat Oct 8, 2024
cae7055
more progress on typing; still not passing mypy
rabernat Oct 8, 2024
1aeb49a
fix typing yay!
rabernat Oct 8, 2024
6714bad
make types work with numpy <, 2
rabernat Oct 8, 2024
2edf3b8
Apply suggestions from code review
rabernat Oct 8, 2024
12a0d65
Apply suggestions from code review
rabernat Oct 8, 2024
7ba7077
apply Joe's suggestions
rabernat Oct 8, 2024
1e828b4
add missing module
rabernat Oct 8, 2024
ba0f093
make _STRING_DTYPE private to try to make sphinx happy
rabernat Oct 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions src/zarr/codecs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Any

if TYPE_CHECKING:
import numpy as np

from zarr.codecs.blosc import BloscCname, BloscCodec, BloscShuffle
from zarr.codecs.bytes import BytesCodec, Endian
from zarr.codecs.crc32c_ import Crc32cCodec
from zarr.codecs.gzip import GzipCodec
from zarr.codecs.pipeline import BatchedCodecPipeline
from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation
from zarr.codecs.transpose import TransposeCodec
from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec
from zarr.codecs.zstd import ZstdCodec
from zarr.core.metadata.v3 import DataType

__all__ = [
"BatchedCodecPipeline",
Expand All @@ -21,5 +28,19 @@
"ShardingCodec",
"ShardingCodecIndexLocation",
"TransposeCodec",
"VLenUTF8Codec",
"VLenBytesCodec",
"ZstdCodec",
]


def get_default_array_bytes_codec(
rabernat marked this conversation as resolved.
Show resolved Hide resolved
np_dtype: np.dtype[Any],
) -> BytesCodec | VLenUTF8Codec | VLenBytesCodec:
dtype = DataType.from_numpy(np_dtype)
if dtype == DataType.string:
return VLenUTF8Codec()
elif dtype == DataType.bytes:
return VLenBytesCodec()
else:
return BytesCodec()
117 changes: 117 additions & 0 deletions src/zarr/codecs/vlen_utf8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING

import numpy as np
from numcodecs.vlen import VLenBytes, VLenUTF8

from zarr.abc.codec import ArrayBytesCodec
from zarr.core.buffer import Buffer, NDBuffer
from zarr.core.common import JSON, parse_named_configuration
from zarr.registry import register_codec
from zarr.strings import cast_to_string_dtype

if TYPE_CHECKING:
from typing import Self

from zarr.core.array_spec import ArraySpec


# can use a global because there are no parameters
vlen_utf8_codec = VLenUTF8()
vlen_bytes_codec = VLenBytes()
rabernat marked this conversation as resolved.
Show resolved Hide resolved


@dataclass(frozen=True)
class VLenUTF8Codec(ArrayBytesCodec):
@classmethod
def from_dict(cls, data: dict[str, JSON]) -> Self:
_, configuration_parsed = parse_named_configuration(
data, "vlen-utf8", require_configuration=False
)
configuration_parsed = configuration_parsed or {}
return cls(**configuration_parsed)

def to_dict(self) -> dict[str, JSON]:
return {"name": "vlen-utf8", "configuration": {}}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One thought I had while implementing this: often the original numpy array is a fixed-length type (e.g. <U5). In V2, this dtype could be stored directly in the metadata, whereas now we are losing the 5, which is potentially useful information.

In the future, we may want to try to resurface this information.


def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
return self

async def _decode_single(
self,
chunk_bytes: Buffer,
chunk_spec: ArraySpec,
) -> NDBuffer:
assert isinstance(chunk_bytes, Buffer)

raw_bytes = chunk_bytes.as_array_like()
decoded = vlen_utf8_codec.decode(raw_bytes)
assert decoded.dtype == np.object_
decoded.shape = chunk_spec.shape
# coming out of the code, we know this is safe, so don't issue a warning
as_string_dtype = cast_to_string_dtype(decoded, safe=True)
return chunk_spec.prototype.nd_buffer.from_numpy_array(as_string_dtype)

async def _encode_single(
self,
chunk_array: NDBuffer,
chunk_spec: ArraySpec,
) -> Buffer | None:
assert isinstance(chunk_array, NDBuffer)
return chunk_spec.prototype.buffer.from_bytes(
vlen_utf8_codec.encode(chunk_array.as_numpy_array())
)

def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
# what is input_byte_length for an object dtype?
raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs")


@dataclass(frozen=True)
class VLenBytesCodec(ArrayBytesCodec):
@classmethod
def from_dict(cls, data: dict[str, JSON]) -> Self:
_, configuration_parsed = parse_named_configuration(
data, "vlen-bytes", require_configuration=False
)
configuration_parsed = configuration_parsed or {}
return cls(**configuration_parsed)

def to_dict(self) -> dict[str, JSON]:
return {"name": "vlen-bytes", "configuration": {}}

def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
return self

async def _decode_single(
self,
chunk_bytes: Buffer,
chunk_spec: ArraySpec,
) -> NDBuffer:
assert isinstance(chunk_bytes, Buffer)

raw_bytes = chunk_bytes.as_array_like()
decoded = vlen_bytes_codec.decode(raw_bytes)
assert decoded.dtype == np.object_
decoded.shape = chunk_spec.shape
return chunk_spec.prototype.nd_buffer.from_numpy_array(decoded)

async def _encode_single(
self,
chunk_array: NDBuffer,
chunk_spec: ArraySpec,
) -> Buffer | None:
assert isinstance(chunk_array, NDBuffer)
return chunk_spec.prototype.buffer.from_bytes(
vlen_bytes_codec.encode(chunk_array.as_numpy_array())
)

def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
# what is input_byte_length for an object dtype?
raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs")


register_codec("vlen-utf8", VLenUTF8Codec)
register_codec("vlen-bytes", VLenBytesCodec)
4 changes: 2 additions & 2 deletions src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from zarr._compat import _deprecate_positional_args
from zarr.abc.store import Store, set_or_delete
from zarr.codecs import BytesCodec
from zarr.codecs import get_default_array_bytes_codec
from zarr.codecs._v2 import V2Compressor, V2Filters
from zarr.core.attributes import Attributes
from zarr.core.buffer import (
Expand Down Expand Up @@ -318,7 +318,7 @@ async def _create_v3(
await ensure_no_existing_node(store_path, zarr_format=3)

shape = parse_shapelike(shape)
codecs = list(codecs) if codecs is not None else [BytesCodec()]
codecs = list(codecs) if codecs is not None else [get_default_array_bytes_codec(dtype)]

if chunk_key_encoding is None:
chunk_key_encoding = ("default", "/")
Expand Down
8 changes: 5 additions & 3 deletions src/zarr/core/buffer/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,8 +313,7 @@ class NDBuffer:
"""

def __init__(self, array: NDArrayLike) -> None:
# assert array.ndim > 0
assert array.dtype != object
# assert array.dtype != object
rabernat marked this conversation as resolved.
Show resolved Hide resolved
self._data = array

@classmethod
Expand Down Expand Up @@ -467,9 +466,12 @@ def all_equal(self, other: Any, equal_nan: bool = True) -> bool:
# Handle None fill_value for Zarr V2
return False
# use array_equal to obtain equal_nan=True functionality
# Note from Ryan: doesn't this lead to a huge amount of unnecessary memory allocation on every single chunk?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we're OK.

In [10]: a, b = np.broadcast_arrays(np.ones(10), 2)

In [11]: b.base
Out[11]: array(2)

I think that it just does some tricks with the strides or something?

rabernat marked this conversation as resolved.
Show resolved Hide resolved
# Since fill-value is a scalar, isn't there a faster path than allocating a new array for fill value
# every single time we have to write data?
_data, other = np.broadcast_arrays(self._data, other)
return np.array_equal(
self._data, other, equal_nan=equal_nan if self._data.dtype.kind not in "US" else False
self._data, other, equal_nan=equal_nan if self._data.dtype.kind not in "USTO" else False
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
)

def fill(self, value: Any) -> None:
Expand Down
2 changes: 2 additions & 0 deletions src/zarr/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ def reset(self) -> None:
"crc32c": "zarr.codecs.crc32c_.Crc32cCodec",
"sharding_indexed": "zarr.codecs.sharding.ShardingCodec",
"transpose": "zarr.codecs.transpose.TransposeCodec",
"vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec",
"vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec",
},
"buffer": "zarr.core.buffer.cpu.Buffer",
"ndbuffer": "zarr.core.buffer.cpu.NDBuffer",
Expand Down
Loading