zarr-developers · rabernat · Oct 8, 2024 · Jul 14, 2024 · Sep 29, 2024 · Sep 29, 2024
diff --git a/src/zarr/codecs/__init__.py b/src/zarr/codecs/__init__.py
@@ -7,6 +7,7 @@
 from zarr.codecs.pipeline import BatchedCodecPipeline
 from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation
 from zarr.codecs.transpose import TransposeCodec
+from zarr.codecs.vlen_utf8 import VLenUTF8Codec
 from zarr.codecs.zstd import ZstdCodec
 
 __all__ = [
@@ -21,5 +22,6 @@
     "ShardingCodec",
     "ShardingCodecIndexLocation",
     "TransposeCodec",
+    "VLenUTF8Codec",
     "ZstdCodec",
 ]
diff --git a/src/zarr/codecs/vlen_utf8.py b/src/zarr/codecs/vlen_utf8.py
@@ -0,0 +1,71 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import numpy as np
+from numcodecs.vlen import VLenUTF8
+
+from zarr.abc.codec import ArrayBytesCodec
+from zarr.core.buffer import Buffer, NDBuffer
+from zarr.core.common import JSON, parse_named_configuration
+from zarr.registry import register_codec
+from zarr.strings import cast_to_string_dtype
+
+if TYPE_CHECKING:
+    from typing import Self
+
+    from zarr.core.array_spec import ArraySpec
+
+
+# can use a global because there are no parameters
+vlen_utf8_codec = VLenUTF8()
+
+
+@dataclass(frozen=True)
+class VLenUTF8Codec(ArrayBytesCodec):
+    @classmethod
+    def from_dict(cls, data: dict[str, JSON]) -> Self:
+        _, configuration_parsed = parse_named_configuration(
+            data, "vlen-utf8", require_configuration=False
+        )
+        configuration_parsed = configuration_parsed or {}
+        return cls(**configuration_parsed)
+
+    def to_dict(self) -> dict[str, JSON]:
+        return {"name": "vlen-utf8", "configuration": {}}
+
+    def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
+        return self
+
+    async def _decode_single(
+        self,
+        chunk_bytes: Buffer,
+        chunk_spec: ArraySpec,
+    ) -> NDBuffer:
+        assert isinstance(chunk_bytes, Buffer)
+
+        raw_bytes = chunk_bytes.as_array_like()
+        decoded = vlen_utf8_codec.decode(raw_bytes)
+        assert decoded.dtype == np.object_
+        decoded.shape = chunk_spec.shape
+        # coming out of the code, we know this is safe, so don't issue a warning
+        as_string_dtype = cast_to_string_dtype(decoded, safe=True)
+        return chunk_spec.prototype.nd_buffer.from_numpy_array(as_string_dtype)
+
+    async def _encode_single(
+        self,
+        chunk_array: NDBuffer,
+        chunk_spec: ArraySpec,
+    ) -> Buffer | None:
+        assert isinstance(chunk_array, NDBuffer)
+        return chunk_spec.prototype.buffer.from_bytes(
+            vlen_utf8_codec.encode(chunk_array.as_numpy_array())
+        )
+
+    def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
+        # what is input_byte_length for an object dtype?
+        raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs")
+
+
+register_codec("vlen-utf8", VLenUTF8Codec)
diff --git a/src/zarr/core/buffer/core.py b/src/zarr/core/buffer/core.py
@@ -313,8 +313,7 @@ class NDBuffer:
     """
 
     def __init__(self, array: NDArrayLike) -> None:
-        # assert array.ndim > 0
-        assert array.dtype != object
+        # assert array.dtype != object
         self._data = array
 
     @classmethod
@@ -467,9 +466,12 @@ def all_equal(self, other: Any, equal_nan: bool = True) -> bool:
             # Handle None fill_value for Zarr V2
             return False
         # use array_equal to obtain equal_nan=True functionality
+        # Note from Ryan: doesn't this lead to a huge amount of unnecessary memory allocation on every single chunk?
+        # Since fill-value is a scalar, isn't there a faster path than allocating a new array for fill value
+        # every single time we have to write data?
         _data, other = np.broadcast_arrays(self._data, other)
         return np.array_equal(
-            self._data, other, equal_nan=equal_nan if self._data.dtype.kind not in "US" else False
+            self._data, other, equal_nan=equal_nan if self._data.dtype.kind not in "USTO" else False
         )
 
     def fill(self, value: Any) -> None:

diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py
@@ -58,6 +58,7 @@ def reset(self) -> None:
                 "crc32c": "zarr.codecs.crc32c_.Crc32cCodec",
                 "sharding_indexed": "zarr.codecs.sharding.ShardingCodec",
                 "transpose": "zarr.codecs.transpose.TransposeCodec",
+                "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec",
             },
             "buffer": "zarr.core.buffer.cpu.Buffer",
             "ndbuffer": "zarr.core.buffer.cpu.NDBuffer",

diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py
@@ -29,6 +29,7 @@
 from zarr.core.config import config
 from zarr.core.metadata.common import ArrayMetadata, parse_attributes
 from zarr.registry import get_codec_class
+from zarr.strings import STRING_DTYPE
 
 DEFAULT_DTYPE = "float64"
 
@@ -449,6 +450,7 @@ class DataType(Enum):
     float64 = "float64"
     complex64 = "complex64"
     complex128 = "complex128"
+    string = "string"
 
     @property
     def byte_count(self) -> int:
@@ -495,10 +497,15 @@ def to_numpy_shortname(self) -> str:
         return data_type_to_numpy[self]
 
     def to_numpy(self) -> np.dtype[Any]:
-        return np.dtype(self.to_numpy_shortname())
+        if self == DataType.string:
+            return STRING_DTYPE
+        else:
+            return np.dtype(self.to_numpy_shortname())
 
     @classmethod
     def from_numpy(cls, dtype: np.dtype[Any]) -> DataType:
+        if np.issubdtype(np.str_, dtype):
+            return DataType.string
         dtype_to_data_type = {
             "|b1": "bool",
             "bool": "bool",
@@ -522,17 +529,20 @@ def from_numpy(cls, dtype: np.dtype[Any]) -> DataType:
     def parse(cls, dtype: None | DataType | Any) -> DataType:
         if dtype is None:
             # the default dtype
-            return DataType[DEFAULT_DTYPE]
+            return DataType.float64
         if isinstance(dtype, DataType):
             return dtype
-        else:
-            try:
-                dtype = np.dtype(dtype)
-            except (ValueError, TypeError) as e:
-                raise ValueError(f"Invalid V3 data_type: {dtype}") from e
-            # check that this is a valid v3 data_type
-            try:
-                data_type = DataType.from_numpy(dtype)
-            except KeyError as e:
-                raise ValueError(f"Invalid V3 data_type: {dtype}") from e
-            return data_type
+        try:
+            return DataType(dtype)
+        except ValueError:
+            pass
+        try:
+            dtype = np.dtype(dtype)
+        except (ValueError, TypeError) as e:
+            raise ValueError(f"Invalid V3 data_type: {dtype}") from e
+        # check that this is a valid v3 data_type
+        try:
+            data_type = DataType.from_numpy(dtype)
+        except KeyError as e:
+            raise ValueError(f"Invalid V3 data_type: {dtype}") from e
+        return data_type
diff --git a/src/zarr/strings.py b/src/zarr/strings.py
@@ -0,0 +1,36 @@
+from typing import Any
+from warnings import warn
+
+import numpy as np
+
+try:
+    STRING_DTYPE = np.dtype("T")
+    NUMPY_SUPPORTS_VLEN_STRING = True
+except TypeError:
+    STRING_DTYPE = np.dtype("object")
+    NUMPY_SUPPORTS_VLEN_STRING = False
+
+
+def cast_to_string_dtype(
+    data: np.ndarray[Any, np.dtype[Any]], safe: bool = False
+) -> np.ndarray[Any, np.dtype[Any]]:
+    if np.issubdtype(data.dtype, np.str_):
+        return data
+    if np.issubdtype(data.dtype, np.object_):
+        if NUMPY_SUPPORTS_VLEN_STRING:
+            try:
+                # cast to variable-length string dtype, fail if object contains non-string data
+                # mypy says "error: Unexpected keyword argument "coerce" for "StringDType"  [call-arg]"
+                return data.astype(np.dtypes.StringDType(coerce=False), copy=False)  # type: ignore[call-arg]
+            except ValueError as e:
+                raise ValueError("Cannot cast object dtype to string dtype") from e
+        else:
+            out = data.astype(np.str_)
+            if not safe:
+                warn(
+                    f"Casted object dtype to string dtype {out.dtype}. To avoid this warning, "
+                    "cast the data to a string dtype before passing to Zarr or upgrade to NumPy >= 2.",
+                    stacklevel=2,
+                )
+            return out
+    raise ValueError(f"Cannot cast dtype {data.dtype} to string dtype")
diff --git a/tests/v3/test_codecs/test_vlen.py b/tests/v3/test_codecs/test_vlen.py
@@ -0,0 +1,51 @@
+from typing import Any
+
+import numpy as np
+import pytest
+
+from zarr import Array
+from zarr.abc.store import Store
+from zarr.codecs import VLenUTF8Codec
+from zarr.core.metadata.v3 import ArrayV3Metadata, DataType
+from zarr.storage.common import StorePath
+from zarr.strings import NUMPY_SUPPORTS_VLEN_STRING
+
+numpy_str_dtypes: list[type | None] = [None, str, np.dtypes.StrDType]
+expected_zarr_string_dtype: np.dtype[Any]
+if NUMPY_SUPPORTS_VLEN_STRING:
+    numpy_str_dtypes.append(np.dtypes.StringDType)
+    expected_zarr_string_dtype = np.dtypes.StringDType()
+else:
+    expected_zarr_string_dtype = np.dtype("O")
+
+
+@pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"])
+@pytest.mark.parametrize("dtype", numpy_str_dtypes)
+async def test_vlen_string(store: Store, dtype: None | np.dtype[Any]) -> None:
+    strings = ["hello", "world", "this", "is", "a", "test"]
+    data = np.array(strings).reshape((2, 3))
+    if dtype is not None:
+        data = data.astype(dtype)
+
+    sp = StorePath(store, path="string")
+    a = Array.create(
+        sp,
+        shape=data.shape,
+        chunk_shape=data.shape,
+        dtype=data.dtype,
+        fill_value="",
+        codecs=[VLenUTF8Codec()],
+    )
+    assert isinstance(a.metadata, ArrayV3Metadata)  # needed for mypy
+
+    a[:, :] = data
+    assert np.array_equal(data, a[:, :])
+    assert a.metadata.data_type == DataType.string
+    assert a.dtype == expected_zarr_string_dtype
+
+    # test round trip
+    b = Array.open(sp)
+    assert isinstance(b.metadata, ArrayV3Metadata)  # needed for mypy
+    assert np.array_equal(data, b[:, :])
+    assert b.metadata.data_type == DataType.string
+    assert a.dtype == expected_zarr_string_dtype
diff --git a/tests/v3/test_config.py b/tests/v3/test_config.py
@@ -58,6 +58,7 @@ def test_config_defaults_set() -> None:
                 "crc32c": "zarr.codecs.crc32c_.Crc32cCodec",
                 "sharding_indexed": "zarr.codecs.sharding.ShardingCodec",
                 "transpose": "zarr.codecs.transpose.TransposeCodec",
+                "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec",
             },
         }
     ]