zarr-developers · rabernat · Jul 12, 2024 · Jul 12, 2024 · joshmoore · Jul 24, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -109,6 +109,7 @@ dependencies = [
     "universal_pathlib"
 ]
 extra-dependencies = [
+    "pyarrow",
     "coverage",
     "pytest",
     "pytest-cov",

diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py
@@ -283,7 +283,10 @@ class NDBuffer:
 
     def __init__(self, array: NDArrayLike):
         # assert array.ndim > 0
-        assert array.dtype != object
+
+        # Commented this out because string arrays have dtype object
+        # TODO: decide how to handle strings (e.g. numpy 2.0 StringDtype)
+        # assert array.dtype != object
         self._data = array
 
     @classmethod

diff --git a/src/zarr/codecs/__init__.py b/src/zarr/codecs/__init__.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from zarr.codecs.arrow import ArrowRecordBatchCodec
 from zarr.codecs.blosc import BloscCname, BloscCodec, BloscShuffle
 from zarr.codecs.bytes import BytesCodec, Endian
 from zarr.codecs.crc32c_ import Crc32cCodec
@@ -10,6 +11,7 @@
 from zarr.codecs.zstd import ZstdCodec
 
 __all__ = [
+    "ArrowRecordBatchCodec",
     "BatchedCodecPipeline",
     "BloscCname",
     "BloscCodec",

diff --git a/src/zarr/codecs/arrow.py b/src/zarr/codecs/arrow.py
@@ -0,0 +1,81 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import pyarrow as pa
+
+from zarr.abc.codec import ArrayBytesCodec
+from zarr.array_spec import ArraySpec
+from zarr.buffer import Buffer, NDBuffer
+from zarr.codecs.registry import register_codec
+from zarr.common import JSON, parse_named_configuration
+
+if TYPE_CHECKING:
+    from typing_extensions import Self
+
+CHUNK_FIELD_NAME = "zarr_chunk"
+
+
+@dataclass(frozen=True)
+class ArrowRecordBatchCodec(ArrayBytesCodec):
+    def __init__(self) -> None:
+        pass
+
+    @classmethod
+    def from_dict(cls, data: dict[str, JSON]) -> Self:
+        _, configuration_parsed = parse_named_configuration(
+            data, "arrow", require_configuration=False
+        )
+        configuration_parsed = configuration_parsed or {}
+        return cls(**configuration_parsed)
+
+    def to_dict(self) -> dict[str, JSON]:
+        return {"name": "arrow"}
+
+    def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
+        return self
+
+    async def _decode_single(
+        self,
+        chunk_bytes: Buffer,
+        chunk_spec: ArraySpec,
+    ) -> NDBuffer:
+        assert isinstance(chunk_bytes, Buffer)
+
+        # TODO: make this compatible with buffer prototype
+        arrow_buffer = memoryview(chunk_bytes.to_bytes())
+        with pa.ipc.open_stream(arrow_buffer) as reader:
+            batches = [b for b in reader]
+        assert len(batches) == 1
+        arrow_array = batches[0][CHUNK_FIELD_NAME]
+        chunk_array = chunk_spec.prototype.nd_buffer.from_ndarray_like(
+            arrow_array.to_numpy(zero_copy_only=False)
+        )
+
+        # ensure correct chunk shape
+        if chunk_array.shape != chunk_spec.shape:
+            chunk_array = chunk_array.reshape(
+                chunk_spec.shape,
+            )
+        return chunk_array
+
+    async def _encode_single(
+        self,
+        chunk_array: NDBuffer,
+        chunk_spec: ArraySpec,
+    ) -> Buffer | None:
+        assert isinstance(chunk_array, NDBuffer)
+        arrow_array = pa.array(chunk_array.as_ndarray_like().ravel())
+        rb = pa.record_batch([arrow_array], names=[CHUNK_FIELD_NAME])
+        # TODO: allocate buffer differently
+        sink = pa.BufferOutputStream()
+        with pa.ipc.new_stream(sink, rb.schema) as writer:
+            writer.write_batch(rb)
+        return chunk_spec.prototype.buffer.from_bytes(memoryview(sink.getvalue()))
+
+    def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
+        raise ValueError("Don't know how to compute encoded size!")
+
+
+register_codec("arrow", ArrowRecordBatchCodec)
diff --git a/tests/v3/test_codecs/test_arrow.py b/tests/v3/test_codecs/test_arrow.py
@@ -0,0 +1,57 @@
+import numpy as np
+import pytest
+
+from zarr.abc.store import Store
+from zarr.array import Array
+from zarr.codecs import ArrowRecordBatchCodec
+from zarr.store.core import StorePath
+
+
+@pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"])
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "uint8",
+        "uint16",
+        "uint32",
+        "uint64",
+        "int8",
+        "int16",
+        "int32",
+        "int64",
+        "float32",
+        "float64",
+    ],
+)
+def test_arrow_standard_dtypes(store: Store, dtype) -> None:
+    data = np.arange(0, 256, dtype=dtype).reshape((16, 16))
+
+    a = Array.create(
+        StorePath(store, path="arrow"),
+        shape=data.shape,
+        chunk_shape=(16, 16),
+        dtype=data.dtype,
+        fill_value=0,
+        codecs=[ArrowRecordBatchCodec()],
+    )
+
+    a[:, :] = data
+    assert np.array_equal(data, a[:, :])
+
+
+@pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"])
+def test_arrow_vlen_string(store: Store) -> None:
+    strings = ["hello", "world", "this", "is", "a", "test"]
+    data = np.array(strings).reshape((2, 3))
+
+    a = Array.create(
+        StorePath(store, path="arrow"),
+        shape=data.shape,
+        chunk_shape=data.shape,
+        dtype=data.dtype,
+        fill_value=0,
+        codecs=[ArrowRecordBatchCodec()],
+    )
+
+    a[:, :] = data
+    assert np.array_equal(data, a[:, :])