From f040aa29d33c183ae8bda7f6db6bffbbcc5a9785 Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Sat, 9 Apr 2022 19:40:07 -0400 Subject: [PATCH 01/16] make zarr backend compatible with v3 spec --- xarray/backends/zarr.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 5731b047738..0c3c621b192 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -367,12 +367,19 @@ def open_group( if isinstance(store, os.PathLike): store = os.fspath(store) + zarr_version = getattr(store, '_store_version', 2) + if zarr_version > 2 and group is None: + # v3 stores require a group name: use 'xarray' as a default one. + group = 'xarray' + open_kwargs = dict( mode=mode, synchronizer=synchronizer, path=group, ) open_kwargs["storage_options"] = storage_options + if zarr_version > 2: + open_kwargs["zarr_version"] = zarr_version if chunk_store: open_kwargs["chunk_store"] = chunk_store @@ -447,6 +454,12 @@ def open_store_variable(self, name, zarr_array): zarr_array, DIMENSION_KEY, try_nczarr ) attributes = dict(attributes) + + # TODO: how to properly handle 'filters' for v3 stores + # currently these use a hack to store 'filters' within attributes + # need to drop this here for V3 store tests to succeed + attributes.pop('filters', None) + encoding = { "chunks": zarr_array.chunks, "preferred_chunks": dict(zip(dimensions, zarr_array.chunks)), @@ -574,8 +587,13 @@ def store( self.set_variables( variables_encoded, check_encoding_set, writer, unlimited_dims=unlimited_dims ) + zarr_version = getattr(self.zarr_group.store, '_store_version', 3) + consolidate_kwargs = {} + if zarr_version > 2: + # zarr v3 spec requires providing a path + consolidate_kwargs['path'] = self.zarr_group.path if self._consolidate_on_close: - zarr.consolidate_metadata(self.zarr_group.store) + zarr.consolidate_metadata(self.zarr_group.store, **consolidate_kwargs) def sync(self): pass From 471837617b6652cb05387e5d7008200c8ac47cc3 Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Sat, 9 Apr 2022 19:40:27 -0400 Subject: [PATCH 02/16] add tests for Zarr v3 stores --- xarray/tests/test_backends.py | 111 +++++++++++++++++++++++++++++++++- 1 file changed, 110 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 862d217b433..0360848ee56 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -101,6 +101,20 @@ except ImportError: pass +have_zarr_kvstore = False +try: + from zarr.storage import KVStore + have_zarr_kvstore = True +except ImportError: + KVStore = None + +have_zarr_v3 = False +try: + from zarr.storage_v3 import DirectoryStoreV3, KVStoreV3 + have_zarr_v3 = True +except ImportError: + KVStoreV3 = None + ON_WINDOWS = sys.platform == "win32" default_value = object() @@ -2470,7 +2484,10 @@ def test_attributes(self, obj): class TestZarrDictStore(ZarrBase): @contextlib.contextmanager def create_zarr_target(self): - yield {} + if have_zarr_kvstore: + yield KVStore({}) + else: + yield {} @requires_zarr @@ -2481,6 +2498,98 @@ def create_zarr_target(self): yield tmp +class ZarrBaseV3(ZarrBase): + def test_roundtrip_coordinates_with_space(self): + original = Dataset(coords={"x": 0, "y z": 1}) + expected = Dataset({"y z": 1}, {"x": 0}) + with pytest.warns(SerializationWarning): + # v3 stores do not allow spaces in the key name + with pytest.raises(ValueError): + with self.roundtrip(original) as actual: + pass + + +@pytest.mark.skipif(not have_zarr_v3, reason=f"requires zarr version 3") +class TestZarrKVStoreV3(ZarrBaseV3): + @contextlib.contextmanager + def create_zarr_target(self): + yield KVStoreV3({}) + + +@pytest.mark.skipif(not have_zarr_v3, reason=f"requires zarr version 3") +class TestZarrDirectoryStoreV3(ZarrBaseV3): + @contextlib.contextmanager + def create_zarr_target(self): + with create_tmp_file(suffix=".zr3") as tmp: + yield DirectoryStoreV3(tmp) + + +""" + + +test_roundtrip_object_dtype & +test_write_persistence_modes & +test_check_encoding_is_consistent_after_append & +test_append_with_new_variable & +test_to_zarr_append_compute_false_roundtrip`` fail due to + extra "filters" attribute for "strings" and "strings_nans" datasets + + (Pdb) !expected["strings"] + + array(['ab', 'cdef', 'g'], dtype=object) + Dimensions without coordinates: b + (Pdb) !actual["strings"] + + array(['ab', 'cdef', 'g'], dtype=object) + Dimensions without coordinates: b + Attributes: + filters: [{'id': 'vlen-utf8'}] + + (Pdb) !expected["strings_nans"] + + array(['ab', 'cdef', ''], dtype=object) + Dimensions without coordinates: b + (Pdb) !actual["strings_nans"] + + array(['ab', 'cdef', ''], dtype=object) + Dimensions without coordinates: b + Attributes: + filters: [{'id': 'vlen-utf8'}] + + +test_roundtrip_object_dtype + also appears to have mismatched types in Data variables + + + (Pdb) !expected + + Dimensions: (a: 5, b: 3, c: 2) + Dimensions without coordinates: a, b, c + Data variables: + floats (a) object 0.0 0.0 1.0 2.0 3.0 + floats_nans (a) object nan nan 1.0 2.0 3.0 + bytes (b) object b'ab' b'cdef' b'g' + bytes_nans (b) object b'ab' b'cdef' b'' + strings (b) object 'ab' 'cdef' 'g' + strings_nans (b) object 'ab' 'cdef' '' + all_nans (c) object nan nan + nan float64 nan + (Pdb) !actual + + Dimensions: (c: 2, b: 3, a: 5) + Dimensions without coordinates: c, b, a + Data variables: + all_nans (c) float64 nan nan + bytes (b) |S4 b'ab' b'cdef' b'g' + bytes_nans (b) |S4 b'ab' b'cdef' b'' + floats (a) float64 0.0 0.0 1.0 2.0 3.0 + floats_nans (a) float64 nan nan 1.0 2.0 3.0 + nan float64 nan + strings (b) object ... + strings_nans (b) object ... + +""" + @requires_zarr @requires_fsspec def test_zarr_storage_options(): From d590b76573fb8823b58af4554a90b14bdc6ad49b Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Wed, 21 Sep 2022 12:42:24 -0400 Subject: [PATCH 03/16] add tests for Zarr v3 stores when the store is not a StoreV3 class In this case where create_zarr_target returns a string, we must specify zarr_version=3 when opening/writing a store to make sure a version 3 store will be created rather than the default of a version 2 store. --- xarray/backends/api.py | 2 + xarray/backends/zarr.py | 16 ++- xarray/core/dataset.py | 6 + xarray/tests/test_backends.py | 208 ++++++++++++++-------------------- 4 files changed, 110 insertions(+), 122 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index d2929519852..cdc8dae3523 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1538,6 +1538,7 @@ def to_zarr( region: Mapping[str, slice] | None = None, safe_chunks: bool = True, storage_options: dict[str, str] | None = None, + zarr_version: int | None = None, ) -> backends.ZarrStore | Delayed: """This function creates an appropriate datastore for writing a dataset to a zarr ztore @@ -1622,6 +1623,7 @@ def to_zarr( write_region=region, safe_chunks=safe_chunks, stacklevel=4, # for Dataset.to_zarr() + zarr_version=zarr_version, ) if mode in ["a", "r+"]: diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 0c3c621b192..a8a5fb45ff0 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -3,6 +3,7 @@ import json import os import warnings +from typing import Optional import numpy as np @@ -361,13 +362,17 @@ def open_group( write_region=None, safe_chunks=True, stacklevel=2, + zarr_version=None, ): # zarr doesn't support pathlib.Path objects yet. zarr-python#601 if isinstance(store, os.PathLike): store = os.fspath(store) - zarr_version = getattr(store, '_store_version', 2) + if zarr_version is None: + # default to 2 if store doesn't specify it's version (e.g. a path) + zarr_version = getattr(store, '_store_version', 2) + if zarr_version > 2 and group is None: # v3 stores require a group name: use 'xarray' as a default one. group = 'xarray' @@ -691,6 +696,7 @@ def open_zarr( storage_options=None, decode_timedelta=None, use_cftime=None, + zarr_version=None, **kwargs, ): """Load and decode a dataset from a Zarr store. @@ -768,6 +774,10 @@ def open_zarr( represented using ``np.datetime64[ns]`` objects. If False, always decode times to ``np.datetime64[ns]`` objects; if this is not possible raise an error. + zarr_version : int or None, optional + The desired zarr spec version to target (currently 2 or 3). The default + of None will attempt to determine the zarr version from ``store`` when + possible, otherwise defaulting to 2. Returns ------- @@ -805,6 +815,7 @@ def open_zarr( "chunk_store": chunk_store, "storage_options": storage_options, "stacklevel": 4, + "zarr_version": zarr_version, } ds = open_dataset( @@ -821,6 +832,7 @@ def open_zarr( backend_kwargs=backend_kwargs, decode_timedelta=decode_timedelta, use_cftime=use_cftime, + zarr_version=zarr_version, ) return ds @@ -852,6 +864,7 @@ def open_dataset( chunk_store=None, storage_options=None, stacklevel=3, + zarr_version: Optional[int] = None, ): filename_or_obj = _normalize_path(filename_or_obj) @@ -865,6 +878,7 @@ def open_dataset( chunk_store=chunk_store, storage_options=storage_options, stacklevel=stacklevel + 1, + zarr_version=zarr_version, ) store_entrypoint = StoreBackendEntrypoint() diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 08a2d04cab8..695cd4831ea 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1916,6 +1916,7 @@ def to_zarr( region: Mapping[str, slice] | None = None, safe_chunks: bool = True, storage_options: dict[str, str] | None = None, + zarr_version: int | None = None, ) -> ZarrStore: ... @@ -2034,6 +2035,10 @@ def to_zarr( storage_options : dict, optional Any additional parameters for the storage backend (ignored for local paths). + zarr_version : int or None, optional + The desired zarr spec version to target (currently 2 or 3). The + default of None will attempt to determine the zarr version from + ``store`` when possible, otherwise defaulting to 2. Returns ------- @@ -2078,6 +2083,7 @@ def to_zarr( append_dim=append_dim, region=region, safe_chunks=safe_chunks, + zarr_version=zarr_version, ) def __repr__(self) -> str: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 0360848ee56..8e3f9fe160a 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1697,6 +1697,7 @@ def test_write_inconsistent_chunks(self): class ZarrBase(CFEncodedBase): DIMENSION_KEY = "_ARRAY_DIMENSIONS" + version_kwargs = {} def create_zarr_target(self): raise NotImplementedError @@ -1704,14 +1705,14 @@ def create_zarr_target(self): @contextlib.contextmanager def create_store(self): with self.create_zarr_target() as store_target: - yield backends.ZarrStore.open_group(store_target, mode="w") + yield backends.ZarrStore.open_group(store_target, mode="w", **self.version_kwargs) def save(self, dataset, store_target, **kwargs): - return dataset.to_zarr(store=store_target, **kwargs) + return dataset.to_zarr(store=store_target, **kwargs, **self.version_kwargs) @contextlib.contextmanager def open(self, store_target, **kwargs): - with xr.open_dataset(store_target, engine="zarr", **kwargs) as ds: + with xr.open_dataset(store_target, engine="zarr", **kwargs, **self.version_kwargs) as ds: yield ds @contextlib.contextmanager @@ -1741,12 +1742,12 @@ def test_roundtrip_consolidated(self, consolidated): def test_read_non_consolidated_warning(self): expected = create_test_data() with self.create_zarr_target() as store: - expected.to_zarr(store, consolidated=False) + expected.to_zarr(store, consolidated=False, **self.version_kwargs) with pytest.warns( RuntimeWarning, match="Failed to open Zarr store with consolidated", ): - with xr.open_zarr(store) as ds: + with xr.open_zarr(store, **self.version_kwargs) as ds: assert_identical(ds, expected) def test_non_existent_store(self): @@ -2038,10 +2039,10 @@ def test_write_persistence_modes(self, group): # check append mode for append write ds, ds_to_append, _ = create_append_test_data() with self.create_zarr_target() as store_target: - ds.to_zarr(store_target, mode="w", group=group) - ds_to_append.to_zarr(store_target, append_dim="time", group=group) + ds.to_zarr(store_target, mode="w", group=group, **self.version_kwargs) + ds_to_append.to_zarr(store_target, append_dim="time", group=group, **self.version_kwargs) original = xr.concat([ds, ds_to_append], dim="time") - actual = xr.open_dataset(store_target, group=group, engine="zarr") + actual = xr.open_dataset(store_target, group=group, engine="zarr", **self.version_kwargs) assert_identical(original, actual) def test_compressor_encoding(self): @@ -2081,8 +2082,8 @@ def test_append_with_mode_rplus_success(self): original = Dataset({"foo": ("x", [1])}) modified = Dataset({"foo": ("x", [2])}) with self.create_zarr_target() as store: - original.to_zarr(store) - modified.to_zarr(store, mode="r+") + original.to_zarr(store, **self.version_kwargs) + modified.to_zarr(store, mode="r+", **self.version_kwargs) with self.open(store) as actual: assert_identical(actual, modified) @@ -2090,50 +2091,51 @@ def test_append_with_mode_rplus_fails(self): original = Dataset({"foo": ("x", [1])}) modified = Dataset({"bar": ("x", [2])}) with self.create_zarr_target() as store: - original.to_zarr(store) + original.to_zarr(store, **self.version_kwargs) with pytest.raises( ValueError, match="dataset contains non-pre-existing variables" ): - modified.to_zarr(store, mode="r+") + modified.to_zarr(store, mode="r+", **self.version_kwargs) def test_append_with_invalid_dim_raises(self): ds, ds_to_append, _ = create_append_test_data() with self.create_zarr_target() as store_target: - ds.to_zarr(store_target, mode="w") + ds.to_zarr(store_target, mode="w", **self.version_kwargs) with pytest.raises( ValueError, match="does not match any existing dataset dimensions" ): - ds_to_append.to_zarr(store_target, append_dim="notvalid") + ds_to_append.to_zarr(store_target, append_dim="notvalid", **self.version_kwargs) def test_append_with_no_dims_raises(self): with self.create_zarr_target() as store_target: - Dataset({"foo": ("x", [1])}).to_zarr(store_target, mode="w") + Dataset({"foo": ("x", [1])}).to_zarr(store_target, mode="w", **self.version_kwargs) with pytest.raises(ValueError, match="different dimension names"): - Dataset({"foo": ("y", [2])}).to_zarr(store_target, mode="a") + Dataset({"foo": ("y", [2])}).to_zarr(store_target, mode="a", **self.version_kwargs) def test_append_with_append_dim_not_set_raises(self): ds, ds_to_append, _ = create_append_test_data() with self.create_zarr_target() as store_target: - ds.to_zarr(store_target, mode="w") + ds.to_zarr(store_target, mode="w", **self.version_kwargs) with pytest.raises(ValueError, match="different dimension sizes"): - ds_to_append.to_zarr(store_target, mode="a") + ds_to_append.to_zarr(store_target, mode="a", **self.version_kwargs) def test_append_with_mode_not_a_raises(self): ds, ds_to_append, _ = create_append_test_data() with self.create_zarr_target() as store_target: - ds.to_zarr(store_target, mode="w") + ds.to_zarr(store_target, mode="w", **self.version_kwargs) with pytest.raises(ValueError, match="cannot set append_dim unless"): - ds_to_append.to_zarr(store_target, mode="w", append_dim="time") + ds_to_append.to_zarr(store_target, mode="w", append_dim="time", **self.version_kwargs) def test_append_with_existing_encoding_raises(self): ds, ds_to_append, _ = create_append_test_data() with self.create_zarr_target() as store_target: - ds.to_zarr(store_target, mode="w") + ds.to_zarr(store_target, mode="w", **self.version_kwargs) with pytest.raises(ValueError, match="but encoding was provided"): ds_to_append.to_zarr( store_target, append_dim="time", encoding={"da": {"compressor": None}}, + **self.version_kwargs, ) @pytest.mark.parametrize("dtype", ["U", "S"]) @@ -2157,13 +2159,13 @@ def test_check_encoding_is_consistent_after_append(self): compressor = zarr.Blosc() encoding = {"da": {"compressor": compressor}} - ds.to_zarr(store_target, mode="w", encoding=encoding) - ds_to_append.to_zarr(store_target, append_dim="time") - actual_ds = xr.open_dataset(store_target, engine="zarr") + ds.to_zarr(store_target, mode="w", encoding=encoding, **self.version_kwargs) + ds_to_append.to_zarr(store_target, append_dim="time", **self.version_kwargs) + actual_ds = xr.open_dataset(store_target, engine="zarr", **self.version_kwargs) actual_encoding = actual_ds["da"].encoding["compressor"] assert actual_encoding.get_config() == compressor.get_config() assert_identical( - xr.open_dataset(store_target, engine="zarr").compute(), + xr.open_dataset(store_target, engine="zarr", **self.version_kwargs).compute(), xr.concat([ds, ds_to_append], dim="time"), ) @@ -2173,11 +2175,15 @@ def test_append_with_new_variable(self): # check append mode for new variable with self.create_zarr_target() as store_target: - xr.concat([ds, ds_to_append], dim="time").to_zarr(store_target, mode="w") - ds_with_new_var.to_zarr(store_target, mode="a") + xr.concat([ds, ds_to_append], dim="time").to_zarr( + store_target, mode="w", **self.version_kwargs + ) + ds_with_new_var.to_zarr(store_target, mode="a", **self.version_kwargs) combined = xr.concat([ds, ds_to_append], dim="time") combined["new_var"] = ds_with_new_var["new_var"] - assert_identical(combined, xr.open_dataset(store_target, engine="zarr")) + assert_identical( + combined, xr.open_dataset(store_target, engine="zarr", **self.version_kwargs) + ) @requires_dask def test_to_zarr_compute_false_roundtrip(self): @@ -2273,16 +2279,19 @@ def test_write_region(self, consolidated, compute, use_dask): consolidated=consolidated, compute=compute, encoding={"u": dict(chunks=2)}, + **self.version_kwargs, ) if compute: - with xr.open_zarr(store, consolidated=consolidated) as actual: + with xr.open_zarr( + store, consolidated=consolidated, **self.version_kwargs + ) as actual: assert_identical(actual, zeros) for i in range(0, 10, 2): region = {"x": slice(i, i + 2)} nonzeros.isel(region).to_zarr( - store, region=region, consolidated=consolidated + store, region=region, consolidated=consolidated, **self.version_kwargs, ) - with xr.open_zarr(store, consolidated=consolidated) as actual: + with xr.open_zarr(store, consolidated=consolidated, **self.version_kwargs) as actual: assert_identical(actual, nonzeros) @pytest.mark.parametrize("mode", [None, "r+", "a"]) @@ -2290,10 +2299,10 @@ def test_write_region_mode(self, mode): zeros = Dataset({"u": (("x",), np.zeros(10))}) nonzeros = Dataset({"u": (("x",), np.arange(1, 11))}) with self.create_zarr_target() as store: - zeros.to_zarr(store) + zeros.to_zarr(store, **self.version_kwargs) for region in [{"x": slice(5)}, {"x": slice(5, 10)}]: - nonzeros.isel(region).to_zarr(store, region=region, mode=mode) - with xr.open_zarr(store) as actual: + nonzeros.isel(region).to_zarr(store, region=region, mode=mode, **self.version_kwargs) + with xr.open_zarr(store, **self.version_kwargs) as actual: assert_identical(actual, nonzeros) @requires_dask @@ -2317,8 +2326,8 @@ def test_write_preexisting_override_metadata(self): ) with self.create_zarr_target() as store: - original.to_zarr(store, compute=False) - both_modified.to_zarr(store, mode="a") + original.to_zarr(store, compute=False, **self.version_kwargs) + both_modified.to_zarr(store, mode="a", **self.version_kwargs) with self.open(store) as actual: # NOTE: this arguably incorrect -- we should probably be # overriding the variable metadata, too. See the TODO note in @@ -2326,15 +2335,15 @@ def test_write_preexisting_override_metadata(self): assert_identical(actual, global_modified) with self.create_zarr_target() as store: - original.to_zarr(store, compute=False) - both_modified.to_zarr(store, mode="r+") + original.to_zarr(store, compute=False, **self.version_kwargs) + both_modified.to_zarr(store, mode="r+", **self.version_kwargs) with self.open(store) as actual: assert_identical(actual, only_new_data) with self.create_zarr_target() as store: - original.to_zarr(store, compute=False) + original.to_zarr(store, compute=False, **self.version_kwargs) # with region, the default mode becomes r+ - both_modified.to_zarr(store, region={"x": slice(None)}) + both_modified.to_zarr(store, region={"x": slice(None)}, **self.version_kwargs) with self.open(store) as actual: assert_identical(actual, only_new_data) @@ -2345,7 +2354,7 @@ def test_write_region_errors(self): @contextlib.contextmanager def setup_and_verify_store(expected=data): with self.create_zarr_target() as store: - data.to_zarr(store) + data.to_zarr(store, **self.version_kwargs) yield store with self.open(store) as actual: assert_identical(actual, expected) @@ -2353,7 +2362,7 @@ def setup_and_verify_store(expected=data): # verify the base case works expected = Dataset({"u": (("x",), np.array([10, 11, 2, 3, 4]))}) with setup_and_verify_store(expected) as store: - data2.to_zarr(store, region={"x": slice(2)}) + data2.to_zarr(store, region={"x": slice(2)}, **self.version_kwargs) with setup_and_verify_store() as store: with pytest.raises( @@ -2362,46 +2371,46 @@ def setup_and_verify_store(expected=data): "cannot set region unless mode='a', mode='r+' or mode=None" ), ): - data.to_zarr(store, region={"x": slice(None)}, mode="w") + data.to_zarr(store, region={"x": slice(None)}, mode="w", **self.version_kwargs) with setup_and_verify_store() as store: with pytest.raises(TypeError, match=r"must be a dict"): - data.to_zarr(store, region=slice(None)) + data.to_zarr(store, region=slice(None), **self.version_kwargs) with setup_and_verify_store() as store: with pytest.raises(TypeError, match=r"must be slice objects"): - data2.to_zarr(store, region={"x": [0, 1]}) + data2.to_zarr(store, region={"x": [0, 1]}, **self.version_kwargs) with setup_and_verify_store() as store: with pytest.raises(ValueError, match=r"step on all slices"): - data2.to_zarr(store, region={"x": slice(None, None, 2)}) + data2.to_zarr(store, region={"x": slice(None, None, 2)}, **self.version_kwargs) with setup_and_verify_store() as store: with pytest.raises( ValueError, match=r"all keys in ``region`` are not in Dataset dimensions", ): - data.to_zarr(store, region={"y": slice(None)}) + data.to_zarr(store, region={"y": slice(None)}, **self.version_kwargs) with setup_and_verify_store() as store: with pytest.raises( ValueError, match=r"all variables in the dataset to write must have at least one dimension in common", ): - data2.assign(v=2).to_zarr(store, region={"x": slice(2)}) + data2.assign(v=2).to_zarr(store, region={"x": slice(2)}, **self.version_kwargs) with setup_and_verify_store() as store: with pytest.raises( ValueError, match=r"cannot list the same dimension in both" ): - data.to_zarr(store, region={"x": slice(None)}, append_dim="x") + data.to_zarr(store, region={"x": slice(None)}, append_dim="x", **self.version_kwargs) with setup_and_verify_store() as store: with pytest.raises( ValueError, match=r"variable 'u' already exists with different dimension sizes", ): - data2.to_zarr(store, region={"x": slice(3)}) + data2.to_zarr(store, region={"x": slice(3)}, **self.version_kwargs) @requires_dask def test_encoding_chunksizes(self): @@ -2443,10 +2452,10 @@ def test_chunk_encoding_with_larger_dask_chunks(self): def test_open_zarr_use_cftime(self): ds = create_test_data() with self.create_zarr_target() as store_target: - ds.to_zarr(store_target) - ds_a = xr.open_zarr(store_target) + ds.to_zarr(store_target, **self.version_kwargs) + ds_a = xr.open_zarr(store_target, **self.version_kwargs) assert_identical(ds, ds_a) - ds_b = xr.open_zarr(store_target, use_cftime=True) + ds_b = xr.open_zarr(store_target, use_cftime=True, **self.version_kwargs) assert xr.coding.times.contains_cftime_datetimes(ds_b.time) def test_write_read_select_write(self): @@ -2455,13 +2464,13 @@ def test_write_read_select_write(self): # NOTE: using self.roundtrip, which uses open_dataset, will not trigger the bug. with self.create_zarr_target() as initial_store: - ds.to_zarr(initial_store, mode="w") - ds1 = xr.open_zarr(initial_store) + ds.to_zarr(initial_store, mode="w", **self.version_kwargs) + ds1 = xr.open_zarr(initial_store, **self.version_kwargs) # Combination of where+squeeze triggers error on write. ds_sel = ds1.where(ds1.coords["dim3"] == "a", drop=True).squeeze("dim3") with self.create_zarr_target() as final_store: - ds_sel.to_zarr(final_store, mode="w") + ds_sel.to_zarr(final_store, mode="w", **self.version_kwargs) @pytest.mark.parametrize("obj", [Dataset(), DataArray(name="foo")]) def test_attributes(self, obj): @@ -2497,6 +2506,17 @@ def create_zarr_target(self): with create_tmp_file(suffix=".zarr") as tmp: yield tmp + @contextlib.contextmanager + def create_store(self): + with self.create_zarr_target() as store_target: + group = backends.ZarrStore.open_group(store_target, mode="w") + # older Zarr versions do not have the _store_version attribute + if have_zarr_v3: + # verify that a v2 store was created + assert group.zarr_group.store._store_version == 2 + yield group + + class ZarrBaseV3(ZarrBase): def test_roundtrip_coordinates_with_space(self): @@ -2524,71 +2544,17 @@ def create_zarr_target(self): yield DirectoryStoreV3(tmp) -""" - - -test_roundtrip_object_dtype & -test_write_persistence_modes & -test_check_encoding_is_consistent_after_append & -test_append_with_new_variable & -test_to_zarr_append_compute_false_roundtrip`` fail due to - extra "filters" attribute for "strings" and "strings_nans" datasets - - (Pdb) !expected["strings"] - - array(['ab', 'cdef', 'g'], dtype=object) - Dimensions without coordinates: b - (Pdb) !actual["strings"] - - array(['ab', 'cdef', 'g'], dtype=object) - Dimensions without coordinates: b - Attributes: - filters: [{'id': 'vlen-utf8'}] - - (Pdb) !expected["strings_nans"] - - array(['ab', 'cdef', ''], dtype=object) - Dimensions without coordinates: b - (Pdb) !actual["strings_nans"] - - array(['ab', 'cdef', ''], dtype=object) - Dimensions without coordinates: b - Attributes: - filters: [{'id': 'vlen-utf8'}] - - -test_roundtrip_object_dtype - also appears to have mismatched types in Data variables - - - (Pdb) !expected - - Dimensions: (a: 5, b: 3, c: 2) - Dimensions without coordinates: a, b, c - Data variables: - floats (a) object 0.0 0.0 1.0 2.0 3.0 - floats_nans (a) object nan nan 1.0 2.0 3.0 - bytes (b) object b'ab' b'cdef' b'g' - bytes_nans (b) object b'ab' b'cdef' b'' - strings (b) object 'ab' 'cdef' 'g' - strings_nans (b) object 'ab' 'cdef' '' - all_nans (c) object nan nan - nan float64 nan - (Pdb) !actual - - Dimensions: (c: 2, b: 3, a: 5) - Dimensions without coordinates: c, b, a - Data variables: - all_nans (c) float64 nan nan - bytes (b) |S4 b'ab' b'cdef' b'g' - bytes_nans (b) |S4 b'ab' b'cdef' b'' - floats (a) float64 0.0 0.0 1.0 2.0 3.0 - floats_nans (a) float64 nan nan 1.0 2.0 3.0 - nan float64 nan - strings (b) object ... - strings_nans (b) object ... - -""" +@pytest.mark.skipif(not have_zarr_v3, reason=f"requires zarr version 3") +class TestZarrDirectoryStoreV3FromPath(TestZarrDirectoryStoreV3): + # Must specify zarr_version=3 to get a v3 store because create_zarr_target + # is a string path. + version_kwargs = {'zarr_version': 3} + + @contextlib.contextmanager + def create_zarr_target(self): + with create_tmp_file(suffix=".zr3") as tmp: + yield tmp + @requires_zarr @requires_fsspec From dbdf63f721b397269c3bf77d97ebcced55069946 Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Wed, 21 Sep 2022 13:31:35 -0400 Subject: [PATCH 04/16] update import path to match Zarr v2.12 and v2.13 experimental API remove path='xarray' default for zarr v3 path=None should work as of Zarr v2.13 --- xarray/backends/zarr.py | 14 ++---- xarray/core/dataset.py | 1 + xarray/tests/test_backends.py | 88 ++++++++++++++++++++++++++--------- 3 files changed, 72 insertions(+), 31 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index a8a5fb45ff0..7312ae4574c 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -371,11 +371,7 @@ def open_group( if zarr_version is None: # default to 2 if store doesn't specify it's version (e.g. a path) - zarr_version = getattr(store, '_store_version', 2) - - if zarr_version > 2 and group is None: - # v3 stores require a group name: use 'xarray' as a default one. - group = 'xarray' + zarr_version = getattr(store, "_store_version", 2) open_kwargs = dict( mode=mode, @@ -463,7 +459,7 @@ def open_store_variable(self, name, zarr_array): # TODO: how to properly handle 'filters' for v3 stores # currently these use a hack to store 'filters' within attributes # need to drop this here for V3 store tests to succeed - attributes.pop('filters', None) + attributes.pop("filters", None) encoding = { "chunks": zarr_array.chunks, @@ -592,11 +588,11 @@ def store( self.set_variables( variables_encoded, check_encoding_set, writer, unlimited_dims=unlimited_dims ) - zarr_version = getattr(self.zarr_group.store, '_store_version', 3) + zarr_version = getattr(self.zarr_group.store, "_store_version", 3) consolidate_kwargs = {} if zarr_version > 2: - # zarr v3 spec requires providing a path - consolidate_kwargs['path'] = self.zarr_group.path + # If the group has a path, it needs to also be passed to consolidate_metadata + consolidate_kwargs["path"] = self.zarr_group.path if self._consolidate_on_close: zarr.consolidate_metadata(self.zarr_group.store, **consolidate_kwargs) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 695cd4831ea..a43fda85752 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1954,6 +1954,7 @@ def to_zarr( region: Mapping[str, slice] | None = None, safe_chunks: bool = True, storage_options: dict[str, str] | None = None, + zarr_version : int | None = None, ) -> ZarrStore | Delayed: """Write dataset contents to a zarr group. diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 8e3f9fe160a..5b5efa7aa02 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -104,13 +104,17 @@ have_zarr_kvstore = False try: from zarr.storage import KVStore + have_zarr_kvstore = True except ImportError: KVStore = None have_zarr_v3 = False try: - from zarr.storage_v3 import DirectoryStoreV3, KVStoreV3 + # as of Zarr v2.13 these imports require environment variable + # ZARR_V3_EXPERIMENTAL_API=1 + from zarr import DirectoryStoreV3, KVStoreV3 + have_zarr_v3 = True except ImportError: KVStoreV3 = None @@ -1705,14 +1709,18 @@ def create_zarr_target(self): @contextlib.contextmanager def create_store(self): with self.create_zarr_target() as store_target: - yield backends.ZarrStore.open_group(store_target, mode="w", **self.version_kwargs) + yield backends.ZarrStore.open_group( + store_target, mode="w", **self.version_kwargs + ) def save(self, dataset, store_target, **kwargs): return dataset.to_zarr(store=store_target, **kwargs, **self.version_kwargs) @contextlib.contextmanager def open(self, store_target, **kwargs): - with xr.open_dataset(store_target, engine="zarr", **kwargs, **self.version_kwargs) as ds: + with xr.open_dataset( + store_target, engine="zarr", **kwargs, **self.version_kwargs + ) as ds: yield ds @contextlib.contextmanager @@ -2040,9 +2048,13 @@ def test_write_persistence_modes(self, group): ds, ds_to_append, _ = create_append_test_data() with self.create_zarr_target() as store_target: ds.to_zarr(store_target, mode="w", group=group, **self.version_kwargs) - ds_to_append.to_zarr(store_target, append_dim="time", group=group, **self.version_kwargs) + ds_to_append.to_zarr( + store_target, append_dim="time", group=group, **self.version_kwargs + ) original = xr.concat([ds, ds_to_append], dim="time") - actual = xr.open_dataset(store_target, group=group, engine="zarr", **self.version_kwargs) + actual = xr.open_dataset( + store_target, group=group, engine="zarr", **self.version_kwargs + ) assert_identical(original, actual) def test_compressor_encoding(self): @@ -2104,13 +2116,19 @@ def test_append_with_invalid_dim_raises(self): with pytest.raises( ValueError, match="does not match any existing dataset dimensions" ): - ds_to_append.to_zarr(store_target, append_dim="notvalid", **self.version_kwargs) + ds_to_append.to_zarr( + store_target, append_dim="notvalid", **self.version_kwargs + ) def test_append_with_no_dims_raises(self): with self.create_zarr_target() as store_target: - Dataset({"foo": ("x", [1])}).to_zarr(store_target, mode="w", **self.version_kwargs) + Dataset({"foo": ("x", [1])}).to_zarr( + store_target, mode="w", **self.version_kwargs + ) with pytest.raises(ValueError, match="different dimension names"): - Dataset({"foo": ("y", [2])}).to_zarr(store_target, mode="a", **self.version_kwargs) + Dataset({"foo": ("y", [2])}).to_zarr( + store_target, mode="a", **self.version_kwargs + ) def test_append_with_append_dim_not_set_raises(self): ds, ds_to_append, _ = create_append_test_data() @@ -2124,7 +2142,9 @@ def test_append_with_mode_not_a_raises(self): with self.create_zarr_target() as store_target: ds.to_zarr(store_target, mode="w", **self.version_kwargs) with pytest.raises(ValueError, match="cannot set append_dim unless"): - ds_to_append.to_zarr(store_target, mode="w", append_dim="time", **self.version_kwargs) + ds_to_append.to_zarr( + store_target, mode="w", append_dim="time", **self.version_kwargs + ) def test_append_with_existing_encoding_raises(self): ds, ds_to_append, _ = create_append_test_data() @@ -2161,11 +2181,15 @@ def test_check_encoding_is_consistent_after_append(self): encoding = {"da": {"compressor": compressor}} ds.to_zarr(store_target, mode="w", encoding=encoding, **self.version_kwargs) ds_to_append.to_zarr(store_target, append_dim="time", **self.version_kwargs) - actual_ds = xr.open_dataset(store_target, engine="zarr", **self.version_kwargs) + actual_ds = xr.open_dataset( + store_target, engine="zarr", **self.version_kwargs + ) actual_encoding = actual_ds["da"].encoding["compressor"] assert actual_encoding.get_config() == compressor.get_config() assert_identical( - xr.open_dataset(store_target, engine="zarr", **self.version_kwargs).compute(), + xr.open_dataset( + store_target, engine="zarr", **self.version_kwargs + ).compute(), xr.concat([ds, ds_to_append], dim="time"), ) @@ -2182,7 +2206,8 @@ def test_append_with_new_variable(self): combined = xr.concat([ds, ds_to_append], dim="time") combined["new_var"] = ds_with_new_var["new_var"] assert_identical( - combined, xr.open_dataset(store_target, engine="zarr", **self.version_kwargs) + combined, + xr.open_dataset(store_target, engine="zarr", **self.version_kwargs), ) @requires_dask @@ -2289,9 +2314,14 @@ def test_write_region(self, consolidated, compute, use_dask): for i in range(0, 10, 2): region = {"x": slice(i, i + 2)} nonzeros.isel(region).to_zarr( - store, region=region, consolidated=consolidated, **self.version_kwargs, + store, + region=region, + consolidated=consolidated, + **self.version_kwargs, ) - with xr.open_zarr(store, consolidated=consolidated, **self.version_kwargs) as actual: + with xr.open_zarr( + store, consolidated=consolidated, **self.version_kwargs + ) as actual: assert_identical(actual, nonzeros) @pytest.mark.parametrize("mode", [None, "r+", "a"]) @@ -2301,7 +2331,9 @@ def test_write_region_mode(self, mode): with self.create_zarr_target() as store: zeros.to_zarr(store, **self.version_kwargs) for region in [{"x": slice(5)}, {"x": slice(5, 10)}]: - nonzeros.isel(region).to_zarr(store, region=region, mode=mode, **self.version_kwargs) + nonzeros.isel(region).to_zarr( + store, region=region, mode=mode, **self.version_kwargs + ) with xr.open_zarr(store, **self.version_kwargs) as actual: assert_identical(actual, nonzeros) @@ -2343,7 +2375,9 @@ def test_write_preexisting_override_metadata(self): with self.create_zarr_target() as store: original.to_zarr(store, compute=False, **self.version_kwargs) # with region, the default mode becomes r+ - both_modified.to_zarr(store, region={"x": slice(None)}, **self.version_kwargs) + both_modified.to_zarr( + store, region={"x": slice(None)}, **self.version_kwargs + ) with self.open(store) as actual: assert_identical(actual, only_new_data) @@ -2371,7 +2405,9 @@ def setup_and_verify_store(expected=data): "cannot set region unless mode='a', mode='r+' or mode=None" ), ): - data.to_zarr(store, region={"x": slice(None)}, mode="w", **self.version_kwargs) + data.to_zarr( + store, region={"x": slice(None)}, mode="w", **self.version_kwargs + ) with setup_and_verify_store() as store: with pytest.raises(TypeError, match=r"must be a dict"): @@ -2383,7 +2419,9 @@ def setup_and_verify_store(expected=data): with setup_and_verify_store() as store: with pytest.raises(ValueError, match=r"step on all slices"): - data2.to_zarr(store, region={"x": slice(None, None, 2)}, **self.version_kwargs) + data2.to_zarr( + store, region={"x": slice(None, None, 2)}, **self.version_kwargs + ) with setup_and_verify_store() as store: with pytest.raises( @@ -2397,13 +2435,20 @@ def setup_and_verify_store(expected=data): ValueError, match=r"all variables in the dataset to write must have at least one dimension in common", ): - data2.assign(v=2).to_zarr(store, region={"x": slice(2)}, **self.version_kwargs) + data2.assign(v=2).to_zarr( + store, region={"x": slice(2)}, **self.version_kwargs + ) with setup_and_verify_store() as store: with pytest.raises( ValueError, match=r"cannot list the same dimension in both" ): - data.to_zarr(store, region={"x": slice(None)}, append_dim="x", **self.version_kwargs) + data.to_zarr( + store, + region={"x": slice(None)}, + append_dim="x", + **self.version_kwargs, + ) with setup_and_verify_store() as store: with pytest.raises( @@ -2517,7 +2562,6 @@ def create_store(self): yield group - class ZarrBaseV3(ZarrBase): def test_roundtrip_coordinates_with_space(self): original = Dataset(coords={"x": 0, "y z": 1}) @@ -2548,7 +2592,7 @@ def create_zarr_target(self): class TestZarrDirectoryStoreV3FromPath(TestZarrDirectoryStoreV3): # Must specify zarr_version=3 to get a v3 store because create_zarr_target # is a string path. - version_kwargs = {'zarr_version': 3} + version_kwargs = {"zarr_version": 3} @contextlib.contextmanager def create_zarr_target(self): From f9c14e08a2083cae1b39497d4cd4d789ded5372f Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Mon, 11 Apr 2022 19:56:58 -0400 Subject: [PATCH 05/16] flake8 fixes --- xarray/tests/test_backends.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 5b5efa7aa02..6cc450e85ce 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2565,22 +2565,21 @@ def create_store(self): class ZarrBaseV3(ZarrBase): def test_roundtrip_coordinates_with_space(self): original = Dataset(coords={"x": 0, "y z": 1}) - expected = Dataset({"y z": 1}, {"x": 0}) with pytest.warns(SerializationWarning): # v3 stores do not allow spaces in the key name with pytest.raises(ValueError): - with self.roundtrip(original) as actual: + with self.roundtrip(original): pass -@pytest.mark.skipif(not have_zarr_v3, reason=f"requires zarr version 3") +@pytest.mark.skipif(not have_zarr_v3, reason="requires zarr version 3") class TestZarrKVStoreV3(ZarrBaseV3): @contextlib.contextmanager def create_zarr_target(self): yield KVStoreV3({}) -@pytest.mark.skipif(not have_zarr_v3, reason=f"requires zarr version 3") +@pytest.mark.skipif(not have_zarr_v3, reason="requires zarr version 3") class TestZarrDirectoryStoreV3(ZarrBaseV3): @contextlib.contextmanager def create_zarr_target(self): @@ -2588,7 +2587,7 @@ def create_zarr_target(self): yield DirectoryStoreV3(tmp) -@pytest.mark.skipif(not have_zarr_v3, reason=f"requires zarr version 3") +@pytest.mark.skipif(not have_zarr_v3, reason="requires zarr version 3") class TestZarrDirectoryStoreV3FromPath(TestZarrDirectoryStoreV3): # Must specify zarr_version=3 to get a v3 store because create_zarr_target # is a string path. From 09ddcffcc41e043d27f42f2a1375a946603ef42c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 21 Sep 2022 18:06:42 +0000 Subject: [PATCH 06/16] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/backends/zarr.py | 2 +- xarray/core/dataset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 7312ae4574c..69673a0a40a 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -860,7 +860,7 @@ def open_dataset( chunk_store=None, storage_options=None, stacklevel=3, - zarr_version: Optional[int] = None, + zarr_version: int | None = None, ): filename_or_obj = _normalize_path(filename_or_obj) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index a43fda85752..e0683124a52 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1954,7 +1954,7 @@ def to_zarr( region: Mapping[str, slice] | None = None, safe_chunks: bool = True, storage_options: dict[str, str] | None = None, - zarr_version : int | None = None, + zarr_version: int | None = None, ) -> ZarrStore | Delayed: """Write dataset contents to a zarr group. From 0f14df5605246523bbb60fa27fa1ae44050269bf Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Fri, 28 Oct 2022 16:01:27 -0700 Subject: [PATCH 07/16] disallow consolidated metadata for zarr v3 --- xarray/backends/api.py | 7 +++++++ xarray/backends/zarr.py | 15 ++++++++++++++- xarray/tests/test_backends.py | 28 +++++++++++++++++++--------- 3 files changed, 40 insertions(+), 10 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index b0e1063357c..a7aa65df99f 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1609,6 +1609,13 @@ def to_zarr( f"``region`` with to_zarr(), got {append_dim} in both" ) + if zarr_version is None: + # default to 2 if store doesn't specify it's version (e.g. a path) + zarr_version = getattr(store, "_store_version", 2) + + if consolidated is None and zarr_version > 2: + consolidated = False + if mode == "r+": already_consolidated = consolidated consolidate_on_close = False diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index ffe6ae1ff71..0a7a0839f43 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -3,7 +3,6 @@ import json import os import warnings -from typing import Optional import numpy as np @@ -375,6 +374,17 @@ def open_group( if zarr_version > 2: open_kwargs["zarr_version"] = zarr_version + if consolidated or consolidate_on_close: + raise ValueError( + "consolidated metadata has not been implemented for zarr " + f"version {zarr_version} yet. Set consolidated=False for " + f"zarr version {zarr_version}. See also " + "https://github.com/zarr-developers/zarr-specs/issues/136" + ) + + if consolidated is None: + consolidated = False + if chunk_store: open_kwargs["chunk_store"] = chunk_store if consolidated is None: @@ -745,6 +755,9 @@ def open_zarr( capability. Only works for stores that have already been consolidated. By default (`consolidate=None`), attempts to read consolidated metadata, falling back to read non-consolidated metadata if that fails. + + When the experimental ``zarr_version=3``, ``consolidated`` must be + either be ``None`` or ``False``. chunk_store : MutableMapping, optional A separate Zarr store only for chunk data. storage_options : dict, optional diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index bcd49912339..782739d4f5f 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1753,6 +1753,7 @@ def test_write_inconsistent_chunks(self) -> None: class ZarrBase(CFEncodedBase): DIMENSION_KEY = "_ARRAY_DIMENSIONS" + zarr_version = 2 version_kwargs = {} def create_zarr_target(self): @@ -1790,16 +1791,22 @@ def roundtrip( @pytest.mark.parametrize("consolidated", [False, True, None]) def test_roundtrip_consolidated(self, consolidated) -> None: + if consolidated and self.zarr_version > 2: + pytest.xfail("consolidated metadata is not supported for zarr v3 yet") expected = create_test_data() with self.roundtrip( expected, - save_kwargs={"consolidated": True}, - open_kwargs={"backend_kwargs": {"consolidated": True}}, + save_kwargs={"consolidated": consolidated}, + open_kwargs={"backend_kwargs": {"consolidated": consolidated}}, ) as actual: self.check_dtypes_roundtripped(expected, actual) assert_identical(expected, actual) def test_read_non_consolidated_warning(self) -> None: + + if self.zarr_version > 2: + pytest.xfail("consolidated metadata is not supported for zarr v3 yet") + expected = create_test_data() with self.create_zarr_target() as store: expected.to_zarr(store, consolidated=False, **self.version_kwargs) @@ -2214,11 +2221,10 @@ def test_append_with_existing_encoding_raises(self) -> None: def test_append_string_length_mismatch_raises(self, dtype) -> None: ds, ds_to_append = create_append_string_length_mismatch_test_data(dtype) with self.create_zarr_target() as store_target: - ds.to_zarr(store_target, mode="w") + ds.to_zarr(store_target, mode="w", **self.version_kwargs) with pytest.raises(ValueError, match="Mismatched dtypes for variable"): ds_to_append.to_zarr( - store_target, - append_dim="time", + store_target, append_dim="time", **self.version_kwargs ) def test_check_encoding_is_consistent_after_append(self) -> None: @@ -2336,12 +2342,14 @@ def test_no_warning_from_open_emptydim_with_chunks(self) -> None: with self.roundtrip(ds, open_kwargs=dict(chunks={"a": 1})) as ds_reload: assert_identical(ds, ds_reload) - @pytest.mark.parametrize("consolidated", [False, True]) + @pytest.mark.parametrize("consolidated", [False, True, None]) @pytest.mark.parametrize("compute", [False, True]) @pytest.mark.parametrize("use_dask", [False, True]) def test_write_region(self, consolidated, compute, use_dask) -> None: if (use_dask or not compute) and not has_dask: pytest.skip("requires dask") + if consolidated and self.zarr_version > 2: + pytest.xfail("consolidated metadata is not supported for zarr v3 yet") zeros = Dataset({"u": (("x",), np.zeros(10))}) nonzeros = Dataset({"u": (("x",), np.arange(1, 11))}) @@ -2576,14 +2584,14 @@ def test_attributes(self, obj) -> None: obj.attrs["good"] = {"key": "value"} ds = obj if isinstance(obj, Dataset) else obj.to_dataset() with self.create_zarr_target() as store_target: - ds.to_zarr(store_target) - assert_identical(ds, xr.open_zarr(store_target)) + ds.to_zarr(store_target, **self.version_kwargs) + assert_identical(ds, xr.open_zarr(store_target, **self.version_kwargs)) obj.attrs["bad"] = DataArray() ds = obj if isinstance(obj, Dataset) else obj.to_dataset() with self.create_zarr_target() as store_target: with pytest.raises(TypeError, match=r"Invalid attribute in Dataset.attrs."): - ds.to_zarr(store_target) + ds.to_zarr(store_target, **self.version_kwargs) @requires_zarr @@ -2615,6 +2623,8 @@ def create_store(self): class ZarrBaseV3(ZarrBase): + zarr_version = 3 + def test_roundtrip_coordinates_with_space(self): original = Dataset(coords={"x": 0, "y z": 1}) with pytest.warns(SerializationWarning): From 1556ce09582230a602cc6db97fd7f5edf45731ac Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Fri, 28 Oct 2022 16:41:31 -0700 Subject: [PATCH 08/16] whats new a + remove more consolidated metadata for v3 --- doc/whats-new.rst | 2 ++ xarray/backends/zarr.py | 7 +------ xarray/core/dataset.py | 3 +++ 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 10de49e23d3..635e4bb646d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -31,6 +31,8 @@ New Features - Added methods :py:meth:`DataArrayGroupBy.cumprod` and :py:meth:`DatasetGroupBy.cumprod`. (:pull:`5816`) By `Patrick Naylor `_ +- Add experimental support for Zarr's V3 specification. (:pull:`6475`). + By `Gregory Lee `_ and `Joe Hamman `_. Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 0a7a0839f43..9905da0e5f2 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -593,13 +593,8 @@ def store( self.set_variables( variables_encoded, check_encoding_set, writer, unlimited_dims=unlimited_dims ) - zarr_version = getattr(self.zarr_group.store, "_store_version", 3) - consolidate_kwargs = {} - if zarr_version > 2: - # If the group has a path, it needs to also be passed to consolidate_metadata - consolidate_kwargs["path"] = self.zarr_group.path if self._consolidate_on_close: - zarr.consolidate_metadata(self.zarr_group.store, **consolidate_kwargs) + zarr.consolidate_metadata(self.zarr_group.store) def sync(self): pass diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 629e3adbea3..18bb020c2a7 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2019,6 +2019,9 @@ def to_zarr( metadata; if False, do not. The default (`consolidated=None`) means write consolidated metadata and attempt to read consolidated metadata for existing stores (falling back to non-consolidated). + + When the experimental ``zarr_version=3``, ``consolidated`` must be + either be ``None`` or ``False``. append_dim : hashable, optional If set, the dimension along which the data will be appended. All other dimensions on overridden variables must remain the same size. From 5885134cae093f183a1d857693824d2bd1bfc4d7 Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Fri, 28 Oct 2022 19:45:11 -0700 Subject: [PATCH 09/16] activate upstream dev test for zarr v3 --- .github/workflows/upstream-dev-ci.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index d451f46d32e..7a71eb75a54 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -79,6 +79,7 @@ jobs: if: success() id: status run: | + export ZARR_V3_EXPERIMENTAL_API=1 python -m pytest --timeout=60 -rf \ --report-log output-${{ matrix.python-version }}-log.jsonl - name: Generate and publish the report From 3ca92f0c6e18ef3d82d31af0bfd7b701cba8c229 Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Sat, 29 Oct 2022 14:27:34 -0700 Subject: [PATCH 10/16] better typing --- xarray/backends/api.py | 4 +++- xarray/tests/test_backends.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index a7aa65df99f..c0126d5f06f 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1503,6 +1503,7 @@ def to_zarr( region: Mapping[str, slice] | None = None, safe_chunks: bool = True, storage_options: dict[str, str] | None = None, + zarr_version: int | None = None, ) -> backends.ZarrStore: ... @@ -1524,6 +1525,7 @@ def to_zarr( region: Mapping[str, slice] | None = None, safe_chunks: bool = True, storage_options: dict[str, str] | None = None, + zarr_version: int | None = None, ) -> Delayed: ... @@ -1611,7 +1613,7 @@ def to_zarr( if zarr_version is None: # default to 2 if store doesn't specify it's version (e.g. a path) - zarr_version = getattr(store, "_store_version", 2) + zarr_version = int(getattr(store, "_store_version", 2)) if consolidated is None and zarr_version > 2: consolidated = False diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 782739d4f5f..70987a3dd38 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1754,7 +1754,7 @@ class ZarrBase(CFEncodedBase): DIMENSION_KEY = "_ARRAY_DIMENSIONS" zarr_version = 2 - version_kwargs = {} + version_kwargs: dict[str, Any] = {} def create_zarr_target(self): raise NotImplementedError From 3c42a95bda8af656dac078ac96c82d9740f57d32 Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Fri, 4 Nov 2022 16:31:37 -0700 Subject: [PATCH 11/16] untype zarr_version in open_dataset --- xarray/backends/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 9905da0e5f2..b569e9ccb2c 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -876,7 +876,7 @@ def open_dataset( chunk_store=None, storage_options=None, stacklevel=3, - zarr_version: int | None = None, + zarr_version=None, ): filename_or_obj = _normalize_path(filename_or_obj) From 4e29496640088823a663a42c4489e3db27a5172b Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Fri, 4 Nov 2022 16:34:11 -0700 Subject: [PATCH 12/16] update whats new --- doc/whats-new.rst | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 1c277e15d5f..e42212802cb 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -14,6 +14,16 @@ What's New np.random.seed(123456) +.. _whats-new.2022.11.1: + +v2022.11.1 (Nov TK, 2022) +------------------------ + +New Features +~~~~~~~~~~~~ + +- Add experimental support for Zarr's V3 specification. (:pull:`6475`). + By `Gregory Lee `_ and `Joe Hamman `_. .. _whats-new.2022.11.0: @@ -37,8 +47,6 @@ New Features - Added methods :py:meth:`DataArrayGroupBy.cumprod` and :py:meth:`DatasetGroupBy.cumprod`. (:pull:`5816`) By `Patrick Naylor `_ -- Add experimental support for Zarr's V3 specification. (:pull:`6475`). - By `Gregory Lee `_ and `Joe Hamman `_. Breaking changes ~~~~~~~~~~~~~~~~ From 4a6d5c94bae15c1ef1f77f9d3360211041152ec6 Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Sat, 5 Nov 2022 07:40:47 -0700 Subject: [PATCH 13/16] [test-upstream] From 0a848572182beab149724ee9fbc5e3585f75cfa8 Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Fri, 18 Nov 2022 09:14:33 -0800 Subject: [PATCH 14/16] update comment --- xarray/backends/zarr.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index b569e9ccb2c..cca2d89678f 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -459,9 +459,8 @@ def open_store_variable(self, name, zarr_array): ) attributes = dict(attributes) - # TODO: how to properly handle 'filters' for v3 stores - # currently these use a hack to store 'filters' within attributes - # need to drop this here for V3 store tests to succeed + # TODO: this should not be needed once + # https://github.com/zarr-developers/zarr-python/issues/1269 is resolved. attributes.pop("filters", None) encoding = { From c953864939946648e45df7572f43d6f0f0b1b22b Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Fri, 18 Nov 2022 16:27:27 -0800 Subject: [PATCH 15/16] fix whats new --- doc/whats-new.rst | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ed6a6caabad..1268b4e730b 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -16,23 +16,14 @@ What's New .. _whats-new.2022.11.1: -v2022.11.1 (Nov TK, 2022) ------------------------- - -New Features -~~~~~~~~~~~~ - -- Add experimental support for Zarr's V3 specification. (:pull:`6475`). - By `Gregory Lee `_ and `Joe Hamman `_. - -.. _whats-new.2022.11.1: - v2022.11.1 (unreleased) ----------------------- New Features ~~~~~~~~~~~~ +- Add experimental support for Zarr's V3 specification. (:pull:`6475`). + By `Gregory Lee `_ and `Joe Hamman `_. Breaking changes ~~~~~~~~~~~~~~~~ From 2d471b3076875ef3f3091cb05b3f099cad38bebd Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Sat, 19 Nov 2022 08:49:07 -0800 Subject: [PATCH 16/16] update whats new --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 1268b4e730b..2e90a660701 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,7 +22,7 @@ v2022.11.1 (unreleased) New Features ~~~~~~~~~~~~ -- Add experimental support for Zarr's V3 specification. (:pull:`6475`). +- Add experimental support for Zarr's in-progress V3 specification. (:pull:`6475`). By `Gregory Lee `_ and `Joe Hamman `_. Breaking changes