diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index d8f2b99acac..1fc58c4f0bf 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -130,7 +130,7 @@ jobs: python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report xarray/ - name: Upload mypy coverage to Codecov - uses: codecov/codecov-action@v4.4.1 + uses: codecov/codecov-action@v4.5.0 with: file: mypy_report/cobertura.xml flags: mypy @@ -184,7 +184,7 @@ jobs: python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report xarray/ - name: Upload mypy coverage to Codecov - uses: codecov/codecov-action@v4.4.1 + uses: codecov/codecov-action@v4.5.0 with: file: mypy_report/cobertura.xml flags: mypy39 @@ -245,7 +245,7 @@ jobs: python -m pyright xarray/ - name: Upload pyright coverage to Codecov - uses: codecov/codecov-action@v4.4.1 + uses: codecov/codecov-action@v4.5.0 with: file: pyright_report/cobertura.xml flags: pyright @@ -304,7 +304,7 @@ jobs: python -m pyright xarray/ - name: Upload pyright coverage to Codecov - uses: codecov/codecov-action@v4.4.1 + uses: codecov/codecov-action@v4.5.0 with: file: pyright_report/cobertura.xml flags: pyright39 diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 2975ba8829f..6f9ba4a440d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -159,7 +159,7 @@ jobs: path: pytest.xml - name: Upload code coverage to Codecov - uses: codecov/codecov-action@v4.4.1 + uses: codecov/codecov-action@v4.5.0 with: file: ./coverage.xml flags: unittests diff --git a/.github/workflows/pypi-release.yaml b/.github/workflows/pypi-release.yaml index e7aec6e8f39..d50fff220a5 100644 --- a/.github/workflows/pypi-release.yaml +++ b/.github/workflows/pypi-release.yaml @@ -88,7 +88,7 @@ jobs: path: dist - name: Publish package to TestPyPI if: github.event_name == 'push' - uses: pypa/gh-action-pypi-publish@v1.8.14 + uses: pypa/gh-action-pypi-publish@v1.9.0 with: repository_url: https://test.pypi.org/legacy/ verbose: true @@ -111,6 +111,6 @@ jobs: name: releases path: dist - name: Publish package to PyPI - uses: pypa/gh-action-pypi-publish@v1.8.14 + uses: pypa/gh-action-pypi-publish@v1.9.0 with: verbose: true diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index e2fbabf39c4..4bc1b693a00 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -146,7 +146,7 @@ jobs: run: | python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report - name: Upload mypy coverage to Codecov - uses: codecov/codecov-action@v4.4.1 + uses: codecov/codecov-action@v4.5.0 with: file: mypy_report/cobertura.xml flags: mypy diff --git a/README.md b/README.md index 432d535d1b1..dcf71217e2c 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ [![Available on pypi](https://img.shields.io/pypi/v/xarray.svg)](https://pypi.python.org/pypi/xarray/) [![Formatted with black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black) [![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/) -[![Mirror on zendoo](https://zenodo.org/badge/DOI/10.5281/zenodo.598201.svg)](https://doi.org/10.5281/zenodo.598201) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.11183201.svg)](https://doi.org/10.5281/zenodo.11183201) [![Examples on binder](https://img.shields.io/badge/launch-binder-579ACA.svg?logo=)](https://mybinder.org/v2/gh/pydata/xarray/main?urlpath=lab/tree/doc/examples/weather-data.ipynb) [![Twitter](https://img.shields.io/twitter/follow/xarray_dev?style=social)](https://twitter.com/xarray_dev) @@ -46,15 +46,15 @@ provide a powerful and concise interface. For example: - Apply operations over dimensions by name: `x.sum('time')`. - Select values by label instead of integer location: - `x.loc['2014-01-01']` or `x.sel(time='2014-01-01')`. + `x.loc['2014-01-01']` or `x.sel(time='2014-01-01')`. - Mathematical operations (e.g., `x - y`) vectorize across multiple - dimensions (array broadcasting) based on dimension names, not shape. + dimensions (array broadcasting) based on dimension names, not shape. - Flexible split-apply-combine operations with groupby: - `x.groupby('time.dayofyear').mean()`. + `x.groupby('time.dayofyear').mean()`. - Database like alignment based on coordinate labels that smoothly - handles missing values: `x, y = xr.align(x, y, join='outer')`. + handles missing values: `x, y = xr.align(x, y, join='outer')`. - Keep track of arbitrary metadata in the form of a Python dictionary: - `x.attrs`. + `x.attrs`. ## Documentation @@ -73,12 +73,12 @@ page](https://docs.xarray.dev/en/stable/contributing.html). ## Get in touch - Ask usage questions ("How do I?") on - [GitHub Discussions](https://github.com/pydata/xarray/discussions). + [GitHub Discussions](https://github.com/pydata/xarray/discussions). - Report bugs, suggest features or view the source code [on - GitHub](https://github.com/pydata/xarray). + GitHub](https://github.com/pydata/xarray). - For less well defined questions or ideas, or to announce other - projects of interest to xarray users, use the [mailing - list](https://groups.google.com/forum/#!forum/xarray). + projects of interest to xarray users, use the [mailing + list](https://groups.google.com/forum/#!forum/xarray). ## NumFOCUS @@ -114,7 +114,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index fd6b7708e48..da414bc383e 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -741,6 +741,65 @@ instance and pass this, as follows: .. _Google Cloud Storage: https://cloud.google.com/storage/ .. _gcsfs: https://github.com/fsspec/gcsfs +.. _io.zarr.distributed_writes: + +Distributed writes +~~~~~~~~~~~~~~~~~~ + +Xarray will natively use dask to write in parallel to a zarr store, which should +satisfy most moderately sized datasets. For more flexible parallelization, we +can use ``region`` to write to limited regions of arrays in an existing Zarr +store. + +To scale this up to writing large datasets, first create an initial Zarr store +without writing all of its array data. This can be done by first creating a +``Dataset`` with dummy values stored in :ref:`dask `, and then calling +``to_zarr`` with ``compute=False`` to write only metadata (including ``attrs``) +to Zarr: + +.. ipython:: python + :suppress: + + ! rm -rf path/to/directory.zarr + +.. ipython:: python + + import dask.array + + # The values of this dask array are entirely irrelevant; only the dtype, + # shape and chunks are used + dummies = dask.array.zeros(30, chunks=10) + ds = xr.Dataset({"foo": ("x", dummies)}, coords={"x": np.arange(30)}) + path = "path/to/directory.zarr" + # Now we write the metadata without computing any array values + ds.to_zarr(path, compute=False) + +Now, a Zarr store with the correct variable shapes and attributes exists that +can be filled out by subsequent calls to ``to_zarr``. +Setting ``region="auto"`` will open the existing store and determine the +correct alignment of the new data with the existing dimensions, or as an +explicit mapping from dimension names to Python ``slice`` objects indicating +where the data should be written (in index space, not label space), e.g., + +.. ipython:: python + + # For convenience, we'll slice a single dataset, but in the real use-case + # we would create them separately possibly even from separate processes. + ds = xr.Dataset({"foo": ("x", np.arange(30))}, coords={"x": np.arange(30)}) + # Any of the following region specifications are valid + ds.isel(x=slice(0, 10)).to_zarr(path, region="auto") + ds.isel(x=slice(10, 20)).to_zarr(path, region={"x": "auto"}) + ds.isel(x=slice(20, 30)).to_zarr(path, region={"x": slice(20, 30)}) + +Concurrent writes with ``region`` are safe as long as they modify distinct +chunks in the underlying Zarr arrays (or use an appropriate ``lock``). + +As a safety check to make it harder to inadvertently override existing values, +if you set ``region`` then *all* variables included in a Dataset must have +dimensions included in ``region``. Other variables (typically coordinates) +need to be explicitly dropped and/or written in a separate calls to ``to_zarr`` +with ``mode='a'``. + Zarr Compressors and Filters ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -767,37 +826,6 @@ For example: Not all native zarr compression and filtering options have been tested with xarray. -.. _io.zarr.consolidated_metadata: - -Consolidated Metadata -~~~~~~~~~~~~~~~~~~~~~ - -Xarray needs to read all of the zarr metadata when it opens a dataset. -In some storage mediums, such as with cloud object storage (e.g. `Amazon S3`_), -this can introduce significant overhead, because two separate HTTP calls to the -object store must be made for each variable in the dataset. -By default Xarray uses a feature called -*consolidated metadata*, storing all metadata for the entire dataset with a -single key (by default called ``.zmetadata``). This typically drastically speeds -up opening the store. (For more information on this feature, consult the -`zarr docs on consolidating metadata `_.) - -By default, xarray writes consolidated metadata and attempts to read stores -with consolidated metadata, falling back to use non-consolidated metadata for -reads. Because this fall-back option is so much slower, xarray issues a -``RuntimeWarning`` with guidance when reading with consolidated metadata fails: - - Failed to open Zarr store with consolidated metadata, falling back to try - reading non-consolidated metadata. This is typically much slower for - opening a dataset. To silence this warning, consider: - - 1. Consolidating metadata in this existing store with - :py:func:`zarr.consolidate_metadata`. - 2. Explicitly setting ``consolidated=False``, to avoid trying to read - consolidate metadata. - 3. Explicitly setting ``consolidated=True``, to raise an error in this case - instead of falling back to try reading non-consolidated metadata. - .. _io.zarr.appending: Modifying existing Zarr stores @@ -856,59 +884,6 @@ order, e.g., for time-stepping a simulation: ) ds2.to_zarr("path/to/directory.zarr", append_dim="t") -Finally, you can use ``region`` to write to limited regions of existing arrays -in an existing Zarr store. This is a good option for writing data in parallel -from independent processes. - -To scale this up to writing large datasets, the first step is creating an -initial Zarr store without writing all of its array data. This can be done by -first creating a ``Dataset`` with dummy values stored in :ref:`dask `, -and then calling ``to_zarr`` with ``compute=False`` to write only metadata -(including ``attrs``) to Zarr: - -.. ipython:: python - :suppress: - - ! rm -rf path/to/directory.zarr - -.. ipython:: python - - import dask.array - - # The values of this dask array are entirely irrelevant; only the dtype, - # shape and chunks are used - dummies = dask.array.zeros(30, chunks=10) - ds = xr.Dataset({"foo": ("x", dummies)}, coords={"x": np.arange(30)}) - path = "path/to/directory.zarr" - # Now we write the metadata without computing any array values - ds.to_zarr(path, compute=False) - -Now, a Zarr store with the correct variable shapes and attributes exists that -can be filled out by subsequent calls to ``to_zarr``. -Setting ``region="auto"`` will open the existing store and determine the -correct alignment of the new data with the existing coordinates, or as an -explicit mapping from dimension names to Python ``slice`` objects indicating -where the data should be written (in index space, not label space), e.g., - -.. ipython:: python - - # For convenience, we'll slice a single dataset, but in the real use-case - # we would create them separately possibly even from separate processes. - ds = xr.Dataset({"foo": ("x", np.arange(30))}, coords={"x": np.arange(30)}) - # Any of the following region specifications are valid - ds.isel(x=slice(0, 10)).to_zarr(path, region="auto") - ds.isel(x=slice(10, 20)).to_zarr(path, region={"x": "auto"}) - ds.isel(x=slice(20, 30)).to_zarr(path, region={"x": slice(20, 30)}) - -Concurrent writes with ``region`` are safe as long as they modify distinct -chunks in the underlying Zarr arrays (or use an appropriate ``lock``). - -As a safety check to make it harder to inadvertently override existing values, -if you set ``region`` then *all* variables included in a Dataset must have -dimensions included in ``region``. Other variables (typically coordinates) -need to be explicitly dropped and/or written in a separate calls to ``to_zarr`` -with ``mode='a'``. - .. _io.zarr.writing_chunks: Specifying chunks in a zarr store @@ -978,6 +953,38 @@ length of each dimension by using the shorthand chunk size ``-1``: The number of chunks on Tair matches our dask chunks, while there is now only a single chunk in the directory stores of each coordinate. +.. _io.zarr.consolidated_metadata: + +Consolidated Metadata +~~~~~~~~~~~~~~~~~~~~~ + +Xarray needs to read all of the zarr metadata when it opens a dataset. +In some storage mediums, such as with cloud object storage (e.g. `Amazon S3`_), +this can introduce significant overhead, because two separate HTTP calls to the +object store must be made for each variable in the dataset. +By default Xarray uses a feature called +*consolidated metadata*, storing all metadata for the entire dataset with a +single key (by default called ``.zmetadata``). This typically drastically speeds +up opening the store. (For more information on this feature, consult the +`zarr docs on consolidating metadata `_.) + +By default, xarray writes consolidated metadata and attempts to read stores +with consolidated metadata, falling back to use non-consolidated metadata for +reads. Because this fall-back option is so much slower, xarray issues a +``RuntimeWarning`` with guidance when reading with consolidated metadata fails: + + Failed to open Zarr store with consolidated metadata, falling back to try + reading non-consolidated metadata. This is typically much slower for + opening a dataset. To silence this warning, consider: + + 1. Consolidating metadata in this existing store with + :py:func:`zarr.consolidate_metadata`. + 2. Explicitly setting ``consolidated=False``, to avoid trying to read + consolidate metadata. + 3. Explicitly setting ``consolidated=True``, to raise an error in this case + instead of falling back to try reading non-consolidated metadata. + + .. _io.iris: Iris diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 7ec6e08ef96..e7a48458ae2 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,7 +22,8 @@ v2024.06.1 (unreleased) New Features ~~~~~~~~~~~~ - +- Allow chunking for arrays with duplicated dimension names (:issue:`8759`, :pull:`9099`). + By `Martin Raspaud `_. Breaking changes ~~~~~~~~~~~~~~~~ @@ -73,7 +74,6 @@ Bug fixes support arbitrary kwargs such as ``order`` for polynomial interpolation (:issue:`8762`). By `Nicolas Karasiak `_. - Documentation ~~~~~~~~~~~~~ - Add link to CF Conventions on packed data and sentence on type determination in the I/O user guide (:issue:`9041`, :pull:`9045`). diff --git a/xarray/namedarray/core.py b/xarray/namedarray/core.py index 960ab9d4d1d..fe47bf50533 100644 --- a/xarray/namedarray/core.py +++ b/xarray/namedarray/core.py @@ -812,7 +812,12 @@ def chunk( chunks = either_dict_or_kwargs(chunks, chunks_kwargs, "chunk") if is_dict_like(chunks): - chunks = {self.get_axis_num(dim): chunk for dim, chunk in chunks.items()} + # This method of iteration allows for duplicated dimension names, GH8579 + chunks = { + dim_number: chunks[dim] + for dim_number, dim in enumerate(self.dims) + if dim in chunks + } chunkmanager = guess_chunkmanager(chunked_array_type) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 3ac638c3c5f..20491eca91a 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -638,6 +638,13 @@ def counting_get(*args, **kwargs): assert count[0] == 1 + def test_duplicate_dims(self): + data = np.random.normal(size=(4, 4)) + arr = DataArray(data, dims=("x", "x")) + chunked_array = arr.chunk({"x": 2}) + assert chunked_array.chunks == ((2, 2), (2, 2)) + assert chunked_array.chunksizes == {"x": (2, 2)} + def test_stack(self): data = da.random.normal(size=(2, 3, 4), chunks=(1, 3, 4)) arr = DataArray(data, dims=("w", "x", "y")) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index ece2ddf8144..659c7c168a5 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -110,13 +110,14 @@ def test_repr(self) -> None: assert expected == repr(data_array) def test_repr_multiindex(self) -> None: + obj_size = np.dtype("O").itemsize expected = dedent( - """\ + f"""\ Size: 32B array([0, 1, 2, 3], dtype=uint64) Coordinates: - * x (x) object 32B MultiIndex - * level_1 (x) object 32B 'a' 'a' 'b' 'b' + * x (x) object {4 * obj_size}B MultiIndex + * level_1 (x) object {4 * obj_size}B 'a' 'a' 'b' 'b' * level_2 (x) int64 32B 1 2 1 2""" ) assert expected == repr(self.mda) @@ -129,15 +130,16 @@ def test_repr_multiindex_long(self) -> None: mda_long = DataArray( list(range(32)), coords={"x": mindex_long}, dims="x" ).astype(np.uint64) + obj_size = np.dtype("O").itemsize expected = dedent( - """\ + f"""\ Size: 256B array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], dtype=uint64) Coordinates: - * x (x) object 256B MultiIndex - * level_1 (x) object 256B 'a' 'a' 'a' 'a' 'a' 'a' ... 'd' 'd' 'd' 'd' 'd' 'd' + * x (x) object {32 * obj_size}B MultiIndex + * level_1 (x) object {32 * obj_size}B 'a' 'a' 'a' 'a' 'a' 'a' ... 'd' 'd' 'd' 'd' 'd' 'd' * level_2 (x) int64 256B 1 2 3 4 5 6 7 8 1 2 3 4 ... 5 6 7 8 1 2 3 4 5 6 7 8""" ) assert expected == repr(mda_long) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 81b27da8d5f..f6829861776 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -335,13 +335,14 @@ def test_repr(self) -> None: def test_repr_multiindex(self) -> None: data = create_test_multiindex() + obj_size = np.dtype("O").itemsize expected = dedent( - """\ - Size: 96B + f"""\ + Size: {8 * obj_size + 32}B Dimensions: (x: 4) Coordinates: - * x (x) object 32B MultiIndex - * level_1 (x) object 32B 'a' 'a' 'b' 'b' + * x (x) object {4 * obj_size}B MultiIndex + * level_1 (x) object {4 * obj_size}B 'a' 'a' 'b' 'b' * level_2 (x) int64 32B 1 2 1 2 Data variables: *empty*""" @@ -357,12 +358,12 @@ def test_repr_multiindex(self) -> None: midx_coords = Coordinates.from_pandas_multiindex(midx, "x") data = Dataset({}, midx_coords) expected = dedent( - """\ - Size: 96B + f"""\ + Size: {8 * obj_size + 32}B Dimensions: (x: 4) Coordinates: - * x (x) object 32B MultiIndex - * a_quite_long_level_name (x) object 32B 'a' 'a' 'b' 'b' + * x (x) object {4 * obj_size}B MultiIndex + * a_quite_long_level_name (x) object {4 * obj_size}B 'a' 'a' 'b' 'b' * level_2 (x) int64 32B 1 2 1 2 Data variables: *empty*""" diff --git a/xarray/tests/test_formatting.py b/xarray/tests/test_formatting.py index 2c40ac88f98..b9d5f401a4a 100644 --- a/xarray/tests/test_formatting.py +++ b/xarray/tests/test_formatting.py @@ -12,8 +12,6 @@ from xarray.core.datatree import DataTree # TODO: Remove when can do xr.DataTree from xarray.tests import requires_cftime, requires_dask, requires_netCDF4 -ON_WINDOWS = sys.platform == "win32" - class TestFormatting: def test_get_indexer_at_least_n_items(self) -> None: @@ -1071,74 +1069,34 @@ def test_array_repr_dtypes(): """.strip() assert actual == expected - -@pytest.mark.skipif( - ON_WINDOWS, - reason="Default numpy's dtypes vary according to OS", -) -def test_array_repr_dtypes_unix() -> None: - # Signed integer dtypes - ds = xr.DataArray(np.array([0]), dims="x") + array = np.array([0]) + ds = xr.DataArray(array, dims="x") actual = repr(ds) - expected = """ - Size: 8B -array([0]) + expected = f""" + Size: {array.dtype.itemsize}B +{repr(array)} Dimensions without coordinates: x """.strip() assert actual == expected - ds = xr.DataArray(np.array([0], dtype="int32"), dims="x") + array = np.array([0], dtype="int32") + ds = xr.DataArray(array, dims="x") actual = repr(ds) - expected = """ + expected = f""" Size: 4B -array([0], dtype=int32) +{repr(array)} Dimensions without coordinates: x """.strip() assert actual == expected - ds = xr.DataArray(np.array([0], dtype="int64"), dims="x") + array = np.array([0], dtype="int64") + ds = xr.DataArray(array, dims="x") actual = repr(ds) - expected = """ - Size: 8B -array([0]) -Dimensions without coordinates: x - """.strip() - assert actual == expected - - -@pytest.mark.skipif( - not ON_WINDOWS, - reason="Default numpy's dtypes vary according to OS", -) -def test_array_repr_dtypes_on_windows() -> None: - - # Integer dtypes - - ds = xr.DataArray(np.array([0]), dims="x") - actual = repr(ds) - expected = """ - Size: 4B -array([0]) -Dimensions without coordinates: x - """.strip() - assert actual == expected - - ds = xr.DataArray(np.array([0], dtype="int32"), dims="x") - actual = repr(ds) - expected = """ - Size: 4B -array([0]) -Dimensions without coordinates: x - """.strip() - assert actual == expected - - ds = xr.DataArray(np.array([0], dtype="int64"), dims="x") - actual = repr(ds) - expected = """ + expected = f""" Size: 8B -array([0], dtype=int64) +{repr(array)} Dimensions without coordinates: x """.strip() assert actual == expected