From 5ac8394526d18d54f79d299064cd73d07188b78d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= Date: Sun, 16 Jun 2024 21:04:07 +0200 Subject: [PATCH 1/5] adjust repr tests to account for different platforms (#9127) (#9128) * adjust repr tests to account for different platforms (#9127) Adjust the expectations in repr tests to account for different object sizes and numpy type representations across platforms, particularly fixing the tests on 32-bit platforms. Firstly, this involves getting the object type size from NumPy and using it to adjust the expectations in DataArray and Dataset tests. The tests were already using int64 type consistently, so only the sizes used for Python objects needed to be adjusted. Secondly, this involves fixing `test_array_repr_dtypes_unix`. The test specifically focuses on testing a 32-bit, 64-bit and "native" data type, which affect both size and actual representation (NumPy skips the dtype attribute for the native data type). Get the expected size from NumPy for the native int type, and reuse `repr()` from NumPy for all array types. * Try combining Unix and Windows dtype repr tests --- xarray/tests/test_dataarray.py | 14 ++++--- xarray/tests/test_dataset.py | 17 +++++---- xarray/tests/test_formatting.py | 68 +++++++-------------------------- 3 files changed, 30 insertions(+), 69 deletions(-) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index ece2ddf8144..659c7c168a5 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -110,13 +110,14 @@ def test_repr(self) -> None: assert expected == repr(data_array) def test_repr_multiindex(self) -> None: + obj_size = np.dtype("O").itemsize expected = dedent( - """\ + f"""\ Size: 32B array([0, 1, 2, 3], dtype=uint64) Coordinates: - * x (x) object 32B MultiIndex - * level_1 (x) object 32B 'a' 'a' 'b' 'b' + * x (x) object {4 * obj_size}B MultiIndex + * level_1 (x) object {4 * obj_size}B 'a' 'a' 'b' 'b' * level_2 (x) int64 32B 1 2 1 2""" ) assert expected == repr(self.mda) @@ -129,15 +130,16 @@ def test_repr_multiindex_long(self) -> None: mda_long = DataArray( list(range(32)), coords={"x": mindex_long}, dims="x" ).astype(np.uint64) + obj_size = np.dtype("O").itemsize expected = dedent( - """\ + f"""\ Size: 256B array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], dtype=uint64) Coordinates: - * x (x) object 256B MultiIndex - * level_1 (x) object 256B 'a' 'a' 'a' 'a' 'a' 'a' ... 'd' 'd' 'd' 'd' 'd' 'd' + * x (x) object {32 * obj_size}B MultiIndex + * level_1 (x) object {32 * obj_size}B 'a' 'a' 'a' 'a' 'a' 'a' ... 'd' 'd' 'd' 'd' 'd' 'd' * level_2 (x) int64 256B 1 2 3 4 5 6 7 8 1 2 3 4 ... 5 6 7 8 1 2 3 4 5 6 7 8""" ) assert expected == repr(mda_long) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 81b27da8d5f..f6829861776 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -335,13 +335,14 @@ def test_repr(self) -> None: def test_repr_multiindex(self) -> None: data = create_test_multiindex() + obj_size = np.dtype("O").itemsize expected = dedent( - """\ - Size: 96B + f"""\ + Size: {8 * obj_size + 32}B Dimensions: (x: 4) Coordinates: - * x (x) object 32B MultiIndex - * level_1 (x) object 32B 'a' 'a' 'b' 'b' + * x (x) object {4 * obj_size}B MultiIndex + * level_1 (x) object {4 * obj_size}B 'a' 'a' 'b' 'b' * level_2 (x) int64 32B 1 2 1 2 Data variables: *empty*""" @@ -357,12 +358,12 @@ def test_repr_multiindex(self) -> None: midx_coords = Coordinates.from_pandas_multiindex(midx, "x") data = Dataset({}, midx_coords) expected = dedent( - """\ - Size: 96B + f"""\ + Size: {8 * obj_size + 32}B Dimensions: (x: 4) Coordinates: - * x (x) object 32B MultiIndex - * a_quite_long_level_name (x) object 32B 'a' 'a' 'b' 'b' + * x (x) object {4 * obj_size}B MultiIndex + * a_quite_long_level_name (x) object {4 * obj_size}B 'a' 'a' 'b' 'b' * level_2 (x) int64 32B 1 2 1 2 Data variables: *empty*""" diff --git a/xarray/tests/test_formatting.py b/xarray/tests/test_formatting.py index 2c40ac88f98..b9d5f401a4a 100644 --- a/xarray/tests/test_formatting.py +++ b/xarray/tests/test_formatting.py @@ -12,8 +12,6 @@ from xarray.core.datatree import DataTree # TODO: Remove when can do xr.DataTree from xarray.tests import requires_cftime, requires_dask, requires_netCDF4 -ON_WINDOWS = sys.platform == "win32" - class TestFormatting: def test_get_indexer_at_least_n_items(self) -> None: @@ -1071,74 +1069,34 @@ def test_array_repr_dtypes(): """.strip() assert actual == expected - -@pytest.mark.skipif( - ON_WINDOWS, - reason="Default numpy's dtypes vary according to OS", -) -def test_array_repr_dtypes_unix() -> None: - # Signed integer dtypes - ds = xr.DataArray(np.array([0]), dims="x") + array = np.array([0]) + ds = xr.DataArray(array, dims="x") actual = repr(ds) - expected = """ - Size: 8B -array([0]) + expected = f""" + Size: {array.dtype.itemsize}B +{repr(array)} Dimensions without coordinates: x """.strip() assert actual == expected - ds = xr.DataArray(np.array([0], dtype="int32"), dims="x") + array = np.array([0], dtype="int32") + ds = xr.DataArray(array, dims="x") actual = repr(ds) - expected = """ + expected = f""" Size: 4B -array([0], dtype=int32) +{repr(array)} Dimensions without coordinates: x """.strip() assert actual == expected - ds = xr.DataArray(np.array([0], dtype="int64"), dims="x") + array = np.array([0], dtype="int64") + ds = xr.DataArray(array, dims="x") actual = repr(ds) - expected = """ - Size: 8B -array([0]) -Dimensions without coordinates: x - """.strip() - assert actual == expected - - -@pytest.mark.skipif( - not ON_WINDOWS, - reason="Default numpy's dtypes vary according to OS", -) -def test_array_repr_dtypes_on_windows() -> None: - - # Integer dtypes - - ds = xr.DataArray(np.array([0]), dims="x") - actual = repr(ds) - expected = """ - Size: 4B -array([0]) -Dimensions without coordinates: x - """.strip() - assert actual == expected - - ds = xr.DataArray(np.array([0], dtype="int32"), dims="x") - actual = repr(ds) - expected = """ - Size: 4B -array([0]) -Dimensions without coordinates: x - """.strip() - assert actual == expected - - ds = xr.DataArray(np.array([0], dtype="int64"), dims="x") - actual = repr(ds) - expected = """ + expected = f""" Size: 8B -array([0], dtype=int64) +{repr(array)} Dimensions without coordinates: x """.strip() assert actual == expected From 32e1f336b53e7aead4e7c8f85c3c8bac17d04b57 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 17 Jun 2024 08:24:57 -0600 Subject: [PATCH 2/5] Bump the actions group with 2 updates (#9130) Bumps the actions group with 2 updates: [codecov/codecov-action](https://github.com/codecov/codecov-action) and [pypa/gh-action-pypi-publish](https://github.com/pypa/gh-action-pypi-publish). Updates `codecov/codecov-action` from 4.4.1 to 4.5.0 - [Release notes](https://github.com/codecov/codecov-action/releases) - [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/codecov/codecov-action/compare/v4.4.1...v4.5.0) Updates `pypa/gh-action-pypi-publish` from 1.8.14 to 1.9.0 - [Release notes](https://github.com/pypa/gh-action-pypi-publish/releases) - [Commits](https://github.com/pypa/gh-action-pypi-publish/compare/v1.8.14...v1.9.0) --- updated-dependencies: - dependency-name: codecov/codecov-action dependency-type: direct:production update-type: version-update:semver-minor dependency-group: actions - dependency-name: pypa/gh-action-pypi-publish dependency-type: direct:production update-type: version-update:semver-minor dependency-group: actions ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/ci-additional.yaml | 8 ++++---- .github/workflows/ci.yaml | 2 +- .github/workflows/pypi-release.yaml | 4 ++-- .github/workflows/upstream-dev-ci.yaml | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index d8f2b99acac..1fc58c4f0bf 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -130,7 +130,7 @@ jobs: python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report xarray/ - name: Upload mypy coverage to Codecov - uses: codecov/codecov-action@v4.4.1 + uses: codecov/codecov-action@v4.5.0 with: file: mypy_report/cobertura.xml flags: mypy @@ -184,7 +184,7 @@ jobs: python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report xarray/ - name: Upload mypy coverage to Codecov - uses: codecov/codecov-action@v4.4.1 + uses: codecov/codecov-action@v4.5.0 with: file: mypy_report/cobertura.xml flags: mypy39 @@ -245,7 +245,7 @@ jobs: python -m pyright xarray/ - name: Upload pyright coverage to Codecov - uses: codecov/codecov-action@v4.4.1 + uses: codecov/codecov-action@v4.5.0 with: file: pyright_report/cobertura.xml flags: pyright @@ -304,7 +304,7 @@ jobs: python -m pyright xarray/ - name: Upload pyright coverage to Codecov - uses: codecov/codecov-action@v4.4.1 + uses: codecov/codecov-action@v4.5.0 with: file: pyright_report/cobertura.xml flags: pyright39 diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 2975ba8829f..6f9ba4a440d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -159,7 +159,7 @@ jobs: path: pytest.xml - name: Upload code coverage to Codecov - uses: codecov/codecov-action@v4.4.1 + uses: codecov/codecov-action@v4.5.0 with: file: ./coverage.xml flags: unittests diff --git a/.github/workflows/pypi-release.yaml b/.github/workflows/pypi-release.yaml index e7aec6e8f39..d50fff220a5 100644 --- a/.github/workflows/pypi-release.yaml +++ b/.github/workflows/pypi-release.yaml @@ -88,7 +88,7 @@ jobs: path: dist - name: Publish package to TestPyPI if: github.event_name == 'push' - uses: pypa/gh-action-pypi-publish@v1.8.14 + uses: pypa/gh-action-pypi-publish@v1.9.0 with: repository_url: https://test.pypi.org/legacy/ verbose: true @@ -111,6 +111,6 @@ jobs: name: releases path: dist - name: Publish package to PyPI - uses: pypa/gh-action-pypi-publish@v1.8.14 + uses: pypa/gh-action-pypi-publish@v1.9.0 with: verbose: true diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index e2fbabf39c4..4bc1b693a00 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -146,7 +146,7 @@ jobs: run: | python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report - name: Upload mypy coverage to Codecov - uses: codecov/codecov-action@v4.4.1 + uses: codecov/codecov-action@v4.5.0 with: file: mypy_report/cobertura.xml flags: mypy From be8e17e4dc5da67d7cbb09db87d80c1bbc71a64e Mon Sep 17 00:00:00 2001 From: Martin Raspaud Date: Mon, 17 Jun 2024 16:25:18 +0200 Subject: [PATCH 3/5] Support duplicate dimensions in `.chunk` (#9099) * Allow duplicate dimensions in chunking * Address review comments * fix whats-new * add comment * Update xarray/tests/test_dask.py --------- Co-authored-by: Deepak Cherian Co-authored-by: Deepak Cherian --- doc/whats-new.rst | 4 ++-- xarray/namedarray/core.py | 7 ++++++- xarray/tests/test_dask.py | 7 +++++++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 7ec6e08ef96..e7a48458ae2 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,7 +22,8 @@ v2024.06.1 (unreleased) New Features ~~~~~~~~~~~~ - +- Allow chunking for arrays with duplicated dimension names (:issue:`8759`, :pull:`9099`). + By `Martin Raspaud `_. Breaking changes ~~~~~~~~~~~~~~~~ @@ -73,7 +74,6 @@ Bug fixes support arbitrary kwargs such as ``order`` for polynomial interpolation (:issue:`8762`). By `Nicolas Karasiak `_. - Documentation ~~~~~~~~~~~~~ - Add link to CF Conventions on packed data and sentence on type determination in the I/O user guide (:issue:`9041`, :pull:`9045`). diff --git a/xarray/namedarray/core.py b/xarray/namedarray/core.py index 960ab9d4d1d..fe47bf50533 100644 --- a/xarray/namedarray/core.py +++ b/xarray/namedarray/core.py @@ -812,7 +812,12 @@ def chunk( chunks = either_dict_or_kwargs(chunks, chunks_kwargs, "chunk") if is_dict_like(chunks): - chunks = {self.get_axis_num(dim): chunk for dim, chunk in chunks.items()} + # This method of iteration allows for duplicated dimension names, GH8579 + chunks = { + dim_number: chunks[dim] + for dim_number, dim in enumerate(self.dims) + if dim in chunks + } chunkmanager = guess_chunkmanager(chunked_array_type) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 3ac638c3c5f..20491eca91a 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -638,6 +638,13 @@ def counting_get(*args, **kwargs): assert count[0] == 1 + def test_duplicate_dims(self): + data = np.random.normal(size=(4, 4)) + arr = DataArray(data, dims=("x", "x")) + chunked_array = arr.chunk({"x": 2}) + assert chunked_array.chunks == ((2, 2), (2, 2)) + assert chunked_array.chunksizes == {"x": (2, 2)} + def test_stack(self): data = da.random.normal(size=(2, 3, 4), chunks=(1, 3, 4)) arr = DataArray(data, dims=("w", "x", "y")) From b1f3fea467f9387ed35c221205a70524f4caa18b Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Mon, 17 Jun 2024 12:16:00 -0700 Subject: [PATCH 4/5] Update zendoo badge link (#9133) * Do we want the zendoo badge? It currently isn't a badge and the link is broken? * Update README.md Co-authored-by: Deepak Cherian --------- Co-authored-by: Deepak Cherian --- README.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 432d535d1b1..dcf71217e2c 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ [![Available on pypi](https://img.shields.io/pypi/v/xarray.svg)](https://pypi.python.org/pypi/xarray/) [![Formatted with black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black) [![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/) -[![Mirror on zendoo](https://zenodo.org/badge/DOI/10.5281/zenodo.598201.svg)](https://doi.org/10.5281/zenodo.598201) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.11183201.svg)](https://doi.org/10.5281/zenodo.11183201) [![Examples on binder](https://img.shields.io/badge/launch-binder-579ACA.svg?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAFkAAABZCAMAAABi1XidAAAB8lBMVEX///9XmsrmZYH1olJXmsr1olJXmsrmZYH1olJXmsr1olJXmsrmZYH1olL1olJXmsr1olJXmsrmZYH1olL1olJXmsrmZYH1olJXmsr1olL1olJXmsrmZYH1olL1olJXmsrmZYH1olL1olL0nFf1olJXmsrmZYH1olJXmsq8dZb1olJXmsrmZYH1olJXmspXmspXmsr1olL1olJXmsrmZYH1olJXmsr1olL1olJXmsrmZYH1olL1olLeaIVXmsrmZYH1olL1olL1olJXmsrmZYH1olLna31Xmsr1olJXmsr1olJXmsrmZYH1olLqoVr1olJXmsr1olJXmsrmZYH1olL1olKkfaPobXvviGabgadXmsqThKuofKHmZ4Dobnr1olJXmsr1olJXmspXmsr1olJXmsrfZ4TuhWn1olL1olJXmsqBi7X1olJXmspZmslbmMhbmsdemsVfl8ZgmsNim8Jpk8F0m7R4m7F5nLB6jbh7jbiDirOEibOGnKaMhq+PnaCVg6qWg6qegKaff6WhnpKofKGtnomxeZy3noG6dZi+n3vCcpPDcpPGn3bLb4/Mb47UbIrVa4rYoGjdaIbeaIXhoWHmZYHobXvpcHjqdHXreHLroVrsfG/uhGnuh2bwj2Hxk17yl1vzmljzm1j0nlX1olL3AJXWAAAAbXRSTlMAEBAQHx8gICAuLjAwMDw9PUBAQEpQUFBXV1hgYGBkcHBwcXl8gICAgoiIkJCQlJicnJ2goKCmqK+wsLC4usDAwMjP0NDQ1NbW3Nzg4ODi5+3v8PDw8/T09PX29vb39/f5+fr7+/z8/Pz9/v7+zczCxgAABC5JREFUeAHN1ul3k0UUBvCb1CTVpmpaitAGSLSpSuKCLWpbTKNJFGlcSMAFF63iUmRccNG6gLbuxkXU66JAUef/9LSpmXnyLr3T5AO/rzl5zj137p136BISy44fKJXuGN/d19PUfYeO67Znqtf2KH33Id1psXoFdW30sPZ1sMvs2D060AHqws4FHeJojLZqnw53cmfvg+XR8mC0OEjuxrXEkX5ydeVJLVIlV0e10PXk5k7dYeHu7Cj1j+49uKg7uLU61tGLw1lq27ugQYlclHC4bgv7VQ+TAyj5Zc/UjsPvs1sd5cWryWObtvWT2EPa4rtnWW3JkpjggEpbOsPr7F7EyNewtpBIslA7p43HCsnwooXTEc3UmPmCNn5lrqTJxy6nRmcavGZVt/3Da2pD5NHvsOHJCrdc1G2r3DITpU7yic7w/7Rxnjc0kt5GC4djiv2Sz3Fb2iEZg41/ddsFDoyuYrIkmFehz0HR2thPgQqMyQYb2OtB0WxsZ3BeG3+wpRb1vzl2UYBog8FfGhttFKjtAclnZYrRo9ryG9uG/FZQU4AEg8ZE9LjGMzTmqKXPLnlWVnIlQQTvxJf8ip7VgjZjyVPrjw1te5otM7RmP7xm+sK2Gv9I8Gi++BRbEkR9EBw8zRUcKxwp73xkaLiqQb+kGduJTNHG72zcW9LoJgqQxpP3/Tj//c3yB0tqzaml05/+orHLksVO+95kX7/7qgJvnjlrfr2Ggsyx0eoy9uPzN5SPd86aXggOsEKW2Prz7du3VID3/tzs/sSRs2w7ovVHKtjrX2pd7ZMlTxAYfBAL9jiDwfLkq55Tm7ifhMlTGPyCAs7RFRhn47JnlcB9RM5T97ASuZXIcVNuUDIndpDbdsfrqsOppeXl5Y+XVKdjFCTh+zGaVuj0d9zy05PPK3QzBamxdwtTCrzyg/2Rvf2EstUjordGwa/kx9mSJLr8mLLtCW8HHGJc2R5hS219IiF6PnTusOqcMl57gm0Z8kanKMAQg0qSyuZfn7zItsbGyO9QlnxY0eCuD1XL2ys/MsrQhltE7Ug0uFOzufJFE2PxBo/YAx8XPPdDwWN0MrDRYIZF0mSMKCNHgaIVFoBbNoLJ7tEQDKxGF0kcLQimojCZopv0OkNOyWCCg9XMVAi7ARJzQdM2QUh0gmBozjc3Skg6dSBRqDGYSUOu66Zg+I2fNZs/M3/f/Grl/XnyF1Gw3VKCez0PN5IUfFLqvgUN4C0qNqYs5YhPL+aVZYDE4IpUk57oSFnJm4FyCqqOE0jhY2SMyLFoo56zyo6becOS5UVDdj7Vih0zp+tcMhwRpBeLyqtIjlJKAIZSbI8SGSF3k0pA3mR5tHuwPFoa7N7reoq2bqCsAk1HqCu5uvI1n6JuRXI+S1Mco54YmYTwcn6Aeic+kssXi8XpXC4V3t7/ADuTNKaQJdScAAAAAElFTkSuQmCC)](https://mybinder.org/v2/gh/pydata/xarray/main?urlpath=lab/tree/doc/examples/weather-data.ipynb) [![Twitter](https://img.shields.io/twitter/follow/xarray_dev?style=social)](https://twitter.com/xarray_dev) @@ -46,15 +46,15 @@ provide a powerful and concise interface. For example: - Apply operations over dimensions by name: `x.sum('time')`. - Select values by label instead of integer location: - `x.loc['2014-01-01']` or `x.sel(time='2014-01-01')`. + `x.loc['2014-01-01']` or `x.sel(time='2014-01-01')`. - Mathematical operations (e.g., `x - y`) vectorize across multiple - dimensions (array broadcasting) based on dimension names, not shape. + dimensions (array broadcasting) based on dimension names, not shape. - Flexible split-apply-combine operations with groupby: - `x.groupby('time.dayofyear').mean()`. + `x.groupby('time.dayofyear').mean()`. - Database like alignment based on coordinate labels that smoothly - handles missing values: `x, y = xr.align(x, y, join='outer')`. + handles missing values: `x, y = xr.align(x, y, join='outer')`. - Keep track of arbitrary metadata in the form of a Python dictionary: - `x.attrs`. + `x.attrs`. ## Documentation @@ -73,12 +73,12 @@ page](https://docs.xarray.dev/en/stable/contributing.html). ## Get in touch - Ask usage questions ("How do I?") on - [GitHub Discussions](https://github.com/pydata/xarray/discussions). + [GitHub Discussions](https://github.com/pydata/xarray/discussions). - Report bugs, suggest features or view the source code [on - GitHub](https://github.com/pydata/xarray). + GitHub](https://github.com/pydata/xarray). - For less well defined questions or ideas, or to announce other - projects of interest to xarray users, use the [mailing - list](https://groups.google.com/forum/#!forum/xarray). + projects of interest to xarray users, use the [mailing + list](https://groups.google.com/forum/#!forum/xarray). ## NumFOCUS @@ -114,7 +114,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, From 3fd162e42bb309cfab03c2c18b037d1ad3cd3193 Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Tue, 18 Jun 2024 21:02:56 -0700 Subject: [PATCH 5/5] Split out distributed writes in zarr docs (#9132) * Split out distributed writes in zarr docs This was under 'Modifying existing Zarr stores', which I understand from a dev perspective but isn't what folks will be scanning for * And some reorg * clarify dask is generally sufficient * . --- doc/user-guide/io.rst | 175 ++++++++++++++++++++++-------------------- 1 file changed, 91 insertions(+), 84 deletions(-) diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index fd6b7708e48..da414bc383e 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -741,6 +741,65 @@ instance and pass this, as follows: .. _Google Cloud Storage: https://cloud.google.com/storage/ .. _gcsfs: https://github.com/fsspec/gcsfs +.. _io.zarr.distributed_writes: + +Distributed writes +~~~~~~~~~~~~~~~~~~ + +Xarray will natively use dask to write in parallel to a zarr store, which should +satisfy most moderately sized datasets. For more flexible parallelization, we +can use ``region`` to write to limited regions of arrays in an existing Zarr +store. + +To scale this up to writing large datasets, first create an initial Zarr store +without writing all of its array data. This can be done by first creating a +``Dataset`` with dummy values stored in :ref:`dask `, and then calling +``to_zarr`` with ``compute=False`` to write only metadata (including ``attrs``) +to Zarr: + +.. ipython:: python + :suppress: + + ! rm -rf path/to/directory.zarr + +.. ipython:: python + + import dask.array + + # The values of this dask array are entirely irrelevant; only the dtype, + # shape and chunks are used + dummies = dask.array.zeros(30, chunks=10) + ds = xr.Dataset({"foo": ("x", dummies)}, coords={"x": np.arange(30)}) + path = "path/to/directory.zarr" + # Now we write the metadata without computing any array values + ds.to_zarr(path, compute=False) + +Now, a Zarr store with the correct variable shapes and attributes exists that +can be filled out by subsequent calls to ``to_zarr``. +Setting ``region="auto"`` will open the existing store and determine the +correct alignment of the new data with the existing dimensions, or as an +explicit mapping from dimension names to Python ``slice`` objects indicating +where the data should be written (in index space, not label space), e.g., + +.. ipython:: python + + # For convenience, we'll slice a single dataset, but in the real use-case + # we would create them separately possibly even from separate processes. + ds = xr.Dataset({"foo": ("x", np.arange(30))}, coords={"x": np.arange(30)}) + # Any of the following region specifications are valid + ds.isel(x=slice(0, 10)).to_zarr(path, region="auto") + ds.isel(x=slice(10, 20)).to_zarr(path, region={"x": "auto"}) + ds.isel(x=slice(20, 30)).to_zarr(path, region={"x": slice(20, 30)}) + +Concurrent writes with ``region`` are safe as long as they modify distinct +chunks in the underlying Zarr arrays (or use an appropriate ``lock``). + +As a safety check to make it harder to inadvertently override existing values, +if you set ``region`` then *all* variables included in a Dataset must have +dimensions included in ``region``. Other variables (typically coordinates) +need to be explicitly dropped and/or written in a separate calls to ``to_zarr`` +with ``mode='a'``. + Zarr Compressors and Filters ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -767,37 +826,6 @@ For example: Not all native zarr compression and filtering options have been tested with xarray. -.. _io.zarr.consolidated_metadata: - -Consolidated Metadata -~~~~~~~~~~~~~~~~~~~~~ - -Xarray needs to read all of the zarr metadata when it opens a dataset. -In some storage mediums, such as with cloud object storage (e.g. `Amazon S3`_), -this can introduce significant overhead, because two separate HTTP calls to the -object store must be made for each variable in the dataset. -By default Xarray uses a feature called -*consolidated metadata*, storing all metadata for the entire dataset with a -single key (by default called ``.zmetadata``). This typically drastically speeds -up opening the store. (For more information on this feature, consult the -`zarr docs on consolidating metadata `_.) - -By default, xarray writes consolidated metadata and attempts to read stores -with consolidated metadata, falling back to use non-consolidated metadata for -reads. Because this fall-back option is so much slower, xarray issues a -``RuntimeWarning`` with guidance when reading with consolidated metadata fails: - - Failed to open Zarr store with consolidated metadata, falling back to try - reading non-consolidated metadata. This is typically much slower for - opening a dataset. To silence this warning, consider: - - 1. Consolidating metadata in this existing store with - :py:func:`zarr.consolidate_metadata`. - 2. Explicitly setting ``consolidated=False``, to avoid trying to read - consolidate metadata. - 3. Explicitly setting ``consolidated=True``, to raise an error in this case - instead of falling back to try reading non-consolidated metadata. - .. _io.zarr.appending: Modifying existing Zarr stores @@ -856,59 +884,6 @@ order, e.g., for time-stepping a simulation: ) ds2.to_zarr("path/to/directory.zarr", append_dim="t") -Finally, you can use ``region`` to write to limited regions of existing arrays -in an existing Zarr store. This is a good option for writing data in parallel -from independent processes. - -To scale this up to writing large datasets, the first step is creating an -initial Zarr store without writing all of its array data. This can be done by -first creating a ``Dataset`` with dummy values stored in :ref:`dask `, -and then calling ``to_zarr`` with ``compute=False`` to write only metadata -(including ``attrs``) to Zarr: - -.. ipython:: python - :suppress: - - ! rm -rf path/to/directory.zarr - -.. ipython:: python - - import dask.array - - # The values of this dask array are entirely irrelevant; only the dtype, - # shape and chunks are used - dummies = dask.array.zeros(30, chunks=10) - ds = xr.Dataset({"foo": ("x", dummies)}, coords={"x": np.arange(30)}) - path = "path/to/directory.zarr" - # Now we write the metadata without computing any array values - ds.to_zarr(path, compute=False) - -Now, a Zarr store with the correct variable shapes and attributes exists that -can be filled out by subsequent calls to ``to_zarr``. -Setting ``region="auto"`` will open the existing store and determine the -correct alignment of the new data with the existing coordinates, or as an -explicit mapping from dimension names to Python ``slice`` objects indicating -where the data should be written (in index space, not label space), e.g., - -.. ipython:: python - - # For convenience, we'll slice a single dataset, but in the real use-case - # we would create them separately possibly even from separate processes. - ds = xr.Dataset({"foo": ("x", np.arange(30))}, coords={"x": np.arange(30)}) - # Any of the following region specifications are valid - ds.isel(x=slice(0, 10)).to_zarr(path, region="auto") - ds.isel(x=slice(10, 20)).to_zarr(path, region={"x": "auto"}) - ds.isel(x=slice(20, 30)).to_zarr(path, region={"x": slice(20, 30)}) - -Concurrent writes with ``region`` are safe as long as they modify distinct -chunks in the underlying Zarr arrays (or use an appropriate ``lock``). - -As a safety check to make it harder to inadvertently override existing values, -if you set ``region`` then *all* variables included in a Dataset must have -dimensions included in ``region``. Other variables (typically coordinates) -need to be explicitly dropped and/or written in a separate calls to ``to_zarr`` -with ``mode='a'``. - .. _io.zarr.writing_chunks: Specifying chunks in a zarr store @@ -978,6 +953,38 @@ length of each dimension by using the shorthand chunk size ``-1``: The number of chunks on Tair matches our dask chunks, while there is now only a single chunk in the directory stores of each coordinate. +.. _io.zarr.consolidated_metadata: + +Consolidated Metadata +~~~~~~~~~~~~~~~~~~~~~ + +Xarray needs to read all of the zarr metadata when it opens a dataset. +In some storage mediums, such as with cloud object storage (e.g. `Amazon S3`_), +this can introduce significant overhead, because two separate HTTP calls to the +object store must be made for each variable in the dataset. +By default Xarray uses a feature called +*consolidated metadata*, storing all metadata for the entire dataset with a +single key (by default called ``.zmetadata``). This typically drastically speeds +up opening the store. (For more information on this feature, consult the +`zarr docs on consolidating metadata `_.) + +By default, xarray writes consolidated metadata and attempts to read stores +with consolidated metadata, falling back to use non-consolidated metadata for +reads. Because this fall-back option is so much slower, xarray issues a +``RuntimeWarning`` with guidance when reading with consolidated metadata fails: + + Failed to open Zarr store with consolidated metadata, falling back to try + reading non-consolidated metadata. This is typically much slower for + opening a dataset. To silence this warning, consider: + + 1. Consolidating metadata in this existing store with + :py:func:`zarr.consolidate_metadata`. + 2. Explicitly setting ``consolidated=False``, to avoid trying to read + consolidate metadata. + 3. Explicitly setting ``consolidated=True``, to raise an error in this case + instead of falling back to try reading non-consolidated metadata. + + .. _io.iris: Iris