From c2b942971e311d6bf589a3f5672ba89cefcf678b Mon Sep 17 00:00:00 2001 From: Mark Harfouche Date: Sun, 5 May 2024 14:05:08 -0400 Subject: [PATCH 01/13] Fix syntax error in test related to cupy (#9000) I suspect the CIs don't have cupy which meant that this line didn't get hit. Recreation: ``` mamba create --name xr_py10 python=3.10 --channel conda-forge --override-channels mamba activate xr_py10 pip install -e . -vv pip install pytest mamba install cupy ``` ``` pytest xarray/tests/test_array_api.py -x ``` Fails on my machine. Happy to provide more info --- xarray/core/duck_array_ops.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index d95dfa566cc..23be37618b0 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -233,9 +233,10 @@ def as_shared_dtype(scalars_or_arrays, xp=np): raise ValueError( f"Cannot cast arrays to shared type, found array types {[x.dtype for x in scalars_or_arrays]}" ) - elif array_type_cupy := array_type("cupy") and any( # noqa: F841 - isinstance(x, array_type_cupy) for x in scalars_or_arrays # noqa: F821 - ): + + # Avoid calling array_type("cupy") repeatidely in the any check + array_type_cupy = array_type("cupy") + if any(isinstance(x, array_type_cupy) for x in scalars_or_arrays): import cupy as cp arrays = [asarray(x, xp=cp) for x in scalars_or_arrays] From 8d728bf253f63c557ab8993d47c1cd0c3113277d Mon Sep 17 00:00:00 2001 From: ignamv Date: Sun, 5 May 2024 20:06:57 +0200 Subject: [PATCH 02/13] Add argument check_dims to assert_allclose to allow transposed inputs (#5733) (#8991) * Add argument check_dims to assert_allclose to allow transposed inputs * Update whats-new.rst * Add `check_dims` argument to assert_equal and assert_identical + tests * Assert that dimensions match before transposing or comparing values * Add docstring for check_dims to assert_equal and assert_identical * Update doc/whats-new.rst Co-authored-by: Tom Nicholas * Undo fat finger Co-authored-by: Tom Nicholas * Add attribution to whats-new.rst * Replace check_dims with bool argument check_dim_order, rename align_dims to maybe_transpose_dims * Remove left-over half-made test * Remove check_dim_order argument from assert_identical * assert_allclose/equal: emit full diff if dimensions don't match * Rename check_dim_order test, test Dataset with different dim orders * Update whats-new.rst * Hide maybe_transpose_dims from Pytest traceback Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> * Ignore mypy error due to missing functools.partial.__name__ --------- Co-authored-by: Tom Nicholas Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> --- doc/whats-new.rst | 2 ++ xarray/testing/assertions.py | 29 +++++++++++++++++++++++++---- xarray/tests/test_assertions.py | 19 +++++++++++++++++++ 3 files changed, 46 insertions(+), 4 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 0f79b648187..9a4601a776f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -29,6 +29,8 @@ New Features for example, will retain the object. However, one cannot do operations that are not possible on the `ExtensionArray` then, such as broadcasting. By `Ilan Gold `_. +- :py:func:`testing.assert_allclose`/:py:func:`testing.assert_equal` now accept a new argument `check_dims="transpose"`, controlling whether a transposed array is considered equal. (:issue:`5733`, :pull:`8991`) + By `Ignacio Martinez Vazquez `_. - Added the option to avoid automatically creating 1D pandas indexes in :py:meth:`Dataset.expand_dims()`, by passing the new kwarg `create_index=False`. (:pull:`8960`) By `Tom Nicholas `_. diff --git a/xarray/testing/assertions.py b/xarray/testing/assertions.py index 018874c169e..69885868f83 100644 --- a/xarray/testing/assertions.py +++ b/xarray/testing/assertions.py @@ -95,6 +95,18 @@ def assert_isomorphic(a: DataTree, b: DataTree, from_root: bool = False): raise TypeError(f"{type(a)} not of type DataTree") +def maybe_transpose_dims(a, b, check_dim_order: bool): + """Helper for assert_equal/allclose/identical""" + __tracebackhide__ = True + if not isinstance(a, (Variable, DataArray, Dataset)): + return b + if not check_dim_order and set(a.dims) == set(b.dims): + # Ensure transpose won't fail if a dimension is missing + # If this is the case, the difference will be caught by the caller + return b.transpose(*a.dims) + return b + + @overload def assert_equal(a, b): ... @@ -104,7 +116,7 @@ def assert_equal(a: DataTree, b: DataTree, from_root: bool = True): ... @ensure_warnings -def assert_equal(a, b, from_root=True): +def assert_equal(a, b, from_root=True, check_dim_order: bool = True): """Like :py:func:`numpy.testing.assert_array_equal`, but for xarray objects. @@ -127,6 +139,8 @@ def assert_equal(a, b, from_root=True): Only used when comparing DataTree objects. Indicates whether or not to first traverse to the root of the trees before checking for isomorphism. If a & b have no parents then this has no effect. + check_dim_order : bool, optional, default is True + Whether dimensions must be in the same order. See Also -------- @@ -137,6 +151,7 @@ def assert_equal(a, b, from_root=True): assert ( type(a) == type(b) or isinstance(a, Coordinates) and isinstance(b, Coordinates) ) + b = maybe_transpose_dims(a, b, check_dim_order) if isinstance(a, (Variable, DataArray)): assert a.equals(b), formatting.diff_array_repr(a, b, "equals") elif isinstance(a, Dataset): @@ -182,6 +197,8 @@ def assert_identical(a, b, from_root=True): Only used when comparing DataTree objects. Indicates whether or not to first traverse to the root of the trees before checking for isomorphism. If a & b have no parents then this has no effect. + check_dim_order : bool, optional, default is True + Whether dimensions must be in the same order. See Also -------- @@ -213,7 +230,9 @@ def assert_identical(a, b, from_root=True): @ensure_warnings -def assert_allclose(a, b, rtol=1e-05, atol=1e-08, decode_bytes=True): +def assert_allclose( + a, b, rtol=1e-05, atol=1e-08, decode_bytes=True, check_dim_order: bool = True +): """Like :py:func:`numpy.testing.assert_allclose`, but for xarray objects. Raises an AssertionError if two objects are not equal up to desired @@ -233,6 +252,8 @@ def assert_allclose(a, b, rtol=1e-05, atol=1e-08, decode_bytes=True): Whether byte dtypes should be decoded to strings as UTF-8 or not. This is useful for testing serialization methods on Python 3 that return saved strings as bytes. + check_dim_order : bool, optional, default is True + Whether dimensions must be in the same order. See Also -------- @@ -240,16 +261,16 @@ def assert_allclose(a, b, rtol=1e-05, atol=1e-08, decode_bytes=True): """ __tracebackhide__ = True assert type(a) == type(b) + b = maybe_transpose_dims(a, b, check_dim_order) equiv = functools.partial( _data_allclose_or_equiv, rtol=rtol, atol=atol, decode_bytes=decode_bytes ) - equiv.__name__ = "allclose" + equiv.__name__ = "allclose" # type: ignore[attr-defined] def compat_variable(a, b): a = getattr(a, "variable", a) b = getattr(b, "variable", b) - return a.dims == b.dims and (a._data is b._data or equiv(a.data, b.data)) if isinstance(a, Variable): diff --git a/xarray/tests/test_assertions.py b/xarray/tests/test_assertions.py index f7e49a0f3de..aa0ea46f7db 100644 --- a/xarray/tests/test_assertions.py +++ b/xarray/tests/test_assertions.py @@ -57,6 +57,25 @@ def test_allclose_regression() -> None: def test_assert_allclose(obj1, obj2) -> None: with pytest.raises(AssertionError): xr.testing.assert_allclose(obj1, obj2) + with pytest.raises(AssertionError): + xr.testing.assert_allclose(obj1, obj2, check_dim_order=False) + + +@pytest.mark.parametrize("func", ["assert_equal", "assert_allclose"]) +def test_assert_allclose_equal_transpose(func) -> None: + """Transposed DataArray raises assertion unless check_dim_order=False.""" + obj1 = xr.DataArray([[0, 1, 2], [2, 3, 4]], dims=["a", "b"]) + obj2 = xr.DataArray([[0, 2], [1, 3], [2, 4]], dims=["b", "a"]) + with pytest.raises(AssertionError): + getattr(xr.testing, func)(obj1, obj2) + getattr(xr.testing, func)(obj1, obj2, check_dim_order=False) + ds1 = obj1.to_dataset(name="varname") + ds1["var2"] = obj1 + ds2 = obj1.to_dataset(name="varname") + ds2["var2"] = obj1.transpose() + with pytest.raises(AssertionError): + getattr(xr.testing, func)(ds1, ds2) + getattr(xr.testing, func)(ds1, ds2, check_dim_order=False) @pytest.mark.filterwarnings("error") From 50f87269e4c7b51c4afc6f5030760d7f8e584a09 Mon Sep 17 00:00:00 2001 From: Mark Harfouche Date: Sun, 5 May 2024 18:57:36 -0400 Subject: [PATCH 03/13] Simplify fast path (#9001) --- xarray/core/variable.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 2bcee5590f8..f0685882595 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -269,9 +269,8 @@ def as_compatible_data( Finally, wrap it up with an adapter if necessary. """ - if fastpath and getattr(data, "ndim", 0) > 0: - # can't use fastpath (yet) for scalars - return cast("T_DuckArray", _maybe_wrap_data(data)) + if fastpath and getattr(data, "ndim", None) is not None: + return cast("T_DuckArray", data) from xarray.core.dataarray import DataArray From faa634579f3240e6fa36694e89cec3d674f784d3 Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Mon, 6 May 2024 07:44:07 +0200 Subject: [PATCH 04/13] Speed up localize (#8536) Co-authored-by: Mathias Hauser --- xarray/core/missing.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 8aa2ff2f042..45abc70c0d3 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -553,11 +553,11 @@ def _localize(var, indexes_coords): """ indexes = {} for dim, [x, new_x] in indexes_coords.items(): - minval = np.nanmin(new_x.values) - maxval = np.nanmax(new_x.values) + new_x_loaded = new_x.values + minval = np.nanmin(new_x_loaded) + maxval = np.nanmax(new_x_loaded) index = x.to_index() - imin = index.get_indexer([minval], method="nearest").item() - imax = index.get_indexer([maxval], method="nearest").item() + imin, imax = index.get_indexer([minval, maxval], method="nearest") indexes[dim] = slice(max(imin - 2, 0), imax + 2) indexes_coords[dim] = (x[indexes[dim]], new_x) return var.isel(**indexes), indexes_coords From c4031cd67c6e56f398414c27aab306656d8af517 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 6 May 2024 10:02:37 -0700 Subject: [PATCH 05/13] Bump codecov/codecov-action from 4.3.0 to 4.3.1 in the actions group (#9004) Bumps the actions group with 1 update: [codecov/codecov-action](https://github.com/codecov/codecov-action). Updates `codecov/codecov-action` from 4.3.0 to 4.3.1 - [Release notes](https://github.com/codecov/codecov-action/releases) - [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/codecov/codecov-action/compare/v4.3.0...v4.3.1) --- updated-dependencies: - dependency-name: codecov/codecov-action dependency-type: direct:production update-type: version-update:semver-patch dependency-group: actions ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/ci-additional.yaml | 8 ++++---- .github/workflows/ci.yaml | 2 +- .github/workflows/upstream-dev-ci.yaml | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index 7b248b14006..f904f259c51 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -127,7 +127,7 @@ jobs: python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report xarray/ - name: Upload mypy coverage to Codecov - uses: codecov/codecov-action@v4.3.0 + uses: codecov/codecov-action@v4.3.1 with: file: mypy_report/cobertura.xml flags: mypy @@ -181,7 +181,7 @@ jobs: python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report xarray/ - name: Upload mypy coverage to Codecov - uses: codecov/codecov-action@v4.3.0 + uses: codecov/codecov-action@v4.3.1 with: file: mypy_report/cobertura.xml flags: mypy39 @@ -242,7 +242,7 @@ jobs: python -m pyright xarray/ - name: Upload pyright coverage to Codecov - uses: codecov/codecov-action@v4.3.0 + uses: codecov/codecov-action@v4.3.1 with: file: pyright_report/cobertura.xml flags: pyright @@ -301,7 +301,7 @@ jobs: python -m pyright xarray/ - name: Upload pyright coverage to Codecov - uses: codecov/codecov-action@v4.3.0 + uses: codecov/codecov-action@v4.3.1 with: file: pyright_report/cobertura.xml flags: pyright39 diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index a577312a7cc..349aa626142 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -156,7 +156,7 @@ jobs: path: pytest.xml - name: Upload code coverage to Codecov - uses: codecov/codecov-action@v4.3.0 + uses: codecov/codecov-action@v4.3.1 with: file: ./coverage.xml flags: unittests diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index 4f3d199dd2d..a216dfd7428 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -143,7 +143,7 @@ jobs: run: | python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report - name: Upload mypy coverage to Codecov - uses: codecov/codecov-action@v4.3.0 + uses: codecov/codecov-action@v4.3.1 with: file: mypy_report/cobertura.xml flags: mypy From e0f2ceede29087854edfa498cfd23d36bcf4178a Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Mon, 6 May 2024 13:37:37 -0400 Subject: [PATCH 06/13] Port negative frequency fix for `pandas.date_range` to `cftime_range` (#8999) --- doc/whats-new.rst | 6 ++++++ xarray/coding/cftime_offsets.py | 7 ++++++- xarray/tests/__init__.py | 1 + xarray/tests/test_cftime_offsets.py | 17 ++++++++++++++--- 4 files changed, 27 insertions(+), 4 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9a4601a776f..a846c1b8a01 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -60,6 +60,12 @@ Breaking changes Bug fixes ~~~~~~~~~ +- Following `an upstream bug fix + `_ to + :py:func:`pandas.date_range`, date ranges produced by + :py:func:`xarray.cftime_range` with negative frequencies will now fall fully + within the bounds of the provided start and end dates (:pull:`8999`). By + `Spencer Clark `_. Internal Changes diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py index 2e594455874..0af75f404a2 100644 --- a/xarray/coding/cftime_offsets.py +++ b/xarray/coding/cftime_offsets.py @@ -845,7 +845,12 @@ def _generate_range(start, end, periods, offset): A generator object """ if start: - start = offset.rollforward(start) + # From pandas GH 56147 / 56832 to account for negative direction and + # range bounds + if offset.n >= 0: + start = offset.rollforward(start) + else: + start = offset.rollback(start) if periods is None and end < start and offset.n >= 0: end = None diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 94c44544fb5..59fcdca8ffa 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -130,6 +130,7 @@ def _importorskip( has_numexpr, requires_numexpr = _importorskip("numexpr") has_flox, requires_flox = _importorskip("flox") has_pandas_ge_2_2, __ = _importorskip("pandas", "2.2") +has_pandas_3, requires_pandas_3 = _importorskip("pandas", "3.0.0.dev0") # some special cases diff --git a/xarray/tests/test_cftime_offsets.py b/xarray/tests/test_cftime_offsets.py index 0110afe40ac..eabb7d2f4d6 100644 --- a/xarray/tests/test_cftime_offsets.py +++ b/xarray/tests/test_cftime_offsets.py @@ -42,6 +42,7 @@ has_cftime, has_pandas_ge_2_2, requires_cftime, + requires_pandas_3, ) cftime = pytest.importorskip("cftime") @@ -1354,7 +1355,7 @@ def test_calendar_specific_month_end_negative_freq( ) -> None: year = 2000 # Use a leap-year to highlight calendar differences result = cftime_range( - start="2000-12", + start="2001", end="2000", freq="-2ME", calendar=calendar, @@ -1464,7 +1465,7 @@ def test_date_range_errors() -> None: ("2020-02-01", "QE-DEC", "noleap", "gregorian", True, "2020-03-31", True), ("2020-02-01", "YS-FEB", "noleap", "gregorian", True, "2020-02-01", True), ("2020-02-01", "YE-FEB", "noleap", "gregorian", True, "2020-02-29", True), - ("2020-02-01", "-1YE-FEB", "noleap", "gregorian", True, "2020-02-29", True), + ("2020-02-01", "-1YE-FEB", "noleap", "gregorian", True, "2019-02-28", True), ("2020-02-28", "3h", "all_leap", "gregorian", False, "2020-02-28", True), ("2020-03-30", "ME", "360_day", "gregorian", False, "2020-03-31", True), ("2020-03-31", "ME", "gregorian", "360_day", None, "2020-03-30", False), @@ -1724,7 +1725,17 @@ def test_new_to_legacy_freq_pd_freq_passthrough(freq, expected): @pytest.mark.parametrize("start", ("2000", "2001")) @pytest.mark.parametrize("end", ("2000", "2001")) @pytest.mark.parametrize( - "freq", ("MS", "-1MS", "YS", "-1YS", "ME", "-1ME", "YE", "-1YE") + "freq", + ( + "MS", + pytest.param("-1MS", marks=requires_pandas_3), + "YS", + pytest.param("-1YS", marks=requires_pandas_3), + "ME", + pytest.param("-1ME", marks=requires_pandas_3), + "YE", + pytest.param("-1YE", marks=requires_pandas_3), + ), ) def test_cftime_range_same_as_pandas(start, end, freq): result = date_range(start, end, freq=freq, calendar="standard", use_cftime=True) From c01de3997cf886b91c8134f17fa086401dbb22a7 Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Mon, 6 May 2024 12:17:22 -0700 Subject: [PATCH 07/13] Fix for ruff 0.4.3 (#9007) * Fix for ruff 0.4.3 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- ci/min_deps_check.py | 2 +- xarray/datatree_/datatree/__init__.py | 1 - xarray/datatree_/datatree/common.py | 9 ++++----- xarray/tests/test_dataset.py | 7 ++++--- xarray/tests/test_groupby.py | 6 +++--- xarray/tests/test_rolling.py | 6 +++--- 6 files changed, 15 insertions(+), 16 deletions(-) diff --git a/ci/min_deps_check.py b/ci/min_deps_check.py index 48ea323ed81..5ec7bff0a30 100755 --- a/ci/min_deps_check.py +++ b/ci/min_deps_check.py @@ -133,7 +133,7 @@ def process_pkg( - publication date of version suggested by policy (YYYY-MM-DD) - status ("<", "=", "> (!)") """ - print("Analyzing %s..." % pkg) + print(f"Analyzing {pkg}...") versions = query_conda(pkg) try: diff --git a/xarray/datatree_/datatree/__init__.py b/xarray/datatree_/datatree/__init__.py index 3159d612913..51c5f1b3073 100644 --- a/xarray/datatree_/datatree/__init__.py +++ b/xarray/datatree_/datatree/__init__.py @@ -1,7 +1,6 @@ # import public API from xarray.core.treenode import InvalidTreeError, NotFoundInTreeError - __all__ = ( "InvalidTreeError", "NotFoundInTreeError", diff --git a/xarray/datatree_/datatree/common.py b/xarray/datatree_/datatree/common.py index e4d52925ede..f4f74337c50 100644 --- a/xarray/datatree_/datatree/common.py +++ b/xarray/datatree_/datatree/common.py @@ -6,8 +6,9 @@ """ import warnings +from collections.abc import Hashable, Iterable, Mapping from contextlib import suppress -from typing import Any, Hashable, Iterable, List, Mapping +from typing import Any class TreeAttrAccessMixin: @@ -83,16 +84,14 @@ def __setattr__(self, name: str, value: Any) -> None: except AttributeError as e: # Don't accidentally shadow custom AttributeErrors, e.g. # DataArray.dims.setter - if str(e) != "{!r} object has no attribute {!r}".format( - type(self).__name__, name - ): + if str(e) != f"{type(self).__name__!r} object has no attribute {name!r}": raise raise AttributeError( f"cannot set attribute {name!r} on a {type(self).__name__!r} object. Use __setitem__ style" "assignment (e.g., `ds['name'] = ...`) instead of assigning variables." ) from e - def __dir__(self) -> List[str]: + def __dir__(self) -> list[str]: """Provide method name lookup and completion. Only provide 'public' methods. """ diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 301596e032f..59b5b2b9b71 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -284,7 +284,7 @@ def test_repr(self) -> None: Dimensions: (dim2: 9, dim3: 10, time: 20, dim1: 8) Coordinates: * dim2 (dim2) float64 72B 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 - * dim3 (dim3) %s 40B 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' + * dim3 (dim3) {} 40B 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' * time (time) datetime64[ns] 160B 2000-01-01 2000-01-02 ... 2000-01-20 numbers (dim3) int64 80B 0 1 2 0 0 1 1 2 2 3 Dimensions without coordinates: dim1 @@ -293,8 +293,9 @@ def test_repr(self) -> None: var2 (dim1, dim2) float64 576B 1.162 -1.097 -2.123 ... 1.267 0.3328 var3 (dim3, dim1) float64 640B 0.5565 -0.2121 0.4563 ... -0.2452 -0.3616 Attributes: - foo: bar""" - % data["dim3"].dtype + foo: bar""".format( + data["dim3"].dtype + ) ) actual = "\n".join(x.rstrip() for x in repr(data).split("\n")) print(actual) diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index e9e4eb1364c..7134fe96d01 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -577,8 +577,8 @@ def test_da_groupby_assign_coords() -> None: def test_groupby_repr(obj, dim) -> None: actual = repr(obj.groupby(dim)) expected = f"{obj.__class__.__name__}GroupBy" - expected += ", grouped over %r" % dim - expected += "\n%r groups with labels " % (len(np.unique(obj[dim]))) + expected += f", grouped over {dim!r}" + expected += f"\n{len(np.unique(obj[dim]))!r} groups with labels " if dim == "x": expected += "1, 2, 3, 4, 5." elif dim == "y": @@ -595,7 +595,7 @@ def test_groupby_repr_datetime(obj) -> None: actual = repr(obj.groupby("t.month")) expected = f"{obj.__class__.__name__}GroupBy" expected += ", grouped over 'month'" - expected += "\n%r groups with labels " % (len(np.unique(obj.t.dt.month))) + expected += f"\n{len(np.unique(obj.t.dt.month))!r} groups with labels " expected += "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12." assert actual == expected diff --git a/xarray/tests/test_rolling.py b/xarray/tests/test_rolling.py index 79a5ba0a667..89f6ebba2c3 100644 --- a/xarray/tests/test_rolling.py +++ b/xarray/tests/test_rolling.py @@ -254,7 +254,7 @@ def test_rolling_reduce( rolling_obj = da.rolling(time=window, center=center, min_periods=min_periods) # add nan prefix to numpy methods to get similar # behavior as bottleneck - actual = rolling_obj.reduce(getattr(np, "nan%s" % name)) + actual = rolling_obj.reduce(getattr(np, f"nan{name}")) expected = getattr(rolling_obj, name)() assert_allclose(actual, expected) assert actual.sizes == expected.sizes @@ -276,7 +276,7 @@ def test_rolling_reduce_nonnumeric( rolling_obj = da.rolling(time=window, center=center, min_periods=min_periods) # add nan prefix to numpy methods to get similar behavior as bottleneck - actual = rolling_obj.reduce(getattr(np, "nan%s" % name)) + actual = rolling_obj.reduce(getattr(np, f"nan{name}")) expected = getattr(rolling_obj, name)() assert_allclose(actual, expected) assert actual.sizes == expected.sizes @@ -741,7 +741,7 @@ def test_rolling_reduce(self, ds, center, min_periods, window, name) -> None: rolling_obj = ds.rolling(time=window, center=center, min_periods=min_periods) # add nan prefix to numpy methods to get similar behavior as bottleneck - actual = rolling_obj.reduce(getattr(np, "nan%s" % name)) + actual = rolling_obj.reduce(getattr(np, f"nan{name}")) expected = getattr(rolling_obj, name)() assert_allclose(actual, expected) assert ds.sizes == actual.sizes From 2ad98b132cf004d908a77c40a7dc7adbd792f668 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 6 May 2024 13:21:14 -0600 Subject: [PATCH 08/13] Trigger CI only if code files are modified. (#9006) * Trigger CI only if code files are modified. Fixes #8705 * Apply suggestions from code review Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> --------- Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> --- .github/workflows/ci-additional.yaml | 6 ++++++ .github/workflows/ci.yaml | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index f904f259c51..c0f978fb0d8 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -6,6 +6,12 @@ on: pull_request: branches: - "main" + paths: + - 'ci/**' + - '.github/**' + - '/*' # covers files such as `pyproject.toml` + - 'properties/**' + - 'xarray/**' workflow_dispatch: # allows you to trigger manually concurrency: diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 349aa626142..b9b15d867a7 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -6,6 +6,12 @@ on: pull_request: branches: - "main" + paths: + - 'ci/**' + - '.github/**' + - '/*' # covers files such as `pyproject.toml` + - 'properties/**' + - 'xarray/**' workflow_dispatch: # allows you to trigger manually concurrency: From dcf2ac4addb5a92723c6b064fb6546ff02ebd1cd Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 7 May 2024 09:29:26 -0600 Subject: [PATCH 09/13] Zarr: Optimize `region="auto"` detection (#8997) * Zarr: Optimize region detection * Fix for unindexed dimensions. * Better example * small cleanup --- doc/user-guide/io.rst | 4 +- xarray/backends/api.py | 115 ++++------------------------------------ xarray/backends/zarr.py | 96 ++++++++++++++++++++++++++++++--- 3 files changed, 101 insertions(+), 114 deletions(-) diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index 63bf8b80d81..b73d0fdcb51 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -874,7 +874,7 @@ and then calling ``to_zarr`` with ``compute=False`` to write only metadata # The values of this dask array are entirely irrelevant; only the dtype, # shape and chunks are used dummies = dask.array.zeros(30, chunks=10) - ds = xr.Dataset({"foo": ("x", dummies)}) + ds = xr.Dataset({"foo": ("x", dummies)}, coords={"x": np.arange(30)}) path = "path/to/directory.zarr" # Now we write the metadata without computing any array values ds.to_zarr(path, compute=False) @@ -890,7 +890,7 @@ where the data should be written (in index space, not label space), e.g., # For convenience, we'll slice a single dataset, but in the real use-case # we would create them separately possibly even from separate processes. - ds = xr.Dataset({"foo": ("x", np.arange(30))}) + ds = xr.Dataset({"foo": ("x", np.arange(30))}, coords={"x": np.arange(30)}) # Any of the following region specifications are valid ds.isel(x=slice(0, 10)).to_zarr(path, region="auto") ds.isel(x=slice(10, 20)).to_zarr(path, region={"x": "auto"}) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 62085fe5e2a..c9a8630a575 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -27,7 +27,6 @@ _normalize_path, ) from xarray.backends.locks import _get_scheduler -from xarray.backends.zarr import open_zarr from xarray.core import indexing from xarray.core.combine import ( _infer_concat_order_from_positions, @@ -1522,92 +1521,6 @@ def save_mfdataset( ) -def _auto_detect_region(ds_new, ds_orig, dim): - # Create a mapping array of coordinates to indices on the original array - coord = ds_orig[dim] - da_map = DataArray(np.arange(coord.size), coords={dim: coord}) - - try: - da_idxs = da_map.sel({dim: ds_new[dim]}) - except KeyError as e: - if "not all values found" in str(e): - raise KeyError( - f"Not all values of coordinate '{dim}' in the new array were" - " found in the original store. Writing to a zarr region slice" - " requires that no dimensions or metadata are changed by the write." - ) - else: - raise e - - if (da_idxs.diff(dim) != 1).any(): - raise ValueError( - f"The auto-detected region of coordinate '{dim}' for writing new data" - " to the original store had non-contiguous indices. Writing to a zarr" - " region slice requires that the new data constitute a contiguous subset" - " of the original store." - ) - - dim_slice = slice(da_idxs.values[0], da_idxs.values[-1] + 1) - - return dim_slice - - -def _auto_detect_regions(ds, region, open_kwargs): - ds_original = open_zarr(**open_kwargs) - for key, val in region.items(): - if val == "auto": - region[key] = _auto_detect_region(ds, ds_original, key) - return region - - -def _validate_and_autodetect_region(ds, region, mode, open_kwargs) -> dict[str, slice]: - if region == "auto": - region = {dim: "auto" for dim in ds.dims} - - if not isinstance(region, dict): - raise TypeError(f"``region`` must be a dict, got {type(region)}") - - if any(v == "auto" for v in region.values()): - if mode != "r+": - raise ValueError( - f"``mode`` must be 'r+' when using ``region='auto'``, got {mode}" - ) - region = _auto_detect_regions(ds, region, open_kwargs) - - for k, v in region.items(): - if k not in ds.dims: - raise ValueError( - f"all keys in ``region`` are not in Dataset dimensions, got " - f"{list(region)} and {list(ds.dims)}" - ) - if not isinstance(v, slice): - raise TypeError( - "all values in ``region`` must be slice objects, got " - f"region={region}" - ) - if v.step not in {1, None}: - raise ValueError( - "step on all slices in ``region`` must be 1 or None, got " - f"region={region}" - ) - - non_matching_vars = [ - k for k, v in ds.variables.items() if not set(region).intersection(v.dims) - ] - if non_matching_vars: - raise ValueError( - f"when setting `region` explicitly in to_zarr(), all " - f"variables in the dataset to write must have at least " - f"one dimension in common with the region's dimensions " - f"{list(region.keys())}, but that is not " - f"the case for some variables here. To drop these variables " - f"from this dataset before exporting to zarr, write: " - f".drop_vars({non_matching_vars!r})" - ) - - return region - - def _validate_datatypes_for_zarr_append(zstore, dataset): """If variable exists in the store, confirm dtype of the data to append is compatible with existing dtype. @@ -1768,24 +1681,6 @@ def to_zarr( # validate Dataset keys, DataArray names _validate_dataset_names(dataset) - if region is not None: - open_kwargs = dict( - store=store, - synchronizer=synchronizer, - group=group, - consolidated=consolidated, - storage_options=storage_options, - zarr_version=zarr_version, - ) - region = _validate_and_autodetect_region(dataset, region, mode, open_kwargs) - # can't modify indexed with region writes - dataset = dataset.drop_vars(dataset.indexes) - if append_dim is not None and append_dim in region: - raise ValueError( - f"cannot list the same dimension in both ``append_dim`` and " - f"``region`` with to_zarr(), got {append_dim} in both" - ) - if zarr_version is None: # default to 2 if store doesn't specify it's version (e.g. a path) zarr_version = int(getattr(store, "_store_version", 2)) @@ -1815,6 +1710,16 @@ def to_zarr( write_empty=write_empty_chunks, ) + if region is not None: + zstore._validate_and_autodetect_region(dataset) + # can't modify indexed with region writes + dataset = dataset.drop_vars(dataset.indexes) + if append_dim is not None and append_dim in region: + raise ValueError( + f"cannot list the same dimension in both ``append_dim`` and " + f"``region`` with to_zarr(), got {append_dim} in both" + ) + if mode in ["a", "a-", "r+"]: _validate_datatypes_for_zarr_append(zstore, dataset) if append_dim is not None: diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 3d6baeefe01..e4a684e945d 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any import numpy as np +import pandas as pd from xarray import coding, conventions from xarray.backends.common import ( @@ -509,7 +510,9 @@ def ds(self): # TODO: consider deprecating this in favor of zarr_group return self.zarr_group - def open_store_variable(self, name, zarr_array): + def open_store_variable(self, name, zarr_array=None): + if zarr_array is None: + zarr_array = self.zarr_group[name] data = indexing.LazilyIndexedArray(ZarrArrayWrapper(zarr_array)) try_nczarr = self._mode == "r" dimensions, attributes = _get_zarr_dims_and_attrs( @@ -623,11 +626,7 @@ def store( # avoid needing to load index variables into memory. # TODO: consider making loading indexes lazy again? existing_vars, _, _ = conventions.decode_cf_variables( - { - k: v - for k, v in self.get_variables().items() - if k in existing_variable_names - }, + {k: self.open_store_variable(name=k) for k in existing_variable_names}, self.get_attrs(), ) # Modified variables must use the same encoding as the store. @@ -796,10 +795,93 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No region = tuple(write_region[dim] for dim in dims) writer.add(v.data, zarr_array, region) - def close(self): + def close(self) -> None: if self._close_store_on_close: self.zarr_group.store.close() + def _auto_detect_regions(self, ds, region): + for dim, val in region.items(): + if val != "auto": + continue + + if dim not in ds._variables: + # unindexed dimension + region[dim] = slice(0, ds.sizes[dim]) + continue + + variable = conventions.decode_cf_variable( + dim, self.open_store_variable(dim).compute() + ) + assert variable.dims == (dim,) + index = pd.Index(variable.data) + idxs = index.get_indexer(ds[dim].data) + if any(idxs == -1): + raise KeyError( + f"Not all values of coordinate '{dim}' in the new array were" + " found in the original store. Writing to a zarr region slice" + " requires that no dimensions or metadata are changed by the write." + ) + + if (np.diff(idxs) != 1).any(): + raise ValueError( + f"The auto-detected region of coordinate '{dim}' for writing new data" + " to the original store had non-contiguous indices. Writing to a zarr" + " region slice requires that the new data constitute a contiguous subset" + " of the original store." + ) + region[dim] = slice(idxs[0], idxs[-1] + 1) + return region + + def _validate_and_autodetect_region(self, ds) -> None: + region = self._write_region + + if region == "auto": + region = {dim: "auto" for dim in ds.dims} + + if not isinstance(region, dict): + raise TypeError(f"``region`` must be a dict, got {type(region)}") + if any(v == "auto" for v in region.values()): + if self._mode != "r+": + raise ValueError( + f"``mode`` must be 'r+' when using ``region='auto'``, got {self._mode!r}" + ) + region = self._auto_detect_regions(ds, region) + + # validate before attempting to auto-detect since the auto-detection + # should always return a valid slice. + for k, v in region.items(): + if k not in ds.dims: + raise ValueError( + f"all keys in ``region`` are not in Dataset dimensions, got " + f"{list(region)} and {list(ds.dims)}" + ) + if not isinstance(v, slice): + raise TypeError( + "all values in ``region`` must be slice objects, got " + f"region={region}" + ) + if v.step not in {1, None}: + raise ValueError( + "step on all slices in ``region`` must be 1 or None, got " + f"region={region}" + ) + + non_matching_vars = [ + k for k, v in ds.variables.items() if not set(region).intersection(v.dims) + ] + if non_matching_vars: + raise ValueError( + f"when setting `region` explicitly in to_zarr(), all " + f"variables in the dataset to write must have at least " + f"one dimension in common with the region's dimensions " + f"{list(region.keys())}, but that is not " + f"the case for some variables here. To drop these variables " + f"from this dataset before exporting to zarr, write: " + f".drop_vars({non_matching_vars!r})" + ) + + self._write_region = region + def open_zarr( store, From 4e9d557d47bf6792937792426559747204fce5ed Mon Sep 17 00:00:00 2001 From: Mark Harfouche Date: Tue, 7 May 2024 11:59:21 -0400 Subject: [PATCH 10/13] Add a benchmark to monitor performance for large dataset indexing (#9012) --- asv_bench/benchmarks/indexing.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 169c8af06e9..892a6cb3758 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -12,6 +12,7 @@ nt = 500 basic_indexes = { + "1scalar": {"x": 0}, "1slice": {"x": slice(0, 3)}, "1slice-1scalar": {"x": 0, "y": slice(None, None, 3)}, "2slicess-1scalar": {"x": slice(3, -3, 3), "y": 1, "t": slice(None, -3, 3)}, @@ -74,6 +75,10 @@ def setup(self, key): "x_coords": ("x", np.linspace(1.1, 2.1, nx)), }, ) + # Benchmark how indexing is slowed down by adding many scalar variable + # to the dataset + # https://github.com/pydata/xarray/pull/9003 + self.ds_large = self.ds.merge({f"extra_var{i}": i for i in range(400)}) class Indexing(Base): @@ -89,6 +94,11 @@ def time_indexing_outer(self, key): def time_indexing_vectorized(self, key): self.ds.isel(**vectorized_indexes[key]).load() + @parameterized(["key"], [list(basic_indexes.keys())]) + def time_indexing_basic_ds_large(self, key): + # https://github.com/pydata/xarray/pull/9003 + self.ds_large.isel(**basic_indexes[key]).load() + class Assignment(Base): @parameterized(["key"], [list(basic_indexes.keys())]) From 322e6706c0d85ec4c0c0763806c4edf158baa58c Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 7 May 2024 13:28:50 -0600 Subject: [PATCH 11/13] Avoid extra read from disk when creating Pandas Index. (#8893) * Avoid extra read from disk when creating Pandas Index. * Update xarray/core/indexes.py Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> --------- Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> --- xarray/core/indexes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index e71c4a6f073..a005e1ebfe2 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -632,7 +632,8 @@ def from_variables( # the checks below. # preserve wrapped pd.Index (if any) - data = getattr(var._data, "array", var.data) + # accessing `.data` can load data from disk, so we only access if needed + data = getattr(var._data, "array") if hasattr(var._data, "array") else var.data # multi-index level variable: get level index if isinstance(var._data, PandasMultiIndexingAdapter): level = var._data.level From 71661d5b17865be426b646136a0d5766290c8d3d Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 7 May 2024 16:53:26 -0600 Subject: [PATCH 12/13] Fix benchmark CI (#9013) * [skip-ci] Fix benchmark CI * [skip-ci] reduce warnings * Fix indexing benchmark --- .github/workflows/benchmarks.yml | 6 +++--- asv_bench/asv.conf.json | 12 ++++++++---- asv_bench/benchmarks/groupby.py | 17 +++++++++-------- asv_bench/benchmarks/indexing.py | 1 + 4 files changed, 21 insertions(+), 15 deletions(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 7969847c61f..886bcfbd548 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -28,8 +28,11 @@ jobs: environment-name: xarray-tests cache-environment: true cache-environment-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}-benchmark" + # add "build" because of https://github.com/airspeed-velocity/asv/issues/1385 create-args: >- asv + build + mamba - name: Run benchmarks @@ -47,9 +50,6 @@ jobs: asv machine --yes echo "Baseline: ${{ github.event.pull_request.base.sha }} (${{ github.event.pull_request.base.label }})" echo "Contender: ${GITHUB_SHA} (${{ github.event.pull_request.head.label }})" - # Use mamba for env creation - # export CONDA_EXE=$(which mamba) - export CONDA_EXE=$(which conda) # Run benchmarks for current commit against base ASV_OPTIONS="--split --show-stderr --factor $ASV_FACTOR" asv continuous $ASV_OPTIONS ${{ github.event.pull_request.base.sha }} ${GITHUB_SHA} \ diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index a709d0a51a7..9dc86df712d 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -29,7 +29,7 @@ // If missing or the empty string, the tool will be automatically // determined by looking for tools on the PATH environment // variable. - "environment_type": "conda", + "environment_type": "mamba", "conda_channels": ["conda-forge"], // timeout in seconds for installing any dependencies in environment @@ -41,7 +41,7 @@ // The Pythons you'd like to test against. If not provided, defaults // to the current version of Python used to run `asv`. - "pythons": ["3.10"], + "pythons": ["3.11"], // The matrix of dependencies to test. Each key is the name of a // package (in PyPI) and the values are version numbers. An empty @@ -72,8 +72,12 @@ "sparse": [""], "cftime": [""] }, - - + // fix for bad builds + // https://github.com/airspeed-velocity/asv/issues/1389#issuecomment-2076131185 + "build_command": [ + "python -m build", + "python -mpip wheel --no-deps --no-build-isolation --no-index -w {build_cache_dir} {build_dir}" + ], // Combinations of libraries/python versions can be excluded/included // from the set to test. Each entry is a dictionary containing additional // key-value pairs to include/exclude. diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 1b3e55fa659..065c1b3b17f 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -68,6 +68,7 @@ def setup(self, *args, **kwargs): self.ds2d_mean = self.ds2d.groupby("b").mean().compute() +# TODO: These don't work now because we are calling `.compute` explicitly. class GroupByPandasDataFrame(GroupBy): """Run groupby tests using pandas DataFrame.""" @@ -111,11 +112,11 @@ def setup(self, *args, **kwargs): { "b": ("time", np.arange(365.0 * 24)), }, - coords={"time": pd.date_range("2001-01-01", freq="H", periods=365 * 24)}, + coords={"time": pd.date_range("2001-01-01", freq="h", periods=365 * 24)}, ) self.ds2d = self.ds1d.expand_dims(z=10) - self.ds1d_mean = self.ds1d.resample(time="48H").mean() - self.ds2d_mean = self.ds2d.resample(time="48H").mean() + self.ds1d_mean = self.ds1d.resample(time="48h").mean() + self.ds2d_mean = self.ds2d.resample(time="48h").mean() @parameterized(["ndim"], [(1, 2)]) def time_init(self, ndim): @@ -127,7 +128,7 @@ def time_init(self, ndim): def time_agg_small_num_groups(self, method, ndim, use_flox): ds = getattr(self, f"ds{ndim}d") with xr.set_options(use_flox=use_flox): - getattr(ds.resample(time="3M"), method)().compute() + getattr(ds.resample(time="3ME"), method)().compute() @parameterized( ["method", "ndim", "use_flox"], [("sum", "mean"), (1, 2), (True, False)] @@ -135,7 +136,7 @@ def time_agg_small_num_groups(self, method, ndim, use_flox): def time_agg_large_num_groups(self, method, ndim, use_flox): ds = getattr(self, f"ds{ndim}d") with xr.set_options(use_flox=use_flox): - getattr(ds.resample(time="48H"), method)().compute() + getattr(ds.resample(time="48h"), method)().compute() class ResampleDask(Resample): @@ -154,13 +155,13 @@ def setup(self, *args, **kwargs): }, coords={ "time": xr.date_range( - "2001-01-01", freq="H", periods=365 * 24, calendar="noleap" + "2001-01-01", freq="h", periods=365 * 24, calendar="noleap" ) }, ) self.ds2d = self.ds1d.expand_dims(z=10) - self.ds1d_mean = self.ds1d.resample(time="48H").mean() - self.ds2d_mean = self.ds2d.resample(time="48H").mean() + self.ds1d_mean = self.ds1d.resample(time="48h").mean() + self.ds2d_mean = self.ds2d.resample(time="48h").mean() @parameterized(["use_cftime", "use_flox"], [[True, False], [True, False]]) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 892a6cb3758..529d023daa8 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -19,6 +19,7 @@ } basic_assignment_values = { + "1scalar": 0, "1slice": xr.DataArray(randn((3, ny), frac_nan=0.1), dims=["x", "y"]), "1slice-1scalar": xr.DataArray(randn(int(ny / 3) + 1, frac_nan=0.1), dims=["y"]), "2slicess-1scalar": xr.DataArray( From 6057128b7779611c03a546927955862b1dcd2572 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 8 May 2024 13:26:43 -0600 Subject: [PATCH 13/13] Avoid auto creation of indexes in concat (#8872) * test not creating indexes on concatenation * construct result dataset using Coordinates object with indexes passed explicitly * remove unnecessary overwriting of indexes * ConcatenatableArray class * use ConcatenableArray in tests * add regression tests * fix by performing check * refactor assert_valid_explicit_coords and rename dims->sizes * Revert "add regression tests" This reverts commit beb665a7109bdb627aa66ee277fd87edc195356d. * Revert "fix by performing check" This reverts commit 22f361dc590a83b2b3660539175a8a7cb1cba051. * Revert "refactor assert_valid_explicit_coords and rename dims->sizes" This reverts commit 55166fc7e002fa07d7a84f8d7fc460ddaad9674f. * fix failing test * possible fix for failing groupby test * Revert "possible fix for failing groupby test" This reverts commit 6e9ead6603de73c5ea6bd8f76d973525bb70b417. * test expand_dims doesn't create Index * add option to not create 1D index in expand_dims * refactor tests to consider data variables and coordinate variables separately * test expand_dims doesn't create Index * add option to not create 1D index in expand_dims * refactor tests to consider data variables and coordinate variables separately * fix bug causing new test to fail * test index auto-creation when iterable passed as new coordinate values * make test for iterable pass * added kwarg to dataarray * whatsnew * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Revert "refactor tests to consider data variables and coordinate variables separately" This reverts commit ba5627eebf7b580d0a0b9a171f1f94d7412662e3. * Revert "add option to not create 1D index in expand_dims" This reverts commit 95d453ccff1d2e2746c1970c0157f2de0b582105. * test that concat doesn't raise if create_1d_index=False * make test pass by passing create_1d_index down through concat * assert that an UnexpectedDataAccess error is raised when create_1d_index=True * eliminate possibility of xarray internals bypassing UnexpectedDataAccess error by accessing .array * update tests to use private versions of assertions * create_1d_index->create_index * Update doc/whats-new.rst Co-authored-by: Deepak Cherian * Rename create_1d_index -> create_index * fix ConcatenatableArray * formatting * whatsnew * add new create_index kwarg to overloads * split vars into data_vars and coord_vars in one loop * avoid mypy error by using new variable name * warn if create_index=True but no index created because dimension variable was a data var not a coord * add string marks in warning message * regression test for dtype changing in to_stacked_array * correct doctest * Remove outdated comment * test we can skip creation of indexes during shape promotion * make shape promotion test pass * point to issue in whatsnew * don't create dimension coordinates just to drop them at the end * Remove ToDo about not using Coordinates object to pass indexes Co-authored-by: Deepak Cherian * get rid of unlabeled_dims variable entirely * move ConcatenatableArray and similar to new file * formatting nit Co-authored-by: Justus Magin * renamed create_index -> create_index_for_new_dim in concat * renamed create_index -> create_index_for_new_dim in expand_dims * fix incorrect arg name * add example to docstring * add example of using new kwarg to docstring of expand_dims * add example of using new kwarg to docstring of concat * re-nit the nit Co-authored-by: Justus Magin * more instances of the nit * fix docstring doctest formatting nit --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Deepak Cherian Co-authored-by: Justus Magin --- doc/whats-new.rst | 5 +- xarray/core/concat.py | 67 ++++++++++--- xarray/core/dataarray.py | 12 ++- xarray/core/dataset.py | 43 +++++++-- xarray/tests/__init__.py | 54 ++--------- xarray/tests/arrays.py | 179 +++++++++++++++++++++++++++++++++++ xarray/tests/test_concat.py | 88 +++++++++++++++++ xarray/tests/test_dataset.py | 41 ++++++-- 8 files changed, 405 insertions(+), 84 deletions(-) create mode 100644 xarray/tests/arrays.py diff --git a/doc/whats-new.rst b/doc/whats-new.rst index a846c1b8a01..378e6330352 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -32,7 +32,10 @@ New Features - :py:func:`testing.assert_allclose`/:py:func:`testing.assert_equal` now accept a new argument `check_dims="transpose"`, controlling whether a transposed array is considered equal. (:issue:`5733`, :pull:`8991`) By `Ignacio Martinez Vazquez `_. - Added the option to avoid automatically creating 1D pandas indexes in :py:meth:`Dataset.expand_dims()`, by passing the new kwarg - `create_index=False`. (:pull:`8960`) + `create_index_for_new_dim=False`. (:pull:`8960`) + By `Tom Nicholas `_. +- Avoid automatically re-creating 1D pandas indexes in :py:func:`concat()`. Also added option to avoid creating 1D indexes for + new dimension coordinates by passing the new kwarg `create_index_for_new_dim=False`. (:issue:`8871`, :pull:`8872`) By `Tom Nicholas `_. Breaking changes diff --git a/xarray/core/concat.py b/xarray/core/concat.py index d95cbccd36a..b1cca586992 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -8,6 +8,7 @@ from xarray.core import dtypes, utils from xarray.core.alignment import align, reindex_variables +from xarray.core.coordinates import Coordinates from xarray.core.duck_array_ops import lazy_array_equiv from xarray.core.indexes import Index, PandasIndex from xarray.core.merge import ( @@ -42,6 +43,7 @@ def concat( fill_value: object = dtypes.NA, join: JoinOptions = "outer", combine_attrs: CombineAttrsOptions = "override", + create_index_for_new_dim: bool = True, ) -> T_Dataset: ... @@ -56,6 +58,7 @@ def concat( fill_value: object = dtypes.NA, join: JoinOptions = "outer", combine_attrs: CombineAttrsOptions = "override", + create_index_for_new_dim: bool = True, ) -> T_DataArray: ... @@ -69,6 +72,7 @@ def concat( fill_value=dtypes.NA, join: JoinOptions = "outer", combine_attrs: CombineAttrsOptions = "override", + create_index_for_new_dim: bool = True, ): """Concatenate xarray objects along a new or existing dimension. @@ -162,6 +166,8 @@ def concat( If a callable, it must expect a sequence of ``attrs`` dicts and a context object as its only parameters. + create_index_for_new_dim : bool, default: True + Whether to create a new ``PandasIndex`` object when the objects being concatenated contain scalar variables named ``dim``. Returns ------- @@ -217,6 +223,25 @@ def concat( x (new_dim) >> ds = xr.Dataset(coords={"x": 0}) + >>> xr.concat([ds, ds], dim="x") + Size: 16B + Dimensions: (x: 2) + Coordinates: + * x (x) int64 16B 0 0 + Data variables: + *empty* + + >>> xr.concat([ds, ds], dim="x").indexes + Indexes: + x Index([0, 0], dtype='int64', name='x') + + >>> xr.concat([ds, ds], dim="x", create_index_for_new_dim=False).indexes + Indexes: + *empty* """ # TODO: add ignore_index arguments copied from pandas.concat # TODO: support concatenating scalar coordinates even if the concatenated @@ -245,6 +270,7 @@ def concat( fill_value=fill_value, join=join, combine_attrs=combine_attrs, + create_index_for_new_dim=create_index_for_new_dim, ) elif isinstance(first_obj, Dataset): return _dataset_concat( @@ -257,6 +283,7 @@ def concat( fill_value=fill_value, join=join, combine_attrs=combine_attrs, + create_index_for_new_dim=create_index_for_new_dim, ) else: raise TypeError( @@ -439,7 +466,7 @@ def _parse_datasets( if dim in dims: continue - if dim not in dim_coords: + if dim in ds.coords and dim not in dim_coords: dim_coords[dim] = ds.coords[dim].variable dims = dims | set(ds.dims) @@ -456,6 +483,7 @@ def _dataset_concat( fill_value: Any = dtypes.NA, join: JoinOptions = "outer", combine_attrs: CombineAttrsOptions = "override", + create_index_for_new_dim: bool = True, ) -> T_Dataset: """ Concatenate a sequence of datasets along a new or existing dimension @@ -489,7 +517,6 @@ def _dataset_concat( datasets ) dim_names = set(dim_coords) - unlabeled_dims = dim_names - coord_names both_data_and_coords = coord_names & data_names if both_data_and_coords: @@ -502,7 +529,10 @@ def _dataset_concat( # case where concat dimension is a coordinate or data_var but not a dimension if (dim in coord_names or dim in data_names) and dim not in dim_names: - datasets = [ds.expand_dims(dim) for ds in datasets] + datasets = [ + ds.expand_dims(dim, create_index_for_new_dim=create_index_for_new_dim) + for ds in datasets + ] # determine which variables to concatenate concat_over, equals, concat_dim_lengths = _calc_concat_over( @@ -510,7 +540,7 @@ def _dataset_concat( ) # determine which variables to merge, and then merge them according to compat - variables_to_merge = (coord_names | data_names) - concat_over - unlabeled_dims + variables_to_merge = (coord_names | data_names) - concat_over result_vars = {} result_indexes = {} @@ -567,7 +597,8 @@ def get_indexes(name): var = ds._variables[name] if not var.dims: data = var.set_dims(dim).values - yield PandasIndex(data, dim, coord_dtype=var.dtype) + if create_index_for_new_dim: + yield PandasIndex(data, dim, coord_dtype=var.dtype) # create concatenation index, needed for later reindexing file_start_indexes = np.append(0, np.cumsum(concat_dim_lengths)) @@ -646,29 +677,33 @@ def get_indexes(name): # preserves original variable order result_vars[name] = result_vars.pop(name) - result = type(datasets[0])(result_vars, attrs=result_attrs) - - absent_coord_names = coord_names - set(result.variables) + absent_coord_names = coord_names - set(result_vars) if absent_coord_names: raise ValueError( f"Variables {absent_coord_names!r} are coordinates in some datasets but not others." ) - result = result.set_coords(coord_names) - result.encoding = result_encoding - result = result.drop_vars(unlabeled_dims, errors="ignore") + result_data_vars = {} + coord_vars = {} + for name, result_var in result_vars.items(): + if name in coord_names: + coord_vars[name] = result_var + else: + result_data_vars[name] = result_var if index is not None: - # add concat index / coordinate last to ensure that its in the final Dataset if dim_var is not None: index_vars = index.create_variables({dim: dim_var}) else: index_vars = index.create_variables() - result[dim] = index_vars[dim] + + coord_vars[dim] = index_vars[dim] result_indexes[dim] = index - # TODO: add indexes at Dataset creation (when it is supported) - result = result._overwrite_indexes(result_indexes) + coords_obj = Coordinates(coord_vars, indexes=result_indexes) + + result = type(datasets[0])(result_data_vars, coords=coords_obj, attrs=result_attrs) + result.encoding = result_encoding return result @@ -683,6 +718,7 @@ def _dataarray_concat( fill_value: object = dtypes.NA, join: JoinOptions = "outer", combine_attrs: CombineAttrsOptions = "override", + create_index_for_new_dim: bool = True, ) -> T_DataArray: from xarray.core.dataarray import DataArray @@ -719,6 +755,7 @@ def _dataarray_concat( fill_value=fill_value, join=join, combine_attrs=combine_attrs, + create_index_for_new_dim=create_index_for_new_dim, ) merged_attrs = merge_attrs([da.attrs for da in arrays], combine_attrs) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index c89dedf1215..4dc897c1878 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2558,7 +2558,7 @@ def expand_dims( self, dim: None | Hashable | Sequence[Hashable] | Mapping[Any, Any] = None, axis: None | int | Sequence[int] = None, - create_index: bool = True, + create_index_for_new_dim: bool = True, **dim_kwargs: Any, ) -> Self: """Return a new object with an additional axis (or axes) inserted at @@ -2569,7 +2569,7 @@ def expand_dims( coordinate consisting of a single value. The automatic creation of indexes to back new 1D coordinate variables - controlled by the create_index kwarg. + controlled by the create_index_for_new_dim kwarg. Parameters ---------- @@ -2586,8 +2586,8 @@ def expand_dims( multiple axes are inserted. In this case, dim arguments should be same length list. If axis=None is passed, all the axes will be inserted to the start of the result array. - create_index : bool, default is True - Whether to create new PandasIndex objects for any new 1D coordinate variables. + create_index_for_new_dim : bool, default: True + Whether to create new ``PandasIndex`` objects when the object being expanded contains scalar variables with names in ``dim``. **dim_kwargs : int or sequence or ndarray The keywords are arbitrary dimensions being inserted and the values are either the lengths of the new dims (if int is given), or their @@ -2651,7 +2651,9 @@ def expand_dims( dim = {dim: 1} dim = either_dict_or_kwargs(dim, dim_kwargs, "expand_dims") - ds = self._to_temp_dataset().expand_dims(dim, axis, create_index=create_index) + ds = self._to_temp_dataset().expand_dims( + dim, axis, create_index_for_new_dim=create_index_for_new_dim + ) return self._from_temp_dataset(ds) def set_index( diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 2ddcacd2fa0..09597670573 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4513,7 +4513,7 @@ def expand_dims( self, dim: None | Hashable | Sequence[Hashable] | Mapping[Any, Any] = None, axis: None | int | Sequence[int] = None, - create_index: bool = True, + create_index_for_new_dim: bool = True, **dim_kwargs: Any, ) -> Self: """Return a new object with an additional axis (or axes) inserted at @@ -4524,7 +4524,7 @@ def expand_dims( coordinate consisting of a single value. The automatic creation of indexes to back new 1D coordinate variables - controlled by the create_index kwarg. + controlled by the create_index_for_new_dim kwarg. Parameters ---------- @@ -4541,8 +4541,8 @@ def expand_dims( multiple axes are inserted. In this case, dim arguments should be same length list. If axis=None is passed, all the axes will be inserted to the start of the result array. - create_index : bool, default is True - Whether to create new PandasIndex objects for any new 1D coordinate variables. + create_index_for_new_dim : bool, default: True + Whether to create new ``PandasIndex`` objects when the object being expanded contains scalar variables with names in ``dim``. **dim_kwargs : int or sequence or ndarray The keywords are arbitrary dimensions being inserted and the values are either the lengths of the new dims (if int is given), or their @@ -4612,6 +4612,33 @@ def expand_dims( Data variables: temperature (y, x, time) float64 96B 0.5488 0.7152 0.6028 ... 0.7917 0.5289 + # Expand a scalar variable along a new dimension of the same name with and without creating a new index + + >>> ds = xr.Dataset(coords={"x": 0}) + >>> ds + Size: 8B + Dimensions: () + Coordinates: + x int64 8B 0 + Data variables: + *empty* + + >>> ds.expand_dims("x") + Size: 8B + Dimensions: (x: 1) + Coordinates: + * x (x) int64 8B 0 + Data variables: + *empty* + + >>> ds.expand_dims("x").indexes + Indexes: + x Index([0], dtype='int64', name='x') + + >>> ds.expand_dims("x", create_index_for_new_dim=False).indexes + Indexes: + *empty* + See Also -------- DataArray.expand_dims @@ -4663,7 +4690,7 @@ def expand_dims( # value within the dim dict to the length of the iterable # for later use. - if create_index: + if create_index_for_new_dim: index = PandasIndex(v, k) indexes[k] = index name_and_new_1d_var = index.create_variables() @@ -4705,14 +4732,14 @@ def expand_dims( variables[k] = v.set_dims(dict(all_dims)) else: if k not in variables: - if k in coord_names and create_index: + if k in coord_names and create_index_for_new_dim: # If dims includes a label of a non-dimension coordinate, # it will be promoted to a 1D coordinate with a single value. index, index_vars = create_default_index_implicit(v.set_dims(k)) indexes[k] = index variables.update(index_vars) else: - if create_index: + if create_index_for_new_dim: warnings.warn( f"No index created for dimension {k} because variable {k} is not a coordinate. " f"To create an index for {k}, please first call `.set_coords('{k}')` on this object.", @@ -5400,7 +5427,7 @@ def to_stacked_array( [3, 4, 5, 7]]) Coordinates: * z (z) object 32B MultiIndex - * variable (z) object 32B 'a' 'a' 'a' 'b' + * variable (z) 1: - raise UnexpectedDataAccess("Tried accessing more than one element.") - return self.array[tuple_idxr] - - -class DuckArrayWrapper(utils.NDArrayMixin): - """Array-like that prevents casting to array. - Modeled after cupy.""" - - def __init__(self, array: np.ndarray): - self.array = array - - def __getitem__(self, key): - return type(self)(self.array[key]) - - def __array__(self, dtype: np.typing.DTypeLike = None): - raise UnexpectedDataAccess("Tried accessing data") - - def __array_namespace__(self): - """Present to satisfy is_duck_array test.""" - - class ReturnItem: def __getitem__(self, key): return key diff --git a/xarray/tests/arrays.py b/xarray/tests/arrays.py new file mode 100644 index 00000000000..983e620d1f0 --- /dev/null +++ b/xarray/tests/arrays.py @@ -0,0 +1,179 @@ +from collections.abc import Iterable +from typing import Any, Callable + +import numpy as np + +from xarray.core import utils +from xarray.core.indexing import ExplicitlyIndexed + +""" +This module contains various lazy array classes which can be wrapped and manipulated by xarray objects but will raise on data access. +""" + + +class UnexpectedDataAccess(Exception): + pass + + +class InaccessibleArray(utils.NDArrayMixin, ExplicitlyIndexed): + """Disallows any loading.""" + + def __init__(self, array): + self.array = array + + def get_duck_array(self): + raise UnexpectedDataAccess("Tried accessing data") + + def __array__(self, dtype: np.typing.DTypeLike = None): + raise UnexpectedDataAccess("Tried accessing data") + + def __getitem__(self, key): + raise UnexpectedDataAccess("Tried accessing data.") + + +class FirstElementAccessibleArray(InaccessibleArray): + def __getitem__(self, key): + tuple_idxr = key.tuple + if len(tuple_idxr) > 1: + raise UnexpectedDataAccess("Tried accessing more than one element.") + return self.array[tuple_idxr] + + +class DuckArrayWrapper(utils.NDArrayMixin): + """Array-like that prevents casting to array. + Modeled after cupy.""" + + def __init__(self, array: np.ndarray): + self.array = array + + def __getitem__(self, key): + return type(self)(self.array[key]) + + def __array__(self, dtype: np.typing.DTypeLike = None): + raise UnexpectedDataAccess("Tried accessing data") + + def __array_namespace__(self): + """Present to satisfy is_duck_array test.""" + + +CONCATENATABLEARRAY_HANDLED_ARRAY_FUNCTIONS: dict[str, Callable] = {} + + +def implements(numpy_function): + """Register an __array_function__ implementation for ConcatenatableArray objects.""" + + def decorator(func): + CONCATENATABLEARRAY_HANDLED_ARRAY_FUNCTIONS[numpy_function] = func + return func + + return decorator + + +@implements(np.concatenate) +def concatenate( + arrays: Iterable["ConcatenatableArray"], /, *, axis=0 +) -> "ConcatenatableArray": + if any(not isinstance(arr, ConcatenatableArray) for arr in arrays): + raise TypeError + + result = np.concatenate([arr._array for arr in arrays], axis=axis) + return ConcatenatableArray(result) + + +@implements(np.stack) +def stack( + arrays: Iterable["ConcatenatableArray"], /, *, axis=0 +) -> "ConcatenatableArray": + if any(not isinstance(arr, ConcatenatableArray) for arr in arrays): + raise TypeError + + result = np.stack([arr._array for arr in arrays], axis=axis) + return ConcatenatableArray(result) + + +@implements(np.result_type) +def result_type(*arrays_and_dtypes) -> np.dtype: + """Called by xarray to ensure all arguments to concat have the same dtype.""" + first_dtype, *other_dtypes = (np.dtype(obj) for obj in arrays_and_dtypes) + for other_dtype in other_dtypes: + if other_dtype != first_dtype: + raise ValueError("dtypes not all consistent") + return first_dtype + + +@implements(np.broadcast_to) +def broadcast_to( + x: "ConcatenatableArray", /, shape: tuple[int, ...] +) -> "ConcatenatableArray": + """ + Broadcasts an array to a specified shape, by either manipulating chunk keys or copying chunk manifest entries. + """ + if not isinstance(x, ConcatenatableArray): + raise TypeError + + result = np.broadcast_to(x._array, shape=shape) + return ConcatenatableArray(result) + + +class ConcatenatableArray: + """Disallows loading or coercing to an index but does support concatenation / stacking.""" + + def __init__(self, array): + # use ._array instead of .array because we don't want this to be accessible even to xarray's internals (e.g. create_default_index_implicit) + self._array = array + + @property + def dtype(self: Any) -> np.dtype: + return self._array.dtype + + @property + def shape(self: Any) -> tuple[int, ...]: + return self._array.shape + + @property + def ndim(self: Any) -> int: + return self._array.ndim + + def __repr__(self: Any) -> str: + return f"{type(self).__name__}(array={self._array!r})" + + def get_duck_array(self): + raise UnexpectedDataAccess("Tried accessing data") + + def __array__(self, dtype: np.typing.DTypeLike = None): + raise UnexpectedDataAccess("Tried accessing data") + + def __getitem__(self, key) -> "ConcatenatableArray": + """Some cases of concat require supporting expanding dims by dimensions of size 1""" + # see https://data-apis.org/array-api/2022.12/API_specification/indexing.html#multi-axis-indexing + arr = self._array + for axis, indexer_1d in enumerate(key): + if indexer_1d is None: + arr = np.expand_dims(arr, axis) + elif indexer_1d is Ellipsis: + pass + else: + raise UnexpectedDataAccess("Tried accessing data.") + return ConcatenatableArray(arr) + + def __array_function__(self, func, types, args, kwargs) -> Any: + if func not in CONCATENATABLEARRAY_HANDLED_ARRAY_FUNCTIONS: + return NotImplemented + + # Note: this allows subclasses that don't override + # __array_function__ to handle ManifestArray objects + if not all(issubclass(t, ConcatenatableArray) for t in types): + return NotImplemented + + return CONCATENATABLEARRAY_HANDLED_ARRAY_FUNCTIONS[func](*args, **kwargs) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs) -> Any: + """We have to define this in order to convince xarray that this class is a duckarray, even though we will never support ufuncs.""" + return NotImplemented + + def astype(self, dtype: np.dtype, /, *, copy: bool = True) -> "ConcatenatableArray": + """Needed because xarray will call this even when it's a no-op""" + if dtype != self.dtype: + raise NotImplementedError() + else: + return self diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 1ddb5a569bd..0c570de3b52 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -12,7 +12,9 @@ from xarray.core.coordinates import Coordinates from xarray.core.indexes import PandasIndex from xarray.tests import ( + ConcatenatableArray, InaccessibleArray, + UnexpectedDataAccess, assert_array_equal, assert_equal, assert_identical, @@ -999,6 +1001,63 @@ def test_concat_str_dtype(self, dtype, dim) -> None: assert np.issubdtype(actual.x2.dtype, dtype) + def test_concat_avoids_index_auto_creation(self) -> None: + # TODO once passing indexes={} directly to Dataset constructor is allowed then no need to create coords first + coords = Coordinates( + {"x": ConcatenatableArray(np.array([1, 2, 3]))}, indexes={} + ) + datasets = [ + Dataset( + {"a": (["x", "y"], ConcatenatableArray(np.zeros((3, 3))))}, + coords=coords, + ) + for _ in range(2) + ] + # should not raise on concat + combined = concat(datasets, dim="x") + assert combined["a"].shape == (6, 3) + assert combined["a"].dims == ("x", "y") + + # nor have auto-created any indexes + assert combined.indexes == {} + + # should not raise on stack + combined = concat(datasets, dim="z") + assert combined["a"].shape == (2, 3, 3) + assert combined["a"].dims == ("z", "x", "y") + + # nor have auto-created any indexes + assert combined.indexes == {} + + def test_concat_avoids_index_auto_creation_new_1d_coord(self) -> None: + # create 0D coordinates (without indexes) + datasets = [ + Dataset( + coords={"x": ConcatenatableArray(np.array(10))}, + ) + for _ in range(2) + ] + + with pytest.raises(UnexpectedDataAccess): + concat(datasets, dim="x", create_index_for_new_dim=True) + + # should not raise on concat iff create_index_for_new_dim=False + combined = concat(datasets, dim="x", create_index_for_new_dim=False) + assert combined["x"].shape == (2,) + assert combined["x"].dims == ("x",) + + # nor have auto-created any indexes + assert combined.indexes == {} + + def test_concat_promote_shape_without_creating_new_index(self) -> None: + # different shapes but neither have indexes + ds1 = Dataset(coords={"x": 0}) + ds2 = Dataset(data_vars={"x": [1]}).drop_indexes("x") + actual = concat([ds1, ds2], dim="x", create_index_for_new_dim=False) + expected = Dataset(data_vars={"x": [0, 1]}).drop_indexes("x") + assert_identical(actual, expected, check_default_indexes=False) + assert actual.indexes == {} + class TestConcatDataArray: def test_concat(self) -> None: @@ -1072,6 +1131,35 @@ def test_concat_lazy(self) -> None: assert combined.shape == (2, 3, 3) assert combined.dims == ("z", "x", "y") + def test_concat_avoids_index_auto_creation(self) -> None: + # TODO once passing indexes={} directly to DataArray constructor is allowed then no need to create coords first + coords = Coordinates( + {"x": ConcatenatableArray(np.array([1, 2, 3]))}, indexes={} + ) + arrays = [ + DataArray( + ConcatenatableArray(np.zeros((3, 3))), + dims=["x", "y"], + coords=coords, + ) + for _ in range(2) + ] + # should not raise on concat + combined = concat(arrays, dim="x") + assert combined.shape == (6, 3) + assert combined.dims == ("x", "y") + + # nor have auto-created any indexes + assert combined.indexes == {} + + # should not raise on stack + combined = concat(arrays, dim="z") + assert combined.shape == (2, 3, 3) + assert combined.dims == ("z", "x", "y") + + # nor have auto-created any indexes + assert combined.indexes == {} + @pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0]) def test_concat_fill_value(self, fill_value) -> None: foo = DataArray([1, 2], coords=[("x", [1, 2])]) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 59b5b2b9b71..584776197e3 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -3431,16 +3431,22 @@ def test_expand_dims_kwargs_python36plus(self) -> None: ) assert_identical(other_way_expected, other_way) - @pytest.mark.parametrize("create_index_flag", [True, False]) - def test_expand_dims_create_index_data_variable(self, create_index_flag): + @pytest.mark.parametrize("create_index_for_new_dim_flag", [True, False]) + def test_expand_dims_create_index_data_variable( + self, create_index_for_new_dim_flag + ): # data variables should not gain an index ever ds = Dataset({"x": 0}) - if create_index_flag: + if create_index_for_new_dim_flag: with pytest.warns(UserWarning, match="No index created"): - expanded = ds.expand_dims("x", create_index=create_index_flag) + expanded = ds.expand_dims( + "x", create_index_for_new_dim=create_index_for_new_dim_flag + ) else: - expanded = ds.expand_dims("x", create_index=create_index_flag) + expanded = ds.expand_dims( + "x", create_index_for_new_dim=create_index_for_new_dim_flag + ) # TODO Can't just create the expected dataset directly using constructor because of GH issue 8959 expected = Dataset({"x": ("x", [0])}).drop_indexes("x").reset_coords("x") @@ -3449,13 +3455,13 @@ def test_expand_dims_create_index_data_variable(self, create_index_flag): assert expanded.indexes == {} def test_expand_dims_create_index_coordinate_variable(self): - # coordinate variables should gain an index only if create_index is True (the default) + # coordinate variables should gain an index only if create_index_for_new_dim is True (the default) ds = Dataset(coords={"x": 0}) expanded = ds.expand_dims("x") expected = Dataset({"x": ("x", [0])}) assert_identical(expanded, expected) - expanded_no_index = ds.expand_dims("x", create_index=False) + expanded_no_index = ds.expand_dims("x", create_index_for_new_dim=False) # TODO Can't just create the expected dataset directly using constructor because of GH issue 8959 expected = Dataset(coords={"x": ("x", [0])}).drop_indexes("x") @@ -3469,7 +3475,7 @@ def test_expand_dims_create_index_from_iterable(self): expected = Dataset({"x": ("x", [0, 1])}) assert_identical(expanded, expected) - expanded_no_index = ds.expand_dims(x=[0, 1], create_index=False) + expanded_no_index = ds.expand_dims(x=[0, 1], create_index_for_new_dim=False) # TODO Can't just create the expected dataset directly using constructor because of GH issue 8959 expected = Dataset(coords={"x": ("x", [0, 1])}).drop_indexes("x") @@ -3971,6 +3977,25 @@ def test_to_stacked_array_to_unstacked_dataset_different_dimension(self) -> None x = y.to_unstacked_dataset("features") assert_identical(D, x) + def test_to_stacked_array_preserves_dtype(self) -> None: + # regression test for bug found in https://github.com/pydata/xarray/pull/8872#issuecomment-2081218616 + ds = xr.Dataset( + data_vars={ + "a": (("x", "y"), [[0, 1, 2], [3, 4, 5]]), + "b": ("x", [6, 7]), + }, + coords={"y": ["u", "v", "w"]}, + ) + stacked = ds.to_stacked_array("z", sample_dims=["x"]) + + # coordinate created from variables names should be of string dtype + data = np.array(["a", "a", "a", "b"], dtype=" None: data = create_test_data(seed=0) expected = data.copy()