From 22995e95b5a58fa3e880655d69f1d4cc35a6d8b4 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 25 Mar 2024 01:11:36 -0400 Subject: [PATCH 01/64] test not creating indexes on concatenation --- xarray/tests/test_concat.py | 53 +++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 0cf4cc03a09..ed5c8f287c1 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -978,6 +978,32 @@ def test_concat_str_dtype(self, dtype, dim) -> None: assert np.issubdtype(actual.x2.dtype, dtype) + def test_concat_avoids_index_auto_creation(self) -> None: + # TODO once passing indexes={} directly to DataArray constructor is allowed then no need to create coords first + coords = Coordinates({"x": np.array([1, 2, 3])}, indexes={}) + datasets = [ + Dataset( + {"a": (["x", "y"], np.zeros((3, 3)))}, + coords=coords, + ) + for _ in range(2) + ] + # should not raise on concat + combined = concat(datasets, dim="x") + assert combined["a"].shape == (6, 3) + assert combined["a"].dims == ("x", "y") + + # nor have auto-created any indexes + assert combined.indexes == {} + + # should not raise on stack + combined = concat(datasets, dim="z") + assert combined["a"].shape == (2, 3, 3) + assert combined["a"].dims == ("z", "x", "y") + + # nor have auto-created any indexes + assert combined.indexes == {} + class TestConcatDataArray: def test_concat(self) -> None: @@ -1051,6 +1077,33 @@ def test_concat_lazy(self) -> None: assert combined.shape == (2, 3, 3) assert combined.dims == ("z", "x", "y") + def test_concat_avoids_index_auto_creation(self) -> None: + # TODO once passing indexes={} directly to DataArray constructor is allowed then no need to create coords first + coords = Coordinates({"x": np.array([1, 2, 3])}, indexes={}) + arrays = [ + DataArray( + np.zeros((3, 3)), + dims=["x", "y"], + coords=coords, + ) + for _ in range(2) + ] + # should not raise on concat + combined = concat(arrays, dim="x") + assert combined.shape == (6, 3) + assert combined.dims == ("x", "y") + + # nor have auto-created any indexes + assert combined.indexes == {} + + # should not raise on stack + combined = concat(arrays, dim="z") + assert combined.shape == (2, 3, 3) + assert combined.dims == ("z", "x", "y") + + # nor have auto-created any indexes + assert combined.indexes == {} + @pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0]) def test_concat_fill_value(self, fill_value) -> None: foo = DataArray([1, 2], coords=[("x", [1, 2])]) From 7142c9f2cf227abe7eee20b181d0f398a1644789 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 25 Mar 2024 01:12:03 -0400 Subject: [PATCH 02/64] construct result dataset using Coordinates object with indexes passed explicitly --- xarray/core/concat.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index d95cbccd36a..708a395c66b 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -8,6 +8,7 @@ from xarray.core import dtypes, utils from xarray.core.alignment import align, reindex_variables +from xarray.core.coordinates import Coordinates from xarray.core.duck_array_ops import lazy_array_equiv from xarray.core.indexes import Index, PandasIndex from xarray.core.merge import ( @@ -646,14 +647,26 @@ def get_indexes(name): # preserves original variable order result_vars[name] = result_vars.pop(name) - result = type(datasets[0])(result_vars, attrs=result_attrs) - - absent_coord_names = coord_names - set(result.variables) + absent_coord_names = coord_names - set(result_vars) if absent_coord_names: raise ValueError( f"Variables {absent_coord_names!r} are coordinates in some datasets but not others." ) - result = result.set_coords(coord_names) + coord_vars = { + name: result_var + for name, result_var in result_vars.items() + if name in coord_names + } + coords = Coordinates(coord_vars, indexes=result_indexes) + + # TODO: this is just the complement of the set of coord_vars + result_data_vars = { + name: result_var + for name, result_var in result_vars.items() + if name not in coord_names + } + + result = type(datasets[0])(result_data_vars, coords=coords, attrs=result_attrs) result.encoding = result_encoding result = result.drop_vars(unlabeled_dims, errors="ignore") @@ -668,6 +681,7 @@ def get_indexes(name): result_indexes[dim] = index # TODO: add indexes at Dataset creation (when it is supported) + # TODO: do we actually need this step anymore? result = result._overwrite_indexes(result_indexes) return result From 7fb075a8821065bdec76b4e837858a0bc978f845 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 25 Mar 2024 01:28:47 -0400 Subject: [PATCH 03/64] remove unnecessary overwriting of indexes --- xarray/core/concat.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 708a395c66b..468c0595b0a 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -678,11 +678,6 @@ def get_indexes(name): else: index_vars = index.create_variables() result[dim] = index_vars[dim] - result_indexes[dim] = index - - # TODO: add indexes at Dataset creation (when it is supported) - # TODO: do we actually need this step anymore? - result = result._overwrite_indexes(result_indexes) return result From 285c1dee688503c21c233a290b3801bfd6c64a84 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sun, 24 Mar 2024 21:24:10 -0400 Subject: [PATCH 04/64] ConcatenatableArray class --- xarray/tests/__init__.py | 101 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 5007db9eeb2..81beba80cf1 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -4,7 +4,9 @@ import platform import string import warnings +from collections.abc import Iterable from contextlib import contextmanager, nullcontext +from typing import Any, Callable from unittest import mock # noqa: F401 import numpy as np @@ -208,6 +210,105 @@ def __getitem__(self, key): raise UnexpectedDataAccess("Tried accessing data.") +HANDLED_ARRAY_FUNCTIONS: dict[str, Callable] = {} + + +def implements(numpy_function): + """Register an __array_function__ implementation for ManifestArray objects.""" + + def decorator(func): + HANDLED_ARRAY_FUNCTIONS[numpy_function] = func + return func + + return decorator + + +@implements(np.concatenate) +def concatenate( + arrays: Iterable[ConcatenatableArray], /, *, axis=0 +) -> ConcatenatableArray: + if any(not isinstance(arr, ConcatenatableArray) for arr in arrays): + raise TypeError + + result = np.concatenate([arr.array for arr in arrays], axis=axis) + return ConcatenatableArray(result) + + +@implements(np.stack) +def stack(arrays: Iterable[ConcatenatableArray], /, *, axis=0) -> ConcatenatableArray: + if any(not isinstance(arr, ConcatenatableArray) for arr in arrays): + raise TypeError + + result = np.stack([arr.array for arr in arrays], axis=axis) + return ConcatenatableArray(result) + + +@implements(np.result_type) +def result_type(*arrays_and_dtypes) -> np.dtype: + """Called by xarray to ensure all arguments to concat have the same dtype.""" + first_dtype, *other_dtypes = (np.dtype(obj) for obj in arrays_and_dtypes) + for other_dtype in other_dtypes: + if other_dtype != first_dtype: + raise ValueError("dtypes not all consistent") + return first_dtype + + +@implements(np.broadcast_to) +def broadcast_to( + x: ConcatenatableArray, /, shape: tuple[int, ...] +) -> ConcatenatableArray: + """ + Broadcasts an array to a specified shape, by either manipulating chunk keys or copying chunk manifest entries. + """ + if not isinstance(x, ConcatenatableArray): + raise TypeError + + result = np.broadcast_to(x.array, shape=shape) + return ConcatenatableArray(result) + + +class ConcatenatableArray(utils.NDArrayMixin): + """Disallows loading or coercing to an index but does support concatenation / stacking.""" + + # TODO only reason this is different from InaccessibleArray is to avoid it being a subclass of ExplicitlyIndexed + + HANDLED_ARRAY_FUNCTIONS = [concatenate, stack, result_type] + + def __init__(self, array): + self.array = array + + def get_duck_array(self): + raise UnexpectedDataAccess("Tried accessing data") + + def __array__(self, dtype: np.typing.DTypeLike = None): + raise UnexpectedDataAccess("Tried accessing data") + + def __getitem__(self, key): + raise UnexpectedDataAccess("Tried accessing data.") + + def __array_function__(self, func, types, args, kwargs) -> Any: + if func not in HANDLED_ARRAY_FUNCTIONS: + return NotImplemented + + # Note: this allows subclasses that don't override + # __array_function__ to handle ManifestArray objects + if not all(issubclass(t, ConcatenatableArray) for t in types): + return NotImplemented + + return HANDLED_ARRAY_FUNCTIONS[func](*args, **kwargs) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs) -> Any: + """We have to define this in order to convince xarray that this class is a duckarray, even though we will never support ufuncs.""" + return NotImplemented + + def astype(self, dtype: np.dtype, /, *, copy: bool = True) -> ConcatenatableArray: + """Needed because xarray will call this even when it's a no-op""" + if dtype != self.dtype: + raise NotImplementedError() + else: + return self + + class FirstElementAccessibleArray(InaccessibleArray): def __getitem__(self, key): tuple_idxr = key.tuple From cc2475703d02063345767627db7fb60f643dbd98 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Wed, 27 Mar 2024 20:50:18 -0400 Subject: [PATCH 05/64] use ConcatenableArray in tests --- xarray/tests/test_concat.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index ed5c8f287c1..a9ed15658f0 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -12,6 +12,7 @@ from xarray.core.coordinates import Coordinates from xarray.core.indexes import PandasIndex from xarray.tests import ( + ConcatenatableArray, InaccessibleArray, assert_array_equal, assert_equal, @@ -979,11 +980,13 @@ def test_concat_str_dtype(self, dtype, dim) -> None: assert np.issubdtype(actual.x2.dtype, dtype) def test_concat_avoids_index_auto_creation(self) -> None: - # TODO once passing indexes={} directly to DataArray constructor is allowed then no need to create coords first - coords = Coordinates({"x": np.array([1, 2, 3])}, indexes={}) + # TODO once passing indexes={} directly to Dataset constructor is allowed then no need to create coords first + coords = Coordinates( + {"x": ConcatenatableArray(np.array([1, 2, 3]))}, indexes={} + ) datasets = [ Dataset( - {"a": (["x", "y"], np.zeros((3, 3)))}, + {"a": (["x", "y"], ConcatenatableArray(np.zeros((3, 3))))}, coords=coords, ) for _ in range(2) @@ -1079,10 +1082,12 @@ def test_concat_lazy(self) -> None: def test_concat_avoids_index_auto_creation(self) -> None: # TODO once passing indexes={} directly to DataArray constructor is allowed then no need to create coords first - coords = Coordinates({"x": np.array([1, 2, 3])}, indexes={}) + coords = Coordinates( + {"x": ConcatenatableArray(np.array([1, 2, 3]))}, indexes={} + ) arrays = [ DataArray( - np.zeros((3, 3)), + ConcatenatableArray(np.zeros((3, 3))), dims=["x", "y"], coords=coords, ) From beb665a7109bdb627aa66ee277fd87edc195356d Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Wed, 27 Mar 2024 22:05:54 -0400 Subject: [PATCH 06/64] add regression tests --- xarray/tests/test_coordinates.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_coordinates.py b/xarray/tests/test_coordinates.py index 40743194ce6..aed2f37700e 100644 --- a/xarray/tests/test_coordinates.py +++ b/xarray/tests/test_coordinates.py @@ -1,5 +1,6 @@ from __future__ import annotations +import numpy as np import pandas as pd import pytest @@ -8,7 +9,7 @@ from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset from xarray.core.indexes import PandasIndex, PandasMultiIndex -from xarray.core.variable import IndexVariable +from xarray.core.variable import IndexVariable, Variable from xarray.tests import assert_identical, source_ndarray @@ -69,6 +70,12 @@ def test_init_dim_sizes_conflict(self) -> None: with pytest.raises(ValueError): Coordinates(coords={"foo": ("x", [1, 2]), "bar": ("x", [1, 2, 3, 4])}) + def test_init_var_shares_name_with_dim(self) -> None: + # regression test for GH #8883 + var = Variable(data=np.arange(6).reshape(2, 3), dims=["x", "y"]) + with pytest.raises(ValueError): + Coordinates(coords={"x": var}, indexes={}) + def test_from_pandas_multiindex(self) -> None: midx = pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=("one", "two")) coords = Coordinates.from_pandas_multiindex(midx, "x") From 22f361dc590a83b2b3660539175a8a7cb1cba051 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Wed, 27 Mar 2024 22:08:57 -0400 Subject: [PATCH 07/64] fix by performing check --- xarray/core/coordinates.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index 251edd1fc6f..e85aafc4479 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -22,7 +22,11 @@ assert_no_index_corrupted, create_default_index_implicit, ) -from xarray.core.merge import merge_coordinates_without_align, merge_coords +from xarray.core.merge import ( + assert_valid_explicit_coords, + merge_coordinates_without_align, + merge_coords, +) from xarray.core.types import DataVars, Self, T_DataArray, T_Xarray from xarray.core.utils import ( Frozen, @@ -306,6 +310,14 @@ def __init__( else: variables[name] = var + dims = set(d for var in variables.values() for d in var.dims) + sizes = { + d: -1 for d in dims + } # TODO the lengths here are never used, so assert_valid_explicit_coords should be refactored + assert_valid_explicit_coords( + variables, dims=sizes, explicit_coords=list(variables) + ) + if indexes is None: indexes = {} else: From 55166fc7e002fa07d7a84f8d7fc460ddaad9674f Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Wed, 27 Mar 2024 22:12:15 -0400 Subject: [PATCH 08/64] refactor assert_valid_explicit_coords and rename dims->sizes --- xarray/core/coordinates.py | 5 +---- xarray/core/merge.py | 10 +++++----- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index e85aafc4479..653e6be0ca2 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -311,11 +311,8 @@ def __init__( variables[name] = var dims = set(d for var in variables.values() for d in var.dims) - sizes = { - d: -1 for d in dims - } # TODO the lengths here are never used, so assert_valid_explicit_coords should be refactored assert_valid_explicit_coords( - variables, dims=sizes, explicit_coords=list(variables) + variables, dims=dims, explicit_coords=list(variables) ) if indexes is None: diff --git a/xarray/core/merge.py b/xarray/core/merge.py index cbd06c8fdc5..b8afdecec6e 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -564,7 +564,7 @@ def merge_coords( def assert_valid_explicit_coords( variables: Mapping[Any, Any], - dims: Mapping[Any, int], + dims: Iterable[Any], explicit_coords: Iterable[Hashable], ) -> None: """Validate explicit coordinate names/dims. @@ -721,16 +721,16 @@ def merge_core( collected, prioritized, compat=compat, combine_attrs=combine_attrs ) - dims = calculate_dimensions(variables) + sizes = calculate_dimensions(variables) coord_names, noncoord_names = determine_coords(coerced) if compat == "minimal": # coordinates may be dropped in merged results coord_names.intersection_update(variables) if explicit_coords is not None: - assert_valid_explicit_coords(variables, dims, explicit_coords) + assert_valid_explicit_coords(variables, sizes.keys(), explicit_coords) coord_names.update(explicit_coords) - for dim, size in dims.items(): + for dim, size in sizes.items(): if dim in variables: coord_names.add(dim) ambiguous_coords = coord_names.intersection(noncoord_names) @@ -745,7 +745,7 @@ def merge_core( combine_attrs, ) - return _MergeResult(variables, coord_names, dims, out_indexes, attrs) + return _MergeResult(variables, coord_names, sizes, out_indexes, attrs) def merge( From da6692b3a856af636fd3c447321759abfb366e56 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Thu, 28 Mar 2024 10:48:59 -0400 Subject: [PATCH 09/64] Revert "add regression tests" This reverts commit beb665a7109bdb627aa66ee277fd87edc195356d. --- xarray/tests/test_coordinates.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/xarray/tests/test_coordinates.py b/xarray/tests/test_coordinates.py index aed2f37700e..40743194ce6 100644 --- a/xarray/tests/test_coordinates.py +++ b/xarray/tests/test_coordinates.py @@ -1,6 +1,5 @@ from __future__ import annotations -import numpy as np import pandas as pd import pytest @@ -9,7 +8,7 @@ from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset from xarray.core.indexes import PandasIndex, PandasMultiIndex -from xarray.core.variable import IndexVariable, Variable +from xarray.core.variable import IndexVariable from xarray.tests import assert_identical, source_ndarray @@ -70,12 +69,6 @@ def test_init_dim_sizes_conflict(self) -> None: with pytest.raises(ValueError): Coordinates(coords={"foo": ("x", [1, 2]), "bar": ("x", [1, 2, 3, 4])}) - def test_init_var_shares_name_with_dim(self) -> None: - # regression test for GH #8883 - var = Variable(data=np.arange(6).reshape(2, 3), dims=["x", "y"]) - with pytest.raises(ValueError): - Coordinates(coords={"x": var}, indexes={}) - def test_from_pandas_multiindex(self) -> None: midx = pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=("one", "two")) coords = Coordinates.from_pandas_multiindex(midx, "x") From 35dfb67beb0b0c4de1cf83d7894f0594d5392d50 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Thu, 28 Mar 2024 10:49:53 -0400 Subject: [PATCH 10/64] Revert "fix by performing check" This reverts commit 22f361dc590a83b2b3660539175a8a7cb1cba051. --- xarray/core/coordinates.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index 653e6be0ca2..251edd1fc6f 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -22,11 +22,7 @@ assert_no_index_corrupted, create_default_index_implicit, ) -from xarray.core.merge import ( - assert_valid_explicit_coords, - merge_coordinates_without_align, - merge_coords, -) +from xarray.core.merge import merge_coordinates_without_align, merge_coords from xarray.core.types import DataVars, Self, T_DataArray, T_Xarray from xarray.core.utils import ( Frozen, @@ -310,11 +306,6 @@ def __init__( else: variables[name] = var - dims = set(d for var in variables.values() for d in var.dims) - assert_valid_explicit_coords( - variables, dims=dims, explicit_coords=list(variables) - ) - if indexes is None: indexes = {} else: From fd3de2be6b5b18f37fad7470e399e12044b22844 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Thu, 28 Mar 2024 10:50:34 -0400 Subject: [PATCH 11/64] Revert "refactor assert_valid_explicit_coords and rename dims->sizes" This reverts commit 55166fc7e002fa07d7a84f8d7fc460ddaad9674f. --- xarray/core/merge.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/xarray/core/merge.py b/xarray/core/merge.py index b8afdecec6e..cbd06c8fdc5 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -564,7 +564,7 @@ def merge_coords( def assert_valid_explicit_coords( variables: Mapping[Any, Any], - dims: Iterable[Any], + dims: Mapping[Any, int], explicit_coords: Iterable[Hashable], ) -> None: """Validate explicit coordinate names/dims. @@ -721,16 +721,16 @@ def merge_core( collected, prioritized, compat=compat, combine_attrs=combine_attrs ) - sizes = calculate_dimensions(variables) + dims = calculate_dimensions(variables) coord_names, noncoord_names = determine_coords(coerced) if compat == "minimal": # coordinates may be dropped in merged results coord_names.intersection_update(variables) if explicit_coords is not None: - assert_valid_explicit_coords(variables, sizes.keys(), explicit_coords) + assert_valid_explicit_coords(variables, dims, explicit_coords) coord_names.update(explicit_coords) - for dim, size in sizes.items(): + for dim, size in dims.items(): if dim in variables: coord_names.add(dim) ambiguous_coords = coord_names.intersection(noncoord_names) @@ -745,7 +745,7 @@ def merge_core( combine_attrs, ) - return _MergeResult(variables, coord_names, sizes, out_indexes, attrs) + return _MergeResult(variables, coord_names, dims, out_indexes, attrs) def merge( From 21afbb1ee88d44e0193f0ec384dac0c3d3b90f05 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Thu, 28 Mar 2024 16:06:23 -0400 Subject: [PATCH 12/64] fix failing test --- xarray/core/concat.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 468c0595b0a..5ef3c01d427 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -652,14 +652,26 @@ def get_indexes(name): raise ValueError( f"Variables {absent_coord_names!r} are coordinates in some datasets but not others." ) + coord_vars = { name: result_var for name, result_var in result_vars.items() if name in coord_names } + + if index is not None: + if dim_var is not None: + index_vars = index.create_variables({dim: dim_var}) + else: + index_vars = index.create_variables() + + coord_vars[dim] = index_vars[dim] + result_indexes[dim] = index + unlabeled_dims = unlabeled_dims - set([dim]) + + # TODO: add indexes at Dataset creation (when it is supported) coords = Coordinates(coord_vars, indexes=result_indexes) - # TODO: this is just the complement of the set of coord_vars result_data_vars = { name: result_var for name, result_var in result_vars.items() @@ -671,14 +683,6 @@ def get_indexes(name): result = result.drop_vars(unlabeled_dims, errors="ignore") - if index is not None: - # add concat index / coordinate last to ensure that its in the final Dataset - if dim_var is not None: - index_vars = index.create_variables({dim: dim_var}) - else: - index_vars = index.create_variables() - result[dim] = index_vars[dim] - return result From 6e9ead6603de73c5ea6bd8f76d973525bb70b417 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Thu, 28 Mar 2024 16:50:22 -0400 Subject: [PATCH 13/64] possible fix for failing groupby test --- xarray/core/concat.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 5ef3c01d427..8cd75502fa3 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -658,6 +658,11 @@ def get_indexes(name): for name, result_var in result_vars.items() if name in coord_names } + result_data_vars = { + name: result_var + for name, result_var in result_vars.items() + if name not in coord_names + } if index is not None: if dim_var is not None: @@ -668,16 +673,14 @@ def get_indexes(name): coord_vars[dim] = index_vars[dim] result_indexes[dim] = index unlabeled_dims = unlabeled_dims - set([dim]) + else: + if dim in result_data_vars: + coord_vars[dim] = result_data_vars[dim] + result_data_vars.pop(dim) # TODO: add indexes at Dataset creation (when it is supported) coords = Coordinates(coord_vars, indexes=result_indexes) - result_data_vars = { - name: result_var - for name, result_var in result_vars.items() - if name not in coord_names - } - result = type(datasets[0])(result_data_vars, coords=coords, attrs=result_attrs) result.encoding = result_encoding From 2534712041377b7ef934f0f0f2191c27bf0e99c0 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 29 Mar 2024 11:00:10 -0400 Subject: [PATCH 14/64] Revert "possible fix for failing groupby test" This reverts commit 6e9ead6603de73c5ea6bd8f76d973525bb70b417. --- xarray/core/concat.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 8cd75502fa3..5ef3c01d427 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -658,11 +658,6 @@ def get_indexes(name): for name, result_var in result_vars.items() if name in coord_names } - result_data_vars = { - name: result_var - for name, result_var in result_vars.items() - if name not in coord_names - } if index is not None: if dim_var is not None: @@ -673,14 +668,16 @@ def get_indexes(name): coord_vars[dim] = index_vars[dim] result_indexes[dim] = index unlabeled_dims = unlabeled_dims - set([dim]) - else: - if dim in result_data_vars: - coord_vars[dim] = result_data_vars[dim] - result_data_vars.pop(dim) # TODO: add indexes at Dataset creation (when it is supported) coords = Coordinates(coord_vars, indexes=result_indexes) + result_data_vars = { + name: result_var + for name, result_var in result_vars.items() + if name not in coord_names + } + result = type(datasets[0])(result_data_vars, coords=coords, attrs=result_attrs) result.encoding = result_encoding From 3e848ebdcca5cc641be3ff9b1b9791f74208d487 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 19 Apr 2024 11:11:25 -0400 Subject: [PATCH 15/64] test expand_dims doesn't create Index --- xarray/tests/test_dataset.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index e2a64964775..020af51a5f8 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -3431,6 +3431,23 @@ def test_expand_dims_kwargs_python36plus(self) -> None: ) assert_identical(other_way_expected, other_way) + def test_expand_dims_creates_indexvariable(self): + # data variables should not gain an index ever + ds = Dataset({"a": 0}) + for flag in [True, False]: + expanded = ds.expand_dims("x", create_1d_index=flag) + expected = Dataset({"a": ("x", [0])}) + assert_identical(expanded, expected) + assert expanded.indexes == {} + + # coordinate variables should gain an index only if create_1d_index is True (the default) + ds = Dataset(coords={"x": 0}) + expanded = ds.expand_dims("x") + expected = Dataset({"x": ("x", [0])}) + assert_identical(expanded, expected) + expanded_no_index = ds.expand_dims("x", create_1d_index=False) + assert expanded_no_index.indexes == {} + @requires_pandas_version_two def test_expand_dims_non_nanosecond_conversion(self) -> None: # Regression test for https://github.com/pydata/xarray/issues/7493#issuecomment-1953091000 From 95d453ccff1d2e2746c1970c0157f2de0b582105 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 19 Apr 2024 11:40:06 -0400 Subject: [PATCH 16/64] add option to not create 1D index in expand_dims --- xarray/core/dataset.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 10bf1466156..861356f3d19 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4479,6 +4479,7 @@ def expand_dims( self, dim: None | Hashable | Sequence[Hashable] | Mapping[Any, Any] = None, axis: None | int | Sequence[int] = None, + create_1d_index: bool = True, **dim_kwargs: Any, ) -> Self: """Return a new object with an additional axis (or axes) inserted at @@ -4488,6 +4489,8 @@ def expand_dims( If dim is already a scalar coordinate, it will be promoted to a 1D coordinate consisting of a single value. + The + Parameters ---------- dim : hashable, sequence of hashable, mapping, or None @@ -4503,6 +4506,8 @@ def expand_dims( multiple axes are inserted. In this case, dim arguments should be same length list. If axis=None is passed, all the axes will be inserted to the start of the result array. + create_1d_index : bool, default is True + Whether to create new PandasIndex objects for new 1D coordinate variables. **dim_kwargs : int or sequence or ndarray The keywords are arbitrary dimensions being inserted and the values are either the lengths of the new dims (if int is given), or their @@ -4622,6 +4627,8 @@ def expand_dims( # save the coordinates to the variables dict, and set the # value within the dim dict to the length of the iterable # for later use. + + # TODO should we have an option to not create a variable here? index = PandasIndex(v, k) indexes[k] = index variables.update(index.create_variables()) @@ -4660,11 +4667,16 @@ def expand_dims( variables[k] = v.set_dims(dict(all_dims)) else: if k not in variables: - # If dims includes a label of a non-dimension coordinate, - # it will be promoted to a 1D coordinate with a single value. - index, index_vars = create_default_index_implicit(v.set_dims(k)) - indexes[k] = index - variables.update(index_vars) + if create_1d_index: + # If dims includes a label of a non-dimension coordinate, + # it will be promoted to a 1D coordinate with a single value. + index, index_vars = create_default_index_implicit(v.set_dims(k)) + indexes[k] = index + variables.update(index_vars) + else: + # create 1D variable without creating a new index + new_1d_var = v.set_dims(k) + variables.update({k: new_1d_var}) return self._replace_with_new_dims( variables, coord_names=coord_names, indexes=indexes From ba5627eebf7b580d0a0b9a171f1f94d7412662e3 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 19 Apr 2024 22:47:07 -0400 Subject: [PATCH 17/64] refactor tests to consider data variables and coordinate variables separately --- xarray/tests/test_dataset.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 020af51a5f8..8c76e1ffd24 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -3431,21 +3431,34 @@ def test_expand_dims_kwargs_python36plus(self) -> None: ) assert_identical(other_way_expected, other_way) - def test_expand_dims_creates_indexvariable(self): + @pytest.mark.parametrize("create_1d_index_flag", [True, False]) + def test_expand_dims_create_index_data_variable(self, create_1d_index_flag): # data variables should not gain an index ever - ds = Dataset({"a": 0}) - for flag in [True, False]: - expanded = ds.expand_dims("x", create_1d_index=flag) - expected = Dataset({"a": ("x", [0])}) - assert_identical(expanded, expected) - assert expanded.indexes == {} + ds = Dataset({"x": 0}) + expanded = ds.expand_dims("x", create_1d_index=create_1d_index_flag) + + # TODO I can't just create the expected dataset directly using constructor because of GH issue 8959 + # expected = Dataset(data_vars={"x": ("x", [0])}) + expected = Dataset({"x": ("x", [0])}).drop_indexes("x").reset_coords("x") + + # TODO also can't just assert equivalence because it will fail internal invariants default indexes checks + # assert_identical(expanded, expected) + assert expected.data_vars == {"x": Variable(data=[0], dims=["x"])} + assert expanded.indexes == {} + def test_expand_dims_create_index_coordinate_variable(self): # coordinate variables should gain an index only if create_1d_index is True (the default) ds = Dataset(coords={"x": 0}) expanded = ds.expand_dims("x") expected = Dataset({"x": ("x", [0])}) assert_identical(expanded, expected) + expanded_no_index = ds.expand_dims("x", create_1d_index=False) + expected = Dataset(coords={"x": ("x", [0])}).drop_indexes("x") + + # TODO also can't just assert equivalence because it will fail internal invariants default indexes checks + # assert_identical(expanded, expected) + assert expanded_no_index.coords == {"x": Variable(data=[0], dims=["x"])} assert expanded_no_index.indexes == {} @requires_pandas_version_two From 3719ba7ca6d66dbb1c7a105907beb13e8d826e37 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 19 Apr 2024 11:11:25 -0400 Subject: [PATCH 18/64] test expand_dims doesn't create Index --- xarray/tests/test_dataset.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index a948fafc815..54a7b0a6f87 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -3431,6 +3431,23 @@ def test_expand_dims_kwargs_python36plus(self) -> None: ) assert_identical(other_way_expected, other_way) + def test_expand_dims_creates_indexvariable(self): + # data variables should not gain an index ever + ds = Dataset({"a": 0}) + for flag in [True, False]: + expanded = ds.expand_dims("x", create_1d_index=flag) + expected = Dataset({"a": ("x", [0])}) + assert_identical(expanded, expected) + assert expanded.indexes == {} + + # coordinate variables should gain an index only if create_1d_index is True (the default) + ds = Dataset(coords={"x": 0}) + expanded = ds.expand_dims("x") + expected = Dataset({"x": ("x", [0])}) + assert_identical(expanded, expected) + expanded_no_index = ds.expand_dims("x", create_1d_index=False) + assert expanded_no_index.indexes == {} + @requires_pandas_version_two def test_expand_dims_non_nanosecond_conversion(self) -> None: # Regression test for https://github.com/pydata/xarray/issues/7493#issuecomment-1953091000 From 018e74bb1c94141cd86502bfa2ac964a44f28860 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 19 Apr 2024 11:40:06 -0400 Subject: [PATCH 19/64] add option to not create 1D index in expand_dims --- xarray/core/dataset.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 96f3be00995..233b7833785 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4497,6 +4497,7 @@ def expand_dims( self, dim: None | Hashable | Sequence[Hashable] | Mapping[Any, Any] = None, axis: None | int | Sequence[int] = None, + create_1d_index: bool = True, **dim_kwargs: Any, ) -> Self: """Return a new object with an additional axis (or axes) inserted at @@ -4506,6 +4507,8 @@ def expand_dims( If dim is already a scalar coordinate, it will be promoted to a 1D coordinate consisting of a single value. + The + Parameters ---------- dim : hashable, sequence of hashable, mapping, or None @@ -4521,6 +4524,8 @@ def expand_dims( multiple axes are inserted. In this case, dim arguments should be same length list. If axis=None is passed, all the axes will be inserted to the start of the result array. + create_1d_index : bool, default is True + Whether to create new PandasIndex objects for new 1D coordinate variables. **dim_kwargs : int or sequence or ndarray The keywords are arbitrary dimensions being inserted and the values are either the lengths of the new dims (if int is given), or their @@ -4640,6 +4645,8 @@ def expand_dims( # save the coordinates to the variables dict, and set the # value within the dim dict to the length of the iterable # for later use. + + # TODO should we have an option to not create a variable here? index = PandasIndex(v, k) indexes[k] = index variables.update(index.create_variables()) @@ -4678,11 +4685,16 @@ def expand_dims( variables[k] = v.set_dims(dict(all_dims)) else: if k not in variables: - # If dims includes a label of a non-dimension coordinate, - # it will be promoted to a 1D coordinate with a single value. - index, index_vars = create_default_index_implicit(v.set_dims(k)) - indexes[k] = index - variables.update(index_vars) + if create_1d_index: + # If dims includes a label of a non-dimension coordinate, + # it will be promoted to a 1D coordinate with a single value. + index, index_vars = create_default_index_implicit(v.set_dims(k)) + indexes[k] = index + variables.update(index_vars) + else: + # create 1D variable without creating a new index + new_1d_var = v.set_dims(k) + variables.update({k: new_1d_var}) return self._replace_with_new_dims( variables, coord_names=coord_names, indexes=indexes From f680505d3dd630bc8fa0a200e5d9efde7cf8ada9 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 19 Apr 2024 22:47:07 -0400 Subject: [PATCH 20/64] refactor tests to consider data variables and coordinate variables separately --- xarray/tests/test_dataset.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 54a7b0a6f87..74c4af1790d 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -3431,21 +3431,34 @@ def test_expand_dims_kwargs_python36plus(self) -> None: ) assert_identical(other_way_expected, other_way) - def test_expand_dims_creates_indexvariable(self): + @pytest.mark.parametrize("create_1d_index_flag", [True, False]) + def test_expand_dims_create_index_data_variable(self, create_1d_index_flag): # data variables should not gain an index ever - ds = Dataset({"a": 0}) - for flag in [True, False]: - expanded = ds.expand_dims("x", create_1d_index=flag) - expected = Dataset({"a": ("x", [0])}) - assert_identical(expanded, expected) - assert expanded.indexes == {} + ds = Dataset({"x": 0}) + expanded = ds.expand_dims("x", create_1d_index=create_1d_index_flag) + + # TODO I can't just create the expected dataset directly using constructor because of GH issue 8959 + # expected = Dataset(data_vars={"x": ("x", [0])}) + expected = Dataset({"x": ("x", [0])}).drop_indexes("x").reset_coords("x") + + # TODO also can't just assert equivalence because it will fail internal invariants default indexes checks + # assert_identical(expanded, expected) + assert expected.data_vars == {"x": Variable(data=[0], dims=["x"])} + assert expanded.indexes == {} + def test_expand_dims_create_index_coordinate_variable(self): # coordinate variables should gain an index only if create_1d_index is True (the default) ds = Dataset(coords={"x": 0}) expanded = ds.expand_dims("x") expected = Dataset({"x": ("x", [0])}) assert_identical(expanded, expected) + expanded_no_index = ds.expand_dims("x", create_1d_index=False) + expected = Dataset(coords={"x": ("x", [0])}).drop_indexes("x") + + # TODO also can't just assert equivalence because it will fail internal invariants default indexes checks + # assert_identical(expanded, expected) + assert expanded_no_index.coords == {"x": Variable(data=[0], dims=["x"])} assert expanded_no_index.indexes == {} @requires_pandas_version_two From f10509aa983a4baae66163030b26827ef6c11344 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 19 Apr 2024 23:22:11 -0400 Subject: [PATCH 21/64] fix bug causing new test to fail --- xarray/core/dataset.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 233b7833785..9752afbd749 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4507,8 +4507,9 @@ def expand_dims( If dim is already a scalar coordinate, it will be promoted to a 1D coordinate consisting of a single value. - The - + The automatic creation of indexes to back new 1D coordinate variables + controlled by the create_1d_index kwarg. + Parameters ---------- dim : hashable, sequence of hashable, mapping, or None @@ -4525,7 +4526,7 @@ def expand_dims( same length list. If axis=None is passed, all the axes will be inserted to the start of the result array. create_1d_index : bool, default is True - Whether to create new PandasIndex objects for new 1D coordinate variables. + Whether to create new PandasIndex objects for any new 1D coordinate variables. **dim_kwargs : int or sequence or ndarray The keywords are arbitrary dimensions being inserted and the values are either the lengths of the new dims (if int is given), or their @@ -4685,7 +4686,7 @@ def expand_dims( variables[k] = v.set_dims(dict(all_dims)) else: if k not in variables: - if create_1d_index: + if k in coord_names and create_1d_index: # If dims includes a label of a non-dimension coordinate, # it will be promoted to a 1D coordinate with a single value. index, index_vars = create_default_index_implicit(v.set_dims(k)) From 8152c0a40c4386eb01cd1974769527ae5c85a570 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 19 Apr 2024 23:51:28 -0400 Subject: [PATCH 22/64] test index auto-creation when iterable passed as new coordinate values --- xarray/tests/test_dataset.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 74c4af1790d..feee69bf49a 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -11,6 +11,7 @@ from typing import Any, Literal import numpy as np +import numpy.testing as npt import pandas as pd import pytest from pandas.core.indexes.datetimes import DatetimeIndex @@ -3461,6 +3462,22 @@ def test_expand_dims_create_index_coordinate_variable(self): assert expanded_no_index.coords == {"x": Variable(data=[0], dims=["x"])} assert expanded_no_index.indexes == {} + def test_expand_dims_create_index_from_iterable(self): + ds = Dataset(coords={"x": 0}) + expanded = ds.expand_dims(x=[0, 1]) + expected = Dataset({"x": ("x", [0, 1])}) + assert_identical(expanded, expected) + + expanded_no_index = ds.expand_dims(x=[0, 1], create_1d_index=False) + expected = Dataset(coords={"x": ("x", [0, 1])}).drop_indexes("x") + + # TODO also can't just assert equivalence because it will fail internal invariants default indexes checks + # assert_identical(expanded, expected) + assert list(expanded_no_index.coords) == ["x"] + assert isinstance(expanded_no_index.coords["x"].variable, Variable) + npt.assert_array_equal(expanded_no_index.coords["x"].data, np.array([0, 1])) + assert expanded_no_index.indexes == {} + @requires_pandas_version_two def test_expand_dims_non_nanosecond_conversion(self) -> None: # Regression test for https://github.com/pydata/xarray/issues/7493#issuecomment-1953091000 From aa813cff0d6011af81cb4a15c87e9857f35c3d26 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sat, 20 Apr 2024 00:06:27 -0400 Subject: [PATCH 23/64] make test for iterable pass --- xarray/core/dataset.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 9752afbd749..25cf8263bf5 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4647,10 +4647,13 @@ def expand_dims( # value within the dim dict to the length of the iterable # for later use. - # TODO should we have an option to not create a variable here? - index = PandasIndex(v, k) - indexes[k] = index - variables.update(index.create_variables()) + if create_1d_index: + index = PandasIndex(v, k) + indexes[k] = index + name_and_new_1d_var = index.create_variables() + else: + name_and_new_1d_var = {k: Variable(data=v, dims=k)} + variables.update(name_and_new_1d_var) coord_names.add(k) dim[k] = variables[k].size elif isinstance(v, int): From e78de7d5e93649114d9aaa916bf0f4185882be9c Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sat, 20 Apr 2024 00:36:37 -0400 Subject: [PATCH 24/64] added kwarg to dataarray --- xarray/core/dataarray.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 509962ff80d..818275dedd3 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2557,6 +2557,7 @@ def expand_dims( self, dim: None | Hashable | Sequence[Hashable] | Mapping[Any, Any] = None, axis: None | int | Sequence[int] = None, + create_1d_index: bool = True, **dim_kwargs: Any, ) -> Self: """Return a new object with an additional axis (or axes) inserted at @@ -2566,6 +2567,9 @@ def expand_dims( If dim is already a scalar coordinate, it will be promoted to a 1D coordinate consisting of a single value. + The automatic creation of indexes to back new 1D coordinate variables + controlled by the create_1d_index kwarg. + Parameters ---------- dim : Hashable, sequence of Hashable, dict, or None, optional @@ -2581,6 +2585,8 @@ def expand_dims( multiple axes are inserted. In this case, dim arguments should be same length list. If axis=None is passed, all the axes will be inserted to the start of the result array. + create_1d_index : bool, default is True + Whether to create new PandasIndex objects for any new 1D coordinate variables. **dim_kwargs : int or sequence or ndarray The keywords are arbitrary dimensions being inserted and the values are either the lengths of the new dims (if int is given), or their @@ -2644,7 +2650,9 @@ def expand_dims( dim = {dim: 1} dim = either_dict_or_kwargs(dim, dim_kwargs, "expand_dims") - ds = self._to_temp_dataset().expand_dims(dim, axis) + ds = self._to_temp_dataset().expand_dims( + dim, axis, create_1d_index=create_1d_index + ) return self._from_temp_dataset(ds) def set_index( From b1329cc08c2b08cf032c6a6ddb4da8ff99bbbee7 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sat, 20 Apr 2024 01:22:51 -0400 Subject: [PATCH 25/64] whatsnew --- doc/whats-new.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 2332f7f236b..ffa6f562e41 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -29,6 +29,9 @@ New Features for example, will retain the object. However, one cannot do operations that are not possible on the `ExtensionArray` then, such as broadcasting. By `Ilan Gold `_. +- Added the option to avoid automatically creating 1D pandas indexes in :py:meth:`Dataset.expand_dims()`, by passing the new kwarg + `create_1d_index=False`. (:pull:`8960`) + By `Tom Nicholas `_. Breaking changes ~~~~~~~~~~~~~~~~ From a9f7e0c7c177f16371819963f16388537fead51d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 20 Apr 2024 05:23:22 +0000 Subject: [PATCH 26/64] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ffa6f562e41..7ecbc8fc3e6 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -30,7 +30,7 @@ New Features then, such as broadcasting. By `Ilan Gold `_. - Added the option to avoid automatically creating 1D pandas indexes in :py:meth:`Dataset.expand_dims()`, by passing the new kwarg - `create_1d_index=False`. (:pull:`8960`) + `create_1d_index=False`. (:pull:`8960`) By `Tom Nicholas `_. Breaking changes From 2ce3decebb0248ff9b3fe92de38f14a293ef08b7 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sat, 20 Apr 2024 01:32:05 -0400 Subject: [PATCH 27/64] Revert "refactor tests to consider data variables and coordinate variables separately" This reverts commit ba5627eebf7b580d0a0b9a171f1f94d7412662e3. --- xarray/tests/test_dataset.py | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 8c76e1ffd24..020af51a5f8 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -3431,34 +3431,21 @@ def test_expand_dims_kwargs_python36plus(self) -> None: ) assert_identical(other_way_expected, other_way) - @pytest.mark.parametrize("create_1d_index_flag", [True, False]) - def test_expand_dims_create_index_data_variable(self, create_1d_index_flag): + def test_expand_dims_creates_indexvariable(self): # data variables should not gain an index ever - ds = Dataset({"x": 0}) - expanded = ds.expand_dims("x", create_1d_index=create_1d_index_flag) - - # TODO I can't just create the expected dataset directly using constructor because of GH issue 8959 - # expected = Dataset(data_vars={"x": ("x", [0])}) - expected = Dataset({"x": ("x", [0])}).drop_indexes("x").reset_coords("x") - - # TODO also can't just assert equivalence because it will fail internal invariants default indexes checks - # assert_identical(expanded, expected) - assert expected.data_vars == {"x": Variable(data=[0], dims=["x"])} - assert expanded.indexes == {} + ds = Dataset({"a": 0}) + for flag in [True, False]: + expanded = ds.expand_dims("x", create_1d_index=flag) + expected = Dataset({"a": ("x", [0])}) + assert_identical(expanded, expected) + assert expanded.indexes == {} - def test_expand_dims_create_index_coordinate_variable(self): # coordinate variables should gain an index only if create_1d_index is True (the default) ds = Dataset(coords={"x": 0}) expanded = ds.expand_dims("x") expected = Dataset({"x": ("x", [0])}) assert_identical(expanded, expected) - expanded_no_index = ds.expand_dims("x", create_1d_index=False) - expected = Dataset(coords={"x": ("x", [0])}).drop_indexes("x") - - # TODO also can't just assert equivalence because it will fail internal invariants default indexes checks - # assert_identical(expanded, expected) - assert expanded_no_index.coords == {"x": Variable(data=[0], dims=["x"])} assert expanded_no_index.indexes == {} @requires_pandas_version_two From 87a08b4c36ebd0b4cc5c613d65685e00d1030f11 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sat, 20 Apr 2024 01:32:06 -0400 Subject: [PATCH 28/64] Revert "add option to not create 1D index in expand_dims" This reverts commit 95d453ccff1d2e2746c1970c0157f2de0b582105. --- xarray/core/dataset.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 861356f3d19..10bf1466156 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4479,7 +4479,6 @@ def expand_dims( self, dim: None | Hashable | Sequence[Hashable] | Mapping[Any, Any] = None, axis: None | int | Sequence[int] = None, - create_1d_index: bool = True, **dim_kwargs: Any, ) -> Self: """Return a new object with an additional axis (or axes) inserted at @@ -4489,8 +4488,6 @@ def expand_dims( If dim is already a scalar coordinate, it will be promoted to a 1D coordinate consisting of a single value. - The - Parameters ---------- dim : hashable, sequence of hashable, mapping, or None @@ -4506,8 +4503,6 @@ def expand_dims( multiple axes are inserted. In this case, dim arguments should be same length list. If axis=None is passed, all the axes will be inserted to the start of the result array. - create_1d_index : bool, default is True - Whether to create new PandasIndex objects for new 1D coordinate variables. **dim_kwargs : int or sequence or ndarray The keywords are arbitrary dimensions being inserted and the values are either the lengths of the new dims (if int is given), or their @@ -4627,8 +4622,6 @@ def expand_dims( # save the coordinates to the variables dict, and set the # value within the dim dict to the length of the iterable # for later use. - - # TODO should we have an option to not create a variable here? index = PandasIndex(v, k) indexes[k] = index variables.update(index.create_variables()) @@ -4667,16 +4660,11 @@ def expand_dims( variables[k] = v.set_dims(dict(all_dims)) else: if k not in variables: - if create_1d_index: - # If dims includes a label of a non-dimension coordinate, - # it will be promoted to a 1D coordinate with a single value. - index, index_vars = create_default_index_implicit(v.set_dims(k)) - indexes[k] = index - variables.update(index_vars) - else: - # create 1D variable without creating a new index - new_1d_var = v.set_dims(k) - variables.update({k: new_1d_var}) + # If dims includes a label of a non-dimension coordinate, + # it will be promoted to a 1D coordinate with a single value. + index, index_vars = create_default_index_implicit(v.set_dims(k)) + indexes[k] = index + variables.update(index_vars) return self._replace_with_new_dims( variables, coord_names=coord_names, indexes=indexes From 214ed7d22e9a1c06331c9d5be61c9dbee83ee467 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sat, 20 Apr 2024 02:34:54 -0400 Subject: [PATCH 29/64] test that concat doesn't raise if create_1d_index=False --- xarray/tests/test_concat.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 7295941568e..87b7c5093bf 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -14,6 +14,7 @@ from xarray.tests import ( ConcatenatableArray, InaccessibleArray, + UnexpectedDataAccess, assert_array_equal, assert_equal, assert_identical, @@ -1028,6 +1029,22 @@ def test_concat_avoids_index_auto_creation(self) -> None: # nor have auto-created any indexes assert combined.indexes == {} + def test_concat_avoids_index_auto_creation_new_1d_coord(self) -> None: + # create 0D coordinates (without indexes) + datasets = [ + Dataset( + coords={"x": ConcatenatableArray(np.array(0))}, + ) + for _ in range(2) + ] + + # should not raise on concat iff create_1d_index=False + combined = concat(datasets, dim="x", create_1d_index=False) + assert combined["x"].shape == (2,) + assert combined["x"].dims == ("x",) + + # nor have auto-created any indexes + assert combined.indexes == {} class TestConcatDataArray: def test_concat(self) -> None: From 78d279830c79b7a8ddfb8de09666ed242b4b6292 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sat, 20 Apr 2024 02:40:56 -0400 Subject: [PATCH 30/64] make test pass by passing create_1d_index down through concat --- xarray/core/concat.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 5ef3c01d427..bc41ee9327d 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -70,6 +70,7 @@ def concat( fill_value=dtypes.NA, join: JoinOptions = "outer", combine_attrs: CombineAttrsOptions = "override", + create_1d_index: bool = True, ): """Concatenate xarray objects along a new or existing dimension. @@ -163,6 +164,8 @@ def concat( If a callable, it must expect a sequence of ``attrs`` dicts and a context object as its only parameters. + create_1d_index : bool, default is True + Whether to create new PandasIndex objects for any new 1D coordinate variables. Returns ------- @@ -246,6 +249,7 @@ def concat( fill_value=fill_value, join=join, combine_attrs=combine_attrs, + create_1d_index=create_1d_index, ) elif isinstance(first_obj, Dataset): return _dataset_concat( @@ -258,6 +262,7 @@ def concat( fill_value=fill_value, join=join, combine_attrs=combine_attrs, + create_1d_index=create_1d_index, ) else: raise TypeError( @@ -457,6 +462,7 @@ def _dataset_concat( fill_value: Any = dtypes.NA, join: JoinOptions = "outer", combine_attrs: CombineAttrsOptions = "override", + create_1d_index: bool = True, ) -> T_Dataset: """ Concatenate a sequence of datasets along a new or existing dimension @@ -503,7 +509,9 @@ def _dataset_concat( # case where concat dimension is a coordinate or data_var but not a dimension if (dim in coord_names or dim in data_names) and dim not in dim_names: - datasets = [ds.expand_dims(dim) for ds in datasets] + datasets = [ + ds.expand_dims(dim, create_1d_index=create_1d_index) for ds in datasets + ] # determine which variables to concatenate concat_over, equals, concat_dim_lengths = _calc_concat_over( @@ -696,6 +704,7 @@ def _dataarray_concat( fill_value: object = dtypes.NA, join: JoinOptions = "outer", combine_attrs: CombineAttrsOptions = "override", + create_1d_index: bool = True, ) -> T_DataArray: from xarray.core.dataarray import DataArray @@ -732,6 +741,7 @@ def _dataarray_concat( fill_value=fill_value, join=join, combine_attrs=combine_attrs, + create_1d_index=create_1d_index, ) merged_attrs = merge_attrs([da.attrs for da in arrays], combine_attrs) From fc206b093a9951bb167621561d6342ef18b69adf Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sat, 20 Apr 2024 03:23:48 -0400 Subject: [PATCH 31/64] assert that an UnexpectedDataAccess error is raised when create_1d_index=True --- xarray/tests/test_concat.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 87b7c5093bf..dad6e9e965f 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -1033,11 +1033,16 @@ def test_concat_avoids_index_auto_creation_new_1d_coord(self) -> None: # create 0D coordinates (without indexes) datasets = [ Dataset( - coords={"x": ConcatenatableArray(np.array(0))}, + coords={"x": ConcatenatableArray(np.array(10))}, ) for _ in range(2) ] - + + # TODO this should raise an error, but it doesn't + # It's somehow cheating it's way around the UnexpectedDataAccess error that should be raised + with pytest.raises(UnexpectedDataAccess): + combined = concat(datasets, dim="x", create_1d_index=True) + # should not raise on concat iff create_1d_index=False combined = concat(datasets, dim="x", create_1d_index=False) assert combined["x"].shape == (2,) @@ -1046,6 +1051,7 @@ def test_concat_avoids_index_auto_creation_new_1d_coord(self) -> None: # nor have auto-created any indexes assert combined.indexes == {} + class TestConcatDataArray: def test_concat(self) -> None: ds = Dataset( From ce797f126da5edd718f6b2a3b1b43b79c257c95e Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sat, 20 Apr 2024 03:25:20 -0400 Subject: [PATCH 32/64] eliminate possibility of xarray internals bypassing UnexpectedDataAccess error by accessing .array --- xarray/tests/__init__.py | 42 +++++++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 8ce2fb44174..e336252d4e1 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -234,7 +234,7 @@ def concatenate( if any(not isinstance(arr, ConcatenatableArray) for arr in arrays): raise TypeError - result = np.concatenate([arr.array for arr in arrays], axis=axis) + result = np.concatenate([arr._array for arr in arrays], axis=axis) return ConcatenatableArray(result) @@ -243,7 +243,7 @@ def stack(arrays: Iterable[ConcatenatableArray], /, *, axis=0) -> Concatenatable if any(not isinstance(arr, ConcatenatableArray) for arr in arrays): raise TypeError - result = np.stack([arr.array for arr in arrays], axis=axis) + result = np.stack([arr._array for arr in arrays], axis=axis) return ConcatenatableArray(result) @@ -267,19 +267,33 @@ def broadcast_to( if not isinstance(x, ConcatenatableArray): raise TypeError - result = np.broadcast_to(x.array, shape=shape) + result = np.broadcast_to(x._array, shape=shape) return ConcatenatableArray(result) -class ConcatenatableArray(utils.NDArrayMixin): +class ConcatenatableArray: """Disallows loading or coercing to an index but does support concatenation / stacking.""" - # TODO only reason this is different from InaccessibleArray is to avoid it being a subclass of ExplicitlyIndexed - HANDLED_ARRAY_FUNCTIONS = [concatenate, stack, result_type] def __init__(self, array): - self.array = array + # use ._array instead of .array because we don't want this to be accessible even to xarray's internals (e.g. create_default_index_implicit) + self._array = array + + @property + def dtype(self: Any) -> np.dtype: + return self._array.dtype + + @property + def shape(self: Any) -> tuple[int, ...]: + return self._array.shape + + @property + def ndim(self: Any) -> int: + return self._array.ndim + + def __repr__(self: Any) -> str: + return f"{type(self).__name__}(array={self._array!r})" def get_duck_array(self): raise UnexpectedDataAccess("Tried accessing data") @@ -287,8 +301,18 @@ def get_duck_array(self): def __array__(self, dtype: np.typing.DTypeLike = None): raise UnexpectedDataAccess("Tried accessing data") - def __getitem__(self, key): - raise UnexpectedDataAccess("Tried accessing data.") + def __getitem__(self, key) -> ConcatenatableArray: + """Some cases of concat require supporting expanding dims by dimensions of size 1""" + # see https://data-apis.org/array-api/2022.12/API_specification/indexing.html#multi-axis-indexing + arr = self._array + for axis, indexer_1d in enumerate(key): + if indexer_1d is None: + arr = np.expand_dims(arr, axis) + elif indexer_1d is Ellipsis: + pass + else: + raise UnexpectedDataAccess("Tried accessing data.") + return arr def __array_function__(self, func, types, args, kwargs) -> Any: if func not in HANDLED_ARRAY_FUNCTIONS: From 62e750f46c55209b904cc14ece174fa58cce546a Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 26 Apr 2024 11:31:59 -0400 Subject: [PATCH 33/64] update tests to use private versions of assertions --- xarray/tests/test_dataset.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index feee69bf49a..4b1e2141750 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -11,7 +11,6 @@ from typing import Any, Literal import numpy as np -import numpy.testing as npt import pandas as pd import pytest from pandas.core.indexes.datetimes import DatetimeIndex @@ -3438,13 +3437,10 @@ def test_expand_dims_create_index_data_variable(self, create_1d_index_flag): ds = Dataset({"x": 0}) expanded = ds.expand_dims("x", create_1d_index=create_1d_index_flag) - # TODO I can't just create the expected dataset directly using constructor because of GH issue 8959 - # expected = Dataset(data_vars={"x": ("x", [0])}) + # TODO Can't just create the expected dataset directly using constructor because of GH issue 8959 expected = Dataset({"x": ("x", [0])}).drop_indexes("x").reset_coords("x") - # TODO also can't just assert equivalence because it will fail internal invariants default indexes checks - # assert_identical(expanded, expected) - assert expected.data_vars == {"x": Variable(data=[0], dims=["x"])} + assert_identical(expanded, expected, check_default_indexes=False) assert expanded.indexes == {} def test_expand_dims_create_index_coordinate_variable(self): @@ -3455,11 +3451,11 @@ def test_expand_dims_create_index_coordinate_variable(self): assert_identical(expanded, expected) expanded_no_index = ds.expand_dims("x", create_1d_index=False) + + # TODO Can't just create the expected dataset directly using constructor because of GH issue 8959 expected = Dataset(coords={"x": ("x", [0])}).drop_indexes("x") - # TODO also can't just assert equivalence because it will fail internal invariants default indexes checks - # assert_identical(expanded, expected) - assert expanded_no_index.coords == {"x": Variable(data=[0], dims=["x"])} + assert_identical(expanded_no_index, expected, check_default_indexes=False) assert expanded_no_index.indexes == {} def test_expand_dims_create_index_from_iterable(self): @@ -3469,13 +3465,11 @@ def test_expand_dims_create_index_from_iterable(self): assert_identical(expanded, expected) expanded_no_index = ds.expand_dims(x=[0, 1], create_1d_index=False) + + # TODO Can't just create the expected dataset directly using constructor because of GH issue 8959 expected = Dataset(coords={"x": ("x", [0, 1])}).drop_indexes("x") - # TODO also can't just assert equivalence because it will fail internal invariants default indexes checks - # assert_identical(expanded, expected) - assert list(expanded_no_index.coords) == ["x"] - assert isinstance(expanded_no_index.coords["x"].variable, Variable) - npt.assert_array_equal(expanded_no_index.coords["x"].data, np.array([0, 1])) + assert_identical(expanded, expected, check_default_indexes=False) assert expanded_no_index.indexes == {} @requires_pandas_version_two From f86c82f1c9efdf0b207f1fb537aea2cefe287c82 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 26 Apr 2024 11:34:37 -0400 Subject: [PATCH 34/64] create_1d_index->create_index --- xarray/core/dataarray.py | 10 ++++------ xarray/core/dataset.py | 10 +++++----- xarray/tests/test_dataset.py | 12 ++++++------ 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 818275dedd3..41c9af1bb10 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2557,7 +2557,7 @@ def expand_dims( self, dim: None | Hashable | Sequence[Hashable] | Mapping[Any, Any] = None, axis: None | int | Sequence[int] = None, - create_1d_index: bool = True, + create_index: bool = True, **dim_kwargs: Any, ) -> Self: """Return a new object with an additional axis (or axes) inserted at @@ -2568,7 +2568,7 @@ def expand_dims( coordinate consisting of a single value. The automatic creation of indexes to back new 1D coordinate variables - controlled by the create_1d_index kwarg. + controlled by the create_index kwarg. Parameters ---------- @@ -2585,7 +2585,7 @@ def expand_dims( multiple axes are inserted. In this case, dim arguments should be same length list. If axis=None is passed, all the axes will be inserted to the start of the result array. - create_1d_index : bool, default is True + create_index : bool, default is True Whether to create new PandasIndex objects for any new 1D coordinate variables. **dim_kwargs : int or sequence or ndarray The keywords are arbitrary dimensions being inserted and the values @@ -2650,9 +2650,7 @@ def expand_dims( dim = {dim: 1} dim = either_dict_or_kwargs(dim, dim_kwargs, "expand_dims") - ds = self._to_temp_dataset().expand_dims( - dim, axis, create_1d_index=create_1d_index - ) + ds = self._to_temp_dataset().expand_dims(dim, axis, create_index=create_index) return self._from_temp_dataset(ds) def set_index( diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 25cf8263bf5..a2bc3505366 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4497,7 +4497,7 @@ def expand_dims( self, dim: None | Hashable | Sequence[Hashable] | Mapping[Any, Any] = None, axis: None | int | Sequence[int] = None, - create_1d_index: bool = True, + create_index: bool = True, **dim_kwargs: Any, ) -> Self: """Return a new object with an additional axis (or axes) inserted at @@ -4508,7 +4508,7 @@ def expand_dims( coordinate consisting of a single value. The automatic creation of indexes to back new 1D coordinate variables - controlled by the create_1d_index kwarg. + controlled by the create_index kwarg. Parameters ---------- @@ -4525,7 +4525,7 @@ def expand_dims( multiple axes are inserted. In this case, dim arguments should be same length list. If axis=None is passed, all the axes will be inserted to the start of the result array. - create_1d_index : bool, default is True + create_index : bool, default is True Whether to create new PandasIndex objects for any new 1D coordinate variables. **dim_kwargs : int or sequence or ndarray The keywords are arbitrary dimensions being inserted and the values @@ -4647,7 +4647,7 @@ def expand_dims( # value within the dim dict to the length of the iterable # for later use. - if create_1d_index: + if create_index: index = PandasIndex(v, k) indexes[k] = index name_and_new_1d_var = index.create_variables() @@ -4689,7 +4689,7 @@ def expand_dims( variables[k] = v.set_dims(dict(all_dims)) else: if k not in variables: - if k in coord_names and create_1d_index: + if k in coord_names and create_index: # If dims includes a label of a non-dimension coordinate, # it will be promoted to a 1D coordinate with a single value. index, index_vars = create_default_index_implicit(v.set_dims(k)) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 4b1e2141750..e4251ced7a4 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -3431,11 +3431,11 @@ def test_expand_dims_kwargs_python36plus(self) -> None: ) assert_identical(other_way_expected, other_way) - @pytest.mark.parametrize("create_1d_index_flag", [True, False]) - def test_expand_dims_create_index_data_variable(self, create_1d_index_flag): + @pytest.mark.parametrize("create_index_flag", [True, False]) + def test_expand_dims_create_index_data_variable(self, create_index_flag): # data variables should not gain an index ever ds = Dataset({"x": 0}) - expanded = ds.expand_dims("x", create_1d_index=create_1d_index_flag) + expanded = ds.expand_dims("x", create_index=create_index_flag) # TODO Can't just create the expected dataset directly using constructor because of GH issue 8959 expected = Dataset({"x": ("x", [0])}).drop_indexes("x").reset_coords("x") @@ -3444,13 +3444,13 @@ def test_expand_dims_create_index_data_variable(self, create_1d_index_flag): assert expanded.indexes == {} def test_expand_dims_create_index_coordinate_variable(self): - # coordinate variables should gain an index only if create_1d_index is True (the default) + # coordinate variables should gain an index only if create_index is True (the default) ds = Dataset(coords={"x": 0}) expanded = ds.expand_dims("x") expected = Dataset({"x": ("x", [0])}) assert_identical(expanded, expected) - expanded_no_index = ds.expand_dims("x", create_1d_index=False) + expanded_no_index = ds.expand_dims("x", create_index=False) # TODO Can't just create the expected dataset directly using constructor because of GH issue 8959 expected = Dataset(coords={"x": ("x", [0])}).drop_indexes("x") @@ -3464,7 +3464,7 @@ def test_expand_dims_create_index_from_iterable(self): expected = Dataset({"x": ("x", [0, 1])}) assert_identical(expanded, expected) - expanded_no_index = ds.expand_dims(x=[0, 1], create_1d_index=False) + expanded_no_index = ds.expand_dims(x=[0, 1], create_index=False) # TODO Can't just create the expected dataset directly using constructor because of GH issue 8959 expected = Dataset(coords={"x": ("x", [0, 1])}).drop_indexes("x") From d5d90fdd27673537d47d7acae883571e0604ee91 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 26 Apr 2024 09:38:47 -0600 Subject: [PATCH 35/64] Update doc/whats-new.rst Co-authored-by: Deepak Cherian --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ce05e41243a..2231f0a7d4c 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -30,7 +30,7 @@ New Features then, such as broadcasting. By `Ilan Gold `_. - Added the option to avoid automatically creating 1D pandas indexes in :py:meth:`Dataset.expand_dims()`, by passing the new kwarg - `create_1d_index=False`. (:pull:`8960`) + `create_index=False`. (:pull:`8960`) By `Tom Nicholas `_. Breaking changes From 5bb88b81d7541ee581e3eff192bfdfb8502cc97e Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 26 Apr 2024 13:19:28 -0400 Subject: [PATCH 36/64] Rename create_1d_index -> create_index --- xarray/core/concat.py | 16 ++++++++-------- xarray/tests/test_concat.py | 6 +++--- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index bc41ee9327d..b05a524ff01 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -70,7 +70,7 @@ def concat( fill_value=dtypes.NA, join: JoinOptions = "outer", combine_attrs: CombineAttrsOptions = "override", - create_1d_index: bool = True, + create_index: bool = True, ): """Concatenate xarray objects along a new or existing dimension. @@ -164,7 +164,7 @@ def concat( If a callable, it must expect a sequence of ``attrs`` dicts and a context object as its only parameters. - create_1d_index : bool, default is True + create_index : bool, default is True Whether to create new PandasIndex objects for any new 1D coordinate variables. Returns @@ -249,7 +249,7 @@ def concat( fill_value=fill_value, join=join, combine_attrs=combine_attrs, - create_1d_index=create_1d_index, + create_index=create_index, ) elif isinstance(first_obj, Dataset): return _dataset_concat( @@ -262,7 +262,7 @@ def concat( fill_value=fill_value, join=join, combine_attrs=combine_attrs, - create_1d_index=create_1d_index, + create_index=create_index, ) else: raise TypeError( @@ -462,7 +462,7 @@ def _dataset_concat( fill_value: Any = dtypes.NA, join: JoinOptions = "outer", combine_attrs: CombineAttrsOptions = "override", - create_1d_index: bool = True, + create_index: bool = True, ) -> T_Dataset: """ Concatenate a sequence of datasets along a new or existing dimension @@ -510,7 +510,7 @@ def _dataset_concat( # case where concat dimension is a coordinate or data_var but not a dimension if (dim in coord_names or dim in data_names) and dim not in dim_names: datasets = [ - ds.expand_dims(dim, create_1d_index=create_1d_index) for ds in datasets + ds.expand_dims(dim, create_index=create_index) for ds in datasets ] # determine which variables to concatenate @@ -704,7 +704,7 @@ def _dataarray_concat( fill_value: object = dtypes.NA, join: JoinOptions = "outer", combine_attrs: CombineAttrsOptions = "override", - create_1d_index: bool = True, + create_index: bool = True, ) -> T_DataArray: from xarray.core.dataarray import DataArray @@ -741,7 +741,7 @@ def _dataarray_concat( fill_value=fill_value, join=join, combine_attrs=combine_attrs, - create_1d_index=create_1d_index, + create_index=create_index, ) merged_attrs = merge_attrs([da.attrs for da in arrays], combine_attrs) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index dad6e9e965f..e4af4f29358 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -1041,10 +1041,10 @@ def test_concat_avoids_index_auto_creation_new_1d_coord(self) -> None: # TODO this should raise an error, but it doesn't # It's somehow cheating it's way around the UnexpectedDataAccess error that should be raised with pytest.raises(UnexpectedDataAccess): - combined = concat(datasets, dim="x", create_1d_index=True) + combined = concat(datasets, dim="x", create_index=True) - # should not raise on concat iff create_1d_index=False - combined = concat(datasets, dim="x", create_1d_index=False) + # should not raise on concat iff create_index=False + combined = concat(datasets, dim="x", create_index=False) assert combined["x"].shape == (2,) assert combined["x"].dims == ("x",) From 1d471b1197802a0ff6748c04d657936ad9358c45 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 26 Apr 2024 13:41:17 -0400 Subject: [PATCH 37/64] fix ConcatenatableArray --- xarray/tests/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index f6756c3b21f..6a31c753e22 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -311,7 +311,7 @@ def __getitem__(self, key) -> ConcatenatableArray: pass else: raise UnexpectedDataAccess("Tried accessing data.") - return arr + return ConcatenatableArray(arr) def __array_function__(self, func, types, args, kwargs) -> Any: if func not in HANDLED_ARRAY_FUNCTIONS: From 766605d855443f8c302bc130d0fe47b91db77092 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 26 Apr 2024 13:41:50 -0400 Subject: [PATCH 38/64] formatting --- xarray/core/concat.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index b05a524ff01..8cb6218687c 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -509,9 +509,7 @@ def _dataset_concat( # case where concat dimension is a coordinate or data_var but not a dimension if (dim in coord_names or dim in data_names) and dim not in dim_names: - datasets = [ - ds.expand_dims(dim, create_index=create_index) for ds in datasets - ] + datasets = [ds.expand_dims(dim, create_index=create_index) for ds in datasets] # determine which variables to concatenate concat_over, equals, concat_dim_lengths = _calc_concat_over( From 10c0ed59c6e64f6adfa53e7bf25fdf3e79f1ce91 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 26 Apr 2024 13:48:18 -0400 Subject: [PATCH 39/64] whatsnew --- doc/whats-new.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e71bda46803..4df03becb18 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -32,6 +32,9 @@ New Features - Added the option to avoid automatically creating 1D pandas indexes in :py:meth:`Dataset.expand_dims()`, by passing the new kwarg `create_index=False`. (:pull:`8960`) By `Tom Nicholas `_. +- Avoid automatically re-creating 1D pandas indexes in :py:func:`concat()`. Also optionally avoid creating 1D indexes for + new dimension coordinates by the new kwarg `create_index=False`. (:pull:`8872`) + By `Tom Nicholas `_. Breaking changes ~~~~~~~~~~~~~~~~ From 51eea5d37f3fc5e856eeff3be087d0d26e0c9962 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 26 Apr 2024 13:50:52 -0400 Subject: [PATCH 40/64] add new create_index kwarg to overloads --- xarray/core/concat.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 8cb6218687c..c615a3c192e 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -43,6 +43,7 @@ def concat( fill_value: object = dtypes.NA, join: JoinOptions = "outer", combine_attrs: CombineAttrsOptions = "override", + create_index: bool = True, ) -> T_Dataset: ... @@ -57,6 +58,7 @@ def concat( fill_value: object = dtypes.NA, join: JoinOptions = "outer", combine_attrs: CombineAttrsOptions = "override", + create_index: bool = True, ) -> T_DataArray: ... From bde9f2bd4a0831166b2cf08604e689a5a2c9184d Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 26 Apr 2024 13:58:53 -0400 Subject: [PATCH 41/64] split vars into data_vars and coord_vars in one loop --- xarray/core/concat.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index c615a3c192e..f8c8e94c427 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -661,11 +661,13 @@ def get_indexes(name): f"Variables {absent_coord_names!r} are coordinates in some datasets but not others." ) - coord_vars = { - name: result_var - for name, result_var in result_vars.items() - if name in coord_names - } + result_data_vars = {} + coord_vars = {} + for name, result_var in result_vars.items(): + if name in coord_names: + coord_vars[name] = result_var + else: + result_data_vars[name] = result_var if index is not None: if dim_var is not None: @@ -680,12 +682,6 @@ def get_indexes(name): # TODO: add indexes at Dataset creation (when it is supported) coords = Coordinates(coord_vars, indexes=result_indexes) - result_data_vars = { - name: result_var - for name, result_var in result_vars.items() - if name not in coord_names - } - result = type(datasets[0])(result_data_vars, coords=coords, attrs=result_attrs) result.encoding = result_encoding From d5241ce7994625fb647b92f4afd34d2caaeb8948 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 26 Apr 2024 14:13:35 -0400 Subject: [PATCH 42/64] avoid mypy error by using new variable name --- xarray/core/concat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index f8c8e94c427..7bbb0283d19 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -680,9 +680,9 @@ def get_indexes(name): unlabeled_dims = unlabeled_dims - set([dim]) # TODO: add indexes at Dataset creation (when it is supported) - coords = Coordinates(coord_vars, indexes=result_indexes) + coords_obj = Coordinates(coord_vars, indexes=result_indexes) - result = type(datasets[0])(result_data_vars, coords=coords, attrs=result_attrs) + result = type(datasets[0])(result_data_vars, coords=coords_obj, attrs=result_attrs) result.encoding = result_encoding result = result.drop_vars(unlabeled_dims, errors="ignore") From 7e8f895c2746e2a75979b5675967b434aef09ee6 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 26 Apr 2024 23:31:08 -0400 Subject: [PATCH 43/64] warn if create_index=True but no index created because dimension variable was a data var not a coord --- xarray/core/dataset.py | 7 +++++++ xarray/tests/test_dataset.py | 7 ++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index a2bc3505366..3985d2cad1a 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4696,6 +4696,13 @@ def expand_dims( indexes[k] = index variables.update(index_vars) else: + if create_index: + warnings.warn( + f"No index created for dimension {k} because variable {k} is not a coordinate. " + f"To create an index for {k}, please first call `.set_coords({k})` on this object.", + UserWarning, + ) + # create 1D variable without creating a new index new_1d_var = v.set_dims(k) variables.update({k: new_1d_var}) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index e4251ced7a4..17ef63d826e 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -3435,7 +3435,12 @@ def test_expand_dims_kwargs_python36plus(self) -> None: def test_expand_dims_create_index_data_variable(self, create_index_flag): # data variables should not gain an index ever ds = Dataset({"x": 0}) - expanded = ds.expand_dims("x", create_index=create_index_flag) + + if create_index_flag: + with pytest.warns(UserWarning, match="No index created"): + expanded = ds.expand_dims("x", create_index=create_index_flag) + else: + expanded = ds.expand_dims("x", create_index=create_index_flag) # TODO Can't just create the expected dataset directly using constructor because of GH issue 8959 expected = Dataset({"x": ("x", [0])}).drop_indexes("x").reset_coords("x") From ed85446d54ea8791f9224b71da833fd0f5d9c1eb Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Fri, 26 Apr 2024 23:32:53 -0400 Subject: [PATCH 44/64] add string marks in warning message --- xarray/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 3985d2cad1a..4f9125a1ab0 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4699,7 +4699,7 @@ def expand_dims( if create_index: warnings.warn( f"No index created for dimension {k} because variable {k} is not a coordinate. " - f"To create an index for {k}, please first call `.set_coords({k})` on this object.", + f"To create an index for {k}, please first call `.set_coords('{k}')` on this object.", UserWarning, ) From 5894724df6bf3412396acca05e47cd2aa7dd3fad Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 29 Apr 2024 11:33:35 -0400 Subject: [PATCH 45/64] regression test for dtype changing in to_stacked_array --- xarray/tests/test_dataset.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 40cf85484da..006a876d37f 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -3970,6 +3970,25 @@ def test_to_stacked_array_to_unstacked_dataset_different_dimension(self) -> None x = y.to_unstacked_dataset("features") assert_identical(D, x) + def test_to_stacked_array_preserves_dtype(self) -> None: + # regression test for bug found in https://github.com/pydata/xarray/pull/8872#issuecomment-2081218616 + ds = xr.Dataset( + data_vars={ + "a": (("x", "y"), [[0, 1, 2], [3, 4, 5]]), + "b": ("x", [6, 7]), + }, + coords={"y": ["u", "v", "w"]}, + ) + stacked = ds.to_stacked_array("z", sample_dims=["x"]) + + # coordinate created from variables names should be of string dtype + data = np.array(["a", "a", "a", "b"], dtype=" None: data = create_test_data(seed=0) expected = data.copy() From dad94332f3083a17ec57b5dfea4d93aba3513abc Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 29 Apr 2024 11:34:56 -0400 Subject: [PATCH 46/64] correct doctest --- xarray/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 4f9125a1ab0..7e6741257a4 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -5383,7 +5383,7 @@ def to_stacked_array( [3, 4, 5, 7]]) Coordinates: * z (z) object 32B MultiIndex - * variable (z) object 32B 'a' 'a' 'a' 'b' + * variable (z) Date: Mon, 29 Apr 2024 10:27:09 -0600 Subject: [PATCH 47/64] Remove outdated comment --- xarray/tests/test_concat.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index e4af4f29358..24a17ceebc1 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -1038,10 +1038,8 @@ def test_concat_avoids_index_auto_creation_new_1d_coord(self) -> None: for _ in range(2) ] - # TODO this should raise an error, but it doesn't - # It's somehow cheating it's way around the UnexpectedDataAccess error that should be raised with pytest.raises(UnexpectedDataAccess): - combined = concat(datasets, dim="x", create_index=True) + concat(datasets, dim="x", create_index=True) # should not raise on concat iff create_index=False combined = concat(datasets, dim="x", create_index=False) From e17c13f6cb4954a94c037a4e2fb4c73d04ebdec3 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 29 Apr 2024 12:34:14 -0400 Subject: [PATCH 48/64] test we can skip creation of indexes during shape promotion --- xarray/tests/test_concat.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index e4af4f29358..3c11db00739 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -1051,6 +1051,15 @@ def test_concat_avoids_index_auto_creation_new_1d_coord(self) -> None: # nor have auto-created any indexes assert combined.indexes == {} + def test_concat_promote_shape_without_creating_new_index(self) -> None: + # different shapes but neither have indexes + ds1 = Dataset(coords={"x": 0}) + ds2 = Dataset(data_vars={"x": [1]}).drop_indexes("x") + actual = concat([ds1, ds2], dim="x", create_index=False) + expected = Dataset(data_vars={"x": [0, 1]}).drop_indexes("x") + assert_identical(actual, expected, check_default_indexes=False) + assert actual.indexes == {} + class TestConcatDataArray: def test_concat(self) -> None: From e8fa857c0b4bf9ddd7909b6d202eddacc4e21173 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 29 Apr 2024 12:35:44 -0400 Subject: [PATCH 49/64] make shape promotion test pass --- xarray/core/concat.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 7bbb0283d19..b530a1e2ca2 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -576,7 +576,8 @@ def get_indexes(name): var = ds._variables[name] if not var.dims: data = var.set_dims(dim).values - yield PandasIndex(data, dim, coord_dtype=var.dtype) + if create_index: + yield PandasIndex(data, dim, coord_dtype=var.dtype) # create concatenation index, needed for later reindexing file_start_indexes = np.append(0, np.cumsum(concat_dim_lengths)) From 6dd57a976f6d7d98dc5dfccd816ab29fc68dfde0 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 29 Apr 2024 12:48:31 -0400 Subject: [PATCH 50/64] point to issue in whatsnew --- doc/whats-new.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 0ca4253ee2b..c0bea023639 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -32,8 +32,8 @@ New Features - Added the option to avoid automatically creating 1D pandas indexes in :py:meth:`Dataset.expand_dims()`, by passing the new kwarg `create_index=False`. (:pull:`8960`) By `Tom Nicholas `_. -- Avoid automatically re-creating 1D pandas indexes in :py:func:`concat()`. Also optionally avoid creating 1D indexes for - new dimension coordinates by the new kwarg `create_index=False`. (:pull:`8872`) +- Avoid automatically re-creating 1D pandas indexes in :py:func:`concat()`. Also added option to avoid creating 1D indexes for + new dimension coordinates by passing the new kwarg `create_index=False`. (:issue:`8871`, :pull:`8872`) By `Tom Nicholas `_. Breaking changes From b0e36124c04255378ee3f7928fb5d0276f628f89 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Wed, 1 May 2024 14:49:07 -0400 Subject: [PATCH 51/64] don't create dimension coordinates just to drop them at the end --- xarray/core/concat.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index b530a1e2ca2..d4dae1a8657 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -447,7 +447,7 @@ def _parse_datasets( if dim in dims: continue - if dim not in dim_coords: + if dim in ds.coords and dim not in dim_coords: dim_coords[dim] = ds.coords[dim].variable dims = dims | set(ds.dims) @@ -686,8 +686,6 @@ def get_indexes(name): result = type(datasets[0])(result_data_vars, coords=coords_obj, attrs=result_attrs) result.encoding = result_encoding - result = result.drop_vars(unlabeled_dims, errors="ignore") - return result From ff70fc7b1b52f225cba199c838a738d46d71f0cc Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 1 May 2024 12:57:45 -0600 Subject: [PATCH 52/64] Remove ToDo about not using Coordinates object to pass indexes Co-authored-by: Deepak Cherian --- xarray/core/concat.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index d4dae1a8657..d67ceba9f0c 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -680,7 +680,6 @@ def get_indexes(name): result_indexes[dim] = index unlabeled_dims = unlabeled_dims - set([dim]) - # TODO: add indexes at Dataset creation (when it is supported) coords_obj = Coordinates(coord_vars, indexes=result_indexes) result = type(datasets[0])(result_data_vars, coords=coords_obj, attrs=result_attrs) From 2f97a5cfe45677300711fa6e690856e1bbb81768 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Wed, 1 May 2024 15:06:57 -0400 Subject: [PATCH 53/64] get rid of unlabeled_dims variable entirely --- xarray/core/concat.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index d67ceba9f0c..79dc906c099 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -498,7 +498,6 @@ def _dataset_concat( datasets ) dim_names = set(dim_coords) - unlabeled_dims = dim_names - coord_names both_data_and_coords = coord_names & data_names if both_data_and_coords: @@ -519,7 +518,7 @@ def _dataset_concat( ) # determine which variables to merge, and then merge them according to compat - variables_to_merge = (coord_names | data_names) - concat_over - unlabeled_dims + variables_to_merge = (coord_names | data_names) - concat_over result_vars = {} result_indexes = {} @@ -678,7 +677,6 @@ def get_indexes(name): coord_vars[dim] = index_vars[dim] result_indexes[dim] = index - unlabeled_dims = unlabeled_dims - set([dim]) coords_obj = Coordinates(coord_vars, indexes=result_indexes) From 6d825e5d97088037933c3242d136b9c41488fd07 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Wed, 8 May 2024 14:11:38 -0400 Subject: [PATCH 54/64] move ConcatenatableArray and similar to new file --- xarray/tests/arrays.py | 176 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 176 insertions(+) create mode 100644 xarray/tests/arrays.py diff --git a/xarray/tests/arrays.py b/xarray/tests/arrays.py new file mode 100644 index 00000000000..2df945d2b22 --- /dev/null +++ b/xarray/tests/arrays.py @@ -0,0 +1,176 @@ +from typing import Callable, Any +from collections.abc import Iterable + +import numpy as np +from xarray.core import utils +from xarray.core.indexing import ExplicitlyIndexed + +""" +This module contains various lazy array classes which can be wrapped and manipulated by xarray objects but will raise on data access. +""" + + +class UnexpectedDataAccess(Exception): + pass + + +class InaccessibleArray(utils.NDArrayMixin, ExplicitlyIndexed): + """Disallows any loading.""" + + def __init__(self, array): + self.array = array + + def get_duck_array(self): + raise UnexpectedDataAccess("Tried accessing data") + + def __array__(self, dtype: np.typing.DTypeLike = None): + raise UnexpectedDataAccess("Tried accessing data") + + def __getitem__(self, key): + raise UnexpectedDataAccess("Tried accessing data.") + + +class FirstElementAccessibleArray(InaccessibleArray): + def __getitem__(self, key): + tuple_idxr = key.tuple + if len(tuple_idxr) > 1: + raise UnexpectedDataAccess("Tried accessing more than one element.") + return self.array[tuple_idxr] + + +class DuckArrayWrapper(utils.NDArrayMixin): + """Array-like that prevents casting to array. + Modeled after cupy.""" + + def __init__(self, array: np.ndarray): + self.array = array + + def __getitem__(self, key): + return type(self)(self.array[key]) + + def __array__(self, dtype: np.typing.DTypeLike = None): + raise UnexpectedDataAccess("Tried accessing data") + + def __array_namespace__(self): + """Present to satisfy is_duck_array test.""" + + +CONCATENATABLEARRAY_HANDLED_ARRAY_FUNCTIONS: dict[str, Callable] = {} + + +def implements(numpy_function): + """Register an __array_function__ implementation for ConcatenatableArray objects.""" + + def decorator(func): + CONCATENATABLEARRAY_HANDLED_ARRAY_FUNCTIONS[numpy_function] = func + return func + + return decorator + + +@implements(np.concatenate) +def concatenate( + arrays: Iterable["ConcatenatableArray"], /, *, axis=0 +) -> "ConcatenatableArray": + if any(not isinstance(arr, ConcatenatableArray) for arr in arrays): + raise TypeError + + result = np.concatenate([arr._array for arr in arrays], axis=axis) + return ConcatenatableArray(result) + + +@implements(np.stack) +def stack(arrays: Iterable["ConcatenatableArray"], /, *, axis=0) -> "ConcatenatableArray": + if any(not isinstance(arr, ConcatenatableArray) for arr in arrays): + raise TypeError + + result = np.stack([arr._array for arr in arrays], axis=axis) + return ConcatenatableArray(result) + + +@implements(np.result_type) +def result_type(*arrays_and_dtypes) -> np.dtype: + """Called by xarray to ensure all arguments to concat have the same dtype.""" + first_dtype, *other_dtypes = (np.dtype(obj) for obj in arrays_and_dtypes) + for other_dtype in other_dtypes: + if other_dtype != first_dtype: + raise ValueError("dtypes not all consistent") + return first_dtype + + +@implements(np.broadcast_to) +def broadcast_to( + x: "ConcatenatableArray", /, shape: tuple[int, ...] +) -> "ConcatenatableArray": + """ + Broadcasts an array to a specified shape, by either manipulating chunk keys or copying chunk manifest entries. + """ + if not isinstance(x, ConcatenatableArray): + raise TypeError + + result = np.broadcast_to(x._array, shape=shape) + return ConcatenatableArray(result) + + +class ConcatenatableArray: + """Disallows loading or coercing to an index but does support concatenation / stacking.""" + + def __init__(self, array): + # use ._array instead of .array because we don't want this to be accessible even to xarray's internals (e.g. create_default_index_implicit) + self._array = array + + @property + def dtype(self: Any) -> np.dtype: + return self._array.dtype + + @property + def shape(self: Any) -> tuple[int, ...]: + return self._array.shape + + @property + def ndim(self: Any) -> int: + return self._array.ndim + + def __repr__(self: Any) -> str: + return f"{type(self).__name__}(array={self._array!r})" + + def get_duck_array(self): + raise UnexpectedDataAccess("Tried accessing data") + + def __array__(self, dtype: np.typing.DTypeLike = None): + raise UnexpectedDataAccess("Tried accessing data") + + def __getitem__(self, key) -> "ConcatenatableArray": + """Some cases of concat require supporting expanding dims by dimensions of size 1""" + # see https://data-apis.org/array-api/2022.12/API_specification/indexing.html#multi-axis-indexing + arr = self._array + for axis, indexer_1d in enumerate(key): + if indexer_1d is None: + arr = np.expand_dims(arr, axis) + elif indexer_1d is Ellipsis: + pass + else: + raise UnexpectedDataAccess("Tried accessing data.") + return ConcatenatableArray(arr) + + def __array_function__(self, func, types, args, kwargs) -> Any: + if func not in CONCATENATABLEARRAY_HANDLED_ARRAY_FUNCTIONS: + return NotImplemented + + # Note: this allows subclasses that don't override + # __array_function__ to handle ManifestArray objects + if not all(issubclass(t, ConcatenatableArray) for t in types): + return NotImplemented + + return CONCATENATABLEARRAY_HANDLED_ARRAY_FUNCTIONS[func](*args, **kwargs) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs) -> Any: + """We have to define this in order to convince xarray that this class is a duckarray, even though we will never support ufuncs.""" + return NotImplemented + + def astype(self, dtype: np.dtype, /, *, copy: bool = True) -> "ConcatenatableArray": + """Needed because xarray will call this even when it's a no-op""" + if dtype != self.dtype: + raise NotImplementedError() + else: + return self From b88b5a6b6be486b2ecff1d1301cf513f8417e0cd Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 8 May 2024 12:13:07 -0600 Subject: [PATCH 55/64] formatting nit Co-authored-by: Justus Magin --- xarray/core/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 79dc906c099..3b981fa1c61 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -166,7 +166,7 @@ def concat( If a callable, it must expect a sequence of ``attrs`` dicts and a context object as its only parameters. - create_index : bool, default is True + create_index : bool, default: True Whether to create new PandasIndex objects for any new 1D coordinate variables. Returns From b243150372bc4969e1f663c74c14424f01f847b7 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Wed, 8 May 2024 14:25:19 -0400 Subject: [PATCH 56/64] renamed create_index -> create_index_for_new_dim in concat --- doc/whats-new.rst | 2 +- xarray/core/concat.py | 29 +++++++++++++++++------------ xarray/tests/test_concat.py | 8 ++++---- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ac172137ad3..2e035e5dd07 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -33,7 +33,7 @@ New Features `create_index=False`. (:pull:`8960`) By `Tom Nicholas `_. - Avoid automatically re-creating 1D pandas indexes in :py:func:`concat()`. Also added option to avoid creating 1D indexes for - new dimension coordinates by passing the new kwarg `create_index=False`. (:issue:`8871`, :pull:`8872`) + new dimension coordinates by passing the new kwarg `create_index_for_new_dim=False`. (:issue:`8871`, :pull:`8872`) By `Tom Nicholas `_. Breaking changes diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 3b981fa1c61..28d214881c5 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -43,7 +43,7 @@ def concat( fill_value: object = dtypes.NA, join: JoinOptions = "outer", combine_attrs: CombineAttrsOptions = "override", - create_index: bool = True, + create_index_for_new_dim: bool = True, ) -> T_Dataset: ... @@ -58,7 +58,7 @@ def concat( fill_value: object = dtypes.NA, join: JoinOptions = "outer", combine_attrs: CombineAttrsOptions = "override", - create_index: bool = True, + create_index_for_new_dim: bool = True, ) -> T_DataArray: ... @@ -72,7 +72,7 @@ def concat( fill_value=dtypes.NA, join: JoinOptions = "outer", combine_attrs: CombineAttrsOptions = "override", - create_index: bool = True, + create_index_for_new_dim: bool = True, ): """Concatenate xarray objects along a new or existing dimension. @@ -166,8 +166,8 @@ def concat( If a callable, it must expect a sequence of ``attrs`` dicts and a context object as its only parameters. - create_index : bool, default: True - Whether to create new PandasIndex objects for any new 1D coordinate variables. + create_index_for_new_dim : bool, default is True + Whether to create a new ``PandasIndex`` object when the objects being concatenated contain scalar variables named ``dim``. Returns ------- @@ -223,6 +223,8 @@ def concat( x (new_dim) T_Dataset: """ Concatenate a sequence of datasets along a new or existing dimension @@ -510,7 +512,10 @@ def _dataset_concat( # case where concat dimension is a coordinate or data_var but not a dimension if (dim in coord_names or dim in data_names) and dim not in dim_names: - datasets = [ds.expand_dims(dim, create_index=create_index) for ds in datasets] + datasets = [ + ds.expand_dims(dim, create_index=create_index_for_new_dim) + for ds in datasets + ] # determine which variables to concatenate concat_over, equals, concat_dim_lengths = _calc_concat_over( @@ -575,7 +580,7 @@ def get_indexes(name): var = ds._variables[name] if not var.dims: data = var.set_dims(dim).values - if create_index: + if create_index_for_new_dim: yield PandasIndex(data, dim, coord_dtype=var.dtype) # create concatenation index, needed for later reindexing @@ -696,7 +701,7 @@ def _dataarray_concat( fill_value: object = dtypes.NA, join: JoinOptions = "outer", combine_attrs: CombineAttrsOptions = "override", - create_index: bool = True, + create_index_for_new_dim: bool = True, ) -> T_DataArray: from xarray.core.dataarray import DataArray @@ -733,7 +738,7 @@ def _dataarray_concat( fill_value=fill_value, join=join, combine_attrs=combine_attrs, - create_index=create_index, + create_index_for_new_dim=create_index_for_new_dim, ) merged_attrs = merge_attrs([da.attrs for da in arrays], combine_attrs) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index e44e6150be9..0c570de3b52 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -1039,10 +1039,10 @@ def test_concat_avoids_index_auto_creation_new_1d_coord(self) -> None: ] with pytest.raises(UnexpectedDataAccess): - concat(datasets, dim="x", create_index=True) + concat(datasets, dim="x", create_index_for_new_dim=True) - # should not raise on concat iff create_index=False - combined = concat(datasets, dim="x", create_index=False) + # should not raise on concat iff create_index_for_new_dim=False + combined = concat(datasets, dim="x", create_index_for_new_dim=False) assert combined["x"].shape == (2,) assert combined["x"].dims == ("x",) @@ -1053,7 +1053,7 @@ def test_concat_promote_shape_without_creating_new_index(self) -> None: # different shapes but neither have indexes ds1 = Dataset(coords={"x": 0}) ds2 = Dataset(data_vars={"x": [1]}).drop_indexes("x") - actual = concat([ds1, ds2], dim="x", create_index=False) + actual = concat([ds1, ds2], dim="x", create_index_for_new_dim=False) expected = Dataset(data_vars={"x": [0, 1]}).drop_indexes("x") assert_identical(actual, expected, check_default_indexes=False) assert actual.indexes == {} From 9e9e1683591309567565d940d038c9fd7a65ad5a Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Wed, 8 May 2024 14:29:39 -0400 Subject: [PATCH 57/64] renamed create_index -> create_index_for_new_dim in expand_dims --- doc/whats-new.rst | 2 +- xarray/core/dataarray.py | 12 +++++++----- xarray/core/dataset.py | 14 +++++++------- xarray/tests/test_dataset.py | 22 ++++++++++++++-------- 4 files changed, 29 insertions(+), 21 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 2e035e5dd07..11657045d83 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -30,7 +30,7 @@ New Features then, such as broadcasting. By `Ilan Gold `_. - Added the option to avoid automatically creating 1D pandas indexes in :py:meth:`Dataset.expand_dims()`, by passing the new kwarg - `create_index=False`. (:pull:`8960`) + `create_index_for_new_dim=False`. (:pull:`8960`) By `Tom Nicholas `_. - Avoid automatically re-creating 1D pandas indexes in :py:func:`concat()`. Also added option to avoid creating 1D indexes for new dimension coordinates by passing the new kwarg `create_index_for_new_dim=False`. (:issue:`8871`, :pull:`8872`) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index d5aa4688ce1..9ef5e56152b 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2558,7 +2558,7 @@ def expand_dims( self, dim: None | Hashable | Sequence[Hashable] | Mapping[Any, Any] = None, axis: None | int | Sequence[int] = None, - create_index: bool = True, + create_index_for_new_dim: bool = True, **dim_kwargs: Any, ) -> Self: """Return a new object with an additional axis (or axes) inserted at @@ -2569,7 +2569,7 @@ def expand_dims( coordinate consisting of a single value. The automatic creation of indexes to back new 1D coordinate variables - controlled by the create_index kwarg. + controlled by the create_index_for_new_dim kwarg. Parameters ---------- @@ -2586,8 +2586,8 @@ def expand_dims( multiple axes are inserted. In this case, dim arguments should be same length list. If axis=None is passed, all the axes will be inserted to the start of the result array. - create_index : bool, default is True - Whether to create new PandasIndex objects for any new 1D coordinate variables. + create_index_for_new_dim : bool, default is True + Whether to create new ``PandasIndex`` objects when the object being expanded contains scalar variables with names in ``dim``. **dim_kwargs : int or sequence or ndarray The keywords are arbitrary dimensions being inserted and the values are either the lengths of the new dims (if int is given), or their @@ -2651,7 +2651,9 @@ def expand_dims( dim = {dim: 1} dim = either_dict_or_kwargs(dim, dim_kwargs, "expand_dims") - ds = self._to_temp_dataset().expand_dims(dim, axis, create_index=create_index) + ds = self._to_temp_dataset().expand_dims( + dim, axis, create_index_for_new_dim=create_index_for_new_dim + ) return self._from_temp_dataset(ds) def set_index( diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 0ff7ce6c7ee..b0c5215570a 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4513,7 +4513,7 @@ def expand_dims( self, dim: None | Hashable | Sequence[Hashable] | Mapping[Any, Any] = None, axis: None | int | Sequence[int] = None, - create_index: bool = True, + create_index_for_new_dim: bool = True, **dim_kwargs: Any, ) -> Self: """Return a new object with an additional axis (or axes) inserted at @@ -4524,7 +4524,7 @@ def expand_dims( coordinate consisting of a single value. The automatic creation of indexes to back new 1D coordinate variables - controlled by the create_index kwarg. + controlled by the create_index_for_new_dim kwarg. Parameters ---------- @@ -4541,8 +4541,8 @@ def expand_dims( multiple axes are inserted. In this case, dim arguments should be same length list. If axis=None is passed, all the axes will be inserted to the start of the result array. - create_index : bool, default is True - Whether to create new PandasIndex objects for any new 1D coordinate variables. + create_index_for_new_dim : bool, default is True + Whether to create new ``PandasIndex`` objects when the object being expanded contains scalar variables with names in ``dim``. **dim_kwargs : int or sequence or ndarray The keywords are arbitrary dimensions being inserted and the values are either the lengths of the new dims (if int is given), or their @@ -4663,7 +4663,7 @@ def expand_dims( # value within the dim dict to the length of the iterable # for later use. - if create_index: + if create_index_for_new_dim: index = PandasIndex(v, k) indexes[k] = index name_and_new_1d_var = index.create_variables() @@ -4705,14 +4705,14 @@ def expand_dims( variables[k] = v.set_dims(dict(all_dims)) else: if k not in variables: - if k in coord_names and create_index: + if k in coord_names and create_index_for_new_dim: # If dims includes a label of a non-dimension coordinate, # it will be promoted to a 1D coordinate with a single value. index, index_vars = create_default_index_implicit(v.set_dims(k)) indexes[k] = index variables.update(index_vars) else: - if create_index: + if create_index_for_new_dim: warnings.warn( f"No index created for dimension {k} because variable {k} is not a coordinate. " f"To create an index for {k}, please first call `.set_coords('{k}')` on this object.", diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 6cb4ecc537b..1aec179a4ad 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -3430,16 +3430,22 @@ def test_expand_dims_kwargs_python36plus(self) -> None: ) assert_identical(other_way_expected, other_way) - @pytest.mark.parametrize("create_index_flag", [True, False]) - def test_expand_dims_create_index_data_variable(self, create_index_flag): + @pytest.mark.parametrize("create_index_for_new_dim_flag", [True, False]) + def test_expand_dims_create_index_data_variable( + self, create_index_for_new_dim_flag + ): # data variables should not gain an index ever ds = Dataset({"x": 0}) - if create_index_flag: + if create_index_for_new_dim_flag: with pytest.warns(UserWarning, match="No index created"): - expanded = ds.expand_dims("x", create_index=create_index_flag) + expanded = ds.expand_dims( + "x", create_index_for_new_dim=create_index_for_new_dim_flag + ) else: - expanded = ds.expand_dims("x", create_index=create_index_flag) + expanded = ds.expand_dims( + "x", create_index_for_new_dim=create_index_for_new_dim_flag + ) # TODO Can't just create the expected dataset directly using constructor because of GH issue 8959 expected = Dataset({"x": ("x", [0])}).drop_indexes("x").reset_coords("x") @@ -3448,13 +3454,13 @@ def test_expand_dims_create_index_data_variable(self, create_index_flag): assert expanded.indexes == {} def test_expand_dims_create_index_coordinate_variable(self): - # coordinate variables should gain an index only if create_index is True (the default) + # coordinate variables should gain an index only if create_index_for_new_dim is True (the default) ds = Dataset(coords={"x": 0}) expanded = ds.expand_dims("x") expected = Dataset({"x": ("x", [0])}) assert_identical(expanded, expected) - expanded_no_index = ds.expand_dims("x", create_index=False) + expanded_no_index = ds.expand_dims("x", create_index_for_new_dim=False) # TODO Can't just create the expected dataset directly using constructor because of GH issue 8959 expected = Dataset(coords={"x": ("x", [0])}).drop_indexes("x") @@ -3468,7 +3474,7 @@ def test_expand_dims_create_index_from_iterable(self): expected = Dataset({"x": ("x", [0, 1])}) assert_identical(expanded, expected) - expanded_no_index = ds.expand_dims(x=[0, 1], create_index=False) + expanded_no_index = ds.expand_dims(x=[0, 1], create_index_for_new_dim=False) # TODO Can't just create the expected dataset directly using constructor because of GH issue 8959 expected = Dataset(coords={"x": ("x", [0, 1])}).drop_indexes("x") From dca2fb9dcd453391bf672b04929b02e97c18bcf6 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Wed, 8 May 2024 14:52:24 -0400 Subject: [PATCH 58/64] fix incorrect arg name --- xarray/core/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 28d214881c5..51506a55395 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -513,7 +513,7 @@ def _dataset_concat( # case where concat dimension is a coordinate or data_var but not a dimension if (dim in coord_names or dim in data_names) and dim not in dim_names: datasets = [ - ds.expand_dims(dim, create_index=create_index_for_new_dim) + ds.expand_dims(dim, create_index_for_new_dim=create_index_for_new_dim) for ds in datasets ] From c9796726bcb6413ed690dc8e2d55e572c7bc99d0 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Wed, 8 May 2024 14:52:37 -0400 Subject: [PATCH 59/64] add example to docstring --- xarray/core/dataset.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index b0c5215570a..ab7ab990789 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4612,6 +4612,33 @@ def expand_dims( Data variables: temperature (y, x, time) float64 96B 0.5488 0.7152 0.6028 ... 0.7917 0.5289 + # Expand a scalar variable along a new dimension of the same name with and without creating a new index + + >>> ds = xr.Dataset(coords={"x": 0}) + >>> ds + Size: 8B + Dimensions: () + Coordinates: + x int64 8B 0 + Data variables: + *empty* + + >>> ds.expand_dims('x') + Size: 8B + Dimensions: (x: 1) + Coordinates: + * x (x) int64 8B 0 + Data variables: + *empty* + + >>> ds.expand_dims("x").indexes + Indexes: + x Index([0], dtype='int64', name='x') + + >>> ds.expand_dims("x", create_index_for_new_dim=False).indexes + Indexes: + *empty* + See Also -------- DataArray.expand_dims From ac27ce0a0fc7b3db4577240079343d6db78a7d2a Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Wed, 8 May 2024 14:56:02 -0400 Subject: [PATCH 60/64] add example of using new kwarg to docstring of expand_dims --- xarray/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index ab7ab990789..40866dff72a 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4623,7 +4623,7 @@ def expand_dims( Data variables: *empty* - >>> ds.expand_dims('x') + >>> ds.expand_dims("x") Size: 8B Dimensions: (x: 1) Coordinates: From d73ac4802c312a9d95c6f07636c7975a16216ab1 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Wed, 8 May 2024 14:56:11 -0400 Subject: [PATCH 61/64] add example of using new kwarg to docstring of concat --- xarray/core/concat.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 51506a55395..92a7768bfb9 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -224,7 +224,24 @@ def concat( * y (y) int64 24B 10 20 30 * new_dim (new_dim) int64 16B -90 -100 + # Concatenate a scalar variable along a new dimension of the same name with and without creating a new index + >>> ds = xr.Dataset(coords={"x": 0}) + >>> xr.concat([ds, ds], dim="x") + Size: 16B + Dimensions: (x: 2) + Coordinates: + * x (x) int64 16B 0 0 + Data variables: + *empty* + + >>> xr.concat([ds, ds], dim="x").indexes + Indexes: + x Index([0, 0], dtype='int64', name='x') + + >>> xr.concat([ds, ds], dim="x", create_index_for_new_dim=False).indexes + Indexes: + *empty* """ # TODO: add ignore_index arguments copied from pandas.concat # TODO: support concatenating scalar coordinates even if the concatenated From d1b656d74e3f2b78e620b536b74fd4a2c2fd721f Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 8 May 2024 12:59:10 -0600 Subject: [PATCH 62/64] re-nit the nit Co-authored-by: Justus Magin --- xarray/core/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 92a7768bfb9..7f7f6d91650 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -166,7 +166,7 @@ def concat( If a callable, it must expect a sequence of ``attrs`` dicts and a context object as its only parameters. - create_index_for_new_dim : bool, default is True + create_index_for_new_dim : bool, default: True Whether to create a new ``PandasIndex`` object when the objects being concatenated contain scalar variables named ``dim``. Returns From ac998e9ca72d11f33c285ed818b72a4b093f5b2a Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Wed, 8 May 2024 21:05:00 +0200 Subject: [PATCH 63/64] more instances of the nit --- xarray/core/dataarray.py | 2 +- xarray/core/dataset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 3363b2e6de8..4dc897c1878 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2586,7 +2586,7 @@ def expand_dims( multiple axes are inserted. In this case, dim arguments should be same length list. If axis=None is passed, all the axes will be inserted to the start of the result array. - create_index_for_new_dim : bool, default is True + create_index_for_new_dim : bool, default: True Whether to create new ``PandasIndex`` objects when the object being expanded contains scalar variables with names in ``dim``. **dim_kwargs : int or sequence or ndarray The keywords are arbitrary dimensions being inserted and the values diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index b87682bb158..4315c0a7ce0 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4541,7 +4541,7 @@ def expand_dims( multiple axes are inserted. In this case, dim arguments should be same length list. If axis=None is passed, all the axes will be inserted to the start of the result array. - create_index_for_new_dim : bool, default is True + create_index_for_new_dim : bool, default: True Whether to create new ``PandasIndex`` objects when the object being expanded contains scalar variables with names in ``dim``. **dim_kwargs : int or sequence or ndarray The keywords are arbitrary dimensions being inserted and the values From 0849b94525fcd1fbfec54e0321f9bda9182814b7 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Wed, 8 May 2024 15:05:06 -0400 Subject: [PATCH 64/64] fix docstring doctest formatting nit --- xarray/core/concat.py | 2 +- xarray/core/dataset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 92a7768bfb9..4612b140e6c 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -231,7 +231,7 @@ def concat( Size: 16B Dimensions: (x: 2) Coordinates: - * x (x) int64 16B 0 0 + * x (x) int64 16B 0 0 Data variables: *empty* diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index b87682bb158..8395236065b 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4627,7 +4627,7 @@ def expand_dims( Size: 8B Dimensions: (x: 1) Coordinates: - * x (x) int64 8B 0 + * x (x) int64 8B 0 Data variables: *empty*