diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 0266b2dc0f5..1dcdd25ccea 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -25,6 +25,9 @@ New Features - Fix :py:meth:`xr.cov` and :py:meth:`xr.corr` now support complex valued arrays (:issue:`7340`, :pull:`7392`). By `Michael Niklas `_. +- Allow indexing along unindexed dimensions with dask arrays + (:issue:`2511`, :issue:`4276`, :issue:`4663`, :pull:`5873`). + By `Abel Aoun `_ and `Deepak Cherian `_. - Support dask arrays in ``first`` and ``last`` reductions. By `Deepak Cherian `_. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 82d5ef088a4..0bd335f3f0a 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -73,7 +73,7 @@ ) from xarray.core.missing import get_clean_interp_index from xarray.core.options import OPTIONS, _get_keep_attrs -from xarray.core.pycompat import array_type, is_duck_dask_array +from xarray.core.pycompat import array_type, is_duck_array, is_duck_dask_array from xarray.core.types import QuantileMethods, T_Dataset from xarray.core.utils import ( Default, @@ -2292,7 +2292,8 @@ def _validate_indexers( elif isinstance(v, Sequence) and len(v) == 0: yield k, np.empty((0,), dtype="int64") else: - v = np.asarray(v) + if not is_duck_array(v): + v = np.asarray(v) if v.dtype.kind in "US": index = self._indexes[k].to_pandas_index() diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index becf1554453..e39c8fe49ea 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -17,7 +17,12 @@ from xarray.core import duck_array_ops from xarray.core.nputils import NumpyVIndexAdapter from xarray.core.options import OPTIONS -from xarray.core.pycompat import array_type, integer_types, is_duck_dask_array +from xarray.core.pycompat import ( + array_type, + integer_types, + is_duck_array, + is_duck_dask_array, +) from xarray.core.types import T_Xarray from xarray.core.utils import ( NDArrayMixin, @@ -368,17 +373,17 @@ def __init__(self, key): k = int(k) elif isinstance(k, slice): k = as_integer_slice(k) - elif isinstance(k, np.ndarray): + elif is_duck_array(k): if not np.issubdtype(k.dtype, np.integer): raise TypeError( f"invalid indexer array, does not have integer dtype: {k!r}" ) - if k.ndim != 1: + if k.ndim > 1: raise TypeError( - f"invalid indexer array for {type(self).__name__}; must have " - f"exactly 1 dimension: {k!r}" + f"invalid indexer array for {type(self).__name__}; must be scalar " + f"or have 1 dimension: {k!r}" ) - k = np.asarray(k, dtype=np.int64) + k = k.astype(np.int64) else: raise TypeError( f"unexpected indexer type for {type(self).__name__}: {k!r}" @@ -409,7 +414,13 @@ def __init__(self, key): for k in key: if isinstance(k, slice): k = as_integer_slice(k) - elif isinstance(k, np.ndarray): + elif is_duck_dask_array(k): + raise ValueError( + "Vectorized indexing with Dask arrays is not supported. " + "Please pass a numpy array by calling ``.compute``. " + "See https://github.com/dask/dask/issues/8958." + ) + elif is_duck_array(k): if not np.issubdtype(k.dtype, np.integer): raise TypeError( f"invalid indexer array, does not have integer dtype: {k!r}" @@ -422,7 +433,7 @@ def __init__(self, key): "invalid indexer key: ndarray arguments " f"have different numbers of dimensions: {ndims}" ) - k = np.asarray(k, dtype=np.int64) + k = k.astype(np.int64) else: raise TypeError( f"unexpected indexer type for {type(self).__name__}: {k!r}" @@ -1351,8 +1362,9 @@ def __getitem__(self, key): rewritten_indexer = False new_indexer = [] for idim, k in enumerate(key.tuple): - if isinstance(k, Iterable) and duck_array_ops.array_equiv( - k, np.arange(self.array.shape[idim]) + if isinstance(k, Iterable) and ( + not is_duck_dask_array(k) + and duck_array_ops.array_equiv(k, np.arange(self.array.shape[idim])) ): new_indexer.append(slice(None)) rewritten_indexer = True diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index 2bc413dc21f..9e17ab93e80 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -7,6 +7,7 @@ from numpy.core.multiarray import normalize_axis_index # type: ignore[attr-defined] from xarray.core.options import OPTIONS +from xarray.core.pycompat import is_duck_array try: import bottleneck as bn @@ -121,7 +122,10 @@ def _advanced_indexer_subspaces(key): return (), () non_slices = [k for k in key if not isinstance(k, slice)] - ndim = len(np.broadcast(*non_slices).shape) + broadcasted_shape = np.broadcast_shapes( + *[item.shape if is_duck_array(item) else (0,) for item in non_slices] + ) + ndim = len(broadcasted_shape) mixed_positions = advanced_index_positions[0] + np.arange(ndim) vindex_positions = np.arange(ndim) return mixed_positions, vindex_positions diff --git a/xarray/core/pycompat.py b/xarray/core/pycompat.py index 95387523bc8..4a3f3638d14 100644 --- a/xarray/core/pycompat.py +++ b/xarray/core/pycompat.py @@ -7,7 +7,7 @@ import numpy as np from packaging.version import Version -from xarray.core.utils import is_duck_array, module_available +from xarray.core.utils import is_duck_array, is_scalar, module_available integer_types = (int, np.integer) @@ -79,3 +79,7 @@ def is_dask_collection(x): def is_duck_dask_array(x): return is_duck_array(x) and is_dask_collection(x) + + +def is_0d_dask_array(x): + return is_duck_dask_array(x) and is_scalar(x) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 831ece25b67..1c436d49b1d 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -26,7 +26,12 @@ as_indexable, ) from xarray.core.options import OPTIONS, _get_keep_attrs -from xarray.core.pycompat import array_type, integer_types, is_duck_dask_array +from xarray.core.pycompat import ( + array_type, + integer_types, + is_0d_dask_array, + is_duck_dask_array, +) from xarray.core.utils import ( Frozen, NdimSizeLenMixin, @@ -687,11 +692,12 @@ def _broadcast_indexes(self, key): key = self._item_key_to_tuple(key) # key is a tuple # key is a tuple of full size key = indexing.expanded_indexer(key, self.ndim) - # Convert a scalar Variable to an integer + # Convert a scalar Variable to a 0d-array key = tuple( - k.data.item() if isinstance(k, Variable) and k.ndim == 0 else k for k in key + k.data if isinstance(k, Variable) and k.ndim == 0 else k for k in key ) - # Convert a 0d-array to an integer + # Convert a 0d numpy arrays to an integer + # dask 0d arrays are passed through key = tuple( k.item() if isinstance(k, np.ndarray) and k.ndim == 0 else k for k in key ) @@ -732,7 +738,8 @@ def _validate_indexers(self, key): for dim, k in zip(self.dims, key): if not isinstance(k, BASIC_INDEXING_TYPES): if not isinstance(k, Variable): - k = np.asarray(k) + if not is_duck_array(k): + k = np.asarray(k) if k.ndim > 1: raise IndexError( "Unlabeled multi-dimensional array cannot be " @@ -749,6 +756,13 @@ def _validate_indexers(self, key): "{}-dimensional boolean indexing is " "not supported. ".format(k.ndim) ) + if is_duck_dask_array(k.data): + raise KeyError( + "Indexing with a boolean dask array is not allowed. " + "This will result in a dask array of unknown shape. " + "Such arrays are unsupported by Xarray." + "Please compute the indexer first using .compute()" + ) if getattr(k, "dims", (dim,)) != (dim,): raise IndexError( "Boolean indexer should be unlabeled or on the " @@ -759,10 +773,11 @@ def _validate_indexers(self, key): ) def _broadcast_indexes_outer(self, key): + # drop dim if k is integer or if k is a 0d dask array dims = tuple( k.dims[0] if isinstance(k, Variable) else dim for k, dim in zip(key, self.dims) - if not isinstance(k, integer_types) + if (not isinstance(k, integer_types) and not is_0d_dask_array(k)) ) new_key = [] @@ -770,7 +785,8 @@ def _broadcast_indexes_outer(self, key): if isinstance(k, Variable): k = k.data if not isinstance(k, BASIC_INDEXING_TYPES): - k = np.asarray(k) + if not is_duck_array(k): + k = np.asarray(k) if k.size == 0: # Slice by empty list; numpy could not infer the dtype k = k.astype(int) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 52a41035faf..21f0ab93d78 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -123,14 +123,15 @@ def test_indexing(self): (da.array([99, 99, 3, 99]), [0, -1, 1]), (da.array([99, 99, 99, 4]), np.arange(3)), (da.array([1, 99, 99, 99]), [False, True, True, True]), - (da.array([1, 99, 99, 99]), np.arange(4) > 0), - (da.array([99, 99, 99, 99]), Variable(("x"), da.array([1, 2, 3, 4])) > 0), + (da.array([1, 99, 99, 99]), np.array([False, True, True, True])), + (da.array([99, 99, 99, 99]), Variable(("x"), np.array([True] * 4))), ], ) def test_setitem_dask_array(self, expected_data, index): arr = Variable(("x"), da.array([1, 2, 3, 4])) expected = Variable(("x"), expected_data) - arr[index] = 99 + with raise_if_dask_computes(): + arr[index] = 99 assert_identical(arr, expected) def test_squeeze(self): diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index ed1abea5fbe..378d471ba6b 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -4216,37 +4216,40 @@ def test_query( d = np.random.choice(["foo", "bar", "baz"], size=30, replace=True).astype( object ) - if backend == "numpy": - aa = DataArray(data=a, dims=["x"], name="a") - bb = DataArray(data=b, dims=["x"], name="b") - cc = DataArray(data=c, dims=["y"], name="c") - dd = DataArray(data=d, dims=["z"], name="d") + aa = DataArray(data=a, dims=["x"], name="a", coords={"a2": ("x", a)}) + bb = DataArray(data=b, dims=["x"], name="b", coords={"b2": ("x", b)}) + cc = DataArray(data=c, dims=["y"], name="c", coords={"c2": ("y", c)}) + dd = DataArray(data=d, dims=["z"], name="d", coords={"d2": ("z", d)}) - elif backend == "dask": + if backend == "dask": import dask.array as da - aa = DataArray(data=da.from_array(a, chunks=3), dims=["x"], name="a") - bb = DataArray(data=da.from_array(b, chunks=3), dims=["x"], name="b") - cc = DataArray(data=da.from_array(c, chunks=7), dims=["y"], name="c") - dd = DataArray(data=da.from_array(d, chunks=12), dims=["z"], name="d") + aa = aa.copy(data=da.from_array(a, chunks=3)) + bb = bb.copy(data=da.from_array(b, chunks=3)) + cc = cc.copy(data=da.from_array(c, chunks=7)) + dd = dd.copy(data=da.from_array(d, chunks=12)) # query single dim, single variable - actual = aa.query(x="a > 5", engine=engine, parser=parser) + with raise_if_dask_computes(): + actual = aa.query(x="a2 > 5", engine=engine, parser=parser) expect = aa.isel(x=(a > 5)) assert_identical(expect, actual) # query single dim, single variable, via dict - actual = aa.query(dict(x="a > 5"), engine=engine, parser=parser) + with raise_if_dask_computes(): + actual = aa.query(dict(x="a2 > 5"), engine=engine, parser=parser) expect = aa.isel(dict(x=(a > 5))) assert_identical(expect, actual) # query single dim, single variable - actual = bb.query(x="b > 50", engine=engine, parser=parser) + with raise_if_dask_computes(): + actual = bb.query(x="b2 > 50", engine=engine, parser=parser) expect = bb.isel(x=(b > 50)) assert_identical(expect, actual) # query single dim, single variable - actual = cc.query(y="c < .5", engine=engine, parser=parser) + with raise_if_dask_computes(): + actual = cc.query(y="c2 < .5", engine=engine, parser=parser) expect = cc.isel(y=(c < 0.5)) assert_identical(expect, actual) @@ -4254,7 +4257,8 @@ def test_query( if parser == "pandas": # N.B., this query currently only works with the pandas parser # xref https://github.com/pandas-dev/pandas/issues/40436 - actual = dd.query(z='d == "bar"', engine=engine, parser=parser) + with raise_if_dask_computes(): + actual = dd.query(z='d2 == "bar"', engine=engine, parser=parser) expect = dd.isel(z=(d == "bar")) assert_identical(expect, actual) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index f45e41af47d..2e23d02a261 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -46,6 +46,7 @@ create_test_data, has_cftime, has_dask, + raise_if_dask_computes, requires_bottleneck, requires_cftime, requires_cupy, @@ -6197,7 +6198,15 @@ def test_query(self, backend, engine, parser) -> None: "d": ("z", d), "e": (("x", "y"), e), "f": (("x", "y", "z"), f), - } + }, + coords={ + "a2": ("x", a), + "b2": ("x", b), + "c2": ("y", c), + "d2": ("z", d), + "e2": (("x", "y"), e), + "f2": (("x", "y", "z"), f), + }, ) elif backend == "dask": ds = Dataset( @@ -6208,26 +6217,38 @@ def test_query(self, backend, engine, parser) -> None: "d": ("z", da.from_array(d, chunks=12)), "e": (("x", "y"), da.from_array(e, chunks=(3, 7))), "f": (("x", "y", "z"), da.from_array(f, chunks=(3, 7, 12))), - } + }, + coords={ + "a2": ("x", a), + "b2": ("x", b), + "c2": ("y", c), + "d2": ("z", d), + "e2": (("x", "y"), e), + "f2": (("x", "y", "z"), f), + }, ) # query single dim, single variable - actual = ds.query(x="a > 5", engine=engine, parser=parser) + with raise_if_dask_computes(): + actual = ds.query(x="a2 > 5", engine=engine, parser=parser) expect = ds.isel(x=(a > 5)) assert_identical(expect, actual) # query single dim, single variable, via dict - actual = ds.query(dict(x="a > 5"), engine=engine, parser=parser) + with raise_if_dask_computes(): + actual = ds.query(dict(x="a2 > 5"), engine=engine, parser=parser) expect = ds.isel(dict(x=(a > 5))) assert_identical(expect, actual) # query single dim, single variable - actual = ds.query(x="b > 50", engine=engine, parser=parser) + with raise_if_dask_computes(): + actual = ds.query(x="b2 > 50", engine=engine, parser=parser) expect = ds.isel(x=(b > 50)) assert_identical(expect, actual) # query single dim, single variable - actual = ds.query(y="c < .5", engine=engine, parser=parser) + with raise_if_dask_computes(): + actual = ds.query(y="c2 < .5", engine=engine, parser=parser) expect = ds.isel(y=(c < 0.5)) assert_identical(expect, actual) @@ -6235,51 +6256,67 @@ def test_query(self, backend, engine, parser) -> None: if parser == "pandas": # N.B., this query currently only works with the pandas parser # xref https://github.com/pandas-dev/pandas/issues/40436 - actual = ds.query(z='d == "bar"', engine=engine, parser=parser) + with raise_if_dask_computes(): + actual = ds.query(z='d2 == "bar"', engine=engine, parser=parser) expect = ds.isel(z=(d == "bar")) assert_identical(expect, actual) # query single dim, multiple variables - actual = ds.query(x="(a > 5) & (b > 50)", engine=engine, parser=parser) + with raise_if_dask_computes(): + actual = ds.query(x="(a2 > 5) & (b2 > 50)", engine=engine, parser=parser) expect = ds.isel(x=((a > 5) & (b > 50))) assert_identical(expect, actual) # query single dim, multiple variables with computation - actual = ds.query(x="(a * b) > 250", engine=engine, parser=parser) + with raise_if_dask_computes(): + actual = ds.query(x="(a2 * b2) > 250", engine=engine, parser=parser) expect = ds.isel(x=(a * b) > 250) assert_identical(expect, actual) # check pandas query syntax is supported if parser == "pandas": - actual = ds.query(x="(a > 5) and (b > 50)", engine=engine, parser=parser) + with raise_if_dask_computes(): + actual = ds.query( + x="(a2 > 5) and (b2 > 50)", engine=engine, parser=parser + ) expect = ds.isel(x=((a > 5) & (b > 50))) assert_identical(expect, actual) # query multiple dims via kwargs - actual = ds.query(x="a > 5", y="c < .5", engine=engine, parser=parser) + with raise_if_dask_computes(): + actual = ds.query(x="a2 > 5", y="c2 < .5", engine=engine, parser=parser) expect = ds.isel(x=(a > 5), y=(c < 0.5)) assert_identical(expect, actual) # query multiple dims via kwargs if parser == "pandas": - actual = ds.query( - x="a > 5", y="c < .5", z="d == 'bar'", engine=engine, parser=parser - ) + with raise_if_dask_computes(): + actual = ds.query( + x="a2 > 5", + y="c2 < .5", + z="d2 == 'bar'", + engine=engine, + parser=parser, + ) expect = ds.isel(x=(a > 5), y=(c < 0.5), z=(d == "bar")) assert_identical(expect, actual) # query multiple dims via dict - actual = ds.query(dict(x="a > 5", y="c < .5"), engine=engine, parser=parser) + with raise_if_dask_computes(): + actual = ds.query( + dict(x="a2 > 5", y="c2 < .5"), engine=engine, parser=parser + ) expect = ds.isel(dict(x=(a > 5), y=(c < 0.5))) assert_identical(expect, actual) # query multiple dims via dict if parser == "pandas": - actual = ds.query( - dict(x="a > 5", y="c < .5", z="d == 'bar'"), - engine=engine, - parser=parser, - ) + with raise_if_dask_computes(): + actual = ds.query( + dict(x="a2 > 5", y="c2 < .5", z="d2 == 'bar'"), + engine=engine, + parser=parser, + ) expect = ds.isel(dict(x=(a > 5), y=(c < 0.5), z=(d == "bar"))) assert_identical(expect, actual) diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 8229db0c132..9f57b3b9056 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -11,7 +11,14 @@ from xarray.core import indexing, nputils from xarray.core.indexes import PandasIndex, PandasMultiIndex from xarray.core.types import T_Xarray -from xarray.tests import IndexerMaker, ReturnItem, assert_array_equal +from xarray.tests import ( + IndexerMaker, + ReturnItem, + assert_array_equal, + assert_identical, + raise_if_dask_computes, + requires_dask, +) B = IndexerMaker(indexing.BasicIndexer) @@ -820,3 +827,65 @@ def test_indexing_1d_object_array() -> None: expected = DataArray(expected_data) assert [actual.data.item()] == [expected.data.item()] + + +@requires_dask +def test_indexing_dask_array(): + import dask.array + + da = DataArray( + np.ones(10 * 3 * 3).reshape((10, 3, 3)), + dims=("time", "x", "y"), + ).chunk(dict(time=-1, x=1, y=1)) + with raise_if_dask_computes(): + actual = da.isel(time=dask.array.from_array([9], chunks=(1,))) + expected = da.isel(time=[9]) + assert_identical(actual, expected) + + +@requires_dask +def test_indexing_dask_array_scalar(): + # GH4276 + import dask.array + + a = dask.array.from_array(np.linspace(0.0, 1.0)) + da = DataArray(a, dims="x") + x_selector = da.argmax(dim=...) + with raise_if_dask_computes(): + actual = da.isel(x_selector) + expected = da.isel(x=-1) + assert_identical(actual, expected) + + +@requires_dask +def test_vectorized_indexing_dask_array(): + # https://github.com/pydata/xarray/issues/2511#issuecomment-563330352 + darr = DataArray(data=[0.2, 0.4, 0.6], coords={"z": range(3)}, dims=("z",)) + indexer = DataArray( + data=np.random.randint(0, 3, 8).reshape(4, 2).astype(int), + coords={"y": range(4), "x": range(2)}, + dims=("y", "x"), + ) + with pytest.raises(ValueError, match="Vectorized indexing with Dask arrays"): + darr[indexer.chunk({"y": 2})] + + +@requires_dask +def test_advanced_indexing_dask_array(): + # GH4663 + import dask.array as da + + ds = Dataset( + dict( + a=("x", da.from_array(np.random.randint(0, 100, 100))), + b=(("x", "y"), da.random.random((100, 10))), + ) + ) + expected = ds.b.sel(x=ds.a.compute()) + with raise_if_dask_computes(): + actual = ds.b.sel(x=ds.a) + assert_identical(expected, actual) + + with raise_if_dask_computes(): + actual = ds.b.sel(x=ds.a.data) + assert_identical(expected, actual)