From 848b3519461ad48bb843928ce1ff64e264d890ec Mon Sep 17 00:00:00 2001 From: Tom White Date: Mon, 12 Aug 2024 12:04:18 +0100 Subject: [PATCH] Add an Xarray test suite adapted from xarray/tests/test_dask.py (#19) * Add an Xarray test suite adapted from xarray/tests/test_dask.py * Add dill to the test dependencies --- cubed_xarray/tests/test_xarray.py | 717 ++++++++++++++++++++++++++++++ pyproject.toml | 4 +- 2 files changed, 720 insertions(+), 1 deletion(-) create mode 100644 cubed_xarray/tests/test_xarray.py diff --git a/cubed_xarray/tests/test_xarray.py b/cubed_xarray/tests/test_xarray.py new file mode 100644 index 0000000..55663d1 --- /dev/null +++ b/cubed_xarray/tests/test_xarray.py @@ -0,0 +1,717 @@ +# Adapted from xarray/tests/test_dask.py + +from __future__ import annotations + +import operator +from contextlib import suppress +from textwrap import dedent + +import cubed +import cubed.random +import dill +import numpy as np +import pandas as pd +import pytest +import xarray as xr +from xarray import DataArray, Dataset, Variable +from xarray.tests import ( + assert_allclose, + assert_array_equal, + assert_equal, + assert_identical, + mock, +) + +try: + from cubed.testing import raise_if_computes as raise_if_cubed_computes +except ImportError: + from contextlib import nullcontext + + raise_if_cubed_computes = nullcontext + + +@pytest.mark.xfail(reason="needs https://github.com/cubed-dev/cubed/pull/545") +def test_raise_if_cubed_computes(): + data = cubed.from_array(np.random.RandomState(0).randn(4, 6), chunks=(2, 2)) + with pytest.raises(RuntimeError, match=r"'compute' was called"): + with raise_if_cubed_computes(): + data.compute() + + +class CubedTestCase: + def assertLazyAnd(self, expected, actual, test): + test(actual, expected) + + if isinstance(actual, Dataset): + for k, v in actual.variables.items(): + if k in actual.xindexes: + assert isinstance(v.data, np.ndarray) + else: + assert isinstance(v.data, cubed.Array) + elif isinstance(actual, DataArray): + assert isinstance(actual.data, cubed.Array) + for k, v in actual.coords.items(): + if k in actual.xindexes: + assert isinstance(v.data, np.ndarray) + else: + assert isinstance(v.data, cubed.Array) + elif isinstance(actual, Variable): + assert isinstance(actual.data, cubed.Array) + else: + assert False + + +class TestVariable(CubedTestCase): + def assertLazyAndIdentical(self, expected, actual): + self.assertLazyAnd(expected, actual, assert_identical) + + def assertLazyAndAllClose(self, expected, actual): + self.assertLazyAnd(expected, actual, assert_allclose) + + @pytest.fixture(autouse=True) + def setUp(self): + self.values = np.random.RandomState(0).randn(4, 6) + self.data = cubed.from_array(self.values, chunks=(2, 2)) + + self.eager_var = Variable(("x", "y"), self.values) + self.lazy_var = Variable(("x", "y"), self.data) + + def test_basics(self): + v = self.lazy_var + assert self.data is v.data + assert self.data.chunks == v.chunks + assert_array_equal(self.values, v) + + def test_copy(self): + self.assertLazyAndIdentical(self.eager_var, self.lazy_var.copy()) + self.assertLazyAndIdentical(self.eager_var, self.lazy_var.copy(deep=True)) + + @pytest.mark.xfail( + reason="cubed rechunk handles chunks={} incorrectly, see https://github.com/cubed-dev/cubed/pull/546" + ) + def test_chunk(self): + for chunks, expected in [ + ({}, ((2, 2), (2, 2, 2))), + (3, ((3, 1), (3, 3))), + ({"x": 3, "y": 3}, ((3, 1), (3, 3))), + ({"x": 3}, ((3, 1), (2, 2, 2))), + ({"x": (3, 1)}, ((3, 1), (2, 2, 2))), + ]: + rechunked = self.lazy_var.chunk(chunks, chunked_array_type="cubed") + assert rechunked.chunks == expected + self.assertLazyAndIdentical(self.eager_var, rechunked) + + expected_chunksizes = { + dim: chunks for dim, chunks in zip(self.lazy_var.dims, expected) + } + assert rechunked.chunksizes == expected_chunksizes + + def test_indexing(self): + u = self.eager_var + v = self.lazy_var + self.assertLazyAndIdentical(u[0], v[0]) + self.assertLazyAndIdentical(u[:1], v[:1]) + self.assertLazyAndIdentical(u[[0, 1], [0, 1, 2]], v[[0, 1], [0, 1, 2]]) + + def test_squeeze(self): + u = self.eager_var + v = self.lazy_var + self.assertLazyAndIdentical(u[0].squeeze(), v[0].squeeze()) + + def test_equals(self): + v = self.lazy_var + assert v.equals(v) + assert isinstance(v.data, cubed.Array) + assert v.identical(v) + assert isinstance(v.data, cubed.Array) + + def test_transpose(self): + u = self.eager_var + v = self.lazy_var + self.assertLazyAndIdentical(u.T, v.T) + + @pytest.mark.xfail( + reason="xarray uses np.pad which is not yet wired up to cubed.pad" + ) + def test_shift(self): + u = self.eager_var + v = self.lazy_var + self.assertLazyAndIdentical(u.shift(x=2), v.shift(x=2)) + self.assertLazyAndIdentical(u.shift(x=-2), v.shift(x=-2)) + assert v.data.chunks == v.shift(x=1).data.chunks + + def test_roll(self): + u = self.eager_var + v = self.lazy_var + self.assertLazyAndIdentical(u.roll(x=2), v.roll(x=2)) + # assert v.data.chunks == v.roll(x=1).data.chunks # TODO: fails + + def test_unary_op(self): + u = self.eager_var + v = self.lazy_var + self.assertLazyAndIdentical(-u, -v) + self.assertLazyAndIdentical(abs(u), abs(v)) + # self.assertLazyAndIdentical(u.round(), v.round()) # TODO: fails, see https://github.com/pydata/xarray/pull/9326 + + def test_binary_op(self): + u = self.eager_var + v = self.lazy_var + self.assertLazyAndIdentical(2 * u, 2 * v) + self.assertLazyAndIdentical(u + u, v + v) + # self.assertLazyAndIdentical(u[0] + u, v[0] + v) # TODO: fails + + def test_binary_op_bitshift(self) -> None: + # bit shifts only work on ints so we need to generate + # new eager and lazy vars + rng = np.random.default_rng(0) + values = rng.integers(low=-10000, high=10000, size=(4, 6)) + data = cubed.from_array(values, chunks=(2, 2)) + u = Variable(("x", "y"), values) + v = Variable(("x", "y"), data) + self.assertLazyAndIdentical(u << 2, v << 2) + self.assertLazyAndIdentical(u << 5, v << 5) + self.assertLazyAndIdentical(u >> 2, v >> 2) + self.assertLazyAndIdentical(u >> 5, v >> 5) + + def test_repr(self): + expected = dedent( + f"""\ + Size: 192B + {self.lazy_var.data!r}""" + ) + assert expected == repr(self.lazy_var) + + @pytest.mark.xfail(reason="duck array ops problem in xarray") + def test_pickle(self): + # Test that pickling/unpickling does not convert the cubed + # backend to numpy + # Use dill since pickle can't handle cubed functions + a1 = self.lazy_var + a1.compute() + assert not a1._in_memory + a2 = dill.loads(dill.dumps(a1)) + assert_identical(a1, a2) + assert not a1._in_memory + assert not a2._in_memory + + def test_reduce(self): + u = self.eager_var + v = self.lazy_var + # TODO: remove skipna=False (https://github.com/cubed-dev/cubed/issues/153) + self.assertLazyAndAllClose(u.mean(skipna=False), v.mean(skipna=False)) + # TODO: other reduce functions need work + # self.assertLazyAndAllClose(u.mean(), v.mean()) + # self.assertLazyAndAllClose(u.std(), v.std()) + # with raise_if_cubed_computes(): + # actual = v.argmax(dim="x") + # self.assertLazyAndAllClose(u.argmax(dim="x"), actual) + # with raise_if_cubed_computes(): + # actual = v.argmin(dim="x") + # self.assertLazyAndAllClose(u.argmin(dim="x"), actual) + # self.assertLazyAndAllClose((u > 1).any(), (v > 1).any()) + # self.assertLazyAndAllClose((u < 1).all("x"), (v < 1).all("x")) + # with pytest.raises(NotImplementedError, match=r"only works along an axis"): + # v.median() + # with pytest.raises(NotImplementedError, match=r"only works along an axis"): + # v.median(v.dims) + # with raise_if_cubed_computes(): + # v.reduce(duck_array_ops.mean) + + def test_missing_values(self): + values = np.array([0, 1, np.nan, 3]) + data = cubed.from_array(values, chunks=(2,)) + + eager_var = Variable("x", values) + lazy_var = Variable("x", data) + self.assertLazyAndIdentical(eager_var, lazy_var.fillna(lazy_var)) + self.assertLazyAndIdentical(Variable("x", range(4)), lazy_var.fillna(2)) + # self.assertLazyAndIdentical(eager_var.count(), lazy_var.count()) # TODO: doesn't use array API + + def test_concat(self): + u = self.eager_var + v = self.lazy_var + self.assertLazyAndIdentical(u, Variable.concat([v[:2], v[2:]], "x")) + self.assertLazyAndIdentical(u[:2], Variable.concat([v[0], v[1]], "x")) + # TODO: following fail + # self.assertLazyAndIdentical(u[:2], Variable.concat([u[0], v[1]], "x")) + # self.assertLazyAndIdentical(u[:2], Variable.concat([v[0], u[1]], "x")) + # self.assertLazyAndIdentical( + # u[:3], Variable.concat([v[[0, 2]], v[[1]]], "x", positions=[[0, 2], [1]]) + # ) + + def test_missing_methods(self): + v = self.lazy_var + with pytest.raises(AttributeError): + v.argsort() + with pytest.raises(AttributeError): + v[0].item() + + @pytest.mark.xfail(reason="np ufuncs don't delegate to cubed") + def test_univariate_ufunc(self): + u = self.eager_var + v = self.lazy_var + self.assertLazyAndAllClose(np.sin(u), np.sin(v)) + + @pytest.mark.xfail(reason="np ufuncs don't delegate to cubed") + def test_bivariate_ufunc(self): + u = self.eager_var + v = self.lazy_var + self.assertLazyAndAllClose(np.maximum(u, 0), np.maximum(v, 0)) + self.assertLazyAndAllClose(np.maximum(u, 0), np.maximum(0, v)) + + @pytest.mark.skip(reason="can't call cubed.compute on anything except cubed arrays") + def test_compute(self): + u = self.eager_var + v = self.lazy_var + + # assert dask.is_dask_collection(v) + (v2,) = cubed.compute(v + 1) + # assert not dask.is_dask_collection(v2) + + assert ((u + 1).data == v2.data).all() + + +class TestDataArrayAndDataset(CubedTestCase): + def assertLazyAndIdentical(self, expected, actual): + self.assertLazyAnd(expected, actual, assert_identical) + + def assertLazyAndAllClose(self, expected, actual): + self.assertLazyAnd(expected, actual, assert_allclose) + + def assertLazyAndEqual(self, expected, actual): + self.assertLazyAnd(expected, actual, assert_equal) + + @pytest.fixture(autouse=True) + def setUp(self): + self.values = np.random.randn(4, 6) + self.data = cubed.from_array(self.values, chunks=(2, 2)) + self.eager_array = DataArray( + self.values, coords={"x": range(4)}, dims=("x", "y"), name="foo" + ) + self.lazy_array = DataArray( + self.data, coords={"x": range(4)}, dims=("x", "y"), name="foo" + ) + + @pytest.mark.xfail( + reason="cubed rechunk handles chunks={} incorrectly, see https://github.com/cubed-dev/cubed/pull/546" + ) + def test_chunk(self) -> None: + for chunks, expected in [ + ({}, ((2, 2), (2, 2, 2))), + (3, ((3, 1), (3, 3))), + ({"x": 3, "y": 3}, ((3, 1), (3, 3))), + ({"x": 3}, ((3, 1), (2, 2, 2))), + ({"x": (3, 1)}, ((3, 1), (2, 2, 2))), + ({"x": "16B"}, ((1, 1, 1, 1), (2, 2, 2))), + ("16B", ((1, 1, 1, 1), (1,) * 6)), + ("16MB", ((4,), (6,))), + ]: + # Test DataArray + rechunked = self.lazy_array.chunk(chunks, chunked_array_type="cubed") + assert rechunked.chunks == expected + self.assertLazyAndIdentical(self.eager_array, rechunked) + + expected_chunksizes = { + dim: chunks for dim, chunks in zip(self.lazy_array.dims, expected) + } + assert rechunked.chunksizes == expected_chunksizes + + # Test Dataset + lazy_dataset = self.lazy_array.to_dataset() + eager_dataset = self.eager_array.to_dataset() + expected_chunksizes = { + dim: chunks for dim, chunks in zip(lazy_dataset.dims, expected) + } + rechunked = lazy_dataset.chunk(chunks, chunked_array_type="cubed") + + # Dataset.chunks has a different return type to DataArray.chunks - see issue #5843 + assert rechunked.chunks == expected_chunksizes + self.assertLazyAndIdentical(eager_dataset, rechunked) + + assert rechunked.chunksizes == expected_chunksizes + + @pytest.mark.xfail(reason="cubed rechunk bug") + def test_rechunk(self): + chunked = self.eager_array.chunk({"x": 2}).chunk( + {"y": 2}, chunked_array_type="cubed" + ) + assert chunked.chunks == ((2,) * 2, (2,) * 3) + self.assertLazyAndIdentical(self.lazy_array, chunked) + + def test_new_chunk(self): + chunked = self.eager_array.chunk(chunked_array_type="cubed") + assert chunked.data.name.startswith("array-") + + def test_lazy_dataset(self): + lazy_ds = Dataset({"foo": (("x", "y"), self.data)}) + assert isinstance(lazy_ds.foo.variable.data, cubed.Array) + + def test_lazy_array(self): + u = self.eager_array + v = self.lazy_array + + self.assertLazyAndAllClose(u, v) + self.assertLazyAndAllClose(-u, -v) + self.assertLazyAndAllClose(u.T, v.T) + # self.assertLazyAndAllClose(u.mean(), v.mean()) # TODO: isn't lazy + self.assertLazyAndAllClose(1 + u, 1 + v) + + actual = xr.concat([v[:2], v[2:]], "x") + self.assertLazyAndAllClose(u, actual) + + @pytest.mark.skip(reason="can't call cubed.compute on anything except cubed arrays") + def test_compute(self): + u = self.eager_array + v = self.lazy_array + + # assert dask.is_dask_collection(v) + (v2,) = cubed.compute(v + 1) + # assert not dask.is_dask_collection(v2) + + assert ((u + 1).data == v2.data).all() + + @pytest.mark.xfail(reason="isn't lazy") + def test_groupby(self): + u = self.eager_array + v = self.lazy_array + + expected = u.groupby("x").mean(...) + with raise_if_cubed_computes(): + actual = v.groupby("x").mean(...) + self.assertLazyAndAllClose(expected, actual) + + @pytest.mark.xfail(reason="isn't lazy") + def test_rolling(self): + u = self.eager_array + v = self.lazy_array + + expected = u.rolling(x=2).mean() + with raise_if_cubed_computes(): + actual = v.rolling(x=2).mean() + self.assertLazyAndAllClose(expected, actual) + + @pytest.mark.xfail(reason="failure in cubed") + @pytest.mark.parametrize("func", ["first", "last"]) + def test_groupby_first_last(self, func): + method = operator.methodcaller(func) + u = self.eager_array + v = self.lazy_array + + for coords in [u.coords, v.coords]: + coords["ab"] = ("x", ["a", "a", "b", "b"]) + expected = method(u.groupby("ab")) + + with raise_if_cubed_computes(): + actual = method(v.groupby("ab")) + self.assertLazyAndAllClose(expected, actual) + + with raise_if_cubed_computes(): + actual = method(v.groupby("ab")) + self.assertLazyAndAllClose(expected, actual) + + @pytest.mark.xfail(reason="isn't lazy") + def test_reindex(self): + u = self.eager_array.assign_coords(y=range(6)) + v = self.lazy_array.assign_coords(y=range(6)) + + for kwargs in [ + {"x": [2, 3, 4]}, + {"x": [1, 100, 2, 101, 3]}, + {"x": [2.5, 3, 3.5], "y": [2, 2.5, 3]}, + ]: + expected = u.reindex(**kwargs) + actual = v.reindex(**kwargs) + self.assertLazyAndAllClose(expected, actual) + + def test_to_dataset_roundtrip(self): + u = self.eager_array + v = self.lazy_array + + expected = u.assign_coords(x=u["x"]) + self.assertLazyAndEqual(expected, v.to_dataset("x").to_dataarray("x")) + + def test_merge(self): + def duplicate_and_merge(array): + return xr.merge([array, array.rename("bar")]).to_dataarray() + + expected = duplicate_and_merge(self.eager_array) + actual = duplicate_and_merge(self.lazy_array) + self.assertLazyAndEqual(expected, actual) + + @pytest.mark.xfail(reason="np ufuncs don't delegate to cubed") + def test_ufuncs(self): + u = self.eager_array + v = self.lazy_array + self.assertLazyAndAllClose(np.sin(u), np.sin(v)) + + @pytest.mark.xfail(reason="failure in cubed") + def test_where_dispatching(self): + a = np.arange(10) + b = a > 3 + x = cubed.from_array(a, 5) + y = cubed.from_array(b, 5) + expected = DataArray(a).where(b) + self.assertLazyAndEqual(expected, DataArray(a).where(y)) + self.assertLazyAndEqual(expected, DataArray(x).where(b)) + self.assertLazyAndEqual(expected, DataArray(x).where(y)) + + def test_duplicate_dims(self): + data = np.random.normal(size=(4, 4)) + arr = DataArray(data, dims=("x", "x")) + chunked_array = arr.chunk({"x": 2}) + assert chunked_array.chunks == ((2, 2), (2, 2)) + assert chunked_array.chunksizes == {"x": (2, 2)} + + @pytest.mark.xfail(reason="duck array ops problem in xarray") + def test_stack(self): + data = cubed.random.random(size=(2, 3, 4), chunks=(1, 3, 4)) + arr = DataArray(data, dims=("w", "x", "y")) + stacked = arr.stack(z=("x", "y")) + z = pd.MultiIndex.from_product([np.arange(3), np.arange(4)], names=["x", "y"]) + expected = DataArray(cubed.reshape(data, (2, -1)), {"z": z}, dims=["w", "z"]) + assert stacked.data.chunks == expected.data.chunks + self.assertLazyAndEqual(expected, stacked) + + @pytest.mark.xfail(reason="relies on np.einsum which is not in cubed") + def test_dot(self): + eager = self.eager_array.dot(self.eager_array[0]) + lazy = self.lazy_array.dot(self.lazy_array[0]) + self.assertLazyAndAllClose(eager, lazy) + + def test_dataarray_repr(self): + data = cubed.asarray([1], chunks=1) + data.name = "array-0" # change name to something fixed for the repr + nonindex_coord = cubed.asarray([1], chunks=1) + a = DataArray(data, dims=["x"], coords={"y": ("x", nonindex_coord)}) + expected = dedent( + f"""\ + Size: 8B + {data!r} + Coordinates: + y (x) int64 8B cubed.Array + Dimensions without coordinates: x""" + ) + assert expected == repr(a) + + def test_dataset_repr(self): + data = cubed.asarray([1], chunks=1) + nonindex_coord = cubed.asarray([1], chunks=1) + ds = Dataset(data_vars={"a": ("x", data)}, coords={"y": ("x", nonindex_coord)}) + expected = dedent( + """\ + Size: 16B + Dimensions: (x: 1) + Coordinates: + y (x) int64 8B cubed.Array + Dimensions without coordinates: x + Data variables: + a (x) int64 8B cubed.Array""" + ) + assert expected == repr(ds) + + def test_dataarray_pickle(self): + # Test that pickling/unpickling converts the cubed backend + # to numpy in neither the data variable nor the non-index coords + data = cubed.asarray([1], chunks=1) + nonindex_coord = cubed.asarray([1], chunks=1) + a1 = DataArray(data, dims=["x"], coords={"y": ("x", nonindex_coord)}) + a1.compute() + assert not a1._in_memory + assert not a1.coords["y"]._in_memory + a2 = dill.loads(dill.dumps(a1)) + assert_identical(a1, a2) + assert not a1._in_memory + assert not a2._in_memory + assert not a1.coords["y"]._in_memory + assert not a2.coords["y"]._in_memory + + def test_dataset_pickle(self): + # Test that pickling/unpickling converts the cubed backend + # to numpy in neither the data variables nor the non-index coords + data = cubed.asarray([1], chunks=1) + nonindex_coord = cubed.asarray([1], chunks=1) + ds1 = Dataset(data_vars={"a": ("x", data)}, coords={"y": ("x", nonindex_coord)}) + ds1.compute() + assert not ds1["a"]._in_memory + assert not ds1["y"]._in_memory + ds2 = dill.loads(dill.dumps(ds1)) + assert_identical(ds1, ds2) + assert not ds1["a"]._in_memory + assert not ds2["a"]._in_memory + assert not ds1["y"]._in_memory + assert not ds2["y"]._in_memory + + def test_dataarray_getattr(self): + # ipython/jupyter does a long list of getattr() calls to when trying to + # represent an object. + # Make sure we're not accidentally computing cubed variables. + data = cubed.asarray([1], chunks=1) + nonindex_coord = cubed.asarray([1], chunks=1) + a = DataArray(data, dims=["x"], coords={"y": ("x", nonindex_coord)}) + with raise_if_cubed_computes(): + with suppress(AttributeError): + getattr(a, "NOTEXIST") + + def test_dataset_getattr(self): + # Test that pickling/unpickling converts the cubed backend + # to numpy in neither the data variables nor the non-index coords + data = cubed.asarray([1], chunks=1) + nonindex_coord = cubed.asarray([1], chunks=1) + ds = Dataset(data_vars={"a": ("x", data)}, coords={"y": ("x", nonindex_coord)}) + with raise_if_cubed_computes(): + with suppress(AttributeError): + getattr(ds, "NOTEXIST") + + def test_values(self): + # Test that invoking the values property does not convert the cubed + # backend to numpy + a = DataArray([1, 2]).chunk() + assert not a._in_memory + assert a.values.tolist() == [1, 2] + assert not a._in_memory + + def test_from_cubed_variable(self): + # Test array creation from Variable with cubed backend. + # This is used e.g. in broadcast() + a = DataArray(self.lazy_array.variable, coords={"x": range(4)}, name="foo") + self.assertLazyAndIdentical(self.lazy_array, a) + + +@pytest.mark.parametrize("method", ["load", "compute"]) +def test_cubed_kwargs_variable(method): + chunked_array = cubed.from_array(np.arange(3), chunks=(2,)) + x = Variable("y", chunked_array) + # args should be passed on to cubed.compute() (via CubedManager.compute()) + with mock.patch.object( + cubed, "compute", return_value=(np.arange(3),) + ) as mock_compute: + getattr(x, method)(foo="bar") + mock_compute.assert_called_with(chunked_array, foo="bar") + + +@pytest.mark.parametrize("method", ["load", "compute"]) +def test_cubed_kwargs_dataarray(method): + data = cubed.from_array(np.arange(3), chunks=(2,)) + x = DataArray(data) + if method in ["load", "compute"]: + cubed_func = "cubed.compute" + # args should be passed on to "cubed_func" + with mock.patch(cubed_func) as mock_func: + getattr(x, method)(foo="bar") + mock_func.assert_called_with(data, foo="bar") + + +@pytest.mark.parametrize("method", ["load", "compute"]) +def test_cubed_kwargs_dataset(method): + data = cubed.from_array(np.arange(3), chunks=(2,)) + x = Dataset({"x": (("y"), data)}) + if method in ["load", "compute"]: + cubed_func = "cubed.compute" + # args should be passed on to "cubed_func" + with mock.patch(cubed_func) as mock_func: + getattr(x, method)(foo="bar") + mock_func.assert_called_with(data, foo="bar") + + +def test_basic_compute(): + ds = Dataset({"foo": ("x", range(5)), "bar": ("x", range(5))}).chunk({"x": 2}) + ds.compute() + ds.foo.compute() + ds.foo.variable.compute() + + +def make_da(): + da = xr.DataArray( + np.ones((10, 20)), + dims=["x", "y"], + coords={"x": np.arange(10), "y": np.arange(100, 120)}, + name="a", + ).chunk({"x": 4, "y": 5}) + da.x.attrs["long_name"] = "x" + da.attrs["test"] = "test" + da.coords["c2"] = 0.5 + da.coords["ndcoord"] = da.x * 2 + da.coords["cxy"] = (da.x * da.y).chunk({"x": 4, "y": 5}) + + return da + + +def make_ds(): + map_ds = xr.Dataset() + map_ds["a"] = make_da() + map_ds["b"] = map_ds.a + 50 + map_ds["c"] = map_ds.x + 20 + map_ds = map_ds.chunk({"x": 4, "y": 5}) + map_ds["d"] = ("z", [1, 1, 1, 1]) + map_ds["z"] = [0, 1, 2, 3] + map_ds["e"] = map_ds.x + map_ds.y + map_ds.coords["c1"] = 0.5 + map_ds.coords["cx"] = ("x", np.arange(len(map_ds.x))) + map_ds.coords["cx"].attrs["test2"] = "test2" + map_ds.attrs["test"] = "test" + map_ds.coords["xx"] = map_ds["a"] * map_ds.y + + map_ds.x.attrs["long_name"] = "x" + map_ds.y.attrs["long_name"] = "y" + + return map_ds + + +# fixtures cannot be used in parametrize statements +# instead use this workaround +# https://docs.pytest.org/en/latest/deprecations.html#calling-fixtures-directly +@pytest.fixture +def map_da(): + return make_da() + + +@pytest.fixture +def map_ds(): + return make_ds() + + +def test_unify_chunks(map_ds): + ds_copy = map_ds.copy() + ds_copy["cxy"] = ds_copy.cxy.chunk({"y": 10}) + + with pytest.raises(ValueError, match=r"inconsistent chunks"): + ds_copy.chunks + + expected_chunks = {"x": (4, 4, 2), "y": (5, 5, 5, 5)} + with raise_if_cubed_computes(): + actual_chunks = ds_copy.unify_chunks().chunks + assert actual_chunks == expected_chunks + assert_identical(map_ds, ds_copy.unify_chunks()) + + out_a, out_b = xr.unify_chunks(ds_copy.cxy, ds_copy.drop_vars("cxy")) + assert out_a.chunks == ((4, 4, 2), (5, 5, 5, 5)) + assert out_b.chunks == expected_chunks + + # TODO: following fails + # # Test unordered dims + # da = ds_copy["cxy"] + # out_a, out_b = xr.unify_chunks(da.chunk({"x": -1}), da.T.chunk({"y": -1})) + # assert out_a.chunks == ((4, 4, 2), (5, 5, 5, 5)) + # assert out_b.chunks == ((5, 5, 5, 5), (4, 4, 2)) + + # # Test mismatch + # with pytest.raises(ValueError, match=r"Dimension 'x' size mismatch: 10 != 2"): + # xr.unify_chunks(da, da.isel(x=slice(2))) + + +@pytest.mark.parametrize("obj", [make_ds(), make_da()]) +@pytest.mark.parametrize( + "transform", [lambda x: x.compute(), lambda x: x.unify_chunks()] +) +def test_unify_chunks_shallow_copy(obj, transform): + obj = transform(obj) + unified = obj.unify_chunks() + assert_identical(obj, unified) and obj is not obj.unify_chunks() + + +@pytest.mark.parametrize("obj", [make_da()]) +def test_auto_chunk_da(obj): + actual = obj.chunk("auto").data + expected = obj.data.rechunk("auto") + np.testing.assert_array_equal(actual, expected) + assert actual.chunks == expected.chunks diff --git a/pyproject.toml b/pyproject.toml index f3a82cb..9557751 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,11 +30,13 @@ dependencies = [ [project.optional-dependencies] test = [ + "dill", "pre-commit", "ruff", "pytest-mypy", "pytest-cov", - "pytest"] + "pytest", +] [project.urls]