diff --git a/ci/environment-3.7.yml b/ci/environment-3.7.yml index 355cfc8..376b2e4 100644 --- a/ci/environment-3.7.yml +++ b/ci/environment-3.7.yml @@ -7,6 +7,7 @@ dependencies: - dask - numpy=1.16 - pytest + - hypothesis - pip - pip: - codecov diff --git a/ci/environment-3.8.yml b/ci/environment-3.8.yml index f09af4a..5c7d84b 100644 --- a/ci/environment-3.8.yml +++ b/ci/environment-3.8.yml @@ -7,6 +7,7 @@ dependencies: - dask - numpy=1.18 - pytest + - hypothesis - pip - pip: - codecov diff --git a/ci/environment-3.9.yml b/ci/environment-3.9.yml index 92d153a..de46895 100644 --- a/ci/environment-3.9.yml +++ b/ci/environment-3.9.yml @@ -7,6 +7,7 @@ dependencies: - dask - numpy - pytest + - hypothesis - pip - pip: - codecov diff --git a/xhistogram/test/fixtures.py b/xhistogram/test/fixtures.py index 330d07b..fd1cf44 100644 --- a/xhistogram/test/fixtures.py +++ b/xhistogram/test/fixtures.py @@ -1,5 +1,8 @@ +import uuid import dask import dask.array as dsa +import numpy as np +import xarray as xr def empty_dask_array(shape, dtype=float, chunks=None): @@ -12,3 +15,25 @@ def raise_if_computed(): a = a.rechunk(chunks) return a + + +def example_dataarray(shape=(5, 20)): + data = np.random.randn(*shape) + dims = [f"dim_{i}" for i in range(len(shape))] + da = xr.DataArray(data, dims=dims, name="T") + return da + + +def example_dataset(n_dim=2, n_vars=2): + """Random dataset with every variable having the same shape""" + + shape = tuple(range(8, 8 + n_dim)) + dims = [f"dim_{i}" for i in range(len(shape))] + var_names = [uuid.uuid4().hex for _ in range(n_vars)] + ds = xr.Dataset() + for i in range(n_vars): + name = var_names[i] + data = np.random.randn(*shape) + da = xr.DataArray(data, dims=dims, name=name) + ds[name] = da + return ds diff --git a/xhistogram/test/test_chunking.py b/xhistogram/test/test_chunking.py new file mode 100644 index 0000000..796afaf --- /dev/null +++ b/xhistogram/test/test_chunking.py @@ -0,0 +1,146 @@ +import numpy as np +import pytest + +from .fixtures import example_dataarray +from ..xarray import histogram + + +@pytest.mark.parametrize("weights", [False, True]) +@pytest.mark.parametrize("chunksize", [1, 2, 3, 10]) +@pytest.mark.parametrize("shape", [(10,), (10, 4)]) +def test_chunked_weights(chunksize, shape, weights): + + data_a = example_dataarray(shape).chunk((chunksize,)) + + if weights: + weights = example_dataarray(shape).chunk((chunksize,)) + weights_arr = weights.values + else: + weights = weights_arr = None + + nbins_a = 6 + bins_a = np.linspace(-4, 4, nbins_a + 1) + + h = histogram(data_a, bins=[bins_a], weights=weights) + + assert h.shape == (nbins_a,) + + hist, _ = np.histogram(data_a.values, bins=bins_a, weights=weights_arr) + + np.testing.assert_allclose(hist, h.values) + + +@pytest.mark.parametrize("xchunksize", [1, 2, 3, 10]) +@pytest.mark.parametrize("ychunksize", [1, 2, 3, 12]) +class TestFixedSize2DChunks: + def test_2d_chunks(self, xchunksize, ychunksize): + + data_a = example_dataarray(shape=(10, 12)).chunk((xchunksize, ychunksize)) + + nbins_a = 8 + bins_a = np.linspace(-4, 4, nbins_a + 1) + + h = histogram(data_a, bins=[bins_a]) + + assert h.shape == (nbins_a,) + + hist, _ = np.histogram(data_a.values, bins=bins_a) + + np.testing.assert_allclose(hist, h.values) + + @pytest.mark.parametrize("reduce_dim", ["dim_0", "dim_1"]) + def test_2d_chunks_broadcast_dim( + self, + xchunksize, + ychunksize, + reduce_dim, + ): + data_a = example_dataarray(shape=(10, 12)).chunk((xchunksize, ychunksize)) + dims = list(data_a.dims) + broadcast_dim = [d for d in dims if d != reduce_dim][0] + + nbins_a = 8 + bins_a = np.linspace(-4, 4, nbins_a + 1) + + h = histogram(data_a, bins=[bins_a], dim=(reduce_dim,)) + + assert h.shape == (data_a.sizes[broadcast_dim], nbins_a) + + def _np_hist(*args, **kwargs): + h, _ = np.histogram(*args, **kwargs) + return h + + hist = np.apply_along_axis( + _np_hist, axis=dims.index(reduce_dim), arr=data_a.values, bins=bins_a + ) + + if reduce_dim == "dim_0": + h = h.transpose() + np.testing.assert_allclose(hist, h.values) + + def test_2d_chunks_2d_hist(self, xchunksize, ychunksize): + + data_a = example_dataarray(shape=(10, 12)).chunk((xchunksize, ychunksize)) + data_b = example_dataarray(shape=(10, 12)).chunk((xchunksize, ychunksize)) + + nbins_a = 8 + nbins_b = 9 + bins_a = np.linspace(-4, 4, nbins_a + 1) + bins_b = np.linspace(-4, 4, nbins_b + 1) + + h = histogram(data_a, data_b, bins=[bins_a, bins_b]) + + assert h.shape == (nbins_a, nbins_b) + + hist, _, _ = np.histogram2d( + data_a.values.ravel(), + data_b.values.ravel(), + bins=[bins_a, bins_b], + ) + + np.testing.assert_allclose(hist, h.values) + + +@pytest.mark.parametrize("xchunksize", [1, 2, 3, 10]) +@pytest.mark.parametrize("ychunksize", [1, 2, 3, 12]) +class TestUnalignedChunks: + def test_unaligned_data_chunks(self, xchunksize, ychunksize): + data_a = example_dataarray(shape=(10, 12)).chunk((xchunksize, ychunksize)) + data_b = example_dataarray(shape=(10, 12)).chunk( + (xchunksize + 1, ychunksize + 1) + ) + + nbins_a = 8 + nbins_b = 9 + bins_a = np.linspace(-4, 4, nbins_a + 1) + bins_b = np.linspace(-4, 4, nbins_b + 1) + + h = histogram(data_a, data_b, bins=[bins_a, bins_b]) + + assert h.shape == (nbins_a, nbins_b) + + hist, _, _ = np.histogram2d( + data_a.values.ravel(), + data_b.values.ravel(), + bins=[bins_a, bins_b], + ) + + np.testing.assert_allclose(hist, h.values) + + def test_unaligned_weights_chunks(self, xchunksize, ychunksize): + + data_a = example_dataarray(shape=(10, 12)).chunk((xchunksize, ychunksize)) + weights = example_dataarray(shape=(10, 12)).chunk( + (xchunksize + 1, ychunksize + 1) + ) + + nbins_a = 8 + bins_a = np.linspace(-4, 4, nbins_a + 1) + + h = histogram(data_a, bins=[bins_a], weights=weights) + + assert h.shape == (nbins_a,) + + hist, _ = np.histogram(data_a.values, bins=bins_a, weights=weights.values) + + np.testing.assert_allclose(hist, h.values) diff --git a/xhistogram/test/test_chunking_hypotheses.py b/xhistogram/test/test_chunking_hypotheses.py new file mode 100644 index 0000000..7feb333 --- /dev/null +++ b/xhistogram/test/test_chunking_hypotheses.py @@ -0,0 +1,86 @@ +import numpy as np +import pytest + +from .fixtures import example_dataarray, example_dataset +from ..xarray import histogram + +pytest.importorskip("hypothesis") + +import hypothesis.strategies as st # noqa +from hypothesis import given # noqa + + +@st.composite +def chunk_shapes(draw, n_dim=3, max_arr_len=10): + """Generate different chunking patterns for an N-D array of data.""" + chunks = [] + for n in range(n_dim): + shape = draw(st.integers(min_value=1, max_value=max_arr_len)) + chunks.append(shape) + return tuple(chunks) + + +class TestChunkingHypotheses: + @given(chunk_shapes(n_dim=1, max_arr_len=20)) + def test_all_chunking_patterns_1d(self, chunks): + + data = example_dataarray(shape=(20,)).chunk(chunks) + + nbins_a = 8 + bins = np.linspace(-4, 4, nbins_a + 1) + + h = histogram(data, bins=[bins]) + + assert h.shape == (nbins_a,) + + hist, _ = np.histogram( + data.values, + bins=bins, + ) + + np.testing.assert_allclose(hist, h) + + # TODO mark as slow? + @given(chunk_shapes(n_dim=2, max_arr_len=8)) + def test_all_chunking_patterns_2d(self, chunks): + + data_a = example_dataarray(shape=(5, 20)).chunk(chunks) + data_b = example_dataarray(shape=(5, 20)).chunk(chunks) + + nbins_a = 8 + nbins_b = 9 + bins_a = np.linspace(-4, 4, nbins_a + 1) + bins_b = np.linspace(-4, 4, nbins_b + 1) + + h = histogram(data_a, data_b, bins=[bins_a, bins_b]) + + assert h.shape == (nbins_a, nbins_b) + + hist, _, _ = np.histogram2d( + data_a.values.ravel(), + data_b.values.ravel(), + bins=[bins_a, bins_b], + ) + + np.testing.assert_allclose(hist, h.values) + + # TODO mark as slow? + @pytest.mark.parametrize("n_vars", [1, 2, 3, 4]) + @given(chunk_shapes(n_dim=2, max_arr_len=7)) + def test_all_chunking_patterns_dd_hist(self, n_vars, chunk_shapes): + ds = example_dataset(n_dim=2, n_vars=n_vars) + ds = ds.chunk({d: c for d, c in zip(ds.dims.keys(), chunk_shapes)}) + + n_bins = (7, 8, 9, 10)[:n_vars] + bins = [np.linspace(-4, 4, n + 1) for n in n_bins] + + h = histogram(*[da for name, da in ds.data_vars.items()], bins=bins) + + assert h.shape == n_bins + + input_data = np.stack( + [da.values.ravel() for name, da in ds.data_vars.items()], axis=-1 + ) + hist, _ = np.histogramdd(input_data, bins=bins) + + np.testing.assert_allclose(hist, h.values)