From fe46e05ca4e972e12a1a5aa42096130341de6d79 Mon Sep 17 00:00:00 2001 From: Ruth Comer <10599679+rcomer@users.noreply.github.com> Date: Thu, 18 Jan 2024 15:16:18 +0000 Subject: [PATCH] Re-write `pearsonr` to use `Resolve` (#5638) * Re-write pearsonr to use Resolve * add test for weight transposing case * Fix single coordinate case * use lenient coord handling * Move coordinate handling tests to separate class These do not need real and lazy versions * Make some syntax highlighters happier Co-authored-by: Martin Yeo <40734014+trexfeathers@users.noreply.github.com> * Initial review actions * Clearer comment Co-authored-by: Martin Yeo <40734014+trexfeathers@users.noreply.github.com> * assign reviewer blame * first attempt at benchmarks * directly use resolve for weight wrangling * Add note about maths broadcasting rules * use lhs/rhs instead of 1/2 for consistency * convert tests to pytest * check dask's compute is not called * revert now redundant resolve change * Improvements to pearsonr docstring. --------- Co-authored-by: Martin Yeo <40734014+trexfeathers@users.noreply.github.com> Co-authored-by: Martin Yeo --- benchmarks/benchmarks/stats.py | 38 ++++ docs/src/whatsnew/latest.rst | 3 + lib/iris/analysis/stats.py | 181 +++++++++--------- .../unit/analysis/stats/test_pearsonr.py | 171 ++++++++++++----- 4 files changed, 252 insertions(+), 141 deletions(-) create mode 100644 benchmarks/benchmarks/stats.py diff --git a/benchmarks/benchmarks/stats.py b/benchmarks/benchmarks/stats.py new file mode 100644 index 0000000000..0530431900 --- /dev/null +++ b/benchmarks/benchmarks/stats.py @@ -0,0 +1,38 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the BSD license. +# See LICENSE in the root of the repository for full licensing details. +"""Stats benchmark tests.""" + +import iris +from iris.analysis.stats import pearsonr +import iris.tests + + +class PearsonR: + def setup(self): + cube_temp = iris.load_cube( + iris.tests.get_data_path( + ("NetCDF", "global", "xyt", "SMALL_total_column_co2.nc") + ) + ) + + # Make data non-lazy. + cube_temp.data + + self.cube_a = cube_temp[:6] + self.cube_b = cube_temp[20:26] + self.cube_b.replace_coord(self.cube_a.coord("time")) + for name in ["latitude", "longitude"]: + self.cube_b.coord(name).guess_bounds() + self.weights = iris.analysis.cartography.area_weights(self.cube_b) + + def time_real(self): + pearsonr(self.cube_a, self.cube_b, weights=self.weights) + + def time_lazy(self): + for cube in self.cube_a, self.cube_b: + cube.data = cube.lazy_data() + + result = pearsonr(self.cube_a, self.cube_b, weights=self.weights) + result.data diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst index 41f765bfe1..cc6427f9e0 100644 --- a/docs/src/whatsnew/latest.rst +++ b/docs/src/whatsnew/latest.rst @@ -104,6 +104,9 @@ This document explains the changes made to Iris for this release lazy data from file. This will also speed up coordinate comparison. (:pull:`5610`) +#. `@rcomer`_ and `@trexfeathers`_ (reviewer) modified + :func:`~iris.analysis.stats.pearsonr` so it preserves lazy data in all cases + and also runs a little faster. (:pull:`5638`) 🔥 Deprecations =============== diff --git a/lib/iris/analysis/stats.py b/lib/iris/analysis/stats.py index 34c37b8b11..f014dd5013 100644 --- a/lib/iris/analysis/stats.py +++ b/lib/iris/analysis/stats.py @@ -4,13 +4,16 @@ # See LICENSE in the root of the repository for full licensing details. """Statistical operations between cubes.""" +import dask.array as da import numpy as np -import numpy.ma as ma import iris -from iris.util import broadcast_to_shape +from iris.common import SERVICES, Resolve +from iris.common.lenient import _lenient_client +from iris.util import _mask_array +@_lenient_client(services=SERVICES) def pearsonr( cube_a, cube_b, @@ -23,30 +26,32 @@ def pearsonr( Parameters ---------- - cube_a, cube_b : cubes + cube_a, cube_b : :class:`iris.cube.Cube` Cubes between which the correlation will be calculated. The cubes should either be the same shape and have the same dimension coordinates - or one cube should be broadcastable to the other. + or one cube should be broadcastable to the other. Broadcasting rules + are the same as those for cube arithmetic (see :ref:`cube maths`). corr_coords : str or list of str The cube coordinate name(s) over which to calculate correlations. If no names are provided then correlation will be calculated over all common cube dimensions. weights : :class:`numpy.ndarray`, optional - Weights array of same shape as (the smaller of) cube_a and cube_b. Note - that latitude/longitude area weights can be calculated using + Weights array of same shape as (the smaller of) `cube_a` and `cube_b`. + Note that latitude/longitude area weights can be calculated using :func:`iris.analysis.cartography.area_weights`. mdtol : float, default=1.0 Tolerance of missing data. The missing data fraction is calculated - based on the number of grid cells masked in both cube_a and cube_b. If - this fraction exceed mdtol, the returned value in the corresponding - cell is masked. mdtol=0 means no missing data is tolerated while - mdtol=1 means the resulting element will be masked if and only if all - contributing elements are masked in cube_a or cube_b. Defaults to 1. + based on the number of grid cells masked in both `cube_a` and `cube_b`. + If this fraction exceed `mdtol`, the returned value in the + corresponding cell is masked. `mdtol` =0 means no missing data is + tolerated while `mdtol` =1 means the resulting element will be masked + if and only if all contributing elements are masked in `cube_a` or + `cube_b`. common_mask : bool, default=False - If True, applies a common mask to cube_a and cube_b so only cells which - are unmasked in both cubes contribute to the calculation. If False, the - variance for each cube is calculated from all available cells. Defaults - to False. + If ``True``, applies a common mask to cube_a and cube_b so only cells + which are unmasked in both cubes contribute to the calculation. If + ``False``, the variance for each cube is calculated from all available + cells. Returns ------- @@ -56,19 +61,19 @@ def pearsonr( cubes. For example providing two time/altitude/latitude/longitude cubes and - corr_coords of 'latitude' and 'longitude' will result in a + `corr_coords` of 'latitude' and 'longitude' will result in a time/altitude cube describing the latitude/longitude (i.e. pattern) correlation at each time/altitude point. Notes ----- + If either of the input cubes has lazy data, the result will have lazy data. + Reference: https://en.wikipedia.org/wiki/Pearson_correlation_coefficient - This operation is non-lazy. - """ - # Assign larger cube to cube_1 + # Assign larger cube to cube_1 for simplicity. if cube_b.ndim > cube_a.ndim: cube_1 = cube_b cube_2 = cube_a @@ -78,90 +83,88 @@ def pearsonr( smaller_shape = cube_2.shape - dim_coords_1 = [coord.name() for coord in cube_1.dim_coords] - dim_coords_2 = [coord.name() for coord in cube_2.dim_coords] - common_dim_coords = list(set(dim_coords_1) & set(dim_coords_2)) + # Get the broadcast, auto-transposed safe versions of the cube operands. + resolver = Resolve(cube_1, cube_2) + lhs_cube_resolved = resolver.lhs_cube_resolved + rhs_cube_resolved = resolver.rhs_cube_resolved + + if lhs_cube_resolved.has_lazy_data() or rhs_cube_resolved.has_lazy_data(): + al = da + array_lhs = lhs_cube_resolved.lazy_data() + array_rhs = rhs_cube_resolved.lazy_data() + else: + al = np + array_lhs = lhs_cube_resolved.data + array_rhs = rhs_cube_resolved.data + # If no coords passed then set to all common dimcoords of cubes. if corr_coords is None: - corr_coords = common_dim_coords - - def _ones_like(cube): - # Return a copy of cube with the same mask, but all data values set to 1. - # The operation is non-lazy. - # For safety we also discard any cell-measures and ancillary-variables, to - # avoid cube arithmetic possibly objecting to them, or inadvertently retaining - # them in the result where they might be inappropriate. - ones_cube = cube.copy() - ones_cube.data = np.ones_like(cube.data) - ones_cube.rename("unknown") - ones_cube.units = 1 - for cm in ones_cube.cell_measures(): - ones_cube.remove_cell_measure(cm) - for av in ones_cube.ancillary_variables(): - ones_cube.remove_ancillary_variable(av) - return ones_cube + dim_coords_1 = {coord.name() for coord in lhs_cube_resolved.dim_coords} + dim_coords_2 = {coord.name() for coord in rhs_cube_resolved.dim_coords} + corr_coords = list(dim_coords_1.intersection(dim_coords_2)) + + # Interpret coords as array dimensions. + corr_dims = set() + if isinstance(corr_coords, str): + corr_coords = [corr_coords] + for coord in corr_coords: + corr_dims.update(lhs_cube_resolved.coord_dims(coord)) + + corr_dims = tuple(corr_dims) # Match up data masks if required. if common_mask: - # Create a cube of 1's with a common mask. - if ma.is_masked(cube_2.data): - mask_cube = _ones_like(cube_2) - else: - mask_cube = 1.0 - if ma.is_masked(cube_1.data): - # Take a slice to avoid unnecessary broadcasting of cube_2. - slice_coords = [ - dim_coords_1[i] - for i in range(cube_1.ndim) - if dim_coords_1[i] not in common_dim_coords - and np.array_equal( - cube_1.data.mask.any(axis=i), cube_1.data.mask.all(axis=i) - ) - ] - cube_1_slice = next(cube_1.slices_over(slice_coords)) - mask_cube = _ones_like(cube_1_slice) * mask_cube - # Apply common mask to data. - if isinstance(mask_cube, iris.cube.Cube): - cube_1 = cube_1 * mask_cube - cube_2 = mask_cube * cube_2 - dim_coords_2 = [coord.name() for coord in cube_2.dim_coords] - - # Broadcast weights to shape of cubes if necessary. - if weights is None or cube_1.shape == smaller_shape: - weights_1 = weights - weights_2 = weights + mask_lhs = al.ma.getmaskarray(array_lhs) + if al is np: + # Reduce all invariant dimensions of mask_lhs to length 1. This avoids + # unnecessary broadcasting of array_rhs. + index = tuple( + slice(0, 1) + if np.array_equal(mask_lhs.any(axis=dim), mask_lhs.all(axis=dim)) + else slice(None) + for dim in range(mask_lhs.ndim) + ) + mask_lhs = mask_lhs[index] + + array_rhs = _mask_array(array_rhs, mask_lhs) + array_lhs = _mask_array(array_lhs, al.ma.getmaskarray(array_rhs)) + + # Broadcast weights to shape of arrays if necessary. + if weights is None: + weights_lhs = weights_rhs = None else: if weights.shape != smaller_shape: - raise ValueError( - "weights array should have dimensions {}".format(smaller_shape) - ) + msg = f"weights array should have dimensions {smaller_shape}" + raise ValueError(msg) - dims_1_common = [ - i for i in range(cube_1.ndim) if dim_coords_1[i] in common_dim_coords - ] - weights_1 = broadcast_to_shape(weights, cube_1.shape, dims_1_common) - if cube_2.shape != smaller_shape: - dims_2_common = [ - i for i in range(cube_2.ndim) if dim_coords_2[i] in common_dim_coords - ] - weights_2 = broadcast_to_shape(weights, cube_2.shape, dims_2_common) - else: - weights_2 = weights + wt_resolver = Resolve(cube_1, cube_2.copy(weights)) + weights = wt_resolver.rhs_cube_resolved.data + weights_rhs = np.broadcast_to(weights, array_rhs.shape) + weights_lhs = np.broadcast_to(weights, array_lhs.shape) # Calculate correlations. - s1 = cube_1 - cube_1.collapsed(corr_coords, iris.analysis.MEAN, weights=weights_1) - s2 = cube_2 - cube_2.collapsed(corr_coords, iris.analysis.MEAN, weights=weights_2) - - covar = (s1 * s2).collapsed( - corr_coords, iris.analysis.SUM, weights=weights_1, mdtol=mdtol + s_lhs = array_lhs - al.ma.average( + array_lhs, axis=corr_dims, weights=weights_lhs, keepdims=True + ) + s_rhs = array_rhs - al.ma.average( + array_rhs, axis=corr_dims, weights=weights_rhs, keepdims=True ) - var_1 = (s1**2).collapsed(corr_coords, iris.analysis.SUM, weights=weights_1) - var_2 = (s2**2).collapsed(corr_coords, iris.analysis.SUM, weights=weights_2) - denom = iris.analysis.maths.apply_ufunc( - np.sqrt, var_1 * var_2, new_unit=covar.units + s_prod = resolver.cube(s_lhs * s_rhs) + + # Use cube collapsed method as it takes care of coordinate collapsing and missing + # data tolerance. + covar = s_prod.collapsed( + corr_coords, iris.analysis.SUM, weights=weights_lhs, mdtol=mdtol ) + + var_lhs = iris.analysis._sum(s_lhs**2, axis=corr_dims, weights=weights_lhs) + var_rhs = iris.analysis._sum(s_rhs**2, axis=corr_dims, weights=weights_rhs) + + denom = np.sqrt(var_lhs * var_rhs) + corr_cube = covar / denom corr_cube.rename("Pearson's r") + corr_cube.units = 1 return corr_cube diff --git a/lib/iris/tests/unit/analysis/stats/test_pearsonr.py b/lib/iris/tests/unit/analysis/stats/test_pearsonr.py index 50387e1418..d46bcd21ba 100644 --- a/lib/iris/tests/unit/analysis/stats/test_pearsonr.py +++ b/lib/iris/tests/unit/analysis/stats/test_pearsonr.py @@ -8,17 +8,22 @@ # importing anything else. import iris.tests as tests # isort:skip +from unittest import mock + +import dask +import dask.array import numpy as np import numpy.ma as ma +import pytest import iris +import iris._lazy_data import iris.analysis.stats as stats from iris.exceptions import CoordinateNotFoundError -@tests.skip_data -class Test(tests.IrisTest): - def setUp(self): +class Mixin: + def setup_method(self): # 3D cubes: cube_temp = iris.load_cube( tests.get_data_path( @@ -33,21 +38,36 @@ def setUp(self): cube_temp.coord("longitude").guess_bounds() self.weights = iris.analysis.cartography.area_weights(cube_temp) - def test_perfect_corr(self): + +@tests.skip_data +class TestLazy(Mixin): + @pytest.fixture + def mocked_compute(self, monkeypatch): + m_compute = mock.Mock(wraps=dask.base.compute) + + # The three dask compute functions are all the same function but monkeypatch + # does not automatically know that. + # https://stackoverflow.com/questions/77820437 + monkeypatch.setattr(dask.base, dask.base.compute.__name__, m_compute) + monkeypatch.setattr(dask, dask.compute.__name__, m_compute) + monkeypatch.setattr(dask.array, dask.array.compute.__name__, m_compute) + + return m_compute + + def test_perfect_corr(self, mocked_compute): r = stats.pearsonr(self.cube_a, self.cube_a, ["latitude", "longitude"]) - self.assertArrayEqual(r.data, np.array([1.0] * 6)) + mocked_compute.assert_not_called() + np.testing.assert_array_equal(r.data, np.array([1.0] * 6)) - def test_perfect_corr_all_dims(self): + def test_perfect_corr_all_dims(self, mocked_compute): r = stats.pearsonr(self.cube_a, self.cube_a) - self.assertArrayEqual(r.data, np.array([1.0])) + mocked_compute.assert_not_called() + np.testing.assert_array_equal(r.data, np.array([1.0])) - def test_incompatible_cubes(self): - with self.assertRaises(ValueError): - stats.pearsonr(self.cube_a[:, 0, :], self.cube_b[0, :, :], "longitude") - - def test_compatible_cubes(self): + def test_compatible_cubes(self, mocked_compute): r = stats.pearsonr(self.cube_a, self.cube_b, ["latitude", "longitude"]) - self.assertArrayAlmostEqual( + mocked_compute.assert_not_called() + np.testing.assert_array_almost_equal( r.data, [ 0.81114936, @@ -59,13 +79,15 @@ def test_compatible_cubes(self): ], ) - def test_broadcast_cubes(self): + def test_broadcast_cubes(self, mocked_compute): r1 = stats.pearsonr( self.cube_a, self.cube_b[0, :, :], ["latitude", "longitude"] ) r2 = stats.pearsonr( self.cube_b[0, :, :], self.cube_a, ["latitude", "longitude"] ) + + mocked_compute.assert_not_called() r_by_slice = [ stats.pearsonr( self.cube_a[i, :, :], @@ -74,14 +96,16 @@ def test_broadcast_cubes(self): ).data for i in range(6) ] - self.assertArrayEqual(r1.data, np.array(r_by_slice)) - self.assertArrayEqual(r2.data, np.array(r_by_slice)) + np.testing.assert_array_equal(r1.data, np.array(r_by_slice)) + np.testing.assert_array_equal(r2.data, np.array(r_by_slice)) - def test_compatible_cubes_weighted(self): + def test_compatible_cubes_weighted(self, mocked_compute): r = stats.pearsonr( self.cube_a, self.cube_b, ["latitude", "longitude"], self.weights ) - self.assertArrayAlmostEqual( + + mocked_compute.assert_not_called() + np.testing.assert_array_almost_equal( r.data, [ 0.79105429, @@ -93,13 +117,15 @@ def test_compatible_cubes_weighted(self): ], ) - def test_broadcast_cubes_weighted(self): + def test_broadcast_cubes_weighted(self, mocked_compute): r = stats.pearsonr( self.cube_a, self.cube_b[0, :, :], ["latitude", "longitude"], weights=self.weights[0, :, :], ) + + mocked_compute.assert_not_called() r_by_slice = [ stats.pearsonr( self.cube_a[i, :, :], @@ -109,10 +135,31 @@ def test_broadcast_cubes_weighted(self): ).data for i in range(6) ] - self.assertArrayAlmostEqual(r.data, np.array(r_by_slice)) + np.testing.assert_array_almost_equal(r.data, np.array(r_by_slice)) + + def test_broadcast_transpose_cubes_weighted(self, mocked_compute): + # Reference is calculated with no transposition. + r_ref = stats.pearsonr( + self.cube_a, + self.cube_b[0, :, :], + ["latitude", "longitude"], + weights=self.weights[0, :, :], + ) + + self.cube_a.transpose() + r_test = stats.pearsonr( + self.cube_a, + self.cube_b[0, :, :], + ["latitude", "longitude"], + weights=self.weights[0, :, :], + ) + + mocked_compute.assert_not_called() + # Should get the same result, but transposed. + np.testing.assert_array_almost_equal(r_test.data, r_ref.data.T) def test_weight_error(self): - with self.assertRaises(ValueError): + with pytest.raises(ValueError): stats.pearsonr( self.cube_a, self.cube_b[0, :, :], @@ -120,54 +167,74 @@ def test_weight_error(self): weights=self.weights, ) - def test_non_existent_coord(self): - with self.assertRaises(CoordinateNotFoundError): - stats.pearsonr(self.cube_a, self.cube_b, "bad_coord") - - def test_mdtol(self): + def test_mdtol(self, mocked_compute): cube_small = self.cube_a[:, 0, 0] - cube_small_masked = cube_small.copy() - cube_small_masked.data = ma.array( - cube_small.data, mask=np.array([0, 0, 0, 1, 1, 1], dtype=bool) - ) + cube_small_masked = iris.util.mask_cube(cube_small, [0, 0, 0, 1, 1, 1]) r1 = stats.pearsonr(cube_small, cube_small_masked) r2 = stats.pearsonr(cube_small, cube_small_masked, mdtol=0.49) - self.assertArrayAlmostEqual(r1.data, np.array([0.74586593])) - self.assertMaskedArrayEqual(r2.data, ma.array([0], mask=[True])) - def test_common_mask_simple(self): + mocked_compute.assert_not_called() + np.testing.assert_array_almost_equal(r1.data, np.array([0.74586593])) + tests.assert_masked_array_equal(r2.data, ma.array([0], mask=[True])) + + def test_common_mask_simple(self, mocked_compute): cube_small = self.cube_a[:, 0, 0] - cube_small_masked = cube_small.copy() - cube_small_masked.data = ma.array( - cube_small.data, mask=np.array([0, 0, 0, 1, 1, 1], dtype=bool) - ) + cube_small_masked = iris.util.mask_cube(cube_small, [0, 0, 0, 1, 1, 1]) r = stats.pearsonr(cube_small, cube_small_masked, common_mask=True) - self.assertArrayAlmostEqual(r.data, np.array([1.0])) - def test_common_mask_broadcast(self): - cube_small = self.cube_a[:, 0, 0] + mocked_compute.assert_not_called() + np.testing.assert_array_almost_equal(r.data, np.array([1.0])) + + def test_common_mask_broadcast(self, mocked_compute): + cube_small = iris.util.mask_cube(self.cube_a[:, 0, 0], [0, 0, 0, 0, 0, 1]) + mask_2d = np.zeros((6, 2), dtype=bool) + # 2d mask varies on unshared coord: + mask_2d[0, 1] = 1 + + # Make a (6, 2) cube. cube_small_2d = self.cube_a[:, 0:2, 0] - cube_small.data = ma.array( - cube_small.data, mask=np.array([0, 0, 0, 0, 0, 1], dtype=bool) - ) - cube_small_2d.data = ma.array( - np.tile(cube_small.data[:, np.newaxis], 2), - mask=np.zeros((6, 2), dtype=bool), + # Duplicate data along unshared coord's dimension. + new_data = iris.util.broadcast_to_shape( + cube_small.core_data(), (6, 2), dim_map=[0] ) - # 2d mask varies on unshared coord: - cube_small_2d.data.mask[0, 1] = 1 + cube_small_2d.data = iris.util._mask_array(new_data, mask_2d) + r = stats.pearsonr( cube_small, cube_small_2d, weights=self.weights[:, 0, 0], common_mask=True, ) - self.assertArrayAlmostEqual(r.data, np.array([1.0, 1.0])) + + mocked_compute.assert_not_called() + np.testing.assert_array_almost_equal(r.data, np.array([1.0, 1.0])) # 2d mask does not vary on unshared coord: cube_small_2d.data.mask[0, 0] = 1 r = stats.pearsonr(cube_small, cube_small_2d, common_mask=True) - self.assertArrayAlmostEqual(r.data, np.array([1.0, 1.0])) + np.testing.assert_array_almost_equal(r.data, np.array([1.0, 1.0])) + + +class TestReal(TestLazy): + def setup_method(self): + super().setup_method() + for cube in [self.cube_a, self.cube_b]: + _ = cube.data + +class TestCoordHandling(Mixin): + def test_lenient_handling(self): + # Smoke test that mismatched var_name does not prevent operation. + self.cube_a.coord("time").var_name = "wibble" + stats.pearsonr(self.cube_a, self.cube_b) -if __name__ == "__main__": - tests.main() + def test_incompatible_cubes(self): + with pytest.raises(ValueError): + stats.pearsonr(self.cube_a[:, 0, :], self.cube_b[0, :, :], "longitude") + + def test_single_coord(self): + # Smoke test that single coord can be passed as single string. + stats.pearsonr(self.cube_a, self.cube_b, "latitude") + + def test_non_existent_coord(self): + with pytest.raises(CoordinateNotFoundError): + stats.pearsonr(self.cube_a, self.cube_b, "bad_coord")