Re-write pearsonr to use Resolve

SciTools · Dec 15, 2023 · ad93a3e · ad93a3e
1 parent d0ee9c2
commit ad93a3e
Show file tree

Hide file tree

Showing 8 changed files with 121 additions and 115 deletions.
diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst
@@ -96,6 +96,8 @@ This document explains the changes made to Iris for this release
    lazy data from file. This will also speed up coordinate comparison.
    (:pull:`5610`)
 
+#. `@rcomer`_ modified :func:`~iris.analysis.stats.pearsonr` so it preserves
+   lazy data in all cases and also runs a little faster.  (:pull:`5638`)
 
 🔥 Deprecations
 ===============
@@ -106,8 +108,8 @@ This document explains the changes made to Iris for this release
 🔗 Dependencies
 ===============
 
-#. `@bjlittle`_ enforced the minimum pin of ``numpy>1.21`` in accordance with the `NEP29 Drop Schedule`_.
-   (:pull:`5525`)
+#. `@bjlittle`_  and `@rcomer`_ enforced the minimum pin of ``numpy>1.22`` in
+   accordance with the `NEP29 Drop Schedule`_. (:pull:`5525`, :pull:`5638`)
 
 
 📚 Documentation

diff --git a/lib/iris/analysis/stats.py b/lib/iris/analysis/stats.py
@@ -2,15 +2,14 @@
 #
 # This file is part of Iris and is released under the BSD license.
 # See LICENSE in the root of the repository for full licensing details.
-"""Statistical operations between cubes.
-
-"""
+"""Statistical operations between cubes."""
 
+import dask.array as da
 import numpy as np
-import numpy.ma as ma
 
 import iris
-from iris.util import broadcast_to_shape
+from iris.common import Resolve
+from iris.util import _mask_array
 
 
 def pearsonr(
@@ -21,37 +20,37 @@ def pearsonr(
     mdtol=1.0,
     common_mask=False,
 ):
-    """Calculate the Pearson's r correlation coefficient over specified
-    dimensions.
-
-    Args:
+    """Calculate the Pearson's r correlation coefficient over specified dimensions.
 
-    * cube_a, cube_b (cubes):
+    Parameters
+    ----------
+    cube_a,cube_b : iris.cube.Cube
         Cubes between which the correlation will be calculated.  The cubes
         should either be the same shape and have the same dimension coordinates
         or one cube should be broadcastable to the other.
-    * corr_coords (str or list of str):
+    corr_coords : str or list of str
         The cube coordinate name(s) over which to calculate correlations. If no
         names are provided then correlation will be calculated over all common
         cube dimensions.
-    * weights (numpy.ndarray, optional):
+    weights : numpy.ndarray, optional
         Weights array of same shape as (the smaller of) cube_a and cube_b. Note
         that latitude/longitude area weights can be calculated using
         :func:`iris.analysis.cartography.area_weights`.
-    * mdtol (float, optional):
+    mdtol : float, default=1
         Tolerance of missing data. The missing data fraction is calculated
         based on the number of grid cells masked in both cube_a and cube_b. If
         this fraction exceed mdtol, the returned value in the corresponding
         cell is masked. mdtol=0 means no missing data is tolerated while
         mdtol=1 means the resulting element will be masked if and only if all
-        contributing elements are masked in cube_a or cube_b. Defaults to 1.
-    * common_mask (bool):
+        contributing elements are masked in cube_a or cube_b.
+    common_mask : bool, default=False
         If True, applies a common mask to cube_a and cube_b so only cells which
         are unmasked in both cubes contribute to the calculation. If False, the
-        variance for each cube is calculated from all available cells. Defaults
-        to False.
+        variance for each cube is calculated from all available cells.
 
-    Returns:
+    Returns
+    -------
+    iris.cube.Cube
         A cube of the correlation between the two input cubes along the
         specified dimensions, at each point in the remaining dimensions of the
         cubes.
@@ -61,14 +60,15 @@ def pearsonr(
         time/altitude cube describing the latitude/longitude (i.e. pattern)
         correlation at each time/altitude point.
 
+    Notes
+    -----
+    If either of the input cubes has lazy data, the result will have lazy data.
+
     Reference:
         https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
 
-    This operation is non-lazy.
-
     """
-
-    # Assign larger cube to cube_1
+    # Assign larger cube to cube_1 for simplicity.
     if cube_b.ndim > cube_a.ndim:
         cube_1 = cube_b
         cube_2 = cube_a
@@ -78,90 +78,91 @@ def pearsonr(
 
     smaller_shape = cube_2.shape
 
-    dim_coords_1 = [coord.name() for coord in cube_1.dim_coords]
-    dim_coords_2 = [coord.name() for coord in cube_2.dim_coords]
-    common_dim_coords = list(set(dim_coords_1) & set(dim_coords_2))
+    # Get the broadcast, auto-transposed safe versions of the cube operands.
+    resolver = Resolve(cube_1, cube_2)
+    cube_1 = resolver.lhs_cube_resolved
+    cube_2 = resolver.rhs_cube_resolved
+
+    if cube_1.has_lazy_data() or cube_2.has_lazy_data():
+        al = da
+        array_1 = cube_1.lazy_data()
+        array_2 = cube_2.lazy_data()
+    else:
+        al = np
+        array_1 = cube_1.data
+        array_2 = cube_2.data
+
     # If no coords passed then set to all common dimcoords of cubes.
     if corr_coords is None:
-        corr_coords = common_dim_coords
-
-    def _ones_like(cube):
-        # Return a copy of cube with the same mask, but all data values set to 1.
-        # The operation is non-lazy.
-        # For safety we also discard any cell-measures and ancillary-variables, to
-        # avoid cube arithmetic possibly objecting to them, or inadvertently retaining
-        # them in the result where they might be inappropriate.
-        ones_cube = cube.copy()
-        ones_cube.data = np.ones_like(cube.data)
-        ones_cube.rename("unknown")
-        ones_cube.units = 1
-        for cm in ones_cube.cell_measures():
-            ones_cube.remove_cell_measure(cm)
-        for av in ones_cube.ancillary_variables():
-            ones_cube.remove_ancillary_variable(av)
-        return ones_cube
+        dim_coords_1 = {coord.name() for coord in cube_1.dim_coords}
+        dim_coords_2 = {coord.name() for coord in cube_2.dim_coords}
+        corr_coords = list(dim_coords_1.intersection(dim_coords_2))
+
+    # Interpret coords as array dimensions.
+    corr_dims = set()
+    for coord in corr_coords:
+        corr_dims.update(cube_1.coord_dims(coord))
+
+    corr_dims = tuple(corr_dims)
 
     # Match up data masks if required.
     if common_mask:
-        # Create a cube of 1's with a common mask.
-        if ma.is_masked(cube_2.data):
-            mask_cube = _ones_like(cube_2)
-        else:
-            mask_cube = 1.0
-        if ma.is_masked(cube_1.data):
-            # Take a slice to avoid unnecessary broadcasting of cube_2.
-            slice_coords = [
-                dim_coords_1[i]
-                for i in range(cube_1.ndim)
-                if dim_coords_1[i] not in common_dim_coords
-                and np.array_equal(
-                    cube_1.data.mask.any(axis=i), cube_1.data.mask.all(axis=i)
-                )
-            ]
-            cube_1_slice = next(cube_1.slices_over(slice_coords))
-            mask_cube = _ones_like(cube_1_slice) * mask_cube
-        # Apply common mask to data.
-        if isinstance(mask_cube, iris.cube.Cube):
-            cube_1 = cube_1 * mask_cube
-            cube_2 = mask_cube * cube_2
-            dim_coords_2 = [coord.name() for coord in cube_2.dim_coords]
-
-    # Broadcast weights to shape of cubes if necessary.
-    if weights is None or cube_1.shape == smaller_shape:
-        weights_1 = weights
-        weights_2 = weights
+        mask_1 = al.ma.getmaskarray(array_1)
+        if al is np:
+            # Reduce all invariant dimensions of mask_1 to length 1.  This avoids
+            # unnecessary broadcasting of array_2.
+            index = tuple(
+                slice(0, 1)
+                if np.array_equal(mask_1.any(axis=dim), mask_1.all(axis=dim))
+                else slice(None)
+                for dim in range(mask_1.ndim)
+            )
+            mask_1 = mask_1[index]
+
+        array_2 = _mask_array(array_2, mask_1)
+        array_1 = _mask_array(array_1, al.ma.getmaskarray(array_2))
+
+    # Broadcast weights to shape of arrays if necessary.
+    if weights is None:
+        weights_1 = weights_2 = None
     else:
         if weights.shape != smaller_shape:
-            raise ValueError(
-                "weights array should have dimensions {}".format(smaller_shape)
-            )
+            msg = f"weights array should have dimensions {smaller_shape}"
+            raise ValueError(msg)
 
-        dims_1_common = [
-            i for i in range(cube_1.ndim) if dim_coords_1[i] in common_dim_coords
-        ]
-        weights_1 = broadcast_to_shape(weights, cube_1.shape, dims_1_common)
-        if cube_2.shape != smaller_shape:
-            dims_2_common = [
-                i for i in range(cube_2.ndim) if dim_coords_2[i] in common_dim_coords
-            ]
-            weights_2 = broadcast_to_shape(weights, cube_2.shape, dims_2_common)
-        else:
-            weights_2 = weights
+        if resolver.reorder_src_dims is not None:
+            # Apply same transposition as was done to cube_2 within Resolve.
+            weights = weights.transpose(resolver.reorder_src_dims)
+
+        # Reshape to add in any length-1 dimensions needed for broadcasting.
+        weights = weights.reshape(cube_2.shape)
+
+        weights_2 = np.broadcast_to(weights, array_2.shape)
+        weights_1 = np.broadcast_to(weights, array_1.shape)
 
     # Calculate correlations.
-    s1 = cube_1 - cube_1.collapsed(corr_coords, iris.analysis.MEAN, weights=weights_1)
-    s2 = cube_2 - cube_2.collapsed(corr_coords, iris.analysis.MEAN, weights=weights_2)
+    s1 = array_1 - al.ma.average(
+        array_1, axis=corr_dims, weights=weights_1, keepdims=True
+    )
+    s2 = array_2 - al.ma.average(
+        array_2, axis=corr_dims, weights=weights_2, keepdims=True
+    )
+
+    s_prod = resolver.cube(s1 * s2)
 
-    covar = (s1 * s2).collapsed(
+    # Use cube collapsed method as it takes care of coordinate collapsing and missing
+    # data tolerance.
+    covar = s_prod.collapsed(
         corr_coords, iris.analysis.SUM, weights=weights_1, mdtol=mdtol
     )
-    var_1 = (s1**2).collapsed(corr_coords, iris.analysis.SUM, weights=weights_1)
-    var_2 = (s2**2).collapsed(corr_coords, iris.analysis.SUM, weights=weights_2)
 
-    denom = iris.analysis.maths.apply_ufunc(
-        np.sqrt, var_1 * var_2, new_unit=covar.units
-    )
+    var_1 = iris.analysis._sum(s1**2, axis=corr_dims, weights=weights_1)
+    var_2 = iris.analysis._sum(s2**2, axis=corr_dims, weights=weights_2)
+
+    denom = np.sqrt(var_1 * var_2)
+
     corr_cube = covar / denom
     corr_cube.rename("Pearson's r")
+    corr_cube.units = 1
 
     return corr_cube
diff --git a/lib/iris/common/resolve.py b/lib/iris/common/resolve.py
@@ -304,6 +304,7 @@ def __init__(self, lhs=None, rhs=None):
         #: operands, where the direction of the mapping is governed by
         #: :attr:`~iris.common.resolve.Resolve.map_rhs_to_lhs`.
         self.mapping = None  # set in _metadata_mapping
+        self.reorder_src_dims = None  # set in _as_compatible_cubes
 
         #: Cache containing a list of dim, aux and scalar coordinates prepared
         #: and ready for creating and attaching to the resultant resolved
@@ -440,6 +441,7 @@ def _as_compatible_cubes(self):
 
         # Determine whether a transpose of the src cube is necessary.
         if order != sorted(order):
+            self.reorder_src_dims = order
             new_src_data = new_src_data.transpose(order)
             logger.debug(
                 f"transpose src {self._src_cube_position} cube with order {order}"

diff --git a/lib/iris/tests/unit/analysis/stats/test_pearsonr.py b/lib/iris/tests/unit/analysis/stats/test_pearsonr.py
@@ -12,12 +12,13 @@
 import numpy.ma as ma
 
 import iris
+import iris._lazy_data
 import iris.analysis.stats as stats
 from iris.exceptions import CoordinateNotFoundError
 
 
 @tests.skip_data
-class Test(tests.IrisTest):
+class TestLazy(tests.IrisTest):
     def setUp(self):
         # 3D cubes:
         cube_temp = iris.load_cube(
@@ -126,36 +127,29 @@ def test_non_existent_coord(self):
 
     def test_mdtol(self):
         cube_small = self.cube_a[:, 0, 0]
-        cube_small_masked = cube_small.copy()
-        cube_small_masked.data = ma.array(
-            cube_small.data, mask=np.array([0, 0, 0, 1, 1, 1], dtype=bool)
-        )
+        cube_small_masked = iris.util.mask_cube(cube_small, [0, 0, 0, 1, 1, 1])
         r1 = stats.pearsonr(cube_small, cube_small_masked)
         r2 = stats.pearsonr(cube_small, cube_small_masked, mdtol=0.49)
         self.assertArrayAlmostEqual(r1.data, np.array([0.74586593]))
         self.assertMaskedArrayEqual(r2.data, ma.array([0], mask=[True]))
 
     def test_common_mask_simple(self):
         cube_small = self.cube_a[:, 0, 0]
-        cube_small_masked = cube_small.copy()
-        cube_small_masked.data = ma.array(
-            cube_small.data, mask=np.array([0, 0, 0, 1, 1, 1], dtype=bool)
-        )
+        cube_small_masked = iris.util.mask_cube(cube_small, [0, 0, 0, 1, 1, 1])
         r = stats.pearsonr(cube_small, cube_small_masked, common_mask=True)
         self.assertArrayAlmostEqual(r.data, np.array([1.0]))
 
     def test_common_mask_broadcast(self):
-        cube_small = self.cube_a[:, 0, 0]
+        cube_small = iris.util.mask_cube(self.cube_a[:, 0, 0], [0, 0, 0, 0, 0, 1])
+        mask_2d = np.zeros((6, 2), dtype=bool)
+        # 2d mask varies on unshared coord:
+        mask_2d[0, 1] = 1
+
         cube_small_2d = self.cube_a[:, 0:2, 0]
-        cube_small.data = ma.array(
-            cube_small.data, mask=np.array([0, 0, 0, 0, 0, 1], dtype=bool)
-        )
-        cube_small_2d.data = ma.array(
-            np.tile(cube_small.data[:, np.newaxis], 2),
-            mask=np.zeros((6, 2), dtype=bool),
+        cube_small_2d.data = iris.util._mask_array(
+            cube_small.core_data().reshape(6, 1), mask_2d
         )
-        # 2d mask varies on unshared coord:
-        cube_small_2d.data.mask[0, 1] = 1
+
         r = stats.pearsonr(
             cube_small,
             cube_small_2d,
@@ -169,5 +163,12 @@ def test_common_mask_broadcast(self):
         self.assertArrayAlmostEqual(r.data, np.array([1.0, 1.0]))
 
 
+class TestReal(TestLazy):
+    def setUp(self):
+        super().setUp()
+        for cube in [self.cube_a, self.cube_b]:
+            cube.data
+
+
 if __name__ == "__main__":
     tests.main()
diff --git a/requirements/py310.yml b/requirements/py310.yml
@@ -18,7 +18,7 @@ dependencies:
   - libnetcdf !=4.9.1
   - matplotlib-base >=3.5
   - netcdf4
-  - numpy >1.21, !=1.24.3
+  - numpy >1.22, !=1.24.3
   - python-xxhash
   - pyproj
   - scipy

diff --git a/requirements/py311.yml b/requirements/py311.yml
@@ -18,7 +18,7 @@ dependencies:
   - libnetcdf !=4.9.1
   - matplotlib-base >=3.5
   - netcdf4
-  - numpy >1.21, !=1.24.3
+  - numpy >1.22, !=1.24.3
   - python-xxhash
   - pyproj
   - scipy

diff --git a/requirements/py39.yml b/requirements/py39.yml
@@ -18,7 +18,7 @@ dependencies:
   - libnetcdf !=4.9.1
   - matplotlib-base >=3.5
   - netcdf4
-  - numpy >1.21, !=1.24.3
+  - numpy >1.22, !=1.24.3
   - python-xxhash
   - pyproj
   - scipy

diff --git a/requirements/pypi-core.txt b/requirements/pypi-core.txt
@@ -5,7 +5,7 @@ dask[array]>=2022.9.0
 # libnetcdf!=4.9.1 (not available on PyPI)
 matplotlib>=3.5
 netcdf4
-numpy>1.21,!=1.24.3
+numpy>1.22,!=1.24.3
 pyproj
 scipy
 shapely!=1.8.3