Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Re-write pearsonr to use Resolve #5638

Merged
merged 18 commits into from
Jan 18, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/src/whatsnew/latest.rst
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ This document explains the changes made to Iris for this release
lazy data from file. This will also speed up coordinate comparison.
(:pull:`5610`)

#. `@rcomer`_ modified :func:`~iris.analysis.stats.pearsonr` so it preserves
lazy data in all cases and also runs a little faster. (:pull:`5638`)
trexfeathers marked this conversation as resolved.
Show resolved Hide resolved

🔥 Deprecations
===============
Expand Down
154 changes: 80 additions & 74 deletions lib/iris/analysis/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,16 @@
# See LICENSE in the root of the repository for full licensing details.
"""Statistical operations between cubes."""

import dask.array as da
import numpy as np
import numpy.ma as ma

import iris
from iris.util import broadcast_to_shape
from iris.common import SERVICES, Resolve
from iris.common.lenient import _lenient_client
from iris.util import _mask_array


@_lenient_client(services=SERVICES)
trexfeathers marked this conversation as resolved.
Show resolved Hide resolved
def pearsonr(
cube_a,
cube_b,
Expand Down Expand Up @@ -63,13 +66,13 @@ def pearsonr(

Notes
-----
If either of the input cubes has lazy data, the result will have lazy data.

Reference:
https://en.wikipedia.org/wiki/Pearson_correlation_coefficient

This operation is non-lazy.

"""
# Assign larger cube to cube_1
# Assign larger cube to cube_1 for simplicity.
if cube_b.ndim > cube_a.ndim:
cube_1 = cube_b
cube_2 = cube_a
Expand All @@ -79,90 +82,93 @@ def pearsonr(

smaller_shape = cube_2.shape

dim_coords_1 = [coord.name() for coord in cube_1.dim_coords]
dim_coords_2 = [coord.name() for coord in cube_2.dim_coords]
common_dim_coords = list(set(dim_coords_1) & set(dim_coords_2))
# Get the broadcast, auto-transposed safe versions of the cube operands.
resolver = Resolve(cube_1, cube_2)
cube_1 = resolver.lhs_cube_resolved
cube_2 = resolver.rhs_cube_resolved
trexfeathers marked this conversation as resolved.
Show resolved Hide resolved

if cube_1.has_lazy_data() or cube_2.has_lazy_data():
al = da
array_1 = cube_1.lazy_data()
array_2 = cube_2.lazy_data()
else:
al = np
array_1 = cube_1.data
array_2 = cube_2.data

# If no coords passed then set to all common dimcoords of cubes.
if corr_coords is None:
corr_coords = common_dim_coords

def _ones_like(cube):
# Return a copy of cube with the same mask, but all data values set to 1.
# The operation is non-lazy.
# For safety we also discard any cell-measures and ancillary-variables, to
# avoid cube arithmetic possibly objecting to them, or inadvertently retaining
# them in the result where they might be inappropriate.
ones_cube = cube.copy()
ones_cube.data = np.ones_like(cube.data)
ones_cube.rename("unknown")
ones_cube.units = 1
for cm in ones_cube.cell_measures():
ones_cube.remove_cell_measure(cm)
for av in ones_cube.ancillary_variables():
ones_cube.remove_ancillary_variable(av)
return ones_cube
dim_coords_1 = {coord.name() for coord in cube_1.dim_coords}
dim_coords_2 = {coord.name() for coord in cube_2.dim_coords}
corr_coords = list(dim_coords_1.intersection(dim_coords_2))

# Interpret coords as array dimensions.
corr_dims = set()
if isinstance(corr_coords, str):
corr_coords = [corr_coords]
for coord in corr_coords:
corr_dims.update(cube_1.coord_dims(coord))

corr_dims = tuple(corr_dims)

# Match up data masks if required.
if common_mask:
# Create a cube of 1's with a common mask.
if ma.is_masked(cube_2.data):
mask_cube = _ones_like(cube_2)
else:
mask_cube = 1.0
if ma.is_masked(cube_1.data):
# Take a slice to avoid unnecessary broadcasting of cube_2.
slice_coords = [
dim_coords_1[i]
for i in range(cube_1.ndim)
if dim_coords_1[i] not in common_dim_coords
and np.array_equal(
cube_1.data.mask.any(axis=i), cube_1.data.mask.all(axis=i)
)
]
cube_1_slice = next(cube_1.slices_over(slice_coords))
mask_cube = _ones_like(cube_1_slice) * mask_cube
# Apply common mask to data.
if isinstance(mask_cube, iris.cube.Cube):
cube_1 = cube_1 * mask_cube
cube_2 = mask_cube * cube_2
dim_coords_2 = [coord.name() for coord in cube_2.dim_coords]

# Broadcast weights to shape of cubes if necessary.
if weights is None or cube_1.shape == smaller_shape:
weights_1 = weights
weights_2 = weights
mask_1 = al.ma.getmaskarray(array_1)
if al is np:
# Reduce all invariant dimensions of mask_1 to length 1. This avoids
# unnecessary broadcasting of array_2.
index = tuple(
slice(0, 1)
if np.array_equal(mask_1.any(axis=dim), mask_1.all(axis=dim))
else slice(None)
for dim in range(mask_1.ndim)
)
mask_1 = mask_1[index]

array_2 = _mask_array(array_2, mask_1)
array_1 = _mask_array(array_1, al.ma.getmaskarray(array_2))
trexfeathers marked this conversation as resolved.
Show resolved Hide resolved

# Broadcast weights to shape of arrays if necessary.
if weights is None:
weights_1 = weights_2 = None
else:
if weights.shape != smaller_shape:
raise ValueError(
"weights array should have dimensions {}".format(smaller_shape)
)
msg = f"weights array should have dimensions {smaller_shape}"
raise ValueError(msg)

if resolver.reorder_src_dims is not None:
# Apply same transposition as was done to cube_2 within Resolve.
weights = weights.transpose(resolver.reorder_src_dims)

# Reshape to add in any length-1 dimensions needed for broadcasting.
rcomer marked this conversation as resolved.
Show resolved Hide resolved
weights = weights.reshape(cube_2.shape)

dims_1_common = [
i for i in range(cube_1.ndim) if dim_coords_1[i] in common_dim_coords
]
weights_1 = broadcast_to_shape(weights, cube_1.shape, dims_1_common)
if cube_2.shape != smaller_shape:
dims_2_common = [
i for i in range(cube_2.ndim) if dim_coords_2[i] in common_dim_coords
]
weights_2 = broadcast_to_shape(weights, cube_2.shape, dims_2_common)
else:
weights_2 = weights
weights_2 = np.broadcast_to(weights, array_2.shape)
weights_1 = np.broadcast_to(weights, array_1.shape)
trexfeathers marked this conversation as resolved.
Show resolved Hide resolved

# Calculate correlations.
s1 = cube_1 - cube_1.collapsed(corr_coords, iris.analysis.MEAN, weights=weights_1)
s2 = cube_2 - cube_2.collapsed(corr_coords, iris.analysis.MEAN, weights=weights_2)
s1 = array_1 - al.ma.average(
array_1, axis=corr_dims, weights=weights_1, keepdims=True
)
s2 = array_2 - al.ma.average(
array_2, axis=corr_dims, weights=weights_2, keepdims=True
)

covar = (s1 * s2).collapsed(
s_prod = resolver.cube(s1 * s2)

# Use cube collapsed method as it takes care of coordinate collapsing and missing
# data tolerance.
covar = s_prod.collapsed(
corr_coords, iris.analysis.SUM, weights=weights_1, mdtol=mdtol
)
var_1 = (s1**2).collapsed(corr_coords, iris.analysis.SUM, weights=weights_1)
var_2 = (s2**2).collapsed(corr_coords, iris.analysis.SUM, weights=weights_2)

denom = iris.analysis.maths.apply_ufunc(
np.sqrt, var_1 * var_2, new_unit=covar.units
)
var_1 = iris.analysis._sum(s1**2, axis=corr_dims, weights=weights_1)
var_2 = iris.analysis._sum(s2**2, axis=corr_dims, weights=weights_2)

denom = np.sqrt(var_1 * var_2)

corr_cube = covar / denom
corr_cube.rename("Pearson's r")
corr_cube.units = 1

return corr_cube
2 changes: 2 additions & 0 deletions lib/iris/common/resolve.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,7 @@ def __init__(self, lhs=None, rhs=None):
#: operands, where the direction of the mapping is governed by
#: :attr:`~iris.common.resolve.Resolve.map_rhs_to_lhs`.
self.mapping = None # set in _metadata_mapping
self.reorder_src_dims = None # set in _as_compatible_cubes

#: Cache containing a list of dim, aux and scalar coordinates prepared
#: and ready for creating and attaching to the resultant resolved
Expand Down Expand Up @@ -439,6 +440,7 @@ def _as_compatible_cubes(self):

# Determine whether a transpose of the src cube is necessary.
if order != sorted(order):
self.reorder_src_dims = order
new_src_data = new_src_data.transpose(order)
logger.debug(
f"transpose src {self._src_cube_position} cube with order {order}"
Expand Down
91 changes: 64 additions & 27 deletions lib/iris/tests/unit/analysis/stats/test_pearsonr.py
trexfeathers marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@
import numpy.ma as ma

import iris
import iris._lazy_data
import iris.analysis.stats as stats
from iris.exceptions import CoordinateNotFoundError


@tests.skip_data
class Test(tests.IrisTest):
class Mixin:
def setUp(self):
# 3D cubes:
cube_temp = iris.load_cube(
Expand All @@ -33,6 +33,9 @@ def setUp(self):
cube_temp.coord("longitude").guess_bounds()
self.weights = iris.analysis.cartography.area_weights(cube_temp)


@tests.skip_data
class TestLazy(Mixin, tests.IrisTest):
def test_perfect_corr(self):
r = stats.pearsonr(self.cube_a, self.cube_a, ["latitude", "longitude"])
self.assertArrayEqual(r.data, np.array([1.0] * 6))
Expand All @@ -41,10 +44,6 @@ def test_perfect_corr_all_dims(self):
r = stats.pearsonr(self.cube_a, self.cube_a)
self.assertArrayEqual(r.data, np.array([1.0]))

def test_incompatible_cubes(self):
with self.assertRaises(ValueError):
stats.pearsonr(self.cube_a[:, 0, :], self.cube_b[0, :, :], "longitude")

def test_compatible_cubes(self):
r = stats.pearsonr(self.cube_a, self.cube_b, ["latitude", "longitude"])
self.assertArrayAlmostEqual(
Expand Down Expand Up @@ -111,6 +110,26 @@ def test_broadcast_cubes_weighted(self):
]
self.assertArrayAlmostEqual(r.data, np.array(r_by_slice))

def test_broadcast_transpose_cubes_weighted(self):
# Reference is calculated with no transposition.
r_ref = stats.pearsonr(
self.cube_a,
self.cube_b[0, :, :],
["latitude", "longitude"],
weights=self.weights[0, :, :],
)

self.cube_a.transpose()
r_test = stats.pearsonr(
self.cube_a,
self.cube_b[0, :, :],
["latitude", "longitude"],
weights=self.weights[0, :, :],
)

# Should get the same result, but transposed.
self.assertArrayAlmostEqual(r_test.data, r_ref.data.T)

def test_weight_error(self):
with self.assertRaises(ValueError):
stats.pearsonr(
Expand All @@ -120,42 +139,34 @@ def test_weight_error(self):
weights=self.weights,
)

def test_non_existent_coord(self):
with self.assertRaises(CoordinateNotFoundError):
stats.pearsonr(self.cube_a, self.cube_b, "bad_coord")

def test_mdtol(self):
cube_small = self.cube_a[:, 0, 0]
cube_small_masked = cube_small.copy()
cube_small_masked.data = ma.array(
cube_small.data, mask=np.array([0, 0, 0, 1, 1, 1], dtype=bool)
)
cube_small_masked = iris.util.mask_cube(cube_small, [0, 0, 0, 1, 1, 1])
r1 = stats.pearsonr(cube_small, cube_small_masked)
r2 = stats.pearsonr(cube_small, cube_small_masked, mdtol=0.49)
self.assertArrayAlmostEqual(r1.data, np.array([0.74586593]))
self.assertMaskedArrayEqual(r2.data, ma.array([0], mask=[True]))

def test_common_mask_simple(self):
cube_small = self.cube_a[:, 0, 0]
cube_small_masked = cube_small.copy()
cube_small_masked.data = ma.array(
cube_small.data, mask=np.array([0, 0, 0, 1, 1, 1], dtype=bool)
)
cube_small_masked = iris.util.mask_cube(cube_small, [0, 0, 0, 1, 1, 1])
r = stats.pearsonr(cube_small, cube_small_masked, common_mask=True)
self.assertArrayAlmostEqual(r.data, np.array([1.0]))

def test_common_mask_broadcast(self):
cube_small = self.cube_a[:, 0, 0]
cube_small = iris.util.mask_cube(self.cube_a[:, 0, 0], [0, 0, 0, 0, 0, 1])
mask_2d = np.zeros((6, 2), dtype=bool)
# 2d mask varies on unshared coord:
mask_2d[0, 1] = 1

# Make a (6, 2) cube.
cube_small_2d = self.cube_a[:, 0:2, 0]
cube_small.data = ma.array(
cube_small.data, mask=np.array([0, 0, 0, 0, 0, 1], dtype=bool)
# Duplicate data along unshared coord's dimension.
new_data = iris.util.broadcast_to_shape(
cube_small.core_data(), (6, 2), dim_map=[0]
)
cube_small_2d.data = ma.array(
np.tile(cube_small.data[:, np.newaxis], 2),
mask=np.zeros((6, 2), dtype=bool),
)
# 2d mask varies on unshared coord:
cube_small_2d.data.mask[0, 1] = 1
cube_small_2d.data = iris.util._mask_array(new_data, mask_2d)

r = stats.pearsonr(
cube_small,
cube_small_2d,
Expand All @@ -169,5 +180,31 @@ def test_common_mask_broadcast(self):
self.assertArrayAlmostEqual(r.data, np.array([1.0, 1.0]))


class TestReal(TestLazy):
def setUp(self):
super().setUp()
for cube in [self.cube_a, self.cube_b]:
_ = cube.data


class TestCoordHandling(Mixin, tests.IrisTest):
def test_lenient_handling(self):
# Smoke test that mismatched var_name does not prevent operation.
self.cube_a.coord("time").var_name = "wibble"
stats.pearsonr(self.cube_a, self.cube_b)

def test_incompatible_cubes(self):
with self.assertRaises(ValueError):
stats.pearsonr(self.cube_a[:, 0, :], self.cube_b[0, :, :], "longitude")
trexfeathers marked this conversation as resolved.
Show resolved Hide resolved

def test_single_coord(self):
# Smoke test that single coord can be passed as single string.
stats.pearsonr(self.cube_a, self.cube_b, "latitude")

def test_non_existent_coord(self):
with self.assertRaises(CoordinateNotFoundError):
stats.pearsonr(self.cube_a, self.cube_b, "bad_coord")


if __name__ == "__main__":
tests.main()