Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Re-write pearsonr to use Resolve #5638

Merged
merged 18 commits into from
Jan 18, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions benchmarks/benchmarks/stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright Iris contributors
#
# This file is part of Iris and is released under the BSD license.
# See LICENSE in the root of the repository for full licensing details.
"""Stats benchmark tests."""

import iris
from iris.analysis.stats import pearsonr
import iris.tests


class PearsonR:
def setup(self):
cube_temp = iris.load_cube(
iris.tests.get_data_path(
("NetCDF", "global", "xyt", "SMALL_total_column_co2.nc")
)
)

# Make data non-lazy.
cube_temp.data

self.cube_a = cube_temp[:6]
self.cube_b = cube_temp[20:26]
self.cube_b.replace_coord(self.cube_a.coord("time"))
for name in ["latitude", "longitude"]:
self.cube_b.coord(name).guess_bounds()
self.weights = iris.analysis.cartography.area_weights(self.cube_b)

def time_real(self):
pearsonr(self.cube_a, self.cube_b, weights=self.weights)

def time_lazy(self):
for cube in self.cube_a, self.cube_b:
cube.data = cube.lazy_data()

result = pearsonr(self.cube_a, self.cube_b, weights=self.weights)
result.data
5 changes: 3 additions & 2 deletions docs/src/whatsnew/latest.rst
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,9 @@ This document explains the changes made to Iris for this release
lazy data from file. This will also speed up coordinate comparison.
(:pull:`5610`)

#. `@rcomer`_ modified :func:`~iris.analysis.stats.pearsonr` so it preserves
lazy data in all cases and also runs a little faster. (:pull:`5638`)
#. `@rcomer`_ and `@trexfeathers`_ (reviewer) modified
:func:`~iris.analysis.stats.pearsonr` so it preserves lazy data in all cases
and also runs a little faster. (:pull:`5638`)

🔥 Deprecations
===============
Expand Down
72 changes: 34 additions & 38 deletions lib/iris/analysis/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ def pearsonr(
cube_a, cube_b : cubes
Cubes between which the correlation will be calculated. The cubes
should either be the same shape and have the same dimension coordinates
or one cube should be broadcastable to the other.
or one cube should be broadcastable to the other. Broadcasting rules
are the same as those for cube arithmetic (see :ref:`cube maths`).
corr_coords : str or list of str
The cube coordinate name(s) over which to calculate correlations. If no
names are provided then correlation will be calculated over all common
Expand Down Expand Up @@ -84,88 +85,83 @@ def pearsonr(

# Get the broadcast, auto-transposed safe versions of the cube operands.
resolver = Resolve(cube_1, cube_2)
cube_1 = resolver.lhs_cube_resolved
cube_2 = resolver.rhs_cube_resolved
lhs_cube_resolved = resolver.lhs_cube_resolved
rhs_cube_resolved = resolver.rhs_cube_resolved

if cube_1.has_lazy_data() or cube_2.has_lazy_data():
if lhs_cube_resolved.has_lazy_data() or rhs_cube_resolved.has_lazy_data():
al = da
array_1 = cube_1.lazy_data()
array_2 = cube_2.lazy_data()
array_lhs = lhs_cube_resolved.lazy_data()
array_rhs = rhs_cube_resolved.lazy_data()
else:
al = np
array_1 = cube_1.data
array_2 = cube_2.data
array_lhs = lhs_cube_resolved.data
array_rhs = rhs_cube_resolved.data

# If no coords passed then set to all common dimcoords of cubes.
if corr_coords is None:
dim_coords_1 = {coord.name() for coord in cube_1.dim_coords}
dim_coords_2 = {coord.name() for coord in cube_2.dim_coords}
dim_coords_1 = {coord.name() for coord in lhs_cube_resolved.dim_coords}
dim_coords_2 = {coord.name() for coord in rhs_cube_resolved.dim_coords}
corr_coords = list(dim_coords_1.intersection(dim_coords_2))

# Interpret coords as array dimensions.
corr_dims = set()
if isinstance(corr_coords, str):
corr_coords = [corr_coords]
for coord in corr_coords:
corr_dims.update(cube_1.coord_dims(coord))
corr_dims.update(lhs_cube_resolved.coord_dims(coord))

corr_dims = tuple(corr_dims)

# Match up data masks if required.
if common_mask:
mask_1 = al.ma.getmaskarray(array_1)
mask_lhs = al.ma.getmaskarray(array_lhs)
if al is np:
# Reduce all invariant dimensions of mask_1 to length 1. This avoids
# unnecessary broadcasting of array_2.
# Reduce all invariant dimensions of mask_lhs to length 1. This avoids
# unnecessary broadcasting of array_rhs.
index = tuple(
slice(0, 1)
if np.array_equal(mask_1.any(axis=dim), mask_1.all(axis=dim))
if np.array_equal(mask_lhs.any(axis=dim), mask_lhs.all(axis=dim))
else slice(None)
for dim in range(mask_1.ndim)
for dim in range(mask_lhs.ndim)
)
mask_1 = mask_1[index]
mask_lhs = mask_lhs[index]

array_2 = _mask_array(array_2, mask_1)
array_1 = _mask_array(array_1, al.ma.getmaskarray(array_2))
array_rhs = _mask_array(array_rhs, mask_lhs)
array_lhs = _mask_array(array_lhs, al.ma.getmaskarray(array_rhs))

# Broadcast weights to shape of arrays if necessary.
if weights is None:
weights_1 = weights_2 = None
weights_lhs = weights_rhs = None
else:
if weights.shape != smaller_shape:
msg = f"weights array should have dimensions {smaller_shape}"
raise ValueError(msg)

if resolver.reorder_src_dims is not None:
# Apply same transposition as was done to cube_2 within Resolve.
weights = weights.transpose(resolver.reorder_src_dims)

# Reshape to add in any length-1 dimensions needed for broadcasting.
weights = weights.reshape(cube_2.shape)

weights_2 = np.broadcast_to(weights, array_2.shape)
weights_1 = np.broadcast_to(weights, array_1.shape)
wt_resolver = Resolve(cube_1, cube_2.copy(weights))
weights = wt_resolver.rhs_cube_resolved.data
weights_rhs = np.broadcast_to(weights, array_rhs.shape)
weights_lhs = np.broadcast_to(weights, array_lhs.shape)

# Calculate correlations.
s1 = array_1 - al.ma.average(
array_1, axis=corr_dims, weights=weights_1, keepdims=True
s_lhs = array_lhs - al.ma.average(
array_lhs, axis=corr_dims, weights=weights_lhs, keepdims=True
)
s2 = array_2 - al.ma.average(
array_2, axis=corr_dims, weights=weights_2, keepdims=True
s_rhs = array_rhs - al.ma.average(
array_rhs, axis=corr_dims, weights=weights_rhs, keepdims=True
)

s_prod = resolver.cube(s1 * s2)
s_prod = resolver.cube(s_lhs * s_rhs)

# Use cube collapsed method as it takes care of coordinate collapsing and missing
# data tolerance.
covar = s_prod.collapsed(
corr_coords, iris.analysis.SUM, weights=weights_1, mdtol=mdtol
corr_coords, iris.analysis.SUM, weights=weights_lhs, mdtol=mdtol
)

var_1 = iris.analysis._sum(s1**2, axis=corr_dims, weights=weights_1)
var_2 = iris.analysis._sum(s2**2, axis=corr_dims, weights=weights_2)
var_lhs = iris.analysis._sum(s_lhs**2, axis=corr_dims, weights=weights_lhs)
var_rhs = iris.analysis._sum(s_rhs**2, axis=corr_dims, weights=weights_rhs)

denom = np.sqrt(var_1 * var_2)
denom = np.sqrt(var_lhs * var_rhs)

corr_cube = covar / denom
corr_cube.rename("Pearson's r")
Expand Down
2 changes: 0 additions & 2 deletions lib/iris/common/resolve.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,6 @@ def __init__(self, lhs=None, rhs=None):
#: operands, where the direction of the mapping is governed by
#: :attr:`~iris.common.resolve.Resolve.map_rhs_to_lhs`.
self.mapping = None # set in _metadata_mapping
self.reorder_src_dims = None # set in _as_compatible_cubes

#: Cache containing a list of dim, aux and scalar coordinates prepared
#: and ready for creating and attaching to the resultant resolved
Expand Down Expand Up @@ -440,7 +439,6 @@ def _as_compatible_cubes(self):

# Determine whether a transpose of the src cube is necessary.
if order != sorted(order):
self.reorder_src_dims = order
new_src_data = new_src_data.transpose(order)
logger.debug(
f"transpose src {self._src_cube_position} cube with order {order}"
Expand Down
Loading