From 9c522d7238a76a1895d5f52275e45362a8073a5b Mon Sep 17 00:00:00 2001 From: Ruth Comer Date: Fri, 26 Jun 2015 11:28:42 +0100 Subject: [PATCH] pearsonr broadcasting and kwarg functionality --- ...newfeature_2015-07-03_pearsonr_rewrite.txt | 5 + lib/iris/analysis/stats.py | 193 +++++++----------- lib/iris/tests/analysis/test_stats.py | 107 ---------- .../tests/unit/analysis/stats/__init__.py | 20 ++ .../unit/analysis/stats/test_pearsonr.py | 126 ++++++++++++ 5 files changed, 227 insertions(+), 224 deletions(-) create mode 100644 docs/iris/src/whatsnew/contributions_1.9/newfeature_2015-07-03_pearsonr_rewrite.txt delete mode 100644 lib/iris/tests/analysis/test_stats.py create mode 100644 lib/iris/tests/unit/analysis/stats/__init__.py create mode 100644 lib/iris/tests/unit/analysis/stats/test_pearsonr.py diff --git a/docs/iris/src/whatsnew/contributions_1.9/newfeature_2015-07-03_pearsonr_rewrite.txt b/docs/iris/src/whatsnew/contributions_1.9/newfeature_2015-07-03_pearsonr_rewrite.txt new file mode 100644 index 0000000000..6c6335d3e1 --- /dev/null +++ b/docs/iris/src/whatsnew/contributions_1.9/newfeature_2015-07-03_pearsonr_rewrite.txt @@ -0,0 +1,5 @@ +* :func:`iris.analysis.stats.pearsonr` updates: + * Cubes can now be different shapes, provided one is broadcastable to the + other. + * Accepts weights keyword for weighted correlations. + * Accepts mdtol keyword for missing data tolerance level. diff --git a/lib/iris/analysis/stats.py b/lib/iris/analysis/stats.py index a49d563fc0..840a0ab7fd 100644 --- a/lib/iris/analysis/stats.py +++ b/lib/iris/analysis/stats.py @@ -23,137 +23,96 @@ from six.moves import (filter, input, map, range, zip) # noqa import numpy as np - import iris +from iris.util import broadcast_to_shape -def _get_calc_view(cube_a, cube_b, corr_coords): - """ - This function takes two cubes and returns cubes which are - flattened so that efficient comparisons can be performed - between the two. - - Args: - - * cube_a: - First cube of data - * cube_b: - Second cube of data, compatible with cube_a - * corr_coords: - Names of the dimension coordinates over which - to calculate correlations. - - Returns: - - * reshaped_a/reshaped_b: - The data arrays of cube_a/cube_b reshaped - so that the dimensions to be compared are - flattened into the 0th dimension of the - return array and other dimensions are - preserved. - * res_ind: - The indices of the dimensions that we - are not comparing, in terms of cube_a/cube_b - - """ - - # Following lists to be filled with: - # indices of dimension we are not comparing - res_ind = [] - # indices of dimensions we are comparing - slice_ind = [] - for i, c in enumerate(cube_a.dim_coords): - if not c.name() in corr_coords: - res_ind.append(i) - else: - slice_ind.append(i) - - # sanitise input - dim_coord_names = [c.name() for c in cube_a.dim_coords] - if corr_coords is None: - corr_coords = dim_coord_names - - if ([c.name() for c in cube_a.dim_coords] != - [c.name() for c in cube_b.dim_coords]): - raise ValueError("Cubes are incompatible.") - - for c in corr_coords: - if c not in dim_coord_names: - raise ValueError("%s coord " - "does not exist in cube." % c) - - # Reshape data to be data to correlate in 0th dim and - # other grid points in 1st dim. - # Transpose to group the correlation data dims before the - # grid point dims. - data_a = cube_a.data.view() - data_b = cube_b.data.view() - dim_i_len = np.prod(np.array(cube_a.shape)[slice_ind]) - dim_j_len = np.prod(np.array(cube_a.shape)[res_ind]) - reshaped_a = data_a.transpose(slice_ind+res_ind)\ - .reshape(dim_i_len, dim_j_len) - reshaped_b = data_b.transpose(slice_ind+res_ind)\ - .reshape(dim_i_len, dim_j_len) - - return reshaped_a, reshaped_b, res_ind - - -def pearsonr(cube_a, cube_b, corr_coords=None): +def pearsonr(cube_a, cube_b, corr_coords=None, weights=None, mdtol=1.): """ - Calculates the n-D Pearson's r correlation - cube over the dimensions associated with the - given coordinates. - - Returns a cube of the correlation between the two - cubes along the dimensions of the given - coordinates, at each point in the remaining - dimensions of the cubes. - - For example providing two time/altitude/latitude/longitude - cubes and corr_coords of 'latitude' and 'longitude' will result - in a time/altitude cube describing the latitude/longitude - (i.e. pattern) correlation at each time/altitude point. + Calculate the Pearson's r correlation coefficient over specified + dimensions. Args: * cube_a, cube_b (cubes): - Between which the correlation field will be calculated. - Cubes should be the same shape and have the - same dimension coordinates. - * corr_coords (list of str): - The cube coordinate names over which to calculate - correlations. If no names are provided then - correlation will be calculated over all cube - dimensions. + Cubes between which the correlation will be calculated. The cubes + should either be the same shape and have the same dimension coordinates + or one cube should be broadcastable to the other. + * corr_coords (str or list of str): + The cube coordinate name(s) over which to calculate correlations. If no + names are provided then correlation will be calculated over all common + cube dimensions. + * weights (numpy.ndarray, optional): + Weights array of same shape as (the smaller of) cube_a and cube_b. Note + that latitude/longitude area weights can be calculated using + :func:`iris.analysis.cartography.area_weights`. + * mdtol (float, optional): + Tolerance of missing data. The missing data fraction is calculated + based on the number of grid cells masked in both cube_a and cube_b. If + this fraction exceed mdtol, the returned value in the corresponding + cell is masked. mdtol=0 means no missing data is tolerated while + mdtol=1 means the resulting element will be masked if and only if all + contributing elements are masked in cube_a or cube_b. Defaults to 1. Returns: - Cube of correlations. + A cube of the correlation between the two input cubes along the + specified dimensions, at each point in the remaining dimensions of the + cubes. + + For example providing two time/altitude/latitude/longitude cubes and + corr_coords of 'latitude' and 'longitude' will result in a + time/altitude cube describing the latitude/longitude (i.e. pattern) + correlation at each time/altitude point. Reference: http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation """ - # If no coords passed then set to all coords of cube. + # Assign larger cube to cube_1 + if cube_b.ndim > cube_a.ndim: + cube_1 = cube_b + cube_2 = cube_a + else: + cube_1 = cube_a + cube_2 = cube_b + + dim_coords_1 = [coord.name() for coord in cube_1.dim_coords] + dim_coords_2 = [coord.name() for coord in cube_2.dim_coords] + common_dim_coords = list(set(dim_coords_1) & set(dim_coords_2)) + # If no coords passed then set to all common dimcoords of cubes. if corr_coords is None: - corr_coords = [c.name() for c in cube_a.dim_coords] - - vec_a, vec_b, res_ind = _get_calc_view(cube_a, - cube_b, - corr_coords) - - sa = vec_a - np.mean(vec_a, 0) - sb = vec_b - np.mean(vec_b, 0) - flat_corrs = np.sum((sa*sb), 0)/np.sqrt(np.sum(sa**2, 0)*np.sum(sb**2, 0)) - - corrs = flat_corrs.reshape([cube_a.shape[i] for i in res_ind]) - - # Construct cube to hold correlation results. - corrs_cube = iris.cube.Cube(corrs) - corrs_cube.long_name = "Pearson's r" - corrs_cube.units = "1" - for i, dim in enumerate(res_ind): - c = cube_a.dim_coords[dim] - corrs_cube.add_dim_coord(c, i) - - return corrs_cube + corr_coords = common_dim_coords + + # Broadcast weights to shape of cube_1 if necessary. + if weights is None or cube_1.shape == cube_2.shape: + weights_1 = weights + weights_2 = weights + else: + if weights.shape != cube_2.shape: + raise ValueError("weights array should have dimensions {}". + format(cube_2.shape)) + dims_1_common = [i for i in range(cube_1.ndim) if + dim_coords_1[i] in common_dim_coords] + weights_1 = broadcast_to_shape(weights, cube_1.shape, dims_1_common) + weights_2 = weights + + # Calculate correlations. + s1 = cube_1 - cube_1.collapsed(corr_coords, iris.analysis.MEAN, + weights=weights_1) + s2 = cube_2 - cube_2.collapsed(corr_coords, iris.analysis.MEAN, + weights=weights_2) + + covar = (s1*s2).collapsed(corr_coords, iris.analysis.SUM, + weights=weights_1, mdtol=mdtol) + var_1 = (s1**2).collapsed(corr_coords, iris.analysis.SUM, + weights=weights_1) + var_2 = (s2**2).collapsed(corr_coords, iris.analysis.SUM, + weights=weights_2) + + denom = iris.analysis.maths.apply_ufunc(np.sqrt, var_1*var_2, + new_unit=covar.units) + corr_cube = covar / denom + corr_cube.rename("Pearson's r") + + return corr_cube diff --git a/lib/iris/tests/analysis/test_stats.py b/lib/iris/tests/analysis/test_stats.py deleted file mode 100644 index bb4dbadcf7..0000000000 --- a/lib/iris/tests/analysis/test_stats.py +++ /dev/null @@ -1,107 +0,0 @@ -# (C) British Crown Copyright 2014 - 2015, Met Office -# -# This file is part of Iris. -# -# Iris is free software: you can redistribute it and/or modify it under -# the terms of the GNU Lesser General Public License as published by the -# Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# Iris is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Lesser General Public License for more details. -# -# You should have received a copy of the GNU Lesser General Public License -# along with Iris. If not, see . -""" -Test the iris.analysis.stats module. - -""" - -from __future__ import (absolute_import, division, print_function) -from six.moves import (filter, input, map, range, zip) # noqa - -# Import iris tests first so that some things can be initialised before -# importing anything else. -import iris.tests as tests - -import numpy as np - -import iris -import iris.analysis.stats as stats - - -@tests.skip_data -class Test_corr(tests.IrisTest): - def setUp(self): - self.cube_a = iris.load_cube(iris.sample_data_path('GloSea4', - 'ensemble_001.pp')) - self.cube_b = iris.load_cube(iris.sample_data_path('GloSea4', - 'ensemble_002.pp')) - - def test_perfect_corr(self): - r = stats.pearsonr(self.cube_a, self.cube_a, - ['latitude', 'longitude']) - self.assertArrayEqual(r.data, np.array([1.]*6)) - - def test_perfect_corr_all_dims(self): - r = stats.pearsonr(self.cube_a, self.cube_a) - self.assertArrayEqual(r.data, np.array([1.])) - - def test_incompatible_cubes(self): - with self.assertRaises(ValueError): - stats.pearsonr(self.cube_a, self.cube_b[0, :, :], 'time') - - def test_compatible_cubes(self): - r = stats.pearsonr(self.cube_a, self.cube_b, ['latitude', 'longitude']) - self.assertArrayAlmostEqual(r.data, [0.99733591, - 0.99501693, - 0.99674225, - 0.99495268, - 0.99217004, - 0.99362189]) - - def test_4d_cube_2_dims(self): - real_0_c = iris.coords.AuxCoord(np.int32(0), 'realization') - real_1_c = iris.coords.AuxCoord(np.int32(1), 'realization') - - # Make cubes merge-able. - self.cube_a.add_aux_coord(real_0_c) - self.cube_b.add_aux_coord(real_1_c) - self.cube_a.remove_coord('forecast_period') - self.cube_a.remove_coord('forecast_reference_time') - self.cube_b.remove_coord('forecast_period') - self.cube_b.remove_coord('forecast_reference_time') - four_d_cube_a = iris.cube\ - .CubeList([self.cube_a, self.cube_b]).merge()[0] - self.cube_a.remove_coord('realization') - self.cube_b.remove_coord('realization') - self.cube_a.add_aux_coord(real_1_c) - self.cube_b.add_aux_coord(real_0_c) - four_d_cube_b = iris.cube\ - .CubeList([self.cube_a, self.cube_b]).merge()[0] - - r = stats.pearsonr(four_d_cube_a, four_d_cube_b, - ['latitude', 'longitude']) - expected_corr = [[0.99733591, - 0.99501693, - 0.99674225, - 0.99495268, - 0.99217004, - 0.99362189], - [0.99733591, - 0.99501693, - 0.99674225, - 0.99495268, - 0.99217004, - 0.99362189]] - self.assertArrayAlmostEqual(r.data, expected_corr) - - def test_non_existent_coord(self): - with self.assertRaises(ValueError): - stats.pearsonr(self.cube_a, self.cube_b, 'bad_coord') - - -if __name__ == '__main__': - tests.main() diff --git a/lib/iris/tests/unit/analysis/stats/__init__.py b/lib/iris/tests/unit/analysis/stats/__init__.py new file mode 100644 index 0000000000..8dd58e46b2 --- /dev/null +++ b/lib/iris/tests/unit/analysis/stats/__init__.py @@ -0,0 +1,20 @@ +# (C) British Crown Copyright 2015, Met Office +# +# This file is part of Iris. +# +# Iris is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License as published by the +# Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Iris is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with Iris. If not, see . +"""Unit tests for the :mod:`iris.analysis.stats` module.""" + +from __future__ import (absolute_import, division, print_function) +from six.moves import (filter, input, map, range, zip) # noqa diff --git a/lib/iris/tests/unit/analysis/stats/test_pearsonr.py b/lib/iris/tests/unit/analysis/stats/test_pearsonr.py new file mode 100644 index 0000000000..7f19a659d0 --- /dev/null +++ b/lib/iris/tests/unit/analysis/stats/test_pearsonr.py @@ -0,0 +1,126 @@ +# (C) British Crown Copyright 2014 - 2015, Met Office +# +# This file is part of Iris. +# +# Iris is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License as published by the +# Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Iris is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with Iris. If not, see . +"""Unit tests for the `iris.analysis.stats.pearsonr` function.""" + +from __future__ import (absolute_import, division, print_function) +from six.moves import (filter, input, map, range, zip) # noqa + +# Import iris tests first so that some things can be initialised before +# importing anything else. +import iris.tests as tests + +import numpy as np +import numpy.ma as ma + +import iris +import iris.analysis.stats as stats +from iris.exceptions import CoordinateNotFoundError + + +@tests.skip_data +class Test(tests.IrisTest): + def setUp(self): + # 3D cubes: + cube_temp = iris.load_cube(tests.get_data_path( + ('NetCDF', 'global', 'xyt', 'SMALL_total_column_co2.nc'))) + self.cube_a = cube_temp[0:6] + self.cube_b = cube_temp[20:26] + cube_temp = self.cube_a.copy() + cube_temp.coord('latitude').guess_bounds() + cube_temp.coord('longitude').guess_bounds() + self.weights = iris.analysis.cartography.area_weights(cube_temp) + + def test_perfect_corr(self): + r = stats.pearsonr(self.cube_a, self.cube_a, + ['latitude', 'longitude']) + self.assertArrayEqual(r.data, np.array([1.]*6)) + + def test_perfect_corr_all_dims(self): + r = stats.pearsonr(self.cube_a, self.cube_a) + self.assertArrayEqual(r.data, np.array([1.])) + + def test_incompatible_cubes(self): + with self.assertRaises(ValueError): + stats.pearsonr(self.cube_a[:, 0, :], self.cube_b[0, :, :], + 'longitude') + + def test_compatible_cubes(self): + r = stats.pearsonr(self.cube_a, self.cube_b, ['latitude', 'longitude']) + self.assertArrayAlmostEqual(r.data, [0.81114936, + 0.81690538, + 0.79833135, + 0.81118674, + 0.79745386, + 0.81278484]) + + def test_broadcast_cubes(self): + r1 = stats.pearsonr(self.cube_a, self.cube_b[0, :, :], + ['latitude', 'longitude']) + r2 = stats.pearsonr(self.cube_b[0, :, :], self.cube_a, + ['latitude', 'longitude']) + r_by_slice = [ + stats.pearsonr(self.cube_a[i, :, :], self.cube_b[0, :, :], + ['latitude', 'longitude']).data for i in range(6) + ] + self.assertArrayEqual(r1.data, np.array(r_by_slice)) + self.assertArrayEqual(r2.data, np.array(r_by_slice)) + + def test_compatible_cubes_weighted(self): + r = stats.pearsonr(self.cube_a, self.cube_b, ['latitude', 'longitude'], + self.weights) + self.assertArrayAlmostEqual(r.data, [0.79106045, + 0.79989169, + 0.78826918, + 0.79925855, + 0.79011544, + 0.80115837]) + + def test_broadcast_cubes_weighted(self): + r = stats.pearsonr(self.cube_a, self.cube_b[0, :, :], + ['latitude', 'longitude'], + weights=self.weights[0, :, :]) + r_by_slice = [ + stats.pearsonr(self.cube_a[i, :, :], self.cube_b[0, :, :], + ['latitude', 'longitude'], + weights=self.weights[0, :, :] + ).data for i in range(6) + ] + self.assertArrayAlmostEqual(r.data, np.array(r_by_slice)) + + def test_weight_error(self): + with self.assertRaises(ValueError): + stats.pearsonr(self.cube_a, self.cube_b[0, :, :], + ['latitude', 'longitude'], + weights=self.weights) + + def test_non_existent_coord(self): + with self.assertRaises(CoordinateNotFoundError): + stats.pearsonr(self.cube_a, self.cube_b, 'bad_coord') + + def test_mdtol(self): + cube_small = self.cube_a[:, 0, 0] + cube_small_masked = cube_small.copy() + cube_small_masked.data = ma.array( + cube_small.data, mask=np.array([0, 0, 0, 1, 1, 1], dtype=bool)) + r1 = stats.pearsonr(cube_small, cube_small_masked) + r2 = stats.pearsonr(cube_small, cube_small_masked, mdtol=0.49) + self.assertArrayAlmostEqual(r1.data, np.array([0.74586593])) + self.assertMaskedArrayEqual(r2.data, ma.array([0], mask=[True])) + + +if __name__ == '__main__': + tests.main()