From 9c522d7238a76a1895d5f52275e45362a8073a5b Mon Sep 17 00:00:00 2001
From: Ruth Comer <ruth.comer@metoffice.gov.uk>
Date: Fri, 26 Jun 2015 11:28:42 +0100
Subject: [PATCH] pearsonr broadcasting and kwarg functionality

---
 ...newfeature_2015-07-03_pearsonr_rewrite.txt |   5 +
 lib/iris/analysis/stats.py                    | 193 +++++++-----------
 lib/iris/tests/analysis/test_stats.py         | 107 ----------
 .../tests/unit/analysis/stats/__init__.py     |  20 ++
 .../unit/analysis/stats/test_pearsonr.py      | 126 ++++++++++++
 5 files changed, 227 insertions(+), 224 deletions(-)
 create mode 100644 docs/iris/src/whatsnew/contributions_1.9/newfeature_2015-07-03_pearsonr_rewrite.txt
 delete mode 100644 lib/iris/tests/analysis/test_stats.py
 create mode 100644 lib/iris/tests/unit/analysis/stats/__init__.py
 create mode 100644 lib/iris/tests/unit/analysis/stats/test_pearsonr.py

diff --git a/docs/iris/src/whatsnew/contributions_1.9/newfeature_2015-07-03_pearsonr_rewrite.txt b/docs/iris/src/whatsnew/contributions_1.9/newfeature_2015-07-03_pearsonr_rewrite.txt
new file mode 100644
index 0000000000..6c6335d3e1
--- /dev/null
+++ b/docs/iris/src/whatsnew/contributions_1.9/newfeature_2015-07-03_pearsonr_rewrite.txt
@@ -0,0 +1,5 @@
+* :func:`iris.analysis.stats.pearsonr` updates:
+  * Cubes can now be different shapes, provided one is broadcastable to the
+  other.
+  * Accepts weights keyword for weighted correlations.
+  * Accepts mdtol keyword for missing data tolerance level.
diff --git a/lib/iris/analysis/stats.py b/lib/iris/analysis/stats.py
index a49d563fc0..840a0ab7fd 100644
--- a/lib/iris/analysis/stats.py
+++ b/lib/iris/analysis/stats.py
@@ -23,137 +23,96 @@
 from six.moves import (filter, input, map, range, zip)  # noqa
 
 import numpy as np
-
 import iris
+from iris.util import broadcast_to_shape
 
 
-def _get_calc_view(cube_a, cube_b, corr_coords):
-    """
-    This function takes two cubes and returns cubes which are
-    flattened so that efficient comparisons can be performed
-    between the two.
-
-    Args:
-
-    * cube_a:
-        First cube of data
-    * cube_b:
-        Second cube of data, compatible with cube_a
-    * corr_coords:
-        Names of the dimension coordinates over which
-        to calculate correlations.
-
-    Returns:
-
-    * reshaped_a/reshaped_b:
-        The data arrays of cube_a/cube_b reshaped
-        so that the dimensions to be compared are
-        flattened into the 0th dimension of the
-        return array and other dimensions are
-        preserved.
-    * res_ind:
-        The indices of the dimensions that we
-        are not comparing, in terms of cube_a/cube_b
-
-    """
-
-    # Following lists to be filled with:
-    # indices of dimension we are not comparing
-    res_ind = []
-    # indices of dimensions we are comparing
-    slice_ind = []
-    for i, c in enumerate(cube_a.dim_coords):
-        if not c.name() in corr_coords:
-            res_ind.append(i)
-        else:
-            slice_ind.append(i)
-
-    # sanitise input
-    dim_coord_names = [c.name() for c in cube_a.dim_coords]
-    if corr_coords is None:
-        corr_coords = dim_coord_names
-
-    if ([c.name() for c in cube_a.dim_coords] !=
-            [c.name() for c in cube_b.dim_coords]):
-        raise ValueError("Cubes are incompatible.")
-
-    for c in corr_coords:
-        if c not in dim_coord_names:
-            raise ValueError("%s coord "
-                             "does not exist in cube." % c)
-
-    # Reshape data to be data to correlate in 0th dim and
-    # other grid points in 1st dim.
-    # Transpose to group the correlation data dims before the
-    # grid point dims.
-    data_a = cube_a.data.view()
-    data_b = cube_b.data.view()
-    dim_i_len = np.prod(np.array(cube_a.shape)[slice_ind])
-    dim_j_len = np.prod(np.array(cube_a.shape)[res_ind])
-    reshaped_a = data_a.transpose(slice_ind+res_ind)\
-                       .reshape(dim_i_len, dim_j_len)
-    reshaped_b = data_b.transpose(slice_ind+res_ind)\
-                       .reshape(dim_i_len, dim_j_len)
-
-    return reshaped_a, reshaped_b, res_ind
-
-
-def pearsonr(cube_a, cube_b, corr_coords=None):
+def pearsonr(cube_a, cube_b, corr_coords=None, weights=None, mdtol=1.):
     """
-    Calculates the n-D Pearson's r correlation
-    cube over the dimensions associated with the
-    given coordinates.
-
-    Returns a cube of the correlation between the two
-    cubes along the dimensions of the given
-    coordinates, at each point in the remaining
-    dimensions of the cubes.
-
-    For example providing two time/altitude/latitude/longitude
-    cubes and corr_coords of 'latitude' and 'longitude' will result
-    in a time/altitude cube describing the latitude/longitude
-    (i.e. pattern) correlation at each time/altitude point.
+    Calculate the Pearson's r correlation coefficient over specified
+    dimensions.
 
     Args:
 
     * cube_a, cube_b (cubes):
-        Between which the correlation field will be calculated.
-        Cubes should be the same shape and have the
-        same dimension coordinates.
-    * corr_coords (list of str):
-        The cube coordinate names over which to calculate
-        correlations. If no names are provided then
-        correlation will be calculated over all cube
-        dimensions.
+        Cubes between which the correlation will be calculated.  The cubes
+        should either be the same shape and have the same dimension coordinates
+        or one cube should be broadcastable to the other.
+    * corr_coords (str or list of str):
+        The cube coordinate name(s) over which to calculate correlations. If no
+        names are provided then correlation will be calculated over all common
+        cube dimensions.
+    * weights (numpy.ndarray, optional):
+        Weights array of same shape as (the smaller of) cube_a and cube_b. Note
+        that latitude/longitude area weights can be calculated using
+        :func:`iris.analysis.cartography.area_weights`.
+    * mdtol (float, optional):
+        Tolerance of missing data. The missing data fraction is calculated
+        based on the number of grid cells masked in both cube_a and cube_b. If
+        this fraction exceed mdtol, the returned value in the corresponding
+        cell is masked. mdtol=0 means no missing data is tolerated while
+        mdtol=1 means the resulting element will be masked if and only if all
+        contributing elements are masked in cube_a or cube_b. Defaults to 1.
 
     Returns:
-        Cube of correlations.
+        A cube of the correlation between the two input cubes along the
+        specified dimensions, at each point in the remaining dimensions of the
+        cubes.
+
+        For example providing two time/altitude/latitude/longitude cubes and
+        corr_coords of 'latitude' and 'longitude' will result in a
+        time/altitude cube describing the latitude/longitude (i.e. pattern)
+        correlation at each time/altitude point.
 
     Reference:
         http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation
 
     """
 
-    # If no coords passed then set to all coords of cube.
+    # Assign larger cube to cube_1
+    if cube_b.ndim > cube_a.ndim:
+        cube_1 = cube_b
+        cube_2 = cube_a
+    else:
+        cube_1 = cube_a
+        cube_2 = cube_b
+
+    dim_coords_1 = [coord.name() for coord in cube_1.dim_coords]
+    dim_coords_2 = [coord.name() for coord in cube_2.dim_coords]
+    common_dim_coords = list(set(dim_coords_1) & set(dim_coords_2))
+    # If no coords passed then set to all common dimcoords of cubes.
     if corr_coords is None:
-        corr_coords = [c.name() for c in cube_a.dim_coords]
-
-    vec_a, vec_b, res_ind = _get_calc_view(cube_a,
-                                           cube_b,
-                                           corr_coords)
-
-    sa = vec_a - np.mean(vec_a, 0)
-    sb = vec_b - np.mean(vec_b, 0)
-    flat_corrs = np.sum((sa*sb), 0)/np.sqrt(np.sum(sa**2, 0)*np.sum(sb**2, 0))
-
-    corrs = flat_corrs.reshape([cube_a.shape[i] for i in res_ind])
-
-    # Construct cube to hold correlation results.
-    corrs_cube = iris.cube.Cube(corrs)
-    corrs_cube.long_name = "Pearson's r"
-    corrs_cube.units = "1"
-    for i, dim in enumerate(res_ind):
-        c = cube_a.dim_coords[dim]
-        corrs_cube.add_dim_coord(c, i)
-
-    return corrs_cube
+        corr_coords = common_dim_coords
+
+    # Broadcast weights to shape of cube_1 if necessary.
+    if weights is None or cube_1.shape == cube_2.shape:
+        weights_1 = weights
+        weights_2 = weights
+    else:
+        if weights.shape != cube_2.shape:
+            raise ValueError("weights array should have dimensions {}".
+                             format(cube_2.shape))
+        dims_1_common = [i for i in range(cube_1.ndim) if
+                         dim_coords_1[i] in common_dim_coords]
+        weights_1 = broadcast_to_shape(weights, cube_1.shape, dims_1_common)
+        weights_2 = weights
+
+    # Calculate correlations.
+    s1 = cube_1 - cube_1.collapsed(corr_coords, iris.analysis.MEAN,
+                                   weights=weights_1)
+    s2 = cube_2 - cube_2.collapsed(corr_coords, iris.analysis.MEAN,
+                                   weights=weights_2)
+
+    covar = (s1*s2).collapsed(corr_coords, iris.analysis.SUM,
+                              weights=weights_1, mdtol=mdtol)
+    var_1 = (s1**2).collapsed(corr_coords, iris.analysis.SUM,
+                              weights=weights_1)
+    var_2 = (s2**2).collapsed(corr_coords, iris.analysis.SUM,
+                              weights=weights_2)
+
+    denom = iris.analysis.maths.apply_ufunc(np.sqrt, var_1*var_2,
+                                            new_unit=covar.units)
+    corr_cube = covar / denom
+    corr_cube.rename("Pearson's r")
+
+    return corr_cube
diff --git a/lib/iris/tests/analysis/test_stats.py b/lib/iris/tests/analysis/test_stats.py
deleted file mode 100644
index bb4dbadcf7..0000000000
--- a/lib/iris/tests/analysis/test_stats.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# (C) British Crown Copyright 2014 - 2015, Met Office
-#
-# This file is part of Iris.
-#
-# Iris is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the
-# Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# Iris is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public License
-# along with Iris.  If not, see <http://www.gnu.org/licenses/>.
-"""
-Test the iris.analysis.stats module.
-
-"""
-
-from __future__ import (absolute_import, division, print_function)
-from six.moves import (filter, input, map, range, zip)  # noqa
-
-# Import iris tests first so that some things can be initialised before
-# importing anything else.
-import iris.tests as tests
-
-import numpy as np
-
-import iris
-import iris.analysis.stats as stats
-
-
-@tests.skip_data
-class Test_corr(tests.IrisTest):
-    def setUp(self):
-        self.cube_a = iris.load_cube(iris.sample_data_path('GloSea4',
-                                                           'ensemble_001.pp'))
-        self.cube_b = iris.load_cube(iris.sample_data_path('GloSea4',
-                                                           'ensemble_002.pp'))
-
-    def test_perfect_corr(self):
-        r = stats.pearsonr(self.cube_a, self.cube_a,
-                           ['latitude',   'longitude'])
-        self.assertArrayEqual(r.data, np.array([1.]*6))
-
-    def test_perfect_corr_all_dims(self):
-        r = stats.pearsonr(self.cube_a, self.cube_a)
-        self.assertArrayEqual(r.data, np.array([1.]))
-
-    def test_incompatible_cubes(self):
-        with self.assertRaises(ValueError):
-            stats.pearsonr(self.cube_a, self.cube_b[0, :, :], 'time')
-
-    def test_compatible_cubes(self):
-        r = stats.pearsonr(self.cube_a, self.cube_b, ['latitude', 'longitude'])
-        self.assertArrayAlmostEqual(r.data, [0.99733591,
-                                             0.99501693,
-                                             0.99674225,
-                                             0.99495268,
-                                             0.99217004,
-                                             0.99362189])
-
-    def test_4d_cube_2_dims(self):
-        real_0_c = iris.coords.AuxCoord(np.int32(0), 'realization')
-        real_1_c = iris.coords.AuxCoord(np.int32(1), 'realization')
-
-        # Make cubes merge-able.
-        self.cube_a.add_aux_coord(real_0_c)
-        self.cube_b.add_aux_coord(real_1_c)
-        self.cube_a.remove_coord('forecast_period')
-        self.cube_a.remove_coord('forecast_reference_time')
-        self.cube_b.remove_coord('forecast_period')
-        self.cube_b.remove_coord('forecast_reference_time')
-        four_d_cube_a = iris.cube\
-            .CubeList([self.cube_a, self.cube_b]).merge()[0]
-        self.cube_a.remove_coord('realization')
-        self.cube_b.remove_coord('realization')
-        self.cube_a.add_aux_coord(real_1_c)
-        self.cube_b.add_aux_coord(real_0_c)
-        four_d_cube_b = iris.cube\
-            .CubeList([self.cube_a, self.cube_b]).merge()[0]
-
-        r = stats.pearsonr(four_d_cube_a, four_d_cube_b,
-                           ['latitude', 'longitude'])
-        expected_corr = [[0.99733591,
-                          0.99501693,
-                          0.99674225,
-                          0.99495268,
-                          0.99217004,
-                          0.99362189],
-                         [0.99733591,
-                          0.99501693,
-                          0.99674225,
-                          0.99495268,
-                          0.99217004,
-                          0.99362189]]
-        self.assertArrayAlmostEqual(r.data, expected_corr)
-
-    def test_non_existent_coord(self):
-        with self.assertRaises(ValueError):
-            stats.pearsonr(self.cube_a, self.cube_b, 'bad_coord')
-
-
-if __name__ == '__main__':
-    tests.main()
diff --git a/lib/iris/tests/unit/analysis/stats/__init__.py b/lib/iris/tests/unit/analysis/stats/__init__.py
new file mode 100644
index 0000000000..8dd58e46b2
--- /dev/null
+++ b/lib/iris/tests/unit/analysis/stats/__init__.py
@@ -0,0 +1,20 @@
+# (C) British Crown Copyright 2015, Met Office
+#
+# This file is part of Iris.
+#
+# Iris is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the
+# Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Iris is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with Iris.  If not, see <http://www.gnu.org/licenses/>.
+"""Unit tests for the :mod:`iris.analysis.stats` module."""
+
+from __future__ import (absolute_import, division, print_function)
+from six.moves import (filter, input, map, range, zip)  # noqa
diff --git a/lib/iris/tests/unit/analysis/stats/test_pearsonr.py b/lib/iris/tests/unit/analysis/stats/test_pearsonr.py
new file mode 100644
index 0000000000..7f19a659d0
--- /dev/null
+++ b/lib/iris/tests/unit/analysis/stats/test_pearsonr.py
@@ -0,0 +1,126 @@
+# (C) British Crown Copyright 2014 - 2015, Met Office
+#
+# This file is part of Iris.
+#
+# Iris is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the
+# Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Iris is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with Iris.  If not, see <http://www.gnu.org/licenses/>.
+"""Unit tests for the `iris.analysis.stats.pearsonr` function."""
+
+from __future__ import (absolute_import, division, print_function)
+from six.moves import (filter, input, map, range, zip)  # noqa
+
+# Import iris tests first so that some things can be initialised before
+# importing anything else.
+import iris.tests as tests
+
+import numpy as np
+import numpy.ma as ma
+
+import iris
+import iris.analysis.stats as stats
+from iris.exceptions import CoordinateNotFoundError
+
+
+@tests.skip_data
+class Test(tests.IrisTest):
+    def setUp(self):
+        # 3D cubes:
+        cube_temp = iris.load_cube(tests.get_data_path(
+            ('NetCDF', 'global', 'xyt', 'SMALL_total_column_co2.nc')))
+        self.cube_a = cube_temp[0:6]
+        self.cube_b = cube_temp[20:26]
+        cube_temp = self.cube_a.copy()
+        cube_temp.coord('latitude').guess_bounds()
+        cube_temp.coord('longitude').guess_bounds()
+        self.weights = iris.analysis.cartography.area_weights(cube_temp)
+
+    def test_perfect_corr(self):
+        r = stats.pearsonr(self.cube_a, self.cube_a,
+                           ['latitude', 'longitude'])
+        self.assertArrayEqual(r.data, np.array([1.]*6))
+
+    def test_perfect_corr_all_dims(self):
+        r = stats.pearsonr(self.cube_a, self.cube_a)
+        self.assertArrayEqual(r.data, np.array([1.]))
+
+    def test_incompatible_cubes(self):
+        with self.assertRaises(ValueError):
+            stats.pearsonr(self.cube_a[:, 0, :], self.cube_b[0, :, :],
+                           'longitude')
+
+    def test_compatible_cubes(self):
+        r = stats.pearsonr(self.cube_a, self.cube_b, ['latitude', 'longitude'])
+        self.assertArrayAlmostEqual(r.data, [0.81114936,
+                                             0.81690538,
+                                             0.79833135,
+                                             0.81118674,
+                                             0.79745386,
+                                             0.81278484])
+
+    def test_broadcast_cubes(self):
+        r1 = stats.pearsonr(self.cube_a, self.cube_b[0, :, :],
+                            ['latitude', 'longitude'])
+        r2 = stats.pearsonr(self.cube_b[0, :, :], self.cube_a,
+                            ['latitude', 'longitude'])
+        r_by_slice = [
+            stats.pearsonr(self.cube_a[i, :, :], self.cube_b[0, :, :],
+                           ['latitude', 'longitude']).data for i in range(6)
+            ]
+        self.assertArrayEqual(r1.data, np.array(r_by_slice))
+        self.assertArrayEqual(r2.data, np.array(r_by_slice))
+
+    def test_compatible_cubes_weighted(self):
+        r = stats.pearsonr(self.cube_a, self.cube_b, ['latitude', 'longitude'],
+                           self.weights)
+        self.assertArrayAlmostEqual(r.data, [0.79106045,
+                                             0.79989169,
+                                             0.78826918,
+                                             0.79925855,
+                                             0.79011544,
+                                             0.80115837])
+
+    def test_broadcast_cubes_weighted(self):
+        r = stats.pearsonr(self.cube_a, self.cube_b[0, :, :],
+                           ['latitude', 'longitude'],
+                           weights=self.weights[0, :, :])
+        r_by_slice = [
+            stats.pearsonr(self.cube_a[i, :, :], self.cube_b[0, :, :],
+                           ['latitude', 'longitude'],
+                           weights=self.weights[0, :, :]
+                           ).data for i in range(6)
+            ]
+        self.assertArrayAlmostEqual(r.data, np.array(r_by_slice))
+
+    def test_weight_error(self):
+        with self.assertRaises(ValueError):
+            stats.pearsonr(self.cube_a, self.cube_b[0, :, :],
+                           ['latitude', 'longitude'],
+                           weights=self.weights)
+
+    def test_non_existent_coord(self):
+        with self.assertRaises(CoordinateNotFoundError):
+            stats.pearsonr(self.cube_a, self.cube_b, 'bad_coord')
+
+    def test_mdtol(self):
+        cube_small = self.cube_a[:, 0, 0]
+        cube_small_masked = cube_small.copy()
+        cube_small_masked.data = ma.array(
+            cube_small.data, mask=np.array([0, 0, 0, 1, 1, 1], dtype=bool))
+        r1 = stats.pearsonr(cube_small, cube_small_masked)
+        r2 = stats.pearsonr(cube_small, cube_small_masked, mdtol=0.49)
+        self.assertArrayAlmostEqual(r1.data, np.array([0.74586593]))
+        self.assertMaskedArrayEqual(r2.data, ma.array([0], mask=[True]))
+
+
+if __name__ == '__main__':
+    tests.main()