Merge pull request #412 from davidhassell/dask-mean-of-upper-decile

dask: `Data.mean_of_upper_decile`
NCAS-CMS · Jun 22, 2022 · bf6b5f7 · bf6b5f7
2 parents c7ed270 + 4d65691
commit bf6b5f7
Show file tree

Hide file tree

Showing 4 changed files with 131 additions and 100 deletions.
diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py
@@ -4,9 +4,6 @@
 instance, as would be passed to `dask.array.map_blocks`.
 
 """
-from functools import reduce
-from operator import mul
-
 import dask.array as da
 import numpy as np
 from dask.core import flatten
@@ -271,6 +268,8 @@ def cf_percentile(a, q, axis, method, keepdims=False, mtol=1):
         `numpy.ndarray`
 
     """
+    from math import prod
+
     if np.ma.is_masked(a):
         # ------------------------------------------------------------
         # Input array is masked: Replace missing values with NaNs and
@@ -285,8 +284,8 @@ def cf_percentile(a, q, axis, method, keepdims=False, mtol=1):
             # Count the number of missing values that contribute to
             # each output percentile value and make a corresponding
             # mask
-            full_size = reduce(
-                mul, [size for i, size in enumerate(a.shape) if i in axis], 1
+            full_size = prod(
+                [size for i, size in enumerate(a.shape) if i in axis]
             )
             n_missing = full_size - np.ma.count(
                 a, axis=axis, keepdims=keepdims

diff --git a/cf/data/data.py b/cf/data/data.py
@@ -1782,21 +1782,24 @@ def median(self, axes=None, squeeze=False, mtol=1, inplace=False):
             50, axes=axes, squeeze=squeeze, mtol=mtol, inplace=inplace
         )
 
+    @daskified(_DASKIFIED_VERBOSE)
     @_inplace_enabled(default=False)
     def mean_of_upper_decile(
         self,
         axes=None,
         weights=None,
+        method="linear",
         squeeze=False,
         mtol=1,
         include_decile=True,
         split_every=None,
         inplace=False,
     ):
-        """Calculate means of the upper deciles.
+        """Mean of values defined by the upper tenth of their
+        distribution.
 
-        Calculates the mean of the upper decile or the mean or the
-        mean of the upper decile values along axes.
+        For the values defined by the upper tenth of their
+        distribution, calculates their mean, or their mean along axes.
 
         See
         https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods
@@ -1810,20 +1813,25 @@ def mean_of_upper_decile(
 
             {{weights: data_like, `dict`, or `None`, optional}}
 
-                TODODASK - note that weights only applies to the
-                           calculation of the mean, not the upper
-                           decile.
+                .. note:: *weights* only applies to the calculation of
+                          the mean defined by the upper tenth of their
+                          distribution.
+
+            {{percentile method: `str`, optional}}
+
+                .. versionadded:: TODODASK
 
             {{collapse squeeze: `bool`, optional}}
 
             {{mtol: number, optional}}
 
-                TODODASK - note that mtol only applies to the
-                           calculation of the upper decile, not the
-                           mean.
+                .. note:: *mtol* only applies to the calculation of
+                          the location of the 90th percentile.
 
             include_decile: `bool`, optional
-                TODODASK
+                If True then include in the mean any values that are
+                equal to the 90th percentile. By default these are
+                excluded.
 
             {{split_every: `int` or `dict`, optional}}
 
@@ -1839,35 +1847,40 @@ def mean_of_upper_decile(
 
         **Examples**
 
-        TODODASK
+        >>> d = cf.Data(np.arange(20).reshape(4, 5), 'm')
+        >>> print(d.array)
+        [[ 0  1  2  3  4]
+         [ 5  6  7  8  9]
+         [10 11 12 13 14]
+         [15 16 17 18 19]]
+        >>> e = d.mean_of_upper_decile()
+        >>> e
+        <CF Data(1, 1): [[18.5]] m>
 
         """
-
-        # TODODASK: Some updates off the back of daskifying collapse
-        #           have been done, but still needs looking at. A unit
-        #           test has also been written, but not run. Needs
-        #           __lt__ and __le__.
-
         d = _inplace_enabled_define_and_cleanup(self)
 
+        # Find the 90th percentile
         p90 = d.percentile(
             90, axes=axes, squeeze=False, mtol=mtol, inplace=False
         )
 
-        with np.testing.suppress_warnings() as sup:
-            sup.filter(
-                RuntimeWarning, message=".*invalid value encountered in less.*"
-            )
-            if include_decile:
-                mask = d < p90
-            else:
-                mask = d <= p90
+        # Mask all elements that are less than (or equal to) the 90th
+        # percentile
+        if include_decile:
+            less_than_p90 = d < p90
+        else:
+            less_than_p90 = d <= p90
 
         if mtol < 1:
-            mask.filled(False, inplace=True)
+            # Set missing values to True to ensure that 'd' gets
+            # masked at those locations
+            less_than_p90.filled(True, inplace=True)
 
-        d.where(mask, cf_masked, inplace=True)
+        d.where(less_than_p90, cf_masked, inplace=True)
 
+        # Find the mean of elements greater than (or equal to) the
+        # 90th percentile
         d.mean(
             axes=axes,
             weights=weights,
@@ -1947,36 +1960,9 @@ def percentile(
 
                 By default, of *axes* is `None`, all axes are selected.
 
-            method: `str`, optional
-                Specify the interpolation method to use when the
-                desired percentile lies between two data values. The
-                methods are listed here, but their definitions must be
-                referenced from the documentation for
-                `numpy.percentile`.
-
-                For the default ``'linear'`` method, if the percentile
-                lies between two adjacent data values ``i < j`` then
-                the percentile is calculated as ``i+(j-i)*fraction``,
-                where ``fraction`` is the fractional part of the index
-                surrounded by ``i`` and ``j``.
-
-                ===============================
-                *method*
-                ===============================
-                ``'inverted_cdf'``
-                ``'averaged_inverted_cdf'``
-                ``'closest_observation'``
-                ``'interpolated_inverted_cdf'``
-                ``'hazen'``
-                ``'weibull'``
-                ``'linear'`` (default)
-                ``'median_unbiased'``
-                ``'normal_unbiased'``
-                ``'lower'``
-                ``'higher'``
-                ``'nearest'``
-                ``'midpoint'``
-                ===============================
+            {{percentile method: `str`, optional}}
+
+                .. versionadded:: TODODASK
 
             squeeze: `bool`, optional
                 If True then all axes over which percentiles are
@@ -7990,10 +7976,10 @@ def filled(self, fill_value=None, inplace=False):
 
         **Examples**
 
-        >>> d = {{package}}.Data([[1, 2, 3]])
+        >>> d = cf.Data([[1, 2, 3]])
         >>> print(d.filled().array)
         [[1 2 3]]
-        >>> d[0, 0] = cfdm.masked
+        >>> d[0, 0] = cf.masked
         >>> print(d.filled().array)
         [-9223372036854775806                    2                    3]
         >>> d.set_fill_value(-99)

diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py
@@ -341,6 +341,37 @@
 
                 When *weights* is a data_like object then it must have
                 the same shape as the array.""",
+    # percentile method
+    "{{percentile method: `str`, optional}}": """method: `str`, optional
+                Specify the interpolation method to use when the
+                percentile lies between two data values. The methods
+                are listed here, but their definitions must be
+                referenced from the documentation for
+                `numpy.percentile`.
+
+                For the default ``'linear'`` method, if the percentile
+                lies between two adjacent data values ``i < j`` then
+                the percentile is calculated as ``i+(j-i)*fraction``,
+                where ``fraction`` is the fractional part of the index
+                surrounded by ``i`` and ``j``.
+
+                ===============================
+                *method*
+                ===============================
+                ``'inverted_cdf'``
+                ``'averaged_inverted_cdf'``
+                ``'closest_observation'``
+                ``'interpolated_inverted_cdf'``
+                ``'hazen'``
+                ``'weibull'``
+                ``'linear'`` (default)
+                ``'median_unbiased'``
+                ``'normal_unbiased'``
+                ``'lower'``
+                ``'higher'``
+                ``'nearest'``
+                ``'midpoint'``
+                ===============================""",
     # ----------------------------------------------------------------
     # Method description susbstitutions (4 levels of indentataion)
     # ----------------------------------------------------------------