Skip to content

Commit

Permalink
Merge pull request #412 from davidhassell/dask-mean-of-upper-decile
Browse files Browse the repository at this point in the history
dask: `Data.mean_of_upper_decile`
  • Loading branch information
davidhassell authored Jun 22, 2022
2 parents c7ed270 + 4d65691 commit bf6b5f7
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 100 deletions.
9 changes: 4 additions & 5 deletions cf/data/dask_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@
instance, as would be passed to `dask.array.map_blocks`.
"""
from functools import reduce
from operator import mul

import dask.array as da
import numpy as np
from dask.core import flatten
Expand Down Expand Up @@ -271,6 +268,8 @@ def cf_percentile(a, q, axis, method, keepdims=False, mtol=1):
`numpy.ndarray`
"""
from math import prod

if np.ma.is_masked(a):
# ------------------------------------------------------------
# Input array is masked: Replace missing values with NaNs and
Expand All @@ -285,8 +284,8 @@ def cf_percentile(a, q, axis, method, keepdims=False, mtol=1):
# Count the number of missing values that contribute to
# each output percentile value and make a corresponding
# mask
full_size = reduce(
mul, [size for i, size in enumerate(a.shape) if i in axis], 1
full_size = prod(
[size for i, size in enumerate(a.shape) if i in axis]
)
n_missing = full_size - np.ma.count(
a, axis=axis, keepdims=keepdims
Expand Down
104 changes: 45 additions & 59 deletions cf/data/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1782,21 +1782,24 @@ def median(self, axes=None, squeeze=False, mtol=1, inplace=False):
50, axes=axes, squeeze=squeeze, mtol=mtol, inplace=inplace
)

@daskified(_DASKIFIED_VERBOSE)
@_inplace_enabled(default=False)
def mean_of_upper_decile(
self,
axes=None,
weights=None,
method="linear",
squeeze=False,
mtol=1,
include_decile=True,
split_every=None,
inplace=False,
):
"""Calculate means of the upper deciles.
"""Mean of values defined by the upper tenth of their
distribution.
Calculates the mean of the upper decile or the mean or the
mean of the upper decile values along axes.
For the values defined by the upper tenth of their
distribution, calculates their mean, or their mean along axes.
See
https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods
Expand All @@ -1810,20 +1813,25 @@ def mean_of_upper_decile(
{{weights: data_like, `dict`, or `None`, optional}}
TODODASK - note that weights only applies to the
calculation of the mean, not the upper
decile.
.. note:: *weights* only applies to the calculation of
the mean defined by the upper tenth of their
distribution.
{{percentile method: `str`, optional}}
.. versionadded:: TODODASK
{{collapse squeeze: `bool`, optional}}
{{mtol: number, optional}}
TODODASK - note that mtol only applies to the
calculation of the upper decile, not the
mean.
.. note:: *mtol* only applies to the calculation of
the location of the 90th percentile.
include_decile: `bool`, optional
TODODASK
If True then include in the mean any values that are
equal to the 90th percentile. By default these are
excluded.
{{split_every: `int` or `dict`, optional}}
Expand All @@ -1839,35 +1847,40 @@ def mean_of_upper_decile(
**Examples**
TODODASK
>>> d = cf.Data(np.arange(20).reshape(4, 5), 'm')
>>> print(d.array)
[[ 0 1 2 3 4]
[ 5 6 7 8 9]
[10 11 12 13 14]
[15 16 17 18 19]]
>>> e = d.mean_of_upper_decile()
>>> e
<CF Data(1, 1): [[18.5]] m>
"""

# TODODASK: Some updates off the back of daskifying collapse
# have been done, but still needs looking at. A unit
# test has also been written, but not run. Needs
# __lt__ and __le__.

d = _inplace_enabled_define_and_cleanup(self)

# Find the 90th percentile
p90 = d.percentile(
90, axes=axes, squeeze=False, mtol=mtol, inplace=False
)

with np.testing.suppress_warnings() as sup:
sup.filter(
RuntimeWarning, message=".*invalid value encountered in less.*"
)
if include_decile:
mask = d < p90
else:
mask = d <= p90
# Mask all elements that are less than (or equal to) the 90th
# percentile
if include_decile:
less_than_p90 = d < p90
else:
less_than_p90 = d <= p90

if mtol < 1:
mask.filled(False, inplace=True)
# Set missing values to True to ensure that 'd' gets
# masked at those locations
less_than_p90.filled(True, inplace=True)

d.where(mask, cf_masked, inplace=True)
d.where(less_than_p90, cf_masked, inplace=True)

# Find the mean of elements greater than (or equal to) the
# 90th percentile
d.mean(
axes=axes,
weights=weights,
Expand Down Expand Up @@ -1947,36 +1960,9 @@ def percentile(
By default, of *axes* is `None`, all axes are selected.
method: `str`, optional
Specify the interpolation method to use when the
desired percentile lies between two data values. The
methods are listed here, but their definitions must be
referenced from the documentation for
`numpy.percentile`.
For the default ``'linear'`` method, if the percentile
lies between two adjacent data values ``i < j`` then
the percentile is calculated as ``i+(j-i)*fraction``,
where ``fraction`` is the fractional part of the index
surrounded by ``i`` and ``j``.
===============================
*method*
===============================
``'inverted_cdf'``
``'averaged_inverted_cdf'``
``'closest_observation'``
``'interpolated_inverted_cdf'``
``'hazen'``
``'weibull'``
``'linear'`` (default)
``'median_unbiased'``
``'normal_unbiased'``
``'lower'``
``'higher'``
``'nearest'``
``'midpoint'``
===============================
{{percentile method: `str`, optional}}
.. versionadded:: TODODASK
squeeze: `bool`, optional
If True then all axes over which percentiles are
Expand Down Expand Up @@ -7990,10 +7976,10 @@ def filled(self, fill_value=None, inplace=False):
**Examples**
>>> d = {{package}}.Data([[1, 2, 3]])
>>> d = cf.Data([[1, 2, 3]])
>>> print(d.filled().array)
[[1 2 3]]
>>> d[0, 0] = cfdm.masked
>>> d[0, 0] = cf.masked
>>> print(d.filled().array)
[-9223372036854775806 2 3]
>>> d.set_fill_value(-99)
Expand Down
31 changes: 31 additions & 0 deletions cf/docstring/docstring.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,37 @@
When *weights* is a data_like object then it must have
the same shape as the array.""",
# percentile method
"{{percentile method: `str`, optional}}": """method: `str`, optional
Specify the interpolation method to use when the
percentile lies between two data values. The methods
are listed here, but their definitions must be
referenced from the documentation for
`numpy.percentile`.
For the default ``'linear'`` method, if the percentile
lies between two adjacent data values ``i < j`` then
the percentile is calculated as ``i+(j-i)*fraction``,
where ``fraction`` is the fractional part of the index
surrounded by ``i`` and ``j``.
===============================
*method*
===============================
``'inverted_cdf'``
``'averaged_inverted_cdf'``
``'closest_observation'``
``'interpolated_inverted_cdf'``
``'hazen'``
``'weibull'``
``'linear'`` (default)
``'median_unbiased'``
``'normal_unbiased'``
``'lower'``
``'higher'``
``'nearest'``
``'midpoint'``
===============================""",
# ----------------------------------------------------------------
# Method description susbstitutions (4 levels of indentataion)
# ----------------------------------------------------------------
Expand Down
Loading

0 comments on commit bf6b5f7

Please sign in to comment.