diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index 0771355e48..ad6c0c33b5 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -4,9 +4,6 @@ instance, as would be passed to `dask.array.map_blocks`. """ -from functools import reduce -from operator import mul - import dask.array as da import numpy as np from dask.core import flatten @@ -271,6 +268,8 @@ def cf_percentile(a, q, axis, method, keepdims=False, mtol=1): `numpy.ndarray` """ + from math import prod + if np.ma.is_masked(a): # ------------------------------------------------------------ # Input array is masked: Replace missing values with NaNs and @@ -285,8 +284,8 @@ def cf_percentile(a, q, axis, method, keepdims=False, mtol=1): # Count the number of missing values that contribute to # each output percentile value and make a corresponding # mask - full_size = reduce( - mul, [size for i, size in enumerate(a.shape) if i in axis], 1 + full_size = prod( + [size for i, size in enumerate(a.shape) if i in axis] ) n_missing = full_size - np.ma.count( a, axis=axis, keepdims=keepdims diff --git a/cf/data/data.py b/cf/data/data.py index 3f1281d1db..a2e511e3cb 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -1782,21 +1782,24 @@ def median(self, axes=None, squeeze=False, mtol=1, inplace=False): 50, axes=axes, squeeze=squeeze, mtol=mtol, inplace=inplace ) + @daskified(_DASKIFIED_VERBOSE) @_inplace_enabled(default=False) def mean_of_upper_decile( self, axes=None, weights=None, + method="linear", squeeze=False, mtol=1, include_decile=True, split_every=None, inplace=False, ): - """Calculate means of the upper deciles. + """Mean of values defined by the upper tenth of their + distribution. - Calculates the mean of the upper decile or the mean or the - mean of the upper decile values along axes. + For the values defined by the upper tenth of their + distribution, calculates their mean, or their mean along axes. See https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods @@ -1810,20 +1813,25 @@ def mean_of_upper_decile( {{weights: data_like, `dict`, or `None`, optional}} - TODODASK - note that weights only applies to the - calculation of the mean, not the upper - decile. + .. note:: *weights* only applies to the calculation of + the mean defined by the upper tenth of their + distribution. + + {{percentile method: `str`, optional}} + + .. versionadded:: TODODASK {{collapse squeeze: `bool`, optional}} {{mtol: number, optional}} - TODODASK - note that mtol only applies to the - calculation of the upper decile, not the - mean. + .. note:: *mtol* only applies to the calculation of + the location of the 90th percentile. include_decile: `bool`, optional - TODODASK + If True then include in the mean any values that are + equal to the 90th percentile. By default these are + excluded. {{split_every: `int` or `dict`, optional}} @@ -1839,35 +1847,40 @@ def mean_of_upper_decile( **Examples** - TODODASK + >>> d = cf.Data(np.arange(20).reshape(4, 5), 'm') + >>> print(d.array) + [[ 0 1 2 3 4] + [ 5 6 7 8 9] + [10 11 12 13 14] + [15 16 17 18 19]] + >>> e = d.mean_of_upper_decile() + >>> e + """ - - # TODODASK: Some updates off the back of daskifying collapse - # have been done, but still needs looking at. A unit - # test has also been written, but not run. Needs - # __lt__ and __le__. - d = _inplace_enabled_define_and_cleanup(self) + # Find the 90th percentile p90 = d.percentile( 90, axes=axes, squeeze=False, mtol=mtol, inplace=False ) - with np.testing.suppress_warnings() as sup: - sup.filter( - RuntimeWarning, message=".*invalid value encountered in less.*" - ) - if include_decile: - mask = d < p90 - else: - mask = d <= p90 + # Mask all elements that are less than (or equal to) the 90th + # percentile + if include_decile: + less_than_p90 = d < p90 + else: + less_than_p90 = d <= p90 if mtol < 1: - mask.filled(False, inplace=True) + # Set missing values to True to ensure that 'd' gets + # masked at those locations + less_than_p90.filled(True, inplace=True) - d.where(mask, cf_masked, inplace=True) + d.where(less_than_p90, cf_masked, inplace=True) + # Find the mean of elements greater than (or equal to) the + # 90th percentile d.mean( axes=axes, weights=weights, @@ -1947,36 +1960,9 @@ def percentile( By default, of *axes* is `None`, all axes are selected. - method: `str`, optional - Specify the interpolation method to use when the - desired percentile lies between two data values. The - methods are listed here, but their definitions must be - referenced from the documentation for - `numpy.percentile`. - - For the default ``'linear'`` method, if the percentile - lies between two adjacent data values ``i < j`` then - the percentile is calculated as ``i+(j-i)*fraction``, - where ``fraction`` is the fractional part of the index - surrounded by ``i`` and ``j``. - - =============================== - *method* - =============================== - ``'inverted_cdf'`` - ``'averaged_inverted_cdf'`` - ``'closest_observation'`` - ``'interpolated_inverted_cdf'`` - ``'hazen'`` - ``'weibull'`` - ``'linear'`` (default) - ``'median_unbiased'`` - ``'normal_unbiased'`` - ``'lower'`` - ``'higher'`` - ``'nearest'`` - ``'midpoint'`` - =============================== + {{percentile method: `str`, optional}} + + .. versionadded:: TODODASK squeeze: `bool`, optional If True then all axes over which percentiles are @@ -7990,10 +7976,10 @@ def filled(self, fill_value=None, inplace=False): **Examples** - >>> d = {{package}}.Data([[1, 2, 3]]) + >>> d = cf.Data([[1, 2, 3]]) >>> print(d.filled().array) [[1 2 3]] - >>> d[0, 0] = cfdm.masked + >>> d[0, 0] = cf.masked >>> print(d.filled().array) [-9223372036854775806 2 3] >>> d.set_fill_value(-99) diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py index 26537d7cec..364e246434 100644 --- a/cf/docstring/docstring.py +++ b/cf/docstring/docstring.py @@ -341,6 +341,37 @@ When *weights* is a data_like object then it must have the same shape as the array.""", + # percentile method + "{{percentile method: `str`, optional}}": """method: `str`, optional + Specify the interpolation method to use when the + percentile lies between two data values. The methods + are listed here, but their definitions must be + referenced from the documentation for + `numpy.percentile`. + + For the default ``'linear'`` method, if the percentile + lies between two adjacent data values ``i < j`` then + the percentile is calculated as ``i+(j-i)*fraction``, + where ``fraction`` is the fractional part of the index + surrounded by ``i`` and ``j``. + + =============================== + *method* + =============================== + ``'inverted_cdf'`` + ``'averaged_inverted_cdf'`` + ``'closest_observation'`` + ``'interpolated_inverted_cdf'`` + ``'hazen'`` + ``'weibull'`` + ``'linear'`` (default) + ``'median_unbiased'`` + ``'normal_unbiased'`` + ``'lower'`` + ``'higher'`` + ``'nearest'`` + ``'midpoint'`` + ===============================""", # ---------------------------------------------------------------- # Method description susbstitutions (4 levels of indentataion) # ---------------------------------------------------------------- diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 69df26c7f1..af47cd97ca 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -2644,7 +2644,7 @@ def test_Data_trigonometric_hyperbolic(self): a = np.sin(a.data) c = getattr(np.ma, method)(a) - for units in (None, "", "1", "radians", "K"): + for units in (None, "", "1", "radians", "m"): d = cf.Data(a, units=units) # Suppress warnings that some values are # invalid (NaN, +/- inf) or there is @@ -3173,7 +3173,7 @@ def test_Data_integral(self): # Masked array, non-masked weights a = self.ma weights = self.w - d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) for axis in axis_combinations(a): b = reshape_array(a, axis) @@ -3190,7 +3190,7 @@ def test_Data_integral(self): def test_Data_max(self): # Masked array a = self.ma - d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) for axis in axis_combinations(a): b = reshape_array(a, axis) @@ -3206,7 +3206,7 @@ def test_Data_max(self): def test_Data_maximum_absolute_value(self): # Masked array a = self.ma - d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) for axis in axis_combinations(a): b = reshape_array(a, axis) @@ -3223,7 +3223,7 @@ def test_Data_mean(self): # Masked array, non-masked weights a = self.ma weights = self.w - d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) for axis in axis_combinations(a): b = reshape_array(a, axis) @@ -3241,7 +3241,7 @@ def test_Data_mean_absolute_value(self): # Masked array, non-masked weights a = self.ma weights = self.w - d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) for axis in axis_combinations(a): b = reshape_array(a, axis) @@ -3258,7 +3258,7 @@ def test_Data_mean_absolute_value(self): def test_Data_mid_range(self): # Masked array, non-masked weights a = self.ma - d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) for axis in axis_combinations(a): b = reshape_array(a, axis) @@ -3277,7 +3277,7 @@ def test_Data_mid_range(self): def test_Data_min(self): # Masked array a = self.ma - d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) for axis in axis_combinations(a): b = reshape_array(a, axis) @@ -3293,7 +3293,7 @@ def test_Data_min(self): def test_Data_minimum_absolute_value(self): # Masked array a = self.ma - d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) for axis in axis_combinations(a): b = reshape_array(a, axis) @@ -3310,7 +3310,7 @@ def test_Data_range(self): # Masked array a = self.ma - d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) for axis in axis_combinations(a): b = reshape_array(a, axis) @@ -3330,7 +3330,7 @@ def test_Data_root_mean_square(self): # Masked array, non-masked weights a = self.ma weights = self.w - d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) for axis in axis_combinations(a): b = reshape_array(a, axis) @@ -3347,7 +3347,7 @@ def test_Data_root_mean_square(self): def test_Data_sample_size(self): # Masked array a = self.ma - d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) for axis in axis_combinations(a): b = reshape_array(a, axis) @@ -3362,7 +3362,7 @@ def test_Data_sample_size(self): # Non-masked array a = self.a - d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) for axis in axis_combinations(a): b = reshape_array(a, axis) @@ -3378,7 +3378,7 @@ def test_Data_std(self): # Masked array, non-masked weights a = self.ma weights = self.w - d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) std = d.std(weights=weights, ddof=1) var = d.var(weights=weights, ddof=1) @@ -3389,7 +3389,7 @@ def test_Data_sum(self): # Masked array, non-masked weights a = self.ma weights = self.w - d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) for axis in axis_combinations(a): b = reshape_array(a, axis) @@ -3407,7 +3407,7 @@ def test_Data_sum_of_squares(self): # Masked array, non-masked weights a = self.ma weights = self.w - d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) for axis in axis_combinations(a): b = reshape_array(a, axis) @@ -3425,7 +3425,7 @@ def test_Data_sum_of_weights(self): # Masked array, non-masked weights a = self.ma weights = self.w - d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) # Weights=None for axis in axis_combinations(a): @@ -3456,7 +3456,7 @@ def test_Data_sum_of_weights2(self): # Masked array, non-masked weights a = self.ma weights = self.w - d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) # Weights=None for axis in axis_combinations(a): @@ -3481,7 +3481,7 @@ def test_Data_var(self): # Masked array, non-masked weights a = self.ma weights = self.w - d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) # Weighted ddof = 0 for axis in axis_combinations(a): @@ -3536,12 +3536,11 @@ def test_Data_var(self): self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - @unittest.skipIf(TEST_DASKIFIED_ONLY, "Needs __lt__ and __le__") def test_Data_mean_of_upper_decile(self): # Masked array, non-masked weights a = self.ma weights = self.w - d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) for axis in axis_combinations(a): b = reshape_array(a, axis) @@ -3549,7 +3548,12 @@ def test_Data_mean_of_upper_decile(self): b = np.ma.filled(b, np.nan) with np.testing.suppress_warnings() as sup: sup.filter( - RuntimeWarning, message=".*All-NaN slice encountered" + category=RuntimeWarning, + message=".*All-NaN slice encountered.*", + ) + sup.filter( + category=UserWarning, + message="Warning: 'partition' will ignore the 'mask' of the MaskedArray.*", ) p = np.nanpercentile(b, 90, axis=-1, keepdims=True) @@ -3558,8 +3562,8 @@ def test_Data_mean_of_upper_decile(self): with np.testing.suppress_warnings() as sup: sup.filter( - RuntimeWarning, - message=".*invalid value encountered in less", + category=UserWarning, + message="Warning: 'partition' will ignore the 'mask' of the MaskedArray.*", ) b = np.ma.where(b < p, np.ma.masked, b) @@ -3569,14 +3573,30 @@ def test_Data_mean_of_upper_decile(self): e = d.mean_of_upper_decile( axes=axis, weights=weights, squeeze=True ) - e = np.ma.array(e.array) + with np.testing.suppress_warnings() as sup: + sup.filter( + category=UserWarning, + message="Warning: 'partition' will ignore the 'mask' of the MaskedArray.*", + ) + e = e.array - self.assertTrue((e.mask == b.mask).all()) + self.assertTrue( + (np.ma.getmaskarray(e) == np.ma.getmaskarray(b)).all() + ) self.assertTrue(np.allclose(e, b)) + # mtol + a[0, 0] = cf.masked + d = cf.Data(a, "m", chunks=3) + e = d.mean_of_upper_decile(mtol=0) + self.assertEqual(e.array.mask, True) + + # Inplace + self.assertIsNone(d.mean_of_upper_decile(inplace=True)) + def test_Data_collapse_mtol(self): # Data with exactly half of its elements masked - d = cf.Data(np.arange(6), "K", mask=[0, 1, 0, 1, 0, 1], chunks=2) + d = cf.Data(np.arange(6), "m", mask=[0, 1, 0, 1, 0, 1], chunks=2) for func in ( d.integral, @@ -3601,10 +3621,8 @@ def test_Data_collapse_mtol(self): self.assertTrue(func(mtol=0.4).array.mask) self.assertFalse(func(mtol=0.5).array.mask) - # TODODASK - add in mean_of_upper_decile when it's daskified - def test_Data_collapse_units(self): - d = cf.Data([1, 2], "K") + d = cf.Data([1, 2], "m") self.assertEqual(d.sample_size().Units, cf.Units()) @@ -3622,6 +3640,7 @@ def test_Data_collapse_units(self): d.root_mean_square, d.std, d.sum, + d.mean_of_upper_decile, ): self.assertEqual(func().Units, d.Units) @@ -3644,8 +3663,6 @@ def test_Data_collapse_units(self): for func in (d.sum_of_squares, d.var): self.assertEqual(func().Units, cf.Units()) - # TODODASK - add in mean_of_upper_decile when it's daskified - def test_Data_collapse_keepdims(self): d = cf.Data(np.arange(6).reshape(2, 3)) @@ -3668,6 +3685,7 @@ def test_Data_collapse_keepdims(self): d.sum_of_weights, d.sum_of_weights2, d.var, + d.mean_of_upper_decile, ): for axis in axis_combinations(d): e = func(axes=axis, squeeze=False) @@ -3679,8 +3697,6 @@ def test_Data_collapse_keepdims(self): s = [n for i, n in enumerate(d.shape) if i not in axis] self.assertEqual(e.shape, tuple(s)) - # TODODASK - add in mean_of_upper_decile - def test_Data_collapse_dtype(self): d = cf.Data([1, 2, 3, 4], dtype="i4", chunks=2) e = cf.Data([1.0, 2, 3, 4], dtype="f4", chunks=2) @@ -3716,6 +3732,7 @@ def test_Data_collapse_dtype(self): x.root_mean_square, x.std, x.var, + x.mean_of_upper_decile, ): self.assertEqual(func().dtype, r) @@ -3736,8 +3753,6 @@ def test_Data_collapse_dtype(self): ): self.assertTrue(func(weights=w).dtype, r) - # TODODASK - add in mean_of_upper_decile - def test_Data_get_units(self): for units in ("", "m", "days since 2000-01-01"): d = cf.Data(1, units)