From dd63d091eafdc4602a43371f07b8f8c794fafb09 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 10 Feb 2022 12:27:52 +0000 Subject: [PATCH 1/4] dask: Data.__contains__ --- cf/data/dask_utils.py | 26 ++++++++++++++++++++++ cf/data/data.py | 52 ++++++++++++++++++++++++++++++------------- cf/test/test_Data.py | 17 +++++++------- 3 files changed, 72 insertions(+), 23 deletions(-) diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index d858b8ecf1..766ae576ea 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -107,6 +107,32 @@ def allclose(a_blocks, b_blocks, rtol=rtol, atol=atol): ) +def cf_contains(a, value=None): + """Whether or not an array contains a value. + + .. versionadded:: TODODASK + + .. seealso:: `cf.Data.__contains__` + + :Parameters: + + a: `numpy.ndarray` + The array. + + value: array_like + The value. + + :Returns: + + `numpy.ndarray` + A size 1 Boolean array, with the same number of dimensions + as *a*, that indicates whether or not *a* contains the + value. + + """ + return np.array(value in a).reshape((1,) * a.ndim) + + try: from scipy.ndimage.filters import convolve1d except ImportError: diff --git a/cf/data/data.py b/cf/data/data.py index 3ec449b74d..e814126e61 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -105,6 +105,7 @@ ) from .dask_utils import ( _da_ma_allclose, + cf_contains, cf_harden_mask, cf_percentile, cf_soften_mask, @@ -648,37 +649,56 @@ def __contains__(self, value): x.__contains__(y) <==> y in x - Returns True if the value is contained anywhere in the data - array. The value may be a `cf.Data` object. + Returns True if the *value* is contained anywhere in the + data. **Performance** - All delayed operations are exectued, and there is no - short-circuit once the first occurrence is found. + `__contains__` causes all delayed operations to be computed, + unless either *value* has more than one element, or *value* is + a `Data` object with incompatible units. In both cases `False` + is returned without checking the data. - **Examples:** + **Examples** - >>> d = cf.Data([[0.0, 1, 2], [3, 4, 5]], 'm') + >>> d = cf.Data([[0, 1, 2], [3, 4, 5]], 'm') >>> 4 in d True - >>> cf.Data(3) in d + >>> 4.0 in d True - >>> cf.Data([2.5], units='2 m') in d + >>> cf.Data(5) in d + True + >>> cf.Data(5, 'm') in d + True + >>> cf.Data([0.005], 'km') in d True >>> [[2]] in d True - >>> numpy.array([[[2]]]) in d + >>> np.array([[[2]]]) in d True - >>> Data(2, 'seconds') in d + >>> 99 in d + False + >>> [1, 2] in d + False + >>> cf.Data(2, 'seconds') in d False """ + # No need to check the dask array if the size of value is + # greater than 1 + size = getattr(value, "size", None) + if size is None: + try: + size = len(value) + except TypeError: + # value has no len() + pass - def contains_chunk(a, value): - out = value in a - return np.array(out).reshape((1,) * a.ndim) + if size is not None and size > 1: + return False - if isinstance(value, self.__class__): # TODDASK chek aother type stoo + # If value is a Data object then conform its units + if isinstance(value, self.__class__): self_units = self.Units value_units = value.Units if value_units.equivalent(self_units): @@ -686,6 +706,8 @@ def contains_chunk(a, value): value = value.copy() value.Units = self_units elif value_units: + # No need to check the dask array if the value uits + # are incompatible return False value = value._get_dask() @@ -696,7 +718,7 @@ def contains_chunk(a, value): dx_ind = out_ind dx = da.blockwise( - partial(contains_chunk, value=value), + partial(cf_contains, value=value), out_ind, dx, dx_ind, diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index c6ae1d5a2a..f6488c1726 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -1083,22 +1083,23 @@ def test_Data_AUXILIARY_MASK(self): self.assertEqual(f.shape, fm.shape) self.assertTrue((f._auxiliary_mask_return().array == fm).all()) - @unittest.skipIf(TEST_DASKIFIED_ONLY, "TypeError: 'int' is not iterable") - def test_Data___contains__(self): + def test_Data__contains__(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - d = cf.Data([[0.0, 1, 2], [3, 4, 5]], units="m") + d = cf.Data([[0, 1, 2], [3, 4, 5]], units="m", chunks=2) self.assertIn(4, d) - self.assertNotIn(40, d) + self.assertIn(4.0, d) self.assertIn(cf.Data(3), d) self.assertIn(cf.Data([[[[3]]]]), d) - value = d[1, 2] - value.Units *= 2 - value.squeeze(0) - self.assertIn(value, d) + self.assertIn(cf.Data([0.005], "km"), d) self.assertIn(np.array([[[2]]]), d) + self.assertNotIn(99, d) + self.assertNotIn([1, 2], d) + self.assertNotIn(np.array([1, 2]), d) + self.assertNotIn(cf.Data(2, "seconds"), d) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_asdata(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: From 6a0975c4de114e821c67bb5aa5824df71f8f82ee Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 16 Feb 2022 08:30:53 +0000 Subject: [PATCH 2/4] Typo Co-authored-by: Sadie L. Bartholomew --- cf/data/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/data.py b/cf/data/data.py index e814126e61..68fd85bb90 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -661,7 +661,7 @@ def __contains__(self, value): **Examples** - >>> d = cf.Data([[0, 1, 2], [3, 4, 5]], 'm') + >>> d = cf.Data([[0, 1, 2], [3, 4, 5]], 'm') >>> 4 in d True >>> 4.0 in d From bb73f8423cf5d7c6637e05b6c88782675dfd7268 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 1 Mar 2022 12:11:08 +0000 Subject: [PATCH 3/4] Typo Co-authored-by: Sadie L. Bartholomew --- cf/data/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/data.py b/cf/data/data.py index 68fd85bb90..af6441ae44 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -706,7 +706,7 @@ def __contains__(self, value): value = value.copy() value.Units = self_units elif value_units: - # No need to check the dask array if the value uits + # No need to check the dask array if the value units # are incompatible return False From 7cef4874a8a1420c6be301ac27f232754f424058 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 1 Mar 2022 13:32:26 +0000 Subject: [PATCH 4/4] only allow scalar comparisons --- cf/data/dask_utils.py | 2 +- cf/data/data.py | 81 ++++++++++++++++++++++++++++--------------- cf/test/test_Data.py | 61 ++++++++++++++++++++++++++------ 3 files changed, 105 insertions(+), 39 deletions(-) diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index 766ae576ea..d3c0341e8a 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -107,7 +107,7 @@ def allclose(a_blocks, b_blocks, rtol=rtol, atol=atol): ) -def cf_contains(a, value=None): +def cf_contains(a, value): """Whether or not an array contains a value. .. versionadded:: TODODASK diff --git a/cf/data/data.py b/cf/data/data.py index e814126e61..60303218f6 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -649,15 +649,15 @@ def __contains__(self, value): x.__contains__(y) <==> y in x - Returns True if the *value* is contained anywhere in the - data. + Returns True if the scalar *value* is contained anywhere in + the data. If *value* is not scalar then an exception is + raised. **Performance** - `__contains__` causes all delayed operations to be computed, - unless either *value* has more than one element, or *value* is - a `Data` object with incompatible units. In both cases `False` - is returned without checking the data. + `__contains__` causes all delayed operations to be computed + unless *value* is a `Data` object with incompatible units, in + which case `False` is always returned. **Examples** @@ -670,34 +670,59 @@ def __contains__(self, value): True >>> cf.Data(5, 'm') in d True - >>> cf.Data([0.005], 'km') in d - True - >>> [[2]] in d - True - >>> np.array([[[2]]]) in d + >>> cf.Data(0.005, 'km') in d True + >>> 99 in d False - >>> [1, 2] in d - False >>> cf.Data(2, 'seconds') in d False + >>> [1] in d + Traceback (most recent call last): + ... + TypeError: elementwise comparison failed; must test against a scalar, not [1] + >>> [1, 2] in d + Traceback (most recent call last): + ... + TypeError: elementwise comparison failed; must test against a scalar, not [1, 2] + + >>> d = cf.Data(["foo", "bar"]) + >>> 'foo' in d + True + >>> 'xyz' in d + False + """ - # No need to check the dask array if the size of value is - # greater than 1 - size = getattr(value, "size", None) - if size is None: - try: - size = len(value) - except TypeError: - # value has no len() - pass + # Check that value is scalar by seeing if its shape is () + shape = getattr(value, "shape", None) + if shape is None: + if isinstance(value, str): + # Strings are scalars, even though they have a len(). + shape = () + else: + try: + len(value) + except TypeError: + # value has no len() so assume that it is a scalar + shape = () + else: + # value has a len() so assume that it is not a scalar + shape = True + elif is_dask_collection(value) and math.isnan(value.size): + # value is a dask array with unknown size, so calculate + # the size. This is acceptable, as we're going to compute + # it anyway at the end of this method. + value.compute_chunk_sizes() + shape = value.shape - if size is not None and size > 1: - return False + if shape: + raise TypeError( + "elementwise comparison failed; must test against a scalar, " + f"not {value!r}" + ) - # If value is a Data object then conform its units + # If value is a scalar Data object then conform its units if isinstance(value, self.__class__): self_units = self.Units value_units = value.Units @@ -706,7 +731,7 @@ def __contains__(self, value): value = value.copy() value.Units = self_units elif value_units: - # No need to check the dask array if the value uits + # No need to check the dask array if the value units # are incompatible return False @@ -718,10 +743,12 @@ def __contains__(self, value): dx_ind = out_ind dx = da.blockwise( - partial(cf_contains, value=value), + cf_contains, out_ind, dx, dx_ind, + value, + (), adjust_chunks={i: 1 for i in out_ind}, dtype=bool, ) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index f6488c1726..a7a1a6e3bd 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -7,6 +7,7 @@ from functools import reduce from operator import mul +import dask.array as da import numpy as np SCIPY_AVAILABLE = False @@ -1088,17 +1089,55 @@ def test_Data__contains__(self): return d = cf.Data([[0, 1, 2], [3, 4, 5]], units="m", chunks=2) - self.assertIn(4, d) - self.assertIn(4.0, d) - self.assertIn(cf.Data(3), d) - self.assertIn(cf.Data([[[[3]]]]), d) - self.assertIn(cf.Data([0.005], "km"), d) - self.assertIn(np.array([[[2]]]), d) - - self.assertNotIn(99, d) - self.assertNotIn([1, 2], d) - self.assertNotIn(np.array([1, 2]), d) - self.assertNotIn(cf.Data(2, "seconds"), d) + + for value in ( + 4, + 4.0, + cf.Data(3), + cf.Data(0.005, "km"), + np.array(2), + da.from_array(2), + ): + self.assertIn(value, d) + + for value in ( + 99, + np.array(99), + da.from_array(99), + cf.Data(99, "km"), + cf.Data(2, "seconds"), + ): + self.assertNotIn(value, d) + + for value in ( + [1], + [[1]], + [1, 2], + [[1, 2]], + np.array([1]), + np.array([[1]]), + np.array([1, 2]), + np.array([[1, 2]]), + da.from_array([1]), + da.from_array([[1]]), + da.from_array([1, 2]), + da.from_array([[1, 2]]), + cf.Data([1]), + cf.Data([[1]]), + cf.Data([1, 2]), + cf.Data([[1, 2]]), + cf.Data([0.005], "km"), + ): + with self.assertRaises(TypeError): + value in d + + # Strings + d = cf.Data(["foo", "bar"]) + self.assertIn("foo", d) + self.assertNotIn("xyz", d) + + with self.assertRaises(TypeError): + ["foo"] in d @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_asdata(self):