From 8f95bb242095d1e925e92ba29599859f8de36541 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 16 Nov 2022 01:04:11 +0000 Subject: [PATCH 001/134] dev --- cf/data/collapse.py | 564 +++++++++++++++++++++++++++++++++-------- cf/data/data.py | 444 +++++++++++--------------------- cf/data/netcdfarray.py | 164 ++++++++++++ cf/data/utils.py | 254 ++++++++++++++++++- cf/test/test_Data.py | 36 +++ 5 files changed, 1054 insertions(+), 408 deletions(-) diff --git a/cf/data/collapse.py b/cf/data/collapse.py index 4856b96d82..0eb28e829a 100644 --- a/cf/data/collapse.py +++ b/cf/data/collapse.py @@ -1,19 +1,261 @@ """Functions used during `Data` object collapses.""" import inspect -from functools import partial, reduce +from functools import partial, reduce, wraps +from numbers import Integral from operator import mul +import dask.array as da import numpy as np from cfdm.core import DocstringRewriteMeta from dask.array import chunk from dask.array.core import _concatenate2 from dask.array.reductions import divide, numel, reduction +from dask.array.utils import validate_axis +from dask.base import collections_to_dsk from dask.core import flatten from dask.utils import deepmap from ..docstring import _docstring_substitution_definitions +def actify( + cls, a, op, axis=None, chunk_function=None, active_storage=False +): + """TODODASKDOCS. + + .. versionadded:: TODODASKVER + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + op: `str` + TODODASKDOCS + + axis: (sequence of) `int`, optional + TODODASKDOCS + + chunk_function: function + TODODASKDOCS + + {{active_storage: `bool`, optional}} + + :Returns: + + `dask.array.Array`, function + TODODASKDOCS + + """ + if not active_storage: + # It has been determined externally that an active storage + # reduction is not possible + return a, chunk_function + + # Still here? Then it is assumed that the dask array is of a form + # which might be able to exploit active storage. In particular, it + # is assumed that all data definitions point to files. + + # Parse axis + if axis is None: + axis = tuple(range(a.ndim)) + else: + if isinstance(axis, Integral): + axis = (axis,) + + if len(axis) != a.ndim: + # Can't (yet) use active storage to collapse a subset + # of the axes + return a, chunk_function + + axis = validate_axis(axis, a.ndim) + + active_chunk_functions = set() + + # Loop round elements of the dask graph, looking for data + # definitions that point to a file and which support active + # storage operations. The elements are traversed in reverse order + # so that the data defintions come out first, allowing for a fast + # short circuit in the common case when using active storage is no + # feasible. + dsk = collections_to_dsk((a,), optimize_graph=True) + for key, value in reversed(dsk.items()): + try: + value.get_filename() + except AttributeError: + # This value is not a data definition (it is assumed that + # all data definitions point to files). + continue + + try: + # Create a new actified data definition value + value = value.set_active_storage_op(op, axis) + except (AttributeError, ValueError): + # This data definition value does not support active + # storage reductions, or does not support the requested + # active storage reduction defined by 'op'. + active_chunk_functions = () + break + + try: + # Get the active storage chunk function + active_chunk_functions.add(value.get_active_chunk_function()) + except AttributeError: + # This data definition value does not support active + # storage reductions + active_chunk_functions = () + break + + # Still here? Then update the dask graph in-place with the + # actified data definition value. + dsk[key] = value + + if len(active_chunk_functions) == 1: + # All data definitions in the dask graph support active + # storage reductions with the same chunk function => redefine + # the array from the actified dask graph, and redefine the + # reduction chunk function. + a = da.Array(dsk, a.name, a.chunks, a.dtype, a._meta) + chunk_function = active_chunk_functions.pop() + + return a, chunk_function + + +def actify_collapse(collapse_method, chunk_function=None): + """A decorator for `Collapse` methods that enables active storage + operations, when the conditions are right. + + """ + def decorator(collapse_method, chunk_function=None): + print (chunk_function) + @wraps(collapse_method) + def wrapper(cls, *args, **kwargs): + print (args, kwargs, cf_max_chunk.op) + if kwargs.get("weights") is None and "axis" in kwargs: + # Collapse is unweighted over defined axes => attempt to + # actify the dask array and chunk function. + chunk_function = kwargs["chunk_function"] + + a, chunk_function = actify( + args[0], + op=chunk_function.op, + axis=kwargs["axis"], + chunk_function=chunk_function, + active_storage=kwargs["active_storage"], + ) + + args = (a,) + kwargs["chunk_function"] = chunk_function + + return collapse_method(cls, *args, **kwargs) + + return wrapper + +# -------------------------------------------------------------------- +# sample size +# -------------------------------------------------------------------- +def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs): + """Chunk calculations for the sample size. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + + """ + if computing_meta: + return x + + if np.ma.isMA(x): + N = chunk.sum(np.ones_like(x, dtype=dtype), **kwargs) + else: + if dtype: + kwargs["dtype"] = dtype + + N = numel(x, **kwargs) + + return {"N": N} + +# -------------------------------------------------------------------- +# maximum +# -------------------------------------------------------------------- +def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): + """Chunk calculations for the maximum. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * max: The maximum of `x``. + + """ + if computing_meta: + return x + + return { + "max": chunk.max(x, **kwargs), + "N": cf_sample_size_chunk(x, **kwargs)["N"], + } + +cf_max_chunk.op = "max" + +# -------------------------------------------------------------------- +# minimum +# -------------------------------------------------------------------- +def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): + """Chunk calculations for the minimum. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * min: The minimum of ``x``. + + """ + if computing_meta: + return x + + return { + "min": chunk.min(x, **kwargs), + "N": cf_sample_size_chunk(x, **kwargs)["N"], + } + +cf_min_chunk.op = "min" + + class Collapse(metaclass=DocstringRewriteMeta): """Container for functions that collapse dask arrays. @@ -54,7 +296,120 @@ def __docstring_package_depth__(self): return 0 @classmethod - def max(cls, a, axis=None, keepdims=False, mtol=None, split_every=None): + def actify( + cls, a, op, axis=None, chunk_function=None, active_storage=False + ): + """TODODASKDOCS. + + .. versionadded:: TODODASKVER + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + op: `str` + TODODASKDOCS + + axis: (sequence of) `int`, optional + TODODASKDOCS + + chunk_function: function + TODODASKDOCS + + {{active_storage: `bool`, optional}} + + :Returns: + + `dask.array.Array`, function + TODODASKDOCS + + """ + if not active_storage: + # It has been determined externally that an active storage + # reduction is not possible + return a, chunk_function + + # Still here? Then it is assumed that the dask array is of a + # form which might be able to exploit active storage. In + # particular, it is assumed that all data definitions point to + # files. + + # Parse axis + if axis is None: + axis = tuple(range(a.ndim)) + else: + if isinstance(axis, Integral): + axis = (axis,) + + if len(axis) != a.ndim: + # Can't (yet) use active storage to collapse a subset + # of the axes + return a, chunk_function + + axis = validate_axis(axis, a.ndim) + + active_chunk_functions = set() + + # Loop round elements of the dask graph, looking for data + # definitions that point to a file and which support active + # storage operations. The elements are traversed in reverse + # order so that the data defintions come out first, allowing + # for a fast short circuit in the common case when using + # active storage is no feasible. + dsk = collections_to_dsk((a,), optimize_graph=True) + for key, value in reversed(dsk.items()): + try: + value.get_filename() + except AttributeError: + # This value is not a data definition (it is assumed + # that all data definitions point to files). + continue + + try: + # Create a new actified data definition value + value = value.set_active_storage_op(op, axis) + except (AttributeError, ValueError): + # This data definition value does not support active + # storage reductions, or does not support the + # requested active storage reduction defined by 'op'. + active_chunk_functions = () + break + + try: + # Get the active storage chunk function + active_chunk_functions.add(value.get_active_chunk_function()) + except AttributeError: + # This data definition value does not support active + # storage reductions + active_chunk_functions = () + break + + # Still here? Then update the dask graph in-place with the + # actified data definition value. + dsk[key] = value + + if len(active_chunk_functions) == 1: + # All data definitions in the dask graph support active + # storage reductions with the same chunk function => + # redefine the array from the actified dask graph, and + # redefine the reduction chunk function. + a = da.Array(dsk, a.name, a.chunks, a.dtype, a._meta) + chunk_function = active_chunk_functions.pop() + + return a, chunk_function + + @classmethod + @actify_collapse(chunk_function=cf_max_chunk) + def max( + cls, + a, + axis=None, + keepdims=False, + mtol=None, + split_every=None, + active_storage=False, + ): """Return maximum values of an array. Calculates the maximum value of an array or the maximum values @@ -79,6 +434,10 @@ def max(cls, a, axis=None, keepdims=False, mtol=None, split_every=None): {{split_every: `int` or `dict`, optional}} + {{active_storage: `bool`, optional}} + + {{chunk_function: function}} + :Returns: `dask.array.Array` @@ -87,9 +446,14 @@ def max(cls, a, axis=None, keepdims=False, mtol=None, split_every=None): """ check_input_dtype(a) dtype = a.dtype + + # a, cf_max_chunk = cls.actify( + # a, "max", axis, cf_max_chunk, active_storage + # ) + return reduction( a, - cf_max_chunk, + chunk_function, # cf_max_chunk, partial(cf_max_agg, mtol=mtol, original_shape=a.shape), axis=axis, keepdims=keepdims, @@ -102,7 +466,13 @@ def max(cls, a, axis=None, keepdims=False, mtol=None, split_every=None): @classmethod def max_abs( - cls, a, axis=None, keepdims=False, mtol=None, split_every=None + cls, + a, + axis=None, + keepdims=False, + mtol=None, + split_every=None, + active_storage=False, ): """Return maximum absolute values of an array. @@ -128,6 +498,8 @@ def max_abs( {{split_every: `int` or `dict`, optional}} + {{active_storage: `bool`, optional}} + :Returns: `dask.array.Array` @@ -151,6 +523,7 @@ def mean( keepdims=False, mtol=None, split_every=None, + active_storage=False, ): """Return mean values of an array. @@ -178,6 +551,8 @@ def mean( {{split_every: `int` or `dict`, optional}} + {{active_storage: `bool`, optional}} + :Returns: `dask.array.Array` @@ -186,6 +561,12 @@ def mean( """ check_input_dtype(a) dtype = "f8" + +# if weights is None: +# a, cf_mean_chunk = cls.actify( +# a, "mean", axis, cf_mean_chunk, active_storage +# ) + return reduction( a, cf_mean_chunk, @@ -209,6 +590,7 @@ def mean_abs( keepdims=False, mtol=None, split_every=None, + active_storage=False, ): """Return mean absolute values of an array. @@ -236,6 +618,8 @@ def mean_abs( {{split_every: `int` or `dict`, optional}} + {{active_storage: `bool`, optional}} + :Returns: `dask.array.Array` @@ -260,6 +644,7 @@ def mid_range( keepdims=False, mtol=None, split_every=None, + active_storage=False, ): """Return mid-range values of an array. @@ -285,6 +670,8 @@ def mid_range( {{split_every: `int` or `dict`, optional}} + {{active_storage: `bool`, optional}} + :Returns: `dask.array.Array` @@ -307,7 +694,15 @@ def mid_range( ) @classmethod - def min(cls, a, axis=None, keepdims=False, mtol=None, split_every=None): + def min( + cls, + a, + axis=None, + keepdims=False, + mtol=None, + split_every=None, + active_storage=False, + ): """Return minimum values of an array. Calculates the minimum value of an array or the minimum values @@ -332,6 +727,8 @@ def min(cls, a, axis=None, keepdims=False, mtol=None, split_every=None): {{split_every: `int` or `dict`, optional}} + {{active_storage: `bool`, optional}} + :Returns: `dask.array.Array` @@ -340,6 +737,11 @@ def min(cls, a, axis=None, keepdims=False, mtol=None, split_every=None): """ check_input_dtype(a) dtype = a.dtype + +# a, cf_min_chunk = cls.actify( +# a, "min", axis, cf_min_chunk, active_storage +# ) + return reduction( a, cf_min_chunk, @@ -355,7 +757,13 @@ def min(cls, a, axis=None, keepdims=False, mtol=None, split_every=None): @classmethod def min_abs( - cls, a, axis=None, keepdims=False, mtol=None, split_every=None + cls, + a, + axis=None, + keepdims=False, + mtol=None, + split_every=None, + active_storage=False, ): """Return minimum absolute values of an array. @@ -381,6 +789,8 @@ def min_abs( {{split_every: `int` or `dict`, optional}} + {{active_storage: `bool`, optional}} + :Returns: `dask.array.Array` @@ -396,7 +806,15 @@ def min_abs( ) @classmethod - def range(cls, a, axis=None, keepdims=False, mtol=None, split_every=None): + def range( + cls, + a, + axis=None, + keepdims=False, + mtol=None, + split_every=None, + active_storage=False, + ): """Return range values of an array. Calculates the range value of an array or the range values @@ -421,6 +839,8 @@ def range(cls, a, axis=None, keepdims=False, mtol=None, split_every=None): {{split_every: `int` or `dict`, optional}} + {{active_storage: `bool`, optional}} + :Returns: `dask.array.Array` @@ -451,6 +871,7 @@ def rms( keepdims=False, mtol=None, split_every=None, + active_storage=False, ): """Return root mean square (RMS) values of an array. @@ -478,6 +899,8 @@ def rms( {{split_every: `int` or `dict`, optional}} + {{active_storage: `bool`, optional}} + :Returns: `dask.array.Array` @@ -502,7 +925,13 @@ def rms( @classmethod def sample_size( - cls, a, axis=None, keepdims=False, mtol=None, split_every=None + cls, + a, + axis=None, + keepdims=False, + mtol=None, + split_every=None, + active_storage=False, ): """Return sample size values of an array. @@ -528,6 +957,8 @@ def sample_size( {{split_every: `int` or `dict`, optional}} + {{active_storage: `bool`, optional}} + :Returns: `dask.array.Array` @@ -558,6 +989,7 @@ def sum( keepdims=False, mtol=None, split_every=None, + active_storage=False, ): """Return sum values of an array. @@ -585,6 +1017,8 @@ def sum( {{split_every: `int` or `dict`, optional}} + {{active_storage: `bool`, optional}} + :Returns: `dask.array.Array` @@ -619,6 +1053,7 @@ def sum_of_weights( keepdims=False, mtol=None, split_every=None, + active_storage=False, ): """Return sum of weights values for an array. @@ -646,6 +1081,8 @@ def sum_of_weights( {{split_every: `int` or `dict`, optional}} + {{active_storage: `bool`, optional}} + :Returns: `dask.array.Array` @@ -677,6 +1114,7 @@ def sum_of_weights2( keepdims=False, mtol=None, split_every=None, + active_storage=False, ): """Return sum of squares of weights values for an array. @@ -704,6 +1142,8 @@ def sum_of_weights2( {{split_every: `int` or `dict`, optional}} + {{active_storage: `bool`, optional}} + :Returns: `dask.array.Array` @@ -736,6 +1176,7 @@ def var( mtol=None, ddof=None, split_every=None, + active_storage=False, ): """Return variances of an array. @@ -765,6 +1206,8 @@ def var( {{split_every: `int` or `dict`, optional}} + {{active_storage: `bool`, optional}} + :Returns: `dask.array.Array` @@ -788,7 +1231,7 @@ def var( ) @classmethod - def unique(cls, a, split_every=None): + def unique(cls, a, split_every=None, active_storage=False): """Return unique elements of the data. .. versionadded:: TODODASKVER @@ -800,6 +1243,8 @@ def unique(cls, a, split_every=None): {{split_every: `int` or `dict`, optional}} + {{active_storage: `bool`, optional}} + :Returns: `dask.array.Array` @@ -1218,39 +1663,6 @@ def cf_mean_agg( return x -# -------------------------------------------------------------------- -# maximum -# -------------------------------------------------------------------- -def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): - """Chunk calculations for the maximum. - - This function is passed to `dask.array.reduction` as its *chunk* - parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - See `dask.array.reductions` for details of the parameters. - - :Returns: - - `dict` - Dictionary with the keys: - - * N: The sample size. - * max: The maximum of `x``. - - """ - if computing_meta: - return x - - return { - "max": chunk.max(x, **kwargs), - "N": cf_sample_size_chunk(x, **kwargs)["N"], - } - - def cf_max_combine(pairs, axis=None, computing_meta=False, **kwargs): """Combination calculations for the maximum. @@ -1369,38 +1781,6 @@ def cf_mid_range_agg( return x -# -------------------------------------------------------------------- -# minimum -# -------------------------------------------------------------------- -def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): - """Chunk calculations for the minimum. - - This function is passed to `dask.array.reduction` as its *chunk* - parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - See `dask.array.reductions` for details of the parameters. - - :Returns: - - `dict` - Dictionary with the keys: - - * N: The sample size. - * min: The minimum of ``x``. - - """ - if computing_meta: - return x - - return { - "min": chunk.min(x, **kwargs), - "N": cf_sample_size_chunk(x, **kwargs)["N"], - } - def cf_min_combine(pairs, axis=None, computing_meta=False, **kwargs): """Combination calculations for the minimum. @@ -1658,42 +2038,6 @@ def cf_rms_agg( return x -# -------------------------------------------------------------------- -# sample size -# -------------------------------------------------------------------- -def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs): - """Chunk calculations for the sample size. - - This function is passed to `dask.array.reduction` as its *chunk* - parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - See `dask.array.reductions` for details of the parameters. - - :Returns: - - `dict` - Dictionary with the keys: - - * N: The sample size. - - """ - if computing_meta: - return x - - if np.ma.isMA(x): - N = chunk.sum(np.ones_like(x, dtype=dtype), **kwargs) - else: - if dtype: - kwargs["dtype"] = dtype - - N = numel(x, **kwargs) - - return {"N": N} - def cf_sample_size_combine( pairs, axis=None, dtype="i8", computing_meta=False, **kwargs diff --git a/cf/data/data.py b/cf/data/data.py index 3ef0159549..35fceac6ff 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -55,6 +55,7 @@ from .utils import ( # is_small,; is_very_small, YMDhms, _is_numeric_dtype, + collapse, conform_units, convert_to_datetime, convert_to_reftime, @@ -362,7 +363,7 @@ def __init__( except (AttributeError, TypeError): pass else: - self._set_dask(array, copy=copy, delete_source=False) + self._set_dask(array, copy=copy, conform=False) else: self._del_dask(None) @@ -455,12 +456,14 @@ def __init__( "Use the 'chunks' parameter instead." ) - # Bring the data into memory if to_memory: + # Bring the data into memory try: array = array.to_memory() except AttributeError: pass + elif self._is_abstract_Array_subclass(array): + self._set_active_storage(True) array = to_dask(array, chunks, **kwargs) @@ -492,7 +495,7 @@ def __init__( self._Units = units # Store the dask array - self._set_dask(array, delete_source=False) + self._set_dask(array, conform=False) # Override the data type if dtype is not None: @@ -1121,9 +1124,8 @@ def __setitem__(self, indices, value): shifts = [-shift for shift in shifts] self.roll(shift=shifts, axis=roll_axes, inplace=True) - # Remove a source array, on the grounds that we can't - # guarantee its consistency with the updated dask array. - self._del_Array(None) + # Remove elements made invalid by updating the `dask` array + self._conform_after_dask_update() return @@ -1241,12 +1243,85 @@ def __keepdims_indexing__(self): def __keepdims_indexing__(self, value): self._custom["__keepdims_indexing__"] = bool(value) - def _set_dask(self, array, copy=False, delete_source=True): - """Set the dask array. + def _conform_after_dask_update(self): + """Remove elements made invalid by updating the `dask` array. + + Removes or modifies components that can't be guaranteed to be + consistent with an updated `dask` array`: + + * Deletes a source array. + * Sets "active storage" to `False` + + .. versionadded:: TODODASKVER + + :Returns: + + `None` + + """ + self._del_Array(None) + self._del_active_storage() + + def _del_active_storage(self): + """TODODASKDOCS. + + .. versionadded:: TODODASKVER + + .. seealso:: `_set_active_storage` + + :Returns: + + `None` + + **Examples** + + >>> d = cf.Data([9]) + >>> d.active_storage() + False + >>> d._set_active_storage(True) + >>> d.active_storage() + True + >>> d._del_active_storage() + >>> d.active_storage() + False + + """ + self._custom.pop("active_storage", None) + + def _set_active_storage(self, value): + """TODODASKDOCS. .. versionadded:: TODODASKVER - .. seealso:: `to_dask_array`, `_del_dask` + .. seealso:: `_del_active_storage` + + :Returns: + + `bool` + TODODASKDOCS + + **Examples** + + >>> d = cf.Data([9]) + >>> d.active_storage() + False + >>> d._set_active_storage(True) + >>> d.active_storage() + True + >>> d._del_active_storage() + >>> d.active_storage() + False + + """ + self._custom["active_storage"] = bool(value) + + def _set_dask(self, array, copy=False, conform=True): + """Set the `dask` array. + + .. versionadded:: TODODASKVER + + .. seealso:: `to_dask_array`, `_conform_after_dask_update` + `_del_dask` :Parameters: @@ -1257,10 +1332,10 @@ def _set_dask(self, array, copy=False, delete_source=True): If True then copy *array* before setting it. By default it is not copied. - delete_source: `bool`, optional - If False then do not delete a source array, if one - exists, after setting the new dask array. By default a - source array is deleted. + conform: `bool`, optional + If True, the default, then remove elements made + invalid by updating the `dask` array. See + `_conform_after_dask_update` for details. :Returns: @@ -1287,17 +1362,18 @@ def _set_dask(self, array, copy=False, delete_source=True): self._custom["dask"] = array - if delete_source: - # Remove a source array, on the grounds that we can't - # guarantee its consistency with the new dask array. - self._del_Array(None) + if conform: + # Remove elements made invalid by updating the `dask` + # array + self._conform_after_dask_update() - def _del_dask(self, default=ValueError(), delete_source=True): - """Remove the dask array. + def _del_dask(self, default=ValueError(), conform=True): + """Remove the `dask` array. .. versionadded:: TODODASKVER - .. seealso:: `_set_dask`, `to_dask_array` + .. seealso:: `to_dask_array`, `_conform_after_dask_update`, + `_set_dask` :Parameters: @@ -1307,9 +1383,10 @@ def _del_dask(self, default=ValueError(), delete_source=True): {{default Exception}} - delete_source: `bool`, optional - If False then do not delete a compressed source array, - if one exists. + conform: `bool`, optional + If True, the default, then remove elements made + invalid by updating the `dask` array. See + `_conform_after_dask_update` for details. :Returns: @@ -1339,11 +1416,10 @@ def _del_dask(self, default=ValueError(), delete_source=True): default, f"{self.__class__.__name__!r} has no dask array" ) - if delete_source: - # Remove a source array, on the grounds that we can't - # guarantee its consistency with any future new dask - # array. - self._del_Array(None) + if conform: + # Remove elements made invalid by deleting the `dask` + # array + self._conform_after_dask_update() return out @@ -2195,7 +2271,8 @@ def persist(self, inplace=False): dx = self.to_dask_array() dx = dx.persist() - d._set_dask(dx, delete_source=False) + d._set_dask(dx, conform=False) + d._del_active_storage() return d @@ -2765,7 +2842,8 @@ def rechunk( dx = d.to_dask_array() dx = dx.rechunk(chunks, threshold, block_size_limit, balance) - d._set_dask(dx, delete_source=False) + d._set_dask(dx, conform=False) + d._del_active_storage() return d @@ -4077,7 +4155,7 @@ def _axes(self, value): # ---------------------------------------------------------------- @property def chunks(self): - """The chunk sizes for each dimension. + """The `dask` chunk sizes for each dimension. **Examples** @@ -4100,6 +4178,27 @@ def force_compute(self, value): # ---------------------------------------------------------------- # Attributes # ---------------------------------------------------------------- + @property + def active_storage(self): + """Whether or not active storage recductions are possible. + + If the `active_storage` attribute is `True` then reductions + (such as calculating the minimum value of the data) will + attempt to use active storage capabilities, falling back on + the usual (non-active) techniques if an active storage + operation fails for any reason. + + .. versionadded:: TODODASKVER + + **Examples** + + >>> d = cf.Data([9]) + >>> d.active_storage + False + + """ + return self._custom.get("active_storage", False) + @property def Units(self): """The `cf.Units` object containing the units of the data array. @@ -5644,7 +5743,7 @@ def max( """ d = _inplace_enabled_define_and_cleanup(self) - d, _ = _collapse( + d, _ = collapse( Collapse.max, d, axis=axes, @@ -5707,7 +5806,7 @@ def maximum_absolute_value( """ d = _inplace_enabled_define_and_cleanup(self) - d, _ = _collapse( + d, _ = collapse( Collapse.max_abs, d, axis=axes, @@ -5776,7 +5875,7 @@ def min( """ d = _inplace_enabled_define_and_cleanup(self) - d, _ = _collapse( + d, _ = collapse( Collapse.min, d, axis=axes, @@ -5839,7 +5938,7 @@ def minimum_absolute_value( """ d = _inplace_enabled_define_and_cleanup(self) - d, _ = _collapse( + d, _ = collapse( Collapse.min_abs, d, axis=axes, @@ -5916,7 +6015,7 @@ def mean( """ d = _inplace_enabled_define_and_cleanup(self) - d, _ = _collapse( + d, _ = collapse( Collapse.mean, d, axis=axes, @@ -5992,7 +6091,7 @@ def mean_absolute_value( """ d = _inplace_enabled_define_and_cleanup(self) - d, _ = _collapse( + d, _ = collapse( Collapse.mean_abs, d, axis=axes, @@ -6072,7 +6171,7 @@ def integral( """ d = _inplace_enabled_define_and_cleanup(self) - d, weights = _collapse( + d, weights = collapse( Collapse.sum, d, axis=axes, @@ -6154,7 +6253,7 @@ def sample_size( """ d = _inplace_enabled_define_and_cleanup(self) - d, _ = _collapse( + d, _ = collapse( Collapse.sample_size, d, axis=axes, @@ -7532,7 +7631,7 @@ def harden_mask(self): """ dx = self.to_dask_array() dx = dx.map_blocks(cf_harden_mask, dtype=self.dtype) - self._set_dask(dx, delete_source=False) + self._set_dask(dx, conform=False) self.hardmask = True def has_calendar(self): @@ -7629,7 +7728,7 @@ def soften_mask(self): """ dx = self.to_dask_array() dx = dx.map_blocks(cf_soften_mask, dtype=self.dtype) - self._set_dask(dx, delete_source=False) + self._set_dask(dx, conform=False) self.hardmask = False @_inplace_enabled(default=False) @@ -8720,7 +8819,7 @@ def mid_range( """ d = _inplace_enabled_define_and_cleanup(self) - d, _ = _collapse( + d, _ = collapse( Collapse.mid_range, d, axis=axes, @@ -9073,7 +9172,7 @@ def root_mean_square( """ d = _inplace_enabled_define_and_cleanup(self) - d, _ = _collapse( + d, _ = collapse( Collapse.rms, d, axis=axes, @@ -10697,7 +10796,7 @@ def range( """ d = _inplace_enabled_define_and_cleanup(self) - d, _ = _collapse( + d, _ = collapse( Collapse.range, d, axis=axes, @@ -10825,7 +10924,7 @@ def sum( """ d = _inplace_enabled_define_and_cleanup(self) - d, _ = _collapse( + d, _ = collapse( Collapse.sum, d, axis=axes, @@ -10990,7 +11089,7 @@ def sum_of_weights( """ d = _inplace_enabled_define_and_cleanup(self) - d, weights = _collapse( + d, weights = collapse( Collapse.sum_of_weights, d, axis=axes, @@ -11088,7 +11187,7 @@ def sum_of_weights2( """ d = _inplace_enabled_define_and_cleanup(self) - d, weights = _collapse( + d, weights = collapse( Collapse.sum_of_weights2, d, axis=axes, @@ -11272,7 +11371,7 @@ def var( """ d = _inplace_enabled_define_and_cleanup(self) - d, _ = _collapse( + d, _ = collapse( Collapse.var, d, axis=axes, @@ -11661,252 +11760,3 @@ def _size_of_index(index, size=None): else: # Index is a list of integers return len(index) - - -def _collapse( - func, - d, - axis=None, - weights=None, - keepdims=True, - mtol=1, - ddof=None, - split_every=None, -): - """Collapse data in-place using a given funcion. - - .. versionadded:: TODODASKVER - - .. seealso:: `_parse_weights` - - :Parameters: - - func: callable - The function that collapses the underlying `dask` array of - *d*. Must have the minimum signature (parameters and - default values) ``func(dx, axis=None, keepdims=False, - mtol=None, split_every=None)`` (optionally including - ``weights=None`` or ``ddof=None``), where ``dx`` is a the - dask array contained in *d*. - - d: `Data` - The data to be collapsed. - - axis: (sequence of) int, optional - The axes to be collapsed. By default all axes are - collapsed, resulting in output with size 1. Each axis is - identified by its integer position. If *axes* is an empty - sequence then the collapse is applied to each scalar - element and the reuslt has the same shape as the input - data. - - weights: data_like, `dict`, or `None`, optional - Weights associated with values of the data. By default - *weights* is `None`, meaning that all non-missing elements - of the data have a weight of 1 and all missing elements - have a weight of 0. - - If *weights* is a data_like object then it must be - broadcastable to the array. - - If *weights* is a dictionary then each key specifies axes - of the data (an `int` or `tuple` of `int`), with a - corresponding value of data_like weights for those - axes. The dimensions of a weights value must correspond to - its key axes in the same order. Not all of the axes need - weights assigned to them. The weights that will be used - will be an outer product of the dictionary's values. - - However they are specified, the weights are internally - broadcast to the shape of the data, and those weights that - are missing data, or that correspond to the missing - elements of the data, are assigned a weight of 0. - - For collapse functions that do not have a ``weights`` - parameter, *weights* must be `None`. - - keepdims: `bool`, optional - By default, the axes which are collapsed are left in the - result as dimensions with size one, so that the result - will broadcast correctly against the input array. If set - to False then collapsed axes are removed from the data. - - mtol: number, optional - The sample size threshold below which collapsed values are - set to missing data. It is defined as a fraction (between - 0 and 1 inclusive) of the contributing input data values. - - The default of *mtol* is 1, meaning that a missing datum - in the output array occurs whenever all of its - contributing input array elements are missing data. - - For other values, a missing datum in the output array - occurs whenever more than ``100*mtol%`` of its - contributing input array elements are missing data. - - ddof: number, optional - The delta degrees of freedom. The number of degrees of - freedom used in the calculation is (N-*ddof*) where N - represents the number of non-missing elements. - - For collapse functions that do not have a ``ddof`` - parameter, *ddof* must be `None`. - - split_every: `int` or `dict`, optional - Determines the depth of the recursive aggregation. See - `dask.array.reduction` for details. - - :Returns: - - (`Data`, formatted weights) - The collapsed data and the output of ``_parse_weights(d, - weights, axis)``. - - """ - kwargs = { - "axis": axis, - "keepdims": keepdims, - "split_every": split_every, - "mtol": mtol, - } - - weights = _parse_weights(d, weights, axis) - if weights is not None: - kwargs["weights"] = weights - - if ddof is not None: - kwargs["ddof"] = ddof - - dx = d.to_dask_array() - dx = func(dx, **kwargs) - d._set_dask(dx) - - return d, weights - - -def _parse_weights(d, weights, axis=None): - """Parse the weights input to `_collapse`. - - .. versionadded:: TODODASKVER - - .. seealso:: `_collapse` - - :Parameters: - - d: `Data` - The data to be collapsed. - - weights: data_like or `dict` - See `_collapse` for details. - - axis: (sequence of) `int`, optional - See `_collapse` for details. - - :Returns: - - `Data` or `None` - * If *weights* is a data_like object then they are - returned unchanged as a `Data` object. It is up to the - downstream functions to check if the weights can be - broadcast to the data. - - * If *weights* is a dictionary then the dictionary - values', i.e. the weights components, outer product is - returned in `Data` object that is broadcastable to the - data. - - If the dictionary is empty, or none of the axes defined - by the keys correspond to collapse axes defined by - *axis*, then then the collapse is unweighted and `None` - is returned. - - Note that, in all cases, the returned weights are *not* - modified to account for missing values in the data. - - **Examples** - - >>> d = cf.Data(np.arange(12)).reshape(4, 3) - - >>> _parse_weights(d, [1, 2, 1], (0, 1)) - - - >>> _parse_weights(d, [[1, 2, 1]], (0, 1)) - - - >>> _parse_weights(d, {1: [1, 2, 1]}, (0, 1)) - - - >>> print(_parse_weights(d, {0: [1, 2, 3, 4], 1: [1, 2, 1]}, (0, 1))) - [[1 2 1] - [2 4 2] - [3 6 3] - [4 8 4]] - - >>> print(cf.data.data._parse_weights(d, {}, (0, 1))) - None - - >>> print(cf.data.data._parse_weights(d, {1: [1, 2, 1]}, 0)) - None - - """ - if weights is None: - # No weights - return - - if not isinstance(weights, dict): - # Weights is data_like. Don't check broadcastability to d, - # leave that to whatever uses the weights. - return Data.asdata(weights) - - if not weights: - # No weights (empty dictionary) - return - - if axis is None: - axis = tuple(range(d.ndim)) - else: - axis = d._parse_axes(axis) - - weights = weights.copy() - weights_axes = set() - for key, value in tuple(weights.items()): - del weights[key] - key = d._parse_axes(key) - if weights_axes.intersection(key): - raise ValueError("Duplicate weights axis") - - weights[tuple(key)] = value - weights_axes.update(key) - - if not weights_axes.intersection(axis): - # No weights span collapse axes - return - - # For each component, add missing dimensions as size 1. - w = [] - shape = d.shape - for key, value in weights.items(): - value = Data.asdata(value) - - # Make sure axes are in ascending order - skey = tuple(sorted(key)) - if key != skey: - value = value.transpose(skey) - key = skey - - if not all( - True if i in (j, 1) else False - for i, j in zip(value.shape, [shape[i] for i in key]) - ): - raise ValueError( - f"Weights component for axes {tuple(key)} with shape " - f"{value.shape} is not broadcastable to data with " - f"shape {shape}" - ) - - new_shape = [n if i in key else 1 for i, n in enumerate(shape)] - w.append(value.reshape(new_shape)) - - # Return the product of the weights components, which will be - # broadcastable to d - return reduce(mul, w) diff --git a/cf/data/netcdfarray.py b/cf/data/netcdfarray.py index 8942638ef0..db9f9c1488 100644 --- a/cf/data/netcdfarray.py +++ b/cf/data/netcdfarray.py @@ -5,3 +5,167 @@ class NetCDFArray(cfdm.NetCDFArray, FileArray): """An array stored in a netCDF file.""" + + def __getitem__(self, indices): + """Returns a subspace of the array as a numpy array. + + x.__getitem__(indices) <==> x[indices] + + The indices that define the subspace must be either `Ellipsis` or + a sequence that contains an index for each dimension. In the + latter case, each dimension's index must either be a `slice` + object or a sequence of two or more integers. + + Indexing is similar to numpy indexing. The only difference to + numpy indexing (given the restrictions on the type of indices + allowed) is: + + * When two or more dimension's indices are sequences of integers + then these indices work independently along each dimension + (similar to the way vector subscripts work in Fortran). + + .. versionadded:: TODODASKVER + + """ + if self.active_storage_op: + # Active storage read. Returns a dictionary. + active = Active(self.filename, self.ncvar) + active.method = self.active_storage_op + active.components = True + + return active[indices] + + # Normal read by local client. Returns a numpy array. + # + # In production code groups, masks, string types, etc. will + # need to be accounted for here. + return super().__getitme__(indices) + + def _active_chunk_functions(self): + return { + "min": self.active_min, + "max": self.active_max, + "mean": self.active_mean, + } + + @property + def active_storage_op(self): + return self._custom.get("active_storage_op") + + @active_storage_op.setter + def active_storage_op(self, value): + self._custom["active_storage_op"] = value + + @property + def op_axis(self): + return self._custom.get("op_axis") + + @op_axis.setter + def op_axis(self, value): + self._custom["op_axis"] = value + + @staticmethod + def active_min(a, **kwargs): + """Chunk calculations for the minimum. + + Assumes that the calculations have already been done, + i.e. that *a* is already the minimum. + + This function is intended to be passed in to + `dask.array.reduction()` as the ``chunk`` parameter. Its + return signature must be the same as the non-active chunks + function that it is replacing. + + .. versionadded:: TODODASKVER + + :Parameters: + + a: `dict` + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * min: The minimum of `a``. + + """ + return {"N": a["n"], "min": a["min"]} + + @staticmethod + def active_max(a, **kwargs): + """Chunk calculations for the maximum. + + Assumes that the calculations have already been done, + i.e. that *a* is already the maximum. + + This function is intended to be passed in to + `dask.array.reduction()` as the ``chunk`` parameter. Its + return signature must be consistent with that expected by the + functions of the ``aggregate`` and ``combine`` parameters. + + .. versionadded:: TODODASKVER + + :Parameters: + + a: `dict` + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * max: The maximum of `a``. + + """ + return {"N": a["n"], "max": a["max"]} + + @staticmethod + def active_mean(a, **kwargs): + """Chunk calculations for the mean. + + Assumes that the calculations have already been done, + i.e. that *a* is already the mean. + + This function is intended to be passed in to + `dask.array.reduction()` as the ``chunk`` parameter. Its + return signature must be the same as the non-active chunks + function that it is replacing. + + .. versionadded:: TODODASKVER + + :Parameters: + + a: `dict` + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * V1: The sum of ``weights``. Equal to ``N`` because + weights have not been set. + * sum: The weighted sum of ``x``. + * weighted: True if weights have been set. Always + False. + + """ + return {"N": a["n"], "V1": a["n"], "sum": a["sum"], "weighted": False} + + def get_active_chunk_function(self): + try: + return self._active_chunk_functions()[self.active_storage_op] + except KeyError: + raise ValueError("no active storage operation has been set") + + def set_active_storage_op(self, op, axis=None): + if op not in self._active_chunk_functions(): + raise ValueError(f"Invalid active storage operation: {op!r}") + + a = self.copy() + a.active_storage_op = op + a.op_axis = axis + return a diff --git a/cf/data/utils.py b/cf/data/utils.py index b1b0d37ee3..55d60d68c2 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -1,6 +1,7 @@ """General functions useful for `Data` functionality.""" -from functools import lru_cache, partial +from functools import lru_cache, partial, reduce from itertools import product +from operator import mul import dask.array as da import numpy as np @@ -743,3 +744,254 @@ def where_broadcastable(data, x, name): ) return x + + +def collapse( + func, + d, + axis=None, + weights=None, + keepdims=True, + mtol=1, + ddof=None, + split_every=None, +): + """Collapse data in-place using a given funcion. + + .. versionadded:: TODODASKVER + + .. seealso:: `parse_weights` + + :Parameters: + + func: callable + The function that collapses the underlying `dask` array of + *d*. Must have the minimum signature (parameters and + default values) ``func(dx, axis=None, keepdims=False, + mtol=None, split_every=None)`` (optionally including + ``weights=None`` or ``ddof=None``), where ``dx`` is a the + dask array contained in *d*. + + d: `Data` + The data to be collapsed. + + axis: (sequence of) int, optional + The axes to be collapsed. By default all axes are + collapsed, resulting in output with size 1. Each axis is + identified by its integer position. If *axes* is an empty + sequence then the collapse is applied to each scalar + element and the reuslt has the same shape as the input + data. + + weights: data_like, `dict`, or `None`, optional + Weights associated with values of the data. By default + *weights* is `None`, meaning that all non-missing elements + of the data have a weight of 1 and all missing elements + have a weight of 0. + + If *weights* is a data_like object then it must be + broadcastable to the array. + + If *weights* is a dictionary then each key specifies axes + of the data (an `int` or `tuple` of `int`), with a + corresponding value of data_like weights for those + axes. The dimensions of a weights value must correspond to + its key axes in the same order. Not all of the axes need + weights assigned to them. The weights that will be used + will be an outer product of the dictionary's values. + + However they are specified, the weights are internally + broadcast to the shape of the data, and those weights that + are missing data, or that correspond to the missing + elements of the data, are assigned a weight of 0. + + For collapse functions that do not have a ``weights`` + parameter, *weights* must be `None`. + + keepdims: `bool`, optional + By default, the axes which are collapsed are left in the + result as dimensions with size one, so that the result + will broadcast correctly against the input array. If set + to False then collapsed axes are removed from the data. + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. It is defined as a fraction (between + 0 and 1 inclusive) of the contributing input data values. + + The default of *mtol* is 1, meaning that a missing datum + in the output array occurs whenever all of its + contributing input array elements are missing data. + + For other values, a missing datum in the output array + occurs whenever more than ``100*mtol%`` of its + contributing input array elements are missing data. + + ddof: number, optional + The delta degrees of freedom. The number of degrees of + freedom used in the calculation is (N-*ddof*) where N + represents the number of non-missing elements. + + For collapse functions that do not have a ``ddof`` + parameter, *ddof* must be `None`. + + split_every: `int` or `dict`, optional + Determines the depth of the recursive aggregation. See + `dask.array.reduction` for details. + + :Returns: + + (`Data`, formatted weights) + The collapsed data and the output of ``parse_weights(d, + weights, axis)``. + + """ + kwargs = { + "axis": axis, + "keepdims": keepdims, + "split_every": split_every, + "mtol": mtol, + "active_storage": d.active_storage, + } + + weights = parse_weights(d, weights, axis) + if weights is not None: + kwargs["weights"] = weights + + if ddof is not None: + kwargs["ddof"] = ddof + + dx = d.to_dask_array() + dx = func(dx, **kwargs) + d._set_dask(dx) + + return d, weights + + +def parse_weights(d, weights, axis=None): + """Parse the weights input to `collapse`. + + .. versionadded:: TODODASKVER + + .. seealso:: `collapse` + + :Parameters: + + d: `Data` + The data to be collapsed. + + weights: data_like or `dict` + See `collapse` for details. + + axis: (sequence of) `int`, optional + See `collapse` for details. + + :Returns: + + `Data` or `None` + * If *weights* is a data_like object then they are + returned unchanged as a `Data` object. It is up to the + downstream functions to check if the weights can be + broadcast to the data. + + * If *weights* is a dictionary then the dictionary + values', i.e. the weights components, outer product is + returned in `Data` object that is broadcastable to the + data. + + If the dictionary is empty, or none of the axes defined + by the keys correspond to collapse axes defined by + *axis*, then then the collapse is unweighted and `None` + is returned. + + Note that, in all cases, the returned weights are *not* + modified to account for missing values in the data. + + **Examples** + + >>> d = cf.Data(np.arange(12)).reshape(4, 3) + + >>> parse_weights(d, [1, 2, 1], (0, 1)) + + + >>> parse_weights(d, [[1, 2, 1]], (0, 1)) + + + >>> parse_weights(d, {1: [1, 2, 1]}, (0, 1)) + + + >>> print(parse_weights(d, {0: [1, 2, 3, 4], 1: [1, 2, 1]}, (0, 1))) + [[1 2 1] + [2 4 2] + [3 6 3] + [4 8 4]] + + >>> print(cf.data.data.parse_weights(d, {}, (0, 1))) + None + + >>> print(cf.data.data.parse_weights(d, {1: [1, 2, 1]}, 0)) + None + + """ + if weights is None: + # No weights + return + + if not isinstance(weights, dict): + # Weights is data_like. Don't check broadcastability to d, + # leave that to whatever uses the weights. + return type(d).asdata(weights) + + if not weights: + # No weights (empty dictionary) + return + + if axis is None: + axis = tuple(range(d.ndim)) + else: + axis = d._parse_axes(axis) + + weights = weights.copy() + weights_axes = set() + for key, value in tuple(weights.items()): + del weights[key] + key = d._parse_axes(key) + if weights_axes.intersection(key): + raise ValueError("Duplicate weights axis") + + weights[tuple(key)] = value + weights_axes.update(key) + + if not weights_axes.intersection(axis): + # No weights span collapse axes + return + + # For each component, add missing dimensions as size 1. + w = [] + shape = d.shape + Data = type(d) + for key, value in weights.items(): + value = Data.asdata(value) + + # Make sure axes are in ascending order + skey = tuple(sorted(key)) + if key != skey: + value = value.transpose(skey) + key = skey + + if not all( + True if i in (j, 1) else False + for i, j in zip(value.shape, [shape[i] for i in key]) + ): + raise ValueError( + f"Weights component for axes {tuple(key)} with shape " + f"{value.shape} is not broadcastable to data with " + f"shape {shape}" + ) + + new_shape = [n if i in key else 1 for i, n in enumerate(shape)] + w.append(value.reshape(new_shape)) + + # Return the product of the weights components, which will be + # broadcastable to d + return reduce(mul, w) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 0bbd70cf72..3368885495 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -4316,6 +4316,42 @@ def test_Data__init__datetime(self): self.assertTrue((q == d).array.all()) self.assertTrue((d == q).array.all()) + def test_Data_active_storage(self): + """Test `Data.active_storage`.""" + d = cf.Data([[9, 8]]) + self.assertFalse(d.active_storage) + + d._set_active_storage(True) + self.assertTrue(d.active_storage) + + # Check that operations correctly set active_storage to False + d[...] = -1 + self.assertFalse(d.active_storage) + + d._set_active_storage(True) + d.transpose(inplace=True) + self.assertFalse(d.active_storage) + + d._set_active_storage(True) + d.persist(inplace=True) + self.assertFalse(d.active_storage) + + d._set_active_storage(True) + d.rechunk(1, inplace=True) + self.assertFalse(d.active_storage) + + # Test with data on disk + n = cf.NetCDFArray( + "test_file.nc", + "eastward_wind", + shape=(1, 9, 10), + dtype=np.dtype(float), + ) + d = cf.Data(n) + self.assertTrue(d.active_storage) + d = cf.Data(n, to_memory=True) + self.assertFalse(d.active_storage) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) From 815d933bc6b913c9db132cfcf287f75a5c57c8b0 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 16 Nov 2022 11:28:39 +0000 Subject: [PATCH 002/134] dev --- cf/data/collapse.py | 1745 +++---------------------------------- cf/data/collapse_utils.py | 232 +++++ cf/data/dask_collapse.py | 1294 +++++++++++++++++++++++++++ cf/data/data.py | 12 +- cf/data/netcdfarray.py | 210 ++++- 5 files changed, 1799 insertions(+), 1694 deletions(-) create mode 100644 cf/data/collapse_utils.py create mode 100644 cf/data/dask_collapse.py diff --git a/cf/data/collapse.py b/cf/data/collapse.py index 0eb28e829a..a1aa39d903 100644 --- a/cf/data/collapse.py +++ b/cf/data/collapse.py @@ -1,259 +1,12 @@ -"""Functions used during `Data` object collapses.""" -import inspect -from functools import partial, reduce, wraps -from numbers import Integral -from operator import mul +"""Functions used for `Data` object collapses.""" +from functools import partial -import dask.array as da import numpy as np from cfdm.core import DocstringRewriteMeta -from dask.array import chunk -from dask.array.core import _concatenate2 -from dask.array.reductions import divide, numel, reduction -from dask.array.utils import validate_axis -from dask.base import collections_to_dsk -from dask.core import flatten -from dask.utils import deepmap +from dask.array.reductions import reduction from ..docstring import _docstring_substitution_definitions - - -def actify( - cls, a, op, axis=None, chunk_function=None, active_storage=False -): - """TODODASKDOCS. - - .. versionadded:: TODODASKVER - - :Parameters: - - a: `dask.array.Array` - The array to be collapsed. - - op: `str` - TODODASKDOCS - - axis: (sequence of) `int`, optional - TODODASKDOCS - - chunk_function: function - TODODASKDOCS - - {{active_storage: `bool`, optional}} - - :Returns: - - `dask.array.Array`, function - TODODASKDOCS - - """ - if not active_storage: - # It has been determined externally that an active storage - # reduction is not possible - return a, chunk_function - - # Still here? Then it is assumed that the dask array is of a form - # which might be able to exploit active storage. In particular, it - # is assumed that all data definitions point to files. - - # Parse axis - if axis is None: - axis = tuple(range(a.ndim)) - else: - if isinstance(axis, Integral): - axis = (axis,) - - if len(axis) != a.ndim: - # Can't (yet) use active storage to collapse a subset - # of the axes - return a, chunk_function - - axis = validate_axis(axis, a.ndim) - - active_chunk_functions = set() - - # Loop round elements of the dask graph, looking for data - # definitions that point to a file and which support active - # storage operations. The elements are traversed in reverse order - # so that the data defintions come out first, allowing for a fast - # short circuit in the common case when using active storage is no - # feasible. - dsk = collections_to_dsk((a,), optimize_graph=True) - for key, value in reversed(dsk.items()): - try: - value.get_filename() - except AttributeError: - # This value is not a data definition (it is assumed that - # all data definitions point to files). - continue - - try: - # Create a new actified data definition value - value = value.set_active_storage_op(op, axis) - except (AttributeError, ValueError): - # This data definition value does not support active - # storage reductions, or does not support the requested - # active storage reduction defined by 'op'. - active_chunk_functions = () - break - - try: - # Get the active storage chunk function - active_chunk_functions.add(value.get_active_chunk_function()) - except AttributeError: - # This data definition value does not support active - # storage reductions - active_chunk_functions = () - break - - # Still here? Then update the dask graph in-place with the - # actified data definition value. - dsk[key] = value - - if len(active_chunk_functions) == 1: - # All data definitions in the dask graph support active - # storage reductions with the same chunk function => redefine - # the array from the actified dask graph, and redefine the - # reduction chunk function. - a = da.Array(dsk, a.name, a.chunks, a.dtype, a._meta) - chunk_function = active_chunk_functions.pop() - - return a, chunk_function - - -def actify_collapse(collapse_method, chunk_function=None): - """A decorator for `Collapse` methods that enables active storage - operations, when the conditions are right. - - """ - def decorator(collapse_method, chunk_function=None): - print (chunk_function) - @wraps(collapse_method) - def wrapper(cls, *args, **kwargs): - print (args, kwargs, cf_max_chunk.op) - if kwargs.get("weights") is None and "axis" in kwargs: - # Collapse is unweighted over defined axes => attempt to - # actify the dask array and chunk function. - chunk_function = kwargs["chunk_function"] - - a, chunk_function = actify( - args[0], - op=chunk_function.op, - axis=kwargs["axis"], - chunk_function=chunk_function, - active_storage=kwargs["active_storage"], - ) - - args = (a,) - kwargs["chunk_function"] = chunk_function - - return collapse_method(cls, *args, **kwargs) - - return wrapper - -# -------------------------------------------------------------------- -# sample size -# -------------------------------------------------------------------- -def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs): - """Chunk calculations for the sample size. - - This function is passed to `dask.array.reduction` as its *chunk* - parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - See `dask.array.reductions` for details of the parameters. - - :Returns: - - `dict` - Dictionary with the keys: - - * N: The sample size. - - """ - if computing_meta: - return x - - if np.ma.isMA(x): - N = chunk.sum(np.ones_like(x, dtype=dtype), **kwargs) - else: - if dtype: - kwargs["dtype"] = dtype - - N = numel(x, **kwargs) - - return {"N": N} - -# -------------------------------------------------------------------- -# maximum -# -------------------------------------------------------------------- -def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): - """Chunk calculations for the maximum. - - This function is passed to `dask.array.reduction` as its *chunk* - parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - See `dask.array.reductions` for details of the parameters. - - :Returns: - - `dict` - Dictionary with the keys: - - * N: The sample size. - * max: The maximum of `x``. - - """ - if computing_meta: - return x - - return { - "max": chunk.max(x, **kwargs), - "N": cf_sample_size_chunk(x, **kwargs)["N"], - } - -cf_max_chunk.op = "max" - -# -------------------------------------------------------------------- -# minimum -# -------------------------------------------------------------------- -def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): - """Chunk calculations for the minimum. - - This function is passed to `dask.array.reduction` as its *chunk* - parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - See `dask.array.reductions` for details of the parameters. - - :Returns: - - `dict` - Dictionary with the keys: - - * N: The sample size. - * min: The minimum of ``x``. - - """ - if computing_meta: - return x - - return { - "min": chunk.min(x, **kwargs), - "N": cf_sample_size_chunk(x, **kwargs)["N"], - } - -cf_min_chunk.op = "min" +from .collapse_utils import actify, check_input_dtype, double_precision_dtype class Collapse(metaclass=DocstringRewriteMeta): @@ -296,117 +49,12 @@ def __docstring_package_depth__(self): return 0 @classmethod - def actify( - cls, a, op, axis=None, chunk_function=None, active_storage=False - ): - """TODODASKDOCS. - - .. versionadded:: TODODASKVER - - :Parameters: - - a: `dask.array.Array` - The array to be collapsed. - - op: `str` - TODODASKDOCS - - axis: (sequence of) `int`, optional - TODODASKDOCS - - chunk_function: function - TODODASKDOCS - - {{active_storage: `bool`, optional}} - - :Returns: - - `dask.array.Array`, function - TODODASKDOCS - - """ - if not active_storage: - # It has been determined externally that an active storage - # reduction is not possible - return a, chunk_function - - # Still here? Then it is assumed that the dask array is of a - # form which might be able to exploit active storage. In - # particular, it is assumed that all data definitions point to - # files. - - # Parse axis - if axis is None: - axis = tuple(range(a.ndim)) - else: - if isinstance(axis, Integral): - axis = (axis,) - - if len(axis) != a.ndim: - # Can't (yet) use active storage to collapse a subset - # of the axes - return a, chunk_function - - axis = validate_axis(axis, a.ndim) - - active_chunk_functions = set() - - # Loop round elements of the dask graph, looking for data - # definitions that point to a file and which support active - # storage operations. The elements are traversed in reverse - # order so that the data defintions come out first, allowing - # for a fast short circuit in the common case when using - # active storage is no feasible. - dsk = collections_to_dsk((a,), optimize_graph=True) - for key, value in reversed(dsk.items()): - try: - value.get_filename() - except AttributeError: - # This value is not a data definition (it is assumed - # that all data definitions point to files). - continue - - try: - # Create a new actified data definition value - value = value.set_active_storage_op(op, axis) - except (AttributeError, ValueError): - # This data definition value does not support active - # storage reductions, or does not support the - # requested active storage reduction defined by 'op'. - active_chunk_functions = () - break - - try: - # Get the active storage chunk function - active_chunk_functions.add(value.get_active_chunk_function()) - except AttributeError: - # This data definition value does not support active - # storage reductions - active_chunk_functions = () - break - - # Still here? Then update the dask graph in-place with the - # actified data definition value. - dsk[key] = value - - if len(active_chunk_functions) == 1: - # All data definitions in the dask graph support active - # storage reductions with the same chunk function => - # redefine the array from the actified dask graph, and - # redefine the reduction chunk function. - a = da.Array(dsk, a.name, a.chunks, a.dtype, a._meta) - chunk_function = active_chunk_functions.pop() - - return a, chunk_function - - @classmethod - @actify_collapse(chunk_function=cf_max_chunk) def max( cls, a, axis=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, active_storage=False, ): @@ -436,24 +84,30 @@ def max( {{active_storage: `bool`, optional}} - {{chunk_function: function}} - :Returns: `dask.array.Array` The collapsed array. """ + from .dask_collapse import cf_max_agg, cf_max_chunk, cf_max_combine + check_input_dtype(a) dtype = a.dtype - # a, cf_max_chunk = cls.actify( - # a, "max", axis, cf_max_chunk, active_storage - # ) + # Rewrite data and chunk function if active storage operations + # are available. + a, chunk_function = actify( + a, + method="max", + axis=axis, + chunk_function=cf_max_chunk, + active_storage=active_storage, + ) return reduction( a, - chunk_function, # cf_max_chunk, + chunk_function, partial(cf_max_agg, mtol=mtol, original_shape=a.shape), axis=axis, keepdims=keepdims, @@ -470,7 +124,7 @@ def max_abs( a, axis=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, active_storage=False, ): @@ -521,7 +175,7 @@ def mean( axis=None, weights=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, active_storage=False, ): @@ -559,17 +213,24 @@ def mean( The collapsed array. """ + from .dask_collapse import cf_mean_agg, cf_mean_chunk, cf_mean_combine + check_input_dtype(a) dtype = "f8" -# if weights is None: -# a, cf_mean_chunk = cls.actify( -# a, "mean", axis, cf_mean_chunk, active_storage -# ) + # Rewrite data and chunk function if active storage operations + # are available. + a, chunk_function = actify( + a, + method="mean", + axis=axis, + chunk_function=cf_mean_chunk, + active_storage=active_storage, + ) return reduction( a, - cf_mean_chunk, + chunk_function, partial(cf_mean_agg, mtol=mtol, original_shape=a.shape), axis=axis, keepdims=keepdims, @@ -588,7 +249,7 @@ def mean_abs( weights=None, axis=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, active_storage=False, ): @@ -642,7 +303,7 @@ def mid_range( axis=None, dtype=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, active_storage=False, ): @@ -678,6 +339,12 @@ def mid_range( The collapsed array. """ + from .dask_collapse import ( + cf_mid_range_agg, + cf_range_chunk, + cf_range_combine, + ) + check_input_dtype(a, allowed="fi") dtype = "f8" return reduction( @@ -699,7 +366,7 @@ def min( a, axis=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, active_storage=False, ): @@ -735,16 +402,24 @@ def min( The collapsed array. """ + from .dask_collapse import cf_min_agg, cf_min_chunk, cf_min_combine + check_input_dtype(a) dtype = a.dtype -# a, cf_min_chunk = cls.actify( -# a, "min", axis, cf_min_chunk, active_storage -# ) + # Rewrite data and chunk function if active storage operations + # are available. + a, chunk_function = actify( + a, + method="min", + axis=axis, + chunk_function=cf_min_chunk, + active_storage=active_storage, + ) return reduction( a, - cf_min_chunk, + chunk_function, partial(cf_min_agg, mtol=mtol, original_shape=a.shape), axis=axis, keepdims=keepdims, @@ -761,7 +436,7 @@ def min_abs( a, axis=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, active_storage=False, ): @@ -811,7 +486,7 @@ def range( a, axis=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, active_storage=False, ): @@ -847,6 +522,12 @@ def range( The collapsed array. """ + from .dask_collapse import ( + cf_range_agg, + cf_range_chunk, + cf_range_combine, + ) + check_input_dtype(a, allowed="fi") dtype = a.dtype return reduction( @@ -869,7 +550,7 @@ def rms( axis=None, weights=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, active_storage=False, ): @@ -907,6 +588,8 @@ def rms( The collapsed array. """ + from .dask_collapse import cf_mean_combine, cf_rms_agg, cf_rms_chunk + check_input_dtype(a) dtype = "f8" return reduction( @@ -929,7 +612,7 @@ def sample_size( a, axis=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, active_storage=False, ): @@ -965,6 +648,12 @@ def sample_size( The collapsed array. """ + from .dask_collapse import ( + cf_sample_size_agg, + cf_sample_size_chunk, + cf_sample_size_combine, + ) + check_input_dtype(a) dtype = "i8" return reduction( @@ -987,7 +676,7 @@ def sum( axis=None, weights=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, active_storage=False, ): @@ -1025,14 +714,26 @@ def sum( The collapsed array. """ + from .dask_collapse import cf_sum_agg, cf_sum_chunk, cf_sum_combine + check_input_dtype(a) dtype = double_precision_dtype(a) if weights is not None: dtype = np.result_type(double_precision_dtype(weights), dtype) + # Rewrite data and chunk function if active storage operations + # are available. + a, chunk_function = actify( + a, + method="sum", + axis=axis, + chunk_function=cf_sum_chunk, + active_storage=active_storage, + ) + return reduction( a, - cf_sum_chunk, + chunk_function, partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), axis=axis, keepdims=keepdims, @@ -1051,7 +752,7 @@ def sum_of_weights( axis=None, weights=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, active_storage=False, ): @@ -1089,6 +790,12 @@ def sum_of_weights( The collapsed array. """ + from .dask_collapse import ( + cf_sum_agg, + cf_sum_combine, + cf_sum_of_weights_chunk, + ) + check_input_dtype(a) dtype = double_precision_dtype(weights, default="i8") return reduction( @@ -1112,7 +819,7 @@ def sum_of_weights2( axis=None, weights=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, active_storage=False, ): @@ -1150,6 +857,12 @@ def sum_of_weights2( The collapsed array. """ + from .dask_collapse import ( + cf_sum_agg, + cf_sum_combine, + cf_sum_of_weights_chunk, + ) + check_input_dtype(a) dtype = double_precision_dtype(weights, default="i8") return reduction( @@ -1173,7 +886,7 @@ def var( axis=None, weights=None, keepdims=False, - mtol=None, + mtol=1, ddof=None, split_every=None, active_storage=False, @@ -1214,6 +927,8 @@ def var( The collapsed array. """ + from .dask_collapse import cf_var_agg, cf_var_chunk, cf_var_combine + check_input_dtype(a) dtype = "f8" return reduction( @@ -1243,7 +958,7 @@ def unique(cls, a, split_every=None, active_storage=False): {{split_every: `int` or `dict`, optional}} - {{active_storage: `bool`, optional}} + {{active_storage: `bool`, optional}} :Returns: @@ -1251,6 +966,8 @@ def unique(cls, a, split_every=None, active_storage=False): The unique values in a 1-d array. """ + from .dask_collapse import cf_unique_agg, cf_unique_chunk + check_input_dtype(a, "fibUS") # Flatten the array so that it has the same number of @@ -1272,1257 +989,3 @@ def unique(cls, a, split_every=None, active_storage=False): concatenate=False, meta=np.array((), dtype=dtype), ) - - -def check_input_dtype(a, allowed="fib"): - """Check that data has a data type allowed by a collapse method. - - The collapse method is assumed to be defined by the calling - function. - - :Parameters: - - a: `dask.array.Array` - The data. - - allowed: `str`, optional - The data type kinds allowed by the collapse - method. Defaults to ``'fib'``, meaning that only float, - integer and Boolean data types are allowed. - - :Returns: - - `None` - - """ - if a.dtype.kind not in allowed: - method = inspect.currentframe().f_back.f_code.co_name - raise TypeError(f"Can't calculate {method} of data with {a.dtype!r}") - - -def double_precision_dtype(a, default=None, bool_type="i"): - """Returns the corresponding double precision data type of an array. - - :Parameters: - - a: `dask.array.Array` or `None` - The data. If `None` then the value of *default* is - returned*. - - default: `str`, optional - If *a* is `None`, then return this data type. - - bool_type: `str`, optional - The corresponding double data type kind for Boolean - data. Defaults to ``'i'``, meaning ``'i8'`` is - returned. Set to ``'f'` to return ``'f8'`` instead. - - :Returns: - - `str` - The double precision type. - - **Examples** - - >>> for d in (int, 'int32', float, 'float32', bool): - ... print(double_precision_dtype(np.array(1, dtype=d))) - ... - i8 - i8 - f8 - f8 - i8 - - >>> double_precision_dtype(np.array(1, dtype=bool), bool_type='f') - 'f8' - >>> double_precision_dtype(None, default="i8") - 'i8' - - """ - if a is None: - return default - - kind = a.dtype.kind - if kind == "b": - return bool_type + "8" - - if kind in "fi": - return kind + "8" - - raise TypeError(f"Can't collapse data with {a.dtype!r}") - - -def mask_small_sample_size(x, N, axis, mtol, original_shape): - """Mask elements where the sample size is below a threshold. - - .. versionadded:: TODODASKVER - - :Parameters: - - x: `numpy.ndarray` - The collapsed data. - - N: `numpy.ndarray` - The sample sizes of the collapsed values. - - axis: sequence of `int` - The axes being collapsed. - - mtol: number - The sample size threshold below which collapsed values are - set to missing data. It is defined as a fraction (between - 0 and 1 inclusive) of the contributing input data values. - - The default of *mtol* is 1, meaning that a missing datum - in the output array occurs whenever all of its - contributing input array elements are missing data. - - For other values, a missing datum in the output array - occurs whenever more than ``100*mtol%`` of its - contributing input array elements are missing data. - - Note that for non-zero values of *mtol*, different - collapsed elements may have different sample sizes, - depending on the distribution of missing data in the input - data. - - original_shape: `tuple` - The shape of the original, uncollapsed data. - - :Returns: - - `numpy.ndarray` - Array *x* masked where *N* is sufficiently small. Note - that the input *x* might be modified in-place with the - contents of the output. - - """ - if not x.ndim: - # Make sure that we have a numpy array (e.g. as opposed to a - # numpy.float64) - x = np.asanyarray(x) - - if mtol < 1: - # Nmax = total number of elements, including missing values - Nmax = reduce(mul, [original_shape[i] for i in axis], 1) - x = np.ma.masked_where(N < (1 - mtol) * Nmax, x, copy=False) - - return x - - -def sum_weights_chunk(x, weights=None, square=False, N=None, **kwargs): - """Sum the weights. - - .. versionadded:: TODODASKVER - - :Parameters: - - x: `numpy.ndarray` - The data. - - weights: `numpy.ndarray`, optional - The weights associated with values of the data. Must have - the same shape as *x*. By default *weights* is `None`, - meaning that all non-missing elements of the data have a - weight of 1 and all missing elements have a weight of - 0. If given as an array then those weights that are - missing data, or that correspond to the missing elements - of the data, are assigned a weight of 0. - - square: `bool`, optional - If True calculate the sum of the squares of the weights. - - N: `numpy.ndarray`, optional - The sample size. If provided as an array and there are no - weights, then the *N* is returned instead of calculating - the sum (of the squares) of weights. Ignored of *weights* - is not `None`. - - :Returns: - - `numpy.ndarray` - The sum of the weights, with data type "i8" or "f8". - - """ - if weights is None: - # All weights are 1, so the sum of the weights and the sum of - # the squares of the weights are both equal to the sample - # size. - if N is None: - N = cf_sample_size_chunk(x, **kwargs)["N"] - - return N - - dtype = double_precision_dtype(weights) - if square: - weights = np.multiply(weights, weights, dtype=dtype) - - if np.ma.is_masked(x): - weights = np.ma.masked_where(x.mask, weights) - - return chunk.sum(weights, dtype=dtype, **kwargs) - - -def combine_arrays( - pairs, key, func, axis, dtype=None, computing_meta=False, **kwargs -): - """Worker function for Combine callables. - - Select arrays by dictionary key from a nested list of - dictionaries, concatenate the resulting nested list of arrays - along the specified axes, and then apply a function to the result - along those same axes. - - See `dask.array.reductions.mean_combine` for an example. - - .. versionadded:: TODODASKVER - - :Returns: - - `numpy.ndarray` - - """ - x = deepmap(lambda pair: pair[key], pairs) if not computing_meta else pairs - - if dtype: - kwargs["dtype"] = dtype - - x = _concatenate2(x, axes=axis) - return func(x, axis=axis, **kwargs) - - -def sum_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): - """Alias of `combine_arrays` with ``func=chunk.sum``. - - .. versionadded:: TODODASKVER - - """ - return combine_arrays( - pairs, key, chunk.sum, axis, dtype, computing_meta, **kwargs - ) - - -def max_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): - """Alias of `combine_arrays` with ``func=chunk.max``. - - .. versionadded:: TODODASKVER - - """ - return combine_arrays( - pairs, key, chunk.max, axis, dtype, computing_meta, **kwargs - ) - - -def min_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): - """Alias of `combine_arrays` with ``func=chunk.min``. - - .. versionadded:: TODODASKVER - - """ - return combine_arrays( - pairs, key, chunk.min, axis, dtype, computing_meta, **kwargs - ) - - -def sum_sample_sizes(pairs, axis, computing_meta=False, **kwargs): - """Alias of `combine_arrays` with ``key="N", func=chunk.sum, - dtype="i8"``. - - .. versionadded:: TODODASKVER - - """ - return combine_arrays( - pairs, - "N", - chunk.sum, - axis, - dtype="i8", - computing_meta=computing_meta, - **kwargs, - ) - - -# -------------------------------------------------------------------- -# mean -# -------------------------------------------------------------------- -def cf_mean_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): - """Chunk calculations for the mean. - - This function is passed to `dask.array.reduction` as its *chunk* - parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - See `dask.array.reductions` for details of the parameters. - - :Returns: - - `dict` - Dictionary with the keys: - - * N: The sample size. - * V1: The sum of ``weights`` (equal to ``N`` if weights - are not set). - * sum: The weighted sum of ``x``. - * weighted: True if weights have been set. - - """ - if computing_meta: - return x - - # N, sum - d = cf_sum_chunk(x, weights, dtype=dtype, **kwargs) - - d["V1"] = sum_weights_chunk(x, weights, N=d["N"], **kwargs) - d["weighted"] = weights is not None - - return d - - -def cf_mean_combine( - pairs, axis=None, dtype="f8", computing_meta=False, **kwargs -): - """Combination calculations for the mean. - - This function is passed to `dask.array.reduction` as its *combine* - parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - See `dask.array.reductions` for details of the parameters. - - :Returns: - - As for `cf_mean_chunk`. - - """ - if not isinstance(pairs, list): - pairs = [pairs] - - weighted = next(flatten(pairs))["weighted"] - d = {"weighted": weighted} - - d["sum"] = sum_arrays(pairs, "sum", axis, dtype, computing_meta, **kwargs) - if computing_meta: - return d["sum"] - - d["N"] = sum_sample_sizes(pairs, axis, **kwargs) - if weighted: - d["V1"] = sum_arrays(pairs, "V1", axis, dtype, **kwargs) - else: - d["V1"] = d["N"] - - return d - - -def cf_mean_agg( - pairs, - axis=None, - dtype="f8", - computing_meta=False, - mtol=None, - original_shape=None, - **kwargs, -): - """Aggregation calculations for the mean. - - This function is passed to `dask.array.reduction` as its - *aggregate* parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - mtol: number, optional - The sample size threshold below which collapsed values are - set to missing data. See `mask_small_sample_size` for - details. - - original_shape: `tuple` - The shape of the original, uncollapsed data. - - See `dask.array.reductions` for details of the other - parameters. - - :Returns: - - `dask.array.Array` - The collapsed array. - - """ - d = cf_mean_combine(pairs, axis, dtype, computing_meta, **kwargs) - if computing_meta: - return d - - x = divide(d["sum"], d["V1"], dtype=dtype) - x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) - return x - - -def cf_max_combine(pairs, axis=None, computing_meta=False, **kwargs): - """Combination calculations for the maximum. - - This function is passed to `dask.array.reduction` as its *combine* - parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - See `dask.array.reductions` for details of the parameters. - - :Returns: - - As for `cf_max_chunk`. - - """ - if not isinstance(pairs, list): - pairs = [pairs] - - mx = max_arrays(pairs, "max", axis, None, computing_meta, **kwargs) - if computing_meta: - return mx - - return {"max": mx, "N": sum_sample_sizes(pairs, axis, **kwargs)} - - -def cf_max_agg( - pairs, - axis=None, - computing_meta=False, - mtol=None, - original_shape=None, - **kwargs, -): - """Aggregation calculations for the maximum. - - This function is passed to `dask.array.reduction` as its - *aggregate* parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - mtol: number, optional - The sample size threshold below which collapsed values are - set to missing data. See `mask_small_sample_size` for - details. - - original_shape: `tuple` - The shape of the original, uncollapsed data. - - See `dask.array.reductions` for details of the other - parameters. - - :Returns: - - `dask.array.Array` - The collapsed array. - - """ - d = cf_max_combine(pairs, axis, computing_meta, **kwargs) - if computing_meta: - return d - - x = d["max"] - x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) - return x - - -# -------------------------------------------------------------------- -# mid-range -# -------------------------------------------------------------------- -def cf_mid_range_agg( - pairs, - axis=None, - dtype="f8", - computing_meta=False, - mtol=None, - original_shape=None, - **kwargs, -): - """Aggregation calculations for the mid-range. - - This function is passed to `dask.array.reduction` as its - *aggregate* parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - mtol: number, optional - The sample size threshold below which collapsed values are - set to missing data. See `mask_small_sample_size` for - details. - - original_shape: `tuple` - The shape of the original, uncollapsed data. - - See `dask.array.reductions` for details of the other - parameters. - - :Returns: - - `dask.array.Array` - The collapsed array. - - """ - d = cf_range_combine(pairs, axis, dtype, computing_meta, **kwargs) - if computing_meta: - return d - - # Calculate the mid-range - x = divide(d["max"] + d["min"], 2.0, dtype=dtype) - x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) - return x - - - -def cf_min_combine(pairs, axis=None, computing_meta=False, **kwargs): - """Combination calculations for the minimum. - - This function is passed to `dask.array.reduction` as its *combine* - parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - See `dask.array.reductions` for details of the parameters. - - :Returns: - - As for `cf_min_chunk`. - - """ - if not isinstance(pairs, list): - pairs = [pairs] - - mn = min_arrays(pairs, "min", axis, None, computing_meta, **kwargs) - if computing_meta: - return mn - - return {"min": mn, "N": sum_sample_sizes(pairs, axis, **kwargs)} - - -def cf_min_agg( - pairs, - axis=None, - computing_meta=False, - mtol=None, - original_shape=None, - **kwargs, -): - """Aggregation calculations for the minimum. - - This function is passed to `dask.array.reduction` as its - *aggregate* parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - mtol: number, optional - The sample size threshold below which collapsed values are - set to missing data. See `mask_small_sample_size` for - details. - - original_shape: `tuple` - The shape of the original, uncollapsed data. - - See `dask.array.reductions` for details of the other - parameters. - - :Returns: - - `dask.array.Array` - The collapsed array. - - """ - d = cf_min_combine(pairs, axis, computing_meta, **kwargs) - if computing_meta: - return d - - x = d["min"] - x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) - return x - - -# -------------------------------------------------------------------- -# range -# -------------------------------------------------------------------- -def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs): - """Chunk calculations for the range. - - This function is passed to `dask.array.reduction` as its *chunk* - parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - See `dask.array.reductions` for details of the parameters. - - :Returns: - - `dict` - Dictionary with the keys: - - * N: The sample size. - * min: The minimum of ``x``. - * max: The maximum of ``x`. - - """ - if computing_meta: - return x - - # N, max - d = cf_max_chunk(x, **kwargs) - - d["min"] = chunk.min(x, **kwargs) - return d - - -def cf_range_combine( - pairs, axis=None, dtype=None, computing_meta=False, **kwargs -): - """Combination calculations for the range. - - This function is passed to `dask.array.reduction` as its *combine* - parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - See `dask.array.reductions` for details of the parameters. - - :Returns: - - As for `cf_range_chunk`. - - """ - if not isinstance(pairs, list): - pairs = [pairs] - - mx = max_arrays(pairs, "max", axis, None, computing_meta, **kwargs) - if computing_meta: - return mx - - mn = min_arrays(pairs, "min", axis, None, **kwargs) - - return {"max": mx, "min": mn, "N": sum_sample_sizes(pairs, axis, **kwargs)} - - -def cf_range_agg( - pairs, - axis=None, - computing_meta=False, - mtol=None, - original_shape=None, - **kwargs, -): - """Aggregation calculations for the range. - - This function is passed to `dask.array.reduction` as its - *aggregate* parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - mtol: number, optional - The sample size threshold below which collapsed values are - set to missing data. See `mask_small_sample_size` for - details. - - original_shape: `tuple` - The shape of the original, uncollapsed data. - - See `dask.array.reductions` for details of the other - parameters. - - :Returns: - - `dask.array.Array` - The collapsed array. - - """ - d = cf_range_combine(pairs, axis, computing_meta, **kwargs) - if computing_meta: - return d - - # Calculate the range - x = d["max"] - d["min"] - x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) - return x - - -# -------------------------------------------------------------------- -# root mean square -# -------------------------------------------------------------------- -def cf_rms_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): - """Chunk calculations for the root mean square (RMS). - - This function is passed to `dask.array.reduction` as its *chunk* - parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - See `dask.array.reductions` for details of the parameters. - - :Returns: - - `dict` - Dictionary with the keys: - - * N: The sample size. - * sum: The weighted sum of ``x**2``. - - """ - if computing_meta: - return x - - return cf_mean_chunk( - np.multiply(x, x, dtype=dtype), weights=weights, dtype=dtype, **kwargs - ) - - -def cf_rms_agg( - pairs, - axis=None, - dtype="f8", - computing_meta=False, - mtol=None, - original_shape=None, - **kwargs, -): - """Aggregation calculations for the root mean square (RMS). - - This function is passed to `dask.array.reduction` as its - *aggregate* parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - mtol: number, optional - The sample size threshold below which collapsed values are - set to missing data. See `mask_small_sample_size` for - details. - - original_shape: `tuple` - The shape of the original, uncollapsed data. - - See `dask.array.reductions` for details of the other - parameters. - - :Returns: - - `dask.array.Array` - The collapsed array. - - """ - d = cf_mean_combine(pairs, axis, dtype, computing_meta, **kwargs) - if computing_meta: - return d - - x = np.sqrt(d["sum"] / d["V1"], dtype=dtype) - x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) - return x - - - -def cf_sample_size_combine( - pairs, axis=None, dtype="i8", computing_meta=False, **kwargs -): - """Combination calculations for the sample size. - - This function is passed to `dask.array.reduction` as its *combine* - parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - See `dask.array.reductions` for details of the parameters. - - :Returns: - - As for `cf_sample_size_chunk`. - - """ - if not isinstance(pairs, list): - pairs = [pairs] - - x = sum_arrays(pairs, "N", axis, dtype, computing_meta, **kwargs) - if computing_meta: - return x - - return {"N": x} - - -def cf_sample_size_agg( - pairs, - axis=None, - computing_meta=False, - dtype="i8", - mtol=None, - original_shape=None, - **kwargs, -): - """Aggregation calculations for the sample size. - - This function is passed to `dask.array.reduction` as its - *aggregate* parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - mtol: number, optional - The sample size threshold below which collapsed values are - set to missing data. See `mask_small_sample_size` for - details. - - original_shape: `tuple` - The shape of the original, uncollapsed data. - - See `dask.array.reductions` for details of the other - parameters. - - :Returns: - - `dask.array.Array` - The collapsed array. - - """ - d = cf_sample_size_combine(pairs, axis, dtype, computing_meta, **kwargs) - if computing_meta: - return d - - x = d["N"] - x = mask_small_sample_size(x, x, axis, mtol, original_shape) - return x - - -# -------------------------------------------------------------------- -# sum -# -------------------------------------------------------------------- -def cf_sum_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): - """Chunk calculations for the sum. - - This function is passed to `dask.array.reduction` as its *chunk* - parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - See `dask.array.reductions` for details of the parameters. - - :Returns: - - `dict` - Dictionary with the keys: - - * N: The sample size. - * sum: The weighted sum of ``x`` - - """ - if computing_meta: - return x - - if weights is not None: - x = np.multiply(x, weights, dtype=dtype) - - d = cf_sample_size_chunk(x, **kwargs) - d["sum"] = chunk.sum(x, dtype=dtype, **kwargs) - return d - - -def cf_sum_combine( - pairs, axis=None, dtype="f8", computing_meta=False, **kwargs -): - """Combination calculations for the sum. - - This function is passed to `dask.array.reduction` as its *combine* - parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - See `dask.array.reductions` for details of the parameters. - - :Returns: - - As for `cf_sum_chunk`. - - """ - if not isinstance(pairs, list): - pairs = [pairs] - - x = sum_arrays(pairs, "sum", axis, dtype, computing_meta, **kwargs) - if computing_meta: - return x - - return {"sum": x, "N": sum_sample_sizes(pairs, axis, **kwargs)} - - -def cf_sum_agg( - pairs, - axis=None, - dtype="f8", - computing_meta=False, - mtol=None, - original_shape=None, - **kwargs, -): - """Aggregation calculations for the sum. - - This function is passed to `dask.array.reduction` as its - *aggregate* parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - mtol: number, optional - The sample size threshold below which collapsed values are - set to missing data. See `mask_small_sample_size` for - details. - - original_shape: `tuple` - The shape of the original, uncollapsed data. - - See `dask.array.reductions` for details of the other - parameters. - - :Returns: - - `dask.array.Array` - The collapsed array. - - """ - d = cf_sum_combine(pairs, axis, dtype, computing_meta, **kwargs) - if computing_meta: - return d - - x = d["sum"] - x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) - return x - - -# -------------------------------------------------------------------- -# sum of weights -# -------------------------------------------------------------------- -def cf_sum_of_weights_chunk( - x, weights=None, dtype="f8", computing_meta=False, square=False, **kwargs -): - """Chunk calculations for the sum of the weights. - - This function is passed to `dask.array.reduction` as its *chunk* - parameter. - - :Parameters: - - square: `bool`, optional - If True then calculate the sum of the squares of the - weights. - - See `dask.array.reductions` for details of the other - parameters. - - :Returns: - - `dict` - Dictionary with the keys: - - * N: The sample size. - * sum: The sum of ``weights``, or the sum of - ``weights**2`` if *square* is True. - - """ - if computing_meta: - return x - - # N - d = cf_sample_size_chunk(x, **kwargs) - - d["sum"] = sum_weights_chunk( - x, weights=weights, square=square, N=d["N"], **kwargs - ) - - return d - - -# -------------------------------------------------------------------- -# unique -# -------------------------------------------------------------------- -def cf_unique_chunk(x, dtype=None, computing_meta=False, **kwargs): - """Chunk calculations for the unique values. - - This function is passed to `dask.array.reduction` as its *chunk* - parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - See `dask.array.reductions` for details of the parameters. - - :Returns: - - `dict` - Dictionary with the keys: - - * unique: The unique values. - - """ - if computing_meta: - return x - - return {"unique": np.unique(x)} - - -def cf_unique_agg(pairs, axis=None, computing_meta=False, **kwargs): - """Aggregation calculations for the unique values. - - This function is passed to `dask.array.reduction` as its - *aggregate* parameter. - - It is assumed that the arrays are one-dimensional. - - .. versionadded:: TODODASKVER - - :Parameters: - - See `dask.array.reductions` for details of the parameters. - - :Returns: - - `dask.array.Array` - The unique values. - - """ - x = ( - deepmap(lambda pair: pair["unique"], pairs) - if not computing_meta - else pairs - ) - if computing_meta: - return x - - x = _concatenate2(x, axes=[0]) - return np.unique(x) - - -# -------------------------------------------------------------------- -# variance -# -------------------------------------------------------------------- -def cf_var_chunk( - x, weights=None, dtype="f8", computing_meta=False, ddof=None, **kwargs -): - """Chunk calculations for the variance. - - This function is passed to `dask.array.reduction` as its *chunk* - parameter. - - See - https://en.wikipedia.org/wiki/Pooled_variance#Sample-based_statistics - for details. - - .. versionadded:: TODODASKVER - - :Parameters: - - ddof: number - The delta degrees of freedom. The number of degrees of - freedom used in the calculation is (N-*ddof*) where N - represents the number of non-missing elements. A value of - 1 applies Bessel's correction. - - See `dask.array.reductions` for details of the other - parameters. - - :Returns: - - `dict` - Dictionary with the keys: - - * N: The sample size. - * V1: The sum of ``weights`` (equal to ``N`` if weights - are not set). - * V2: The sum of ``weights**2``, or `None` of not - required. - * sum: The weighted sum of ``x``. - * part: ``V1 * (sigma**2 + mu**2)``, where ``sigma**2`` is - the weighted biased (i.e. ``ddof=0``) variance of - ``x``, and ``mu`` is the weighted mean of ``x``. - * weighted: True if weights have been set. - * ddof: The delta degrees of freedom. - - """ - if computing_meta: - return x - - weighted = weights is not None - - # N, V1, sum - d = cf_mean_chunk(x, weights, dtype=dtype, **kwargs) - - wsum = d["sum"] - V1 = d["V1"] - - avg = divide(wsum, V1, dtype=dtype) - part = x - avg - part *= part - if weighted: - part = part * weights - - part = chunk.sum(part, dtype=dtype, **kwargs) - part = part + avg * wsum - - d["part"] = part - - if weighted and ddof == 1: - d["V2"] = sum_weights_chunk(x, weights, square=True, **kwargs) - else: - d["V2"] = None - - d["weighted"] = weighted - d["ddof"] = ddof - - return d - - -def cf_var_combine( - pairs, axis=None, dtype="f8", computing_meta=False, **kwargs -): - """Combination calculations for the variance. - - This function is passed to `dask.array.reduction` as its *combine* - parameter. - - .. versionadded:: TODODASKVER - - :Parameters: - - See `dask.array.reductions` for details of the parameters. - - :Returns: - - As for `cf_var_chunk`. - - """ - if not isinstance(pairs, list): - pairs = [pairs] - - d = next(flatten(pairs)) - weighted = d["weighted"] - ddof = d["ddof"] - d = {"weighted": weighted, "ddof": ddof} - - d["part"] = sum_arrays( - pairs, "part", axis, dtype, computing_meta, **kwargs - ) - if computing_meta: - return d["part"] - - d["sum"] = sum_arrays(pairs, "sum", axis, dtype, **kwargs) - - d["N"] = sum_sample_sizes(pairs, axis, **kwargs) - d["V1"] = d["N"] - d["V2"] = None - if weighted: - d["V1"] = sum_arrays(pairs, "V1", axis, dtype, **kwargs) - if ddof == 1: - d["V2"] = sum_arrays(pairs, "V2", axis, dtype, **kwargs) - - return d - - -def cf_var_agg( - pairs, - axis=None, - dtype="f8", - computing_meta=False, - mtol=None, - original_shape=None, - **kwargs, -): - """Aggregation calculations for the variance. - - This function is passed to `dask.array.reduction` as its - *aggregate* parameter. - - .. note:: Weights are interpreted as reliability weights, as - opposed to frequency weights. - - See - https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights - for details. - - .. versionadded:: TODODASKVER - - :Parameters: - - mtol: number, optional - The sample size threshold below which collapsed values are - set to missing data. See `mask_small_sample_size` for - details. - - original_shape: `tuple` - The shape of the original, uncollapsed data. - - See `dask.array.reductions` for details of the other - parameters. - - :Returns: - - `dask.array.Array` - The collapsed array. - - """ - d = cf_var_combine(pairs, axis, dtype, computing_meta, **kwargs) - if computing_meta: - return d - - ddof = d["ddof"] - V1 = d["V1"] - wsum = d["sum"] - var = d["part"] - wsum * wsum / V1 - - # Note: var is now the global value of V1 * sigma**2, where sigma - # is the global weighted biased (i.e. ddof=0) variance. - - if ddof is None: - raise ValueError(f"Must set ddof to a numeric value. Got: {ddof!r}") - - if not ddof: - # Weighted or unweighted variance with ddof=0 - f = 1 / V1 - elif not d["weighted"]: - # Unweighted variance with any non-zero value of ddof - f = 1 / (V1 - ddof) - elif ddof == 1: - # Weighted variance with ddof=1 - f = V1 / (V1 * V1 - d["V2"]) - else: - raise ValueError( - "Can only calculate a weighted variance with ddof=0 or ddof=1. " - f"Got: {ddof!r}" - ) - - # Now get the required global variance with the requested ddof - var = f * var - - var = mask_small_sample_size(var, d["N"], axis, mtol, original_shape) - return var diff --git a/cf/data/collapse_utils.py b/cf/data/collapse_utils.py new file mode 100644 index 0000000000..eac6a7005d --- /dev/null +++ b/cf/data/collapse_utils.py @@ -0,0 +1,232 @@ +from numbers import Integral + +import dask.array as da +from dask.array.utils import validate_axis +from dask.base import collections_to_dsk + + +def double_precision_dtype(a, default=None, bool_type="i"): + """Returns the corresponding double precision data type of an array. + + .. versionadded:: TODODASKVER + + :Parameters: + + a: `dask.array.Array` or `None` + The data. If `None` then the value of *default* is + returned*. + + default: `str`, optional + If *a* is `None`, then return this data type. + + bool_type: `str`, optional + The corresponding double data type kind for Boolean + data. Defaults to ``'i'``, meaning ``'i8'`` is + returned. Set to ``'f'` to return ``'f8'`` instead. + + :Returns: + + `str` + The double precision type. + + **Examples** + + >>> for d in (int, 'int32', float, 'float32', bool): + ... print(double_precision_dtype(np.array(1, dtype=d))) + ... + i8 + i8 + f8 + f8 + i8 + + >>> double_precision_dtype(np.array(1, dtype=bool), bool_type='f') + 'f8' + >>> double_precision_dtype(None, default="i8") + 'i8' + + """ + if a is None: + return default + + kind = a.dtype.kind + if kind == "b": + return bool_type + "8" + + if kind in "fi": + return kind + "8" + + raise TypeError(f"Can't collapse data with {a.dtype!r}") + + +def check_input_dtype(a, allowed="fib"): + """Check that data has a data type allowed by a collapse method. + + The collapse method is assumed to be defined by the calling + function. + + .. versionadded:: TODODASKVER + + :Parameters: + + a: `dask.array.Array` + The data. + + allowed: `str`, optional + The data type kinds allowed by the collapse + method. Defaults to ``'fib'``, meaning that only float, + integer and Boolean data types are allowed. + + :Returns: + + `None` + + """ + if a.dtype.kind not in allowed: + from inspect import currentframe + + method = currentframe().f_back.f_code.co_name + raise TypeError(f"Can't calculate {method} of data with {a.dtype!r}") + + +def actify(a, method, axis=None, chunk_function=None, active_storage=False): + """TODOACTIVEDOCS. + + .. versionadded:: TODOACTIVEVER + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + method: `str` + TODOACTIVEDOCS + + axis: (sequence of) `int`, optional + TODOACTIVEDOCS + + chunk_function: function + TODOACTIVEDOCS + + {{active_storage: `bool`, optional}} + + :Returns: + + `dask.array.Array`, function + TODOACTIVEDOCS + + """ + if not active_storage: + # It has been determined externally that an active storage + # reduction is not possible, so return the input data and + # chunk function unchanged. + return a, chunk_function + + # Still here? Then it is assumed that the dask array is of a form + # which might be able to exploit active storage. In particular, it + # is assumed that all data definitions point to files. + + # Parse axis + if axis is None: + axis = tuple(range(a.ndim)) + else: + if isinstance(axis, Integral): + axis = (axis,) + + if len(axis) != a.ndim: + # Can't (yet) use active storage to collapse a subset of + # the axes, so return the input data and chunk function + # unchanged. + return a, chunk_function + + axis = validate_axis(axis, a.ndim) + + active_chunk_functions = set() + + # Loop round elements of the dask graph, looking for data + # definitions that point to a file and which support active + # storage operations. The elements are traversed in reverse order + # so that the data defintions come out first, allowing for a fast + # short circuit in the common case when using active storage is no + # feasible. + dsk = collections_to_dsk((a,), optimize_graph=True) + for key, value in reversed(dsk.items()): + try: + value.get_filename() + except AttributeError: + # This value is not a data definition (it is assumed that + # all data definitions point to files). + continue + + try: + # Create a new actified data definition value + value = value.actify(method, axis) + except (AttributeError, ValueError): + # This data definition value does not support active + # storage reductions, or does not support the requested + # active storage reduction defined by 'method'. + active_chunk_functions = () + break + + try: + # Get the active storage chunk function + active_chunk_functions.add(value.get_active_chunk_function()) + except AttributeError: + # This data definition value does not support active + # storage reductions + active_chunk_functions = () + break + + # Still here? Then update the dask graph in-place with the + # actified data definition value. + dsk[key] = value + + if len(active_chunk_functions) == 1: + # All data definitions in the dask graph support active + # storage reductions with the same chunk function => redefine + # the array from the actified dask graph, and redefine the + # reduction chunk function. + a = da.Array(dsk, a.name, a.chunks, a.dtype, a._meta) + chunk_function = active_chunk_functions.pop() + + # Return the data and chunk function. These will either be + # identical to the inputs or, if it has been determinded that + # active storage operation is possible, then these the data and + # chunk function will have been replaced with actified versions. + return a, chunk_function + + +# def actify_collapse(method): +# """A wrapper to provide positional arguments to the decorator. +# +# A decorator for `Collapse` methods that enables active storage +# operations, when the conditions are right. +# +# """ +# def decorator(collapse_method): +# @wraps(collapse_method) +# def wrapper(cls, *args, **kwargs): +# if kwargs.get("weights") is None: +# # The collapse is unweighted over defined axes => +# # attempt to actify the dask array and provide a new +# # chunk function. +# a, chunk_function = actify( +# args[0], +# op=method, +# axis=kwargs.get("axis"), +# active_storage=kwargs.get("active_storage", False), +# ) +# args = list(args) +# args[0] = a +# +# if chunk_function is not None: +# kwargs["chunk_function"] = chunk_function +# +# #Create the collapse +# return collapse_method(cls, *args, **kwargs) +# +# return wrapper +# +# +# +# return decorator diff --git a/cf/data/dask_collapse.py b/cf/data/dask_collapse.py new file mode 100644 index 0000000000..3930dc711d --- /dev/null +++ b/cf/data/dask_collapse.py @@ -0,0 +1,1294 @@ +"""Reduction functions intended to be passed to be dask. + +Most of these functions are expected to be set as *chunk*, *combine* and +*aggregate* parameters of `dask.array.reduction` + +""" +from functools import reduce +from operator import mul + +import numpy as np +from dask.array import chunk +from dask.array.core import _concatenate2 +from dask.array.reductions import divide, numel +from dask.core import flatten +from dask.utils import deepmap + +from .collapse_utils import double_precision_dtype + + +def mask_small_sample_size(x, N, axis, mtol, original_shape): + """Mask elements where the sample size is below a threshold. + + .. versionadded:: TODODASKVER + + :Parameters: + + x: `numpy.ndarray` + The collapsed data. + + N: `numpy.ndarray` + The sample sizes of the collapsed values. + + axis: sequence of `int` + The axes being collapsed. + + mtol: number + The sample size threshold below which collapsed values are + set to missing data. It is defined as a fraction (between + 0 and 1 inclusive) of the contributing input data values. + + The default of *mtol* is 1, meaning that a missing datum + in the output array occurs whenever all of its + contributing input array elements are missing data. + + For other values, a missing datum in the output array + occurs whenever more than ``100*mtol%`` of its + contributing input array elements are missing data. + + Note that for non-zero values of *mtol*, different + collapsed elements may have different sample sizes, + depending on the distribution of missing data in the input + data. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + :Returns: + + `numpy.ndarray` + Array *x* masked where *N* is sufficiently small. Note + that the input *x* might be modified in-place with the + contents of the output. + + """ + if not x.ndim: + # Make sure that we have a numpy array (e.g. as opposed to a + # numpy.float64) + x = np.asanyarray(x) + + if mtol < 1: + # Nmax = total number of elements, including missing values + Nmax = reduce(mul, [original_shape[i] for i in axis], 1) + x = np.ma.masked_where(N < (1 - mtol) * Nmax, x, copy=False) + + return x + + +def sum_weights_chunk(x, weights=None, square=False, N=None, **kwargs): + """Sum the weights. + + .. versionadded:: TODODASKVER + + :Parameters: + + x: `numpy.ndarray` + The data. + + weights: `numpy.ndarray`, optional + The weights associated with values of the data. Must have + the same shape as *x*. By default *weights* is `None`, + meaning that all non-missing elements of the data have a + weight of 1 and all missing elements have a weight of + 0. If given as an array then those weights that are + missing data, or that correspond to the missing elements + of the data, are assigned a weight of 0. + + square: `bool`, optional + If True calculate the sum of the squares of the weights. + + N: `numpy.ndarray`, optional + The sample size. If provided as an array and there are no + weights, then the *N* is returned instead of calculating + the sum (of the squares) of weights. Ignored of *weights* + is not `None`. + + :Returns: + + `numpy.ndarray` + The sum of the weights, with data type "i8" or "f8". + + """ + if weights is None: + # All weights are 1, so the sum of the weights and the sum of + # the squares of the weights are both equal to the sample + # size. + if N is None: + N = cf_sample_size_chunk(x, **kwargs)["N"] + + return N + + dtype = double_precision_dtype(weights) + if square: + weights = np.multiply(weights, weights, dtype=dtype) + + if np.ma.is_masked(x): + weights = np.ma.masked_where(x.mask, weights) + + return chunk.sum(weights, dtype=dtype, **kwargs) + + +def combine_arrays( + pairs, key, func, axis, dtype=None, computing_meta=False, **kwargs +): + """Worker function for Combine callables. + + Select arrays by dictionary key from a nested list of + dictionaries, concatenate the resulting nested list of arrays + along the specified axes, and then apply a function to the result + along those same axes. + + See `dask.array.reductions.mean_combine` for an example. + + .. versionadded:: TODODASKVER + + :Returns: + + `numpy.ndarray` + + """ + x = deepmap(lambda pair: pair[key], pairs) if not computing_meta else pairs + + if dtype: + kwargs["dtype"] = dtype + + x = _concatenate2(x, axes=axis) + return func(x, axis=axis, **kwargs) + + +def sum_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): + """Alias of `combine_arrays` with ``func=chunk.sum``. + + .. versionadded:: TODODASKVER + + """ + return combine_arrays( + pairs, key, chunk.sum, axis, dtype, computing_meta, **kwargs + ) + + +def max_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): + """Alias of `combine_arrays` with ``func=chunk.max``. + + .. versionadded:: TODODASKVER + + """ + return combine_arrays( + pairs, key, chunk.max, axis, dtype, computing_meta, **kwargs + ) + + +def min_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): + """Alias of `combine_arrays` with ``func=chunk.min``. + + .. versionadded:: TODODASKVER + + """ + return combine_arrays( + pairs, key, chunk.min, axis, dtype, computing_meta, **kwargs + ) + + +def sum_sample_sizes(pairs, axis, computing_meta=False, **kwargs): + """Alias of `combine_arrays` with ``key="N", func=chunk.sum, + dtype="i8"``. + + .. versionadded:: TODODASKVER + + """ + return combine_arrays( + pairs, + "N", + chunk.sum, + axis, + dtype="i8", + computing_meta=computing_meta, + **kwargs, + ) + + +# -------------------------------------------------------------------- +# mean +# -------------------------------------------------------------------- +def cf_mean_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): + """Chunk calculations for the mean. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * V1: The sum of ``weights`` (equal to ``N`` if weights + are not set). + * sum: The weighted sum of ``x``. + * weighted: True if weights have been set. + + """ + if computing_meta: + return x + + # N, sum + d = cf_sum_chunk(x, weights, dtype=dtype, **kwargs) + + d["V1"] = sum_weights_chunk(x, weights, N=d["N"], **kwargs) + d["weighted"] = weights is not None + + return d + + +def cf_mean_combine( + pairs, axis=None, dtype="f8", computing_meta=False, **kwargs +): + """Combination calculations for the mean. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + As for `cf_mean_chunk`. + + """ + if not isinstance(pairs, list): + pairs = [pairs] + + weighted = next(flatten(pairs))["weighted"] + d = {"weighted": weighted} + + d["sum"] = sum_arrays(pairs, "sum", axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d["sum"] + + d["N"] = sum_sample_sizes(pairs, axis, **kwargs) + if weighted: + d["V1"] = sum_arrays(pairs, "V1", axis, dtype, **kwargs) + else: + d["V1"] = d["N"] + + return d + + +def cf_mean_agg( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + mtol=1, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the mean. + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_mean_combine(pairs, axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d + + x = divide(d["sum"], d["V1"], dtype=dtype) + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x + + +# -------------------------------------------------------------------- +# maximum +# -------------------------------------------------------------------- +def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): + """Chunk calculations for the maximum. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * max: The maximum of `x``. + + """ + if computing_meta: + return x + + return { + "max": chunk.max(x, **kwargs), + "N": cf_sample_size_chunk(x, **kwargs)["N"], + } + + +def cf_max_combine(pairs, axis=None, computing_meta=False, **kwargs): + """Combination calculations for the maximum. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + As for `cf_max_chunk`. + + """ + if not isinstance(pairs, list): + pairs = [pairs] + + mx = max_arrays(pairs, "max", axis, None, computing_meta, **kwargs) + if computing_meta: + return mx + + return {"max": mx, "N": sum_sample_sizes(pairs, axis, **kwargs)} + + +def cf_max_agg( + pairs, + axis=None, + computing_meta=False, + mtol=1, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the maximum. + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_max_combine(pairs, axis, computing_meta, **kwargs) + if computing_meta: + return d + + x = d["max"] + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x + + +# -------------------------------------------------------------------- +# mid-range +# -------------------------------------------------------------------- +def cf_mid_range_agg( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + mtol=1, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the mid-range. + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_range_combine(pairs, axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d + + # Calculate the mid-range + x = divide(d["max"] + d["min"], 2.0, dtype=dtype) + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x + + +# -------------------------------------------------------------------- +# minimum +# -------------------------------------------------------------------- +def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): + """Chunk calculations for the minimum. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * min: The minimum of ``x``. + + """ + if computing_meta: + return x + + return { + "min": chunk.min(x, **kwargs), + "N": cf_sample_size_chunk(x, **kwargs)["N"], + } + + +def cf_min_combine(pairs, axis=None, computing_meta=False, **kwargs): + """Combination calculations for the minimum. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + As for `cf_min_chunk`. + + """ + if not isinstance(pairs, list): + pairs = [pairs] + + mn = min_arrays(pairs, "min", axis, None, computing_meta, **kwargs) + if computing_meta: + return mn + + return {"min": mn, "N": sum_sample_sizes(pairs, axis, **kwargs)} + + +def cf_min_agg( + pairs, + axis=None, + computing_meta=False, + mtol=1, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the minimum. + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_min_combine(pairs, axis, computing_meta, **kwargs) + if computing_meta: + return d + + x = d["min"] + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x + + +# -------------------------------------------------------------------- +# range +# -------------------------------------------------------------------- +def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs): + """Chunk calculations for the range. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * min: The minimum of ``x``. + * max: The maximum of ``x`. + + """ + if computing_meta: + return x + + # N, max + d = cf_max_chunk(x, **kwargs) + + d["min"] = chunk.min(x, **kwargs) + return d + + +def cf_range_combine( + pairs, axis=None, dtype=None, computing_meta=False, **kwargs +): + """Combination calculations for the range. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + As for `cf_range_chunk`. + + """ + if not isinstance(pairs, list): + pairs = [pairs] + + mx = max_arrays(pairs, "max", axis, None, computing_meta, **kwargs) + if computing_meta: + return mx + + mn = min_arrays(pairs, "min", axis, None, **kwargs) + + return {"max": mx, "min": mn, "N": sum_sample_sizes(pairs, axis, **kwargs)} + + +def cf_range_agg( + pairs, + axis=None, + computing_meta=False, + mtol=1, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the range. + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_range_combine(pairs, axis, computing_meta, **kwargs) + if computing_meta: + return d + + # Calculate the range + x = d["max"] - d["min"] + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x + + +# -------------------------------------------------------------------- +# root mean square +# -------------------------------------------------------------------- +def cf_rms_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): + """Chunk calculations for the root mean square (RMS). + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * sum: The weighted sum of ``x**2``. + + """ + if computing_meta: + return x + + return cf_mean_chunk( + np.multiply(x, x, dtype=dtype), weights=weights, dtype=dtype, **kwargs + ) + + +def cf_rms_agg( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + mtol=1, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the root mean square (RMS). + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_mean_combine(pairs, axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d + + x = np.sqrt(d["sum"] / d["V1"], dtype=dtype) + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x + + +# -------------------------------------------------------------------- +# sample size +# -------------------------------------------------------------------- +def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs): + """Chunk calculations for the sample size. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + + """ + if computing_meta: + return x + + if np.ma.isMA(x): + N = chunk.sum(np.ones_like(x, dtype=dtype), **kwargs) + else: + if dtype: + kwargs["dtype"] = dtype + + N = numel(x, **kwargs) + + return {"N": N} + + +def cf_sample_size_combine( + pairs, axis=None, dtype="i8", computing_meta=False, **kwargs +): + """Combination calculations for the sample size. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + As for `cf_sample_size_chunk`. + + """ + if not isinstance(pairs, list): + pairs = [pairs] + + x = sum_arrays(pairs, "N", axis, dtype, computing_meta, **kwargs) + if computing_meta: + return x + + return {"N": x} + + +def cf_sample_size_agg( + pairs, + axis=None, + computing_meta=False, + dtype="i8", + mtol=1, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the sample size. + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_sample_size_combine(pairs, axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d + + x = d["N"] + x = mask_small_sample_size(x, x, axis, mtol, original_shape) + return x + + +# -------------------------------------------------------------------- +# sum +# -------------------------------------------------------------------- +def cf_sum_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): + """Chunk calculations for the sum. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * sum: The weighted sum of ``x`` + + """ + if computing_meta: + return x + + if weights is not None: + x = np.multiply(x, weights, dtype=dtype) + + d = cf_sample_size_chunk(x, **kwargs) + d["sum"] = chunk.sum(x, dtype=dtype, **kwargs) + return d + + +def cf_sum_combine( + pairs, axis=None, dtype="f8", computing_meta=False, **kwargs +): + """Combination calculations for the sum. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + As for `cf_sum_chunk`. + + """ + if not isinstance(pairs, list): + pairs = [pairs] + + x = sum_arrays(pairs, "sum", axis, dtype, computing_meta, **kwargs) + if computing_meta: + return x + + return {"sum": x, "N": sum_sample_sizes(pairs, axis, **kwargs)} + + +def cf_sum_agg( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + mtol=1, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the sum. + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_sum_combine(pairs, axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d + + x = d["sum"] + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x + + +# -------------------------------------------------------------------- +# sum of weights +# -------------------------------------------------------------------- +def cf_sum_of_weights_chunk( + x, weights=None, dtype="f8", computing_meta=False, square=False, **kwargs +): + """Chunk calculations for the sum of the weights. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + :Parameters: + + square: `bool`, optional + If True then calculate the sum of the squares of the + weights. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * sum: The sum of ``weights``, or the sum of + ``weights**2`` if *square* is True. + + """ + if computing_meta: + return x + + # N + d = cf_sample_size_chunk(x, **kwargs) + + d["sum"] = sum_weights_chunk( + x, weights=weights, square=square, N=d["N"], **kwargs + ) + + return d + + +# -------------------------------------------------------------------- +# unique +# -------------------------------------------------------------------- +def cf_unique_chunk(x, dtype=None, computing_meta=False, **kwargs): + """Chunk calculations for the unique values. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * unique: The unique values. + + """ + if computing_meta: + return x + + return {"unique": np.unique(x)} + + +def cf_unique_agg(pairs, axis=None, computing_meta=False, **kwargs): + """Aggregation calculations for the unique values. + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + It is assumed that the arrays are one-dimensional. + + .. versionadded:: TODODASKVER + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dask.array.Array` + The unique values. + + """ + x = ( + deepmap(lambda pair: pair["unique"], pairs) + if not computing_meta + else pairs + ) + if computing_meta: + return x + + x = _concatenate2(x, axes=[0]) + return np.unique(x) + + +# -------------------------------------------------------------------- +# variance +# -------------------------------------------------------------------- +def cf_var_chunk( + x, weights=None, dtype="f8", computing_meta=False, ddof=None, **kwargs +): + """Chunk calculations for the variance. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + See + https://en.wikipedia.org/wiki/Pooled_variance#Sample-based_statistics + for details. + + .. versionadded:: TODODASKVER + + :Parameters: + + ddof: number + The delta degrees of freedom. The number of degrees of + freedom used in the calculation is (N-*ddof*) where N + represents the number of non-missing elements. A value of + 1 applies Bessel's correction. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * V1: The sum of ``weights`` (equal to ``N`` if weights + are not set). + * V2: The sum of ``weights**2``, or `None` of not + required. + * sum: The weighted sum of ``x``. + * part: ``V1 * (sigma**2 + mu**2)``, where ``sigma**2`` is + the weighted biased (i.e. ``ddof=0``) variance of + ``x``, and ``mu`` is the weighted mean of ``x``. + * weighted: True if weights have been set. + * ddof: The delta degrees of freedom. + + """ + if computing_meta: + return x + + weighted = weights is not None + + # N, V1, sum + d = cf_mean_chunk(x, weights, dtype=dtype, **kwargs) + + wsum = d["sum"] + V1 = d["V1"] + + avg = divide(wsum, V1, dtype=dtype) + part = x - avg + part *= part + if weighted: + part = part * weights + + part = chunk.sum(part, dtype=dtype, **kwargs) + part = part + avg * wsum + + d["part"] = part + + if weighted and ddof == 1: + d["V2"] = sum_weights_chunk(x, weights, square=True, **kwargs) + else: + d["V2"] = None + + d["weighted"] = weighted + d["ddof"] = ddof + + return d + + +def cf_var_combine( + pairs, axis=None, dtype="f8", computing_meta=False, **kwargs +): + """Combination calculations for the variance. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. + + .. versionadded:: TODODASKVER + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + As for `cf_var_chunk`. + + """ + if not isinstance(pairs, list): + pairs = [pairs] + + d = next(flatten(pairs)) + weighted = d["weighted"] + ddof = d["ddof"] + d = {"weighted": weighted, "ddof": ddof} + + d["part"] = sum_arrays( + pairs, "part", axis, dtype, computing_meta, **kwargs + ) + if computing_meta: + return d["part"] + + d["sum"] = sum_arrays(pairs, "sum", axis, dtype, **kwargs) + + d["N"] = sum_sample_sizes(pairs, axis, **kwargs) + d["V1"] = d["N"] + d["V2"] = None + if weighted: + d["V1"] = sum_arrays(pairs, "V1", axis, dtype, **kwargs) + if ddof == 1: + d["V2"] = sum_arrays(pairs, "V2", axis, dtype, **kwargs) + + return d + + +def cf_var_agg( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + mtol=1, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the variance. + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. note:: Weights are interpreted as reliability weights, as + opposed to frequency weights. + + See + https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights + for details. + + .. versionadded:: TODODASKVER + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_var_combine(pairs, axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d + + ddof = d["ddof"] + V1 = d["V1"] + wsum = d["sum"] + var = d["part"] - wsum * wsum / V1 + + # Note: var is now the global value of V1 * sigma**2, where sigma + # is the global weighted biased (i.e. ddof=0) variance. + + if ddof is None: + raise ValueError(f"Must set ddof to a numeric value. Got: {ddof!r}") + + if not ddof: + # Weighted or unweighted variance with ddof=0 + f = 1 / V1 + elif not d["weighted"]: + # Unweighted variance with any non-zero value of ddof + f = 1 / (V1 - ddof) + elif ddof == 1: + # Weighted variance with ddof=1 + f = V1 / (V1 * V1 - d["V2"]) + else: + raise ValueError( + "Can only calculate a weighted variance with ddof=0 or ddof=1. " + f"Got: {ddof!r}" + ) + + # Now get the required global variance with the requested ddof + var = f * var + + var = mask_small_sample_size(var, d["N"], axis, mtol, original_shape) + return var diff --git a/cf/data/data.py b/cf/data/data.py index 35fceac6ff..cf631ee504 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -1263,9 +1263,9 @@ def _conform_after_dask_update(self): self._del_active_storage() def _del_active_storage(self): - """TODODASKDOCS. + """TODOACTIVEDOCS. - .. versionadded:: TODODASKVER + .. versionadded:: TODOACTIVEVER .. seealso:: `_set_active_storage` @@ -1289,16 +1289,16 @@ def _del_active_storage(self): self._custom.pop("active_storage", None) def _set_active_storage(self, value): - """TODODASKDOCS. + """TODOACTIVEDOCS. - .. versionadded:: TODODASKVER + .. versionadded:: TODOACTIVEVER .. seealso:: `_del_active_storage` :Returns: `bool` - TODODASKDOCS + TODOACTIVEDOCS **Examples** @@ -4188,7 +4188,7 @@ def active_storage(self): the usual (non-active) techniques if an active storage operation fails for any reason. - .. versionadded:: TODODASKVER + .. versionadded:: TODOACTIVEVER **Examples** diff --git a/cf/data/netcdfarray.py b/cf/data/netcdfarray.py index db9f9c1488..e58b286d3a 100644 --- a/cf/data/netcdfarray.py +++ b/cf/data/netcdfarray.py @@ -4,7 +4,11 @@ class NetCDFArray(cfdm.NetCDFArray, FileArray): - """An array stored in a netCDF file.""" + """An array stored in a netCDF file. + + TODOACTIVEDOC + + """ def __getitem__(self, indices): """Returns a subspace of the array as a numpy array. @@ -24,45 +28,68 @@ def __getitem__(self, indices): then these indices work independently along each dimension (similar to the way vector subscripts work in Fortran). - .. versionadded:: TODODASKVER + .. versionadded:: TODOACTIVEVER """ - if self.active_storage_op: - # Active storage read. Returns a dictionary. + method = self.get_active_method() + if method: + # Active storage read by server. Returns a dictionary. active = Active(self.filename, self.ncvar) - active.method = self.active_storage_op + active.method = method active.components = True return active[indices] # Normal read by local client. Returns a numpy array. - # - # In production code groups, masks, string types, etc. will - # need to be accounted for here. - return super().__getitme__(indices) + return super().__getitem__(indices) def _active_chunk_functions(self): + """Mapping of method names to active chunk functions. + + .. versionadded:: TODOACTIVEVER + + :Returns: + + `dict` + The mapping. + + """ return { "min": self.active_min, "max": self.active_max, "mean": self.active_mean, + "sum": self.active_sum, } - @property - def active_storage_op(self): - return self._custom.get("active_storage_op") + def actify(self, method, axis=None): + """Return a new actified {{class}} instance. + + The new instance is a deep copy of the original, including the + definitions of the active storage method and axis. - @active_storage_op.setter - def active_storage_op(self, value): - self._custom["active_storage_op"] = value + .. versionadded:: TODOACTIVEVER - @property - def op_axis(self): - return self._custom.get("op_axis") + :Parameters: + + method: `str` + TODOACTIVEDOCS + + axis: (sequence of) `int`, optional + TODOACTIVEDOCS + + :Returns: - @op_axis.setter - def op_axis(self, value): - self._custom["op_axis"] = value + {{class}} + TODOACTIVEDOCS + + """ + if method not in self._active_chunk_functions(): + raise ValueError(f"Invalid active storage operation: {method!r}") + + a = self.copy() + a.set_active_method(method) + a.set_active_axis(axis) + return a @staticmethod def active_min(a, **kwargs): @@ -71,16 +98,17 @@ def active_min(a, **kwargs): Assumes that the calculations have already been done, i.e. that *a* is already the minimum. - This function is intended to be passed in to - `dask.array.reduction()` as the ``chunk`` parameter. Its - return signature must be the same as the non-active chunks - function that it is replacing. + This function is intended to be passed to + `dask.array.reduction` as the ``chunk`` parameter. Its return + signature must be the same as the non-active chunk function + that it is replacing. - .. versionadded:: TODODASKVER + .. versionadded:: TODOACTIVEVER :Parameters: a: `dict` + TODOACTIVEDOCS :Returns: @@ -100,16 +128,17 @@ def active_max(a, **kwargs): Assumes that the calculations have already been done, i.e. that *a* is already the maximum. - This function is intended to be passed in to - `dask.array.reduction()` as the ``chunk`` parameter. Its - return signature must be consistent with that expected by the - functions of the ``aggregate`` and ``combine`` parameters. + This function is intended to be passed to + `dask.array.reduction` as the ``chunk`` parameter. Its return + signature must be the same as the non-active chunk function + that it is replacing. - .. versionadded:: TODODASKVER + .. versionadded:: TODOACTIVEVER :Parameters: a: `dict` + TODOACTIVEDOCS :Returns: @@ -124,21 +153,22 @@ def active_max(a, **kwargs): @staticmethod def active_mean(a, **kwargs): - """Chunk calculations for the mean. + """Chunk calculations for the unweighted mean. Assumes that the calculations have already been done, - i.e. that *a* is already the mean. + i.e. that *a* is already the uweighted mean. - This function is intended to be passed in to - `dask.array.reduction()` as the ``chunk`` parameter. Its - return signature must be the same as the non-active chunks - function that it is replacing. + This function is intended to be passed to + `dask.array.reduction` as the ``chunk`` parameter. Its return + signature must be the same as the non-active chunk function + that it is replacing. - .. versionadded:: TODODASKVER + .. versionadded:: TODOACTIVEVER :Parameters: a: `dict` + TODOACTIVEDOCS :Returns: @@ -148,24 +178,110 @@ def active_mean(a, **kwargs): * N: The sample size. * V1: The sum of ``weights``. Equal to ``N`` because weights have not been set. - * sum: The weighted sum of ``x``. + * sum: The weighted sum of ``a``. * weighted: True if weights have been set. Always False. """ return {"N": a["n"], "V1": a["n"], "sum": a["sum"], "weighted": False} + @staticmethod + def active_sum(a, **kwargs): + """Chunk calculations for the unweighted sum. + + Assumes that the calculations have already been done, + i.e. that *a* is already the uweighted sum. + + This function is intended to be passed to + `dask.array.reduction` as the ``chunk`` parameter. Its return + signature must be the same as the non-active chunk function + that it is replacing. + + .. versionadded:: TODOACTIVEVER + + :Parameters: + + a: `dict` + TODOACTIVEDOCS + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * sum: The weighted sum of ``a`` + + """ + return {"N": a["n"], "sum": a["sum"]} + + def get_active_method(self): + """TODOACTIVEDOC. + + .. versionadded:: TODOACTIVEVER + + :Returns: + + TODOACTIVEDOC + + """ + return self._custom.get("active_method") + + def get_active_axis(self): + """TODOACTIVEDOC. + + .. versionadded:: TODOACTIVEVER + + :Returns: + + TODOACTIVEDOC + + """ + return self._custom.get("active_axis") + def get_active_chunk_function(self): + """TODOACTIVEDOC. + + .. versionadded:: TODOACTIVEVER + + :Returns: + + TODOACTIVEDOC + + """ try: - return self._active_chunk_functions()[self.active_storage_op] + return self._active_chunk_functions()[self.get_active_method()] except KeyError: raise ValueError("no active storage operation has been set") - def set_active_storage_op(self, op, axis=None): - if op not in self._active_chunk_functions(): - raise ValueError(f"Invalid active storage operation: {op!r}") + def set_active_method(self, value): + """TODOACTIVEDOC. - a = self.copy() - a.active_storage_op = op - a.op_axis = axis - return a + .. versionadded:: TODOACTIVEVER + + :Parameters: + + TODOACTIVEDOCS + + :Returns: + + `None` + + """ + self._custom["active_method"] = value + + def set_active_axis(self, value): + """TODOACTIVEDOC. + + .. versionadded:: TODOACTIVEVER + + :Parameters: + + TODOACTIVEDOCS + + :Returns: + + `None` + + """ + self._custom["active_axis"] = value From 1d8e39b28647ffb0d1d4558e99730abd89895dff Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 17 Nov 2022 16:54:23 +0000 Subject: [PATCH 003/134] actify methods with @active_stage decorator --- cf/data/collapse.py | 171 +++++++++++++++++++++++--------------- cf/data/collapse_utils.py | 74 ++++++++--------- cf/data/data.py | 34 ++++---- 3 files changed, 157 insertions(+), 122 deletions(-) diff --git a/cf/data/collapse.py b/cf/data/collapse.py index a1aa39d903..1c23c5170f 100644 --- a/cf/data/collapse.py +++ b/cf/data/collapse.py @@ -1,4 +1,4 @@ -"""Functions used for `Data` object collapses.""" +"""TODOACTIVEDOCS used for `Data` object collapses.""" from functools import partial import numpy as np @@ -6,7 +6,11 @@ from dask.array.reductions import reduction from ..docstring import _docstring_substitution_definitions -from .collapse_utils import actify, check_input_dtype, double_precision_dtype +from .collapse_utils import ( + active_storage, + check_input_dtype, + double_precision_dtype, +) class Collapse(metaclass=DocstringRewriteMeta): @@ -48,7 +52,7 @@ def __docstring_package_depth__(self): """ return 0 - @classmethod + @active_storage("max") def max( cls, a, @@ -57,6 +61,7 @@ def max( mtol=1, split_every=None, active_storage=False, + chunk_function=None, ): """Return maximum values of an array. @@ -84,6 +89,8 @@ def max( {{active_storage: `bool`, optional}} + {{chunk_function: function, optional}} + :Returns: `dask.array.Array` @@ -92,19 +99,12 @@ def max( """ from .dask_collapse import cf_max_agg, cf_max_chunk, cf_max_combine + if chunk_function is None: + chunk_function = cf_max_chunk + check_input_dtype(a) dtype = a.dtype - # Rewrite data and chunk function if active storage operations - # are available. - a, chunk_function = actify( - a, - method="max", - axis=axis, - chunk_function=cf_max_chunk, - active_storage=active_storage, - ) - return reduction( a, chunk_function, @@ -118,7 +118,6 @@ def max( meta=np.array((), dtype=dtype), ) - @classmethod def max_abs( cls, a, @@ -127,6 +126,7 @@ def max_abs( mtol=1, split_every=None, active_storage=False, + chunk_function=None, ): """Return maximum absolute values of an array. @@ -154,6 +154,8 @@ def max_abs( {{active_storage: `bool`, optional}} + {{chunk_function: function, optional}} + :Returns: `dask.array.Array` @@ -168,7 +170,7 @@ def max_abs( split_every=split_every, ) - @classmethod + @active_storage("mean") def mean( cls, a, @@ -178,6 +180,7 @@ def mean( mtol=1, split_every=None, active_storage=False, + chunk_function=None, ): """Return mean values of an array. @@ -207,6 +210,8 @@ def mean( {{active_storage: `bool`, optional}} + {{chunk_function: function, optional}} + :Returns: `dask.array.Array` @@ -215,19 +220,12 @@ def mean( """ from .dask_collapse import cf_mean_agg, cf_mean_chunk, cf_mean_combine + if chunk_function is None: + chunk_function = cf_mean_chunk + check_input_dtype(a) dtype = "f8" - # Rewrite data and chunk function if active storage operations - # are available. - a, chunk_function = actify( - a, - method="mean", - axis=axis, - chunk_function=cf_mean_chunk, - active_storage=active_storage, - ) - return reduction( a, chunk_function, @@ -242,7 +240,6 @@ def mean( weights=weights, ) - @classmethod def mean_abs( cls, a, @@ -252,6 +249,7 @@ def mean_abs( mtol=1, split_every=None, active_storage=False, + chunk_function=None, ): """Return mean absolute values of an array. @@ -281,6 +279,8 @@ def mean_abs( {{active_storage: `bool`, optional}} + {{chunk_function: function, optional}} + :Returns: `dask.array.Array` @@ -296,9 +296,8 @@ def mean_abs( split_every=split_every, ) - @classmethod def mid_range( - cls, + self, a, axis=None, dtype=None, @@ -306,6 +305,7 @@ def mid_range( mtol=1, split_every=None, active_storage=False, + chunk_function=None, ): """Return mid-range values of an array. @@ -333,6 +333,8 @@ def mid_range( {{active_storage: `bool`, optional}} + {{chunk_function: function, optional}} + :Returns: `dask.array.Array` @@ -345,11 +347,14 @@ def mid_range( cf_range_combine, ) + if chunk_function is None: + chunk_function = cf_range_chunk + check_input_dtype(a, allowed="fi") dtype = "f8" return reduction( a, - cf_range_chunk, + chunk_function, partial(cf_mid_range_agg, mtol=mtol, original_shape=a.shape), axis=axis, keepdims=keepdims, @@ -360,15 +365,16 @@ def mid_range( meta=np.array((), dtype=dtype), ) - @classmethod + @active_storage("min") def min( - cls, + self, a, axis=None, keepdims=False, mtol=1, split_every=None, active_storage=False, + chunk_function=None, ): """Return minimum values of an array. @@ -396,6 +402,8 @@ def min( {{active_storage: `bool`, optional}} + {{chunk_function: function, optional}} + :Returns: `dask.array.Array` @@ -404,19 +412,12 @@ def min( """ from .dask_collapse import cf_min_agg, cf_min_chunk, cf_min_combine + if chunk_function is None: + chunk_function = cf_min_chunk + check_input_dtype(a) dtype = a.dtype - # Rewrite data and chunk function if active storage operations - # are available. - a, chunk_function = actify( - a, - method="min", - axis=axis, - chunk_function=cf_min_chunk, - active_storage=active_storage, - ) - return reduction( a, chunk_function, @@ -430,7 +431,6 @@ def min( meta=np.array((), dtype=dtype), ) - @classmethod def min_abs( cls, a, @@ -439,6 +439,7 @@ def min_abs( mtol=1, split_every=None, active_storage=False, + chunk_function=None, ): """Return minimum absolute values of an array. @@ -466,6 +467,8 @@ def min_abs( {{active_storage: `bool`, optional}} + {{chunk_function: function, optional}} + :Returns: `dask.array.Array` @@ -480,7 +483,6 @@ def min_abs( split_every=split_every, ) - @classmethod def range( cls, a, @@ -489,6 +491,7 @@ def range( mtol=1, split_every=None, active_storage=False, + chunk_function=None, ): """Return range values of an array. @@ -516,6 +519,8 @@ def range( {{active_storage: `bool`, optional}} + {{chunk_function: function, optional}} + :Returns: `dask.array.Array` @@ -528,11 +533,14 @@ def range( cf_range_combine, ) + if chunk_function is None: + chunk_function = cf_range_chunk + check_input_dtype(a, allowed="fi") dtype = a.dtype return reduction( a, - cf_range_chunk, + chunk_function, partial(cf_range_agg, mtol=mtol, original_shape=a.shape), axis=axis, keepdims=keepdims, @@ -543,7 +551,6 @@ def range( meta=np.array((), dtype=dtype), ) - @classmethod def rms( cls, a, @@ -553,6 +560,7 @@ def rms( mtol=1, split_every=None, active_storage=False, + chunk_function=None, ): """Return root mean square (RMS) values of an array. @@ -582,6 +590,8 @@ def rms( {{active_storage: `bool`, optional}} + {{chunk_function: function, optional}} + :Returns: `dask.array.Array` @@ -590,11 +600,14 @@ def rms( """ from .dask_collapse import cf_mean_combine, cf_rms_agg, cf_rms_chunk + if chunk_function is None: + chunk_function = cf_rms_chunk + check_input_dtype(a) dtype = "f8" return reduction( a, - cf_rms_chunk, + chunk_function, partial(cf_rms_agg, mtol=mtol, original_shape=a.shape), axis=axis, keepdims=keepdims, @@ -606,7 +619,6 @@ def rms( weights=weights, ) - @classmethod def sample_size( cls, a, @@ -615,6 +627,7 @@ def sample_size( mtol=1, split_every=None, active_storage=False, + chunk_function=None, ): """Return sample size values of an array. @@ -642,6 +655,8 @@ def sample_size( {{active_storage: `bool`, optional}} + {{chunk_function: function, optional}} + :Returns: `dask.array.Array` @@ -654,11 +669,14 @@ def sample_size( cf_sample_size_combine, ) + if chunk_function is None: + chunk_function = cf_sample_size_chunk + check_input_dtype(a) dtype = "i8" return reduction( a, - cf_sample_size_chunk, + chunk_function, partial(cf_sample_size_agg, mtol=mtol, original_shape=a.shape), axis=axis, keepdims=keepdims, @@ -669,7 +687,7 @@ def sample_size( meta=np.array((), dtype=dtype), ) - @classmethod + @active_storage("sum") def sum( cls, a, @@ -679,6 +697,7 @@ def sum( mtol=1, split_every=None, active_storage=False, + chunk_function=None, ): """Return sum values of an array. @@ -708,6 +727,8 @@ def sum( {{active_storage: `bool`, optional}} + {{chunk_function: function, optional}} + :Returns: `dask.array.Array` @@ -716,21 +737,14 @@ def sum( """ from .dask_collapse import cf_sum_agg, cf_sum_chunk, cf_sum_combine + if chunk_function is None: + chunk_function = cf_sum_chunk + check_input_dtype(a) dtype = double_precision_dtype(a) if weights is not None: dtype = np.result_type(double_precision_dtype(weights), dtype) - # Rewrite data and chunk function if active storage operations - # are available. - a, chunk_function = actify( - a, - method="sum", - axis=axis, - chunk_function=cf_sum_chunk, - active_storage=active_storage, - ) - return reduction( a, chunk_function, @@ -745,7 +759,6 @@ def sum( weights=weights, ) - @classmethod def sum_of_weights( cls, a, @@ -755,6 +768,7 @@ def sum_of_weights( mtol=1, split_every=None, active_storage=False, + chunk_function=None, ): """Return sum of weights values for an array. @@ -784,6 +798,8 @@ def sum_of_weights( {{active_storage: `bool`, optional}} + {{chunk_function: function, optional}} + :Returns: `dask.array.Array` @@ -796,11 +812,14 @@ def sum_of_weights( cf_sum_of_weights_chunk, ) + if chunk_function is None: + chunk_function = cf_sum_of_weights_chunk + check_input_dtype(a) dtype = double_precision_dtype(weights, default="i8") return reduction( a, - cf_sum_of_weights_chunk, + chunk_function, partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), axis=axis, keepdims=keepdims, @@ -812,7 +831,6 @@ def sum_of_weights( weights=weights, ) - @classmethod def sum_of_weights2( cls, a, @@ -822,6 +840,7 @@ def sum_of_weights2( mtol=1, split_every=None, active_storage=False, + chunk_function=None, ): """Return sum of squares of weights values for an array. @@ -851,6 +870,8 @@ def sum_of_weights2( {{active_storage: `bool`, optional}} + {{chunk_function: function, optional}} + :Returns: `dask.array.Array` @@ -863,11 +884,14 @@ def sum_of_weights2( cf_sum_of_weights_chunk, ) + if chunk_function is None: + chunk_function = cf_sum_of_weights_chunk + check_input_dtype(a) dtype = double_precision_dtype(weights, default="i8") return reduction( a, - partial(cf_sum_of_weights_chunk, square=True), + partial(chunk_function, square=True), partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), axis=axis, keepdims=keepdims, @@ -879,7 +903,6 @@ def sum_of_weights2( weights=weights, ) - @classmethod def var( cls, a, @@ -890,6 +913,7 @@ def var( ddof=None, split_every=None, active_storage=False, + chunk_function=None, ): """Return variances of an array. @@ -921,6 +945,8 @@ def var( {{active_storage: `bool`, optional}} + {{chunk_function: function, optional}} + :Returns: `dask.array.Array` @@ -929,11 +955,14 @@ def var( """ from .dask_collapse import cf_var_agg, cf_var_chunk, cf_var_combine + if chunk_function is None: + chunk_function = cf_var_chunk + check_input_dtype(a) dtype = "f8" return reduction( a, - partial(cf_var_chunk, ddof=ddof), + partial(chunk_function, ddof=ddof), partial(cf_var_agg, mtol=mtol, original_shape=a.shape), axis=axis, keepdims=keepdims, @@ -945,8 +974,9 @@ def var( weights=weights, ) - @classmethod - def unique(cls, a, split_every=None, active_storage=False): + def unique( + cls, a, split_every=None, active_storage=False, chunk_function=None + ): """Return unique elements of the data. .. versionadded:: TODODASKVER @@ -960,6 +990,8 @@ def unique(cls, a, split_every=None, active_storage=False): {{active_storage: `bool`, optional}} + {{chunk_function: function, optional}} + :Returns: `dask.array.Array` @@ -968,6 +1000,9 @@ def unique(cls, a, split_every=None, active_storage=False): """ from .dask_collapse import cf_unique_agg, cf_unique_chunk + if chunk_function is None: + chunk_function = cf_unique_chunk + check_input_dtype(a, "fibUS") # Flatten the array so that it has the same number of @@ -980,7 +1015,7 @@ def unique(cls, a, split_every=None, active_storage=False): dtype = a.dtype return reduction( a, - cf_unique_chunk, + chunk_function, cf_unique_agg, keepdims=True, output_size=np.nan, diff --git a/cf/data/collapse_utils.py b/cf/data/collapse_utils.py index eac6a7005d..1773f2d027 100644 --- a/cf/data/collapse_utils.py +++ b/cf/data/collapse_utils.py @@ -1,3 +1,4 @@ +from functools import wraps from numbers import Integral import dask.array as da @@ -89,7 +90,7 @@ def check_input_dtype(a, allowed="fib"): raise TypeError(f"Can't calculate {method} of data with {a.dtype!r}") -def actify(a, method, axis=None, chunk_function=None, active_storage=False): +def actify(a, method, axis=None, active_storage=False): """TODOACTIVEDOCS. .. versionadded:: TODOACTIVEVER @@ -105,9 +106,6 @@ def actify(a, method, axis=None, chunk_function=None, active_storage=False): axis: (sequence of) `int`, optional TODOACTIVEDOCS - chunk_function: function - TODOACTIVEDOCS - {{active_storage: `bool`, optional}} :Returns: @@ -116,6 +114,7 @@ def actify(a, method, axis=None, chunk_function=None, active_storage=False): TODOACTIVEDOCS """ + chunk_function = None if not active_storage: # It has been determined externally that an active storage # reduction is not possible, so return the input data and @@ -196,37 +195,36 @@ def actify(a, method, axis=None, chunk_function=None, active_storage=False): return a, chunk_function -# def actify_collapse(method): -# """A wrapper to provide positional arguments to the decorator. -# -# A decorator for `Collapse` methods that enables active storage -# operations, when the conditions are right. -# -# """ -# def decorator(collapse_method): -# @wraps(collapse_method) -# def wrapper(cls, *args, **kwargs): -# if kwargs.get("weights") is None: -# # The collapse is unweighted over defined axes => -# # attempt to actify the dask array and provide a new -# # chunk function. -# a, chunk_function = actify( -# args[0], -# op=method, -# axis=kwargs.get("axis"), -# active_storage=kwargs.get("active_storage", False), -# ) -# args = list(args) -# args[0] = a -# -# if chunk_function is not None: -# kwargs["chunk_function"] = chunk_function -# -# #Create the collapse -# return collapse_method(cls, *args, **kwargs) -# -# return wrapper -# -# -# -# return decorator +def active_storage(method): + """A decorator for `Collapse` methods that enables active storage + operations, when the conditions are right.""" + + def decorator(collapse_method): + @wraps(collapse_method) + def wrapper(self, *args, **kwargs): + if ( + kwargs.get("weights") is None + and kwargs.get("chunk_function") is None + ): + # The collapse is unweighted => attempt to actify the + # dask array and provide a new chunk function. + a, chunk_function = actify( + args[0], + method=method, + axis=kwargs.get("axis"), + active_storage=kwargs.get("active_storage", False), + ) + args = list(args) + args[0] = a + + if chunk_function is not None: + # The dask array has been actified, so update the + # chunk function. + kwargs["chunk_function"] = chunk_function + + # Create the collapse + return collapse_method(self, *args, **kwargs) + + return wrapper + + return decorator diff --git a/cf/data/data.py b/cf/data/data.py index cf631ee504..f3bab1280a 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -5744,7 +5744,7 @@ def max( """ d = _inplace_enabled_define_and_cleanup(self) d, _ = collapse( - Collapse.max, + Collapse().max, d, axis=axes, keepdims=not squeeze, @@ -5807,7 +5807,7 @@ def maximum_absolute_value( """ d = _inplace_enabled_define_and_cleanup(self) d, _ = collapse( - Collapse.max_abs, + Collapse().max_abs, d, axis=axes, keepdims=not squeeze, @@ -5876,7 +5876,7 @@ def min( """ d = _inplace_enabled_define_and_cleanup(self) d, _ = collapse( - Collapse.min, + Collapse().min, d, axis=axes, keepdims=not squeeze, @@ -5939,7 +5939,7 @@ def minimum_absolute_value( """ d = _inplace_enabled_define_and_cleanup(self) d, _ = collapse( - Collapse.min_abs, + Collapse().min_abs, d, axis=axes, keepdims=not squeeze, @@ -6016,7 +6016,7 @@ def mean( """ d = _inplace_enabled_define_and_cleanup(self) d, _ = collapse( - Collapse.mean, + Collapse().mean, d, axis=axes, weights=weights, @@ -6092,7 +6092,7 @@ def mean_absolute_value( """ d = _inplace_enabled_define_and_cleanup(self) d, _ = collapse( - Collapse.mean_abs, + Collapse().mean_abs, d, axis=axes, weights=weights, @@ -6172,7 +6172,7 @@ def integral( """ d = _inplace_enabled_define_and_cleanup(self) d, weights = collapse( - Collapse.sum, + Collapse().sum, d, axis=axes, weights=weights, @@ -6254,7 +6254,7 @@ def sample_size( """ d = _inplace_enabled_define_and_cleanup(self) d, _ = collapse( - Collapse.sample_size, + Collapse().sample_size, d, axis=axes, keepdims=not squeeze, @@ -6951,7 +6951,9 @@ def unique(self, split_every=None): d.soften_mask() dx = d.to_dask_array() - dx = Collapse.unique(dx, split_every=split_every) + dx = Collapse().unique( + dx, split_every=split_every, active_storage=d.active_storage + ) d._set_dask(dx) @@ -8820,7 +8822,7 @@ def mid_range( """ d = _inplace_enabled_define_and_cleanup(self) d, _ = collapse( - Collapse.mid_range, + Collapse().mid_range, d, axis=axes, keepdims=not squeeze, @@ -9173,7 +9175,7 @@ def root_mean_square( """ d = _inplace_enabled_define_and_cleanup(self) d, _ = collapse( - Collapse.rms, + Collapse().rms, d, axis=axes, weights=weights, @@ -10797,7 +10799,7 @@ def range( """ d = _inplace_enabled_define_and_cleanup(self) d, _ = collapse( - Collapse.range, + Collapse().range, d, axis=axes, keepdims=not squeeze, @@ -10925,7 +10927,7 @@ def sum( """ d = _inplace_enabled_define_and_cleanup(self) d, _ = collapse( - Collapse.sum, + Collapse().sum, d, axis=axes, weights=weights, @@ -11090,7 +11092,7 @@ def sum_of_weights( """ d = _inplace_enabled_define_and_cleanup(self) d, weights = collapse( - Collapse.sum_of_weights, + Collapse().sum_of_weights, d, axis=axes, weights=weights, @@ -11188,7 +11190,7 @@ def sum_of_weights2( """ d = _inplace_enabled_define_and_cleanup(self) d, weights = collapse( - Collapse.sum_of_weights2, + Collapse().sum_of_weights2, d, axis=axes, weights=weights, @@ -11372,7 +11374,7 @@ def var( """ d = _inplace_enabled_define_and_cleanup(self) d, _ = collapse( - Collapse.var, + Collapse().var, d, axis=axes, weights=weights, From 24ec636b6bd87c35f2b2d9d66a827e049b63d650 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 17 Nov 2022 17:10:03 +0000 Subject: [PATCH 004/134] tidy --- cf/data/collapse.py | 41 +++++++++++++++++++++------------------ cf/data/collapse_utils.py | 30 +++++++++++++--------------- 2 files changed, 36 insertions(+), 35 deletions(-) diff --git a/cf/data/collapse.py b/cf/data/collapse.py index 1c23c5170f..e0e869746f 100644 --- a/cf/data/collapse.py +++ b/cf/data/collapse.py @@ -54,7 +54,7 @@ def __docstring_package_depth__(self): @active_storage("max") def max( - cls, + self, a, axis=None, keepdims=False, @@ -119,7 +119,7 @@ def max( ) def max_abs( - cls, + self, a, axis=None, keepdims=False, @@ -154,7 +154,7 @@ def max_abs( {{active_storage: `bool`, optional}} - {{chunk_function: function, optional}} + {{chunk_function: function, optional}} ppp :Returns: @@ -162,17 +162,18 @@ def max_abs( The collapsed array. """ - return cls.max( + return self.max( abs(a), axis=axis, keepdims=keepdims, mtol=mtol, split_every=split_every, + active_storage=False, ) @active_storage("mean") def mean( - cls, + self, a, axis=None, weights=None, @@ -241,7 +242,7 @@ def mean( ) def mean_abs( - cls, + self, a, weights=None, axis=None, @@ -279,7 +280,7 @@ def mean_abs( {{active_storage: `bool`, optional}} - {{chunk_function: function, optional}} + {{chunk_function: function, optional}} ppp :Returns: @@ -287,13 +288,14 @@ def mean_abs( The collapsed array. """ - return cls.mean( + return self.mean( abs(a), weights=weights, axis=axis, keepdims=keepdims, mtol=mtol, split_every=split_every, + active_storage=False, ) def mid_range( @@ -432,7 +434,7 @@ def min( ) def min_abs( - cls, + self, a, axis=None, keepdims=False, @@ -467,7 +469,7 @@ def min_abs( {{active_storage: `bool`, optional}} - {{chunk_function: function, optional}} + {{chunk_function: function, optional}} ppp :Returns: @@ -475,16 +477,17 @@ def min_abs( The collapsed array. """ - return cls.min( + return self.min( abs(a), axis=axis, keepdims=keepdims, mtol=mtol, split_every=split_every, + active_storage=False, ) def range( - cls, + self, a, axis=None, keepdims=False, @@ -552,7 +555,7 @@ def range( ) def rms( - cls, + self, a, axis=None, weights=None, @@ -620,7 +623,7 @@ def rms( ) def sample_size( - cls, + self, a, axis=None, keepdims=False, @@ -689,7 +692,7 @@ def sample_size( @active_storage("sum") def sum( - cls, + self, a, axis=None, weights=None, @@ -760,7 +763,7 @@ def sum( ) def sum_of_weights( - cls, + self, a, axis=None, weights=None, @@ -832,7 +835,7 @@ def sum_of_weights( ) def sum_of_weights2( - cls, + self, a, axis=None, weights=None, @@ -904,7 +907,7 @@ def sum_of_weights2( ) def var( - cls, + self, a, axis=None, weights=None, @@ -975,7 +978,7 @@ def var( ) def unique( - cls, a, split_every=None, active_storage=False, chunk_function=None + self, a, split_every=None, active_storage=False, chunk_function=None ): """Return unique elements of the data. diff --git a/cf/data/collapse_utils.py b/cf/data/collapse_utils.py index 1773f2d027..9dc5a77ae2 100644 --- a/cf/data/collapse_utils.py +++ b/cf/data/collapse_utils.py @@ -90,7 +90,7 @@ def check_input_dtype(a, allowed="fib"): raise TypeError(f"Can't calculate {method} of data with {a.dtype!r}") -def actify(a, method, axis=None, active_storage=False): +def actify(a, method, axis=None): """TODOACTIVEDOCS. .. versionadded:: TODOACTIVEVER @@ -106,8 +106,6 @@ def actify(a, method, axis=None, active_storage=False): axis: (sequence of) `int`, optional TODOACTIVEDOCS - {{active_storage: `bool`, optional}} - :Returns: `dask.array.Array`, function @@ -115,15 +113,15 @@ def actify(a, method, axis=None, active_storage=False): """ chunk_function = None - if not active_storage: - # It has been determined externally that an active storage - # reduction is not possible, so return the input data and - # chunk function unchanged. - return a, chunk_function - - # Still here? Then it is assumed that the dask array is of a form - # which might be able to exploit active storage. In particular, it - # is assumed that all data definitions point to files. + # if not active_storage: + # # It has been determined externally that an active storage + # # reduction is not possible, so return the input data and + # # chunk function unchanged. + # return a, chunk_function + # + # # Still here? Then it is assumed that the dask array is of a form + # # which might be able to exploit active storage. In particular, it + # # is assumed that all data definitions point to files. # Parse axis if axis is None: @@ -203,16 +201,16 @@ def decorator(collapse_method): @wraps(collapse_method) def wrapper(self, *args, **kwargs): if ( - kwargs.get("weights") is None + kwargs.get("active_storage") + and kwargs.get("weights") is None and kwargs.get("chunk_function") is None ): - # The collapse is unweighted => attempt to actify the - # dask array and provide a new chunk function. + # Attempt to actify the dask array and provide a new + # chunk function a, chunk_function = actify( args[0], method=method, axis=kwargs.get("axis"), - active_storage=kwargs.get("active_storage", False), ) args = list(args) args[0] = a From 54bef6b7a32747d59562b6d2d5268f547448c136 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 18 Nov 2022 15:13:27 +0000 Subject: [PATCH 005/134] refactor --- cf/data/collapse/collapse_active.py~ | 144 --------------------------- 1 file changed, 144 deletions(-) delete mode 100644 cf/data/collapse/collapse_active.py~ diff --git a/cf/data/collapse/collapse_active.py~ b/cf/data/collapse/collapse_active.py~ deleted file mode 100644 index b5f491fbfb..0000000000 --- a/cf/data/collapse/collapse_active.py~ +++ /dev/null @@ -1,144 +0,0 @@ -from functools import wraps -from numbers import Integral - -import dask.array as da -from dask.array.utils import validate_axis -from dask.base import collections_to_dsk - - -def actify(a, method, axis=None): - """TODOACTIVEDOCS. - - .. versionadded:: TODOACTIVEVER - - :Parameters: - - a: `dask.array.Array` - The array to be collapsed. - - method: `str` - TODOACTIVEDOCS - - axis: (sequence of) `int`, optional - TODOACTIVEDOCS - - :Returns: - - `dask.array.Array`, function - TODOACTIVEDOCS - - """ - chunk_function = None - # if not active_storage: - # # It has been determined externally that an active storage - # # reduction is not possible, so return the input data and - # # chunk function unchanged. - # return a, chunk_function - # - # # Still here? Then it is assumed that the dask array is of a form - # # which might be able to exploit active storage. In particular, it - # # is assumed that all data definitions point to files. - - # Parse axis - if axis is None: - axis = tuple(range(a.ndim)) - else: - if isinstance(axis, Integral): - axis = (axis,) - - if len(axis) != a.ndim: - # Can't (yet) use active storage to collapse a subset of - # the axes, so return the input data and chunk function - # unchanged. - return a, chunk_function - - axis = validate_axis(axis, a.ndim) - - active_chunk_functions = set() - - # Loop round elements of the dask graph, looking for data - # definitions that point to a file and which support active - # storage operations. The elements are traversed in reverse order - # so that the data defintions come out first, allowing for a fast - # short circuit in the common case when using active storage is no - # feasible. - dsk = collections_to_dsk((a,), optimize_graph=True) - for key, value in reversed(dsk.items()): - try: - value.get_filename() - except AttributeError: - # This value is not a data definition (it is assumed that - # all data definitions point to files). - continue - - try: - # Create a new actified data definition value - value = value.actify(method, axis) - except (AttributeError, ValueError): - # This data definition value does not support active - # storage reductions, or does not support the requested - # active storage reduction defined by 'method'. - active_chunk_functions = () - break - - try: - # Get the active storage chunk function - active_chunk_functions.add(value.get_active_chunk_function()) - except AttributeError: - # This data definition value does not support active - # storage reductions - active_chunk_functions = () - break - - # Still here? Then update the dask graph in-place with the - # actified data definition value. - dsk[key] = value - - if len(active_chunk_functions) == 1: - # All data definitions in the dask graph support active - # storage reductions with the same chunk function => redefine - # the array from the actified dask graph, and redefine the - # reduction chunk function. - a = da.Array(dsk, a.name, a.chunks, a.dtype, a._meta) - chunk_function = active_chunk_functions.pop() - - # Return the data and chunk function. These will either be - # identical to the inputs or, if it has been determinded that - # active storage operation is possible, then these the data and - # chunk function will have been replaced with actified versions. - return a, chunk_function - - -def active_storage(method): - """A decorator for `Collapse` methods that enables active storage - operations, when the conditions are right.""" - - def decorator(collapse_method): - @wraps(collapse_method) - def wrapper(self, *args, **kwargs): - if ( - kwargs.get("active_storage") - and kwargs.get("weights") is None - and kwargs.get("chunk_function") is None - ): - # Attempt to actify the dask array and provide a new - # chunk function - a, chunk_function = actify( - args[0], - method=method, - axis=kwargs.get("axis"), - ) - args = list(args) - args[0] = a - - if chunk_function is not None: - # The dask array has been actified, so update the - # chunk function. - kwargs["chunk_function"] = chunk_function - - # Create the collapse - return collapse_method(self, *args, **kwargs) - - return wrapper - - return decorator From eddd37773f34596ef924f340cbabb29c4e858380 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 21 Nov 2022 17:58:38 +0000 Subject: [PATCH 006/134] dev --- cf/data/collapse/collapse.py | 155 ++++++++++++++++------------ cf/data/collapse/collapse_active.py | 59 ++++++----- cf/docstring/docstring.py | 9 ++ 3 files changed, 134 insertions(+), 89 deletions(-) diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index 4ee6738777..f4aae2428d 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -13,6 +13,20 @@ class Collapse(metaclass=DocstringRewriteMeta): """Container for functions that collapse dask arrays. + **Active storage** + + Any collapse method (such as `max`, `max_abs`, etc.) will attempt + to make use use of active storage if: + + * The collapse method's `active_storage` parameter is True. + * The method name is recognised by the `Active` class. + * The `Active` class recognioses the storage location as one that + supports active storage operations. + + If all of these conditions are passed but the dask array being + collapsed is, on inspection, not deemed suitable, then the + collapse operation will be executed without active storage. + .. versionadded:: TODODASKVER """ @@ -86,7 +100,7 @@ def max( {{active_storage: `bool`, optional}} - {{chunk_function: function, optional}} + {{chunk_function: callable, optional}} :Returns: @@ -115,6 +129,7 @@ def max( meta=np.array((), dtype=dtype), ) + @active_storage("max_abs") def max_abs( self, a, @@ -151,7 +166,7 @@ def max_abs( {{active_storage: `bool`, optional}} - {{chunk_function: function, optional}} ppp + {{chunk_function: callable, optional}} :Returns: @@ -208,7 +223,7 @@ def mean( {{active_storage: `bool`, optional}} - {{chunk_function: function, optional}} + {{chunk_function: callable, optional}} :Returns: @@ -238,6 +253,7 @@ def mean( weights=weights, ) + @active_storage("mean_abs") def mean_abs( self, a, @@ -277,7 +293,7 @@ def mean_abs( {{active_storage: `bool`, optional}} - {{chunk_function: function, optional}} ppp + {{chunk_function: callable, optional}} :Returns: @@ -295,6 +311,7 @@ def mean_abs( active_storage=False, ) + @active_storage("mid_range") def mid_range( self, a, @@ -332,7 +349,7 @@ def mid_range( {{active_storage: `bool`, optional}} - {{chunk_function: function, optional}} + {{chunk_function: callable, optional}} :Returns: @@ -401,7 +418,7 @@ def min( {{active_storage: `bool`, optional}} - {{chunk_function: function, optional}} + {{chunk_function: callable, optional}} :Returns: @@ -430,6 +447,7 @@ def min( meta=np.array((), dtype=dtype), ) + @active_storage("min_abs") def min_abs( self, a, @@ -466,7 +484,7 @@ def min_abs( {{active_storage: `bool`, optional}} - {{chunk_function: function, optional}} ppp + {{chunk_function: callable, optional}} :Returns: @@ -483,6 +501,7 @@ def min_abs( active_storage=False, ) + @active_storage("range") def range( self, a, @@ -519,7 +538,7 @@ def range( {{active_storage: `bool`, optional}} - {{chunk_function: function, optional}} + {{chunk_function: callable, optional}} :Returns: @@ -551,6 +570,7 @@ def range( meta=np.array((), dtype=dtype), ) + @active_storage("rms") def rms( self, a, @@ -590,7 +610,7 @@ def rms( {{active_storage: `bool`, optional}} - {{chunk_function: function, optional}} + {{chunk_function: callable, optional}} :Returns: @@ -619,6 +639,7 @@ def rms( weights=weights, ) + @active_storage("sample_size") def sample_size( self, a, @@ -655,7 +676,7 @@ def sample_size( {{active_storage: `bool`, optional}} - {{chunk_function: function, optional}} + {{chunk_function: callable, optional}} :Returns: @@ -727,7 +748,7 @@ def sum( {{active_storage: `bool`, optional}} - {{chunk_function: function, optional}} + {{chunk_function: callable, optional}} :Returns: @@ -759,6 +780,7 @@ def sum( weights=weights, ) + @active_storage("sum_of_weights") def sum_of_weights( self, a, @@ -798,7 +820,7 @@ def sum_of_weights( {{active_storage: `bool`, optional}} - {{chunk_function: function, optional}} + {{chunk_function: callable, optional}} :Returns: @@ -831,6 +853,7 @@ def sum_of_weights( weights=weights, ) + @active_storage("sum_of_weights2") def sum_of_weights2( self, a, @@ -870,7 +893,7 @@ def sum_of_weights2( {{active_storage: `bool`, optional}} - {{chunk_function: function, optional}} + {{chunk_function: callable, optional}} :Returns: @@ -903,6 +926,59 @@ def sum_of_weights2( weights=weights, ) + @active_storage("unique") + def unique( + self, a, split_every=None, active_storage=False, chunk_function=None + ): + """Return unique elements of the data. + + .. versionadded:: TODODASKVER + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{split_every: `int` or `dict`, optional}} + + {{active_storage: `bool`, optional}} + + {{chunk_function: callable, optional}} + + :Returns: + + `dask.array.Array` + The unique values in a 1-d array. + + """ + from .dask_collapse import cf_unique_agg, cf_unique_chunk + + if chunk_function is None: + chunk_function = cf_unique_chunk + + check_input_dtype(a, "fibUS") + + # Flatten the array so that it has the same number of + # dimensions as the result (i.e. 1). This ensures that the + # combination of `keepdims=True, output_size=np.nan` will + # result in a correct output chunk size `np.nan`. See + # `dask.array.reduction` for details. + a = a.flatten() + + dtype = a.dtype + return reduction( + a, + chunk_function, + cf_unique_agg, + keepdims=True, + output_size=np.nan, + dtype=dtype, + split_every=split_every, + concatenate=False, + meta=np.array((), dtype=dtype), + ) + + @active_storage("var") def var( self, a, @@ -945,7 +1021,7 @@ def var( {{active_storage: `bool`, optional}} - {{chunk_function: function, optional}} + {{chunk_function: callable, optional}} :Returns: @@ -973,54 +1049,3 @@ def var( meta=np.array((), dtype=dtype), weights=weights, ) - - def unique( - self, a, split_every=None, active_storage=False, chunk_function=None - ): - """Return unique elements of the data. - - .. versionadded:: TODODASKVER - - :Parameters: - - a: `dask.array.Array` - The array to be collapsed. - - {{split_every: `int` or `dict`, optional}} - - {{active_storage: `bool`, optional}} - - {{chunk_function: function, optional}} - - :Returns: - - `dask.array.Array` - The unique values in a 1-d array. - - """ - from .dask_collapse import cf_unique_agg, cf_unique_chunk - - if chunk_function is None: - chunk_function = cf_unique_chunk - - check_input_dtype(a, "fibUS") - - # Flatten the array so that it has the same number of - # dimensions as the result (i.e. 1). This ensures that the - # combination of `keepdims=True, output_size=np.nan` will - # result in a correct output chunk size `np.nan`. See - # `dask.array.reduction` for details. - a = a.flatten() - - dtype = a.dtype - return reduction( - a, - chunk_function, - cf_unique_agg, - keepdims=True, - output_size=np.nan, - dtype=dtype, - split_every=split_every, - concatenate=False, - meta=np.array((), dtype=dtype), - ) diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index d0b52f60f5..82d2fdbf8b 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -28,16 +28,10 @@ def actify(a, method, axis=None): TODOACTIVEDOCS """ - chunk_function = None - # if not active_storage: - # # It has been determined externally that an active storage - # # reduction is not possible, so return the input data and - # # chunk function unchanged. - # return a, chunk_function - # - # # Still here? Then it is assumed that the dask array is of a form - # # which might be able to exploit active storage. In particular, it - # # is assumed that all data definitions point to files. + if method not in Active.methods: + # The given method is not recognised by `Active`, so return + # the input data unchanged. + return a, None # Parse axis if axis is None: @@ -48,12 +42,12 @@ def actify(a, method, axis=None): if len(axis) != a.ndim: # Can't (yet) use active storage to collapse a subset of - # the axes, so return the input data and chunk function - # unchanged. - return a, chunk_function + # the axes, so return the input data unchanged. + return a, None axis = validate_axis(axis, a.ndim) + filenames = set() active_chunk_functions = set() # Loop round elements of the dask graph, looking for data @@ -62,14 +56,13 @@ def actify(a, method, axis=None): # so that the data defintions come out first, allowing for a # faster short circuit when using active storage is not possible. # - # It is assumed that teh graph doesn't have many laters - i.e. it - # is assumed that this function is called only if has been - # deterimined extermanlly that it is sensible to do so. + # It is assumed that this `actify` has only been called if has + # been deterimined externally that it is sensible to do so. dsk = collections_to_dsk((a,), optimize_graph=True) for key, value in reversed(dsk.items()): try: - value.get_filename() + filenames.add(value.get_filename()) except AttributeError: # This value is not a data definition (it is assumed that # all data definitions point to files). @@ -98,20 +91,33 @@ def actify(a, method, axis=None): # actified data definition value. dsk[key] = value + for filename in filenames: + # TODOACTIVE: Check that Active(filename) supports active + # storage. I don't really know how this will work + # ... + if not OK: + # This file location does not support active storage, so + # return the input data unchanged. + return a, None + + # Still here? if len(active_chunk_functions) == 1: # All data definitions in the dask graph support active # storage reductions with the same chunk function => redefine # the array from the actified dask graph, and define the - # actified reduction chunk function. + # active storage reduction chunk function. a = da.Array(dsk, a.name, a.chunks, a.dtype, a._meta) chunk_function = active_chunk_functions.pop() + else: + chunk_function = None - # Return the dask array and chunk function. The array will either - # be identical to the input or, if it has been determined that - # active storage operation is possible, then it will have been - # replaced by its actified version. The chunk function will either - # be None, or the active storage chunk function provided by the - # data definitions in each chunk. + # Return the dask array and chunk function. + # + # The array will either be identical to the input or, if it has + # been determined that active storage operations are possible, + # then it will have been replaced by its actified version. The + # chunk function will either be None, or the active storage chunk + # function provided by each chunks's data definition. return a, chunk_function @@ -121,6 +127,11 @@ def active_storage(method): .. versionadded:: TODOACTIVEVER + :Parameters: + + method: `str` + TODOACTIVEDOCS + """ def decorator(collapse_method): diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py index 96d2ba665f..9e6b450d4f 100644 --- a/cf/docstring/docstring.py +++ b/cf/docstring/docstring.py @@ -332,6 +332,15 @@ value. A default can also be set globally with the ``split_every`` key in `dask.config`. See `dask.array.reduction` for details.""", + "{{active_storage: `bool`, optional}}": """{{active_storage: `bool`, optional}} + If True then attempt to perform the collapse using + active storage. If other necessary conditions are not + met (see `Collapse` for details) then the operation + will be executed without active storage.""", + "{{chunk_function: callable, optional}}": """{{chunk_function: callable, optional}} + Provides the ``chunk`` parameter to + `dask.array.reduction`. If unset then a default + function will be used.""", # Collapse weights "{{Collapse weights: data_like or `None`, optional}}": """weights: data_like or `None`, optional Weights associated with values of the array. By From 0825c565f0113ae591b90b629537a194eb9fc0bf Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 22 Nov 2022 16:42:44 +0000 Subject: [PATCH 007/134] dev --- cf/data/collapse/collapse.py | 10 +++++----- cf/data/collapse/collapse_active.py | 24 ++++++++++-------------- cf/data/collapse/collapse_utils.py | 1 + cf/docstring/docstring.py | 4 ++-- 4 files changed, 18 insertions(+), 21 deletions(-) diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index f4aae2428d..03fbd22794 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -15,16 +15,16 @@ class Collapse(metaclass=DocstringRewriteMeta): **Active storage** - Any collapse method (such as `max`, `max_abs`, etc.) will attempt - to make use use of active storage if: + A collapse method (such as `max`, `max_abs`, etc.) will attempt to + make use use of active storage if: * The collapse method's `active_storage` parameter is True. - * The method name is recognised by the `Active` class. + * The method's identity is recognised by the `Active` class. * The `Active` class recognioses the storage location as one that supports active storage operations. - If all of these conditions are passed but the dask array being - collapsed is, on inspection, not deemed suitable, then the + However, if all of these conditions are passed but the dask array + being collapsed is, on inspection, not deemed suitable, then the collapse operation will be executed without active storage. .. versionadded:: TODODASKVER diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 82d2fdbf8b..feffa3e7a0 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -28,7 +28,7 @@ def actify(a, method, axis=None): TODOACTIVEDOCS """ - if method not in Active.methods: + if method not in Active.methods(): # The given method is not recognised by `Active`, so return # the input data unchanged. return a, None @@ -64,17 +64,20 @@ def actify(a, method, axis=None): try: filenames.add(value.get_filename()) except AttributeError: - # This value is not a data definition (it is assumed that - # all data definitions point to files). + # This value is not a data definition. + # + # Note: It is assumed that all data definitions point to + # files continue try: - # Create a new actified data definition value + # Create a new actified data definition value = value.actify(method, axis) except (AttributeError, ValueError): - # This data definition value does not support active - # storage reductions, or does not support the requested - # active storage reduction defined by 'method'. + # Either this data definition does not support active + # storage reductions (AttributeError), or it does not + # support the requested active storage reduction defined + # by 'method' (ValueError). active_chunk_functions = () break @@ -111,13 +114,6 @@ def actify(a, method, axis=None): else: chunk_function = None - # Return the dask array and chunk function. - # - # The array will either be identical to the input or, if it has - # been determined that active storage operations are possible, - # then it will have been replaced by its actified version. The - # chunk function will either be None, or the active storage chunk - # function provided by each chunks's data definition. return a, chunk_function diff --git a/cf/data/collapse/collapse_utils.py b/cf/data/collapse/collapse_utils.py index 392b9919ba..3f1009308c 100644 --- a/cf/data/collapse/collapse_utils.py +++ b/cf/data/collapse/collapse_utils.py @@ -1,3 +1,4 @@ +"""TODOACTIVEDOCS""" def double_precision_dtype(a, default=None, bool_type="i"): """Returns the corresponding double precision data type of an array. diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py index 9e6b450d4f..eb67287edd 100644 --- a/cf/docstring/docstring.py +++ b/cf/docstring/docstring.py @@ -339,8 +339,8 @@ will be executed without active storage.""", "{{chunk_function: callable, optional}}": """{{chunk_function: callable, optional}} Provides the ``chunk`` parameter to - `dask.array.reduction`. If unset then a default - function will be used.""", + `dask.array.reduction`. If unset then an approriate + default function will be used.""", # Collapse weights "{{Collapse weights: data_like or `None`, optional}}": """weights: data_like or `None`, optional Weights associated with values of the array. By From 044ccc91e8871c03140a0611cd1b873f11f39cd2 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 8 Feb 2023 12:29:43 +0000 Subject: [PATCH 008/134] dev --- cf/data/collapse/collapse.py | 45 ++++++++++---------- cf/data/collapse/collapse_active.py | 27 ++++++------ cf/data/collapse/collapse_utils.py | 4 +- cf/data/collapse/dask_collapse.py | 66 ++++++++++++++--------------- cf/data/data.py | 36 ++++++++++++---- 5 files changed, 99 insertions(+), 79 deletions(-) diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index 03fbd22794..0676060d72 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -20,14 +20,15 @@ class Collapse(metaclass=DocstringRewriteMeta): * The collapse method's `active_storage` parameter is True. * The method's identity is recognised by the `Active` class. - * The `Active` class recognioses the storage location as one that + * The `Active` class recognises the storage location as one that supports active storage operations. - However, if all of these conditions are passed but the dask array - being collapsed is, on inspection, not deemed suitable, then the - collapse operation will be executed without active storage. + However, when all of these conditions are passed, the collapse + operation will *not* be executed with active storage if the + dask array is deemed, on inspection to be unsuitable. See the + `actify` function for details. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 """ @@ -40,7 +41,7 @@ def __docstring_substitutions__(self): See `_docstring_substitutions` for details. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 .. seealso:: `_docstring_substitutions` @@ -58,7 +59,7 @@ def __docstring_package_depth__(self): See `_docstring_package_depth` for details. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 """ return 0 @@ -83,7 +84,7 @@ def max( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -149,7 +150,7 @@ def max_abs( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -204,7 +205,7 @@ def mean( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -274,7 +275,7 @@ def mean_abs( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -332,7 +333,7 @@ def mid_range( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -401,7 +402,7 @@ def min( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -467,7 +468,7 @@ def min_abs( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -521,7 +522,7 @@ def range( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -591,7 +592,7 @@ def rms( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -659,7 +660,7 @@ def sample_size( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -729,7 +730,7 @@ def sum( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -801,7 +802,7 @@ def sum_of_weights( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -874,7 +875,7 @@ def sum_of_weights2( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -932,7 +933,7 @@ def unique( ): """Return unique elements of the data. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -1000,7 +1001,7 @@ def var( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index feffa3e7a0..dd89347303 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -1,9 +1,4 @@ from functools import wraps -from numbers import Integral - -import dask.array as da -from dask.array.utils import validate_axis -from dask.base import collections_to_dsk def actify(a, method, axis=None): @@ -28,6 +23,12 @@ def actify(a, method, axis=None): TODOACTIVEDOCS """ + from numbers import Integral + + import dask.array as da + from dask.base import collections_to_dsk + from dask.array.utils import validate_axis + if method not in Active.methods(): # The given method is not recognised by `Active`, so return # the input data unchanged. @@ -36,7 +37,7 @@ def actify(a, method, axis=None): # Parse axis if axis is None: axis = tuple(range(a.ndim)) - else: + else if isinstance(axis, Integral): axis = (axis,) @@ -56,18 +57,17 @@ def actify(a, method, axis=None): # so that the data defintions come out first, allowing for a # faster short circuit when using active storage is not possible. # - # It is assumed that this `actify` has only been called if has - # been deterimined externally that it is sensible to do so. - + # It is assumed that `actify` has only been called if has been + # deterimined externally that it is sensible to do so. This will + # be the case if an only if the parent `Data` instance's + # `active_storage` attribute is `True`. dsk = collections_to_dsk((a,), optimize_graph=True) for key, value in reversed(dsk.items()): try: filenames.add(value.get_filename()) except AttributeError: - # This value is not a data definition. - # - # Note: It is assumed that all data definitions point to - # files + # This value is not a data definition. Note: It is assumed + # that all data definitions point to files. continue try: @@ -129,7 +129,6 @@ def active_storage(method): TODOACTIVEDOCS """ - def decorator(collapse_method): @wraps(collapse_method) def wrapper(self, *args, **kwargs): diff --git a/cf/data/collapse/collapse_utils.py b/cf/data/collapse/collapse_utils.py index 3f1009308c..c66bc21e4e 100644 --- a/cf/data/collapse/collapse_utils.py +++ b/cf/data/collapse/collapse_utils.py @@ -2,7 +2,7 @@ def double_precision_dtype(a, default=None, bool_type="i"): """Returns the corresponding double precision data type of an array. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -59,7 +59,7 @@ def check_input_dtype(a, allowed="fib"): The collapse method is assumed to be defined by the calling function. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: diff --git a/cf/data/collapse/dask_collapse.py b/cf/data/collapse/dask_collapse.py index 3930dc711d..cb7f710c98 100644 --- a/cf/data/collapse/dask_collapse.py +++ b/cf/data/collapse/dask_collapse.py @@ -20,7 +20,7 @@ def mask_small_sample_size(x, N, axis, mtol, original_shape): """Mask elements where the sample size is below a threshold. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -78,7 +78,7 @@ def mask_small_sample_size(x, N, axis, mtol, original_shape): def sum_weights_chunk(x, weights=None, square=False, N=None, **kwargs): """Sum the weights. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -140,7 +140,7 @@ def combine_arrays( See `dask.array.reductions.mean_combine` for an example. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Returns: @@ -159,7 +159,7 @@ def combine_arrays( def sum_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): """Alias of `combine_arrays` with ``func=chunk.sum``. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 """ return combine_arrays( @@ -170,7 +170,7 @@ def sum_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): def max_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): """Alias of `combine_arrays` with ``func=chunk.max``. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 """ return combine_arrays( @@ -181,7 +181,7 @@ def max_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): def min_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): """Alias of `combine_arrays` with ``func=chunk.min``. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 """ return combine_arrays( @@ -193,7 +193,7 @@ def sum_sample_sizes(pairs, axis, computing_meta=False, **kwargs): """Alias of `combine_arrays` with ``key="N", func=chunk.sum, dtype="i8"``. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 """ return combine_arrays( @@ -216,7 +216,7 @@ def cf_mean_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): This function is passed to `dask.array.reduction` as its *chunk* parameter. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -254,7 +254,7 @@ def cf_mean_combine( This function is passed to `dask.array.reduction` as its *combine* parameter. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -298,7 +298,7 @@ def cf_mean_agg( This function is passed to `dask.array.reduction` as its *aggregate* parameter. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -337,7 +337,7 @@ def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): This function is passed to `dask.array.reduction` as its *chunk* parameter. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -367,7 +367,7 @@ def cf_max_combine(pairs, axis=None, computing_meta=False, **kwargs): This function is passed to `dask.array.reduction` as its *combine* parameter. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -401,7 +401,7 @@ def cf_max_agg( This function is passed to `dask.array.reduction` as its *aggregate* parameter. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -448,7 +448,7 @@ def cf_mid_range_agg( This function is passed to `dask.array.reduction` as its *aggregate* parameter. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -488,7 +488,7 @@ def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): This function is passed to `dask.array.reduction` as its *chunk* parameter. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -518,7 +518,7 @@ def cf_min_combine(pairs, axis=None, computing_meta=False, **kwargs): This function is passed to `dask.array.reduction` as its *combine* parameter. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -552,7 +552,7 @@ def cf_min_agg( This function is passed to `dask.array.reduction` as its *aggregate* parameter. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -591,7 +591,7 @@ def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs): This function is passed to `dask.array.reduction` as its *chunk* parameter. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -625,7 +625,7 @@ def cf_range_combine( This function is passed to `dask.array.reduction` as its *combine* parameter. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -661,7 +661,7 @@ def cf_range_agg( This function is passed to `dask.array.reduction` as its *aggregate* parameter. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -701,7 +701,7 @@ def cf_rms_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): This function is passed to `dask.array.reduction` as its *chunk* parameter. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -738,7 +738,7 @@ def cf_rms_agg( This function is passed to `dask.array.reduction` as its *aggregate* parameter. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -777,7 +777,7 @@ def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs): This function is passed to `dask.array.reduction` as its *chunk* parameter. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -813,7 +813,7 @@ def cf_sample_size_combine( This function is passed to `dask.array.reduction` as its *combine* parameter. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -848,7 +848,7 @@ def cf_sample_size_agg( This function is passed to `dask.array.reduction` as its *aggregate* parameter. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -887,7 +887,7 @@ def cf_sum_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): This function is passed to `dask.array.reduction` as its *chunk* parameter. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -921,7 +921,7 @@ def cf_sum_combine( This function is passed to `dask.array.reduction` as its *combine* parameter. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -956,7 +956,7 @@ def cf_sum_agg( This function is passed to `dask.array.reduction` as its *aggregate* parameter. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -1038,7 +1038,7 @@ def cf_unique_chunk(x, dtype=None, computing_meta=False, **kwargs): This function is passed to `dask.array.reduction` as its *chunk* parameter. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -1066,7 +1066,7 @@ def cf_unique_agg(pairs, axis=None, computing_meta=False, **kwargs): It is assumed that the arrays are one-dimensional. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -1105,7 +1105,7 @@ def cf_var_chunk( https://en.wikipedia.org/wiki/Pooled_variance#Sample-based_statistics for details. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -1177,7 +1177,7 @@ def cf_var_combine( This function is passed to `dask.array.reduction` as its *combine* parameter. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: @@ -1236,7 +1236,7 @@ def cf_var_agg( https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights for details. - .. versionadded:: TODODASKVER + .. versionadded:: 3.14.0 :Parameters: diff --git a/cf/data/data.py b/cf/data/data.py index 1c8b49c7e7..8e54fffa60 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -89,6 +89,14 @@ _DEFAULT_CHUNKS = "auto" _DEFAULT_HARDMASK = True +# +_NONE = 0 +_ARRAY = 1 +_CACHE = 2 +_CFA = 4 +_ACTIVE = 8 +_ALL = _ARRAY | _CACHE | _CFA | _ACTIVE + class Data(DataClassDeprecationsMixin, Container, cfdm.Data): """An N-dimensional data array with units and masked values. @@ -635,13 +643,17 @@ def _rtol(self): def _is_file_array(self, array): """Whether or not an array is stored on disk. + .. versionaddedd: TODOACTIVEVER + :Parameters: array: + TODOACTIVEDOCS :Returns: `bool` + TODOACTIVEDOCS """ return isinstance(array, FileArrayMixin) @@ -1259,7 +1271,7 @@ def _del_active_storage(self): .. versionadded:: TODOACTIVEVER - .. seealso:: `_set_active_storage` + .. seealso:: `active_storage`, `_set_active_storage` :Returns: @@ -1278,14 +1290,14 @@ def _del_active_storage(self): False """ - self._custom.pop("active_storage", None) + self._custom.pop("active_storage", False) def _set_active_storage(self, value): """TODOACTIVEDOCS. .. versionadded:: TODOACTIVEVER - .. seealso:: `_del_active_storage` + .. seealso:: `active_storage`, `_del_active_storage` :Returns: @@ -3649,11 +3661,19 @@ def concatenate(cls, data, axis=0, _preserve=True): processed_data.append(data1) # Get data as dask arrays and apply concatenation operation - dxs = [] - for data1 in processed_data: - dxs.append(data1.to_dask_array()) - - data0._set_dask(da.concatenate(dxs, axis=axis)) + dxs = [d.to_dask_array() for d in processed_data] + dx = da.concatenate(dxs, axis=axis) + + # Set the active storage status + active = _NONE + for d in processed_data: + if not d.active_storage(): + # Set the output active storage status to False when any + # input data instance has False status + active = _ACTIVE + break + + data0._set_dask(dx, conform=_ALL ^ active) # Manage cyclicity of axes: if join axis was cyclic, it is no longer axis = data0._parse_axes(axis)[0] From f5d2834eba83f1fc5e7fd9ae06fcf9956af7967f Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 9 Feb 2023 08:32:46 +0000 Subject: [PATCH 009/134] dev --- cf/data/collapse/collapse_active.py | 7 ++++--- cf/data/collapse/collapse_utils.py | 4 +++- cf/data/data.py | 16 ++++++++-------- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index dd89347303..33fe0f828b 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -24,10 +24,10 @@ def actify(a, method, axis=None): """ from numbers import Integral - + import dask.array as da - from dask.base import collections_to_dsk from dask.array.utils import validate_axis + from dask.base import collections_to_dsk if method not in Active.methods(): # The given method is not recognised by `Active`, so return @@ -37,7 +37,7 @@ def actify(a, method, axis=None): # Parse axis if axis is None: axis = tuple(range(a.ndim)) - else + else: if isinstance(axis, Integral): axis = (axis,) @@ -129,6 +129,7 @@ def active_storage(method): TODOACTIVEDOCS """ + def decorator(collapse_method): @wraps(collapse_method) def wrapper(self, *args, **kwargs): diff --git a/cf/data/collapse/collapse_utils.py b/cf/data/collapse/collapse_utils.py index c66bc21e4e..875d2e3ab7 100644 --- a/cf/data/collapse/collapse_utils.py +++ b/cf/data/collapse/collapse_utils.py @@ -1,4 +1,6 @@ -"""TODOACTIVEDOCS""" +"""TODOACTIVEDOCS.""" + + def double_precision_dtype(a, default=None, bool_type="i"): """Returns the corresponding double precision data type of an array. diff --git a/cf/data/data.py b/cf/data/data.py index 8e54fffa60..34d22bddca 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -644,7 +644,7 @@ def _is_file_array(self, array): """Whether or not an array is stored on disk. .. versionaddedd: TODOACTIVEVER - + :Parameters: array: @@ -2348,8 +2348,8 @@ def persist(self, inplace=False): dx = self.to_dask_array() dx = dx.persist() - d._set_dask(dx, conform=False) - d._del_active_storage() + d._set_dask(dx, conform=False) # TODOACTIVE + d._del_active_storage() # TODOACTIVE return d @@ -2919,8 +2919,8 @@ def rechunk( dx = d.to_dask_array() dx = dx.rechunk(chunks, threshold, block_size_limit, balance) - d._set_dask(dx, conform=False) - d._del_active_storage() + d._set_dask(dx, conform=False) # TODOACTIVE + d._del_active_storage() # TODOACTIVE return d @@ -3663,14 +3663,14 @@ def concatenate(cls, data, axis=0, _preserve=True): # Get data as dask arrays and apply concatenation operation dxs = [d.to_dask_array() for d in processed_data] dx = da.concatenate(dxs, axis=axis) - + # Set the active storage status - active = _NONE + active = _ACTIVE for d in processed_data: if not d.active_storage(): # Set the output active storage status to False when any # input data instance has False status - active = _ACTIVE + active = _NONE break data0._set_dask(dx, conform=_ALL ^ active) From 02ce7b759824c68a54dc9a68c1c2c5429405c0d1 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 9 Feb 2023 12:01:44 +0000 Subject: [PATCH 010/134] dev --- cf/data/array/mixin/activestoragemixin.py | 332 +++++++++++----------- cf/data/collapse/collapse_active.py | 193 +++++++++++-- cf/data/fragment/netcdffragmentarray.py | 5 +- 3 files changed, 340 insertions(+), 190 deletions(-) diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index b705aedea9..a84d29c5bf 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -38,38 +38,40 @@ def __getitem__(self, indices): return active[indices] - def _active_chunk_functions(self): - """Mapping of method names to active chunk functions. - - .. versionadded:: TODOACTIVEVER - - :Returns: - - `dict` - The mapping. - - """ - return { - "min": self.active_min, - "max": self.active_max, - "mean": self.active_mean, - "sum": self.active_sum, - } +# def _active_chunk_functions(self): +# """Mapping of method names to active chunk functions. +# +# .. versionadded:: TODOACTIVEVER +# +# :Returns: +# +# `dict` +# The mapping. +# +# """ +# return { +# "min": self.active_min, +# "max": self.active_max, +# "mean": self.active_mean, +# "sum": self.active_sum, +# } def actify(self, method, axis=None): """Return a new actified `{{class}}` instance. - The new instance is a deep copy of the original, including the - definitions of the active storage method and axis. + The new instance is a deep copy of the original, with the + additional setting of the active storage method and axis. .. versionadded:: TODOACTIVEVER + .. seealso:: `set_active_axis`, `set_active_method` + :Parameters: method: `str` TODOACTIVEDOCS - axis: (sequence of) `int`, optional + axis: `None` or (sequence of) `int`, optional TODOACTIVEDOCS :Returns: @@ -78,146 +80,150 @@ def actify(self, method, axis=None): TODOACTIVEDOCS """ - if method not in self._active_chunk_functions(): - raise ValueError(f"Invalid active storage operation: {method!r}") +# if method not in self._active_chunk_functions(): +# raise ValueError(f"Invalid active storage operation: {method!r}") a = self.copy() a.set_active_method(method) a.set_active_axis(axis) return a - @staticmethod - def active_min(a, **kwargs): - """Chunk calculations for the minimum. - - Assumes that the calculations have already been done, - i.e. that *a* is already the minimum. - - This function is intended to be passed to - `dask.array.reduction` as the ``chunk`` parameter. Its return - signature must be the same as the non-active chunk function - that it is replacing. - - .. versionadded:: TODOACTIVEVER - - :Parameters: - - a: `dict` - TODOACTIVEDOCS - - :Returns: - - `dict` - Dictionary with the keys: - - * N: The sample size. - * min: The minimum of `a``. - - """ - return {"N": a["n"], "min": a["min"]} - - @staticmethod - def active_max(a, **kwargs): - """Chunk calculations for the maximum. - - Assumes that the calculations have already been done, - i.e. that *a* is already the maximum. - - This function is intended to be passed to - `dask.array.reduction` as the ``chunk`` parameter. Its return - signature must be the same as the non-active chunk function - that it is replacing. - - .. versionadded:: TODOACTIVEVER - - :Parameters: - - a: `dict` - TODOACTIVEDOCS - - :Returns: - - `dict` - Dictionary with the keys: - - * N: The sample size. - * max: The maximum of `a``. - - """ - return {"N": a["n"], "max": a["max"]} - - @staticmethod - def active_mean(a, **kwargs): - """Chunk calculations for the unweighted mean. - - Assumes that the calculations have already been done, - i.e. that *a* is already the uweighted mean. - - This function is intended to be passed to - `dask.array.reduction` as the ``chunk`` parameter. Its return - signature must be the same as the non-active chunk function - that it is replacing. - - .. versionadded:: TODOACTIVEVER - - :Parameters: - - a: `dict` - TODOACTIVEDOCS - - :Returns: - - `dict` - Dictionary with the keys: - - * N: The sample size. - * V1: The sum of ``weights``. Equal to ``N`` because - weights have not been set. - * sum: The weighted sum of ``a``. - * weighted: True if weights have been set. Always - False. - - """ - return {"N": a["n"], "V1": a["n"], "sum": a["sum"], "weighted": False} - - @staticmethod - def active_sum(a, **kwargs): - """Chunk calculations for the unweighted sum. - - Assumes that the calculations have already been done, - i.e. that *a* is already the uweighted sum. - - This function is intended to be passed to - `dask.array.reduction` as the ``chunk`` parameter. Its return - signature must be the same as the non-active chunk function - that it is replacing. - - .. versionadded:: TODOACTIVEVER - - :Parameters: - - a: `dict` - TODOACTIVEDOCS - - :Returns: - - `dict` - Dictionary with the keys: - - * N: The sample size. - * sum: The weighted sum of ``a`` - - """ - return {"N": a["n"], "sum": a["sum"]} +# @staticmethod +# def active_min(a, **kwargs): +# """Chunk calculations for the minimum. +# +# Assumes that the calculations have already been done, +# i.e. that *a* is already the minimum. +# +# This function is intended to be passed to +# `dask.array.reduction` as the ``chunk`` parameter. Its return +# signature must be the same as the non-active chunk function +# that it is replacing. +# +# .. versionadded:: TODOACTIVEVER +# +# :Parameters: +# +# a: `dict` +# TODOACTIVEDOCS +# +# :Returns: +# +# `dict` +# Dictionary with the keys: +# +# * N: The sample size. +# * min: The minimum of `a``. +# +# """ +# return {"N": a["n"], "min": a["min"]} +# +# @staticmethod +# def active_max(a, **kwargs): +# """Chunk calculations for the maximum. +# +# Assumes that the calculations have already been done, +# i.e. that *a* is already the maximum. +# +# This function is intended to be passed to +# `dask.array.reduction` as the ``chunk`` parameter. Its return +# signature must be the same as the non-active chunk function +# that it is replacing. +# +# .. versionadded:: TODOACTIVEVER +# +# :Parameters: +# +# a: `dict` +# TODOACTIVEDOCS +# +# :Returns: +# +# `dict` +# Dictionary with the keys: +# +# * N: The sample size. +# * max: The maximum of `a``. +# +# """ +# return {"N": a["n"], "max": a["max"]} +# +# @staticmethod +# def active_mean(a, **kwargs): +# """Chunk calculations for the unweighted mean. +# +# Assumes that the calculations have already been done, +# i.e. that *a* is already the uweighted mean. +# +# This function is intended to be passed to +# `dask.array.reduction` as the ``chunk`` parameter. Its return +# signature must be the same as the non-active chunk function +# that it is replacing. +# +# .. versionadded:: TODOACTIVEVER +# +# :Parameters: +# +# a: `dict` +# TODOACTIVEDOCS +# +# :Returns: +# +# `dict` +# Dictionary with the keys: +# +# * N: The sample size. +# * V1: The sum of ``weights``. Equal to ``N`` because +# weights have not been set. +# * sum: The weighted sum of ``a``. +# * weighted: True if weights have been set. Always +# False. +# +# """ +# return {"N": a["n"], "V1": a["n"], "sum": a["sum"], "weighted": False} +# +# @staticmethod +# def active_sum(a, **kwargs): +# """Chunk calculations for the unweighted sum. +# +# Assumes that the calculations have already been done, +# i.e. that *a* is already the uweighted sum. +# +# This function is intended to be passed to +# `dask.array.reduction` as the ``chunk`` parameter. Its return +# signature must be the same as the non-active chunk function +# that it is replacing. +# +# .. versionadded:: TODOACTIVEVER +# +# :Parameters: +# +# a: `dict` +# TODOACTIVEDOCS +# +# :Returns: +# +# `dict` +# Dictionary with the keys: +# +# * N: The sample size. +# * sum: The weighted sum of ``a`` +# +# """ +# return {"N": a["n"], "sum": a["sum"]} def get_active_method(self): """TODOACTIVEDOC. .. versionadded:: TODOACTIVEVER + .. seealso:: `set_active_method` + :Returns: - TODOACTIVEDOC + `str` or `None` + The name of the active reduction method, or `None` if + one hasn't been set. """ return self._custom.get("active_method") @@ -227,6 +233,8 @@ def get_active_axis(self): .. versionadded:: TODOACTIVEVER + .. seealso:: `set_active_axis` + :Returns: TODOACTIVEDOC @@ -234,26 +242,28 @@ def get_active_axis(self): """ return self._custom.get("active_axis") - def get_active_chunk_function(self): - """TODOACTIVEDOC. - - .. versionadded:: TODOACTIVEVER - - :Returns: - - TODOACTIVEDOC - - """ - try: - return self._active_chunk_functions()[self.get_active_method()] - except KeyError: - raise ValueError("no active storage operation has been set") +# def get_active_chunk_function(self): +# """TODOACTIVEDOC. +# +# .. versionadded:: TODOACTIVEVER +# +# :Returns: +# +# TODOACTIVEDOC +# +# """ +# try: +# return self._active_chunk_functions()[self.get_active_method()] +# except KeyError: +# raise ValueError("no active storage operation has been set") def set_active_method(self, value): """TODOACTIVEDOC. .. versionadded:: TODOACTIVEVER + .. seealso:: `get_active_method` + :Parameters: TODOACTIVEDOCS @@ -270,6 +280,8 @@ def set_active_axis(self, value): .. versionadded:: TODOACTIVEVER + .. seealso:: `get_active_axis` + :Parameters: TODOACTIVEDOCS diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 33fe0f828b..07bf2be18f 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -1,6 +1,137 @@ from functools import wraps +def active_min(a, **kwargs): + """Chunk calculations for the minimum. + + TODO Assumes that the calculations have already been done, i.e. that + *a* is already the minimum. + + This function is intended to be passed to `dask.array.reduction` + as the ``chunk`` parameter. Its return signature must be the same + as the non-active chunk function that it is replacing. + + .. versionadded:: TODOACTIVEVER + + :Parameters: + + a: `dict` + TODOACTIVEDOCS + + kwargs: optional + TODOACTIVEDOCS + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * min: The minimum of `a``. + + """ + return {"N": a["n"], "min": a["min"]} + + +def active_max(a, **kwargs): + """Chunk calculations for the maximum. + + TODO Assumes that the calculations have already been done, + i.e. that *a* is already the maximum. + + This function is intended to be passed to `dask.array.reduction` + as the ``chunk`` parameter. Its return signature must be the same + as the non-active chunk function that it is replacing. + + .. versionadded:: TODOACTIVEVER + + :Parameters: + + a: `dict` + TODOACTIVEDOCS + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * max: The maximum of `a``. + + """ + return {"N": a["n"], "max": a["max"]} + + +def active_mean(a, **kwargs): + """Chunk calculations for the unweighted mean. + + TODO Assumes that the calculations have already been done, + i.e. that *a* is already the uweighted mean. + + This function is intended to be passed to `dask.array.reduction` + as the ``chunk`` parameter. Its return signature must be the same + as the non-active chunk function that it is replacing. + + .. versionadded:: TODOACTIVEVER + + :Parameters: + + a: `dict` + TODOACTIVEDOCS + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * V1: The sum of ``weights``. Equal to ``N`` because + weights have not been set. + * sum: The weighted sum of ``a``. + * weighted: True if weights have been set. Always + False. + + """ + return {"N": a["n"], "V1": a["n"], "sum": a["sum"], "weighted": False} + + +def active_sum(a, **kwargs): + """Chunk calculations for the unweighted sum. + + TODO Assumes that the calculations have already been done, + i.e. that *a* is already the uweighted sum. + + This function is intended to be passed to `dask.array.reduction` + as the ``chunk`` parameter. Its return signature must be the same + as the non-active chunk function that it is replacing. + + .. versionadded:: TODOACTIVEVER + + :Parameters: + + a: `dict` + TODOACTIVEDOCS + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * sum: The weighted sum of ``a`` + + """ + return {"N": a["n"], "sum": a["sum"]} + + +_active_chunk_functions = { + "min": active_min, + "max": active_max, + "mean": active_mean, + "sum": active_sum, +} + + def actify(a, method, axis=None): """TODOACTIVEDOCS. @@ -29,7 +160,7 @@ def actify(a, method, axis=None): from dask.array.utils import validate_axis from dask.base import collections_to_dsk - if method not in Active.methods(): + if not (method in _active_chunk_functions or method in Active.methods()): # The given method is not recognised by `Active`, so return # the input data unchanged. return a, None @@ -49,7 +180,7 @@ def actify(a, method, axis=None): axis = validate_axis(axis, a.ndim) filenames = set() - active_chunk_functions = set() + chunk_functions = set() # Loop round elements of the dask graph, looking for data # definitions that point to a file and which support active @@ -58,38 +189,44 @@ def actify(a, method, axis=None): # faster short circuit when using active storage is not possible. # # It is assumed that `actify` has only been called if has been - # deterimined externally that it is sensible to do so. This will - # be the case if an only if the parent `Data` instance's - # `active_storage` attribute is `True`. + # deterimined externally that it is sensible to do so. A + # necessary, but not sufficient, condition for this will is the + # parent `Data` instance's `active_storage` attribute being `True`. + ok_to_actify = True dsk = collections_to_dsk((a,), optimize_graph=True) for key, value in reversed(dsk.items()): try: filenames.add(value.get_filename()) - except AttributeError: - # This value is not a data definition. Note: It is assumed - # that all data definitions point to files. + except AttributeError: + if hasattr(value, "get_full_value"): + # This value is a constant fragment (such as might + # arise from CFA aggregated data), which precludes the + # use of active stoarge. +# chunk_functions = () + ok_to_actify = False + break + continue + # Still here? Then this value is a file fragment, so try to actify it. try: - # Create a new actified data definition value = value.actify(method, axis) - except (AttributeError, ValueError): - # Either this data definition does not support active - # storage reductions (AttributeError), or it does not - # support the requested active storage reduction defined - # by 'method' (ValueError). - active_chunk_functions = () - break - - try: - # Get the active storage chunk function - active_chunk_functions.add(value.get_active_chunk_function()) except AttributeError: - # This data definition value does not support active - # storage reductions - active_chunk_functions = () + # This file fragment does not support active storage + # reductions +# chunk_functions = () + ok_to_actify = False break +# try: + # Get the active storage chunk function + chunk_functions.add(_active_chunk_functions[method]) +# except AttributeError: +# # This data definition value does not support active +# # storage reductions +# chunk_functions = () +# break + # Still here? Then update the dask graph dictionary with the # actified data definition value. dsk[key] = value @@ -104,13 +241,13 @@ def actify(a, method, axis=None): return a, None # Still here? - if len(active_chunk_functions) == 1: + if ok_to_actify: #len(chunk_functions) == 1: # All data definitions in the dask graph support active - # storage reductions with the same chunk function => redefine - # the array from the actified dask graph, and define the - # active storage reduction chunk function. + # storage reductions => redefine the array from the actified + # dask graph, and define the active storage reduction chunk + # function. a = da.Array(dsk, a.name, a.chunks, a.dtype, a._meta) - chunk_function = active_chunk_functions.pop() + chunk_function = _active_chunk_functions[method] #chunk_functions.pop() else: chunk_function = None diff --git a/cf/data/fragment/netcdffragmentarray.py b/cf/data/fragment/netcdffragmentarray.py index 5fbe164398..26de07134a 100644 --- a/cf/data/fragment/netcdffragmentarray.py +++ b/cf/data/fragment/netcdffragmentarray.py @@ -1,8 +1,9 @@ +from ..array.mixin import ActiveStorageMixin from ..array.netcdfarray import NetCDFArray from .abstract import FragmentArray -class NetCDFFragmentArray(FragmentArray): +class NetCDFFragmentArray(ActiveStorageMixin, FragmentArray): """A CFA fragment array stored in a netCDF file. .. versionadded:: TODODASKVER @@ -118,7 +119,7 @@ def __getitem__(self, indices): differences: * A dimension's index can't be rank-reducing, i.e. it can't - be an integer, nor a scalar `numpy` or `dask` array. + be an integer, a scalar `numpy`, nor a `dask` array. * When two or more dimension's indices are sequences of integers then these indices work independently along each From 7dff9a0c8402944e481cdb337ea7cc68671458e7 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 10 Feb 2023 09:04:58 +0000 Subject: [PATCH 011/134] linting --- cf/data/array/mixin/activestoragemixin.py | 312 +++++++++++----------- cf/data/array/netcdfarray.py | 5 +- cf/data/collapse/collapse.py | 4 +- cf/data/collapse/collapse_active.py | 26 +- 4 files changed, 174 insertions(+), 173 deletions(-) diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index 98e150c96f..4378ee6375 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -37,23 +37,23 @@ def __getitem__(self, indices): active.components = True return active[indices] -# def _active_chunk_functions(self): -# """Mapping of method names to active chunk functions. -# -# .. versionadded:: TODOACTIVEVER -# -# :Returns: -# -# `dict` -# The mapping. -# -# """ -# return { -# "min": self.active_min, -# "max": self.active_max, -# "mean": self.active_mean, -# "sum": self.active_sum, -# } + # def _active_chunk_functions(self): + # """Mapping of method names to active chunk functions. + # + # .. versionadded:: TODOACTIVEVER + # + # :Returns: + # + # `dict` + # The mapping. + # + # """ + # return { + # "min": self.active_min, + # "max": self.active_max, + # "mean": self.active_mean, + # "sum": self.active_sum, + # } def actify(self, method, axis=None): """Return a new actified `{{class}}` instance. @@ -79,137 +79,137 @@ def actify(self, method, axis=None): TODOACTIVEDOCS """ -# if method not in self._active_chunk_functions(): -# raise ValueError(f"Invalid active storage operation: {method!r}") + # if method not in self._active_chunk_functions(): + # raise ValueError(f"Invalid active storage operation: {method!r}") a = self.copy() a.set_active_method(method) a.set_active_axis(axis) return a -# @staticmethod -# def active_min(a, **kwargs): -# """Chunk calculations for the minimum. -# -# Assumes that the calculations have already been done, -# i.e. that *a* is already the minimum. -# -# This function is intended to be passed to -# `dask.array.reduction` as the ``chunk`` parameter. Its return -# signature must be the same as the non-active chunk function -# that it is replacing. -# -# .. versionadded:: TODOACTIVEVER -# -# :Parameters: -# -# a: `dict` -# TODOACTIVEDOCS -# -# :Returns: -# -# `dict` -# Dictionary with the keys: -# -# * N: The sample size. -# * min: The minimum of `a``. -# -# """ -# return {"N": a["n"], "min": a["min"]} -# -# @staticmethod -# def active_max(a, **kwargs): -# """Chunk calculations for the maximum. -# -# Assumes that the calculations have already been done, -# i.e. that *a* is already the maximum. -# -# This function is intended to be passed to -# `dask.array.reduction` as the ``chunk`` parameter. Its return -# signature must be the same as the non-active chunk function -# that it is replacing. -# -# .. versionadded:: TODOACTIVEVER -# -# :Parameters: -# -# a: `dict` -# TODOACTIVEDOCS -# -# :Returns: -# -# `dict` -# Dictionary with the keys: -# -# * N: The sample size. -# * max: The maximum of `a``. -# -# """ -# return {"N": a["n"], "max": a["max"]} -# -# @staticmethod -# def active_mean(a, **kwargs): -# """Chunk calculations for the unweighted mean. -# -# Assumes that the calculations have already been done, -# i.e. that *a* is already the uweighted mean. -# -# This function is intended to be passed to -# `dask.array.reduction` as the ``chunk`` parameter. Its return -# signature must be the same as the non-active chunk function -# that it is replacing. -# -# .. versionadded:: TODOACTIVEVER -# -# :Parameters: -# -# a: `dict` -# TODOACTIVEDOCS -# -# :Returns: -# -# `dict` -# Dictionary with the keys: -# -# * N: The sample size. -# * V1: The sum of ``weights``. Equal to ``N`` because -# weights have not been set. -# * sum: The weighted sum of ``a``. -# * weighted: True if weights have been set. Always -# False. -# -# """ -# return {"N": a["n"], "V1": a["n"], "sum": a["sum"], "weighted": False} -# -# @staticmethod -# def active_sum(a, **kwargs): -# """Chunk calculations for the unweighted sum. -# -# Assumes that the calculations have already been done, -# i.e. that *a* is already the uweighted sum. -# -# This function is intended to be passed to -# `dask.array.reduction` as the ``chunk`` parameter. Its return -# signature must be the same as the non-active chunk function -# that it is replacing. -# -# .. versionadded:: TODOACTIVEVER -# -# :Parameters: -# -# a: `dict` -# TODOACTIVEDOCS -# -# :Returns: -# -# `dict` -# Dictionary with the keys: -# -# * N: The sample size. -# * sum: The weighted sum of ``a`` -# -# """ -# return {"N": a["n"], "sum": a["sum"]} + # @staticmethod + # def active_min(a, **kwargs): + # """Chunk calculations for the minimum. + # + # Assumes that the calculations have already been done, + # i.e. that *a* is already the minimum. + # + # This function is intended to be passed to + # `dask.array.reduction` as the ``chunk`` parameter. Its return + # signature must be the same as the non-active chunk function + # that it is replacing. + # + # .. versionadded:: TODOACTIVEVER + # + # :Parameters: + # + # a: `dict` + # TODOACTIVEDOCS + # + # :Returns: + # + # `dict` + # Dictionary with the keys: + # + # * N: The sample size. + # * min: The minimum of `a``. + # + # """ + # return {"N": a["n"], "min": a["min"]} + # + # @staticmethod + # def active_max(a, **kwargs): + # """Chunk calculations for the maximum. + # + # Assumes that the calculations have already been done, + # i.e. that *a* is already the maximum. + # + # This function is intended to be passed to + # `dask.array.reduction` as the ``chunk`` parameter. Its return + # signature must be the same as the non-active chunk function + # that it is replacing. + # + # .. versionadded:: TODOACTIVEVER + # + # :Parameters: + # + # a: `dict` + # TODOACTIVEDOCS + # + # :Returns: + # + # `dict` + # Dictionary with the keys: + # + # * N: The sample size. + # * max: The maximum of `a``. + # + # """ + # return {"N": a["n"], "max": a["max"]} + # + # @staticmethod + # def active_mean(a, **kwargs): + # """Chunk calculations for the unweighted mean. + # + # Assumes that the calculations have already been done, + # i.e. that *a* is already the uweighted mean. + # + # This function is intended to be passed to + # `dask.array.reduction` as the ``chunk`` parameter. Its return + # signature must be the same as the non-active chunk function + # that it is replacing. + # + # .. versionadded:: TODOACTIVEVER + # + # :Parameters: + # + # a: `dict` + # TODOACTIVEDOCS + # + # :Returns: + # + # `dict` + # Dictionary with the keys: + # + # * N: The sample size. + # * V1: The sum of ``weights``. Equal to ``N`` because + # weights have not been set. + # * sum: The weighted sum of ``a``. + # * weighted: True if weights have been set. Always + # False. + # + # """ + # return {"N": a["n"], "V1": a["n"], "sum": a["sum"], "weighted": False} + # + # @staticmethod + # def active_sum(a, **kwargs): + # """Chunk calculations for the unweighted sum. + # + # Assumes that the calculations have already been done, + # i.e. that *a* is already the uweighted sum. + # + # This function is intended to be passed to + # `dask.array.reduction` as the ``chunk`` parameter. Its return + # signature must be the same as the non-active chunk function + # that it is replacing. + # + # .. versionadded:: TODOACTIVEVER + # + # :Parameters: + # + # a: `dict` + # TODOACTIVEDOCS + # + # :Returns: + # + # `dict` + # Dictionary with the keys: + # + # * N: The sample size. + # * sum: The weighted sum of ``a`` + # + # """ + # return {"N": a["n"], "sum": a["sum"]} def get_active_method(self): """TODOACTIVEDOC. @@ -241,20 +241,20 @@ def get_active_axis(self): """ return self._custom.get("active_axis") -# def get_active_chunk_function(self): -# """TODOACTIVEDOC. -# -# .. versionadded:: TODOACTIVEVER -# -# :Returns: -# -# TODOACTIVEDOC -# -# """ -# try: -# return self._active_chunk_functions()[self.get_active_method()] -# except KeyError: -# raise ValueError("no active storage operation has been set") + # def get_active_chunk_function(self): + # """TODOACTIVEDOC. + # + # .. versionadded:: TODOACTIVEVER + # + # :Returns: + # + # TODOACTIVEDOC + # + # """ + # try: + # return self._active_chunk_functions()[self.get_active_method()] + # except KeyError: + # raise ValueError("no active storage operation has been set") def set_active_method(self, value): """TODOACTIVEDOC. diff --git a/cf/data/array/netcdfarray.py b/cf/data/array/netcdfarray.py index 4b252fb53b..59effa840e 100644 --- a/cf/data/array/netcdfarray.py +++ b/cf/data/array/netcdfarray.py @@ -4,12 +4,13 @@ from ...mixin_container import Container from .mixin import ActiveStorageMixin, FileArrayMixin - # Global lock for netCDF file access _lock = SerializableLock() -class NetCDFArray(ActiveStorageMixin, FileArrayMixin, Container, cfdm.NetCDFArray): +class NetCDFArray( + ActiveStorageMixin, FileArrayMixin, Container, cfdm.NetCDFArray +): """An array stored in a netCDF file. TODOACTIVEDOCS diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index 99d20804a0..539d275d5b 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -6,7 +6,6 @@ from dask.array.reductions import reduction from ...docstring import _docstring_substitution_definitions - from .collapse_active import active_storage from .collapse_utils import check_input_dtype, double_precision_dtype @@ -397,7 +396,6 @@ def min( a, axis=None, keepdims=False, - active_storage=False, mtol=None, split_every=None, chunk_function=None, @@ -954,7 +952,7 @@ def sum_of_weights2( @active_storage("unique") def unique( - self, a, split_every=None, chunk_function=None, active_storage=False + self, a, split_every=None, chunk_function=None, active_storage=False ): """Return unique elements of the data. diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 07bf2be18f..a6ba5fe0a1 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -197,12 +197,12 @@ def actify(a, method, axis=None): for key, value in reversed(dsk.items()): try: filenames.add(value.get_filename()) - except AttributeError: - if hasattr(value, "get_full_value"): + except AttributeError: + if hasattr(value, "get_full_value"): # This value is a constant fragment (such as might # arise from CFA aggregated data), which precludes the # use of active stoarge. -# chunk_functions = () + # chunk_functions = () ok_to_actify = False break @@ -214,18 +214,18 @@ def actify(a, method, axis=None): except AttributeError: # This file fragment does not support active storage # reductions -# chunk_functions = () + # chunk_functions = () ok_to_actify = False break -# try: + # try: # Get the active storage chunk function chunk_functions.add(_active_chunk_functions[method]) -# except AttributeError: -# # This data definition value does not support active -# # storage reductions -# chunk_functions = () -# break + # except AttributeError: + # # This data definition value does not support active + # # storage reductions + # chunk_functions = () + # break # Still here? Then update the dask graph dictionary with the # actified data definition value. @@ -241,13 +241,15 @@ def actify(a, method, axis=None): return a, None # Still here? - if ok_to_actify: #len(chunk_functions) == 1: + if ok_to_actify: # len(chunk_functions) == 1: # All data definitions in the dask graph support active # storage reductions => redefine the array from the actified # dask graph, and define the active storage reduction chunk # function. a = da.Array(dsk, a.name, a.chunks, a.dtype, a._meta) - chunk_function = _active_chunk_functions[method] #chunk_functions.pop() + chunk_function = _active_chunk_functions[ + method + ] # chunk_functions.pop() else: chunk_function = None From ede2946bf30a2e3d14ebc7da2766eb2df53c734a Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 10 Feb 2023 09:17:16 +0000 Subject: [PATCH 012/134] dev --- cf/data/collapse/__init__.py | 1 + cf/data/collapse/collapse_active.py | 43 +++++++++++++---------------- cf/docstring/docstring.py | 7 +++-- 3 files changed, 24 insertions(+), 27 deletions(-) diff --git a/cf/data/collapse/__init__.py b/cf/data/collapse/__init__.py index 0de12360ea..47bbd037ce 100644 --- a/cf/data/collapse/__init__.py +++ b/cf/data/collapse/__init__.py @@ -1 +1,2 @@ from .collapse import Collapse +from .collapse_active import actify diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index a6ba5fe0a1..5a6182532a 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -4,8 +4,8 @@ def active_min(a, **kwargs): """Chunk calculations for the minimum. - TODO Assumes that the calculations have already been done, i.e. that - *a* is already the minimum. + TODO Assumes that the calculations have already been done, + i.e. that *a* is already the minimum. This function is intended to be passed to `dask.array.reduction` as the ``chunk`` parameter. Its return signature must be the same @@ -85,8 +85,8 @@ def active_mean(a, **kwargs): Dictionary with the keys: * N: The sample size. - * V1: The sum of ``weights``. Equal to ``N`` because - weights have not been set. + * V1: The sum of ``weights``. Always equal to ``N`` + because weights have not been set. * sum: The weighted sum of ``a``. * weighted: True if weights have been set. Always False. @@ -135,6 +135,8 @@ def active_sum(a, **kwargs): def actify(a, method, axis=None): """TODOACTIVEDOCS. + TODO: Describe the necessary conditions here. + .. versionadded:: TODOACTIVEVER :Parameters: @@ -150,7 +152,7 @@ def actify(a, method, axis=None): :Returns: - `dask.array.Array`, function + (`dask.array.Array`, function) or (`dask.array.Array`, `None`) TODOACTIVEDOCS """ @@ -160,9 +162,9 @@ def actify(a, method, axis=None): from dask.array.utils import validate_axis from dask.base import collections_to_dsk - if not (method in _active_chunk_functions or method in Active.methods()): - # The given method is not recognised by `Active`, so return - # the input data unchanged. + if not (method in _active_chunk_functions and method in Active.methods()): + # The given method is not supported, so return the input data + # unchanged. return a, None # Parse axis @@ -190,8 +192,9 @@ def actify(a, method, axis=None): # # It is assumed that `actify` has only been called if has been # deterimined externally that it is sensible to do so. A - # necessary, but not sufficient, condition for this will is the - # parent `Data` instance's `active_storage` attribute being `True`. + # necessary, but not sufficient, condition for this being the case + # will is the parent `Data` instance's `active_storage` attribute + # being `True`. ok_to_actify = True dsk = collections_to_dsk((a,), optimize_graph=True) for key, value in reversed(dsk.items()): @@ -202,30 +205,22 @@ def actify(a, method, axis=None): # This value is a constant fragment (such as might # arise from CFA aggregated data), which precludes the # use of active stoarge. - # chunk_functions = () ok_to_actify = False break continue - # Still here? Then this value is a file fragment, so try to actify it. + # Still here? Then this value is a file fragment, so try to + # actify it. try: value = value.actify(method, axis) except AttributeError: # This file fragment does not support active storage # reductions - # chunk_functions = () ok_to_actify = False break - # try: - # Get the active storage chunk function chunk_functions.add(_active_chunk_functions[method]) - # except AttributeError: - # # This data definition value does not support active - # # storage reductions - # chunk_functions = () - # break # Still here? Then update the dask graph dictionary with the # actified data definition value. @@ -241,15 +236,13 @@ def actify(a, method, axis=None): return a, None # Still here? - if ok_to_actify: # len(chunk_functions) == 1: + if ok_to_actify: # All data definitions in the dask graph support active # storage reductions => redefine the array from the actified # dask graph, and define the active storage reduction chunk # function. a = da.Array(dsk, a.name, a.chunks, a.dtype, a._meta) - chunk_function = _active_chunk_functions[ - method - ] # chunk_functions.pop() + chunk_function = _active_chunk_functions[method] else: chunk_function = None @@ -262,6 +255,8 @@ def active_storage(method): .. versionadded:: TODOACTIVEVER + .. seealso `cf.data.collapse.Collapse` + :Parameters: method: `str` diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py index d7dda31407..a3490fe51a 100644 --- a/cf/docstring/docstring.py +++ b/cf/docstring/docstring.py @@ -317,9 +317,10 @@ # active_storage "{{active_storage: `bool`, optional}}": """{{active_storage: `bool`, optional}} If True then attempt to perform the collapse using - active storage. If other necessary conditions are not - met (see `Collapse` for details) then the operation - will be executed without active storage.""", + active storage. However, if other necessary conditions + are not met (see `cf.data.collapse.actify` for + details) then the operation will be executed without + active storage.""", # Collapse chunk_function "{{chunk_function: callable, optional}}": """{{chunk_function: callable, optional}} Provides the ``chunk`` parameter to From a32ced62dd666952712330b596f9f8f9dd1d88ea Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 10 Feb 2023 12:58:22 +0000 Subject: [PATCH 013/134] dev --- cf/data/data.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index 7b76537837..e5da5861df 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -437,14 +437,13 @@ def __init__( except AttributeError: pass elif hasattr(array, "actify"): - # Allow the possibilty of active storage operations on - # data that is wholly on disk + # Allow the possibilty of active storage operations self._set_active_storage(True) if self._is_abstract_Array_subclass(array): # Save the input array in case it's useful later. For - # compressed input arrays this will contain extra information, - # such as a count or index variable. + # compressed input arrays this will contain extra + # information, such as a count or index variable. self._set_Array(array) # Cast the input data as a dask array From b24d521e689fe8ed54b91843489f6616d096999e Mon Sep 17 00:00:00 2001 From: David Hassell Date: Sat, 11 Feb 2023 13:57:33 +0000 Subject: [PATCH 014/134] dev --- cf/data/array/mixin/activestoragemixin.py | 34 +++++++++++------------ cf/data/collapse/__init__.py | 2 +- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index 4378ee6375..8a91049b23 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -75,7 +75,7 @@ def actify(self, method, axis=None): :Returns: - {{class}} + `{{class}}` TODOACTIVEDOCS """ @@ -211,35 +211,35 @@ def actify(self, method, axis=None): # """ # return {"N": a["n"], "sum": a["sum"]} - def get_active_method(self): + def get_active_axis(self): """TODOACTIVEDOC. .. versionadded:: TODOACTIVEVER - .. seealso:: `set_active_method` + .. seealso:: `set_active_axis` :Returns: - `str` or `None` - The name of the active reduction method, or `None` if - one hasn't been set. + TODOACTIVEDOC """ - return self._custom.get("active_method") + return self._custom.get("active_axis") - def get_active_axis(self): + def get_active_method(self): """TODOACTIVEDOC. .. versionadded:: TODOACTIVEVER - .. seealso:: `set_active_axis` + .. seealso:: `set_active_method` :Returns: - TODOACTIVEDOC + `str` or `None` + The name of the active reduction method, or `None` if + one hasn't been set. """ - return self._custom.get("active_axis") + return self._custom.get("active_method") # def get_active_chunk_function(self): # """TODOACTIVEDOC. @@ -256,12 +256,12 @@ def get_active_axis(self): # except KeyError: # raise ValueError("no active storage operation has been set") - def set_active_method(self, value): + def set_active_axis(self, value): """TODOACTIVEDOC. .. versionadded:: TODOACTIVEVER - .. seealso:: `get_active_method` + .. seealso:: `get_active_axis` :Parameters: @@ -272,14 +272,14 @@ def set_active_method(self, value): `None` """ - self._custom["active_method"] = value + self._custom["active_axis"] = value - def set_active_axis(self, value): + def set_active_method(self, value): """TODOACTIVEDOC. .. versionadded:: TODOACTIVEVER - .. seealso:: `get_active_axis` + .. seealso:: `get_active_method` :Parameters: @@ -290,4 +290,4 @@ def set_active_axis(self, value): `None` """ - self._custom["active_axis"] = value + self._custom["active_method"] = value diff --git a/cf/data/collapse/__init__.py b/cf/data/collapse/__init__.py index 47bbd037ce..9ba083601d 100644 --- a/cf/data/collapse/__init__.py +++ b/cf/data/collapse/__init__.py @@ -1,2 +1,2 @@ from .collapse import Collapse -from .collapse_active import actify +from .collapse_active import _active_chunk_functions, actify From 669f3cdb861ee3ce7ee495e9fe89ceac4d178950 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 2 Mar 2023 15:09:31 +0000 Subject: [PATCH 015/134] dev --- cf/data/array/mixin/activestoragemixin.py | 4 ++++ cf/data/array/netcdfarray.py | 4 ++-- cf/data/collapse/collapse_active.py | 2 +- cf/data/utils.py | 8 +++++++- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index 8a91049b23..f5662e50b8 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -1,3 +1,6 @@ +from ...utils import netcdf_lock + + class ActiveStorageMixin: """TODOACTIVEDOCS. @@ -35,6 +38,7 @@ def __getitem__(self, indices): active = Active(self.filename, self.ncvar) active.method = method active.components = True + active.lock = netcdf_lock return active[indices] # def _active_chunk_functions(self): diff --git a/cf/data/array/netcdfarray.py b/cf/data/array/netcdfarray.py index 59effa840e..a03b48a076 100644 --- a/cf/data/array/netcdfarray.py +++ b/cf/data/array/netcdfarray.py @@ -5,7 +5,7 @@ from .mixin import ActiveStorageMixin, FileArrayMixin # Global lock for netCDF file access -_lock = SerializableLock() +from ..utils import netcdf_lock class NetCDFArray( @@ -42,4 +42,4 @@ def _dask_lock(self): if filename is None: return False - return _lock + return netcdf_lock diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 5a6182532a..25e7683b39 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -123,7 +123,7 @@ def active_sum(a, **kwargs): """ return {"N": a["n"], "sum": a["sum"]} - +# Create a lookup of the active functions _active_chunk_functions = { "min": active_min, "max": active_max, diff --git a/cf/data/utils.py b/cf/data/utils.py index 6f4cc0e98c..2be4e0dd4d 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -3,9 +3,11 @@ from itertools import product from operator import mul -import dask.array as da import numpy as np +import dask.array as da +from dask.utils import SerializableLock + from ..cfdatetime import ( canonical_calendar, default_calendar, @@ -959,3 +961,7 @@ def parse_weights(d, weights, axis=None): # Return the product of the weights components, which will be # broadcastable to d return reduce(mul, w) + + +# Global lock for netCDF file access +netcdf_lock = SerializableLock() From b2b0c7e5515c0884d0d24d728c5571ce3045a6e2 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 2 Mar 2023 16:43:42 +0000 Subject: [PATCH 016/134] dev --- cf/data/array/mixin/activestoragemixin.py | 164 +----------------- cf/data/collapse/__init__.py | 1 - ...{collapse_active.py => active_collapse.py} | 76 ++++---- cf/data/collapse/collapse.py | 2 +- 4 files changed, 33 insertions(+), 210 deletions(-) rename cf/data/collapse/{collapse_active.py => active_collapse.py} (79%) diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index f5662e50b8..5472813ca0 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -34,31 +34,13 @@ def __getitem__(self, indices): # Normal read by local client. Returns a numpy array. return super().__getitem__(indices) - # Active storage read. Returns a dictionary. - active = Active(self.filename, self.ncvar) + # Active storage read and reduction. Returns a dictionary. + active = Active(self.get_filename(), self.get_ncvar()) active.method = method active.components = True active.lock = netcdf_lock return active[indices] - # def _active_chunk_functions(self): - # """Mapping of method names to active chunk functions. - # - # .. versionadded:: TODOACTIVEVER - # - # :Returns: - # - # `dict` - # The mapping. - # - # """ - # return { - # "min": self.active_min, - # "max": self.active_max, - # "mean": self.active_mean, - # "sum": self.active_sum, - # } - def actify(self, method, axis=None): """Return a new actified `{{class}}` instance. @@ -83,138 +65,11 @@ def actify(self, method, axis=None): TODOACTIVEDOCS """ - # if method not in self._active_chunk_functions(): - # raise ValueError(f"Invalid active storage operation: {method!r}") - a = self.copy() a.set_active_method(method) a.set_active_axis(axis) return a - # @staticmethod - # def active_min(a, **kwargs): - # """Chunk calculations for the minimum. - # - # Assumes that the calculations have already been done, - # i.e. that *a* is already the minimum. - # - # This function is intended to be passed to - # `dask.array.reduction` as the ``chunk`` parameter. Its return - # signature must be the same as the non-active chunk function - # that it is replacing. - # - # .. versionadded:: TODOACTIVEVER - # - # :Parameters: - # - # a: `dict` - # TODOACTIVEDOCS - # - # :Returns: - # - # `dict` - # Dictionary with the keys: - # - # * N: The sample size. - # * min: The minimum of `a``. - # - # """ - # return {"N": a["n"], "min": a["min"]} - # - # @staticmethod - # def active_max(a, **kwargs): - # """Chunk calculations for the maximum. - # - # Assumes that the calculations have already been done, - # i.e. that *a* is already the maximum. - # - # This function is intended to be passed to - # `dask.array.reduction` as the ``chunk`` parameter. Its return - # signature must be the same as the non-active chunk function - # that it is replacing. - # - # .. versionadded:: TODOACTIVEVER - # - # :Parameters: - # - # a: `dict` - # TODOACTIVEDOCS - # - # :Returns: - # - # `dict` - # Dictionary with the keys: - # - # * N: The sample size. - # * max: The maximum of `a``. - # - # """ - # return {"N": a["n"], "max": a["max"]} - # - # @staticmethod - # def active_mean(a, **kwargs): - # """Chunk calculations for the unweighted mean. - # - # Assumes that the calculations have already been done, - # i.e. that *a* is already the uweighted mean. - # - # This function is intended to be passed to - # `dask.array.reduction` as the ``chunk`` parameter. Its return - # signature must be the same as the non-active chunk function - # that it is replacing. - # - # .. versionadded:: TODOACTIVEVER - # - # :Parameters: - # - # a: `dict` - # TODOACTIVEDOCS - # - # :Returns: - # - # `dict` - # Dictionary with the keys: - # - # * N: The sample size. - # * V1: The sum of ``weights``. Equal to ``N`` because - # weights have not been set. - # * sum: The weighted sum of ``a``. - # * weighted: True if weights have been set. Always - # False. - # - # """ - # return {"N": a["n"], "V1": a["n"], "sum": a["sum"], "weighted": False} - # - # @staticmethod - # def active_sum(a, **kwargs): - # """Chunk calculations for the unweighted sum. - # - # Assumes that the calculations have already been done, - # i.e. that *a* is already the uweighted sum. - # - # This function is intended to be passed to - # `dask.array.reduction` as the ``chunk`` parameter. Its return - # signature must be the same as the non-active chunk function - # that it is replacing. - # - # .. versionadded:: TODOACTIVEVER - # - # :Parameters: - # - # a: `dict` - # TODOACTIVEDOCS - # - # :Returns: - # - # `dict` - # Dictionary with the keys: - # - # * N: The sample size. - # * sum: The weighted sum of ``a`` - # - # """ - # return {"N": a["n"], "sum": a["sum"]} - def get_active_axis(self): """TODOACTIVEDOC. @@ -245,21 +100,6 @@ def get_active_method(self): """ return self._custom.get("active_method") - # def get_active_chunk_function(self): - # """TODOACTIVEDOC. - # - # .. versionadded:: TODOACTIVEVER - # - # :Returns: - # - # TODOACTIVEDOC - # - # """ - # try: - # return self._active_chunk_functions()[self.get_active_method()] - # except KeyError: - # raise ValueError("no active storage operation has been set") - def set_active_axis(self, value): """TODOACTIVEDOC. diff --git a/cf/data/collapse/__init__.py b/cf/data/collapse/__init__.py index 9ba083601d..0de12360ea 100644 --- a/cf/data/collapse/__init__.py +++ b/cf/data/collapse/__init__.py @@ -1,2 +1 @@ from .collapse import Collapse -from .collapse_active import _active_chunk_functions, actify diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/active_collapse.py similarity index 79% rename from cf/data/collapse/collapse_active.py rename to cf/data/collapse/active_collapse.py index 25e7683b39..29aefa495c 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/active_collapse.py @@ -1,6 +1,9 @@ from functools import wraps +# -------------------------------------------------------------------- +# Define the active functions +# -------------------------------------------------------------------- def active_min(a, **kwargs): """Chunk calculations for the minimum. @@ -123,7 +126,9 @@ def active_sum(a, **kwargs): """ return {"N": a["n"], "sum": a["sum"]} +# -------------------------------------------------------------------- # Create a lookup of the active functions +# -------------------------------------------------------------------- _active_chunk_functions = { "min": active_min, "max": active_max, @@ -181,9 +186,6 @@ def actify(a, method, axis=None): axis = validate_axis(axis, a.ndim) - filenames = set() - chunk_functions = set() - # Loop round elements of the dask graph, looking for data # definitions that point to a file and which support active # storage operations. The elements are traversed in reverse order @@ -191,62 +193,43 @@ def actify(a, method, axis=None): # faster short circuit when using active storage is not possible. # # It is assumed that `actify` has only been called if has been - # deterimined externally that it is sensible to do so. A + # already been deterimined that it is sensible to do so. A # necessary, but not sufficient, condition for this being the case # will is the parent `Data` instance's `active_storage` attribute # being `True`. - ok_to_actify = True + ok_to_actify = False dsk = collections_to_dsk((a,), optimize_graph=True) for key, value in reversed(dsk.items()): try: - filenames.add(value.get_filename()) + value.get_filename() except AttributeError: - if hasattr(value, "get_full_value"): - # This value is a constant fragment (such as might - # arise from CFA aggregated data), which precludes the - # use of active stoarge. - ok_to_actify = False - break - continue - - # Still here? Then this value is a file fragment, so try to - # actify it. + + # Still here? Then this chunk is a data definition that points + # to a file, so try to insert an actified copy into the dask + # graph. try: - value = value.actify(method, axis) + dsk[key] = value.actify(method, axis) except AttributeError: - # This file fragment does not support active storage + # This data definition doesn't support active storage # reductions - ok_to_actify = False break + else: + ok_to_actify = True - chunk_functions.add(_active_chunk_functions[method]) - - # Still here? Then update the dask graph dictionary with the - # actified data definition value. - dsk[key] = value - - for filename in filenames: - # TODOACTIVE: Check that Active(filename) supports active - # storage. I don't really know how this will work - # ... - if not OK: - # This file location does not support active storage, so - # return the input data unchanged. - return a, None - - # Still here? - if ok_to_actify: - # All data definitions in the dask graph support active - # storage reductions => redefine the array from the actified - # dask graph, and define the active storage reduction chunk - # function. - a = da.Array(dsk, a.name, a.chunks, a.dtype, a._meta) - chunk_function = _active_chunk_functions[method] - else: - chunk_function = None - - return a, chunk_function + if not ok_to_actify: + # The dask graph is not suitable for active storage + # reductions, so return the input data unchanged. + return a, None + + # Still here? Then all data definitions in the dask graph support + # active storage reductions => redefine the array from the + # actified dask graph, and define the active storage reduction + # chunk function. + return ( + da.Array(dsk, a.name, a.chunks, a.dtype, a._meta), + _active_chunk_functions[method] + ) def active_storage(method): @@ -269,6 +252,7 @@ def decorator(collapse_method): def wrapper(self, *args, **kwargs): if ( kwargs.get("active_storage") + and method in _active_chunk_functions and kwargs.get("weights") is None and kwargs.get("chunk_function") is None ): diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index 8830c2efb8..214d18639d 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -5,7 +5,7 @@ from dask.array.reductions import reduction from ...docstring import _docstring_substitution_definitions -from .collapse_active import active_storage +from .active_collapse import active_storage from .collapse_utils import check_input_dtype, double_precision_dtype From e95a624ee9bc6035a1e523826c3fec5aa650d237 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 2 Mar 2023 23:40:05 +0000 Subject: [PATCH 017/134] dev --- cf/read_write/netcdf/netcdfread.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py index b041fdc30c..552ac7ae68 100644 --- a/cf/read_write/netcdf/netcdfread.py +++ b/cf/read_write/netcdf/netcdfread.py @@ -1,25 +1,6 @@ import cfdm import numpy as np -""" -TODOCFA: remove aggregation_* properties from constructs - -TODOCFA: Create auxiliary coordinates from non-standardised terms - -TODOCFA: Reference instruction variables (and/or set as - "do_not_create_field") - -TODOCFA: Create auxiliary coordinates from non-standardised terms - -TODOCFA: Consider scanning for cfa variables to the top (e.g. where - scanning for geometry varables is). This will probably need a - change in cfdm so that a customizable hook can be overlaoded - (like `_customize_read_vars` does). - -TODOCFA: What about groups/netcdf_flattener? - -""" - class NetCDFRead(cfdm.read_write.netcdf.NetCDFRead): """A container for instantiating Fields from a netCDF dataset. From f74cf7a647bf81561f181ce0eb66800615e57bef Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 3 Mar 2023 09:26:58 +0000 Subject: [PATCH 018/134] dev --- cf/constants.py | 1 + cf/data/array/mixin/activestoragemixin.py | 35 +++++++++----- cf/data/array/netcdfarray.py | 3 +- cf/data/collapse/active_collapse.py | 58 +++++++++++++---------- cf/data/collapse/collapse.py | 30 ++++++------ cf/data/data.py | 34 +++++++------ cf/data/utils.py | 15 +++--- cf/functions.py | 58 +++++++++++++++++++++++ cf/test/test_functions.py | 2 + 9 files changed, 159 insertions(+), 77 deletions(-) diff --git a/cf/constants.py b/cf/constants.py index c6c2ea4f5b..b89eff9e99 100644 --- a/cf/constants.py +++ b/cf/constants.py @@ -63,6 +63,7 @@ "LOG_LEVEL": logging.getLevelName(logging.getLogger().level), "BOUNDS_COMBINATION_MODE": "AND", "CHUNKSIZE": parse_bytes(_CHUNKSIZE), + "ACTIVE_STORAGE": True, } masked = np.ma.masked diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index 5472813ca0..a88cc151d8 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -1,10 +1,11 @@ +# Global lock for netCDF file access from ...utils import netcdf_lock class ActiveStorageMixin: """TODOACTIVEDOCS. - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION """ @@ -26,7 +27,7 @@ def __getitem__(self, indices): then these indices work independently along each dimension (similar to the way vector subscripts work in Fortran). - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION """ method = self.get_active_method() @@ -35,10 +36,18 @@ def __getitem__(self, indices): return super().__getitem__(indices) # Active storage read and reduction. Returns a dictionary. - active = Active(self.get_filename(), self.get_ncvar()) + try: + missing_data_indicators = self.get_missing_data_indicators() + except AttributeError: + missing_data_indicators = {} + + active = Active( + self.get_filename(), self.get_ncvar(), **missing_data_indicators + ) active.method = method active.components = True active.lock = netcdf_lock + return active[indices] def actify(self, method, axis=None): @@ -47,7 +56,7 @@ def actify(self, method, axis=None): The new instance is a deep copy of the original, with the additional setting of the active storage method and axis. - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVER .. seealso:: `set_active_axis`, `set_active_method` @@ -71,23 +80,23 @@ def actify(self, method, axis=None): return a def get_active_axis(self): - """TODOACTIVEDOC. + """TODOACTIVEDOCS. - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION .. seealso:: `set_active_axis` :Returns: - TODOACTIVEDOC + TODOACTIVEDOCS """ return self._custom.get("active_axis") def get_active_method(self): - """TODOACTIVEDOC. + """TODOACTIVEDOCS. - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION .. seealso:: `set_active_method` @@ -101,9 +110,9 @@ def get_active_method(self): return self._custom.get("active_method") def set_active_axis(self, value): - """TODOACTIVEDOC. + """TODOACTIVEDOCS. - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION .. seealso:: `get_active_axis` @@ -119,9 +128,9 @@ def set_active_axis(self, value): self._custom["active_axis"] = value def set_active_method(self, value): - """TODOACTIVEDOC. + """TODOACTIVEDOCS. - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION .. seealso:: `get_active_method` diff --git a/cf/data/array/netcdfarray.py b/cf/data/array/netcdfarray.py index 3bab524a94..7dece7d1a3 100644 --- a/cf/data/array/netcdfarray.py +++ b/cf/data/array/netcdfarray.py @@ -1,11 +1,10 @@ import cfdm -from dask.utils import SerializableLock from ...mixin_container import Container -from .mixin import ActiveStorageMixin, FileArrayMixin # Global lock for netCDF file access from ..utils import netcdf_lock +from .mixin import ActiveStorageMixin, FileArrayMixin class NetCDFArray( diff --git a/cf/data/collapse/active_collapse.py b/cf/data/collapse/active_collapse.py index 29aefa495c..8ac3fd2b1c 100644 --- a/cf/data/collapse/active_collapse.py +++ b/cf/data/collapse/active_collapse.py @@ -14,7 +14,7 @@ def active_min(a, **kwargs): as the ``chunk`` parameter. Its return signature must be the same as the non-active chunk function that it is replacing. - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION :Parameters: @@ -46,13 +46,16 @@ def active_max(a, **kwargs): as the ``chunk`` parameter. Its return signature must be the same as the non-active chunk function that it is replacing. - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION :Parameters: a: `dict` TODOACTIVEDOCS + kwargs: optional + TODOACTIVEDOCS + :Returns: `dict` @@ -75,13 +78,16 @@ def active_mean(a, **kwargs): as the ``chunk`` parameter. Its return signature must be the same as the non-active chunk function that it is replacing. - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION :Parameters: a: `dict` TODOACTIVEDOCS + kwargs: optional + TODOACTIVEDOCS + :Returns: `dict` @@ -90,7 +96,7 @@ def active_mean(a, **kwargs): * N: The sample size. * V1: The sum of ``weights``. Always equal to ``N`` because weights have not been set. - * sum: The weighted sum of ``a``. + * sum: The un-weighted sum of ``a``. * weighted: True if weights have been set. Always False. @@ -108,24 +114,28 @@ def active_sum(a, **kwargs): as the ``chunk`` parameter. Its return signature must be the same as the non-active chunk function that it is replacing. - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION :Parameters: a: `dict` TODOACTIVEDOCS + kwargs: optional + TODOACTIVEDOCS + :Returns: `dict` Dictionary with the keys: * N: The sample size. - * sum: The weighted sum of ``a`` + * sum: The un-weighted sum of ``a`` """ return {"N": a["n"], "sum": a["sum"]} + # -------------------------------------------------------------------- # Create a lookup of the active functions # -------------------------------------------------------------------- @@ -140,9 +150,15 @@ def active_sum(a, **kwargs): def actify(a, method, axis=None): """TODOACTIVEDOCS. - TODO: Describe the necessary conditions here. + It is assumed that: + + * The *method* has an entry in the `_active_chunk_functions` + dictionary + + * The `!active_storage` attribute of the `Data` object that + provided the dask array *a* is `True`. - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION :Parameters: @@ -167,11 +183,6 @@ def actify(a, method, axis=None): from dask.array.utils import validate_axis from dask.base import collections_to_dsk - if not (method in _active_chunk_functions and method in Active.methods()): - # The given method is not supported, so return the input data - # unchanged. - return a, None - # Parse axis if axis is None: axis = tuple(range(a.ndim)) @@ -189,14 +200,9 @@ def actify(a, method, axis=None): # Loop round elements of the dask graph, looking for data # definitions that point to a file and which support active # storage operations. The elements are traversed in reverse order - # so that the data defintions come out first, allowing for a - # faster short circuit when using active storage is not possible. - # - # It is assumed that `actify` has only been called if has been - # already been deterimined that it is sensible to do so. A - # necessary, but not sufficient, condition for this being the case - # will is the parent `Data` instance's `active_storage` attribute - # being `True`. + # so that the data defintions come out first, allowing for the + # potential of a faster short circuit when using active storage is + # not possible. ok_to_actify = False dsk = collections_to_dsk((a,), optimize_graph=True) for key, value in reversed(dsk.items()): @@ -204,7 +210,7 @@ def actify(a, method, axis=None): value.get_filename() except AttributeError: continue - + # Still here? Then this chunk is a data definition that points # to a file, so try to insert an actified copy into the dask # graph. @@ -221,14 +227,14 @@ def actify(a, method, axis=None): # The dask graph is not suitable for active storage # reductions, so return the input data unchanged. return a, None - + # Still here? Then all data definitions in the dask graph support # active storage reductions => redefine the array from the # actified dask graph, and define the active storage reduction # chunk function. return ( da.Array(dsk, a.name, a.chunks, a.dtype, a._meta), - _active_chunk_functions[method] + _active_chunk_functions[method], ) @@ -236,7 +242,7 @@ def active_storage(method): """A decorator for `Collapse` methods that enables active storage operations, when the conditions are right. - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION .. seealso `cf.data.collapse.Collapse` @@ -252,7 +258,7 @@ def decorator(collapse_method): def wrapper(self, *args, **kwargs): if ( kwargs.get("active_storage") - and method in _active_chunk_functions + and method in _active_chunk_functions and kwargs.get("weights") is None and kwargs.get("chunk_function") is None ): diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index 214d18639d..82e1d6b016 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -102,7 +102,7 @@ def max( {{active_storage: `bool`, optional}} - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION :Returns: @@ -169,7 +169,7 @@ def max_abs( {{active_storage: `bool`, optional}} - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION :Returns: @@ -228,7 +228,7 @@ def mean( {{active_storage: `bool`, optional}} - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION :Returns: @@ -299,7 +299,7 @@ def mean_abs( {{active_storage: `bool`, optional}} - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION :Returns: @@ -357,7 +357,7 @@ def mid_range( {{active_storage: `bool`, optional}} - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION :Returns: @@ -428,7 +428,7 @@ def min( {{active_storage: `bool`, optional}} - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION :Returns: @@ -495,7 +495,7 @@ def min_abs( {{active_storage: `bool`, optional}} - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION :Returns: @@ -551,7 +551,7 @@ def range( {{active_storage: `bool`, optional}} - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION :Returns: @@ -625,7 +625,7 @@ def rms( {{active_storage: `bool`, optional}} - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION :Returns: @@ -693,7 +693,7 @@ def sample_size( {{active_storage: `bool`, optional}} - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION :Returns: @@ -767,7 +767,7 @@ def sum( {{active_storage: `bool`, optional}} - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION :Returns: @@ -841,7 +841,7 @@ def sum_of_weights( {{active_storage: `bool`, optional}} - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION :Returns: @@ -916,7 +916,7 @@ def sum_of_weights2( {{active_storage: `bool`, optional}} - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION :Returns: @@ -968,7 +968,7 @@ def unique( {{active_storage: `bool`, optional}} - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION :Returns: @@ -1048,7 +1048,7 @@ def var( {{active_storage: `bool`, optional}} - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION :Returns: diff --git a/cf/data/data.py b/cf/data/data.py index 53ec1f9b92..6f9e9e869d 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -424,21 +424,13 @@ def __init__( "for compressed input arrays" ) - # Save the input compressed array, as this will contain - # extra information, such as a count or index variable. - self._set_Array(array) - - if self._is_file_array(array): + # Bring the compressed data into memory without + # decompressing it if to_memory: - # Bring the compressed data into memory (without - # decompressing it if it's compressed) try: array = array.to_memory() except AttributeError: pass - elif hasattr(array, "actify"): - # Allow the possibilty of active storage operations - self._set_active_storage(True) if self._is_abstract_Array_subclass(array): # Save the input array in case it's useful later. For @@ -633,7 +625,7 @@ def _rtol(self): def _is_file_array(self, array): """Whether or not an array is stored on disk. - .. versionaddedd: TODOACTIVEVER + .. versionaddedd: ACTIVEVERSION :Parameters: @@ -1284,7 +1276,7 @@ def _conform_after_dask_update(self): def _del_active_storage(self): """TODOACTIVEDOCS. - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION .. seealso:: `active_storage`, `_set_active_storage` @@ -1310,7 +1302,7 @@ def _del_active_storage(self): def _set_active_storage(self, value): """TODOACTIVEDOCS. - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION .. seealso:: `active_storage`, `_del_active_storage` @@ -1475,6 +1467,20 @@ def _del_cached_elements(self): for element in ("first_element", "second_element", "last_element"): custom.pop(element, None) + def _is_abstract_Array_subclass(self, array): + """Whether or not an array is a type of Array. + + :Parameters: + + array: + + :Returns: + + `bool` + + """ + return isinstance(array, cfdm.Array) + def _set_cached_elements(self, elements): """Cache selected element values. @@ -4350,7 +4356,7 @@ def active_storage(self): the usual (non-active) techniques if an active storage operation fails for any reason. - .. versionadded:: TODOACTIVEVER + .. versionadded:: ACTIVEVERSION **Examples** diff --git a/cf/data/utils.py b/cf/data/utils.py index 2be4e0dd4d..360f6312af 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -3,9 +3,8 @@ from itertools import product from operator import mul -import numpy as np - import dask.array as da +import numpy as np from dask.utils import SerializableLock from ..cfdatetime import ( @@ -16,11 +15,17 @@ rt2dt, st2rt, ) +from ..functions import active_storage from ..units import Units from .dask_utils import cf_YMDhms _units_None = Units(None) +# -------------------------------------------------------------------- +# Global lock for netCDF file access +# -------------------------------------------------------------------- +netcdf_lock = SerializableLock() + def is_numeric_dtype(array): """True if the given array is of a numeric or boolean data type. @@ -822,7 +827,7 @@ def collapse( "keepdims": keepdims, "split_every": split_every, "mtol": mtol, - "active_storage": d.active_storage, + "active_storage": d.active_storage and active_storage(), } weights = parse_weights(d, weights, axis) @@ -961,7 +966,3 @@ def parse_weights(d, weights, axis=None): # Return the product of the weights components, which will be # broadcastable to d return reduce(mul, w) - - -# Global lock for netCDF file access -netcdf_lock = SerializableLock() diff --git a/cf/functions.py b/cf/functions.py index 3fd6f4bee4..b9aa5ff975 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -173,6 +173,7 @@ def configuration( regrid_logging=None, relaxed_identities=None, bounds_combination_mode=None, + active_storage=None, of_fraction=None, collapse_parallel_mode=None, free_memory_factor=None, @@ -261,6 +262,12 @@ def configuration( construct identity. The default is to not change the current value. + active_storage: `bool` or `Constant`, optional + TODOACTIVEDOCS + + .. versionaddedd:: ACTIVEVERSION + + of_fraction: `float` or `Constant`, optional Deprecated at version 3.14.0 and is no longer available. @@ -376,6 +383,7 @@ def configuration( new_regrid_logging=regrid_logging, new_relaxed_identities=relaxed_identities, bounds_combination_mode=bounds_combination_mode, + active_storage=active_storage, ) @@ -425,6 +433,7 @@ def _configuration(_Configuration, **kwargs): "new_regrid_logging": regrid_logging, "new_relaxed_identities": relaxed_identities, "bounds_combination_mode": bounds_combination_mode, + "active_storage": active_storage, } old_values = {} @@ -1136,6 +1145,55 @@ def _parse(cls, arg): return arg +class active_storage(ConstantAccess): + """TODOACTIVEDOCS + + .. versionadded:: ACTIVEVERSION + + .. seealso:: `configuration` + + :Parameters: + + arg: `bool` or `Constant`, optional + Provide a value that will apply to all subsequent + operations. + + :Returns: + + `Constant` + The value prior to the change, or the current value if no + new value was specified. + + **Examples** + + TODOACTIVEDOCS + + """ + + _name = "ACTIVE_STORAGE" + + def _parse(cls, arg): + """Parse a new constant value. + + .. versionaddedd:: ACTIVEVERSION + + :Parameters: + + cls: + This class. + + arg: + The given new constant value. + + :Returns: + + A version of the new constant value suitable for + insertion into the `CONSTANTS` dictionary. + + """ + return bool(arg) + + def CF(): """The version of the CF conventions. diff --git a/cf/test/test_functions.py b/cf/test/test_functions.py index 7a32b0d859..7cc43ce93a 100644 --- a/cf/test/test_functions.py +++ b/cf/test/test_functions.py @@ -64,6 +64,7 @@ def test_configuration(self): self.assertIsInstance(org["bounds_combination_mode"], str) self.assertIsInstance(org["regrid_logging"], bool) self.assertIsInstance(org["tempdir"], str) + self.assertIsInstance(org["active_storage"], bool) # Log level may be input as an int but always given as # equiv. string self.assertIsInstance(org["log_level"], str) @@ -83,6 +84,7 @@ def test_configuration(self): "bounds_combination_mode": "XOR", "log_level": "INFO", "chunksize": 8e9, + "active_storage": True, } # Test the setting of each lone item. From d464a0492300e867ad3123d7397e025840b67731 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 3 Mar 2023 10:36:17 +0000 Subject: [PATCH 019/134] dev --- cf/data/collapse/active_collapse.py | 285 ---------------------------- cf/data/collapse/collapse.py | 21 +- 2 files changed, 11 insertions(+), 295 deletions(-) delete mode 100644 cf/data/collapse/active_collapse.py diff --git a/cf/data/collapse/active_collapse.py b/cf/data/collapse/active_collapse.py deleted file mode 100644 index 8ac3fd2b1c..0000000000 --- a/cf/data/collapse/active_collapse.py +++ /dev/null @@ -1,285 +0,0 @@ -from functools import wraps - - -# -------------------------------------------------------------------- -# Define the active functions -# -------------------------------------------------------------------- -def active_min(a, **kwargs): - """Chunk calculations for the minimum. - - TODO Assumes that the calculations have already been done, - i.e. that *a* is already the minimum. - - This function is intended to be passed to `dask.array.reduction` - as the ``chunk`` parameter. Its return signature must be the same - as the non-active chunk function that it is replacing. - - .. versionadded:: ACTIVEVERSION - - :Parameters: - - a: `dict` - TODOACTIVEDOCS - - kwargs: optional - TODOACTIVEDOCS - - :Returns: - - `dict` - Dictionary with the keys: - - * N: The sample size. - * min: The minimum of `a``. - - """ - return {"N": a["n"], "min": a["min"]} - - -def active_max(a, **kwargs): - """Chunk calculations for the maximum. - - TODO Assumes that the calculations have already been done, - i.e. that *a* is already the maximum. - - This function is intended to be passed to `dask.array.reduction` - as the ``chunk`` parameter. Its return signature must be the same - as the non-active chunk function that it is replacing. - - .. versionadded:: ACTIVEVERSION - - :Parameters: - - a: `dict` - TODOACTIVEDOCS - - kwargs: optional - TODOACTIVEDOCS - - :Returns: - - `dict` - Dictionary with the keys: - - * N: The sample size. - * max: The maximum of `a``. - - """ - return {"N": a["n"], "max": a["max"]} - - -def active_mean(a, **kwargs): - """Chunk calculations for the unweighted mean. - - TODO Assumes that the calculations have already been done, - i.e. that *a* is already the uweighted mean. - - This function is intended to be passed to `dask.array.reduction` - as the ``chunk`` parameter. Its return signature must be the same - as the non-active chunk function that it is replacing. - - .. versionadded:: ACTIVEVERSION - - :Parameters: - - a: `dict` - TODOACTIVEDOCS - - kwargs: optional - TODOACTIVEDOCS - - :Returns: - - `dict` - Dictionary with the keys: - - * N: The sample size. - * V1: The sum of ``weights``. Always equal to ``N`` - because weights have not been set. - * sum: The un-weighted sum of ``a``. - * weighted: True if weights have been set. Always - False. - - """ - return {"N": a["n"], "V1": a["n"], "sum": a["sum"], "weighted": False} - - -def active_sum(a, **kwargs): - """Chunk calculations for the unweighted sum. - - TODO Assumes that the calculations have already been done, - i.e. that *a* is already the uweighted sum. - - This function is intended to be passed to `dask.array.reduction` - as the ``chunk`` parameter. Its return signature must be the same - as the non-active chunk function that it is replacing. - - .. versionadded:: ACTIVEVERSION - - :Parameters: - - a: `dict` - TODOACTIVEDOCS - - kwargs: optional - TODOACTIVEDOCS - - :Returns: - - `dict` - Dictionary with the keys: - - * N: The sample size. - * sum: The un-weighted sum of ``a`` - - """ - return {"N": a["n"], "sum": a["sum"]} - - -# -------------------------------------------------------------------- -# Create a lookup of the active functions -# -------------------------------------------------------------------- -_active_chunk_functions = { - "min": active_min, - "max": active_max, - "mean": active_mean, - "sum": active_sum, -} - - -def actify(a, method, axis=None): - """TODOACTIVEDOCS. - - It is assumed that: - - * The *method* has an entry in the `_active_chunk_functions` - dictionary - - * The `!active_storage` attribute of the `Data` object that - provided the dask array *a* is `True`. - - .. versionadded:: ACTIVEVERSION - - :Parameters: - - a: `dask.array.Array` - The array to be collapsed. - - method: `str` - TODOACTIVEDOCS - - axis: (sequence of) `int`, optional - TODOACTIVEDOCS - - :Returns: - - (`dask.array.Array`, function) or (`dask.array.Array`, `None`) - TODOACTIVEDOCS - - """ - from numbers import Integral - - import dask.array as da - from dask.array.utils import validate_axis - from dask.base import collections_to_dsk - - # Parse axis - if axis is None: - axis = tuple(range(a.ndim)) - else: - if isinstance(axis, Integral): - axis = (axis,) - - if len(axis) != a.ndim: - # Can't (yet) use active storage to collapse a subset of - # the axes, so return the input data unchanged. - return a, None - - axis = validate_axis(axis, a.ndim) - - # Loop round elements of the dask graph, looking for data - # definitions that point to a file and which support active - # storage operations. The elements are traversed in reverse order - # so that the data defintions come out first, allowing for the - # potential of a faster short circuit when using active storage is - # not possible. - ok_to_actify = False - dsk = collections_to_dsk((a,), optimize_graph=True) - for key, value in reversed(dsk.items()): - try: - value.get_filename() - except AttributeError: - continue - - # Still here? Then this chunk is a data definition that points - # to a file, so try to insert an actified copy into the dask - # graph. - try: - dsk[key] = value.actify(method, axis) - except AttributeError: - # This data definition doesn't support active storage - # reductions - break - else: - ok_to_actify = True - - if not ok_to_actify: - # The dask graph is not suitable for active storage - # reductions, so return the input data unchanged. - return a, None - - # Still here? Then all data definitions in the dask graph support - # active storage reductions => redefine the array from the - # actified dask graph, and define the active storage reduction - # chunk function. - return ( - da.Array(dsk, a.name, a.chunks, a.dtype, a._meta), - _active_chunk_functions[method], - ) - - -def active_storage(method): - """A decorator for `Collapse` methods that enables active storage - operations, when the conditions are right. - - .. versionadded:: ACTIVEVERSION - - .. seealso `cf.data.collapse.Collapse` - - :Parameters: - - method: `str` - TODOACTIVEDOCS - - """ - - def decorator(collapse_method): - @wraps(collapse_method) - def wrapper(self, *args, **kwargs): - if ( - kwargs.get("active_storage") - and method in _active_chunk_functions - and kwargs.get("weights") is None - and kwargs.get("chunk_function") is None - ): - # Attempt to actify the dask array and provide a new - # chunk function - a, chunk_function = actify( - args[0], - method=method, - axis=kwargs.get("axis"), - ) - args = list(args) - args[0] = a - - if chunk_function is not None: - # The dask array has been actified, so update the - # chunk function. - kwargs["chunk_function"] = chunk_function - - # Create the collapse - return collapse_method(self, *args, **kwargs) - - return wrapper - - return decorator diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index 82e1d6b016..c4bc1ccfc5 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -5,7 +5,7 @@ from dask.array.reductions import reduction from ...docstring import _docstring_substitution_definitions -from .active_collapse import active_storage +from .collapse_active import active_storage from .collapse_utils import check_input_dtype, double_precision_dtype @@ -15,17 +15,18 @@ class Collapse(metaclass=DocstringRewriteMeta): **Active storage** A collapse method (such as `max`, `max_abs`, etc.) will attempt to - make use use of active storage if: + make use of active storage if: - * The collapse method's `active_storage` parameter is True. - * The method's identity is recognised by the `Active` class. - * The `Active` class recognises the storage location as one that - supports active storage operations. + * The collapse method's *active_storage* parameter is True. - However, when all of these conditions are passed, the collapse - operation will *not* be executed with active storage if the - dask array is deemed, on inspection to be unsuitable. See the - `actify` function for details. + * The method has a corresponding active function defined in + `collapse_active`. + + When these conditions are passed, the graph of the `dask` array is + inspected to confirm that making use of active storage is + possible, and if so the `dask` graph is modified to expect the + per-chunk reductions to be carried out externally. See + `collapse_active.actify` for details. .. versionadded:: 3.14.0 From 8bc3a92149dbc13b654bb3d4242e9a0e8796c41a Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 3 Mar 2023 14:06:33 +0000 Subject: [PATCH 020/134] dev --- cf/data/array/mixin/activestoragemixin.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index a88cc151d8..ceef1de416 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -1,3 +1,8 @@ +try: + from activestorage import Active +except ModuleNotFoundError: + Active = None + # Global lock for netCDF file access from ...utils import netcdf_lock @@ -31,18 +36,21 @@ def __getitem__(self, indices): """ method = self.get_active_method() - if method is None: + if method is None or Active is None: # Normal read by local client. Returns a numpy array. return super().__getitem__(indices) # Active storage read and reduction. Returns a dictionary. try: - missing_data_indicators = self.get_missing_data_indicators() + missing_values = self.get_missing_values() except AttributeError: - missing_data_indicators = {} + missing_values = {} + else: + if missing_values is None: + missing_values = {} active = Active( - self.get_filename(), self.get_ncvar(), **missing_data_indicators + self.get_filename(), self.get_ncvar(), **missing_values ) active.method = method active.components = True From 68fb18a6df240233fbb75998e1978d49770672cf Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 17 Mar 2023 15:28:18 +0000 Subject: [PATCH 021/134] dev --- cf/data/array/mixin/activestoragemixin.py | 2 +- cf/data/collapse/collapse_active.py | 300 ++++++++++++++++++++++ 2 files changed, 301 insertions(+), 1 deletion(-) create mode 100644 cf/data/collapse/collapse_active.py diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index ceef1de416..b4bdfe5ed0 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -40,7 +40,7 @@ def __getitem__(self, indices): # Normal read by local client. Returns a numpy array. return super().__getitem__(indices) - # Active storage read and reduction. Returns a dictionary. + # Active storage reduction. Returns a dictionary. try: missing_values = self.get_missing_values() except AttributeError: diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py new file mode 100644 index 0000000000..57cdf11942 --- /dev/null +++ b/cf/data/collapse/collapse_active.py @@ -0,0 +1,300 @@ +from functools import wraps + + +# -------------------------------------------------------------------- +# Define the active functions +# -------------------------------------------------------------------- +def active_min(a, **kwargs): + """Chunk calculations for the minimum. + + TODO Assumes that the calculations have already been done, + i.e. that *a* is already the minimum. + + This function is intended to be passed to `dask.array.reduction` + as the ``chunk`` parameter. Its return signature must be the same + as the non-active chunk function that it is replacing. + + .. versionadded:: ACTIVEVERSION + + :Parameters: + + a: `dict` + TODOACTIVEDOCS + + kwargs: optional + TODOACTIVEDOCS + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * min: The minimum of `a``. + + """ + return {"N": a["n"], "min": a["min"]} + + +def active_max(a, **kwargs): + """Chunk calculations for the maximum. + + TODO Assumes that the calculations have already been done, + i.e. that *a* is already the maximum. + + This function is intended to be passed to `dask.array.reduction` + as the ``chunk`` parameter. Its return signature must be the same + as the non-active chunk function that it is replacing. + + .. versionadded:: ACTIVEVERSION + + :Parameters: + + a: `dict` + TODOACTIVEDOCS + + kwargs: optional + TODOACTIVEDOCS + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * max: The maximum of `a``. + + """ + return {"N": a["n"], "max": a["max"]} + + +def active_mean(a, **kwargs): + """Chunk calculations for the unweighted mean. + + TODO Assumes that the calculations have already been done, + i.e. that *a* is already the uweighted mean. + + This function is intended to be passed to `dask.array.reduction` + as the ``chunk`` parameter. Its return signature must be the same + as the non-active chunk function that it is replacing. + + .. versionadded:: ACTIVEVERSION + + :Parameters: + + a: `dict` + TODOACTIVEDOCS + + kwargs: optional + TODOACTIVEDOCS + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * V1: The sum of ``weights``. Always equal to ``N`` + because weights have not been set. + * sum: The un-weighted sum of ``a``. + * weighted: True if weights have been set. Always + False. + + """ + return {"N": a["n"], "V1": a["n"], "sum": a["sum"], "weighted": False} + + +def active_sum(a, **kwargs): + """Chunk calculations for the unweighted sum. + + TODO Assumes that the calculations have already been done, + i.e. that *a* is already the uweighted sum. + + This function is intended to be passed to `dask.array.reduction` + as the ``chunk`` parameter. Its return signature must be the same + as the non-active chunk function that it is replacing. + + .. versionadded:: ACTIVEVERSION + + :Parameters: + + a: `dict` + TODOACTIVEDOCS + + kwargs: optional + TODOACTIVEDOCS + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * sum: The un-weighted sum of ``a`` + + """ + return {"N": a["n"], "sum": a["sum"]} + + +# -------------------------------------------------------------------- +# Create a map of reduction methods to their corresponding active +# functions +# -------------------------------------------------------------------- +_active_chunk_functions = { + "min": active_min, + "max": active_max, + "mean": active_mean, + "sum": active_sum, +} + + +def actify(a, method, axis=None): + """TODOACTIVEDOCS. + + It is assumed that: + + * The *method* has an entry in the `_active_chunk_functions` + dictionary + + * The `!active_storage` attribute of the `Data` object that + provided the dask array *a* is `True`. If this is not the case + then an error at compute time is likely. + + .. versionadded:: ACTIVEVERSION + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + method: `str` + TODOACTIVEDOCS + + axis: (sequence of) `int`, optional + TODOACTIVEDOCS + + :Returns: + + (`dask.array.Array`, function) or (`dask.array.Array`, `None`) + TODOACTIVEDOCS + + """ + from numbers import Integral + + import dask.array as da + from dask.array.utils import validate_axis + from dask.base import collections_to_dsk + + # Parse axis + if axis is None: + axis = tuple(range(a.ndim)) + else: + if isinstance(axis, Integral): + axis = (axis,) + + if len(axis) != a.ndim: + # Can't (yet) use active storage to collapse a subset of + # the axes, so return the input data unchanged. + return a, None + + axis = validate_axis(axis, a.ndim) + + # Loop round elements of the dask graph, looking for data + # definitions that point to a file and which support active + # storage operations. The elements are traversed in reverse order + # so that the data defintions come out first, allowing for the + # potential of a faster short circuit when using active storage is + # not possible. + ok_to_actify = True + dsk = collections_to_dsk((a,), optimize_graph=True) + for key, value in reversed(dsk.items()): + try: + filenames = value.get_filenames() + except AttributeError: + # This dask chunk is not a data definition + continue + + if not filenames: + # This data definition doesn't have any files, so can't + # support active storage reductions + ok_to_actify = False + break + + # Still here? Then this chunk is a data definition that points + # to files, so try to insert an actified copy into the dask + # graph. + try: + new_value = value.actify(method, axis) + except AttributeError: + # This data definition doesn't support active storage + # reductions + ok_to_actify = False + break + + if new_value is None: + # This data definition wasn't actifiable + ok_to_actify = False + break + + dsk[key] = new_value + + if not ok_to_actify: + # The dask graph is not suitable for active storage + # reductions, so return the input data unchanged. + return a, None + + # Still here? Then all data definitions in the dask graph support + # active storage reductions => redefine the array from the + # actified dask graph, and define the active storage reduction + # chunk function. + return ( + da.Array(dsk, a.name, a.chunks, a.dtype, a._meta), + _active_chunk_functions[method], + ) + + +def active_storage(method): + """A decorator for `Collapse` methods that enables active storage + operations, when the conditions are right. + + .. versionadded:: ACTIVEVERSION + + .. seealso `cf.data.collapse.Collapse` + + :Parameters: + + method: `str` + TODOACTIVEDOCS + + """ + + def decorator(collapse_method): + @wraps(collapse_method) + def wrapper(self, *args, **kwargs): + if ( + kwargs.get("active_storage") + and method in _active_chunk_functions + and kwargs.get("weights") is None + and kwargs.get("chunk_function") is None + ): + # Attempt to actify the dask array and provide a new + # chunk function + a, chunk_function = actify( + args[0], + method=method, + axis=kwargs.get("axis"), + ) + args = list(args) + args[0] = a + + if chunk_function is not None: + # The dask array has been actified, so update the + # chunk function. + kwargs["chunk_function"] = chunk_function + + # Create the collapse + return collapse_method(self, *args, **kwargs) + + return wrapper + + return decorator From 9fcc7374e743d52ba278a0423abc55b127235476 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 17 Mar 2023 17:19:00 +0000 Subject: [PATCH 022/134] dev --- cf/data/array/mixin/activestoragemixin.py | 10 +- cf/data/collapse/__init__.py | 1 + cf/data/collapse/collapse.py | 22 +-- cf/data/collapse/collapse_active.py | 171 +++++++++++++--------- cf/data/data.py | 39 ++--- cf/docstring/docstring.py | 8 +- cf/functions.py | 13 +- cf/test/test_functions.py | 2 +- 8 files changed, 145 insertions(+), 121 deletions(-) diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index b4bdfe5ed0..2c7f6b5d3c 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -3,9 +3,6 @@ except ModuleNotFoundError: Active = None -# Global lock for netCDF file access -from ...utils import netcdf_lock - class ActiveStorageMixin: """TODOACTIVEDOCS. @@ -36,7 +33,7 @@ def __getitem__(self, indices): """ method = self.get_active_method() - if method is None or Active is None: + if method is None: # Normal read by local client. Returns a numpy array. return super().__getitem__(indices) @@ -54,7 +51,10 @@ def __getitem__(self, indices): ) active.method = method active.components = True - active.lock = netcdf_lock + try: + active.lock = self._dask_lock + except AttributeError: + pass return active[indices] diff --git a/cf/data/collapse/__init__.py b/cf/data/collapse/__init__.py index 0de12360ea..47bbd037ce 100644 --- a/cf/data/collapse/__init__.py +++ b/cf/data/collapse/__init__.py @@ -1 +1,2 @@ from .collapse import Collapse +from .collapse_active import actify diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index c4bc1ccfc5..fc1ccc07cf 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -12,21 +12,23 @@ class Collapse(metaclass=DocstringRewriteMeta): """Container for functions that collapse dask arrays. - **Active storage** + **Active storage reductions** - A collapse method (such as `max`, `max_abs`, etc.) will attempt to - make use of active storage if: + A collapse method (such as `max`, `var`, etc.) will attempt to + make use of active storage reductions if: - * The collapse method's *active_storage* parameter is True. + * The collapse method's *active_storage* parameter is set to True. - * The method has a corresponding active function defined in - `collapse_active`. + * The method has a corresponding active chunk function defined in + the `collapse_active.active_chunk_functions` dictionary. - When these conditions are passed, the graph of the `dask` array is + These conditions alone are not sufficient active storage + reductions to occur. In addition, the graph of the `dask` array is inspected to confirm that making use of active storage is - possible, and if so the `dask` graph is modified to expect the - per-chunk reductions to be carried out externally. See - `collapse_active.actify` for details. + possible, and if so the graph is modified to expect the per-chunk + reductions to be carried out externally. + + See `cf.data.collapse.actify` for details. .. versionadded:: 3.14.0 diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 05d6fe5a07..5684927cef 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -5,24 +5,27 @@ # Define the active functions # -------------------------------------------------------------------- def active_min(a, **kwargs): - """Chunk calculations for the minimum. + """Chunk function for minimum values computed by active storage. - TODO Assumes that the calculations have already been done, - i.e. that *a* is already the minimum. + Converts active storage reduction components to the components + expected by the reduction combine and aggregate functions. This function is intended to be passed to `dask.array.reduction` - as the ``chunk`` parameter. Its return signature must be the same - as the non-active chunk function that it is replacing. + as the ``chunk`` parameter. Its returned value must be the same as + the non-active chunk function that it is replacing. .. versionadded:: ACTIVEVERSION + .. seealso:: `actify` + :Parameters: a: `dict` - TODOACTIVEDOCS + The components output from the active storage + reduction. For instance: - kwargs: optional - TODOACTIVEDOCS + >>> print(a) + {'min': array([[[49.5]]], dtype=float32), 'n': 1015808} :Returns: @@ -30,31 +33,34 @@ def active_min(a, **kwargs): Dictionary with the keys: * N: The sample size. - * min: The minimum of `a``. + * min: The minimum. """ return {"N": a["n"], "min": a["min"]} def active_max(a, **kwargs): - """Chunk calculations for the maximum. + """Chunk function for maximum values computed by active storage. - TODO Assumes that the calculations have already been done, - i.e. that *a* is already the maximum. + Converts active storage reduction components to the components + expected by the reduction combine and aggregate functions. This function is intended to be passed to `dask.array.reduction` - as the ``chunk`` parameter. Its return signature must be the same - as the non-active chunk function that it is replacing. + as the ``chunk`` parameter. Its returned value must be the same as + the non-active chunk function that it is replacing. .. versionadded:: ACTIVEVERSION + .. seealso:: `actify` + :Parameters: a: `dict` - TODOACTIVEDOCS + The components output from the active storage + reduction. For instance: - kwargs: optional - TODOACTIVEDOCS + >>> print(a) + {'max': array([[[2930.4856]]], dtype=float32), 'n': 1015808} :Returns: @@ -62,31 +68,34 @@ def active_max(a, **kwargs): Dictionary with the keys: * N: The sample size. - * max: The maximum of `a``. + * max: The maximum. """ return {"N": a["n"], "max": a["max"]} def active_mean(a, **kwargs): - """Chunk calculations for the unweighted mean. + """Chunk function for mean values computed by active storage. - TODO Assumes that the calculations have already been done, - i.e. that *a* is already the uweighted mean. + Converts active storage reduction components to the components + expected by the reduction combine and aggregate functions. This function is intended to be passed to `dask.array.reduction` - as the ``chunk`` parameter. Its return signature must be the same - as the non-active chunk function that it is replacing. + as the ``chunk`` parameter. Its returned value must be the same as + the non-active chunk function that it is replacing. .. versionadded:: ACTIVEVERSION + .. seealso:: `actify` + :Parameters: a: `dict` - TODOACTIVEDOCS + The components output from the active storage + reduction. For instance: - kwargs: optional - TODOACTIVEDOCS + >>> print(a) + {'sum': array([[[1.5131907e+09]]], dtype=float32), 'n': 1015808} :Returns: @@ -96,7 +105,7 @@ def active_mean(a, **kwargs): * N: The sample size. * V1: The sum of ``weights``. Always equal to ``N`` because weights have not been set. - * sum: The un-weighted sum of ``a``. + * sum: The un-weighted sum. * weighted: True if weights have been set. Always False. @@ -105,24 +114,27 @@ def active_mean(a, **kwargs): def active_sum(a, **kwargs): - """Chunk calculations for the unweighted sum. + """Chunk function for sum values computed by active storage. - TODO Assumes that the calculations have already been done, - i.e. that *a* is already the uweighted sum. + Converts active storage reduction components to the components + expected by the reduction combine and aggregate functions. This function is intended to be passed to `dask.array.reduction` - as the ``chunk`` parameter. Its return signature must be the same - as the non-active chunk function that it is replacing. + as the ``chunk`` parameter. Its returned value must be the same as + the non-active chunk function that it is replacing. .. versionadded:: ACTIVEVERSION + .. seealso:: `actify` + :Parameters: a: `dict` - TODOACTIVEDOCS + The components output from the active storage + reduction. For instance: - kwargs: optional - TODOACTIVEDOCS + >>> print(a) + {'sum': array([[[1.5131907e+09]]], dtype=float32), 'n': 1015808} :Returns: @@ -130,7 +142,7 @@ def active_sum(a, **kwargs): Dictionary with the keys: * N: The sample size. - * sum: The un-weighted sum of ``a`` + * sum: The un-weighted sum. """ return {"N": a["n"], "sum": a["sum"]} @@ -140,7 +152,7 @@ def active_sum(a, **kwargs): # Create a map of reduction methods to their corresponding active # functions # -------------------------------------------------------------------- -_active_chunk_functions = { +active_chunk_functions = { "min": active_min, "max": active_max, "mean": active_mean, @@ -149,12 +161,17 @@ def active_sum(a, **kwargs): def actify(a, method, axis=None): - """TODOACTIVEDOCS. + """Modify a dask array to use active storage reductions. + + The dask graph is inspected to ensure that active storage + reductions are possible, and if not then the dask array is + returned unchanged. It is assumed that: - * The *method* has an entry in the `_active_chunk_functions` - dictionary + * The method has a corresponding active function defined in the + `active_chunk_functions` dictionary. If this is not the case + then an error will occur at definition time. * The `!active_storage` attribute of the `Data` object that provided the dask array *a* is `True`. If this is not the case @@ -162,23 +179,37 @@ def actify(a, method, axis=None): .. versionadded:: ACTIVEVERSION + .. seealso:: `active_storage` + :Parameters: a: `dask.array.Array` The array to be collapsed. method: `str` - TODOACTIVEDOCS + The name of the reduction method. Must be a key of the + `active_chunk_functions` dictionary. axis: (sequence of) `int`, optional - TODOACTIVEDOCS + Axis or axes along which to operate. By default, + flattened input is used. :Returns: (`dask.array.Array`, function) or (`dask.array.Array`, `None`) - TODOACTIVEDOCS + If active storage operations are possible then return the + modified dask array and the new chunk reduction + function. Otherwise return the unaltered input array and + `None`. """ + try: + from activestorage import Active # noqa: F401 + except ModuleNotFoundError: + # The active storage class dependency is not met, so using + # active storage is not possible. + return a, None + from numbers import Integral import dask.array as da @@ -199,24 +230,25 @@ def actify(a, method, axis=None): axis = validate_axis(axis, a.ndim) - # Loop round elements of the dask graph, looking for data - # definitions that point to a file and which support active - # storage operations. The elements are traversed in reverse order - # so that the data defintions come out first, allowing for the - # potential of a faster short circuit when using active storage is - # not possible. + # Loop round the nodes of the dask graph, looking for data + # definitions that point to files and which support active storage + # operations, and modify the dask grpah when we find them. + # + # The elements are traversed in reverse order so that the data + # defintions come out first, allowing for the potential of a + # faster short circuit when using active storage is not possible. ok_to_actify = True dsk = collections_to_dsk((a,), optimize_graph=True) for key, value in reversed(dsk.items()): try: - filenames = value.get_filenames() + filename = value.get_filename() except AttributeError: # This dask chunk is not a data definition continue - if not filenames: + if not filename: # This data definition doesn't have any files, so can't - # support active storage reductions + # support active storage reductions. ok_to_actify = False break @@ -224,47 +256,44 @@ def actify(a, method, axis=None): # to files, so try to insert an actified copy into the dask # graph. try: - new_value = value.actify(method, axis) + dsk[key] = value.actify(method, axis) except AttributeError: # This data definition doesn't support active storage # reductions ok_to_actify = False break - if new_value is None: - # This data definition wasn't actifiable - ok_to_actify = False - break - - dsk[key] = new_value - if not ok_to_actify: - # The dask graph is not suitable for active storage - # reductions, so return the input data unchanged. + # It turns out that the dask graph is not suitable for active + # storage reductions, so return the input data unchanged. return a, None # Still here? Then all data definitions in the dask graph support - # active storage reductions => redefine the array from the - # actified dask graph, and define the active storage reduction - # chunk function. + # active storage reductions => redefine the dask array from the + # actified dask graph, and set the active storage reduction chunk + # function. return ( da.Array(dsk, a.name, a.chunks, a.dtype, a._meta), - _active_chunk_functions[method], + active_chunk_functions[method], ) def active_storage(method): - """A decorator for `Collapse` methods that enables active storage - operations, when the conditions are right. + """A decorator that enables active storage reductions. + + This decorator is intended for `Collapse` methods. Active storage + operations are only carried out when the conditions are right. .. versionadded:: ACTIVEVERSION - .. seealso `cf.data.collapse.Collapse` + .. seealso:: `actify`, `cf.data.collapse.Collapse` :Parameters: method: `str` - TODOACTIVEDOCS + The name of the reduction method. If it is not one of the + keys of the `active_chunk_functions` dictionary then + active storage reductions will not occur. """ @@ -273,7 +302,7 @@ def decorator(collapse_method): def wrapper(self, *args, **kwargs): if ( kwargs.get("active_storage") - and method in _active_chunk_functions + and method in active_chunk_functions and kwargs.get("weights") is None and kwargs.get("chunk_function") is None ): diff --git a/cf/data/data.py b/cf/data/data.py index b736a9ec76..fc85a5d8a6 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -29,6 +29,7 @@ from ..functions import ( _DEPRECATION_ERROR_KWARGS, _section, + active_storage, atol, default_netCDF_fillvals, free_memory, @@ -37,7 +38,6 @@ ) from ..mixin_container import Container from ..units import Units -from .array.mixin import FileArrayMixin from .collapse import Collapse from .creation import generate_axis_identifiers, to_dask from .dask_utils import ( @@ -94,7 +94,7 @@ _NONE = 0 # = 0b0000 _ARRAY = 1 # = 0b0001 _CACHE = 2 # = 0b0010 -_ACTIVE = 8 # = 0b0010 +_ACTIVE = 8 # = 0b1000 _ALL = 15 # = 0b1111 @@ -438,6 +438,7 @@ def __init__( # compressed input arrays this will contain extra # information, such as a count or index variable. self._set_Array(array) + # Data files are candidates for active storage reductions self._set_active_storage(True) # Cast the input data as a dask array @@ -624,24 +625,6 @@ def _rtol(self): """Return the current value of the `cf.rtol` function.""" return rtol().value - def _is_file_array(self, array): - """Whether or not an array is stored on disk. - - .. versionaddedd: ACTIVEVERSION - - :Parameters: - - array: - TODOACTIVEDOCS - - :Returns: - - `bool` - TODOACTIVEDOCS - - """ - return isinstance(array, FileArrayMixin) - def __data__(self): """Returns a new reference to self.""" return self @@ -1281,7 +1264,8 @@ def _clear_after_dask_update(self, clear=_ALL): * If ``clear & _CACHE`` is non-zero then cached element values are deleted. - * If ``clear & _ACTIVE`` is non-zero then TODOACTIVE + * If ``clear & _ACTIVE`` is non-zero then set the + active storage status to `False`. By default *clear* is the ``_ALL`` integer-valued constant, which results in all components being @@ -1315,7 +1299,7 @@ def _clear_after_dask_update(self, clear=_ALL): self._del_cached_elements() if clear & _ACTIVE: - # Delete cached element values + # Set active storage to False self._del_active_storage() def _set_dask(self, array, copy=False, clear=_ALL): @@ -1425,7 +1409,7 @@ def _del_dask(self, default=ValueError(), clear=_ALL): return out def _del_active_storage(self): - """TODOACTIVEDOCS. + """Set the active storage reduction status to False. .. versionadded:: ACTIVEVERSION @@ -1510,7 +1494,7 @@ def _is_abstract_Array_subclass(self, array): return isinstance(array, cfdm.Array) def _set_active_storage(self, value): - """TODOACTIVEDOCS. + """Set the active storage reduction status. .. versionadded:: ACTIVEVERSION @@ -1518,8 +1502,7 @@ def _set_active_storage(self, value): :Returns: - `bool` - TODOACTIVEDOCS + `None` **Examples** @@ -7475,7 +7458,9 @@ def unique(self, split_every=None): dx = d.to_dask_array() dx = Collapse().unique( - dx, split_every=split_every, active_storage=d.active_storage + dx, + split_every=split_every, + active_storage=d.active_storage and active_storage(), ) d._set_dask(dx) diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py index a3490fe51a..2555e6f59e 100644 --- a/cf/docstring/docstring.py +++ b/cf/docstring/docstring.py @@ -317,10 +317,10 @@ # active_storage "{{active_storage: `bool`, optional}}": """{{active_storage: `bool`, optional}} If True then attempt to perform the collapse using - active storage. However, if other necessary conditions - are not met (see `cf.data.collapse.actify` for - details) then the operation will be executed without - active storage.""", + active storage reductions. However, if other necessary + conditions are not met (see `cf.data.collapse.actify` + for details) then the operation will be executed + without active storage.""", # Collapse chunk_function "{{chunk_function: callable, optional}}": """{{chunk_function: callable, optional}} Provides the ``chunk`` parameter to diff --git a/cf/functions.py b/cf/functions.py index 8bfd33d41c..61ce74b547 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -263,7 +263,9 @@ def configuration( current value. active_storage: `bool` or `Constant`, optional - TODOACTIVEDOCS + The new value (either True to enable active storage + reductions or False to disable them). The default is to + not change the current behaviour. .. versionaddedd:: ACTIVEVERSION @@ -1146,7 +1148,7 @@ def _parse(cls, arg): class active_storage(ConstantAccess): - """TODOACTIVEDOCS + """Whether or not to allow active storage reductions. .. versionadded:: ACTIVEVERSION @@ -1166,7 +1168,12 @@ class active_storage(ConstantAccess): **Examples** - TODOACTIVEDOCS + >>> cf.active_storage() + True + >>> cf.active_storage(False) + True + >>> cf.active_storage() + False """ diff --git a/cf/test/test_functions.py b/cf/test/test_functions.py index 7cc43ce93a..bab173dfea 100644 --- a/cf/test/test_functions.py +++ b/cf/test/test_functions.py @@ -54,7 +54,7 @@ def test_configuration(self): self.assertIsInstance(org, dict) # Check all keys that should be there are, with correct value type: - self.assertEqual(len(org), 8) # update expected len if add new key(s) + self.assertEqual(len(org), 9) # update expected len if add new key(s) # Types expected: self.assertIsInstance(org["atol"], float) From c2e7eca5b3392b4efeff62ee518a25ed487a313e Mon Sep 17 00:00:00 2001 From: David Hassell Date: Sat, 18 Mar 2023 08:30:07 +0000 Subject: [PATCH 023/134] move netcdf lock --- cf/data/array/netcdfarray.py | 7 ++++--- cf/data/utils.py | 6 ------ 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/cf/data/array/netcdfarray.py b/cf/data/array/netcdfarray.py index 7dece7d1a3..dcf9a4336e 100644 --- a/cf/data/array/netcdfarray.py +++ b/cf/data/array/netcdfarray.py @@ -1,10 +1,11 @@ import cfdm +from dask.utils import SerializableLock from ...mixin_container import Container +from .mixin import ActiveStorageMixin, FileArrayMixin # Global lock for netCDF file access -from ..utils import netcdf_lock -from .mixin import ActiveStorageMixin, FileArrayMixin +_lock = SerializableLock() class NetCDFArray( @@ -41,4 +42,4 @@ def _dask_lock(self): if filename is None: return False - return netcdf_lock + return _lock diff --git a/cf/data/utils.py b/cf/data/utils.py index 360f6312af..ff73675a56 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -5,7 +5,6 @@ import dask.array as da import numpy as np -from dask.utils import SerializableLock from ..cfdatetime import ( canonical_calendar, @@ -21,11 +20,6 @@ _units_None = Units(None) -# -------------------------------------------------------------------- -# Global lock for netCDF file access -# -------------------------------------------------------------------- -netcdf_lock = SerializableLock() - def is_numeric_dtype(array): """True if the given array is of a numeric or boolean data type. From 064de91adb03a0f00b828f260410e494d7bde200 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 25 Sep 2023 14:06:24 +0100 Subject: [PATCH 024/134] dev --- cf/data/array/netcdfarray.py | 6 ++++-- cf/data/data.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cf/data/array/netcdfarray.py b/cf/data/array/netcdfarray.py index bdd98089ea..f8361708e7 100644 --- a/cf/data/array/netcdfarray.py +++ b/cf/data/array/netcdfarray.py @@ -8,11 +8,13 @@ _lock = SerializableLock() -class NetCDFArray(ActiveStorageMixin, FileArrayMixin, ArrayMixin, Container, cfdm.NetCDFArray): +class NetCDFArray( + ActiveStorageMixin, FileArrayMixin, ArrayMixin, Container, cfdm.NetCDFArray +): """An array stored in a netCDF file. TODOACTIVEDOCS - + """ def __dask_tokenize__(self): diff --git a/cf/data/data.py b/cf/data/data.py index 4f0e6acac5..0d28204ace 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -32,8 +32,8 @@ _DEPRECATION_ERROR_KWARGS, _numpy_allclose, _section, - active_storage, abspath, + active_storage, atol, default_netCDF_fillvals, free_memory, From 78b7269c4f7ab38f658ce5c7a9a78d277a93ac5d Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 22 Jan 2024 23:12:34 +0000 Subject: [PATCH 025/134] dev --- cf/__init__.py | 1 + cf/cfimplementation.py | 3 ++ cf/data/array/__init__.py | 1 + cf/data/array/hdfarray.py | 39 +++++++++++++++++++++++++ cf/data/collapse/collapse.py | 9 +++--- cf/data/collapse/collapse_active.py | 4 +-- cf/data/fragment/netcdffragmentarray.py | 4 +-- cf/docstring/docstring.py | 5 ++-- 8 files changed, 56 insertions(+), 10 deletions(-) create mode 100644 cf/data/array/hdfarray.py diff --git a/cf/__init__.py b/cf/__init__.py index 6ee6c353e2..bd9b83b0ea 100644 --- a/cf/__init__.py +++ b/cf/__init__.py @@ -277,6 +277,7 @@ CFANetCDFArray, FullArray, GatheredArray, + HDFArray, NetCDFArray, PointTopologyArray, RaggedContiguousArray, diff --git a/cf/cfimplementation.py b/cf/cfimplementation.py index 9a4c2090d8..10065081a9 100644 --- a/cf/cfimplementation.py +++ b/cf/cfimplementation.py @@ -31,6 +31,7 @@ CellConnectivityArray, CFANetCDFArray, GatheredArray, + HDFArray, NetCDFArray, PointTopologyArray, RaggedContiguousArray, @@ -175,6 +176,7 @@ def initialise_CFANetCDFArray( BoundsFromNodesArray=BoundsFromNodesArray, CellConnectivityArray=CellConnectivityArray, GatheredArray=GatheredArray, + HDFArray=HDFArray, NetCDFArray=NetCDFArray, PointTopologyArray=PointTopologyArray, RaggedContiguousArray=RaggedContiguousArray, @@ -230,6 +232,7 @@ def implementation(): 'PartNodeCountProperties': cf.partnodecountproperties.PartNodeCountProperties, 'Data': cf.data.data.Data, 'GatheredArray': cf.data.array.gatheredarray.GatheredArray, + 'HDFArray': cf.data.array.hdfarray.HDFArray, 'NetCDFArray': cf.data.array.netcdfarray.NetCDFArray, 'PointTopologyArray': , 'RaggedContiguousArray': cf.data.array.raggedcontiguousarray.RaggedContiguousArray, diff --git a/cf/data/array/__init__.py b/cf/data/array/__init__.py index c21f8916c5..3a35b976fb 100644 --- a/cf/data/array/__init__.py +++ b/cf/data/array/__init__.py @@ -3,6 +3,7 @@ from .cfanetcdfarray import CFANetCDFArray from .fullarray import FullArray from .gatheredarray import GatheredArray +from .hdfarray import HDFArray from .netcdfarray import NetCDFArray from .pointtopologyarray import PointTopologyArray from .raggedcontiguousarray import RaggedContiguousArray diff --git a/cf/data/array/hdfarray.py b/cf/data/array/hdfarray.py new file mode 100644 index 0000000000..f8dbff9fda --- /dev/null +++ b/cf/data/array/hdfarray.py @@ -0,0 +1,39 @@ +import cfdm +from dask.utils import SerializableLock + +from ...mixin_container import Container +from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin + +# Global lock for netCDF file access +_lock = SerializableLock() + + +class HDFArray(ActiveStorageMixin, FileArrayMixin, ArrayMixin, Container, cfdm.HDFArray): + """An array stored in a netCDF file.] + + .. versionadded:: HDFVER + + """ + + def __dask_tokenize__(self): + """Return a value fully representative of the object. + + .. versionadded:: HDFVER + + """ + return super().__dask_tokenize__() + (self.get_mask(),) + + @property + def _lock(self): + """Set the lock for use in `dask.array.from_array`. + + Returns a lock object because concurrent reads are not + currently supported by the netCDF-C library. The lock object + will be the same for all `NetCDFArray` instances, regardless + of the dataset they access, which means that access to all + netCDF files coordinates around the same lock. + + .. versionadded:: HDFVER + + """ + return _lock diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index fc1ccc07cf..8970f2371f 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -15,12 +15,13 @@ class Collapse(metaclass=DocstringRewriteMeta): **Active storage reductions** A collapse method (such as `max`, `var`, etc.) will attempt to - make use of active storage reductions if: + make use of active storage reductions if both of the following are + true: - * The collapse method's *active_storage* parameter is set to True. + 1. The collapse method's *active_storage* parameter is True. - * The method has a corresponding active chunk function defined in - the `collapse_active.active_chunk_functions` dictionary. + 2. The method has a corresponding active chunk function defined in + the `collapse_active.active_chunk_functions` dictionary. These conditions alone are not sufficient active storage reductions to occur. In addition, the graph of the `dask` array is diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 5684927cef..d7102c61b4 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -230,9 +230,9 @@ def actify(a, method, axis=None): axis = validate_axis(axis, a.ndim) - # Loop round the nodes of the dask graph, looking for data + # Loop round the nodes of the dask graph looking for data # definitions that point to files and which support active storage - # operations, and modify the dask grpah when we find them. + # operations, and modify the dask graph when we find them. # # The elements are traversed in reverse order so that the data # defintions come out first, allowing for the potential of a diff --git a/cf/data/fragment/netcdffragmentarray.py b/cf/data/fragment/netcdffragmentarray.py index bd6f9c3683..9b85ed3597 100644 --- a/cf/data/fragment/netcdffragmentarray.py +++ b/cf/data/fragment/netcdffragmentarray.py @@ -1,9 +1,9 @@ -from ..array.mixin import ActiveStorageMixin +#from ..array.mixin import ActiveStorageMixin from ..array.netcdfarray import NetCDFArray from .mixin import FragmentArrayMixin -class NetCDFFragmentArray(ActiveStorageMixin, FragmentArrayMixin, NetCDFArray): +class NetCDFFragmentArray(FragmentArrayMixin, NetCDFArray): """A CFA fragment array stored in a netCDF file. .. versionadded:: 3.14.0 diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py index 4983b297ec..60dab55a17 100644 --- a/cf/docstring/docstring.py +++ b/cf/docstring/docstring.py @@ -324,8 +324,9 @@ # Collapse chunk_function "{{chunk_function: callable, optional}}": """{{chunk_function: callable, optional}} Provides the ``chunk`` parameter to - `dask.array.reduction`. If unset then an approriate - default function will be used.""", + `dask.array.reduction`. If unset then an appropriate + default function from `cf.data.collapse.dask_collapse` + will be used.""", # Collapse weights "{{Collapse weights: data_like or `None`, optional}}": """weights: data_like or `None`, optional Weights associated with values of the array. By From 39a5a647da999d38c5fd28ea5fab9789bf5b93d1 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 25 Jan 2024 17:39:09 +0000 Subject: [PATCH 026/134] dev --- cf/data/array/hdfarray.py | 12 +++++------- cf/data/array/mixin/activestoragemixin.py | 12 +++++++++--- cf/data/array/netcdfarray.py | 13 ++++++------- cf/data/collapse/collapse.py | 1 + cf/data/collapse/collapse_active.py | 13 +++++++++---- cf/data/data.py | 18 +++++++----------- cf/read_write/read.py | 23 +++++++++++++++++++++-- 7 files changed, 58 insertions(+), 34 deletions(-) diff --git a/cf/data/array/hdfarray.py b/cf/data/array/hdfarray.py index f8dbff9fda..6da6ef2aeb 100644 --- a/cf/data/array/hdfarray.py +++ b/cf/data/array/hdfarray.py @@ -2,11 +2,9 @@ from dask.utils import SerializableLock from ...mixin_container import Container +from .locks import _lock from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin -# Global lock for netCDF file access -_lock = SerializableLock() - class HDFArray(ActiveStorageMixin, FileArrayMixin, ArrayMixin, Container, cfdm.HDFArray): """An array stored in a netCDF file.] @@ -28,10 +26,10 @@ def _lock(self): """Set the lock for use in `dask.array.from_array`. Returns a lock object because concurrent reads are not - currently supported by the netCDF-C library. The lock object - will be the same for all `NetCDFArray` instances, regardless - of the dataset they access, which means that access to all - netCDF files coordinates around the same lock. + currently supported by the HDF5 library. The lock object will + be the same for all `NetCDFArray` and `HDFArray` instances, + regardless of the dataset they access, which means that access + to all netCDF and HDF files coordinates around the same lock. .. versionadded:: HDFVER diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index 2c7f6b5d3c..751045a456 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -46,13 +46,19 @@ def __getitem__(self, indices): if missing_values is None: missing_values = {} + try: + s3 = self.get_s3() + except AttributeError: + s3 = {} + active = Active( - self.get_filename(), self.get_ncvar(), **missing_values + self.get_filename(), self.get_address(), **missing_values, + # **s3 ) active.method = method active.components = True try: - active.lock = self._dask_lock + active.lock = self._lock except AttributeError: pass @@ -64,7 +70,7 @@ def actify(self, method, axis=None): The new instance is a deep copy of the original, with the additional setting of the active storage method and axis. - .. versionadded:: ACTIVEVER + .. versionadded:: ACTIVEVERSION .. seealso:: `set_active_axis`, `set_active_method` diff --git a/cf/data/array/netcdfarray.py b/cf/data/array/netcdfarray.py index 34b64064ad..09ec9192ef 100644 --- a/cf/data/array/netcdfarray.py +++ b/cf/data/array/netcdfarray.py @@ -2,11 +2,9 @@ from dask.utils import SerializableLock from ...mixin_container import Container +from .locks import _lock from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin -# Global lock for netCDF file access -_lock = SerializableLock() - class NetCDFArray( ActiveStorageMixin, FileArrayMixin, ArrayMixin, Container, cfdm.NetCDFArray @@ -30,10 +28,11 @@ def _lock(self): """Set the lock for use in `dask.array.from_array`. Returns a lock object because concurrent reads are not - currently supported by the netCDF-C library. The lock object - will be the same for all `NetCDFArray` instances, regardless - of the dataset they access, which means that access to all - netCDF files coordinates around the same lock. + currently supported by the netCDF and HDF libraries. The lock + object will be the same for all `NetCDFArray` and `HDFArray` + instances, regardless of the dataset they access, which means + that access to all netCDF and HDF files coordinates around the + same lock. .. versionadded:: 3.14.0 diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index 8970f2371f..a08563d398 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -440,6 +440,7 @@ def min( The collapsed array. """ + print ('min: active_stoege =', active_storage) from .dask_collapse import cf_min_agg, cf_min_chunk, cf_min_combine if chunk_function is None: diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index d7102c61b4..6124b37564 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -1,4 +1,5 @@ from functools import wraps +import logging # -------------------------------------------------------------------- @@ -203,11 +204,13 @@ def actify(a, method, axis=None): `None`. """ + print ('runing actify') try: from activestorage import Active # noqa: F401 except ModuleNotFoundError: # The active storage class dependency is not met, so using # active storage is not possible. + print('oops') return a, None from numbers import Integral @@ -258,8 +261,8 @@ def actify(a, method, axis=None): try: dsk[key] = value.actify(method, axis) except AttributeError: - # This data definition doesn't support active storage - # reductions + # This data definition doesn't have an 'actify' method, + # and so doesn't support active storage reductions. ok_to_actify = False break @@ -281,8 +284,10 @@ def actify(a, method, axis=None): def active_storage(method): """A decorator that enables active storage reductions. - This decorator is intended for `Collapse` methods. Active storage - operations are only carried out when the conditions are right. + This decorator is intended for `Collapse` methods. When a + `Collapse` method is decorated, active storage operations are only + carried out when the conditions are right. See `Collapse` for + details. .. versionadded:: ACTIVEVERSION diff --git a/cf/data/data.py b/cf/data/data.py index e2b9ce48ee..e146584a34 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -96,12 +96,12 @@ # Contstants used to specify which `Data` components should be cleared # when a new dask array is set. See `Data._clear_after_dask_update` # for details. -_NONE = 0 # = 0b0000 -_ARRAY = 1 # = 0b0001 -_CACHE = 2 # = 0b0010 -_CFA = 4 # = 0b0100 +_NONE = 0 # = 0b0000 +_ARRAY = 1 # = 0b0001 +_CACHE = 2 # = 0b0010 +_CFA = 4 # = 0b0100 _ACTIVE = 8 # = 0b1000 -_ALL = 15 # = 0b1111 +_ALL = 15 # = 0b1111 class Data(DataClassDeprecationsMixin, CFANetCDF, Container, cfdm.Data): @@ -1328,9 +1328,6 @@ def _clear_after_dask_update(self, clear=_ALL): * If ``clear & _ACTIVE`` is non-zero then set the active storage status to `False`. - * If ``clear`` is non-zero then the CFA term status is - set to `False`. - By default *clear* is the ``_ALL`` integer-valued constant, which results in all components being removed. @@ -1411,8 +1408,8 @@ def _set_dask(self, array, copy=False, clear=_ALL): "suitability (such as data type casting, " "broadcasting, etc.). Note that the exception may be " "difficult to diagnose, as dask will have silently " - "trapped it and returned NotImplemented (for " - "instance, see dask.array.core.elemwise). Print " + "trapped it and returned NotImplemented (see, for " + "instance, dask.array.core.elemwise). Print " "statements in a local copy of dask are possibly the " "way to go if the cause of the error is not obvious." ) @@ -8111,7 +8108,6 @@ def unique(self, split_every=None): dx = Collapse().unique( dx, split_every=split_every, - active_storage=d.active_storage and active_storage(), ) d._set_dask(dx) diff --git a/cf/read_write/read.py b/cf/read_write/read.py index a7ad3f23af..65df0c07aa 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -62,6 +62,7 @@ def read( chunks="auto", domain=False, cfa=None, + s3=None ): """Read field or domain constructs from files. @@ -664,6 +665,16 @@ def read( .. versionadded:: 3.15.0 + s3: `None` or `dict` , optional + TODOACTIVEDOCS + + .. versionadded:: (cfdm) ACTIVEVERSION + + _no_HDF: `bool`, optional + TODOACTIVEDOCS + + .. versionadded:: (cfdm) ACTIVEVERSION + umversion: deprecated at version 3.0.0 Use the *um* parameter instead. @@ -885,8 +896,8 @@ def read( file_glob = os.path.expanduser(os.path.expandvars(file_glob)) scheme = urlparse(file_glob).scheme - if scheme in ("https", "http"): - # Do not glob a URL + if scheme in ("https", "http", "s3"): + # Do not glob a remote URL files2 = (file_glob,) else: # Glob files on disk @@ -955,6 +966,7 @@ def read( select=select, domain=domain, cfa_options=cfa_options, + s3=s3, ) # -------------------------------------------------------- @@ -1069,6 +1081,7 @@ def _read_a_file( select=None, domain=False, cfa_options=None, + s3=None, ): """Read the contents of a single file into a field list. @@ -1104,6 +1117,11 @@ def _read_a_file( .. versionadded:: 3.15.0 + s3: `dict`, optional + TODOACTIVEDOCS + + .. versionadded:: AVTIVEVERSION + :Returns: `FieldList` or `DomainList` @@ -1177,6 +1195,7 @@ def _read_a_file( mask=mask, warn_valid=warn_valid, domain=domain, + s3=s3, ) except MaskError: # Some data required for field interpretation is missing, From 37f8b7f4bb4a82bfa400399e290696e6ddb4a490 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 25 Jan 2024 17:39:26 +0000 Subject: [PATCH 027/134] dev --- cf/data/array/locks.py | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 cf/data/array/locks.py diff --git a/cf/data/array/locks.py b/cf/data/array/locks.py new file mode 100644 index 0000000000..21255ec9b5 --- /dev/null +++ b/cf/data/array/locks.py @@ -0,0 +1,4 @@ +from dask.utils import SerializableLock + +# Global lock for file access +_lock = SerializableLock() From fe429b728baae2bd242c50697b71ddac700032ab Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 26 Jan 2024 10:29:49 +0000 Subject: [PATCH 028/134] dev --- cf/data/array/cfanetcdfarray.py | 16 +++++++++++++--- cf/read_write/netcdf/netcdfread.py | 5 +++-- cf/read_write/read.py | 18 ++++++++++++++---- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/cf/data/array/cfanetcdfarray.py b/cf/data/array/cfanetcdfarray.py index 86465309e9..d75a5ed02c 100644 --- a/cf/data/array/cfanetcdfarray.py +++ b/cf/data/array/cfanetcdfarray.py @@ -6,6 +6,7 @@ from ..fragment import FullFragmentArray, NetCDFFragmentArray, UMFragmentArray from ..utils import chunk_locations, chunk_positions +from .mixin import CFAMixin from .netcdfarray import NetCDFArray # Store fragment array classes. @@ -16,7 +17,7 @@ } -class CFANetCDFArray(NetCDFArray): +class CFANetCDFArray(CFAMixin, NetCDFArray): """A CFA aggregated array stored in a netCDF file. .. versionadded:: 3.14.0 @@ -139,8 +140,10 @@ def __init__( location = x["location"] ndim = location.shape[0] - - chunks = [i.compressed().tolist() for i in location] + print("location =", location.shape, repr(location)) + compressed = np.ma.compressed + chunks = [compressed(i).tolist() for i in location] + # print(chunks) shape = [sum(c) for c in chunks] positions = chunk_positions(chunks) locations = chunk_locations(chunks) @@ -179,6 +182,13 @@ def __init__( fmt = np.full(fragment_shape, fmt, dtype=fmt.dtype) if extra_dimension: + print("-----") + import copy + + print(f.shape, repr(f)) + # if f.shape == (780, 1, 1, 2): + # for frag_loc, loc in zip(positions, locations): + # print(frag_loc, loc) aggregated_data = { frag_loc: { "location": loc, diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py index f8437349c6..dc28ea52dd 100644 --- a/cf/read_write/netcdf/netcdfread.py +++ b/cf/read_write/netcdf/netcdfread.py @@ -654,6 +654,7 @@ def _create_cfanetcdfarray( """ g = self.read_vars + print(g["s3"]) # Get the kwargs needed to instantiate a general NetCDFArray # instance @@ -664,8 +665,8 @@ def _create_cfanetcdfarray( return_kwargs_only=True, ) - # Get rid of the incorrect shape of (). This will end up - # getting set correctly by the CFANetCDFArray instance. + # Get rid of the incorrect shape. This will end up getting set + # correctly by the CFANetCDFArray instance. kwargs.pop("shape", None) aggregated_data = g["cfa_aggregated_data"][ncvar] diff --git a/cf/read_write/read.py b/cf/read_write/read.py index 65df0c07aa..bfc46e0d20 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -62,7 +62,7 @@ def read( chunks="auto", domain=False, cfa=None, - s3=None + s3=None, ): """Read field or domain constructs from files. @@ -665,8 +665,18 @@ def read( .. versionadded:: 3.15.0 - s3: `None` or `dict` , optional - TODOACTIVEDOCS + s3: `dict` or `None`, optional + Keyword parameters to be passed to `s3fs.S3FileSystem` to + control the opening of files in an S3 object store. By + default, or if `None`, then ``s3={'anon': True}``. Ignored + for file names that don't start with ``s3:``. + + If and only if *s3* has no ``'endpoint_url'`` key, then + one will be automatically derived from the *filename*. For + example, if *filename* was + ``'s3://object-store/data/file.nc'``, then an + ``'endpoint_url'`` key with value + ``'https://object-store'`` would be created. .. versionadded:: (cfdm) ACTIVEVERSION @@ -1118,7 +1128,7 @@ def _read_a_file( .. versionadded:: 3.15.0 s3: `dict`, optional - TODOACTIVEDOCS + See `cf.read` for details. .. versionadded:: AVTIVEVERSION From 417a29734688218819e7a3bd8c08e870fd219586 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 26 Jan 2024 10:31:18 +0000 Subject: [PATCH 029/134] dev --- cf/data/array/cfaarray.py | 730 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 730 insertions(+) create mode 100644 cf/data/array/cfaarray.py diff --git a/cf/data/array/cfaarray.py b/cf/data/array/cfaarray.py new file mode 100644 index 0000000000..e53720e8f1 --- /dev/null +++ b/cf/data/array/cfaarray.py @@ -0,0 +1,730 @@ +from copy import deepcopy +from functools import partial +from itertools import accumulate, product + +import numpy as np + +import .abstract +from ..fragment import FullFragmentArray, NetCDFFragmentArray, UMFragmentArray +from ..utils import chunk_locations, chunk_positions +from .mixin import FileArrayMixin +from .netcdfarray import NetCDFArray + +# Store fragment array classes. +_FragmentArray = { +# "h5": H5FragmentArray, + "nc": NetCDFFragmentArray, + "um": UMFragmentArray, + "full": FullFragmentArray, +} + + +class CFAArray(FileArrayMixin, abstract.Array): + """A CFA aggregated array stored in a netCDF file. + + .. versionadded:: 3.14.0 + + """ + + def __init__( + self, + dtype=None, + mask=True, + units=False, + calendar=False, + instructions=None, + substitutions=None, + term=None, + source=None, + copy=True, + x=None, + ): + """**Initialisation** + + :Parameters: + + dtype: `numpy.dtype` + The data type of the aggregated data array. May be + `None` if the numpy data-type is not known (which can + be the case for netCDF string types, for example). + + mask: `bool` + If True (the default) then mask by convention when + reading data from disk. + + A netCDF array is masked depending on the values of any of + the netCDF variable attributes ``valid_min``, + ``valid_max``, ``valid_range``, ``_FillValue`` and + ``missing_value``. + + units: `str` or `None`, optional + The units of the aggregated data. Set to `None` to + indicate that there are no units. + + calendar: `str` or `None`, optional + The calendar of the aggregated data. Set to `None` to + indicate the CF default calendar, if applicable. + + instructions: `str`, optional + The ``aggregated_data`` attribute value as found on + the CFA netCDF variable. If set then this will be used + to improve the performance of `__dask_tokenize__`. + + substitutions: `dict`, optional + A dictionary whose key/value pairs define text + substitutions to be applied to the fragment file + names. Each key must be specified with the ``${...}`` + syntax, for instance ``{'${base}': 'sub'}``. + + .. versionadded:: 3.15.0 + + term: `str`, optional + The name of a non-standard aggregation instruction + term from which the array is to be created, instead of + creating the aggregated data in the standard + terms. If set then *address* must be the name of the + term's CFA-netCDF aggregation instruction variable, + which must be defined on the fragment dimensions and + no others. Each value of the aggregation instruction + variable will be broadcast across the shape of the + corresponding fragment. + + *Parameter example:* + ``address='cfa_tracking_id', term='tracking_id'`` + + .. versionadded:: 3.15.0 + + {{init source: optional}} + + {{init copy: `bool`, optional}} + + """ + if source is not None: + try: + shape = source._get_component("shape", None) + except AttributeError: + shape = None + + try: + dtype = source._get_component("dtype", None) + except AttributeError: + dtype = None + + try: + mask = source._get_component("mask", True) + except AttributeError: + mask = True + + try: + units = source._get_component("units", False) + except AttributeError: + units = False + + try: + calendar = source._get_component("calendar", False) + except AttributeError: + calendar = False + + try: + missing_values = source._get_component("missing_values", None) + except AttributeError: + missing_values = None + + try: + fragment_shape = source.get_fragment_shape() + except AttributeError: + fragment_shape = None + + try: + instructions = source._get_component("instructions") + except AttributeError: + instructions = None + + try: + aggregated_data = source.get_aggregated_data(copy=False) + except AttributeError: + aggregated_data = {} + + try: + substitutions = source.get_substitutions() + except AttributeError: + substitutions = None + + try: + term = source.get_term() + except AttributeError: + term = None + + else: + aggregated_data = {} + + location = x["location"] + ndim = location.shape[0] + + compressed = np.ma.compressed + chunks = [compressed(i).tolist() for i in location] + shape = [sum(c) for c in chunks] + positions = chunk_positions(chunks) + locations = chunk_locations(chunks) + + if term is not None: + # ---------------------------------------------------- + # This fragment contains a constant value (as opposed + # to file locations) + # ---------------------------------------------------- + term = x[term] + fragment_shape = term.shape + aggregated_data = { + frag_loc: { + "location": loc, + "fill_value": term[frag_loc].item(), + "format": "full", + } + for frag_loc, loc in zip(positions, locations) + } + else: + # ---------------------------------------------------- + # This fragment contains file locations + # ---------------------------------------------------- + a = x["address"] + f = x["file"] + fmt = x["format"] + + extra_dimension = f.ndim > ndim + if extra_dimension: + # There is an extra non-fragment dimension + fragment_shape = f.shape[:-1] + else: + fragment_shape = f.shape + + if not a.ndim: + a = np.full(f.shape, a, dtype=a.dtype) + + if not fmt.ndim: + fmt = np.full(fragment_shape, fmt, dtype=fmt.dtype) + + if extra_dimension: + aggregated_data = { + frag_loc: { + "location": loc, + "filename": f[frag_loc].tolist(), + "address": a[frag_loc].tolist(), + "format": fmt[frag_loc].item(), + } + for frag_loc, loc in zip(positions, locations) + } + else: + aggregated_data = { + frag_loc: { + "location": loc, + "filename": (f[frag_loc].item(),), + "address": (a[frag_loc].item(),), + "format": fmt[frag_loc].item(), + } + for frag_loc, loc in zip(positions, locations) + } + + # Apply string substitutions to the fragment filenames + if substitutions: + for value in aggregated_data.values(): + filenames2 = [] + for filename in value["filename"]: + for base, sub in substitutions.items(): + filename = filename.replace(base, sub) + + filenames2.append(filename) + + value["filename"] = filenames2 + + self._set_component("shape", shape, copy=False) + self._set_component("dtype", dtype, copy=False) + self._set_component("mask", mask, copy=False) + self._set_component("units", units, copy=False) + self._set_component("calendar", calendar, copy=False) + if missing_values is not None: + self._set_component( + "missing_values", missing_values.copy(), copy=False + ) + + self._set_component("fragment_shape", fragment_shape, copy=False) + self._set_component("aggregated_data", aggregated_data, copy=False) + self._set_component("instructions", instructions, copy=False) + self._set_component("term", term, copy=False) + if substitutions is not None: + self._set_component( + "substitutions", substitutions.copy(), copy=False + ) + + def __dask_tokenize__(self): + """Used by `dask.base.tokenize`. + + .. versionadded:: 3.14.0 + + """ + out = super().__dask_tokenize__() + aggregated_data = self._get_component("instructions", None) + if aggregated_data is None: + aggregated_data = self.get_aggregated_data(copy=False) + + return out + (aggregated_data,) + + def __getitem__(self, indices): + """x.__getitem__(indices) <==> x[indices]""" + return NotImplemented # pragma: no cover + + def close(self, dataset): + """Close the dataset containing the data.""" + return NotImplemented # pragma: no cover + + def get_aggregated_data(self, copy=True): + """Get the aggregation data dictionary. + + The aggregation data dictionary contains the definitions of + the fragments and the instructions on how to aggregate them. + The keys are indices of the CFA fragment dimensions, + e.g. ``(1, 0, 0 ,0)``. + + .. versionadded:: 3.14.0 + + :Parameters: + + copy: `bool`, optional + Whether or not to return a copy of the aggregation + dictionary. By default a deep copy is returned. + + .. warning:: If False then changing the returned + dictionary in-place will change the + aggregation dictionary stored in the + {{class}} instance, **as well as in any + copies of it**. + + :Returns: + + `dict` + The aggregation data dictionary. + + **Examples** + + >>> a.shape + (12, 1, 73, 144) + >>> a.get_fragment_shape() + (2, 1, 1, 1) + >>> a.get_aggregated_data() + {(0, 0, 0, 0): {'file': 'January-June.nc', + 'address': 'temp', + 'format': 'nc', + 'location': [(0, 6), (0, 1), (0, 73), (0, 144)]}, + (1, 0, 0, 0): {'file': 'July-December.nc', + 'address': 'temp', + 'format': 'nc', + 'location': [(6, 12), (0, 1), (0, 73), (0, 144)]}} + + """ + aggregated_data = self._get_component("aggregated_data") + if copy: + aggregated_data = deepcopy(aggregated_data) + + return aggregated_data + + def get_fragmented_dimensions(self): + """Get the positions of dimensions that have two or more fragments. + + .. versionadded:: 3.14.0 + + :Returns: + + `list` + The dimension positions. + + **Examples** + + >>> a.get_fragment_shape() + (20, 1, 40, 1) + >>> a.get_fragmented_dimensions() + [0, 2] + + >>> a.get_fragment_shape() + (1, 1, 1) + >>> a.get_fragmented_dimensions() + [] + + """ + return [ + i for i, size in enumerate(self.get_fragment_shape()) if size > 1 + ] + + def get_fragment_shape(self): + """Get the sizes of the fragment dimensions. + + The fragment dimension sizes are given in the same order as + the aggregated dimension sizes given by `shape`. + + .. versionadded:: 3.14.0 + + :Returns: + + `tuple` + The shape of the fragment dimensions. + + """ + return self._get_component("fragment_shape") + + def get_term(self, default=ValueError()): + """The CFA aggregation instruction term for the data, if set. + + .. versionadded:: 3.15.0 + + :Parameters: + + default: optional + Return the value of the *default* parameter if the + term has not been set. If set to an `Exception` + instance then it will be raised instead. + + :Returns: + + `str` + The CFA aggregation instruction term name. + + """ + return self._get_component("term", default=default) + + def open(self, func, *args, **kwargs): + """Return a dataset file object and address.""" + return NotImplemented # pragma: no cover + + def subarray_shapes(self, shapes): + """Create the subarray shapes. + + .. versionadded:: 3.14.0 + + .. seealso:: `subarrays` + + :Parameters: + + shapes: `int`, sequence, `dict` or `str`, optional + Define the subarray shapes. + + Any value accepted by the *chunks* parameter of the + `dask.array.from_array` function is allowed. + + The subarray sizes implied by *chunks* for a dimension + that has been fragmented are ignored, so their + specification is arbitrary. + + :Returns: + + `tuple` + The subarray sizes along each dimension. + + **Examples** + + >>> a.shape + (12, 1, 73, 144) + >>> a.get_fragment_shape() + (2, 1, 1, 1) + >>> a.fragmented_dimensions() + [0] + >>> a.subarray_shapes(-1) + ((6, 6), (1,), (73,), (144,)) + >>> a.subarray_shapes(None) + ((6, 6), (1,), (73,), (144,)) + >>> a.subarray_shapes("auto") + ((6, 6), (1,), (73,), (144,)) + >>> a.subarray_shapes((None, 1, 40, 50)) + ((6, 6), (1,), (40, 33), (50, 50, 44)) + >>> a.subarray_shapes((None, None, "auto", 50)) + ((6, 6), (1,), (73,), (50, 50, 44)) + >>> a.subarray_shapes({2: 40}) + ((6, 6), (1,), (40, 33), (144,)) + + """ + from numbers import Number + + from dask.array.core import normalize_chunks + + # Indices of fragmented dimensions + f_dims = self.get_fragmented_dimensions() + + shape = self.shape + aggregated_data = self.get_aggregated_data(copy=False) + + # Create the base chunks. + chunks = [] + ndim = self.ndim + for dim, (n_fragments, size) in enumerate( + zip(self.get_fragment_shape(), self.shape) + ): + if dim in f_dims: + # This aggregated dimension is spanned by more than + # one fragment. + c = [] + index = [0] * ndim + for j in range(n_fragments): + index[dim] = j + loc = aggregated_data[tuple(index)]["location"][dim] + chunk_size = loc[1] - loc[0] + c.append(chunk_size) + + chunks.append(tuple(c)) + else: + # This aggregated dimension is spanned by exactly one + # fragment. Store None, for now, in the expectation + # that it will get overwrittten. + chunks.append(None) + + if isinstance(shapes, (str, Number)) or shapes is None: + chunks = [ + c if i in f_dims else shapes for i, c in enumerate(chunks) + ] + elif isinstance(shapes, dict): + chunks = [ + chunks[i] if i in f_dims else shapes.get(i, "auto") + for i, c in enumerate(chunks) + ] + else: + # chunks is a sequence + if len(shapes) != ndim: + raise ValueError( + f"Wrong number of 'shapes' elements in {shapes}: " + f"Got {len(shapes)}, expected {self.ndim}" + ) + + chunks = [ + c if i in f_dims else shapes[i] for i, c in enumerate(chunks) + ] + + return normalize_chunks(chunks, shape=shape, dtype=self.dtype) + + def subarrays(self, subarray_shapes): + """Return descriptors for every subarray. + + .. versionadded:: 3.14.0 + + .. seealso:: `subarray_shapes` + + :Parameters: + + subarray_shapes: `tuple` + The subarray sizes along each dimension, as returned + by a prior call to `subarray_shapes`. + + :Returns: + + 6-`tuple` of iterators + Each iterator iterates over a particular descriptor + from each subarray. + + 1. The indices of the aggregated array that correspond + to each subarray. + + 2. The shape of each subarray. + + 3. The indices of the fragment that corresponds to each + subarray (some subarrays may be represented by a + part of a fragment). + + 4. The location of each subarray. + + 5. The location on the fragment dimensions of the + fragment that corresponds to each subarray. + + 6. The shape of each fragment that overlaps each chunk. + + **Examples** + + An aggregated array with shape (12, 73, 144) has two + fragments, both with with shape (6, 73, 144). + + >>> a.shape + (12, 73, 144) + >>> a.get_fragment_shape() + (2, 1, 1) + >>> a.fragmented_dimensions() + [0] + >>> subarray_shapes = a.subarray_shapes({1: 40}) + >>> print(subarray_shapes) + ((6, 6), (40, 33), (144,)) + >>> ( + ... u_indices, + ... u_shapes, + ... f_indices, + ... s_locations, + ... f_locations, + ... f_shapes, + ... ) = a.subarrays(subarray_shapes) + >>> for i in u_indices: + ... print(i) + ... + (slice(0, 6, None), slice(0, 40, None), slice(0, 144, None)) + (slice(0, 6, None), slice(40, 73, None), slice(0, 144, None)) + (slice(6, 12, None), slice(0, 40, None), slice(0, 144, None)) + (slice(6, 12, None), slice(40, 73, None), slice(0, 144, None)) + + >>> for i in u_shapes + ... print(i) + ... + (6, 40, 144) + (6, 33, 144) + (6, 40, 144) + (6, 33, 144) + >>> for i in f_indices: + ... print(i) + ... + (slice(None, None, None), slice(0, 40, None), slice(0, 144, None)) + (slice(None, None, None), slice(40, 73, None), slice(0, 144, None)) + (slice(None, None, None), slice(0, 40, None), slice(0, 144, None)) + (slice(None, None, None), slice(40, 73, None), slice(0, 144, None)) + >>> for i in s_locations: + ... print(i) + ... + (0, 0, 0) + (0, 1, 0) + (1, 0, 0) + (1, 1, 0) + >>> for i in f_locations: + ... print(i) + ... + (0, 0, 0) + (0, 0, 0) + (1, 0, 0) + (1, 0, 0) + >>> for i in f_shapes: + ... print(i) + ... + (6, 73, 144) + (6, 73, 144) + (6, 73, 144) + (6, 73, 144) + + """ + f_dims = self.get_fragmented_dimensions() + + # The indices of the uncompressed array that correspond to + # each subarray, the shape of each uncompressed subarray, and + # the location of each subarray + s_locations = [] + u_shapes = [] + u_indices = [] + f_locations = [] + for dim, c in enumerate(subarray_shapes): + nc = len(c) + s_locations.append(tuple(range(nc))) + u_shapes.append(c) + + if dim in f_dims: + f_locations.append(tuple(range(nc))) + else: + # No fragmentation along this dimension + f_locations.append((0,) * nc) + + c = tuple(accumulate((0,) + c)) + u_indices.append([slice(i, j) for i, j in zip(c[:-1], c[1:])]) + + # For each subarray, the part of the fragment that corresponds + # to it. + f_indices = [ + (slice(None),) * len(u) if dim in f_dims else u + for dim, u in enumerate(u_indices) + ] + + # For each subarray, the shape of the fragment that + # corresponds to it. + f_shapes = [ + u_shape if dim in f_dims else (size,) * len(u_shape) + for dim, (u_shape, size) in enumerate(zip(u_shapes, self.shape)) + ] + + return ( + product(*u_indices), + product(*u_shapes), + product(*f_indices), + product(*s_locations), + product(*f_locations), + product(*f_shapes), + ) + + def to_dask_array(self, chunks="auto"): + """Create a dask array with `FragmentArray` chunks. + + .. versionadded:: 3.14.0 + + :Parameters: + + chunks: `int`, `tuple`, `dict` or `str`, optional + Specify the chunking of the returned dask array. + + Any value accepted by the *chunks* parameter of the + `dask.array.from_array` function is allowed. + + The chunk sizes implied by *chunks* for a dimension that + has been fragmented are ignored and replaced with values + that are implied by that dimensions fragment sizes. + + :Returns: + + `dask.array.Array` + + """ + import dask.array as da + from dask.array.core import getter + from dask.base import tokenize + + name = (f"{self.__class__.__name__}-{tokenize(self)}",) + + dtype = self.dtype + units = self.get_units() + calendar = self.get_calendar(None) + aggregated_data = self.get_aggregated_data(copy=False) + + # Set the chunk sizes for the dask array + chunks = self.subarray_shapes(chunks) + + if self.get_mask(): + fragment_arrays = _FragmentArray + else: + fragment_arrays = _FragmentArray.copy() + fragment_arrays["nc"] = partial(_FragmentArray["nc"], mask=False) + + dsk = {} + for ( + u_indices, + u_shape, + f_indices, + chunk_location, + fragment_location, + fragment_shape, + ) in zip(*self.subarrays(chunks)): + kwargs = aggregated_data[fragment_location].copy() + kwargs.pop("location", None) + + fragment_format = kwargs.pop("format", None) + try: + FragmentArray = fragment_arrays[fragment_format] + except KeyError: + raise ValueError( + "Can't get FragmentArray class for unknown " + f"fragment dataset format: {fragment_format!r}" + ) + + fragment = FragmentArray( + dtype=dtype, + shape=fragment_shape, + aggregated_units=units, + aggregated_calendar=calendar, + **kwargs, + # pass s3 here TODO + ) + + key = f"{fragment.__class__.__name__}-{tokenize(fragment)}" + dsk[key] = fragment + dsk[name + chunk_location] = ( + getter, + key, + f_indices, + False, + getattr(fragment, "_lock", False), + ) + + # Return the dask array + return da.Array(dsk, name[0], chunks=chunks, dtype=dtype) From 5ef961c9bc808cb6d5278f50f14a8ab58cbd9ec4 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 26 Jan 2024 18:00:02 +0000 Subject: [PATCH 030/134] dev --- cf/data/array/cfaarray.py | 2 +- cf/data/array/cfanetcdfarray.py | 34 ++++-- cf/data/array/mixin/activestoragemixin.py | 3 +- cf/data/fragment/__init__.py | 1 + cf/data/fragment/h5fragmentarray.py | 107 +++++++++++++++++++ cf/data/fragment/mixin/fragmentarraymixin.py | 2 +- cf/data/fragment/netcdffragmentarray.py | 10 +- cf/read_write/netcdf/netcdfread.py | 46 +++++++- cf/read_write/read.py | 4 + 9 files changed, 188 insertions(+), 21 deletions(-) create mode 100644 cf/data/fragment/h5fragmentarray.py diff --git a/cf/data/array/cfaarray.py b/cf/data/array/cfaarray.py index e53720e8f1..51ef503349 100644 --- a/cf/data/array/cfaarray.py +++ b/cf/data/array/cfaarray.py @@ -12,7 +12,7 @@ # Store fragment array classes. _FragmentArray = { -# "h5": H5FragmentArray, + "h5": None, #H5FragmentArray, "nc": NetCDFFragmentArray, "um": UMFragmentArray, "full": FullFragmentArray, diff --git a/cf/data/array/cfanetcdfarray.py b/cf/data/array/cfanetcdfarray.py index d75a5ed02c..9d53e88ea0 100644 --- a/cf/data/array/cfanetcdfarray.py +++ b/cf/data/array/cfanetcdfarray.py @@ -6,18 +6,19 @@ from ..fragment import FullFragmentArray, NetCDFFragmentArray, UMFragmentArray from ..utils import chunk_locations, chunk_positions -from .mixin import CFAMixin +#from .mixin import CFAMixin from .netcdfarray import NetCDFArray # Store fragment array classes. _FragmentArray = { +# "nc": H5FragmentArray, "nc": NetCDFFragmentArray, "um": UMFragmentArray, "full": FullFragmentArray, } -class CFANetCDFArray(CFAMixin, NetCDFArray): +class CFANetCDFArray(NetCDFArray): """A CFA aggregated array stored in a netCDF file. .. versionadded:: 3.14.0 @@ -35,6 +36,7 @@ def __init__( instructions=None, substitutions=None, term=None, + s3=None, source=None, copy=True, x=None, @@ -102,6 +104,10 @@ def __init__( .. versionadded:: 3.15.0 + {{s3: `dict` or `None`, optional}} + + .. versionadded:: ACTIVEVERSION + {{init source: optional}} {{init copy: `bool`, optional}} @@ -140,7 +146,7 @@ def __init__( location = x["location"] ndim = location.shape[0] - print("location =", location.shape, repr(location)) +# print("location =", location.shape, repr(location)) compressed = np.ma.compressed chunks = [compressed(i).tolist() for i in location] # print(chunks) @@ -177,23 +183,25 @@ def __init__( if not a.ndim: a = np.full(f.shape, a, dtype=a.dtype) - + if np.ma.is_masked(f): + a = np.ma.array(a, mask=f.mask) + if not fmt.ndim: fmt = np.full(fragment_shape, fmt, dtype=fmt.dtype) if extra_dimension: - print("-----") - import copy - - print(f.shape, repr(f)) +# print("-----") +# import copy +# +# print(f.shape, repr(f)) # if f.shape == (780, 1, 1, 2): # for frag_loc, loc in zip(positions, locations): # print(frag_loc, loc) aggregated_data = { frag_loc: { "location": loc, - "filename": f[frag_loc].tolist(), - "address": a[frag_loc].tolist(), + "filename": compressed(f[frag_loc]).tolist(), + "address": compressed(a[frag_loc]).tolist(), "format": fmt[frag_loc].item(), } for frag_loc, loc in zip(positions, locations) @@ -680,6 +688,8 @@ def to_dask_array(self, chunks="auto"): fragment_arrays = _FragmentArray.copy() fragment_arrays["nc"] = partial(_FragmentArray["nc"], mask=False) + s3 = self.get_s3() + dsk = {} for ( u_indices, @@ -701,6 +711,10 @@ def to_dask_array(self, chunks="auto"): f"fragment dataset format: {fragment_format!r}" ) + if s3 and kwargs['address'] == 'nc': + # Pass on any S3 file system options + kwargs['s3'] = s3 + fragment = FragmentArray( dtype=dtype, shape=fragment_shape, diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index 751045a456..85cb851e80 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -52,8 +52,7 @@ def __getitem__(self, indices): s3 = {} active = Active( - self.get_filename(), self.get_address(), **missing_values, - # **s3 + self.get_filename(), self.get_address(), s3=s3, **missing_values, ) active.method = method active.components = True diff --git a/cf/data/fragment/__init__.py b/cf/data/fragment/__init__.py index 2ce2dafa60..efa56dc307 100644 --- a/cf/data/fragment/__init__.py +++ b/cf/data/fragment/__init__.py @@ -1,3 +1,4 @@ from .fullfragmentarray import FullFragmentArray +from .h5fragmentarray import H5FragmentArray from .netcdffragmentarray import NetCDFFragmentArray from .umfragmentarray import UMFragmentArray diff --git a/cf/data/fragment/h5fragmentarray.py b/cf/data/fragment/h5fragmentarray.py new file mode 100644 index 0000000000..7f027dc478 --- /dev/null +++ b/cf/data/fragment/h5fragmentarray.py @@ -0,0 +1,107 @@ +#from ..array.mixin import ActiveStorageMixin +from ..array.hdfarray import HDFArray +from .mixin import FragmentArrayMixin + + +class H5FragmentArray(FragmentArrayMixin, HDFArray): + """A CFA fragment array stored in a netCDF file. + + .. versionadded:: ACTIVEVERSION + + """ + + def __init__( + self, + filename=None, + address=None, + dtype=None, + shape=None, + aggregated_units=False, + aggregated_calendar=False, + units=False, + calendar=None, + s3=None, + source=None, + copy=True, + ): + """**Initialisation** + + :Parameters: + + filename: (sequence of `str`), optional + The names of the netCDF fragment files containing the + array. + + address: (sequence of `str`), optional + The name of the netCDF variable containing the + fragment array. Required unless *varid* is set. + + dtype: `numpy.dtype`, optional + The data type of the aggregated array. May be `None` + if the numpy data-type is not known (which can be the + case for netCDF string types, for example). This may + differ from the data type of the netCDF fragment + variable. + + shape: `tuple`, optional + The shape of the fragment within the aggregated + array. This may differ from the shape of the netCDF + fragment variable in that the latter may have fewer + size 1 dimensions. + + units: `str` or `None`, optional + The units of the fragment data. Set to `None` to + indicate that there are no units. If unset then the + units will be set during the first `__getitem__` call. + + calendar: `str` or `None`, optional + The calendar of the fragment data. Set to `None` to + indicate the CF default calendar, if applicable. If + unset then the calendar will be set during the first + `__getitem__` call. + + {{aggregated_units: `str` or `None`, optional}} + + {{aggregated_calendar: `str` or `None`, optional}} + + {{s3: `dict` or `None`, optional}} + + .. versionadded:: ACTIVEVERSION + + {{init source: optional}} + + {{init copy: `bool`, optional}} + + """ + super().__init__( + filename=filename, + address=address, + dtype=dtype, + shape=shape, + mask=True, + units=units, + calendar=calendar, + s3=s3, + source=source, + copy=copy, + ) + + if source is not None: + try: + aggregated_units = source._get_component( + "aggregated_units", False + ) + except AttributeError: + aggregated_units = False + + try: + aggregated_calendar = source._get_component( + "aggregated_calendar", False + ) + except AttributeError: + aggregated_calendar = False + + self._set_component("aggregated_units", aggregated_units, copy=False) + self._set_component( + "aggregated_calendar", aggregated_calendar, copy=False + ) diff --git a/cf/data/fragment/mixin/fragmentarraymixin.py b/cf/data/fragment/mixin/fragmentarraymixin.py index 544befd337..d523d88d70 100644 --- a/cf/data/fragment/mixin/fragmentarraymixin.py +++ b/cf/data/fragment/mixin/fragmentarraymixin.py @@ -71,7 +71,7 @@ def __getitem__(self, indices): "\n\n" "Consider re-creating the data with exactly one " "dask compute chunk per fragment (e.g. by setting " - "'chunks=None' as a keyword to cf.read)." + "chunks=None as a keyword to cf.read)." ) array = array.reshape(self.shape) diff --git a/cf/data/fragment/netcdffragmentarray.py b/cf/data/fragment/netcdffragmentarray.py index 9b85ed3597..2a9c78e639 100644 --- a/cf/data/fragment/netcdffragmentarray.py +++ b/cf/data/fragment/netcdffragmentarray.py @@ -1,9 +1,9 @@ #from ..array.mixin import ActiveStorageMixin -from ..array.netcdfarray import NetCDFArray +from ..array.hdfarray import HDFArray from .mixin import FragmentArrayMixin -class NetCDFFragmentArray(FragmentArrayMixin, NetCDFArray): +class NetCDFFragmentArray(FragmentArrayMixin, HDFArray): """A CFA fragment array stored in a netCDF file. .. versionadded:: 3.14.0 @@ -20,6 +20,7 @@ def __init__( aggregated_calendar=False, units=False, calendar=None, + s3=None, source=None, copy=True, ): @@ -63,6 +64,10 @@ def __init__( {{aggregated_calendar: `str` or `None`, optional}} + {{s3: `dict` or `None`, optional}} + + .. versionadded:: ACTIVEVERSION + {{init source: optional}} {{init copy: `bool`, optional}} @@ -76,6 +81,7 @@ def __init__( mask=True, units=units, calendar=calendar, + s3=s3, source=source, copy=copy, ) diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py index dc28ea52dd..d30fe057ef 100644 --- a/cf/read_write/netcdf/netcdfread.py +++ b/cf/read_write/netcdf/netcdfread.py @@ -955,10 +955,16 @@ def _cfa_parse_aggregated_data(self, ncvar, aggregated_data): # Already processed this term continue - array = g["variables"][term_ncvar][...] - aggregation_instructions[term_ncvar] = self._cfa_conform_array( - array - ) + array = g["variables"][term_ncvar][...] # DCH HERE no missing +# if g['original_HDF']: +# v = g["variables"][term_ncvar] +# from ...data.array import HDFArray +# array = HDFArray._mask2(array, v.dtype, v.attrs, isvlen=v.dtype.kind == "O") + + array = self._cfa_conform_array(array) # Do we ant to do this? + aggregation_instructions[term_ncvar] = array + + print (term_ncvar, g["variables"][term_ncvar].dtype, repr(array)) if term == "file": # Find URI substitutions that may be stored in the @@ -979,7 +985,7 @@ def _cfa_parse_aggregated_data(self, ncvar, aggregated_data): # precedence over those defined in the file. subs.update(g["cfa_options"].get("substitutions", {})) g["cfa_file_substitutions"][term_ncvar] = subs - + g["cfa_aggregated_data"][ncvar] = out return out @@ -999,6 +1005,36 @@ def _cfa_conform_array(self, array): The conformed array. """ + string_type = isinstance(array, str) + + if string_type: + print (888888) + # -------------------------------------------------------- + # A netCDF string type scalar variable comes out as Python + # str object, so convert it to a numpy array. + # -------------------------------------------------------- + array = np.array(array, dtype=f"U{len(array)}") + + kind = array.dtype.kind + if not string_type and kind in "SU": + # Collapse by concatenation the outermost (fastest + # varying) dimension of char array into + # memory. E.g. [['a','b','c']] becomes ['abc'] + if kind == "U": + array = array.astype("S", copy=False) + + array = netCDF4.chartostring(array) + shape = array.shape + array = np.array([x.rstrip() for x in array.flat], dtype="U") + array = np.reshape(array, shape) + array = np.ma.masked_where(array == "", array) + elif not string_type and kind == "O": + array = array.astype("U", copy=False) + print (11111111, repr(array)) + array = np.ma.where(array == "", np.ma.masked, array) + + return array + if isinstance(array, str): # string return np.array(array, dtype=f"S{len(array)}").astype("U") diff --git a/cf/read_write/read.py b/cf/read_write/read.py index bfc46e0d20..ce96e5a6f4 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -63,6 +63,7 @@ def read( domain=False, cfa=None, s3=None, + _no_HDF=False, ): """Read field or domain constructs from files. @@ -976,6 +977,7 @@ def read( select=select, domain=domain, cfa_options=cfa_options, + _no_HDF=_no_HDF, s3=s3, ) @@ -1091,6 +1093,7 @@ def _read_a_file( select=None, domain=False, cfa_options=None, + _no_HDF=False, s3=None, ): """Read the contents of a single file into a field list. @@ -1206,6 +1209,7 @@ def _read_a_file( warn_valid=warn_valid, domain=domain, s3=s3, + _no_HDF=_no_HDF ) except MaskError: # Some data required for field interpretation is missing, From 2abc8c4765c3948067e877e43702738d78aabc1b Mon Sep 17 00:00:00 2001 From: David Hassell Date: Sun, 28 Jan 2024 18:29:30 +0000 Subject: [PATCH 031/134] dev --- cf/__init__.py | 2 +- cf/cfimplementation.py | 6 +- cf/data/array/__init__.py | 2 +- cf/data/array/cfanetcdfarray.py | 134 ++++++++++----- .../array/{hdfarray.py => h5netcdfarray.py} | 10 +- cf/data/array/mixin/activestoragemixin.py | 7 +- cf/data/array/umarray.py | 1 + cf/data/collapse/collapse.py | 2 +- cf/data/collapse/collapse_active.py | 6 +- cf/data/fragment/h5fragmentarray.py | 8 +- cf/data/fragment/netcdffragmentarray.py | 8 +- cf/read_write/netcdf/netcdfread.py | 155 +++++++++--------- cf/read_write/read.py | 4 +- 13 files changed, 203 insertions(+), 142 deletions(-) rename cf/data/array/{hdfarray.py => h5netcdfarray.py} (86%) diff --git a/cf/__init__.py b/cf/__init__.py index bd9b83b0ea..ae070ca65f 100644 --- a/cf/__init__.py +++ b/cf/__init__.py @@ -277,7 +277,7 @@ CFANetCDFArray, FullArray, GatheredArray, - HDFArray, + H5netcdfArray, NetCDFArray, PointTopologyArray, RaggedContiguousArray, diff --git a/cf/cfimplementation.py b/cf/cfimplementation.py index 10065081a9..3a9980731b 100644 --- a/cf/cfimplementation.py +++ b/cf/cfimplementation.py @@ -31,7 +31,7 @@ CellConnectivityArray, CFANetCDFArray, GatheredArray, - HDFArray, + H5netcdfArray, NetCDFArray, PointTopologyArray, RaggedContiguousArray, @@ -176,7 +176,7 @@ def initialise_CFANetCDFArray( BoundsFromNodesArray=BoundsFromNodesArray, CellConnectivityArray=CellConnectivityArray, GatheredArray=GatheredArray, - HDFArray=HDFArray, + H5netcdfArray=H5netcdfArray, NetCDFArray=NetCDFArray, PointTopologyArray=PointTopologyArray, RaggedContiguousArray=RaggedContiguousArray, @@ -232,7 +232,7 @@ def implementation(): 'PartNodeCountProperties': cf.partnodecountproperties.PartNodeCountProperties, 'Data': cf.data.data.Data, 'GatheredArray': cf.data.array.gatheredarray.GatheredArray, - 'HDFArray': cf.data.array.hdfarray.HDFArray, + 'H5netcdfArray': cf.data.array.h5netcdfarray.H5netcdfArray, 'NetCDFArray': cf.data.array.netcdfarray.NetCDFArray, 'PointTopologyArray': , 'RaggedContiguousArray': cf.data.array.raggedcontiguousarray.RaggedContiguousArray, diff --git a/cf/data/array/__init__.py b/cf/data/array/__init__.py index 3a35b976fb..ea828ac8de 100644 --- a/cf/data/array/__init__.py +++ b/cf/data/array/__init__.py @@ -3,7 +3,7 @@ from .cfanetcdfarray import CFANetCDFArray from .fullarray import FullArray from .gatheredarray import GatheredArray -from .hdfarray import HDFArray +from .h5netcdfarray import H5netcdfArray from .netcdfarray import NetCDFArray from .pointtopologyarray import PointTopologyArray from .raggedcontiguousarray import RaggedContiguousArray diff --git a/cf/data/array/cfanetcdfarray.py b/cf/data/array/cfanetcdfarray.py index 9d53e88ea0..b182ad912b 100644 --- a/cf/data/array/cfanetcdfarray.py +++ b/cf/data/array/cfanetcdfarray.py @@ -6,12 +6,13 @@ from ..fragment import FullFragmentArray, NetCDFFragmentArray, UMFragmentArray from ..utils import chunk_locations, chunk_positions -#from .mixin import CFAMixin + +# from .mixin import CFAMixin from .netcdfarray import NetCDFArray # Store fragment array classes. _FragmentArray = { -# "nc": H5FragmentArray, + # "nc": H5FragmentArray, "nc": NetCDFFragmentArray, "um": UMFragmentArray, "full": FullFragmentArray, @@ -36,7 +37,7 @@ def __init__( instructions=None, substitutions=None, term=None, - s3=None, + s3=None, source=None, copy=True, x=None, @@ -146,10 +147,8 @@ def __init__( location = x["location"] ndim = location.shape[0] -# print("location =", location.shape, repr(location)) compressed = np.ma.compressed chunks = [compressed(i).tolist() for i in location] - # print(chunks) shape = [sum(c) for c in chunks] positions = chunk_positions(chunks) locations = chunk_locations(chunks) @@ -172,7 +171,7 @@ def __init__( else: a = x["address"] f = x["file"] - fmt = x["format"] + file_fmt = x["format"] extra_dimension = f.ndim > ndim if extra_dimension: @@ -180,42 +179,91 @@ def __init__( fragment_shape = f.shape[:-1] else: fragment_shape = f.shape - - if not a.ndim: - a = np.full(f.shape, a, dtype=a.dtype) - if np.ma.is_masked(f): - a = np.ma.array(a, mask=f.mask) - if not fmt.ndim: - fmt = np.full(fragment_shape, fmt, dtype=fmt.dtype) + if not a.ndim: + a = (a.item(),) +# a = np.full(f.shape, a, dtype=a.dtype) +# if np.ma.is_masked(f): +# a = np.ma.array(a, mask=f.mask) + scalar_address = True + else: + scalar_address = False - if extra_dimension: -# print("-----") -# import copy -# -# print(f.shape, repr(f)) - # if f.shape == (780, 1, 1, 2): - # for frag_loc, loc in zip(positions, locations): - # print(frag_loc, loc) - aggregated_data = { - frag_loc: { - "location": loc, - "filename": compressed(f[frag_loc]).tolist(), - "address": compressed(a[frag_loc]).tolist(), - "format": fmt[frag_loc].item(), - } - for frag_loc, loc in zip(positions, locations) - } + if not file_fmt.ndim: + # fmt = np.full(fragment_shape, fmt, dtype=fmt.dtype) + file_fmt = (file_fmt.item(),) + scalar_fmt = True else: - aggregated_data = { - frag_loc: { - "location": loc, - "filename": (f[frag_loc].item(),), - "address": (a[frag_loc].item(),), - "format": fmt[frag_loc].item(), - } - for frag_loc, loc in zip(positions, locations) - } + scalar_fmt = False + + #if extra_dimension: + # for frag_loc, loc in zip(positions, locations): + # if not scalar_address: + # address = compressed(a[frag_loc]).tolist() + # else: + # address = a + # + # if not scalar_fmt: + # file_fmt = compressed(fmt[frag_loc].tolist()) + # else: + # file_fmt = fmt + # + # aggregated_data['frag_loc'] = { + # "location": loc, + # "filename": compressed(f[frag_loc]).tolist(), + # "address": address, + # "format": file_fmt, + # } + # #aggregated_data = { + # # frag_loc: { + # # "location": loc, + # # "filename": compressed(f[frag_loc]).tolist(), + # # "address": compressed(a[frag_loc]).tolist(), + # # "format": fmt[frag_loc].item(), + # # } + # # for frag_loc, loc in zip(positions, locations) + # #} + #else: + for frag_loc, location in zip(positions, locations): + if extra_dimension: + filename = compressed(f[frag_loc]).tolist() + n_files = len(filenames) + if scalar_address: + address = a * n_files + else: + address = compressed(a[frag_loc].tolist()) + + if not scalar_fmt: + fmt = file_fmt * n_files + else: + fmt = compressed(file_fmt[frag_loc]).tolist() + else: + filename = (f[frag_loc].item(),) + if scalar_address: + address = a + else: + address = (a[frag_loc].item(),) + + if scalar_fmt: + fmt = file_fmt + else: + fmt = file_fmt[frag_loc].item() + + aggregated_data['frag_loc'] = { + "location": location, + "filename": filename, + "address": address, + "format": fmt, + } +# aggregated_data = { +# frag_loc: { +# "location": loc, +# "filename": (f[frag_loc].item(),), +# "address": (a[frag_loc].item(),), +# "format": fmt[frag_loc].item(), +# } +# for frag_loc, loc in zip(positions, locations) +# } # Apply string substitutions to the fragment filenames if substitutions: @@ -689,7 +737,7 @@ def to_dask_array(self, chunks="auto"): fragment_arrays["nc"] = partial(_FragmentArray["nc"], mask=False) s3 = self.get_s3() - + dsk = {} for ( u_indices, @@ -711,9 +759,9 @@ def to_dask_array(self, chunks="auto"): f"fragment dataset format: {fragment_format!r}" ) - if s3 and kwargs['address'] == 'nc': - # Pass on any S3 file system options - kwargs['s3'] = s3 + if s3 and kwargs["address"] == "nc": + # Pass on any S3 file system options + kwargs["s3"] = s3 fragment = FragmentArray( dtype=dtype, diff --git a/cf/data/array/hdfarray.py b/cf/data/array/h5netcdfarray.py similarity index 86% rename from cf/data/array/hdfarray.py rename to cf/data/array/h5netcdfarray.py index 6da6ef2aeb..4fb88effd5 100644 --- a/cf/data/array/hdfarray.py +++ b/cf/data/array/h5netcdfarray.py @@ -6,10 +6,16 @@ from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin -class HDFArray(ActiveStorageMixin, FileArrayMixin, ArrayMixin, Container, cfdm.HDFArray): +class H5netcdfArray( + ActiveStorageMixin, + FileArrayMixin, + ArrayMixin, + Container, + cfdm.H5netcdfArray, +): """An array stored in a netCDF file.] - .. versionadded:: HDFVER + .. versionadded:: HDFVER """ diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index 85cb851e80..5c0f2be739 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -50,9 +50,12 @@ def __getitem__(self, indices): s3 = self.get_s3() except AttributeError: s3 = {} - + active = Active( - self.get_filename(), self.get_address(), s3=s3, **missing_values, + self.get_filename(), + self.get_address(), + s3=s3, + **missing_values, ) active.method = method active.components = True diff --git a/cf/data/array/umarray.py b/cf/data/array/umarray.py index ab5d0d857f..112753cc75 100644 --- a/cf/data/array/umarray.py +++ b/cf/data/array/umarray.py @@ -303,6 +303,7 @@ def _set_units(self, int_hdr): """ units = self._get_component("units", False) if units is False: + # TODOHDF mocve to def _get_attr units = None if not _stash2standard_name: diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index a08563d398..3e1b4c6151 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -440,7 +440,7 @@ def min( The collapsed array. """ - print ('min: active_stoege =', active_storage) + print("min: active_stoege =", active_storage) from .dask_collapse import cf_min_agg, cf_min_chunk, cf_min_combine if chunk_function is None: diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 6124b37564..226ed25fca 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -1,5 +1,5 @@ -from functools import wraps import logging +from functools import wraps # -------------------------------------------------------------------- @@ -204,13 +204,13 @@ def actify(a, method, axis=None): `None`. """ - print ('runing actify') + print("runing actify") try: from activestorage import Active # noqa: F401 except ModuleNotFoundError: # The active storage class dependency is not met, so using # active storage is not possible. - print('oops') + print("oops") return a, None from numbers import Integral diff --git a/cf/data/fragment/h5fragmentarray.py b/cf/data/fragment/h5fragmentarray.py index 7f027dc478..2bb832dc3c 100644 --- a/cf/data/fragment/h5fragmentarray.py +++ b/cf/data/fragment/h5fragmentarray.py @@ -1,9 +1,9 @@ -#from ..array.mixin import ActiveStorageMixin -from ..array.hdfarray import HDFArray +# from ..array.mixin import ActiveStorageMixin +from ..array.h5netcdfarray import H5netcdfArray from .mixin import FragmentArrayMixin -class H5FragmentArray(FragmentArrayMixin, HDFArray): +class H5FragmentArray(FragmentArrayMixin, H5netcdfArray): """A CFA fragment array stored in a netCDF file. .. versionadded:: ACTIVEVERSION @@ -20,7 +20,7 @@ def __init__( aggregated_calendar=False, units=False, calendar=None, - s3=None, + s3=None, source=None, copy=True, ): diff --git a/cf/data/fragment/netcdffragmentarray.py b/cf/data/fragment/netcdffragmentarray.py index 2a9c78e639..4a046e4e5b 100644 --- a/cf/data/fragment/netcdffragmentarray.py +++ b/cf/data/fragment/netcdffragmentarray.py @@ -1,9 +1,9 @@ -#from ..array.mixin import ActiveStorageMixin -from ..array.hdfarray import HDFArray +# from ..array.mixin import ActiveStorageMixin +from ..array.h5netcdfarray import H5netcdfArray from .mixin import FragmentArrayMixin -class NetCDFFragmentArray(FragmentArrayMixin, HDFArray): +class NetCDFFragmentArray(FragmentArrayMixin, H5netcdfArray): """A CFA fragment array stored in a netCDF file. .. versionadded:: 3.14.0 @@ -20,7 +20,7 @@ def __init__( aggregated_calendar=False, units=False, calendar=None, - s3=None, + s3=None, source=None, copy=True, ): diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py index d30fe057ef..2f453d910e 100644 --- a/cf/read_write/netcdf/netcdfread.py +++ b/cf/read_write/netcdf/netcdfread.py @@ -955,16 +955,18 @@ def _cfa_parse_aggregated_data(self, ncvar, aggregated_data): # Already processed this term continue - array = g["variables"][term_ncvar][...] # DCH HERE no missing -# if g['original_HDF']: -# v = g["variables"][term_ncvar] -# from ...data.array import HDFArray -# array = HDFArray._mask2(array, v.dtype, v.attrs, isvlen=v.dtype.kind == "O") + variable = g["variables"][term_ncvar] + if g["original_netCDF"]: + variable.set_auto_maskandscale(False) - array = self._cfa_conform_array(array) # Do we ant to do this? + array = variable[...] + array = cfdm.MaskScale.apply( + variable, array, mask=True, scale=True + ) + # array = self._cfa_conform_array(array) # Do we ant to do this? aggregation_instructions[term_ncvar] = array - print (term_ncvar, g["variables"][term_ncvar].dtype, repr(array)) + print(term_ncvar, g["variables"][term_ncvar].dtype, array) if term == "file": # Find URI substitutions that may be stored in the @@ -985,76 +987,77 @@ def _cfa_parse_aggregated_data(self, ncvar, aggregated_data): # precedence over those defined in the file. subs.update(g["cfa_options"].get("substitutions", {})) g["cfa_file_substitutions"][term_ncvar] = subs - + g["cfa_aggregated_data"][ncvar] = out return out - def _cfa_conform_array(self, array): - """Conform an array so that it is suitable for CFA processing. - - .. versionadded: 3.15.0 - - :Parameters: - array: `np.ndarray` - The array to conform. - - :Returns: - - array: `np.ndarray` - The conformed array. - - """ - string_type = isinstance(array, str) - - if string_type: - print (888888) - # -------------------------------------------------------- - # A netCDF string type scalar variable comes out as Python - # str object, so convert it to a numpy array. - # -------------------------------------------------------- - array = np.array(array, dtype=f"U{len(array)}") - - kind = array.dtype.kind - if not string_type and kind in "SU": - # Collapse by concatenation the outermost (fastest - # varying) dimension of char array into - # memory. E.g. [['a','b','c']] becomes ['abc'] - if kind == "U": - array = array.astype("S", copy=False) - - array = netCDF4.chartostring(array) - shape = array.shape - array = np.array([x.rstrip() for x in array.flat], dtype="U") - array = np.reshape(array, shape) - array = np.ma.masked_where(array == "", array) - elif not string_type and kind == "O": - array = array.astype("U", copy=False) - print (11111111, repr(array)) - array = np.ma.where(array == "", np.ma.masked, array) - - return array - - if isinstance(array, str): - # string - return np.array(array, dtype=f"S{len(array)}").astype("U") - - kind = array.dtype.kind - if kind == "O": - # string - return array.astype("U") - - if kind in "SU": - # char - if kind == "U": - array = array.astype("S") - - array = netCDF4.chartostring(array) - shape = array.shape - array = np.array([x.rstrip() for x in array.flat], dtype="S") - array = np.reshape(array, shape) - array = np.ma.masked_where(array == b"", array) - return array.astype("U") - - # number - return array +# def _cfa_conform_array(self, array): +# """Conform an array so that it is suitable for CFA processing. +# +# .. versionadded: 3.15.0 +# +# :Parameters: +# +# array: `np.ndarray` +# The array to conform. +# +# :Returns: +# +# array: `np.ndarray` +# The conformed array. +# +# """ +# string_type = isinstance(array, str) +# +# if string_type: +# print (888888) +# # -------------------------------------------------------- +# # A netCDF string type scalar variable comes out as Python +# # str object, so convert it to a numpy array. +# # -------------------------------------------------------- +# array = np.array(array, dtype=f"U{len(array)}") +# +# kind = array.dtype.kind +# if not string_type and kind in "SU": +# # Collapse by concatenation the outermost (fastest +# # varying) dimension of char array into +# # memory. E.g. [['a','b','c']] becomes ['abc'] +# if kind == "U": +# array = array.astype("S", copy=False) +# +# array = netCDF4.chartostring(array) +# shape = array.shape +# array = np.array([x.rstrip() for x in array.flat], dtype="U") +# array = np.reshape(array, shape) +# array = np.ma.masked_where(array == "", array) +# elif not string_type and kind == "O": +# array = array.astype("U", copy=False) +# print (11111111, repr(array)) +# array = np.ma.where(array == "", np.ma.masked, array) +# +# return array +# +# if isinstance(array, str): +# # string +# return np.array(array, dtype=f"S{len(array)}").astype("U") +# +# kind = array.dtype.kind +# if kind == "O": +# # string +# return array.astype("U") +# +# if kind in "SU": +# # char +# if kind == "U": +# array = array.astype("S") +# +# array = netCDF4.chartostring(array) +# shape = array.shape +# array = np.array([x.rstrip() for x in array.flat], dtype="S") +# array = np.reshape(array, shape) +# array = np.ma.masked_where(array == b"", array) +# return array.astype("U") +# +# # number +# return array diff --git a/cf/read_write/read.py b/cf/read_write/read.py index ce96e5a6f4..def1d4f5d9 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -1093,7 +1093,7 @@ def _read_a_file( select=None, domain=False, cfa_options=None, - _no_HDF=False, + _no_HDF=False, s3=None, ): """Read the contents of a single file into a field list. @@ -1209,7 +1209,7 @@ def _read_a_file( warn_valid=warn_valid, domain=domain, s3=s3, - _no_HDF=_no_HDF + _no_HDF=_no_HDF, ) except MaskError: # Some data required for field interpretation is missing, From 589bd16f5341f84f5f3f167ed8ea1fec8c12ed0e Mon Sep 17 00:00:00 2001 From: David Hassell Date: Sun, 28 Jan 2024 18:29:57 +0000 Subject: [PATCH 032/134] dev --- cf/data/array/cfanetcdfarray.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cf/data/array/cfanetcdfarray.py b/cf/data/array/cfanetcdfarray.py index b182ad912b..de9c8deeaa 100644 --- a/cf/data/array/cfanetcdfarray.py +++ b/cf/data/array/cfanetcdfarray.py @@ -196,6 +196,7 @@ def __init__( else: scalar_fmt = False + #if extra_dimension: # for frag_loc, loc in zip(positions, locations): # if not scalar_address: From 92fc8e2fa2eb040e28e4fe1eedf30e5b6440f6e5 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 30 Jan 2024 22:58:16 +0000 Subject: [PATCH 033/134] dev --- cf/data/array/cfaarray.py | 730 ------------------------ cf/data/array/cfanetcdfarray.py | 59 +- cf/data/array/h5netcdfarray.py | 1 - cf/data/array/netcdfarray.py | 1 - cf/data/collapse/collapse.py | 33 +- cf/data/collapse/collapse_active.py | 31 +- cf/data/data.py | 1 - cf/data/fragment/__init__.py | 1 - cf/data/fragment/h5fragmentarray.py | 107 ---- cf/data/fragment/netcdffragmentarray.py | 2 +- cf/docstring/docstring.py | 3 + cf/read_write/netcdf/netcdfread.py | 4 +- cf/read_write/read.py | 10 +- 13 files changed, 79 insertions(+), 904 deletions(-) delete mode 100644 cf/data/array/cfaarray.py delete mode 100644 cf/data/fragment/h5fragmentarray.py diff --git a/cf/data/array/cfaarray.py b/cf/data/array/cfaarray.py deleted file mode 100644 index 51ef503349..0000000000 --- a/cf/data/array/cfaarray.py +++ /dev/null @@ -1,730 +0,0 @@ -from copy import deepcopy -from functools import partial -from itertools import accumulate, product - -import numpy as np - -import .abstract -from ..fragment import FullFragmentArray, NetCDFFragmentArray, UMFragmentArray -from ..utils import chunk_locations, chunk_positions -from .mixin import FileArrayMixin -from .netcdfarray import NetCDFArray - -# Store fragment array classes. -_FragmentArray = { - "h5": None, #H5FragmentArray, - "nc": NetCDFFragmentArray, - "um": UMFragmentArray, - "full": FullFragmentArray, -} - - -class CFAArray(FileArrayMixin, abstract.Array): - """A CFA aggregated array stored in a netCDF file. - - .. versionadded:: 3.14.0 - - """ - - def __init__( - self, - dtype=None, - mask=True, - units=False, - calendar=False, - instructions=None, - substitutions=None, - term=None, - source=None, - copy=True, - x=None, - ): - """**Initialisation** - - :Parameters: - - dtype: `numpy.dtype` - The data type of the aggregated data array. May be - `None` if the numpy data-type is not known (which can - be the case for netCDF string types, for example). - - mask: `bool` - If True (the default) then mask by convention when - reading data from disk. - - A netCDF array is masked depending on the values of any of - the netCDF variable attributes ``valid_min``, - ``valid_max``, ``valid_range``, ``_FillValue`` and - ``missing_value``. - - units: `str` or `None`, optional - The units of the aggregated data. Set to `None` to - indicate that there are no units. - - calendar: `str` or `None`, optional - The calendar of the aggregated data. Set to `None` to - indicate the CF default calendar, if applicable. - - instructions: `str`, optional - The ``aggregated_data`` attribute value as found on - the CFA netCDF variable. If set then this will be used - to improve the performance of `__dask_tokenize__`. - - substitutions: `dict`, optional - A dictionary whose key/value pairs define text - substitutions to be applied to the fragment file - names. Each key must be specified with the ``${...}`` - syntax, for instance ``{'${base}': 'sub'}``. - - .. versionadded:: 3.15.0 - - term: `str`, optional - The name of a non-standard aggregation instruction - term from which the array is to be created, instead of - creating the aggregated data in the standard - terms. If set then *address* must be the name of the - term's CFA-netCDF aggregation instruction variable, - which must be defined on the fragment dimensions and - no others. Each value of the aggregation instruction - variable will be broadcast across the shape of the - corresponding fragment. - - *Parameter example:* - ``address='cfa_tracking_id', term='tracking_id'`` - - .. versionadded:: 3.15.0 - - {{init source: optional}} - - {{init copy: `bool`, optional}} - - """ - if source is not None: - try: - shape = source._get_component("shape", None) - except AttributeError: - shape = None - - try: - dtype = source._get_component("dtype", None) - except AttributeError: - dtype = None - - try: - mask = source._get_component("mask", True) - except AttributeError: - mask = True - - try: - units = source._get_component("units", False) - except AttributeError: - units = False - - try: - calendar = source._get_component("calendar", False) - except AttributeError: - calendar = False - - try: - missing_values = source._get_component("missing_values", None) - except AttributeError: - missing_values = None - - try: - fragment_shape = source.get_fragment_shape() - except AttributeError: - fragment_shape = None - - try: - instructions = source._get_component("instructions") - except AttributeError: - instructions = None - - try: - aggregated_data = source.get_aggregated_data(copy=False) - except AttributeError: - aggregated_data = {} - - try: - substitutions = source.get_substitutions() - except AttributeError: - substitutions = None - - try: - term = source.get_term() - except AttributeError: - term = None - - else: - aggregated_data = {} - - location = x["location"] - ndim = location.shape[0] - - compressed = np.ma.compressed - chunks = [compressed(i).tolist() for i in location] - shape = [sum(c) for c in chunks] - positions = chunk_positions(chunks) - locations = chunk_locations(chunks) - - if term is not None: - # ---------------------------------------------------- - # This fragment contains a constant value (as opposed - # to file locations) - # ---------------------------------------------------- - term = x[term] - fragment_shape = term.shape - aggregated_data = { - frag_loc: { - "location": loc, - "fill_value": term[frag_loc].item(), - "format": "full", - } - for frag_loc, loc in zip(positions, locations) - } - else: - # ---------------------------------------------------- - # This fragment contains file locations - # ---------------------------------------------------- - a = x["address"] - f = x["file"] - fmt = x["format"] - - extra_dimension = f.ndim > ndim - if extra_dimension: - # There is an extra non-fragment dimension - fragment_shape = f.shape[:-1] - else: - fragment_shape = f.shape - - if not a.ndim: - a = np.full(f.shape, a, dtype=a.dtype) - - if not fmt.ndim: - fmt = np.full(fragment_shape, fmt, dtype=fmt.dtype) - - if extra_dimension: - aggregated_data = { - frag_loc: { - "location": loc, - "filename": f[frag_loc].tolist(), - "address": a[frag_loc].tolist(), - "format": fmt[frag_loc].item(), - } - for frag_loc, loc in zip(positions, locations) - } - else: - aggregated_data = { - frag_loc: { - "location": loc, - "filename": (f[frag_loc].item(),), - "address": (a[frag_loc].item(),), - "format": fmt[frag_loc].item(), - } - for frag_loc, loc in zip(positions, locations) - } - - # Apply string substitutions to the fragment filenames - if substitutions: - for value in aggregated_data.values(): - filenames2 = [] - for filename in value["filename"]: - for base, sub in substitutions.items(): - filename = filename.replace(base, sub) - - filenames2.append(filename) - - value["filename"] = filenames2 - - self._set_component("shape", shape, copy=False) - self._set_component("dtype", dtype, copy=False) - self._set_component("mask", mask, copy=False) - self._set_component("units", units, copy=False) - self._set_component("calendar", calendar, copy=False) - if missing_values is not None: - self._set_component( - "missing_values", missing_values.copy(), copy=False - ) - - self._set_component("fragment_shape", fragment_shape, copy=False) - self._set_component("aggregated_data", aggregated_data, copy=False) - self._set_component("instructions", instructions, copy=False) - self._set_component("term", term, copy=False) - if substitutions is not None: - self._set_component( - "substitutions", substitutions.copy(), copy=False - ) - - def __dask_tokenize__(self): - """Used by `dask.base.tokenize`. - - .. versionadded:: 3.14.0 - - """ - out = super().__dask_tokenize__() - aggregated_data = self._get_component("instructions", None) - if aggregated_data is None: - aggregated_data = self.get_aggregated_data(copy=False) - - return out + (aggregated_data,) - - def __getitem__(self, indices): - """x.__getitem__(indices) <==> x[indices]""" - return NotImplemented # pragma: no cover - - def close(self, dataset): - """Close the dataset containing the data.""" - return NotImplemented # pragma: no cover - - def get_aggregated_data(self, copy=True): - """Get the aggregation data dictionary. - - The aggregation data dictionary contains the definitions of - the fragments and the instructions on how to aggregate them. - The keys are indices of the CFA fragment dimensions, - e.g. ``(1, 0, 0 ,0)``. - - .. versionadded:: 3.14.0 - - :Parameters: - - copy: `bool`, optional - Whether or not to return a copy of the aggregation - dictionary. By default a deep copy is returned. - - .. warning:: If False then changing the returned - dictionary in-place will change the - aggregation dictionary stored in the - {{class}} instance, **as well as in any - copies of it**. - - :Returns: - - `dict` - The aggregation data dictionary. - - **Examples** - - >>> a.shape - (12, 1, 73, 144) - >>> a.get_fragment_shape() - (2, 1, 1, 1) - >>> a.get_aggregated_data() - {(0, 0, 0, 0): {'file': 'January-June.nc', - 'address': 'temp', - 'format': 'nc', - 'location': [(0, 6), (0, 1), (0, 73), (0, 144)]}, - (1, 0, 0, 0): {'file': 'July-December.nc', - 'address': 'temp', - 'format': 'nc', - 'location': [(6, 12), (0, 1), (0, 73), (0, 144)]}} - - """ - aggregated_data = self._get_component("aggregated_data") - if copy: - aggregated_data = deepcopy(aggregated_data) - - return aggregated_data - - def get_fragmented_dimensions(self): - """Get the positions of dimensions that have two or more fragments. - - .. versionadded:: 3.14.0 - - :Returns: - - `list` - The dimension positions. - - **Examples** - - >>> a.get_fragment_shape() - (20, 1, 40, 1) - >>> a.get_fragmented_dimensions() - [0, 2] - - >>> a.get_fragment_shape() - (1, 1, 1) - >>> a.get_fragmented_dimensions() - [] - - """ - return [ - i for i, size in enumerate(self.get_fragment_shape()) if size > 1 - ] - - def get_fragment_shape(self): - """Get the sizes of the fragment dimensions. - - The fragment dimension sizes are given in the same order as - the aggregated dimension sizes given by `shape`. - - .. versionadded:: 3.14.0 - - :Returns: - - `tuple` - The shape of the fragment dimensions. - - """ - return self._get_component("fragment_shape") - - def get_term(self, default=ValueError()): - """The CFA aggregation instruction term for the data, if set. - - .. versionadded:: 3.15.0 - - :Parameters: - - default: optional - Return the value of the *default* parameter if the - term has not been set. If set to an `Exception` - instance then it will be raised instead. - - :Returns: - - `str` - The CFA aggregation instruction term name. - - """ - return self._get_component("term", default=default) - - def open(self, func, *args, **kwargs): - """Return a dataset file object and address.""" - return NotImplemented # pragma: no cover - - def subarray_shapes(self, shapes): - """Create the subarray shapes. - - .. versionadded:: 3.14.0 - - .. seealso:: `subarrays` - - :Parameters: - - shapes: `int`, sequence, `dict` or `str`, optional - Define the subarray shapes. - - Any value accepted by the *chunks* parameter of the - `dask.array.from_array` function is allowed. - - The subarray sizes implied by *chunks* for a dimension - that has been fragmented are ignored, so their - specification is arbitrary. - - :Returns: - - `tuple` - The subarray sizes along each dimension. - - **Examples** - - >>> a.shape - (12, 1, 73, 144) - >>> a.get_fragment_shape() - (2, 1, 1, 1) - >>> a.fragmented_dimensions() - [0] - >>> a.subarray_shapes(-1) - ((6, 6), (1,), (73,), (144,)) - >>> a.subarray_shapes(None) - ((6, 6), (1,), (73,), (144,)) - >>> a.subarray_shapes("auto") - ((6, 6), (1,), (73,), (144,)) - >>> a.subarray_shapes((None, 1, 40, 50)) - ((6, 6), (1,), (40, 33), (50, 50, 44)) - >>> a.subarray_shapes((None, None, "auto", 50)) - ((6, 6), (1,), (73,), (50, 50, 44)) - >>> a.subarray_shapes({2: 40}) - ((6, 6), (1,), (40, 33), (144,)) - - """ - from numbers import Number - - from dask.array.core import normalize_chunks - - # Indices of fragmented dimensions - f_dims = self.get_fragmented_dimensions() - - shape = self.shape - aggregated_data = self.get_aggregated_data(copy=False) - - # Create the base chunks. - chunks = [] - ndim = self.ndim - for dim, (n_fragments, size) in enumerate( - zip(self.get_fragment_shape(), self.shape) - ): - if dim in f_dims: - # This aggregated dimension is spanned by more than - # one fragment. - c = [] - index = [0] * ndim - for j in range(n_fragments): - index[dim] = j - loc = aggregated_data[tuple(index)]["location"][dim] - chunk_size = loc[1] - loc[0] - c.append(chunk_size) - - chunks.append(tuple(c)) - else: - # This aggregated dimension is spanned by exactly one - # fragment. Store None, for now, in the expectation - # that it will get overwrittten. - chunks.append(None) - - if isinstance(shapes, (str, Number)) or shapes is None: - chunks = [ - c if i in f_dims else shapes for i, c in enumerate(chunks) - ] - elif isinstance(shapes, dict): - chunks = [ - chunks[i] if i in f_dims else shapes.get(i, "auto") - for i, c in enumerate(chunks) - ] - else: - # chunks is a sequence - if len(shapes) != ndim: - raise ValueError( - f"Wrong number of 'shapes' elements in {shapes}: " - f"Got {len(shapes)}, expected {self.ndim}" - ) - - chunks = [ - c if i in f_dims else shapes[i] for i, c in enumerate(chunks) - ] - - return normalize_chunks(chunks, shape=shape, dtype=self.dtype) - - def subarrays(self, subarray_shapes): - """Return descriptors for every subarray. - - .. versionadded:: 3.14.0 - - .. seealso:: `subarray_shapes` - - :Parameters: - - subarray_shapes: `tuple` - The subarray sizes along each dimension, as returned - by a prior call to `subarray_shapes`. - - :Returns: - - 6-`tuple` of iterators - Each iterator iterates over a particular descriptor - from each subarray. - - 1. The indices of the aggregated array that correspond - to each subarray. - - 2. The shape of each subarray. - - 3. The indices of the fragment that corresponds to each - subarray (some subarrays may be represented by a - part of a fragment). - - 4. The location of each subarray. - - 5. The location on the fragment dimensions of the - fragment that corresponds to each subarray. - - 6. The shape of each fragment that overlaps each chunk. - - **Examples** - - An aggregated array with shape (12, 73, 144) has two - fragments, both with with shape (6, 73, 144). - - >>> a.shape - (12, 73, 144) - >>> a.get_fragment_shape() - (2, 1, 1) - >>> a.fragmented_dimensions() - [0] - >>> subarray_shapes = a.subarray_shapes({1: 40}) - >>> print(subarray_shapes) - ((6, 6), (40, 33), (144,)) - >>> ( - ... u_indices, - ... u_shapes, - ... f_indices, - ... s_locations, - ... f_locations, - ... f_shapes, - ... ) = a.subarrays(subarray_shapes) - >>> for i in u_indices: - ... print(i) - ... - (slice(0, 6, None), slice(0, 40, None), slice(0, 144, None)) - (slice(0, 6, None), slice(40, 73, None), slice(0, 144, None)) - (slice(6, 12, None), slice(0, 40, None), slice(0, 144, None)) - (slice(6, 12, None), slice(40, 73, None), slice(0, 144, None)) - - >>> for i in u_shapes - ... print(i) - ... - (6, 40, 144) - (6, 33, 144) - (6, 40, 144) - (6, 33, 144) - >>> for i in f_indices: - ... print(i) - ... - (slice(None, None, None), slice(0, 40, None), slice(0, 144, None)) - (slice(None, None, None), slice(40, 73, None), slice(0, 144, None)) - (slice(None, None, None), slice(0, 40, None), slice(0, 144, None)) - (slice(None, None, None), slice(40, 73, None), slice(0, 144, None)) - >>> for i in s_locations: - ... print(i) - ... - (0, 0, 0) - (0, 1, 0) - (1, 0, 0) - (1, 1, 0) - >>> for i in f_locations: - ... print(i) - ... - (0, 0, 0) - (0, 0, 0) - (1, 0, 0) - (1, 0, 0) - >>> for i in f_shapes: - ... print(i) - ... - (6, 73, 144) - (6, 73, 144) - (6, 73, 144) - (6, 73, 144) - - """ - f_dims = self.get_fragmented_dimensions() - - # The indices of the uncompressed array that correspond to - # each subarray, the shape of each uncompressed subarray, and - # the location of each subarray - s_locations = [] - u_shapes = [] - u_indices = [] - f_locations = [] - for dim, c in enumerate(subarray_shapes): - nc = len(c) - s_locations.append(tuple(range(nc))) - u_shapes.append(c) - - if dim in f_dims: - f_locations.append(tuple(range(nc))) - else: - # No fragmentation along this dimension - f_locations.append((0,) * nc) - - c = tuple(accumulate((0,) + c)) - u_indices.append([slice(i, j) for i, j in zip(c[:-1], c[1:])]) - - # For each subarray, the part of the fragment that corresponds - # to it. - f_indices = [ - (slice(None),) * len(u) if dim in f_dims else u - for dim, u in enumerate(u_indices) - ] - - # For each subarray, the shape of the fragment that - # corresponds to it. - f_shapes = [ - u_shape if dim in f_dims else (size,) * len(u_shape) - for dim, (u_shape, size) in enumerate(zip(u_shapes, self.shape)) - ] - - return ( - product(*u_indices), - product(*u_shapes), - product(*f_indices), - product(*s_locations), - product(*f_locations), - product(*f_shapes), - ) - - def to_dask_array(self, chunks="auto"): - """Create a dask array with `FragmentArray` chunks. - - .. versionadded:: 3.14.0 - - :Parameters: - - chunks: `int`, `tuple`, `dict` or `str`, optional - Specify the chunking of the returned dask array. - - Any value accepted by the *chunks* parameter of the - `dask.array.from_array` function is allowed. - - The chunk sizes implied by *chunks* for a dimension that - has been fragmented are ignored and replaced with values - that are implied by that dimensions fragment sizes. - - :Returns: - - `dask.array.Array` - - """ - import dask.array as da - from dask.array.core import getter - from dask.base import tokenize - - name = (f"{self.__class__.__name__}-{tokenize(self)}",) - - dtype = self.dtype - units = self.get_units() - calendar = self.get_calendar(None) - aggregated_data = self.get_aggregated_data(copy=False) - - # Set the chunk sizes for the dask array - chunks = self.subarray_shapes(chunks) - - if self.get_mask(): - fragment_arrays = _FragmentArray - else: - fragment_arrays = _FragmentArray.copy() - fragment_arrays["nc"] = partial(_FragmentArray["nc"], mask=False) - - dsk = {} - for ( - u_indices, - u_shape, - f_indices, - chunk_location, - fragment_location, - fragment_shape, - ) in zip(*self.subarrays(chunks)): - kwargs = aggregated_data[fragment_location].copy() - kwargs.pop("location", None) - - fragment_format = kwargs.pop("format", None) - try: - FragmentArray = fragment_arrays[fragment_format] - except KeyError: - raise ValueError( - "Can't get FragmentArray class for unknown " - f"fragment dataset format: {fragment_format!r}" - ) - - fragment = FragmentArray( - dtype=dtype, - shape=fragment_shape, - aggregated_units=units, - aggregated_calendar=calendar, - **kwargs, - # pass s3 here TODO - ) - - key = f"{fragment.__class__.__name__}-{tokenize(fragment)}" - dsk[key] = fragment - dsk[name + chunk_location] = ( - getter, - key, - f_indices, - False, - getattr(fragment, "_lock", False), - ) - - # Return the dask array - return da.Array(dsk, name[0], chunks=chunks, dtype=dtype) diff --git a/cf/data/array/cfanetcdfarray.py b/cf/data/array/cfanetcdfarray.py index de9c8deeaa..dbadca946d 100644 --- a/cf/data/array/cfanetcdfarray.py +++ b/cf/data/array/cfanetcdfarray.py @@ -105,7 +105,7 @@ def __init__( .. versionadded:: 3.15.0 - {{s3: `dict` or `None`, optional}} + {{init s3: `dict` or `None`, optional}} .. versionadded:: ACTIVEVERSION @@ -179,25 +179,24 @@ def __init__( fragment_shape = f.shape[:-1] else: fragment_shape = f.shape - + if not a.ndim: a = (a.item(),) -# a = np.full(f.shape, a, dtype=a.dtype) -# if np.ma.is_masked(f): -# a = np.ma.array(a, mask=f.mask) + # a = np.full(f.shape, a, dtype=a.dtype) + # if np.ma.is_masked(f): + # a = np.ma.array(a, mask=f.mask) scalar_address = True else: scalar_address = False if not file_fmt.ndim: # fmt = np.full(fragment_shape, fmt, dtype=fmt.dtype) - file_fmt = (file_fmt.item(),) + file_fmt = file_fmt.item() scalar_fmt = True else: scalar_fmt = False - - #if extra_dimension: + # if extra_dimension: # for frag_loc, loc in zip(positions, locations): # if not scalar_address: # address = compressed(a[frag_loc]).tolist() @@ -208,11 +207,11 @@ def __init__( # file_fmt = compressed(fmt[frag_loc].tolist()) # else: # file_fmt = fmt - # + # # aggregated_data['frag_loc'] = { # "location": loc, # "filename": compressed(f[frag_loc]).tolist(), - # "address": address, + # "address": address, # "format": file_fmt, # } # #aggregated_data = { @@ -224,47 +223,47 @@ def __init__( # # } # # for frag_loc, loc in zip(positions, locations) # #} - #else: - for frag_loc, location in zip(positions, locations): + # else: + for frag_loc, location in zip(positions, locations): if extra_dimension: filename = compressed(f[frag_loc]).tolist() - n_files = len(filenames) + n_files = len(filename) if scalar_address: address = a * n_files else: address = compressed(a[frag_loc].tolist()) - if not scalar_fmt: - fmt = file_fmt * n_files + if scalar_fmt: + fmt = file_fmt else: fmt = compressed(file_fmt[frag_loc]).tolist() else: - filename = (f[frag_loc].item(),) + filename = (f[frag_loc].item(),) if scalar_address: address = a else: address = (a[frag_loc].item(),) - + if scalar_fmt: fmt = file_fmt else: fmt = file_fmt[frag_loc].item() - - aggregated_data['frag_loc'] = { + + aggregated_data[frag_loc] = { "location": location, "filename": filename, - "address": address, + "address": address, "format": fmt, - } -# aggregated_data = { -# frag_loc: { -# "location": loc, -# "filename": (f[frag_loc].item(),), -# "address": (a[frag_loc].item(),), -# "format": fmt[frag_loc].item(), -# } -# for frag_loc, loc in zip(positions, locations) -# } + } + # aggregated_data = { + # frag_loc: { + # "location": loc, + # "filename": (f[frag_loc].item(),), + # "address": (a[frag_loc].item(),), + # "format": fmt[frag_loc].item(), + # } + # for frag_loc, loc in zip(positions, locations) + # } # Apply string substitutions to the fragment filenames if substitutions: diff --git a/cf/data/array/h5netcdfarray.py b/cf/data/array/h5netcdfarray.py index 4fb88effd5..a757fd2a24 100644 --- a/cf/data/array/h5netcdfarray.py +++ b/cf/data/array/h5netcdfarray.py @@ -1,5 +1,4 @@ import cfdm -from dask.utils import SerializableLock from ...mixin_container import Container from .locks import _lock diff --git a/cf/data/array/netcdfarray.py b/cf/data/array/netcdfarray.py index 09ec9192ef..4e9e6345a6 100644 --- a/cf/data/array/netcdfarray.py +++ b/cf/data/array/netcdfarray.py @@ -1,5 +1,4 @@ import cfdm -from dask.utils import SerializableLock from ...mixin_container import Container from .locks import _lock diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index 3e1b4c6151..9e460e5c7e 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -15,18 +15,20 @@ class Collapse(metaclass=DocstringRewriteMeta): **Active storage reductions** A collapse method (such as `max`, `var`, etc.) will attempt to - make use of active storage reductions if both of the following are - true: + make use of active storage reductions if all of the following + conditions are met: - 1. The collapse method's *active_storage* parameter is True. + 1. it is possible to import the `activestorage.Active` class; - 2. The method has a corresponding active chunk function defined in - the `collapse_active.active_chunk_functions` dictionary. + 2. the collapse method's *active_storage* parameter is True; - These conditions alone are not sufficient active storage - reductions to occur. In addition, the graph of the `dask` array is - inspected to confirm that making use of active storage is - possible, and if so the graph is modified to expect the per-chunk + 3. the method has a corresponding active chunk function defined + in the `collapse_active.active_chunk_functions` dictionary; + + 4. inspection of the graph of the `dask` array confirms that + making use of active storage is possible; + + in which case the Dask graph is modified to expect the per-chunk reductions to be carried out externally. See `cf.data.collapse.actify` for details. @@ -117,6 +119,7 @@ def max( from .dask_collapse import cf_max_agg, cf_max_chunk, cf_max_combine if chunk_function is None: + # Default function for chunk calculations chunk_function = cf_max_chunk check_input_dtype(a) @@ -243,6 +246,7 @@ def mean( from .dask_collapse import cf_mean_agg, cf_mean_chunk, cf_mean_combine if chunk_function is None: + # Default function for chunk calculations chunk_function = cf_mean_chunk check_input_dtype(a) @@ -376,6 +380,7 @@ def mid_range( ) if chunk_function is None: + # Default function for chunk calculations chunk_function = cf_range_chunk check_input_dtype(a, allowed="fi") @@ -440,10 +445,10 @@ def min( The collapsed array. """ - print("min: active_stoege =", active_storage) from .dask_collapse import cf_min_agg, cf_min_chunk, cf_min_combine if chunk_function is None: + # Default function for chunk calculations chunk_function = cf_min_chunk check_input_dtype(a) @@ -571,6 +576,7 @@ def range( ) if chunk_function is None: + # Default function for chunk calculations chunk_function = cf_range_chunk check_input_dtype(a, allowed="fi") @@ -641,6 +647,7 @@ def rms( from .dask_collapse import cf_mean_combine, cf_rms_agg, cf_rms_chunk if chunk_function is None: + # Default function for chunk calculations chunk_function = cf_rms_chunk check_input_dtype(a) @@ -713,6 +720,7 @@ def sample_size( ) if chunk_function is None: + # Default function for chunk calculations chunk_function = cf_sample_size_chunk check_input_dtype(a) @@ -783,6 +791,7 @@ def sum( from .dask_collapse import cf_sum_agg, cf_sum_chunk, cf_sum_combine if chunk_function is None: + # Default function for chunk calculations chunk_function = cf_sum_chunk check_input_dtype(a) @@ -861,6 +870,7 @@ def sum_of_weights( ) if chunk_function is None: + # Default function for chunk calculations chunk_function = cf_sum_of_weights_chunk check_input_dtype(a) @@ -936,6 +946,7 @@ def sum_of_weights2( ) if chunk_function is None: + # Default function for chunk calculations chunk_function = cf_sum_of_weights_chunk check_input_dtype(a) @@ -984,6 +995,7 @@ def unique( from .dask_collapse import cf_unique_agg, cf_unique_chunk if chunk_function is None: + # Default function for chunk calculations chunk_function = cf_unique_chunk check_input_dtype(a, "fibUS") @@ -1064,6 +1076,7 @@ def var( from .dask_collapse import cf_var_agg, cf_var_chunk, cf_var_combine if chunk_function is None: + # Default function for chunk calculations chunk_function = cf_var_chunk check_input_dtype(a) diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 226ed25fca..68af5947f2 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -1,4 +1,3 @@ -import logging from functools import wraps @@ -168,19 +167,18 @@ def actify(a, method, axis=None): reductions are possible, and if not then the dask array is returned unchanged. - It is assumed that: + .. note:: It is assumed that the `!active_storage` attribute of + the `Data` object that provided the dask array *a* is + `True`. If this is not the case then an error at compute + time is likely. - * The method has a corresponding active function defined in the - `active_chunk_functions` dictionary. If this is not the case - then an error will occur at definition time. - - * The `!active_storage` attribute of the `Data` object that - provided the dask array *a* is `True`. If this is not the case - then an error at compute time is likely. + The value of the `!active_storage` attribute is + registered via the *active_storage* parameter of + `Collapse` methods. .. versionadded:: ACTIVEVERSION - .. seealso:: `active_storage` + .. seealso:: `active_storage`, `cf.data.collapse.Collapse` :Parameters: @@ -188,8 +186,10 @@ def actify(a, method, axis=None): The array to be collapsed. method: `str` - The name of the reduction method. Must be a key of the - `active_chunk_functions` dictionary. + The name of the reduction method. If the method does not + have a corresponding active function in the + `active_chunk_functions` dictionary then active + compuations are not carried out. axis: (sequence of) `int`, optional Axis or axes along which to operate. By default, @@ -204,13 +204,11 @@ def actify(a, method, axis=None): `None`. """ - print("runing actify") try: from activestorage import Active # noqa: F401 except ModuleNotFoundError: # The active storage class dependency is not met, so using # active storage is not possible. - print("oops") return a, None from numbers import Integral @@ -219,6 +217,11 @@ def actify(a, method, axis=None): from dask.array.utils import validate_axis from dask.base import collections_to_dsk + if method not in active_chunk_functions: + # The method does not have a corresponding active function, so + # return the input data unchanged. + return a, None + # Parse axis if axis is None: axis = tuple(range(a.ndim)) diff --git a/cf/data/data.py b/cf/data/data.py index e146584a34..6208183cf7 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -34,7 +34,6 @@ _numpy_allclose, _section, abspath, - active_storage, atol, default_netCDF_fillvals, free_memory, diff --git a/cf/data/fragment/__init__.py b/cf/data/fragment/__init__.py index efa56dc307..2ce2dafa60 100644 --- a/cf/data/fragment/__init__.py +++ b/cf/data/fragment/__init__.py @@ -1,4 +1,3 @@ from .fullfragmentarray import FullFragmentArray -from .h5fragmentarray import H5FragmentArray from .netcdffragmentarray import NetCDFFragmentArray from .umfragmentarray import UMFragmentArray diff --git a/cf/data/fragment/h5fragmentarray.py b/cf/data/fragment/h5fragmentarray.py deleted file mode 100644 index 2bb832dc3c..0000000000 --- a/cf/data/fragment/h5fragmentarray.py +++ /dev/null @@ -1,107 +0,0 @@ -# from ..array.mixin import ActiveStorageMixin -from ..array.h5netcdfarray import H5netcdfArray -from .mixin import FragmentArrayMixin - - -class H5FragmentArray(FragmentArrayMixin, H5netcdfArray): - """A CFA fragment array stored in a netCDF file. - - .. versionadded:: ACTIVEVERSION - - """ - - def __init__( - self, - filename=None, - address=None, - dtype=None, - shape=None, - aggregated_units=False, - aggregated_calendar=False, - units=False, - calendar=None, - s3=None, - source=None, - copy=True, - ): - """**Initialisation** - - :Parameters: - - filename: (sequence of `str`), optional - The names of the netCDF fragment files containing the - array. - - address: (sequence of `str`), optional - The name of the netCDF variable containing the - fragment array. Required unless *varid* is set. - - dtype: `numpy.dtype`, optional - The data type of the aggregated array. May be `None` - if the numpy data-type is not known (which can be the - case for netCDF string types, for example). This may - differ from the data type of the netCDF fragment - variable. - - shape: `tuple`, optional - The shape of the fragment within the aggregated - array. This may differ from the shape of the netCDF - fragment variable in that the latter may have fewer - size 1 dimensions. - - units: `str` or `None`, optional - The units of the fragment data. Set to `None` to - indicate that there are no units. If unset then the - units will be set during the first `__getitem__` call. - - calendar: `str` or `None`, optional - The calendar of the fragment data. Set to `None` to - indicate the CF default calendar, if applicable. If - unset then the calendar will be set during the first - `__getitem__` call. - - {{aggregated_units: `str` or `None`, optional}} - - {{aggregated_calendar: `str` or `None`, optional}} - - {{s3: `dict` or `None`, optional}} - - .. versionadded:: ACTIVEVERSION - - {{init source: optional}} - - {{init copy: `bool`, optional}} - - """ - super().__init__( - filename=filename, - address=address, - dtype=dtype, - shape=shape, - mask=True, - units=units, - calendar=calendar, - s3=s3, - source=source, - copy=copy, - ) - - if source is not None: - try: - aggregated_units = source._get_component( - "aggregated_units", False - ) - except AttributeError: - aggregated_units = False - - try: - aggregated_calendar = source._get_component( - "aggregated_calendar", False - ) - except AttributeError: - aggregated_calendar = False - - self._set_component("aggregated_units", aggregated_units, copy=False) - self._set_component( - "aggregated_calendar", aggregated_calendar, copy=False - ) diff --git a/cf/data/fragment/netcdffragmentarray.py b/cf/data/fragment/netcdffragmentarray.py index 4a046e4e5b..c3730c418b 100644 --- a/cf/data/fragment/netcdffragmentarray.py +++ b/cf/data/fragment/netcdffragmentarray.py @@ -64,7 +64,7 @@ def __init__( {{aggregated_calendar: `str` or `None`, optional}} - {{s3: `dict` or `None`, optional}} + {{init s3: `dict` or `None`, optional}} .. versionadded:: ACTIVEVERSION diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py index 60dab55a17..8645660bf2 100644 --- a/cf/docstring/docstring.py +++ b/cf/docstring/docstring.py @@ -595,6 +595,9 @@ "{{weights auto: `bool`, optional}}": """auto: `bool`, optional If True then return `False` if weights can't be found, rather than raising an exception.""", + # init s3 + "{{init s3: `dict` or `None`, optional}}": """s3: `dict` or `None`, optional + TODO""", # ---------------------------------------------------------------- # Method description substitutions (4 levels of indentation) # ---------------------------------------------------------------- diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py index 2f453d910e..42824f8190 100644 --- a/cf/read_write/netcdf/netcdfread.py +++ b/cf/read_write/netcdf/netcdfread.py @@ -961,13 +961,11 @@ def _cfa_parse_aggregated_data(self, ncvar, aggregated_data): array = variable[...] array = cfdm.MaskScale.apply( - variable, array, mask=True, scale=True + variable, array, mask=True, scale=True, always_mask=False ) # array = self._cfa_conform_array(array) # Do we ant to do this? aggregation_instructions[term_ncvar] = array - print(term_ncvar, g["variables"][term_ncvar].dtype, array) - if term == "file": # Find URI substitutions that may be stored in the # CFA file instruction variable's "substitutions" diff --git a/cf/read_write/read.py b/cf/read_write/read.py index def1d4f5d9..b9347782b0 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -63,7 +63,7 @@ def read( domain=False, cfa=None, s3=None, - _no_HDF=False, + library=None, ): """Read field or domain constructs from files. @@ -681,7 +681,7 @@ def read( .. versionadded:: (cfdm) ACTIVEVERSION - _no_HDF: `bool`, optional + library: `bool`, optional TODOACTIVEDOCS .. versionadded:: (cfdm) ACTIVEVERSION @@ -977,7 +977,7 @@ def read( select=select, domain=domain, cfa_options=cfa_options, - _no_HDF=_no_HDF, + library=None, s3=s3, ) @@ -1093,7 +1093,7 @@ def _read_a_file( select=None, domain=False, cfa_options=None, - _no_HDF=False, + library=None, s3=None, ): """Read the contents of a single file into a field list. @@ -1209,7 +1209,7 @@ def _read_a_file( warn_valid=warn_valid, domain=domain, s3=s3, - _no_HDF=_no_HDF, + library=library, ) except MaskError: # Some data required for field interpretation is missing, From d54fc4031281d9b87c51eb607c38f5c7874d8330 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 1 Feb 2024 23:47:13 +0000 Subject: [PATCH 034/134] dev --- cf/constants.py | 2 +- cf/functions.py | 62 +++++++++++++++++++----------- cf/read_write/netcdf/netcdfread.py | 4 +- cf/read_write/read.py | 45 ++++++++++++++-------- 4 files changed, 73 insertions(+), 40 deletions(-) diff --git a/cf/constants.py b/cf/constants.py index 120d371be8..813a158796 100644 --- a/cf/constants.py +++ b/cf/constants.py @@ -63,7 +63,7 @@ "LOG_LEVEL": logging.getLevelName(logging.getLogger().level), "BOUNDS_COMBINATION_MODE": "AND", "CHUNKSIZE": parse_bytes(_CHUNKSIZE), - "ACTIVE_STORAGE": True, + "ACTIVE_STORAGE": False, } masked = np.ma.masked diff --git a/cf/functions.py b/cf/functions.py index 6d612a60d3..4d21c88399 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -268,7 +268,6 @@ def configuration( .. versionaddedd:: ACTIVEVERSION - of_fraction: `float` or `Constant`, optional Deprecated at version 3.14.0 and is no longer available. @@ -564,11 +563,17 @@ class regrid_logging(ConstantAccess): **Examples** - >>> cf.regrid_logging() + >>> print(cf.regrid_logging()) + False + >>> print(cf.regrid_logging(True)) False - >>> cf.regrid_logging(True) + >>> print(cf.regrid_logging()) + True + >>> with cf.regrid_logging(False): + ... print(cf.regrid_logging()) + ... False - >>> cf.regrid_logging() + >>> print(cf.regrid_logging()) True """ @@ -693,13 +698,19 @@ class relaxed_identities(ConstantAccess): >>> org = cf.relaxed_identities() >>> org False - >>> cf.relaxed_identities(True) + >>> print(cf.relaxed_identities(True)) False - >>> cf.relaxed_identities() + >>> print(cf.relaxed_identities()) + True + >>> print(cf.relaxed_identities(org)) True - >>> cf.relaxed_identities(org) + >>> print(cf.relaxed_identities()) + False + >>> with cf.relaxed_identities(True): + ... print(cf.relaxed_identities()) + ... True - >>> cf.relaxed_identities() + >>> print(cf.relaxed_identities()) False """ @@ -816,18 +827,24 @@ class tempdir(ConstantAccess): :Returns: - `str` - The directory prior to the change, or the current - directory if no new value was specified. + `Constant` + The directory name prior to the change, or the name of the + current directory if no new value was specified. **Examples** - >>> cf.tempdir() + >>> print(cf.tempdir()) '/tmp' >>> old = cf.tempdir('/home/me/tmp') - >>> cf.tempdir(old) + >>> print(cf.tempdir(old)) '/home/me/tmp' - >>> cf.tempdir() + >>> print(cf.tempdir()) + '/tmp' + >>> with cf.tempdir('~/NEW_TMP'): + ... print(cf.tempdir()) + ... + /home/me/NEW_TMP + >>> print(cf.tempdir()) '/tmp' """ @@ -1093,11 +1110,6 @@ class bounds_combination_mode(ConstantAccess): OR >>> print(cf.bounds_combination_mode(old)) OR - >>> print(cf.bounds_combination_mode()) - AND - - Use as a context manager: - >>> print(cf.bounds_combination_mode()) AND >>> with cf.bounds_combination_mode('XOR'): @@ -1168,11 +1180,17 @@ class active_storage(ConstantAccess): **Examples** >>> cf.active_storage() - True - >>> cf.active_storage(False) - True + False + >>> cf.active_storage(True) + False >>> cf.active_storage() + True + >>> with cf.active_storage(False): + ... print(cf.active_storage()) + ... False + >>> cf.active_storage() + True """ diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py index 42824f8190..c5e53b54c8 100644 --- a/cf/read_write/netcdf/netcdfread.py +++ b/cf/read_write/netcdf/netcdfread.py @@ -956,14 +956,14 @@ def _cfa_parse_aggregated_data(self, ncvar, aggregated_data): continue variable = g["variables"][term_ncvar] - if g["original_netCDF"]: + if g["original_netCDF4"]: variable.set_auto_maskandscale(False) array = variable[...] array = cfdm.MaskScale.apply( variable, array, mask=True, scale=True, always_mask=False ) - # array = self._cfa_conform_array(array) # Do we ant to do this? + # array = self._cfa_conform_array(array) aggregation_instructions[term_ncvar] = array if term == "file": diff --git a/cf/read_write/read.py b/cf/read_write/read.py index b9347782b0..49f6edc3c1 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -68,13 +68,14 @@ def read( """Read field or domain constructs from files. The following file formats are supported: CF-netCDF, CFA-netCDF, - CDL, PP and UM fields datasets. + CDL, UM fields file, and PP. Input datasets are mapped to constructs in memory which are returned as elements of a `FieldList` or if the *domain* parameter is True, a `DomainList`. - NetCDF files may be on disk or on an OPeNDAP server. + NetCDF files may be on disk, on an OPeNDAP server, or in an S3 + object store. Any amount of files of any combination of file types may be read. @@ -669,20 +670,29 @@ def read( s3: `dict` or `None`, optional Keyword parameters to be passed to `s3fs.S3FileSystem` to control the opening of files in an S3 object store. By - default, or if `None`, then ``s3={'anon': True}``. Ignored - for file names that don't start with ``s3:``. - - If and only if *s3* has no ``'endpoint_url'`` key, then - one will be automatically derived from the *filename*. For - example, if *filename* was - ``'s3://object-store/data/file.nc'``, then an + default, or if `None`, then a value of ``{'anon': True}`` + is used. Ignored for file names that don't start with + ``s3:``. + + If and only if *s3* has no ``'endpoint_url'`` key (which + will always be the case when *s3* is `None`), then one + will be automatically derived from the file name and + included in the keyword parameters. For example, for a + file name of ``'s3://object-store/data/file.nc'``, then an ``'endpoint_url'`` key with value - ``'https://object-store'`` would be created. + ``'https://object-store'`` would be created. To disable + this behaviour, assign `None` to the ``'endpoint_url'`` + key. .. versionadded:: (cfdm) ACTIVEVERSION - library: `bool`, optional - TODOACTIVEDOCS + library: `str` or `None`, optional + Specify which library to use for opening input files. By + default, or if `None`, then `netCDF4` will used unless it + fails to open a given file, in which case `h5netcdf` will + be used. Setting *library* to ``'netCDF4'`` or + ``'h5netcdf'`` will force the use of the `netCDF4` or + `h5netcdf` libraries respectively. .. versionadded:: (cfdm) ACTIVEVERSION @@ -977,7 +987,7 @@ def read( select=select, domain=domain, cfa_options=cfa_options, - library=None, + library=library, s3=s3, ) @@ -1130,10 +1140,15 @@ def _read_a_file( .. versionadded:: 3.15.0 - s3: `dict`, optional + s3: `dict` or `None`, optional + See `cf.read` for details. + + .. versionadded:: ACTIVEVERSION + + library: `str` or `None`, optional See `cf.read` for details. - .. versionadded:: AVTIVEVERSION + .. versionadded:: ACTIVEVERSION :Returns: From 80ac2e6fd6f3e73edf7690cf0a1a2352e76d31dc Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 2 Feb 2024 09:41:40 +0000 Subject: [PATCH 035/134] dev --- cf/data/array/mixin/activestoragemixin.py | 46 ++++++++++++++++------- cf/data/array/netcdfarray.py | 5 ++- cf/data/collapse/collapse_active.py | 11 ++++-- cf/test/test_NetCDFArray.py | 21 +++++++++++ 4 files changed, 65 insertions(+), 18 deletions(-) diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index 5c0f2be739..62ebe5b33b 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -5,7 +5,7 @@ class ActiveStorageMixin: - """TODOACTIVEDOCS. + """Mixin class for enabling active storage reductions. .. versionadded:: ACTIVEVERSION @@ -59,6 +59,8 @@ def __getitem__(self, indices): ) active.method = method active.components = True + + # Provide a file lock try: active.lock = self._lock except AttributeError: @@ -79,24 +81,37 @@ def actify(self, method, axis=None): :Parameters: method: `str` - TODOACTIVEDOCS + The name of the reduction method. + + *Parameter example:* + ``'min'`` axis: `None` or (sequence of) `int`, optional - TODOACTIVEDOCS + Axis or axes along which to operate. By default, or if + `None`, flattened input is used. :Returns: `{{class}}` - TODOACTIVEDOCS + The new `{{class}}`` instance that ues an active + storage operation. """ + if Active is None: + # The active storage import dependency is not met, so + # using active storage is not possible. + raise AttributeError( + f"Can't actify {self.__class__.__name__} when " + "activestorage.Active is not available" + ) + a = self.copy() a.set_active_method(method) a.set_active_axis(axis) return a def get_active_axis(self): - """TODOACTIVEDOCS. + """Return the active storage reduction axes. .. versionadded:: ACTIVEVERSION @@ -104,13 +119,15 @@ def get_active_axis(self): :Returns: - TODOACTIVEDOCS + `None` or (sequence of) `int + The active storage reduction axes. `None` signifies + that all axes will be reduced. """ return self._custom.get("active_axis") def get_active_method(self): - """TODOACTIVEDOCS. + """Return the name of the active storage reduction method. .. versionadded:: ACTIVEVERSION @@ -119,14 +136,14 @@ def get_active_method(self): :Returns: `str` or `None` - The name of the active reduction method, or `None` if - one hasn't been set. + The name of the active storage reduction method, or + `None` if one hasn't been set. """ return self._custom.get("active_method") def set_active_axis(self, value): - """TODOACTIVEDOCS. + """Set the active storage reduction axes. .. versionadded:: ACTIVEVERSION @@ -134,7 +151,9 @@ def set_active_axis(self, value): :Parameters: - TODOACTIVEDOCS + value: `None` or (sequence of) `int` + The active storage reduction axes. If `None` then all + axes will be reduced. :Returns: @@ -144,7 +163,7 @@ def set_active_axis(self, value): self._custom["active_axis"] = value def set_active_method(self, value): - """TODOACTIVEDOCS. + """Set the name of the active storage reduction method. .. versionadded:: ACTIVEVERSION @@ -152,7 +171,8 @@ def set_active_method(self, value): :Parameters: - TODOACTIVEDOCS + value: `str` + The active storage reduction method. :Returns: diff --git a/cf/data/array/netcdfarray.py b/cf/data/array/netcdfarray.py index 4e9e6345a6..f2b4c28a47 100644 --- a/cf/data/array/netcdfarray.py +++ b/cf/data/array/netcdfarray.py @@ -10,7 +10,10 @@ class NetCDFArray( ): """An array stored in a netCDF file. - TODOACTIVEDOCS + **Active storage reductions** + + Active storage reduction may be enabled with the `actify` + method. See `cf.data.collapse.Collapse` for details. """ diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 68af5947f2..a8309e0db9 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -1,5 +1,10 @@ from functools import wraps +try: + from activestorage import Active +except ModuleNotFoundError: + Active = None + # -------------------------------------------------------------------- # Define the active functions @@ -204,10 +209,8 @@ def actify(a, method, axis=None): `None`. """ - try: - from activestorage import Active # noqa: F401 - except ModuleNotFoundError: - # The active storage class dependency is not met, so using + if Active is None: + # The active storage import dependency is not met, so using # active storage is not possible. return a, None diff --git a/cf/test/test_NetCDFArray.py b/cf/test/test_NetCDFArray.py index c69a4654e7..d1fbcec692 100644 --- a/cf/test/test_NetCDFArray.py +++ b/cf/test/test_NetCDFArray.py @@ -5,6 +5,7 @@ import tempfile import unittest +import numpy as np from dask.base import tokenize faulthandler.enable() # to debug seg faults and timeouts @@ -32,6 +33,13 @@ def _remove_tmpfiles(): class NetCDFArrayTest(unittest.TestCase): + n = cf.NetCDFArray( + filename="filename.nc", + address="x", + shape=(5, 8), + dtype=np.dtype(float), + ) + def test_NetCDFArray_del_file_location(self): a = cf.NetCDFArray(("/data1/file1", "/data2/file2"), ("tas1", "tas2")) b = a.del_file_location("/data1") @@ -121,6 +129,19 @@ def test_NetCDFArray_multiple_files(self): self.assertEqual(len(n.get_filenames()), 2) self.assertTrue((n[...] == f.array).all()) + def test_NetCDFArray_active_method(self): + n = self.n + self.assertIsNone(n.get_active_method()) + self.assertIsNone(n.set_active_method("min")) + self.assertEqual(n.get_active_method(), "min") + + def test_NetCDFArray_active_axis(self): + # Create instance with non-existent file + n = self.n + self.assertIsNone(n.get_active_axis()) + self.assertIsNone(n.set_active_axis((1, 2))) + self.assertEqual(n.get_active_axis(), (1, 2)) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) From 62edeb80e6e508840de8b5a0b06e414d732461b8 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 2 Feb 2024 14:39:40 +0000 Subject: [PATCH 036/134] dependency versions --- Changelog.rst | 2 ++ cf/__init__.py | 6 +++--- docs/source/installation.rst | 6 +++--- requirements.txt | 4 ++-- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/Changelog.rst b/Changelog.rst index 907b1a0132..49a1a60820 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -6,6 +6,8 @@ version 3.17.0 * Fix bug that caused `cf.Field.del_file_location` to fail when updating its metdata constructs (https://github.com/NCAS-CMS/cf-python/issues/707) +* Changed dependency: ``1.11.1.0<=cfdm<1.11.2.0`` +* Changed dependency: ``cfunits>=3.3.7`` version 3.16.0 -------------- diff --git a/cf/__init__.py b/cf/__init__.py index ae070ca65f..8be27befbb 100644 --- a/cf/__init__.py +++ b/cf/__init__.py @@ -191,7 +191,7 @@ ) # Check the version of cfunits -_minimum_vn = "3.3.6" +_minimum_vn = "3.3.7" if Version(cfunits.__version__) < Version(_minimum_vn): raise RuntimeError( f"Bad cfunits version: cf requires cfunits>={_minimum_vn}. " @@ -199,8 +199,8 @@ ) # Check the version of cfdm -_minimum_vn = "1.11.0.0" -_maximum_vn = "1.11.1.0" +_minimum_vn = "1.11.1.0" +_maximum_vn = "1.11.2.0" _cfdm_version = Version(cfdm.__version__) if not Version(_minimum_vn) <= _cfdm_version < Version(_maximum_vn): raise RuntimeError( diff --git a/docs/source/installation.rst b/docs/source/installation.rst index f94a331318..774972d0aa 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -203,10 +203,10 @@ Required * `scipy `_, version 1.10.0 or newer. -* `cfdm `_, version 1.11.0.0 or up to, - but not including, 1.11.1.0. +* `cfdm `_, version 1.11.1.0 or up to, + but not including, 1.11.2.0. -* `cfunits `_, version 3.3.6 or newer. +* `cfunits `_, version 3.3.7 or newer. * `psutil `_, version 0.6.0 or newer. diff --git a/requirements.txt b/requirements.txt index a10ba9f71a..0e64e76a4a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ netCDF4>=1.5.4 cftime>=1.6.2 numpy>=1.22 -cfdm>=1.11.0.0, <1.11.1.0 +cfdm>=1.11.1.0, <1.11.2.0 psutil>=0.6.0 -cfunits>=3.3.6 +cfunits>=3.3.7 dask>=2022.12.1 packaging>=20.0 scipy>=1.10.0 From ebb94ccf690140ee1796ef9a3b979566d04fa8f3 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 2 Feb 2024 16:23:02 +0000 Subject: [PATCH 037/134] dev --- cf/data/array/umarray.py | 1 - cf/data/collapse/__init__.py | 2 +- cf/data/collapse/collapse.py | 27 +++++++--- cf/data/collapse/collapse_active.py | 23 +++++--- cf/data/data.py | 9 +++- cf/data/utils.py | 3 +- cf/read_write/netcdf/netcdfread.py | 81 ++--------------------------- cf/test/test_Data.py | 76 ++++++++++++++------------- 8 files changed, 87 insertions(+), 135 deletions(-) diff --git a/cf/data/array/umarray.py b/cf/data/array/umarray.py index 112753cc75..ab5d0d857f 100644 --- a/cf/data/array/umarray.py +++ b/cf/data/array/umarray.py @@ -303,7 +303,6 @@ def _set_units(self, int_hdr): """ units = self._get_component("units", False) if units is False: - # TODOHDF mocve to def _get_attr units = None if not _stash2standard_name: diff --git a/cf/data/collapse/__init__.py b/cf/data/collapse/__init__.py index 47bbd037ce..547689794d 100644 --- a/cf/data/collapse/__init__.py +++ b/cf/data/collapse/__init__.py @@ -1,2 +1,2 @@ from .collapse import Collapse -from .collapse_active import actify +from .collapse_active import actify, active_chunk_functions, active_storage diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index 9e460e5c7e..74cdb5ffd8 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -15,23 +15,34 @@ class Collapse(metaclass=DocstringRewriteMeta): **Active storage reductions** A collapse method (such as `max`, `var`, etc.) will attempt to - make use of active storage reductions if all of the following + make use of active storage reductions when all of the following conditions are met: - 1. it is possible to import the `activestorage.Active` class; + * the collapse is over all axes; - 2. the collapse method's *active_storage* parameter is True; + * the collapse is unweighted; - 3. the method has a corresponding active chunk function defined - in the `collapse_active.active_chunk_functions` dictionary; + * the data is not compressed by convention (note that netCDF + numeric packing is not considered here to be a compression by + convention techinigue); - 4. inspection of the graph of the `dask` array confirms that - making use of active storage is possible; + * it is possible to import the `activestorage.Active` class; + + * the collapse method's *active_storage* parameter is True; + + * the collapse method's *chunk_function* parameter is `None`; + + * the method has a corresponding active chunk function defined + in the `cf.data.collapse.active_chunk_functions` dictionary; + + * inspection of the graph of the `dask` array with + `cf.data.collapse.actify` confirms that making use of active + storage is possible; in which case the Dask graph is modified to expect the per-chunk reductions to be carried out externally. - See `cf.data.collapse.actify` for details. + See `cf.data.collapse.active_storage` for details. .. versionadded:: 3.14.0 diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index a8309e0db9..89b05e3c2c 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -1,3 +1,4 @@ +import logging from functools import wraps try: @@ -5,6 +6,8 @@ except ModuleNotFoundError: Active = None +logger = logging.getLogger(__name__) + # -------------------------------------------------------------------- # Define the active functions @@ -175,11 +178,9 @@ def actify(a, method, axis=None): .. note:: It is assumed that the `!active_storage` attribute of the `Data` object that provided the dask array *a* is `True`. If this is not the case then an error at compute - time is likely. - - The value of the `!active_storage` attribute is - registered via the *active_storage* parameter of - `Collapse` methods. + time is likely. The value of the `Data` object's + `!active_storage` attribute is registered via the + *active_storage* parameter of `Collapse` methods. .. versionadded:: ACTIVEVERSION @@ -281,6 +282,7 @@ def actify(a, method, axis=None): # active storage reductions => redefine the dask array from the # actified dask graph, and set the active storage reduction chunk # function. + logger.warn("Using activestorage.Active to collapse data") return ( da.Array(dsk, a.name, a.chunks, a.dtype, a._meta), active_chunk_functions[method], @@ -319,13 +321,18 @@ def wrapper(self, *args, **kwargs): ): # Attempt to actify the dask array and provide a new # chunk function - a, chunk_function = actify( - args[0], + if args: + dask_array = args[0] + else: + dask_array = kwargs.pop("a") + + dask_array, chunk_function = actify( + dask_array, method=method, axis=kwargs.get("axis"), ) args = list(args) - args[0] = a + args[0] = dask_array if chunk_function is not None: # The dask array has been actified, so update the diff --git a/cf/data/data.py b/cf/data/data.py index 6208183cf7..72c3446c3d 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -34,6 +34,9 @@ _numpy_allclose, _section, abspath, +) +from ..functions import active_storage as cf_active_storage +from ..functions import ( atol, default_netCDF_fillvals, free_memory, @@ -4778,7 +4781,11 @@ def active_storage(self): False """ - return self._custom.get("active_storage", False) + return ( + self._custom.get("active_storage", False) + and bool(cf_active_storage()) + and not self.get_compression_type() + ) @property def Units(self): diff --git a/cf/data/utils.py b/cf/data/utils.py index 51b60597e9..89f1a02928 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -14,7 +14,6 @@ rt2dt, st2rt, ) -from ..functions import active_storage from ..units import Units from .dask_utils import cf_YMDhms @@ -863,7 +862,7 @@ def collapse( "keepdims": keepdims, "split_every": split_every, "mtol": mtol, - "active_storage": d.active_storage and active_storage(), + "active_storage": d.active_storage, } weights = parse_weights(d, weights, axis) diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py index c5e53b54c8..e4044e2da7 100644 --- a/cf/read_write/netcdf/netcdfread.py +++ b/cf/read_write/netcdf/netcdfread.py @@ -956,13 +956,11 @@ def _cfa_parse_aggregated_data(self, ncvar, aggregated_data): continue variable = g["variables"][term_ncvar] - if g["original_netCDF4"]: - variable.set_auto_maskandscale(False) - - array = variable[...] - array = cfdm.MaskScale.apply( - variable, array, mask=True, scale=True, always_mask=False + array = cfdm.VariableIndexer( + variable, mask=True, scale=True, always_masked=False ) + array = array[...] + # array = self._cfa_conform_array(array) aggregation_instructions[term_ncvar] = array @@ -988,74 +986,3 @@ def _cfa_parse_aggregated_data(self, ncvar, aggregated_data): g["cfa_aggregated_data"][ncvar] = out return out - - -# def _cfa_conform_array(self, array): -# """Conform an array so that it is suitable for CFA processing. -# -# .. versionadded: 3.15.0 -# -# :Parameters: -# -# array: `np.ndarray` -# The array to conform. -# -# :Returns: -# -# array: `np.ndarray` -# The conformed array. -# -# """ -# string_type = isinstance(array, str) -# -# if string_type: -# print (888888) -# # -------------------------------------------------------- -# # A netCDF string type scalar variable comes out as Python -# # str object, so convert it to a numpy array. -# # -------------------------------------------------------- -# array = np.array(array, dtype=f"U{len(array)}") -# -# kind = array.dtype.kind -# if not string_type and kind in "SU": -# # Collapse by concatenation the outermost (fastest -# # varying) dimension of char array into -# # memory. E.g. [['a','b','c']] becomes ['abc'] -# if kind == "U": -# array = array.astype("S", copy=False) -# -# array = netCDF4.chartostring(array) -# shape = array.shape -# array = np.array([x.rstrip() for x in array.flat], dtype="U") -# array = np.reshape(array, shape) -# array = np.ma.masked_where(array == "", array) -# elif not string_type and kind == "O": -# array = array.astype("U", copy=False) -# print (11111111, repr(array)) -# array = np.ma.where(array == "", np.ma.masked, array) -# -# return array -# -# if isinstance(array, str): -# # string -# return np.array(array, dtype=f"S{len(array)}").astype("U") -# -# kind = array.dtype.kind -# if kind == "O": -# # string -# return array.astype("U") -# -# if kind in "SU": -# # char -# if kind == "U": -# array = array.astype("S") -# -# array = netCDF4.chartostring(array) -# shape = array.shape -# array = np.array([x.rstrip() for x in array.flat], dtype="S") -# array = np.reshape(array, shape) -# array = np.ma.masked_where(array == b"", array) -# return array.astype("U") -# -# # number -# return array diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index ee956f936f..b17119a66d 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -4502,43 +4502,45 @@ def test_Data__str__(self): def test_Data_active_storage(self): """Test `Data.active_storage`.""" - d = cf.Data([[9, 8]]) - self.assertFalse(d.active_storage) - - d._set_active_storage(True) - self.assertTrue(d.active_storage) - d._del_active_storage() - self.assertFalse(d.active_storage) - - # Check that operations correctly set active_storage to False, - # in particular those that do not invokde `Data._set_dask`. - d._set_active_storage(True) - d.transpose(inplace=True) - self.assertFalse(d.active_storage) - - d._set_active_storage(True) - d[...] = -1 - self.assertFalse(d.active_storage) - - d._set_active_storage(True) - d.persist(inplace=True) - self.assertFalse(d.active_storage) - - d._set_active_storage(True) - d.rechunk(1, inplace=True) - self.assertFalse(d.active_storage) - - # Test with data on disk - n = cf.NetCDFArray( - "test_file.nc", - "eastward_wind", - shape=(1, 9, 10), - dtype=np.dtype(float), - ) - d = cf.Data(n) - self.assertTrue(d.active_storage) - d = cf.Data(n, to_memory=True) - self.assertFalse(d.active_storage) + with cf.active_storage(True): + d = cf.Data([[9, 8]]) + self.assertFalse(d.active_storage) + + d._set_active_storage(True) + self.assertTrue(d.active_storage) + d._del_active_storage() + self.assertFalse(d.active_storage) + + # Check that operations correctly set active_storage to + # False, in particular those that do not invokde + # `Data._set_dask`. + d._set_active_storage(True) + d.transpose(inplace=True) + self.assertFalse(d.active_storage) + + d._set_active_storage(True) + d[...] = -1 + self.assertFalse(d.active_storage) + + d._set_active_storage(True) + d.persist(inplace=True) + self.assertFalse(d.active_storage) + + d._set_active_storage(True) + d.rechunk(1, inplace=True) + self.assertFalse(d.active_storage) + + # Test with data on disk + n = cf.NetCDFArray( + "test_file.nc", + "eastward_wind", + shape=(1, 9, 10), + dtype=np.dtype(float), + ) + d = cf.Data(n) + self.assertTrue(d.active_storage) + d = cf.Data(n, to_memory=True) + self.assertFalse(d.active_storage) def test_Data_cull_graph(self): """Test `Data.cull`""" From af7c20a7cb55a356c12c8e6a86047a2e493b5994 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Sun, 4 Feb 2024 20:39:14 +0000 Subject: [PATCH 038/134] dev --- cf/data/array/mixin/activestoragemixin.py | 11 +-- cf/data/collapse/collapse_active.py | 4 +- cf/data/data.py | 6 +- cf/read_write/netcdf/netcdfread.py | 2 +- cf/read_write/read.py | 66 ++++++++++-------- cf/test/test_active_storage.py | 81 +++++++++++++++++++++++ 6 files changed, 134 insertions(+), 36 deletions(-) create mode 100644 cf/test/test_active_storage.py diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index 62ebe5b33b..7bf1b367a4 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -47,14 +47,14 @@ def __getitem__(self, indices): missing_values = {} try: - s3 = self.get_s3() + storage_options = self.get_storage_options() except AttributeError: - s3 = {} + storage_options = {} active = Active( self.get_filename(), self.get_address(), - s3=s3, + # storage_options=storage_options, **missing_values, ) active.method = method @@ -66,7 +66,10 @@ def __getitem__(self, indices): except AttributeError: pass - return active[indices] + print(active.__dict__) + out = active[indices] + print(repr(out)) + return out def actify(self, method, axis=None): """Return a new actified `{{class}}` instance. diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 89b05e3c2c..f5a8487b4d 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -226,6 +226,7 @@ def actify(a, method, axis=None): # return the input data unchanged. return a, None + print(9992, axis) # Parse axis if axis is None: axis = tuple(range(a.ndim)) @@ -236,6 +237,7 @@ def actify(a, method, axis=None): if len(axis) != a.ndim: # Can't (yet) use active storage to collapse a subset of # the axes, so return the input data unchanged. + print(9993) return a, None axis = validate_axis(axis, a.ndim) @@ -282,7 +284,7 @@ def actify(a, method, axis=None): # active storage reductions => redefine the dask array from the # actified dask graph, and set the active storage reduction chunk # function. - logger.warn("Using activestorage.Active to collapse data") + logger.warning("Using activestorage.Active to collapse chunks") return ( da.Array(dsk, a.name, a.chunks, a.dtype, a._meta), active_chunk_functions[method], diff --git a/cf/data/data.py b/cf/data/data.py index 72c3446c3d..d45dec4210 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -4768,9 +4768,9 @@ def active_storage(self): If the `active_storage` attribute is `True` then reductions (such as calculating the minimum value of the data) will - attempt to use active storage capabilities, falling back on - the usual (non-active) techniques if an active storage - operation fails for any reason. + *attempt* to use active storage capabilities, falling back on + the usual (non-active) techniques if the conditionsa are not + right. .. versionadded:: ACTIVEVERSION diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py index e4044e2da7..8c9803a43e 100644 --- a/cf/read_write/netcdf/netcdfread.py +++ b/cf/read_write/netcdf/netcdfread.py @@ -654,7 +654,7 @@ def _create_cfanetcdfarray( """ g = self.read_vars - print(g["s3"]) + print(g["storage_options"]) # Get the kwargs needed to instantiate a general NetCDFArray # instance diff --git a/cf/read_write/read.py b/cf/read_write/read.py index 49f6edc3c1..f98ad01710 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -62,13 +62,13 @@ def read( chunks="auto", domain=False, cfa=None, - s3=None, - library=None, + storage_options=None, + netCDF_backend=None, ): """Read field or domain constructs from files. - The following file formats are supported: CF-netCDF, CFA-netCDF, - CDL, UM fields file, and PP. + The following file formats are supported: netCDF, CFA-netCDF, CDL, + UM fields file, and PP. Input datasets are mapped to constructs in memory which are returned as elements of a `FieldList` or if the *domain* parameter @@ -667,26 +667,38 @@ def read( .. versionadded:: 3.15.0 - s3: `dict` or `None`, optional - Keyword parameters to be passed to `s3fs.S3FileSystem` to - control the opening of files in an S3 object store. By - default, or if `None`, then a value of ``{'anon': True}`` - is used. Ignored for file names that don't start with - ``s3:``. + storage_options: `dict` or `None`, optional + Key/value pairs to be passed on to the `s3fs.S3FileSystem` + file-system backend to control the opening of files in an + S3 object store. By default, or if `None`, then a value of + ``{'anon': True}`` is used. Ignored for file names that + don't start with ``s3:``. - If and only if *s3* has no ``'endpoint_url'`` key (which - will always be the case when *s3* is `None`), then one - will be automatically derived from the file name and + If and only if *s3* has no ``'endpoint_url'`` key, then + one will be automatically derived from the file name and included in the keyword parameters. For example, for a - file name of ``'s3://object-store/data/file.nc'``, then an - ``'endpoint_url'`` key with value - ``'https://object-store'`` would be created. To disable - this behaviour, assign `None` to the ``'endpoint_url'`` - key. + file name of ``'s3://store/data/file.nc'``, an + ``'endpoint_url'`` key with value ``'https://store'`` + would be created. To disable this behaviour, assign `None` + to the ``'endpoint_url'`` key. + + *Parameter example:* + ``{'anon': True}`` + + *Parameter example:* + For a file name of ``'s3://store/data/file.nc'``, the + following are equivalent: ``{'anon': True}`` and + ``{'anon': True, 'endpoint_url': 'https://store'}``. + + *Parameter example:* + ``{'key": 'kjhsadf8756', 'secret': '862t3gyebh', + 'client_kwargs': {'endpoint_url': 'http://some-s3.com', + 'config_kwargs': {'s3': {'addressing_style': + 'virtual'}}`` .. versionadded:: (cfdm) ACTIVEVERSION - library: `str` or `None`, optional + netCDF_backend: `str` or `None`, optional Specify which library to use for opening input files. By default, or if `None`, then `netCDF4` will used unless it fails to open a given file, in which case `h5netcdf` will @@ -987,8 +999,8 @@ def read( select=select, domain=domain, cfa_options=cfa_options, - library=library, - s3=s3, + netCDF_backend=netCDF_backend, + storage_options=storage_options, ) # -------------------------------------------------------- @@ -1103,8 +1115,8 @@ def _read_a_file( select=None, domain=False, cfa_options=None, - library=None, - s3=None, + netCDF_backend=None, + storage_options=None, ): """Read the contents of a single file into a field list. @@ -1140,12 +1152,12 @@ def _read_a_file( .. versionadded:: 3.15.0 - s3: `dict` or `None`, optional + storage_options: `dict` or `None`, optional See `cf.read` for details. .. versionadded:: ACTIVEVERSION - library: `str` or `None`, optional + netCDF_backend: `str` or `None`, optional See `cf.read` for details. .. versionadded:: ACTIVEVERSION @@ -1223,8 +1235,8 @@ def _read_a_file( mask=mask, warn_valid=warn_valid, domain=domain, - s3=s3, - library=library, + storage_options=storage_options, + netCDF_backend=netCDF_backend, ) except MaskError: # Some data required for field interpretation is missing, diff --git a/cf/test/test_active_storage.py b/cf/test/test_active_storage.py new file mode 100644 index 0000000000..d401c4e2be --- /dev/null +++ b/cf/test/test_active_storage.py @@ -0,0 +1,81 @@ +import atexit +import datetime +import faulthandler +import os +import tempfile +import unittest + +faulthandler.enable() # to debug seg faults and timeouts + +import numpy as np + +import cf + +try: + from activestorage import Active +except ModuleNotFoundError: + Active = None + +n_tmpfiles = 2 +tmpfiles = [ + tempfile.mkstemp("_test_active_storage.nc", dir=os.getcwd())[1] + for i in range(n_tmpfiles) +] +(tmpfile, tmpfile2) = tmpfiles + + +def _remove_tmpfiles(): + """Try to remove defined temporary files by deleting their paths.""" + for f in tmpfiles: + try: + os.remove(f) + except OSError: + pass + + +atexit.register(_remove_tmpfiles) + + +class ActiveStorageTest(unittest.TestCase): + @unittest.skipUnless(Active is not None, "Requires activestorage package.") + def test_active_storage(self): + # No masked values + f = cf.example_field(0) + cf.write(f, tmpfile) + + f = cf.read(tmpfile, chunks={"latitude": (4, 1), "longitude": (3, 5)}) + f = f[0] + self.assertEqual(f.data.chunks, ((4, 1), (3, 5))) + + cf.active_storage(False) + self.assertFalse(cf.active_storage()) + array = f.collapse("mean", weights=False).array + + with cf.active_storage(True): + self.assertTrue(cf.active_storage()) + self.assertTrue(f.data.active_storage) + active_array = f.collapse("mean").array + + self.assertEqual(array, active_array) + + # Masked values (not yet working) + # self.assertFalse(cf.active_storage()) + # f[0] = cf.masked + # cf.write(f, tmpfile2) + # f = cf.read(tmpfile2, chunks={"latitude": (4, 1), "longitude": (3, 5)}) + # f = f[0] + # + # array = f.collapse("mean", weights=False).array + # with cf.active_storage(True): + # self.assertTrue(cf.active_storage()) + # self.assertTrue(f.data.active_storage) + # active_array = f.collapse("mean").array + # + # self.assertEqual(array, active_array) + + +if __name__ == "__main__": + print("Run date:", datetime.datetime.now()) + cf.environment() + print("") + unittest.main(verbosity=2) From 31b2b64be7208fa14c53e05c954815790ba91233 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 5 Feb 2024 16:24:43 +0000 Subject: [PATCH 039/134] dev --- cf/__init__.py | 2 +- cf/cfimplementation.py | 6 +- cf/data/array/__init__.py | 2 +- cf/data/array/cfanetcdfarray.py | 56 +------ cf/data/array/h5netcdfarray.py | 7 +- cf/data/array/netcdf4array.py | 46 ++++++ cf/data/array/netcdfarray.py | 8 +- cf/data/fragment/__init__.py | 2 + cf/data/fragment/netcdf4fragmentarray.py | 105 +++++++++++++ cf/data/fragment/netcdffragmentarray.py | 147 ++++++++++++++++-- cf/read_write/netcdf/netcdfread.py | 2 +- cf/read_write/read.py | 11 +- cf/test/test_Data.py | 2 +- cf/test/test_Field.py | 4 +- ...st_NetCDFArray.py => test_NetCDF4Array.py} | 44 +++--- cf/test/test_active_storage.py | 2 - cf/test/test_read_write.py | 44 +++--- 17 files changed, 359 insertions(+), 131 deletions(-) create mode 100644 cf/data/array/netcdf4array.py create mode 100644 cf/data/fragment/netcdf4fragmentarray.py rename cf/test/{test_NetCDFArray.py => test_NetCDF4Array.py} (74%) diff --git a/cf/__init__.py b/cf/__init__.py index 8be27befbb..c750d697be 100644 --- a/cf/__init__.py +++ b/cf/__init__.py @@ -278,7 +278,7 @@ FullArray, GatheredArray, H5netcdfArray, - NetCDFArray, + NetCDF4Array, PointTopologyArray, RaggedContiguousArray, RaggedIndexedArray, diff --git a/cf/cfimplementation.py b/cf/cfimplementation.py index 3a9980731b..eb39374438 100644 --- a/cf/cfimplementation.py +++ b/cf/cfimplementation.py @@ -32,7 +32,7 @@ CFANetCDFArray, GatheredArray, H5netcdfArray, - NetCDFArray, + NetCDF4Array, PointTopologyArray, RaggedContiguousArray, RaggedIndexedArray, @@ -177,7 +177,7 @@ def initialise_CFANetCDFArray( CellConnectivityArray=CellConnectivityArray, GatheredArray=GatheredArray, H5netcdfArray=H5netcdfArray, - NetCDFArray=NetCDFArray, + NetCDF4Array=NetCDF4Array, PointTopologyArray=PointTopologyArray, RaggedContiguousArray=RaggedContiguousArray, RaggedIndexedArray=RaggedIndexedArray, @@ -233,7 +233,7 @@ def implementation(): 'Data': cf.data.data.Data, 'GatheredArray': cf.data.array.gatheredarray.GatheredArray, 'H5netcdfArray': cf.data.array.h5netcdfarray.H5netcdfArray, - 'NetCDFArray': cf.data.array.netcdfarray.NetCDFArray, + 'NetCDF4Array': cf.data.array.netcdf4array.NetCDF4Array, 'PointTopologyArray': , 'RaggedContiguousArray': cf.data.array.raggedcontiguousarray.RaggedContiguousArray, 'RaggedIndexedArray': cf.data.array.raggedindexedarray.RaggedIndexedArray, diff --git a/cf/data/array/__init__.py b/cf/data/array/__init__.py index ea828ac8de..5006b8a39e 100644 --- a/cf/data/array/__init__.py +++ b/cf/data/array/__init__.py @@ -4,7 +4,7 @@ from .fullarray import FullArray from .gatheredarray import GatheredArray from .h5netcdfarray import H5netcdfArray -from .netcdfarray import NetCDFArray +from .netcdf4array import NetCDF4Array from .pointtopologyarray import PointTopologyArray from .raggedcontiguousarray import RaggedContiguousArray from .raggedindexedarray import RaggedIndexedArray diff --git a/cf/data/array/cfanetcdfarray.py b/cf/data/array/cfanetcdfarray.py index dbadca946d..5336c5fcaa 100644 --- a/cf/data/array/cfanetcdfarray.py +++ b/cf/data/array/cfanetcdfarray.py @@ -8,18 +8,17 @@ from ..utils import chunk_locations, chunk_positions # from .mixin import CFAMixin -from .netcdfarray import NetCDFArray +from .netcdf4array import NetCDF4Array # Store fragment array classes. _FragmentArray = { - # "nc": H5FragmentArray, "nc": NetCDFFragmentArray, "um": UMFragmentArray, "full": FullFragmentArray, } -class CFANetCDFArray(NetCDFArray): +class CFANetCDFArray(NetCDF4Array): """A CFA aggregated array stored in a netCDF file. .. versionadded:: 3.14.0 @@ -37,7 +36,7 @@ def __init__( instructions=None, substitutions=None, term=None, - s3=None, + storage_options=None, source=None, copy=True, x=None, @@ -105,7 +104,7 @@ def __init__( .. versionadded:: 3.15.0 - {{init s3: `dict` or `None`, optional}} + {{init storage_options: `dict` or `None`, optional}} .. versionadded:: ACTIVEVERSION @@ -182,48 +181,16 @@ def __init__( if not a.ndim: a = (a.item(),) - # a = np.full(f.shape, a, dtype=a.dtype) - # if np.ma.is_masked(f): - # a = np.ma.array(a, mask=f.mask) scalar_address = True else: scalar_address = False if not file_fmt.ndim: - # fmt = np.full(fragment_shape, fmt, dtype=fmt.dtype) file_fmt = file_fmt.item() scalar_fmt = True else: scalar_fmt = False - # if extra_dimension: - # for frag_loc, loc in zip(positions, locations): - # if not scalar_address: - # address = compressed(a[frag_loc]).tolist() - # else: - # address = a - # - # if not scalar_fmt: - # file_fmt = compressed(fmt[frag_loc].tolist()) - # else: - # file_fmt = fmt - # - # aggregated_data['frag_loc'] = { - # "location": loc, - # "filename": compressed(f[frag_loc]).tolist(), - # "address": address, - # "format": file_fmt, - # } - # #aggregated_data = { - # # frag_loc: { - # # "location": loc, - # # "filename": compressed(f[frag_loc]).tolist(), - # # "address": compressed(a[frag_loc]).tolist(), - # # "format": fmt[frag_loc].item(), - # # } - # # for frag_loc, loc in zip(positions, locations) - # #} - # else: for frag_loc, location in zip(positions, locations): if extra_dimension: filename = compressed(f[frag_loc]).tolist() @@ -255,15 +222,6 @@ def __init__( "address": address, "format": fmt, } - # aggregated_data = { - # frag_loc: { - # "location": loc, - # "filename": (f[frag_loc].item(),), - # "address": (a[frag_loc].item(),), - # "format": fmt[frag_loc].item(), - # } - # for frag_loc, loc in zip(positions, locations) - # } # Apply string substitutions to the fragment filenames if substitutions: @@ -736,7 +694,7 @@ def to_dask_array(self, chunks="auto"): fragment_arrays = _FragmentArray.copy() fragment_arrays["nc"] = partial(_FragmentArray["nc"], mask=False) - s3 = self.get_s3() + storage_options = self.get_storage_options(endpoint_url=False) dsk = {} for ( @@ -759,9 +717,9 @@ def to_dask_array(self, chunks="auto"): f"fragment dataset format: {fragment_format!r}" ) - if s3 and kwargs["address"] == "nc": + if storage_options and kwargs["address"] == "nc": # Pass on any S3 file system options - kwargs["s3"] = s3 + kwargs["storage_options"] = storage_options fragment = FragmentArray( dtype=dtype, diff --git a/cf/data/array/h5netcdfarray.py b/cf/data/array/h5netcdfarray.py index a757fd2a24..370f5e6c6c 100644 --- a/cf/data/array/h5netcdfarray.py +++ b/cf/data/array/h5netcdfarray.py @@ -32,9 +32,10 @@ def _lock(self): Returns a lock object because concurrent reads are not currently supported by the HDF5 library. The lock object will - be the same for all `NetCDFArray` and `HDFArray` instances, - regardless of the dataset they access, which means that access - to all netCDF and HDF files coordinates around the same lock. + be the same for all `NetCDF4Array` and `H5netcdfArray` + instances, regardless of the dataset they access, which means + that access to all netCDF and HDF files coordinates around the + same lock. .. versionadded:: HDFVER diff --git a/cf/data/array/netcdf4array.py b/cf/data/array/netcdf4array.py new file mode 100644 index 0000000000..47718e3d9f --- /dev/null +++ b/cf/data/array/netcdf4array.py @@ -0,0 +1,46 @@ +import cfdm + +from ...mixin_container import Container +from .locks import _lock +from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin + + +class NetCDF4Array( + ActiveStorageMixin, + FileArrayMixin, + ArrayMixin, + Container, + cfdm.NetCDF4Array, +): + """An array stored in a netCDF file. + + **Active storage reductions** + + Active storage reduction may be enabled with the `actify` + method. See `cf.data.collapse.Collapse` for details. + + """ + + def __dask_tokenize__(self): + """Return a value fully representative of the object. + + .. versionadded:: 3.15.0 + + """ + return super().__dask_tokenize__() + (self.get_mask(),) + + @property + def _lock(self): + """Set the lock for use in `dask.array.from_array`. + + Returns a lock object because concurrent reads are not + currently supported by the netCDF and HDF libraries. The lock + object will be the same for all `NetCDF4Array` and + `H5netcdfArray` instances, regardless of the dataset they + access, which means that access to all netCDF and HDF files + coordinates around the same lock. + + .. versionadded:: 3.14.0 + + """ + return _lock diff --git a/cf/data/array/netcdfarray.py b/cf/data/array/netcdfarray.py index f2b4c28a47..54b826b79a 100644 --- a/cf/data/array/netcdfarray.py +++ b/cf/data/array/netcdfarray.py @@ -5,8 +5,12 @@ from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin -class NetCDFArray( - ActiveStorageMixin, FileArrayMixin, ArrayMixin, Container, cfdm.NetCDFArray +class NetCDF4Array( + ActiveStorageMixin, + FileArrayMixin, + ArrayMixin, + Container, + cfdm.NetCDF4Array, ): """An array stored in a netCDF file. diff --git a/cf/data/fragment/__init__.py b/cf/data/fragment/__init__.py index 2ce2dafa60..b7315107d4 100644 --- a/cf/data/fragment/__init__.py +++ b/cf/data/fragment/__init__.py @@ -1,3 +1,5 @@ from .fullfragmentarray import FullFragmentArray +from .h5netcdffragmentarray import H5netcdfFragmentArray from .netcdffragmentarray import NetCDFFragmentArray +from .netcdf4fragmentarray import NetCDF4FragmentArray from .umfragmentarray import UMFragmentArray diff --git a/cf/data/fragment/netcdf4fragmentarray.py b/cf/data/fragment/netcdf4fragmentarray.py new file mode 100644 index 0000000000..e14e2ba6db --- /dev/null +++ b/cf/data/fragment/netcdf4fragmentarray.py @@ -0,0 +1,105 @@ +from ..array.netcdf4array import NetCDF4Array +from .mixin import FragmentArrayMixin + + +class NetCDF4FragmentArray(FragmentArrayMixin, NetCDF4Array): + """A CFA fragment array stored in a netCDF file. + + .. versionadded:: 3.14.0 + + """ + + def __init__( + self, + filename=None, + address=None, + dtype=None, + shape=None, + aggregated_units=False, + aggregated_calendar=False, + units=False, + calendar=None, + storage_options=None, + source=None, + copy=True, + ): + """**Initialisation** + + :Parameters: + + filename: (sequence of `str`), optional + The names of the netCDF fragment files containing the + array. + + address: (sequence of `str`), optional + The name of the netCDF variable containing the + fragment array. Required unless *varid* is set. + + dtype: `numpy.dtype`, optional + The data type of the aggregated array. May be `None` + if the numpy data-type is not known (which can be the + case for netCDF string types, for example). This may + differ from the data type of the netCDF fragment + variable. + + shape: `tuple`, optional + The shape of the fragment within the aggregated + array. This may differ from the shape of the netCDF + fragment variable in that the latter may have fewer + size 1 dimensions. + + units: `str` or `None`, optional + The units of the fragment data. Set to `None` to + indicate that there are no units. If unset then the + units will be set during the first `__getitem__` call. + + calendar: `str` or `None`, optional + The calendar of the fragment data. Set to `None` to + indicate the CF default calendar, if applicable. If + unset then the calendar will be set during the first + `__getitem__` call. + + {{aggregated_units: `str` or `None`, optional}} + + {{aggregated_calendar: `str` or `None`, optional}} + + {{init storage_options: `dict` or `None`, optional}} + + .. versionadded:: ACTIVEVERSION + + {{init source: optional}} + + {{init copy: `bool`, optional}} + + """ + super().__init__( + filename=filename, + address=address, + dtype=dtype, + shape=shape, + mask=True, + units=units, + calendar=calendar, + source=source, + copy=copy, + ) + + if source is not None: + try: + aggregated_units = source._get_component( + "aggregated_units", False + ) + except AttributeError: + aggregated_units = False + + try: + aggregated_calendar = source._get_component( + "aggregated_calendar", False + ) + except AttributeError: + aggregated_calendar = False + + self._set_component("aggregated_units", aggregated_units, copy=False) + self._set_component( + "aggregated_calendar", aggregated_calendar, copy=False + ) diff --git a/cf/data/fragment/netcdffragmentarray.py b/cf/data/fragment/netcdffragmentarray.py index c3730c418b..62285e9efe 100644 --- a/cf/data/fragment/netcdffragmentarray.py +++ b/cf/data/fragment/netcdffragmentarray.py @@ -1,12 +1,24 @@ -# from ..array.mixin import ActiveStorageMixin -from ..array.h5netcdfarray import H5netcdfArray +from urllib.parse import urlparse + +import cfdm + +from ..array.abstract import Array +from ..array.mixin import FileArrayMixin +from .h5netcdffragmentarray import H5netcdfFragmentArray from .mixin import FragmentArrayMixin +from .netcdf4fragmentarray import NetCDF4FragmentArray -class NetCDFFragmentArray(FragmentArrayMixin, H5netcdfArray): - """A CFA fragment array stored in a netCDF file. +class NetCDFFragmentArray( + FragmentArrayMixin, + cfdm.data.mixin.NetCDFFileMixin, + FileArrayMixin, + cfdm.data.mixin.FileArrayMixin, + Array, +): + """Mixin class for a CFA fragment array. - .. versionadded:: 3.14.0 + .. versionadded:: 3.15.0 """ @@ -20,7 +32,7 @@ def __init__( aggregated_calendar=False, units=False, calendar=None, - s3=None, + storage_options=None, source=None, copy=True, ): @@ -64,7 +76,7 @@ def __init__( {{aggregated_calendar: `str` or `None`, optional}} - {{init s3: `dict` or `None`, optional}} + {{init storage_options: `dict` or `None`, optional}} .. versionadded:: ACTIVEVERSION @@ -74,19 +86,41 @@ def __init__( """ super().__init__( - filename=filename, - address=address, - dtype=dtype, - shape=shape, - mask=True, - units=units, - calendar=calendar, - s3=s3, source=source, copy=copy, ) if source is not None: + try: + shape = source._get_component("shape", None) + except AttributeError: + shape = None + + try: + filename = source._get_component("filename", None) + except AttributeError: + filename = None + + try: + address = source._get_component("address", None) + except AttributeError: + address = None + + try: + dtype = source._get_component("dtype", None) + except AttributeError: + dtype = None + + try: + units = source._get_component("units", False) + except AttributeError: + units = False + + try: + calendar = source._get_component("calendar", False) + except AttributeError: + calendar = False + try: aggregated_units = source._get_component( "aggregated_units", False @@ -101,7 +135,90 @@ def __init__( except AttributeError: aggregated_calendar = False + try: + storage_options = source._get_component( + "storage_options", None + ) + except AttributeError: + storage_options = None + + if filename is not None: + if isinstance(filename, str): + filename = (filename,) + else: + filename = tuple(filename) + + self._set_component("filename", filename, copy=False) + + if address is not None: + if isinstance(address, int): + address = (address,) + else: + address = tuple(address) + + self._set_component("address", address, copy=False) + + if storage_options is not None: + self._set_component("storage_options", storage_options, copy=False) + + self._set_component("shape", shape, copy=False) + self._set_component("dtype", dtype, copy=False) + self._set_component("units", units, copy=False) + self._set_component("calendar", calendar, copy=False) + self._set_component("mask", True, copy=False) + self._set_component("aggregated_units", aggregated_units, copy=False) self._set_component( "aggregated_calendar", aggregated_calendar, copy=False ) + + # By default, close the file after data array access + self._set_component("close", True, copy=False) + + def __getitem__(self, indices): + """Returns a subspace of the fragment as a numpy array. + + x.__getitem__(indices) <==> x[indices] + + .. versionadded:: 3.15.0 + + """ + + kwargs = { + "dtype": self.dtype, + "shape": self.shape, + "aggregated_units": self.get_aggregated_units(None), + "aggregated_calendar": self.get_aggregated_calendar(None), + "units": self.get_units(None), + "calendar": self.get_units(None), + "copy": False, + } + + # Loop round the files, returning as soon as we find one that + # works. + filenames = self.get_filenames() + for filename, address in zip(filenames, self.get_addresses()): + kwargs["filename"] = filename + kwargs["address"] = address + + scheme = urlparse(filename).scheme + if scheme == "s3": + kwargs["storage_options"] = self.get_storage_options( + endpoint_url=False + ) + fragment = H5netcdfFragmentArray(**kwargs) + else: + fragment = NetCDF4FragmentArray(**kwargs) + + try: + return fragment[indices] + except FileNotFoundError: + pass + except RuntimeError as error: + raise RuntimeError(f"{error}: {filename}") + + # Still here? + if len(filenames) == 1: + raise FileNotFoundError(f"No such fragment file: {filenames[0]}") + + raise FileNotFoundError(f"No such fragment files: {filenames}") diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py index 8c9803a43e..52127eb81d 100644 --- a/cf/read_write/netcdf/netcdfread.py +++ b/cf/read_write/netcdf/netcdfread.py @@ -957,7 +957,7 @@ def _cfa_parse_aggregated_data(self, ncvar, aggregated_data): variable = g["variables"][term_ncvar] array = cfdm.VariableIndexer( - variable, mask=True, scale=True, always_masked=False + variable, mask=True, scale=True, always_mask=False ) array = array[...] diff --git a/cf/read_write/read.py b/cf/read_write/read.py index f98ad01710..570e1e0631 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -682,19 +682,16 @@ def read( would be created. To disable this behaviour, assign `None` to the ``'endpoint_url'`` key. - *Parameter example:* - ``{'anon': True}`` - *Parameter example:* For a file name of ``'s3://store/data/file.nc'``, the following are equivalent: ``{'anon': True}`` and ``{'anon': True, 'endpoint_url': 'https://store'}``. *Parameter example:* - ``{'key": 'kjhsadf8756', 'secret': '862t3gyebh', - 'client_kwargs': {'endpoint_url': 'http://some-s3.com', - 'config_kwargs': {'s3': {'addressing_style': - 'virtual'}}`` + ``{'key": 'kjhsadf8756', 'secret': '862t3gyebh', + 'endpoint_url': None, 'client_kwargs': {'endpoint_url': + 'http://some-s3.com', 'config_kwargs': {'s3': + {'addressing_style': 'virtual'}}}}`` .. versionadded:: (cfdm) ACTIVEVERSION diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index b17119a66d..449f07b54a 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -4531,7 +4531,7 @@ def test_Data_active_storage(self): self.assertFalse(d.active_storage) # Test with data on disk - n = cf.NetCDFArray( + n = cf.NetCDF4Array( "test_file.nc", "eastward_wind", shape=(1, 9, 10), diff --git a/cf/test/test_Field.py b/cf/test/test_Field.py index 6927f9813b..88ee6c761e 100644 --- a/cf/test/test_Field.py +++ b/cf/test/test_Field.py @@ -1435,7 +1435,7 @@ def test_Field_indices(self): shape = (1, 1, 1) self.assertEqual(g.shape, shape) - self.assertEqual(g.array.compressed(), 29) + self.assertEqual(np.ma.compressed(g.array), 29) if mode != "full": self.assertEqual(g.construct("longitude").array, 83) @@ -1453,7 +1453,7 @@ def test_Field_indices(self): shape = (1, 2, 2) self.assertEqual(g.shape, shape) - self.assertTrue((g.array.compressed() == [4, 29]).all()) + self.assertTrue((np.ma.compressed(g.array) == [4, 29]).all()) # Add 2-d auxiliary coordinates with bounds, so we can # properly test cf.contains values diff --git a/cf/test/test_NetCDFArray.py b/cf/test/test_NetCDF4Array.py similarity index 74% rename from cf/test/test_NetCDFArray.py rename to cf/test/test_NetCDF4Array.py index d1fbcec692..a26b399808 100644 --- a/cf/test/test_NetCDFArray.py +++ b/cf/test/test_NetCDF4Array.py @@ -14,7 +14,7 @@ n_tmpfiles = 1 tmpfiles = [ - tempfile.mkstemp("_test_NetCDFArray.nc", dir=os.getcwd())[1] + tempfile.mkstemp("_test_NetCDF4Array.nc", dir=os.getcwd())[1] for i in range(n_tmpfiles) ] (tmpfile1,) = tmpfiles @@ -32,22 +32,22 @@ def _remove_tmpfiles(): atexit.register(_remove_tmpfiles) -class NetCDFArrayTest(unittest.TestCase): - n = cf.NetCDFArray( +class NetCDF4ArrayTest(unittest.TestCase): + n = cf.NetCDF4Array( filename="filename.nc", address="x", shape=(5, 8), dtype=np.dtype(float), ) - def test_NetCDFArray_del_file_location(self): - a = cf.NetCDFArray(("/data1/file1", "/data2/file2"), ("tas1", "tas2")) + def test_NetCDF4Array_del_file_location(self): + a = cf.NetCDF4Array(("/data1/file1", "/data2/file2"), ("tas1", "tas2")) b = a.del_file_location("/data1") self.assertIsNot(b, a) self.assertEqual(b.get_filenames(), ("/data2/file2",)) self.assertEqual(b.get_addresses(), ("tas2",)) - a = cf.NetCDFArray( + a = cf.NetCDF4Array( ("/data1/file1", "/data2/file1", "/data2/file2"), ("tas1", "tas1", "tas2"), ) @@ -60,18 +60,18 @@ def test_NetCDFArray_del_file_location(self): with self.assertRaises(ValueError): b.del_file_location("/data1/") - def test_NetCDFArray_file_locations(self): - a = cf.NetCDFArray("/data1/file1") + def test_NetCDF4Array_file_locations(self): + a = cf.NetCDF4Array("/data1/file1") self.assertEqual(a.file_locations(), ("/data1",)) - a = cf.NetCDFArray(("/data1/file1", "/data2/file2")) + a = cf.NetCDF4Array(("/data1/file1", "/data2/file2")) self.assertEqual(a.file_locations(), ("/data1", "/data2")) - a = cf.NetCDFArray(("/data1/file1", "/data2/file2", "/data1/file2")) + a = cf.NetCDF4Array(("/data1/file1", "/data2/file2", "/data1/file2")) self.assertEqual(a.file_locations(), ("/data1", "/data2", "/data1")) - def test_NetCDFArray_add_file_location(self): - a = cf.NetCDFArray("/data1/file1", "tas") + def test_NetCDF4Array_add_file_location(self): + a = cf.NetCDF4Array("/data1/file1", "tas") b = a.add_file_location("/home/user") self.assertIsNot(b, a) self.assertEqual( @@ -79,7 +79,7 @@ def test_NetCDFArray_add_file_location(self): ) self.assertEqual(b.get_addresses(), ("tas", "tas")) - a = cf.NetCDFArray(("/data1/file1", "/data2/file2"), ("tas1", "tas2")) + a = cf.NetCDF4Array(("/data1/file1", "/data2/file2"), ("tas1", "tas2")) b = a.add_file_location("/home/user") self.assertEqual( b.get_filenames(), @@ -92,7 +92,7 @@ def test_NetCDFArray_add_file_location(self): ) self.assertEqual(b.get_addresses(), ("tas1", "tas2", "tas1", "tas2")) - a = cf.NetCDFArray(("/data1/file1", "/data2/file1"), ("tas1", "tas2")) + a = cf.NetCDF4Array(("/data1/file1", "/data2/file1"), ("tas1", "tas2")) b = a.add_file_location("/home/user") self.assertEqual( b.get_filenames(), @@ -100,24 +100,24 @@ def test_NetCDFArray_add_file_location(self): ) self.assertEqual(b.get_addresses(), ("tas1", "tas2", "tas1")) - a = cf.NetCDFArray(("/data1/file1", "/data2/file1"), ("tas1", "tas2")) + a = cf.NetCDF4Array(("/data1/file1", "/data2/file1"), ("tas1", "tas2")) b = a.add_file_location("/data1/") self.assertEqual(b.get_filenames(), a.get_filenames()) self.assertEqual(b.get_addresses(), a.get_addresses()) - def test_NetCDFArray__dask_tokenize__(self): - a = cf.NetCDFArray("/data1/file1", "tas", shape=(12, 2), mask=False) + def test_NetCDF4Array__dask_tokenize__(self): + a = cf.NetCDF4Array("/data1/file1", "tas", shape=(12, 2), mask=False) self.assertEqual(tokenize(a), tokenize(a.copy())) - b = cf.NetCDFArray("/home/file2", "tas", shape=(12, 2)) + b = cf.NetCDF4Array("/home/file2", "tas", shape=(12, 2)) self.assertNotEqual(tokenize(a), tokenize(b)) - def test_NetCDFArray_multiple_files(self): + def test_NetCDF4Array_multiple_files(self): f = cf.example_field(0) cf.write(f, tmpfile1) # Create instance with non-existent file - n = cf.NetCDFArray( + n = cf.NetCDF4Array( filename=os.path.join("/bad/location", os.path.basename(tmpfile1)), address=f.nc_get_variable(), shape=f.shape, @@ -129,13 +129,13 @@ def test_NetCDFArray_multiple_files(self): self.assertEqual(len(n.get_filenames()), 2) self.assertTrue((n[...] == f.array).all()) - def test_NetCDFArray_active_method(self): + def test_NetCDF4Array_active_method(self): n = self.n self.assertIsNone(n.get_active_method()) self.assertIsNone(n.set_active_method("min")) self.assertEqual(n.get_active_method(), "min") - def test_NetCDFArray_active_axis(self): + def test_NetCDF4Array_active_axis(self): # Create instance with non-existent file n = self.n self.assertIsNone(n.get_active_axis()) diff --git a/cf/test/test_active_storage.py b/cf/test/test_active_storage.py index d401c4e2be..372d6ea185 100644 --- a/cf/test/test_active_storage.py +++ b/cf/test/test_active_storage.py @@ -7,8 +7,6 @@ faulthandler.enable() # to debug seg faults and timeouts -import numpy as np - import cf try: diff --git a/cf/test/test_read_write.py b/cf/test/test_read_write.py index 0eefa1b2ac..758da54f89 100644 --- a/cf/test/test_read_write.py +++ b/cf/test/test_read_write.py @@ -8,7 +8,7 @@ import tempfile import unittest -import numpy +import numpy as np faulthandler.enable() # to debug seg faults and timeouts @@ -93,26 +93,26 @@ def test_read_mask(self): cf.write(f, tmpfile) g = cf.read(tmpfile)[0] - self.assertEqual(numpy.ma.count(g.data.array), N - 2) + self.assertEqual(np.ma.count(g.data.array), N - 2) g = cf.read(tmpfile, mask=False)[0] - self.assertEqual(numpy.ma.count(g.data.array), N) + self.assertEqual(np.ma.count(g.data.array), N) g.apply_masking(inplace=True) - self.assertEqual(numpy.ma.count(g.data.array), N - 2) + self.assertEqual(np.ma.count(g.data.array), N - 2) f.set_property("_FillValue", 999) f.set_property("missing_value", -111) cf.write(f, tmpfile) g = cf.read(tmpfile)[0] - self.assertEqual(numpy.ma.count(g.data.array), N - 2) + self.assertEqual(np.ma.count(g.data.array), N - 2) g = cf.read(tmpfile, mask=False)[0] - self.assertEqual(numpy.ma.count(g.data.array), N) + self.assertEqual(np.ma.count(g.data.array), N) g.apply_masking(inplace=True) - self.assertEqual(numpy.ma.count(g.data.array), N - 2) + self.assertEqual(np.ma.count(g.data.array), N - 2) def test_read_directory(self): pwd = os.getcwd() + "/" @@ -562,38 +562,38 @@ def test_read_write_netCDF4_compress_shuffle(self): def test_write_datatype(self): f = cf.read(self.filename)[0] - self.assertEqual(f.dtype, numpy.dtype(float)) + self.assertEqual(f.dtype, np.dtype(float)) cf.write( f, tmpfile, fmt="NETCDF4", - datatype={numpy.dtype(float): numpy.dtype("float32")}, + datatype={np.dtype(float): np.dtype("float32")}, ) g = cf.read(tmpfile)[0] self.assertEqual( g.dtype, - numpy.dtype("float32"), + np.dtype("float32"), "datatype read in is " + str(g.dtype), ) # Keyword single f = cf.read(self.filename)[0] - self.assertEqual(f.dtype, numpy.dtype(float)) + self.assertEqual(f.dtype, np.dtype(float)) cf.write(f, tmpfile, fmt="NETCDF4", single=True) g = cf.read(tmpfile)[0] self.assertEqual( g.dtype, - numpy.dtype("float32"), + np.dtype("float32"), "datatype read in is " + str(g.dtype), ) # Keyword double f = g - self.assertEqual(f.dtype, numpy.dtype("float32")) + self.assertEqual(f.dtype, np.dtype("float32")) cf.write(f, tmpfile2, fmt="NETCDF4", double=True) g = cf.read(tmpfile2)[0] self.assertEqual( - g.dtype, numpy.dtype(float), "datatype read in is " + str(g.dtype) + g.dtype, np.dtype(float), "datatype read in is " + str(g.dtype) ) for single in (True, False): @@ -601,7 +601,7 @@ def test_write_datatype(self): with self.assertRaises(Exception): cf.write(g, double=double, single=single) - datatype = {numpy.dtype(float): numpy.dtype("float32")} + datatype = {np.dtype(float): np.dtype("float32")} with self.assertRaises(Exception): cf.write(g, datatype=datatype, single=True) @@ -898,8 +898,8 @@ def test_write_omit_data(self): g = g[0] # Check that the data are missing - self.assertFalse(g.array.count()) - self.assertFalse(g.construct("grid_latitude").array.count()) + self.assertFalse(np.ma.count(g.array)) + self.assertFalse(np.ma.count(g.construct("grid_latitude").array)) # Check that a dump works g.dump(display=False) @@ -909,16 +909,16 @@ def test_write_omit_data(self): # Check that only the field and dimension coordinate data are # missing - self.assertFalse(g.array.count()) - self.assertFalse(g.construct("grid_latitude").array.count()) - self.assertTrue(g.construct("latitude").array.count()) + self.assertFalse(np.ma.count(g.array)) + self.assertFalse(np.ma.count(g.construct("grid_latitude").array)) + self.assertTrue(np.ma.count(g.construct("latitude").array)) cf.write(f, tmpfile, omit_data="field") g = cf.read(tmpfile)[0] # Check that only the field data are missing - self.assertFalse(g.array.count()) - self.assertTrue(g.construct("grid_latitude").array.count()) + self.assertFalse(np.ma.count(g.array)) + self.assertTrue(np.ma.count(g.construct("grid_latitude").array)) def test_read_url(self): """Test reading urls.""" From 7b6cabe26f375ad18fdd65b94d562f898fe408e0 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 6 Feb 2024 17:58:31 +0000 Subject: [PATCH 040/134] dev --- cf/data/array/cfanetcdfarray.py | 62 +++++++++++++++++- cf/data/array/h5netcdfarray.py | 8 +-- cf/data/array/mixin/activestoragemixin.py | 7 +-- cf/data/array/netcdf4array.py | 2 +- cf/data/collapse/collapse.py | 10 +-- cf/data/collapse/collapse_active.py | 19 +++--- cf/data/data.py | 18 +++--- cf/data/fragment/netcdf4fragmentarray.py | 2 +- cf/data/fragment/netcdffragmentarray.py | 7 ++- cf/field.py | 2 +- cf/functions.py | 2 +- cf/read_write/netcdf/netcdfread.py | 22 +++---- cf/read_write/read.py | 76 ++++++++++++++--------- cf/test/test_active_storage.py | 2 +- 14 files changed, 158 insertions(+), 81 deletions(-) diff --git a/cf/data/array/cfanetcdfarray.py b/cf/data/array/cfanetcdfarray.py index 5336c5fcaa..f44d1ef9c2 100644 --- a/cf/data/array/cfanetcdfarray.py +++ b/cf/data/array/cfanetcdfarray.py @@ -104,7 +104,32 @@ def __init__( .. versionadded:: 3.15.0 - {{init storage_options: `dict` or `None`, optional}} + storage_options: `dict` or `None`, optional + Key/value pairs to be passed on to the creation of + `s3fs.S3FileSystem` file systems to control the + opening of fragment files in an S3 object + stores. Ignored for fragment files not in S3 object + stores, i.e. those whose names do not start with + ``s3:``. + + If an ``'endpoint_url'`` key is not in + *storage_options* then one will be automatically + derived for accessing each S3 fragment file. For + example, for a fragment file name of + ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` + key with value ``'https://store'`` would be created. + + *Parameter example:* + For a fragment file name of + ``'s3://store/data/file.nc'``, the following are + equivalent: ``None``, ``{}`` and ``{'endpoint_url': + 'https://store'}``. + + *Parameter example:* + ``{'key: 'scaleway-api-key...', 'secret': + 'scaleway-secretkey...', 'endpoint_url': + 'https://s3.fr-par.scw.cloud', 'client_kwargs': + {'region_name': 'fr-par'}}`` .. versionadded:: ACTIVEVERSION @@ -381,6 +406,39 @@ def get_fragment_shape(self): """ return self._get_component("fragment_shape") + def get_storage_options(self): + """Return `s3fs.S3FileSystem` options for accessing S3 fragment files. + + If an ``'endpoint_url'`` key is not in the returned options, + then one will be automatically derived for accessing each S3 + fragment file. For example, for a fragment file name of + ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` key with + value ``'https://store'`` would be created. + + .. versionadded:: (cfdm) HDFVER + + :Returns: + + `dict` or `None` + The `s3fs.S3FileSystem` options. + + **Examples** + + >>> f.get_storage_options() + {} + + >>> f.get_storage_options() + {'anon': True} + + >>> f.get_storage_options() + {'key: 'scaleway-api-key...', + 'secret': 'scaleway-secretkey...', + 'endpoint_url': 'https://s3.fr-par.scw.cloud', + 'client_kwargs': {'region_name': 'fr-par'}} + + """ + return super().get_storage_options(create_endpoint_url=False) + def get_term(self, default=ValueError()): """The CFA aggregation instruction term for the data, if set. @@ -694,7 +752,7 @@ def to_dask_array(self, chunks="auto"): fragment_arrays = _FragmentArray.copy() fragment_arrays["nc"] = partial(_FragmentArray["nc"], mask=False) - storage_options = self.get_storage_options(endpoint_url=False) + storage_options = self.get_storage_options() dsk = {} for ( diff --git a/cf/data/array/h5netcdfarray.py b/cf/data/array/h5netcdfarray.py index 370f5e6c6c..8d43242063 100644 --- a/cf/data/array/h5netcdfarray.py +++ b/cf/data/array/h5netcdfarray.py @@ -12,16 +12,16 @@ class H5netcdfArray( Container, cfdm.H5netcdfArray, ): - """An array stored in a netCDF file.] + """A netCDF array accessed with `h5netcdf`. - .. versionadded:: HDFVER + .. versionadded:: ACTIVEVERSION """ def __dask_tokenize__(self): """Return a value fully representative of the object. - .. versionadded:: HDFVER + .. versionadded:: ACTIVEVERSION """ return super().__dask_tokenize__() + (self.get_mask(),) @@ -37,7 +37,7 @@ def _lock(self): that access to all netCDF and HDF files coordinates around the same lock. - .. versionadded:: HDFVER + .. versionadded:: ACTIVEVERSION """ return _lock diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index 7bf1b367a4..3f1f9f7de9 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -54,7 +54,7 @@ def __getitem__(self, indices): active = Active( self.get_filename(), self.get_address(), - # storage_options=storage_options, + storage_options=storage_options, **missing_values, ) active.method = method @@ -66,10 +66,7 @@ def __getitem__(self, indices): except AttributeError: pass - print(active.__dict__) - out = active[indices] - print(repr(out)) - return out + return active[indices] def actify(self, method, axis=None): """Return a new actified `{{class}}` instance. diff --git a/cf/data/array/netcdf4array.py b/cf/data/array/netcdf4array.py index 47718e3d9f..bccc28603c 100644 --- a/cf/data/array/netcdf4array.py +++ b/cf/data/array/netcdf4array.py @@ -12,7 +12,7 @@ class NetCDF4Array( Container, cfdm.NetCDF4Array, ): - """An array stored in a netCDF file. + """A netCDF array accessed with `netCDF4`. **Active storage reductions** diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index 74cdb5ffd8..fa1974b603 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -18,6 +18,10 @@ class Collapse(metaclass=DocstringRewriteMeta): make use of active storage reductions when all of the following conditions are met: + * `cf.active_storage()` is True; + + * it is possible to import the `activestorage.Active` class; + * the collapse is over all axes; * the collapse is unweighted; @@ -26,11 +30,9 @@ class Collapse(metaclass=DocstringRewriteMeta): numeric packing is not considered here to be a compression by convention techinigue); - * it is possible to import the `activestorage.Active` class; - - * the collapse method's *active_storage* parameter is True; + * the `Collapse` method's *active_storage* parameter is True; - * the collapse method's *chunk_function* parameter is `None`; + * the `Collapse` method's *chunk_function* parameter is `None`; * the method has a corresponding active chunk function defined in the `cf.data.collapse.active_chunk_functions` dictionary; diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index f5a8487b4d..aead9dd2ab 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -6,6 +6,8 @@ except ModuleNotFoundError: Active = None +from ...functions import active_storage as cf_active_storage + logger = logging.getLogger(__name__) @@ -24,7 +26,7 @@ def active_min(a, **kwargs): .. versionadded:: ACTIVEVERSION - .. seealso:: `actify` + .. seealso:: `actify`, `active_storage` :Parameters: @@ -59,7 +61,7 @@ def active_max(a, **kwargs): .. versionadded:: ACTIVEVERSION - .. seealso:: `actify` + .. seealso:: `actify`, `active_storage` :Parameters: @@ -94,7 +96,7 @@ def active_mean(a, **kwargs): .. versionadded:: ACTIVEVERSION - .. seealso:: `actify` + .. seealso:: `actify`, `active_storage` :Parameters: @@ -133,7 +135,7 @@ def active_sum(a, **kwargs): .. versionadded:: ACTIVEVERSION - .. seealso:: `actify` + .. seealso:: `actify`, `active_storage` :Parameters: @@ -226,7 +228,6 @@ def actify(a, method, axis=None): # return the input data unchanged. return a, None - print(9992, axis) # Parse axis if axis is None: axis = tuple(range(a.ndim)) @@ -237,7 +238,6 @@ def actify(a, method, axis=None): if len(axis) != a.ndim: # Can't (yet) use active storage to collapse a subset of # the axes, so return the input data unchanged. - print(9993) return a, None axis = validate_axis(axis, a.ndim) @@ -296,8 +296,7 @@ def active_storage(method): This decorator is intended for `Collapse` methods. When a `Collapse` method is decorated, active storage operations are only - carried out when the conditions are right. See `Collapse` for - details. + carried out when the conditions are right. .. versionadded:: ACTIVEVERSION @@ -320,6 +319,7 @@ def wrapper(self, *args, **kwargs): and method in active_chunk_functions and kwargs.get("weights") is None and kwargs.get("chunk_function") is None + and cf_active_storage() ): # Attempt to actify the dask array and provide a new # chunk function @@ -347,3 +347,6 @@ def wrapper(self, *args, **kwargs): return wrapper return decorator + + +2 diff --git a/cf/data/data.py b/cf/data/data.py index d45dec4210..25dfb38e97 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -34,9 +34,6 @@ _numpy_allclose, _section, abspath, -) -from ..functions import active_storage as cf_active_storage -from ..functions import ( atol, default_netCDF_fillvals, free_memory, @@ -4764,13 +4761,15 @@ def chunks(self): # ---------------------------------------------------------------- @property def active_storage(self): - """Whether or not active storage recductions are possible. + """Whether or not active storage reductions are possible. + + When the `active_storage` attribute is False it signifies that + active storage reductions are not available. - If the `active_storage` attribute is `True` then reductions - (such as calculating the minimum value of the data) will - *attempt* to use active storage capabilities, falling back on - the usual (non-active) techniques if the conditionsa are not - right. + When the `active_storage` attribute is True it signifies that + active storage reductions are possible, but only when all of + the conditions described by `cf.data.collapse.Collapse` are + met. .. versionadded:: ACTIVEVERSION @@ -4783,7 +4782,6 @@ def active_storage(self): """ return ( self._custom.get("active_storage", False) - and bool(cf_active_storage()) and not self.get_compression_type() ) diff --git a/cf/data/fragment/netcdf4fragmentarray.py b/cf/data/fragment/netcdf4fragmentarray.py index e14e2ba6db..377170cdaa 100644 --- a/cf/data/fragment/netcdf4fragmentarray.py +++ b/cf/data/fragment/netcdf4fragmentarray.py @@ -3,7 +3,7 @@ class NetCDF4FragmentArray(FragmentArrayMixin, NetCDF4Array): - """A CFA fragment array stored in a netCDF file. + """A netCDF fragment array accessed with `netCDF4`. .. versionadded:: 3.14.0 diff --git a/cf/data/fragment/netcdffragmentarray.py b/cf/data/fragment/netcdffragmentarray.py index 62285e9efe..ed2523bfd3 100644 --- a/cf/data/fragment/netcdffragmentarray.py +++ b/cf/data/fragment/netcdffragmentarray.py @@ -16,9 +16,12 @@ class NetCDFFragmentArray( cfdm.data.mixin.FileArrayMixin, Array, ): - """Mixin class for a CFA fragment array. + """A netCDF fragment array. - .. versionadded:: 3.15.0 + Access will either with `netCDF4` (for local and OPenDAP files) or + `h5netcdf` (for S3 files). + + .. versionadded:: ACTIVEVERSION """ diff --git a/cf/field.py b/cf/field.py index c970460636..30fbc40e2b 100644 --- a/cf/field.py +++ b/cf/field.py @@ -6777,7 +6777,7 @@ def collapse( data_axes = f.get_data_axes() iaxes = [ data_axes.index(axis) - for axis in collapse_axes + for axis in collapse_axes_all_sizes if axis in data_axes ] diff --git a/cf/functions.py b/cf/functions.py index 4d21c88399..901103dcd2 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -266,7 +266,7 @@ def configuration( reductions or False to disable them). The default is to not change the current behaviour. - .. versionaddedd:: ACTIVEVERSION + .. versionadded:: ACTIVEVERSION of_fraction: `float` or `Constant`, optional Deprecated at version 3.14.0 and is no longer diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py index 52127eb81d..dd456f8aee 100644 --- a/cf/read_write/netcdf/netcdfread.py +++ b/cf/read_write/netcdf/netcdfread.py @@ -649,14 +649,13 @@ def _create_cfanetcdfarray( :Returns: (`CFANetCDFArray`, `dict`) - The new `NetCDFArray` instance and dictionary of the - kwargs used to create it. + The new `CFANetCDFArray` instance and dictionary of + the kwargs used to create it. """ g = self.read_vars - print(g["storage_options"]) - # Get the kwargs needed to instantiate a general NetCDFArray + # Get the kwargs needed to instantiate a general netCDF array # instance kwargs = self._create_netcdfarray( ncvar, @@ -728,13 +727,13 @@ def _create_cfanetcdfarray_term( :Returns: (`CFANetCDFArray`, `dict`) - The new `NetCDFArray` instance and dictionary of the - kwargs used to create it. + The new `CFANetCDFArray` instance and dictionary of + the kwargs used to create it. """ g = self.read_vars - # Get the kwargs needed to instantiate a general NetCDFArray + # Get the kwargs needed to instantiate a general netCDF array # instance kwargs = self._create_netcdfarray( ncvar, @@ -956,13 +955,10 @@ def _cfa_parse_aggregated_data(self, ncvar, aggregated_data): continue variable = g["variables"][term_ncvar] - array = cfdm.VariableIndexer( - variable, mask=True, scale=True, always_mask=False + array = cfdm.NetCDFIndexer( + variable, mask=True, unpack=True, always_mask=False ) - array = array[...] - - # array = self._cfa_conform_array(array) - aggregation_instructions[term_ncvar] = array + aggregation_instructions[term_ncvar] = array[...] if term == "file": # Find URI substitutions that may be stored in the diff --git a/cf/read_write/read.py b/cf/read_write/read.py index 570e1e0631..9f3c2ceeb3 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -58,6 +58,7 @@ def read( select_options=None, follow_symlinks=False, mask=True, + unpack=True, warn_valid=False, chunks="auto", domain=False, @@ -411,14 +412,13 @@ def read( parameter. mask: `bool`, optional - If False then do not mask by convention when reading the - data of field or metadata constructs from disk. By default - data is masked by convention. + If True (the default) then mask by convention the data of + field and metadata constructs. - The masking by convention of a netCDF array depends on the - values of any of the netCDF variable attributes - ``_FillValue``, ``missing_value``, ``valid_min``, - ``valid_max`` and ``valid_range``. + A netCDF array is masked depending on the values of any of + the netCDF attributes ``_FillValue``, ``missing_value``, + ``_Unsigned``, ``valid_min``, ``valid_max``, and + ``valid_range``. The masking by convention of a PP or UM array depends on the value of BMDI in the lookup header. A value other than @@ -430,6 +430,15 @@ def read( .. versionadded:: 3.4.0 + unpack: `bool`, optional + If True (the default) then unpack by convention when + reading data from disk. + + A netCDF array is unpacked depending on the values of the + netCDF attributes ``add_offset`` and ``scale_factor``. + + .. versionadded:: (cfdm) ACTIVEVERSION + warn_valid: `bool`, optional If True then print a warning for the presence of ``valid_min``, ``valid_max`` or ``valid_range`` properties @@ -668,32 +677,33 @@ def read( .. versionadded:: 3.15.0 storage_options: `dict` or `None`, optional - Key/value pairs to be passed on to the `s3fs.S3FileSystem` - file-system backend to control the opening of files in an - S3 object store. By default, or if `None`, then a value of - ``{'anon': True}`` is used. Ignored for file names that - don't start with ``s3:``. - - If and only if *s3* has no ``'endpoint_url'`` key, then - one will be automatically derived from the file name and - included in the keyword parameters. For example, for a - file name of ``'s3://store/data/file.nc'``, an - ``'endpoint_url'`` key with value ``'https://store'`` - would be created. To disable this behaviour, assign `None` - to the ``'endpoint_url'`` key. + Key/value pairs to be passed on to the creation of + `s3fs.S3FileSystem` file systems to control the opening of + files in S3 object stores. Ignored for files not in an S3 + object store, i.e. those whose names do not start with + ``s3:``. + + By default, or if `None`, then a value of ``{'anon': + True}`` is used. + + If an ``'endpoint_url'`` key is not in *storage_options* + then one will be automatically derived for accessing each + S3 file. For example, for a file name of + ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` key + with value ``'https://store'`` would be created. *Parameter example:* For a file name of ``'s3://store/data/file.nc'``, the - following are equivalent: ``{'anon': True}`` and - ``{'anon': True, 'endpoint_url': 'https://store'}``. + following are equivalent: ``None``, ``{'anon': True}``, + and ``{'anon': True, 'endpoint_url': 'https://store'}``. *Parameter example:* - ``{'key": 'kjhsadf8756', 'secret': '862t3gyebh', - 'endpoint_url': None, 'client_kwargs': {'endpoint_url': - 'http://some-s3.com', 'config_kwargs': {'s3': - {'addressing_style': 'virtual'}}}}`` + ``{'key: 'scaleway-api-key...', 'secret': + 'scaleway-secretkey...', 'endpoint_url': + 'https://s3.fr-par.scw.cloud', 'client_kwargs': + {'region_name': 'fr-par'}}`` - .. versionadded:: (cfdm) ACTIVEVERSION + .. versionadded:: ACTIVEVERSION netCDF_backend: `str` or `None`, optional Specify which library to use for opening input files. By @@ -703,7 +713,14 @@ def read( ``'h5netcdf'`` will force the use of the `netCDF4` or `h5netcdf` libraries respectively. - .. versionadded:: (cfdm) ACTIVEVERSION + .. note:: The *netCDF_backend* parameter does not affect + the opening of netCDF fragment files that define + the data of aggregated variables. For these, + `netCDF4` is used for local files and those + accessed via OPenDAP, and `h5netcdf` is used for + fragement files in S3 object stores. + + .. versionadded:: ACTIVEVERSION umversion: deprecated at version 3.0.0 Use the *um* parameter instead. @@ -992,6 +1009,7 @@ def read( height_at_top_of_model=height_at_top_of_model, chunks=chunks, mask=mask, + unpack=unpack, warn_valid=warn_valid, select=select, domain=domain, @@ -1107,6 +1125,7 @@ def _read_a_file( extra=None, height_at_top_of_model=None, mask=True, + unpack=True, warn_valid=False, chunks="auto", select=None, @@ -1230,6 +1249,7 @@ def _read_a_file( warnings=warnings, extra_read_vars=extra_read_vars, mask=mask, + unpack=unpack, warn_valid=warn_valid, domain=domain, storage_options=storage_options, diff --git a/cf/test/test_active_storage.py b/cf/test/test_active_storage.py index 372d6ea185..fb058f287e 100644 --- a/cf/test/test_active_storage.py +++ b/cf/test/test_active_storage.py @@ -52,7 +52,7 @@ def test_active_storage(self): with cf.active_storage(True): self.assertTrue(cf.active_storage()) self.assertTrue(f.data.active_storage) - active_array = f.collapse("mean").array + active_array = f.collapse("mean", weights=False).array self.assertEqual(array, active_array) From a038030eb3a1badb15d9fa348fff2ef9993de7ee Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 7 Feb 2024 11:33:45 +0000 Subject: [PATCH 041/134] dev --- cf/data/array/mixin/activestoragemixin.py | 2 +- cf/data/fragment/h5netcdffragmentarray.py | 106 ++++++++++++++++++++++ cf/data/fragment/netcdf4fragmentarray.py | 1 + 3 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 cf/data/fragment/h5netcdffragmentarray.py diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index 3f1f9f7de9..1ccacbab6b 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -54,7 +54,7 @@ def __getitem__(self, indices): active = Active( self.get_filename(), self.get_address(), - storage_options=storage_options, + # storage_options=storage_options, **missing_values, ) active.method = method diff --git a/cf/data/fragment/h5netcdffragmentarray.py b/cf/data/fragment/h5netcdffragmentarray.py new file mode 100644 index 0000000000..98a1a5f843 --- /dev/null +++ b/cf/data/fragment/h5netcdffragmentarray.py @@ -0,0 +1,106 @@ +from ..array.h5netcdfarray import H5netcdfArray +from .mixin import FragmentArrayMixin + + +class H5netcdfFragmentArray(FragmentArrayMixin, H5netcdfArray): + """A netCDF fragment array accessed with `h5netcdf`. + + .. versionadded:: ACTIVEVERSION + + """ + + def __init__( + self, + filename=None, + address=None, + dtype=None, + shape=None, + aggregated_units=False, + aggregated_calendar=False, + units=False, + calendar=None, + storage_options=None, + source=None, + copy=True, + ): + """**Initialisation** + + :Parameters: + + filename: (sequence of `str`), optional + The names of the netCDF fragment files containing the + array. + + address: (sequence of `str`), optional + The name of the netCDF variable containing the + fragment array. Required unless *varid* is set. + + dtype: `numpy.dtype`, optional + The data type of the aggregated array. May be `None` + if the numpy data-type is not known (which can be the + case for netCDF string types, for example). This may + differ from the data type of the netCDF fragment + variable. + + shape: `tuple`, optional + The shape of the fragment within the aggregated + array. This may differ from the shape of the netCDF + fragment variable in that the latter may have fewer + size 1 dimensions. + + units: `str` or `None`, optional + The units of the fragment data. Set to `None` to + indicate that there are no units. If unset then the + units will be set during the first `__getitem__` call. + + calendar: `str` or `None`, optional + The calendar of the fragment data. Set to `None` to + indicate the CF default calendar, if applicable. If + unset then the calendar will be set during the first + `__getitem__` call. + + {{aggregated_units: `str` or `None`, optional}} + + {{aggregated_calendar: `str` or `None`, optional}} + + {{init storage_options: `dict` or `None`, optional}} + + .. versionadded:: ACTIVEVERSION + + {{init source: optional}} + + {{init copy: `bool`, optional}} + + """ + super().__init__( + filename=filename, + address=address, + dtype=dtype, + shape=shape, + mask=True, + units=units, + calendar=calendar, + storage_options=storage_options, + source=source, + copy=copy, + ) + + if source is not None: + try: + aggregated_units = source._get_component( + "aggregated_units", False + ) + except AttributeError: + aggregated_units = False + + try: + aggregated_calendar = source._get_component( + "aggregated_calendar", False + ) + except AttributeError: + aggregated_calendar = False + + self._set_component("aggregated_units", aggregated_units, copy=False) + self._set_component( + "aggregated_calendar", aggregated_calendar, copy=False + ) diff --git a/cf/data/fragment/netcdf4fragmentarray.py b/cf/data/fragment/netcdf4fragmentarray.py index 377170cdaa..7fdbe79cb9 100644 --- a/cf/data/fragment/netcdf4fragmentarray.py +++ b/cf/data/fragment/netcdf4fragmentarray.py @@ -80,6 +80,7 @@ def __init__( mask=True, units=units, calendar=calendar, + storage_options=storage_options, source=source, copy=copy, ) From c6e94e7b42634ab25f573d9109ce3bb71de981bd Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 8 Feb 2024 18:28:10 +0000 Subject: [PATCH 042/134] dev --- cf/data/array/mixin/activestoragemixin.py | 25 ++++++++--------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index 1ccacbab6b..81cf90c496 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -34,28 +34,19 @@ def __getitem__(self, indices): """ method = self.get_active_method() if method is None: - # Normal read by local client. Returns a numpy array. + # Do a normal read by local client. Returns an un-reduced + # numpy array. return super().__getitem__(indices) - # Active storage reduction. Returns a dictionary. - try: - missing_values = self.get_missing_values() - except AttributeError: - missing_values = {} - else: - if missing_values is None: - missing_values = {} - - try: - storage_options = self.get_storage_options() - except AttributeError: - storage_options = {} - + # Still here? Then do an active storage reduction. Returns a + # dictionary of reduced values. active = Active( self.get_filename(), self.get_address(), - # storage_options=storage_options, - **missing_values, + # dtype=self.dtype, + # missing_values=self.get_missing_values(None), + # storage_options=self.get_storage_options(), + # active_storage_url=None, ) active.method = method active.components = True From 3b8ae98038e884898cb492a86998534fdc4d0f7c Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 9 Feb 2024 09:54:00 +0000 Subject: [PATCH 043/134] dev --- cf/constants.py | 3 +- cf/data/array/mixin/activestoragemixin.py | 56 ++++++++--- cf/data/collapse/collapse.py | 3 + cf/data/collapse/collapse_active.py | 13 ++- cf/functions.py | 110 +++++++++++++++++++--- cf/test/test_active_storage.py | 7 +- cf/test/test_functions.py | 3 +- docs/source/function.rst | 11 +++ 8 files changed, 171 insertions(+), 35 deletions(-) diff --git a/cf/constants.py b/cf/constants.py index 813a158796..4214786c2e 100644 --- a/cf/constants.py +++ b/cf/constants.py @@ -63,7 +63,8 @@ "LOG_LEVEL": logging.getLevelName(logging.getLogger().level), "BOUNDS_COMBINATION_MODE": "AND", "CHUNKSIZE": parse_bytes(_CHUNKSIZE), - "ACTIVE_STORAGE": False, + "active_storage": False, + "active_storage_url": "", } masked = np.ma.masked diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index 81cf90c496..11a9edebb5 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -33,7 +33,7 @@ def __getitem__(self, indices): """ method = self.get_active_method() - if method is None: + if method is None or Active is None: # Do a normal read by local client. Returns an un-reduced # numpy array. return super().__getitem__(indices) @@ -43,10 +43,8 @@ def __getitem__(self, indices): active = Active( self.get_filename(), self.get_address(), - # dtype=self.dtype, - # missing_values=self.get_missing_values(None), # storage_options=self.get_storage_options(), - # active_storage_url=None, + # active_storage_url=self.get_active_storage_url(), ) active.method = method active.components = True @@ -59,7 +57,7 @@ def __getitem__(self, indices): return active[indices] - def actify(self, method, axis=None): + def actify(self, method, axis=None, active_storage_url=""): """Return a new actified `{{class}}` instance. The new instance is a deep copy of the original, with the @@ -81,6 +79,10 @@ def actify(self, method, axis=None): Axis or axes along which to operate. By default, or if `None`, flattened input is used. + active_storage_url: `str`, optional + Axis or axes along which to operate. By default, or if + `None`, flattened input is used. + :Returns: `{{class}}` @@ -88,17 +90,10 @@ def actify(self, method, axis=None): storage operation. """ - if Active is None: - # The active storage import dependency is not met, so - # using active storage is not possible. - raise AttributeError( - f"Can't actify {self.__class__.__name__} when " - "activestorage.Active is not available" - ) - a = self.copy() a.set_active_method(method) a.set_active_axis(axis) + a.set_active_storage_url(active_storage_url) return a def get_active_axis(self): @@ -133,6 +128,22 @@ def get_active_method(self): """ return self._custom.get("active_method") + def get_active_storage_url(self): + """Return the the active storage URL. + + .. versionadded:: ACTIVEVERSION + + .. seealso:: `set_active_storage_url` + + :Returns: + + `str` + The active storage URL. An empty string specifies no + URL. + + """ + self._custom.get("active_storage_url", "") + def set_active_axis(self, value): """Set the active storage reduction axes. @@ -171,3 +182,22 @@ def set_active_method(self, value): """ self._custom["active_method"] = value + + def set_active_storage_url(self, value): + """Set the active storage URL. + + .. versionadded:: ACTIVEVERSION + + .. seealso:: `get_active_storage_url` + + :Parameters: + + value: `str` + The active storage URL. + + :Returns: + + `None` + + """ + self._custom["active_storage_url"] = value diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index fa1974b603..1ba168acfd 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -20,6 +20,9 @@ class Collapse(metaclass=DocstringRewriteMeta): * `cf.active_storage()` is True; + * An active storage URL has been set with + `cf.active_storage_url`; + * it is possible to import the `activestorage.Active` class; * the collapse is over all axes; diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index aead9dd2ab..077e15e125 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -7,6 +7,7 @@ Active = None from ...functions import active_storage as cf_active_storage +from ...functions import active_storage_url logger = logging.getLogger(__name__) @@ -249,6 +250,7 @@ def actify(a, method, axis=None): # The elements are traversed in reverse order so that the data # defintions come out first, allowing for the potential of a # faster short circuit when using active storage is not possible. + url = str(active_storage_url()) ok_to_actify = True dsk = collections_to_dsk((a,), optimize_graph=True) for key, value in reversed(dsk.items()): @@ -268,7 +270,7 @@ def actify(a, method, axis=None): # to files, so try to insert an actified copy into the dask # graph. try: - dsk[key] = value.actify(method, axis) + dsk[key] = value.actify(method, axis, active_storage_url=url) except AttributeError: # This data definition doesn't have an 'actify' method, # and so doesn't support active storage reductions. @@ -284,7 +286,10 @@ def actify(a, method, axis=None): # active storage reductions => redefine the dask array from the # actified dask graph, and set the active storage reduction chunk # function. - logger.warning("Using activestorage.Active to collapse chunks") + logger.warning( + "At compute time chunks will be collapsed with " + f"active storage URL: {active_storage_url()}" + ) return ( da.Array(dsk, a.name, a.chunks, a.dtype, a._meta), active_chunk_functions[method], @@ -320,6 +325,7 @@ def wrapper(self, *args, **kwargs): and kwargs.get("weights") is None and kwargs.get("chunk_function") is None and cf_active_storage() + and active_storage_url() ): # Attempt to actify the dask array and provide a new # chunk function @@ -347,6 +353,3 @@ def wrapper(self, *args, **kwargs): return wrapper return decorator - - -2 diff --git a/cf/functions.py b/cf/functions.py index 901103dcd2..29e68f7d16 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -173,6 +173,7 @@ def configuration( relaxed_identities=None, bounds_combination_mode=None, active_storage=None, + active_storage_url=None, of_fraction=None, collapse_parallel_mode=None, free_memory_factor=None, @@ -191,6 +192,8 @@ def configuration( * `regrid_logging` * `relaxed_identities` * `bounds_combination_mode` + * `active_storage` + * `active_storage_url` These are all constants that apply throughout cf, except for in specific functions only if overridden by the corresponding keyword @@ -268,6 +271,13 @@ def configuration( .. versionadded:: ACTIVEVERSION + active_storage_url: `str` or `Constant`, optional + The new value TODOACTIVE (either True to enable active + storage reductions or False to disable them). The default + is to not change the current behaviour. + + .. versionadded:: ACTIVEVERSION + of_fraction: `float` or `Constant`, optional Deprecated at version 3.14.0 and is no longer available. @@ -297,7 +307,9 @@ def configuration( 'relaxed_identities': False, 'log_level': 'WARNING', 'bounds_combination_mode': 'AND', - 'chunksize': 82873466.88000001} + 'chunksize': 82873466.88000001, + 'active_storage': False, + 'active_storage_url': ''} >>> cf.chunksize(7.5e7) # any change to one constant... 82873466.88000001 >>> cf.configuration()['chunksize'] # ...is reflected in the configuration @@ -311,7 +323,9 @@ def configuration( 'relaxed_identities': False, 'log_level': 'WARNING', 'bounds_combination_mode': 'AND', - 'chunksize': 75000000.0} + 'chunksize': 75000000.0, + 'active_storage': False, + 'active_storage_url': ''} >>> cf.configuration() # the items set have been updated accordingly {'rtol': 2.220446049250313e-16, 'atol': 2.220446049250313e-16, @@ -320,7 +334,9 @@ def configuration( 'relaxed_identities': False, 'log_level': 'INFO', 'bounds_combination_mode': 'AND', - 'chunksize': 75000000.0} + 'chunksize': 75000000.0, + 'active_storage': False, + 'active_storage_url': ''} Use as a context manager: @@ -332,7 +348,9 @@ def configuration( 'relaxed_identities': False, 'log_level': 'INFO', 'bounds_combination_mode': 'AND', - 'chunksize': 75000000.0} + 'chunksize': 75000000.0, + 'active_storage': False, + 'active_storage_url': ''} >>> with cf.configuration(atol=9, rtol=10): ... print(cf.configuration()) ... @@ -343,7 +361,9 @@ def configuration( 'relaxed_identities': False, 'log_level': 'INFO', 'bounds_combination_mode': 'AND', - 'chunksize': 75000000.0} + 'chunksize': 75000000.0, + 'active_storage': False, + 'active_storage_url': ''} >>> print(cf.configuration()) {'rtol': 2.220446049250313e-16, 'atol': 2.220446049250313e-16, @@ -352,7 +372,9 @@ def configuration( 'relaxed_identities': False, 'log_level': 'INFO', 'bounds_combination_mode': 'AND', - 'chunksize': 75000000.0} + 'chunksize': 75000000.0, + 'active_storage': False, + 'active_storage_url': ''} """ if of_fraction is not None: @@ -384,6 +406,7 @@ def configuration( new_relaxed_identities=relaxed_identities, bounds_combination_mode=bounds_combination_mode, active_storage=active_storage, + active_storage_url=active_storage_url, ) @@ -434,6 +457,7 @@ def _configuration(_Configuration, **kwargs): "new_relaxed_identities": relaxed_identities, "bounds_combination_mode": bounds_combination_mode, "active_storage": active_storage, + "active_storage_url": active_storage_url, } old_values = {} @@ -1159,7 +1183,7 @@ def _parse(cls, arg): class active_storage(ConstantAccess): - """Whether or not to allow active storage reductions. + """Whether or not to attempt active storage reductions. .. versionadded:: ACTIVEVERSION @@ -1181,20 +1205,20 @@ class active_storage(ConstantAccess): >>> cf.active_storage() False - >>> cf.active_storage(True) - False - >>> cf.active_storage() - True - >>> with cf.active_storage(False): + >>> with cf.active_storage(True): ... print(cf.active_storage()) ... + True + >>> cf.active_storage() + False + >>> cf.active_storage(True) False >>> cf.active_storage() True """ - _name = "ACTIVE_STORAGE" + _name = "active_storage" def _parse(cls, arg): """Parse a new constant value. @@ -1218,6 +1242,66 @@ def _parse(cls, arg): return bool(arg) +class active_storage_url(ConstantAccess): + """The URL location of the active storage reducer. + + .. versionadded:: ACTIVEVERSION + + .. seealso:: `configuration` + + :Parameters: + + arg: `str` or `Constant`, optional + Provide a value that will apply to all subsequent + operations. + + :Returns: + + `Constant` + The value prior to the change, or the current value if no + new value was specified. + + **Examples** + + >>> cf.active_storage_url() + '' + >>> with cf.active_storage_url('http://active/storage/location'): + ... print(cf.active_storage_url()) + ... + 'http://active/storage/location' + >>> cf.active_storage_url() + '' + >>> cf.active_storage_url('http://other/location') + '' + >>> cf.active_storage_url() + 'http://other/location' + + """ + + _name = "active_storage_url" + + def _parse(cls, arg): + """Parse a new constant value. + + .. versionaddedd:: ACTIVEVERSION + + :Parameters: + + cls: + This class. + + arg: + The given new constant value. + + :Returns: + + A version of the new constant value suitable for + insertion into the `CONSTANTS` dictionary. + + """ + return str(arg) + + def CF(): """The version of the CF conventions. diff --git a/cf/test/test_active_storage.py b/cf/test/test_active_storage.py index fb058f287e..ac26793b70 100644 --- a/cf/test/test_active_storage.py +++ b/cf/test/test_active_storage.py @@ -49,8 +49,9 @@ def test_active_storage(self): self.assertFalse(cf.active_storage()) array = f.collapse("mean", weights=False).array - with cf.active_storage(True): + with cf.configuration(active_storage=True, active_storage_url="dummy"): self.assertTrue(cf.active_storage()) + self.assertTrue(cf.active_storage_url()) self.assertTrue(f.data.active_storage) active_array = f.collapse("mean", weights=False).array @@ -64,8 +65,10 @@ def test_active_storage(self): # f = f[0] # # array = f.collapse("mean", weights=False).array - # with cf.active_storage(True): + # + # with cf.active_storage(True, active_storage_url="dummy"): # self.assertTrue(cf.active_storage()) + # self.assertTrue(cf.active_storage_url()) # self.assertTrue(f.data.active_storage) # active_array = f.collapse("mean").array # diff --git a/cf/test/test_functions.py b/cf/test/test_functions.py index 917e9cb4dd..791fe3305e 100644 --- a/cf/test/test_functions.py +++ b/cf/test/test_functions.py @@ -54,7 +54,7 @@ def test_configuration(self): self.assertIsInstance(org, dict) # Check all keys that should be there are, with correct value type: - self.assertEqual(len(org), 9) # update expected len if add new key(s) + self.assertEqual(len(org), 10) # update expected len if add new key(s) # Types expected: self.assertIsInstance(org["atol"], float) @@ -85,6 +85,7 @@ def test_configuration(self): "log_level": "INFO", "chunksize": 8e9, "active_storage": True, + "active_storage_url": "", } # Test the setting of each lone item. diff --git a/docs/source/function.rst b/docs/source/function.rst index df6a1bd5ad..107011c9b5 100644 --- a/docs/source/function.rst +++ b/docs/source/function.rst @@ -145,6 +145,17 @@ Resource management cf.TEMPDIR cf.TOTAL_MEMORY +Active storage reductions +------------------------- + +.. autosummary:: + :nosignatures: + :toctree: function/ + :template: function.rst + + cf.active_storage + cf.active_storage_url + Miscellaneous ------------- From 1f90a487ecae26b64a1ddb8acf60e84c0b319859 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 12 Feb 2024 17:49:41 +0000 Subject: [PATCH 044/134] dev --- cf/constants.py | 2 +- cf/data/array/mixin/activestoragemixin.py | 20 ++++++-- cf/data/collapse/collapse.py | 6 +-- cf/data/fragment/mixin/fragmentarraymixin.py | 4 +- cf/field.py | 40 ++++++++++++++++ cf/functions.py | 50 +++++++++++--------- cf/read_write/read.py | 36 +++++++------- cf/test/test_functions.py | 2 +- release_docs | 2 +- 9 files changed, 109 insertions(+), 53 deletions(-) diff --git a/cf/constants.py b/cf/constants.py index 4214786c2e..0b8e12ecfd 100644 --- a/cf/constants.py +++ b/cf/constants.py @@ -64,7 +64,7 @@ "BOUNDS_COMBINATION_MODE": "AND", "CHUNKSIZE": parse_bytes(_CHUNKSIZE), "active_storage": False, - "active_storage_url": "", + "active_storage_url": None, } masked = np.ma.masked diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index 11a9edebb5..d9d1b21c12 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -57,7 +57,7 @@ def __getitem__(self, indices): return active[indices] - def actify(self, method, axis=None, active_storage_url=""): + def actify(self, method, axis=None, active_storage_url=None): """Return a new actified `{{class}}` instance. The new instance is a deep copy of the original, with the @@ -79,9 +79,8 @@ def actify(self, method, axis=None, active_storage_url=""): Axis or axes along which to operate. By default, or if `None`, flattened input is used. - active_storage_url: `str`, optional - Axis or axes along which to operate. By default, or if - `None`, flattened input is used. + active_storage_url: `str` or `None`, optional + The URL of the active storage server. :Returns: @@ -90,6 +89,19 @@ def actify(self, method, axis=None, active_storage_url=""): storage operation. """ + if Active is None: + raise AttributeError( + "Can't actify {self.__class__.__name__} when " + "activestorage.Active is not available" + ) + + attributes = self.get_attributes({}) + if "add_offset" in attributes or "scale_factor" in attributes: + raise AttributeError( + "Can't actify {self.__class__.__name__} when " + "the data has been numerically packed" + ) + a = self.copy() a.set_active_method(method) a.set_active_axis(axis) diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index 1ba168acfd..71798bd159 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -20,7 +20,7 @@ class Collapse(metaclass=DocstringRewriteMeta): * `cf.active_storage()` is True; - * An active storage URL has been set with + * an active storage URL has been set with `cf.active_storage_url`; * it is possible to import the `activestorage.Active` class; @@ -29,8 +29,8 @@ class Collapse(metaclass=DocstringRewriteMeta): * the collapse is unweighted; - * the data is not compressed by convention (note that netCDF - numeric packing is not considered here to be a compression by + * the data is not compressed by convention (netCDF numeric + packing is not considered here to be a compression by convention techinigue); * the `Collapse` method's *active_storage* parameter is True; diff --git a/cf/data/fragment/mixin/fragmentarraymixin.py b/cf/data/fragment/mixin/fragmentarraymixin.py index d523d88d70..48f9409bfd 100644 --- a/cf/data/fragment/mixin/fragmentarraymixin.py +++ b/cf/data/fragment/mixin/fragmentarraymixin.py @@ -32,8 +32,8 @@ def __getitem__(self, indices): .. versionadded:: 3.15.0 """ - # TODOACTIVE: modify this for the case when - # super().__getitem__(tuple(indices)) returns a + # TODOACTIVE: modify this for the active storage case of + # super().__getitem__(tuple(indices)) returning a # dictionary indices = self._parse_indices(indices) diff --git a/cf/field.py b/cf/field.py index 30fbc40e2b..bda58d2d65 100644 --- a/cf/field.py +++ b/cf/field.py @@ -5314,6 +5314,46 @@ def collapse( ... within_years=cf.seasons(), weights=True) + **Active storage collapses** + + When the data being collapsed are stored remotely, the + collapse calculations may be carried out on a server that is + close (in a network distance sense) to the data, thereby + removing the time and power costs of transfering the entire + un-collapsed data to the local client. Whether or not this + will occur is determined on a case-by-case basis, and will + only be done if all of the following criteria are met: + + * the collapse method is registered as having an active + storage counterpart, i.e. one of ``'mean'``, ``'maximum'``, + ``'minimum'``, or ``'sum'``; + + * the collapse is over all axes; + + * the collapse is unweighted; + + * `cf.active_storage()` is True; + + * a URL of the active storage server has been set with + `cf.active_storage_url`; + + * the data values are in netCDF files on disk (rather than in + any other file format, or in memory); + + * the `!active_storage` attribute of the `Data` being + collapsed is `True`. In general, it will only be `True` for + data that is not compressed by convention (including + numerical packing), and has not had any other operations + applied to it; + + * it is possible to import the `activestorage.Active` class. + + The performance improvements from using active storage + operations will increase the closer the active storage server + is to the data storage. If the active storage server is + sufficiently far away from the data then it may be faster to + do a normal, non-active operation. + .. versionadded:: 1.0 .. seealso:: `bin`, `cell_area`, `convolution_filter`, diff --git a/cf/functions.py b/cf/functions.py index 29e68f7d16..70c5dd347d 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -6,7 +6,6 @@ import platform import re import sys -import urllib.parse import warnings from collections.abc import Iterable from itertools import product @@ -19,7 +18,7 @@ from os.path import expandvars as _os_path_expandvars from os.path import join as _os_path_join from os.path import relpath as _os_path_relpath -from urllib.parse import urlparse +from urllib.parse import urljoin, urlparse import cfdm import netCDF4 @@ -271,10 +270,9 @@ def configuration( .. versionadded:: ACTIVEVERSION - active_storage_url: `str` or `Constant`, optional - The new value TODOACTIVE (either True to enable active - storage reductions or False to disable them). The default - is to not change the current behaviour. + active_storage_url: `str` or `None` or `Constant`, optional + The new value (either a new URL string or `None` to remove + the URL). The default is to not change the value. .. versionadded:: ACTIVEVERSION @@ -309,7 +307,7 @@ def configuration( 'bounds_combination_mode': 'AND', 'chunksize': 82873466.88000001, 'active_storage': False, - 'active_storage_url': ''} + 'active_storage_url': None} >>> cf.chunksize(7.5e7) # any change to one constant... 82873466.88000001 >>> cf.configuration()['chunksize'] # ...is reflected in the configuration @@ -325,7 +323,7 @@ def configuration( 'bounds_combination_mode': 'AND', 'chunksize': 75000000.0, 'active_storage': False, - 'active_storage_url': ''} + 'active_storage_url': None} >>> cf.configuration() # the items set have been updated accordingly {'rtol': 2.220446049250313e-16, 'atol': 2.220446049250313e-16, @@ -336,7 +334,7 @@ def configuration( 'bounds_combination_mode': 'AND', 'chunksize': 75000000.0, 'active_storage': False, - 'active_storage_url': ''} + 'active_storage_url': None} Use as a context manager: @@ -350,7 +348,7 @@ def configuration( 'bounds_combination_mode': 'AND', 'chunksize': 75000000.0, 'active_storage': False, - 'active_storage_url': ''} + 'active_storage_url': None} >>> with cf.configuration(atol=9, rtol=10): ... print(cf.configuration()) ... @@ -363,7 +361,7 @@ def configuration( 'bounds_combination_mode': 'AND', 'chunksize': 75000000.0, 'active_storage': False, - 'active_storage_url': ''} + 'active_storage_url': None} >>> print(cf.configuration()) {'rtol': 2.220446049250313e-16, 'atol': 2.220446049250313e-16, @@ -374,7 +372,7 @@ def configuration( 'bounds_combination_mode': 'AND', 'chunksize': 75000000.0, 'active_storage': False, - 'active_storage_url': ''} + 'active_storage_url': None} """ if of_fraction is not None: @@ -1251,7 +1249,7 @@ class active_storage_url(ConstantAccess): :Parameters: - arg: `str` or `Constant`, optional + arg: `str` or `None` or `Constant`, optional Provide a value that will apply to all subsequent operations. @@ -1263,16 +1261,16 @@ class active_storage_url(ConstantAccess): **Examples** - >>> cf.active_storage_url() - '' + >>> print(cf.active_storage_url()) + None >>> with cf.active_storage_url('http://active/storage/location'): ... print(cf.active_storage_url()) ... 'http://active/storage/location' - >>> cf.active_storage_url() - '' - >>> cf.active_storage_url('http://other/location') - '' + >>> print(cf.active_storage_url()) + None + >>> print(cf.active_storage_url('http://other/location')) + None >>> cf.active_storage_url() 'http://other/location' @@ -1299,6 +1297,9 @@ def _parse(cls, arg): insertion into the `CONSTANTS` dictionary. """ + if arg is None: + return arg + return str(arg) @@ -2720,7 +2721,7 @@ def relpath(filename, start=None): 'http://data/archive/file.nc' """ - u = urllib.parse.urlparse(filename) + u = urlparse(filename) if u.scheme != "": return filename @@ -2758,7 +2759,7 @@ def dirname(filename): 'http://data/archive' """ - u = urllib.parse.urlparse(filename) + u = urlparse(filename) if u.scheme != "": return filename.rpartition("/")[0] @@ -2797,9 +2798,9 @@ def pathjoin(path1, path2): 'http://data/archive/file.nc' """ - u = urllib.parse.urlparse(path1) + u = urlparse(path1) if u.scheme != "": - return urllib.parse.urljoin(path1, path2) + return urljoin(path1, path2) return _os_path_join(path1, path2) @@ -3246,6 +3247,9 @@ def environment(display=True, paths=True): "dask": _get_module_info("dask"), # Then Python libraries not related to CF "netCDF4": _get_module_info("netCDF4"), + "h5netcdf": _get_module_info("h5netcdf"), + "h5py": _get_module_info("h5py"), + "s3fs": _get_module_info("s3fs"), "psutil": _get_module_info("psutil"), "packaging": _get_module_info("packaging"), "numpy": _get_module_info("numpy"), diff --git a/cf/read_write/read.py b/cf/read_write/read.py index 9f3c2ceeb3..eeaa42f5cb 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -63,8 +63,8 @@ def read( chunks="auto", domain=False, cfa=None, - storage_options=None, netCDF_backend=None, + storage_options=None, ): """Read field or domain constructs from files. @@ -676,6 +676,23 @@ def read( .. versionadded:: 3.15.0 + netCDF_backend: `str` or `None`, optional + Specify which library to use for opening input files. By + default, or if `None`, then `netCDF4` will used unless it + fails to open a given file, in which case `h5netcdf` will + be used. Setting *library* to ``'netCDF4'`` or + ``'h5netcdf'`` will force the use of the `netCDF4` or + `h5netcdf` libraries respectively. + + .. note:: The *netCDF_backend* parameter does not affect + the opening of netCDF fragment files that define + the data of aggregated variables. For these, + `netCDF4` is used for local files and those + accessed via OPenDAP, and `h5netcdf` is used for + fragement files in S3 object stores. + + .. versionadded:: ACTIVEVERSION + storage_options: `dict` or `None`, optional Key/value pairs to be passed on to the creation of `s3fs.S3FileSystem` file systems to control the opening of @@ -705,23 +722,6 @@ def read( .. versionadded:: ACTIVEVERSION - netCDF_backend: `str` or `None`, optional - Specify which library to use for opening input files. By - default, or if `None`, then `netCDF4` will used unless it - fails to open a given file, in which case `h5netcdf` will - be used. Setting *library* to ``'netCDF4'`` or - ``'h5netcdf'`` will force the use of the `netCDF4` or - `h5netcdf` libraries respectively. - - .. note:: The *netCDF_backend* parameter does not affect - the opening of netCDF fragment files that define - the data of aggregated variables. For these, - `netCDF4` is used for local files and those - accessed via OPenDAP, and `h5netcdf` is used for - fragement files in S3 object stores. - - .. versionadded:: ACTIVEVERSION - umversion: deprecated at version 3.0.0 Use the *um* parameter instead. diff --git a/cf/test/test_functions.py b/cf/test/test_functions.py index 791fe3305e..9c015f43ba 100644 --- a/cf/test/test_functions.py +++ b/cf/test/test_functions.py @@ -85,7 +85,7 @@ def test_configuration(self): "log_level": "INFO", "chunksize": 8e9, "active_storage": True, - "active_storage_url": "", + "active_storage_url": None, } # Test the setting of each lone item. diff --git a/release_docs b/release_docs index bd9643b686..eb9c7ace24 100755 --- a/release_docs +++ b/release_docs @@ -12,7 +12,7 @@ version=`python -c "import cf; print(cf.__version__)"` sphinx_version=`python -c "import sphinx; print(sphinx.__version__)"` if [[ $sphinx_version != "2.4.5" ]] ; then - echo "ERROR: Must use sphinx version 2.4.5. Got $sphinx_version" + echo "ERROR: Must (sadly) use sphinx version 2.4.5. Got $sphinx_version" exit 3 fi From 866ccca74c9d89fefb36f6c1ff02226a16e7eeef Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 13 Feb 2024 12:09:37 +0000 Subject: [PATCH 045/134] dev --- cf/data/array/mixin/activestoragemixin.py | 13 ++++++++----- cf/test/test_active_storage.py | 20 +++----------------- 2 files changed, 11 insertions(+), 22 deletions(-) diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index d9d1b21c12..a8eeb765e7 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -89,17 +89,20 @@ def actify(self, method, axis=None, active_storage_url=None): storage operation. """ - if Active is None: + # Don't actify when the data are packed. Note: There may come + # a time when activestorage.Active can cope with packed data, + # in which cas we can remove this test. + attributes = self.get_attributes({}) + if "add_offset" in attributes or "scale_factor" in attributes: raise AttributeError( "Can't actify {self.__class__.__name__} when " - "activestorage.Active is not available" + "the data has been numerically packed" ) - attributes = self.get_attributes({}) - if "add_offset" in attributes or "scale_factor" in attributes: + if Active is None: raise AttributeError( "Can't actify {self.__class__.__name__} when " - "the data has been numerically packed" + "activestorage.Active is not available" ) a = self.copy() diff --git a/cf/test/test_active_storage.py b/cf/test/test_active_storage.py index ac26793b70..34afc8cee1 100644 --- a/cf/test/test_active_storage.py +++ b/cf/test/test_active_storage.py @@ -51,28 +51,14 @@ def test_active_storage(self): with cf.configuration(active_storage=True, active_storage_url="dummy"): self.assertTrue(cf.active_storage()) - self.assertTrue(cf.active_storage_url()) + self.assertEqual(cf.active_storage_url(), "dummy") self.assertTrue(f.data.active_storage) active_array = f.collapse("mean", weights=False).array self.assertEqual(array, active_array) - # Masked values (not yet working) - # self.assertFalse(cf.active_storage()) - # f[0] = cf.masked - # cf.write(f, tmpfile2) - # f = cf.read(tmpfile2, chunks={"latitude": (4, 1), "longitude": (3, 5)}) - # f = f[0] - # - # array = f.collapse("mean", weights=False).array - # - # with cf.active_storage(True, active_storage_url="dummy"): - # self.assertTrue(cf.active_storage()) - # self.assertTrue(cf.active_storage_url()) - # self.assertTrue(f.data.active_storage) - # active_array = f.collapse("mean").array - # - # self.assertEqual(array, active_array) + # TODOACTIVE: Test with masked values (not yet working in + # activestorage.Active) if __name__ == "__main__": From baee889a36182fb63faae6e258c0f9e181317600 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 13 Feb 2024 18:35:59 +0000 Subject: [PATCH 046/134] dev --- Changelog.rst | 10 + cf/__init__.py | 12 +- cf/cellmethod.py | 2 +- cf/data/array/__init__.py | 1 + cf/data/array/cfanetcdfarray.py | 39 ++-- cf/data/array/h5netcdfarray.py | 11 +- cf/data/array/mixin/activestoragemixin.py | 18 +- cf/data/array/netcdfarray.py | 52 ++--- cf/data/collapse/collapse.py | 30 +-- cf/data/collapse/collapse_active.py | 12 +- cf/data/data.py | 6 +- cf/data/fragment/h5netcdffragmentarray.py | 4 +- cf/data/fragment/mixin/fragmentarraymixin.py | 7 +- cf/data/fragment/netcdf4fragmentarray.py | 4 +- cf/data/fragment/netcdffragmentarray.py | 7 +- cf/domain.py | 2 +- cf/field.py | 41 ++-- cf/functions.py | 21 +- cf/mixin/fielddomain.py | 2 +- cf/read_write/netcdf/netcdfread.py | 5 +- cf/read_write/netcdf/netcdfwrite.py | 72 ++----- cf/read_write/read.py | 12 +- cf/read_write/write.py | 15 +- cf/regrid/regrid.py | 2 +- cf/regrid/regridoperator.py | 6 +- docs/source/class.rst | 54 ++++-- docs/source/class/cf.H5netcdfArray.rst | 193 +++++++++++++++++++ docs/source/class/cf.NetCDF4Array.rst | 70 +++++++ docs/source/class/cf.NetCDFArray.rst | 70 ------- docs/source/conf.py | 1 + docs/source/field_analysis.rst | 56 +++++- docs/source/installation.rst | 9 +- docs/source/introduction.rst | 7 +- docs/source/tutorial.rst | 8 +- requirements.txt | 4 + 35 files changed, 550 insertions(+), 315 deletions(-) create mode 100644 docs/source/class/cf.H5netcdfArray.rst create mode 100644 docs/source/class/cf.NetCDF4Array.rst delete mode 100644 docs/source/class/cf.NetCDFArray.rst diff --git a/Changelog.rst b/Changelog.rst index 49a1a60820..3ecb4d53c1 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -3,12 +3,20 @@ version 3.17.0 **2024-??-??** +* Allow access to netCDF-4 files in S3 object stores + (https://github.com/NCAS-CMS/cf-python/issues/712) +* New class `cfdm.H5netcdfArray` * Fix bug that caused `cf.Field.del_file_location` to fail when updating its metdata constructs (https://github.com/NCAS-CMS/cf-python/issues/707) +* New dependency: ``h5netcdf>=1.3.0`` +* New dependency: ``h5py>=3.10.0`` +* New dependency: ``s3fs>=2024.2.0`` * Changed dependency: ``1.11.1.0<=cfdm<1.11.2.0`` * Changed dependency: ``cfunits>=3.3.7`` +---- + version 3.16.0 -------------- @@ -33,6 +41,8 @@ version 3.16.0 * Changed dependency: ``1.11.0.0<=cfdm<1.11.1.0`` * New dependency: ``scipy>=1.10.0`` +---- + version 3.15.4 -------------- diff --git a/cf/__init__.py b/cf/__init__.py index c750d697be..793e62b6d2 100644 --- a/cf/__init__.py +++ b/cf/__init__.py @@ -13,10 +13,17 @@ * read field constructs from netCDF, CDL, PP and UM datasets, +* read field constructs and domain constructs from netCDF, CDL, PP and + UM datasets with a choice of netCDF backends, + +* read files from OPeNDAP servers and S3 object stores, + * create new field constructs in memory, * write and append field constructs to netCDF datasets on disk, +* read, write, and manipulate UGRID mesh topologies, + * read, write, and create coordinates defined by geometry cells, * read netCDF and CDL datasets containing hierarchical groups, @@ -74,8 +81,8 @@ """ __Conventions__ = "CF-1.11" -__date__ = "2023-12-06" -__version__ = "3.16.0" +__date__ = "2024-??-??" +__version__ = "3.17.0" _requires = ( "numpy", @@ -278,6 +285,7 @@ FullArray, GatheredArray, H5netcdfArray, + NetCDFArray, NetCDF4Array, PointTopologyArray, RaggedContiguousArray, diff --git a/cf/cellmethod.py b/cf/cellmethod.py index 67ebf72cbd..a1a2f3be15 100644 --- a/cf/cellmethod.py +++ b/cf/cellmethod.py @@ -56,7 +56,7 @@ class CellMethod(cfdm.CellMethod): def __new__(cls, *args, **kwargs): """This must be overridden in subclasses. - .. versionadded:: (cfdm) 3.7.0 + .. versionadded:: 3.7.0 """ instance = super().__new__(cls) diff --git a/cf/data/array/__init__.py b/cf/data/array/__init__.py index 5006b8a39e..0b16361f53 100644 --- a/cf/data/array/__init__.py +++ b/cf/data/array/__init__.py @@ -4,6 +4,7 @@ from .fullarray import FullArray from .gatheredarray import GatheredArray from .h5netcdfarray import H5netcdfArray +from .netcdfarray import NetCDFArray from .netcdf4array import NetCDF4Array from .pointtopologyarray import PointTopologyArray from .raggedcontiguousarray import RaggedContiguousArray diff --git a/cf/data/array/cfanetcdfarray.py b/cf/data/array/cfanetcdfarray.py index f44d1ef9c2..868bb1b12d 100644 --- a/cf/data/array/cfanetcdfarray.py +++ b/cf/data/array/cfanetcdfarray.py @@ -6,8 +6,6 @@ from ..fragment import FullFragmentArray, NetCDFFragmentArray, UMFragmentArray from ..utils import chunk_locations, chunk_positions - -# from .mixin import CFAMixin from .netcdf4array import NetCDF4Array # Store fragment array classes. @@ -131,7 +129,7 @@ def __init__( 'https://s3.fr-par.scw.cloud', 'client_kwargs': {'region_name': 'fr-par'}}`` - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 {{init source: optional}} @@ -219,9 +217,8 @@ def __init__( for frag_loc, location in zip(positions, locations): if extra_dimension: filename = compressed(f[frag_loc]).tolist() - n_files = len(filename) if scalar_address: - address = a * n_files + address = a * len(filename) else: address = compressed(a[frag_loc].tolist()) @@ -230,6 +227,7 @@ def __init__( else: fmt = compressed(file_fmt[frag_loc]).tolist() else: + print(f.shape, frag_loc, address) filename = (f[frag_loc].item(),) if scalar_address: address = a @@ -347,12 +345,14 @@ def get_aggregated_data(self, copy=True): >>> a.get_fragment_shape() (2, 1, 1, 1) >>> a.get_aggregated_data() - {(0, 0, 0, 0): {'file': 'January-June.nc', - 'address': 'temp', + {(0, 0, 0, 0): { + 'file': ('January-June.nc',), + 'address': ('temp',), 'format': 'nc', 'location': [(0, 6), (0, 1), (0, 73), (0, 144)]}, - (1, 0, 0, 0): {'file': 'July-December.nc', - 'address': 'temp', + (1, 0, 0, 0): { + 'file': ('July-December.nc',), + 'address': ('temp',), 'format': 'nc', 'location': [(6, 12), (0, 1), (0, 73), (0, 144)]}} @@ -415,7 +415,7 @@ def get_storage_options(self): ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` key with value ``'https://store'`` would be created. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: 3.17.0 :Returns: @@ -462,13 +462,18 @@ def get_term(self, default=ValueError()): def subarray_shapes(self, shapes): """Create the subarray shapes. + A fragmented dimenion (i.e. one spanned by two or fragments) + will always have a subarray size equal to the size of each of + its fragments, overriding any other size implied by the + *shapes* parameter. + .. versionadded:: 3.14.0 .. seealso:: `subarrays` :Parameters: - shapes: `int`, sequence, `dict` or `str`, optional + shapes: `int`, sequence, `dict` or `str`, optional Define the subarray shapes. Any value accepted by the *chunks* parameter of the @@ -509,7 +514,8 @@ def subarray_shapes(self, shapes): from dask.array.core import normalize_chunks - # Indices of fragmented dimensions + # Positions of fragmented dimensions (i.e. those spanned by + # two or more fragments) f_dims = self.get_fragmented_dimensions() shape = self.shape @@ -522,8 +528,9 @@ def subarray_shapes(self, shapes): zip(self.get_fragment_shape(), self.shape) ): if dim in f_dims: - # This aggregated dimension is spanned by more than - # one fragment. + # This aggregated dimension is spanned by two or more + # fragments => set the chunks to be the same size as + # the each fragment. c = [] index = [0] * ndim for j in range(n_fragments): @@ -535,8 +542,8 @@ def subarray_shapes(self, shapes): chunks.append(tuple(c)) else: # This aggregated dimension is spanned by exactly one - # fragment. Store None, for now, in the expectation - # that it will get overwrittten. + # fragment => store `None` for now. This will get + # overwritten from 'shapes'. chunks.append(None) if isinstance(shapes, (str, Number)) or shapes is None: diff --git a/cf/data/array/h5netcdfarray.py b/cf/data/array/h5netcdfarray.py index 8d43242063..62fb791b3e 100644 --- a/cf/data/array/h5netcdfarray.py +++ b/cf/data/array/h5netcdfarray.py @@ -14,14 +14,19 @@ class H5netcdfArray( ): """A netCDF array accessed with `h5netcdf`. - .. versionadded:: ACTIVEVERSION + **Active storage reductions** + + Active storage reduction may be enabled with the `actify` + method. See `cf.data.collapse.Collapse` for details. + + .. versionadded:: 3.17.0 """ def __dask_tokenize__(self): """Return a value fully representative of the object. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 """ return super().__dask_tokenize__() + (self.get_mask(),) @@ -37,7 +42,7 @@ def _lock(self): that access to all netCDF and HDF files coordinates around the same lock. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 """ return _lock diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index a8eeb765e7..a83ac065d2 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -7,7 +7,7 @@ class ActiveStorageMixin: """Mixin class for enabling active storage reductions. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 """ @@ -29,7 +29,7 @@ def __getitem__(self, indices): then these indices work independently along each dimension (similar to the way vector subscripts work in Fortran). - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 """ method = self.get_active_method() @@ -63,7 +63,7 @@ def actify(self, method, axis=None, active_storage_url=None): The new instance is a deep copy of the original, with the additional setting of the active storage method and axis. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 .. seealso:: `set_active_axis`, `set_active_method` @@ -114,7 +114,7 @@ def actify(self, method, axis=None, active_storage_url=None): def get_active_axis(self): """Return the active storage reduction axes. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 .. seealso:: `set_active_axis` @@ -130,7 +130,7 @@ def get_active_axis(self): def get_active_method(self): """Return the name of the active storage reduction method. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 .. seealso:: `set_active_method` @@ -146,7 +146,7 @@ def get_active_method(self): def get_active_storage_url(self): """Return the the active storage URL. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 .. seealso:: `set_active_storage_url` @@ -162,7 +162,7 @@ def get_active_storage_url(self): def set_active_axis(self, value): """Set the active storage reduction axes. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 .. seealso:: `get_active_axis` @@ -182,7 +182,7 @@ def set_active_axis(self, value): def set_active_method(self, value): """Set the name of the active storage reduction method. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 .. seealso:: `get_active_method` @@ -201,7 +201,7 @@ def set_active_method(self, value): def set_active_storage_url(self, value): """Set the active storage URL. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 .. seealso:: `get_active_storage_url` diff --git a/cf/data/array/netcdfarray.py b/cf/data/array/netcdfarray.py index 54b826b79a..b921f2c418 100644 --- a/cf/data/array/netcdfarray.py +++ b/cf/data/array/netcdfarray.py @@ -1,46 +1,16 @@ -import cfdm +class NetCDFArray: + """A netCDF array accessed with `netCDF4`. -from ...mixin_container import Container -from .locks import _lock -from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin - - -class NetCDF4Array( - ActiveStorageMixin, - FileArrayMixin, - ArrayMixin, - Container, - cfdm.NetCDF4Array, -): - """An array stored in a netCDF file. - - **Active storage reductions** - - Active storage reduction may be enabled with the `actify` - method. See `cf.data.collapse.Collapse` for details. + Deprecated at version 3.17.0 and is no longer available. Use + `cf.NetCDF4Array` instead. """ - def __dask_tokenize__(self): - """Return a value fully representative of the object. - - .. versionadded:: 3.15.0 - - """ - return super().__dask_tokenize__() + (self.get_mask(),) - - @property - def _lock(self): - """Set the lock for use in `dask.array.from_array`. - - Returns a lock object because concurrent reads are not - currently supported by the netCDF and HDF libraries. The lock - object will be the same for all `NetCDFArray` and `HDFArray` - instances, regardless of the dataset they access, which means - that access to all netCDF and HDF files coordinates around the - same lock. - - .. versionadded:: 3.14.0 + def __init__(self, *args, **kwargs): + """**Initialisation**""" + from ..functions import DeprecationError - """ - return _lock + raise DeprecationError( + f"{self.__class__.__name__} was deprecated at version 3.17.0 " + "and is no longer available. Use cf.NetCDF4Array instead." + ) diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index 71798bd159..f6fdb7693c 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -124,7 +124,7 @@ def max( {{active_storage: `bool`, optional}} - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 :Returns: @@ -192,7 +192,7 @@ def max_abs( {{active_storage: `bool`, optional}} - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 :Returns: @@ -251,7 +251,7 @@ def mean( {{active_storage: `bool`, optional}} - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 :Returns: @@ -323,7 +323,7 @@ def mean_abs( {{active_storage: `bool`, optional}} - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 :Returns: @@ -381,7 +381,7 @@ def mid_range( {{active_storage: `bool`, optional}} - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 :Returns: @@ -453,7 +453,7 @@ def min( {{active_storage: `bool`, optional}} - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 :Returns: @@ -521,7 +521,7 @@ def min_abs( {{active_storage: `bool`, optional}} - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 :Returns: @@ -577,7 +577,7 @@ def range( {{active_storage: `bool`, optional}} - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 :Returns: @@ -652,7 +652,7 @@ def rms( {{active_storage: `bool`, optional}} - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 :Returns: @@ -721,7 +721,7 @@ def sample_size( {{active_storage: `bool`, optional}} - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 :Returns: @@ -796,7 +796,7 @@ def sum( {{active_storage: `bool`, optional}} - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 :Returns: @@ -871,7 +871,7 @@ def sum_of_weights( {{active_storage: `bool`, optional}} - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 :Returns: @@ -947,7 +947,7 @@ def sum_of_weights2( {{active_storage: `bool`, optional}} - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 :Returns: @@ -1000,7 +1000,7 @@ def unique( {{active_storage: `bool`, optional}} - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 :Returns: @@ -1081,7 +1081,7 @@ def var( {{active_storage: `bool`, optional}} - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 :Returns: diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 077e15e125..53067061b4 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -25,7 +25,7 @@ def active_min(a, **kwargs): as the ``chunk`` parameter. Its returned value must be the same as the non-active chunk function that it is replacing. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 .. seealso:: `actify`, `active_storage` @@ -60,7 +60,7 @@ def active_max(a, **kwargs): as the ``chunk`` parameter. Its returned value must be the same as the non-active chunk function that it is replacing. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 .. seealso:: `actify`, `active_storage` @@ -95,7 +95,7 @@ def active_mean(a, **kwargs): as the ``chunk`` parameter. Its returned value must be the same as the non-active chunk function that it is replacing. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 .. seealso:: `actify`, `active_storage` @@ -134,7 +134,7 @@ def active_sum(a, **kwargs): as the ``chunk`` parameter. Its returned value must be the same as the non-active chunk function that it is replacing. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 .. seealso:: `actify`, `active_storage` @@ -185,7 +185,7 @@ def actify(a, method, axis=None): `!active_storage` attribute is registered via the *active_storage* parameter of `Collapse` methods. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 .. seealso:: `active_storage`, `cf.data.collapse.Collapse` @@ -303,7 +303,7 @@ def active_storage(method): `Collapse` method is decorated, active storage operations are only carried out when the conditions are right. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 .. seealso:: `actify`, `cf.data.collapse.Collapse` diff --git a/cf/data/data.py b/cf/data/data.py index 25dfb38e97..535a137d47 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -1476,7 +1476,7 @@ def _del_dask(self, default=ValueError(), clear=_ALL): def _del_active_storage(self): """Set the active storage reduction status to False. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 .. seealso:: `active_storage`, `_set_active_storage` @@ -1561,7 +1561,7 @@ def _is_abstract_Array_subclass(self, array): def _set_active_storage(self, value): """Set the active storage reduction status. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 .. seealso:: `active_storage`, `_del_active_storage` @@ -4771,7 +4771,7 @@ def active_storage(self): the conditions described by `cf.data.collapse.Collapse` are met. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 **Examples** diff --git a/cf/data/fragment/h5netcdffragmentarray.py b/cf/data/fragment/h5netcdffragmentarray.py index 98a1a5f843..87688c056e 100644 --- a/cf/data/fragment/h5netcdffragmentarray.py +++ b/cf/data/fragment/h5netcdffragmentarray.py @@ -5,7 +5,7 @@ class H5netcdfFragmentArray(FragmentArrayMixin, H5netcdfArray): """A netCDF fragment array accessed with `h5netcdf`. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 """ @@ -65,8 +65,6 @@ def __init__( {{init storage_options: `dict` or `None`, optional}} - .. versionadded:: ACTIVEVERSION - {{init source: optional}} {{init copy: `bool`, optional}} diff --git a/cf/data/fragment/mixin/fragmentarraymixin.py b/cf/data/fragment/mixin/fragmentarraymixin.py index 48f9409bfd..01657497f2 100644 --- a/cf/data/fragment/mixin/fragmentarraymixin.py +++ b/cf/data/fragment/mixin/fragmentarraymixin.py @@ -32,7 +32,8 @@ def __getitem__(self, indices): .. versionadded:: 3.15.0 """ - # TODOACTIVE: modify this for the active storage case of + # TODOACTIVE: Placeholder note to modify this for the active + # storage case of # super().__getitem__(tuple(indices)) returning a # dictionary @@ -116,8 +117,8 @@ def _conform_to_aggregated_units(self, array): if isinstance(array, dict): # 'array' is a dictionary. raise ValueError( - "TODOACTIVE. This error is notification of an " - "unreplaced placeholder for dealing with active " + "TODOACTIVE. Placeholder notification thatn " + "we can't yet dealing with active " "storage reductions on CFA fragments." ) else: diff --git a/cf/data/fragment/netcdf4fragmentarray.py b/cf/data/fragment/netcdf4fragmentarray.py index 7fdbe79cb9..100ad31c8e 100644 --- a/cf/data/fragment/netcdf4fragmentarray.py +++ b/cf/data/fragment/netcdf4fragmentarray.py @@ -5,7 +5,7 @@ class NetCDF4FragmentArray(FragmentArrayMixin, NetCDF4Array): """A netCDF fragment array accessed with `netCDF4`. - .. versionadded:: 3.14.0 + .. versionadded:: 3.17.0 """ @@ -65,8 +65,6 @@ def __init__( {{init storage_options: `dict` or `None`, optional}} - .. versionadded:: ACTIVEVERSION - {{init source: optional}} {{init copy: `bool`, optional}} diff --git a/cf/data/fragment/netcdffragmentarray.py b/cf/data/fragment/netcdffragmentarray.py index ed2523bfd3..394b859220 100644 --- a/cf/data/fragment/netcdffragmentarray.py +++ b/cf/data/fragment/netcdffragmentarray.py @@ -21,7 +21,8 @@ class NetCDFFragmentArray( Access will either with `netCDF4` (for local and OPenDAP files) or `h5netcdf` (for S3 files). - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.15.0 + """ @@ -81,7 +82,7 @@ def __init__( {{init storage_options: `dict` or `None`, optional}} - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 {{init source: optional}} @@ -198,7 +199,7 @@ def __getitem__(self, indices): } # Loop round the files, returning as soon as we find one that - # works. + # is accessible. filenames = self.get_filenames() for filename, address in zip(filenames, self.get_addresses()): kwargs["filename"] = filename diff --git a/cf/domain.py b/cf/domain.py index 262dd56b7f..8d3afed0ec 100644 --- a/cf/domain.py +++ b/cf/domain.py @@ -707,7 +707,7 @@ def identities(self): equals e.g. ``'foo=bar'``. * The netCDF variable name, preceded by ``'ncvar%'``. - .. versionadded:: (cfdm) 1.9.0.0 + .. versionadded:: 3.8.0 .. seealso:: `identity` diff --git a/cf/field.py b/cf/field.py index bda58d2d65..6f3bac8763 100644 --- a/cf/field.py +++ b/cf/field.py @@ -5113,18 +5113,20 @@ def collapse( **Collapse weights** - The calculations of means, standard deviations and variances are, - by default, **not weighted**. For weights to be incorporated in - the collapse, the axes to be weighted must be identified with the - *weights* keyword. + .. warning:: By default, the collapse calculations are **not** + weighted. + + For weights to be incorporated in the collapse, + the *weights* keyword must be set. Weights are either derived from the field construct's metadata - (such as cell sizes), or may be provided explicitly in the form of - other field constructs containing data of weights values. In - either case, the weights actually used are those derived by the - `weights` method of the field construct with the same weights - keyword value. Collapsed axes that are not identified by the - *weights* keyword are unweighted during the collapse operation. + (such as cell sizes), or may be provided explicitly in the + form of other field constructs containing data of weights + values. In either case, the weights actually used are those + derived by the `weights` method of the field construct with + the same *weights* keyword value. Collapsed axes that are not + identified by the *weights* keyword are unweighted during the + collapse operation. *Example:* Create a weighted time average: @@ -5324,27 +5326,28 @@ def collapse( will occur is determined on a case-by-case basis, and will only be done if all of the following criteria are met: - * the collapse method is registered as having an active - storage counterpart, i.e. one of ``'mean'``, ``'maximum'``, + * the collapse method is one of ``'mean'``, ``'maximum'``, ``'minimum'``, or ``'sum'``; * the collapse is over all axes; * the collapse is unweighted; - * `cf.active_storage()` is True; + * `cf.active_storage()` is `True`; * a URL of the active storage server has been set with `cf.active_storage_url`; - * the data values are in netCDF files on disk (rather than in - any other file format, or in memory); + * the data values are in netCDF-4 files on disk (rather than + in any other file format, or in memory) and are not + numerically packed; * the `!active_storage` attribute of the `Data` being - collapsed is `True`. In general, it will only be `True` for - data that is not compressed by convention (including - numerical packing), and has not had any other operations - applied to it; + collapsed is `True`, indicating that active storage + operations may be possible. In general, it will only be + `True` for data that are in files on disk, are not + compressed by convention and have not had any other + operations applied; * it is possible to import the `activestorage.Active` class. diff --git a/cf/functions.py b/cf/functions.py index 70c5dd347d..6c874664e6 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -211,7 +211,8 @@ def configuration( .. seealso:: `atol`, `rtol`, `tempdir`, `chunksize`, `total_memory`, `log_level`, `regrid_logging`, - `relaxed_identities`, `bounds_combination_mode` + `relaxed_identities`, `bounds_combination_mode`, + `active_storage`, `active_storage_url` :Parameters: @@ -268,13 +269,13 @@ def configuration( reductions or False to disable them). The default is to not change the current behaviour. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 active_storage_url: `str` or `None` or `Constant`, optional The new value (either a new URL string or `None` to remove the URL). The default is to not change the value. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 of_fraction: `float` or `Constant`, optional Deprecated at version 3.14.0 and is no longer @@ -1183,9 +1184,9 @@ def _parse(cls, arg): class active_storage(ConstantAccess): """Whether or not to attempt active storage reductions. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 - .. seealso:: `configuration` + .. seealso:: `active_storage_url`, `configuration` :Parameters: @@ -1221,7 +1222,7 @@ class active_storage(ConstantAccess): def _parse(cls, arg): """Parse a new constant value. - .. versionaddedd:: ACTIVEVERSION + .. versionaddedd:: 3.17.0 :Parameters: @@ -1243,9 +1244,9 @@ def _parse(cls, arg): class active_storage_url(ConstantAccess): """The URL location of the active storage reducer. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 - .. seealso:: `configuration` + .. seealso:: `active_storage`, `configuration` :Parameters: @@ -1263,7 +1264,7 @@ class active_storage_url(ConstantAccess): >>> print(cf.active_storage_url()) None - >>> with cf.active_storage_url('http://active/storage/location'): + >>> with cf.active_storage_url('http://active/storage/location'): ... print(cf.active_storage_url()) ... 'http://active/storage/location' @@ -1281,7 +1282,7 @@ class active_storage_url(ConstantAccess): def _parse(cls, arg): """Parse a new constant value. - .. versionaddedd:: ACTIVEVERSION + .. versionaddedd:: 3.17.0 :Parameters: diff --git a/cf/mixin/fielddomain.py b/cf/mixin/fielddomain.py index 638f9aa855..6dab4b9223 100644 --- a/cf/mixin/fielddomain.py +++ b/cf/mixin/fielddomain.py @@ -2035,7 +2035,7 @@ def iscyclic(self, *identity, **filter_kwargs): {{filter_kwargs: optional}} - .. versionadded:: (cfdm) 3.9.0 + .. versionadded:: 3.9.0 :Returns: diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py index dd456f8aee..a0a8178ca5 100644 --- a/cf/read_write/netcdf/netcdfread.py +++ b/cf/read_write/netcdf/netcdfread.py @@ -260,8 +260,8 @@ def _create_data( # Note: We don't cache elements from CFA variables, because # the data are in fragment files which have not been - # opened; and may not not even be openable, such as - # could be the case if a fragement was on tape storage. + # opened and may not not even be openable (such as could + # be the case if a fragment file was on tape storage). # Set the CFA write status to True iff each non-aggregated # axis has exactly one dask storage chunk @@ -940,6 +940,7 @@ def _cfa_parse_aggregated_data(self, ncvar, aggregated_data): aggregation_instructions = g["cfa_aggregation_instructions"] variable_attributes = g["variable_attributes"] + # Loop round aggregation instruction terms out = {} for x in self._parse_x( ncvar, diff --git a/cf/read_write/netcdf/netcdfwrite.py b/cf/read_write/netcdf/netcdfwrite.py index ad7da4c9cd..50ae69f483 100644 --- a/cf/read_write/netcdf/netcdfwrite.py +++ b/cf/read_write/netcdf/netcdfwrite.py @@ -103,8 +103,15 @@ def _write_as_cfa(self, cfvar, construct_type, domain_axes): raise ValueError( f"Can't write {cfvar!r} as a CFA-netCDF " - "aggregation variable. Consider setting " - "cfa={'strict': False}" + "aggregation variable. If the variable was read " + "from disk then setting chunks=None as an " + "argument to cf.read will likely solve the " + "problem. " + "Alternatively, you could consider setting " + "cfa={'strict': False} as an argument to " + "cf.write, but note the this will create a copy " + "of the data for this variable in the output " + "dataset." ) return cfa_get_write @@ -464,7 +471,7 @@ def _create_cfa_data(self, ncvar, ncdimensions, data, cfvar): ): f_ncdim = f"f_{ncdim}" if f_ncdim not in g["dimensions"]: - # Create a new fragement dimension + # Create a new fragment dimension self._write_dimension(f_ncdim, None, size=size) fragment_ncdimensions.append(f_ncdim) @@ -568,55 +575,6 @@ def _create_cfa_data(self, ncvar, ncdimensions, data, cfvar): }, ) - def _convert_to_builtin_type(self, x): - """Convert a non-JSON-encodable object to a JSON-encodable - built-in type. - - Possible conversions are: - - ============== ============= ====================================== - Input object Output object numpy data types covered - ============== ============= ====================================== - numpy.bool_ bool bool - numpy.integer int int, int8, int16, int32, int64, uint8, - uint16, uint32, uint64 - numpy.floating float float, float16, float32, float64 - ============== ============= ====================================== - - .. versionadded:: 3.0.0 - - :Parameters: - - x: - - :Returns: - - 'int' or `float` or `bool` - - **Examples:** - - >>> type(_convert_to_builtin_type(numpy.bool_(True))) - bool - >>> type(_convert_to_builtin_type(numpy.array([1.0])[0])) - double - >>> type(_convert_to_builtin_type(numpy.array([2])[0])) - int - - """ - if isinstance(x, np.bool_): - return bool(x) - - if isinstance(x, np.integer): - return int(x) - - if isinstance(x, np.floating): - return float(x) - - raise TypeError( - f"{type(x)!r} object can't be converted to a JSON serializable " - f"type: {x!r}" - ) - def _check_valid(self, array, cfvar=None, attributes=None): """Checks for array values outside of the valid range. @@ -908,16 +866,14 @@ def _cfa_aggregation_instructions(self, data, cfvar): if file_details: raise ValueError( "Can't write CFA-netCDF aggregation variable from " - f"{cfvar!r} when the " - f"dask storage chunk defined by indices {indices} " - "spans two or more files" + f"{cfvar!r}: Dask storage chunk defined by indices " + f"{indices} spans two or more fragment files" ) raise ValueError( "Can't write CFA-netCDF aggregation variable from " - f"{cfvar!r} when the " - f"dask storage chunk defined by indices {indices} spans " - "zero files" + f"{cfvar!r}: Dask storage chunk defined by indices " + f"{indices} spans zero files" ) filenames, addresses, formats = file_details.pop() diff --git a/cf/read_write/read.py b/cf/read_write/read.py index eeaa42f5cb..d9ca85cebf 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -437,7 +437,7 @@ def read( A netCDF array is unpacked depending on the values of the netCDF attributes ``add_offset`` and ``scale_factor``. - .. versionadded:: (cfdm) ACTIVEVERSION + .. versionadded:: 3.17.0 warn_valid: `bool`, optional If True then print a warning for the presence of @@ -689,9 +689,9 @@ def read( the data of aggregated variables. For these, `netCDF4` is used for local files and those accessed via OPenDAP, and `h5netcdf` is used for - fragement files in S3 object stores. + fragment files in S3 object stores. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 storage_options: `dict` or `None`, optional Key/value pairs to be passed on to the creation of @@ -720,7 +720,7 @@ def read( 'https://s3.fr-par.scw.cloud', 'client_kwargs': {'region_name': 'fr-par'}}`` - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 umversion: deprecated at version 3.0.0 Use the *um* parameter instead. @@ -1171,12 +1171,12 @@ def _read_a_file( storage_options: `dict` or `None`, optional See `cf.read` for details. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 netCDF_backend: `str` or `None`, optional See `cf.read` for details. - .. versionadded:: ACTIVEVERSION + .. versionadded:: 3.17.0 :Returns: diff --git a/cf/read_write/write.py b/cf/read_write/write.py index c3d0edb615..23a8dda3cd 100644 --- a/cf/read_write/write.py +++ b/cf/read_write/write.py @@ -97,22 +97,22 @@ def write( construct. - **NetCDF hierarchical groups** + **NetCDF-4 hierarchical groups** Hierarchical groups in CF provide a mechanism to structure - variables within netCDF4 datasets with well defined rules for + variables within netCDF-4 datasets with well defined rules for resolving references to out-of-group netCDF variables and dimensions. The group structure defined by a field construct's netCDF interface will, by default, be recreated in the output dataset. See the *group* parameter for details. - **NetCDF4 HDF chunk sizes** + **NetCDF-4 HDF chunk sizes** HDF5 chunksizes may be set on contruct's data. See the - `~cf.Data.nc_hdf5_chunksizes`, - `~cf.Data.nc_clear_hdf5_chunksizes` and - `~cf.Data.nc_set_hdf5_chunksizes` methods of a `Data` instance. + `~cf.Data.nc_hdf5_chunksizes`, `~cf.Data.nc_clear_hdf5_chunksizes` + and `~cf.Data.nc_set_hdf5_chunksizes` methods of a `Data` + instance. .. seealso:: `cf.read` @@ -121,7 +121,6 @@ def write( fields: (arbitrarily nested sequence of) `Field` or `FieldList` The field constructs to write to the file. - filename: `str` The output netCDF file name. Various type of expansion are applied to the file names. @@ -548,7 +547,7 @@ def write( variables. By default only auxiliary and scalar coordinate variables are included. - .. versionadded:: (cfdm) 3.7.0 + .. versionadded:: 3.7.0 omit_data: (sequence of) `str`, optional Do not write the data of the named construct types. diff --git a/cf/regrid/regrid.py b/cf/regrid/regrid.py index 1748f1829e..b855d98e53 100644 --- a/cf/regrid/regrid.py +++ b/cf/regrid/regrid.py @@ -1923,7 +1923,7 @@ def create_esmpy_weights( from netCDF4 import Dataset from .. import __version__ - from ..data.array.netcdfarray import _lock + from ..data.array.locks import _lock if ( max(dst_esmpy_field.data.size, src_esmpy_field.data.size) diff --git a/cf/regrid/regridoperator.py b/cf/regrid/regridoperator.py index a8f783f696..f3a78d9a22 100644 --- a/cf/regrid/regridoperator.py +++ b/cf/regrid/regridoperator.py @@ -18,6 +18,8 @@ class RegridOperator(mixin_Container, Container): information, such as the grid shapes; the CF metadata for the destination grid; and the source grid coordinates. + .. versionadded:: 3.10.0 + """ def __init__( @@ -550,6 +552,8 @@ def tosparse(self): any further modification of the weights to account for missing values in the source grid will always involve row-slicing. + .. versionadded:: 3.14.0 + :Returns: `None` @@ -572,7 +576,7 @@ def tosparse(self): # Read the weights from the weights file from netCDF4 import Dataset - from ..data.array.netcdfarray import _lock + from ..data.array.locks import _lock _lock.acquire() nc = Dataset(weights_file, "r") diff --git a/docs/source/class.rst b/docs/source/class.rst index dfa24006c8..4b307ecc2b 100644 --- a/docs/source/class.rst +++ b/docs/source/class.rst @@ -77,24 +77,12 @@ Data classes :toctree: class/ cf.Data - cf.NetCDFArray - -Miscellaneous classes ---------------------- - -.. autosummary:: - :nosignatures: - :toctree: class/ - - cf.Flags - cf.Query - cf.TimeDuration - cf.Units - cf.RegridOperator - cf.Constant - cf.Configuration - - + cf.H5netcdfArray + cf.NetCDF4Array + cf.CFANetCDFArray + cf.FullArray + cf.UMAarray + Data compression classes ------------------------ @@ -111,3 +99,33 @@ Classes that support the creation and storage of compressed arrays. cf.RaggedContiguousArray cf.RaggedIndexedArray cf.RaggedIndexedContiguousArray + cf.SubsampledArray + +Data UGRID classes +------------------ + +Classes that support the creation and storage of UGRID-related arrays. + +.. autosummary:: + :nosignatures: + :toctree: class/ + + cf.BoundsFromNodesArray + cf.CellConnectivityArray + cf.PointTopologyArray + +Miscellaneous classes +--------------------- + +.. autosummary:: + :nosignatures: + :toctree: class/ + + cf.Flags + cf.Query + cf.TimeDuration + cf.Units + cf.RegridOperator + cf.Constant + cf.Configuration + diff --git a/docs/source/class/cf.H5netcdfArray.rst b/docs/source/class/cf.H5netcdfArray.rst new file mode 100644 index 0000000000..f601692e39 --- /dev/null +++ b/docs/source/class/cf.H5netcdfArray.rst @@ -0,0 +1,193 @@ +.. currentmodule:: cf +.. default-role:: obj + +cf.H5netcdfArray +================ + +---- + +.. autoclass:: cf.H5netcdfArray + :no-members: + :no-inherited-members: + +Inspection +---------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.H5netcdfArray.get_compression_type + ~cf.H5netcdfArray.get_subspace + ~cf.H5netcdfArray.get_attributes + + +.. rubric:: Attributes + +.. autosummary:: + :nosignatures: + :toctree: ../attribute/ + :template: attribute.rst + + ~cf.H5netcdfArray.array + ~cf.H5netcdfArray.dtype + ~cf.H5netcdfArray.ndim + ~cf.H5netcdfArray.shape + ~cf.H5netcdfArray.size + +Units +----- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.H5netcdfArray.get_calendar + ~cf.H5netcdfArray.get_units + +File +---- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.H5netcdfArray.get_address + ~cf.H5netcdfArray.get_addresses + ~cf.H5netcdfArray.close + ~cf.H5netcdfArray.open + ~cf.H5netcdfArray.get_filename + ~cf.H5netcdfArray.get_filenames + ~cf.H5netcdfArray.get_format + ~cf.H5netcdfArray.get_formats + ~cf.H5netcdfArray.get_groups + ~cf.H5netcdfArray.get_mask + ~cf.H5netcdfArray.get_unpack + ~cf.H5netcdfArray.get_storage_options + ~cf.H5netcdfArray._lock + +Miscellaneous +------------- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.H5netcdfArray.copy + ~cf.H5netcdfArray.to_memory + +Special +------- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.H5netcdfArray.__dask_tokenize__ + ~cf.H5netcdfArray.__getitem__ + +Docstring substitutions +----------------------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.H5netcdfArray._docstring_special_substitutions + ~cf.H5netcdfArray._docstring_substitutions + ~cf.H5netcdfArray._docstring_package_depth + ~cf.H5netcdfArray._docstring_method_exclusions + +Deprecated +---------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.H5netcdfArray.get_missing_values +---------------------------------------------- +cf.H5netcdfArray +=============== + +---- + +.. autoclass:: cf.H5netcdfArray + :no-members: + :no-inherited-members: + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.H5netcdfArray.add_file_location + ~cf.H5netcdfArray.close + ~cf.H5netcdfArray.copy + ~cf.H5netcdfArray.del_file_location + ~cf.H5netcdfArray.file_locations + ~cf.H5netcdfArray.filename + ~cf.H5netcdfArray.get_address + ~cf.H5netcdfArray.get_addresses + ~cf.H5netcdfArray.get_format + ~cf.H5netcdfArray.get_formats + ~cf.H5netcdfArray.get_calendar + ~cf.H5netcdfArray.get_compression_type + ~cf.H5netcdfArray.get_filename + ~cf.H5netcdfArray.get_filenames + ~cf.H5netcdfArray.get_group + ~cf.H5netcdfArray.get_groups + ~cf.H5netcdfArray.get_mask + ~cf.H5netcdfArray.get_missing_values + ~cf.H5netcdfArray.get_ncvar + ~cf.H5netcdfArray.get_subspace + ~cf.H5netcdfArray.get_units + ~cf.H5netcdfArray.get_varid + ~cf.H5netcdfArray.open + ~cf.H5netcdfArray.to_memory + ~cf.H5netcdfArray.Units + +.. rubric:: Attributes + +.. autosummary:: + :nosignatures: + :toctree: ../attribute/ + :template: attribute.rst + + ~cf.H5netcdfArray.array + ~cf.H5netcdfArray.dtype + ~cf.H5netcdfArray.file_address + ~cf.H5netcdfArray.ndim + ~cf.H5netcdfArray.shape + ~cf.H5netcdfArray.size + +Special +------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.H5netcdfArray.__getitem__ diff --git a/docs/source/class/cf.NetCDF4Array.rst b/docs/source/class/cf.NetCDF4Array.rst new file mode 100644 index 0000000000..3a00cfaf8c --- /dev/null +++ b/docs/source/class/cf.NetCDF4Array.rst @@ -0,0 +1,70 @@ +.. currentmodule:: cf +.. default-role:: obj + +cf.NetCDF4Array +=============== + +---- + +.. autoclass:: cf.NetCDF4Array + :no-members: + :no-inherited-members: + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.NetCDF4Array.add_file_location + ~cf.NetCDF4Array.close + ~cf.NetCDF4Array.copy + ~cf.NetCDF4Array.del_file_location + ~cf.NetCDF4Array.file_locations + ~cf.NetCDF4Array.filename + ~cf.NetCDF4Array.get_address + ~cf.NetCDF4Array.get_addresses + ~cf.NetCDF4Array.get_format + ~cf.NetCDF4Array.get_formats + ~cf.NetCDF4Array.get_calendar + ~cf.NetCDF4Array.get_compression_type + ~cf.NetCDF4Array.get_filename + ~cf.NetCDF4Array.get_filenames + ~cf.NetCDF4Array.get_group + ~cf.NetCDF4Array.get_groups + ~cf.NetCDF4Array.get_mask + ~cf.NetCDF4Array.get_missing_values + ~cf.NetCDF4Array.get_ncvar + ~cf.NetCDF4Array.get_subspace + ~cf.NetCDF4Array.get_units + ~cf.NetCDF4Array.get_varid + ~cf.NetCDF4Array.open + ~cf.NetCDF4Array.to_memory + ~cf.NetCDF4Array.Units + +.. rubric:: Attributes + +.. autosummary:: + :nosignatures: + :toctree: ../attribute/ + :template: attribute.rst + + ~cf.NetCDF4Array.array + ~cf.NetCDF4Array.dtype + ~cf.NetCDF4Array.file_address + ~cf.NetCDF4Array.ndim + ~cf.NetCDF4Array.shape + ~cf.NetCDF4Array.size + +Special +------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.NetCDF4Array.__getitem__ diff --git a/docs/source/class/cf.NetCDFArray.rst b/docs/source/class/cf.NetCDFArray.rst deleted file mode 100644 index 34d7bf0d65..0000000000 --- a/docs/source/class/cf.NetCDFArray.rst +++ /dev/null @@ -1,70 +0,0 @@ -.. currentmodule:: cf -.. default-role:: obj - -cf.NetCDFArray -============== - ----- - -.. autoclass:: cf.NetCDFArray - :no-members: - :no-inherited-members: - -.. rubric:: Methods - -.. autosummary:: - :nosignatures: - :toctree: ../method/ - :template: method.rst - - ~cf.NetCDFArray.add_file_location - ~cf.NetCDFArray.close - ~cf.NetCDFArray.copy - ~cf.NetCDFArray.del_file_location - ~cf.NetCDFArray.file_locations - ~cf.NetCDFArray.filename - ~cf.NetCDFArray.get_address - ~cf.NetCDFArray.get_addresses - ~cf.NetCDFArray.get_format - ~cf.NetCDFArray.get_formats - ~cf.NetCDFArray.get_calendar - ~cf.NetCDFArray.get_compression_type - ~cf.NetCDFArray.get_filename - ~cf.NetCDFArray.get_filenames - ~cf.NetCDFArray.get_group - ~cf.NetCDFArray.get_groups - ~cf.NetCDFArray.get_mask - ~cf.NetCDFArray.get_missing_values - ~cf.NetCDFArray.get_ncvar - ~cf.NetCDFArray.get_subspace - ~cf.NetCDFArray.get_units - ~cf.NetCDFArray.get_varid - ~cf.NetCDFArray.open - ~cf.NetCDFArray.to_memory - ~cf.NetCDFArray.Units - -.. rubric:: Attributes - -.. autosummary:: - :nosignatures: - :toctree: ../attribute/ - :template: attribute.rst - - ~cf.NetCDFArray.array - ~cf.NetCDFArray.dtype - ~cf.NetCDFArray.file_address - ~cf.NetCDFArray.ndim - ~cf.NetCDFArray.shape - ~cf.NetCDFArray.size - -Special -------- - -.. rubric:: Methods - -.. autosummary:: - :nosignatures: - :toctree: ../method/ - :template: method.rst - - ~cf.NetCDFArray.__getitem__ diff --git a/docs/source/conf.py b/docs/source/conf.py index ac8ca82105..a30fb4d0a3 100755 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -156,6 +156,7 @@ def _get_date(): "cfplot": ("https://ajheaps.github.io/cf-plot", None), "dask": ("https://docs.dask.org/en/latest", None), "matplotlib": ("https://matplotlib.org/stable/", None), + "h5netcdf": ("https://h5netcdf.org", None), } # This extension is meant to help with the common pattern of having diff --git a/docs/source/field_analysis.rst b/docs/source/field_analysis.rst index c3ea848470..3083654e13 100644 --- a/docs/source/field_analysis.rst +++ b/docs/source/field_analysis.rst @@ -342,15 +342,14 @@ construct. Collapse weights ^^^^^^^^^^^^^^^^ -.. The calculations of means, standard deviations and variances are, - by default, not weighted. For weights to be incorporated in the - collapse, the axes to be weighted must be identified with the - *weights* keyword. +.. warning:: By default, the collapse calculations are **not** + weighted. -For weights to be incorporated in the collapse, the axes to be -weighted must be identified with the *weights* keyword. A collapse by -a particular method is either never weighted, or may be weighted, or -is always weighted, as described in the following table: + For weights to be incorporated in the collapse, the + *weights* keyword must be set. + +A collapse by a particular method is either never weighted, or may be +weighted, or is always weighted, as described in the following table: ============================ ============================ ======== Method Description Weighted @@ -853,6 +852,47 @@ method constructs. : longitude(8) = [22.5, ..., 337.5] degrees_east : air_pressure(1) = [850.0] hPa +.. _Active-storage-collapses: + +Active storage collapses +^^^^^^^^^^^^^^^^^^^^^^^^ + +When the data being collapsed are stored remotely, the collapse +calculations may be carried out on a server that is close (in a +network distance sense) to the data, thereby removing the time and +power costs of transfering the entire un-collapsed data to the local +client. Whether or not this will occur is determined on a case-by-case +basis, and will only be done if all of the following criteria are met: + +* the collapse method is one of ``'mean'``, ``'maximum'``, + ``'minimum'``, or ``'sum'``; + +* the collapse is over all axes; + +* the collapse is unweighted; + +* `cf.active_storage()` is `True`; + +* a URL of the active storage server has been set with + `cf.active_storage_url`; + +* the data values are in netCDF-4 files on disk (rather than in any + other file format, or in memory) and are not numerically packed; + +* the `~cf.Data.active_storage` attribute of the `Data` being + collapsed is `True`, indicating that active storage operations may + be possible. In general, it will only be `True` for data that are in + files on disk, are not compressed by convention and have not had any + other operations applied; + + +* it is possible to import the `activestorage.Active` class. + +The performance improvements from using active storage operations will +increase the closer the active storage server is to the data +storage. If the active storage server is sufficiently far away from +the data then it may be faster to do a normal, non-active operation. + ---- .. _ Other-statistical-operations: diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 774972d0aa..61215779aa 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -129,7 +129,7 @@ To install from source (without any dependencies): 1. Download the cf package from https://pypi.org/project/cf-python 2. Unpack the library (replacing ```` with the version that - you want to install, e.g. ``3.16.0``): + you want to install, e.g. ``3.17.0``): .. code-block:: console @@ -201,6 +201,13 @@ Required * `cftime `_, version 1.6.2 or newer (note that this package may be installed with netCDF4). +* `h5netcdf `_, version 1.3.0 + newer. + +* `h5py `_, version 3.10.0 or newer. + +* `s3fs `_, version 2024.2.0 or newer. + * `scipy `_, version 1.10.0 or newer. * `cfdm `_, version 1.11.1.0 or up to, diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst index 8076ef9f70..64d4940bda 100644 --- a/docs/source/introduction.rst +++ b/docs/source/introduction.rst @@ -59,12 +59,17 @@ The `cf` package uses :ref:`Dask ` for all of its array manipulation and can: * read :term:`field constructs ` and :term:`domain - constructs ` from netCDF, CDL, PP and UM datasets, + constructs ` from netCDF, CDL, PP and UM datasets + with a choice of netCDF backends, + +* read files from OPeNDAP servers and S3 object stores, * create new field constructs in memory, * write and append field constructs to netCDF datasets on disk, +* read, write, and manipulate UGRID mesh topologies, + * read, write, and create coordinates defined by geometry cells, * read netCDF and CDL datasets containing hierarchical groups, diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index dbd6b65894..f4f1601c55 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -241,6 +241,9 @@ The `cf.read` function has optional parameters to read recursively, and to allow directories which resolve to symbolic links; and +* choose either `netCDF4` or `h5netcdf` backends for accessing netCDF + files. + * configure parameters for :ref:`reading PP and UM fields files `. @@ -4606,8 +4609,9 @@ All the of above examples use arrays in memory to construct the data instances for the field and metadata constructs. It is, however, possible to create data from arrays that reside on disk. The `cf.read` function creates data in this manner. A pointer to an array in a -netCDF file can be stored in a `cf.NetCDFArray` instance, which is is -used to initialise a `cf.Data` instance. +netCDF file can be stored in a `cf.NetCDFArray` or +`~cf.H5netcdfAarray` instance, which is is used to initialise a +`cf.Data` instance. .. code-block:: python :caption: *Define a variable from a dataset with the netCDF package diff --git a/requirements.txt b/requirements.txt index 0e64e76a4a..baa0d1b0bc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,7 @@ cfunits>=3.3.7 dask>=2022.12.1 packaging>=20.0 scipy>=1.10.0 +h5netcdf>=1.3.0 +h5py>=3.10.0 +s3fs>=2024.2.0 + From 8108dd63cb5289b5f73c5d3c74a5793a2928e614 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 13 Feb 2024 23:06:20 +0000 Subject: [PATCH 047/134] dev --- Changelog.rst | 10 +- cf/data/array/cfanetcdfarray.py | 1 - cf/data/array/mixin/activestoragemixin.py | 94 ++++------------ cf/data/collapse/collapse_active.py | 2 +- cf/field.py | 7 +- cf/test/test_NetCDF4Array.py | 13 --- docs/source/class/cf.H5netcdfArray.rst | 81 +++----------- docs/source/class/cf.NetCDF4Array.rst | 126 +++++++++++++++++----- docs/source/field_analysis.rst | 12 +-- docs/source/installation.rst | 5 + docs/source/releases.rst | 7 +- 11 files changed, 153 insertions(+), 205 deletions(-) diff --git a/Changelog.rst b/Changelog.rst index 3ecb4d53c1..206077dc6d 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -5,7 +5,7 @@ version 3.17.0 * Allow access to netCDF-4 files in S3 object stores (https://github.com/NCAS-CMS/cf-python/issues/712) -* New class `cfdm.H5netcdfArray` +* New class `cf.H5netcdfArray` * Fix bug that caused `cf.Field.del_file_location` to fail when updating its metdata constructs (https://github.com/NCAS-CMS/cf-python/issues/707) @@ -181,8 +181,8 @@ version 3.14.1 ---- -version 3.14.0 --------------- +version 3.14.0 (*first Dask version*) +------------------------------------- **2023-01-31** @@ -216,8 +216,8 @@ version 3.14.0 ---- -version 3.13.1 --------------- +version 3.13.1 (*last LAMA version*) +------------------------------------ **2022-10-17** diff --git a/cf/data/array/cfanetcdfarray.py b/cf/data/array/cfanetcdfarray.py index 868bb1b12d..36ec327d0f 100644 --- a/cf/data/array/cfanetcdfarray.py +++ b/cf/data/array/cfanetcdfarray.py @@ -227,7 +227,6 @@ def __init__( else: fmt = compressed(file_fmt[frag_loc]).tolist() else: - print(f.shape, frag_loc, address) filename = (f[frag_loc].item(),) if scalar_address: address = a diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index a83ac065d2..2aa7b928fd 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -44,7 +44,7 @@ def __getitem__(self, indices): self.get_filename(), self.get_address(), # storage_options=self.get_storage_options(), - # active_storage_url=self.get_active_storage_url(), + # active_storage_url=self.get_active_url(), ) active.method = method active.components = True @@ -57,7 +57,7 @@ def __getitem__(self, indices): return active[indices] - def actify(self, method, axis=None, active_storage_url=None): + def actify(self, method, axis=None, active_url=None): """Return a new actified `{{class}}` instance. The new instance is a deep copy of the original, with the @@ -79,7 +79,7 @@ def actify(self, method, axis=None, active_storage_url=None): Axis or axes along which to operate. By default, or if `None`, flattened input is used. - active_storage_url: `str` or `None`, optional + active_url: `str` or `None`, optional The URL of the active storage server. :Returns: @@ -106,9 +106,9 @@ def actify(self, method, axis=None, active_storage_url=None): ) a = self.copy() - a.set_active_method(method) - a.set_active_axis(axis) - a.set_active_storage_url(active_storage_url) + a._custom["active_method"] = method + a._custom["active_axis"] = axis + a._custom["active_url"] = active_url return a def get_active_axis(self): @@ -116,13 +116,13 @@ def get_active_axis(self): .. versionadded:: 3.17.0 - .. seealso:: `set_active_axis` + .. seealso:: `get_active_method`, `get_active_url` :Returns: - `None` or (sequence of) `int - The active storage reduction axes. `None` signifies - that all axes will be reduced. + `None` or (sequence of) `int` + The active storage reduction axes, or `None` if there + is no active storage reduction. """ return self._custom.get("active_axis") @@ -132,87 +132,29 @@ def get_active_method(self): .. versionadded:: 3.17.0 - .. seealso:: `set_active_method` + .. seealso:: `get_active_axis`, `get_active_url` :Returns: `str` or `None` The name of the active storage reduction method, or - `None` if one hasn't been set. + `None` if there is no active storage reduction. """ return self._custom.get("active_method") - def get_active_storage_url(self): + def get_active_url(self): """Return the the active storage URL. .. versionadded:: 3.17.0 - .. seealso:: `set_active_storage_url` + .. seealso:: `get_active_axis`, `get_active_method` :Returns: - `str` - The active storage URL. An empty string specifies no - URL. - - """ - self._custom.get("active_storage_url", "") - - def set_active_axis(self, value): - """Set the active storage reduction axes. - - .. versionadded:: 3.17.0 - - .. seealso:: `get_active_axis` - - :Parameters: - - value: `None` or (sequence of) `int` - The active storage reduction axes. If `None` then all - axes will be reduced. - - :Returns: - - `None` - - """ - self._custom["active_axis"] = value - - def set_active_method(self, value): - """Set the name of the active storage reduction method. - - .. versionadded:: 3.17.0 - - .. seealso:: `get_active_method` - - :Parameters: - - value: `str` - The active storage reduction method. - - :Returns: - - `None` - - """ - self._custom["active_method"] = value - - def set_active_storage_url(self, value): - """Set the active storage URL. - - .. versionadded:: 3.17.0 - - .. seealso:: `get_active_storage_url` - - :Parameters: - - value: `str` - The active storage URL. - - :Returns: - - `None` + `str` or `None` + The active storage URL, or `None` if there is no + active storage reduction. """ - self._custom["active_storage_url"] = value + self._custom.get("active_url") diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 53067061b4..8728116349 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -270,7 +270,7 @@ def actify(a, method, axis=None): # to files, so try to insert an actified copy into the dask # graph. try: - dsk[key] = value.actify(method, axis, active_storage_url=url) + dsk[key] = value.actify(method, axis, active_url=url) except AttributeError: # This data definition doesn't have an 'actify' method, # and so doesn't support active storage reductions. diff --git a/cf/field.py b/cf/field.py index 6f3bac8763..4c8145dcf4 100644 --- a/cf/field.py +++ b/cf/field.py @@ -5342,14 +5342,15 @@ def collapse( in any other file format, or in memory) and are not numerically packed; - * the `!active_storage` attribute of the `Data` being - collapsed is `True`, indicating that active storage + * the `!active_storage` attribute of the `cf.Data` object + being collapsed is `True`, indicating that active storage operations may be possible. In general, it will only be `True` for data that are in files on disk, are not compressed by convention and have not had any other operations applied; - * it is possible to import the `activestorage.Active` class. + * it is possible to import the external `activestorage.Active` + class. The performance improvements from using active storage operations will increase the closer the active storage server diff --git a/cf/test/test_NetCDF4Array.py b/cf/test/test_NetCDF4Array.py index a26b399808..4a5de427ad 100644 --- a/cf/test/test_NetCDF4Array.py +++ b/cf/test/test_NetCDF4Array.py @@ -129,19 +129,6 @@ def test_NetCDF4Array_multiple_files(self): self.assertEqual(len(n.get_filenames()), 2) self.assertTrue((n[...] == f.array).all()) - def test_NetCDF4Array_active_method(self): - n = self.n - self.assertIsNone(n.get_active_method()) - self.assertIsNone(n.set_active_method("min")) - self.assertEqual(n.get_active_method(), "min") - - def test_NetCDF4Array_active_axis(self): - # Create instance with non-existent file - n = self.n - self.assertIsNone(n.get_active_axis()) - self.assertIsNone(n.set_active_axis((1, 2))) - self.assertEqual(n.get_active_axis(), (1, 2)) - if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/docs/source/class/cf.H5netcdfArray.rst b/docs/source/class/cf.H5netcdfArray.rst index f601692e39..217d0163cd 100644 --- a/docs/source/class/cf.H5netcdfArray.rst +++ b/docs/source/class/cf.H5netcdfArray.rst @@ -86,6 +86,19 @@ Miscellaneous ~cf.H5netcdfArray.copy ~cf.H5netcdfArray.to_memory +Active storage +-------------- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.H5netcdfArray.actify + ~cf.H5netcdfArray.get_active_url + ~cf.H5netcdfArray.get_active_method + ~cf.H5netcdfArray.get_active_axis + Special ------- @@ -123,71 +136,3 @@ Deprecated :template: method.rst ~cf.H5netcdfArray.get_missing_values ----------------------------------------------- -cf.H5netcdfArray -=============== - ----- - -.. autoclass:: cf.H5netcdfArray - :no-members: - :no-inherited-members: - -.. rubric:: Methods - -.. autosummary:: - :nosignatures: - :toctree: ../method/ - :template: method.rst - - ~cf.H5netcdfArray.add_file_location - ~cf.H5netcdfArray.close - ~cf.H5netcdfArray.copy - ~cf.H5netcdfArray.del_file_location - ~cf.H5netcdfArray.file_locations - ~cf.H5netcdfArray.filename - ~cf.H5netcdfArray.get_address - ~cf.H5netcdfArray.get_addresses - ~cf.H5netcdfArray.get_format - ~cf.H5netcdfArray.get_formats - ~cf.H5netcdfArray.get_calendar - ~cf.H5netcdfArray.get_compression_type - ~cf.H5netcdfArray.get_filename - ~cf.H5netcdfArray.get_filenames - ~cf.H5netcdfArray.get_group - ~cf.H5netcdfArray.get_groups - ~cf.H5netcdfArray.get_mask - ~cf.H5netcdfArray.get_missing_values - ~cf.H5netcdfArray.get_ncvar - ~cf.H5netcdfArray.get_subspace - ~cf.H5netcdfArray.get_units - ~cf.H5netcdfArray.get_varid - ~cf.H5netcdfArray.open - ~cf.H5netcdfArray.to_memory - ~cf.H5netcdfArray.Units - -.. rubric:: Attributes - -.. autosummary:: - :nosignatures: - :toctree: ../attribute/ - :template: attribute.rst - - ~cf.H5netcdfArray.array - ~cf.H5netcdfArray.dtype - ~cf.H5netcdfArray.file_address - ~cf.H5netcdfArray.ndim - ~cf.H5netcdfArray.shape - ~cf.H5netcdfArray.size - -Special -------- - -.. rubric:: Methods - -.. autosummary:: - :nosignatures: - :toctree: ../method/ - :template: method.rst - - ~cf.H5netcdfArray.__getitem__ diff --git a/docs/source/class/cf.NetCDF4Array.rst b/docs/source/class/cf.NetCDF4Array.rst index 3a00cfaf8c..ef1da7a8cb 100644 --- a/docs/source/class/cf.NetCDF4Array.rst +++ b/docs/source/class/cf.NetCDF4Array.rst @@ -10,55 +10,36 @@ cf.NetCDF4Array :no-members: :no-inherited-members: +Inspection +---------- + .. rubric:: Methods .. autosummary:: :nosignatures: :toctree: ../method/ :template: method.rst - - ~cf.NetCDF4Array.add_file_location - ~cf.NetCDF4Array.close - ~cf.NetCDF4Array.copy - ~cf.NetCDF4Array.del_file_location - ~cf.NetCDF4Array.file_locations - ~cf.NetCDF4Array.filename - ~cf.NetCDF4Array.get_address - ~cf.NetCDF4Array.get_addresses - ~cf.NetCDF4Array.get_format - ~cf.NetCDF4Array.get_formats - ~cf.NetCDF4Array.get_calendar + ~cf.NetCDF4Array.get_compression_type - ~cf.NetCDF4Array.get_filename - ~cf.NetCDF4Array.get_filenames - ~cf.NetCDF4Array.get_group - ~cf.NetCDF4Array.get_groups - ~cf.NetCDF4Array.get_mask - ~cf.NetCDF4Array.get_missing_values - ~cf.NetCDF4Array.get_ncvar ~cf.NetCDF4Array.get_subspace - ~cf.NetCDF4Array.get_units - ~cf.NetCDF4Array.get_varid - ~cf.NetCDF4Array.open - ~cf.NetCDF4Array.to_memory - ~cf.NetCDF4Array.Units + ~cf.NetCDF4Array.get_attributes + .. rubric:: Attributes .. autosummary:: :nosignatures: - :toctree: ../attribute/ + :toctree: ../attribute/ :template: attribute.rst ~cf.NetCDF4Array.array ~cf.NetCDF4Array.dtype - ~cf.NetCDF4Array.file_address ~cf.NetCDF4Array.ndim ~cf.NetCDF4Array.shape ~cf.NetCDF4Array.size -Special -------- +Units +----- .. rubric:: Methods @@ -66,5 +47,92 @@ Special :nosignatures: :toctree: ../method/ :template: method.rst - + + ~cf.NetCDF4Array.get_calendar + ~cf.NetCDF4Array.get_units + +File +---- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.NetCDF4Array.get_address + ~cf.NetCDF4Array.get_addresses + ~cf.NetCDF4Array.close + ~cf.NetCDF4Array.open + ~cf.NetCDF4Array.get_filename + ~cf.NetCDF4Array.get_filenames + ~cf.NetCDF4Array.get_format + ~cf.NetCDF4Array.get_formats + ~cf.NetCDF4Array.get_groups + ~cf.NetCDF4Array.get_mask + ~cf.NetCDF4Array.get_unpack + ~cf.NetCDF4Array.get_storage_options + ~cf.NetCDF4Array._lock + +Miscellaneous +------------- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.NetCDF4Array.copy + ~cf.NetCDF4Array.to_memory + +Active storage +-------------- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.NetCDF4Array.actify + ~cf.NetCDF4Array.get_active_url + ~cf.NetCDF4Array.get_active_method + ~cf.NetCDF4Array.get_active_axis + +Special +------- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.NetCDF4Array.__dask_tokenize__ ~cf.NetCDF4Array.__getitem__ + +Docstring substitutions +----------------------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.NetCDF4Array._docstring_special_substitutions + ~cf.NetCDF4Array._docstring_substitutions + ~cf.NetCDF4Array._docstring_package_depth + ~cf.NetCDF4Array._docstring_method_exclusions + +Deprecated +---------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.NetCDF4Array.get_missing_values diff --git a/docs/source/field_analysis.rst b/docs/source/field_analysis.rst index 3083654e13..9a42d0b737 100644 --- a/docs/source/field_analysis.rst +++ b/docs/source/field_analysis.rst @@ -879,14 +879,14 @@ basis, and will only be done if all of the following criteria are met: * the data values are in netCDF-4 files on disk (rather than in any other file format, or in memory) and are not numerically packed; -* the `~cf.Data.active_storage` attribute of the `Data` being - collapsed is `True`, indicating that active storage operations may - be possible. In general, it will only be `True` for data that are in - files on disk, are not compressed by convention and have not had any - other operations applied; +* the `~cf.Data.active_storage` attribute of the `cf.Data` object + being collapsed is `True`, indicating that active storage operations + may be possible. In general, it will only be `True` for data that + are in files on disk, are not compressed by convention and have not + had any other operations applied; -* it is possible to import the `activestorage.Active` class. +* it is possible to import the external `activestorage.Active` class. The performance improvements from using active storage operations will increase the closer the active storage server is to the data diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 61215779aa..b0c44cbd43 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -268,6 +268,11 @@ environments for which these features are not required. * `matplotlib `_, version 3.0.0 or newer. +.. rubric:: Active storage collapses + +* `activestorage `_, version 1.0.0 + or newer. + ---- .. _Tests: diff --git a/docs/source/releases.rst b/docs/source/releases.rst index 5f1b2bc9e8..46fcf26a81 100644 --- a/docs/source/releases.rst +++ b/docs/source/releases.rst @@ -16,6 +16,7 @@ Documentation for all versions of cf. **CF-1.11** ----------- +* `Version 3.17.0 `_ (2024-??-??) * `Version 3.16.0 `_ (2023-12-06) **CF-1.10** @@ -25,9 +26,9 @@ Documentation for all versions of cf. * `Version 3.15.3 `_ (2023-08-31) * `Version 3.15.2 `_ (2023-07-21) * `Version 3.15.1 `_ (2023-06-09) -* `Version 3.15.0 `_ (2023-04-27) -* `Version 3.14.1 `_ (2023-03-10) -* `Version 3.14.0 `_ (2023-01-31 *first Dask release*) +* `Version 3.15.0 `_ (2023-04-27) +* `Version 3.14.1 `_ (2023-03-10) +* `Version 3.14.0 `_ (2023-01-31 *first Dask release*) ---- From 28fdf10b0602e9ead0c4e1dc3244991b9be6356c Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 14 Feb 2024 10:16:15 +0000 Subject: [PATCH 048/134] dev --- cf/__init__.py | 1 + docs/source/cf_data_model.rst | 110 ---------------------------------- docs/source/class.rst | 3 +- docs/source/class/cf.Data.rst | 12 ++++ docs/source/index.rst | 2 - docs/source/introduction.rst | 101 +++++++++++++++++++++++++++++++ docs/source/performance.rst | 35 ++++++----- docs/source/tutorial.rst | 12 ++-- 8 files changed, 138 insertions(+), 138 deletions(-) delete mode 100644 docs/source/cf_data_model.rst diff --git a/cf/__init__.py b/cf/__init__.py index 793e62b6d2..eda9241e3a 100644 --- a/cf/__init__.py +++ b/cf/__init__.py @@ -292,6 +292,7 @@ RaggedIndexedArray, RaggedIndexedContiguousArray, SubsampledArray, + UMArray, ) from .data.fragment import ( diff --git a/docs/source/cf_data_model.rst b/docs/source/cf_data_model.rst deleted file mode 100644 index 1838fe8a1d..0000000000 --- a/docs/source/cf_data_model.rst +++ /dev/null @@ -1,110 +0,0 @@ -.. currentmodule:: cf -.. default-role:: obj - -.. raw:: html - - - -.. role:: small - -.. _CF-data-model: - -**CF data model** -================= - ----- - -Version |release| for version |version| of the CF conventions. - -The CF (Climate and Forecast) metadata conventions -(http://cfconventions.org) provide a description of the physical -meaning of data and of their spatial and temporal properties and are -designed to promote the creation, processing, and sharing of climate -and forecasting data using netCDF files and libraries -(https://www.unidata.ucar.edu/software/netcdf). - -`The CF data model -`_ -identifies the fundamental elements ("constructs") of the CF -conventions and shows how they relate to each other, independently of -the netCDF encoding. - -The CF data model defines a **field construct** for storing data with -all of its metadata. It is defined as follows: - -.. glossary:: - - field construct - corresponds to a CF-netCDF data variable with all of its - metadata. It consists of - - - descriptive properties that apply to field construct as a whole - (e.g. the standard name), - - - a data array, - - - a **domain construct** that describes the locations of each cell - of the data array (i.e. the "domain"), - - - **metadata constructs** that describe the physical nature of the - data array, defined by - - .. glossary:: - - field ancillary constructs - corresponding to CF-netCDF ancillary variables - - cell method constructs - corresponding to a CF-netCDF cell_methods attribute of data - variable - - domain construct - that describes the locations of each cell of the domain. It may - exist independently of a **field construct** and consists of - - - descriptive properties that apply to domain construct as a whole, - - - **metadata constructs** that describe the locations of each cell - of the domain, defined by - - .. glossary:: - - domain axis constructs - corresponding to CF-netCDF dimensions or scalar coordinate - variables - - dimension coordinate constructs - corresponding to CF-netCDF coordinate variables or numeric - scalar coordinate variables - - auxiliary coordinate constructs - corresponding to CF-netCDF auxiliary coordinate variables and - non-numeric scalar coordinate variables - - coordinate reference constructs - corresponding to CF-netCDF grid mapping variables or the - formula_terms attribute of a coordinate variable - - domain ancillary constructs - corresponding to CF-netCDF variables named by the - formula_terms attribute of a coordinate variable - - cell measure constructs - corresponding to CF-netCDF cell measure variables - - domain topology constructs - corresponding to CF-netCDF UGRID mesh topology variables - - cell connectivity constructs - corresponding to CF-netCDF UGRID connectivity variables - ----- - - -| - -.. figure:: images/cfdm_field.svg - - *The constructs of the CF data model described using UML. The field construct corresponds to a CF-netCDF data variable. The domain construct provides the linkage between the field construct and the constructs which describe measurement locations and cell properties. It is useful to define an abstract generic coordinate construct that can be used to refer to coordinates when the their type (dimension or auxiliary coordinate construct) is not an issue.* - ----- diff --git a/docs/source/class.rst b/docs/source/class.rst index 4b307ecc2b..cd236ff624 100644 --- a/docs/source/class.rst +++ b/docs/source/class.rst @@ -79,9 +79,8 @@ Data classes cf.Data cf.H5netcdfArray cf.NetCDF4Array - cf.CFANetCDFArray cf.FullArray - cf.UMAarray + cf.UMArray Data compression classes ------------------------ diff --git a/docs/source/class/cf.Data.rst b/docs/source/class/cf.Data.rst index f71ed20e14..62f9294ba2 100644 --- a/docs/source/class/cf.Data.rst +++ b/docs/source/class/cf.Data.rst @@ -590,6 +590,18 @@ Compression by convention ~cf.Data.compressed_array +Active storage +-------------- + +.. rubric:: Attributes + +.. autosummary:: + :nosignatures: + :toctree: ../attribute/ + :template: attribute.rst + + ~cf.Data.active_storage + Miscellaneous ------------- diff --git a/docs/source/index.rst b/docs/source/index.rst index 2a7bc15e5e..b8b196a8d8 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -20,12 +20,10 @@ :maxdepth: 1 introduction - cf_data_model installation cheat_sheet recipes/index tutorial - analysis api_reference aggregation_rules performance diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst index 64d4940bda..85714e8b6e 100644 --- a/docs/source/introduction.rst +++ b/docs/source/introduction.rst @@ -13,6 +13,12 @@ .. role:: blue +.. raw:: html + + + +.. role:: small + **Introduction** ================ @@ -154,6 +160,101 @@ installed, which ---- +.. _CF-data-model: + +**CF data model** +----------------- + +The CF (Climate and Forecast) metadata conventions +(http://cfconventions.org) provide a description of the physical +meaning of data and of their spatial and temporal properties and are +designed to promote the creation, processing, and sharing of climate +and forecasting data using netCDF files and libraries +(https://www.unidata.ucar.edu/software/netcdf). + +`The CF data model +`_ +identifies the fundamental elements ("constructs") of the CF +conventions and shows how they relate to each other, independently of +the netCDF encoding. + +The CF data model defines a **field construct** for storing data with +all of its metadata. It is defined in CF-|version| as follows: + +.. glossary:: + + field construct + corresponds to a CF-netCDF data variable with all of its + metadata. It consists of + + - descriptive properties that apply to field construct as a whole + (e.g. the standard name), + + - a data array, + + - a **domain construct** that describes the locations of each cell + of the data array (i.e. the "domain"), + + - **metadata constructs** that describe the physical nature of the + data array, defined by + + .. glossary:: + + field ancillary constructs + corresponding to CF-netCDF ancillary variables + + cell method constructs + corresponding to a CF-netCDF cell_methods attribute of data + variable + + domain construct + that describes the locations of each cell of the domain. It may + exist independently of a **field construct** and consists of + + - descriptive properties that apply to domain construct as a whole, + + - **metadata constructs** that describe the locations of each cell + of the domain, defined by + + .. glossary:: + + domain axis constructs + corresponding to CF-netCDF dimensions or scalar coordinate + variables + + dimension coordinate constructs + corresponding to CF-netCDF coordinate variables or numeric + scalar coordinate variables + + auxiliary coordinate constructs + corresponding to CF-netCDF auxiliary coordinate variables and + non-numeric scalar coordinate variables + + coordinate reference constructs + corresponding to CF-netCDF grid mapping variables or the + formula_terms attribute of a coordinate variable + + domain ancillary constructs + corresponding to CF-netCDF variables named by the + formula_terms attribute of a coordinate variable + + cell measure constructs + corresponding to CF-netCDF cell measure variables + + domain topology constructs + corresponding to CF-netCDF UGRID mesh topology variables + + cell connectivity constructs + corresponding to CF-netCDF UGRID connectivity variables + +| + +.. figure:: images/cfdm_field.svg + + *The constructs of the CF data model described using UML. The field construct corresponds to a CF-netCDF data variable. The domain construct provides the linkage between the field construct and the constructs which describe measurement locations and cell properties. It is useful to define an abstract generic coordinate construct that can be used to refer to coordinates when the their type (dimension or auxiliary coordinate construct) is not an issue.* + +---- + **References** -------------- diff --git a/docs/source/performance.rst b/docs/source/performance.rst index 74b3f85c00..9ceb7cd446 100644 --- a/docs/source/performance.rst +++ b/docs/source/performance.rst @@ -83,7 +83,7 @@ Some notable cases where non-lazy computations occur are: The weights may also be stored on disk for re-use in future sessions by using the ``weights_file`` keyword parameter. - + * **Aggregation** When two or more field or domain constructs are aggregated to form a @@ -100,7 +100,8 @@ Some notable cases where non-lazy computations occur are: convention (such as compression by gathering, some discrete sampling geometries, etc.), the compression metadata, such as the "list" array for compression by gathering, are read from disk non-lazily - during the `cf.read` operation. + during the `cf.read` operation. The compressed data themselves are, + however, accessed lazily. ---- @@ -173,15 +174,15 @@ scheduler has been defined will use that scheduler. >>> import cf >>> import dask + >>> # To make cf computations use local processes: >>> dask.config.set(scheduler='processes') - >>> # cf computations will now use local processes + >>> # To make cf computations be single-threaded >>> dask.config.set(scheduler='synchronous') - >>> # cf computations will now be single-threaded + >>> # To make cf computations use local threads (the default) >>> dask.config.set(scheduler='threads') - >>> # cf computations will now use local threads (the default) + >>> # To make cf computations use a user-defined distributed cluster >>> from dask.distributed import Client >>> client = Client('127.0.0.1:8786') - >>> # cf computations will now use the defined distributed cluster Operations are stored by Dask in `task graphs `_ where each task @@ -221,26 +222,24 @@ basic data computation over four chunks: [600 650 702 756 812 870]] The image file ``dask_task_graph.png`` contains the visualisation of -the dask task graph, showing the operations on each chunk. The -operations were only executed when their result was requested with the -final ``e.array`` command. The boxes represent the data chunks and the -circles represent the operations to be performed on the chunks. The -five boxes in the bottom row are the starting data (i.e. the four -chunks of ``d`` and the scalar ``2``), and the four boxes in the top -row are the result of the computations which combine to produce the -values in ``e.array``. +the Dask task graph, showing the operations on each chunk: .. figure:: images/dask_task_graph.svg :scale: 8 % - *The dask task graph from dask_task_graph.png* - +The operations were only executed when their result was requested with +the final ``e.array`` command. The boxes in ``dask_task_graph.png`` +represent the data chunks and the circles represent the operations to +be performed on the chunks. The five boxes in the bottom row are the +starting data (i.e. the four chunks of ``d`` and the scalar ``2``), +and the four boxes in the top row are the result of the computations +which combine to produce the values in ``e.array``. NetCDF file access ^^^^^^^^^^^^^^^^^^ -Note that reading from and writing to netCDF files is currently a -serial operation, i.e. only one Dask chunk can access any netCDF file +Note that reading from and writing to netCDF files are currently only +serial operations, i.e. only one Dask chunk can access any netCDF file at any given moment. This situation can result in slower-than-expected performance. When a thread-safe version of the netCDF-C library is available we hope to lift this restriction. diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index f4f1601c55..76a4f46427 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -4609,20 +4609,20 @@ All the of above examples use arrays in memory to construct the data instances for the field and metadata constructs. It is, however, possible to create data from arrays that reside on disk. The `cf.read` function creates data in this manner. A pointer to an array in a -netCDF file can be stored in a `cf.NetCDFArray` or +netCDF file can be stored in a `cf.NetCDF4Array` or `~cf.H5netcdfAarray` instance, which is is used to initialise a `cf.Data` instance. .. code-block:: python :caption: *Define a variable from a dataset with the netCDF package - and use it to create a NetCDFArray instance with which to - initialise a Data instance.* + and use it to create a NetCDF4Array instance with which + to initialise a Data instance.* >>> import netCDF4 >>> nc = netCDF4.Dataset('file.nc', 'r') >>> v = nc.variables['ta'] - >>> netcdf_array = cf.NetCDFArray(filename='file.nc', address='ta', - ... dtype=v.dtype, shape=v.shape) + >>> netcdf_array = cf.NetCDF4Array(filename='file.nc', address='ta', + ... dtype=v.dtype, shape=v.shape) >>> data_disk = cf.Data(netcdf_array) @@ -4638,7 +4638,7 @@ netCDF file can be stored in a `cf.NetCDFArray` or Note that data type, number of dimensions, dimension sizes and number of elements of the array on disk that are used to initialise the -`cf.NetCDFArray` instance are those expected by the CF data model, +`cf.NetCDF4Array` instance are those expected by the CF data model, which may be different to those of the netCDF variable in the file (although they are the same in the above example). For example, a netCDF character array of shape ``(12, 9)`` is viewed in cf as a From 4fcb960019669fe76d4bd8c7f38eb0b0e671651f Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 14 Feb 2024 17:01:17 +0000 Subject: [PATCH 049/134] dev --- cf/data/array/mixin/activestoragemixin.py | 4 ++-- cf/data/collapse/collapse_active.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index 2aa7b928fd..14c4343bd3 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -43,8 +43,8 @@ def __getitem__(self, indices): active = Active( self.get_filename(), self.get_address(), - # storage_options=self.get_storage_options(), - # active_storage_url=self.get_active_url(), + storage_options=self.get_storage_options(), + active_storage_url=self.get_active_url(), ) active.method = method active.components = True diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 8728116349..c4f2866736 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -288,7 +288,7 @@ def actify(a, method, axis=None): # function. logger.warning( "At compute time chunks will be collapsed with " - f"active storage URL: {active_storage_url()}" + f"active storage at URL {active_storage_url()}" ) return ( da.Array(dsk, a.name, a.chunks, a.dtype, a._meta), From 96cdc8fe8257bc2982b7db5d37f069c5b8ca1757 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 16 Feb 2024 16:02:34 +0000 Subject: [PATCH 050/134] dev --- cf/aggregate.py | 12 +++++------ cf/data/array/mixin/activestoragemixin.py | 25 +++++++++++++++-------- cf/data/collapse/collapse_active.py | 4 ++-- cf/data/data.py | 8 ++++++-- 4 files changed, 30 insertions(+), 19 deletions(-) diff --git a/cf/aggregate.py b/cf/aggregate.py index 6fe53c2c9f..8311f04d39 100644 --- a/cf/aggregate.py +++ b/cf/aggregate.py @@ -2856,7 +2856,7 @@ def aggregate( output_meta_append(meta) continue - + # ------------------------------------------------------------ # This field has a structural signature, so append it to the # list of fields with the same structural signature. @@ -3007,7 +3007,7 @@ def aggregate( # Take a shallow copy in case we abandon and want to output # the original, unaggregated fields. meta0 = meta[:] - + unaggregatable = False for axis in aggregating_axes: @@ -3178,7 +3178,7 @@ def aggregate( # # 0.0012 , 0.019 , 0.55 , 2.1 # - # compared with current method timings of + # compared with new timings of # # 0.00035, 0.0012, 0.013, 0.064 # ------------------------------------------------ @@ -3194,7 +3194,7 @@ def aggregate( copy=copy, ) field.set_data(data, set_axes=False, copy=False) - + # Concatenate the metadata construct data for construct_type, value in data_concatenation.items(): for (key, iaxis), constructs in value.items(): @@ -4748,7 +4748,7 @@ def _aggregate_2_fields( # Ensure that the axis orders are the same in both fields transpose_axes1 = [dim0_name_map[axis0] for axis0 in data_axes0] - if transpose_axes1 != data_axes1: + if transpose_axes1 != list(data_axes1): parent1.transpose(transpose_axes1, inplace=True) construct_type = parent0.construct_type @@ -4762,7 +4762,7 @@ def _aggregate_2_fields( data_concatenation[construct_type].setdefault( axis, [parent1.get_data()] ).append(parent0.get_data()) - + # Update the size of the aggregating axis in parent0 domain_axis = constructs0[adim0] domain_axis += constructs1[adim1].get_size() diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index 14c4343bd3..9554a18779 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -38,13 +38,20 @@ def __getitem__(self, indices): # numpy array. return super().__getitem__(indices) + import urllib # Still here? Then do an active storage reduction. Returns a # dictionary of reduced values. + + # Hack for testing! + filename = self.get_filename() + filename = urllib.parse.urlparse(filename).path[1:] + active = Active( - self.get_filename(), + filename, self.get_address(), + storage_type ='s3', # Hack for testing! storage_options=self.get_storage_options(), - active_storage_url=self.get_active_url(), + active_storage_url=self.get_active_storage_url(), ) active.method = method active.components = True @@ -57,7 +64,7 @@ def __getitem__(self, indices): return active[indices] - def actify(self, method, axis=None, active_url=None): + def actify(self, method, axis=None, active_storage_url=None): """Return a new actified `{{class}}` instance. The new instance is a deep copy of the original, with the @@ -79,7 +86,7 @@ def actify(self, method, axis=None, active_url=None): Axis or axes along which to operate. By default, or if `None`, flattened input is used. - active_url: `str` or `None`, optional + active_storage_url: `str` or `None`, optional The URL of the active storage server. :Returns: @@ -108,7 +115,7 @@ def actify(self, method, axis=None, active_url=None): a = self.copy() a._custom["active_method"] = method a._custom["active_axis"] = axis - a._custom["active_url"] = active_url + a._custom["active_storage_url"] = active_storage_url return a def get_active_axis(self): @@ -116,7 +123,7 @@ def get_active_axis(self): .. versionadded:: 3.17.0 - .. seealso:: `get_active_method`, `get_active_url` + .. seealso:: `get_active_method`, `get_active_storage_url` :Returns: @@ -132,7 +139,7 @@ def get_active_method(self): .. versionadded:: 3.17.0 - .. seealso:: `get_active_axis`, `get_active_url` + .. seealso:: `get_active_axis`, `get_active_storage_url` :Returns: @@ -143,7 +150,7 @@ def get_active_method(self): """ return self._custom.get("active_method") - def get_active_url(self): + def get_active_storage_url(self): """Return the the active storage URL. .. versionadded:: 3.17.0 @@ -157,4 +164,4 @@ def get_active_url(self): active storage reduction. """ - self._custom.get("active_url") + return self._custom.get("active_storage_url") diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index c4f2866736..f253d06b3e 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -270,7 +270,7 @@ def actify(a, method, axis=None): # to files, so try to insert an actified copy into the dask # graph. try: - dsk[key] = value.actify(method, axis, active_url=url) + dsk[key] = value.actify(method, axis, active_storage_url=url) except AttributeError: # This data definition doesn't have an 'actify' method, # and so doesn't support active storage reductions. @@ -288,7 +288,7 @@ def actify(a, method, axis=None): # function. logger.warning( "At compute time chunks will be collapsed with " - f"active storage at URL {active_storage_url()}" + f"active storage at URL {url}" ) return ( da.Array(dsk, a.name, a.chunks, a.dtype, a._meta), diff --git a/cf/data/data.py b/cf/data/data.py index 535a137d47..404d160f6a 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -4027,15 +4027,17 @@ def concatenate( data0 = data[0] units0 = data0.Units - + print ('data0.a_s=', data0.active_storage) + if copy: data0 = data0.copy() copied = True else: copied = False - + processed_data = [] for index, data1 in enumerate(data): + print ('data1.a_s=', data1.active_storage) # Turn any scalar array into a 1-d array if not data1.ndim: if not copied: @@ -4106,6 +4108,7 @@ def concatenate( if not d.active_storage: # Set the output active storage status to False when # any input data instance has False status + print ('nuking active in concatenate') active = _NONE break @@ -4780,6 +4783,7 @@ def active_storage(self): False """ +# return True return ( self._custom.get("active_storage", False) and not self.get_compression_type() From 16131f83c3a689acf9e713fe5b45f51e56f84e70 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 4 Mar 2024 22:23:39 +0000 Subject: [PATCH 051/134] dev --- cf/__init__.py | 5 +- cf/aggregate.py | 8 +- cf/cfimplementation.py | 69 +-- cf/data/array/__init__.py | 3 +- cf/data/array/cfah5netcdfarray.py | 197 +++++++ cf/data/array/cfanetcdf4array.py | 197 +++++++ cf/data/array/cfanetcdfarray.py | 10 +- cf/data/array/h5netcdfarray.py | 6 +- cf/data/array/mixin/__init__.py | 1 + cf/data/array/mixin/activestoragemixin.py | 32 +- cf/data/array/mixin/cfamixin.py | 639 ++++++++++++++++++++++ cf/data/array/netcdfarray.py | 4 +- cf/data/collapse/collapse.py | 30 +- cf/data/collapse/collapse_active.py | 22 +- cf/data/data.py | 16 +- cf/data/fragment/h5netcdffragmentarray.py | 2 +- cf/data/fragment/netcdf4fragmentarray.py | 2 +- cf/data/fragment/netcdffragmentarray.py | 6 +- cf/functions.py | 12 +- cf/read_write/netcdf/netcdfread.py | 18 +- cf/read_write/read.py | 55 +- cf/regrid/regrid.py | 6 + 22 files changed, 1189 insertions(+), 151 deletions(-) create mode 100644 cf/data/array/cfah5netcdfarray.py create mode 100644 cf/data/array/cfanetcdf4array.py create mode 100644 cf/data/array/mixin/cfamixin.py diff --git a/cf/__init__.py b/cf/__init__.py index eda9241e3a..45e30f54b9 100644 --- a/cf/__init__.py +++ b/cf/__init__.py @@ -206,7 +206,7 @@ ) # Check the version of cfdm -_minimum_vn = "1.11.1.0" +_minimum_vn = "1.11.0.0" # TODO _maximum_vn = "1.11.2.0" _cfdm_version = Version(cfdm.__version__) if not Version(_minimum_vn) <= _cfdm_version < Version(_maximum_vn): @@ -281,7 +281,8 @@ from .data.array import ( BoundsFromNodesArray, CellConnectivityArray, - CFANetCDFArray, + CFAH5netcdfArray, + CFANetCDF4Array, FullArray, GatheredArray, H5netcdfArray, diff --git a/cf/aggregate.py b/cf/aggregate.py index 8311f04d39..b45ebbbdde 100644 --- a/cf/aggregate.py +++ b/cf/aggregate.py @@ -2856,7 +2856,7 @@ def aggregate( output_meta_append(meta) continue - + # ------------------------------------------------------------ # This field has a structural signature, so append it to the # list of fields with the same structural signature. @@ -3007,7 +3007,7 @@ def aggregate( # Take a shallow copy in case we abandon and want to output # the original, unaggregated fields. meta0 = meta[:] - + unaggregatable = False for axis in aggregating_axes: @@ -3194,7 +3194,7 @@ def aggregate( copy=copy, ) field.set_data(data, set_axes=False, copy=False) - + # Concatenate the metadata construct data for construct_type, value in data_concatenation.items(): for (key, iaxis), constructs in value.items(): @@ -4762,7 +4762,7 @@ def _aggregate_2_fields( data_concatenation[construct_type].setdefault( axis, [parent1.get_data()] ).append(parent0.get_data()) - + # Update the size of the aggregating axis in parent0 domain_axis = constructs0[adim0] domain_axis += constructs1[adim1].get_size() diff --git a/cf/cfimplementation.py b/cf/cfimplementation.py index eb39374438..435f7621bd 100644 --- a/cf/cfimplementation.py +++ b/cf/cfimplementation.py @@ -29,7 +29,8 @@ from .data.array import ( BoundsFromNodesArray, CellConnectivityArray, - CFANetCDFArray, + CFAH5netcdfArray, + CFANetCDF4Array, GatheredArray, H5netcdfArray, NetCDF4Array, @@ -86,65 +87,39 @@ def set_construct(self, parent, construct, axes=None, copy=True, **kwargs): parent, construct, axes=axes, copy=copy, **kwargs ) - def initialise_CFANetCDFArray( - self, - filename=None, - address=None, - dtype=None, - mask=True, - units=False, - calendar=False, - instructions=None, - substitutions=None, - term=None, - x=None, - **kwargs, - ): - """Return a `CFANetCDFArray` instance. + def initialise_CFANetCDF4Array(self, **kwargs): + """Return a `CFANetCDF4Array` instance. :Parameters: - filename: `str` - - address: (sequence of) `str` or `int` - - dytpe: `numpy.dtype` - - mask: `bool`, optional + kwargs: optional + Initialisation parameters to pass to the new instance. - units: `str` or `None`, optional + :Returns: - calendar: `str` or `None`, optional + `CFANetCDF4Array` - instructions: `str`, optional + """ + cls = self.get_class("CFANetCDF4Array") + return cls(**kwargs) - substitutions: `dict`, optional + def initialise_CFAH5netcdfArray(self, **kwargs): + """Return a `CFAH5netcdfArray` instance. - term: `str`, optional + .. versionadded:: NEXTVERSION - x: `dict`, optional + :Parameters: kwargs: optional - Ignored. + Initialisation parameters to pass to the new instance. :Returns: - `CFANetCDFArray` + `CFAH5netcdfArray` """ - cls = self.get_class("CFANetCDFArray") - return cls( - filename=filename, - address=address, - dtype=dtype, - mask=mask, - units=units, - calendar=calendar, - instructions=instructions, - substitutions=substitutions, - term=term, - x=x, - ) + cls = self.get_class("CFAH5netcdfArray") + return cls(**kwargs) _implementation = CFImplementation( @@ -153,7 +128,8 @@ def initialise_CFANetCDFArray( CellConnectivity=CellConnectivity, CellMeasure=CellMeasure, CellMethod=CellMethod, - CFANetCDFArray=CFANetCDFArray, + CFAH5netcdfArray=CFAH5netcdfArray, + CFANetCDF4Array=CFANetCDF4Array, CoordinateReference=CoordinateReference, DimensionCoordinate=DimensionCoordinate, Domain=Domain, @@ -211,7 +187,8 @@ def implementation(): 'CellConnectivityArray': cf.data.array.cellconnectivityarray.CellConnectivityArray, 'CellMeasure': cf.cellmeasure.CellMeasure, 'CellMethod': cf.cellmethod.CellMethod, - 'CFANetCDFArray': cf.data.array.cfanetcdfarray.CFANetCDFArray, + 'CFAH5netcdfArray': cf.data.array.cfah5netcdfarray.CFAH5netcdfArray, + 'CFANetCDF4Array': cf.data.array.cfanetcdf4array.CFANetCDF4Array, 'CoordinateReference': cf.coordinatereference.CoordinateReference, 'DimensionCoordinate': cf.dimensioncoordinate.DimensionCoordinate, 'Domain': cf.domain.Domain, diff --git a/cf/data/array/__init__.py b/cf/data/array/__init__.py index 0b16361f53..cd2c53766b 100644 --- a/cf/data/array/__init__.py +++ b/cf/data/array/__init__.py @@ -1,6 +1,7 @@ from .boundsfromnodesarray import BoundsFromNodesArray from .cellconnectivityarray import CellConnectivityArray -from .cfanetcdfarray import CFANetCDFArray +from .cfah5netcdfarray import CFAH5netcdfArray +from .cfanetcdf4array import CFANetCDF4Array from .fullarray import FullArray from .gatheredarray import GatheredArray from .h5netcdfarray import H5netcdfArray diff --git a/cf/data/array/cfah5netcdfarray.py b/cf/data/array/cfah5netcdfarray.py new file mode 100644 index 0000000000..6b1acc3a09 --- /dev/null +++ b/cf/data/array/cfah5netcdfarray.py @@ -0,0 +1,197 @@ +from .h5netcdfarray import H5netcdfArray +from .mixin import CFAMixin + + +class CFAH5netcdfArray(CFAMixin, H5netcdfArray): + """A CFA-netCDF array accessed with `h5netcdf` + + .. versionadded:: NEXTVERSION + + """ + + def __init__( + self, + filename=None, + address=None, + dtype=None, + mask=True, + unpack=True, + units=False, + calendar=False, + instructions=None, + substitutions=None, + term=None, + attributes=None, + storage_options=None, + source=None, + copy=True, + x=None, + ): + """**Initialisation** + + :Parameters: + + filename: (sequence of) `str`, optional + The name of the CFA-netCDF file containing the + array. If a sequence then it must contain one element. + + address: (sequence of) `str`, optional + The name of the CFA-netCDF aggregation variable for the + array. If a sequence then it must contain one element. + + dtype: `numpy.dtype` + The data type of the aggregated data array. May be + `None` if the numpy data-type is not known (which can + be the case for netCDF string types, for example). + + mask: `bool` + If True (the default) then mask by convention when + reading data from disk. + + A netCDF array is masked depending on the values of any of + the netCDF variable attributes ``valid_min``, + ``valid_max``, ``valid_range``, ``_FillValue`` and + ``missing_value``. + + {{init unpack: `bool`, optional}} + + .. versionadded:: NEXTVERSION + + units: `str` or `None`, optional + The units of the aggregated data. Set to `None` to + indicate that there are no units. + + calendar: `str` or `None`, optional + The calendar of the aggregated data. Set to `None` to + indicate the CF default calendar, if applicable. + + instructions: `str`, optional + The ``aggregated_data`` attribute value as found on + the CFA netCDF variable. If set then this will be used + to improve the performance of `__dask_tokenize__`. + + substitutions: `dict`, optional + A dictionary whose key/value pairs define text + substitutions to be applied to the fragment file + names. Each key must be specified with the ``${...}`` + syntax, for instance ``{'${base}': 'sub'}``. + + .. versionadded:: 3.15.0 + + term: `str`, optional + The name of a non-standard aggregation instruction + term from which the array is to be created, instead of + creating the aggregated data in the standard + terms. If set then *address* must be the name of the + term's CFA-netCDF aggregation instruction variable, + which must be defined on the fragment dimensions and + no others. Each value of the aggregation instruction + variable will be broadcast across the shape of the + corresponding fragment. + + *Parameter example:* + ``address='cfa_tracking_id', term='tracking_id'`` + + .. versionadded:: 3.15.0 + + storage_options: `dict` or `None`, optional + Key/value pairs to be passed on to the creation of + `s3fs.S3FileSystem` file systems to control the + opening of fragment files in an S3 object + stores. Ignored for fragment files not in S3 object + stores, i.e. those whose names do not start with + ``s3:``. + + If an ``'endpoint_url'`` key is not in + *storage_options* then one will be automatically + derived for accessing each S3 fragment file. For + example, for a fragment file name of + ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` + key with value ``'https://store'`` would be created. + + *Parameter example:* + For a fragment file name of + ``'s3://store/data/file.nc'``, the following are + equivalent: ``None``, ``{}`` and ``{'endpoint_url': + 'https://store'}``. + + *Parameter example:* + ``{'key: 'scaleway-api-key...', 'secret': + 'scaleway-secretkey...', 'endpoint_url': + 'https://s3.fr-par.scw.cloud', 'client_kwargs': + {'region_name': 'fr-par'}}`` + + .. versionadded:: NEXTVERSION + + {{init source: optional}} + + {{init copy: `bool`, optional}} + + """ + if source is not None: + super().__init__(source=source, copy=copy) + + try: + fragment_shape = source.get_fragment_shape() + except AttributeError: + fragment_shape = None + + try: + instructions = source._get_component("instructions") + except AttributeError: + instructions = None + + try: + aggregated_data = source.get_aggregated_data(copy=False) + except AttributeError: + aggregated_data = {} + + try: + substitutions = source.get_substitutions() + except AttributeError: + substitutions = None + + try: + term = source.get_term() + except AttributeError: + term = None + + elif filename is not None: + shape, fragment_shape, aggregated_data = self._parse( + x, term, substitutions + ) + super().__init__( + filename=filename, + address=address, + shape=shape, + dtype=dtype, + mask=mask, + units=units, + calendar=calendar, + copy=copy, + ) + else: + super().__init__( + filename=filename, + address=address, + dtype=dtype, + mask=mask, + units=units, + calendar=calendar, + copy=copy, + ) + + fragment_shape = None + aggregated_data = None + instructions = None + term = None + + self._set_component("fragment_shape", fragment_shape, copy=False) + self._set_component("aggregated_data", aggregated_data, copy=False) + self._set_component("instructions", instructions, copy=False) + self._set_component("term", term, copy=False) + + if substitutions is not None: + self._set_component( + "substitutions", substitutions.copy(), copy=False + ) diff --git a/cf/data/array/cfanetcdf4array.py b/cf/data/array/cfanetcdf4array.py new file mode 100644 index 0000000000..0532bb191d --- /dev/null +++ b/cf/data/array/cfanetcdf4array.py @@ -0,0 +1,197 @@ +from .mixin import CFAMixin +from .netcdf4array import NetCDF4Array + + +class CFANetCDF4Array(CFAMixin, NetCDF4Array): + """A CFA-netCDF array accessed with `netCDF4`. + + .. versionadded:: NEXTVERSION + + """ + + def __init__( + self, + filename=None, + address=None, + dtype=None, + mask=True, + unpack=True, + units=False, + calendar=False, + instructions=None, + substitutions=None, + term=None, + attributes=None, + storage_options=None, + source=None, + copy=True, + x=None, + ): + """**Initialisation** + + :Parameters: + + filename: (sequence of) `str`, optional + The name of the CFA-netCDF file containing the + array. If a sequence then it must contain one element. + + address: (sequence of) `str`, optional + The name of the CFA-netCDF aggregation variable for the + array. If a sequence then it must contain one element. + + dtype: `numpy.dtype` + The data type of the aggregated data array. May be + `None` if the numpy data-type is not known (which can + be the case for netCDF string types, for example). + + mask: `bool` + If True (the default) then mask by convention when + reading data from disk. + + A netCDF array is masked depending on the values of any of + the netCDF variable attributes ``valid_min``, + ``valid_max``, ``valid_range``, ``_FillValue`` and + ``missing_value``. + + {{init unpack: `bool`, optional}} + + .. versionadded:: NEXTVERSION + + units: `str` or `None`, optional + The units of the aggregated data. Set to `None` to + indicate that there are no units. + + calendar: `str` or `None`, optional + The calendar of the aggregated data. Set to `None` to + indicate the CF default calendar, if applicable. + + instructions: `str`, optional + The ``aggregated_data`` attribute value as found on + the CFA netCDF variable. If set then this will be used + to improve the performance of `__dask_tokenize__`. + + substitutions: `dict`, optional + A dictionary whose key/value pairs define text + substitutions to be applied to the fragment file + names. Each key must be specified with the ``${...}`` + syntax, for instance ``{'${base}': 'sub'}``. + + .. versionadded:: 3.15.0 + + term: `str`, optional + The name of a non-standard aggregation instruction + term from which the array is to be created, instead of + creating the aggregated data in the standard + terms. If set then *address* must be the name of the + term's CFA-netCDF aggregation instruction variable, + which must be defined on the fragment dimensions and + no others. Each value of the aggregation instruction + variable will be broadcast across the shape of the + corresponding fragment. + + *Parameter example:* + ``address='cfa_tracking_id', term='tracking_id'`` + + .. versionadded:: 3.15.0 + + storage_options: `dict` or `None`, optional + Key/value pairs to be passed on to the creation of + `s3fs.S3FileSystem` file systems to control the + opening of fragment files in an S3 object + stores. Ignored for fragment files not in S3 object + stores, i.e. those whose names do not start with + ``s3:``. + + If an ``'endpoint_url'`` key is not in + *storage_options* then one will be automatically + derived for accessing each S3 fragment file. For + example, for a fragment file name of + ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` + key with value ``'https://store'`` would be created. + + *Parameter example:* + For a fragment file name of + ``'s3://store/data/file.nc'``, the following are + equivalent: ``None``, ``{}`` and ``{'endpoint_url': + 'https://store'}``. + + *Parameter example:* + ``{'key: 'scaleway-api-key...', 'secret': + 'scaleway-secretkey...', 'endpoint_url': + 'https://s3.fr-par.scw.cloud', 'client_kwargs': + {'region_name': 'fr-par'}}`` + + .. versionadded:: NEXTVERSION + + {{init source: optional}} + + {{init copy: `bool`, optional}} + + """ + if source is not None: + super().__init__(source=source, copy=copy) + + try: + fragment_shape = source.get_fragment_shape() + except AttributeError: + fragment_shape = None + + try: + instructions = source._get_component("instructions") + except AttributeError: + instructions = None + + try: + aggregated_data = source.get_aggregated_data(copy=False) + except AttributeError: + aggregated_data = {} + + try: + substitutions = source.get_substitutions() + except AttributeError: + substitutions = None + + try: + term = source.get_term() + except AttributeError: + term = None + + elif filename is not None: + shape, fragment_shape, aggregated_data = self._parse_cfa( + x, term, substitutions + ) + super().__init__( + filename=filename, + address=address, + shape=shape, + dtype=dtype, + mask=mask, + units=units, + calendar=calendar, + copy=copy, + ) + else: + super().__init__( + filename=filename, + address=address, + dtype=dtype, + mask=mask, + units=units, + calendar=calendar, + copy=copy, + ) + + fragment_shape = None + aggregated_data = None + instructions = None + term = None + + self._set_component("fragment_shape", fragment_shape, copy=False) + self._set_component("aggregated_data", aggregated_data, copy=False) + self._set_component("instructions", instructions, copy=False) + self._set_component("term", term, copy=False) + + if substitutions is not None: + self._set_component( + "substitutions", substitutions.copy(), copy=False + ) diff --git a/cf/data/array/cfanetcdfarray.py b/cf/data/array/cfanetcdfarray.py index 36ec327d0f..84c14f5ef7 100644 --- a/cf/data/array/cfanetcdfarray.py +++ b/cf/data/array/cfanetcdfarray.py @@ -129,7 +129,7 @@ def __init__( 'https://s3.fr-par.scw.cloud', 'client_kwargs': {'region_name': 'fr-par'}}`` - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION {{init source: optional}} @@ -408,13 +408,7 @@ def get_fragment_shape(self): def get_storage_options(self): """Return `s3fs.S3FileSystem` options for accessing S3 fragment files. - If an ``'endpoint_url'`` key is not in the returned options, - then one will be automatically derived for accessing each S3 - fragment file. For example, for a fragment file name of - ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` key with - value ``'https://store'`` would be created. - - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION :Returns: diff --git a/cf/data/array/h5netcdfarray.py b/cf/data/array/h5netcdfarray.py index 62fb791b3e..6fe943ab18 100644 --- a/cf/data/array/h5netcdfarray.py +++ b/cf/data/array/h5netcdfarray.py @@ -19,14 +19,14 @@ class H5netcdfArray( Active storage reduction may be enabled with the `actify` method. See `cf.data.collapse.Collapse` for details. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION """ def __dask_tokenize__(self): """Return a value fully representative of the object. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION """ return super().__dask_tokenize__() + (self.get_mask(),) @@ -42,7 +42,7 @@ def _lock(self): that access to all netCDF and HDF files coordinates around the same lock. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION """ return _lock diff --git a/cf/data/array/mixin/__init__.py b/cf/data/array/mixin/__init__.py index d38ac4a307..af036620cf 100644 --- a/cf/data/array/mixin/__init__.py +++ b/cf/data/array/mixin/__init__.py @@ -1,4 +1,5 @@ from .activestoragemixin import ActiveStorageMixin from .arraymixin import ArrayMixin +from .cfamixin import CFAMixin from .compressedarraymixin import CompressedArrayMixin from .filearraymixin import FileArrayMixin diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index 9554a18779..9da5acdbb9 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -7,7 +7,7 @@ class ActiveStorageMixin: """Mixin class for enabling active storage reductions. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION """ @@ -29,7 +29,7 @@ def __getitem__(self, indices): then these indices work independently along each dimension (similar to the way vector subscripts work in Fortran). - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION """ method = self.get_active_method() @@ -38,18 +38,18 @@ def __getitem__(self, indices): # numpy array. return super().__getitem__(indices) - import urllib + # import urllib + # Still here? Then do an active storage reduction. Returns a # dictionary of reduced values. - # Hack for testing! filename = self.get_filename() - filename = urllib.parse.urlparse(filename).path[1:] + # filename = urllib.parse.urlparse(filename).path[1:] active = Active( filename, self.get_address(), - storage_type ='s3', # Hack for testing! + # storage_type ='s3', # Hack for testing! storage_options=self.get_storage_options(), active_storage_url=self.get_active_storage_url(), ) @@ -70,7 +70,10 @@ def actify(self, method, axis=None, active_storage_url=None): The new instance is a deep copy of the original, with the additional setting of the active storage method and axis. - .. versionadded:: 3.17.0 + When the instance is indexed, the result of applying the + active storage method to the subspace will be returned. + + .. versionadded:: NEXTVERSION .. seealso:: `set_active_axis`, `set_active_method` @@ -98,7 +101,7 @@ def actify(self, method, axis=None, active_storage_url=None): """ # Don't actify when the data are packed. Note: There may come # a time when activestorage.Active can cope with packed data, - # in which cas we can remove this test. + # in which case we can remove this test. attributes = self.get_attributes({}) if "add_offset" in attributes or "scale_factor" in attributes: raise AttributeError( @@ -107,6 +110,13 @@ def actify(self, method, axis=None, active_storage_url=None): ) if Active is None: + # Note: We don't really expect to be here because if + # activestorage.Active is not available then we + # wouldn't even attempt to actify the instance + # during a reduction (see + # `cf.data.collapse.active_storage`). However, it's + # worth checking in case `actify` is called by the + # user. raise AttributeError( "Can't actify {self.__class__.__name__} when " "activestorage.Active is not available" @@ -121,7 +131,7 @@ def actify(self, method, axis=None, active_storage_url=None): def get_active_axis(self): """Return the active storage reduction axes. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION .. seealso:: `get_active_method`, `get_active_storage_url` @@ -137,7 +147,7 @@ def get_active_axis(self): def get_active_method(self): """Return the name of the active storage reduction method. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION .. seealso:: `get_active_axis`, `get_active_storage_url` @@ -153,7 +163,7 @@ def get_active_method(self): def get_active_storage_url(self): """Return the the active storage URL. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION .. seealso:: `get_active_axis`, `get_active_method` diff --git a/cf/data/array/mixin/cfamixin.py b/cf/data/array/mixin/cfamixin.py new file mode 100644 index 0000000000..cfa8f16903 --- /dev/null +++ b/cf/data/array/mixin/cfamixin.py @@ -0,0 +1,639 @@ +from copy import deepcopy +from functools import partial +from itertools import accumulate, product + +import numpy as np + +from ...utils import chunk_locations, chunk_positions + + +class CFAMixin: + """TODO + + .. versionadded:: NEXTVERSION + + """ + + def __new__(cls, *args, **kwargs): + """Store fragment array classes. + + .. versionadded:: (cfdm) 1.10.0.0 + + """ + # Import fragment array classes. Do this here (as opposed to + # outside the class) to avoid a circular import. + from ...fragment import ( + FullFragmentArray, + NetCDFFragmentArray, + UMFragmentArray, + ) + + instance = super().__new__(cls) + instance._FragmentArray = { + "nc": NetCDFFragmentArray, + "um": UMFragmentArray, + "full": FullFragmentArray, + } + return instance + + def _parse_cfa(self, x, term, substitutions): + """TODO""" + aggregated_data = {} + + location = x["location"] + ndim = location.shape[0] + compressed = np.ma.compressed + chunks = [compressed(i).tolist() for i in location] + shape = [sum(c) for c in chunks] + positions = chunk_positions(chunks) + locations = chunk_locations(chunks) + + if term is not None: + # -------------------------------------------------------- + # This fragment contains a constant value, not file + # locations. + # -------------------------------------------------------- + term = x[term] + fragment_shape = term.shape + aggregated_data = { + frag_loc: { + "location": loc, + "fill_value": term[frag_loc].item(), + "format": "full", + } + for frag_loc, loc in zip(positions, locations) + } + else: + a = x["address"] + f = x["file"] + file_fmt = x["format"] + + extra_dimension = f.ndim > ndim + if extra_dimension: + # There is an extra non-fragment dimension + fragment_shape = f.shape[:-1] + else: + fragment_shape = f.shape + + if not a.ndim: + a = (a.item(),) + scalar_address = True + else: + scalar_address = False + + if not file_fmt.ndim: + file_fmt = file_fmt.item() + scalar_fmt = True + else: + scalar_fmt = False + + for frag_loc, location in zip(positions, locations): + if extra_dimension: + filename = compressed(f[frag_loc]).tolist() + if scalar_address: + address = a * len(filename) + else: + address = compressed(a[frag_loc].tolist()) + + if scalar_fmt: + fmt = file_fmt + else: + fmt = compressed(file_fmt[frag_loc]).tolist() + else: + filename = (f[frag_loc].item(),) + if scalar_address: + address = a + else: + address = (a[frag_loc].item(),) + + if scalar_fmt: + fmt = file_fmt + else: + fmt = file_fmt[frag_loc].item() + + aggregated_data[frag_loc] = { + "location": location, + "filename": filename, + "address": address, + "format": fmt, + } + + # Apply string substitutions to the fragment filenames + if substitutions: + for value in aggregated_data.values(): + filenames2 = [] + for filename in value["filename"]: + for base, sub in substitutions.items(): + filename = filename.replace(base, sub) + + filenames2.append(filename) + + value["filename"] = filenames2 + + return shape, fragment_shape, aggregated_data + + def __dask_tokenize__(self): + """Used by `dask.base.tokenize`. + + .. versionadded:: 3.14.0 + + """ + out = super().__dask_tokenize__() + aggregated_data = self._get_component("instructions", None) + if aggregated_data is None: + aggregated_data = self.get_aggregated_data(copy=False) + + return out + (aggregated_data,) + + def __getitem__(self, indices): + """x.__getitem__(indices) <==> x[indices]""" + return NotImplemented # pragma: no cover + + def get_aggregated_data(self, copy=True): + """Get the aggregation data dictionary. + + The aggregation data dictionary contains the definitions of + the fragments and the instructions on how to aggregate them. + The keys are indices of the CFA fragment dimensions, + e.g. ``(1, 0, 0 ,0)``. + + .. versionadded:: 3.14.0 + + :Parameters: + + copy: `bool`, optional + Whether or not to return a copy of the aggregation + dictionary. By default a deep copy is returned. + + .. warning:: If False then changing the returned + dictionary in-place will change the + aggregation dictionary stored in the + {{class}} instance, **as well as in any + copies of it**. + + :Returns: + + `dict` + The aggregation data dictionary. + + **Examples** + + >>> a.shape + (12, 1, 73, 144) + >>> a.get_fragment_shape() + (2, 1, 1, 1) + >>> a.get_aggregated_data() + {(0, 0, 0, 0): { + 'file': ('January-June.nc',), + 'address': ('temp',), + 'format': 'nc', + 'location': [(0, 6), (0, 1), (0, 73), (0, 144)]}, + (1, 0, 0, 0): { + 'file': ('July-December.nc',), + 'address': ('temp',), + 'format': 'nc', + 'location': [(6, 12), (0, 1), (0, 73), (0, 144)]}} + + """ + aggregated_data = self._get_component("aggregated_data") + if copy: + aggregated_data = deepcopy(aggregated_data) + + return aggregated_data + + def get_fragmented_dimensions(self): + """Get the positions of dimensions that have two or more fragments. + + .. versionadded:: 3.14.0 + + :Returns: + + `list` + The dimension positions. + + **Examples** + + >>> a.get_fragment_shape() + (20, 1, 40, 1) + >>> a.get_fragmented_dimensions() + [0, 2] + + >>> a.get_fragment_shape() + (1, 1, 1) + >>> a.get_fragmented_dimensions() + [] + + """ + return [ + i for i, size in enumerate(self.get_fragment_shape()) if size > 1 + ] + + def get_fragment_shape(self): + """Get the sizes of the fragment dimensions. + + The fragment dimension sizes are given in the same order as + the aggregated dimension sizes given by `shape`. + + .. versionadded:: 3.14.0 + + :Returns: + + `tuple` + The shape of the fragment dimensions. + + """ + return self._get_component("fragment_shape") + + def get_storage_options(self): + """Return `s3fs.S3FileSystem` options for accessing S3 fragment files. + + .. versionadded:: NEXTVERSION + + :Returns: + + `dict` or `None` + The `s3fs.S3FileSystem` options. + + **Examples** + + >>> f.get_storage_options() + {} + + >>> f.get_storage_options() + {'anon': True} + + >>> f.get_storage_options() + {'key: 'scaleway-api-key...', + 'secret': 'scaleway-secretkey...', + 'endpoint_url': 'https://s3.fr-par.scw.cloud', + 'client_kwargs': {'region_name': 'fr-par'}} + + """ + return super().get_storage_options(create_endpoint_url=False) + + def get_term(self, default=ValueError()): + """The CFA aggregation instruction term for the data, if set. + + .. versionadded:: 3.15.0 + + :Parameters: + + default: optional + Return the value of the *default* parameter if the + term has not been set. If set to an `Exception` + instance then it will be raised instead. + + :Returns: + + `str` + The CFA aggregation instruction term name. + + """ + return self._get_component("term", default=default) + + def subarray_shapes(self, shapes): + """Create the subarray shapes. + + A fragmented dimenion (i.e. one spanned by two or fragments) + will always have a subarray size equal to the size of each of + its fragments, overriding any other size implied by the + *shapes* parameter. + + .. versionadded:: 3.14.0 + + .. seealso:: `subarrays` + + :Parameters: + + shapes: `int`, sequence, `dict` or `str`, optional + Define the subarray shapes. + + Any value accepted by the *chunks* parameter of the + `dask.array.from_array` function is allowed. + + The subarray sizes implied by *chunks* for a dimension + that has been fragmented are ignored, so their + specification is arbitrary. + + :Returns: + + `tuple` + The subarray sizes along each dimension. + + **Examples** + + >>> a.shape + (12, 1, 73, 144) + >>> a.get_fragment_shape() + (2, 1, 1, 1) + >>> a.fragmented_dimensions() + [0] + >>> a.subarray_shapes(-1) + ((6, 6), (1,), (73,), (144,)) + >>> a.subarray_shapes(None) + ((6, 6), (1,), (73,), (144,)) + >>> a.subarray_shapes("auto") + ((6, 6), (1,), (73,), (144,)) + >>> a.subarray_shapes((None, 1, 40, 50)) + ((6, 6), (1,), (40, 33), (50, 50, 44)) + >>> a.subarray_shapes((None, None, "auto", 50)) + ((6, 6), (1,), (73,), (50, 50, 44)) + >>> a.subarray_shapes({2: 40}) + ((6, 6), (1,), (40, 33), (144,)) + + """ + from numbers import Number + + from dask.array.core import normalize_chunks + + # Positions of fragmented dimensions (i.e. those spanned by + # two or more fragments) + f_dims = self.get_fragmented_dimensions() + + shape = self.shape + aggregated_data = self.get_aggregated_data(copy=False) + + # Create the base chunks. + chunks = [] + ndim = self.ndim + for dim, (n_fragments, size) in enumerate( + zip(self.get_fragment_shape(), self.shape) + ): + if dim in f_dims: + # This aggregated dimension is spanned by two or more + # fragments => set the chunks to be the same size as + # the each fragment. + c = [] + index = [0] * ndim + for j in range(n_fragments): + index[dim] = j + loc = aggregated_data[tuple(index)]["location"][dim] + chunk_size = loc[1] - loc[0] + c.append(chunk_size) + + chunks.append(tuple(c)) + else: + # This aggregated dimension is spanned by exactly one + # fragment => store `None` for now. This will get + # overwritten from 'shapes'. + chunks.append(None) + + if isinstance(shapes, (str, Number)) or shapes is None: + chunks = [ + c if i in f_dims else shapes for i, c in enumerate(chunks) + ] + elif isinstance(shapes, dict): + chunks = [ + chunks[i] if i in f_dims else shapes.get(i, "auto") + for i, c in enumerate(chunks) + ] + else: + # chunks is a sequence + if len(shapes) != ndim: + raise ValueError( + f"Wrong number of 'shapes' elements in {shapes}: " + f"Got {len(shapes)}, expected {self.ndim}" + ) + + chunks = [ + c if i in f_dims else shapes[i] for i, c in enumerate(chunks) + ] + + return normalize_chunks(chunks, shape=shape, dtype=self.dtype) + + def subarrays(self, subarray_shapes): + """Return descriptors for every subarray. + + .. versionadded:: 3.14.0 + + .. seealso:: `subarray_shapes` + + :Parameters: + + subarray_shapes: `tuple` + The subarray sizes along each dimension, as returned + by a prior call to `subarray_shapes`. + + :Returns: + + 6-`tuple` of iterators + Each iterator iterates over a particular descriptor + from each subarray. + + 1. The indices of the aggregated array that correspond + to each subarray. + + 2. The shape of each subarray. + + 3. The indices of the fragment that corresponds to each + subarray (some subarrays may be represented by a + part of a fragment). + + 4. The location of each subarray. + + 5. The location on the fragment dimensions of the + fragment that corresponds to each subarray. + + 6. The shape of each fragment that overlaps each chunk. + + **Examples** + + An aggregated array with shape (12, 73, 144) has two + fragments, both with with shape (6, 73, 144). + + >>> a.shape + (12, 73, 144) + >>> a.get_fragment_shape() + (2, 1, 1) + >>> a.fragmented_dimensions() + [0] + >>> subarray_shapes = a.subarray_shapes({1: 40}) + >>> print(subarray_shapes) + ((6, 6), (40, 33), (144,)) + >>> ( + ... u_indices, + ... u_shapes, + ... f_indices, + ... s_locations, + ... f_locations, + ... f_shapes, + ... ) = a.subarrays(subarray_shapes) + >>> for i in u_indices: + ... print(i) + ... + (slice(0, 6, None), slice(0, 40, None), slice(0, 144, None)) + (slice(0, 6, None), slice(40, 73, None), slice(0, 144, None)) + (slice(6, 12, None), slice(0, 40, None), slice(0, 144, None)) + (slice(6, 12, None), slice(40, 73, None), slice(0, 144, None)) + + >>> for i in u_shapes + ... print(i) + ... + (6, 40, 144) + (6, 33, 144) + (6, 40, 144) + (6, 33, 144) + >>> for i in f_indices: + ... print(i) + ... + (slice(None, None, None), slice(0, 40, None), slice(0, 144, None)) + (slice(None, None, None), slice(40, 73, None), slice(0, 144, None)) + (slice(None, None, None), slice(0, 40, None), slice(0, 144, None)) + (slice(None, None, None), slice(40, 73, None), slice(0, 144, None)) + >>> for i in s_locations: + ... print(i) + ... + (0, 0, 0) + (0, 1, 0) + (1, 0, 0) + (1, 1, 0) + >>> for i in f_locations: + ... print(i) + ... + (0, 0, 0) + (0, 0, 0) + (1, 0, 0) + (1, 0, 0) + >>> for i in f_shapes: + ... print(i) + ... + (6, 73, 144) + (6, 73, 144) + (6, 73, 144) + (6, 73, 144) + + """ + f_dims = self.get_fragmented_dimensions() + + # The indices of the uncompressed array that correspond to + # each subarray, the shape of each uncompressed subarray, and + # the location of each subarray + s_locations = [] + u_shapes = [] + u_indices = [] + f_locations = [] + for dim, c in enumerate(subarray_shapes): + nc = len(c) + s_locations.append(tuple(range(nc))) + u_shapes.append(c) + + if dim in f_dims: + f_locations.append(tuple(range(nc))) + else: + # No fragmentation along this dimension + f_locations.append((0,) * nc) + + c = tuple(accumulate((0,) + c)) + u_indices.append([slice(i, j) for i, j in zip(c[:-1], c[1:])]) + + # For each subarray, the part of the fragment that corresponds + # to it. + f_indices = [ + (slice(None),) * len(u) if dim in f_dims else u + for dim, u in enumerate(u_indices) + ] + + # For each subarray, the shape of the fragment that + # corresponds to it. + f_shapes = [ + u_shape if dim in f_dims else (size,) * len(u_shape) + for dim, (u_shape, size) in enumerate(zip(u_shapes, self.shape)) + ] + + return ( + product(*u_indices), + product(*u_shapes), + product(*f_indices), + product(*s_locations), + product(*f_locations), + product(*f_shapes), + ) + + def to_dask_array(self, chunks="auto"): + """Create a dask array with `FragmentArray` chunks. + + .. versionadded:: 3.14.0 + + :Parameters: + + chunks: `int`, `tuple`, `dict` or `str`, optional + Specify the chunking of the returned dask array. + + Any value accepted by the *chunks* parameter of the + `dask.array.from_array` function is allowed. + + The chunk sizes implied by *chunks* for a dimension that + has been fragmented are ignored and replaced with values + that are implied by that dimensions fragment sizes. + + :Returns: + + `dask.array.Array` + + """ + import dask.array as da + from dask.array.core import getter + from dask.base import tokenize + + name = (f"{self.__class__.__name__}-{tokenize(self)}",) + + dtype = self.dtype + units = self.get_units() + calendar = self.get_calendar(None) + aggregated_data = self.get_aggregated_data(copy=False) + + # Set the chunk sizes for the dask array + chunks = self.subarray_shapes(chunks) + + fragment_arrays = self._FragmentArray + if not self.get_mask(): + fragment_arrays = fragment_arrays.copy() + fragment_arrays["nc"] = partial(fragment_arrays["nc"], mask=False) + + storage_options = self.get_storage_options() + + dsk = {} + for ( + u_indices, + u_shape, + f_indices, + chunk_location, + fragment_location, + fragment_shape, + ) in zip(*self.subarrays(chunks)): + kwargs = aggregated_data[fragment_location].copy() + kwargs.pop("location", None) + + fragment_format = kwargs.pop("format", None) + try: + FragmentArray = fragment_arrays[fragment_format] + except KeyError: + raise ValueError( + "Can't get FragmentArray class for unknown " + f"fragment dataset format: {fragment_format!r}" + ) + + if storage_options and kwargs["address"] == "nc": + # Pass on any S3 file system options + kwargs["storage_options"] = storage_options + + fragment = FragmentArray( + dtype=dtype, + shape=fragment_shape, + aggregated_units=units, + aggregated_calendar=calendar, + **kwargs, + ) + + key = f"{fragment.__class__.__name__}-{tokenize(fragment)}" + dsk[key] = fragment + dsk[name + chunk_location] = ( + getter, + key, + f_indices, + False, + getattr(fragment, "_lock", False), + ) + + # Return the dask array + return da.Array(dsk, name[0], chunks=chunks, dtype=dtype) diff --git a/cf/data/array/netcdfarray.py b/cf/data/array/netcdfarray.py index b921f2c418..6cfef4f939 100644 --- a/cf/data/array/netcdfarray.py +++ b/cf/data/array/netcdfarray.py @@ -1,7 +1,7 @@ class NetCDFArray: """A netCDF array accessed with `netCDF4`. - Deprecated at version 3.17.0 and is no longer available. Use + Deprecated at version NEXTVERSION and is no longer available. Use `cf.NetCDF4Array` instead. """ @@ -11,6 +11,6 @@ def __init__(self, *args, **kwargs): from ..functions import DeprecationError raise DeprecationError( - f"{self.__class__.__name__} was deprecated at version 3.17.0 " + f"{self.__class__.__name__} was deprecated at version NEXTVERSION " "and is no longer available. Use cf.NetCDF4Array instead." ) diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index f6fdb7693c..53dd316e6c 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -124,7 +124,7 @@ def max( {{active_storage: `bool`, optional}} - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION :Returns: @@ -192,7 +192,7 @@ def max_abs( {{active_storage: `bool`, optional}} - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION :Returns: @@ -251,7 +251,7 @@ def mean( {{active_storage: `bool`, optional}} - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION :Returns: @@ -323,7 +323,7 @@ def mean_abs( {{active_storage: `bool`, optional}} - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION :Returns: @@ -381,7 +381,7 @@ def mid_range( {{active_storage: `bool`, optional}} - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION :Returns: @@ -453,7 +453,7 @@ def min( {{active_storage: `bool`, optional}} - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION :Returns: @@ -521,7 +521,7 @@ def min_abs( {{active_storage: `bool`, optional}} - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION :Returns: @@ -577,7 +577,7 @@ def range( {{active_storage: `bool`, optional}} - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION :Returns: @@ -652,7 +652,7 @@ def rms( {{active_storage: `bool`, optional}} - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION :Returns: @@ -721,7 +721,7 @@ def sample_size( {{active_storage: `bool`, optional}} - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION :Returns: @@ -796,7 +796,7 @@ def sum( {{active_storage: `bool`, optional}} - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION :Returns: @@ -871,7 +871,7 @@ def sum_of_weights( {{active_storage: `bool`, optional}} - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION :Returns: @@ -947,7 +947,7 @@ def sum_of_weights2( {{active_storage: `bool`, optional}} - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION :Returns: @@ -1000,7 +1000,7 @@ def unique( {{active_storage: `bool`, optional}} - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION :Returns: @@ -1081,7 +1081,7 @@ def var( {{active_storage: `bool`, optional}} - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION :Returns: diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index f253d06b3e..c4b6575193 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -25,7 +25,7 @@ def active_min(a, **kwargs): as the ``chunk`` parameter. Its returned value must be the same as the non-active chunk function that it is replacing. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION .. seealso:: `actify`, `active_storage` @@ -60,7 +60,7 @@ def active_max(a, **kwargs): as the ``chunk`` parameter. Its returned value must be the same as the non-active chunk function that it is replacing. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION .. seealso:: `actify`, `active_storage` @@ -95,7 +95,7 @@ def active_mean(a, **kwargs): as the ``chunk`` parameter. Its returned value must be the same as the non-active chunk function that it is replacing. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION .. seealso:: `actify`, `active_storage` @@ -134,7 +134,7 @@ def active_sum(a, **kwargs): as the ``chunk`` parameter. Its returned value must be the same as the non-active chunk function that it is replacing. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION .. seealso:: `actify`, `active_storage` @@ -185,7 +185,7 @@ def actify(a, method, axis=None): `!active_storage` attribute is registered via the *active_storage* parameter of `Collapse` methods. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION .. seealso:: `active_storage`, `cf.data.collapse.Collapse` @@ -213,11 +213,6 @@ def actify(a, method, axis=None): `None`. """ - if Active is None: - # The active storage import dependency is not met, so using - # active storage is not possible. - return a, None - from numbers import Integral import dask.array as da @@ -303,7 +298,7 @@ def active_storage(method): `Collapse` method is decorated, active storage operations are only carried out when the conditions are right. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION .. seealso:: `actify`, `cf.data.collapse.Collapse` @@ -320,11 +315,12 @@ def decorator(collapse_method): @wraps(collapse_method) def wrapper(self, *args, **kwargs): if ( - kwargs.get("active_storage") + cf_active_storage() + and Active is not None + and kwargs.get("active_storage") and method in active_chunk_functions and kwargs.get("weights") is None and kwargs.get("chunk_function") is None - and cf_active_storage() and active_storage_url() ): # Attempt to actify the dask array and provide a new diff --git a/cf/data/data.py b/cf/data/data.py index 404d160f6a..cc7074f24c 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -1407,7 +1407,7 @@ def _set_dask(self, array, copy=False, clear=_ALL): "suitability (such as data type casting, " "broadcasting, etc.). Note that the exception may be " "difficult to diagnose, as dask will have silently " - "trapped it and returned NotImplemented (see, for " + "trapped it and returned NotImplemented (seeprint , for " "instance, dask.array.core.elemwise). Print " "statements in a local copy of dask are possibly the " "way to go if the cause of the error is not obvious." @@ -1476,7 +1476,7 @@ def _del_dask(self, default=ValueError(), clear=_ALL): def _del_active_storage(self): """Set the active storage reduction status to False. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION .. seealso:: `active_storage`, `_set_active_storage` @@ -1561,7 +1561,7 @@ def _is_abstract_Array_subclass(self, array): def _set_active_storage(self, value): """Set the active storage reduction status. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION .. seealso:: `active_storage`, `_del_active_storage` @@ -4027,17 +4027,15 @@ def concatenate( data0 = data[0] units0 = data0.Units - print ('data0.a_s=', data0.active_storage) - + if copy: data0 = data0.copy() copied = True else: copied = False - + processed_data = [] for index, data1 in enumerate(data): - print ('data1.a_s=', data1.active_storage) # Turn any scalar array into a 1-d array if not data1.ndim: if not copied: @@ -4108,7 +4106,6 @@ def concatenate( if not d.active_storage: # Set the output active storage status to False when # any input data instance has False status - print ('nuking active in concatenate') active = _NONE break @@ -4774,7 +4771,7 @@ def active_storage(self): the conditions described by `cf.data.collapse.Collapse` are met. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION **Examples** @@ -4783,7 +4780,6 @@ def active_storage(self): False """ -# return True return ( self._custom.get("active_storage", False) and not self.get_compression_type() diff --git a/cf/data/fragment/h5netcdffragmentarray.py b/cf/data/fragment/h5netcdffragmentarray.py index 87688c056e..2f140df6ff 100644 --- a/cf/data/fragment/h5netcdffragmentarray.py +++ b/cf/data/fragment/h5netcdffragmentarray.py @@ -5,7 +5,7 @@ class H5netcdfFragmentArray(FragmentArrayMixin, H5netcdfArray): """A netCDF fragment array accessed with `h5netcdf`. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION """ diff --git a/cf/data/fragment/netcdf4fragmentarray.py b/cf/data/fragment/netcdf4fragmentarray.py index 100ad31c8e..12ae8c201d 100644 --- a/cf/data/fragment/netcdf4fragmentarray.py +++ b/cf/data/fragment/netcdf4fragmentarray.py @@ -5,7 +5,7 @@ class NetCDF4FragmentArray(FragmentArrayMixin, NetCDF4Array): """A netCDF fragment array accessed with `netCDF4`. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION """ diff --git a/cf/data/fragment/netcdffragmentarray.py b/cf/data/fragment/netcdffragmentarray.py index 394b859220..b6c0a3206e 100644 --- a/cf/data/fragment/netcdffragmentarray.py +++ b/cf/data/fragment/netcdffragmentarray.py @@ -206,10 +206,10 @@ def __getitem__(self, indices): kwargs["address"] = address scheme = urlparse(filename).scheme + kwargs["storage_options"] = self.get_storage_options( + create_endpoint_url=False + ) if scheme == "s3": - kwargs["storage_options"] = self.get_storage_options( - endpoint_url=False - ) fragment = H5netcdfFragmentArray(**kwargs) else: fragment = NetCDF4FragmentArray(**kwargs) diff --git a/cf/functions.py b/cf/functions.py index 6c874664e6..aebb3267c6 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -269,13 +269,13 @@ def configuration( reductions or False to disable them). The default is to not change the current behaviour. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION active_storage_url: `str` or `None` or `Constant`, optional The new value (either a new URL string or `None` to remove the URL). The default is to not change the value. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION of_fraction: `float` or `Constant`, optional Deprecated at version 3.14.0 and is no longer @@ -1184,7 +1184,7 @@ def _parse(cls, arg): class active_storage(ConstantAccess): """Whether or not to attempt active storage reductions. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION .. seealso:: `active_storage_url`, `configuration` @@ -1222,7 +1222,7 @@ class active_storage(ConstantAccess): def _parse(cls, arg): """Parse a new constant value. - .. versionaddedd:: 3.17.0 + .. versionaddedd:: NEXTVERSION :Parameters: @@ -1244,7 +1244,7 @@ def _parse(cls, arg): class active_storage_url(ConstantAccess): """The URL location of the active storage reducer. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION .. seealso:: `active_storage`, `configuration` @@ -1282,7 +1282,7 @@ class active_storage_url(ConstantAccess): def _parse(cls, arg): """Parse a new constant value. - .. versionaddedd:: 3.17.0 + .. versionaddedd:: NEXTVERSION :Parameters: diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py index a0a8178ca5..00f5ad9916 100644 --- a/cf/read_write/netcdf/netcdfread.py +++ b/cf/read_write/netcdf/netcdfread.py @@ -692,7 +692,12 @@ def _create_cfanetcdfarray( kwargs["instructions"] = " ".join(sorted(instructions)) # Use the kwargs to create a CFANetCDFArray instance - array = self.implementation.initialise_CFANetCDFArray(**kwargs) + # array = self.implementation.initialise_CFANetCDFArray(**kwargs) + if g["original_netCDF4"]: + array = self.implementation.initialise_CFANetCDF4Array(**kwargs) + else: + # h5netcdf + array = self.implementation.initialise_CFAH5netcdfArray(**kwargs) return array, kwargs @@ -740,6 +745,10 @@ def _create_cfanetcdfarray_term( return_kwargs_only=True, ) + # Get rid of the incorrect shape. This will end up getting set + # correctly by the CFANetCDFArray instance. + kwargs.pop("shape", None) + instructions = [] aggregation_instructions = {} for t, term_ncvar in g["cfa_aggregated_data"][parent_ncvar].items(): @@ -755,7 +764,12 @@ def _create_cfanetcdfarray_term( kwargs["instructions"] = " ".join(sorted(instructions)) # Use the kwargs to create a CFANetCDFArray instance - array = self.implementation.initialise_CFANetCDFArray(**kwargs) + # array = self.implementation.initialise_CFANetCDFArray(**kwargs) + if g["original_netCDF4"]: + array = self.implementation.initialise_CFANetCDF4Array(**kwargs) + else: + # h5netcdf + array = self.implementation.initialise_CFAH5netcdfArray(**kwargs) return array, kwargs diff --git a/cf/read_write/read.py b/cf/read_write/read.py index d9ca85cebf..ea63d3cee7 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -431,13 +431,14 @@ def read( .. versionadded:: 3.4.0 unpack: `bool`, optional - If True (the default) then unpack by convention when - reading data from disk. + If True (the default) then unpack arrays by convention + when the data is read from disk. - A netCDF array is unpacked depending on the values of the - netCDF attributes ``add_offset`` and ``scale_factor``. + Unpacking is determined netCDF conventions for the + following attributes: ``add_offset``, ``scale_factor``, + and ``_Unsigned``. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION warn_valid: `bool`, optional If True then print a warning for the presence of @@ -676,22 +677,22 @@ def read( .. versionadded:: 3.15.0 - netCDF_backend: `str` or `None`, optional - Specify which library to use for opening input files. By + netCDF_backend: `None` or `str`, optional + Specify which library to use for opening netCDF files. By default, or if `None`, then `netCDF4` will used unless it fails to open a given file, in which case `h5netcdf` will - be used. Setting *library* to ``'netCDF4'`` or - ``'h5netcdf'`` will force the use of the `netCDF4` or + be used instead. Setting *netCDF_backend* to ``'netCDF4'`` + or ``'h5netcdf'`` will force the use of the `netCDF4` or `h5netcdf` libraries respectively. .. note:: The *netCDF_backend* parameter does not affect the opening of netCDF fragment files that define the data of aggregated variables. For these, `netCDF4` is used for local files and those - accessed via OPenDAP, and `h5netcdf` is used for + accessed via OPeNDAP, and `h5netcdf` is used for fragment files in S3 object stores. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION storage_options: `dict` or `None`, optional Key/value pairs to be passed on to the creation of @@ -700,19 +701,22 @@ def read( object store, i.e. those whose names do not start with ``s3:``. - By default, or if `None`, then a value of ``{'anon': - True}`` is used. + By default, or if `None` or ``{}``, then no options are + passed. - If an ``'endpoint_url'`` key is not in *storage_options* - then one will be automatically derived for accessing each - S3 file. For example, for a file name of - ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` key - with value ``'https://store'`` would be created. + If the ``'endpoint_url'`` key is not in *storage_options* + or is not in a dictionary defined by the + ``'client_kwargs`` key (which is always the case when + *storage_options* is `None`), then one will be + automatically inserted for accessing each S3 file. For + example, for a file name of ``'s3://store/data/file.nc'``, + an ``'endpoint_url'`` key with value ``'https://store'`` + would be created. *Parameter example:* For a file name of ``'s3://store/data/file.nc'``, the - following are equivalent: ``None``, ``{'anon': True}``, - and ``{'anon': True, 'endpoint_url': 'https://store'}``. + following are equivalent: ``None``, ``{}``, and + ``{'endpoint_url': 'https://store'}``. *Parameter example:* ``{'key: 'scaleway-api-key...', 'secret': @@ -720,7 +724,12 @@ def read( 'https://s3.fr-par.scw.cloud', 'client_kwargs': {'region_name': 'fr-par'}}`` - .. versionadded:: 3.17.0 + *Parameter example:* + The following are equivalent: ``{'endpoint_url': + 'https://store'}`` ``{'client_kwargs': {'endpoint_url': + 'https://store'}}`` + + .. versionadded:: NEXTVERSION umversion: deprecated at version 3.0.0 Use the *um* parameter instead. @@ -1171,12 +1180,12 @@ def _read_a_file( storage_options: `dict` or `None`, optional See `cf.read` for details. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION netCDF_backend: `str` or `None`, optional See `cf.read` for details. - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION :Returns: diff --git a/cf/regrid/regrid.py b/cf/regrid/regrid.py index b855d98e53..970218cc62 100644 --- a/cf/regrid/regrid.py +++ b/cf/regrid/regrid.py @@ -1707,6 +1707,12 @@ def create_esmpy_mesh(grid, mask=None): node_count = node_ids.size node_owners = np.zeros(node_count) + # Make sure that node IDs are >= 1, as needed by newer versions of + # esmpy + min_id = node_ids.min() + if min_id < 1: + node_ids += min_id + 1 + # Add nodes. This must be done before `add_elements`. esmpy_mesh.add_nodes( node_count=node_count, From 1023ad0abc6adcd613ee5d77af1e306d6c27331e Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 4 Mar 2024 22:59:08 +0000 Subject: [PATCH 052/134] dev --- cf/data/array/cfah5netcdfarray.py | 10 ++++---- cf/data/array/mixin/cfamixin.py | 40 +++++++++++++++++++++++++++---- cf/data/collapse/collapse.py | 18 ++++++++++---- docs/source/field_analysis.rst | 33 +++++++++++++------------ 4 files changed, 72 insertions(+), 29 deletions(-) diff --git a/cf/data/array/cfah5netcdfarray.py b/cf/data/array/cfah5netcdfarray.py index 6b1acc3a09..33f3111aa6 100644 --- a/cf/data/array/cfah5netcdfarray.py +++ b/cf/data/array/cfah5netcdfarray.py @@ -81,11 +81,11 @@ def __init__( term: `str`, optional The name of a non-standard aggregation instruction term from which the array is to be created, instead of - creating the aggregated data in the standard - terms. If set then *address* must be the name of the - term's CFA-netCDF aggregation instruction variable, - which must be defined on the fragment dimensions and - no others. Each value of the aggregation instruction + creating the aggregated data in the standard terms. If + set then *address* must be the name of the term's + CFA-netCDF aggregation instruction variable, which + must be defined on the fragment dimensions and no + others. Each value of the aggregation instruction variable will be broadcast across the shape of the corresponding fragment. diff --git a/cf/data/array/mixin/cfamixin.py b/cf/data/array/mixin/cfamixin.py index cfa8f16903..9e1627b55d 100644 --- a/cf/data/array/mixin/cfamixin.py +++ b/cf/data/array/mixin/cfamixin.py @@ -8,7 +8,7 @@ class CFAMixin: - """TODO + """Mixin class for a CFA-netCDF array. .. versionadded:: NEXTVERSION @@ -17,7 +17,7 @@ class CFAMixin: def __new__(cls, *args, **kwargs): """Store fragment array classes. - .. versionadded:: (cfdm) 1.10.0.0 + .. versionadded:: NEXTVERSION """ # Import fragment array classes. Do this here (as opposed to @@ -37,7 +37,36 @@ def __new__(cls, *args, **kwargs): return instance def _parse_cfa(self, x, term, substitutions): - """TODO""" + """Parse the CFA aggregation instructions. + + .. versionadded:: NEXTVERSION + + :Parameters: + + x: `dict` + + term: `str` or `None` + The name of a non-standard aggregation instruction + term from which the array is to be created, instead of + creating the aggregated data in the standard + terms. Each value of the aggregation instruction + variable will be broadcast across the shape of the + corresponding fragment. + + substitutions: `dict` or `None` + A dictionary whose key/value pairs define text + substitutions to be applied to the fragment file + names. Each key must be specified with the ``${...}`` + syntax, for instance ``{'${base}': 'sub'}``. + + :Returns: + + 3-`tuple` + 1. The shape of the aggregated data. + 2. The shape of the array of fragments. + 3. The parsed aggregation instructsions. + + """ aggregated_data = {} location = x["location"] @@ -50,7 +79,7 @@ def _parse_cfa(self, x, term, substitutions): if term is not None: # -------------------------------------------------------- - # This fragment contains a constant value, not file + # Each fragment contains a constant value, not file # locations. # -------------------------------------------------------- term = x[term] @@ -64,6 +93,9 @@ def _parse_cfa(self, x, term, substitutions): for frag_loc, loc in zip(positions, locations) } else: + # -------------------------------------------------------- + # Each fragment contains file locations + # -------------------------------------------------------- a = x["address"] f = x["file"] file_fmt = x["format"] diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index 53dd316e6c..6dc0c9bf80 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -20,8 +20,8 @@ class Collapse(metaclass=DocstringRewriteMeta): * `cf.active_storage()` is True; - * an active storage URL has been set with - `cf.active_storage_url`; + * ``cf.active_storage_url()`` returns the URL of an active + storage server; * it is possible to import the `activestorage.Active` class; @@ -29,9 +29,10 @@ class Collapse(metaclass=DocstringRewriteMeta): * the collapse is unweighted; - * the data is not compressed by convention (netCDF numeric - packing is not considered here to be a compression by - convention techinigue); + * the data are in netCDF-4 files on disk (rather than in + any other file format, or in memory); + + * the data are not compressed by convention; * the `Collapse` method's *active_storage* parameter is True; @@ -47,6 +48,13 @@ class Collapse(metaclass=DocstringRewriteMeta): in which case the Dask graph is modified to expect the per-chunk reductions to be carried out externally. + .. note:: The performance improvements from using active storage + operations will increase the closer, in a network sense, + the active storage server is to the data storage. If the + active storage server is sufficiently far away from the + data then it may be faster and require less energy to do + a normal, non-active operation. + See `cf.data.collapse.active_storage` for details. .. versionadded:: 3.14.0 diff --git a/docs/source/field_analysis.rst b/docs/source/field_analysis.rst index 9a42d0b737..df78f1ec1a 100644 --- a/docs/source/field_analysis.rst +++ b/docs/source/field_analysis.rst @@ -864,6 +864,13 @@ power costs of transfering the entire un-collapsed data to the local client. Whether or not this will occur is determined on a case-by-case basis, and will only be done if all of the following criteria are met: +* ``cf.active_storage()`` is `True`; + +* ``cf.active_storage_url()`` returns the URL of an active storage + server; + +* it is possible to import the external `activestorage.Active` class. + * the collapse method is one of ``'mean'``, ``'maximum'``, ``'minimum'``, or ``'sum'``; @@ -871,27 +878,23 @@ basis, and will only be done if all of the following criteria are met: * the collapse is unweighted; -* `cf.active_storage()` is `True`; - -* a URL of the active storage server has been set with - `cf.active_storage_url`; - * the data values are in netCDF-4 files on disk (rather than in any - other file format, or in memory) and are not numerically packed; + other file format, or in memory); + +* the data are not compressed by convention; * the `~cf.Data.active_storage` attribute of the `cf.Data` object being collapsed is `True`, indicating that active storage operations - may be possible. In general, it will only be `True` for data that - are in files on disk, are not compressed by convention and have not - had any other operations applied; - - -* it is possible to import the external `activestorage.Active` class. + are possible, provided all of the other conditions are also met. In + general, it will only be `True` for data that are in files on disk, + are not compressed by convention, and have not had any other + operations applied. The performance improvements from using active storage operations will -increase the closer the active storage server is to the data -storage. If the active storage server is sufficiently far away from -the data then it may be faster to do a normal, non-active operation. +increase the closer, in a network sense, the active storage server is +to the data storage. If the active storage server is sufficiently far +away from the data then it may be faster and require less energy to do +a normal, non-active operation. ---- From 6eef10a7798384b3997066603785fc67c0bc3932 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 5 Mar 2024 14:48:46 +0000 Subject: [PATCH 053/134] dev --- Changelog.rst | 2 +- cf/__init__.py | 4 +- cf/data/array/cfah5netcdfarray.py | 28 +- cf/data/array/cfanetcdf4array.py | 28 +- cf/data/array/cfanetcdfarray.py | 801 ------------------------------ cf/docstring/docstring.py | 3 - cf/functions.py | 2 +- cf/read_write/read.py | 65 ++- cf/test/test_functions.py | 2 +- docs/source/installation.rst | 4 +- requirements.txt | 2 +- 11 files changed, 64 insertions(+), 877 deletions(-) delete mode 100644 cf/data/array/cfanetcdfarray.py diff --git a/Changelog.rst b/Changelog.rst index 7f0a0ea735..fbb47274d3 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -12,7 +12,7 @@ version NEXT * New dependency: ``h5netcdf>=1.3.0`` * New dependency: ``h5py>=3.10.0`` * New dependency: ``s3fs>=2024.2.0`` -* Changed dependency: ``1.11.1.0<=cfdm<1.11.2.0`` +* Changed dependency: ``1.11.2.0<=cfdm<1.11.3.0`` * Changed dependency: ``cfunits>=3.3.7`` ---- diff --git a/cf/__init__.py b/cf/__init__.py index e65ebe7ae6..aa4def8885 100644 --- a/cf/__init__.py +++ b/cf/__init__.py @@ -206,8 +206,8 @@ ) # Check the version of cfdm -_minimum_vn = "1.11.1.0" -_maximum_vn = "1.11.2.0" +_minimum_vn = "1.11.2.0" +_maximum_vn = "1.11.3.0" _cfdm_version = Version(cfdm.__version__) if not Version(_minimum_vn) <= _cfdm_version < Version(_maximum_vn): raise RuntimeError( diff --git a/cf/data/array/cfah5netcdfarray.py b/cf/data/array/cfah5netcdfarray.py index 33f3111aa6..6f4efbdeaf 100644 --- a/cf/data/array/cfah5netcdfarray.py +++ b/cf/data/array/cfah5netcdfarray.py @@ -97,24 +97,22 @@ def __init__( storage_options: `dict` or `None`, optional Key/value pairs to be passed on to the creation of `s3fs.S3FileSystem` file systems to control the - opening of fragment files in an S3 object - stores. Ignored for fragment files not in S3 object - stores, i.e. those whose names do not start with - ``s3:``. - - If an ``'endpoint_url'`` key is not in - *storage_options* then one will be automatically - derived for accessing each S3 fragment file. For - example, for a fragment file name of + opening of fragment files in S3 object stores. Ignored + for files not in an S3 object store, i.e. those whose + names do not start with ``s3:``. + + By default, or if `None`, then *storage_options* is + taken as ``{}``. + + If the ``'endpoint_url'`` key is not in + *storage_options* or is not in a dictionary defined by + the ``'client_kwargs`` key (which is always the case + when *storage_options* is `None`), then one will be + automatically inserted for accessing a fragment S3 + file. For example, for a file name of ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` key with value ``'https://store'`` would be created. - *Parameter example:* - For a fragment file name of - ``'s3://store/data/file.nc'``, the following are - equivalent: ``None``, ``{}`` and ``{'endpoint_url': - 'https://store'}``. - *Parameter example:* ``{'key: 'scaleway-api-key...', 'secret': 'scaleway-secretkey...', 'endpoint_url': diff --git a/cf/data/array/cfanetcdf4array.py b/cf/data/array/cfanetcdf4array.py index 0532bb191d..dddf3411cb 100644 --- a/cf/data/array/cfanetcdf4array.py +++ b/cf/data/array/cfanetcdf4array.py @@ -97,24 +97,22 @@ def __init__( storage_options: `dict` or `None`, optional Key/value pairs to be passed on to the creation of `s3fs.S3FileSystem` file systems to control the - opening of fragment files in an S3 object - stores. Ignored for fragment files not in S3 object - stores, i.e. those whose names do not start with - ``s3:``. - - If an ``'endpoint_url'`` key is not in - *storage_options* then one will be automatically - derived for accessing each S3 fragment file. For - example, for a fragment file name of + opening of fragment files in S3 object stores. Ignored + for files not in an S3 object store, i.e. those whose + names do not start with ``s3:``. + + By default, or if `None`, then *storage_options* is + taken as ``{}``. + + If the ``'endpoint_url'`` key is not in + *storage_options* or is not in a dictionary defined by + the ``'client_kwargs`` key (which is always the case + when *storage_options* is `None`), then one will be + automatically inserted for accessing a fragment S3 + file. For example, for a file name of ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` key with value ``'https://store'`` would be created. - *Parameter example:* - For a fragment file name of - ``'s3://store/data/file.nc'``, the following are - equivalent: ``None``, ``{}`` and ``{'endpoint_url': - 'https://store'}``. - *Parameter example:* ``{'key: 'scaleway-api-key...', 'secret': 'scaleway-secretkey...', 'endpoint_url': diff --git a/cf/data/array/cfanetcdfarray.py b/cf/data/array/cfanetcdfarray.py deleted file mode 100644 index 84c14f5ef7..0000000000 --- a/cf/data/array/cfanetcdfarray.py +++ /dev/null @@ -1,801 +0,0 @@ -from copy import deepcopy -from functools import partial -from itertools import accumulate, product - -import numpy as np - -from ..fragment import FullFragmentArray, NetCDFFragmentArray, UMFragmentArray -from ..utils import chunk_locations, chunk_positions -from .netcdf4array import NetCDF4Array - -# Store fragment array classes. -_FragmentArray = { - "nc": NetCDFFragmentArray, - "um": UMFragmentArray, - "full": FullFragmentArray, -} - - -class CFANetCDFArray(NetCDF4Array): - """A CFA aggregated array stored in a netCDF file. - - .. versionadded:: 3.14.0 - - """ - - def __init__( - self, - filename=None, - address=None, - dtype=None, - mask=True, - units=False, - calendar=False, - instructions=None, - substitutions=None, - term=None, - storage_options=None, - source=None, - copy=True, - x=None, - ): - """**Initialisation** - - :Parameters: - - filename: (sequence of) `str`, optional - The name of the CFA-netCDF file containing the - array. If a sequence then it must contain one element. - - address: (sequence of) `str`, optional - The name of the CFA-netCDF aggregation variable for the - array. If a sequence then it must contain one element. - - dtype: `numpy.dtype` - The data type of the aggregated data array. May be - `None` if the numpy data-type is not known (which can - be the case for netCDF string types, for example). - - mask: `bool` - If True (the default) then mask by convention when - reading data from disk. - - A netCDF array is masked depending on the values of any of - the netCDF variable attributes ``valid_min``, - ``valid_max``, ``valid_range``, ``_FillValue`` and - ``missing_value``. - - units: `str` or `None`, optional - The units of the aggregated data. Set to `None` to - indicate that there are no units. - - calendar: `str` or `None`, optional - The calendar of the aggregated data. Set to `None` to - indicate the CF default calendar, if applicable. - - instructions: `str`, optional - The ``aggregated_data`` attribute value as found on - the CFA netCDF variable. If set then this will be used - to improve the performance of `__dask_tokenize__`. - - substitutions: `dict`, optional - A dictionary whose key/value pairs define text - substitutions to be applied to the fragment file - names. Each key must be specified with the ``${...}`` - syntax, for instance ``{'${base}': 'sub'}``. - - .. versionadded:: 3.15.0 - - term: `str`, optional - The name of a non-standard aggregation instruction - term from which the array is to be created, instead of - creating the aggregated data in the standard - terms. If set then *address* must be the name of the - term's CFA-netCDF aggregation instruction variable, - which must be defined on the fragment dimensions and - no others. Each value of the aggregation instruction - variable will be broadcast across the shape of the - corresponding fragment. - - *Parameter example:* - ``address='cfa_tracking_id', term='tracking_id'`` - - .. versionadded:: 3.15.0 - - storage_options: `dict` or `None`, optional - Key/value pairs to be passed on to the creation of - `s3fs.S3FileSystem` file systems to control the - opening of fragment files in an S3 object - stores. Ignored for fragment files not in S3 object - stores, i.e. those whose names do not start with - ``s3:``. - - If an ``'endpoint_url'`` key is not in - *storage_options* then one will be automatically - derived for accessing each S3 fragment file. For - example, for a fragment file name of - ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` - key with value ``'https://store'`` would be created. - - *Parameter example:* - For a fragment file name of - ``'s3://store/data/file.nc'``, the following are - equivalent: ``None``, ``{}`` and ``{'endpoint_url': - 'https://store'}``. - - *Parameter example:* - ``{'key: 'scaleway-api-key...', 'secret': - 'scaleway-secretkey...', 'endpoint_url': - 'https://s3.fr-par.scw.cloud', 'client_kwargs': - {'region_name': 'fr-par'}}`` - - .. versionadded:: NEXTVERSION - - {{init source: optional}} - - {{init copy: `bool`, optional}} - - """ - if source is not None: - super().__init__(source=source, copy=copy) - - try: - fragment_shape = source.get_fragment_shape() - except AttributeError: - fragment_shape = None - - try: - instructions = source._get_component("instructions") - except AttributeError: - instructions = None - - try: - aggregated_data = source.get_aggregated_data(copy=False) - except AttributeError: - aggregated_data = {} - - try: - substitutions = source.get_substitutions() - except AttributeError: - substitutions = None - - try: - term = source.get_term() - except AttributeError: - term = None - - elif filename is not None: - aggregated_data = {} - - location = x["location"] - ndim = location.shape[0] - compressed = np.ma.compressed - chunks = [compressed(i).tolist() for i in location] - shape = [sum(c) for c in chunks] - positions = chunk_positions(chunks) - locations = chunk_locations(chunks) - - if term is not None: - # -------------------------------------------------------- - # This fragment contains a constant value, not file - # locations. - # -------------------------------------------------------- - term = x[term] - fragment_shape = term.shape - aggregated_data = { - frag_loc: { - "location": loc, - "fill_value": term[frag_loc].item(), - "format": "full", - } - for frag_loc, loc in zip(positions, locations) - } - else: - a = x["address"] - f = x["file"] - file_fmt = x["format"] - - extra_dimension = f.ndim > ndim - if extra_dimension: - # There is an extra non-fragment dimension - fragment_shape = f.shape[:-1] - else: - fragment_shape = f.shape - - if not a.ndim: - a = (a.item(),) - scalar_address = True - else: - scalar_address = False - - if not file_fmt.ndim: - file_fmt = file_fmt.item() - scalar_fmt = True - else: - scalar_fmt = False - - for frag_loc, location in zip(positions, locations): - if extra_dimension: - filename = compressed(f[frag_loc]).tolist() - if scalar_address: - address = a * len(filename) - else: - address = compressed(a[frag_loc].tolist()) - - if scalar_fmt: - fmt = file_fmt - else: - fmt = compressed(file_fmt[frag_loc]).tolist() - else: - filename = (f[frag_loc].item(),) - if scalar_address: - address = a - else: - address = (a[frag_loc].item(),) - - if scalar_fmt: - fmt = file_fmt - else: - fmt = file_fmt[frag_loc].item() - - aggregated_data[frag_loc] = { - "location": location, - "filename": filename, - "address": address, - "format": fmt, - } - - # Apply string substitutions to the fragment filenames - if substitutions: - for value in aggregated_data.values(): - filenames2 = [] - for filename in value["filename"]: - for base, sub in substitutions.items(): - filename = filename.replace(base, sub) - - filenames2.append(filename) - - value["filename"] = filenames2 - - super().__init__( - filename=filename, - address=address, - shape=shape, - dtype=dtype, - mask=mask, - units=units, - calendar=calendar, - copy=copy, - ) - else: - super().__init__( - filename=filename, - address=address, - dtype=dtype, - mask=mask, - units=units, - calendar=calendar, - copy=copy, - ) - - fragment_shape = None - aggregated_data = None - instructions = None - term = None - - self._set_component("fragment_shape", fragment_shape, copy=False) - self._set_component("aggregated_data", aggregated_data, copy=False) - self._set_component("instructions", instructions, copy=False) - self._set_component("term", term, copy=False) - - if substitutions is not None: - self._set_component( - "substitutions", substitutions.copy(), copy=False - ) - - def __dask_tokenize__(self): - """Used by `dask.base.tokenize`. - - .. versionadded:: 3.14.0 - - """ - out = super().__dask_tokenize__() - aggregated_data = self._get_component("instructions", None) - if aggregated_data is None: - aggregated_data = self.get_aggregated_data(copy=False) - - return out + (aggregated_data,) - - def __getitem__(self, indices): - """x.__getitem__(indices) <==> x[indices]""" - return NotImplemented # pragma: no cover - - def get_aggregated_data(self, copy=True): - """Get the aggregation data dictionary. - - The aggregation data dictionary contains the definitions of - the fragments and the instructions on how to aggregate them. - The keys are indices of the CFA fragment dimensions, - e.g. ``(1, 0, 0 ,0)``. - - .. versionadded:: 3.14.0 - - :Parameters: - - copy: `bool`, optional - Whether or not to return a copy of the aggregation - dictionary. By default a deep copy is returned. - - .. warning:: If False then changing the returned - dictionary in-place will change the - aggregation dictionary stored in the - {{class}} instance, **as well as in any - copies of it**. - - :Returns: - - `dict` - The aggregation data dictionary. - - **Examples** - - >>> a.shape - (12, 1, 73, 144) - >>> a.get_fragment_shape() - (2, 1, 1, 1) - >>> a.get_aggregated_data() - {(0, 0, 0, 0): { - 'file': ('January-June.nc',), - 'address': ('temp',), - 'format': 'nc', - 'location': [(0, 6), (0, 1), (0, 73), (0, 144)]}, - (1, 0, 0, 0): { - 'file': ('July-December.nc',), - 'address': ('temp',), - 'format': 'nc', - 'location': [(6, 12), (0, 1), (0, 73), (0, 144)]}} - - """ - aggregated_data = self._get_component("aggregated_data") - if copy: - aggregated_data = deepcopy(aggregated_data) - - return aggregated_data - - def get_fragmented_dimensions(self): - """Get the positions of dimensions that have two or more fragments. - - .. versionadded:: 3.14.0 - - :Returns: - - `list` - The dimension positions. - - **Examples** - - >>> a.get_fragment_shape() - (20, 1, 40, 1) - >>> a.get_fragmented_dimensions() - [0, 2] - - >>> a.get_fragment_shape() - (1, 1, 1) - >>> a.get_fragmented_dimensions() - [] - - """ - return [ - i for i, size in enumerate(self.get_fragment_shape()) if size > 1 - ] - - def get_fragment_shape(self): - """Get the sizes of the fragment dimensions. - - The fragment dimension sizes are given in the same order as - the aggregated dimension sizes given by `shape`. - - .. versionadded:: 3.14.0 - - :Returns: - - `tuple` - The shape of the fragment dimensions. - - """ - return self._get_component("fragment_shape") - - def get_storage_options(self): - """Return `s3fs.S3FileSystem` options for accessing S3 fragment files. - - .. versionadded:: NEXTVERSION - - :Returns: - - `dict` or `None` - The `s3fs.S3FileSystem` options. - - **Examples** - - >>> f.get_storage_options() - {} - - >>> f.get_storage_options() - {'anon': True} - - >>> f.get_storage_options() - {'key: 'scaleway-api-key...', - 'secret': 'scaleway-secretkey...', - 'endpoint_url': 'https://s3.fr-par.scw.cloud', - 'client_kwargs': {'region_name': 'fr-par'}} - - """ - return super().get_storage_options(create_endpoint_url=False) - - def get_term(self, default=ValueError()): - """The CFA aggregation instruction term for the data, if set. - - .. versionadded:: 3.15.0 - - :Parameters: - - default: optional - Return the value of the *default* parameter if the - term has not been set. If set to an `Exception` - instance then it will be raised instead. - - :Returns: - - `str` - The CFA aggregation instruction term name. - - """ - return self._get_component("term", default=default) - - def subarray_shapes(self, shapes): - """Create the subarray shapes. - - A fragmented dimenion (i.e. one spanned by two or fragments) - will always have a subarray size equal to the size of each of - its fragments, overriding any other size implied by the - *shapes* parameter. - - .. versionadded:: 3.14.0 - - .. seealso:: `subarrays` - - :Parameters: - - shapes: `int`, sequence, `dict` or `str`, optional - Define the subarray shapes. - - Any value accepted by the *chunks* parameter of the - `dask.array.from_array` function is allowed. - - The subarray sizes implied by *chunks* for a dimension - that has been fragmented are ignored, so their - specification is arbitrary. - - :Returns: - - `tuple` - The subarray sizes along each dimension. - - **Examples** - - >>> a.shape - (12, 1, 73, 144) - >>> a.get_fragment_shape() - (2, 1, 1, 1) - >>> a.fragmented_dimensions() - [0] - >>> a.subarray_shapes(-1) - ((6, 6), (1,), (73,), (144,)) - >>> a.subarray_shapes(None) - ((6, 6), (1,), (73,), (144,)) - >>> a.subarray_shapes("auto") - ((6, 6), (1,), (73,), (144,)) - >>> a.subarray_shapes((None, 1, 40, 50)) - ((6, 6), (1,), (40, 33), (50, 50, 44)) - >>> a.subarray_shapes((None, None, "auto", 50)) - ((6, 6), (1,), (73,), (50, 50, 44)) - >>> a.subarray_shapes({2: 40}) - ((6, 6), (1,), (40, 33), (144,)) - - """ - from numbers import Number - - from dask.array.core import normalize_chunks - - # Positions of fragmented dimensions (i.e. those spanned by - # two or more fragments) - f_dims = self.get_fragmented_dimensions() - - shape = self.shape - aggregated_data = self.get_aggregated_data(copy=False) - - # Create the base chunks. - chunks = [] - ndim = self.ndim - for dim, (n_fragments, size) in enumerate( - zip(self.get_fragment_shape(), self.shape) - ): - if dim in f_dims: - # This aggregated dimension is spanned by two or more - # fragments => set the chunks to be the same size as - # the each fragment. - c = [] - index = [0] * ndim - for j in range(n_fragments): - index[dim] = j - loc = aggregated_data[tuple(index)]["location"][dim] - chunk_size = loc[1] - loc[0] - c.append(chunk_size) - - chunks.append(tuple(c)) - else: - # This aggregated dimension is spanned by exactly one - # fragment => store `None` for now. This will get - # overwritten from 'shapes'. - chunks.append(None) - - if isinstance(shapes, (str, Number)) or shapes is None: - chunks = [ - c if i in f_dims else shapes for i, c in enumerate(chunks) - ] - elif isinstance(shapes, dict): - chunks = [ - chunks[i] if i in f_dims else shapes.get(i, "auto") - for i, c in enumerate(chunks) - ] - else: - # chunks is a sequence - if len(shapes) != ndim: - raise ValueError( - f"Wrong number of 'shapes' elements in {shapes}: " - f"Got {len(shapes)}, expected {self.ndim}" - ) - - chunks = [ - c if i in f_dims else shapes[i] for i, c in enumerate(chunks) - ] - - return normalize_chunks(chunks, shape=shape, dtype=self.dtype) - - def subarrays(self, subarray_shapes): - """Return descriptors for every subarray. - - .. versionadded:: 3.14.0 - - .. seealso:: `subarray_shapes` - - :Parameters: - - subarray_shapes: `tuple` - The subarray sizes along each dimension, as returned - by a prior call to `subarray_shapes`. - - :Returns: - - 6-`tuple` of iterators - Each iterator iterates over a particular descriptor - from each subarray. - - 1. The indices of the aggregated array that correspond - to each subarray. - - 2. The shape of each subarray. - - 3. The indices of the fragment that corresponds to each - subarray (some subarrays may be represented by a - part of a fragment). - - 4. The location of each subarray. - - 5. The location on the fragment dimensions of the - fragment that corresponds to each subarray. - - 6. The shape of each fragment that overlaps each chunk. - - **Examples** - - An aggregated array with shape (12, 73, 144) has two - fragments, both with with shape (6, 73, 144). - - >>> a.shape - (12, 73, 144) - >>> a.get_fragment_shape() - (2, 1, 1) - >>> a.fragmented_dimensions() - [0] - >>> subarray_shapes = a.subarray_shapes({1: 40}) - >>> print(subarray_shapes) - ((6, 6), (40, 33), (144,)) - >>> ( - ... u_indices, - ... u_shapes, - ... f_indices, - ... s_locations, - ... f_locations, - ... f_shapes, - ... ) = a.subarrays(subarray_shapes) - >>> for i in u_indices: - ... print(i) - ... - (slice(0, 6, None), slice(0, 40, None), slice(0, 144, None)) - (slice(0, 6, None), slice(40, 73, None), slice(0, 144, None)) - (slice(6, 12, None), slice(0, 40, None), slice(0, 144, None)) - (slice(6, 12, None), slice(40, 73, None), slice(0, 144, None)) - - >>> for i in u_shapes - ... print(i) - ... - (6, 40, 144) - (6, 33, 144) - (6, 40, 144) - (6, 33, 144) - >>> for i in f_indices: - ... print(i) - ... - (slice(None, None, None), slice(0, 40, None), slice(0, 144, None)) - (slice(None, None, None), slice(40, 73, None), slice(0, 144, None)) - (slice(None, None, None), slice(0, 40, None), slice(0, 144, None)) - (slice(None, None, None), slice(40, 73, None), slice(0, 144, None)) - >>> for i in s_locations: - ... print(i) - ... - (0, 0, 0) - (0, 1, 0) - (1, 0, 0) - (1, 1, 0) - >>> for i in f_locations: - ... print(i) - ... - (0, 0, 0) - (0, 0, 0) - (1, 0, 0) - (1, 0, 0) - >>> for i in f_shapes: - ... print(i) - ... - (6, 73, 144) - (6, 73, 144) - (6, 73, 144) - (6, 73, 144) - - """ - f_dims = self.get_fragmented_dimensions() - - # The indices of the uncompressed array that correspond to - # each subarray, the shape of each uncompressed subarray, and - # the location of each subarray - s_locations = [] - u_shapes = [] - u_indices = [] - f_locations = [] - for dim, c in enumerate(subarray_shapes): - nc = len(c) - s_locations.append(tuple(range(nc))) - u_shapes.append(c) - - if dim in f_dims: - f_locations.append(tuple(range(nc))) - else: - # No fragmentation along this dimension - f_locations.append((0,) * nc) - - c = tuple(accumulate((0,) + c)) - u_indices.append([slice(i, j) for i, j in zip(c[:-1], c[1:])]) - - # For each subarray, the part of the fragment that corresponds - # to it. - f_indices = [ - (slice(None),) * len(u) if dim in f_dims else u - for dim, u in enumerate(u_indices) - ] - - # For each subarray, the shape of the fragment that - # corresponds to it. - f_shapes = [ - u_shape if dim in f_dims else (size,) * len(u_shape) - for dim, (u_shape, size) in enumerate(zip(u_shapes, self.shape)) - ] - - return ( - product(*u_indices), - product(*u_shapes), - product(*f_indices), - product(*s_locations), - product(*f_locations), - product(*f_shapes), - ) - - def to_dask_array(self, chunks="auto"): - """Create a dask array with `FragmentArray` chunks. - - .. versionadded:: 3.14.0 - - :Parameters: - - chunks: `int`, `tuple`, `dict` or `str`, optional - Specify the chunking of the returned dask array. - - Any value accepted by the *chunks* parameter of the - `dask.array.from_array` function is allowed. - - The chunk sizes implied by *chunks* for a dimension that - has been fragmented are ignored and replaced with values - that are implied by that dimensions fragment sizes. - - :Returns: - - `dask.array.Array` - - """ - import dask.array as da - from dask.array.core import getter - from dask.base import tokenize - - name = (f"{self.__class__.__name__}-{tokenize(self)}",) - - dtype = self.dtype - units = self.get_units() - calendar = self.get_calendar(None) - aggregated_data = self.get_aggregated_data(copy=False) - - # Set the chunk sizes for the dask array - chunks = self.subarray_shapes(chunks) - - if self.get_mask(): - fragment_arrays = _FragmentArray - else: - fragment_arrays = _FragmentArray.copy() - fragment_arrays["nc"] = partial(_FragmentArray["nc"], mask=False) - - storage_options = self.get_storage_options() - - dsk = {} - for ( - u_indices, - u_shape, - f_indices, - chunk_location, - fragment_location, - fragment_shape, - ) in zip(*self.subarrays(chunks)): - kwargs = aggregated_data[fragment_location].copy() - kwargs.pop("location", None) - - fragment_format = kwargs.pop("format", None) - try: - FragmentArray = fragment_arrays[fragment_format] - except KeyError: - raise ValueError( - "Can't get FragmentArray class for unknown " - f"fragment dataset format: {fragment_format!r}" - ) - - if storage_options and kwargs["address"] == "nc": - # Pass on any S3 file system options - kwargs["storage_options"] = storage_options - - fragment = FragmentArray( - dtype=dtype, - shape=fragment_shape, - aggregated_units=units, - aggregated_calendar=calendar, - **kwargs, - ) - - key = f"{fragment.__class__.__name__}-{tokenize(fragment)}" - dsk[key] = fragment - dsk[name + chunk_location] = ( - getter, - key, - f_indices, - False, - getattr(fragment, "_lock", False), - ) - - # Return the dask array - return da.Array(dsk, name[0], chunks=chunks, dtype=dtype) diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py index 68ef4c99f1..f9b9100f0b 100644 --- a/cf/docstring/docstring.py +++ b/cf/docstring/docstring.py @@ -595,9 +595,6 @@ "{{weights auto: `bool`, optional}}": """auto: `bool`, optional If True then return `False` if weights can't be found, rather than raising an exception.""", - # init s3 - "{{init s3: `dict` or `None`, optional}}": """s3: `dict` or `None`, optional - TODOACTIVE""", # pad_width "{{pad_width: sequence of `int`, optional}}": """pad_width: sequence of `int`, optional Number of values to pad before and after the edges of diff --git a/cf/functions.py b/cf/functions.py index 96779c7af4..001fa66866 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -422,7 +422,7 @@ def configuration( bounds_combination_mode=bounds_combination_mode, active_storage=active_storage, active_storage_url=active_storage_url, - netcdf_lock=netcdf_lock + netcdf_lock=netcdf_lock, ) diff --git a/cf/read_write/read.py b/cf/read_write/read.py index ea63d3cee7..e17be69be9 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -695,41 +695,36 @@ def read( .. versionadded:: NEXTVERSION storage_options: `dict` or `None`, optional - Key/value pairs to be passed on to the creation of - `s3fs.S3FileSystem` file systems to control the opening of - files in S3 object stores. Ignored for files not in an S3 - object store, i.e. those whose names do not start with - ``s3:``. - - By default, or if `None` or ``{}``, then no options are - passed. - - If the ``'endpoint_url'`` key is not in *storage_options* - or is not in a dictionary defined by the - ``'client_kwargs`` key (which is always the case when - *storage_options* is `None`), then one will be - automatically inserted for accessing each S3 file. For - example, for a file name of ``'s3://store/data/file.nc'``, - an ``'endpoint_url'`` key with value ``'https://store'`` - would be created. - - *Parameter example:* - For a file name of ``'s3://store/data/file.nc'``, the - following are equivalent: ``None``, ``{}``, and - ``{'endpoint_url': 'https://store'}``. - - *Parameter example:* - ``{'key: 'scaleway-api-key...', 'secret': - 'scaleway-secretkey...', 'endpoint_url': - 'https://s3.fr-par.scw.cloud', 'client_kwargs': - {'region_name': 'fr-par'}}`` - - *Parameter example:* - The following are equivalent: ``{'endpoint_url': - 'https://store'}`` ``{'client_kwargs': {'endpoint_url': - 'https://store'}}`` - - .. versionadded:: NEXTVERSION + Key/value pairs to be passed on to the creation of + `s3fs.S3FileSystem` file systems to control the opening of + files in S3 object stores. Ignored for files not in an S3 + object store, i.e. those whose names do not start with + ``s3:``. + + By default, or if `None`, then *storage_options* is taken + as ``{}``. + + If the ``'endpoint_url'`` key is not in *storage_options* + or is not in a dictionary defined by the ``'client_kwargs`` + key (which is always the case when *storage_options* is + `None`), then one will be automatically inserted for + accessing an S3 file. For example, for a file name of + ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` key + with value ``'https://store'`` would be created. + + *Parameter example:* + For a file name of ``'s3://store/data/file.nc'``, the + following are equivalent: ``None``, ``{}``, and + ``{'endpoint_url': 'https://store'}``, + ``{'client_kwargs': {'endpoint_url': 'https://store'}}`` + + *Parameter example:* + ``{'key: 'scaleway-api-key...', 'secret': + 'scaleway-secretkey...', 'endpoint_url': + 'https://s3.fr-par.scw.cloud', 'client_kwargs': + {'region_name': 'fr-par'}}`` + + .. versionadded:: NEXTVERSION umversion: deprecated at version 3.0.0 Use the *um* parameter instead. diff --git a/cf/test/test_functions.py b/cf/test/test_functions.py index 48cecbe4c7..10e36f5f87 100644 --- a/cf/test/test_functions.py +++ b/cf/test/test_functions.py @@ -86,7 +86,7 @@ def test_configuration(self): "chunksize": 8e9, "active_storage": True, "active_storage_url": None, - "netcdf_lock": True + "netcdf_lock": True, } # Test the setting of each lone item. diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 415ea3026a..ceb0367532 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -210,8 +210,8 @@ Required * `scipy `_, version 1.10.0 or newer. -* `cfdm `_, version 1.11.1.0 or up to, - but not including, 1.11.2.0. +* `cfdm `_, version 1.11.2.0 or up to, + but not including, 1.11.3.0. * `cfunits `_, version 3.3.7 or newer. diff --git a/requirements.txt b/requirements.txt index baa0d1b0bc..d193fbcfa1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ netCDF4>=1.5.4 cftime>=1.6.2 numpy>=1.22 -cfdm>=1.11.1.0, <1.11.2.0 +cfdm>=1.11.2.0, <1.11.3.0 psutil>=0.6.0 cfunits>=3.3.7 dask>=2022.12.1 From e829e582395c45bc8de307d99fff8d7545d526b1 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 5 Mar 2024 16:37:39 +0000 Subject: [PATCH 054/134] dev --- cf/data/array/mixin/activestoragemixin.py | 30 +++++++++++++---------- cf/test/test_active_storage.py | 2 ++ 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index 9da5acdbb9..101c329c77 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -33,25 +33,21 @@ def __getitem__(self, indices): """ method = self.get_active_method() - if method is None or Active is None: - # Do a normal read by local client. Returns an un-reduced - # numpy array. + if Active is None or method is None: + # The instance has not been actified so do a normal read, + # returning an un-reduced numpy array. return super().__getitem__(indices) - # import urllib - # Still here? Then do an active storage reduction. Returns a # dictionary of reduced values. - # Hack for testing! filename = self.get_filename() - # filename = urllib.parse.urlparse(filename).path[1:] active = Active( filename, self.get_address(), - # storage_type ='s3', # Hack for testing! storage_options=self.get_storage_options(), active_storage_url=self.get_active_storage_url(), + storage_type="s3", # Temporary requirement! ) active.method = method active.components = True @@ -75,7 +71,7 @@ def actify(self, method, axis=None, active_storage_url=None): .. versionadded:: NEXTVERSION - .. seealso:: `set_active_axis`, `set_active_method` + .. seealso:: `get_active_axis`, `get_active_method` :Parameters: @@ -131,9 +127,12 @@ def actify(self, method, axis=None, active_storage_url=None): def get_active_axis(self): """Return the active storage reduction axes. + Active storage reduction axes are set with `actify`. + .. versionadded:: NEXTVERSION - .. seealso:: `get_active_method`, `get_active_storage_url` + .. seealso:: `actify`, `get_active_method`, + `get_active_storage_url` :Returns: @@ -147,9 +146,12 @@ def get_active_axis(self): def get_active_method(self): """Return the name of the active storage reduction method. + An active storage reduction method is set with `actify`. + .. versionadded:: NEXTVERSION - .. seealso:: `get_active_axis`, `get_active_storage_url` + .. seealso:: `actify`, `get_active_axis`, + `get_active_storage_url` :Returns: @@ -161,11 +163,13 @@ def get_active_method(self): return self._custom.get("active_method") def get_active_storage_url(self): - """Return the the active storage URL. + """Return the active storage reduction URL. + + An active storage reduction URL is set with `actify`. .. versionadded:: NEXTVERSION - .. seealso:: `get_active_axis`, `get_active_method` + .. seealso:: `actify`, `get_active_axis`, `get_active_method` :Returns: diff --git a/cf/test/test_active_storage.py b/cf/test/test_active_storage.py index 34afc8cee1..689a9f0026 100644 --- a/cf/test/test_active_storage.py +++ b/cf/test/test_active_storage.py @@ -37,6 +37,8 @@ def _remove_tmpfiles(): class ActiveStorageTest(unittest.TestCase): @unittest.skipUnless(Active is not None, "Requires activestorage package.") def test_active_storage(self): + print("WARNING: Skipping active storage test!") + return # No masked values f = cf.example_field(0) cf.write(f, tmpfile) From e2c892cd92140ff246a375f5f1e6bf8f8b7177ef Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 11 Mar 2024 23:13:35 +0000 Subject: [PATCH 055/134] dev --- cf/data/array/mixin/activestoragemixin.py | 20 +++++++++++++++- cf/field.py | 28 +++++++++++++---------- cf/read_write/netcdf/netcdfread.py | 3 +++ cf/read_write/read.py | 21 +++++++++++++++++ 4 files changed, 59 insertions(+), 13 deletions(-) diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index 101c329c77..0aacfc8e90 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -41,6 +41,21 @@ def __getitem__(self, indices): # Still here? Then do an active storage reduction. Returns a # dictionary of reduced values. filename = self.get_filename() + filename = "/".join(filename.split("/")[3:]) + + if True: + print( + "active = Active(\n ", + filename, + ",\n ", + self.get_address(), + ",\n ", + "storage_options=", + self.get_storage_options(), + ",\n active_storage_url=", + self.get_active_storage_url(), + ",\n storage_type=s3\n)", # Temporary requirement! + ) active = Active( filename, @@ -54,9 +69,12 @@ def __getitem__(self, indices): # Provide a file lock try: - active.lock = self._lock + lock = self._lock except AttributeError: pass + else: + if lock: + active.lock = lock return active[indices] diff --git a/cf/field.py b/cf/field.py index faf05fcbc4..2b9f06dc10 100644 --- a/cf/field.py +++ b/cf/field.py @@ -7194,13 +7194,21 @@ def collapse( if dim is None: continue - # Create a new dimension coordinate for this axis + # Create new dimension coordinate bounds if dim.has_bounds(): - bounds_data = [dim.bounds.datum(0), dim.bounds.datum(-1)] + b = dim.bounds.data else: - bounds_data = [dim.datum(0), dim.datum(-1)] - - units = dim.Units + b = dim.data + + # Note: Accessing first_element and last_element is + # likely to be fast for dat one disk, assuming + # that these values were cached during the read. + bounds_data = Data( + [[b.first_element(), b.last_element()]], + dtype=b.dtype, + units=b.Units, + ) + bounds = self._Bounds(data=bounds_data) if coordinate == "min": coordinate = "minimum" @@ -7216,21 +7224,17 @@ def collapse( ) if coordinate == "mid_range": - data = Data( - [(bounds_data[0] + bounds_data[1]) * 0.5], units=units - ) + data = bounds_data.mean(axes=1, weights=None, squeeze=True) elif coordinate == "minimum": - data = dim.data.min() + data = dim.data.min(squeeze=False) elif coordinate == "maximum": - data = dim.data.max() + data = dim.data.max(squeeze=False) else: raise ValueError( "Can't collapse: Bad parameter value: " f"coordinate={coordinate!r}" ) - bounds = self._Bounds(data=Data([bounds_data], units=units)) - dim.set_data(data, copy=False) dim.set_bounds(bounds, copy=False) diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py index 00f5ad9916..a7c493d4ef 100644 --- a/cf/read_write/netcdf/netcdfread.py +++ b/cf/read_write/netcdf/netcdfread.py @@ -208,8 +208,10 @@ def _create_data( # one dask chunk if data.npartitions == 1: data._cfa_set_write(True) + if ( not compression_index + and self.read_vars.get("cache_metadata") and self.implementation.get_construct_type(construct) != "field" ): @@ -503,6 +505,7 @@ def _cache_data_elements(self, data, ncvar): `None` """ + if data.data.get_compression_type(): # Don't get cached elements from arrays compressed by # convention, as they'll likely be wrong. diff --git a/cf/read_write/read.py b/cf/read_write/read.py index e17be69be9..679d58fb38 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -65,6 +65,7 @@ def read( cfa=None, netCDF_backend=None, storage_options=None, + cache_metadata=True, ): """Read field or domain constructs from files. @@ -726,6 +727,16 @@ def read( .. versionadded:: NEXTVERSION + cache_metadata: `bool`, optional + If True, the default, then data for metadata constructs + will have their first and last array elements retrieved + from the file and cached in memory for fast future + access. In addition, the second and penultimate array + elements will be cached from 2-d coordinate bounds data + that has two bounds per cell. + + .. versionadded:: NEXTVERSION + umversion: deprecated at version 3.0.0 Use the *um* parameter instead. @@ -885,6 +896,8 @@ def read( cfa_options["substitutions"] = substitutions + cache_metadata = bool(cache_metadata) + # Initialise the output list of fields/domains if domain: out = DomainList() @@ -1020,6 +1033,7 @@ def read( cfa_options=cfa_options, netCDF_backend=netCDF_backend, storage_options=storage_options, + cache_metadata=cache_metadata, ) # -------------------------------------------------------- @@ -1137,6 +1151,7 @@ def _read_a_file( cfa_options=None, netCDF_backend=None, storage_options=None, + cache_metadata=True, ): """Read the contents of a single file into a field list. @@ -1182,6 +1197,11 @@ def _read_a_file( .. versionadded:: NEXTVERSION + cache_metadata: `bool`, optional + See `cf.read` for details. + + .. versionadded:: NEXTVERSION + :Returns: `FieldList` or `DomainList` @@ -1216,6 +1236,7 @@ def _read_a_file( "fmt": selected_fmt, "ignore_read_error": ignore_read_error, "cfa_options": cfa_options, + "cache_metadata": cache_metadata, } # ---------------------------------------------------------------- From 4825684fc17cadb5dff4b81e1649382793548fa9 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 15 Mar 2024 10:15:00 +0000 Subject: [PATCH 056/134] dev --- cf/data/array/mixin/filearraymixin.py | 94 +++++++++++++++++++++++++++ cf/data/dask_regrid.py | 9 +++ cf/data/dask_utils.py | 34 ++++++++++ cf/data/data.py | 90 +++++++++++++------------ 4 files changed, 185 insertions(+), 42 deletions(-) diff --git a/cf/data/array/mixin/filearraymixin.py b/cf/data/array/mixin/filearraymixin.py index 378567a23a..590b4453f3 100644 --- a/cf/data/array/mixin/filearraymixin.py +++ b/cf/data/array/mixin/filearraymixin.py @@ -13,6 +13,38 @@ class FileArrayMixin: """ + def __array__(self, *dtype): + """Convert the ``{{class}}` into a `numpy` array. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + dtype: optional + Typecode or data-type to which the array is cast. + + :Returns: + + `numpy.ndarray` + An independent numpy array of the data. + + **Examples** + + TODO + >>> d = {{package}}.{{class}}([1, 2, 3]) + >>> a = numpy.array(d) + >>> print(type(a)) + + >>> a[0] = -99 + >>> d + <{{repr}}{{class}}(3): [1, 2, 3]> + >>> b = numpy.array(d, float) + >>> print(b) + [1. 2. 3.] + + """ + return np.asanyarray(self._getitem()) + def __dask_tokenize__(self): """Return a value fully representative of the object. @@ -26,6 +58,68 @@ def __dask_tokenize__(self): self.get_addresses(), ) + def _getitem(self) + """Returns a subspace of the array as a numpy array. + + x.__getitem__(indices) <==> x[indices] + + The indices that define the subspace must be either `Ellipsis` or + a sequence that contains an index for each dimension. In the + latter case, each dimension's index must either be a `slice` + object or a sequence of two or more integers. + + Indexing is similar to numpy indexing. The only difference to + numpy indexing (given the restrictions on the type of indices + allowed) is: + + * When two or more dimension's indices are sequences of integers + then these indices work independently along each dimension + (similar to the way vector subscripts work in Fortran). + + .. versionadded:: (cfdm) 1.7.0 + + """ + netcdf, address = self.open() + dataset = netcdf + + groups, address = self.get_groups(address) + if groups: + # Traverse the group structure, if there is one (CF>=1.8). + netcdf = self._group(netcdf, groups) + + if isinstance(address, str): + # Get the variable by netCDF name + variable = netcdf.variables[address] + else: + # Get the variable by netCDF integer ID + for variable in netcdf.variables.values(): + if variable._varid == address: + break + + # Get the data, applying masking and scaling as required. + array = netcdf_indexer( + variable, + mask=self.get_mask(), + unpack=self.get_unpack(), + always_mask=False, + ) + array = array[self.index] + + # Set the units, if they haven't been set already. + self._set_attributes(variable) + + # Set the units, if they haven't been set already. + self._set_units(variable) + + self.close(dataset) + del netcdf, dataset + + if not self.ndim: + # Hmm netCDF4 has a thing for making scalar size 1, 1d + array = array.squeeze() + + return array + @property def _dask_meta(self): """The metadata for the containing dask array. diff --git a/cf/data/dask_regrid.py b/cf/data/dask_regrid.py index 510591c339..ea2fe04d76 100644 --- a/cf/data/dask_regrid.py +++ b/cf/data/dask_regrid.py @@ -1,6 +1,8 @@ """Regridding functions used within a dask graph.""" import numpy as np +from .dask_utils import hhh + def regrid( a, @@ -173,6 +175,13 @@ def regrid( """ weights, dst_mask = weights_dst_mask + a = hhh(a) + if dst_mask is not None: + dst_mask = hhh(dst_mask) + + if ref_src_mask is not None: + ref_src_mask = hhh(ref_src_mask) + # ---------------------------------------------------------------- # Reshape the array into a form suitable for the regridding dot # product, i.e. a 2-d array whose right-hand dimension represents diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index 63291a4d96..0d180212e9 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -10,6 +10,7 @@ import numpy as np from dask.core import flatten from scipy.ndimage import convolve1d +from scipy.sparse import issparse from ..cfdatetime import dt, dt2rt, rt2dt from ..functions import atol as cf_atol @@ -126,6 +127,7 @@ def cf_contains(a, value): value. """ + a = hhh(a) return np.array(value in a).reshape((1,) * a.ndim) @@ -159,6 +161,8 @@ def cf_convolve1d(a, window=None, axis=-1, origin=0): Convolved float array with same shape as input. """ + a = hhh(a) + masked = np.ma.is_masked(a) if masked: # convolve1d does not deal with masked arrays, so uses NaNs @@ -196,6 +200,7 @@ def cf_harden_mask(a): The array with hardened mask. """ + a = hhh(a) if np.ma.isMA(a): try: a.harden_mask() @@ -266,6 +271,9 @@ def cf_percentile(a, q, axis, method, keepdims=False, mtol=1): """ from math import prod + a = hhh(a) + q = hhh(q) + if np.ma.isMA(a) and not np.ma.is_masked(a): # Masked array with no masked elements a = a.data @@ -359,6 +367,8 @@ def cf_soften_mask(a): The array with softened mask. """ + a = hhh(a) + if np.ma.isMA(a): try: a.soften_mask() @@ -414,6 +424,15 @@ def cf_where(array, condition, x, y, hardmask): elsewhere. """ + a = hhh(a) + condition = hhh(condition) + if x is not None: + x = hhh(x) + + if y is not None: + y = hhh(y) + + mask = None if np.ma.isMA(array): @@ -509,6 +528,7 @@ def cf_YMDhms(a, attr): array([1, 2]) """ + a = hhh(a) return _array_getattr(a, attr=attr) @@ -541,6 +561,7 @@ def cf_rt2dt(a, units): cftime.DatetimeGregorian(2000, 1, 2, 0, 0, 0, 0, has_year_zero=False)] """ + a = hhh(a) if not units.iscalendartime: return rt2dt(a, units_in=units) @@ -595,6 +616,7 @@ def cf_dt2rt(a, units): [365 366] """ + a = hhh(a) return dt2rt(a, units_out=units, units_in=None) @@ -635,6 +657,18 @@ def cf_units(a, from_units, to_units): [1000. 2000.] """ + a = hhh(a) return Units.conform( a, from_units=from_units, to_units=to_units, inplace=False ) + + +def cf_filled(a, fill_value=None): + a = hhh(a) + return np.ma.filled(a, fill_value= fill_value) + +def hhh(self, a): + if issparse(a): + return a + + return np.asanyarray(a) diff --git a/cf/data/data.py b/cf/data/data.py index fd84d14bb4..933c8e7ac0 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -48,6 +48,7 @@ _da_ma_allclose, cf_contains, cf_dt2rt, + cf_filled, cf_harden_mask, cf_percentile, cf_rt2dt, @@ -373,7 +374,7 @@ def __init__( if _use_array: try: - array = source.to_dask_array() + array = source.to_dask_array(numpify=False) except (AttributeError, TypeError): pass else: @@ -528,7 +529,7 @@ def dask_compressed_array(self): if ca is None or not ca.get_compression_type(): raise ValueError("not compressed: can't get compressed dask array") - return ca.to_dask_array() + return ca.to_dask_array(numpify=False) def __contains__(self, value): """Membership test operator ``in`` @@ -621,9 +622,9 @@ def __contains__(self, value): # are incompatible return False - value = value.to_dask_array() + value = value.to_dask_array(numpify=True) - dx = self.to_dask_array() + dx = self.to_dask_array(numpify=True) out_ind = tuple(range(dx.ndim)) dx_ind = out_ind @@ -667,7 +668,7 @@ def __float__(self): 1. """ - return float(self.to_dask_array()) + return float(self.to_dask_array(numpify=True)) def __int__(self): """Called to implement the built-in function `int` @@ -680,7 +681,7 @@ def __int__(self): the dask array size is already known to be greater than 1. """ - return int(self.to_dask_array()) + return int(self.to_dask_array(numpify=True)) def __iter__(self): """Called when an iterator is required. @@ -756,7 +757,7 @@ def __len__(self): TypeError: len() of unsized object """ - dx = self.to_dask_array() + dx = self.to_dask_array(numpify=False) if math.isnan(dx.size): logger.debug("Computing data len: Performance may be degraded") dx.compute_chunk_sizes() @@ -787,7 +788,7 @@ def __bool__(self): "elements is ambiguous. Use d.any() or d.all()" ) - return bool(self.to_dask_array()) + return bool(self.to_dask_array(numpify=True)) def __getitem__(self, indices): """Return a subspace of the data defined by indices. @@ -874,10 +875,10 @@ def __getitem__(self, indices): new = self.roll( axis=tuple(roll.keys()), shift=tuple(roll.values()) ) - dx = new.to_dask_array() + dx = new.to_dask_array(numpify=False) else: new = self.copy(array=False) - dx = self.to_dask_array() + dx = self.to_dask_array(numpify=False) # ------------------------------------------------------------ # Subspace the dask array @@ -1113,7 +1114,7 @@ def __setitem__(self, indices, value): # Missing values could be affected, so make sure that the mask # hardness has been applied. - dx = self.to_dask_array(apply_mask_hardness=True) + dx = self.to_dask_array(apply_mask_hardness=True, numpify=True) # Do the assignment self._set_subspace(dx, indices, value) @@ -1696,7 +1697,7 @@ def diff(self, axis=-1, n=1, inplace=False): """ d = _inplace_enabled_define_and_cleanup(self) - dx = self.to_dask_array() + dx = self.to_dask_array(numpify=True) dx = da.diff(dx, axis=axis, n=n) d._set_dask(dx) @@ -1992,7 +1993,7 @@ def digitize( delete_bins.append(bins.size) # Digitise the array - dx = d.to_dask_array() + dx = d.to_dask_array(numpify=True) dx = da.digitize(dx, bins, right=upper) d._set_dask(dx) d.override_units(_units_None, inplace=True) @@ -2265,7 +2266,7 @@ def pad_missing(self, axis, pad_width=None, to_size=None, inplace=False): d = _inplace_enabled_define_and_cleanup(self) - dx = d.to_dask_array() + dx = d.to_dask_array(numpify=True) mask0 = da.ma.getmaskarray(dx) pad = [(0, 0)] * dx.ndim @@ -2470,7 +2471,7 @@ def percentile( else: axes = tuple(sorted(d._parse_axes(axes))) - dx = d.to_dask_array() + dx = d.to_dask_array(numpify=True) dtype = dx.dtype shape = dx.shape @@ -2587,7 +2588,7 @@ def persist(self, inplace=False): """ d = _inplace_enabled_define_and_cleanup(self) - dx = self.to_dask_array() + dx = self.to_dask_array(numpify=True) dx = dx.persist() d._set_dask(dx, clear=_ALL ^ _ARRAY ^ _CACHE) @@ -2627,7 +2628,7 @@ def ceil(self, inplace=False, i=False): """ d = _inplace_enabled_define_and_cleanup(self) - dx = d.to_dask_array() + dx = d.to_dask_array(numpify=True) d._set_dask(da.ceil(dx)) return d @@ -2778,7 +2779,9 @@ def compute(self): # noqa: F811 [0., 0., 0.]]) """ - a = self.to_dask_array().compute() + dx = self.to_dask_array(numpify=False).copy() + dx = dx.map_blocks(hhh, dtype=dx.dtype) + a = dx.compute() if np.ma.isMA(a): if self.hardmask: @@ -2964,7 +2967,7 @@ def convolution_filter( depth += abs(origin) - dx = d.to_dask_array() + dx = d.to_dask_array(numpify=True) # Cast to float to ensure that NaNs can be stored (as required # by cf_convolve1d) @@ -3076,7 +3079,7 @@ def cumsum( d = _inplace_enabled_define_and_cleanup(self) - dx = d.to_dask_array() + dx = d.to_dask_array(numpify=True) dx = dx.cumsum(axis=axis, method=method) d._set_dask(dx) @@ -3153,7 +3156,7 @@ def rechunk( """ d = _inplace_enabled_define_and_cleanup(self) - dx = d.to_dask_array() + dx = d.to_dask_array(numpify=False) dx = dx.rechunk(chunks, threshold, block_size_limit, balance) d._set_dask(dx, clear=_ALL ^ _ARRAY ^ _CACHE) @@ -3206,7 +3209,7 @@ def _asdatetime(self, inplace=False): ) if not d._isdatetime(): - dx = d.to_dask_array() + dx = d.to_dask_array(numpify=False) dx = dx.map_blocks(cf_rt2dt, units=units, dtype=object) d._set_dask(dx) @@ -3261,7 +3264,7 @@ def _asreftime(self, inplace=False): ) if d._isdatetime(): - dx = d.to_dask_array() + dx = d.to_dask_array(numpify=False) dx = dx.map_blocks(cf_dt2rt, units=units, dtype=float) d._set_dask(dx) @@ -3730,8 +3733,8 @@ def _binary_operation(self, other, method): data0, other, new_Units = data0._combined_units(other, method, True) # Cast as dask arrays - dx0 = data0.to_dask_array() - dx1 = other.to_dask_array() + dx0 = data0.to_dask_array(numpify=True) + dx1 = other.to_dask_array(numpify=True) # Set if applicable the tolerance levels for the result if method in ("__eq__", "__ne__"): @@ -3871,7 +3874,7 @@ def _regrid( f"the shape of the regrid operator: {operator.src_shape}" ) - dx = self.to_dask_array() + dx = self.to_dask_array(numpify=False) # Rechunk so that each chunk contains data in the form # expected by the regrid operator, i.e. the regrid axes all @@ -4114,7 +4117,7 @@ def concatenate( copied = not copy # to avoid making two copies in a given case # Get data as dask arrays and apply concatenation operation - dxs = [d.to_dask_array() for d in processed_data] + dxs = [d.to_dask_array(numpify=False) for d in processed_data] dx = da.concatenate(dxs, axis=axis) # Set the CFA write status @@ -8465,7 +8468,7 @@ def insert_dimension(self, position=0, inplace=False): shape = list(d.shape) shape.insert(position, 1) - dx = d.to_dask_array() + dx = d.to_dask_array(numpify=False) dx = dx.reshape(shape) # Inserting a dimension doesn't affect the cached elements nor @@ -9061,8 +9064,8 @@ def filled(self, fill_value=None, inplace=False): f"data type {d.dtype.str!r}" ) - dx = d.to_dask_array() - dx = dx.map_blocks(np.ma.filled, fill_value=fill_value, dtype=d.dtype) + dx = d.to_dask_array() + dx = dx.map_blocks(cf_filled, fill_value=fill_value, dtype=d.dtype) d._set_dask(dx) return d @@ -9690,7 +9693,7 @@ def override_calendar(self, calendar, inplace=False, i=False): d._Units = Units(d.Units._units, calendar) return d - def to_dask_array(self, apply_mask_hardness=False): + def to_dask_array(self, apply_mask_hardness=False, numpify=False): """Convert the data to a `dask` array. .. warning:: By default, the mask hardness of the returned @@ -9736,16 +9739,19 @@ def to_dask_array(self, apply_mask_hardness=False): dask.array """ - if apply_mask_hardness and "dask" in self._custom: + dx = self._custom.get("dask") + if dx is None: + raise ValueError(f"{self.__class__.__name__} object has no data") + + if apply_mask_hardness: if self.hardmask: self.harden_mask() else: self.soften_mask() + elif numpify: + return dx.map_blocks(hhh, dtype=dx.dtype) - try: - return self._custom["dask"] - except KeyError: - raise ValueError(f"{self.__class__.__name__} object has no data") + return self._custom["dask"] def datum(self, *index): """Return an element of the data array as a standard Python @@ -11887,7 +11893,7 @@ def transpose(self, axes=None, inplace=False, i=False): data_axes = d._axes d._axes = [data_axes[i] for i in iaxes] - dx = d.to_dask_array() + dx = d.to_dask_array(numpify=True) try: dx = da.transpose(dx, axes=axes) except ValueError: @@ -11932,7 +11938,7 @@ def trunc(self, inplace=False, i=False): """ d = _inplace_enabled_define_and_cleanup(self) - dx = d.to_dask_array() + dx = d.to_dask_array(numpify=True) d._set_dask(da.trunc(dx)) return d @@ -12238,7 +12244,7 @@ def func( """ d = _inplace_enabled_define_and_cleanup(self) - dx = d.to_dask_array() + dx = d.to_dask_array(numpify=True) if preserve_invalid: # Assume all inputs are masked, as checking for a mask to confirm @@ -12418,7 +12424,7 @@ def roll(self, axis, shift, inplace=False, i=False): d = _inplace_enabled_define_and_cleanup(self) - dx = d.to_dask_array() + dx = d.to_dask_array(numpify=True) dx = da.roll(dx, shift, axis=axis) d._set_dask(dx) @@ -13082,7 +13088,7 @@ def square(self, dtype=None, inplace=False): """ d = _inplace_enabled_define_and_cleanup(self) - dx = d.to_dask_array() + dx = d.to_dask_array(numpify=True) dx = da.square(dx, dtype=dtype) d._set_dask(dx) @@ -13151,7 +13157,7 @@ def sqrt(self, dtype=None, inplace=False): """ d = _inplace_enabled_define_and_cleanup(self) - dx = d.to_dask_array() + dx = d.to_dask_array(numpify=True) dx = da.sqrt(dx, dtype=dtype) d._set_dask(dx) From 36f1ecc046d2821fd626f6cf7c37aba4c02c4bc9 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 15 Mar 2024 17:56:22 +0000 Subject: [PATCH 057/134] dev --- cf/data/array/mixin/filearraymixin.py | 97 +++++++++++++++++++++++++- cf/data/array/umarray.py | 2 + cf/data/collapse/dask_collapse.py | 32 ++++++++- cf/data/dask_utils.py | 41 ++++++----- cf/data/data.py | 99 ++++++++++++++++++--------- cf/functions.py | 8 ++- 6 files changed, 223 insertions(+), 56 deletions(-) diff --git a/cf/data/array/mixin/filearraymixin.py b/cf/data/array/mixin/filearraymixin.py index 590b4453f3..5e34bf7598 100644 --- a/cf/data/array/mixin/filearraymixin.py +++ b/cf/data/array/mixin/filearraymixin.py @@ -1,3 +1,4 @@ +from math import ceil from os import sep from os.path import basename, dirname, join @@ -58,6 +59,82 @@ def __dask_tokenize__(self): self.get_addresses(), ) + def __getitem__(self, index) + """TODO Returns a subspace of the array as a numpy array. + + x.__getitem__(indices) <==> x[indices] + + The indices that define the subspace must be either `Ellipsis` or + a sequence that contains an index for each dimension. In the + latter case, each dimension's index must either be a `slice` + object or a sequence of two or more integers. + + Indexing is similar to numpy indexing. The only difference to + numpy indexing (given the restrictions on the type of indices + allowed) is: + + * When two or more dimension's indices are sequences of integers + then these indices work independently along each dimension + (similar to the way vector subscripts work in Fortran). + + .. versionadded:: NEXTVERSION + + """ + shape0 = self.shape + index = parse_indices(shape0, index, keepdims=False, bool_as_int=True) + + index0 = self._get_component('index', None) + if index0 is None: + self._set_component('index', index, copy=False) + return + + new_index = [] + for ind0, ind, size0 in zip(index0, index, shape0): + if index == slice(None): + new_index.append(ind0) + new_shape.apepend(size0) + continue + + if isinstance(ind0, slice): + if isinstance(ind, slice): + # 'ind0' is slice, 'ind' is slice + start, stop, step = ind0.indices(size0) + size1, mod = divmod(stop - start - 1, step) + start1, stop1, step1 = ind.indices(size1 + 1) + size2, mod = divmod(stop1 - start1, step1) + + if mod != 0: + size2 += 1 + + start += start1 * step + step *= step1 + stop = start + (size2 - 1) * step + + if step > 0: + stop += 1 + else: + stop -= 1 + + if stop < 0: + stop = None + + new = slice(start, stop, step) + new_size = ceil((stop - start)/step) + else: + # 'ind0' is slice, 'ind' is numpy array of int + new = np.arange(*ind0.indices(size0))[ind] + new_size = new.size + else: + # 'ind0' is numpy array of int + new = ind0[ind] + new_size = new.size + + new_index.append(new) + new_shape.apepend(new_size) + + self._set_component('index', tuple(new_index), copy=False) + self._set_component('shape', tuple(new_shape), copy=False) + def _getitem(self) """Returns a subspace of the array as a numpy array. @@ -76,9 +153,14 @@ def _getitem(self) then these indices work independently along each dimension (similar to the way vector subscripts work in Fortran). - .. versionadded:: (cfdm) 1.7.0 + .. versionadded:: NEXTVERSION """ + + index = parse_indices(self.shape, self.index, cyclic=False, keepdims=True) + + + netcdf, address = self.open() dataset = netcdf @@ -149,6 +231,19 @@ def filename(self): removed_at="5.0.0", ) # pragma: no cover + @property + def index(self): + """TODO + + .. versionadded:: NEXTVERSION + + """ + i = self._get_component('index', None) + if i is None: + i = parse_indices(self.shape, (Ellipsis,), keepdims=False) + + return i + def del_file_location(self, location): """Remove reference to files in the given location. diff --git a/cf/data/array/umarray.py b/cf/data/array/umarray.py index ab5d0d857f..816ced9a3f 100644 --- a/cf/data/array/umarray.py +++ b/cf/data/array/umarray.py @@ -174,6 +174,8 @@ def __init__( # By default, close the UM file after data array access self._set_component("close", True, copy=False) + self._set_component("index", None, copy=False) + def __getitem__(self, indices): """Return a subspace of the array. diff --git a/cf/data/collapse/dask_collapse.py b/cf/data/collapse/dask_collapse.py index ae5eb24be2..01b3db7c21 100644 --- a/cf/data/collapse/dask_collapse.py +++ b/cf/data/collapse/dask_collapse.py @@ -125,7 +125,9 @@ def sum_weights_chunk( N = cf_sample_size_chunk(x, **kwargs)["N"] return N - elif check_weights: + + weights = asanyarray(weights) + if check_weights: w_min = weights.min() if w_min <= 0: raise ValueError( @@ -263,9 +265,14 @@ def cf_mean_chunk( * weighted: True if weights have been set. """ + x = asanyarray(x) + if computing_meta: return x + if weights is not None: + weights = asanyarray(weights) + # N, sum d = cf_sum_chunk(x, weights, dtype=dtype, **kwargs) @@ -383,6 +390,8 @@ def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): * max: The maximum of `x``. """ + x = asanyarray(x) + if computing_meta: return x @@ -534,6 +543,8 @@ def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): * min: The minimum of ``x``. """ + x = asanyarray(x) + if computing_meta: return x @@ -638,6 +649,8 @@ def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs): * max: The maximum of ``x`. """ + x = asanyarray(x) + if computing_meta: return x @@ -747,6 +760,8 @@ def cf_rms_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): * sum: The weighted sum of ``x**2``. """ + x = asanyarray(x) + if computing_meta: return x @@ -822,6 +837,8 @@ def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs): * N: The sample size. """ + x = asanyarray(x) + if computing_meta: return x @@ -947,10 +964,13 @@ def cf_sum_chunk( * sum: The weighted sum of ``x`` """ + x = asanyarray(x) + if computing_meta: return x if weights is not None: + weights = asanyarray(weights) if check_weights: w_min = weights.min() if w_min <= 0: @@ -1070,6 +1090,8 @@ def cf_sum_of_weights_chunk( ``weights**2`` if *square* is True. """ + x = asanyarray(x) + if computing_meta: return x @@ -1106,6 +1128,8 @@ def cf_unique_chunk(x, dtype=None, computing_meta=False, **kwargs): * unique: The unique values. """ + x = asanyarray(x) + if computing_meta: return x @@ -1190,11 +1214,15 @@ def cf_var_chunk( * ddof: The delta degrees of freedom. """ + x = asanyarray(x) + if computing_meta: return x weighted = weights is not None - + if weighted: + weights = asanyarray(weights) + # N, V1, sum d = cf_mean_chunk(x, weights, dtype=dtype, **kwargs) diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index 0d180212e9..39830399ad 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -113,7 +113,7 @@ def cf_contains(a, value): :Parameters: - a: `numpy.ndarray` + a: array_like The array. value: array_like @@ -127,7 +127,8 @@ def cf_contains(a, value): value. """ - a = hhh(a) + a = asanyarray(a) + value = asanyarray(value) return np.array(value in a).reshape((1,) * a.ndim) @@ -161,7 +162,12 @@ def cf_convolve1d(a, window=None, axis=-1, origin=0): Convolved float array with same shape as input. """ - a = hhh(a) + a = asanyarray(a) + + # Cast to float to ensure that NaNs can be stored + if a.dtype != float: + a = a.astype(float, copy=False) + masked = np.ma.is_masked(a) if masked: @@ -200,7 +206,7 @@ def cf_harden_mask(a): The array with hardened mask. """ - a = hhh(a) + a = asanyarray(a) if np.ma.isMA(a): try: a.harden_mask() @@ -226,7 +232,7 @@ def cf_percentile(a, q, axis, method, keepdims=False, mtol=1): :Parameters: - a: `numpy.ndarray` + a: array_like Input array. q: `numpy.ndarray` @@ -271,8 +277,7 @@ def cf_percentile(a, q, axis, method, keepdims=False, mtol=1): """ from math import prod - a = hhh(a) - q = hhh(q) + a = asanyarray(a) if np.ma.isMA(a) and not np.ma.is_masked(a): # Masked array with no masked elements @@ -367,7 +372,7 @@ def cf_soften_mask(a): The array with softened mask. """ - a = hhh(a) + a = asanyarray(a) if np.ma.isMA(a): try: @@ -424,13 +429,13 @@ def cf_where(array, condition, x, y, hardmask): elsewhere. """ - a = hhh(a) - condition = hhh(condition) + a = asanyarray(a) + condition = asanyarray(condition) if x is not None: - x = hhh(x) + x = asanyarray(x) if y is not None: - y = hhh(y) + y = asanyarray(y) mask = None @@ -528,7 +533,7 @@ def cf_YMDhms(a, attr): array([1, 2]) """ - a = hhh(a) + a = asanyarray(a) return _array_getattr(a, attr=attr) @@ -561,7 +566,7 @@ def cf_rt2dt(a, units): cftime.DatetimeGregorian(2000, 1, 2, 0, 0, 0, 0, has_year_zero=False)] """ - a = hhh(a) + a = asanyarray(a) if not units.iscalendartime: return rt2dt(a, units_in=units) @@ -616,7 +621,7 @@ def cf_dt2rt(a, units): [365 366] """ - a = hhh(a) + a = asanyarray(a) return dt2rt(a, units_out=units, units_in=None) @@ -657,17 +662,17 @@ def cf_units(a, from_units, to_units): [1000. 2000.] """ - a = hhh(a) + a = asanyarray(a) return Units.conform( a, from_units=from_units, to_units=to_units, inplace=False ) def cf_filled(a, fill_value=None): - a = hhh(a) + a = asanyarray(a) return np.ma.filled(a, fill_value= fill_value) -def hhh(self, a): +def asanyarray(self, a): if issparse(a): return a diff --git a/cf/data/data.py b/cf/data/data.py index 933c8e7ac0..0f1ca3e6b1 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -529,7 +529,7 @@ def dask_compressed_array(self): if ca is None or not ca.get_compression_type(): raise ValueError("not compressed: can't get compressed dask array") - return ca.to_dask_array(numpify=False) + return ca.to_dask_array(numpify=False) # TODO def __contains__(self, value): """Membership test operator ``in`` @@ -622,9 +622,9 @@ def __contains__(self, value): # are incompatible return False - value = value.to_dask_array(numpify=True) + value = value.to_dask_array(numpify=False) - dx = self.to_dask_array(numpify=True) + dx = self.to_dask_array(numpify=False) out_ind = tuple(range(dx.ndim)) dx_ind = out_ind @@ -1114,10 +1114,11 @@ def __setitem__(self, indices, value): # Missing values could be affected, so make sure that the mask # hardness has been applied. - dx = self.to_dask_array(apply_mask_hardness=True, numpify=True) + dx = self.to_dask_array(apply_mask_hardness=True) # Do the assignment self._set_subspace(dx, indices, value) + self._custom['numpified'] = True # Unroll any axes that were rolled to enable a cyclic # assignment @@ -1362,7 +1363,7 @@ def _clear_after_dask_update(self, clear=_ALL): # Set the CFA write status to False self._cfa_del_write() - def _set_dask(self, array, copy=False, clear=_ALL): + def _set_dask(self, dx, copy=False, clear=_ALL, computable=None): """Set the dask array. .. versionadded:: 3.14.0 @@ -1372,7 +1373,7 @@ def _set_dask(self, array, copy=False, clear=_ALL): :Parameters: - array: `dask.array.Array` + dx: `dask.array.Array` The array to be inserted. copy: `bool`, optional @@ -1390,7 +1391,7 @@ def _set_dask(self, array, copy=False, clear=_ALL): `None` """ - if array is NotImplemented: + if dx is NotImplemented: logger.warning( "WARNING: NotImplemented has been set in the place of a " "dask array." @@ -1410,9 +1411,13 @@ def _set_dask(self, array, copy=False, clear=_ALL): ) if copy: - array = array.copy() + dx = dx.copy() - self._custom["dask"] = array + custom = self._custom + custom["dask"] = dx + if computable: + custom["computable"] = True + self._clear_after_dask_update(clear) def _del_dask(self, default=ValueError(), clear=_ALL): @@ -1700,6 +1705,7 @@ def diff(self, axis=-1, n=1, inplace=False): dx = self.to_dask_array(numpify=True) dx = da.diff(dx, axis=axis, n=n) d._set_dask(dx) + d._custom['numpified'] = True # Convert to "difference" units # @@ -1996,6 +2002,7 @@ def digitize( dx = d.to_dask_array(numpify=True) dx = da.digitize(dx, bins, right=upper) d._set_dask(dx) + d._custom['numpified'] = True d.override_units(_units_None, inplace=True) # More elegant to handle 'delete_bins' in cf- rather than Dask- space @@ -2282,6 +2289,8 @@ def pad_missing(self, axis, pad_width=None, to_size=None, inplace=False): dx = da.ma.masked_where(mask, dx) d._set_dask(dx) + d._custom['numpified'] = True + return d @_inplace_enabled(default=False) @@ -2471,7 +2480,7 @@ def percentile( else: axes = tuple(sorted(d._parse_axes(axes))) - dx = d.to_dask_array(numpify=True) + dx = d.to_dask_array(numpify=False) dtype = dx.dtype shape = dx.shape @@ -2546,7 +2555,8 @@ def percentile( d._axes = (new_axis_identifier(axes),) + axes d._update_deterministic(not is_dask_collection(q)) - + d._custom['numpified'] = True + return d @_inplace_enabled(default=False) @@ -2591,6 +2601,7 @@ def persist(self, inplace=False): dx = self.to_dask_array(numpify=True) dx = dx.persist() d._set_dask(dx, clear=_ALL ^ _ARRAY ^ _CACHE) + d._custom['numpified'] = True return d @@ -2630,6 +2641,7 @@ def ceil(self, inplace=False, i=False): d = _inplace_enabled_define_and_cleanup(self) dx = d.to_dask_array(numpify=True) d._set_dask(da.ceil(dx)) + d._custom['numpified'] = True return d def cfa_get_term(self): @@ -2779,8 +2791,7 @@ def compute(self): # noqa: F811 [0., 0., 0.]]) """ - dx = self.to_dask_array(numpify=False).copy() - dx = dx.map_blocks(hhh, dtype=dx.dtype) + dx = self.to_dask_array(numpify=True) a = dx.compute() if np.ma.isMA(a): @@ -2967,12 +2978,12 @@ def convolution_filter( depth += abs(origin) - dx = d.to_dask_array(numpify=True) + dx = d.to_dask_array(numpify=False) - # Cast to float to ensure that NaNs can be stored (as required - # by cf_convolve1d) - if dx.dtype != float: - dx = dx.astype(float, copy=False) +# # Cast to float to ensure that NaNs can be stored (as required +# # by cf_convolve1d) +# if dx.dtype != float: +# dx = dx.astype(float, copy=False)# # Convolve each chunk convolve1d = partial( @@ -2988,6 +2999,7 @@ def convolution_filter( ) d._set_dask(dx) + d._custom['numpified'] = True return d @@ -3082,6 +3094,7 @@ def cumsum( dx = d.to_dask_array(numpify=True) dx = dx.cumsum(axis=axis, method=method) d._set_dask(dx) + d._custom['numpified'] = True return d @@ -3212,6 +3225,7 @@ def _asdatetime(self, inplace=False): dx = d.to_dask_array(numpify=False) dx = dx.map_blocks(cf_rt2dt, units=units, dtype=object) d._set_dask(dx) + d._custom['numpified'] = True return d @@ -3267,6 +3281,7 @@ def _asreftime(self, inplace=False): dx = d.to_dask_array(numpify=False) dx = dx.map_blocks(cf_dt2rt, units=units, dtype=float) d._set_dask(dx) + d._custom['numpified'] = True return d @@ -3793,6 +3808,7 @@ def _binary_operation(self, other, method): self._axes = axes self._update_deterministic(other) + self._custom['numpified'] = True return self else: # not, so concerns a new Data object copied from self, data0 @@ -3802,6 +3818,7 @@ def _binary_operation(self, other, method): data0._axes = axes data0._update_deterministic(other) + data0._custom['numpified'] = True return data0 def _parse_indices(self, *args, **kwargs): @@ -3973,6 +3990,7 @@ def _regrid( d = self.copy() d._set_dask(dx) + d._custom['numpified'] = True # Don't know (yet) if 'operator' has a deterministic name d._update_deterministic(False) @@ -4845,13 +4863,14 @@ def Units(self, value): func = partial(cf_units, from_units=old_units, to_units=value) - dx = self.to_dask_array() + dx = self.to_dask_array(numpify=False) dx = dx.map_blocks(func, dtype=dtype) # Setting equivalent units doesn't affect the CFA write # status. Nor does it invalidate any cached values, but only # because we'll adjust those, too. self._set_dask(dx, clear=_ALL ^ _CACHE ^ _CFA) + self._custom['numpified'] = True # Adjust cached values for the new units cache = self._get_cached_elements() @@ -6402,7 +6421,7 @@ def convert_reference_time( ) d.Units = units0 - dx = d.to_dask_array() + dx = d.to_dask_array(numpify=False) # Convert to the correct date-time objects dx = dx.map_blocks(cf_rt2dt, units=units0, dtype=object) @@ -6411,6 +6430,7 @@ def convert_reference_time( dx = dx.map_blocks(cf_dt2rt, units=units, dtype=float) d._set_dask(dx) + d._custom['numpified'] = True d.override_units(units, inplace=True) return d @@ -8857,9 +8877,10 @@ def harden_mask(self): [1 -- 3] """ - dx = self.to_dask_array() + dx = self.to_dask_array(numpify=False) dx = dx.map_blocks(cf_harden_mask, dtype=self.dtype) self._set_dask(dx, clear=_NONE) + self._custom['numpified'] = True self.hardmask = True def has_calendar(self): @@ -8977,9 +8998,10 @@ def soften_mask(self): [ 1 999 3] """ - dx = self.to_dask_array() + dx = self.to_dask_array(numpify=False) dx = dx.map_blocks(cf_soften_mask, dtype=self.dtype) self._set_dask(dx, clear=_NONE) + self._custom['numpified'] = True self.hardmask = False def file_locations(self): @@ -9064,9 +9086,10 @@ def filled(self, fill_value=None, inplace=False): f"data type {d.dtype.str!r}" ) - dx = d.to_dask_array() + dx = d.to_dask_array(numpify=False) dx = dx.map_blocks(cf_filled, fill_value=fill_value, dtype=d.dtype) d._set_dask(dx) + d._custom['numpified'] = True return d @@ -9693,7 +9716,7 @@ def override_calendar(self, calendar, inplace=False, i=False): d._Units = Units(d.Units._units, calendar) return d - def to_dask_array(self, apply_mask_hardness=False, numpify=False): + def to_dask_array(self, apply_mask_hardness=False, numpify=True): """Convert the data to a `dask` array. .. warning:: By default, the mask hardness of the returned @@ -9748,8 +9771,8 @@ def to_dask_array(self, apply_mask_hardness=False, numpify=False): self.harden_mask() else: self.soften_mask() - elif numpify: - return dx.map_blocks(hhh, dtype=dx.dtype) + elif numpify and not self._custom.get('numpified'): + return dx.map_blocks(asanyarray, dtype=dx.dtype) return self._custom["dask"] @@ -9925,9 +9948,10 @@ def masked_invalid(self, inplace=False): """ d = _inplace_enabled_define_and_cleanup(self) - dx = self.to_dask_array() + dx = self.to_dask_array(numpify=True) dx = da.ma.masked_invalid(dx) d._set_dask(dx) + d._custom['numpified'] = True return d def del_calendar(self, default=ValueError()): @@ -10022,9 +10046,10 @@ def del_file_location(self, location): updated = True if updated: - dx = self.to_dask_array() + dx = self.to_dask_array(numpify=False) dx = da.Array(dsk, dx.name, dx.chunks, dx.dtype, dx._meta) self._set_dask(dx, clear=_NONE) + self._custom['numpified'] = True return location @@ -11190,7 +11215,7 @@ def where( # Missing values could be affected, so make sure that the mask # hardness has been applied. - dx = d.to_dask_array(apply_mask_hardness=True) + dx = d.to_dask_array(apply_mask_hardness=True, numpify=False) units = d.Units @@ -11205,7 +11230,8 @@ def where( condition = type(self).asdata(condition) condition = where_broadcastable(d, condition, "condition") - + condition = condition.to_dask_array(numpify=False) + # If x or y is self then change it to None. This prevents an # unnecessary copy; and, at compute time, an unncessary numpy # where. @@ -11249,9 +11275,9 @@ def where( # Apply the where operation dx = da.core.elemwise( - cf_where, dx, da.asanyarray(condition), x, y, d.hardmask + cf_where, dx, condition, x, y, d.hardmask ) - d._set_dask(dx) + d._set_dask(dx, computable=True) # Don't know (yet) if 'x' and 'y' have a deterministic names d._update_deterministic(False) @@ -11781,7 +11807,7 @@ def todict(self, optimize_graph=True): 0), (slice(0, 1, 1),))} """ - dx = self.to_dask_array() + dx = self.to_dask_array(numpify=False) # TODO if optimize_graph: return collections_to_dsk((dx,), optimize_graph=True) @@ -11902,7 +11928,7 @@ def transpose(self, axes=None, inplace=False, i=False): ) d._set_dask(dx) - + d._custom['numpified'] = True return d @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @@ -11940,6 +11966,7 @@ def trunc(self, inplace=False, i=False): d = _inplace_enabled_define_and_cleanup(self) dx = d.to_dask_array(numpify=True) d._set_dask(da.trunc(dx)) + d._custom['numpified'] = True return d @classmethod @@ -12261,6 +12288,7 @@ def func( dx = da.ma.masked_array(dx, mask=dx_mask) d._set_dask(dx) + d._custom['numpified'] = True if units is not None: d.override_units(units, inplace=True) @@ -12427,6 +12455,7 @@ def roll(self, axis, shift, inplace=False, i=False): dx = d.to_dask_array(numpify=True) dx = da.roll(dx, shift, axis=axis) d._set_dask(dx) + d._custom['numpified'] = True return d @@ -13091,6 +13120,7 @@ def square(self, dtype=None, inplace=False): dx = d.to_dask_array(numpify=True) dx = da.square(dx, dtype=dtype) d._set_dask(dx) + d._custom['numpified'] = True units = d.Units if units: @@ -13160,6 +13190,7 @@ def sqrt(self, dtype=None, inplace=False): dx = d.to_dask_array(numpify=True) dx = da.sqrt(dx, dtype=dtype) d._set_dask(dx) + d._custom['numpified'] = True units = d.Units if units: diff --git a/cf/functions.py b/cf/functions.py index 70eb6a81df..f4964637eb 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -1885,7 +1885,7 @@ def indices_shape(indices, full_shape, keepdims=True): return shape -def parse_indices(shape, indices, cyclic=False, keepdims=True): +def parse_indices(shape, indices, cyclic=False, keepdims=True, bool_as_int=False): """Parse indices for array access and assignment. :Parameters: @@ -2042,6 +2042,12 @@ def parse_indices(shape, indices, cyclic=False, keepdims=True): # Replace index with its Dask array index = to_dask_array() + elif bool_as_int: + index = np.asanyarray(index) + if index.dtype == bool: + index = np.arange(size)[index] + + parsed_indices[i] = index if not cyclic: From df2f23b554d626c53dff959e0f34962221742c85 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 15 Mar 2024 20:23:03 +0000 Subject: [PATCH 058/134] dev --- cf/data/array/mixin/filearraymixin.py | 81 +++------------ cf/data/array/mixin/indexmixin.py | 140 ++++++++++++++++++++++++++ cf/data/array/netcdfarray.py | 8 +- cf/data/array/umarray.py | 72 ++++++++++++- 4 files changed, 233 insertions(+), 68 deletions(-) create mode 100644 cf/data/array/mixin/indexmixin.py diff --git a/cf/data/array/mixin/filearraymixin.py b/cf/data/array/mixin/filearraymixin.py index 5e34bf7598..fe906fc22f 100644 --- a/cf/data/array/mixin/filearraymixin.py +++ b/cf/data/array/mixin/filearraymixin.py @@ -17,6 +17,8 @@ class FileArrayMixin: def __array__(self, *dtype): """Convert the ``{{class}}` into a `numpy` array. + TODO stored indices + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -29,23 +31,13 @@ def __array__(self, *dtype): `numpy.ndarray` An independent numpy array of the data. - **Examples** - - TODO - >>> d = {{package}}.{{class}}([1, 2, 3]) - >>> a = numpy.array(d) - >>> print(type(a)) - - >>> a[0] = -99 - >>> d - <{{repr}}{{class}}(3): [1, 2, 3]> - >>> b = numpy.array(d, float) - >>> print(b) - [1. 2. 3.] - """ - return np.asanyarray(self._getitem()) - + array = np.asanyarray(self._get_array()) + if not dtype: + return array + else: + return array.astype(dtype[0], copy=False) + def __dask_tokenize__(self): """Return a value fully representative of the object. @@ -135,7 +127,7 @@ def __getitem__(self, index) self._set_component('index', tuple(new_index), copy=False) self._set_component('shape', tuple(new_shape), copy=False) - def _getitem(self) + def _get_array(self) """Returns a subspace of the array as a numpy array. x.__getitem__(indices) <==> x[indices] @@ -156,51 +148,9 @@ def _getitem(self) .. versionadded:: NEXTVERSION """ - - index = parse_indices(self.shape, self.index, cyclic=False, keepdims=True) - - - - netcdf, address = self.open() - dataset = netcdf - - groups, address = self.get_groups(address) - if groups: - # Traverse the group structure, if there is one (CF>=1.8). - netcdf = self._group(netcdf, groups) - - if isinstance(address, str): - # Get the variable by netCDF name - variable = netcdf.variables[address] - else: - # Get the variable by netCDF integer ID - for variable in netcdf.variables.values(): - if variable._varid == address: - break - - # Get the data, applying masking and scaling as required. - array = netcdf_indexer( - variable, - mask=self.get_mask(), - unpack=self.get_unpack(), - always_mask=False, + return NotImplementedError( + f"Must implement {self.__class__.__name__}._get_array" ) - array = array[self.index] - - # Set the units, if they haven't been set already. - self._set_attributes(variable) - - # Set the units, if they haven't been set already. - self._set_units(variable) - - self.close(dataset) - del netcdf, dataset - - if not self.ndim: - # Hmm netCDF4 has a thing for making scalar size 1, 1d - array = array.squeeze() - - return array @property def _dask_meta(self): @@ -238,11 +188,12 @@ def index(self): .. versionadded:: NEXTVERSION """ - i = self._get_component('index', None) - if i is None: - i = parse_indices(self.shape, (Ellipsis,), keepdims=False) + ind = self._get_component('index', None) + if ind is None: + ind = parse_indices(self.shape, (Ellipsis,), keepdims=False, bool_ti_int=True) + self._set_component('index', ind, copy=False) - return i + return ind def del_file_location(self, location): """Remove reference to files in the given location. diff --git a/cf/data/array/mixin/indexmixin.py b/cf/data/array/mixin/indexmixin.py new file mode 100644 index 0000000000..7cf2a49160 --- /dev/null +++ b/cf/data/array/mixin/indexmixin.py @@ -0,0 +1,140 @@ +from math import ceil +from os import sep +from os.path import basename, dirname, join + +import numpy as np + +from ....functions import _DEPRECATION_ERROR_ATTRIBUTE, abspath + + +class IndexMixin: + """TODO xMixin class for an array stored in a file. + + .. versionadded:: NEXTVERSION + + """ + + def __array__(self, *dtype): + """Convert the ``{{class}}` into a `numpy` array. + + TODO stored indices + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + dtype: optional + Typecode or data-type to which the array is cast. + + :Returns: + + `numpy.ndarray` + An independent numpy array of the data. + + """ + array = np.asanyarray(self._get_array()) + if not dtype: + return array + else: + return array.astype(dtype[0], copy=False) + + def __getitem__(self, index) + """TODO Returns a subspace of the array as a numpy array. + + x.__getitem__(indices) <==> x[indices] + + The indices that define the subspace must be either `Ellipsis` or + a sequence that contains an index for each dimension. In the + latter case, each dimension's index must either be a `slice` + object or a sequence of two or more integers. + + Indexing is similar to numpy indexing. The only difference to + numpy indexing (given the restrictions on the type of indices + allowed) is: + + * When two or more dimension's indices are sequences of integers + then these indices work independently along each dimension + (similar to the way vector subscripts work in Fortran). + + .. versionadded:: NEXTVERSION + + """ + shape0 = self.shape + index = parse_indices(shape0, index, keepdims=False, bool_as_int=True) + + index0 = self._get_component('index', None) + if index0 is None: + self._set_component('index', index, copy=False) + return + + new_index = [] + for ind0, ind, size0 in zip(index0, index, shape0): + if index == slice(None): + new_index.append(ind0) + new_shape.apepend(size0) + continue + + if isinstance(ind0, slice): + if isinstance(ind, slice): + # 'ind0' is slice, 'ind' is slice + start, stop, step = ind0.indices(size0) + size1, mod = divmod(stop - start - 1, step) + start1, stop1, step1 = ind.indices(size1 + 1) + size2, mod = divmod(stop1 - start1, step1) + + if mod != 0: + size2 += 1 + + start += start1 * step + step *= step1 + stop = start + (size2 - 1) * step + + if step > 0: + stop += 1 + else: + stop -= 1 + + if stop < 0: + stop = None + + new = slice(start, stop, step) + new_size = ceil((stop - start)/step) + else: + # 'ind0' is slice, 'ind' is numpy array of int + new = np.arange(*ind0.indices(size0))[ind] + new_size = new.size + else: + # 'ind0' is numpy array of int + new = ind0[ind] + new_size = new.size + + new_index.append(new) + new_shape.apepend(new_size) + + self._set_component('index', tuple(new_index), copy=False) + self._set_component('shape', tuple(new_shape), copy=False) + + def _get_array(self) + """Returns a subspace of the array as a numpy array. + + x.__getitem__(indices) <==> x[indices] + + The indices that define the subspace must be either `Ellipsis` or + a sequence that contains an index for each dimension. In the + latter case, each dimension's index must either be a `slice` + object or a sequence of two or more integers. + + Indexing is similar to numpy indexing. The only difference to + numpy indexing (given the restrictions on the type of indices + allowed) is: + + * When two or more dimension's indices are sequences of integers + then these indices work independently along each dimension + (similar to the way vector subscripts work in Fortran). + + .. versionadded:: NEXTVERSION + + """ + return NotImplementedError( + f"Must implement {self.__class__.__name__}._get_array" + ) diff --git a/cf/data/array/netcdfarray.py b/cf/data/array/netcdfarray.py index 6dab56af1a..3cd83552f5 100644 --- a/cf/data/array/netcdfarray.py +++ b/cf/data/array/netcdfarray.py @@ -17,8 +17,8 @@ def __dask_tokenize__(self): .. versionadded:: 3.15.0 """ - return super().__dask_tokenize__() + (self.get_mask(),) - + return super().__dask_tokenize__() + (self.get_mask(),) + @property def _lock(self): """Set the lock for use in `dask.array.from_array`. @@ -33,3 +33,7 @@ def _lock(self): """ return _lock + + def _get_array(self): + """TODO""" + return super(cfdm.NetCDFArray).__getitem__(self.index) diff --git a/cf/data/array/umarray.py b/cf/data/array/umarray.py index 816ced9a3f..796066709f 100644 --- a/cf/data/array/umarray.py +++ b/cf/data/array/umarray.py @@ -174,7 +174,77 @@ def __init__( # By default, close the UM file after data array access self._set_component("close", True, copy=False) - self._set_component("index", None, copy=False) + def _get_array(self): + """Return a subspace of the array. + + Returns a subspace of the array as an independent numpy array. + + """ + f, header_offset = self.open() + rec = self._get_rec(f, header_offset) + + int_hdr = rec.int_hdr + real_hdr = rec.real_hdr + array = rec.get_data().reshape(self.shape) + + self.close(f) + del f, rec + + array = get_subspace(array, self.index) + + # Set the units, if they haven't been set already. + self._set_units(int_hdr) + + LBUSER2 = int_hdr.item(38) + if LBUSER2 == 3: + # Return the numpy array now if it is a boolean array + self._set_component("dtype", np.dtype(bool), copy=False) + return array.astype(bool) + + integer_array = LBUSER2 == 2 + + # ------------------------------------------------------------ + # Convert to a masked array + # ------------------------------------------------------------ + # Set the fill_value from BMDI + fill_value = real_hdr.item(17) + if fill_value != -1.0e30: + # -1.0e30 is the flag for no missing data + if integer_array: + # The fill_value must be of the same type as the data + # values + fill_value = int(fill_value) + + # Mask any missing values + mask = array == fill_value + if mask.any(): + array = np.ma.masked_where(mask, array, copy=False) + + # ------------------------------------------------------------ + # Unpack the array using the scale_factor and add_offset, if + # either is available + # ------------------------------------------------------------ + # Treat BMKS as a scale_factor if it is neither 0 nor 1 + scale_factor = real_hdr.item(18) + if scale_factor != 1.0 and scale_factor != 0.0: + if integer_array: + scale_factor = int(scale_factor) + + array *= scale_factor + + # Treat BDATUM as an add_offset if it is not 0 + add_offset = real_hdr.item(4) + if add_offset != 0.0: + if integer_array: + add_offset = int(add_offset) + + array += add_offset + + # Set the data type + self._set_component("dtype", array.dtype, copy=False) + + # Return the numpy array + return array def __getitem__(self, indices): """Return a subspace of the array. From d01d4276ed492a817d250267b9ce707a5ad1495c Mon Sep 17 00:00:00 2001 From: David Hassell Date: Sun, 17 Mar 2024 00:01:02 +0000 Subject: [PATCH 059/134] dev --- cf/data/array/mixin/__init__.py | 1 + cf/data/array/mixin/filearraymixin.py | 140 ------------- cf/data/array/mixin/indexmixin.py | 125 +++++++----- cf/data/array/netcdfarray.py | 53 ++++- cf/data/array/umarray.py | 161 +++++++-------- cf/data/collapse/dask_collapse.py | 37 ++-- cf/data/dask_regrid.py | 10 +- cf/data/dask_utils.py | 100 ++++++--- cf/data/data.py | 281 ++++++++++++++------------ cf/functions.py | 15 +- 10 files changed, 464 insertions(+), 459 deletions(-) diff --git a/cf/data/array/mixin/__init__.py b/cf/data/array/mixin/__init__.py index a9f7f75cb3..ad630280ef 100644 --- a/cf/data/array/mixin/__init__.py +++ b/cf/data/array/mixin/__init__.py @@ -1,3 +1,4 @@ from .arraymixin import ArrayMixin from .compressedarraymixin import CompressedArrayMixin from .filearraymixin import FileArrayMixin +from .indexmixin import IndexMixin diff --git a/cf/data/array/mixin/filearraymixin.py b/cf/data/array/mixin/filearraymixin.py index fe906fc22f..378567a23a 100644 --- a/cf/data/array/mixin/filearraymixin.py +++ b/cf/data/array/mixin/filearraymixin.py @@ -1,4 +1,3 @@ -from math import ceil from os import sep from os.path import basename, dirname, join @@ -14,30 +13,6 @@ class FileArrayMixin: """ - def __array__(self, *dtype): - """Convert the ``{{class}}` into a `numpy` array. - - TODO stored indices - - .. versionadded:: (cfdm) NEXTVERSION - - :Parameters: - - dtype: optional - Typecode or data-type to which the array is cast. - - :Returns: - - `numpy.ndarray` - An independent numpy array of the data. - - """ - array = np.asanyarray(self._get_array()) - if not dtype: - return array - else: - return array.astype(dtype[0], copy=False) - def __dask_tokenize__(self): """Return a value fully representative of the object. @@ -51,107 +26,6 @@ def __dask_tokenize__(self): self.get_addresses(), ) - def __getitem__(self, index) - """TODO Returns a subspace of the array as a numpy array. - - x.__getitem__(indices) <==> x[indices] - - The indices that define the subspace must be either `Ellipsis` or - a sequence that contains an index for each dimension. In the - latter case, each dimension's index must either be a `slice` - object or a sequence of two or more integers. - - Indexing is similar to numpy indexing. The only difference to - numpy indexing (given the restrictions on the type of indices - allowed) is: - - * When two or more dimension's indices are sequences of integers - then these indices work independently along each dimension - (similar to the way vector subscripts work in Fortran). - - .. versionadded:: NEXTVERSION - - """ - shape0 = self.shape - index = parse_indices(shape0, index, keepdims=False, bool_as_int=True) - - index0 = self._get_component('index', None) - if index0 is None: - self._set_component('index', index, copy=False) - return - - new_index = [] - for ind0, ind, size0 in zip(index0, index, shape0): - if index == slice(None): - new_index.append(ind0) - new_shape.apepend(size0) - continue - - if isinstance(ind0, slice): - if isinstance(ind, slice): - # 'ind0' is slice, 'ind' is slice - start, stop, step = ind0.indices(size0) - size1, mod = divmod(stop - start - 1, step) - start1, stop1, step1 = ind.indices(size1 + 1) - size2, mod = divmod(stop1 - start1, step1) - - if mod != 0: - size2 += 1 - - start += start1 * step - step *= step1 - stop = start + (size2 - 1) * step - - if step > 0: - stop += 1 - else: - stop -= 1 - - if stop < 0: - stop = None - - new = slice(start, stop, step) - new_size = ceil((stop - start)/step) - else: - # 'ind0' is slice, 'ind' is numpy array of int - new = np.arange(*ind0.indices(size0))[ind] - new_size = new.size - else: - # 'ind0' is numpy array of int - new = ind0[ind] - new_size = new.size - - new_index.append(new) - new_shape.apepend(new_size) - - self._set_component('index', tuple(new_index), copy=False) - self._set_component('shape', tuple(new_shape), copy=False) - - def _get_array(self) - """Returns a subspace of the array as a numpy array. - - x.__getitem__(indices) <==> x[indices] - - The indices that define the subspace must be either `Ellipsis` or - a sequence that contains an index for each dimension. In the - latter case, each dimension's index must either be a `slice` - object or a sequence of two or more integers. - - Indexing is similar to numpy indexing. The only difference to - numpy indexing (given the restrictions on the type of indices - allowed) is: - - * When two or more dimension's indices are sequences of integers - then these indices work independently along each dimension - (similar to the way vector subscripts work in Fortran). - - .. versionadded:: NEXTVERSION - - """ - return NotImplementedError( - f"Must implement {self.__class__.__name__}._get_array" - ) - @property def _dask_meta(self): """The metadata for the containing dask array. @@ -181,20 +55,6 @@ def filename(self): removed_at="5.0.0", ) # pragma: no cover - @property - def index(self): - """TODO - - .. versionadded:: NEXTVERSION - - """ - ind = self._get_component('index', None) - if ind is None: - ind = parse_indices(self.shape, (Ellipsis,), keepdims=False, bool_ti_int=True) - self._set_component('index', ind, copy=False) - - return ind - def del_file_location(self, location): """Remove reference to files in the given location. diff --git a/cf/data/array/mixin/indexmixin.py b/cf/data/array/mixin/indexmixin.py index 7cf2a49160..c94bd7559d 100644 --- a/cf/data/array/mixin/indexmixin.py +++ b/cf/data/array/mixin/indexmixin.py @@ -1,10 +1,9 @@ from math import ceil -from os import sep -from os.path import basename, dirname, join import numpy as np +from dask.base import is_dask_collection -from ....functions import _DEPRECATION_ERROR_ATTRIBUTE, abspath +from ....functions import parse_indices class IndexMixin: @@ -32,13 +31,13 @@ def __array__(self, *dtype): An independent numpy array of the data. """ - array = np.asanyarray(self._get_array()) - if not dtype: - return array - else: + array = self._get_array() + if dtype: return array.astype(dtype[0], copy=False) - def __getitem__(self, index) + return array + + def __getitem__(self, index): """TODO Returns a subspace of the array as a numpy array. x.__getitem__(indices) <==> x[indices] @@ -58,66 +57,88 @@ def __getitem__(self, index) .. versionadded:: NEXTVERSION + :Returns: + + `{{class}}` + TODO + """ + new = self.copy() + shape0 = self.shape - index = parse_indices(shape0, index, keepdims=False, bool_as_int=True) - - index0 = self._get_component('index', None) - if index0 is None: - self._set_component('index', index, copy=False) - return - - new_index = [] - for ind0, ind, size0 in zip(index0, index, shape0): - if index == slice(None): - new_index.append(ind0) - new_shape.apepend(size0) + index0 = self.index + index = parse_indices(shape0, index, keepdims=False) + + new_indices = [] + new_shape = [] + for ind0, ind, size in zip(index0, index, shape0): + if ind == slice(None): + new_indices.append(ind0) + new_shape.append(size) continue - + + if is_dask_collection(ind): + # I think that this will never occur when __getitem__ + # is being called from within a Dask graph. Otherwise + # we'll need to run the `compute` inside a `with + # dask.config.set({"scheduler": "synchronous"}):` + ind = ind.compute() + if isinstance(ind0, slice): if isinstance(ind, slice): # 'ind0' is slice, 'ind' is slice - start, stop, step = ind0.indices(size0) - size1, mod = divmod(stop - start - 1, step) - start1, stop1, step1 = ind.indices(size1 + 1) - size2, mod = divmod(stop1 - start1, step1) + start, stop, step = ind0.indices(size) + size0, _ = divmod(stop - start - 1, step) + start1, stop1, step1 = ind.indices(size0 + 1) + size1, mod1 = divmod(stop1 - start1, step1) - if mod != 0: - size2 += 1 + if mod1 != 0: + size1 += 1 start += start1 * step step *= step1 - stop = start + (size2 - 1) * step + stop = start + (size1 - 1) * step if step > 0: stop += 1 else: stop -= 1 - + if stop < 0: stop = None - - new = slice(start, stop, step) - new_size = ceil((stop - start)/step) + + new_index = slice(start, stop, step) + new_size = ceil((stop - start) / step) else: - # 'ind0' is slice, 'ind' is numpy array of int - new = np.arange(*ind0.indices(size0))[ind] + # 'ind0' is slice, 'ind' is (array of) int/bool + new_index = np.arange(*ind0.indices(size0))[ind] new_size = new.size else: - # 'ind0' is numpy array of int - new = ind0[ind] + # 'ind0' is (array of) int + new_index = np.asanyarray(ind0)[ind] new_size = new.size - - new_index.append(new) - new_shape.apepend(new_size) - self._set_component('index', tuple(new_index), copy=False) - self._set_component('shape', tuple(new_shape), copy=False) - - def _get_array(self) - """Returns a subspace of the array as a numpy array. + new_indices.append(new_index) + new_shape.append(new_size) - x.__getitem__(indices) <==> x[indices] + new._set_component("index", tuple(new_indices), copy=False) + new._set_component("shape", tuple(new_shape), copy=False) + + print (index0, index, new_indices) + + return new + + @property + def _dask_asanyarray(self): + """TODO + + .. versionadded:: NEXTVERSION + + """ + return True + + def _get_array(self): + """TODO Returns a subspace of the array as a numpy array. The indices that define the subspace must be either `Ellipsis` or a sequence that contains an index for each dimension. In the @@ -138,3 +159,17 @@ def _get_array(self) return NotImplementedError( f"Must implement {self.__class__.__name__}._get_array" ) + + @property + def index(self): + """TODO + + .. versionadded:: NEXTVERSION + + """ + ind = self._get_component("index", None) + if ind is None: + ind = (slice(None),) * self.ndim + self._set_component("index", ind, copy=False) + + return ind diff --git a/cf/data/array/netcdfarray.py b/cf/data/array/netcdfarray.py index 3cd83552f5..578f7be66e 100644 --- a/cf/data/array/netcdfarray.py +++ b/cf/data/array/netcdfarray.py @@ -2,13 +2,13 @@ from dask.utils import SerializableLock from ...mixin_container import Container -from .mixin import ArrayMixin, FileArrayMixin +from .mixin import ArrayMixin, FileArrayMixin, IndexMixin # Global lock for netCDF file access _lock = SerializableLock() -class NetCDFArray(FileArrayMixin, ArrayMixin, Container, cfdm.NetCDFArray): +class NetCDFArray(IndexMixin, FileArrayMixin, ArrayMixin, Container, cfdm.NetCDFArray): """An array stored in a netCDF file.""" def __dask_tokenize__(self): @@ -17,8 +17,8 @@ def __dask_tokenize__(self): .. versionadded:: 3.15.0 """ - return super().__dask_tokenize__() + (self.get_mask(),) - + return super().__dask_tokenize__() + (self.get_mask(),) + @property def _lock(self): """Set the lock for use in `dask.array.from_array`. @@ -36,4 +36,47 @@ def _lock(self): def _get_array(self): """TODO""" - return super(cfdm.NetCDFArray).__getitem__(self.index) + print ('cf.NetCDFArray._get_array', self.index) +# return super(cfdm.NetCDFArray, self).__getitem__(self.index) +# return super(cfdm.NetCDFArray, self).__getitem__(self.index) + + netcdf, address = self.open() + dataset = netcdf + + groups, address = self.get_groups(address) + if groups: + # Traverse the group structure, if there is one (CF>=1.8). + netcdf = self._group(netcdf, groups) + + if isinstance(address, str): + # Get the variable by netCDF name + variable = netcdf.variables[address] + else: + # Get the variable by netCDF integer ID + for variable in netcdf.variables.values(): + if variable._varid == address: + break + + # Get the data, applying masking and scaling as required. +# array = cfdm.netcdf_indexer( +# variable, +# mask=self.get_mask(), +# unpack=self.get_unpack(), +# always_mask=False, +# ) + array = variable[self.index] + + # Set the units, if they haven't been set already. +# self._set_attributes(variable) + + # Set the units, if they haven't been set already. + self._set_units(variable) + + self.close(dataset) + del netcdf, dataset + + if not self.ndim: + # Hmm netCDF4 has a thing for making scalar size 1, 1d + array = array.squeeze() + + return array diff --git a/cf/data/array/umarray.py b/cf/data/array/umarray.py index 796066709f..bf0ca67ddc 100644 --- a/cf/data/array/umarray.py +++ b/cf/data/array/umarray.py @@ -2,18 +2,19 @@ import numpy as np from ...constants import _stash2standard_name -from ...functions import ( +from ...functions import ( # parse_indices, _DEPRECATION_ERROR_ATTRIBUTE, get_subspace, load_stash2standard_name, - parse_indices, ) from ...umread_lib.umfile import File from .abstract import Array -from .mixin import FileArrayMixin +from .mixin import FileArrayMixin, IndexMixin -class UMArray(FileArrayMixin, cfdm.data.mixin.FileArrayMixin, Array): +class UMArray( + IndexMixin, FileArrayMixin, cfdm.data.mixin.FileArrayMixin, Array +): """A sub-array stored in a PP or UM fields file.""" def __init__( @@ -179,7 +180,7 @@ def _get_array(self): Returns a subspace of the array as an independent numpy array. - """ + """ f, header_offset = self.open() rec = self._get_rec(f, header_offset) @@ -246,81 +247,81 @@ def _get_array(self): # Return the numpy array return array - def __getitem__(self, indices): - """Return a subspace of the array. - - x.__getitem__(indices) <==> x[indices] - - Returns a subspace of the array as an independent numpy array. - - """ - f, header_offset = self.open() - rec = self._get_rec(f, header_offset) - - int_hdr = rec.int_hdr - real_hdr = rec.real_hdr - array = rec.get_data().reshape(self.shape) - - self.close(f) - del f, rec - - if indices is not Ellipsis: - indices = parse_indices(array.shape, indices) - array = get_subspace(array, indices) - - # Set the units, if they haven't been set already. - self._set_units(int_hdr) - - LBUSER2 = int_hdr.item(38) - if LBUSER2 == 3: - # Return the numpy array now if it is a boolean array - self._set_component("dtype", np.dtype(bool), copy=False) - return array.astype(bool) - - integer_array = LBUSER2 == 2 - - # ------------------------------------------------------------ - # Convert to a masked array - # ------------------------------------------------------------ - # Set the fill_value from BMDI - fill_value = real_hdr.item(17) - if fill_value != -1.0e30: - # -1.0e30 is the flag for no missing data - if integer_array: - # The fill_value must be of the same type as the data - # values - fill_value = int(fill_value) - - # Mask any missing values - mask = array == fill_value - if mask.any(): - array = np.ma.masked_where(mask, array, copy=False) - - # ------------------------------------------------------------ - # Unpack the array using the scale_factor and add_offset, if - # either is available - # ------------------------------------------------------------ - # Treat BMKS as a scale_factor if it is neither 0 nor 1 - scale_factor = real_hdr.item(18) - if scale_factor != 1.0 and scale_factor != 0.0: - if integer_array: - scale_factor = int(scale_factor) - - array *= scale_factor - - # Treat BDATUM as an add_offset if it is not 0 - add_offset = real_hdr.item(4) - if add_offset != 0.0: - if integer_array: - add_offset = int(add_offset) - - array += add_offset - - # Set the data type - self._set_component("dtype", array.dtype, copy=False) - - # Return the numpy array - return array + # def __getitem__(self, indices): + # """Return a subspace of the array. + # + # x.__getitem__(indices) <==> x[indices] + # + # Returns a subspace of the array as an independent numpy array. + # + # """ + # f, header_offset = self.open() + # rec = self._get_rec(f, header_offset) + # + # int_hdr = rec.int_hdr + # real_hdr = rec.real_hdr + # array = rec.get_data().reshape(self.shape) + # + # self.close(f) + # del f, rec + # + # if indices is not Ellipsis: + # indices = parse_indices(array.shape, indices) + # array = get_subspace(array, indices) + # + # # Set the units, if they haven't been set already. + # self._set_units(int_hdr) + # + # LBUSER2 = int_hdr.item(38) + # if LBUSER2 == 3: + # # Return the numpy array now if it is a boolean array + # self._set_component("dtype", np.dtype(bool), copy=False) + # return array.astype(bool) + # + # integer_array = LBUSER2 == 2 + # + # # ------------------------------------------------------------ + # # Convert to a masked array + # # ------------------------------------------------------------ + # # Set the fill_value from BMDI + # fill_value = real_hdr.item(17) + # if fill_value != -1.0e30: + # # -1.0e30 is the flag for no missing data + # if integer_array: + # # The fill_value must be of the same type as the data + # # values + # fill_value = int(fill_value) + # + # # Mask any missing values + # mask = array == fill_value + # if mask.any(): + # array = np.ma.masked_where(mask, array, copy=False) + # + # # ------------------------------------------------------------ + # # Unpack the array using the scale_factor and add_offset, if + # # either is available + # # ------------------------------------------------------------ + # # Treat BMKS as a scale_factor if it is neither 0 nor 1 + # scale_factor = real_hdr.item(18) + # if scale_factor != 1.0 and scale_factor != 0.0: + # if integer_array: + # scale_factor = int(scale_factor) + # + # array *= scale_factor + # + # # Treat BDATUM as an add_offset if it is not 0 + # add_offset = real_hdr.item(4) + # if add_offset != 0.0: + # if integer_array: + # add_offset = int(add_offset) + # + # array += add_offset + # + # # Set the data type + # self._set_component("dtype", array.dtype, copy=False) + # + # # Return the numpy array + # return array def _get_rec(self, f, header_offset): """Get a container for a record. diff --git a/cf/data/collapse/dask_collapse.py b/cf/data/collapse/dask_collapse.py index 01b3db7c21..33956b9246 100644 --- a/cf/data/collapse/dask_collapse.py +++ b/cf/data/collapse/dask_collapse.py @@ -14,6 +14,7 @@ from dask.core import flatten from dask.utils import deepmap +from ..dask_utils import cf_asanyarray from .collapse_utils import double_precision_dtype @@ -126,7 +127,7 @@ def sum_weights_chunk( return N - weights = asanyarray(weights) + weights = cf_asanyarray(weights) if check_weights: w_min = weights.min() if w_min <= 0: @@ -265,14 +266,14 @@ def cf_mean_chunk( * weighted: True if weights have been set. """ - x = asanyarray(x) - + x = cf_asanyarray(x) + if computing_meta: return x if weights is not None: - weights = asanyarray(weights) - + weights = cf_asanyarray(weights) + # N, sum d = cf_sum_chunk(x, weights, dtype=dtype, **kwargs) @@ -390,8 +391,8 @@ def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): * max: The maximum of `x``. """ - x = asanyarray(x) - + x = cf_asanyarray(x) + if computing_meta: return x @@ -543,7 +544,7 @@ def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): * min: The minimum of ``x``. """ - x = asanyarray(x) + x = cf_asanyarray(x) if computing_meta: return x @@ -649,7 +650,7 @@ def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs): * max: The maximum of ``x`. """ - x = asanyarray(x) + x = cf_asanyarray(x) if computing_meta: return x @@ -760,7 +761,7 @@ def cf_rms_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): * sum: The weighted sum of ``x**2``. """ - x = asanyarray(x) + x = cf_asanyarray(x) if computing_meta: return x @@ -837,7 +838,7 @@ def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs): * N: The sample size. """ - x = asanyarray(x) + x = cf_asanyarray(x) if computing_meta: return x @@ -964,13 +965,13 @@ def cf_sum_chunk( * sum: The weighted sum of ``x`` """ - x = asanyarray(x) + x = cf_asanyarray(x) if computing_meta: return x if weights is not None: - weights = asanyarray(weights) + weights = cf_asanyarray(weights) if check_weights: w_min = weights.min() if w_min <= 0: @@ -1090,7 +1091,7 @@ def cf_sum_of_weights_chunk( ``weights**2`` if *square* is True. """ - x = asanyarray(x) + x = cf_asanyarray(x) if computing_meta: return x @@ -1128,7 +1129,7 @@ def cf_unique_chunk(x, dtype=None, computing_meta=False, **kwargs): * unique: The unique values. """ - x = asanyarray(x) + x = cf_asanyarray(x) if computing_meta: return x @@ -1214,15 +1215,15 @@ def cf_var_chunk( * ddof: The delta degrees of freedom. """ - x = asanyarray(x) + x = cf_asanyarray(x) if computing_meta: return x weighted = weights is not None if weighted: - weights = asanyarray(weights) - + weights = cf_asanyarray(weights) + # N, V1, sum d = cf_mean_chunk(x, weights, dtype=dtype, **kwargs) diff --git a/cf/data/dask_regrid.py b/cf/data/dask_regrid.py index ea2fe04d76..659418396d 100644 --- a/cf/data/dask_regrid.py +++ b/cf/data/dask_regrid.py @@ -1,7 +1,7 @@ """Regridding functions used within a dask graph.""" import numpy as np -from .dask_utils import hhh +from .dask_utils import cf_asanyarray def regrid( @@ -175,12 +175,12 @@ def regrid( """ weights, dst_mask = weights_dst_mask - a = hhh(a) + a = cf_asanyarray(a) if dst_mask is not None: - dst_mask = hhh(dst_mask) - + dst_mask = cf_asanyarray(dst_mask) + if ref_src_mask is not None: - ref_src_mask = hhh(ref_src_mask) + ref_src_mask = cf_asanyarray(ref_src_mask) # ---------------------------------------------------------------- # Reshape the array into a form suitable for the regridding dot diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index 39830399ad..e018877473 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -10,7 +10,6 @@ import numpy as np from dask.core import flatten from scipy.ndimage import convolve1d -from scipy.sparse import issparse from ..cfdatetime import dt, dt2rt, rt2dt from ..functions import atol as cf_atol @@ -127,8 +126,8 @@ def cf_contains(a, value): value. """ - a = asanyarray(a) - value = asanyarray(value) + a = cf_asanyarray(a) + value = cf_asanyarray(value) return np.array(value in a).reshape((1,) * a.ndim) @@ -162,13 +161,12 @@ def cf_convolve1d(a, window=None, axis=-1, origin=0): Convolved float array with same shape as input. """ - a = asanyarray(a) + a = cf_asanyarray(a) # Cast to float to ensure that NaNs can be stored if a.dtype != float: a = a.astype(float, copy=False) - masked = np.ma.is_masked(a) if masked: # convolve1d does not deal with masked arrays, so uses NaNs @@ -206,7 +204,7 @@ def cf_harden_mask(a): The array with hardened mask. """ - a = asanyarray(a) + a = cf_asanyarray(a) if np.ma.isMA(a): try: a.harden_mask() @@ -277,8 +275,8 @@ def cf_percentile(a, q, axis, method, keepdims=False, mtol=1): """ from math import prod - a = asanyarray(a) - + a = cf_asanyarray(a) + if np.ma.isMA(a) and not np.ma.is_masked(a): # Masked array with no masked elements a = a.data @@ -372,7 +370,7 @@ def cf_soften_mask(a): The array with softened mask. """ - a = asanyarray(a) + a = cf_asanyarray(a) if np.ma.isMA(a): try: @@ -429,15 +427,14 @@ def cf_where(array, condition, x, y, hardmask): elsewhere. """ - a = asanyarray(a) - condition = asanyarray(condition) + array = cf_asanyarray(array) + condition = cf_asanyarray(condition) if x is not None: - x = asanyarray(x) - + x = cf_asanyarray(x) + if y is not None: - y = asanyarray(y) - - + y = cf_asanyarray(y) + mask = None if np.ma.isMA(array): @@ -533,7 +530,7 @@ def cf_YMDhms(a, attr): array([1, 2]) """ - a = asanyarray(a) + a = cf_asanyarray(a) return _array_getattr(a, attr=attr) @@ -566,7 +563,7 @@ def cf_rt2dt(a, units): cftime.DatetimeGregorian(2000, 1, 2, 0, 0, 0, 0, has_year_zero=False)] """ - a = asanyarray(a) + a = cf_asanyarray(a) if not units.iscalendartime: return rt2dt(a, units_in=units) @@ -621,7 +618,7 @@ def cf_dt2rt(a, units): [365 366] """ - a = asanyarray(a) + a = cf_asanyarray(a) return dt2rt(a, units_out=units, units_in=None) @@ -662,18 +659,69 @@ def cf_units(a, from_units, to_units): [1000. 2000.] """ - a = asanyarray(a) + a = cf_asanyarray(a) return Units.conform( a, from_units=from_units, to_units=to_units, inplace=False ) def cf_filled(a, fill_value=None): - a = asanyarray(a) - return np.ma.filled(a, fill_value= fill_value) + """TODOConvert array values to have different equivalent units. + + .. versionadded:: NEXTVERSION + + :Parameters: + + a: array_like + The array. + + fill_value: + TODO + + :Returns: + + `numpy.ndarray` + TODO An array containing values in the new units. In order to + represent the new units, the returned data type may be + different from that of the input array. For instance, if + *a* has an integer data type, *from_units* are kilometres, + and *to_units* are ``'miles'`` then the returned array + will have a float data type. + + **Examples** + + TODO + >>> import numpy as np + >>> a = np.array([1, 2]) + >>> print(cf.data.dask_utils.cf_units(a, cf.Units('km'), cf.Units('m'))) + [1000. 2000.] + + """ + a = cf_asanyarray(a) + return np.ma.filled(a, fill_value=fill_value) + + +def cf_asanyarray(a): + """TODO + + .. versionadded:: NEXTVERSION + + :Parameters: -def asanyarray(self, a): - if issparse(a): - return a + a: array_like + The array. + + :Returns: + + TODO - return np.asanyarray(a) + **Examples** + + TODO + + """ + if getattr(a, "_dask_asanyarray", False): + print ('cf_asanyarray', repr(a)) + return np.asanyarray(a) + + return a diff --git a/cf/data/data.py b/cf/data/data.py index 0f1ca3e6b1..bf1db9a164 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -46,6 +46,7 @@ from .creation import generate_axis_identifiers, to_dask from .dask_utils import ( _da_ma_allclose, + cf_asanyarray, cf_contains, cf_dt2rt, cf_filled, @@ -372,13 +373,22 @@ def __init__( source=source, _use_array=_use_array and array is not None ) + self._custom.setdefault("asanyarray", True) + if _use_array: try: - array = source.to_dask_array(numpify=False) + array = source.to_dask_array(asanyarray=False) except (AttributeError, TypeError): - pass + try: + array = source.to_dask_array() + except (AttributeError, TypeError): + pass + else: + self._set_dask(array, copy=copy, clear=_NONE) else: - self._set_dask(array, copy=copy, clear=_NONE) + self._set_dask( + array, copy=copy, clear=_NONE, asanyarray=None + ) else: self._del_dask(None) @@ -466,6 +476,10 @@ def __init__( # deterministic name self._custom["deterministic"] = not is_dask_collection(array) + # Set whether or not to call np.asanyarray on chunks to + # convert them to numpy arrays. + self._custom["asanyarray"] = getattr(array, "_dask_asanyarray", False) + dx = to_dask(array, chunks, **kwargs) # Find out if we have an array of date-time objects @@ -529,7 +543,7 @@ def dask_compressed_array(self): if ca is None or not ca.get_compression_type(): raise ValueError("not compressed: can't get compressed dask array") - return ca.to_dask_array(numpify=False) # TODO + return ca.to_dask_array() def __contains__(self, value): """Membership test operator ``in`` @@ -622,9 +636,9 @@ def __contains__(self, value): # are incompatible return False - value = value.to_dask_array(numpify=False) + value = value.to_dask_array() - dx = self.to_dask_array(numpify=False) + dx = self.to_dask_array() out_ind = tuple(range(dx.ndim)) dx_ind = out_ind @@ -668,7 +682,7 @@ def __float__(self): 1. """ - return float(self.to_dask_array(numpify=True)) + return float(self.to_dask_array()) def __int__(self): """Called to implement the built-in function `int` @@ -681,7 +695,7 @@ def __int__(self): the dask array size is already known to be greater than 1. """ - return int(self.to_dask_array(numpify=True)) + return int(self.to_dask_array()) def __iter__(self): """Called when an iterator is required. @@ -757,7 +771,7 @@ def __len__(self): TypeError: len() of unsized object """ - dx = self.to_dask_array(numpify=False) + dx = self.to_dask_array(asanyarray=False) # TODO check if math.isnan(dx.size): logger.debug("Computing data len: Performance may be degraded") dx.compute_chunk_sizes() @@ -788,7 +802,7 @@ def __bool__(self): "elements is ambiguous. Use d.any() or d.all()" ) - return bool(self.to_dask_array(numpify=True)) + return bool(self.to_dask_array()) def __getitem__(self, indices): """Return a subspace of the data defined by indices. @@ -875,10 +889,10 @@ def __getitem__(self, indices): new = self.roll( axis=tuple(roll.keys()), shift=tuple(roll.values()) ) - dx = new.to_dask_array(numpify=False) + dx = new.to_dask_array(asanyarray=False) else: new = self.copy(array=False) - dx = self.to_dask_array(numpify=False) + dx = self.to_dask_array(asanyarray=False) # ------------------------------------------------------------ # Subspace the dask array @@ -925,7 +939,7 @@ def __getitem__(self, indices): # ------------------------------------------------------------ # Set the subspaced dask array # ------------------------------------------------------------ - new._set_dask(dx) + new._set_dask(dx, asanyarray=True) # ------------------------------------------------------------ # Get the axis identifiers for the subspace @@ -1118,7 +1132,7 @@ def __setitem__(self, indices, value): # Do the assignment self._set_subspace(dx, indices, value) - self._custom['numpified'] = True + self._set_dask(dx) # Unroll any axes that were rolled to enable a cyclic # assignment @@ -1138,10 +1152,6 @@ def __setitem__(self, indices, value): self[indices] = reset - # Remove elements made invalid by updating the `dask` array - # in-place - self._clear_after_dask_update(_ALL) - return # ---------------------------------------------------------------- @@ -1363,7 +1373,7 @@ def _clear_after_dask_update(self, clear=_ALL): # Set the CFA write status to False self._cfa_del_write() - def _set_dask(self, dx, copy=False, clear=_ALL, computable=None): + def _set_dask(self, dx, copy=False, clear=_ALL, asanyarray=False): """Set the dask array. .. versionadded:: 3.14.0 @@ -1386,6 +1396,11 @@ def _set_dask(self, dx, copy=False, clear=_ALL, computable=None): results in all components being removed. See `_clear_after_dask_update` for details. + asanyarray: `bool` or `None`, optional + TODO + + .. versionadded:: NEXTRELEASE + :Returns: `None` @@ -1415,9 +1430,9 @@ def _set_dask(self, dx, copy=False, clear=_ALL, computable=None): custom = self._custom custom["dask"] = dx - if computable: - custom["computable"] = True - + if asanyarray is not None: + custom["asanyarray"] = bool(asanyarray) + self._clear_after_dask_update(clear) def _del_dask(self, default=ValueError(), clear=_ALL): @@ -1702,10 +1717,9 @@ def diff(self, axis=-1, n=1, inplace=False): """ d = _inplace_enabled_define_and_cleanup(self) - dx = self.to_dask_array(numpify=True) + dx = self.to_dask_array() dx = da.diff(dx, axis=axis, n=n) d._set_dask(dx) - d._custom['numpified'] = True # Convert to "difference" units # @@ -1999,10 +2013,9 @@ def digitize( delete_bins.append(bins.size) # Digitise the array - dx = d.to_dask_array(numpify=True) + dx = d.to_dask_array() dx = da.digitize(dx, bins, right=upper) d._set_dask(dx) - d._custom['numpified'] = True d.override_units(_units_None, inplace=True) # More elegant to handle 'delete_bins' in cf- rather than Dask- space @@ -2273,7 +2286,7 @@ def pad_missing(self, axis, pad_width=None, to_size=None, inplace=False): d = _inplace_enabled_define_and_cleanup(self) - dx = d.to_dask_array(numpify=True) + dx = d.to_dask_array() mask0 = da.ma.getmaskarray(dx) pad = [(0, 0)] * dx.ndim @@ -2289,8 +2302,6 @@ def pad_missing(self, axis, pad_width=None, to_size=None, inplace=False): dx = da.ma.masked_where(mask, dx) d._set_dask(dx) - d._custom['numpified'] = True - return d @_inplace_enabled(default=False) @@ -2480,7 +2491,7 @@ def percentile( else: axes = tuple(sorted(d._parse_axes(axes))) - dx = d.to_dask_array(numpify=False) + dx = d.to_dask_array() dtype = dx.dtype shape = dx.shape @@ -2555,8 +2566,7 @@ def percentile( d._axes = (new_axis_identifier(axes),) + axes d._update_deterministic(not is_dask_collection(q)) - d._custom['numpified'] = True - + return d @_inplace_enabled(default=False) @@ -2597,12 +2607,9 @@ def persist(self, inplace=False): """ d = _inplace_enabled_define_and_cleanup(self) - - dx = self.to_dask_array(numpify=True) + dx = self.to_dask_array() dx = dx.persist() d._set_dask(dx, clear=_ALL ^ _ARRAY ^ _CACHE) - d._custom['numpified'] = True - return d @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @@ -2639,9 +2646,9 @@ def ceil(self, inplace=False, i=False): """ d = _inplace_enabled_define_and_cleanup(self) - dx = d.to_dask_array(numpify=True) - d._set_dask(da.ceil(dx)) - d._custom['numpified'] = True + dx = d.to_dask_array() + dx = da.ceil(dx) + d._set_dask(dx) return d def cfa_get_term(self): @@ -2766,7 +2773,8 @@ def compute(self): # noqa: F811 .. versionadded:: 3.14.0 - .. seealso:: `persist`, `array`, `datetime_array` + .. seealso:: `persist`, `array`, `datetime_array`, + `sparse_array` :Returns: @@ -2791,7 +2799,7 @@ def compute(self): # noqa: F811 [0., 0., 0.]]) """ - dx = self.to_dask_array(numpify=True) + dx = self.to_dask_array() a = dx.compute() if np.ma.isMA(a): @@ -2978,12 +2986,7 @@ def convolution_filter( depth += abs(origin) - dx = d.to_dask_array(numpify=False) - -# # Cast to float to ensure that NaNs can be stored (as required -# # by cf_convolve1d) -# if dx.dtype != float: -# dx = dx.astype(float, copy=False)# + dx = d.to_dask_array() # Convolve each chunk convolve1d = partial( @@ -2999,7 +3002,6 @@ def convolution_filter( ) d._set_dask(dx) - d._custom['numpified'] = True return d @@ -3091,10 +3093,9 @@ def cumsum( d = _inplace_enabled_define_and_cleanup(self) - dx = d.to_dask_array(numpify=True) + dx = d.to_dask_array() dx = dx.cumsum(axis=axis, method=method) d._set_dask(dx) - d._custom['numpified'] = True return d @@ -3169,7 +3170,7 @@ def rechunk( """ d = _inplace_enabled_define_and_cleanup(self) - dx = d.to_dask_array(numpify=False) + dx = d.to_dask_array(asanyarray=False) # TODO: check that this is OK! dx = dx.rechunk(chunks, threshold, block_size_limit, balance) d._set_dask(dx, clear=_ALL ^ _ARRAY ^ _CACHE) @@ -3222,10 +3223,9 @@ def _asdatetime(self, inplace=False): ) if not d._isdatetime(): - dx = d.to_dask_array(numpify=False) + dx = d.to_dask_array(asanyarray=False) dx = dx.map_blocks(cf_rt2dt, units=units, dtype=object) d._set_dask(dx) - d._custom['numpified'] = True return d @@ -3278,10 +3278,9 @@ def _asreftime(self, inplace=False): ) if d._isdatetime(): - dx = d.to_dask_array(numpify=False) + dx = d.to_dask_array(asanyarray=False) dx = dx.map_blocks(cf_dt2rt, units=units, dtype=float) d._set_dask(dx) - d._custom['numpified'] = True return d @@ -3748,8 +3747,8 @@ def _binary_operation(self, other, method): data0, other, new_Units = data0._combined_units(other, method, True) # Cast as dask arrays - dx0 = data0.to_dask_array(numpify=True) - dx1 = other.to_dask_array(numpify=True) + dx0 = data0.to_dask_array() + dx1 = other.to_dask_array() # Set if applicable the tolerance levels for the result if method in ("__eq__", "__ne__"): @@ -3808,7 +3807,6 @@ def _binary_operation(self, other, method): self._axes = axes self._update_deterministic(other) - self._custom['numpified'] = True return self else: # not, so concerns a new Data object copied from self, data0 @@ -3818,7 +3816,6 @@ def _binary_operation(self, other, method): data0._axes = axes data0._update_deterministic(other) - data0._custom['numpified'] = True return data0 def _parse_indices(self, *args, **kwargs): @@ -3891,7 +3888,7 @@ def _regrid( f"the shape of the regrid operator: {operator.src_shape}" ) - dx = self.to_dask_array(numpify=False) + dx = self.to_dask_array(asanyarray=False) # Rechunk so that each chunk contains data in the form # expected by the regrid operator, i.e. the regrid axes all @@ -3959,7 +3956,7 @@ def _regrid( ) # Create a regridding function to apply to each chunk - regrid_func = partial( + cf_regrid_func = partial( regrid, method=method, src_shape=src_shape, @@ -3979,7 +3976,7 @@ def _regrid( # github.com/pangeo-data/pangeo/issues/334#issuecomment-403787663 dx = dx.map_blocks( - regrid_func, + cf_regrid_func, weights_dst_mask=weights_dst_mask, ref_src_mask=src_mask, chunks=regridded_chunks, @@ -3990,7 +3987,6 @@ def _regrid( d = self.copy() d._set_dask(dx) - d._custom['numpified'] = True # Don't know (yet) if 'operator' has a deterministic name d._update_deterministic(False) @@ -4135,7 +4131,7 @@ def concatenate( copied = not copy # to avoid making two copies in a given case # Get data as dask arrays and apply concatenation operation - dxs = [d.to_dask_array(numpify=False) for d in processed_data] + dxs = [d.to_dask_array() for d in processed_data] dx = da.concatenate(dxs, axis=axis) # Set the CFA write status @@ -4808,7 +4804,7 @@ def chunks(self): 6 """ - return self.to_dask_array().chunks + return self.to_dask_array(asanyarray=False).chunks # ---------------------------------------------------------------- # Attributes @@ -4861,22 +4857,21 @@ def Units(self, value): else: dtype = _dtype_float - func = partial(cf_units, from_units=old_units, to_units=value) + cf_func = partial(cf_units, from_units=old_units, to_units=value) - dx = self.to_dask_array(numpify=False) - dx = dx.map_blocks(func, dtype=dtype) + dx = self.to_dask_array(asanyarray=False) + dx = dx.map_blocks(cf_func, dtype=dtype) # Setting equivalent units doesn't affect the CFA write # status. Nor does it invalidate any cached values, but only # because we'll adjust those, too. self._set_dask(dx, clear=_ALL ^ _CACHE ^ _CFA) - self._custom['numpified'] = True # Adjust cached values for the new units cache = self._get_cached_elements() if cache: self._set_cached_elements( - {index: func(value) for index, value in cache.items()} + {index: cf_func(value) for index, value in cache.items()} ) self._Units = value @@ -4931,16 +4926,15 @@ def dtype(self): [1 2 3] """ - dx = self.to_dask_array() + dx = self.to_dask_array(asanyarray=False) return dx.dtype @dtype.setter def dtype(self, value): - dx = self.to_dask_array() - # Only change the datatype if it's different to that of the # dask array - if dx.dtype != value: + if self.dtype != value: + dx = self.to_dask_array() dx = dx.astype(value) self._set_dask(dx) @@ -5093,7 +5087,7 @@ def nbytes(self): 24 """ - dx = self.to_dask_array() + dx = self.to_dask_array(asanyarray=False) # TODO: Check if math.isnan(dx.size): logger.debug("Computing data nbytes: Performance may be degraded") dx.compute_chunk_sizes() @@ -5127,7 +5121,7 @@ def ndim(self): 0 """ - dx = self.to_dask_array() + dx = self.to_dask_array(asanyarray=False) return dx.ndim @property @@ -5149,7 +5143,7 @@ def npartitions(self): 6 """ - return self.to_dask_array().npartitions + return self.to_dask_array(asanyarray=False).npartitions @property def numblocks(self): @@ -5170,7 +5164,7 @@ def numblocks(self): 6 """ - return self.to_dask_array().numblocks + return self.to_dask_array(asanyarray=False).numblocks @property def shape(self): @@ -5200,7 +5194,7 @@ def shape(self): () """ - dx = self.to_dask_array() + dx = self.to_dask_array(asanyarray=False) # TODO: Check if math.isnan(dx.size): logger.debug("Computing data shape: Performance may be degraded") dx.compute_chunk_sizes() @@ -5239,7 +5233,7 @@ def size(self): 1 """ - dx = self.to_dask_array() + dx = self.to_dask_array(asanyarray=False) # TODO: Check size = dx.size if math.isnan(size): logger.debug("Computing data size: Performance may be degraded") @@ -5446,7 +5440,8 @@ def arctan(self, inplace=False): d = _inplace_enabled_define_and_cleanup(self) dx = d.to_dask_array() - d._set_dask(da.arctan(dx)) + dx = da.arctan(dx) + d._set_dask(dx) d.override_units(_units_radians, inplace=True) @@ -5601,7 +5596,8 @@ def arcsinh(self, inplace=False): d = _inplace_enabled_define_and_cleanup(self) dx = d.to_dask_array() - d._set_dask(da.arcsinh(dx)) + dx = da.arcsinh(dx) + d._set_dask(dx) d.override_units(_units_radians, inplace=True) @@ -6421,7 +6417,7 @@ def convert_reference_time( ) d.Units = units0 - dx = d.to_dask_array(numpify=False) + dx = d.to_dask_array(asanyarray=False) # Convert to the correct date-time objects dx = dx.map_blocks(cf_rt2dt, units=units0, dtype=object) @@ -6430,7 +6426,6 @@ def convert_reference_time( dx = dx.map_blocks(cf_dt2rt, units=units, dtype=float) d._set_dask(dx) - d._custom['numpified'] = True d.override_units(units, inplace=True) return d @@ -6711,9 +6706,9 @@ def add_file_location(self, location): updated = True if updated: - dx = self.to_dask_array() + dx = self.to_dask_array(asanyarray=False) dx = da.Array(dsk, dx.name, dx.chunks, dx.dtype, dx._meta) - self._set_dask(dx, clear=_NONE) + self._set_dask(dx, clear=_NONE, asanyarray=None) return location @@ -8442,7 +8437,8 @@ def exp(self, inplace=False, i=False): d.Units = _units_1 dx = d.to_dask_array() - d._set_dask(da.exp(dx)) + dx = da.exp(dx) + d._set_dask(dx) return d @@ -8488,7 +8484,7 @@ def insert_dimension(self, position=0, inplace=False): shape = list(d.shape) shape.insert(position, 1) - dx = d.to_dask_array(numpify=False) + dx = d.to_dask_array() dx = dx.reshape(shape) # Inserting a dimension doesn't affect the cached elements nor @@ -8877,10 +8873,9 @@ def harden_mask(self): [1 -- 3] """ - dx = self.to_dask_array(numpify=False) + dx = self.to_dask_array(asanyarray=False) dx = dx.map_blocks(cf_harden_mask, dtype=self.dtype) self._set_dask(dx, clear=_NONE) - self._custom['numpified'] = True self.hardmask = True def has_calendar(self): @@ -8998,10 +8993,9 @@ def soften_mask(self): [ 1 999 3] """ - dx = self.to_dask_array(numpify=False) + dx = self.to_dask_array(asanyarray=False) dx = dx.map_blocks(cf_soften_mask, dtype=self.dtype) self._set_dask(dx, clear=_NONE) - self._custom['numpified'] = True self.hardmask = False def file_locations(self): @@ -9086,10 +9080,9 @@ def filled(self, fill_value=None, inplace=False): f"data type {d.dtype.str!r}" ) - dx = d.to_dask_array(numpify=False) + dx = d.to_dask_array(asanyarray=False) dx = dx.map_blocks(cf_filled, fill_value=fill_value, dtype=d.dtype) d._set_dask(dx) - d._custom['numpified'] = True return d @@ -9716,7 +9709,7 @@ def override_calendar(self, calendar, inplace=False, i=False): d._Units = Units(d.Units._units, calendar) return d - def to_dask_array(self, apply_mask_hardness=False, numpify=True): + def to_dask_array(self, apply_mask_hardness=False, asanyarray=None): """Convert the data to a `dask` array. .. warning:: By default, the mask hardness of the returned @@ -9740,6 +9733,11 @@ def to_dask_array(self, apply_mask_hardness=False, numpify=True): If True then force the mask hardness of the returned array to be that given by the `hardmask` attribute. + asanyarray: `bool` or `None`, optional + TODO + + .. versionadded:: NEXTVERSION + :Returns: `dask.array.Array` @@ -9764,17 +9762,23 @@ def to_dask_array(self, apply_mask_hardness=False, numpify=True): """ dx = self._custom.get("dask") if dx is None: - raise ValueError(f"{self.__class__.__name__} object has no data") + raise ValueError(f"{self.__class__.__name__} object has no data") if apply_mask_hardness: if self.hardmask: self.harden_mask() else: self.soften_mask() - elif numpify and not self._custom.get('numpified'): - return dx.map_blocks(asanyarray, dtype=dx.dtype) - return self._custom["dask"] + dx = self._custom["dask"] + else: + if asanyarray is None: + asanyarray = self._custom.get("asanyarray") + + if asanyarray: + dx = dx.map_blocks(cf_asanyarray, dtype=dx.dtype) + + return dx def datum(self, *index): """Return an element of the data array as a standard Python @@ -9948,10 +9952,9 @@ def masked_invalid(self, inplace=False): """ d = _inplace_enabled_define_and_cleanup(self) - dx = self.to_dask_array(numpify=True) + dx = self.to_dask_array() dx = da.ma.masked_invalid(dx) d._set_dask(dx) - d._custom['numpified'] = True return d def del_calendar(self, default=ValueError()): @@ -10046,10 +10049,9 @@ def del_file_location(self, location): updated = True if updated: - dx = self.to_dask_array(numpify=False) + dx = self.to_dask_array(asanyarray=False) dx = da.Array(dsk, dx.name, dx.chunks, dx.dtype, dx._meta) - self._set_dask(dx, clear=_NONE) - self._custom['numpified'] = True + self._set_dask(dx, clear=_NONE, asanyarray=None) return location @@ -11215,7 +11217,7 @@ def where( # Missing values could be affected, so make sure that the mask # hardness has been applied. - dx = d.to_dask_array(apply_mask_hardness=True, numpify=False) + dx = d.to_dask_array(apply_mask_hardness=True, asanyarray=False) units = d.Units @@ -11230,8 +11232,8 @@ def where( condition = type(self).asdata(condition) condition = where_broadcastable(d, condition, "condition") - condition = condition.to_dask_array(numpify=False) - + condition = condition.to_dask_array(asanyarray=False) + # If x or y is self then change it to None. This prevents an # unnecessary copy; and, at compute time, an unncessary numpy # where. @@ -11274,10 +11276,8 @@ def where( x, y = xy # Apply the where operation - dx = da.core.elemwise( - cf_where, dx, condition, x, y, d.hardmask - ) - d._set_dask(dx, computable=True) + dx = da.core.elemwise(cf_where, dx, condition, x, y, d.hardmask) + d._set_dask(dx) # Don't know (yet) if 'x' and 'y' have a deterministic names d._update_deterministic(False) @@ -11337,7 +11337,8 @@ def sin(self, inplace=False, i=False): d.Units = _units_radians dx = d.to_dask_array() - d._set_dask(da.sin(dx)) + dx = da.sin(dx) + d._set_dask(dx) d.override_units(_units_1, inplace=True) @@ -11397,7 +11398,8 @@ def sinh(self, inplace=False): d.Units = _units_radians dx = d.to_dask_array() - d._set_dask(da.sinh(dx)) + dx = da.sinh(dx) + d._set_dask(dx) d.override_units(_units_1, inplace=True) @@ -11455,7 +11457,8 @@ def cosh(self, inplace=False): d.Units = _units_radians dx = d.to_dask_array() - d._set_dask(da.cosh(dx)) + dx = da.cosh(dx) + d._set_dask(dx) d.override_units(_units_1, inplace=True) @@ -11498,10 +11501,10 @@ def cull_graph(self): ('array-21ea057f160746a3d3f0943bba945460', 0): array([1, 2, 3])} """ - dx = self.to_dask_array() + dx = self.to_dask_array(asanyarray=False) dsk, _ = cull(dx.dask, dx.__dask_keys__()) dx = da.Array(dsk, name=dx.name, chunks=dx.chunks, dtype=dx.dtype) - self._set_dask(dx, clear=_NONE) + self._set_dask(dx, clear=_NONE, asanyarray=None) @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) @@ -11558,7 +11561,8 @@ def tanh(self, inplace=False): d.Units = _units_radians dx = d.to_dask_array() - d._set_dask(da.tanh(dx)) + dx = da.tanh(dx) + d._set_dask(dx) d.override_units(_units_1, inplace=True) @@ -11761,13 +11765,16 @@ def tan(self, inplace=False, i=False): d.Units = _units_radians dx = d.to_dask_array() - d._set_dask(da.tan(dx)) + dx = da.tan(dx) + d._set_dask(dx) d.override_units(_units_1, inplace=True) return d - def todict(self, optimize_graph=True): + def todict( + self, optimize_graph=True, apply_mask_hardness=False, asanyarray=False + ): """Return a dictionary of the dask graph key/value pairs. .. versionadded:: 3.15.0 @@ -11782,6 +11789,17 @@ def todict(self, optimize_graph=True): chunks. Note that optimising the graph can add a considerable performance overhead. + apply_mask_hardness: `bool`, optional + If True then force the mask hardness of the returned + array to be that given by the `hardmask` attribute. + + .. versionadded:: NEXTVERSION + + asanyarray: `bool` or `None`, optional + TODO + + .. versionadded:: NEXTVERSION + :Returns: `dict` @@ -11807,7 +11825,9 @@ def todict(self, optimize_graph=True): 0), (slice(0, 1, 1),))} """ - dx = self.to_dask_array(numpify=False) # TODO + dx = self.to_dask_array( + apply_mask_hardness=apply_mask_hardness, asanyarray=asanyarray + ) if optimize_graph: return collections_to_dsk((dx,), optimize_graph=True) @@ -11919,7 +11939,7 @@ def transpose(self, axes=None, inplace=False, i=False): data_axes = d._axes d._axes = [data_axes[i] for i in iaxes] - dx = d.to_dask_array(numpify=True) + dx = d.to_dask_array() try: dx = da.transpose(dx, axes=axes) except ValueError: @@ -11928,7 +11948,6 @@ def transpose(self, axes=None, inplace=False, i=False): ) d._set_dask(dx) - d._custom['numpified'] = True return d @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @@ -11964,9 +11983,9 @@ def trunc(self, inplace=False, i=False): """ d = _inplace_enabled_define_and_cleanup(self) - dx = d.to_dask_array(numpify=True) - d._set_dask(da.trunc(dx)) - d._custom['numpified'] = True + dx = d.to_dask_array() + dx = da.trunc(dx) + d._set_dask(dx) return d @classmethod @@ -12271,7 +12290,7 @@ def func( """ d = _inplace_enabled_define_and_cleanup(self) - dx = d.to_dask_array(numpify=True) + dx = d.to_dask_array() if preserve_invalid: # Assume all inputs are masked, as checking for a mask to confirm @@ -12288,7 +12307,6 @@ def func( dx = da.ma.masked_array(dx, mask=dx_mask) d._set_dask(dx) - d._custom['numpified'] = True if units is not None: d.override_units(units, inplace=True) @@ -12452,10 +12470,9 @@ def roll(self, axis, shift, inplace=False, i=False): d = _inplace_enabled_define_and_cleanup(self) - dx = d.to_dask_array(numpify=True) + dx = d.to_dask_array() dx = da.roll(dx, shift, axis=axis) d._set_dask(dx) - d._custom['numpified'] = True return d @@ -13117,10 +13134,9 @@ def square(self, dtype=None, inplace=False): """ d = _inplace_enabled_define_and_cleanup(self) - dx = d.to_dask_array(numpify=True) + dx = d.to_dask_array() dx = da.square(dx, dtype=dtype) d._set_dask(dx) - d._custom['numpified'] = True units = d.Units if units: @@ -13187,10 +13203,9 @@ def sqrt(self, dtype=None, inplace=False): """ d = _inplace_enabled_define_and_cleanup(self) - dx = d.to_dask_array(numpify=True) + dx = d.to_dask_array() dx = da.sqrt(dx, dtype=dtype) d._set_dask(dx) - d._custom['numpified'] = True units = d.Units if units: diff --git a/cf/functions.py b/cf/functions.py index f4964637eb..30109091c5 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -1885,7 +1885,9 @@ def indices_shape(indices, full_shape, keepdims=True): return shape -def parse_indices(shape, indices, cyclic=False, keepdims=True, bool_as_int=False): +def parse_indices( + shape, indices, cyclic=False, keepdims=True +): # , bool_as_int=False): """Parse indices for array access and assignment. :Parameters: @@ -2041,12 +2043,11 @@ def parse_indices(shape, indices, cyclic=False, keepdims=True, bool_as_int=False if callable(to_dask_array): # Replace index with its Dask array index = to_dask_array() - - elif bool_as_int: - index = np.asanyarray(index) - if index.dtype == bool: - index = np.arange(size)[index] - + # + # elif bool_as_int: + # index = np.asanyarray(index) + # if index.dtype == bool: + # index = np.arange(size)[index] parsed_indices[i] = index From 297f33be0781c3d4ba812631aa44082d4a3c3995 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Sun, 17 Mar 2024 14:24:41 +0000 Subject: [PATCH 060/134] dev --- cf/data/array/mixin/indexmixin.py | 73 +++++++++++++++++++++---------- cf/data/array/netcdfarray.py | 48 ++------------------ cf/data/dask_utils.py | 1 - cf/data/data.py | 18 +++++++- 4 files changed, 70 insertions(+), 70 deletions(-) diff --git a/cf/data/array/mixin/indexmixin.py b/cf/data/array/mixin/indexmixin.py index c94bd7559d..f31e2ff785 100644 --- a/cf/data/array/mixin/indexmixin.py +++ b/cf/data/array/mixin/indexmixin.py @@ -3,7 +3,7 @@ import numpy as np from dask.base import is_dask_collection -from ....functions import parse_indices +from ....functions import parse_indices, indices_shape class IndexMixin: @@ -63,25 +63,35 @@ def __getitem__(self, index): TODO """ - new = self.copy() - - shape0 = self.shape + shape = self.shape index0 = self.index - index = parse_indices(shape0, index, keepdims=False) + original_shape = self.original_shape + index = parse_indices(shape, index, keepdims=True) + + new = self.copy() new_indices = [] new_shape = [] - for ind0, ind, size in zip(index0, index, shape0): - if ind == slice(None): + + for ind0, ind, size, original_size in zip( + index0, index, shape, original_shape + ): + if isinstance(ind, slice) and ind == slice(None): new_indices.append(ind0) new_shape.append(size) continue if is_dask_collection(ind): - # I think that this will never occur when __getitem__ - # is being called from within a Dask graph. Otherwise - # we'll need to run the `compute` inside a `with - # dask.config.set({"scheduler": "synchronous"}):` + # Note: This will never occur when __getitem__ is + # being called from within a Dask graph, because + # any lazy indices will have already been + # computed as part of the whole graph execution + # - i.e. we don't have to worry about a + # compute-withn-a-compute situation. (If this + # were not the case then we could get round it + # by wrapping the compute inside a `with + # dask.config.set({"scheduler": + # "synchronous"}):` claus.) ind = ind.compute() if isinstance(ind0, slice): @@ -108,26 +118,26 @@ def __getitem__(self, index): stop = None new_index = slice(start, stop, step) - new_size = ceil((stop - start) / step) else: - # 'ind0' is slice, 'ind' is (array of) int/bool - new_index = np.arange(*ind0.indices(size0))[ind] - new_size = new.size + # 'ind0' is slice, 'ind' is array of int/bool + new_index = np.arange(*ind0.indices(original_size))[ind] else: - # 'ind0' is (array of) int + # 'ind0' is array of int new_index = np.asanyarray(ind0)[ind] - new_size = new.size new_indices.append(new_index) - new_shape.append(new_size) - new._set_component("index", tuple(new_indices), copy=False) + new_shape = indices_shape(new_indices, original_shape, keepdims=False) new._set_component("shape", tuple(new_shape), copy=False) - - print (index0, index, new_indices) - + + new._custom["index"] = tuple(new_indices) return new + def __repr__(self): + """TODO""" + out = super().__repr__() + return f"{out[:-1]}{self.original_shape}>" + @property def _dask_asanyarray(self): """TODO @@ -167,9 +177,24 @@ def index(self): .. versionadded:: NEXTVERSION """ - ind = self._get_component("index", None) + ind = self._custom.get("index") if ind is None: ind = (slice(None),) * self.ndim - self._set_component("index", ind, copy=False) + self._custom["index"]= ind return ind + + @property + def original_shape(self): + """TODO + + .. versionadded:: NEXTVERSION + + """ + shape = self._custom.get('original_shape') + if shape is None: + shape = self.shape + self._custom["original_shape"] = shape + + return shape + diff --git a/cf/data/array/netcdfarray.py b/cf/data/array/netcdfarray.py index 578f7be66e..2150a97e92 100644 --- a/cf/data/array/netcdfarray.py +++ b/cf/data/array/netcdfarray.py @@ -36,47 +36,7 @@ def _lock(self): def _get_array(self): """TODO""" - print ('cf.NetCDFArray._get_array', self.index) -# return super(cfdm.NetCDFArray, self).__getitem__(self.index) -# return super(cfdm.NetCDFArray, self).__getitem__(self.index) - - netcdf, address = self.open() - dataset = netcdf - - groups, address = self.get_groups(address) - if groups: - # Traverse the group structure, if there is one (CF>=1.8). - netcdf = self._group(netcdf, groups) - - if isinstance(address, str): - # Get the variable by netCDF name - variable = netcdf.variables[address] - else: - # Get the variable by netCDF integer ID - for variable in netcdf.variables.values(): - if variable._varid == address: - break - - # Get the data, applying masking and scaling as required. -# array = cfdm.netcdf_indexer( -# variable, -# mask=self.get_mask(), -# unpack=self.get_unpack(), -# always_mask=False, -# ) - array = variable[self.index] - - # Set the units, if they haven't been set already. -# self._set_attributes(variable) - - # Set the units, if they haven't been set already. - self._set_units(variable) - - self.close(dataset) - del netcdf, dataset - - if not self.ndim: - # Hmm netCDF4 has a thing for making scalar size 1, 1d - array = array.squeeze() - - return array + # Note: Using Container in super because that comes + # immediately before cfdm.NetCDFArray in the method + # resolution order. + return super(Container, self).__getitem__(self.index) diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index e018877473..b69147db8b 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -721,7 +721,6 @@ def cf_asanyarray(a): """ if getattr(a, "_dask_asanyarray", False): - print ('cf_asanyarray', repr(a)) return np.asanyarray(a) return a diff --git a/cf/data/data.py b/cf/data/data.py index bf1db9a164..5932b40bd1 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -1399,6 +1399,12 @@ def _set_dask(self, dx, copy=False, clear=_ALL, asanyarray=False): asanyarray: `bool` or `None`, optional TODO + If True then call `np.asanyarray` on chunks to convert + them to numpy arrays. If False then chunks are passed + through unchanged. If None (default) then we use True + if the ``__array_function__`` method is undefined. + + .. versionadded:: NEXTRELEASE :Returns: @@ -1541,7 +1547,7 @@ def _set_cached_elements(self, elements): within its ``custom`` dictionary. .. warning:: Never change ``_custom['cached_elements']`` - in-place. + in-place. .. versionadded:: 3.14.0 @@ -5290,6 +5296,16 @@ def array(self): elif not isinstance(array, np.ndarray): array = np.asanyarray(array) + # Set cached elements + items = [0, -1] + if array.size >= 3: + items.append(1) + + if array.ndim == 2 and array.shape[-1] == 2: + items.append(-2) + + self._set_cached_elements({i: array.item(i) for i in items}) + return array @property From c7a9cb959bae8336cfb5a31b57c6e887e0bb652e Mon Sep 17 00:00:00 2001 From: David Hassell Date: Sun, 17 Mar 2024 18:50:38 +0000 Subject: [PATCH 061/134] dev --- cf/data/array/mixin/indexmixin.py | 125 +++++++++++++++++----------- cf/data/array/netcdfarray.py | 27 ++++-- cf/data/array/umarray.py | 92 +++----------------- cf/data/data.py | 65 +++++++++------ cf/read_write/netcdf/netcdfwrite.py | 2 +- 5 files changed, 151 insertions(+), 160 deletions(-) diff --git a/cf/data/array/mixin/indexmixin.py b/cf/data/array/mixin/indexmixin.py index f31e2ff785..5ff20b427f 100644 --- a/cf/data/array/mixin/indexmixin.py +++ b/cf/data/array/mixin/indexmixin.py @@ -1,9 +1,7 @@ -from math import ceil - import numpy as np from dask.base import is_dask_collection -from ....functions import parse_indices, indices_shape +from ....functions import indices_shape, parse_indices class IndexMixin: @@ -38,43 +36,55 @@ def __array__(self, *dtype): return array def __getitem__(self, index): - """TODO Returns a subspace of the array as a numpy array. + """Returns a subspace of the array as a new `{{class}}`. x.__getitem__(indices) <==> x[indices] - The indices that define the subspace must be either `Ellipsis` or - a sequence that contains an index for each dimension. In the - latter case, each dimension's index must either be a `slice` - object or a sequence of two or more integers. + The new `{{class}}` may be converted to a `numpy` array with + its `__array__` method. + + Consecutive subspaces are lazy, with only the final data + elements read from the dataset when `__array__` is called. + + For example, if a dataset variable has shape ``(12, 145, + 192)`` and consecutive subspaces of ``[8:9, 10:20:3, [15, 1, + 4, 12]`` and ``[[0], [True, False, True], ::-2]`` are applied + then only the elements defined by subspace ``[[8], [10, 16], + [12, 1]]`` will be retrieved from the dataset when `__array__` + is called. - Indexing is similar to numpy indexing. The only difference to - numpy indexing (given the restrictions on the type of indices - allowed) is: + Indexing is similar to `numpy` indexing. The only difference + to numpy indexing (given the restrictions on the type of + indices allowed) is: - * When two or more dimension's indices are sequences of integers - then these indices work independently along each dimension - (similar to the way vector subscripts work in Fortran). + * When two or more dimension's indices are sequences of + integers then these indices work independently along each + dimension (similar to the way vector subscripts work in + Fortran). .. versionadded:: NEXTVERSION + .. seealso:: `index`, `original_shape`, `__array__`, + `__getitem__` + :Returns: `{{class}}` - TODO + The subspaced array. """ shape = self.shape index0 = self.index original_shape = self.original_shape - index = parse_indices(shape, index, keepdims=True) - + index = parse_indices(shape, index, keepdims=False) + new = self.copy() new_indices = [] new_shape = [] - + for ind0, ind, size, original_size in zip( - index0, index, shape, original_shape + index0, index, shape, original_shape ): if isinstance(ind, slice) and ind == slice(None): new_indices.append(ind0) @@ -85,20 +95,20 @@ def __getitem__(self, index): # Note: This will never occur when __getitem__ is # being called from within a Dask graph, because # any lazy indices will have already been - # computed as part of the whole graph execution - # - i.e. we don't have to worry about a - # compute-withn-a-compute situation. (If this + # computed as part of the whole graph execution; + # i.e. we don't have to worry about a + # compute-within-a-compute situation. (If this # were not the case then we could get round it # by wrapping the compute inside a `with # dask.config.set({"scheduler": - # "synchronous"}):` claus.) + # "synchronous"}):` clause.) ind = ind.compute() if isinstance(ind0, slice): if isinstance(ind, slice): - # 'ind0' is slice, 'ind' is slice + # 'ind0' is slice; 'ind' is slice start, stop, step = ind0.indices(size) - size0, _ = divmod(stop - start - 1, step) + size0 = indices_shape((ind0,), (original_size,))[0] start1, stop1, step1 = ind.indices(size0 + 1) size1, mod1 = divmod(stop1 - start1, step1) @@ -118,26 +128,31 @@ def __getitem__(self, index): stop = None new_index = slice(start, stop, step) - else: - # 'ind0' is slice, 'ind' is array of int/bool + elif np.iterable(ind): + # 'ind0' is slice; 'ind' is array of int/bool new_index = np.arange(*ind0.indices(original_size))[ind] + else: + raise ValueError( + f"Can't subspace {self!r} with index {ind} that " + "removes a dimension" + ) else: # 'ind0' is array of int new_index = np.asanyarray(ind0)[ind] new_indices.append(new_index) - new_shape = indices_shape(new_indices, original_shape, keepdims=False) + new_shape = indices_shape(new_indices, original_shape, keepdims=True) new._set_component("shape", tuple(new_shape), copy=False) - - new._custom["index"] = tuple(new_indices) + + new._custom["index"] = tuple(new_indices) return new def __repr__(self): """TODO""" out = super().__repr__() return f"{out[:-1]}{self.original_shape}>" - + @property def _dask_asanyarray(self): """TODO @@ -148,22 +163,19 @@ def _dask_asanyarray(self): return True def _get_array(self): - """TODO Returns a subspace of the array as a numpy array. + """Returns a subspace of the dataset variable. - The indices that define the subspace must be either `Ellipsis` or - a sequence that contains an index for each dimension. In the - latter case, each dimension's index must either be a `slice` - object or a sequence of two or more integers. + The subspace is defined by the indices stored in the `index` + attribute. - Indexing is similar to numpy indexing. The only difference to - numpy indexing (given the restrictions on the type of indices - allowed) is: + .. versionadded:: NEXTVERSION - * When two or more dimension's indices are sequences of integers - then these indices work independently along each dimension - (similar to the way vector subscripts work in Fortran). + .. seealso:: `__array__`, `index` - .. versionadded:: NEXTVERSION + :Returns: + + `numpy.ndarray` + The subspace. """ return NotImplementedError( @@ -172,15 +184,31 @@ def _get_array(self): @property def index(self): - """TODO + """The index to be applied when converting to a `numpy` array. .. versionadded:: NEXTVERSION + :Returns: + + `tuple` + + **Examples** + + >>> x.index + (slice(None, None, None), slice(None, None, None), slice(None, None, None)) + >>> x.index + (slice(None, None, None), slice(None, None, None), slice(None, None, None)) + >>> x = x[[0], 10:20:2, :] + >>> x.index + + TODO + + """ ind = self._custom.get("index") if ind is None: ind = (slice(None),) * self.ndim - self._custom["index"]= ind + self._custom["index"] = ind return ind @@ -191,10 +219,9 @@ def original_shape(self): .. versionadded:: NEXTVERSION """ - shape = self._custom.get('original_shape') + shape = self._custom.get("original_shape") if shape is None: - shape = self.shape + shape = self.shape self._custom["original_shape"] = shape - + return shape - diff --git a/cf/data/array/netcdfarray.py b/cf/data/array/netcdfarray.py index 2150a97e92..113fceb46f 100644 --- a/cf/data/array/netcdfarray.py +++ b/cf/data/array/netcdfarray.py @@ -8,7 +8,9 @@ _lock = SerializableLock() -class NetCDFArray(IndexMixin, FileArrayMixin, ArrayMixin, Container, cfdm.NetCDFArray): +class NetCDFArray( + IndexMixin, FileArrayMixin, ArrayMixin, Container, cfdm.NetCDFArray +): """An array stored in a netCDF file.""" def __dask_tokenize__(self): @@ -35,8 +37,23 @@ def _lock(self): return _lock def _get_array(self): - """TODO""" - # Note: Using Container in super because that comes - # immediately before cfdm.NetCDFArray in the method - # resolution order. + """Returns a subspace of the dataset variable. + + The subspace is defined by the indices stored in the `index` + attribute. + + .. versionadded:: NEXTVERSION + + .. seealso:: `__array__`, `index` + + :Returns: + + `numpy.ndarray` + The subspace. + + """ + # Note: It's cfdm.NetCDFArray.__getitem__ that we want to + # call, but we use 'Container' in super because that + # comes immediately before cfdm.NetCDFArray in the + # method resolution order. return super(Container, self).__getitem__(self.index) diff --git a/cf/data/array/umarray.py b/cf/data/array/umarray.py index bf0ca67ddc..128af5e11e 100644 --- a/cf/data/array/umarray.py +++ b/cf/data/array/umarray.py @@ -176,9 +176,19 @@ def __init__( self._set_component("close", True, copy=False) def _get_array(self): - """Return a subspace of the array. + """Returns a subspace of the dataset variable. - Returns a subspace of the array as an independent numpy array. + The subspace is defined by the indices stored in the `index` + attribute. + + .. versionadded:: NEXTVERSION + + .. seealso:: `__array__`, `index` + + :Returns: + + `numpy.ndarray` + The subspace. """ f, header_offset = self.open() @@ -186,7 +196,7 @@ def _get_array(self): int_hdr = rec.int_hdr real_hdr = rec.real_hdr - array = rec.get_data().reshape(self.shape) + array = rec.get_data().reshape(self.original_shape) self.close(f) del f, rec @@ -247,82 +257,6 @@ def _get_array(self): # Return the numpy array return array - # def __getitem__(self, indices): - # """Return a subspace of the array. - # - # x.__getitem__(indices) <==> x[indices] - # - # Returns a subspace of the array as an independent numpy array. - # - # """ - # f, header_offset = self.open() - # rec = self._get_rec(f, header_offset) - # - # int_hdr = rec.int_hdr - # real_hdr = rec.real_hdr - # array = rec.get_data().reshape(self.shape) - # - # self.close(f) - # del f, rec - # - # if indices is not Ellipsis: - # indices = parse_indices(array.shape, indices) - # array = get_subspace(array, indices) - # - # # Set the units, if they haven't been set already. - # self._set_units(int_hdr) - # - # LBUSER2 = int_hdr.item(38) - # if LBUSER2 == 3: - # # Return the numpy array now if it is a boolean array - # self._set_component("dtype", np.dtype(bool), copy=False) - # return array.astype(bool) - # - # integer_array = LBUSER2 == 2 - # - # # ------------------------------------------------------------ - # # Convert to a masked array - # # ------------------------------------------------------------ - # # Set the fill_value from BMDI - # fill_value = real_hdr.item(17) - # if fill_value != -1.0e30: - # # -1.0e30 is the flag for no missing data - # if integer_array: - # # The fill_value must be of the same type as the data - # # values - # fill_value = int(fill_value) - # - # # Mask any missing values - # mask = array == fill_value - # if mask.any(): - # array = np.ma.masked_where(mask, array, copy=False) - # - # # ------------------------------------------------------------ - # # Unpack the array using the scale_factor and add_offset, if - # # either is available - # # ------------------------------------------------------------ - # # Treat BMKS as a scale_factor if it is neither 0 nor 1 - # scale_factor = real_hdr.item(18) - # if scale_factor != 1.0 and scale_factor != 0.0: - # if integer_array: - # scale_factor = int(scale_factor) - # - # array *= scale_factor - # - # # Treat BDATUM as an add_offset if it is not 0 - # add_offset = real_hdr.item(4) - # if add_offset != 0.0: - # if integer_array: - # add_offset = int(add_offset) - # - # array += add_offset - # - # # Set the data type - # self._set_component("dtype", array.dtype, copy=False) - # - # # Return the numpy array - # return array - def _get_rec(self, f, header_offset): """Get a container for a record. diff --git a/cf/data/data.py b/cf/data/data.py index 5932b40bd1..c5dd037479 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -1397,13 +1397,11 @@ def _set_dask(self, dx, copy=False, clear=_ALL, asanyarray=False): `_clear_after_dask_update` for details. asanyarray: `bool` or `None`, optional - TODO - - If True then call `np.asanyarray` on chunks to convert - them to numpy arrays. If False then chunks are passed - through unchanged. If None (default) then we use True - if the ``__array_function__`` method is undefined. - + If True then at compute time add a final operation to + the Dask graph that converts chunks to `numpy` + arrays. If False, the default, then do not do this. If + `None` then do not change the current behaviour, which + is defined by the `_asanyarray` attribute. .. versionadded:: NEXTRELEASE @@ -5290,23 +5288,26 @@ def array(self): 2000-12-01 00:00:00 """ - array = self.compute().copy() - if issparse(array): - array = array.toarray() - elif not isinstance(array, np.ndarray): - array = np.asanyarray(array) + a = self.compute().copy() + if issparse(a): + a = a.toarray() + elif not isinstance(a, np.ndarray): + a = np.asanyarray(a) + + if not a.size: + return a # Set cached elements items = [0, -1] - if array.size >= 3: + if a.size >= 3: items.append(1) - - if array.ndim == 2 and array.shape[-1] == 2: + + if a.ndim == 2 and a.shape[-1] == 2: items.append(-2) - - self._set_cached_elements({i: array.item(i) for i in items}) - - return array + + self._set_cached_elements({i: a.item(i) for i in items}) + + return a @property def datetime_array(self): @@ -6576,7 +6577,7 @@ def get_filenames(self): """ out = set() - for a in self.todict().values(): + for a in self.todict(asanyarray=False).values(): try: out.update(a.get_filenames()) except AttributeError: @@ -6709,7 +6710,7 @@ def add_file_location(self, location): location = abspath(location).rstrip(sep) updated = False - dsk = self.todict() + dsk = self.todict(asanyarray=False) for key, a in dsk.items(): try: dsk[key] = a.add_file_location(location) @@ -9038,7 +9039,7 @@ def file_locations(self): """ out = set() - for key, a in self.todict().items(): + for key, a in self.todict(asanyarray=False).items(): try: out.update(a.file_locations()) except AttributeError: @@ -9750,7 +9751,15 @@ def to_dask_array(self, apply_mask_hardness=False, asanyarray=None): array to be that given by the `hardmask` attribute. asanyarray: `bool` or `None`, optional - TODO + If True then add a final operation to the Dask graph + that converts chunks to `numpy` arrays. If False then + do not do this. If `None`, the default, then add the + final operation only if the `_asanyarray` attribute is + `True`. + + .. note:: Such a final operation is included in the + returned Dask array, but is not included in + the Dask array stored in the `Data` object. .. versionadded:: NEXTVERSION @@ -10052,7 +10061,7 @@ def del_file_location(self, location): location = abspath(location).rstrip(sep) updated = False - dsk = self.todict() + dsk = self.todict(asanyarray=False) for key, a in dsk.items(): try: dsk[key] = a.del_file_location(location) @@ -11789,7 +11798,7 @@ def tan(self, inplace=False, i=False): return d def todict( - self, optimize_graph=True, apply_mask_hardness=False, asanyarray=False + self, optimize_graph=True, apply_mask_hardness=False, asanyarray=None ): """Return a dictionary of the dask graph key/value pairs. @@ -11812,7 +11821,11 @@ def todict( .. versionadded:: NEXTVERSION asanyarray: `bool` or `None`, optional - TODO + If True then add a final operation to the Dask graph + that converts chunks to `numpy` arrays. If False then + do not do this. If `None`, the default, then add the + final operation only if the `_asanyarray` attribute is + `True`. .. versionadded:: NEXTVERSION diff --git a/cf/read_write/netcdf/netcdfwrite.py b/cf/read_write/netcdf/netcdfwrite.py index ad7da4c9cd..8cdc9cf9c7 100644 --- a/cf/read_write/netcdf/netcdfwrite.py +++ b/cf/read_write/netcdf/netcdfwrite.py @@ -1056,7 +1056,7 @@ def _cfa_get_file_details(self, data): """ out = set() - for a in data.todict().values(): + for a in data.todict(asanyarray=False).values(): try: out.update( ((a.get_filenames(), a.get_addresses(), a.get_formats()),) From d48a7cf026ca09c80c6762784d634abe9c103378 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 18 Mar 2024 00:59:20 +0000 Subject: [PATCH 062/134] dev --- cf/data/array/mixin/indexmixin.py | 175 ++++++++++++++++-------------- cf/data/creation.py | 6 +- cf/data/dask_utils.py | 37 +++---- cf/data/data.py | 7 +- 4 files changed, 120 insertions(+), 105 deletions(-) diff --git a/cf/data/array/mixin/indexmixin.py b/cf/data/array/mixin/indexmixin.py index 5ff20b427f..c2cc5921b2 100644 --- a/cf/data/array/mixin/indexmixin.py +++ b/cf/data/array/mixin/indexmixin.py @@ -5,7 +5,7 @@ class IndexMixin: - """TODO xMixin class for an array stored in a file. + """Mixin class for lazy subspacing of a data array. .. versionadded:: NEXTVERSION @@ -14,8 +14,6 @@ class IndexMixin: def __array__(self, *dtype): """Convert the ``{{class}}` into a `numpy` array. - TODO stored indices - .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -26,7 +24,8 @@ def __array__(self, *dtype): :Returns: `numpy.ndarray` - An independent numpy array of the data. + An independent `numpy` array of the subspace of the + data defined by the `indices` attribute. """ array = self._get_array() @@ -44,17 +43,17 @@ def __getitem__(self, index): its `__array__` method. Consecutive subspaces are lazy, with only the final data - elements read from the dataset when `__array__` is called. + elements retrieved from the data when `__array__` is called. - For example, if a dataset variable has shape ``(12, 145, - 192)`` and consecutive subspaces of ``[8:9, 10:20:3, [15, 1, - 4, 12]`` and ``[[0], [True, False, True], ::-2]`` are applied - then only the elements defined by subspace ``[[8], [10, 16], - [12, 1]]`` will be retrieved from the dataset when `__array__` - is called. + For example, if the original data has shape ``(12, 145, 192)`` + and consecutive subspaces of ``[8:9, 10:20:3, [15, 1, 4, 12]`` + and ``[[0], [True, False, True], ::-2]`` are applied, then + only the elements defined by subspace ``[[8], [10, 16], [12, + 1]]`` will be retrieved from the data when `__array__` is + called. Indexing is similar to `numpy` indexing. The only difference - to numpy indexing (given the restrictions on the type of + to `numpy` indexing (given the restrictions on the type of indices allowed) is: * When two or more dimension's indices are sequences of @@ -82,80 +81,91 @@ def __getitem__(self, index): new = self.copy() new_indices = [] new_shape = [] + new_original_shape = [] for ind0, ind, size, original_size in zip( index0, index, shape, original_shape ): + keepdim = True if isinstance(ind, slice) and ind == slice(None): - new_indices.append(ind0) - new_shape.append(size) - continue - - if is_dask_collection(ind): - # Note: This will never occur when __getitem__ is - # being called from within a Dask graph, because - # any lazy indices will have already been - # computed as part of the whole graph execution; - # i.e. we don't have to worry about a - # compute-within-a-compute situation. (If this - # were not the case then we could get round it - # by wrapping the compute inside a `with - # dask.config.set({"scheduler": - # "synchronous"}):` clause.) - ind = ind.compute() - - if isinstance(ind0, slice): - if isinstance(ind, slice): - # 'ind0' is slice; 'ind' is slice - start, stop, step = ind0.indices(size) - size0 = indices_shape((ind0,), (original_size,))[0] - start1, stop1, step1 = ind.indices(size0 + 1) - size1, mod1 = divmod(stop1 - start1, step1) - - if mod1 != 0: - size1 += 1 - - start += start1 * step - step *= step1 - stop = start + (size1 - 1) * step - - if step > 0: - stop += 1 + new_index = ind0 + new_size = size + else: + if is_dask_collection(ind): + # Note: This will never occur when __getitem__ is + # being called from within a Dask graph, because + # any lazy indices will have already been + # computed as part of the whole graph execution; + # i.e. we don't have to worry about a + # compute-within-a-compute situation. (If this + # were not the case then we could get round it + # by wrapping the compute inside a `with + # dask.config.set({"scheduler": + # "synchronous"}):` clause.) + ind = ind.compute() + + if isinstance(ind0, slice): + if isinstance(ind, slice): + # 'ind0' is slice; 'ind' is slice + start, stop, step = ind0.indices(size) + size0 = indices_shape((ind0,), (original_size,))[0] + start1, stop1, step1 = ind.indices(size0 + 1) + size1, mod1 = divmod(stop1 - start1, step1) + + if mod1 != 0: + size1 += 1 + + start += start1 * step + step *= step1 + stop = start + (size1 - 1) * step + + if step > 0: + stop += 1 + else: + stop -= 1 + + if stop < 0: + stop = None + + new_index = slice(start, stop, step) + elif np.iterable(ind): + # 'ind0' is slice; 'ind' is array of int/bool + new_index = np.arange(*ind0.indices(original_size))[ind] else: - stop -= 1 - - if stop < 0: - stop = None - - new_index = slice(start, stop, step) - elif np.iterable(ind): - # 'ind0' is slice; 'ind' is array of int/bool - new_index = np.arange(*ind0.indices(original_size))[ind] + # 'ind' is Integral. Remove the dimension. + new_index = ind + keepdim = False else: - raise ValueError( - f"Can't subspace {self!r} with index {ind} that " - "removes a dimension" - ) - else: - # 'ind0' is array of int - new_index = np.asanyarray(ind0)[ind] + # 'ind0' is array of int + new_index = np.asanyarray(ind0)[ind] new_indices.append(new_index) + if keepdim: + new_original_shape.append(original_size) + + new_shape = indices_shape(new_indices, original_shape, keepdims=False) - new_shape = indices_shape(new_indices, original_shape, keepdims=True) new._set_component("shape", tuple(new_shape), copy=False) + new._custom["original_shape"] = tuple(new_original_shape) new._custom["index"] = tuple(new_indices) return new def __repr__(self): - """TODO""" + """Called by the `repr` built-in function. + + x.__repr__() <==> repr(x) + + """ + self.original_shape out = super().__repr__() - return f"{out[:-1]}{self.original_shape}>" + return f"{out[:-1]}{self._custom['original_shape0']}>" @property - def _dask_asanyarray(self): - """TODO + def __asanyarray__(self): + """Whether the array is accessed by conversion to a `numpy` array. + + Always returns `True`. .. versionadded:: NEXTVERSION @@ -163,10 +173,11 @@ def _dask_asanyarray(self): return True def _get_array(self): - """Returns a subspace of the dataset variable. + """Returns a subspace of the data. The subspace is defined by the indices stored in the `index` - attribute. + attribute, and may be the result of multiple `__getitem__` + calls. .. versionadded:: NEXTVERSION @@ -188,21 +199,24 @@ def index(self): .. versionadded:: NEXTVERSION - :Returns: - - `tuple` + .. seealso:: `original_shape`, `shape` **Examples** >>> x.index - (slice(None, None, None), slice(None, None, None), slice(None, None, None)) + (slice(None), slice(None), slice(None)) + >>> x.shape + (12, 145, 192) + >>> x = x[8:9, 10:20:3, [15, 1, 4, 12]] >>> x.index - (slice(None, None, None), slice(None, None, None), slice(None, None, None)) - >>> x = x[[0], 10:20:2, :] + (slice(8, 9), slice(10, 20, 3), [15, 1, 4, 12]) + >>> x.shape + (1, 3, 4) + >>> x = x[[0], [True, False, True], ::-2] >>> x.index - - TODO - + ([8], [10, 16], [12, 1]) + >>> x.shape + (1, 2, 2) """ ind = self._custom.get("index") @@ -214,14 +228,17 @@ def index(self): @property def original_shape(self): - """TODO + """The original shape of the data. .. versionadded:: NEXTVERSION + .. seealso:: `index`, `shape` + """ shape = self._custom.get("original_shape") if shape is None: shape = self.shape + self._custom["original_shape0"] = shape self._custom["original_shape"] = shape return shape diff --git a/cf/data/creation.py b/cf/data/creation.py index 980881db80..912878787c 100644 --- a/cf/data/creation.py +++ b/cf/data/creation.py @@ -67,7 +67,11 @@ def to_dask(array, chunks, **from_array_options): try: return array.to_dask_array(chunks=chunks) except TypeError: - return array.to_dask_array() + try: + return array.to_dask_array(asanyarray=False) + except TypeError: + return array.to_dask_array() + if type(array).__module__.split(".")[0] == "xarray": data = getattr(array, "data", None) diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index b69147db8b..93ab5791b9 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -666,7 +666,7 @@ def cf_units(a, from_units, to_units): def cf_filled(a, fill_value=None): - """TODOConvert array values to have different equivalent units. + """Replace masked elements with a fill value. .. versionadded:: NEXTVERSION @@ -675,26 +675,22 @@ def cf_filled(a, fill_value=None): a: array_like The array. - fill_value: - TODO + fill_value: scalar + The fill value. :Returns: `numpy.ndarray` - TODO An array containing values in the new units. In order to - represent the new units, the returned data type may be - different from that of the input array. For instance, if - *a* has an integer data type, *from_units* are kilometres, - and *to_units* are ``'miles'`` then the returned array - will have a float data type. + The filled array. **Examples** - TODO - >>> import numpy as np - >>> a = np.array([1, 2]) - >>> print(cf.data.dask_utils.cf_units(a, cf.Units('km'), cf.Units('m'))) - [1000. 2000.] + >>> a = np.array([[1, 2, 3]]) + >>> print(cf.data.dask_utils.cf_filled(a, -999)) + [[1 2 3]] + >>> a = np.ma.array([[1, 2, 3]], mask=[[True, False, False]]) + >>> print(cf.data.dask_utils.cf_filled(a, -999)) + [[-999 2 3]] """ a = cf_asanyarray(a) @@ -702,7 +698,10 @@ def cf_filled(a, fill_value=None): def cf_asanyarray(a): - """TODO + """Convert to a `numpy` array. + + Only do this is the input *a* has an `__asanyarray__` attribute + with value True. .. versionadded:: NEXTVERSION @@ -713,14 +712,10 @@ def cf_asanyarray(a): :Returns: - TODO - - **Examples** - - TODO + The converted array, or the input array unchanged. """ - if getattr(a, "_dask_asanyarray", False): + if getattr(a, "__asanyarray__", False): return np.asanyarray(a) return a diff --git a/cf/data/data.py b/cf/data/data.py index c5dd037479..9225e670f8 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -478,7 +478,7 @@ def __init__( # Set whether or not to call np.asanyarray on chunks to # convert them to numpy arrays. - self._custom["asanyarray"] = getattr(array, "_dask_asanyarray", False) + self._custom["asanyarray"] = getattr(array, "__asanyarray__", False) dx = to_dask(array, chunks, **kwargs) @@ -502,7 +502,7 @@ def __init__( self._Units = units # Store the dask array - self._set_dask(dx, clear=_NONE) + self._set_dask(dx, clear=_NONE, asanyarray=None) # Override the data type if dtype is not None: @@ -1400,8 +1400,7 @@ def _set_dask(self, dx, copy=False, clear=_ALL, asanyarray=False): If True then at compute time add a final operation to the Dask graph that converts chunks to `numpy` arrays. If False, the default, then do not do this. If - `None` then do not change the current behaviour, which - is defined by the `_asanyarray` attribute. + `None` then do not change the current behaviour. .. versionadded:: NEXTRELEASE From 1c73b890197b5c9caf526285c1563dd2cb30b878 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 18 Mar 2024 21:31:03 +0000 Subject: [PATCH 063/134] dev --- cf/data/array/mixin/indexmixin.py | 167 +++++++++-------- cf/data/array/netcdfarray.py | 24 ++- cf/data/array/umarray.py | 15 +- cf/data/creation.py | 8 +- cf/data/dask_utils.py | 25 ++- cf/data/data.py | 157 +++++++++++----- cf/data/fragment/mixin/fragmentarraymixin.py | 177 +++++-------------- cf/data/utils.py | 4 +- cf/functions.py | 9 +- cf/mixin/propertiesdata.py | 42 ++++- cf/read_write/netcdf/netcdfwrite.py | 13 +- cf/test/test_Data.py | 31 +++- 12 files changed, 386 insertions(+), 286 deletions(-) diff --git a/cf/data/array/mixin/indexmixin.py b/cf/data/array/mixin/indexmixin.py index c2cc5921b2..ebd7e5a41e 100644 --- a/cf/data/array/mixin/indexmixin.py +++ b/cf/data/array/mixin/indexmixin.py @@ -1,3 +1,5 @@ +from numbers import Integral + import numpy as np from dask.base import is_dask_collection @@ -72,83 +74,88 @@ def __getitem__(self, index): The subspaced array. """ - shape = self.shape + shape0 = self.shape index0 = self.index original_shape = self.original_shape - index = parse_indices(shape, index, keepdims=False) + index = parse_indices(shape0, index, keepdims=False) new = self.copy() new_indices = [] new_shape = [] - new_original_shape = [] - - for ind0, ind, size, original_size in zip( - index0, index, shape, original_shape - ): - keepdim = True - if isinstance(ind, slice) and ind == slice(None): - new_index = ind0 - new_size = size - else: - if is_dask_collection(ind): - # Note: This will never occur when __getitem__ is - # being called from within a Dask graph, because - # any lazy indices will have already been - # computed as part of the whole graph execution; - # i.e. we don't have to worry about a - # compute-within-a-compute situation. (If this - # were not the case then we could get round it - # by wrapping the compute inside a `with - # dask.config.set({"scheduler": - # "synchronous"}):` clause.) - ind = ind.compute() - - if isinstance(ind0, slice): - if isinstance(ind, slice): - # 'ind0' is slice; 'ind' is slice - start, stop, step = ind0.indices(size) - size0 = indices_shape((ind0,), (original_size,))[0] - start1, stop1, step1 = ind.indices(size0 + 1) - size1, mod1 = divmod(stop1 - start1, step1) - - if mod1 != 0: - size1 += 1 - - start += start1 * step - step *= step1 - stop = start + (size1 - 1) * step - - if step > 0: - stop += 1 - else: - stop -= 1 - - if stop < 0: - stop = None - - new_index = slice(start, stop, step) - elif np.iterable(ind): - # 'ind0' is slice; 'ind' is array of int/bool - new_index = np.arange(*ind0.indices(original_size))[ind] + + i = 0 + for ind0, original_size in zip(index0, original_shape): + if isinstance(ind0, Integral): + # This dimension has been previously removed by the + # integer index 'ind0' + new_indices.append(ind0) + continue + + # 'index' might have fewer elements than 'index0' + ind1 = index[i] + size0 = shape0[i] + i += 1 + + if isinstance(ind1, slice) and ind1 == slice(None): + # This dimension is not subspaced + new_indices.append(ind0) + continue + + # Still here? Then we have to work out the the subspace of + # the full array implied by applying both 'ind0' and + # 'ind1'. + if is_dask_collection(ind1): + # Note: This will never occur when __getitem__ is + # being called from within a Dask graph, because + # any lazy indices will have already been + # computed as part of the whole graph execution; + # i.e. we don't have to worry about a + # compute-within-a-compute situation. (If this + # were not the case then we could get round it + # by wrapping the compute inside a `with + # dask.config.set({"scheduler": + # "synchronous"}):` clause.) + ind1 = ind1.compute() + + if isinstance(ind0, slice): + if isinstance(ind1, slice): + # ind0: slice + # ind1: slice + start, stop, step = ind0.indices(original_size) + start1, stop1, step1 = ind1.indices(size0) + size1, mod1 = divmod(stop1 - start1, step1) + + if mod1 != 0: + size1 += 1 + + start += start1 * step + step *= step1 + stop = start + (size1 - 1) * step + + if step > 0: + stop += 1 else: - # 'ind' is Integral. Remove the dimension. - new_index = ind - keepdim = False + stop -= 1 + + if stop < 0: + stop = None + + new_index = slice(start, stop, step) else: - # 'ind0' is array of int - new_index = np.asanyarray(ind0)[ind] + # ind0: slice + # ind1: int, or array of int/bool + new_index = np.arange(*ind0.indices(original_size))[ind1] + else: + # ind0: array of int + new_index = np.asanyarray(ind0)[ind1] new_indices.append(new_index) - if keepdim: - new_original_shape.append(original_size) - - new_shape = indices_shape(new_indices, original_shape, keepdims=False) + new_shape = indices_shape(new_indices, original_shape, keepdims=False) new._set_component("shape", tuple(new_shape), copy=False) - - new._custom["original_shape"] = tuple(new_original_shape) new._custom["index"] = tuple(new_indices) + return new def __repr__(self): @@ -157,32 +164,35 @@ def __repr__(self): x.__repr__() <==> repr(x) """ - self.original_shape out = super().__repr__() - return f"{out[:-1]}{self._custom['original_shape0']}>" + return f"{out[:-1]}{self.original_shape}>" @property def __asanyarray__(self): """Whether the array is accessed by conversion to a `numpy` array. - Always returns `True`. - .. versionadded:: NEXTVERSION + :Returns: + + `True` + """ return True - def _get_array(self): + def _get_array(self, index=None): """Returns a subspace of the data. - The subspace is defined by the indices stored in the `index` - attribute, and may be the result of multiple `__getitem__` - calls. - .. versionadded:: NEXTVERSION .. seealso:: `__array__`, `index` + :Parameters: + + index: `tuple` or `None`, optional + Provide the indices that define the subspace. If `None` + then the `index` attribute is used. + :Returns: `numpy.ndarray` @@ -197,9 +207,12 @@ def _get_array(self): def index(self): """The index to be applied when converting to a `numpy` array. + The `shape` is defined by the `index` applied to the + `original_shape`. + .. versionadded:: NEXTVERSION - .. seealso:: `original_shape`, `shape` + .. seealso:: `shape`, `original_shape` **Examples** @@ -223,6 +236,7 @@ def index(self): if ind is None: ind = (slice(None),) * self.ndim self._custom["index"] = ind + self._custom["original_shape"] = self.shape return ind @@ -230,6 +244,9 @@ def index(self): def original_shape(self): """The original shape of the data. + The `shape` is defined by the `index` applied to the + `original_shape`. + .. versionadded:: NEXTVERSION .. seealso:: `index`, `shape` @@ -237,8 +254,6 @@ def original_shape(self): """ shape = self._custom.get("original_shape") if shape is None: - shape = self.shape - self._custom["original_shape0"] = shape - self._custom["original_shape"] = shape + self._custom["original_shape"] = self.shape return shape diff --git a/cf/data/array/netcdfarray.py b/cf/data/array/netcdfarray.py index 113fceb46f..597a61145f 100644 --- a/cf/data/array/netcdfarray.py +++ b/cf/data/array/netcdfarray.py @@ -36,7 +36,7 @@ def _lock(self): """ return _lock - def _get_array(self): + def _get_array(self, index=None): """Returns a subspace of the dataset variable. The subspace is defined by the indices stored in the `index` @@ -46,14 +46,30 @@ def _get_array(self): .. seealso:: `__array__`, `index` + :Parameters: + + index: `tuple` or `None`, optional + Provide the indices that define the subspace. If `None` + then the `index` attribute is used. + :Returns: `numpy.ndarray` The subspace. """ - # Note: It's cfdm.NetCDFArray.__getitem__ that we want to - # call, but we use 'Container' in super because that + if index is None: + index = self.index + + # Note: We need to use the lock because the netCDF file is + # going to be read. + self._lock.acquire() + + # Note: It's cfdm.NetCDFArray.__getitem__ that we want to call + # here, but we use 'Container' in super because that # comes immediately before cfdm.NetCDFArray in the # method resolution order. - return super(Container, self).__getitem__(self.index) + array = super(Container, self).__getitem__(index) + + self._lock.release() + return array diff --git a/cf/data/array/umarray.py b/cf/data/array/umarray.py index 128af5e11e..3764d6679d 100644 --- a/cf/data/array/umarray.py +++ b/cf/data/array/umarray.py @@ -175,7 +175,7 @@ def __init__( # By default, close the UM file after data array access self._set_component("close", True, copy=False) - def _get_array(self): + def _get_array(self, index=None): """Returns a subspace of the dataset variable. The subspace is defined by the indices stored in the `index` @@ -185,12 +185,23 @@ def _get_array(self): .. seealso:: `__array__`, `index` + :Parameters: + + index: `tuple` or `None`, optional + Provide the indices that define the subspace. If `None` + then the `index` attribute is used. + :Returns: `numpy.ndarray` The subspace. """ + # Note: No need to lock the UM file - concurrent reads are OK. + + if index is None: + index = self.index + f, header_offset = self.open() rec = self._get_rec(f, header_offset) @@ -201,7 +212,7 @@ def _get_array(self): self.close(f) del f, rec - array = get_subspace(array, self.index) + array = get_subspace(array, index) # Set the units, if they haven't been set already. self._set_units(int_hdr) diff --git a/cf/data/creation.py b/cf/data/creation.py index 912878787c..b861281e22 100644 --- a/cf/data/creation.py +++ b/cf/data/creation.py @@ -24,7 +24,7 @@ def to_dask(array, chunks, **from_array_options): value accepted by the *chunks* parameter of the `dask.array.from_array` function is allowed. - Ignored if *array* is a `dask` array, which already + Might be ignored if *array* is a `dask` array that already defines its own chunks. Might get automatically modified if *array* is a @@ -33,10 +33,6 @@ def to_dask(array, chunks, **from_array_options): from_array_options: `dict`, optional Keyword arguments to be passed to `dask.array.from_array`. - If *from_array_options* has no ``'lock'`` key then the - `lock` keyword is set to the `_lock` attribute of *array* - or, if there is no such attribute, `False`. - If *from_array_options* has no ``'meta'`` key then the `meta` keyword is set to the `_dask_meta` attribute of *array* or, if there is no such attribute, `None`. @@ -71,7 +67,6 @@ def to_dask(array, chunks, **from_array_options): return array.to_dask_array(asanyarray=False) except TypeError: return array.to_dask_array() - if type(array).__module__.split(".")[0] == "xarray": data = getattr(array, "data", None) @@ -86,7 +81,6 @@ def to_dask(array, chunks, **from_array_options): array = np.asanyarray(array) kwargs = from_array_options - kwargs.setdefault("lock", getattr(array, "_lock", False)) kwargs.setdefault("meta", getattr(array, "_dask_meta", None)) try: diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index 93ab5791b9..201ef1274b 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -121,7 +121,7 @@ def cf_contains(a, value): :Returns: `numpy.ndarray` - A size 1 Boolean array, with the same number of dimensions + A size 1 Boolean array with the same number of dimensions as *a*, that indicates whether or not *a* contains the value. @@ -665,6 +665,29 @@ def cf_units(a, from_units, to_units): ) +def cf_is_masked(a): + """Determine whether an array has masked values. + + .. versionadded:: NEXTVERSION + + :Parameters: + + a: array_like + The array. + + :Returns: + + `numpy.ndarray` + A size 1 Boolean array with the same number of dimensions + as *a*, for which `True` indicates that there are masked + values. + + """ + a = cf_asanyarray(a) + out = np.ma.is_masked(a) + return np.array(out).reshape((1,) * a.ndim) + + def cf_filled(a, fill_value=None): """Replace masked elements with a fill value. diff --git a/cf/data/data.py b/cf/data/data.py index 9225e670f8..1bb1f5437e 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -13,7 +13,6 @@ import numpy as np from cfdm import is_log_level_info from dask import compute, delayed # noqa: F401 -from dask.array import Array from dask.array.core import normalize_chunks from dask.base import collections_to_dsk, is_dask_collection, tokenize from dask.highlevelgraph import HighLevelGraph @@ -51,6 +50,7 @@ cf_dt2rt, cf_filled, cf_harden_mask, + cf_is_masked, cf_percentile, cf_rt2dt, cf_soften_mask, @@ -373,7 +373,7 @@ def __init__( source=source, _use_array=_use_array and array is not None ) - self._custom.setdefault("asanyarray", True) + # self._custom.setdefault("__asanyarray__", True) if _use_array: try: @@ -438,6 +438,7 @@ def __init__( return # Still here? Then create a dask array and store it. + custom = self._custom # Find out if the input data is compressed by convention try: @@ -474,11 +475,19 @@ def __init__( # Set whether or not we're sure that the Data instance has a # deterministic name - self._custom["deterministic"] = not is_dask_collection(array) + is_dask = is_dask_collection(array) + custom["deterministic"] = not is_dask # Set whether or not to call np.asanyarray on chunks to # convert them to numpy arrays. - self._custom["asanyarray"] = getattr(array, "__asanyarray__", False) + if is_dask: + # We don't know what's in the dask array, so we should + # assume that it might need converting to a numpy array.x + custom["__asanyarray__"] = True + else: + custom["__asanyarray__"] = bool( + getattr(array, "__asanyarray__", False) + ) dx = to_dask(array, chunks, **kwargs) @@ -636,9 +645,13 @@ def __contains__(self, value): # are incompatible return False - value = value.to_dask_array() + # 'cf_contains' has its own calls to 'cf_asanyarray', so + # we can set 'asanyarray=False'. + value = value.to_dask_array(asanyarray=False) - dx = self.to_dask_array() + # 'cf_contains' has its own calls to 'cf_asanyarray', so we + # can set 'asanyarray=False'. + dx = self.to_dask_array(asanyarray=False) out_ind = tuple(range(dx.ndim)) dx_ind = out_ind @@ -771,7 +784,7 @@ def __len__(self): TypeError: len() of unsized object """ - dx = self.to_dask_array(asanyarray=False) # TODO check + dx = self.to_dask_array(asanyarray=False) if math.isnan(dx.size): logger.debug("Computing data len: Performance may be degraded") dx.compute_chunk_sizes() @@ -937,7 +950,8 @@ def __getitem__(self, indices): ) # ------------------------------------------------------------ - # Set the subspaced dask array + # Set the subspaced dask array. Set 'asanyarray=True' to + # honour truely lazy subspacing. # ------------------------------------------------------------ new._set_dask(dx, asanyarray=True) @@ -1154,9 +1168,24 @@ def __setitem__(self, indices, value): return - # ---------------------------------------------------------------- - # Indexing behaviour attributes - # ---------------------------------------------------------------- + @property + def __asanyarray__(self): + """Whether the chunks need conversion to a `numpy` array. + + .. versionadded:: NEXTVERSION + + :Returns: + + `bool` + If True then at compute time add a final operation to + the Dask graph that converts chunks to `numpy` arrays, + but only if a chunk's data object has an + `__asanyarray__` attribute that is also `True`. If + `False` then do not do this. + + """ + return self._custom.get("__asanyarray__", True) + @property def __orthogonal_indexing__(self): """Flag to indicate that orthogonal indexing is supported. @@ -1398,9 +1427,11 @@ def _set_dask(self, dx, copy=False, clear=_ALL, asanyarray=False): asanyarray: `bool` or `None`, optional If True then at compute time add a final operation to - the Dask graph that converts chunks to `numpy` - arrays. If False, the default, then do not do this. If - `None` then do not change the current behaviour. + the Dask graph that converts chunks to `numpy` arrays, + but only if a chunk's data object has an + `__asanyarray__` attribute that is also `True`. If + False, the default, then do not do this. If `None` + then do not change the current behaviour. .. versionadded:: NEXTRELEASE @@ -1434,7 +1465,7 @@ def _set_dask(self, dx, copy=False, clear=_ALL, asanyarray=False): custom = self._custom custom["dask"] = dx if asanyarray is not None: - custom["asanyarray"] = bool(asanyarray) + custom["__asanyarray__"] = bool(asanyarray) self._clear_after_dask_update(clear) @@ -2494,7 +2525,9 @@ def percentile( else: axes = tuple(sorted(d._parse_axes(axes))) - dx = d.to_dask_array() + # 'cf_percentile' has its own call to 'cf_asanyarray', so we + # can set 'asanyarray=False'. + dx = d.to_dask_array(asanyarray=False) dtype = dx.dtype shape = dx.shape @@ -2559,7 +2592,7 @@ def percentile( name = name[0] graph = HighLevelGraph.from_collections(name, dsk, dependencies=[dx]) - dx = Array(graph, name, chunks=out_chunks, dtype=float) + dx = da.Array(graph, name, chunks=out_chunks, dtype=float) d._set_dask(dx) @@ -2989,7 +3022,14 @@ def convolution_filter( depth += abs(origin) - dx = d.to_dask_array() + # 'cf_convolve1d' has its own call to 'cf_asanyarray', but we + # need to pre-empt that so that the halos can be created. + dx = d.to_dask_array(asanyarray=None) + + # Cast to float to ensure that NaNs can be stored (so + # map_overlap can correctly assign the halos) + if dx.dtype != float: + dx = dx.astype(float, copy=False) # Convolve each chunk convolve1d = partial( @@ -3173,9 +3213,13 @@ def rechunk( """ d = _inplace_enabled_define_and_cleanup(self) - dx = d.to_dask_array(asanyarray=False) # TODO: check that this is OK! + # Dask rechunking is essentially a wrapper for __getitem__ + # calls on the chunks, which allows us to use the same + # 'asanyarray' settings as used in `__gettem__`. + + dx = d.to_dask_array(asanyarray=False) dx = dx.rechunk(chunks, threshold, block_size_limit, balance) - d._set_dask(dx, clear=_ALL ^ _ARRAY ^ _CACHE) + d._set_dask(dx, clear=_ALL ^ _ARRAY ^ _CACHE, asanyarray=True) return d @@ -3226,6 +3270,8 @@ def _asdatetime(self, inplace=False): ) if not d._isdatetime(): + # 'cf_rt2dt' has its own call to 'cf_asanyarray', so we + # can set 'asanyarray=False'. dx = d.to_dask_array(asanyarray=False) dx = dx.map_blocks(cf_rt2dt, units=units, dtype=object) d._set_dask(dx) @@ -3281,6 +3327,8 @@ def _asreftime(self, inplace=False): ) if d._isdatetime(): + # 'cf_dt2rt' has its own call to 'cf_asanyarray', so we + # can set 'asanyarray=False'. dx = d.to_dask_array(asanyarray=False) dx = dx.map_blocks(cf_dt2rt, units=units, dtype=float) d._set_dask(dx) @@ -3891,6 +3939,8 @@ def _regrid( f"the shape of the regrid operator: {operator.src_shape}" ) + # 'regrid' has its own calls to 'cf_asanyarray', so we can set + # 'asanyarray=False'. dx = self.to_dask_array(asanyarray=False) # Rechunk so that each chunk contains data in the form @@ -4862,6 +4912,8 @@ def Units(self, value): cf_func = partial(cf_units, from_units=old_units, to_units=value) + # 'cf_units' has its own call to 'cf_asanyarray', so we can + # set 'asanyarray=False'. dx = self.to_dask_array(asanyarray=False) dx = dx.map_blocks(cf_func, dtype=dtype) @@ -5042,18 +5094,15 @@ def is_masked(self): True """ - - def is_masked(a): - out = np.ma.is_masked(a) - return np.array(out).reshape((1,) * a.ndim) - - dx = self.to_dask_array() + # 'cf_is_masked' has its own call to 'cf_asanyarray', so we + # can set 'asanyarray=False'. + dx = self.to_dask_array(asanyarray=False) out_ind = tuple(range(dx.ndim)) dx_ind = out_ind dx = da.blockwise( - is_masked, + cf_is_masked, out_ind, dx, dx_ind, @@ -5090,7 +5139,7 @@ def nbytes(self): 24 """ - dx = self.to_dask_array(asanyarray=False) # TODO: Check + dx = self.to_dask_array(asanyarray=False) if math.isnan(dx.size): logger.debug("Computing data nbytes: Performance may be degraded") dx.compute_chunk_sizes() @@ -5197,7 +5246,7 @@ def shape(self): () """ - dx = self.to_dask_array(asanyarray=False) # TODO: Check + dx = self.to_dask_array(asanyarray=False) if math.isnan(dx.size): logger.debug("Computing data shape: Performance may be degraded") dx.compute_chunk_sizes() @@ -5236,7 +5285,7 @@ def size(self): 1 """ - dx = self.to_dask_array(asanyarray=False) # TODO: Check + dx = self.to_dask_array(asanyarray=False) size = dx.size if math.isnan(size): logger.debug("Computing data size: Performance may be degraded") @@ -5298,11 +5347,10 @@ def array(self): # Set cached elements items = [0, -1] - if a.size >= 3: - items.append(1) - if a.ndim == 2 and a.shape[-1] == 2: - items.append(-2) + items.extend((1, -2)) + elif a.size == 3: + items.append(1) self._set_cached_elements({i: a.item(i) for i in items}) @@ -6433,6 +6481,8 @@ def convert_reference_time( ) d.Units = units0 + # 'cf_rt2dt' its own call to 'cf_asanyarray', so we can set + # 'asanyarray=False'. dx = d.to_dask_array(asanyarray=False) # Convert to the correct date-time objects @@ -6511,7 +6561,7 @@ def get_deterministic_name(self): units = self._Units return tokenize( - self.to_dask_array().name, + self.to_dask_array(asanyarray=None).name, units.formatted(definition=True, names=True), units._canonical_calendar, ) @@ -7706,7 +7756,8 @@ def cos(self, inplace=False, i=False): d.Units = _units_radians dx = d.to_dask_array() - d._set_dask(da.cos(dx)) + dx = da.cos(dx) + d._set_dask(dx) d.override_units(_units_1, inplace=True) @@ -8149,7 +8200,9 @@ def unique(self, split_every=None): # in the result. d.soften_mask() - dx = d.to_dask_array() + # The applicable chunk function will have its own call to + # 'cf_asanyarray', so we can set 'asanyarray=False'. + dx = d.to_dask_array(asanyarray=False) dx = Collapse().unique(dx, split_every=split_every) d._set_dask(dx) @@ -8410,7 +8463,6 @@ def equals( # Apply a (dask) logical 'and' to confirm if both the mask and the # data are equal for the pair of masked arrays: result = da.logical_and(data_comparison, mask_comparison) - if not result.compute(): if is_log_level_info(logger): logger.info( @@ -8889,6 +8941,8 @@ def harden_mask(self): [1 -- 3] """ + # 'cf_harden_mask' has its own call to 'cf_asanyarray', so we + # can set 'asanyarray=False'. dx = self.to_dask_array(asanyarray=False) dx = dx.map_blocks(cf_harden_mask, dtype=self.dtype) self._set_dask(dx, clear=_NONE) @@ -9009,6 +9063,8 @@ def soften_mask(self): [ 1 999 3] """ + # 'cf_soften_mask' has its own call to 'cf_asanyarray', so we + # can set 'asanyarray=False'. dx = self.to_dask_array(asanyarray=False) dx = dx.map_blocks(cf_soften_mask, dtype=self.dtype) self._set_dask(dx, clear=_NONE) @@ -9096,6 +9152,8 @@ def filled(self, fill_value=None, inplace=False): f"data type {d.dtype.str!r}" ) + # 'cf_filled' has its own call to 'cf_asanyarray', so we can + # set 'asanyarray=False'. dx = d.to_dask_array(asanyarray=False) dx = dx.map_blocks(cf_filled, fill_value=fill_value, dtype=d.dtype) d._set_dask(dx) @@ -9751,9 +9809,11 @@ def to_dask_array(self, apply_mask_hardness=False, asanyarray=None): asanyarray: `bool` or `None`, optional If True then add a final operation to the Dask graph - that converts chunks to `numpy` arrays. If False then - do not do this. If `None`, the default, then add the - final operation only if the `_asanyarray` attribute is + that converts chunks to `numpy` arrays, but only if a + chunk's data object has an `__asanyarray__` attribute + that is also `True`. If False then do not do this. If + `None`, the default, then the final operation is added + if the `Data` object's `__asanyarray__` attribute is `True`. .. note:: Such a final operation is included in the @@ -9797,7 +9857,7 @@ def to_dask_array(self, apply_mask_hardness=False, asanyarray=None): dx = self._custom["dask"] else: if asanyarray is None: - asanyarray = self._custom.get("asanyarray") + asanyarray = self.__asanyarray__ if asanyarray: dx = dx.map_blocks(cf_asanyarray, dtype=dx.dtype) @@ -11241,6 +11301,9 @@ def where( # Missing values could be affected, so make sure that the mask # hardness has been applied. + # + # 'cf_where' has its own calls to 'cf_asanyarray', so we can + # set 'asanyarray=False'. dx = d.to_dask_array(apply_mask_hardness=True, asanyarray=False) units = d.Units @@ -11256,6 +11319,8 @@ def where( condition = type(self).asdata(condition) condition = where_broadcastable(d, condition, "condition") + # 'cf_where' has its own calls to 'cf_asanyarray', so we can + # set 'asanyarray=False'. condition = condition.to_dask_array(asanyarray=False) # If x or y is self then change it to None. This prevents an @@ -11821,9 +11886,11 @@ def todict( asanyarray: `bool` or `None`, optional If True then add a final operation to the Dask graph - that converts chunks to `numpy` arrays. If False then - do not do this. If `None`, the default, then add the - final operation only if the `_asanyarray` attribute is + that converts chunks to `numpy` arrays, but only if + chunk's data object has an `__asanyarray__` attribute + that is also `True`. If False then do not do this. If + `None`, the default, then the final operation is added + if the `Data` object's `__asanyarray__` attribute is `True`. .. versionadded:: NEXTVERSION diff --git a/cf/data/fragment/mixin/fragmentarraymixin.py b/cf/data/fragment/mixin/fragmentarraymixin.py index 544befd337..4b3524e622 100644 --- a/cf/data/fragment/mixin/fragmentarraymixin.py +++ b/cf/data/fragment/mixin/fragmentarraymixin.py @@ -1,4 +1,4 @@ -from numbers import Integral +from math import prod import numpy as np @@ -12,48 +12,53 @@ class FragmentArrayMixin: """ - def __getitem__(self, indices): - """Returns a subspace of the fragment as a numpy array. + def _get_array(self, index=None): + """Returns a subspace of the dataset variable. - x.__getitem__(indices) <==> x[indices] + The subspace is defined by the indices stored in the `index` + attribute. - Indexing is similar to numpy indexing, with the following - differences: + .. versionadded:: NEXTVERSION - * A dimension's index can't be rank-reducing, i.e. it can't - be an integer, a scalar `numpy` array, nor a scalar `dask` - array. + .. seealso:: `__array__`, `index` - * When two or more dimension's indices are sequences of - integers then these indices work independently along each - dimension (similar to the way vector subscripts work in - Fortran). + :Parameters: - .. versionadded:: 3.15.0 + index: `tuple` or `None`, optional + Provide the indices that define the subspace. It is + assumed that there is a distinct index for each + fragment dimension. If `None` then the `index` + attribute is used. + + :Returns: + + `numpy.ndarray` + The subspace. """ - # TODOACTIVE: modify this for the case when - # super().__getitem__(tuple(indices)) returns a - # dictionary + # TODOACTIVE: modify this for the case when super()._get_array + # returns a dictionary - indices = self._parse_indices(indices) + if index is None: + index = self.index try: - array = super().__getitem__(tuple(indices)) + array = super()._get_array(index) except ValueError: # A ValueError is expected to be raised when the fragment # variable has fewer than 'self.ndim' dimensions (we know - # this because because 'indices' has 'self.ndim' + # that this is the case because 'index' has 'self.ndim' # elements). - axis = self._size_1_axis(indices) + axis = self._size_1_axis(index) if axis is not None: # There is a unique size 1 index that must correspond # to the missing dimension => Remove it from the # indices, get the fragment array with the new # indices; and then insert the missing size one # dimension. - indices.pop(axis) - array = super().__getitem__(tuple(indices)) + index = list(index) + index.pop(axis) + array = super()._get_array(tuple(index)) array = np.expand_dims(array, axis) else: # There are multiple size 1 indices so we don't know @@ -61,20 +66,21 @@ def __getitem__(self, indices): # their positions => Get the full fragment array and # then reshape it to the shape of the dask compute # chunk. - array = super().__getitem__(Ellipsis) - if array.size != self.size: + array = super()._get_array(Ellipsis) + if array.size > prod(self.original_shape): raise ValueError( f"Can't get CFA fragment data from ({self}) when " "the fragment has two or more missing size 1 " "dimensions, whilst also spanning two or more " - "dask compute chunks." + "Dask compute chunks." "\n\n" "Consider re-creating the data with exactly one " - "dask compute chunk per fragment (e.g. by setting " + "Dask compute chunk per fragment (e.g. by setting " "'chunks=None' as a keyword to cf.read)." ) - array = array.reshape(self.shape) + array = array.reshape(self.original_shape) + array = array[index] array = self._conform_to_aggregated_units(array) return array @@ -128,87 +134,6 @@ def _conform_to_aggregated_units(self, array): return array - def _parse_indices(self, indices): - """Parse the indices that retrieve the fragment data. - - Ellipses are replaced with the approriate number of `slice` - instances, and rank-reducing indices (such as an integer or - scalar array) are disallowed. - - .. versionadded:: 3.15.0 - - :Parameters: - - indices: `tuple` or `Ellipsis` - The array indices to be parsed. - - :Returns: - - `list` - The parsed indices. - - **Examples** - - >>> a.shape - (12, 1, 73, 144) - >>> a._parse_indices([2, 4, 5], Ellipsis, slice(45, 67)) - [[2, 4, 5], slice(0, 1), slice(0, 73), slice(45, 67)] - >>> a._parse_indices([2, 4, 5], [0], slice(None), slice(45, 67)) - [[2, 4, 5], [0], slice(0, 73), slice(45, 67)] - - """ - shape = self.shape - if indices is Ellipsis: - return [slice(0, n) for n in shape] - - indices = list(indices) - - # Check indices - has_ellipsis = False - for i, (index, n) in enumerate(zip(indices, shape)): - if isinstance(index, slice): - if index == slice(None): - indices[i] = slice(0, n) - - continue - - if index is Ellipsis: - has_ellipsis = True - continue - - if isinstance(index, Integral) or not getattr(index, "ndim", True): - # TODOCFA: what about [] or np.array([])? - - # 'index' is an integer or a scalar numpy/dask array - raise ValueError( - f"Can't subspace {self.__class__.__name__} with a " - f"rank-reducing index: {index!r}" - ) - - if has_ellipsis: - # Replace Ellipsis with one or more slices - indices2 = [] - length = len(indices) - n = self.ndim - for index in indices: - if index is Ellipsis: - m = n - length + 1 - indices2.extend([slice(None)] * m) - n -= m - else: - indices2.append(index) - n -= 1 - - length -= 1 - - indices = indices2 - - for i, (index, n) in enumerate(zip(indices, shape)): - if index == slice(None): - indices[i] = slice(0, n) - - return indices - def _size_1_axis(self, indices): """Find the position of a unique size 1 index. @@ -216,6 +141,8 @@ def _size_1_axis(self, indices): .. seealso:: `_parse_indices`, `__getitem__` + :Paramealso:: `_parse_indices`, `__getitem__` + :Parameters: indices: sequence of index @@ -244,33 +171,11 @@ def _size_1_axis(self, indices): None """ - axis = None - - n_size_1 = 0 # Number of size 1 indices - for i, (index, n) in enumerate(zip(indices, self.shape)): - try: - x = index.indices(n) - if abs(x[1] - x[0]) == 1: - # Index is a size 1 slice - n_size_1 += 1 - axis = i - except AttributeError: - try: - if index.size == 1: - # Index is a size 1 numpy or dask array - n_size_1 += 1 - axis = i - except AttributeError: - if len(index) == 1: - # Index is a size 1 list - n_size_1 += 1 - axis = i - - if n_size_1 > 1: - # There are two or more size 1 indices - axis = None - - return axis + original_shape = self.original_shape + if original_shape.count(1): + return original_shape.index(1) + + return @property def aggregated_Units(self): diff --git a/cf/data/utils.py b/cf/data/utils.py index 747e703cfe..b08f8a55e3 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -871,7 +871,9 @@ def collapse( if ddof is not None: kwargs["ddof"] = ddof - dx = d.to_dask_array() + # The applicable chunk function will have its own call to + # 'cf_asanyarray', so we can set 'asanyarray=False'. + dx = d.to_dask_array(asanyarray=False) dx = func(dx, **kwargs) d._set_dask(dx) diff --git a/cf/functions.py b/cf/functions.py index 30109091c5..70eb6a81df 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -1885,9 +1885,7 @@ def indices_shape(indices, full_shape, keepdims=True): return shape -def parse_indices( - shape, indices, cyclic=False, keepdims=True -): # , bool_as_int=False): +def parse_indices(shape, indices, cyclic=False, keepdims=True): """Parse indices for array access and assignment. :Parameters: @@ -2043,11 +2041,6 @@ def parse_indices( if callable(to_dask_array): # Replace index with its Dask array index = to_dask_array() - # - # elif bool_as_int: - # index = np.asanyarray(index) - # if index.dtype == bool: - # index = np.arange(size)[index] parsed_indices[i] = index diff --git a/cf/mixin/propertiesdata.py b/cf/mixin/propertiesdata.py index 80269e11e5..14444d2d36 100644 --- a/cf/mixin/propertiesdata.py +++ b/cf/mixin/propertiesdata.py @@ -4692,13 +4692,49 @@ def log(self, base=None, inplace=False, i=False): delete_props=True, ) - def to_dask_array(self): + def to_dask_array(self, apply_mask_hardness=False, asanyarray=None): """Convert the data to a `dask` array. + .. warning:: By default, the mask hardness of the returned + dask array might not be the same as that + specified by the `hardmask` attribute. + + This could cause problems if a subsequent + operation on the returned dask array involves the + un-masking of masked values (such as by indexed + assignment). + + To guarantee that the mask hardness of the + returned dask array is correct, set the + *apply_mask_hardness* parameter to True. + .. versionadded:: 3.14.0 .. seealso:: `cf.Data.to_dask_array` + :Parameters: + + apply_mask_hardness: `bool`, optional + If True then force the mask hardness of the returned + array to be that given by the `hardmask` attribute. + + .. versionadded:: NEXTVERSION + + asanyarray: `bool` or `None`, optional + If True then add a final operation to the Dask graph + that converts chunks to `numpy` arrays, but only if a + chunk's data object has an `__asanyarray__` attribute + that is `True`. If False then do not do this. If + `None`, the default, then the final operation is added + if the `Data` object's `__asanyarray__` attribute is + `True`. + + .. note:: Such a final operation is included in the + returned Dask array, but is not included in + the Dask array stored in the `Data` object. + + .. versionadded:: NEXTVERSION + :Returns: `dask.array.Array` @@ -4717,7 +4753,9 @@ def to_dask_array(self): if data is None: raise ValueError("Can't get dask array when there is no data") - return data.to_dask_array() + return data.to_dask_array( + apply_mask_hardness=apply_mask_hardness, asanyarray=asanyarray + ) @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) diff --git a/cf/read_write/netcdf/netcdfwrite.py b/cf/read_write/netcdf/netcdfwrite.py index 8cdc9cf9c7..3f30cad026 100644 --- a/cf/read_write/netcdf/netcdfwrite.py +++ b/cf/read_write/netcdf/netcdfwrite.py @@ -4,6 +4,7 @@ import dask.array as da import numpy as np +from ...data.dask_utils import cf_asanyarray from .netcdfread import NetCDFRead @@ -784,7 +785,10 @@ def _cfa_write_non_standard_terms( # dimensions, with one value per fragment. If a chunk has # more than one unique value then the fragment's value is # missing data. - dx = data.to_dask_array() + # + # '_cfa_unique' has its own call to 'cf_asanyarray', so + # we can set 'asanyarray=False'. + dx = data.to_dask_array(asanyarray=False) dx_ind = tuple(range(dx.ndim)) out_ind = dx_ind dx = da.blockwise( @@ -840,6 +844,8 @@ def _cfa_unique(cls, a): data if there is not a unique value. """ + a = cf_asanyarray(a) + out_shape = (1,) * a.ndim a = np.unique(a) if np.ma.isMA(a): @@ -990,7 +996,10 @@ def _cfa_aggregation_instructions(self, data, cfvar): # Create the location array # ------------------------------------------------------------ dtype = np.dtype(np.int32) - if max(data.to_dask_array().chunksize) > np.iinfo(dtype).max: + if ( + max(data.to_dask_array(asanyarray=False).chunksize) + > np.iinfo(dtype).max + ): dtype = np.dtype(np.int64) ndim = data.ndim diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 53656208ef..8d43eef252 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -1479,6 +1479,18 @@ def test_Data__getitem__(self): f = cf.Data([-999, 35], mask=[True, False]).reshape(2, 1) self.assertTrue(e.equals(f)) + # Chained subspaces reading from disk + f = cf.read(self.filename)[0] + d = f.data + a = d[:1, [1, 3, 4], :][:, [True, False, True], ::-2].array + b = d.array[:1, [1, 3, 4], :][:, [True, False, True], ::-2] + self.assertTrue((a == b).all()) + + d.__keepdims_indexing__ = False + a = d[0, [1, 3, 4], :][[True, False, True], ::-2].array + b = d.array[0, [1, 3, 4], :][[True, False, True], ::-2] + self.assertTrue((a == b).all()) + def test_Data__setitem__(self): """Test the assignment of data elements on Data.""" for hardmask in (False, True): @@ -3279,6 +3291,14 @@ def test_Data_rechunk(self): self.assertEqual(e.chunks, ((4,), (5,))) self.assertTrue(e.equals(d)) + # Test rechunking after a __getitem__ + e = d[:2].rechunk((2, 5)) + self.assertTrue(e.equals(d[:2])) + + d = cf.Data.ones((4, 5), chunks=(4, 5)) + e = d[:2].rechunk((1, 3)) + self.assertTrue(e.equals(d[:2])) + def test_Data_reshape(self): """Test the `reshape` Data method.""" a = np.arange(12).reshape(3, 4) @@ -4504,11 +4524,11 @@ def test_Data_cull_graph(self): """Test `Data.cull`""" d = cf.Data([1, 2, 3, 4, 5], chunks=3) d = d[:2] - self.assertEqual(len(dict(d.to_dask_array().dask)), 3) + self.assertEqual(len(dict(d.to_dask_array().dask)), 4) # Check that there are fewer keys after culling d.cull_graph() - self.assertEqual(len(dict(d.to_dask_array().dask)), 2) + self.assertEqual(len(dict(d.to_dask_array().dask)), 3) def test_Data_npartitions(self): """Test the `npartitions` Data property.""" @@ -4756,6 +4776,13 @@ def test_Data_pad_missing(self): with self.assertRaises(ValueError): d.pad_missing(99, to_size=99) + def test_Data_is_masked(self): + """Test Data.is_masked.""" + d = cf.Data(np.arange(6).reshape(2, 3)) + d[0, 0] = cf.masked + self.assertTrue(d[0].is_masked) + self.assertFalse(d[1].is_masked) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) From 4bfa673f30db114891a92678b8af0be28abe10fd Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 19 Mar 2024 00:38:45 +0000 Subject: [PATCH 064/134] dev --- cf/data/data.py | 22 +++++++++++++++++----- cf/test/test_Data.py | 7 ++++++- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index ac74f89941..515b5fb15b 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -391,7 +391,7 @@ def __init__( array, copy=copy, clear=_NONE, asanyarray=None ) else: - self._del_dask(None) + self._del_dask(None, clear=_NONE) # Set the mask hardness self.hardmask = getattr(source, "hardmask", _DEFAULT_HARDMASK) @@ -957,7 +957,7 @@ def __getitem__(self, indices): # Set the subspaced dask array. Set 'asanyarray=True' to # honour truely lazy subspacing. # ------------------------------------------------------------ - new._set_dask(dx, asanyarray=True) + new._set_dask(dx, clear=_ALL ^ _ACTIVE, asanyarray=True) # ------------------------------------------------------------ # Get the axis identifiers for the subspace @@ -3293,7 +3293,9 @@ def rechunk( dx = d.to_dask_array(asanyarray=False) dx = dx.rechunk(chunks, threshold, block_size_limit, balance) - d._set_dask(dx, clear=_ALL ^ _ARRAY ^ _CACHE, asanyarray=True) + d._set_dask( + dx, clear=_ALL ^ _ARRAY ^ _CACHE ^ _ACTIVE, asanyarray=True + ) return d @@ -4258,7 +4260,7 @@ def concatenate( copied = not copy # to avoid making two copies in a given case # Get data as dask arrays and apply concatenation operation - dxs = [d.to_dask_array() for d in processed_data] + dxs = [d.to_dask_array(asanyarray=False) for d in processed_data] dx = da.concatenate(dxs, axis=axis) # Set the CFA write status @@ -4295,8 +4297,18 @@ def concatenate( active = _NONE break + # Set the __asanyarray__ status + asanyarray = processed_data[0].__asanyarray__ + for d in processed_data[1:]: + if d.__asanyarray__ != asanyarray: + # If and only if any two input Data objects have + # different __asanyarray__ values, then set + # asanyarray=True on the concatenation. + asanyarray = True + break + # Set the new dask array - data0._set_dask(dx, clear=_ALL ^ cfa ^ active) + data0._set_dask(dx, clear=_ALL ^ cfa ^ active, asanyarray=asanyarray) # Set appropriate cached elements cached_elements = {} diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 37ce938859..c9224443f8 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -4546,9 +4546,14 @@ def test_Data_active_storage(self): d.persist(inplace=True) self.assertFalse(d.active_storage) + # Rechunk should preserve active_storage d._set_active_storage(True) d.rechunk(1, inplace=True) - self.assertFalse(d.active_storage) + self.assertTrue(d.active_storage) + + # __getitem__ should preserve active_storage + d._set_active_storage(True) + self.assertTrue(d[0, 3:].active_storage) # Test with data on disk n = cf.NetCDF4Array( From 82079fd217c4aa429933016fbb509b616b0d9176 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 19 Mar 2024 12:13:51 +0000 Subject: [PATCH 065/134] dev --- cf/data/array/mixin/activestoragemixin.py | 105 +++--- cf/data/array/mixin/filearraymixin.py | 4 +- cf/data/collapse/__init__.py | 2 +- cf/data/collapse/collapse_active.py | 438 ++++++++++++---------- cf/data/collapse/dask_collapse.py | 49 ++- cf/data/creation.py | 6 +- 6 files changed, 353 insertions(+), 251 deletions(-) diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index c370ef56cf..1432fe91c0 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -69,7 +69,25 @@ class ActiveStorageMixin: # # return active[index] - def actify(self, method, axis=None, active_storage_url=None): + @property + def actified(self): + """Whether active storage operations are possible. + + .. versionadded:: NEXTVERSION + + .. seealso:: `actify`, `get_active_storage_url` + + :Returns: + + `bool` + `True` if active stoage operations are possible, + otherwise `False`. + + """ + return self.get_active_storage_url() is not None + + def actify(self, active_storage_url): +# def actify(self, method, axis=None, active_storage_url=None): """Return a new actified `{{class}}` instance. The new instance is a deep copy of the original, with the @@ -80,7 +98,7 @@ def actify(self, method, axis=None, active_storage_url=None): .. versionadded:: NEXTVERSION - .. seealso:: `get_active_axis`, `get_active_method` + .. seealso:: `actified`, `get_active_storage_url` :Parameters: @@ -95,7 +113,8 @@ def actify(self, method, axis=None, active_storage_url=None): `None`, flattened input is used. active_storage_url: `str` or `None`, optional - The URL of the active storage server. + The URL of the active storage server. If `None` then + `actified` will be `False` :Returns: @@ -128,48 +147,48 @@ def actify(self, method, axis=None, active_storage_url=None): ) a = self.copy() - a._custom["active_method"] = method - a._custom["active_axis"] = axis +# a._custom["active_method"] = method +# a._custom["active_axis"] = axis a._custom["active_storage_url"] = active_storage_url return a - def get_active_axis(self): - """Return the active storage reduction axes. - - Active storage reduction axes are set with `actify`. - - .. versionadded:: NEXTVERSION - - .. seealso:: `actify`, `get_active_method`, - `get_active_storage_url` - - :Returns: - - `None` or (sequence of) `int` - The active storage reduction axes, or `None` if there - is no active storage reduction. - - """ - return self._custom.get("active_axis") - - def get_active_method(self): - """Return the name of the active storage reduction method. - - An active storage reduction method is set with `actify`. - - .. versionadded:: NEXTVERSION - - .. seealso:: `actify`, `get_active_axis`, - `get_active_storage_url` - - :Returns: - - `str` or `None` - The name of the active storage reduction method, or - `None` if there is no active storage reduction. - - """ - return self._custom.get("active_method") +# def get_active_axis(self): +# """Return the active storage reduction axes. +# +# Active storage reduction axes are set with `actify`. +# +# .. versionadded:: NEXTVERSION +# +# .. seealso:: `actify`, `get_active_method`, +# `get_active_storage_url` +# +# :Returns: +# +# `None` or (sequence of) `int` +# The active storage reduction axes, or `None` if there +# is no active storage reduction. +# +# """ +# return self._custom.get("active_axis") +# +# def get_active_method(self): +# """Return the name of the active storage reduction method. +# +# An active storage reduction method is set with `actify`. +# +# .. versionadded:: NEXTVERSION +# +# .. seealso:: `actify`, `get_active_axis`, +# `get_active_storage_url` +# +# :Returns: +# +# `str` or `None` +# The name of the active storage reduction method, or +# `None` if there is no active storage reduction. +# +# """ +# return self._custom.get("active_method") def get_active_storage_url(self): """Return the active storage reduction URL. @@ -178,7 +197,7 @@ def get_active_storage_url(self): .. versionadded:: NEXTVERSION - .. seealso:: `actify`, `get_active_axis`, `get_active_method` + .. seealso:: `actified`, `actify` :Returns: diff --git a/cf/data/array/mixin/filearraymixin.py b/cf/data/array/mixin/filearraymixin.py index 378567a23a..04d731d234 100644 --- a/cf/data/array/mixin/filearraymixin.py +++ b/cf/data/array/mixin/filearraymixin.py @@ -27,8 +27,8 @@ def __dask_tokenize__(self): ) @property - def _dask_meta(self): - """The metadata for the containing dask array. + def _meta(self): + """The metadata for the containing Dask array. This is the kind of array that will result from slicing the file array. diff --git a/cf/data/collapse/__init__.py b/cf/data/collapse/__init__.py index 547689794d..7902b51936 100644 --- a/cf/data/collapse/__init__.py +++ b/cf/data/collapse/__init__.py @@ -1,2 +1,2 @@ from .collapse import Collapse -from .collapse_active import actify, active_chunk_functions, active_storage +from .collapse_active import actify, active_storage diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index ddc3e60179..d4658c827f 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -11,23 +11,60 @@ logger = logging.getLogger(__name__) +# -------------------------------------------------------------------- +# Specify which reductions are possible with active storage +# -------------------------------------------------------------------- +active_reduction_methods = ("max", "mean", "min", "sum") -def active_collapse(a, method): + +def active_reduction(x, method, axis=None, **kwargs): """Collapse data in a file with `Active`. .. versionadded:: NEXTVERSION - TODOACTIVE + .. seealso:: `actify`, `cf.data.collapse.Collapse` + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + method: `str` + The name of the reduction method. If the method does not + have a corresponding active function in the + `active_chunk_functions` dictionary then active + compuations are not carried out. + + axis: (sequence of) `int`, optional + Axis or axes along which to operate. By default, + flattened input is used. + + kwargs: optional + Extra keyword arguments that define the reduction. + + :Returns: + + `dict` + The reduced data in component form. """ - filename = a.get_filename() + if not getattr(x, 'actified', False): + raise ValueError( + "Can't do active reductions when on non-actified data" + ) + + weighted = kwargs.get("weights") is not None + if weighted: + raise ValueError(f"Can't do weighted {method!r} active reductions") + + filename = x.get_filename() filename = "/".join(filename.split("/")[3:]) active_kwargs = { "uri": filename, - "ncvar": a.get_address(), - "storage_options": a.get_storage_options(), - "active_storage_url": a.get_active_storage_url(), + "ncvar": x.get_address(), + "storage_options": x.get_storage_options(), + "active_storage_url": x.get_active_storage_url(), "storage_type": "s3", # Temporary requirement! } @@ -36,185 +73,202 @@ def active_collapse(a, method): active = Active(**active_kwargs) - if a.get_active_method() != method: - raise ValueError("TODOACTIVE") - - active.method = method - active.components = True - # Provide a file lock try: - lock = a._lock + lock = x._lock except AttributeError: pass else: if lock: active.lock = lock - return active[a.index] + # Create the output dictionary + active.method = method + active.components = True + d = active[x.index] + + # Reformat the output dictionary + if method == 'max': + d = {"N": d["n"], "max": d["max"]} + elif method == "mean": + d = {"N": d["n"], "sum": d["sum"], "V1": d["n"], "weighted": weighted} + elif method == 'min': + d = {"N": d["n"], "min": d["min"]} + elif method == 'sum': + d = {"N": d["n"], "sum": d["sum"]} + + print ('DONE!') + return d # -------------------------------------------------------------------- # Define the active functions # -------------------------------------------------------------------- -def active_min(a, **kwargs): - """Chunk function for minimum values computed by active storage. - - Converts active storage reduction components to the components - expected by the reduction combine and aggregate functions. - - This function is intended to be passed to `dask.array.reduction` - as the ``chunk`` parameter. Its returned value must be the same as - the non-active chunk function that it is replacing. - - .. versionadded:: NEXTVERSION - - .. seealso:: `actify`, `active_storage` - - :Parameters: - - a: `dict` - The components output from the active storage - reduction. For instance: - - >>> print(a) - {'min': array([[[49.5]]], dtype=float32), 'n': 1015808} - - :Returns: - - `dict` - Dictionary with the keys: - - * N: The sample size. - * min: The minimum. - - """ - a = active_collapse(a, "min") - return {"N": a["n"], "min": a["min"]} - - -def active_max(a, **kwargs): - """Chunk function for maximum values computed by active storage. - - Converts active storage reduction components to the components - expected by the reduction combine and aggregate functions. - - This function is intended to be passed to `dask.array.reduction` - as the ``chunk`` parameter. Its returned value must be the same as - the non-active chunk function that it is replacing. - - .. versionadded:: NEXTVERSION - - .. seealso:: `actify`, `active_storage` - - :Parameters: - - a: `dict` - The components output from the active storage - reduction. For instance: - - >>> print(a) - {'max': array([[[2930.4856]]], dtype=float32), 'n': 1015808} - - :Returns: - - `dict` - Dictionary with the keys: - - * N: The sample size. - * max: The maximum. - - """ - a = active_collapse(a, "max") - return {"N": a["n"], "max": a["max"]} - - -def active_mean(a, **kwargs): - """Chunk function for mean values computed by active storage. - - Converts active storage reduction components to the components - expected by the reduction combine and aggregate functions. - - This function is intended to be passed to `dask.array.reduction` - as the ``chunk`` parameter. Its returned value must be the same as - the non-active chunk function that it is replacing. - - .. versionadded:: NEXTVERSION - - .. seealso:: `actify`, `active_storage` - - :Parameters: - - a: `dict` - The components output from the active storage - reduction. For instance: - - >>> print(a) - {'sum': array([[[1.5131907e+09]]], dtype=float32), 'n': 1015808} - - :Returns: - - `dict` - Dictionary with the keys: - - * N: The sample size. - * V1: The sum of ``weights``. Always equal to ``N`` - because weights have not been set. - * sum: The un-weighted sum. - * weighted: True if weights have been set. Always - False. - - """ - a = active_collapse(a, "mean") - return {"N": a["n"], "V1": a["n"], "sum": a["sum"], "weighted": False} - - -def active_sum(a, **kwargs): - """Chunk function for sum values computed by active storage. - - Converts active storage reduction components to the components - expected by the reduction combine and aggregate functions. - - This function is intended to be passed to `dask.array.reduction` - as the ``chunk`` parameter. Its returned value must be the same as - the non-active chunk function that it is replacing. - - .. versionadded:: NEXTVERSION - - .. seealso:: `actify`, `active_storage` - - :Parameters: - - a: `dict` - The components output from the active storage - reduction. For instance: - - >>> print(a) - {'sum': array([[[1.5131907e+09]]], dtype=float32), 'n': 1015808} - - :Returns: - - `dict` - Dictionary with the keys: - - * N: The sample size. - * sum: The un-weighted sum. - - """ - a = active_collapse(a, "sum") - return {"N": a["n"], "sum": a["sum"]} +#def active_min(x, dtype=None, computing_meta=False, **kwargs): +# """Chunk function for minimum values computed by active storage. +# +# Converts active storage reduction components to the components +# expected by the reduction combine and aggregate functions. +# +# This function is intended to be passed to `dask.array.reduction` +# as the ``chunk`` parameter. Its returned value must be the same as +# the non-active chunk function that it is replacing. +# +# .. versionadded:: NEXTVERSION +# +# .. seealso:: `actify`, `active_storage` +# +# :Parameters: +# +# See `dask.array.reductions` for details of the parameters. +# +# :Returns: +# +# `dict` +# Dictionary with the keys: +# +# * N: The sample size. +# * min: The minimum ``x``. +# +# """ +# if computing_meta: +# return x +# +# x = active_reduction(x, "min", **kwargs) +# return {"N": x["n"], "min": x["min"]} +# +# +#def active_max(a, **kwargs): +# """Chunk function for maximum values computed by active storage. +# +# Converts active storage reduction components to the components +# expected by the reduction combine and aggregate functions. +# +# This function is intended to be passed to `dask.array.reduction` +# as the ``chunk`` parameter. Its returned value must be the same as +# the non-active chunk function that it is replacing. +# +# .. versionadded:: NEXTVERSION +# +# .. seealso:: `actify`, `active_storage` +# +# :Parameters: +# +# a: `dict` +# The components output from the active storage +# reduction. For instance: +# +# >>> print(a) +# {'max': array([[[2930.4856]]], dtype=float32), 'n': 1015808} +# +# :Returns: +# +# `dict` +# Dictionary with the keys: +# +# * N: The sample size. +# * max: The maximum. +# +# """ +# if computing_meta: +# return x +# +# x = active_reduction(x, "max", **kwargs) +# return {"N": a["n"], "max": a["max"]} +# +# +#def active_mean(a, **kwargs): +# """Chunk function for mean values computed by active storage. +# +# Converts active storage reduction components to the components +# expected by the reduction combine and aggregate functions. +# +# This function is intended to be passed to `dask.array.reduction` +# as the ``chunk`` parameter. Its returned value must be the same as +# the non-active chunk function that it is replacing. +# +# .. versionadded:: NEXTVERSION +# +# .. seealso:: `actify`, `active_storage` +# +# :Parameters: +# +# a: `dict` +# The components output from the active storage +# reduction. For instance: +# +# >>> print(a) +# {'sum': array([[[1.5131907e+09]]], dtype=float32), 'n': 1015808} +# +# :Returns: +# +# `dict` +# Dictionary with the keys: +# +# * N: The sample size. +# * V1: The sum of ``weights``. Always equal to ``N`` +# because weights have not been set. +# * sum: The un-weighted sum. +# * weighted: True if weights have been set. Always +# False. +# +# """ +# if computing_meta: +# return x +# +# x = active_reduction(x, "mean", **kwargs) +# return {"N": a["n"], "V1": a["n"], "sum": a["sum"], "weighted": False} +# +# +#def active_sum(a, **kwargs): +# """Chunk function for sum values computed by active storage. +# +# Converts active storage reduction components to the components +# expected by the reduction combine and aggregate functions. +# +# This function is intended to be passed to `dask.array.reduction` +# as the ``chunk`` parameter. Its returned value must be the same as +# the non-active chunk function that it is replacing. +# +# .. versionadded:: NEXTVERSION +# +# .. seealso:: `actify`, `active_storage` +# +# :Parameters: +# +# a: `dict` +# The components output from the active storage +# reduction. For instance: +# +# >>> print(a) +# {'sum': array([[[1.5131907e+09]]], dtype=float32), 'n': 1015808} +# +# :Returns: +# +# `dict` +# Dictionary with the keys: +# +# * N: The sample size. +# * sum: The un-weighted sum. +# +# """ +# if computing_meta: +# return x +# +# x = active_reduction(x, "sum", **kwargs) +# return {"N": a["n"], "sum": a["sum"]} # -------------------------------------------------------------------- # Create a map of reduction methods to their corresponding active # functions # -------------------------------------------------------------------- -active_chunk_functions = { - "min": active_min, - "max": active_max, - "mean": active_mean, - "sum": active_sum, -} +#active_chunk_functions = { +# "min": True, #active_min, +# "max": active_max, +# "mean": active_mean, +# "sum": active_sum, +#} def actify(a, method, axis=None): @@ -259,30 +313,36 @@ def actify(a, method, axis=None): `None`. """ - from numbers import Integral - import dask.array as da - from dask.array.utils import validate_axis from dask.base import collections_to_dsk - if method not in active_chunk_functions: - # The method does not have a corresponding active function, so + if Active is None: + raise AttributeError( + "Can't actify {self.__class__.__name__} when " + "activestorage.Active is not available" + ) + + if method not in active_reduction_methods: + # The method cannot be calculated with active storage, so # return the input data unchanged. - return a, None + return a # Parse axis + ndim = a.ndim if axis is None: - axis = tuple(range(a.ndim)) + axis = tuple(range(ndim)) else: + from numbers import Integral + from dask.array.utils import validate_axis + if isinstance(axis, Integral): axis = (axis,) - if len(axis) != a.ndim: + axis = validate_axis(axis, ndim) + if len(axis) != ndim or len(set(axis)) != ndim: # Can't (yet) use active storage to collapse a subset of # the axes, so return the input data unchanged. - return a, None - - axis = validate_axis(axis, a.ndim) + return a # Loop round the nodes of the dask graph looking for data # definitions that point to files and which support active storage @@ -311,7 +371,7 @@ def actify(a, method, axis=None): # to files, so try to insert an actified copy into the dask # graph. try: - dsk[key] = value.actify(method, axis, active_storage_url=url) + dsk[key] = value.actify(url) except AttributeError: # This data definition doesn't support active storage # reductions @@ -321,20 +381,17 @@ def actify(a, method, axis=None): if not ok_to_actify: # It turns out that the dask graph is not suitable for active # storage reductions, so return the input data unchanged. - return a, None + return a # Still here? Then all data definitions in the dask graph support # active storage reductions => redefine the dask array from the # actified dask graph, and set the active storage reduction chunk # function. logger.warning( - "At compute time chunks will be collapsed with " + "At compute time, data will be collapsed with " f"active storage at URL {url}" ) - return ( - da.Array(dsk, a.name, a.chunks, a.dtype, a._meta), - active_chunk_functions[method], - ) + return da.Array(dsk, a.name, a.chunks, a.dtype, a._meta) def active_storage(method): @@ -364,7 +421,7 @@ def wrapper(self, *args, **kwargs): cf_active_storage() and Active is not None and kwargs.get("active_storage") - and method in active_chunk_functions + and method in active_reduction_methods and kwargs.get("weights") is None and kwargs.get("chunk_function") is None and active_storage_url() @@ -376,7 +433,8 @@ def wrapper(self, *args, **kwargs): else: dask_array = kwargs.pop("a") - dask_array, chunk_function = actify( +# dask_array, chunk_function = actify( + dask_array = actify( dask_array, method=method, axis=kwargs.get("axis"), @@ -384,10 +442,10 @@ def wrapper(self, *args, **kwargs): args = list(args) args[0] = dask_array - if chunk_function is not None: - # The dask array has been actified, so update the - # chunk function. - kwargs["chunk_function"] = chunk_function + #if chunk_function is not None: + # # The dask array has been actified, so update the + # # chunk function. + # kwargs["chunk_function"] = chunk_function # Create the collapse return collapse_method(self, *args, **kwargs) diff --git a/cf/data/collapse/dask_collapse.py b/cf/data/collapse/dask_collapse.py index 33956b9246..c4fff394ce 100644 --- a/cf/data/collapse/dask_collapse.py +++ b/cf/data/collapse/dask_collapse.py @@ -16,6 +16,7 @@ from ..dask_utils import cf_asanyarray from .collapse_utils import double_precision_dtype +from .collapse_active import active_reduction def mask_small_sample_size(x, N, axis, mtol, original_shape): @@ -239,8 +240,11 @@ def cf_mean_chunk( ): """Chunk calculations for the mean. - This function is passed to `dask.array.reduction` as its *chunk* - parameter. + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + If ``x.actified`` exists and is `True` then the calculations are + done in active storage. .. versionadded:: 3.14.0 @@ -266,11 +270,17 @@ def cf_mean_chunk( * weighted: True if weights have been set. """ - x = cf_asanyarray(x) - if computing_meta: return x + # if getattr(x, 'actified', False): + try: + print (repr(x)) + return active_reduction(x, "mean", weights=weights, **kwargs) + except ValueError: + pass + + x = cf_asanyarray(x) if weights is not None: weights = cf_asanyarray(weights) @@ -376,6 +386,9 @@ def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): This function is passed to `dask.array.reduction` as its *chunk* parameter. + If ``x.actified`` exists and is `True` then the calculations are + done in active storage. + .. versionadded:: 3.14.0 :Parameters: @@ -391,11 +404,13 @@ def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): * max: The maximum of `x``. """ - x = cf_asanyarray(x) - if computing_meta: return x - + + if getattr(x, 'actified', False): + return active_reduction(x, "max", **kwargs) + + x = cf_asanyarray(x) return { "max": chunk.max(x, **kwargs), "N": cf_sample_size_chunk(x, **kwargs)["N"], @@ -529,6 +544,9 @@ def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): This function is passed to `dask.array.reduction` as its *chunk* parameter. + If ``x.actified`` exists and is `True` then the calculations are + done in active storage. + .. versionadded:: 3.14.0 :Parameters: @@ -544,11 +562,13 @@ def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): * min: The minimum of ``x``. """ - x = cf_asanyarray(x) - if computing_meta: return x - + + if getattr(x, 'actified', False): + return active_reduction(x, "min", **kwargs) + + x = cf_asanyarray(x) return { "min": chunk.min(x, **kwargs), "N": cf_sample_size_chunk(x, **kwargs)["N"], @@ -943,6 +963,9 @@ def cf_sum_chunk( This function is passed to `dask.array.reduction` as its *chunk* parameter. + If ``x.actified`` exists and is `True` then the calculations are + done in active storage. + .. versionadded:: 3.14.0 :Parameters: @@ -965,11 +988,13 @@ def cf_sum_chunk( * sum: The weighted sum of ``x`` """ - x = cf_asanyarray(x) - if computing_meta: return x + if getattr(x, 'actified', False): + return active_reduction(x, "sum", weights=weights, **kwargs) + + x = cf_asanyarray(x) if weights is not None: weights = cf_asanyarray(weights) if check_weights: diff --git a/cf/data/creation.py b/cf/data/creation.py index b861281e22..a8b90811a7 100644 --- a/cf/data/creation.py +++ b/cf/data/creation.py @@ -34,8 +34,8 @@ def to_dask(array, chunks, **from_array_options): Keyword arguments to be passed to `dask.array.from_array`. If *from_array_options* has no ``'meta'`` key then the - `meta` keyword is set to the `_dask_meta` attribute of - *array* or, if there is no such attribute, `None`. + `meta` keyword is set to the `_meta` attribute of *array* + or, if there is no such attribute, `None`. :Returns: @@ -81,7 +81,7 @@ def to_dask(array, chunks, **from_array_options): array = np.asanyarray(array) kwargs = from_array_options - kwargs.setdefault("meta", getattr(array, "_dask_meta", None)) + kwargs.setdefault("meta", getattr(array, "_meta", None)) try: return da.from_array(array, chunks=chunks, **kwargs) From 9e6d4a2151d2d6dc84ba7345458242a3b9acc624 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 19 Mar 2024 16:55:11 +0000 Subject: [PATCH 066/134] dev --- cf/data/array/mixin/activestoragemixin.py | 86 +++++++++++------------ cf/data/collapse/collapse_active.py | 33 ++++----- cf/data/collapse/dask_collapse.py | 21 +++--- 3 files changed, 70 insertions(+), 70 deletions(-) diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index 1432fe91c0..472684e28c 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -74,9 +74,9 @@ def actified(self): """Whether active storage operations are possible. .. versionadded:: NEXTVERSION - + .. seealso:: `actify`, `get_active_storage_url` - + :Returns: `bool` @@ -85,9 +85,9 @@ def actified(self): """ return self.get_active_storage_url() is not None - + def actify(self, active_storage_url): -# def actify(self, method, axis=None, active_storage_url=None): + # def actify(self, method, axis=None, active_storage_url=None): """Return a new actified `{{class}}` instance. The new instance is a deep copy of the original, with the @@ -147,48 +147,48 @@ def actify(self, active_storage_url): ) a = self.copy() -# a._custom["active_method"] = method -# a._custom["active_axis"] = axis + # a._custom["active_method"] = method + # a._custom["active_axis"] = axis a._custom["active_storage_url"] = active_storage_url return a -# def get_active_axis(self): -# """Return the active storage reduction axes. -# -# Active storage reduction axes are set with `actify`. -# -# .. versionadded:: NEXTVERSION -# -# .. seealso:: `actify`, `get_active_method`, -# `get_active_storage_url` -# -# :Returns: -# -# `None` or (sequence of) `int` -# The active storage reduction axes, or `None` if there -# is no active storage reduction. -# -# """ -# return self._custom.get("active_axis") -# -# def get_active_method(self): -# """Return the name of the active storage reduction method. -# -# An active storage reduction method is set with `actify`. -# -# .. versionadded:: NEXTVERSION -# -# .. seealso:: `actify`, `get_active_axis`, -# `get_active_storage_url` -# -# :Returns: -# -# `str` or `None` -# The name of the active storage reduction method, or -# `None` if there is no active storage reduction. -# -# """ -# return self._custom.get("active_method") + # def get_active_axis(self): + # """Return the active storage reduction axes. + # + # Active storage reduction axes are set with `actify`. + # + # .. versionadded:: NEXTVERSION + # + # .. seealso:: `actify`, `get_active_method`, + # `get_active_storage_url` + # + # :Returns: + # + # `None` or (sequence of) `int` + # The active storage reduction axes, or `None` if there + # is no active storage reduction. + # + # """ + # return self._custom.get("active_axis") + # + # def get_active_method(self): + # """Return the name of the active storage reduction method. + # + # An active storage reduction method is set with `actify`. + # + # .. versionadded:: NEXTVERSION + # + # .. seealso:: `actify`, `get_active_axis`, + # `get_active_storage_url` + # + # :Returns: + # + # `str` or `None` + # The name of the active storage reduction method, or + # `None` if there is no active storage reduction. + # + # """ + # return self._custom.get("active_method") def get_active_storage_url(self): """Return the active storage reduction URL. diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index d4658c827f..e0540ecede 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -48,15 +48,15 @@ def active_reduction(x, method, axis=None, **kwargs): The reduced data in component form. """ - if not getattr(x, 'actified', False): + if not getattr(x, "actified", False): raise ValueError( "Can't do active reductions when on non-actified data" ) - + weighted = kwargs.get("weights") is not None if weighted: raise ValueError(f"Can't do weighted {method!r} active reductions") - + filename = x.get_filename() filename = "/".join(filename.split("/")[3:]) @@ -88,23 +88,23 @@ def active_reduction(x, method, axis=None, **kwargs): d = active[x.index] # Reformat the output dictionary - if method == 'max': + if method == "max": d = {"N": d["n"], "max": d["max"]} elif method == "mean": d = {"N": d["n"], "sum": d["sum"], "V1": d["n"], "weighted": weighted} - elif method == 'min': + elif method == "min": d = {"N": d["n"], "min": d["min"]} - elif method == 'sum': + elif method == "sum": d = {"N": d["n"], "sum": d["sum"]} - print ('DONE!') + print("DONE!") return d # -------------------------------------------------------------------- # Define the active functions # -------------------------------------------------------------------- -#def active_min(x, dtype=None, computing_meta=False, **kwargs): +# def active_min(x, dtype=None, computing_meta=False, **kwargs): # """Chunk function for minimum values computed by active storage. # # Converts active storage reduction components to the components @@ -138,7 +138,7 @@ def active_reduction(x, method, axis=None, **kwargs): # return {"N": x["n"], "min": x["min"]} # # -#def active_max(a, **kwargs): +# def active_max(a, **kwargs): # """Chunk function for maximum values computed by active storage. # # Converts active storage reduction components to the components @@ -177,7 +177,7 @@ def active_reduction(x, method, axis=None, **kwargs): # return {"N": a["n"], "max": a["max"]} # # -#def active_mean(a, **kwargs): +# def active_mean(a, **kwargs): # """Chunk function for mean values computed by active storage. # # Converts active storage reduction components to the components @@ -220,7 +220,7 @@ def active_reduction(x, method, axis=None, **kwargs): # return {"N": a["n"], "V1": a["n"], "sum": a["sum"], "weighted": False} # # -#def active_sum(a, **kwargs): +# def active_sum(a, **kwargs): # """Chunk function for sum values computed by active storage. # # Converts active storage reduction components to the components @@ -263,12 +263,12 @@ def active_reduction(x, method, axis=None, **kwargs): # Create a map of reduction methods to their corresponding active # functions # -------------------------------------------------------------------- -#active_chunk_functions = { +# active_chunk_functions = { # "min": True, #active_min, # "max": active_max, # "mean": active_mean, # "sum": active_sum, -#} +# } def actify(a, method, axis=None): @@ -333,8 +333,9 @@ def actify(a, method, axis=None): axis = tuple(range(ndim)) else: from numbers import Integral + from dask.array.utils import validate_axis - + if isinstance(axis, Integral): axis = (axis,) @@ -433,7 +434,7 @@ def wrapper(self, *args, **kwargs): else: dask_array = kwargs.pop("a") -# dask_array, chunk_function = actify( + # dask_array, chunk_function = actify( dask_array = actify( dask_array, method=method, @@ -442,7 +443,7 @@ def wrapper(self, *args, **kwargs): args = list(args) args[0] = dask_array - #if chunk_function is not None: + # if chunk_function is not None: # # The dask array has been actified, so update the # # chunk function. # kwargs["chunk_function"] = chunk_function diff --git a/cf/data/collapse/dask_collapse.py b/cf/data/collapse/dask_collapse.py index c4fff394ce..86b9006bc0 100644 --- a/cf/data/collapse/dask_collapse.py +++ b/cf/data/collapse/dask_collapse.py @@ -15,8 +15,8 @@ from dask.utils import deepmap from ..dask_utils import cf_asanyarray -from .collapse_utils import double_precision_dtype from .collapse_active import active_reduction +from .collapse_utils import double_precision_dtype def mask_small_sample_size(x, N, axis, mtol, original_shape): @@ -275,11 +275,10 @@ def cf_mean_chunk( # if getattr(x, 'actified', False): try: - print (repr(x)) return active_reduction(x, "mean", weights=weights, **kwargs) except ValueError: pass - + x = cf_asanyarray(x) if weights is not None: weights = cf_asanyarray(weights) @@ -406,10 +405,10 @@ def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): """ if computing_meta: return x - - if getattr(x, 'actified', False): + + if getattr(x, "actified", False): return active_reduction(x, "max", **kwargs) - + x = cf_asanyarray(x) return { "max": chunk.max(x, **kwargs), @@ -564,10 +563,10 @@ def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): """ if computing_meta: return x - - if getattr(x, 'actified', False): + + if getattr(x, "actified", False): return active_reduction(x, "min", **kwargs) - + x = cf_asanyarray(x) return { "min": chunk.min(x, **kwargs), @@ -991,9 +990,9 @@ def cf_sum_chunk( if computing_meta: return x - if getattr(x, 'actified', False): + if getattr(x, "actified", False): return active_reduction(x, "sum", weights=weights, **kwargs) - + x = cf_asanyarray(x) if weights is not None: weights = cf_asanyarray(weights) From be63ec73e960be8971b7f13cd341dc4aeabd5faf Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 20 Mar 2024 00:40:31 +0000 Subject: [PATCH 067/134] dev --- cf/data/array/mixin/activestoragemixin.py | 114 +--------- cf/data/collapse/collapse.py | 6 +- cf/data/collapse/collapse_active.py | 264 ++++++---------------- cf/data/collapse/dask_collapse.py | 103 +++++---- cf/data/data.py | 2 +- cf/data/utils.py | 4 +- cf/field.py | 37 ++- 7 files changed, 165 insertions(+), 365 deletions(-) diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index 472684e28c..eb17095ef6 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -11,64 +11,6 @@ class ActiveStorageMixin: """ - # def _get_array(self, index=None): - # """Returns a subspace of the data. - # - # .. versionadded:: NEXTVERSION - # - # .. seealso:: `__array__`, `index` - # - # :Parameters: - # - # index: `tuple` or `None`, optional - # Provide the indices that define the subspace. If `None` - # then the `index` attribute is used. - # - # :Returns: - # - # `numpy.ndarray` - # - # """ - # method = self.get_active_method() - # if Active is None or method is None: - # # The instance has not been actified so do a normal read, - # # returning an un-reduced numpy array. - # return super()._get_array(index) - # - # # Still here? Then do an active storage reduction. Returns a - # # dictionary of reduced values. - # filename = self.get_filename() - # filename = "/".join(filename.split("/")[3:]) - # - # kwargs ={ - # 'uri': filename, - # 'ncvar': self.get_address(), - # "storage_options": self.get_storage_options(), - # "active_storage_url": self.get_active_storage_url(), - # "storage_type": "s3", # Temporary requirement! - # } - # - # if True: - # print(f"Active(**{kwargs})") - # - # active = Active(**kwargs) - # active.method = method - # active.components = True - # - # # Provide a file lock - # try: - # lock = self._lock - # except AttributeError: - # pass - # else: - # if lock: - # active.lock = lock - # - # if index is None: - # index = self.index - # - # return active[index] - @property def actified(self): """Whether active storage operations are possible. @@ -87,14 +29,10 @@ def actified(self): return self.get_active_storage_url() is not None def actify(self, active_storage_url): - # def actify(self, method, axis=None, active_storage_url=None): """Return a new actified `{{class}}` instance. The new instance is a deep copy of the original, with the - additional setting of the active storage method and axis. - - When the instance is indexed, the result of applying the - active storage method to the subspace will be returned. + additional setting of the active storage URL. .. versionadded:: NEXTVERSION @@ -102,16 +40,6 @@ def actify(self, active_storage_url): :Parameters: - method: `str` - The name of the reduction method. - - *Parameter example:* - ``'min'`` - - axis: `None` or (sequence of) `int`, optional - Axis or axes along which to operate. By default, or if - `None`, flattened input is used. - active_storage_url: `str` or `None`, optional The URL of the active storage server. If `None` then `actified` will be `False` @@ -147,49 +75,9 @@ def actify(self, active_storage_url): ) a = self.copy() - # a._custom["active_method"] = method - # a._custom["active_axis"] = axis a._custom["active_storage_url"] = active_storage_url return a - # def get_active_axis(self): - # """Return the active storage reduction axes. - # - # Active storage reduction axes are set with `actify`. - # - # .. versionadded:: NEXTVERSION - # - # .. seealso:: `actify`, `get_active_method`, - # `get_active_storage_url` - # - # :Returns: - # - # `None` or (sequence of) `int` - # The active storage reduction axes, or `None` if there - # is no active storage reduction. - # - # """ - # return self._custom.get("active_axis") - # - # def get_active_method(self): - # """Return the name of the active storage reduction method. - # - # An active storage reduction method is set with `actify`. - # - # .. versionadded:: NEXTVERSION - # - # .. seealso:: `actify`, `get_active_axis`, - # `get_active_storage_url` - # - # :Returns: - # - # `str` or `None` - # The name of the active storage reduction method, or - # `None` if there is no active storage reduction. - # - # """ - # return self._custom.get("active_method") - def get_active_storage_url(self): """Return the active storage reduction URL. diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index 6dc0c9bf80..0dac65d7c8 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -966,18 +966,18 @@ def sum_of_weights2( from .dask_collapse import ( cf_sum_agg, cf_sum_combine, - cf_sum_of_weights_chunk, + cf_sum_of_weights2_chunk, ) if chunk_function is None: # Default function for chunk calculations - chunk_function = cf_sum_of_weights_chunk + chunk_function = cf_sum_of_weights2_chunk check_input_dtype(a) dtype = double_precision_dtype(weights, default="i8") return reduction( a, - partial(chunk_function, square=True), + chunk_function, partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), axis=axis, keepdims=keepdims, diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index e0540ecede..a5bb12b221 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -17,12 +17,12 @@ active_reduction_methods = ("max", "mean", "min", "sum") -def active_reduction(x, method, axis=None, **kwargs): - """Collapse data in a file with `Active`. +def active_chunk(method, x, **kwargs): + """Collapse a data in a chunk with active storage. .. versionadded:: NEXTVERSION - .. seealso:: `actify`, `cf.data.collapse.Collapse` + .. seealso:: `actify`, `active_storage2`, `cf.data.collapse.Collapse` :Parameters: @@ -48,6 +48,9 @@ def active_reduction(x, method, axis=None, **kwargs): The reduced data in component form. """ + if kwargs.get("computing_meta"): + return x + if not getattr(x, "actified", False): raise ValueError( "Can't do active reductions when on non-actified data" @@ -97,180 +100,10 @@ def active_reduction(x, method, axis=None, **kwargs): elif method == "sum": d = {"N": d["n"], "sum": d["sum"]} - print("DONE!") + print("ACTIVE CHUNK DONE!") return d -# -------------------------------------------------------------------- -# Define the active functions -# -------------------------------------------------------------------- -# def active_min(x, dtype=None, computing_meta=False, **kwargs): -# """Chunk function for minimum values computed by active storage. -# -# Converts active storage reduction components to the components -# expected by the reduction combine and aggregate functions. -# -# This function is intended to be passed to `dask.array.reduction` -# as the ``chunk`` parameter. Its returned value must be the same as -# the non-active chunk function that it is replacing. -# -# .. versionadded:: NEXTVERSION -# -# .. seealso:: `actify`, `active_storage` -# -# :Parameters: -# -# See `dask.array.reductions` for details of the parameters. -# -# :Returns: -# -# `dict` -# Dictionary with the keys: -# -# * N: The sample size. -# * min: The minimum ``x``. -# -# """ -# if computing_meta: -# return x -# -# x = active_reduction(x, "min", **kwargs) -# return {"N": x["n"], "min": x["min"]} -# -# -# def active_max(a, **kwargs): -# """Chunk function for maximum values computed by active storage. -# -# Converts active storage reduction components to the components -# expected by the reduction combine and aggregate functions. -# -# This function is intended to be passed to `dask.array.reduction` -# as the ``chunk`` parameter. Its returned value must be the same as -# the non-active chunk function that it is replacing. -# -# .. versionadded:: NEXTVERSION -# -# .. seealso:: `actify`, `active_storage` -# -# :Parameters: -# -# a: `dict` -# The components output from the active storage -# reduction. For instance: -# -# >>> print(a) -# {'max': array([[[2930.4856]]], dtype=float32), 'n': 1015808} -# -# :Returns: -# -# `dict` -# Dictionary with the keys: -# -# * N: The sample size. -# * max: The maximum. -# -# """ -# if computing_meta: -# return x -# -# x = active_reduction(x, "max", **kwargs) -# return {"N": a["n"], "max": a["max"]} -# -# -# def active_mean(a, **kwargs): -# """Chunk function for mean values computed by active storage. -# -# Converts active storage reduction components to the components -# expected by the reduction combine and aggregate functions. -# -# This function is intended to be passed to `dask.array.reduction` -# as the ``chunk`` parameter. Its returned value must be the same as -# the non-active chunk function that it is replacing. -# -# .. versionadded:: NEXTVERSION -# -# .. seealso:: `actify`, `active_storage` -# -# :Parameters: -# -# a: `dict` -# The components output from the active storage -# reduction. For instance: -# -# >>> print(a) -# {'sum': array([[[1.5131907e+09]]], dtype=float32), 'n': 1015808} -# -# :Returns: -# -# `dict` -# Dictionary with the keys: -# -# * N: The sample size. -# * V1: The sum of ``weights``. Always equal to ``N`` -# because weights have not been set. -# * sum: The un-weighted sum. -# * weighted: True if weights have been set. Always -# False. -# -# """ -# if computing_meta: -# return x -# -# x = active_reduction(x, "mean", **kwargs) -# return {"N": a["n"], "V1": a["n"], "sum": a["sum"], "weighted": False} -# -# -# def active_sum(a, **kwargs): -# """Chunk function for sum values computed by active storage. -# -# Converts active storage reduction components to the components -# expected by the reduction combine and aggregate functions. -# -# This function is intended to be passed to `dask.array.reduction` -# as the ``chunk`` parameter. Its returned value must be the same as -# the non-active chunk function that it is replacing. -# -# .. versionadded:: NEXTVERSION -# -# .. seealso:: `actify`, `active_storage` -# -# :Parameters: -# -# a: `dict` -# The components output from the active storage -# reduction. For instance: -# -# >>> print(a) -# {'sum': array([[[1.5131907e+09]]], dtype=float32), 'n': 1015808} -# -# :Returns: -# -# `dict` -# Dictionary with the keys: -# -# * N: The sample size. -# * sum: The un-weighted sum. -# -# """ -# if computing_meta: -# return x -# -# x = active_reduction(x, "sum", **kwargs) -# return {"N": a["n"], "sum": a["sum"]} - - -# -------------------------------------------------------------------- -# Create a map of reduction methods to their corresponding active -# functions -# -------------------------------------------------------------------- -# active_chunk_functions = { -# "min": True, #active_min, -# "max": active_max, -# "mean": active_mean, -# "sum": active_sum, -# } - - def actify(a, method, axis=None): """Modify a dask array to use active storage reductions. @@ -352,6 +185,9 @@ def actify(a, method, axis=None): # The elements are traversed in reverse order so that the data # defintions come out first, allowing for the potential of a # faster short circuit when using active storage is not possible. + # + # Performance: The optimisation is essential, but can be slow for + # complicated graphs. url = str(active_storage_url()) ok_to_actify = True dsk = collections_to_dsk((a,), optimize_graph=True) @@ -395,12 +231,14 @@ def actify(a, method, axis=None): return da.Array(dsk, a.name, a.chunks, a.dtype, a._meta) +# -------------------------------------------------------------------- +# Decoators +# -------------------------------------------------------------------- def active_storage(method): - """A decorator that enables active storage reductions. + """Decorator for active storage reductions on `Collapse` methods. - This decorator is intended for `Collapse` methods. When a - `Collapse` method is decorated, active storage operations are only - carried out when the conditions are right. + When a `Collapse` method is decorated, active storage operations + are carried out if the conditions are right. .. versionadded:: NEXTVERSION @@ -409,9 +247,9 @@ def active_storage(method): :Parameters: method: `str` - The name of the reduction method. If it is not one of the - keys of the `active_chunk_functions` dictionary then - active storage reductions will not occur. + The name of the reduction method. If it is one of the + `active_chunk_methods` then active storage reductions + *might* occur. """ @@ -419,38 +257,72 @@ def decorator(collapse_method): @wraps(collapse_method) def wrapper(self, *args, **kwargs): if ( - cf_active_storage() - and Active is not None - and kwargs.get("active_storage") + Active is not None and method in active_reduction_methods + and kwargs.get("active_storage") and kwargs.get("weights") is None and kwargs.get("chunk_function") is None + and cf_active_storage() and active_storage_url() ): - # Attempt to actify the dask array and provide a new - # chunk function + # Attempt to actify the dask array + args = list(args) if args: - dask_array = args[0] + dask_array = args.pop(0) else: dask_array = kwargs.pop("a") - # dask_array, chunk_function = actify( dask_array = actify( dask_array, method=method, axis=kwargs.get("axis"), ) - args = list(args) - args[0] = dask_array + args.insert(0, dask_array) - # if chunk_function is not None: - # # The dask array has been actified, so update the - # # chunk function. - # kwargs["chunk_function"] = chunk_function - - # Create the collapse + # Run the collapse method return collapse_method(self, *args, **kwargs) return wrapper return decorator + + +def active_storage_chunk(method): + """Decorator for active storage reductions on chunks. + + Intended for the ``cf_*_chunk`` methods in + cf.data.collapse.dask_collapse`. + + .. versionadded:: NEXTVERSION + + :Parameters: + + method: `str` + The name of the reduction method. If it is one of the + `active_chunk_methods` then active storage reductions + *might* occur. + + """ + + def decorator(chunk): + @wraps(chunk) + def wrapper(*args, **kwargs): + if ( + Active is not None + and method in active_reduction_methods + and cf_active_storage() + and active_storage_url() + ): + try: + # Try doing an active storage reduction + return active_chunk(method, *args, **kwargs) + except ValueError: + pass + + # Still here? Then we couldn't do an active storage + # reduction, so we'll do a local one. + return chunk(*args, **kwargs) + + return wrapper + + return decorator diff --git a/cf/data/collapse/dask_collapse.py b/cf/data/collapse/dask_collapse.py index 86b9006bc0..a24c9dcd64 100644 --- a/cf/data/collapse/dask_collapse.py +++ b/cf/data/collapse/dask_collapse.py @@ -15,7 +15,7 @@ from dask.utils import deepmap from ..dask_utils import cf_asanyarray -from .collapse_active import active_reduction +from .collapse_active import active_storage_chunk from .collapse_utils import double_precision_dtype @@ -230,6 +230,7 @@ def sum_sample_sizes(pairs, axis, computing_meta=False, **kwargs): # -------------------------------------------------------------------- # mean # -------------------------------------------------------------------- +@active_storage_chunk("mean") def cf_mean_chunk( x, weights=None, @@ -243,9 +244,6 @@ def cf_mean_chunk( This function is passed to `dask.array.reduction` as its *chunk* parameter. - If ``x.actified`` exists and is `True` then the calculations are - done in active storage. - .. versionadded:: 3.14.0 :Parameters: @@ -273,21 +271,15 @@ def cf_mean_chunk( if computing_meta: return x - # if getattr(x, 'actified', False): - try: - return active_reduction(x, "mean", weights=weights, **kwargs) - except ValueError: - pass - x = cf_asanyarray(x) if weights is not None: weights = cf_asanyarray(weights) # N, sum - d = cf_sum_chunk(x, weights, dtype=dtype, **kwargs) + d = cf_sum_chunk(x, weights=weights, dtype=dtype, **kwargs) d["V1"] = sum_weights_chunk( - x, weights, N=d["N"], check_weights=False, **kwargs + x, weights=weights, N=d["N"], check_weights=False, **kwargs ) d["weighted"] = weights is not None @@ -379,15 +371,13 @@ def cf_mean_agg( # -------------------------------------------------------------------- # maximum # -------------------------------------------------------------------- +@active_storage_chunk("max") def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): """Chunk calculations for the maximum. This function is passed to `dask.array.reduction` as its *chunk* parameter. - If ``x.actified`` exists and is `True` then the calculations are - done in active storage. - .. versionadded:: 3.14.0 :Parameters: @@ -400,15 +390,12 @@ def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): Dictionary with the keys: * N: The sample size. - * max: The maximum of `x``. + * max: The maximum of `x`. """ if computing_meta: return x - if getattr(x, "actified", False): - return active_reduction(x, "max", **kwargs) - x = cf_asanyarray(x) return { "max": chunk.max(x, **kwargs), @@ -537,15 +524,13 @@ def cf_mid_range_agg( # -------------------------------------------------------------------- # minimum # -------------------------------------------------------------------- +@active_storage_chunk("min") def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): """Chunk calculations for the minimum. This function is passed to `dask.array.reduction` as its *chunk* parameter. - If ``x.actified`` exists and is `True` then the calculations are - done in active storage. - .. versionadded:: 3.14.0 :Parameters: @@ -564,9 +549,6 @@ def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): if computing_meta: return x - if getattr(x, "actified", False): - return active_reduction(x, "min", **kwargs) - x = cf_asanyarray(x) return { "min": chunk.min(x, **kwargs), @@ -647,6 +629,7 @@ def cf_min_agg( # -------------------------------------------------------------------- # range # -------------------------------------------------------------------- +@active_storage_chunk("range") def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs): """Chunk calculations for the range. @@ -666,7 +649,7 @@ def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs): * N: The sample size. * min: The minimum of ``x``. - * max: The maximum of ``x`. + * max: The maximum of ``x``. """ x = cf_asanyarray(x) @@ -759,6 +742,7 @@ def cf_range_agg( # -------------------------------------------------------------------- # root mean square # -------------------------------------------------------------------- +@active_storage_chunk("rms") def cf_rms_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): """Chunk calculations for the root mean square (RMS). @@ -837,6 +821,7 @@ def cf_rms_agg( # -------------------------------------------------------------------- # sample size # -------------------------------------------------------------------- +@active_storage_chunk("sample_size") def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs): """Chunk calculations for the sample size. @@ -949,6 +934,7 @@ def cf_sample_size_agg( # -------------------------------------------------------------------- # sum # -------------------------------------------------------------------- +@active_storage_chunk("sum") def cf_sum_chunk( x, weights=None, @@ -962,9 +948,6 @@ def cf_sum_chunk( This function is passed to `dask.array.reduction` as its *chunk* parameter. - If ``x.actified`` exists and is `True` then the calculations are - done in active storage. - .. versionadded:: 3.14.0 :Parameters: @@ -990,9 +973,6 @@ def cf_sum_chunk( if computing_meta: return x - if getattr(x, "actified", False): - return active_reduction(x, "sum", weights=weights, **kwargs) - x = cf_asanyarray(x) if weights is not None: weights = cf_asanyarray(weights) @@ -1088,8 +1068,9 @@ def cf_sum_agg( # -------------------------------------------------------------------- # sum of weights # -------------------------------------------------------------------- +@active_storage_chunk("sum_of_weights") def cf_sum_of_weights_chunk( - x, weights=None, dtype="f8", computing_meta=False, square=False, **kwargs + x, weights=None, dtype="f8", computing_meta=False, **kwargs ): """Chunk calculations for the sum of the weights. @@ -1098,10 +1079,6 @@ def cf_sum_of_weights_chunk( :Parameters: - square: `bool`, optional - If True then calculate the sum of the squares of the - weights. - See `dask.array.reductions` for details of the other parameters. @@ -1111,12 +1088,52 @@ def cf_sum_of_weights_chunk( Dictionary with the keys: * N: The sample size. - * sum: The sum of ``weights``, or the sum of - ``weights**2`` if *square* is True. + * sum: The sum of ``weights``. """ x = cf_asanyarray(x) + if computing_meta: + return x + + # N + d = cf_sample_size_chunk(x, **kwargs) + d["sum"] = sum_weights_chunk( + x, weights=weights, square=False, N=d["N"], **kwargs + ) + + return d + + +# -------------------------------------------------------------------- +# sum of squares of weights +# -------------------------------------------------------------------- +@active_storage_chunk("sum_of_weights2") +def cf_sum_of_weights2_chunk( + x, weights=None, dtype="f8", computing_meta=False, **kwargs +): + """Chunk calculations for the sum of the squares of the weights. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: NEXTRELEASE + + :Parameters: + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * sum: The sum of the squares of ``weights``. + + """ + x = cf_asanyarray(x) if computing_meta: return x @@ -1124,7 +1141,7 @@ def cf_sum_of_weights_chunk( d = cf_sample_size_chunk(x, **kwargs) d["sum"] = sum_weights_chunk( - x, weights=weights, square=square, N=d["N"], **kwargs + x, weights=weights, square=True, N=d["N"], **kwargs ) return d @@ -1133,6 +1150,7 @@ def cf_sum_of_weights_chunk( # -------------------------------------------------------------------- # unique # -------------------------------------------------------------------- +@active_storage_chunk("unique") def cf_unique_chunk(x, dtype=None, computing_meta=False, **kwargs): """Chunk calculations for the unique values. @@ -1196,6 +1214,7 @@ def cf_unique_agg(pairs, axis=None, computing_meta=False, **kwargs): # -------------------------------------------------------------------- # variance # -------------------------------------------------------------------- +@active_storage_chunk("var") def cf_var_chunk( x, weights=None, dtype="f8", computing_meta=False, ddof=None, **kwargs ): @@ -1249,7 +1268,7 @@ def cf_var_chunk( weights = cf_asanyarray(weights) # N, V1, sum - d = cf_mean_chunk(x, weights, dtype=dtype, **kwargs) + d = cf_mean_chunk(x, weights=weights, dtype=dtype, **kwargs) wsum = d["sum"] V1 = d["V1"] @@ -1267,7 +1286,7 @@ def cf_var_chunk( if weighted and ddof == 1: d["V2"] = sum_weights_chunk( - x, weights, square=True, check_weights=False, **kwargs + x, weights=weights, square=True, check_weights=False, **kwargs ) else: d["V2"] = None diff --git a/cf/data/data.py b/cf/data/data.py index 515b5fb15b..8e0be09e24 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -908,7 +908,7 @@ def __getitem__(self, indices): ) dx = new.to_dask_array(asanyarray=False) else: - new = self.copy(array=False) + new = self.copy() dx = self.to_dask_array(asanyarray=False) # ------------------------------------------------------------ diff --git a/cf/data/utils.py b/cf/data/utils.py index 1d605ee120..00bb88785b 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -873,7 +873,9 @@ def collapse( kwargs["ddof"] = ddof # The applicable chunk function will have its own call to - # 'cf_asanyarray', so we can set 'asanyarray=False'. + # 'cf_asanyarray', so we can set 'asanyarray=False'. Also, setting + # asanyarray=False will ensure that any active storage operations + # are not compromised. dx = d.to_dask_array(asanyarray=False) dx = func(dx, **kwargs) d._set_dask(dx) diff --git a/cf/field.py b/cf/field.py index 8f4794f82c..ac4084664b 100644 --- a/cf/field.py +++ b/cf/field.py @@ -7193,22 +7193,41 @@ def collapse( if dim is None: continue - # Create new dimension coordinate bounds + # Create new dimension coordinate bounds (lazily) if dim.has_bounds(): b = dim.bounds.data else: b = dim.data - # Note: Accessing first_element and last_element is - # likely to be fast for dat one disk, assuming - # that these values were cached during the read. - bounds_data = Data( - [[b.first_element(), b.last_element()]], - dtype=b.dtype, - units=b.Units, - ) + try: + # Set the new bounds from cached values + bounds_data = Data( + [ + [ + self._custom["cached_elements"][0], + self._custom["cached_elements"][-1], + ] + ], + dtype=b.dtype, + units=b.Units, + ) + except KeyError: + # Set the new bounds lazily + ndim = b.ndim + bounds_data = Data.concatenate( + [ + b[(slice(0, 1, 1),) * ndim], + b[(slice(-1, None, 1),) * ndim], + ], + axis=-1, + copy=False, + ) + if ndim == 1: + bounds_data.insert_dimension(0, inplace=True) + bounds = self._Bounds(data=bounds_data) + # Create a new dimension coordinate value if coordinate == "min": coordinate = "minimum" print( From 2a162422073db7ddac73642c45619b30f4c6e51b Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 20 Mar 2024 10:22:55 +0000 Subject: [PATCH 068/134] dev --- cf/constants.py | 1 - cf/data/array/mixin/indexmixin.py | 54 +++++++++++------- cf/functions.py | 93 +++---------------------------- cf/read_write/read.py | 20 ++++--- cf/test/test_NetCDF4Array.py | 9 +++ cf/test/test_functions.py | 3 +- 6 files changed, 64 insertions(+), 116 deletions(-) diff --git a/cf/constants.py b/cf/constants.py index f4baaac2d1..0b8e12ecfd 100644 --- a/cf/constants.py +++ b/cf/constants.py @@ -65,7 +65,6 @@ "CHUNKSIZE": parse_bytes(_CHUNKSIZE), "active_storage": False, "active_storage_url": None, - "netcdf_lock": True, } masked = np.ma.masked diff --git a/cf/data/array/mixin/indexmixin.py b/cf/data/array/mixin/indexmixin.py index ebd7e5a41e..01f53e313a 100644 --- a/cf/data/array/mixin/indexmixin.py +++ b/cf/data/array/mixin/indexmixin.py @@ -7,7 +7,28 @@ class IndexMixin: - """Mixin class for lazy subspacing of a data array. + """Mixin class for lazy indexing of a data array. + + A data for a subspace it retrieved by casting the `{{class}}` as a + `numpy` array: + + >>> a = cf.{{class}}(....) + >>> a.shape + (6, 5) + >>> print(np.asanyarray(a) + [[ 0 1 2 3 4]) + [ 5 6 7 8 9] + [10 11 12 13 14] + [15 16 17 18 19] + [20 21 22 23 24] + [25 26 27 28 29]] + >>> a = a[::2, [1, 3, 4]] + >>> a = a[[False, True, True], 1:] + >>> a.shape + (2, 2) + >>> print(np.asanyarray(a)) + [[13 14] + [23 24]] .. versionadded:: NEXTVERSION @@ -54,15 +75,6 @@ def __getitem__(self, index): 1]]`` will be retrieved from the data when `__array__` is called. - Indexing is similar to `numpy` indexing. The only difference - to `numpy` indexing (given the restrictions on the type of - indices allowed) is: - - * When two or more dimension's indices are sequences of - integers then these indices work independently along each - dimension (similar to the way vector subscripts work in - Fortran). - .. versionadded:: NEXTVERSION .. seealso:: `index`, `original_shape`, `__array__`, @@ -164,8 +176,10 @@ def __repr__(self): x.__repr__() <==> repr(x) """ - out = super().__repr__() - return f"{out[:-1]}{self.original_shape}>" + return ( + f"" + ) @property def __asanyarray__(self): @@ -181,7 +195,7 @@ def __asanyarray__(self): return True def _get_array(self, index=None): - """Returns a subspace of the data. + """Returns a subspace of the data as a `numpy` array. .. versionadded:: NEXTVERSION @@ -190,8 +204,8 @@ def _get_array(self, index=None): :Parameters: index: `tuple` or `None`, optional - Provide the indices that define the subspace. If `None` - then the `index` attribute is used. + Provide the indices that define the subspace. If + `None` then the `index` attribute is used. :Returns: @@ -252,8 +266,10 @@ def original_shape(self): .. seealso:: `index`, `shape` """ - shape = self._custom.get("original_shape") - if shape is None: - self._custom["original_shape"] = self.shape + out = self._custom.get("original_shape") + if out is None: + # If shape is None then no subspace has been defined + out = self.shape + self._custom["original_shape"] = out - return shape + return out diff --git a/cf/functions.py b/cf/functions.py index 001fa66866..aebb3267c6 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -173,7 +173,6 @@ def configuration( bounds_combination_mode=None, active_storage=None, active_storage_url=None, - netcdf_lock=None, of_fraction=None, collapse_parallel_mode=None, free_memory_factor=None, @@ -194,7 +193,6 @@ def configuration( * `bounds_combination_mode` * `active_storage` * `active_storage_url` - * `netcdf_lock` These are all constants that apply throughout cf, except for in specific functions only if overridden by the corresponding keyword @@ -214,7 +212,7 @@ def configuration( .. seealso:: `atol`, `rtol`, `tempdir`, `chunksize`, `total_memory`, `log_level`, `regrid_logging`, `relaxed_identities`, `bounds_combination_mode`, - `active_storage`, `active_storage_url`, `netcdf_lock` + `active_storage`, `active_storage_url` :Parameters: @@ -279,14 +277,6 @@ def configuration( .. versionadded:: NEXTVERSION - netcdf_lock: `bool` or `Constant`, optional - The new value. If True then all netCDF file access - coordinates around the same lock, thereby preventing - concurrent reads. If False the concurrent reads are - allowed. The default is to not change the current value. - - .. versionadded:: NEXTVERSION - of_fraction: `float` or `Constant`, optional Deprecated at version 3.14.0 and is no longer available. @@ -318,8 +308,7 @@ def configuration( 'bounds_combination_mode': 'AND', 'chunksize': 82873466.88000001, 'active_storage': False, - 'active_storage_url': None, - 'netcdf_lock': True} + 'active_storage_url': None} >>> cf.chunksize(7.5e7) # any change to one constant... 82873466.88000001 >>> cf.configuration()['chunksize'] # ...is reflected in the configuration @@ -335,8 +324,7 @@ def configuration( 'bounds_combination_mode': 'AND', 'chunksize': 75000000.0, 'active_storage': False, - 'active_storage_url': None, - 'netcdf_lock': True} + 'active_storage_url': None} >>> cf.configuration() # the items set have been updated accordingly {'rtol': 2.220446049250313e-16, 'atol': 2.220446049250313e-16, @@ -347,8 +335,7 @@ def configuration( 'bounds_combination_mode': 'AND', 'chunksize': 75000000.0, 'active_storage': False, - 'active_storage_url': None, - 'netcdf_lock': True} + 'active_storage_url': None} Use as a context manager: @@ -362,8 +349,7 @@ def configuration( 'bounds_combination_mode': 'AND', 'chunksize': 75000000.0, 'active_storage': False, - 'active_storage_url': None, - 'netcdf_lock': True} + 'active_storage_url': None} >>> with cf.configuration(atol=9, rtol=10): ... print(cf.configuration()) ... @@ -376,8 +362,7 @@ def configuration( 'bounds_combination_mode': 'AND', 'chunksize': 75000000.0, 'active_storage': False, - 'active_storage_url': None, - 'netcdf_lock': True} + 'active_storage_url': None} >>> print(cf.configuration()) {'rtol': 2.220446049250313e-16, 'atol': 2.220446049250313e-16, @@ -388,8 +373,7 @@ def configuration( 'bounds_combination_mode': 'AND', 'chunksize': 75000000.0, 'active_storage': False, - 'active_storage_url': None, - 'netcdf_lock': True} + 'active_storage_url': None} """ if of_fraction is not None: @@ -422,7 +406,6 @@ def configuration( bounds_combination_mode=bounds_combination_mode, active_storage=active_storage, active_storage_url=active_storage_url, - netcdf_lock=netcdf_lock, ) @@ -474,7 +457,6 @@ def _configuration(_Configuration, **kwargs): "bounds_combination_mode": bounds_combination_mode, "active_storage": active_storage, "active_storage_url": active_storage_url, - "netcdf_lock": netcdf_lock, } old_values = {} @@ -1322,67 +1304,6 @@ def _parse(cls, arg): return str(arg) -class netcdf_lock(ConstantAccess): - """Whether or not allow concurrent netCDF read access. - - .. versionadded:: NEXTVERSION - - :Parameters: - - arg: `bool` or `Constant`, optional - Provide a value that will apply to all subsequent - operations. - - :Returns: - - `Constant` - The value prior to the change, or the current value if no - new value was specified. - - **Examples** - - >>> print(cf.netcdf_lock()) - True - >>> with cf.netcdf_lock(False): - ... print(cf.netcdf_lock()) - ... - False - >>> print(cf.netcdf_lock()) - True - >>> print(cf.netcdf_lock(False)) - True - >>> cf.netcdf_lock() - False - - """ - - _name = "netcdf_lock" - - def _parse(cls, arg): - """Parse a new constant value. - - .. versionaddedd:: NEXTVERSION - - :Parameters: - - cls: - This class. - - arg: - The given new constant value. - - :Returns: - - A version of the new constant value suitable for - insertion into the `CONSTANTS` dictionary. - - """ - if arg is None: - return arg - - return bool(arg) - - def CF(): """The version of the CF conventions. diff --git a/cf/read_write/read.py b/cf/read_write/read.py index 679d58fb38..d46d2d91ac 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -678,20 +678,24 @@ def read( .. versionadded:: 3.15.0 - netCDF_backend: `None` or `str`, optional + engine: `None` or `str`, optional + Specify which library to use for opening netCDF files. By default, or if `None`, then `netCDF4` will used unless it - fails to open a given file, in which case `h5netcdf` will - be used instead. Setting *netCDF_backend* to ``'netCDF4'`` - or ``'h5netcdf'`` will force the use of the `netCDF4` or - `h5netcdf` libraries respectively. + fails to open a given netCDF file, in which case + `h5netcdf` will be used instead. Setting *engine* to + ``'netCDF4'`` or ``'h5netcdf'`` will force the use of the + `netCDF4` or `h5netcdf` libraries respectively. + + .. note:: Using `h5netcdf` may restrictshas limited https://docs.h5py.org/en/stable/high/dataset.html#reading-writing-data .. note:: The *netCDF_backend* parameter does not affect the opening of netCDF fragment files that define the data of aggregated variables. For these, - `netCDF4` is used for local files and those - accessed via OPeNDAP, and `h5netcdf` is used for - fragment files in S3 object stores. + `netCDF4` is always used for local files and + those accessed via OPeNDAP, and `h5netcdf` is + always used for fragment files in S3 object + stores. .. versionadded:: NEXTVERSION diff --git a/cf/test/test_NetCDF4Array.py b/cf/test/test_NetCDF4Array.py index 4a5de427ad..c3a97b77c8 100644 --- a/cf/test/test_NetCDF4Array.py +++ b/cf/test/test_NetCDF4Array.py @@ -129,6 +129,15 @@ def test_NetCDF4Array_multiple_files(self): self.assertEqual(len(n.get_filenames()), 2) self.assertTrue((n[...] == f.array).all()) + def test_NetCDF4Array_shape(self): + shape = (12, 96, 73) + a = cf.NetCDF4Array("/home/file2", "tas", shape=shape) + self.assertEqual(a.shape, shape) + self.assertEqual(a.original_shape, shape) + a = a[::2] + self.assertEqual(a.shape, (shape[0] // 2,) + shape[1:]) + self.assertEqual(a.original_shape, shape) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/cf/test/test_functions.py b/cf/test/test_functions.py index 10e36f5f87..9c015f43ba 100644 --- a/cf/test/test_functions.py +++ b/cf/test/test_functions.py @@ -54,7 +54,7 @@ def test_configuration(self): self.assertIsInstance(org, dict) # Check all keys that should be there are, with correct value type: - self.assertEqual(len(org), 11) # update expected len if add new key(s) + self.assertEqual(len(org), 10) # update expected len if add new key(s) # Types expected: self.assertIsInstance(org["atol"], float) @@ -86,7 +86,6 @@ def test_configuration(self): "chunksize": 8e9, "active_storage": True, "active_storage_url": None, - "netcdf_lock": True, } # Test the setting of each lone item. From b3907b28721b1d96cb0dbe9b1d13fb2297022085 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 20 Mar 2024 12:38:57 +0000 Subject: [PATCH 069/134] dev --- cf/data/array/mixin/indexmixin.py | 36 +++++++++++++++++++++---------- cf/field.py | 10 +++------ cf/read_write/read.py | 8 ++++++- 3 files changed, 35 insertions(+), 19 deletions(-) diff --git a/cf/data/array/mixin/indexmixin.py b/cf/data/array/mixin/indexmixin.py index 01f53e313a..d56e9f6922 100644 --- a/cf/data/array/mixin/indexmixin.py +++ b/cf/data/array/mixin/indexmixin.py @@ -90,7 +90,7 @@ def __getitem__(self, index): index0 = self.index original_shape = self.original_shape - index = parse_indices(shape0, index, keepdims=False) + index1 = parse_indices(shape0, index, keepdims=False) new = self.copy() new_indices = [] @@ -98,25 +98,27 @@ def __getitem__(self, index): i = 0 for ind0, original_size in zip(index0, original_shape): + # If a previous call to __getitem__ resulted in a + # dimension being subspaced to and size 1 *and* removed + # (i.e. 'ind0' is integer-valued) then 'index1' will have + # fewer elements than 'index0' if isinstance(ind0, Integral): - # This dimension has been previously removed by the - # integer index 'ind0' new_indices.append(ind0) continue - # 'index' might have fewer elements than 'index0' - ind1 = index[i] + ind1 = index1[i] size0 = shape0[i] i += 1 + # If this dimension is not subspaced by the new index then + # we don't need to update the old index. if isinstance(ind1, slice) and ind1 == slice(None): - # This dimension is not subspaced new_indices.append(ind0) continue - # Still here? Then we have to work out the the subspace of - # the full array implied by applying both 'ind0' and - # 'ind1'. + # Still here? Then we have to work out the subspace of the + # full array implied by applying both 'ind0' + # and 'ind1'. if is_dask_collection(ind1): # Note: This will never occur when __getitem__ is # being called from within a Dask graph, because @@ -124,7 +126,7 @@ def __getitem__(self, index): # computed as part of the whole graph execution; # i.e. we don't have to worry about a # compute-within-a-compute situation. (If this - # were not the case then we could get round it + # were not the case then we would get round it # by wrapping the compute inside a `with # dask.config.set({"scheduler": # "synchronous"}):` clause.) @@ -159,12 +161,24 @@ def __getitem__(self, index): # ind1: int, or array of int/bool new_index = np.arange(*ind0.indices(original_size))[ind1] else: - # ind0: array of int + # ind0: array of int (if we made it here, then it + # can't be anything else) new_index = np.asanyarray(ind0)[ind1] new_indices.append(new_index) + # Find the shape implied by the new indices new_shape = indices_shape(new_indices, original_shape, keepdims=False) + + # Decreasing slices are not universally accepted (e.g. `h5py` + # doesn't like them), but at least we can convert a size 1 + # decreasing slice to an increasing one. + for i, (size, ind) in enumerate(zip(new_shape, new_indices[:])): + if size == 1 and isinstance(ind, slice): + start, _, step = ind.indices(size) + if step and step < 0: + new_indices[i] = slice(start, start + 1) + new._set_component("shape", tuple(new_shape), copy=False) new._custom["index"] = tuple(new_indices) diff --git a/cf/field.py b/cf/field.py index ac4084664b..af9d0778a8 100644 --- a/cf/field.py +++ b/cf/field.py @@ -7199,15 +7199,11 @@ def collapse( else: b = dim.data + cached_elements = b._get_cached_elements() try: - # Set the new bounds from cached values + # Try to set the new bounds from cached values bounds_data = Data( - [ - [ - self._custom["cached_elements"][0], - self._custom["cached_elements"][-1], - ] - ], + [[cached_elements[0], cached_elements[-1]]], dtype=b.dtype, units=b.Units, ) diff --git a/cf/read_write/read.py b/cf/read_write/read.py index d46d2d91ac..c1dbf92f74 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -687,7 +687,13 @@ def read( ``'netCDF4'`` or ``'h5netcdf'`` will force the use of the `netCDF4` or `h5netcdf` libraries respectively. - .. note:: Using `h5netcdf` may restrictshas limited https://docs.h5py.org/en/stable/high/dataset.html#reading-writing-data + .. note:: `h5netcdf` restricts the types of indices that + define subspaces of its data. See the `h5py` + documentaiton at https://docs.h5py.org for + details. However, such indices on a returned + `Field` may be possible if they are followed by + further subspaces that imply acceptable indices + to the data in the file. .. note:: The *netCDF_backend* parameter does not affect the opening of netCDF fragment files that define From b8b52a745a77e64750e0449c59cd1d04e446993d Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 20 Mar 2024 13:16:53 +0000 Subject: [PATCH 070/134] dev --- cf/data/array/cfah5netcdfarray.py | 185 ----------------------------- cf/data/array/cfanetcdf4array.py | 185 ----------------------------- cf/data/array/h5netcdfarray.py | 12 +- cf/data/array/locks.py | 4 +- cf/data/array/mixin/cfamixin.py | 190 +++++++++++++++++++++++++++++- cf/data/array/netcdf4array.py | 12 +- cf/regrid/regrid.py | 6 +- cf/regrid/regridoperator.py | 6 +- 8 files changed, 203 insertions(+), 397 deletions(-) diff --git a/cf/data/array/cfah5netcdfarray.py b/cf/data/array/cfah5netcdfarray.py index 6f4efbdeaf..47c58bff06 100644 --- a/cf/data/array/cfah5netcdfarray.py +++ b/cf/data/array/cfah5netcdfarray.py @@ -8,188 +8,3 @@ class CFAH5netcdfArray(CFAMixin, H5netcdfArray): .. versionadded:: NEXTVERSION """ - - def __init__( - self, - filename=None, - address=None, - dtype=None, - mask=True, - unpack=True, - units=False, - calendar=False, - instructions=None, - substitutions=None, - term=None, - attributes=None, - storage_options=None, - source=None, - copy=True, - x=None, - ): - """**Initialisation** - - :Parameters: - - filename: (sequence of) `str`, optional - The name of the CFA-netCDF file containing the - array. If a sequence then it must contain one element. - - address: (sequence of) `str`, optional - The name of the CFA-netCDF aggregation variable for the - array. If a sequence then it must contain one element. - - dtype: `numpy.dtype` - The data type of the aggregated data array. May be - `None` if the numpy data-type is not known (which can - be the case for netCDF string types, for example). - - mask: `bool` - If True (the default) then mask by convention when - reading data from disk. - - A netCDF array is masked depending on the values of any of - the netCDF variable attributes ``valid_min``, - ``valid_max``, ``valid_range``, ``_FillValue`` and - ``missing_value``. - - {{init unpack: `bool`, optional}} - - .. versionadded:: NEXTVERSION - - units: `str` or `None`, optional - The units of the aggregated data. Set to `None` to - indicate that there are no units. - - calendar: `str` or `None`, optional - The calendar of the aggregated data. Set to `None` to - indicate the CF default calendar, if applicable. - - instructions: `str`, optional - The ``aggregated_data`` attribute value as found on - the CFA netCDF variable. If set then this will be used - to improve the performance of `__dask_tokenize__`. - - substitutions: `dict`, optional - A dictionary whose key/value pairs define text - substitutions to be applied to the fragment file - names. Each key must be specified with the ``${...}`` - syntax, for instance ``{'${base}': 'sub'}``. - - .. versionadded:: 3.15.0 - - term: `str`, optional - The name of a non-standard aggregation instruction - term from which the array is to be created, instead of - creating the aggregated data in the standard terms. If - set then *address* must be the name of the term's - CFA-netCDF aggregation instruction variable, which - must be defined on the fragment dimensions and no - others. Each value of the aggregation instruction - variable will be broadcast across the shape of the - corresponding fragment. - - *Parameter example:* - ``address='cfa_tracking_id', term='tracking_id'`` - - .. versionadded:: 3.15.0 - - storage_options: `dict` or `None`, optional - Key/value pairs to be passed on to the creation of - `s3fs.S3FileSystem` file systems to control the - opening of fragment files in S3 object stores. Ignored - for files not in an S3 object store, i.e. those whose - names do not start with ``s3:``. - - By default, or if `None`, then *storage_options* is - taken as ``{}``. - - If the ``'endpoint_url'`` key is not in - *storage_options* or is not in a dictionary defined by - the ``'client_kwargs`` key (which is always the case - when *storage_options* is `None`), then one will be - automatically inserted for accessing a fragment S3 - file. For example, for a file name of - ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` - key with value ``'https://store'`` would be created. - - *Parameter example:* - ``{'key: 'scaleway-api-key...', 'secret': - 'scaleway-secretkey...', 'endpoint_url': - 'https://s3.fr-par.scw.cloud', 'client_kwargs': - {'region_name': 'fr-par'}}`` - - .. versionadded:: NEXTVERSION - - {{init source: optional}} - - {{init copy: `bool`, optional}} - - """ - if source is not None: - super().__init__(source=source, copy=copy) - - try: - fragment_shape = source.get_fragment_shape() - except AttributeError: - fragment_shape = None - - try: - instructions = source._get_component("instructions") - except AttributeError: - instructions = None - - try: - aggregated_data = source.get_aggregated_data(copy=False) - except AttributeError: - aggregated_data = {} - - try: - substitutions = source.get_substitutions() - except AttributeError: - substitutions = None - - try: - term = source.get_term() - except AttributeError: - term = None - - elif filename is not None: - shape, fragment_shape, aggregated_data = self._parse( - x, term, substitutions - ) - super().__init__( - filename=filename, - address=address, - shape=shape, - dtype=dtype, - mask=mask, - units=units, - calendar=calendar, - copy=copy, - ) - else: - super().__init__( - filename=filename, - address=address, - dtype=dtype, - mask=mask, - units=units, - calendar=calendar, - copy=copy, - ) - - fragment_shape = None - aggregated_data = None - instructions = None - term = None - - self._set_component("fragment_shape", fragment_shape, copy=False) - self._set_component("aggregated_data", aggregated_data, copy=False) - self._set_component("instructions", instructions, copy=False) - self._set_component("term", term, copy=False) - - if substitutions is not None: - self._set_component( - "substitutions", substitutions.copy(), copy=False - ) diff --git a/cf/data/array/cfanetcdf4array.py b/cf/data/array/cfanetcdf4array.py index dddf3411cb..b3b6b69d7a 100644 --- a/cf/data/array/cfanetcdf4array.py +++ b/cf/data/array/cfanetcdf4array.py @@ -8,188 +8,3 @@ class CFANetCDF4Array(CFAMixin, NetCDF4Array): .. versionadded:: NEXTVERSION """ - - def __init__( - self, - filename=None, - address=None, - dtype=None, - mask=True, - unpack=True, - units=False, - calendar=False, - instructions=None, - substitutions=None, - term=None, - attributes=None, - storage_options=None, - source=None, - copy=True, - x=None, - ): - """**Initialisation** - - :Parameters: - - filename: (sequence of) `str`, optional - The name of the CFA-netCDF file containing the - array. If a sequence then it must contain one element. - - address: (sequence of) `str`, optional - The name of the CFA-netCDF aggregation variable for the - array. If a sequence then it must contain one element. - - dtype: `numpy.dtype` - The data type of the aggregated data array. May be - `None` if the numpy data-type is not known (which can - be the case for netCDF string types, for example). - - mask: `bool` - If True (the default) then mask by convention when - reading data from disk. - - A netCDF array is masked depending on the values of any of - the netCDF variable attributes ``valid_min``, - ``valid_max``, ``valid_range``, ``_FillValue`` and - ``missing_value``. - - {{init unpack: `bool`, optional}} - - .. versionadded:: NEXTVERSION - - units: `str` or `None`, optional - The units of the aggregated data. Set to `None` to - indicate that there are no units. - - calendar: `str` or `None`, optional - The calendar of the aggregated data. Set to `None` to - indicate the CF default calendar, if applicable. - - instructions: `str`, optional - The ``aggregated_data`` attribute value as found on - the CFA netCDF variable. If set then this will be used - to improve the performance of `__dask_tokenize__`. - - substitutions: `dict`, optional - A dictionary whose key/value pairs define text - substitutions to be applied to the fragment file - names. Each key must be specified with the ``${...}`` - syntax, for instance ``{'${base}': 'sub'}``. - - .. versionadded:: 3.15.0 - - term: `str`, optional - The name of a non-standard aggregation instruction - term from which the array is to be created, instead of - creating the aggregated data in the standard - terms. If set then *address* must be the name of the - term's CFA-netCDF aggregation instruction variable, - which must be defined on the fragment dimensions and - no others. Each value of the aggregation instruction - variable will be broadcast across the shape of the - corresponding fragment. - - *Parameter example:* - ``address='cfa_tracking_id', term='tracking_id'`` - - .. versionadded:: 3.15.0 - - storage_options: `dict` or `None`, optional - Key/value pairs to be passed on to the creation of - `s3fs.S3FileSystem` file systems to control the - opening of fragment files in S3 object stores. Ignored - for files not in an S3 object store, i.e. those whose - names do not start with ``s3:``. - - By default, or if `None`, then *storage_options* is - taken as ``{}``. - - If the ``'endpoint_url'`` key is not in - *storage_options* or is not in a dictionary defined by - the ``'client_kwargs`` key (which is always the case - when *storage_options* is `None`), then one will be - automatically inserted for accessing a fragment S3 - file. For example, for a file name of - ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` - key with value ``'https://store'`` would be created. - - *Parameter example:* - ``{'key: 'scaleway-api-key...', 'secret': - 'scaleway-secretkey...', 'endpoint_url': - 'https://s3.fr-par.scw.cloud', 'client_kwargs': - {'region_name': 'fr-par'}}`` - - .. versionadded:: NEXTVERSION - - {{init source: optional}} - - {{init copy: `bool`, optional}} - - """ - if source is not None: - super().__init__(source=source, copy=copy) - - try: - fragment_shape = source.get_fragment_shape() - except AttributeError: - fragment_shape = None - - try: - instructions = source._get_component("instructions") - except AttributeError: - instructions = None - - try: - aggregated_data = source.get_aggregated_data(copy=False) - except AttributeError: - aggregated_data = {} - - try: - substitutions = source.get_substitutions() - except AttributeError: - substitutions = None - - try: - term = source.get_term() - except AttributeError: - term = None - - elif filename is not None: - shape, fragment_shape, aggregated_data = self._parse_cfa( - x, term, substitutions - ) - super().__init__( - filename=filename, - address=address, - shape=shape, - dtype=dtype, - mask=mask, - units=units, - calendar=calendar, - copy=copy, - ) - else: - super().__init__( - filename=filename, - address=address, - dtype=dtype, - mask=mask, - units=units, - calendar=calendar, - copy=copy, - ) - - fragment_shape = None - aggregated_data = None - instructions = None - term = None - - self._set_component("fragment_shape", fragment_shape, copy=False) - self._set_component("aggregated_data", aggregated_data, copy=False) - self._set_component("instructions", instructions, copy=False) - self._set_component("term", term, copy=False) - - if substitutions is not None: - self._set_component( - "substitutions", substitutions.copy(), copy=False - ) diff --git a/cf/data/array/h5netcdfarray.py b/cf/data/array/h5netcdfarray.py index 8803406643..2b21854ff2 100644 --- a/cf/data/array/h5netcdfarray.py +++ b/cf/data/array/h5netcdfarray.py @@ -1,8 +1,7 @@ import cfdm -# from ...functions import netcdf_lock from ...mixin_container import Container -from .locks import _lock +from .locks import netcdf_lock from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin, IndexMixin @@ -47,10 +46,7 @@ def _lock(self): .. versionadded:: NEXTVERSION """ - # if netcdf_lock(): - return _lock - - # return False + return netcdf_lock def _get_array(self, index=None): """Returns a subspace of the dataset variable. @@ -77,8 +73,8 @@ def _get_array(self, index=None): if index is None: index = self.index - # Note: We need to use the lock because the netCDF file is - # going to be read. + # Note: We need to lock because the netCDF file is about to be + # accessed. self._lock.acquire() # Note: It's cfdm.H5netcdfArray.__getitem__ that we want to diff --git a/cf/data/array/locks.py b/cf/data/array/locks.py index 21255ec9b5..5a7b2bd333 100644 --- a/cf/data/array/locks.py +++ b/cf/data/array/locks.py @@ -1,4 +1,4 @@ from dask.utils import SerializableLock -# Global lock for file access -_lock = SerializableLock() +# Global lock for netCDF file access +netcdf_lock = SerializableLock() diff --git a/cf/data/array/mixin/cfamixin.py b/cf/data/array/mixin/cfamixin.py index 9e1627b55d..9ce84e6c20 100644 --- a/cf/data/array/mixin/cfamixin.py +++ b/cf/data/array/mixin/cfamixin.py @@ -8,7 +8,7 @@ class CFAMixin: - """Mixin class for a CFA-netCDF array. + """Mixin class for a CFA array. .. versionadded:: NEXTVERSION @@ -36,6 +36,190 @@ def __new__(cls, *args, **kwargs): } return instance + def __init__( + self, + filename=None, + address=None, + dtype=None, + mask=True, + unpack=True, + units=False, + calendar=False, + instructions=None, + substitutions=None, + term=None, + attributes=None, + storage_options=None, + source=None, + copy=True, + x=None, + ): + """**Initialisation** + + :Parameters: + + filename: (sequence of) `str`, optional + The name of the CFA file containing the array. If a + sequence then it must contain one element. + + address: (sequence of) `str`, optional + The name of the CFA aggregation variable for the + array. If a sequence then it must contain one element. + + dtype: `numpy.dtype` + The data type of the aggregated data array. May be + `None` if the numpy data-type is not known (which can + be the case for some string types, for example). + + mask: `bool` + If True (the default) then mask by convention when + reading data from disk. + + A array is masked depending on the values of any of + the variable attributes ``valid_min``, ``valid_max``, + ``valid_range``, ``_FillValue`` and ``missing_value``. + + {{init unpack: `bool`, optional}} + + .. versionadded:: NEXTVERSION + + units: `str` or `None`, optional + The units of the aggregated data. Set to `None` to + indicate that there are no units. + + calendar: `str` or `None`, optional + The calendar of the aggregated data. Set to `None` to + indicate the CF default calendar, if applicable. + + instructions: `str`, optional + The ``aggregated_data`` attribute value as found on + the CFA variable. If set then this will be used to + improve the performance of `__dask_tokenize__`. + + substitutions: `dict`, optional + A dictionary whose key/value pairs define text + substitutions to be applied to the fragment file + names. Each key must be specified with the ``${...}`` + syntax, for instance ``{'${base}': 'sub'}``. + + .. versionadded:: 3.15.0 + + term: `str`, optional + The name of a non-standard aggregation instruction + term from which the array is to be created, instead of + creating the aggregated data in the standard terms. If + set then *address* must be the name of the term's + aggregation instruction variable, which must be + defined on the fragment dimensions and no others. Each + value of the aggregation instruction variable will be + broadcast across the shape of the corresponding + fragment. + + *Parameter example:* + ``address='cfa_tracking_id', term='tracking_id'`` + + .. versionadded:: 3.15.0 + + storage_options: `dict` or `None`, optional + Key/value pairs to be passed on to the creation of + `s3fs.S3FileSystem` file systems to control the + opening of fragment files in S3 object stores. Ignored + for files not in an S3 object store, i.e. those whose + names do not start with ``s3:``. + + By default, or if `None`, then *storage_options* is + taken as ``{}``. + + If the ``'endpoint_url'`` key is not in + *storage_options* or is not in a dictionary defined by + the ``'client_kwargs`` key (which is always the case + when *storage_options* is `None`), then one will be + automatically inserted for accessing a fragment S3 + file. For example, for a file name of + ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` + key with value ``'https://store'`` would be created. + + *Parameter example:* + ``{'key: 'scaleway-api-key...', 'secret': + 'scaleway-secretkey...', 'endpoint_url': + 'https://s3.fr-par.scw.cloud', 'client_kwargs': + {'region_name': 'fr-par'}}`` + + .. versionadded:: NEXTVERSION + + {{init source: optional}} + + {{init copy: `bool`, optional}} + + """ + if source is not None: + super().__init__(source=source, copy=copy) + + try: + fragment_shape = source.get_fragment_shape() + except AttributeError: + fragment_shape = None + + try: + instructions = source._get_component("instructions") + except AttributeError: + instructions = None + + try: + aggregated_data = source.get_aggregated_data(copy=False) + except AttributeError: + aggregated_data = {} + + try: + substitutions = source.get_substitutions() + except AttributeError: + substitutions = None + + try: + term = source.get_term() + except AttributeError: + term = None + + elif filename is not None: + shape, fragment_shape, aggregated_data = self._parse_cfa( + x, term, substitutions + ) + super().__init__( + filename=filename, + address=address, + shape=shape, + dtype=dtype, + mask=mask, + units=units, + calendar=calendar, + copy=copy, + ) + else: + super().__init__( + filename=filename, + address=address, + dtype=dtype, + mask=mask, + units=units, + calendar=calendar, + copy=copy, + ) + + fragment_shape = None + aggregated_data = None + instructions = None + term = None + + self._set_component("fragment_shape", fragment_shape, copy=False) + self._set_component("aggregated_data", aggregated_data, copy=False) + self._set_component("instructions", instructions, copy=False) + self._set_component("term", term, copy=False) + + if substitutions is not None: + self._set_component( + "substitutions", substitutions.copy(), copy=False + ) + def _parse_cfa(self, x, term, substitutions): """Parse the CFA aggregation instructions. @@ -646,7 +830,7 @@ def to_dask_array(self, chunks="auto"): ) if storage_options and kwargs["address"] == "nc": - # Pass on any S3 file system options + # Pass on any file system options kwargs["storage_options"] = storage_options fragment = FragmentArray( @@ -664,7 +848,7 @@ def to_dask_array(self, chunks="auto"): key, f_indices, False, - getattr(fragment, "_lock", False), + False, ) # Return the dask array diff --git a/cf/data/array/netcdf4array.py b/cf/data/array/netcdf4array.py index a0f1700e9b..a13a9e3e90 100644 --- a/cf/data/array/netcdf4array.py +++ b/cf/data/array/netcdf4array.py @@ -1,8 +1,7 @@ import cfdm -# from ...functions import netcdf_lock from ...mixin_container import Container -from .locks import _lock +from .locks import netcdf_lock from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin, IndexMixin @@ -45,10 +44,7 @@ def _lock(self): .. versionadded:: 3.14.0 """ - # if netcdf_lock(): - return _lock - - # return False + return netcdf_lock def _get_array(self, index=None): """Returns a subspace of the dataset variable. @@ -75,8 +71,8 @@ def _get_array(self, index=None): if index is None: index = self.index - # Note: We need to use the lock because the netCDF file is - # going to be read. + # Note: We need to lock because the netCDF file is about to be + # accessed. self._lock.acquire() # Note: It's cfdm.NetCDFArray.__getitem__ that we want to call diff --git a/cf/regrid/regrid.py b/cf/regrid/regrid.py index 84d9243ae5..9f2a39eda9 100644 --- a/cf/regrid/regrid.py +++ b/cf/regrid/regrid.py @@ -2465,7 +2465,7 @@ def create_esmpy_weights( from netCDF4 import Dataset from .. import __version__ - from ..data.array.locks import _lock + from ..data.array.locks import netcdf_lock if ( max(dst_esmpy_field.data.size, src_esmpy_field.data.size) @@ -2491,7 +2491,7 @@ def create_esmpy_weights( if src_grid.ln_z: regrid_method += f", ln {src_grid.method} in vertical" - _lock.acquire() + netcdf_lock.acquire() nc = Dataset(weights_file, "w", format="NETCDF4") nc.title = ( @@ -2532,7 +2532,7 @@ def create_esmpy_weights( v[...] = col nc.close() - _lock.release() + netcdf_lock.release() if esmpy_regrid_operator is None: # Destroy esmpy objects (the esmpy.Grid objects exist even if diff --git a/cf/regrid/regridoperator.py b/cf/regrid/regridoperator.py index cae895a5eb..fad0f545aa 100644 --- a/cf/regrid/regridoperator.py +++ b/cf/regrid/regridoperator.py @@ -727,9 +727,9 @@ def tosparse(self): # Read the weights from the weights file from netCDF4 import Dataset - from ..data.array.locks import _lock + from ..data.array.locks import netcdf_lock - _lock.acquire() + netcdf_lock.acquire() nc = Dataset(weights_file, "r") weights = nc.variables["S"][...] row = nc.variables["row"][...] @@ -746,7 +746,7 @@ def tosparse(self): row_start_index = 1 nc.close() - _lock.release() + netcdf_lock.release() else: raise ValueError( "Conversion to sparse array format requires at least " From 81f37946d43282059bbb00e4d9b71e069d57f6d1 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 21 Mar 2024 00:58:01 +0000 Subject: [PATCH 071/134] dev --- cf/data/array/h5netcdfarray.py | 4 +- cf/data/array/mixin/indexmixin.py | 104 +++++++++++++------ cf/data/array/netcdf4array.py | 4 +- cf/data/array/umarray.py | 2 +- cf/data/collapse/collapse_active.py | 2 +- cf/data/fragment/mixin/fragmentarraymixin.py | 2 +- cf/test/test_read_write.py | 2 + 7 files changed, 84 insertions(+), 36 deletions(-) diff --git a/cf/data/array/h5netcdfarray.py b/cf/data/array/h5netcdfarray.py index 2b21854ff2..1104b67007 100644 --- a/cf/data/array/h5netcdfarray.py +++ b/cf/data/array/h5netcdfarray.py @@ -17,7 +17,7 @@ class H5netcdfArray( **Active storage reductions** - Active storage reduction may be enabled with the `actify` + An active storage reductionx may be enabled with the `actify` method. See `cf.data.collapse.Collapse` for details. .. versionadded:: NEXTVERSION @@ -71,7 +71,7 @@ def _get_array(self, index=None): """ if index is None: - index = self.index + index = self.index() # Note: We need to lock because the netCDF file is about to be # accessed. diff --git a/cf/data/array/mixin/indexmixin.py b/cf/data/array/mixin/indexmixin.py index d56e9f6922..4940812a62 100644 --- a/cf/data/array/mixin/indexmixin.py +++ b/cf/data/array/mixin/indexmixin.py @@ -9,7 +9,7 @@ class IndexMixin: """Mixin class for lazy indexing of a data array. - A data for a subspace it retrieved by casting the `{{class}}` as a + A data for a subspace it retrieved by casting the object as a `numpy` array: >>> a = cf.{{class}}(....) @@ -22,13 +22,13 @@ class IndexMixin: [15 16 17 18 19] [20 21 22 23 24] [25 26 27 28 29]] - >>> a = a[::2, [1, 3, 4]] - >>> a = a[[False, True, True], 1:] + >>> a = a[::2, [1, 2, 4]] + >>> a = a[[True, False, True], :] >>> a.shape - (2, 2) + (2, 3) >>> print(np.asanyarray(a)) - [[13 14] - [23 24]] + [[ 1, 2, 4], + [21, 22, 24]] .. versionadded:: NEXTVERSION @@ -37,7 +37,7 @@ class IndexMixin: def __array__(self, *dtype): """Convert the ``{{class}}` into a `numpy` array. - .. versionadded:: (cfdm) NEXTVERSION + .. versionadded:: NEXTVERSION :Parameters: @@ -60,34 +60,44 @@ def __array__(self, *dtype): def __getitem__(self, index): """Returns a subspace of the array as a new `{{class}}`. - x.__getitem__(indices) <==> x[indices] + x.__getitem__(indices) <==> x[indices] - The new `{{class}}` may be converted to a `numpy` array with - its `__array__` method. + Subspaces created by indexing are lazy and are not applied + until the {{class}} object is converted to a `numpy` array + (via `__array__`), by which time all lazily-defined subspaces + will have been converted to a single index which defines only + the actual elements that need to be retrieved from the + original data. + [::2, [1, 3, 4]] + >>> a = a[[False, True, True], 1 - Consecutive subspaces are lazy, with only the final data - elements retrieved from the data when `__array__` is called. - For example, if the original data has shape ``(12, 145, 192)`` - and consecutive subspaces of ``[8:9, 10:20:3, [15, 1, 4, 12]`` - and ``[[0], [True, False, True], ::-2]`` are applied, then - only the elements defined by subspace ``[[8], [10, 16], [12, - 1]]`` will be retrieved from the data when `__array__` is - called. + For example, if the original data has shape ``(12, 145, 192)`` + and consecutive subspaces of ``[::2, [1, 3, 4]]`` and + ``[[False, True, True], 1:]`` are applied, then only the + elements defined by subspace ``[[8], [10, 16], [12, 1]]`` will + be retrieved from the data when `__array__` is called. - .. versionadded:: NEXTVERSION + For example, if the original data has shape ``(6, 5)`` and + consecutive subspaces of ``[8:9, 10:20:3, [15, 1, 4, 12]]`` + and ``[[0], [True, False, True], ::-2]`` are applied, then + only the elements defined by subspace ``[[8], [10, 16], [12, + 1]]`` will be retrieved from the data when `__array__` is + called. - .. seealso:: `index`, `original_shape`, `__array__`, - `__getitem__` + .. versionadded:: NEXTVERSION - :Returns: + .. seealso:: `index`, `original_shape`, `__array__`, + `__getitem__` + + :Returns: - `{{class}}` - The subspaced array. + `{{class}}` + The subspaced array. """ shape0 = self.shape - index0 = self.index + index0 = self.index(conform=False) original_shape = self.original_shape index1 = parse_indices(shape0, index, keepdims=False) @@ -231,8 +241,7 @@ def _get_array(self, index=None): f"Must implement {self.__class__.__name__}._get_array" ) - @property - def index(self): + def index(self, conform=True): """The index to be applied when converting to a `numpy` array. The `shape` is defined by the `index` applied to the @@ -265,8 +274,45 @@ def index(self): ind = (slice(None),) * self.ndim self._custom["index"] = ind self._custom["original_shape"] = self.shape - - return ind + return ind + + if not conform: + return ind + + # Still here? Then conform the indices. + ind = list(ind) + for n, (i, size) in enumerate(zip(ind[:], self.shape)): + if isinstance(i, slice): + if size == 1: + start, _, step = i.indices(size) + if step and step < 0: + # Decreasing slices are not universally + # accepted (e.g. `h5py` doesn't like them), + # but at least we can convert a size 1 + # decreasing slice into an increasing one. + ind[n] = slice(start, start + 1) + elif np.iterable(i): + if i.size == 1: + # Convert a sequence of one integer into an + # increasing slice + start = i.item() + ind[n] = slice(start, start + 1) + else: + # Convert a sequence of two or more integers into + # a slice, if possible. + step = np.unique(np.diff(i)) + if step.size == 1: + start, stop = i[[0, -1]] + if stop >= start: + stop += 1 + elif stop: + stop = -1 + else: + stop = None + + ind[n] = slice(start, stop, step.item()) + + return tuple(ind) @property def original_shape(self): diff --git a/cf/data/array/netcdf4array.py b/cf/data/array/netcdf4array.py index a13a9e3e90..c313451cc7 100644 --- a/cf/data/array/netcdf4array.py +++ b/cf/data/array/netcdf4array.py @@ -17,7 +17,7 @@ class NetCDF4Array( **Active storage reductions** - Active storage reduction may be enabled with the `actify` + An active storage reduction may be enabled with the `actify` method. See `cf.data.collapse.Collapse` for details. """ @@ -69,7 +69,7 @@ def _get_array(self, index=None): """ if index is None: - index = self.index + index = self.index() # Note: We need to lock because the netCDF file is about to be # accessed. diff --git a/cf/data/array/umarray.py b/cf/data/array/umarray.py index 470cc11c9f..f706575fbf 100644 --- a/cf/data/array/umarray.py +++ b/cf/data/array/umarray.py @@ -200,7 +200,7 @@ def _get_array(self, index=None): # Note: No need to lock the UM file - concurrent reads are OK. if index is None: - index = self.index + index = self.index() f, header_offset = self.open() rec = self._get_rec(f, header_offset) diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index a5bb12b221..6c5a757f9b 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -88,7 +88,7 @@ def active_chunk(method, x, **kwargs): # Create the output dictionary active.method = method active.components = True - d = active[x.index] + d = active[x.index()] # Reformat the output dictionary if method == "max": diff --git a/cf/data/fragment/mixin/fragmentarraymixin.py b/cf/data/fragment/mixin/fragmentarraymixin.py index 4355a3bf4b..2e89917ad2 100644 --- a/cf/data/fragment/mixin/fragmentarraymixin.py +++ b/cf/data/fragment/mixin/fragmentarraymixin.py @@ -37,7 +37,7 @@ def _get_array(self, index=None): """ if index is None: - index = self.index + index = self.index() try: array = super()._get_array(index) diff --git a/cf/test/test_read_write.py b/cf/test/test_read_write.py index 758da54f89..b7165f3469 100644 --- a/cf/test/test_read_write.py +++ b/cf/test/test_read_write.py @@ -922,6 +922,8 @@ def test_write_omit_data(self): def test_read_url(self): """Test reading urls.""" + print("SKIPPING URL TEST") + return for scheme in ("http", "https"): remote = f"{scheme}://psl.noaa.gov/thredds/dodsC/Datasets/cru/crutem5/Monthlies/air.mon.anom.nobs.nc" # Check that cf can access it From 8c39e35689e0ecfc65c189460d6eb67a82b9c123 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 21 Mar 2024 10:02:41 +0000 Subject: [PATCH 072/134] dev --- cf/data/array/mixin/indexmixin.py | 124 ++++++++++++++++-------------- cf/test/test_NetCDF4Array.py | 27 ++++++- 2 files changed, 91 insertions(+), 60 deletions(-) diff --git a/cf/data/array/mixin/indexmixin.py b/cf/data/array/mixin/indexmixin.py index 4940812a62..5e48e66d36 100644 --- a/cf/data/array/mixin/indexmixin.py +++ b/cf/data/array/mixin/indexmixin.py @@ -1,6 +1,7 @@ from numbers import Integral import numpy as np +from dask.array.slicing import normalize_index from dask.base import is_dask_collection from ....functions import indices_shape, parse_indices @@ -9,8 +10,10 @@ class IndexMixin: """Mixin class for lazy indexing of a data array. - A data for a subspace it retrieved by casting the object as a - `numpy` array: + A data for a subspace is retrieved by casting the object as a + `numpy` array. See `__getitem__` for more details. + + **Examples** >>> a = cf.{{class}}(....) >>> a.shape @@ -58,42 +61,39 @@ def __array__(self, *dtype): return array def __getitem__(self, index): - """Returns a subspace of the array as a new `{{class}}`. - - x.__getitem__(indices) <==> x[indices] - - Subspaces created by indexing are lazy and are not applied - until the {{class}} object is converted to a `numpy` array - (via `__array__`), by which time all lazily-defined subspaces - will have been converted to a single index which defines only - the actual elements that need to be retrieved from the - original data. - [::2, [1, 3, 4]] - >>> a = a[[False, True, True], 1 - - - For example, if the original data has shape ``(12, 145, 192)`` - and consecutive subspaces of ``[::2, [1, 3, 4]]`` and - ``[[False, True, True], 1:]`` are applied, then only the - elements defined by subspace ``[[8], [10, 16], [12, 1]]`` will - be retrieved from the data when `__array__` is called. - - For example, if the original data has shape ``(6, 5)`` and - consecutive subspaces of ``[8:9, 10:20:3, [15, 1, 4, 12]]`` - and ``[[0], [True, False, True], ::-2]`` are applied, then - only the elements defined by subspace ``[[8], [10, 16], [12, - 1]]`` will be retrieved from the data when `__array__` is - called. + """Returns a subspace of the data as a new `{{class}}`. + + x.__getitem__(indices) <==> x[indices] + + Subspaces created by indexing are lazy and are not applied + until the {{class}} object is converted to a `numpy` array + (via `__array__`), by which time all lazily-defined subspaces + will have been converted to a single combined index which + defines only the actual elements that need to be retrieved + from the original data. + + The combined index is intended to be treated orthogonally, + meaning that the index for each dimension is to be applied + independently, regardless of how that index was defined. For + instance, the indices ``[[0, 1], [1, 3], 0]`` and ``[:2, 1::2, + 0]`` will give identical results. + + For example, if the original data has shape ``(12, 145, 192)`` + and consecutive subspaces of ``[::2, [1, 3, 4], 96:]`` and + ``[[0, 5], [True, False, True], 0]`` are applied, then only + the elements defined by the combined index``[[0, 10], [1, 4], + 96]`` will be retrieved from the data when `__array__` is + called. - .. versionadded:: NEXTVERSION + .. versionadded:: NEXTVERSION - .. seealso:: `index`, `original_shape`, `__array__`, - `__getitem__` + .. seealso:: `index`, `original_shape`, `__array__`, + `__getitem__` - :Returns: + :Returns: - `{{class}}` - The subspaced array. + `{{class}}` + The subspaced data. """ shape0 = self.shape @@ -177,20 +177,11 @@ def __getitem__(self, index): new_indices.append(new_index) - # Find the shape implied by the new indices - new_shape = indices_shape(new_indices, original_shape, keepdims=False) - - # Decreasing slices are not universally accepted (e.g. `h5py` - # doesn't like them), but at least we can convert a size 1 - # decreasing slice to an increasing one. - for i, (size, ind) in enumerate(zip(new_shape, new_indices[:])): - if size == 1 and isinstance(ind, slice): - start, _, step = ind.indices(size) - if step and step < 0: - new_indices[i] = slice(start, start + 1) + new._custom["index"] = tuple(new_indices) + # Find the shape defined by the new indices + new_shape = indices_shape(new_indices, original_shape, keepdims=False) new._set_component("shape", tuple(new_shape), copy=False) - new._custom["index"] = tuple(new_indices) return new @@ -251,22 +242,33 @@ def index(self, conform=True): .. seealso:: `shape`, `original_shape` + :Parameters: + + conform: `bool`, optional + If True, the default, then 1) convert a decreasing + size 1 slice to an increasing one, and 2) where + possible, a convert sequence of integers to a + slice. If False then these transformations are not + done. + + :Returns: + + `tuple` + **Examples** - >>> x.index - (slice(None), slice(None), slice(None)) >>> x.shape (12, 145, 192) - >>> x = x[8:9, 10:20:3, [15, 1, 4, 12]] - >>> x.index - (slice(8, 9), slice(10, 20, 3), [15, 1, 4, 12]) - >>> x.shape - (1, 3, 4) + >>> x.index() + (slice(None), slice(None), slice(None)) + >>> x = x[8:7:-1, 10:19:3, [15, 1, 4, 12]] >>> x = x[[0], [True, False, True], ::-2] - >>> x.index - ([8], [10, 16], [12, 1]) >>> x.shape (1, 2, 2) + >>> x.index() + (slice(8, 9, None), slice(10, 17, 6), slice(12, -1, -11)) + >>> x.index(conform=False) + (array([8]), array([10, 16]), array([12, 1])) """ ind = self._custom.get("index") @@ -279,9 +281,13 @@ def index(self, conform=True): if not conform: return ind - # Still here? Then conform the indices. + # Still here? Then conform the indices by: + # + # 1) Converting decreasing size 1 slices to increasing ones. + # 2) Where possible, converting sequences of integers to + # slices. ind = list(ind) - for n, (i, size) in enumerate(zip(ind[:], self.shape)): + for n, (i, size) in enumerate(zip(ind[:], self.original_shape)): if isinstance(i, slice): if size == 1: start, _, step = i.indices(size) @@ -292,9 +298,9 @@ def index(self, conform=True): # decreasing slice into an increasing one. ind[n] = slice(start, start + 1) elif np.iterable(i): + i = normalize_index((i,), (size,))[0] if i.size == 1: - # Convert a sequence of one integer into an - # increasing slice + # Convert a sequence of one integer into a slice start = i.item() ind[n] = slice(start, start + 1) else: diff --git a/cf/test/test_NetCDF4Array.py b/cf/test/test_NetCDF4Array.py index c3a97b77c8..0d049ff497 100644 --- a/cf/test/test_NetCDF4Array.py +++ b/cf/test/test_NetCDF4Array.py @@ -130,7 +130,7 @@ def test_NetCDF4Array_multiple_files(self): self.assertTrue((n[...] == f.array).all()) def test_NetCDF4Array_shape(self): - shape = (12, 96, 73) + shape = (12, 73, 96) a = cf.NetCDF4Array("/home/file2", "tas", shape=shape) self.assertEqual(a.shape, shape) self.assertEqual(a.original_shape, shape) @@ -138,6 +138,31 @@ def test_NetCDF4Array_shape(self): self.assertEqual(a.shape, (shape[0] // 2,) + shape[1:]) self.assertEqual(a.original_shape, shape) + def test_NetCDF4Array_index(self): + shape = (12, 73, 96) + a = cf.NetCDF4Array("/home/file2", "tas", shape=shape) + self.assertEqual( + a.index(), + ( + slice( + None, + ), + ) + * len(shape), + ) + a = a[8:7:-1, 10:19:3, [15, 1, 4, 12]] + a = a[[0], [True, False, True], ::-2] + self.assertEqual(a.shape, (1, 2, 2)) + self.assertEqual( + a.index(), + (slice(8, 9, None), slice(10, 17, 6), slice(12, -1, -11)), + ) + + index = a.index(conform=False) + self.assertTrue((index[0] == [8]).all()) + self.assertTrue((index[1] == [10, 16]).all()) + self.assertTrue((index[2] == [12, 1]).all()) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) From 7e633e6c8d1ddaaad50c08abcf4d6ae868e70785 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 22 Mar 2024 11:28:02 +0000 Subject: [PATCH 073/134] dev --- cf/data/collapse/collapse_active.py | 38 +++++++++++++------ cf/data/fragment/netcdffragmentarray.py | 12 ++---- cf/read_write/read.py | 49 ++++++++++++------------- 3 files changed, 53 insertions(+), 46 deletions(-) diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 6c5a757f9b..07365bed13 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -63,32 +63,49 @@ def active_chunk(method, x, **kwargs): filename = x.get_filename() filename = "/".join(filename.split("/")[3:]) + max_threads = 100 + active_kwargs = { "uri": filename, "ncvar": x.get_address(), "storage_options": x.get_storage_options(), "active_storage_url": x.get_active_storage_url(), "storage_type": "s3", # Temporary requirement! + "max_threads": max_threads, } - if True: + if False: print(f"Active(**{active_kwargs})") active = Active(**active_kwargs) - # Provide a file lock - try: - lock = x._lock - except AttributeError: - pass - else: - if lock: - active.lock = lock +# # Provide a file lock +# try: +# lock = x._lock +# except AttributeError: +# pass +# else: +# if lock: +# active.lock = lock # Create the output dictionary active.method = method active.components = True - d = active[x.index()] + + import time, datetime + lock = False #True #False + if lock: + x._lock.acquire() + start = time.time() + print ("START LOCKED", x.index(), datetime.datetime.now()) + d = active[x.index()] + print ("FINISH LOCKED", x.index(), datetime.datetime.now(), time.time()-start, f"maxT={max_threads}") + x._lock.release() + else: + start = time.time() + print ("START unlocked", x.index(), datetime.datetime.now()) + d = active[x.index()] + print ("FINISH unlocked", x.index(), datetime.datetime.now(), time.time()-start, f"maxT={max_threads}") # Reformat the output dictionary if method == "max": @@ -100,7 +117,6 @@ def active_chunk(method, x, **kwargs): elif method == "sum": d = {"N": d["n"], "sum": d["sum"]} - print("ACTIVE CHUNK DONE!") return d diff --git a/cf/data/fragment/netcdffragmentarray.py b/cf/data/fragment/netcdffragmentarray.py index 93e1e0fdac..e33de32c0d 100644 --- a/cf/data/fragment/netcdffragmentarray.py +++ b/cf/data/fragment/netcdffragmentarray.py @@ -204,22 +204,16 @@ def __getitem__(self, indices): for filename, address in zip(filenames, self.get_addresses()): kwargs["filename"] = filename kwargs["address"] = address - - scheme = urlparse(filename).scheme kwargs["storage_options"] = self.get_storage_options( create_endpoint_url=False ) - if scheme == "s3": - fragment = H5netcdfFragmentArray(**kwargs) - else: - fragment = NetCDF4FragmentArray(**kwargs) try: - return fragment[indices] + return NetCDF4FragmentArray(**kwargs)[indices] except FileNotFoundError: pass - except RuntimeError as error: - raise RuntimeError(f"{error}: {filename}") + except Exception: + return H5netcdfFragmentArray(**kwargs)[indices] # Still here? if len(filenames) == 1: diff --git a/cf/read_write/read.py b/cf/read_write/read.py index c1dbf92f74..04af080a47 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -63,7 +63,7 @@ def read( chunks="auto", domain=False, cfa=None, - netCDF_backend=None, + netcdf_engine=None, storage_options=None, cache_metadata=True, ): @@ -678,30 +678,27 @@ def read( .. versionadded:: 3.15.0 - engine: `None` or `str`, optional + netcdf_engine: `None` or `str`, optional + Specify which library to use for opening and reading + netCDF files. By default, or if `None`, then the first one + of `netCDF4` and `h5netcdf` to successfully open the file + netCDF file is used. Setting *netcdf_engine* to one of + ``'netCDF4'`` and ``'h5netcdf'`` will force the use of + that library. - Specify which library to use for opening netCDF files. By - default, or if `None`, then `netCDF4` will used unless it - fails to open a given netCDF file, in which case - `h5netcdf` will be used instead. Setting *engine* to - ``'netCDF4'`` or ``'h5netcdf'`` will force the use of the - `netCDF4` or `h5netcdf` libraries respectively. + .. note:: The *netcdf_engine* parameter does not affect + the opening of netCDF fragment files that define + the data of aggregated variables. For these, the + first one of `netCDF4` and `h5netcdf` to + successfully open the file netCDF file is always + be used. .. note:: `h5netcdf` restricts the types of indices that - define subspaces of its data. See the `h5py` - documentaiton at https://docs.h5py.org for - details. However, such indices on a returned - `Field` may be possible if they are followed by - further subspaces that imply acceptable indices - to the data in the file. - - .. note:: The *netCDF_backend* parameter does not affect - the opening of netCDF fragment files that define - the data of aggregated variables. For these, - `netCDF4` is always used for local files and - those accessed via OPeNDAP, and `h5netcdf` is - always used for fragment files in S3 object - stores. + define subspaces of its data. See + https://docs.h5py.org for details. However, such + indices on a returned `Field` are possible if + they are followed by further subspaces that + imply acceptable indices. .. versionadded:: NEXTVERSION @@ -1041,7 +1038,7 @@ def read( select=select, domain=domain, cfa_options=cfa_options, - netCDF_backend=netCDF_backend, + netcdf_engine=netcdf_engine, storage_options=storage_options, cache_metadata=cache_metadata, ) @@ -1159,7 +1156,7 @@ def _read_a_file( select=None, domain=False, cfa_options=None, - netCDF_backend=None, + netcdf_engine=None, storage_options=None, cache_metadata=True, ): @@ -1202,7 +1199,7 @@ def _read_a_file( .. versionadded:: NEXTVERSION - netCDF_backend: `str` or `None`, optional + netcdf_engine: `str` or `None`, optional See `cf.read` for details. .. versionadded:: NEXTVERSION @@ -1288,7 +1285,7 @@ def _read_a_file( warn_valid=warn_valid, domain=domain, storage_options=storage_options, - netCDF_backend=netCDF_backend, + netcdf_engine=netcdf_engine, ) except MaskError: # Some data required for field interpretation is missing, From 2ac6cbd1b2bb69ad225cb68e28ff4242740a74d3 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 22 Mar 2024 15:25:14 +0000 Subject: [PATCH 074/134] dev --- cf/data/collapse/collapse_active.py | 46 ++++++++++++++++--------- cf/data/fragment/netcdffragmentarray.py | 2 -- cf/field.py | 21 +++++++++-- cf/mixin/fielddomain.py | 7 +++- cf/test/test_Field.py | 8 +++++ 5 files changed, 63 insertions(+), 21 deletions(-) diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 07365bed13..bbe177d22b 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -64,7 +64,7 @@ def active_chunk(method, x, **kwargs): filename = "/".join(filename.split("/")[3:]) max_threads = 100 - + active_kwargs = { "uri": filename, "ncvar": x.get_address(), @@ -79,33 +79,47 @@ def active_chunk(method, x, **kwargs): active = Active(**active_kwargs) -# # Provide a file lock -# try: -# lock = x._lock -# except AttributeError: -# pass -# else: -# if lock: -# active.lock = lock + # # Provide a file lock + # try: + # lock = x._lock + # except AttributeError: + # pass + # else: + # if lock: + # active.lock = lock # Create the output dictionary active.method = method active.components = True - import time, datetime - lock = False #True #False - if lock: + import datetime + import time + + lock = False # True #False + if lock: x._lock.acquire() start = time.time() - print ("START LOCKED", x.index(), datetime.datetime.now()) + print("START LOCKED", x.index(), datetime.datetime.now()) d = active[x.index()] - print ("FINISH LOCKED", x.index(), datetime.datetime.now(), time.time()-start, f"maxT={max_threads}") + print( + "FINISH LOCKED", + x.index(), + datetime.datetime.now(), + time.time() - start, + f"maxT={max_threads}", + ) x._lock.release() else: start = time.time() - print ("START unlocked", x.index(), datetime.datetime.now()) + print("START unlocked", x.index(), datetime.datetime.now()) d = active[x.index()] - print ("FINISH unlocked", x.index(), datetime.datetime.now(), time.time()-start, f"maxT={max_threads}") + print( + "FINISH unlocked", + x.index(), + datetime.datetime.now(), + time.time() - start, + f"maxT={max_threads}", + ) # Reformat the output dictionary if method == "max": diff --git a/cf/data/fragment/netcdffragmentarray.py b/cf/data/fragment/netcdffragmentarray.py index e33de32c0d..699cf790ab 100644 --- a/cf/data/fragment/netcdffragmentarray.py +++ b/cf/data/fragment/netcdffragmentarray.py @@ -1,5 +1,3 @@ -from urllib.parse import urlparse - import cfdm from ..array.abstract import Array diff --git a/cf/field.py b/cf/field.py index af9d0778a8..8e2acbd49f 100644 --- a/cf/field.py +++ b/cf/field.py @@ -441,6 +441,23 @@ def __getitem__(self, indices): for axis, size in zip(data_axes, new_data.shape): domain_axes[axis].set_size(size) + # Record which axes were cyclic before the subspace + org_cyclic = [data_axes.index(axis) for axis in new.cyclic()] + + # Se the subspaced data + new.set_data(new_data, axes=data_axes, copy=False) + + # Update axis cylcicity. Note that this can only entail + # setting an originally cyclic axis to be non-cyclic. Doing + # this now it enables us to disable the (possibly very slow) + # automatic check for cyclicity on the 'set_construct' calls + # below. + if org_cyclic: + new_cyclic = new_data.cyclic() + for i in org_cyclic: + if i not in new_cyclic: + new.cyclic(i, iscyclic=False) + # ------------------------------------------------------------ # Subspace constructs with data # ------------------------------------------------------------ @@ -502,15 +519,15 @@ def __getitem__(self, indices): # Replace existing construct with its subspace if needs_slicing: + print ('setting ',repr(construct)) new.set_construct( construct[tuple(dice)], key=key, axes=construct_axes, copy=False, + autocyclic={"no-op": True}, ) - new.set_data(new_data, axes=data_axes, copy=False) - return new def __setitem__(self, indices, value): diff --git a/cf/mixin/fielddomain.py b/cf/mixin/fielddomain.py index 6dab4b9223..5146c2554d 100644 --- a/cf/mixin/fielddomain.py +++ b/cf/mixin/fielddomain.py @@ -1109,10 +1109,15 @@ def autocyclic(self, key=None, coord=None, verbose=None, config={}): :Returns: - `bool` + `bool` or `None` + `True` if the dimension is cycle, `False` if it isn't, + or `None` no checks were done. """ noop = config.get("no-op") + if noop: + # Don't do anything + return if "cyclic" in config: if not config["cyclic"]: diff --git a/cf/test/test_Field.py b/cf/test/test_Field.py index 313defb6d3..e012d535fe 100644 --- a/cf/test/test_Field.py +++ b/cf/test/test_Field.py @@ -663,6 +663,14 @@ def test_Field__getitem__(self): self.assertTrue(np.allclose(f[:, -3:].array, g[:, :3].array)) self.assertTrue(f[:, :4].equals(g[:, 3:])) + # Test setting of axis cyclicity + f.cyclic("grid_longitude", iscyclic=True) + self.assertEqual(f.data.cyclic(), {1}) + g = f[0, :] + self.assertEqual(g.data.cyclic(), {1}) + g = f[:, 0] + self.assertEqual(g.data.cyclic(), set()) + def test_Field__setitem__(self): f = self.f.copy().squeeze() From 9b373ae6f0f8971b921ffcf1a07fdd309f6403b4 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 22 Mar 2024 15:44:32 +0000 Subject: [PATCH 075/134] dev --- cf/field.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cf/field.py b/cf/field.py index 8e2acbd49f..9ed2e4edf1 100644 --- a/cf/field.py +++ b/cf/field.py @@ -519,7 +519,6 @@ def __getitem__(self, indices): # Replace existing construct with its subspace if needs_slicing: - print ('setting ',repr(construct)) new.set_construct( construct[tuple(dice)], key=key, From 080f2276d1493d3832e0ef2b89aa473fb09c5e20 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 26 Mar 2024 23:31:47 +0000 Subject: [PATCH 076/134] dev --- Changelog.rst | 22 +++++++++++++++------- cf/read_write/netcdf/netcdfread.py | 2 +- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/Changelog.rst b/Changelog.rst index 9302158ca1..7451338f8b 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -1,5 +1,5 @@ -version NEXT ------------- +version NEXT + 1 +---------------- **2024-??-??** @@ -9,6 +9,19 @@ version NEXT * New class `cf.NetCDF4Array` * New class `cf.CFAH5netcdfArray` * New class `cf.CFANetCDF4Array` +* New dependency: ``h5netcdf>=1.3.0`` +* New dependency: ``h5py>=3.10.0`` +* New dependency: ``s3fs>=2024.2.0`` +* Changed dependency: ``1.11.2.0<=cfdm<1.11.3.0`` +* Changed dependency: ``cfunits>=3.3.7`` + +---- + +version NEXT +------------ + +**2024-??-??** + * Added spherical regridding to discrete sampling geometry destination grids (https://github.com/NCAS-CMS/cf-python/issues/716) * Added 3-d spherical regridding to `cf.Field.regrids`, and the option @@ -30,11 +43,6 @@ version NEXT * Fix bug in `cf.read` when reading UM files that caused LBPROC value 131072 (Mean over an ensemble of parallel runs) to be ignored (https://github.com/NCAS-CMS/cf-python/issues/737) -* New dependency: ``h5netcdf>=1.3.0`` -* New dependency: ``h5py>=3.10.0`` -* New dependency: ``s3fs>=2024.2.0`` -* Changed dependency: ``1.11.2.0<=cfdm<1.11.3.0`` -* Changed dependency: ``cfunits>=3.3.7`` ---- diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py index 33e0145a8d..40eb2e460b 100644 --- a/cf/read_write/netcdf/netcdfread.py +++ b/cf/read_write/netcdf/netcdfread.py @@ -974,7 +974,7 @@ def _cfa_parse_aggregated_data(self, ncvar, aggregated_data): variable = g["variables"][term_ncvar] array = cfdm.netcdf_indexer( - variable, mask=True, unpack=True, always_mask=False + variable, mask=True, unpack=True, always_masked_array=False ) aggregation_instructions[term_ncvar] = array[...] From b127508ff70aab5f6beef7035f64a88c1af0dd3b Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 3 Apr 2024 11:05:48 +0100 Subject: [PATCH 077/134] dev --- cf/data/array/fullarray.py | 107 +++++++------------ cf/data/array/h5netcdfarray.py | 3 - cf/data/array/mixin/indexmixin.py | 28 +++-- cf/data/array/netcdf4array.py | 3 - cf/data/array/umarray.py | 3 - cf/data/fragment/mixin/fragmentarraymixin.py | 3 - 6 files changed, 54 insertions(+), 93 deletions(-) diff --git a/cf/data/array/fullarray.py b/cf/data/array/fullarray.py index f6c29858fb..52eb0af86b 100644 --- a/cf/data/array/fullarray.py +++ b/cf/data/array/fullarray.py @@ -1,11 +1,12 @@ import numpy as np from .abstract import Array +from .mixin import IndexMixin _FULLARRAY_HANDLED_FUNCTIONS = {} -class FullArray(Array): +class FullArray(IndexMixin, Array): """A array filled with a given value. The array may be empty or all missing values. @@ -90,27 +91,7 @@ def __init__( self._set_component("units", units, copy=False) self._set_component("calendar", calendar, copy=False) - def __array__(self, *dtype): - """The numpy array interface. - - .. versionadded:: 3.15.0 - - :Parameters: - - dtype: optional - Typecode or data-type to which the array is cast. - - :Returns: - - `numpy.ndarray` - An independent numpy array of the data. - - """ - array = self[...] - if not dtype: - return array - else: - return array.astype(dtype[0], copy=False) + self._set_units() def __array_function__(self, func, types, args, kwargs): """The `numpy` `__array_function__` protocol. @@ -128,54 +109,6 @@ def __array_function__(self, func, types, args, kwargs): return _FULLARRAY_HANDLED_FUNCTIONS[func](*args, **kwargs) - def __getitem__(self, indices): - """x.__getitem__(indices) <==> x[indices] - - Returns a numpy array. - - """ - # If 'apply_indices' is False then we can make a filled numpy - # array that already has the correct shape. Otherwise we need - # to make one with shape 'self.shape' and then apply the - # indices to that. - apply_indices = False - - if indices is Ellipsis: - array_shape = self.shape - else: - array_shape = [] - for i, size in zip(indices, self.shape): - if not isinstance(i, slice): - continue - - start, stop, step = i.indices(size) - a, b = divmod(stop - start, step) - if b: - a += 1 - - array_shape.append(a) - - if len(array_shape) != self.ndim: - apply_indices = True - array_shape = self.shape - - fill_value = self.get_full_value() - if fill_value is np.ma.masked: - array = np.ma.masked_all(array_shape, dtype=self.dtype) - elif fill_value is not None: - array = np.full( - array_shape, fill_value=fill_value, dtype=self.dtype - ) - else: - array = np.empty(array_shape, dtype=self.dtype) - - if apply_indices: - array = self.get_subspace(array, indices) - - self._set_units() - - return array - def __repr__(self): """Called by the `repr` built-in function. @@ -196,6 +129,40 @@ def __str__(self): return f"Filled with {fill_value!r}" + def _get_array(self, index=None): + """Returns a subspace of the dataset variable. + + .. versionadded:: NEXTVERSION + + .. seealso:: `__array__`, `index` + + :Parameters: + + index: `tuple` or `None`, optional + Provide the indices that define the subspace. If `None` + then the `index` attribute is used. + + :Returns: + + `numpy.ndarray` + The subspace. + + """ + if index is None: + index = self.index() + + fill_value = self.get_full_value() + if fill_value is np.ma.masked: + array = np.ma.masked_all(self.shape, dtype=self.dtype) + elif fill_value is not None: + array = np.full( + self.shape, fill_value=fill_value, dtype=self.dtype + ) + else: + array = np.empty(self.shape, dtype=self.dtype) + + return array + @property def dtype(self): """Data-type of the data elements.""" diff --git a/cf/data/array/h5netcdfarray.py b/cf/data/array/h5netcdfarray.py index 1104b67007..01dac019d9 100644 --- a/cf/data/array/h5netcdfarray.py +++ b/cf/data/array/h5netcdfarray.py @@ -51,9 +51,6 @@ def _lock(self): def _get_array(self, index=None): """Returns a subspace of the dataset variable. - The subspace is defined by the indices stored in the `index` - attribute. - .. versionadded:: NEXTVERSION .. seealso:: `__array__`, `index` diff --git a/cf/data/array/mixin/indexmixin.py b/cf/data/array/mixin/indexmixin.py index 5e48e66d36..3bbc74481c 100644 --- a/cf/data/array/mixin/indexmixin.py +++ b/cf/data/array/mixin/indexmixin.py @@ -127,10 +127,10 @@ def __getitem__(self, index): continue # Still here? Then we have to work out the subspace of the - # full array implied by applying both 'ind0' - # and 'ind1'. + # full array implied by applying 'ind0' + # followed by 'ind1'. if is_dask_collection(ind1): - # Note: This will never occur when __getitem__ is + # Note: This will never occur when this __getitem__ is # being called from within a Dask graph, because # any lazy indices will have already been # computed as part of the whole graph execution; @@ -139,7 +139,7 @@ def __getitem__(self, index): # were not the case then we would get round it # by wrapping the compute inside a `with # dask.config.set({"scheduler": - # "synchronous"}):` clause.) + # "synchronous"}):`.) ind1 = ind1.compute() if isinstance(ind0, slice): @@ -172,7 +172,13 @@ def __getitem__(self, index): new_index = np.arange(*ind0.indices(original_size))[ind1] else: # ind0: array of int (if we made it here, then it - # can't be anything else) + # can't be anything else, because + # we've dealt with ind0 being an + # int, and a previous ind1 that + # was an array of bool will have + # resulted in this ind0 being an + # array of int) + # ind1: anything new_index = np.asanyarray(ind0)[ind1] new_indices.append(new_index) @@ -284,6 +290,7 @@ def index(self, conform=True): # Still here? Then conform the indices by: # # 1) Converting decreasing size 1 slices to increasing ones. + # # 2) Where possible, converting sequences of integers to # slices. ind = list(ind) @@ -294,8 +301,7 @@ def index(self, conform=True): if step and step < 0: # Decreasing slices are not universally # accepted (e.g. `h5py` doesn't like them), - # but at least we can convert a size 1 - # decreasing slice into an increasing one. + # but we can convert them to increasing ones. ind[n] = slice(start, start + 1) elif np.iterable(i): i = normalize_index((i,), (size,))[0] @@ -322,10 +328,10 @@ def index(self, conform=True): @property def original_shape(self): - """The original shape of the data. + """The original shape of the data, before any subspacing. - The `shape` is defined by the `index` applied to the - `original_shape`. + The `shape` is defined by the result of subspacing the data in + its original shape with the indices defined by `index`. .. versionadded:: NEXTVERSION @@ -334,7 +340,7 @@ def original_shape(self): """ out = self._custom.get("original_shape") if out is None: - # If shape is None then no subspace has been defined + # If None then no subspace has been defined out = self.shape self._custom["original_shape"] = out diff --git a/cf/data/array/netcdf4array.py b/cf/data/array/netcdf4array.py index c313451cc7..2d7fa45d99 100644 --- a/cf/data/array/netcdf4array.py +++ b/cf/data/array/netcdf4array.py @@ -49,9 +49,6 @@ def _lock(self): def _get_array(self, index=None): """Returns a subspace of the dataset variable. - The subspace is defined by the indices stored in the `index` - attribute. - .. versionadded:: NEXTVERSION .. seealso:: `__array__`, `index` diff --git a/cf/data/array/umarray.py b/cf/data/array/umarray.py index 7a36e286e2..2f79df475a 100644 --- a/cf/data/array/umarray.py +++ b/cf/data/array/umarray.py @@ -178,9 +178,6 @@ def __init__( def _get_array(self, index=None): """Returns a subspace of the dataset variable. - The subspace is defined by the indices stored in the `index` - attribute. - .. versionadded:: NEXTVERSION .. seealso:: `__array__`, `index` diff --git a/cf/data/fragment/mixin/fragmentarraymixin.py b/cf/data/fragment/mixin/fragmentarraymixin.py index 2e89917ad2..118baea2f6 100644 --- a/cf/data/fragment/mixin/fragmentarraymixin.py +++ b/cf/data/fragment/mixin/fragmentarraymixin.py @@ -15,9 +15,6 @@ class FragmentArrayMixin: def _get_array(self, index=None): """Returns a subspace of the dataset variable. - The subspace is defined by the indices stored in the `index` - attribute. - .. versionadded:: NEXTVERSION .. seealso:: `__array__`, `index` From 3a2ad829d21ae08ec805432b9e5e83da11814e62 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 3 Apr 2024 15:02:28 +0100 Subject: [PATCH 078/134] dev --- cf/__init__.py | 12 +++--- cf/data/array/fullarray.py | 30 +++++++++++---- cf/data/array/mixin/arraymixin.py | 18 +++++++++ cf/data/array/mixin/filearraymixin.py | 16 -------- cf/test/test_FullArray.py | 55 +++++++++++++++++++++++++++ 5 files changed, 102 insertions(+), 29 deletions(-) create mode 100644 cf/test/test_FullArray.py diff --git a/cf/__init__.py b/cf/__init__.py index aa4def8885..8dce28a25a 100644 --- a/cf/__init__.py +++ b/cf/__init__.py @@ -216,12 +216,12 @@ ) # Check the version of dask -_minimum_vn = "2022.12.1" -if Version(dask.__version__) < Version(_minimum_vn): - raise RuntimeError( - f"Bad dask version: cf requires dask>={_minimum_vn}. " - f"Got {dask.__version__} at {dask.__file__}" - ) +# _minimum_vn = "2022.12.1" +# if Version(dask.__version__) < Version(_minimum_vn): +# raise RuntimeError( +# f"Bad dask version: cf requires dask>={_minimum_vn}. " +# f"Got {dask.__version__} at {dask.__file__}" +# ) # Check the version of Python _minimum_vn = "3.8.0" diff --git a/cf/data/array/fullarray.py b/cf/data/array/fullarray.py index 52eb0af86b..48b43442fa 100644 --- a/cf/data/array/fullarray.py +++ b/cf/data/array/fullarray.py @@ -1,5 +1,6 @@ import numpy as np +from ...functions import indices_shape, parse_indices from .abstract import Array from .mixin import IndexMixin @@ -130,7 +131,7 @@ def __str__(self): return f"Filled with {fill_value!r}" def _get_array(self, index=None): - """Returns a subspace of the dataset variable. + """Returns the full array. .. versionadded:: NEXTVERSION @@ -149,20 +150,35 @@ def _get_array(self, index=None): """ if index is None: - index = self.index() + shape = self.shape + else: + original_shape = self.original_shape + index = parse_indices(original_shape, index, keepdims=False) + shape = indices_shape(index, original_shape, keepdims=False) fill_value = self.get_full_value() if fill_value is np.ma.masked: - array = np.ma.masked_all(self.shape, dtype=self.dtype) + array = np.ma.masked_all(shape, dtype=self.dtype) elif fill_value is not None: - array = np.full( - self.shape, fill_value=fill_value, dtype=self.dtype - ) + array = np.full(shape, fill_value=fill_value, dtype=self.dtype) else: - array = np.empty(self.shape, dtype=self.dtype) + array = np.empty(shape, dtype=self.dtype) return array + @property + def array(self): + """Return an independent numpy array containing the data. + + .. versionadded:: NEXTRELEASE + + :Returns: + + `numpy.ndarray` + An independent numpy array of the data. + """ + return np.asanyarray(self) + @property def dtype(self): """Data-type of the data elements.""" diff --git a/cf/data/array/mixin/arraymixin.py b/cf/data/array/mixin/arraymixin.py index a167abec50..6b7412d9ee 100644 --- a/cf/data/array/mixin/arraymixin.py +++ b/cf/data/array/mixin/arraymixin.py @@ -1,3 +1,5 @@ +import numpy as np + from ....units import Units @@ -16,6 +18,22 @@ def __array_function__(self, func, types, args, kwargs): """ return NotImplemented + @property + def _meta(self): + """Normalize the array to an appropriate Dask meta object. + + The Dask meta can be thought of as a suggestion to Dask. Dask + uses this meta to generate the task graph until it can infer + the actual metadata from the values. It does not force the + output to have the structure or dtype of the specified meta. + + .. versionadded:: NEXTVERSION + + .. seealso:: `dask.utils.meta_from_array` + + """ + return np.array((), dtype=self.dtype) + @property def Units(self): """The `cf.Units` object containing the units of the array. diff --git a/cf/data/array/mixin/filearraymixin.py b/cf/data/array/mixin/filearraymixin.py index 04d731d234..b5b314b9e2 100644 --- a/cf/data/array/mixin/filearraymixin.py +++ b/cf/data/array/mixin/filearraymixin.py @@ -1,8 +1,6 @@ from os import sep from os.path import basename, dirname, join -import numpy as np - from ....functions import _DEPRECATION_ERROR_ATTRIBUTE, abspath @@ -26,20 +24,6 @@ def __dask_tokenize__(self): self.get_addresses(), ) - @property - def _meta(self): - """The metadata for the containing Dask array. - - This is the kind of array that will result from slicing the - file array. - - .. versionadded:: 3.14.0 - - .. seealso:: `dask.array.from_array` - - """ - return np.array((), dtype=self.dtype) - @property def filename(self): """The name of the file containing the array. diff --git a/cf/test/test_FullArray.py b/cf/test/test_FullArray.py new file mode 100644 index 0000000000..63dcb84f34 --- /dev/null +++ b/cf/test/test_FullArray.py @@ -0,0 +1,55 @@ +import datetime +import faulthandler +import unittest + +import numpy as np + +faulthandler.enable() # to debug seg faults and timeouts + +import cf + + +class FullArrayTest(unittest.TestCase): + def test_FullValue_inspection(self): + full = 9 + f = cf.FullArray(full, np.dtype(int), shape=(2, 3, 4)) + self.assertEqual(f.get_full_value(), full) + self.assertEqual(f.shape, (2, 3, 4)) + self.assertEqual(f.dtype, np.dtype(int)) + self.assertIsNone(f.set_full_value(10)) + self.assertEqual(f.get_full_value(), 10) + + def test_FullValue_array(self): + full = 9 + f = cf.FullArray(full, np.dtype(int), shape=(2, 3, 4)) + self.assertTrue((f.array == np.full(f.shape, full)).all()) + + f = f[0, [True, False, True], ::3] + self.assertTrue((f.array == np.full((2, 1), full)).all()) + + def test_FullValue_masked_array(self): + full = np.ma.masked + f = cf.FullArray(full, np.dtype(int), shape=(2, 3)) + + a = np.ma.masked_all(f.shape, dtype=np.dtype(int)) + array = f.array + self.assertEqual(array.dtype, a.dtype) + self.assertTrue( + (np.ma.getmaskarray(array) == np.ma.getmaskarray(a)).all() + ) + + def test_FullValue_get_array(self): + full = 9 + f = cf.FullArray(full, np.dtype(int), shape=(2, 3)) + f = f[0, 1] + self.assertEqual(f.shape, ()) + + array = f._get_array(index=Ellipsis) + self.assertTrue((array == np.full((2, 3), full)).all()) + + +if __name__ == "__main__": + print("Run date:", datetime.datetime.now()) + cf.environment() + print() + unittest.main(verbosity=2) From 157eeea51b5f2bfcb409ee9d73fd65f0602e4975 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 4 Apr 2024 15:06:14 +0100 Subject: [PATCH 079/134] dev --- cf/data/array/mixin/activestoragemixin.py | 18 +- cf/data/collapse/__init__.py | 2 +- cf/data/collapse/collapse.py | 51 +++--- cf/data/collapse/collapse_active.py | 209 +++++++++++----------- cf/data/collapse/dask_collapse.py | 78 +++++--- cf/docstring/docstring.py | 19 +- cf/test/test_active_storage.py | 15 +- docs/source/field_analysis.rst | 4 +- 8 files changed, 225 insertions(+), 171 deletions(-) diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index eb17095ef6..4493aaf621 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -58,17 +58,10 @@ def actify(self, active_storage_url): if "add_offset" in attributes or "scale_factor" in attributes: raise AttributeError( "Can't actify {self.__class__.__name__} when " - "the data has been numerically packed" + "the data have been numerically packed" ) if Active is None: - # Note: We don't really expect to be here because if - # activestorage.Active is not available then we - # wouldn't even attempt to actify the instance - # during a reduction (see - # `cf.data.collapse.active_storage`). However, it's - # worth checking in case `actify` is called from - # elsewhere. raise AttributeError( "Can't actify {self.__class__.__name__} when " "activestorage.Active is not available" @@ -90,8 +83,13 @@ def get_active_storage_url(self): :Returns: `str` or `None` - The active storage URL, or `None` if there is no - active storage reduction. + The active storage URL, or `None` if no active storage + reduction is possible. + + **Examples** + + >>> a.get_active_storage() + 'https://183.175.143.286:8080' """ return self._custom.get("active_storage_url") diff --git a/cf/data/collapse/__init__.py b/cf/data/collapse/__init__.py index 7902b51936..30bef911c6 100644 --- a/cf/data/collapse/__init__.py +++ b/cf/data/collapse/__init__.py @@ -1,2 +1,2 @@ from .collapse import Collapse -from .collapse_active import actify, active_storage +from .collapse_active import actify, active_reduction_methods, active_storage diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index 0dac65d7c8..45a53a3bb7 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -25,6 +25,9 @@ class Collapse(metaclass=DocstringRewriteMeta): * it is possible to import the `activestorage.Active` class; + * the method is one of those specified by + `cf.data.collapse.active_reduction_methods`; + * the collapse is over all axes; * the collapse is unweighted; @@ -38,12 +41,13 @@ class Collapse(metaclass=DocstringRewriteMeta): * the `Collapse` method's *chunk_function* parameter is `None`; - * the method has a corresponding active chunk function defined - in the `cf.data.collapse.active_chunk_functions` dictionary; - - * inspection of the graph of the `dask` array with - `cf.data.collapse.actify` confirms that making use of active - storage is possible; + * the `active_storage` attribute of the `Data` object being + collapsed is `True`, indicating that active storage operations + are possible, provided all of the other conditions are also + met. In general, it will only be `True` for data that are in + files on disk, are not compressed by convention, and have not + been previously operated on, apart from by subspacing + operations. in which case the Dask graph is modified to expect the per-chunk reductions to be carried out externally. @@ -128,7 +132,7 @@ def max( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} {{active_storage: `bool`, optional}} @@ -196,7 +200,7 @@ def max_abs( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} {{active_storage: `bool`, optional}} @@ -255,7 +259,7 @@ def mean( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} {{active_storage: `bool`, optional}} @@ -327,7 +331,7 @@ def mean_abs( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} {{active_storage: `bool`, optional}} @@ -385,7 +389,7 @@ def mid_range( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} {{active_storage: `bool`, optional}} @@ -457,7 +461,7 @@ def min( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} {{active_storage: `bool`, optional}} @@ -525,7 +529,7 @@ def min_abs( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} {{active_storage: `bool`, optional}} @@ -581,7 +585,7 @@ def range( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} {{active_storage: `bool`, optional}} @@ -656,7 +660,7 @@ def rms( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} {{active_storage: `bool`, optional}} @@ -725,7 +729,7 @@ def sample_size( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} {{active_storage: `bool`, optional}} @@ -800,7 +804,7 @@ def sum( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} {{active_storage: `bool`, optional}} @@ -875,7 +879,7 @@ def sum_of_weights( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} {{active_storage: `bool`, optional}} @@ -951,7 +955,7 @@ def sum_of_weights2( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} {{active_storage: `bool`, optional}} @@ -1004,7 +1008,7 @@ def unique( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} {{active_storage: `bool`, optional}} @@ -1085,7 +1089,12 @@ def var( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} + + A callable function must accept a *ddof* keyword + parameter that sets the delta degrees of freedom. See + `cf.data.collapse.dask_collapse.cf_var_chunk` for + details. {{active_storage: `bool`, optional}} diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index bbe177d22b..b05604484f 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -12,28 +12,32 @@ logger = logging.getLogger(__name__) # -------------------------------------------------------------------- -# Specify which reductions are possible with active storage +# Specify which reduction methods are possible with active storage # -------------------------------------------------------------------- active_reduction_methods = ("max", "mean", "min", "sum") +class ActiveStorageError(Exception): + pass + + def active_chunk(method, x, **kwargs): - """Collapse a data in a chunk with active storage. + """Collapse data in a chunk with active storage. .. versionadded:: NEXTVERSION - .. seealso:: `actify`, `active_storage2`, `cf.data.collapse.Collapse` + .. seealso:: `actify`, `active_storage`, `cf.data.collapse.Collapse` :Parameters: - a: `dask.array.Array` - The array to be collapsed. + a: array_like + The data to be collapsed. method: `str` The name of the reduction method. If the method does not have a corresponding active function in the - `active_chunk_functions` dictionary then active - compuations are not carried out. + `active_chunk_functions` dictionary then active storage + computations are not carried out. axis: (sequence of) `int`, optional Axis or axes along which to operate. By default, @@ -47,15 +51,16 @@ def active_chunk(method, x, **kwargs): `dict` The reduced data in component form. + **Examples** + + >>> d = active_chunk('sum', x) + >>> d + {'N': 7008, 'sum': 7006221.66903949} + """ if kwargs.get("computing_meta"): return x - if not getattr(x, "actified", False): - raise ValueError( - "Can't do active reductions when on non-actified data" - ) - weighted = kwargs.get("weights") is not None if weighted: raise ValueError(f"Can't do weighted {method!r} active reductions") @@ -78,71 +83,72 @@ def active_chunk(method, x, **kwargs): print(f"Active(**{active_kwargs})") active = Active(**active_kwargs) - - # # Provide a file lock - # try: - # lock = x._lock - # except AttributeError: - # pass - # else: - # if lock: - # active.lock = lock - - # Create the output dictionary active.method = method active.components = True import datetime import time - lock = False # True #False - if lock: - x._lock.acquire() - start = time.time() - print("START LOCKED", x.index(), datetime.datetime.now()) - d = active[x.index()] - print( - "FINISH LOCKED", - x.index(), - datetime.datetime.now(), - time.time() - start, - f"maxT={max_threads}", - ) - x._lock.release() - else: - start = time.time() - print("START unlocked", x.index(), datetime.datetime.now()) - d = active[x.index()] - print( - "FINISH unlocked", - x.index(), - datetime.datetime.now(), - time.time() - start, - f"maxT={max_threads}", - ) - - # Reformat the output dictionary + try: + lock = False # True #False + if lock: + x._lock.acquire() + start = time.time() + print("START LOCKED", x.index(), datetime.datetime.now()) + d = active[x.index()] + print( + "FINISH LOCKED", + x.index(), + datetime.datetime.now(), + time.time() - start, + f"maxT={max_threads}", + ) + x._lock.release() + else: + start = time.time() + print("START unlocked", x.index(), datetime.datetime.now()) + d = active[x.index()] + print( + "FINISH unlocked", + x.index(), + datetime.datetime.now(), + time.time() - start, + f"maxT={max_threads}", + ) + except Exception as error: + raise ActiveStorageError(error) + + # Reformat the components dictionary to match the output of the + # corresponding local chunk function if method == "max": + # Local chunk function `cf_max_chunk` d = {"N": d["n"], "max": d["max"]} elif method == "mean": - d = {"N": d["n"], "sum": d["sum"], "V1": d["n"], "weighted": weighted} + # Local chunk function `cf_mean_chunk` + d = {"N": d["n"], "sum": d["sum"], "V1": d["n"], "weighted": False} elif method == "min": + # Local chunk function `cf_min_chunk` d = {"N": d["n"], "min": d["min"]} elif method == "sum": + # Local chunk function `cf_sum_chunk` d = {"N": d["n"], "sum": d["sum"]} + else: + raise ActiveStorageError( + f"Don't know how to reformat {method!r} components" + ) return d def actify(a, method, axis=None): - """Modify a dask array to use active storage reductions. + """Modify a Dask array to use active storage reductions. - The dask graph is inspected to ensure that active storage - reductions are possible, and if not then the dask array is + The Dask graph is inspected to ensure that active storage + reductions are possible, and if not then the Dask array is returned unchanged. .. note:: It is assumed that the `!active_storage` attribute of - the `Data` object that provided the dask array *a* is + the `Data` object that provided the Dask array *a* is `True`. If this is not the case then an error at compute time is likely. The value of the `Data` object's `!active_storage` attribute is registered via the @@ -160,8 +166,8 @@ def actify(a, method, axis=None): method: `str` The name of the reduction method. If the method does not have a corresponding active function in the - `active_chunk_functions` dictionary then active - compuations are not carried out. + `active_chunk_functions` dictionary then active storage + computations are not carried out. axis: (sequence of) `int`, optional Axis or axes along which to operate. By default, @@ -171,7 +177,7 @@ def actify(a, method, axis=None): (`dask.array.Array`, function) or (`dask.array.Array`, `None`) If active storage operations are possible then return the - modified dask array and the new chunk reduction + modified Dask array and the new chunk reduction function. Otherwise return the unaltered input array and `None`. @@ -190,6 +196,11 @@ def actify(a, method, axis=None): # return the input data unchanged. return a + url = active_storage_url().value + if url is None: + # TODOACTIVE + return a + # Parse axis ndim = a.ndim if axis is None: @@ -208,24 +219,24 @@ def actify(a, method, axis=None): # the axes, so return the input data unchanged. return a - # Loop round the nodes of the dask graph looking for data + # Loop round the nodes of the Dask graph looking for data # definitions that point to files and which support active storage - # operations, and modify the dask graph when we find them. + # operations, and then modify the Dask graph when we find them. # # The elements are traversed in reverse order so that the data - # defintions come out first, allowing for the potential of a - # faster short circuit when using active storage is not possible. + # definitions will tend to come out first, allowing for the + # potential of a faster short circuit when using active storage is + # not possible. # - # Performance: The optimisation is essential, but can be slow for - # complicated graphs. - url = str(active_storage_url()) + # Performance: The optimising the graph can be slow for + # complicated graphs, but is nonetheless essential. ok_to_actify = True dsk = collections_to_dsk((a,), optimize_graph=True) for key, value in reversed(dsk.items()): try: filename = value.get_filename() except AttributeError: - # This dask chunk is not a data definition + # This Dask chunk is not a data definition continue if not filename: @@ -235,7 +246,7 @@ def actify(a, method, axis=None): break # Still here? Then this chunk is a data definition that points - # to files, so try to insert an actified copy into the dask + # to files, so try to insert an actified copy into the Dask # graph. try: dsk[key] = value.actify(url) @@ -246,23 +257,22 @@ def actify(a, method, axis=None): break if not ok_to_actify: - # It turns out that the dask graph is not suitable for active + # It turns out that the Dask graph is not suitable for active # storage reductions, so return the input data unchanged. return a - # Still here? Then all data definitions in the dask graph support - # active storage reductions => redefine the dask array from the - # actified dask graph, and set the active storage reduction chunk - # function. + # Still here? Then the Dask graph supports active storage + # reductions => redefine the Dask array from the + # actified Dask graph. logger.warning( - "At compute time, data will be collapsed with " - f"active storage at URL {url}" + "At compute time, the collapse will be attempted with active " + f"storage at URL {url}" ) return da.Array(dsk, a.name, a.chunks, a.dtype, a._meta) # -------------------------------------------------------------------- -# Decoators +# Decorators # -------------------------------------------------------------------- def active_storage(method): """Decorator for active storage reductions on `Collapse` methods. @@ -288,26 +298,22 @@ def decorator(collapse_method): def wrapper(self, *args, **kwargs): if ( Active is not None - and method in active_reduction_methods and kwargs.get("active_storage") + and cf_active_storage() + # and active_storage_url() + and method in active_reduction_methods and kwargs.get("weights") is None and kwargs.get("chunk_function") is None - and cf_active_storage() - and active_storage_url() ): - # Attempt to actify the dask array + # Attempt to actify the Dask array args = list(args) if args: - dask_array = args.pop(0) + dx = args.pop(0) else: - dask_array = kwargs.pop("a") + dx = kwargs.pop("a") - dask_array = actify( - dask_array, - method=method, - axis=kwargs.get("axis"), - ) - args.insert(0, dask_array) + dx = actify(dx, method=method, axis=kwargs.get("axis")) + args.insert(0, dx) # Run the collapse method return collapse_method(self, *args, **kwargs) @@ -334,24 +340,25 @@ def active_storage_chunk(method): """ - def decorator(chunk): - @wraps(chunk) + def decorator(chunk_function): + @wraps(chunk_function) def wrapper(*args, **kwargs): - if ( - Active is not None - and method in active_reduction_methods - and cf_active_storage() - and active_storage_url() - ): + if args: + x = args[0] + else: + x = kwargs["x"] + + if getattr(x, "actified", False): try: - # Try doing an active storage reduction + # Try doing an active storage reduction on + # actified chunk data return active_chunk(method, *args, **kwargs) - except ValueError: - pass + except ActiveStorageError as error: + # The active storage reduction failed + logger.warning(f"{error}. Reverting to local reduction.") - # Still here? Then we couldn't do an active storage - # reduction, so we'll do a local one. - return chunk(*args, **kwargs) + # Still here? Then do a local reduction. + return chunk_function(*args, **kwargs) return wrapper diff --git a/cf/data/collapse/dask_collapse.py b/cf/data/collapse/dask_collapse.py index a24c9dcd64..c18a8e1118 100644 --- a/cf/data/collapse/dask_collapse.py +++ b/cf/data/collapse/dask_collapse.py @@ -1,7 +1,8 @@ """Reduction functions intended to be passed to be dask. -Most of these functions are expected to be set as *chunk*, *combine* and -*aggregate* parameters of `dask.array.reduction` +Most of these functions are expected to be passed to +`dask.array.reduction` as its *chunk*, *combine* and *aggregate* +parameters. """ from functools import reduce @@ -128,7 +129,6 @@ def sum_weights_chunk( return N - weights = cf_asanyarray(weights) if check_weights: w_min = weights.min() if w_min <= 0: @@ -244,6 +244,11 @@ def cf_mean_chunk( This function is passed to `dask.array.reduction` as its *chunk* parameter. + Weights are interpreted as reliability weights, as opposed to + frequency weights. See + https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights + for details. + .. versionadded:: 3.14.0 :Parameters: @@ -390,13 +395,14 @@ def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): Dictionary with the keys: * N: The sample size. - * max: The maximum of `x`. + * max: The maximum of ``x``. """ if computing_meta: return x x = cf_asanyarray(x) + return { "max": chunk.max(x, **kwargs), "N": cf_sample_size_chunk(x, **kwargs)["N"], @@ -550,6 +556,7 @@ def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): return x x = cf_asanyarray(x) + return { "min": chunk.min(x, **kwargs), "N": cf_sample_size_chunk(x, **kwargs)["N"], @@ -652,11 +659,11 @@ def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs): * max: The maximum of ``x``. """ - x = cf_asanyarray(x) - if computing_meta: return x + x = cf_asanyarray(x) + # N, max d = cf_max_chunk(x, **kwargs) @@ -749,6 +756,11 @@ def cf_rms_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): This function is passed to `dask.array.reduction` as its *chunk* parameter. + Weights are interpreted as reliability weights, as opposed to + frequency weights. See + https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights + for details. + .. versionadded:: 3.14.0 :Parameters: @@ -764,11 +776,11 @@ def cf_rms_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): * sum: The weighted sum of ``x**2``. """ - x = cf_asanyarray(x) - if computing_meta: return x + x = cf_asanyarray(x) + return cf_mean_chunk( np.multiply(x, x, dtype=dtype), weights=weights, dtype=dtype, **kwargs ) @@ -842,11 +854,11 @@ def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs): * N: The sample size. """ - x = cf_asanyarray(x) - if computing_meta: return x + x = cf_asanyarray(x) + if np.ma.isMA(x): N = chunk.sum(np.ones_like(x, dtype=dtype), **kwargs) else: @@ -974,6 +986,7 @@ def cf_sum_chunk( return x x = cf_asanyarray(x) + if weights is not None: weights = cf_asanyarray(weights) if check_weights: @@ -1091,10 +1104,13 @@ def cf_sum_of_weights_chunk( * sum: The sum of ``weights``. """ - x = cf_asanyarray(x) if computing_meta: return x + x = cf_asanyarray(x) + if weights is not None: + weights = cf_asanyarray(weights) + # N d = cf_sample_size_chunk(x, **kwargs) @@ -1133,10 +1149,13 @@ def cf_sum_of_weights2_chunk( * sum: The sum of the squares of ``weights``. """ - x = cf_asanyarray(x) if computing_meta: return x + x = cf_asanyarray(x) + if weights is not None: + weights = cf_asanyarray(weights) + # N d = cf_sample_size_chunk(x, **kwargs) @@ -1171,11 +1190,11 @@ def cf_unique_chunk(x, dtype=None, computing_meta=False, **kwargs): * unique: The unique values. """ - x = cf_asanyarray(x) - if computing_meta: return x + x = cf_asanyarray(x) + return {"unique": np.unique(x)} @@ -1223,10 +1242,28 @@ def cf_var_chunk( This function is passed to `dask.array.reduction` as its *chunk* parameter. + For non-overlapping data sets, X_{i}, making up the aggregate data + set X=\bigcup _{i}X_{i}, the unweighted variance \sigma ^{2} is + + \mu &={\frac {1}{\sum _{i}{N_{X_{i}}}}}\left(\sum + _{i}{N_{X_{i}}\mu _{X_{i}}}\right) + + \sigma ^{2}&={\sqrt {{\frac {1}{\sum + _{i}{N_{X_{i}}-ddof}}}\left(\sum _{i}{\left[(N_{X_{i}}-1)\sigma + _{X_{i}}^{2}+N_{X_{i}}\mu _{X_{i}}^{2}\right]}-\left[\sum + _{i}{N_{X_{i}}}\right]\mu _{X}^{2}\right)}} + + where X_{i}\cap X_{j}=\varnothing , \forall i Date: Thu, 4 Apr 2024 23:50:07 +0100 Subject: [PATCH 080/134] dev --- Changelog.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Changelog.rst b/Changelog.rst index d6ae65c15f..bdacad3b81 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -44,7 +44,7 @@ version NEXT 131072 (Mean over an ensemble of parallel runs) to be ignored (https://github.com/NCAS-CMS/cf-python/issues/737) * Fix bug in `cf.aggregate` that sometimes put a null transpose - operation into the Dask grpah when one was not needed + operation into the Dask graph when one was not needed (https://github.com/NCAS-CMS/cf-python/issues/754) ---- From a3f805c54a37517418645ec111abe926e94f0c8f Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 5 Apr 2024 11:47:07 +0100 Subject: [PATCH 081/134] dev --- cf/data/array/fullarray.py | 35 +++-- cf/data/array/mixin/arraymixin.py | 2 +- cf/data/array/mixin/cfamixin.py | 16 +- cf/data/array/umarray.py | 182 ++++++++++++++++------ cf/data/fragment/fullfragmentarray.py | 14 +- cf/data/fragment/h5netcdffragmentarray.py | 18 ++- cf/data/fragment/netcdf4fragmentarray.py | 18 ++- cf/data/fragment/netcdffragmentarray.py | 42 +++-- cf/data/fragment/umfragmentarray.py | 18 ++- cf/read_write/netcdf/netcdfread.py | 7 +- cf/read_write/um/umread.py | 23 ++- 11 files changed, 270 insertions(+), 105 deletions(-) diff --git a/cf/data/array/fullarray.py b/cf/data/array/fullarray.py index 48b43442fa..931d57adfb 100644 --- a/cf/data/array/fullarray.py +++ b/cf/data/array/fullarray.py @@ -21,8 +21,9 @@ def __init__( fill_value=None, dtype=None, shape=None, - units=False, - calendar=False, + # units=False, + # calendar=False, + attributes=None, source=None, copy=True, ): @@ -53,6 +54,10 @@ def __init__( will be set to `None` during the first `__getitem__` call. + {{attributes: `dict` or `None`, optional}} + + .. versionadded:: NEXTRELEASE + {{init source: optional}} {{init copy: `bool`, optional}} @@ -77,22 +82,28 @@ def __init__( shape = None try: - units = source._get_component("units", False) - except AttributeError: - units = False - - try: - calendar = source._get_component("calendar", None) + attributes = source._get_component("attributes", False) except AttributeError: - calendar = None + attributes = None + + # try: + # units = source._get_component("units", False) + # except AttributeError: + # units = False + # + # try: + # calendar = source._get_component("calendar", None) + # except AttributeError: + # calendar = None self._set_component("full_value", fill_value, copy=False) self._set_component("dtype", dtype, copy=False) self._set_component("shape", shape, copy=False) - self._set_component("units", units, copy=False) - self._set_component("calendar", calendar, copy=False) + # self._set_component("units", units, copy=False) + # self._set_component("calendar", calendar, copy=False) + self._set_component("attributes", attributes, copy=False) - self._set_units() + # self._set_units() def __array_function__(self, func, types, args, kwargs): """The `numpy` `__array_function__` protocol. diff --git a/cf/data/array/mixin/arraymixin.py b/cf/data/array/mixin/arraymixin.py index 6b7412d9ee..d5a5aaa861 100644 --- a/cf/data/array/mixin/arraymixin.py +++ b/cf/data/array/mixin/arraymixin.py @@ -41,4 +41,4 @@ def Units(self): .. versionadded:: 3.14.0 """ - return Units(self.get_units(), self.get_calendar(None)) + return Units(self.get_units(None), self.get_calendar(None)) diff --git a/cf/data/array/mixin/cfamixin.py b/cf/data/array/mixin/cfamixin.py index 9ce84e6c20..525404389a 100644 --- a/cf/data/array/mixin/cfamixin.py +++ b/cf/data/array/mixin/cfamixin.py @@ -43,8 +43,8 @@ def __init__( dtype=None, mask=True, unpack=True, - units=False, - calendar=False, + # units=False, + # calendar=False, instructions=None, substitutions=None, term=None, @@ -190,8 +190,9 @@ def __init__( shape=shape, dtype=dtype, mask=mask, - units=units, - calendar=calendar, + # units=units, + # calendar=calendar, + attributes=attributes, copy=copy, ) else: @@ -200,8 +201,9 @@ def __init__( address=address, dtype=dtype, mask=mask, - units=units, - calendar=calendar, + # units=units, + # calendar=calendar, + attributes=attributes, copy=copy, ) @@ -794,7 +796,7 @@ def to_dask_array(self, chunks="auto"): name = (f"{self.__class__.__name__}-{tokenize(self)}",) dtype = self.dtype - units = self.get_units() + units = self.get_units(None) calendar = self.get_calendar(None) aggregated_data = self.get_aggregated_data(copy=False) diff --git a/cf/data/array/umarray.py b/cf/data/array/umarray.py index 2f79df475a..c35b8606da 100644 --- a/cf/data/array/umarray.py +++ b/cf/data/array/umarray.py @@ -26,8 +26,9 @@ def __init__( fmt=None, word_size=None, byte_ordering=None, - units=False, - calendar=False, + # units=False, + # calendar=False, + attributes=None, source=None, copy=True, ): @@ -73,6 +74,14 @@ def __init__( unset then the calendar will be set during the first `__getitem__` call. + {{attributes: `dict` or `None`, optional}} + + If *attributes* is `None`, the default, then + attributes will be set from the file during the first + `__getitem__` call. + + .. versionadded:: NEXTRELEASE + {{init source: optional}} {{init copy: `bool`, optional}} @@ -133,14 +142,19 @@ def __init__( byte_ordering = None try: - units = source._get_component("units", False) - except AttributeError: - units = False - - try: - calendar = source._get_component("calendar", False) + attributes = source._get_component("attributes", None) except AttributeError: - calendar = False + attributes = None + + # try: + # units = source._get_component("units", False) + # except AttributeError: + # units = False + # + # try: + # calendar = source._get_component("calendar", False) + # except AttributeError: + # calendar = False if filename is not None: if isinstance(filename, str): @@ -160,8 +174,9 @@ def __init__( self._set_component("shape", shape, copy=False) self._set_component("dtype", dtype, copy=False) - self._set_component("units", units, copy=False) - self._set_component("calendar", calendar, copy=False) + # self._set_component("units", units, copy=False) + # self._set_component("calendar", calendar, copy=False) + self._set_component("attributes", attributes, copy=False) if fmt is not None: self._set_component("fmt", fmt, copy=False) @@ -211,8 +226,10 @@ def _get_array(self, index=None): array = get_subspace(array, index) + attributes = self.get_attributes({}) + # Set the units, if they haven't been set already. - self._set_units(int_hdr) + self._set_units(int_hdr, attributes) LBUSER2 = int_hdr.item(38) if LBUSER2 == 3: @@ -220,45 +237,48 @@ def _get_array(self, index=None): self._set_component("dtype", np.dtype(bool), copy=False) return array.astype(bool) - integer_array = LBUSER2 == 2 + # integer_array = LBUSER2 == 2 # ------------------------------------------------------------ # Convert to a masked array # ------------------------------------------------------------ # Set the fill_value from BMDI - fill_value = real_hdr.item(17) - if fill_value != -1.0e30: - # -1.0e30 is the flag for no missing data - if integer_array: - # The fill_value must be of the same type as the data - # values - fill_value = int(fill_value) - + self._set_FillValue(int_hdr, real_hdr, attributes) + _FillValue = attributes.get("_FillValue") + if _FillValue is not None: # Mask any missing values - mask = array == fill_value + mask = array == _FillValue if mask.any(): array = np.ma.masked_where(mask, array, copy=False) + # fill_value = real_hdr.item(17) + # if fill_value != -1.0e30: + # # -1.0e30 is the flag for no missing data + # if integer_array: + # # The fill_value must be of the same type as the data + # # values + # fill_value = int(fill_value) + # + # # Mask any missing values + # mask = array == fill_value + # if mask.any(): + # array = np.ma.masked_where(mask, array, copy=False) + # ------------------------------------------------------------ # Unpack the array using the scale_factor and add_offset, if # either is available # ------------------------------------------------------------ - # Treat BMKS as a scale_factor if it is neither 0 nor 1 - scale_factor = real_hdr.item(18) - if scale_factor != 1.0 and scale_factor != 0.0: - if integer_array: - scale_factor = int(scale_factor) - + self._set_unpack(int_hdr, real_hdr, attributes) + scale_factor = attributes.get("scale_factor") + if scale_factor is not None: array *= scale_factor - # Treat BDATUM as an add_offset if it is not 0 - add_offset = real_hdr.item(4) - if add_offset != 0.0: - if integer_array: - add_offset = int(add_offset) - + add_offset = attributes.get("add_offset") + if add_offset is not None: array += add_offset + self._set_component("attributes", attributes, copy=False) + # Set the data type self._set_component("dtype", array.dtype, copy=False) @@ -304,7 +324,44 @@ def _get_rec(self, f, header_offset): # if r.hdr_offset == header_offset: # return r - def _set_units(self, int_hdr): + def _set_FillValue(self, int_hdr, real_hdr, attributes): + """TODO""" + if "FillValue" in attributes: + return + + # Set the fill_value from BMDI + _FillValue = real_hdr.item(17) + if _FillValue != -1.0e30: + # -1.0e30 is the flag for no missing data + if int_hdr.item(38) == 2: + # Must have an integer _FillValue for integer data + _FillValue = int(_FillValue) + + attributes["_FillValue"] = _FillValue + + def _set_unpack(self, int_hdr, real_hdr, attributes): + """TODO""" + if "scale_factor" not in attributes: + # Treat BMKS as a scale_factor if it is neither 0 nor 1 + scale_factor = real_hdr.item(18) + if scale_factor != 1.0 and scale_factor != 0.0: + if int_hdr.item(38) == 2: + # Must have an integer scale_factor for integer data + scale_factor = int(scale_factor) + + attributes["scale_factor"] = scale_factor + + if "add_offset" not in attributes: + # Treat BDATUM as an add_offset if it is not 0 + add_offset = real_hdr.item(4) + if add_offset != 0.0: + if int_hdr.item(38) == 2: + # Must have an integer add_offset for integer data + add_offset = int(add_offset) + + attributes["add_offset"] = add_offset + + def _set_units(self, int_hdr, attributes): """The units and calendar properties. These are set from inpection of the integer header, but only @@ -325,10 +382,8 @@ def _set_units(self, int_hdr): `None`. """ - units = self._get_component("units", False) - if units is False: + if "units" not in attributes: units = None - if not _stash2standard_name: load_stash2standard_name() @@ -358,14 +413,49 @@ def _set_units(self, int_hdr): units = units0 break - self._set_component("units", units, copy=False) - - calendar = self._get_component("calendar", False) - if calendar is False: - calendar = None - self._set_component("calendar", calendar, copy=False) - - return units, calendar + attributes["units"] = units + + # units = self._get_component("units", False) + # if units is False: + # units = None + # + # if not _stash2standard_name: + # load_stash2standard_name() + # + # submodel = int_hdr[44] + # stash = int_hdr[41] + # records = _stash2standard_name.get((submodel, stash)) + # if records: + # LBSRCE = int_hdr[37] + # version, source = divmod(LBSRCE, 10000) + # if version <= 0: + # version = 405.0 + # + # for ( + # long_name, + # units0, + # valid_from, + # valid_to, + # standard_name, + # cf_info, + # condition, + # ) in records: + # if not self._test_version( + # valid_from, valid_to, version + # ) or not self._test_condition(condition, int_hdr): + # continue + # + # units = units0 + # break + # + # self._set_component("units", units, copy=False) + # + # calendar = self._get_component("calendar", False) + # if calendar is False: + # calendar = None + # self._set_component("calendar", calendar, copy=False) + # + # return units, calendar def _test_condition(self, condition, int_hdr): """Return `True` if a field satisfies a condition for a STASH diff --git a/cf/data/fragment/fullfragmentarray.py b/cf/data/fragment/fullfragmentarray.py index eecb50ef16..9197f8ef0b 100644 --- a/cf/data/fragment/fullfragmentarray.py +++ b/cf/data/fragment/fullfragmentarray.py @@ -16,8 +16,9 @@ def __init__( shape=None, aggregated_units=False, aggregated_calendar=False, - units=False, - calendar=False, + # units=False, + # calendar=False, + attributes=None, source=None, copy=True, ): @@ -53,6 +54,10 @@ def __init__( unset then the calendar will be set to `None` during the first `__getitem__` call. + {{attributes: `dict` or `None`, optional}} + + .. versionadded:: NEXTRELEASE + {{aggregated_units: `str` or `None`, optional}} {{aggregated_calendar: `str` or `None`, optional}} @@ -66,8 +71,9 @@ def __init__( fill_value=fill_value, dtype=dtype, shape=shape, - units=units, - calendar=calendar, + # units=units, + # calendar=calendar, + attributes=attributes, source=source, copy=False, ) diff --git a/cf/data/fragment/h5netcdffragmentarray.py b/cf/data/fragment/h5netcdffragmentarray.py index 2f140df6ff..796da15d2e 100644 --- a/cf/data/fragment/h5netcdffragmentarray.py +++ b/cf/data/fragment/h5netcdffragmentarray.py @@ -17,8 +17,9 @@ def __init__( shape=None, aggregated_units=False, aggregated_calendar=False, - units=False, - calendar=None, + # units=False, + # calendar=None, + attributes=None, storage_options=None, source=None, copy=True, @@ -59,6 +60,14 @@ def __init__( unset then the calendar will be set during the first `__getitem__` call. + {{attributes: `dict` or `None`, optional}} + + If *attributes* is `None`, the default, then the + attributes will be set from the file during the first + `__getitem__` call. + + .. versionadded:: NEXTRELEASE + {{aggregated_units: `str` or `None`, optional}} {{aggregated_calendar: `str` or `None`, optional}} @@ -76,8 +85,9 @@ def __init__( dtype=dtype, shape=shape, mask=True, - units=units, - calendar=calendar, + # units=units, + # calendar=calendar, + attributes=attributes, storage_options=storage_options, source=source, copy=copy, diff --git a/cf/data/fragment/netcdf4fragmentarray.py b/cf/data/fragment/netcdf4fragmentarray.py index 12ae8c201d..5001df2a61 100644 --- a/cf/data/fragment/netcdf4fragmentarray.py +++ b/cf/data/fragment/netcdf4fragmentarray.py @@ -17,8 +17,9 @@ def __init__( shape=None, aggregated_units=False, aggregated_calendar=False, - units=False, - calendar=None, + # units=False, + # calendar=None, + attributes=None, storage_options=None, source=None, copy=True, @@ -59,6 +60,14 @@ def __init__( unset then the calendar will be set during the first `__getitem__` call. + {{attributes: `dict` or `None`, optional}} + + If *attributes* is `None`, the default, then the + attributes will be set from the file during the first + `__getitem__` call. + + .. versionadded:: NEXTRELEASE + {{aggregated_units: `str` or `None`, optional}} {{aggregated_calendar: `str` or `None`, optional}} @@ -76,8 +85,9 @@ def __init__( dtype=dtype, shape=shape, mask=True, - units=units, - calendar=calendar, + # units=units, + # calendar=calendar, + attributes=attributes, storage_options=storage_options, source=source, copy=copy, diff --git a/cf/data/fragment/netcdffragmentarray.py b/cf/data/fragment/netcdffragmentarray.py index 699cf790ab..6db4cd439e 100644 --- a/cf/data/fragment/netcdffragmentarray.py +++ b/cf/data/fragment/netcdffragmentarray.py @@ -32,8 +32,9 @@ def __init__( shape=None, aggregated_units=False, aggregated_calendar=False, - units=False, - calendar=None, + # units=False, + # calendar=None, + attributes=None, storage_options=None, source=None, copy=True, @@ -74,6 +75,14 @@ def __init__( unset then the calendar will be set during the first `__getitem__` call. + {{attributes: `dict` or `None`, optional}} + + If *attributes* is `None`, the default, then the + attributes will be set from the file during the first + `__getitem__` call. + + .. versionadded:: NEXTRELEASE + {{aggregated_units: `str` or `None`, optional}} {{aggregated_calendar: `str` or `None`, optional}} @@ -114,14 +123,19 @@ def __init__( dtype = None try: - units = source._get_component("units", False) - except AttributeError: - units = False - - try: - calendar = source._get_component("calendar", False) + attributes = source._get_component("attributes", None) except AttributeError: - calendar = False + attributes = None + + # try: + # units = source._get_component("units", False) + # except AttributeError: + # units = False + # + # try: + # calendar = source._get_component("calendar", False) + # except AttributeError: + # calendar = False try: aggregated_units = source._get_component( @@ -165,8 +179,9 @@ def __init__( self._set_component("shape", shape, copy=False) self._set_component("dtype", dtype, copy=False) - self._set_component("units", units, copy=False) - self._set_component("calendar", calendar, copy=False) + # self._set_component("units", units, copy=False) + # self._set_component("calendar", calendar, copy=False) + self._set_component("attributes", attributes, copy=False) self._set_component("mask", True, copy=False) self._set_component("aggregated_units", aggregated_units, copy=False) @@ -191,8 +206,9 @@ def __getitem__(self, indices): "shape": self.shape, "aggregated_units": self.get_aggregated_units(None), "aggregated_calendar": self.get_aggregated_calendar(None), - "units": self.get_units(None), - "calendar": self.get_units(None), + # "units": self.get_units(None), + # "calendar": self.get_units(None), + "attributes": self.get_attributes(None), "copy": False, } diff --git a/cf/data/fragment/umfragmentarray.py b/cf/data/fragment/umfragmentarray.py index a30737f46d..d99a26326a 100644 --- a/cf/data/fragment/umfragmentarray.py +++ b/cf/data/fragment/umfragmentarray.py @@ -17,8 +17,9 @@ def __init__( shape=None, aggregated_units=False, aggregated_calendar=False, - units=False, - calendar=False, + # units=False, + # calendar=False, + attributes=None, source=None, copy=True, ): @@ -56,6 +57,14 @@ def __init__( unset then the calendar will be set during the first `__getitem__` call. + {{attributes: `dict` or `None`, optional}} + + If *attributes* is `None`, the default, then the + attributes will be set from the file during the first + `__getitem__` call. + + .. versionadded:: NEXTRELEASE + {{aggregated_units: `str` or `None`, optional}} {{aggregated_calendar: `str` or `None`, optional}} @@ -70,8 +79,9 @@ def __init__( address=address, dtype=dtype, shape=shape, - units=units, - calendar=calendar, + # units=units, + # calendar=calendar, + attributes=attributes, source=source, copy=False, ) diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py index 40eb2e460b..901f348343 100644 --- a/cf/read_write/netcdf/netcdfread.py +++ b/cf/read_write/netcdf/netcdfread.py @@ -253,11 +253,14 @@ def _create_data( coord_ncvar=coord_ncvar, ) + attributes = kwargs["attributes"] data = self._create_Data( cfa_array, ncvar, - units=kwargs["units"], - calendar=kwargs["calendar"], + units=attributes.get("units"), # units=kwargs["units"], + calendar=attributes.get( + "calendar" + ), # calendar=kwargs["calendar"], ) # Note: We don't cache elements from CFA variables, because diff --git a/cf/read_write/um/umread.py b/cf/read_write/um/umread.py index a409ba2bd6..29535235f5 100644 --- a/cf/read_write/um/umread.py +++ b/cf/read_write/um/umread.py @@ -1957,8 +1957,12 @@ def create_data(self): recs = self.recs um_Units = self.um_Units - units = getattr(um_Units, "units", None) - calendar = getattr(um_Units, "calendar", None) + # units = getattr(um_Units, "units", None) + # calendar = getattr(um_Units, "calendar", None) + attributes = { + "units": getattr(um_Units, "units", None), + "calendar": getattr(um_Units, "calendar", None), + } data_type_in_file = self.data_type_in_file @@ -1999,8 +2003,9 @@ def create_data(self): fmt=fmt, word_size=self.word_size, byte_ordering=self.byte_ordering, - units=units, - calendar=calendar, + # units=units, + # calendar=calendar, + attributes=attributes, ) key = f"{klass_name}-{tokenize(subarray)}" @@ -2053,8 +2058,9 @@ def create_data(self): fmt=fmt, word_size=word_size, byte_ordering=byte_ordering, - units=units, - calendar=calendar, + # units=units, + # calendar=calendar, + attributes=attributes, ) key = f"{klass_name}-{tokenize(subarray)}" @@ -2104,8 +2110,9 @@ def create_data(self): fmt=fmt, word_size=word_size, byte_ordering=byte_ordering, - units=units, - calendar=calendar, + # units=units, + # calendar=calendar, + attributes=attributes, ) key = f"{klass_name}-{tokenize(subarray)}" From 75e48978b35e3df26d40f88ee7e601205fe84327 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 5 Apr 2024 15:53:34 +0100 Subject: [PATCH 082/134] dev --- cf/data/array/fullarray.py | 39 +-- cf/data/array/umarray.py | 320 ++++++++++-------------- cf/data/fragment/netcdffragmentarray.py | 27 +- cf/data/fragment/umfragmentarray.py | 10 +- cf/read_write/um/umread.py | 8 - cf/test/test_regrid_mesh.py | 2 + 6 files changed, 167 insertions(+), 239 deletions(-) diff --git a/cf/data/array/fullarray.py b/cf/data/array/fullarray.py index 931d57adfb..eb2460f401 100644 --- a/cf/data/array/fullarray.py +++ b/cf/data/array/fullarray.py @@ -21,8 +21,6 @@ def __init__( fill_value=None, dtype=None, shape=None, - # units=False, - # calendar=False, attributes=None, source=None, copy=True, @@ -41,20 +39,7 @@ def __init__( shape: `tuple` The array dimension sizes. - units: `str` or `None`, optional - The units of the netCDF variable. Set to `None` to - indicate that there are no units. If unset then the - units will be set to `None` during the first - `__getitem__` call. - - calendar: `str` or `None`, optional - The calendar of the netCDF variable. By default, or if - set to `None`, then the CF default calendar is - assumed, if applicable. If unset then the calendar - will be set to `None` during the first `__getitem__` - call. - - {{attributes: `dict` or `None`, optional}} + {{init attributes: `dict` or `None`, optional}} .. versionadded:: NEXTRELEASE @@ -62,6 +47,14 @@ def __init__( {{init copy: `bool`, optional}} + units: `str` or `None`, optional + Deprecated at version NEXTRELEASE. Use the + *attributes* parameter instead. + + calendar: `str` or `None`, optional + Deprecated at version NEXTRELEASE. Use the + *attributes* parameter instead. + """ super().__init__(source=source, copy=copy) @@ -86,25 +79,11 @@ def __init__( except AttributeError: attributes = None - # try: - # units = source._get_component("units", False) - # except AttributeError: - # units = False - # - # try: - # calendar = source._get_component("calendar", None) - # except AttributeError: - # calendar = None - self._set_component("full_value", fill_value, copy=False) self._set_component("dtype", dtype, copy=False) self._set_component("shape", shape, copy=False) - # self._set_component("units", units, copy=False) - # self._set_component("calendar", calendar, copy=False) self._set_component("attributes", attributes, copy=False) - # self._set_units() - def __array_function__(self, func, types, args, kwargs): """The `numpy` `__array_function__` protocol. diff --git a/cf/data/array/umarray.py b/cf/data/array/umarray.py index c35b8606da..96ce6ad763 100644 --- a/cf/data/array/umarray.py +++ b/cf/data/array/umarray.py @@ -1,12 +1,7 @@ import cfdm -import numpy as np from ...constants import _stash2standard_name -from ...functions import ( - _DEPRECATION_ERROR_ATTRIBUTE, - get_subspace, - load_stash2standard_name, -) +from ...functions import _DEPRECATION_ERROR_ATTRIBUTE, load_stash2standard_name from ...umread_lib.umfile import File, Rec from .abstract import Array from .mixin import FileArrayMixin, IndexMixin @@ -26,8 +21,6 @@ def __init__( fmt=None, word_size=None, byte_ordering=None, - # units=False, - # calendar=False, attributes=None, source=None, copy=True, @@ -63,22 +56,13 @@ def __init__( byte_ordering: `str`, optional ``'little_endian'`` or ``'big_endian'`` - units: `str` or `None`, optional - The units of the fragment data. Set to `None` to - indicate that there are no units. If unset then the - units will be set during the first `__getitem__` call. - - calendar: `str` or `None`, optional - The calendar of the fragment data. Set to `None` to - indicate the CF default calendar, if applicable. If - unset then the calendar will be set during the first - `__getitem__` call. - {{attributes: `dict` or `None`, optional}} - If *attributes* is `None`, the default, then - attributes will be set from the file during the first - `__getitem__` call. + During the first `__getitem__` call, any of the + ``_FillValue``, ``add_offset``, ``scale_factor``, + ``units``, and ``calendar`` attributes which haven't + already been set will be inferred from the lookup + header and cached for future use. .. versionadded:: NEXTRELEASE @@ -93,7 +77,7 @@ def __init__( Deprecated at version 3.14.0. header_offset: `int` - Deprecated at version 3.15.0. use the *address* + Deprecated at version 3.15.0. Use the *address* parameter instead. data_offset: `int`, optional @@ -102,6 +86,14 @@ def __init__( disk_length: `int`, optional Deprecated at version 3.15.0. + units: `str` or `None`, optional + Deprecated at version NEXTRELEASE. Use the + *attributes* parameter instead. + + calendar: `str` or `None`, optional + Deprecated at version NEXTRELEASE. Use the + *attributes* parameter instead. + """ super().__init__(source=source, copy=copy) @@ -146,16 +138,6 @@ def __init__( except AttributeError: attributes = None - # try: - # units = source._get_component("units", False) - # except AttributeError: - # units = False - # - # try: - # calendar = source._get_component("calendar", False) - # except AttributeError: - # calendar = False - if filename is not None: if isinstance(filename, str): filename = (filename,) @@ -174,8 +156,6 @@ def __init__( self._set_component("shape", shape, copy=False) self._set_component("dtype", dtype, copy=False) - # self._set_component("units", units, copy=False) - # self._set_component("calendar", calendar, copy=False) self._set_component("attributes", attributes, copy=False) if fmt is not None: @@ -224,60 +204,26 @@ def _get_array(self, index=None): self.close(f) del f, rec - array = get_subspace(array, index) - + # Set the netCDF attributes for the data attributes = self.get_attributes({}) - - # Set the units, if they haven't been set already. self._set_units(int_hdr, attributes) - - LBUSER2 = int_hdr.item(38) - if LBUSER2 == 3: - # Return the numpy array now if it is a boolean array - self._set_component("dtype", np.dtype(bool), copy=False) - return array.astype(bool) - - # integer_array = LBUSER2 == 2 - - # ------------------------------------------------------------ - # Convert to a masked array - # ------------------------------------------------------------ - # Set the fill_value from BMDI self._set_FillValue(int_hdr, real_hdr, attributes) - _FillValue = attributes.get("_FillValue") - if _FillValue is not None: - # Mask any missing values - mask = array == _FillValue - if mask.any(): - array = np.ma.masked_where(mask, array, copy=False) - - # fill_value = real_hdr.item(17) - # if fill_value != -1.0e30: - # # -1.0e30 is the flag for no missing data - # if integer_array: - # # The fill_value must be of the same type as the data - # # values - # fill_value = int(fill_value) - # - # # Mask any missing values - # mask = array == fill_value - # if mask.any(): - # array = np.ma.masked_where(mask, array, copy=False) - - # ------------------------------------------------------------ - # Unpack the array using the scale_factor and add_offset, if - # either is available - # ------------------------------------------------------------ self._set_unpack(int_hdr, real_hdr, attributes) - scale_factor = attributes.get("scale_factor") - if scale_factor is not None: - array *= scale_factor + self._set_component("attributes", attributes, copy=False) - add_offset = attributes.get("add_offset") - if add_offset is not None: - array += add_offset + # Get the data subspace, applying any masking and unpacking + array = cfdm.netcdf_indexer( + array, + mask=True, + unpack=True, + always_masked_array=False, + attributes=attributes, + ) + array = array[index] - self._set_component("attributes", attributes, copy=False) + if int_hdr.item(38) == 3: + # Convert the data to a boolean array + array = array.astype(bool) # Set the data type self._set_component("dtype", array.dtype, copy=False) @@ -325,7 +271,28 @@ def _get_rec(self, f, header_offset): # return r def _set_FillValue(self, int_hdr, real_hdr, attributes): - """TODO""" + """Set the ``_FillValue`` attribute. + + .. versionadded:: NEXTRELEASE + + :Parameters: + + int_hdr: `numpy.ndarray` + The integer header of the data. + + real_header: `numpy.ndarray` + The real header of the data. + + attributes: `dict` + The dictionary in which to store the new + attributes. If a new attribute exists then + *attributes* is updated in-place. + + :Returns: + + `None + + """ if "FillValue" in attributes: return @@ -339,8 +306,89 @@ def _set_FillValue(self, int_hdr, real_hdr, attributes): attributes["_FillValue"] = _FillValue + def _set_units(self, int_hdr, attributes): + """Set the ``units`` attribute. + + .. versionadded:: 3.14.0 + + .. versionadded:: NEXTRELEASE + + :Parameters: + + int_hdr: `numpy.ndarray` + The integer header of the data. + + real_header: `numpy.ndarray` + The real header of the data. + + attributes: `dict` + The dictionary in which to store the new + attributes. If a new attribute exists then + *attributes* is updated in-place. + + :Returns: + + `None` + + """ + if "units" in attributes: + return + + units = None + if not _stash2standard_name: + load_stash2standard_name() + + submodel = int_hdr.item(44) + stash = int_hdr.item(41) + records = _stash2standard_name.get((submodel, stash)) + if records: + LBSRCE = int_hdr.item(37) + version, source = divmod(LBSRCE, 10000) + if version <= 0: + version = 405.0 + + for ( + long_name, + units0, + valid_from, + valid_to, + standard_name, + cf_info, + condition, + ) in records: + if not self._test_version( + valid_from, valid_to, version + ) or not self._test_condition(condition, int_hdr): + continue + + units = units0 + break + + attributes["units"] = units + def _set_unpack(self, int_hdr, real_hdr, attributes): - """TODO""" + """Set the ``add_offset`` and ``scale_factor`` attributes. + + .. versionadded:: NEXTRELEASE + + :Parameters: + + int_hdr: `numpy.ndarray` + The integer header of the data. + + real_header: `numpy.ndarray` + The real header of the data. + + attributes: `dict` + The dictionary in which to store the new + attributes. If any new attributes exist then + *attributes* is updated in-place. + + :Returns: + + `None + + """ if "scale_factor" not in attributes: # Treat BMKS as a scale_factor if it is neither 0 nor 1 scale_factor = real_hdr.item(18) @@ -361,102 +409,6 @@ def _set_unpack(self, int_hdr, real_hdr, attributes): attributes["add_offset"] = add_offset - def _set_units(self, int_hdr, attributes): - """The units and calendar properties. - - These are set from inpection of the integer header, but only - if they have already not been defined, either during {{class}} - instantiation or by a previous call to `_set_units`. - - .. versionadded:: 3.14.0 - - :Parameters: - - int_hdr: `numpy.ndarray` - The integer header of the data. - - :Returns: - - `tuple` - The units and calendar values, either of which may be - `None`. - - """ - if "units" not in attributes: - units = None - if not _stash2standard_name: - load_stash2standard_name() - - submodel = int_hdr[44] - stash = int_hdr[41] - records = _stash2standard_name.get((submodel, stash)) - if records: - LBSRCE = int_hdr[37] - version, source = divmod(LBSRCE, 10000) - if version <= 0: - version = 405.0 - - for ( - long_name, - units0, - valid_from, - valid_to, - standard_name, - cf_info, - condition, - ) in records: - if not self._test_version( - valid_from, valid_to, version - ) or not self._test_condition(condition, int_hdr): - continue - - units = units0 - break - - attributes["units"] = units - - # units = self._get_component("units", False) - # if units is False: - # units = None - # - # if not _stash2standard_name: - # load_stash2standard_name() - # - # submodel = int_hdr[44] - # stash = int_hdr[41] - # records = _stash2standard_name.get((submodel, stash)) - # if records: - # LBSRCE = int_hdr[37] - # version, source = divmod(LBSRCE, 10000) - # if version <= 0: - # version = 405.0 - # - # for ( - # long_name, - # units0, - # valid_from, - # valid_to, - # standard_name, - # cf_info, - # condition, - # ) in records: - # if not self._test_version( - # valid_from, valid_to, version - # ) or not self._test_condition(condition, int_hdr): - # continue - # - # units = units0 - # break - # - # self._set_component("units", units, copy=False) - # - # calendar = self._get_component("calendar", False) - # if calendar is False: - # calendar = None - # self._set_component("calendar", calendar, copy=False) - # - # return units, calendar - def _test_condition(self, condition, int_hdr): """Return `True` if a field satisfies a condition for a STASH code to standard name conversion. @@ -486,14 +438,14 @@ def _test_condition(self, condition, int_hdr): return True if condition == "true_latitude_longitude": - LBCODE = int_hdr[15] + LBCODE = int_hdr.item(15) # LBCODE 1: Unrotated regular lat/long grid # LBCODE 2 = Regular lat/lon grid boxes (grid points are # box centres) if LBCODE in (1, 2): return True elif condition == "rotated_latitude_longitude": - LBCODE = int_hdr[15] + LBCODE = int_hdr.item(15) # LBCODE 101: Rotated regular lat/long grid # LBCODE 102: Rotated regular lat/lon grid boxes (grid # points are box centres) @@ -791,7 +743,7 @@ def open(self): **Examples** >>> f.open() - (, 4) + (, 4) """ return super().open( diff --git a/cf/data/fragment/netcdffragmentarray.py b/cf/data/fragment/netcdffragmentarray.py index 6db4cd439e..637f0174d3 100644 --- a/cf/data/fragment/netcdffragmentarray.py +++ b/cf/data/fragment/netcdffragmentarray.py @@ -32,8 +32,6 @@ def __init__( shape=None, aggregated_units=False, aggregated_calendar=False, - # units=False, - # calendar=None, attributes=None, storage_options=None, source=None, @@ -64,22 +62,11 @@ def __init__( fragment variable in that the latter may have fewer size 1 dimensions. - units: `str` or `None`, optional - The units of the fragment data. Set to `None` to - indicate that there are no units. If unset then the - units will be set during the first `__getitem__` call. - - calendar: `str` or `None`, optional - The calendar of the fragment data. Set to `None` to - indicate the CF default calendar, if applicable. If - unset then the calendar will be set during the first - `__getitem__` call. - - {{attributes: `dict` or `None`, optional}} + {{init attributes: `dict` or `None`, optional}} If *attributes* is `None`, the default, then the - attributes will be set from the file during the first - `__getitem__` call. + netCDF attributes will be set from the netCDF variable + during the first `__getitem__` call. .. versionadded:: NEXTRELEASE @@ -95,6 +82,14 @@ def __init__( {{init copy: `bool`, optional}} + units: `str` or `None`, optional + Deprecated at version NEXTRELEASE. Use the + *attributes* parameter instead. + + calendar: `str` or `None`, optional + Deprecated at version NEXTRELEASE. Use the + *attributes* parameter instead. + """ super().__init__( source=source, diff --git a/cf/data/fragment/umfragmentarray.py b/cf/data/fragment/umfragmentarray.py index d99a26326a..dbc0e88361 100644 --- a/cf/data/fragment/umfragmentarray.py +++ b/cf/data/fragment/umfragmentarray.py @@ -57,7 +57,7 @@ def __init__( unset then the calendar will be set during the first `__getitem__` call. - {{attributes: `dict` or `None`, optional}} + {{init attributes: `dict` or `None`, optional}} If *attributes* is `None`, the default, then the attributes will be set from the file during the first @@ -73,6 +73,14 @@ def __init__( {{init copy: `bool`, optional}} + units: `str` or `None`, optional + Deprecated at version NEXTRELEASE. Use the + *attributes* parameter instead. + + calendar: `str` or `None`, optional + Deprecated at version NEXTRELEASE. Use the + *attributes* parameter instead. + """ super().__init__( filename=filename, diff --git a/cf/read_write/um/umread.py b/cf/read_write/um/umread.py index 29535235f5..849529e39d 100644 --- a/cf/read_write/um/umread.py +++ b/cf/read_write/um/umread.py @@ -1957,8 +1957,6 @@ def create_data(self): recs = self.recs um_Units = self.um_Units - # units = getattr(um_Units, "units", None) - # calendar = getattr(um_Units, "calendar", None) attributes = { "units": getattr(um_Units, "units", None), "calendar": getattr(um_Units, "calendar", None), @@ -2003,8 +2001,6 @@ def create_data(self): fmt=fmt, word_size=self.word_size, byte_ordering=self.byte_ordering, - # units=units, - # calendar=calendar, attributes=attributes, ) @@ -2058,8 +2054,6 @@ def create_data(self): fmt=fmt, word_size=word_size, byte_ordering=byte_ordering, - # units=units, - # calendar=calendar, attributes=attributes, ) @@ -2110,8 +2104,6 @@ def create_data(self): fmt=fmt, word_size=word_size, byte_ordering=byte_ordering, - # units=units, - # calendar=calendar, attributes=attributes, ) diff --git a/cf/test/test_regrid_mesh.py b/cf/test/test_regrid_mesh.py index 3095640135..da070395fc 100644 --- a/cf/test/test_regrid_mesh.py +++ b/cf/test/test_regrid_mesh.py @@ -161,6 +161,8 @@ def test_Field_regrid_mesh_to_mesh(self): use_dst_mask=use_dst_mask, ) + print("\ny=", y) + # print ('a=', a) self.assertTrue(np.allclose(y, a, atol=atol, rtol=rtol)) if isinstance(a, np.ma.MaskedArray): From 222a18be1e113963998cb12ea2c0ce1e2a2db63c Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 5 Apr 2024 17:07:17 +0100 Subject: [PATCH 083/134] dev --- cf/data/fragment/h5netcdffragmentarray.py | 19 ++----------------- cf/data/fragment/netcdffragmentarray.py | 4 ++-- cf/data/fragment/umfragmentarray.py | 23 +++++------------------ 3 files changed, 9 insertions(+), 37 deletions(-) diff --git a/cf/data/fragment/h5netcdffragmentarray.py b/cf/data/fragment/h5netcdffragmentarray.py index 796da15d2e..8ea98761f4 100644 --- a/cf/data/fragment/h5netcdffragmentarray.py +++ b/cf/data/fragment/h5netcdffragmentarray.py @@ -17,8 +17,6 @@ def __init__( shape=None, aggregated_units=False, aggregated_calendar=False, - # units=False, - # calendar=None, attributes=None, storage_options=None, source=None, @@ -49,22 +47,11 @@ def __init__( fragment variable in that the latter may have fewer size 1 dimensions. - units: `str` or `None`, optional - The units of the fragment data. Set to `None` to - indicate that there are no units. If unset then the - units will be set during the first `__getitem__` call. - - calendar: `str` or `None`, optional - The calendar of the fragment data. Set to `None` to - indicate the CF default calendar, if applicable. If - unset then the calendar will be set during the first - `__getitem__` call. - {{attributes: `dict` or `None`, optional}} If *attributes* is `None`, the default, then the - attributes will be set from the file during the first - `__getitem__` call. + attributes will be set from the netCDF variable during + the first `__getitem__` call. .. versionadded:: NEXTRELEASE @@ -85,8 +72,6 @@ def __init__( dtype=dtype, shape=shape, mask=True, - # units=units, - # calendar=calendar, attributes=attributes, storage_options=storage_options, source=source, diff --git a/cf/data/fragment/netcdffragmentarray.py b/cf/data/fragment/netcdffragmentarray.py index 637f0174d3..c38710cfdb 100644 --- a/cf/data/fragment/netcdffragmentarray.py +++ b/cf/data/fragment/netcdffragmentarray.py @@ -65,8 +65,8 @@ def __init__( {{init attributes: `dict` or `None`, optional}} If *attributes* is `None`, the default, then the - netCDF attributes will be set from the netCDF variable - during the first `__getitem__` call. + attributes will be set from the netCDF variable during + the first `__getitem__` call. .. versionadded:: NEXTRELEASE diff --git a/cf/data/fragment/umfragmentarray.py b/cf/data/fragment/umfragmentarray.py index dbc0e88361..9c5dbed13a 100644 --- a/cf/data/fragment/umfragmentarray.py +++ b/cf/data/fragment/umfragmentarray.py @@ -17,8 +17,6 @@ def __init__( shape=None, aggregated_units=False, aggregated_calendar=False, - # units=False, - # calendar=False, attributes=None, source=None, copy=True, @@ -46,22 +44,13 @@ def __init__( fragment variable in that the latter may have fewer size 1 dimensions. - units: `str` or `None`, optional - The units of the fragment data. Set to `None` to - indicate that there are no units. If unset then the - units will be set during the first `__getitem__` call. - - calendar: `str` or `None`, optional - The calendar of the fragment data. Set to `None` to - indicate the CF default calendar, if applicable. If - unset then the calendar will be set during the first - `__getitem__` call. - {{init attributes: `dict` or `None`, optional}} - If *attributes* is `None`, the default, then the - attributes will be set from the file during the first - `__getitem__` call. + During the first `__getitem__` call, any of the + ``_FillValue``, ``add_offset``, ``scale_factor``, + ``units``, and ``calendar`` attributes which haven't + already been set will be inferred from the lookup + header and cached for future use. .. versionadded:: NEXTRELEASE @@ -87,8 +76,6 @@ def __init__( address=address, dtype=dtype, shape=shape, - # units=units, - # calendar=calendar, attributes=attributes, source=source, copy=False, From bdbbd6c4372386365c7a392c5ca85e4aa279825e Mon Sep 17 00:00:00 2001 From: David Hassell Date: Sat, 6 Apr 2024 11:38:45 +0100 Subject: [PATCH 084/134] dev --- cf/data/array/fullarray.py | 4 ++++ cf/data/array/h5netcdfarray.py | 4 ++++ cf/data/array/netcdf4array.py | 4 ++++ cf/data/array/umarray.py | 12 ++++++++++- cf/data/fragment/fullfragmentarray.py | 26 ++++++++--------------- cf/data/fragment/h5netcdffragmentarray.py | 4 +--- cf/data/fragment/netcdf4fragmentarray.py | 10 +++------ cf/data/fragment/netcdffragmentarray.py | 19 ++--------------- cf/test/test_regrid_mesh.py | 2 -- 9 files changed, 38 insertions(+), 47 deletions(-) diff --git a/cf/data/array/fullarray.py b/cf/data/array/fullarray.py index eb2460f401..f900b221b7 100644 --- a/cf/data/array/fullarray.py +++ b/cf/data/array/fullarray.py @@ -139,6 +139,8 @@ def _get_array(self, index=None): The subspace. """ + # REVIEW: getitem + if index is None: shape = self.shape else: @@ -167,6 +169,8 @@ def array(self): `numpy.ndarray` An independent numpy array of the data. """ + # REVIEW: getitem + return np.asanyarray(self) @property diff --git a/cf/data/array/h5netcdfarray.py b/cf/data/array/h5netcdfarray.py index 01dac019d9..c6691ce97a 100644 --- a/cf/data/array/h5netcdfarray.py +++ b/cf/data/array/h5netcdfarray.py @@ -24,6 +24,8 @@ class H5netcdfArray( """ + # REVIEW: h5 + def __dask_tokenize__(self): """Return a value fully representative of the object. @@ -67,6 +69,8 @@ def _get_array(self, index=None): The subspace. """ + # REVIEW: getitem + if index is None: index = self.index() diff --git a/cf/data/array/netcdf4array.py b/cf/data/array/netcdf4array.py index 2d7fa45d99..b800363f38 100644 --- a/cf/data/array/netcdf4array.py +++ b/cf/data/array/netcdf4array.py @@ -22,6 +22,8 @@ class NetCDF4Array( """ + # REVIEW: h5 + def __dask_tokenize__(self): """Return a value fully representative of the object. @@ -65,6 +67,8 @@ def _get_array(self, index=None): The subspace. """ + # REVIEW: getitem + if index is None: index = self.index() diff --git a/cf/data/array/umarray.py b/cf/data/array/umarray.py index 96ce6ad763..74209f9195 100644 --- a/cf/data/array/umarray.py +++ b/cf/data/array/umarray.py @@ -56,7 +56,7 @@ def __init__( byte_ordering: `str`, optional ``'little_endian'`` or ``'big_endian'`` - {{attributes: `dict` or `None`, optional}} + {{init attributes: `dict` or `None`, optional}} During the first `__getitem__` call, any of the ``_FillValue``, ``add_offset``, ``scale_factor``, @@ -95,6 +95,8 @@ def __init__( *attributes* parameter instead. """ + # REVIEW: h5, getitem + super().__init__(source=source, copy=copy) if source is not None: @@ -189,6 +191,8 @@ def _get_array(self, index=None): The subspace. """ + # REVIEW: getitem + # Note: No need to lock the UM file - concurrent reads are OK. if index is None: @@ -293,6 +297,8 @@ def _set_FillValue(self, int_hdr, real_hdr, attributes): `None """ + # REVIEW: getitem + if "FillValue" in attributes: return @@ -331,6 +337,8 @@ def _set_units(self, int_hdr, attributes): `None` """ + # REVIEW: getitem + if "units" in attributes: return @@ -389,6 +397,8 @@ def _set_unpack(self, int_hdr, real_hdr, attributes): `None """ + # REVIEW: getitem + if "scale_factor" not in attributes: # Treat BMKS as a scale_factor if it is neither 0 nor 1 scale_factor = real_hdr.item(18) diff --git a/cf/data/fragment/fullfragmentarray.py b/cf/data/fragment/fullfragmentarray.py index 9197f8ef0b..fa0cc22adb 100644 --- a/cf/data/fragment/fullfragmentarray.py +++ b/cf/data/fragment/fullfragmentarray.py @@ -16,8 +16,6 @@ def __init__( shape=None, aggregated_units=False, aggregated_calendar=False, - # units=False, - # calendar=False, attributes=None, source=None, copy=True, @@ -42,19 +40,7 @@ def __init__( fragment variable in that the latter may have fewer size 1 dimensions. - units: `str` or `None`, optional - The units of the fragment data. Set to `None` to - indicate that there are no units. If unset then the - units will be set to `None` during the first - `__getitem__` call. - - calendar: `str` or `None`, optional - The calendar of the fragment data. Set to `None` to - indicate the CF default calendar, if applicable. If - unset then the calendar will be set to `None` during - the first `__getitem__` call. - - {{attributes: `dict` or `None`, optional}} + {{init attributes: `dict` or `None`, optional}} .. versionadded:: NEXTRELEASE @@ -66,13 +52,19 @@ def __init__( {{init copy: `bool`, optional}} + units: `str` or `None`, optional + Deprecated at version NEXTRELEASE. Use the + *attributes* parameter instead. + + calendar: `str` or `None`, optional + Deprecated at version NEXTRELEASE. Use the + *attributes* parameter instead. + """ super().__init__( fill_value=fill_value, dtype=dtype, shape=shape, - # units=units, - # calendar=calendar, attributes=attributes, source=source, copy=False, diff --git a/cf/data/fragment/h5netcdffragmentarray.py b/cf/data/fragment/h5netcdffragmentarray.py index 8ea98761f4..0b70976c7f 100644 --- a/cf/data/fragment/h5netcdffragmentarray.py +++ b/cf/data/fragment/h5netcdffragmentarray.py @@ -47,14 +47,12 @@ def __init__( fragment variable in that the latter may have fewer size 1 dimensions. - {{attributes: `dict` or `None`, optional}} + {{init attributes: `dict` or `None`, optional}} If *attributes* is `None`, the default, then the attributes will be set from the netCDF variable during the first `__getitem__` call. - .. versionadded:: NEXTRELEASE - {{aggregated_units: `str` or `None`, optional}} {{aggregated_calendar: `str` or `None`, optional}} diff --git a/cf/data/fragment/netcdf4fragmentarray.py b/cf/data/fragment/netcdf4fragmentarray.py index 5001df2a61..54f3e4240d 100644 --- a/cf/data/fragment/netcdf4fragmentarray.py +++ b/cf/data/fragment/netcdf4fragmentarray.py @@ -17,8 +17,6 @@ def __init__( shape=None, aggregated_units=False, aggregated_calendar=False, - # units=False, - # calendar=None, attributes=None, storage_options=None, source=None, @@ -60,13 +58,11 @@ def __init__( unset then the calendar will be set during the first `__getitem__` call. - {{attributes: `dict` or `None`, optional}} + {{init attributes: `dict` or `None`, optional}} If *attributes* is `None`, the default, then the - attributes will be set from the file during the first - `__getitem__` call. - - .. versionadded:: NEXTRELEASE + attributes will be set from the netCDF variable during + the first `__getitem__` call. {{aggregated_units: `str` or `None`, optional}} diff --git a/cf/data/fragment/netcdffragmentarray.py b/cf/data/fragment/netcdffragmentarray.py index c38710cfdb..324207db7e 100644 --- a/cf/data/fragment/netcdffragmentarray.py +++ b/cf/data/fragment/netcdffragmentarray.py @@ -16,12 +16,11 @@ class NetCDFFragmentArray( ): """A netCDF fragment array. - Access will either with `netCDF4` (for local and OPenDAP files) or - `h5netcdf` (for S3 files). + Access will be with either `netCDF4` (for local and OPenDAP files) + or `h5netcdf` (for S3 files). .. versionadded:: 3.15.0 - """ def __init__( @@ -122,16 +121,6 @@ def __init__( except AttributeError: attributes = None - # try: - # units = source._get_component("units", False) - # except AttributeError: - # units = False - # - # try: - # calendar = source._get_component("calendar", False) - # except AttributeError: - # calendar = False - try: aggregated_units = source._get_component( "aggregated_units", False @@ -174,8 +163,6 @@ def __init__( self._set_component("shape", shape, copy=False) self._set_component("dtype", dtype, copy=False) - # self._set_component("units", units, copy=False) - # self._set_component("calendar", calendar, copy=False) self._set_component("attributes", attributes, copy=False) self._set_component("mask", True, copy=False) @@ -201,8 +188,6 @@ def __getitem__(self, indices): "shape": self.shape, "aggregated_units": self.get_aggregated_units(None), "aggregated_calendar": self.get_aggregated_calendar(None), - # "units": self.get_units(None), - # "calendar": self.get_units(None), "attributes": self.get_attributes(None), "copy": False, } diff --git a/cf/test/test_regrid_mesh.py b/cf/test/test_regrid_mesh.py index da070395fc..3095640135 100644 --- a/cf/test/test_regrid_mesh.py +++ b/cf/test/test_regrid_mesh.py @@ -161,8 +161,6 @@ def test_Field_regrid_mesh_to_mesh(self): use_dst_mask=use_dst_mask, ) - print("\ny=", y) - # print ('a=', a) self.assertTrue(np.allclose(y, a, atol=atol, rtol=rtol)) if isinstance(a, np.ma.MaskedArray): From d4ec9745ea839cdb120ef520726db8e5ddb2ff75 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 8 Apr 2024 09:51:31 +0100 Subject: [PATCH 085/134] dev --- cf/read_write/netcdf/netcdfwrite.py | 33 ++++++--- cf/read_write/read.py | 109 +++++++++++++++------------- cf/read_write/um/umread.py | 1 + cf/read_write/write.py | 1 + cf/regrid/regrid.py | 1 + cf/regrid/regridoperator.py | 1 + docs/source/conf.py | 1 + 7 files changed, 86 insertions(+), 61 deletions(-) diff --git a/cf/read_write/netcdf/netcdfwrite.py b/cf/read_write/netcdf/netcdfwrite.py index e9f1f7b61a..460b758089 100644 --- a/cf/read_write/netcdf/netcdfwrite.py +++ b/cf/read_write/netcdf/netcdfwrite.py @@ -104,15 +104,17 @@ def _write_as_cfa(self, cfvar, construct_type, domain_axes): raise ValueError( f"Can't write {cfvar!r} as a CFA-netCDF " - "aggregation variable. If the variable was read " - "from disk then setting chunks=None as an " - "argument to cf.read will likely solve the " - "problem. " - "Alternatively, you could consider setting " - "cfa={'strict': False} as an argument to " - "cf.write, but note the this will create a copy " - "of the data for this variable in the output " - "dataset." + "aggregation variable. Possible reasons for this " + "include 1) there is more than one Dask chunk " + "per fragment file, and 2) data values have been " + "changed relative to those in the fragment files. " + "\n\n" + "In case 1), setting chunks=None as an " + "argument to cf.read may solve the problem. " + "You could consider setting cfa={'strict': False} " + "as an argument to cf.write, but note the this " + "will create a copy of the data for this " + "variable in the output dataset." ) return cfa_get_write @@ -584,6 +586,8 @@ def _create_cfa_data(self, ncvar, ncdimensions, data, cfvar): }, ) + # REVIEW: h5: Function _convert_to_builtin_type was a CFA-0.4 thing + def _check_valid(self, array, cfvar=None, attributes=None): """Checks for array values outside of the valid range. @@ -752,6 +756,8 @@ def _cfa_write_non_standard_terms( # more than one unique value then the fragment's value is # missing data. # + # REVIEW: getitem: asanyarray parameter + # # '_cfa_unique' has its own call to 'cf_asanyarray', so # we can set 'asanyarray=False'. dx = data.to_dask_array(asanyarray=False) @@ -812,6 +818,7 @@ def _cfa_unique(cls, a): data if there is not a unique value. """ + # REVIEW: getitem: make sure that 'a' is usable data a = cf_asanyarray(a) out_shape = (1,) * a.ndim @@ -883,13 +890,15 @@ def _cfa_aggregation_instructions(self, data, cfvar): if file_details: raise ValueError( "Can't write CFA-netCDF aggregation variable from " - f"{cfvar!r}: Dask storage chunk defined by indices " - f"{indices} spans two or more fragment files" + f"{cfvar!r}: Dask chunk defined by indices " + f"{indices} spans two or more fragment files." + "A possible fix for this is to set chunks=None as an " + "argument to cf.read" ) raise ValueError( "Can't write CFA-netCDF aggregation variable from " - f"{cfvar!r}: Dask storage chunk defined by indices " + f"{cfvar!r}: Dask chunk defined by indices " f"{indices} spans zero files" ) diff --git a/cf/read_write/read.py b/cf/read_write/read.py index 04af080a47..670ab19219 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -33,7 +33,6 @@ logger = logging.getLogger(__name__) - @_manage_log_level_via_verbosity def read( files, @@ -58,14 +57,18 @@ def read( select_options=None, follow_symlinks=False, mask=True, + # REVIEW: h5: new unpack parameter to control auto-unpacking (previously always True) unpack=True, warn_valid=False, chunks="auto", domain=False, cfa=None, + # REVIEW: h5: new netcdf_engine parameter to control how to read files netcdf_engine=None, + # REVIEW: h5: new storage_options parameter to control access to S3 storage_options=None, - cache_metadata=True, + # REVIEW: h5: cache_metadata parameter to control whethe or not to get to caache selected data elements + cache_metadata=True, ): """Read field or domain constructs from files. @@ -432,12 +435,12 @@ def read( .. versionadded:: 3.4.0 unpack: `bool`, optional - If True (the default) then unpack arrays by convention + If True, the default, then unpack arrays by convention when the data is read from disk. - Unpacking is determined netCDF conventions for the - following attributes: ``add_offset``, ``scale_factor``, - and ``_Unsigned``. + Unpacking is determined by netCDF conventions for the + following variable attributes: ``add_offset``, + ``scale_factor``, and ``_Unsigned``. .. versionadded:: NEXTVERSION @@ -690,57 +693,62 @@ def read( the opening of netCDF fragment files that define the data of aggregated variables. For these, the first one of `netCDF4` and `h5netcdf` to - successfully open the file netCDF file is always - be used. - - .. note:: `h5netcdf` restricts the types of indices that - define subspaces of its data. See - https://docs.h5py.org for details. However, such - indices on a returned `Field` are possible if - they are followed by further subspaces that - imply acceptable indices. + successfully open the file is used. .. versionadded:: NEXTVERSION storage_options: `dict` or `None`, optional - Key/value pairs to be passed on to the creation of - `s3fs.S3FileSystem` file systems to control the opening of - files in S3 object stores. Ignored for files not in an S3 - object store, i.e. those whose names do not start with - ``s3:``. - - By default, or if `None`, then *storage_options* is taken - as ``{}``. - - If the ``'endpoint_url'`` key is not in *storage_options* - or is not in a dictionary defined by the ``'client_kwargs`` - key (which is always the case when *storage_options* is - `None`), then one will be automatically inserted for - accessing an S3 file. For example, for a file name of - ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` key - with value ``'https://store'`` would be created. - - *Parameter example:* - For a file name of ``'s3://store/data/file.nc'``, the - following are equivalent: ``None``, ``{}``, and - ``{'endpoint_url': 'https://store'}``, - ``{'client_kwargs': {'endpoint_url': 'https://store'}}`` - - *Parameter example:* - ``{'key: 'scaleway-api-key...', 'secret': - 'scaleway-secretkey...', 'endpoint_url': - 'https://s3.fr-par.scw.cloud', 'client_kwargs': - {'region_name': 'fr-par'}}`` + Pass parameters to the backend file system driver, such as + username, password, server, port, etc. How the storage + options are interpreted depends on the location of the + file: + + **Local File System** + + Storage options are ignored for local files. + + **HTTP(S)** + + Storage options are ignored for files available across the + network via OPeNDAP. + + **S3-compatible services** + + The backend used is `s3fs`, and the storage options are + used to initialise an `s3fs.S3FileSystem` file system + object. By default, or if `None`, then *storage_options* + is taken as ``{}``. + + If the ``'endpoint_url'`` key is not in *storage_options*, + nor in a dictionary defined by the ``'client_kwargs'`` key + (both of which are the case when *storage_options* is + `None`), then one will be automatically inserted for + accessing an S3 file. For example, for a file name of + ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` key + with value ``'https://store'`` would be created. To + disable this, set ``'endpoint_url'`` to `None`. + + *Parameter example:* + For a file name of ``'s3://store/data/file.nc'``, the + following are equivalent: ``None``, ``{}``, + ``{'endpoint_url': 'https://store'}``, and + ``{'client_kwargs': {'endpoint_url': 'https://store'}}`` + + *Parameter example:* + ``{'key: 'scaleway-api-key...', 'secret': + 'scaleway-secretkey...', 'endpoint_url': + 'https://s3.fr-par.scw.cloud', 'client_kwargs': + {'region_name': 'fr-par'}}`` .. versionadded:: NEXTVERSION cache_metadata: `bool`, optional - If True, the default, then data for metadata constructs - will have their first and last array elements retrieved - from the file and cached in memory for fast future - access. In addition, the second and penultimate array - elements will be cached from 2-d coordinate bounds data - that has two bounds per cell. + If True, the default, then cache the first and last array + elements of metadata constructs for fast future access. In + addition, the second and penultimate array elements will + be cached from coordinate bounds when there are two bounds + per cell. For remote data, setting *cache_metadata* to + False may speed up the parsing of the file. .. versionadded:: NEXTVERSION @@ -1180,6 +1188,9 @@ def _read_a_file( mask: `bool`, optional See `cf.read` for details. + unpack: `bool`, optional + See `cf.read` for details. + verbose: `int` or `str` or `None`, optional See `cf.read` for details. diff --git a/cf/read_write/um/umread.py b/cf/read_write/um/umread.py index 849529e39d..0a40d45812 100644 --- a/cf/read_write/um/umread.py +++ b/cf/read_write/um/umread.py @@ -1957,6 +1957,7 @@ def create_data(self): recs = self.recs um_Units = self.um_Units + # REVIEW: h5: replace units/calendar API with attributes attributes = { "units": getattr(um_Units, "units", None), "calendar": getattr(um_Units, "calendar", None), diff --git a/cf/read_write/write.py b/cf/read_write/write.py index 23a8dda3cd..44cab26398 100644 --- a/cf/read_write/write.py +++ b/cf/read_write/write.py @@ -12,6 +12,7 @@ netcdf = NetCDFWrite(implementation()) +# REVIEW: h5: docstring improvements @_manage_log_level_via_verbosity def write( diff --git a/cf/regrid/regrid.py b/cf/regrid/regrid.py index e1dd0f71a6..4e8ff58bfa 100644 --- a/cf/regrid/regrid.py +++ b/cf/regrid/regrid.py @@ -2464,6 +2464,7 @@ def create_esmpy_weights( from netCDF4 import Dataset from .. import __version__ + # REVIEW: h5: new name and location of file lock from ..data.array.locks import netcdf_lock if ( diff --git a/cf/regrid/regridoperator.py b/cf/regrid/regridoperator.py index a20754203d..c4104a49ab 100644 --- a/cf/regrid/regridoperator.py +++ b/cf/regrid/regridoperator.py @@ -727,6 +727,7 @@ def tosparse(self): # Read the weights from the weights file from netCDF4 import Dataset + # REVIEW: h5: new name and location of file lock from ..data.array.locks import netcdf_lock netcdf_lock.acquire() diff --git a/docs/source/conf.py b/docs/source/conf.py index a30fb4d0a3..5e39feaf2b 100755 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -156,6 +156,7 @@ def _get_date(): "cfplot": ("https://ajheaps.github.io/cf-plot", None), "dask": ("https://docs.dask.org/en/latest", None), "matplotlib": ("https://matplotlib.org/stable/", None), + # REVIEW: h5: new intersphinx mapping "h5netcdf": ("https://h5netcdf.org", None), } From 20dc35844707d1e02a5198a669b96c35e6e729d0 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 8 Apr 2024 19:12:21 +0100 Subject: [PATCH 086/134] dev --- cf/aggregate.py | 2 +- cf/cellmethod.py | 1 + cf/cfimplementation.py | 3 + cf/constants.py | 1 + cf/data/array/__init__.py | 1 + cf/data/array/cfah5netcdfarray.py | 1 + cf/data/array/cfanetcdf4array.py | 1 + cf/data/array/fullarray.py | 5 +- cf/data/array/h5netcdfarray.py | 6 +- cf/data/array/mixin/__init__.py | 3 + cf/data/array/mixin/activestoragemixin.py | 1 + cf/data/array/mixin/arraymixin.py | 1 + cf/data/array/mixin/cfamixin.py | 33 +++--- cf/data/array/mixin/indexmixin.py | 90 ++++++++++------- cf/data/array/netcdf4array.py | 6 +- cf/data/array/netcdfarray.py | 1 + cf/data/array/umarray.py | 15 ++- cf/data/collapse/__init__.py | 2 + cf/data/collapse/collapse.py | 1 + cf/data/collapse/collapse_active.py | 1 + cf/data/collapse/dask_collapse.py | 1 + cf/data/creation.py | 5 + cf/data/dask_regrid.py | 2 + cf/data/dask_utils.py | 13 +++ cf/data/data.py | 101 +++++++++++++++---- cf/data/fragment/__init__.py | 1 + cf/data/fragment/fullfragmentarray.py | 1 + cf/data/fragment/h5netcdffragmentarray.py | 1 + cf/data/fragment/mixin/fragmentarraymixin.py | 2 + cf/data/fragment/netcdf4fragmentarray.py | 3 +- cf/data/fragment/netcdffragmentarray.py | 2 + cf/data/fragment/umfragmentarray.py | 1 + cf/data/utils.py | 3 + cf/domain.py | 1 + cf/field.py | 4 + cf/functions.py | 67 +++--------- cf/mixin/fielddomain.py | 1 + cf/mixin/propertiesdata.py | 3 +- cf/read_write/netcdf/netcdfread.py | 21 ++-- cf/read_write/netcdf/netcdfwrite.py | 37 +++---- cf/read_write/read.py | 3 +- cf/read_write/write.py | 1 + cf/regrid/regrid.py | 1 + cf/test/test_Data.py | 11 +- cf/test/test_Field.py | 1 + cf/test/test_FullArray.py | 1 + cf/test/test_NetCDF4Array.py | 8 ++ cf/test/test_active_storage.py | 8 +- cf/test/test_functions.py | 1 + cf/test/test_read_write.py | 4 +- 50 files changed, 298 insertions(+), 186 deletions(-) diff --git a/cf/aggregate.py b/cf/aggregate.py index 300ded16c5..39f2c9bfa1 100644 --- a/cf/aggregate.py +++ b/cf/aggregate.py @@ -3207,7 +3207,7 @@ def aggregate( # # 0.0012 , 0.019 , 0.55 , 2.1 # - # compared with new timings of + # compared with current method timings of # # 0.00035, 0.0012, 0.013, 0.064 # ------------------------------------------------ diff --git a/cf/cellmethod.py b/cf/cellmethod.py index a1a2f3be15..1b257da76b 100644 --- a/cf/cellmethod.py +++ b/cf/cellmethod.py @@ -53,6 +53,7 @@ class CellMethod(cfdm.CellMethod): """ + # REVIEW: h5 def __new__(cls, *args, **kwargs): """This must be overridden in subclasses. diff --git a/cf/cfimplementation.py b/cf/cfimplementation.py index acbb11ad97..868c39d71f 100644 --- a/cf/cfimplementation.py +++ b/cf/cfimplementation.py @@ -43,6 +43,7 @@ from .functions import CF +# REVIEW: h5 class CFImplementation(cfdm.CFDMImplementation): """A container for the CF data model implementation for `cf`. @@ -114,6 +115,7 @@ def set_construct(self, parent, construct, axes=None, copy=True, **kwargs): parent, construct, axes=axes, copy=copy, **kwargs ) + # REVIEW: h5 def initialise_CFANetCDF4Array(self, **kwargs): """Return a `CFANetCDF4Array` instance. @@ -130,6 +132,7 @@ def initialise_CFANetCDF4Array(self, **kwargs): cls = self.get_class("CFANetCDF4Array") return cls(**kwargs) + # REVIEW: h5 def initialise_CFAH5netcdfArray(self, **kwargs): """Return a `CFAH5netcdfArray` instance. diff --git a/cf/constants.py b/cf/constants.py index 0b8e12ecfd..83554d5596 100644 --- a/cf/constants.py +++ b/cf/constants.py @@ -63,6 +63,7 @@ "LOG_LEVEL": logging.getLevelName(logging.getLogger().level), "BOUNDS_COMBINATION_MODE": "AND", "CHUNKSIZE": parse_bytes(_CHUNKSIZE), + # REVIEW: active "active_storage": False, "active_storage_url": None, } diff --git a/cf/data/array/__init__.py b/cf/data/array/__init__.py index cd2c53766b..20924e6433 100644 --- a/cf/data/array/__init__.py +++ b/cf/data/array/__init__.py @@ -1,3 +1,4 @@ +# REVIEW: h5 from .boundsfromnodesarray import BoundsFromNodesArray from .cellconnectivityarray import CellConnectivityArray from .cfah5netcdfarray import CFAH5netcdfArray diff --git a/cf/data/array/cfah5netcdfarray.py b/cf/data/array/cfah5netcdfarray.py index 47c58bff06..9a3244f550 100644 --- a/cf/data/array/cfah5netcdfarray.py +++ b/cf/data/array/cfah5netcdfarray.py @@ -2,6 +2,7 @@ from .mixin import CFAMixin +# REVIEW: h5 class CFAH5netcdfArray(CFAMixin, H5netcdfArray): """A CFA-netCDF array accessed with `h5netcdf` diff --git a/cf/data/array/cfanetcdf4array.py b/cf/data/array/cfanetcdf4array.py index b3b6b69d7a..2b4edb91c8 100644 --- a/cf/data/array/cfanetcdf4array.py +++ b/cf/data/array/cfanetcdf4array.py @@ -2,6 +2,7 @@ from .netcdf4array import NetCDF4Array +# REVIEW: h5 class CFANetCDF4Array(CFAMixin, NetCDF4Array): """A CFA-netCDF array accessed with `netCDF4`. diff --git a/cf/data/array/fullarray.py b/cf/data/array/fullarray.py index f900b221b7..d73a5aaf04 100644 --- a/cf/data/array/fullarray.py +++ b/cf/data/array/fullarray.py @@ -16,6 +16,7 @@ class FullArray(IndexMixin, Array): """ + # REVIEW: h5: Replace "units/calendar" API with "attributes" def __init__( self, fill_value=None, @@ -120,6 +121,7 @@ def __str__(self): return f"Filled with {fill_value!r}" + # REVIEW: getitem def _get_array(self, index=None): """Returns the full array. @@ -158,6 +160,7 @@ def _get_array(self, index=None): return array + # REVIEW: getitem @property def array(self): """Return an independent numpy array containing the data. @@ -169,8 +172,6 @@ def array(self): `numpy.ndarray` An independent numpy array of the data. """ - # REVIEW: getitem - return np.asanyarray(self) @property diff --git a/cf/data/array/h5netcdfarray.py b/cf/data/array/h5netcdfarray.py index c6691ce97a..a52f65f814 100644 --- a/cf/data/array/h5netcdfarray.py +++ b/cf/data/array/h5netcdfarray.py @@ -5,6 +5,7 @@ from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin, IndexMixin +# REVIEW: h5 class H5netcdfArray( ActiveStorageMixin, IndexMixin, @@ -24,8 +25,6 @@ class H5netcdfArray( """ - # REVIEW: h5 - def __dask_tokenize__(self): """Return a value fully representative of the object. @@ -50,6 +49,7 @@ def _lock(self): """ return netcdf_lock + # REVIEW: getitem def _get_array(self, index=None): """Returns a subspace of the dataset variable. @@ -69,8 +69,6 @@ def _get_array(self, index=None): The subspace. """ - # REVIEW: getitem - if index is None: index = self.index() diff --git a/cf/data/array/mixin/__init__.py b/cf/data/array/mixin/__init__.py index 8e5dd7690d..5b7fc33cf9 100644 --- a/cf/data/array/mixin/__init__.py +++ b/cf/data/array/mixin/__init__.py @@ -1,6 +1,9 @@ +# REVIEW: active from .activestoragemixin import ActiveStorageMixin from .arraymixin import ArrayMixin from .cfamixin import CFAMixin from .compressedarraymixin import CompressedArrayMixin from .filearraymixin import FileArrayMixin + +# REVIEW: getitem from .indexmixin import IndexMixin diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index 4493aaf621..abad445b76 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -1,3 +1,4 @@ +# REVIEW: active try: from activestorage import Active except ModuleNotFoundError: diff --git a/cf/data/array/mixin/arraymixin.py b/cf/data/array/mixin/arraymixin.py index d5a5aaa861..f17687bdcf 100644 --- a/cf/data/array/mixin/arraymixin.py +++ b/cf/data/array/mixin/arraymixin.py @@ -18,6 +18,7 @@ def __array_function__(self, func, types, args, kwargs): """ return NotImplemented + # REVIEW: active @property def _meta(self): """Normalize the array to an appropriate Dask meta object. diff --git a/cf/data/array/mixin/cfamixin.py b/cf/data/array/mixin/cfamixin.py index 525404389a..c3a54ef8fa 100644 --- a/cf/data/array/mixin/cfamixin.py +++ b/cf/data/array/mixin/cfamixin.py @@ -36,6 +36,7 @@ def __new__(cls, *args, **kwargs): } return instance + # REVIEW: h5: Replace "units/calendar" API with "attributes" def __init__( self, filename=None, @@ -43,8 +44,6 @@ def __init__( dtype=None, mask=True, unpack=True, - # units=False, - # calendar=False, instructions=None, substitutions=None, term=None, @@ -83,14 +82,6 @@ def __init__( .. versionadded:: NEXTVERSION - units: `str` or `None`, optional - The units of the aggregated data. Set to `None` to - indicate that there are no units. - - calendar: `str` or `None`, optional - The calendar of the aggregated data. Set to `None` to - indicate the CF default calendar, if applicable. - instructions: `str`, optional The ``aggregated_data`` attribute value as found on the CFA variable. If set then this will be used to @@ -147,10 +138,26 @@ def __init__( .. versionadded:: NEXTVERSION + {{init attributes: `dict` or `None`, optional}} + + If *attributes* is `None`, the default, then the + attributes will be set from the netCDF variable during + the first `__getitem__` call. + + .. versionaddedd:: NEXTVERSION + {{init source: optional}} {{init copy: `bool`, optional}} + units: `str` or `None`, optional + Deprecated at version NEXTRELEASE. Use the + *attributes* parameter instead. + + calendar: `str` or `None`, optional + Deprecated at version NEXTRELEASE. Use the + *attributes* parameter instead. + """ if source is not None: super().__init__(source=source, copy=copy) @@ -190,8 +197,6 @@ def __init__( shape=shape, dtype=dtype, mask=mask, - # units=units, - # calendar=calendar, attributes=attributes, copy=copy, ) @@ -201,8 +206,6 @@ def __init__( address=address, dtype=dtype, mask=mask, - # units=units, - # calendar=calendar, attributes=attributes, copy=copy, ) @@ -222,6 +225,7 @@ def __init__( "substitutions", substitutions.copy(), copy=False ) + # REVIEW: h5 def _parse_cfa(self, x, term, substitutions): """Parse the CFA aggregation instructions. @@ -462,6 +466,7 @@ def get_fragment_shape(self): """ return self._get_component("fragment_shape") + # REVIEW: h5 def get_storage_options(self): """Return `s3fs.S3FileSystem` options for accessing S3 fragment files. diff --git a/cf/data/array/mixin/indexmixin.py b/cf/data/array/mixin/indexmixin.py index 3bbc74481c..5799782b00 100644 --- a/cf/data/array/mixin/indexmixin.py +++ b/cf/data/array/mixin/indexmixin.py @@ -7,6 +7,7 @@ from ....functions import indices_shape, parse_indices +# REVIEW: getitem class IndexMixin: """Mixin class for lazy indexing of a data array. @@ -38,7 +39,7 @@ class IndexMixin: """ def __array__(self, *dtype): - """Convert the ``{{class}}` into a `numpy` array. + """Convert the `{{class}}` into a `numpy` array. .. versionadded:: NEXTVERSION @@ -66,17 +67,17 @@ def __getitem__(self, index): x.__getitem__(indices) <==> x[indices] Subspaces created by indexing are lazy and are not applied - until the {{class}} object is converted to a `numpy` array - (via `__array__`), by which time all lazily-defined subspaces - will have been converted to a single combined index which - defines only the actual elements that need to be retrieved - from the original data. - - The combined index is intended to be treated orthogonally, - meaning that the index for each dimension is to be applied - independently, regardless of how that index was defined. For - instance, the indices ``[[0, 1], [1, 3], 0]`` and ``[:2, 1::2, - 0]`` will give identical results. + until the `{{class}}` object is converted to a `numpy` array, + by which time all lazily-defined subspaces will have been + converted to a single combined index which defines only the + actual elements that need to be retrieved from the original + data. + + The combined index is orthogonal, meaning that the index for + each dimension is to be applied independently, regardless of + how that index was defined. For instance, the indices ``[[0, + 1], [1, 3], 0]`` and ``[:2, 1::2, 0]`` will give identical + results. For example, if the original data has shape ``(12, 145, 192)`` and consecutive subspaces of ``[::2, [1, 3, 4], 96:]`` and @@ -108,11 +109,13 @@ def __getitem__(self, index): i = 0 for ind0, original_size in zip(index0, original_shape): - # If a previous call to __getitem__ resulted in a - # dimension being subspaced to and size 1 *and* removed - # (i.e. 'ind0' is integer-valued) then 'index1' will have - # fewer elements than 'index0' if isinstance(ind0, Integral): + # The previous call to __getitem__ resulted in a + # dimension being removed (i.e. 'ind0' is + # integer-valued). Therefore 'index1' must have have + # fewer elements than 'index0', so we need to "carry + # forward" the integer-valued index so that it is + # available at evaluation time. new_indices.append(ind0) continue @@ -126,9 +129,9 @@ def __getitem__(self, index): new_indices.append(ind0) continue - # Still here? Then we have to work out the subspace of the - # full array implied by applying 'ind0' - # followed by 'ind1'. + # Still here? Then we have to work out the index of the + # full array that is equivalent to applying + # 'ind0' followed by 'ind1'. if is_dask_collection(ind1): # Note: This will never occur when this __getitem__ is # being called from within a Dask graph, because @@ -136,7 +139,7 @@ def __getitem__(self, index): # computed as part of the whole graph execution; # i.e. we don't have to worry about a # compute-within-a-compute situation. (If this - # were not the case then we would get round it + # were not the case then we could get round it # by wrapping the compute inside a `with # dask.config.set({"scheduler": # "synchronous"}):`.) @@ -171,13 +174,16 @@ def __getitem__(self, index): # ind1: int, or array of int/bool new_index = np.arange(*ind0.indices(original_size))[ind1] else: - # ind0: array of int (if we made it here, then it - # can't be anything else, because - # we've dealt with ind0 being an - # int, and a previous ind1 that - # was an array of bool will have - # resulted in this ind0 being an - # array of int) + # ind0: array of int. If we made it here then it can't + # be anything else. This is + # because we've dealt with ind0 + # being a slice or an int, the + # very first ind0 is always + # slice(None), and a previous ind1 + # that was an array of bool will + # have resulted in this ind0 being + # an array of int. + # # ind1: anything new_index = np.asanyarray(ind0)[ind1] @@ -185,7 +191,7 @@ def __getitem__(self, index): new._custom["index"] = tuple(new_indices) - # Find the shape defined by the new indices + # Find the shape defined by the new index new_shape = indices_shape(new_indices, original_shape, keepdims=False) new._set_component("shape", tuple(new_shape), copy=False) @@ -251,11 +257,19 @@ def index(self, conform=True): :Parameters: conform: `bool`, optional - If True, the default, then 1) convert a decreasing - size 1 slice to an increasing one, and 2) where - possible, a convert sequence of integers to a - slice. If False then these transformations are not - done. + If True, the default, then + + * Convert a decreasing size 1 slice to an increasing + one. + + * Convert, where possible, a sequence of integers to a + slice. + + These transformations are to allow subspacing on data + objects that have restricted indexing functionality, + such as `h5py.Variable` objects. + + If False then these transformations are not done. :Returns: @@ -279,6 +293,8 @@ def index(self, conform=True): """ ind = self._custom.get("index") if ind is None: + # No indices have been applied yet, so define indices that + # are equivalent to Ellipsis, and set the original shape. ind = (slice(None),) * self.ndim self._custom["index"] = ind self._custom["original_shape"] = self.shape @@ -291,7 +307,7 @@ def index(self, conform=True): # # 1) Converting decreasing size 1 slices to increasing ones. # - # 2) Where possible, converting sequences of integers to + # 2) Converting, where possible, sequences of integers to # slices. ind = list(ind) for n, (i, size) in enumerate(zip(ind[:], self.original_shape)): @@ -310,8 +326,8 @@ def index(self, conform=True): start = i.item() ind[n] = slice(start, start + 1) else: - # Convert a sequence of two or more integers into - # a slice, if possible. + # Convert a sequence of two or more evenly spaced + # integers into a slice. step = np.unique(np.diff(i)) if step.size == 1: start, stop = i[[0, -1]] @@ -331,7 +347,7 @@ def original_shape(self): """The original shape of the data, before any subspacing. The `shape` is defined by the result of subspacing the data in - its original shape with the indices defined by `index`. + its original shape with the indices given by `index`. .. versionadded:: NEXTVERSION diff --git a/cf/data/array/netcdf4array.py b/cf/data/array/netcdf4array.py index b800363f38..4c2ff57417 100644 --- a/cf/data/array/netcdf4array.py +++ b/cf/data/array/netcdf4array.py @@ -5,6 +5,7 @@ from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin, IndexMixin +# REVIEW: h5 class NetCDF4Array( ActiveStorageMixin, IndexMixin, @@ -22,8 +23,6 @@ class NetCDF4Array( """ - # REVIEW: h5 - def __dask_tokenize__(self): """Return a value fully representative of the object. @@ -48,6 +47,7 @@ def _lock(self): """ return netcdf_lock + # REVIEW: getitem def _get_array(self, index=None): """Returns a subspace of the dataset variable. @@ -67,8 +67,6 @@ def _get_array(self, index=None): The subspace. """ - # REVIEW: getitem - if index is None: index = self.index() diff --git a/cf/data/array/netcdfarray.py b/cf/data/array/netcdfarray.py index 6cfef4f939..e2b1d850e7 100644 --- a/cf/data/array/netcdfarray.py +++ b/cf/data/array/netcdfarray.py @@ -1,3 +1,4 @@ +# REVIEW: h5 class NetCDFArray: """A netCDF array accessed with `netCDF4`. diff --git a/cf/data/array/umarray.py b/cf/data/array/umarray.py index 74209f9195..07723fd8b4 100644 --- a/cf/data/array/umarray.py +++ b/cf/data/array/umarray.py @@ -7,6 +7,7 @@ from .mixin import FileArrayMixin, IndexMixin +# REVIEW: h5: Replace "units/calendar" API with "attributes" class UMArray( IndexMixin, FileArrayMixin, cfdm.data.mixin.FileArrayMixin, Array ): @@ -95,8 +96,6 @@ def __init__( *attributes* parameter instead. """ - # REVIEW: h5, getitem - super().__init__(source=source, copy=copy) if source is not None: @@ -172,6 +171,7 @@ def __init__( # By default, close the UM file after data array access self._set_component("close", True, copy=False) + # REVIEW: getitem def _get_array(self, index=None): """Returns a subspace of the dataset variable. @@ -191,8 +191,6 @@ def _get_array(self, index=None): The subspace. """ - # REVIEW: getitem - # Note: No need to lock the UM file - concurrent reads are OK. if index is None: @@ -221,7 +219,9 @@ def _get_array(self, index=None): mask=True, unpack=True, always_masked_array=False, + orthogonal_indexing=True, attributes=attributes, + copy=False, ) array = array[index] @@ -312,6 +312,7 @@ def _set_FillValue(self, int_hdr, real_hdr, attributes): attributes["_FillValue"] = _FillValue + # REVIEW: getitem def _set_units(self, int_hdr, attributes): """Set the ``units`` attribute. @@ -337,8 +338,6 @@ def _set_units(self, int_hdr, attributes): `None` """ - # REVIEW: getitem - if "units" in attributes: return @@ -374,6 +373,8 @@ def _set_units(self, int_hdr, attributes): attributes["units"] = units + # REVIEW: h5 + # REVIEW: getitem def _set_unpack(self, int_hdr, real_hdr, attributes): """Set the ``add_offset`` and ``scale_factor`` attributes. @@ -397,8 +398,6 @@ def _set_unpack(self, int_hdr, real_hdr, attributes): `None """ - # REVIEW: getitem - if "scale_factor" not in attributes: # Treat BMKS as a scale_factor if it is neither 0 nor 1 scale_factor = real_hdr.item(18) diff --git a/cf/data/collapse/__init__.py b/cf/data/collapse/__init__.py index 30bef911c6..a2842da447 100644 --- a/cf/data/collapse/__init__.py +++ b/cf/data/collapse/__init__.py @@ -1,2 +1,4 @@ from .collapse import Collapse + +# REVIEW: active from .collapse_active import actify, active_reduction_methods, active_storage diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index 45a53a3bb7..5eef6efa35 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -9,6 +9,7 @@ from .collapse_utils import check_input_dtype, double_precision_dtype +# REVIEW: active class Collapse(metaclass=DocstringRewriteMeta): """Container for functions that collapse dask arrays. diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index b05604484f..3e69c64464 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -1,3 +1,4 @@ +# REVIEW: active import logging from functools import wraps diff --git a/cf/data/collapse/dask_collapse.py b/cf/data/collapse/dask_collapse.py index c18a8e1118..9544f3bedd 100644 --- a/cf/data/collapse/dask_collapse.py +++ b/cf/data/collapse/dask_collapse.py @@ -1,3 +1,4 @@ +# REVIEW: active """Reduction functions intended to be passed to be dask. Most of these functions are expected to be passed to diff --git a/cf/data/creation.py b/cf/data/creation.py index a8b90811a7..71bd000d2a 100644 --- a/cf/data/creation.py +++ b/cf/data/creation.py @@ -59,6 +59,7 @@ def to_dask(array, chunks, **from_array_options): if is_dask_collection(array): return array + # REVIEW: getitem if hasattr(array, "to_dask_array"): try: return array.to_dask_array(chunks=chunks) @@ -81,6 +82,10 @@ def to_dask(array, chunks, **from_array_options): array = np.asanyarray(array) kwargs = from_array_options + # REVIEW: active: + # REVIEW: getitem: The file lock has been push onto an `Array` + # object (in its `_get_array` method), rather + # than being set on the Dask array itself. kwargs.setdefault("meta", getattr(array, "_meta", None)) try: diff --git a/cf/data/dask_regrid.py b/cf/data/dask_regrid.py index 659418396d..a09826450d 100644 --- a/cf/data/dask_regrid.py +++ b/cf/data/dask_regrid.py @@ -1,6 +1,7 @@ """Regridding functions used within a dask graph.""" import numpy as np +# REVIEW: getitem from .dask_utils import cf_asanyarray @@ -175,6 +176,7 @@ def regrid( """ weights, dst_mask = weights_dst_mask + # REVIEW: getitem a = cf_asanyarray(a) if dst_mask is not None: dst_mask = cf_asanyarray(dst_mask) diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index 201ef1274b..bf9f73d444 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -103,6 +103,7 @@ def allclose(a_blocks, b_blocks, rtol=rtol, atol=atol): ) +# REVIEW: getitem def cf_contains(a, value): """Whether or not an array contains a value. @@ -131,6 +132,7 @@ def cf_contains(a, value): return np.array(value in a).reshape((1,) * a.ndim) +# REVIEW: getitem def cf_convolve1d(a, window=None, axis=-1, origin=0): """Calculate a 1-d convolution along the given axis. @@ -184,6 +186,7 @@ def cf_convolve1d(a, window=None, axis=-1, origin=0): return c +# REVIEW: getitem def cf_harden_mask(a): """Harden the mask of a masked `numpy` array. @@ -216,6 +219,7 @@ def cf_harden_mask(a): return a +# REVIEW: getitem def cf_percentile(a, q, axis, method, keepdims=False, mtol=1): """Compute percentiles of the data along the specified axes. @@ -350,6 +354,7 @@ def cf_percentile(a, q, axis, method, keepdims=False, mtol=1): return p +# REVIEW: getitem def cf_soften_mask(a): """Soften the mask of a masked `numpy` array. @@ -383,6 +388,7 @@ def cf_soften_mask(a): return a +# REVIEW: getitem def cf_where(array, condition, x, y, hardmask): """Set elements of *array* from *x* or *y* depending on *condition*. @@ -493,6 +499,7 @@ def _getattr(x, attr): _array_getattr = np.vectorize(_getattr, excluded="attr") +# REVIEW: getitem def cf_YMDhms(a, attr): """Return a date-time component from an array of date-time objects. @@ -534,6 +541,7 @@ def cf_YMDhms(a, attr): return _array_getattr(a, attr=attr) +# REVIEW: getitem def cf_rt2dt(a, units): """Convert an array of reference times to date-time objects. @@ -587,6 +595,7 @@ def _convert(x, units, reftime): )(a) +# REVIEW: getitem def cf_dt2rt(a, units): """Convert an array of date-time objects to reference times. @@ -622,6 +631,7 @@ def cf_dt2rt(a, units): return dt2rt(a, units_out=units, units_in=None) +# REVIEW: getitem def cf_units(a, from_units, to_units): """Convert array values to have different equivalent units. @@ -665,6 +675,7 @@ def cf_units(a, from_units, to_units): ) +# REVIEW: getitem def cf_is_masked(a): """Determine whether an array has masked values. @@ -688,6 +699,7 @@ def cf_is_masked(a): return np.array(out).reshape((1,) * a.ndim) +# REVIEW: getitem def cf_filled(a, fill_value=None): """Replace masked elements with a fill value. @@ -720,6 +732,7 @@ def cf_filled(a, fill_value=None): return np.ma.filled(a, fill_value=fill_value) +# REVIEW: getitem def cf_asanyarray(a): """Convert to a `numpy` array. diff --git a/cf/data/data.py b/cf/data/data.py index 8e0be09e24..38cf2c3124 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -43,6 +43,8 @@ from ..units import Units from .collapse import Collapse from .creation import generate_axis_identifiers, to_dask + +# REVIEW: getitem from .dask_utils import ( _da_ma_allclose, cf_asanyarray, @@ -101,6 +103,7 @@ _ARRAY = 1 # = 0b0001 _CACHE = 2 # = 0b0010 _CFA = 4 # = 0b0100 +# REVIEW: active: Set the active storage status bit mask _ACTIVE = 8 # = 0b1000 _ALL = 15 # = 0b1111 @@ -373,10 +376,8 @@ def __init__( super().__init__( source=source, _use_array=_use_array and array is not None ) - - # self._custom.setdefault("__asanyarray__", True) - if _use_array: + # REVIEW: getitem try: array = source.to_dask_array(asanyarray=False) except (AttributeError, TypeError): @@ -461,6 +462,7 @@ def __init__( except AttributeError: pass + # REVIEW: active: set the active storage status to True only for Array subclasses if self._is_abstract_Array_subclass(array): # Save the input array in case it's useful later. For # compressed input arrays this will contain extra @@ -482,13 +484,15 @@ def __init__( is_dask = is_dask_collection(array) custom["deterministic"] = not is_dask - # Set whether or not to call np.asanyarray on chunks to + # REVIEW: getitem + # Set whether or not to call `np.asanyarray` on chunks to # convert them to numpy arrays. if is_dask: # We don't know what's in the dask array, so we should - # assume that it might need converting to a numpy array.x + # assume that it might need converting to a numpy array. custom["__asanyarray__"] = True else: + # Use the array's __asanyarray__ value, if it has one. custom["__asanyarray__"] = bool( getattr(array, "__asanyarray__", False) ) @@ -514,6 +518,7 @@ def __init__( # Reset the units self._Units = units + # REVIEW: getitem # Store the dask array self._set_dask(dx, clear=_NONE, asanyarray=None) @@ -608,6 +613,7 @@ def __contains__(self, value): False """ + # REVIEW: getitem # Check that value is scalar by seeing if its shape is () shape = getattr(value, "shape", None) if shape is None: @@ -788,6 +794,7 @@ def __len__(self): TypeError: len() of unsized object """ + # REVIEW: getitem dx = self.to_dask_array(asanyarray=False) if math.isnan(dx.size): logger.debug("Computing data len: Performance may be degraded") @@ -894,6 +901,7 @@ def __getitem__(self, indices): # ------------------------------------------------------------ # Roll axes with cyclic slices # ------------------------------------------------------------ + # REVIEW: getitem if roll: # For example, if slice(-2, 3) has been requested on a # cyclic axis, then we roll that axis by two points and @@ -953,9 +961,16 @@ def __getitem__(self, indices): "Non-orthogonal indexing has not yet been implemented" ) + # REVIEW: active + # REVIEW: getitem # ------------------------------------------------------------ - # Set the subspaced dask array. Set 'asanyarray=True' to - # honour truely lazy subspacing. + # Set the subspaced dask array + # + # * A subpspace chunk may not result in an array in memory, so + # we need to set asanyarray=True + # + # * Subspacing the data does not affect the active storage + # status # ------------------------------------------------------------ new._set_dask(dx, clear=_ALL ^ _ACTIVE, asanyarray=True) @@ -1172,6 +1187,7 @@ def __setitem__(self, indices, value): return + # REVIEW: getitem @property def __asanyarray__(self): """Whether the chunks need conversion to a `numpy` array. @@ -1183,7 +1199,7 @@ def __asanyarray__(self): `bool` If True then at compute time add a final operation to the Dask graph that converts chunks to `numpy` arrays, - but only if a chunk's data object has an + but only if a chunk's array object has an `__asanyarray__` attribute that is also `True`. If `False` then do not do this. @@ -1406,10 +1422,12 @@ def _clear_after_dask_update(self, clear=_ALL): # Set the CFA write status to False self._cfa_del_write() + # REVIEW: active: update the active storage status if clear & _ACTIVE: # Set active storage to False self._del_active_storage() + # REVIEW: getitem def _set_dask(self, dx, copy=False, clear=_ALL, asanyarray=False): """Set the dask array. @@ -1435,11 +1453,12 @@ def _set_dask(self, dx, copy=False, clear=_ALL, asanyarray=False): asanyarray: `bool` or `None`, optional If True then at compute time add a final operation to - the Dask graph that converts chunks to `numpy` arrays, - but only if a chunk's data object has an - `__asanyarray__` attribute that is also `True`. If - False, the default, then do not do this. If `None` - then do not change the current behaviour. + the Dask graph (not in-place) that converts chunks to + `numpy` arrays, but only for chunks whose array + objects have an `__asanyarray__` attribute that is + also `True`. If False, the default, then do not do + this. If `None` then do not change the current + behaviour. .. versionadded:: NEXTRELEASE @@ -1531,6 +1550,7 @@ def _del_dask(self, default=ValueError(), clear=_ALL): self._clear_after_dask_update(clear) return out + # REVIEW: active: Set the active storage status to False def _del_active_storage(self): """Set the active storage reduction status to False. @@ -1616,6 +1636,7 @@ def _is_abstract_Array_subclass(self, array): """ return isinstance(array, cfdm.Array) + # REVIEW: active: set the active storage status def _set_active_storage(self, value): """Set the active storage reduction status. @@ -2412,6 +2433,7 @@ def pad_missing(self, axis, pad_width=None, to_size=None, inplace=False): d._set_dask(dx) return d + # REVIEW: getitem @_inplace_enabled(default=False) def percentile( self, @@ -2922,6 +2944,7 @@ def compute(self): # noqa: F811 return a + # REVIEW: getitem @_inplace_enabled(default=False) def convolution_filter( self, @@ -3287,9 +3310,13 @@ def rechunk( """ d = _inplace_enabled_define_and_cleanup(self) + # REVIEW: getitem: set asanyarray for rechunk + # REVIEW: active: No need to change active storage status after a rechunk + # Dask rechunking is essentially a wrapper for __getitem__ - # calls on the chunks, which allows us to use the same - # 'asanyarray' settings as used in `__gettem__`. + # calls on the chunks, which means that we can use the same + # 'asanyarray' and active-storage-clear settings to + # `_set_dask` as are used in `__gettem__`. dx = d.to_dask_array(asanyarray=False) dx = dx.rechunk(chunks, threshold, block_size_limit, balance) @@ -3299,6 +3326,7 @@ def rechunk( return d + # REVIEW: getitem @_inplace_enabled(default=False) def _asdatetime(self, inplace=False): """Change the internal representation of data array elements @@ -3358,6 +3386,7 @@ def _isdatetime(self): """True if the internal representation is a datetime object.""" return self.dtype.kind == "O" and self.Units.isreftime + # REVIEW: getitem @_inplace_enabled(default=False) def _asreftime(self, inplace=False): """Change the internal representation of data array elements @@ -3956,6 +3985,7 @@ def _parse_indices(self, *args, **kwargs): "Use function 'cf.parse_indices' instead." ) + # REVIEW: getitem def _regrid( self, method=None, @@ -4259,6 +4289,7 @@ def concatenate( processed_data.append(data1) copied = not copy # to avoid making two copies in a given case + # REVIEW: getitem # Get data as dask arrays and apply concatenation operation dxs = [d.to_dask_array(asanyarray=False) for d in processed_data] dx = da.concatenate(dxs, axis=axis) @@ -4288,6 +4319,8 @@ def concatenate( cfa = _NONE break + # REVIEW: active: For the concatenated data, decide whether or not to clear the active_storage status during _set_dask + # Set the active storage status active = _ACTIVE for d in processed_data: @@ -4297,6 +4330,8 @@ def concatenate( active = _NONE break + # REVIEW: getitem: For the concatenated data, set the asanyarray argument to _set_dask + # Set the __asanyarray__ status asanyarray = processed_data[0].__asanyarray__ for d in processed_data[1:]: @@ -4933,6 +4968,7 @@ def _axes(self, value): # ---------------------------------------------------------------- # Dask attributes # ---------------------------------------------------------------- + # REVIEW: getitem @property def chunks(self): """The `dask` chunk sizes for each dimension. @@ -4957,6 +4993,7 @@ def chunks(self): # ---------------------------------------------------------------- # Attributes # ---------------------------------------------------------------- + # REVIEW: active: return the active storage status @property def active_storage(self): """Whether or not active storage reductions are possible. @@ -4967,7 +5004,7 @@ def active_storage(self): When the `active_storage` attribute is True it signifies that active storage reductions are possible, but only when all of the conditions described by `cf.data.collapse.Collapse` are - met. + also met. .. versionadded:: NEXTVERSION @@ -5006,6 +5043,7 @@ def Units(self): """ return self._Units + # REVIEW: getitem @Units.setter def Units(self, value): try: @@ -5072,6 +5110,7 @@ def data(self): """ return self + # REVIEW: getitem @property def dtype(self): """The `numpy` data-type of the data. @@ -5105,6 +5144,7 @@ def dtype(self): dx = self.to_dask_array(asanyarray=False) return dx.dtype + # REVIEW: getitem @dtype.setter def dtype(self, value): # Only change the datatype if it's different to that of the @@ -5197,6 +5237,7 @@ def hardmask(self): def hardmask(self, value): self._custom["hardmask"] = value + # REVIEW: getitem @property def is_masked(self): """True if the data array has any masked values. @@ -5233,6 +5274,7 @@ def is_masked(self): return bool(dx.any()) + # REVIEW: getitem @property def nbytes(self): """Total number of bytes consumed by the elements of the array. @@ -5267,6 +5309,7 @@ def nbytes(self): return dx.nbytes + # REVIEW: getitem @property def ndim(self): """Number of dimensions in the data array. @@ -5297,6 +5340,7 @@ def ndim(self): dx = self.to_dask_array(asanyarray=False) return dx.ndim + # REVIEW: getitem @property def npartitions(self): """The total number of chunks. @@ -5318,6 +5362,7 @@ def npartitions(self): """ return self.to_dask_array(asanyarray=False).npartitions + # REVIEW: getitem @property def numblocks(self): """The number of chunks along each dimension. @@ -5339,6 +5384,7 @@ def numblocks(self): """ return self.to_dask_array(asanyarray=False).numblocks + # REVIEW: getitem @property def shape(self): """Tuple of the data array's dimension sizes. @@ -5374,6 +5420,7 @@ def shape(self): return dx.shape + # REVIEW: getitem @property def size(self): """Number of elements in the data array. @@ -6444,6 +6491,7 @@ def argmin(self, axis=None, unravel=False): return type(self)(a) + # REVIEW: getitem @_inplace_enabled(default=False) def convert_reference_time( self, @@ -6629,6 +6677,7 @@ def get_data(self, default=ValueError(), _units=None, _fill_value=None): """ return self + # REVIEW: getitem def get_deterministic_name(self): """Get the deterministic name for the data. @@ -6687,6 +6736,7 @@ def get_deterministic_name(self): units._canonical_calendar, ) + # REVIEW: getitem def get_filenames(self): """The names of files containing parts of the data array. @@ -6850,6 +6900,7 @@ def set_calendar(self, calendar): """ self.Units = Units(self.get_units(default=None), calendar) + # REVIEW: getitem def add_file_location(self, location): """Add a new file location in-place. @@ -8278,6 +8329,7 @@ def uncompress(self, inplace=False): return d + # REVIEW: getitem def unique(self, split_every=None): """The unique elements of the data. @@ -9034,6 +9086,7 @@ def halo( return d + # REVIEW: getitem def harden_mask(self): """Force the mask to hard. @@ -9156,6 +9209,7 @@ def has_units(self): """ return hasattr(self.Units, "units") + # REVIEW: getitem def soften_mask(self): """Force the mask to soft. @@ -9191,6 +9245,7 @@ def soften_mask(self): self._set_dask(dx, clear=_NONE) self.hardmask = False + # REVIEW: getitem def file_locations(self): """The locations of files containing parts of the data. @@ -9224,6 +9279,7 @@ def file_locations(self): return out + # REVIEW: getitem @_inplace_enabled(default=False) def filled(self, fill_value=None, inplace=False): """Replace masked elements with a fill value. @@ -9904,6 +9960,7 @@ def override_calendar(self, calendar, inplace=False, i=False): d._Units = Units(d.Units._units, calendar) return d + # REVIEW: getitem def to_dask_array(self, apply_mask_hardness=False, asanyarray=None): """Convert the data to a `dask` array. @@ -9931,7 +9988,7 @@ def to_dask_array(self, apply_mask_hardness=False, asanyarray=None): asanyarray: `bool` or `None`, optional If True then add a final operation to the Dask graph that converts chunks to `numpy` arrays, but only if a - chunk's data object has an `__asanyarray__` attribute + chunk's array object has an `__asanyarray__` attribute that is also `True`. If False then do not do this. If `None`, the default, then the final operation is added if the `Data` object's `__asanyarray__` attribute is @@ -9976,11 +10033,15 @@ def to_dask_array(self, apply_mask_hardness=False, asanyarray=None): self.soften_mask() dx = self._custom["dask"] + # Note: The mask hardness functions have their own calls + # to cf_asanyarray, so we can don't need worry about + # setting another one. else: if asanyarray is None: asanyarray = self.__asanyarray__ if asanyarray: + # Add a new cf_asanyarray layer to the output graph dx = dx.map_blocks(cf_asanyarray, dtype=dx.dtype) return dx @@ -10211,6 +10272,7 @@ def del_calendar(self, default=ValueError()): self.override_calendar(None, inplace=True) return calendar + # REVIEW: getitem def del_file_location(self, location): """Remove a file location in-place. @@ -11213,6 +11275,7 @@ def fits_in_memory(self): """ return self.size * (self.dtype.itemsize + 1) <= free_memory() + # REVIEW: getitem @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) @_manage_log_level_via_verbosity @@ -11674,6 +11737,7 @@ def cosh(self, inplace=False): return d + # REVIEW: getitem def cull_graph(self): """Remove unnecessary tasks from the dask graph in-place. @@ -11982,6 +12046,7 @@ def tan(self, inplace=False, i=False): return d + # REVIEW: getitem def todict( self, optimize_graph=True, apply_mask_hardness=False, asanyarray=None ): @@ -12008,7 +12073,7 @@ def todict( asanyarray: `bool` or `None`, optional If True then add a final operation to the Dask graph that converts chunks to `numpy` arrays, but only if - chunk's data object has an `__asanyarray__` attribute + chunk's array object has an `__asanyarray__` attribute that is also `True`. If False then do not do this. If `None`, the default, then the final operation is added if the `Data` object's `__asanyarray__` attribute is diff --git a/cf/data/fragment/__init__.py b/cf/data/fragment/__init__.py index b7315107d4..38522d3958 100644 --- a/cf/data/fragment/__init__.py +++ b/cf/data/fragment/__init__.py @@ -1,3 +1,4 @@ +# REVIEW: h5 from .fullfragmentarray import FullFragmentArray from .h5netcdffragmentarray import H5netcdfFragmentArray from .netcdffragmentarray import NetCDFFragmentArray diff --git a/cf/data/fragment/fullfragmentarray.py b/cf/data/fragment/fullfragmentarray.py index fa0cc22adb..1212b27c29 100644 --- a/cf/data/fragment/fullfragmentarray.py +++ b/cf/data/fragment/fullfragmentarray.py @@ -9,6 +9,7 @@ class FullFragmentArray(FragmentArrayMixin, FullArray): """ + # REVIEW: h5: Replace "units/calendar" API with "attributes" def __init__( self, fill_value=None, diff --git a/cf/data/fragment/h5netcdffragmentarray.py b/cf/data/fragment/h5netcdffragmentarray.py index 0b70976c7f..c339cebb18 100644 --- a/cf/data/fragment/h5netcdffragmentarray.py +++ b/cf/data/fragment/h5netcdffragmentarray.py @@ -2,6 +2,7 @@ from .mixin import FragmentArrayMixin +# REVIEW: h5 class H5netcdfFragmentArray(FragmentArrayMixin, H5netcdfArray): """A netCDF fragment array accessed with `h5netcdf`. diff --git a/cf/data/fragment/mixin/fragmentarraymixin.py b/cf/data/fragment/mixin/fragmentarraymixin.py index 118baea2f6..c541c3094f 100644 --- a/cf/data/fragment/mixin/fragmentarraymixin.py +++ b/cf/data/fragment/mixin/fragmentarraymixin.py @@ -12,6 +12,7 @@ class FragmentArrayMixin: """ + # REVIEW: getitem def _get_array(self, index=None): """Returns a subspace of the dataset variable. @@ -128,6 +129,7 @@ def _conform_to_aggregated_units(self, array): return array + # REVIEW: getitem def _size_1_axis(self, indices): """Find the position of a unique size 1 index. diff --git a/cf/data/fragment/netcdf4fragmentarray.py b/cf/data/fragment/netcdf4fragmentarray.py index 54f3e4240d..e2f8fa62fb 100644 --- a/cf/data/fragment/netcdf4fragmentarray.py +++ b/cf/data/fragment/netcdf4fragmentarray.py @@ -2,6 +2,7 @@ from .mixin import FragmentArrayMixin +# REVIEW: h5 class NetCDF4FragmentArray(FragmentArrayMixin, NetCDF4Array): """A netCDF fragment array accessed with `netCDF4`. @@ -81,8 +82,6 @@ def __init__( dtype=dtype, shape=shape, mask=True, - # units=units, - # calendar=calendar, attributes=attributes, storage_options=storage_options, source=source, diff --git a/cf/data/fragment/netcdffragmentarray.py b/cf/data/fragment/netcdffragmentarray.py index 324207db7e..3ffd17a786 100644 --- a/cf/data/fragment/netcdffragmentarray.py +++ b/cf/data/fragment/netcdffragmentarray.py @@ -23,6 +23,7 @@ class NetCDFFragmentArray( """ + # REVIEW: h5: Replace "units/calendar" API with "attributes" def __init__( self, filename=None, @@ -174,6 +175,7 @@ def __init__( # By default, close the file after data array access self._set_component("close", True, copy=False) + # REVIEW: h5 def __getitem__(self, indices): """Returns a subspace of the fragment as a numpy array. diff --git a/cf/data/fragment/umfragmentarray.py b/cf/data/fragment/umfragmentarray.py index 9c5dbed13a..8e03ad1bdd 100644 --- a/cf/data/fragment/umfragmentarray.py +++ b/cf/data/fragment/umfragmentarray.py @@ -9,6 +9,7 @@ class UMFragmentArray(FragmentArrayMixin, UMArray): """ + # REVIEW: h5: Replace "units/calendar" API with "attributes" def __init__( self, filename=None, diff --git a/cf/data/utils.py b/cf/data/utils.py index 00bb88785b..7b6e3494b0 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -862,6 +862,7 @@ def collapse( "keepdims": keepdims, "split_every": split_every, "mtol": mtol, + # REVIEW: active: pass the active storage status onto the collapse functions "active_storage": d.active_storage, } @@ -872,6 +873,7 @@ def collapse( if ddof is not None: kwargs["ddof"] = ddof + # REVIEW: getitem # The applicable chunk function will have its own call to # 'cf_asanyarray', so we can set 'asanyarray=False'. Also, setting # asanyarray=False will ensure that any active storage operations @@ -991,6 +993,7 @@ def parse_weights(d, weights, axis=None): axes = d._axes Data = type(d) for key, value in weights.items(): + # REVIEW: active value = Data.asdata(value) # Make sure axes are in ascending order diff --git a/cf/domain.py b/cf/domain.py index 8d3afed0ec..cf3567d03b 100644 --- a/cf/domain.py +++ b/cf/domain.py @@ -695,6 +695,7 @@ def identity(self, default="", strict=False, relaxed=False, nc_only=False): return default + # REVIEW: h5 def identities(self): """Return all possible identities. diff --git a/cf/field.py b/cf/field.py index 56c2ef0ff8..0ba922d488 100644 --- a/cf/field.py +++ b/cf/field.py @@ -5113,6 +5113,7 @@ def histogram(self, digitized): """ raise RuntimeError("Use cf.histogram instead.") + # REVIEW: active @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_manage_log_level_via_verbosity def collapse( @@ -6993,6 +6994,9 @@ def collapse( "collapse" ) + # Note: It is important that size 1 axes are also passed + # on to the Data collapse, because active storage + # collapses get confused if they're not there. data_axes = f.get_data_axes() iaxes = [ data_axes.index(axis) diff --git a/cf/functions.py b/cf/functions.py index aebb3267c6..71ae42ff0f 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -162,6 +162,7 @@ def _free_memory(): return float(virtual_memory().available) +# REVIEW: active def configuration( atol=None, rtol=None, @@ -409,6 +410,7 @@ def configuration( ) +# REVIEW: active def _configuration(_Configuration, **kwargs): """Internal helper function to provide the logic for `cf.configuration`. @@ -566,6 +568,7 @@ class log_level(ConstantAccess, cfdm.log_level): _reset_log_emergence_level = _reset_log_emergence_level +# REVIEW: active class regrid_logging(ConstantAccess): """Whether or not to enable `esmpy` regridding logging. @@ -697,6 +700,7 @@ def _parse(cls, arg): ) # pragma: no cover +# REVIEW: active class relaxed_identities(ConstantAccess): """Use 'relaxed' mode when getting a construct identity. @@ -830,6 +834,7 @@ def _parse(cls, arg): return parse_bytes(arg) +# REVIEW: active class tempdir(ConstantAccess): """The directory for internally generated temporary files. @@ -1181,6 +1186,7 @@ def _parse(cls, arg): return arg +# REVIEW: active class active_storage(ConstantAccess): """Whether or not to attempt active storage reductions. @@ -1241,6 +1247,7 @@ def _parse(cls, arg): return bool(arg) +# REVIEW: active class active_storage_url(ConstantAccess): """The URL location of the active storage reducer. @@ -2219,61 +2226,6 @@ def parse_indices(shape, indices, cyclic=False, keepdims=True): return parsed_indices, roll -def get_subspace(array, indices): - """Return a subspace defined by the given indices of an array. - - Subset the input numpy array with the given indices. Indexing is - similar to that of a numpy array. The differences to numpy array - indexing are: - - 1. An integer index i takes the i-th element but does not reduce - the rank of the output array by one. - - 2. When more than one dimension's slice is a 1-d boolean array or - 1-d sequence of integers then these indices work independently - along each dimension (similar to the way vector subscripts work - in Fortran). - - Indices must contain an index for each dimension of the input array. - - :Parameters: - - array: `numpy.ndarray` - - indices: `list` - - """ - gg = [i for i, x in enumerate(indices) if not isinstance(x, slice)] - len_gg = len(gg) - - if len_gg < 2: - # ------------------------------------------------------------ - # At most one axis has a list-of-integers index so we can do a - # normal numpy subspace - # ------------------------------------------------------------ - return array[tuple(indices)] - - else: - # ------------------------------------------------------------ - # At least two axes have list-of-integers indices so we can't - # do a normal numpy subspace - # ------------------------------------------------------------ - if np.ma.isMA(array): - take = np.ma.take - else: - take = np.take - - indices = indices[:] - for axis in gg: - array = take(array, indices[axis], axis=axis) - indices[axis] = slice(None) - - if len_gg < len(indices): - array = array[tuple(indices)] - - return array - - _equals = cfdm.Data()._equals @@ -2644,6 +2596,7 @@ def flat(x): yield a +# REVIEW: h5 def abspath(filename): """Return a normalized absolute version of a file name. @@ -2688,6 +2641,7 @@ def abspath(filename): return filename +# REVIEW: h5 def relpath(filename, start=None): """Return a relative filepath to a file. @@ -2732,6 +2686,7 @@ def relpath(filename, start=None): return _os_path_relpath(filename) +# REVIEW: h5 def dirname(filename): """Return the directory name of a file. @@ -2767,6 +2722,7 @@ def dirname(filename): return _os_path_dirname(filename) +# REVIEW: h5 def pathjoin(path1, path2): """Join two file path components intelligently. @@ -3167,6 +3123,7 @@ def _get_module_info(module, alternative_name=False, try_except=False): ) +# REVIEW: h5 def environment(display=True, paths=True): """Return the names and versions of the cf package and its dependencies. diff --git a/cf/mixin/fielddomain.py b/cf/mixin/fielddomain.py index d81dbdf800..6c82147652 100644 --- a/cf/mixin/fielddomain.py +++ b/cf/mixin/fielddomain.py @@ -2022,6 +2022,7 @@ def get_coordinate_reference( return out + # REVIEW: h5 def iscyclic(self, *identity, **filter_kwargs): """Returns True if the given axis is cyclic. diff --git a/cf/mixin/propertiesdata.py b/cf/mixin/propertiesdata.py index 14444d2d36..279402bdf2 100644 --- a/cf/mixin/propertiesdata.py +++ b/cf/mixin/propertiesdata.py @@ -4692,6 +4692,7 @@ def log(self, base=None, inplace=False, i=False): delete_props=True, ) + # REVIEW: getitem def to_dask_array(self, apply_mask_hardness=False, asanyarray=None): """Convert the data to a `dask` array. @@ -4723,7 +4724,7 @@ def to_dask_array(self, apply_mask_hardness=False, asanyarray=None): asanyarray: `bool` or `None`, optional If True then add a final operation to the Dask graph that converts chunks to `numpy` arrays, but only if a - chunk's data object has an `__asanyarray__` attribute + chunk's array object has an `__asanyarray__` attribute that is `True`. If False then do not do this. If `None`, the default, then the final operation is added if the `Data` object's `__asanyarray__` attribute is diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py index 901f348343..5a8892b56b 100644 --- a/cf/read_write/netcdf/netcdfread.py +++ b/cf/read_write/netcdf/netcdfread.py @@ -209,6 +209,7 @@ def _create_data( if data.npartitions == 1: data._cfa_set_write(True) + # REVIEW: h5 if ( not compression_index and self.read_vars.get("cache_metadata") @@ -253,16 +254,16 @@ def _create_data( coord_ncvar=coord_ncvar, ) + # REVIEW: h5: Replace "units/calendar" API with "attributes" attributes = kwargs["attributes"] data = self._create_Data( cfa_array, ncvar, - units=attributes.get("units"), # units=kwargs["units"], - calendar=attributes.get( - "calendar" - ), # calendar=kwargs["calendar"], + units=attributes.get("units"), + calendar=attributes.get("calendar"), ) + # REVIEW: h5 # Note: We don't cache elements from CFA variables, because # the data are in fragment files which have not been # opened and may not not even be openable (such as could @@ -623,6 +624,7 @@ def _cache_data_elements(self, data, ncvar): # Store the elements in the data object data._set_cached_elements(elements) + # REVIEW: h5 def _create_cfanetcdfarray( self, ncvar, @@ -707,6 +709,7 @@ def _create_cfanetcdfarray( return array, kwargs + # REVIEW: h5 def _create_cfanetcdfarray_term( self, parent_ncvar, @@ -769,8 +772,6 @@ def _create_cfanetcdfarray_term( kwargs["x"] = aggregation_instructions kwargs["instructions"] = " ".join(sorted(instructions)) - # Use the kwargs to create a CFANetCDFArray instance - # array = self.implementation.initialise_CFANetCDFArray(**kwargs) if g["original_netCDF4"]: array = self.implementation.initialise_CFANetCDF4Array(**kwargs) else: @@ -934,6 +935,7 @@ def _customise_field_ancillaries(self, parent_ncvar, f): return out + # REVIEW: h5 def _cfa_parse_aggregated_data(self, ncvar, aggregated_data): """Parse a CFA-netCDF ``aggregated_data`` attribute. @@ -977,7 +979,12 @@ def _cfa_parse_aggregated_data(self, ncvar, aggregated_data): variable = g["variables"][term_ncvar] array = cfdm.netcdf_indexer( - variable, mask=True, unpack=True, always_masked_array=False + variable, + mask=True, + unpack=True, + always_masked_array=False, + orthogonal_indexing=False, + copy=False, ) aggregation_instructions[term_ncvar] = array[...] diff --git a/cf/read_write/netcdf/netcdfwrite.py b/cf/read_write/netcdf/netcdfwrite.py index 460b758089..6b035568a2 100644 --- a/cf/read_write/netcdf/netcdfwrite.py +++ b/cf/read_write/netcdf/netcdfwrite.py @@ -106,15 +106,8 @@ def _write_as_cfa(self, cfvar, construct_type, domain_axes): f"Can't write {cfvar!r} as a CFA-netCDF " "aggregation variable. Possible reasons for this " "include 1) there is more than one Dask chunk " - "per fragment file, and 2) data values have been " - "changed relative to those in the fragment files. " - "\n\n" - "In case 1), setting chunks=None as an " - "argument to cf.read may solve the problem. " - "You could consider setting cfa={'strict': False} " - "as an argument to cf.write, but note the this " - "will create a copy of the data for this " - "variable in the output dataset." + "per fragment, and 2) data values have been " + "changed relative to those in the fragments." ) return cfa_get_write @@ -586,8 +579,8 @@ def _create_cfa_data(self, ncvar, ncdimensions, data, cfvar): }, ) - # REVIEW: h5: Function _convert_to_builtin_type was a CFA-0.4 thing - + # REVIEW: h5: Deleted function _convert_to_builtin_type was a CFA-0.4 thing + def _check_valid(self, array, cfvar=None, attributes=None): """Checks for array values outside of the valid range. @@ -713,6 +706,7 @@ def _cfa_write_term_variable( return ncvar + # REVIEW: getitem: def _cfa_write_non_standard_terms( self, field, fragment_ncdimensions, aggregated_data ): @@ -756,8 +750,6 @@ def _cfa_write_non_standard_terms( # more than one unique value then the fragment's value is # missing data. # - # REVIEW: getitem: asanyarray parameter - # # '_cfa_unique' has its own call to 'cf_asanyarray', so # we can set 'asanyarray=False'. dx = data.to_dask_array(asanyarray=False) @@ -797,6 +789,7 @@ def _cfa_write_non_standard_terms( return aggregated_data_attr + # REVIEW: getitem @classmethod def _cfa_unique(cls, a): """Return the unique value of an array. @@ -818,7 +811,6 @@ def _cfa_unique(cls, a): data if there is not a unique value. """ - # REVIEW: getitem: make sure that 'a' is usable data a = cf_asanyarray(a) out_shape = (1,) * a.ndim @@ -832,6 +824,7 @@ def _cfa_unique(cls, a): return np.ma.masked_all(out_shape, dtype=a.dtype) + # REVIEW: getitem def _cfa_aggregation_instructions(self, data, cfvar): """Convert data to standardised CFA aggregation instruction terms. @@ -889,17 +882,17 @@ def _cfa_aggregation_instructions(self, data, cfvar): if len(file_details) != 1: if file_details: raise ValueError( - "Can't write CFA-netCDF aggregation variable from " - f"{cfvar!r}: Dask chunk defined by indices " - f"{indices} spans two or more fragment files." - "A possible fix for this is to set chunks=None as an " - "argument to cf.read" + f"Can't write {cfvar!r} as a CFA-netCDF " + "aggregation variable: Dask chunk defined by index " + f"{indices} spans two or more fragments." + "A possible fix for this is to set chunks=None as " + "an argument of a prior call to cf.read" ) raise ValueError( - "Can't write CFA-netCDF aggregation variable from " - f"{cfvar!r}: Dask chunk defined by indices " - f"{indices} spans zero files" + f"Can't write {cfvar!r} as a CFA-netCDF " + "aggregation variable: Dask chunk defined by index " + f"{indices} spans zero fragments." ) filenames, addresses, formats = file_details.pop() diff --git a/cf/read_write/read.py b/cf/read_write/read.py index 670ab19219..6a007892f4 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -33,6 +33,7 @@ logger = logging.getLogger(__name__) + @_manage_log_level_via_verbosity def read( files, @@ -68,7 +69,7 @@ def read( # REVIEW: h5: new storage_options parameter to control access to S3 storage_options=None, # REVIEW: h5: cache_metadata parameter to control whethe or not to get to caache selected data elements - cache_metadata=True, + cache_metadata=True, ): """Read field or domain constructs from files. diff --git a/cf/read_write/write.py b/cf/read_write/write.py index 44cab26398..fdfe6f7fbb 100644 --- a/cf/read_write/write.py +++ b/cf/read_write/write.py @@ -14,6 +14,7 @@ # REVIEW: h5: docstring improvements + @_manage_log_level_via_verbosity def write( fields, diff --git a/cf/regrid/regrid.py b/cf/regrid/regrid.py index 4e8ff58bfa..e99d07f9bd 100644 --- a/cf/regrid/regrid.py +++ b/cf/regrid/regrid.py @@ -2464,6 +2464,7 @@ def create_esmpy_weights( from netCDF4 import Dataset from .. import __version__ + # REVIEW: h5: new name and location of file lock from ..data.array.locks import netcdf_lock diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index c9224443f8..85e6f253d4 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -1479,6 +1479,7 @@ def test_Data__getitem__(self): f = cf.Data([-999, 35], mask=[True, False]).reshape(2, 1) self.assertTrue(e.equals(f)) + # REVIEW: getitem # Chained subspaces reading from disk f = cf.read(self.filename)[0] d = f.data @@ -3291,6 +3292,7 @@ def test_Data_rechunk(self): self.assertEqual(e.chunks, ((4,), (5,))) self.assertTrue(e.equals(d)) + # REVIEW: getitem # Test rechunking after a __getitem__ e = d[:2].rechunk((2, 5)) self.assertTrue(e.equals(d[:2])) @@ -4520,6 +4522,7 @@ def test_Data__str__(self): for element in elements0: self.assertNotIn(element, d._get_cached_elements()) + # REVIEW: active def test_Data_active_storage(self): """Test `Data.active_storage`.""" with cf.active_storage(True): @@ -4567,15 +4570,18 @@ def test_Data_active_storage(self): d = cf.Data(n, to_memory=True) self.assertFalse(d.active_storage) + # REVIEW: getitem def test_Data_cull_graph(self): """Test `Data.cull`""" + # Note: The number of layers in the culled graphs include a + # `cf_asanyarray` layer d = cf.Data([1, 2, 3, 4, 5], chunks=3) d = d[:2] - self.assertEqual(len(dict(d.to_dask_array().dask)), 4) + self.assertEqual(len(dict(d.to_dask_array(asanyarray=False).dask)), 3) # Check that there are fewer keys after culling d.cull_graph() - self.assertEqual(len(dict(d.to_dask_array().dask)), 3) + self.assertEqual(len(dict(d.to_dask_array(asanyarray=False).dask)), 2) def test_Data_npartitions(self): """Test the `npartitions` Data property.""" @@ -4823,6 +4829,7 @@ def test_Data_pad_missing(self): with self.assertRaises(ValueError): d.pad_missing(99, to_size=99) + # REVIEW: getitem def test_Data_is_masked(self): """Test Data.is_masked.""" d = cf.Data(np.arange(6).reshape(2, 3)) diff --git a/cf/test/test_Field.py b/cf/test/test_Field.py index e012d535fe..b819b0d0e8 100644 --- a/cf/test/test_Field.py +++ b/cf/test/test_Field.py @@ -1158,6 +1158,7 @@ def test_Field_insert_dimension(self): with self.assertRaises(ValueError): f.insert_dimension(1, "qwerty") + # REVIEW: getitem def test_Field_indices(self): f = cf.read(self.filename)[0] diff --git a/cf/test/test_FullArray.py b/cf/test/test_FullArray.py index 63dcb84f34..5a8faf1d6c 100644 --- a/cf/test/test_FullArray.py +++ b/cf/test/test_FullArray.py @@ -9,6 +9,7 @@ import cf +# REVIEW: getitem class FullArrayTest(unittest.TestCase): def test_FullValue_inspection(self): full = 9 diff --git a/cf/test/test_NetCDF4Array.py b/cf/test/test_NetCDF4Array.py index 0d049ff497..2124424a77 100644 --- a/cf/test/test_NetCDF4Array.py +++ b/cf/test/test_NetCDF4Array.py @@ -32,6 +32,7 @@ def _remove_tmpfiles(): atexit.register(_remove_tmpfiles) +# REVIEW: h5 class NetCDF4ArrayTest(unittest.TestCase): n = cf.NetCDF4Array( filename="filename.nc", @@ -40,6 +41,7 @@ class NetCDF4ArrayTest(unittest.TestCase): dtype=np.dtype(float), ) + # REVIEW: h5 def test_NetCDF4Array_del_file_location(self): a = cf.NetCDF4Array(("/data1/file1", "/data2/file2"), ("tas1", "tas2")) b = a.del_file_location("/data1") @@ -60,6 +62,7 @@ def test_NetCDF4Array_del_file_location(self): with self.assertRaises(ValueError): b.del_file_location("/data1/") + # REVIEW: h5 def test_NetCDF4Array_file_locations(self): a = cf.NetCDF4Array("/data1/file1") self.assertEqual(a.file_locations(), ("/data1",)) @@ -70,6 +73,7 @@ def test_NetCDF4Array_file_locations(self): a = cf.NetCDF4Array(("/data1/file1", "/data2/file2", "/data1/file2")) self.assertEqual(a.file_locations(), ("/data1", "/data2", "/data1")) + # REVIEW: h5 def test_NetCDF4Array_add_file_location(self): a = cf.NetCDF4Array("/data1/file1", "tas") b = a.add_file_location("/home/user") @@ -105,6 +109,7 @@ def test_NetCDF4Array_add_file_location(self): self.assertEqual(b.get_filenames(), a.get_filenames()) self.assertEqual(b.get_addresses(), a.get_addresses()) + # REVIEW: h5 def test_NetCDF4Array__dask_tokenize__(self): a = cf.NetCDF4Array("/data1/file1", "tas", shape=(12, 2), mask=False) self.assertEqual(tokenize(a), tokenize(a.copy())) @@ -112,6 +117,7 @@ def test_NetCDF4Array__dask_tokenize__(self): b = cf.NetCDF4Array("/home/file2", "tas", shape=(12, 2)) self.assertNotEqual(tokenize(a), tokenize(b)) + # REVIEW: h5 def test_NetCDF4Array_multiple_files(self): f = cf.example_field(0) cf.write(f, tmpfile1) @@ -129,6 +135,7 @@ def test_NetCDF4Array_multiple_files(self): self.assertEqual(len(n.get_filenames()), 2) self.assertTrue((n[...] == f.array).all()) + # REVIEW: getitem def test_NetCDF4Array_shape(self): shape = (12, 73, 96) a = cf.NetCDF4Array("/home/file2", "tas", shape=shape) @@ -138,6 +145,7 @@ def test_NetCDF4Array_shape(self): self.assertEqual(a.shape, (shape[0] // 2,) + shape[1:]) self.assertEqual(a.original_shape, shape) + # REVIEW: getitem def test_NetCDF4Array_index(self): shape = (12, 73, 96) a = cf.NetCDF4Array("/home/file2", "tas", shape=shape) diff --git a/cf/test/test_active_storage.py b/cf/test/test_active_storage.py index 985d55654a..b2d166f7a8 100644 --- a/cf/test/test_active_storage.py +++ b/cf/test/test_active_storage.py @@ -34,11 +34,10 @@ def _remove_tmpfiles(): atexit.register(_remove_tmpfiles) +# REVIEW: active class ActiveStorageTest(unittest.TestCase): @unittest.skipUnless(Active is not None, "Requires activestorage.Active") def test_active_storage(self): - # print("WARNING: Skipping active storage test!") - # return # No masked values f = cf.example_field(0) cf.write(f, tmpfile) @@ -50,17 +49,14 @@ def test_active_storage(self): cf.active_storage(False) self.assertFalse(cf.active_storage()) f.collapse("mean", weights=False) - print("=========") + local_array = f.collapse("mean", weights=False).array - print("=========") with cf.configuration(active_storage=True, active_storage_url="dummy"): self.assertTrue(cf.active_storage()) self.assertEqual(cf.active_storage_url(), "dummy") self.assertTrue(f.data.active_storage) active_array = f.collapse("mean", weights=False).array - print(active_array) - print(local_array) self.assertEqual(active_array, local_array) diff --git a/cf/test/test_functions.py b/cf/test/test_functions.py index 9c015f43ba..4f32bdbdc5 100644 --- a/cf/test/test_functions.py +++ b/cf/test/test_functions.py @@ -43,6 +43,7 @@ def test_aliases(self): self.assertEqual(cf.tempdir(), cf.TEMPDIR()) self.assertEqual(cf.chunksize(), cf.CHUNKSIZE()) + # REVIEW: active def test_configuration(self): # This test assumes 'total_memory' remains constant throughout # the test run, which should be true generally in any diff --git a/cf/test/test_read_write.py b/cf/test/test_read_write.py index 8cdf0fb505..4fc3fb4aa4 100644 --- a/cf/test/test_read_write.py +++ b/cf/test/test_read_write.py @@ -79,6 +79,7 @@ def test_write_filename(self): self.assertTrue((a == g[0].array).all()) + # REVIEW: h5 def test_read_mask(self): f = self.f0.copy() @@ -560,6 +561,7 @@ def test_read_write_netCDF4_compress_shuffle(self): f"Bad read/write with lossless compression: {fmt}", ) + # REVIEW: h5 def test_write_datatype(self): f = cf.read(self.filename)[0] self.assertEqual(f.dtype, np.dtype(float)) @@ -925,8 +927,6 @@ def test_write_omit_data(self): ) def test_read_url(self): """Test reading urls.""" - print("SKIPPING URL TEST") - return for scheme in ("http", "https"): remote = f"{scheme}://psl.noaa.gov/thredds/dodsC/Datasets/cru/crutem5/Monthlies/air.mon.anom.nobs.nc" # Check that cf can access it From b3dc1bd632daa74df4c67d37322c1ffc3ecbd111 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Sat, 20 Apr 2024 13:55:44 +0100 Subject: [PATCH 087/134] dev --- cf/data/dask_utils.py | 25 ++++--- cf/data/data.py | 162 ++++++++++++++++++++++++++---------------- 2 files changed, 112 insertions(+), 75 deletions(-) diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index bf9f73d444..c5b110c07a 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -103,7 +103,6 @@ def allclose(a_blocks, b_blocks, rtol=rtol, atol=atol): ) -# REVIEW: getitem def cf_contains(a, value): """Whether or not an array contains a value. @@ -127,12 +126,12 @@ def cf_contains(a, value): value. """ + # REVIEW: getitem: convert a to a usable array a = cf_asanyarray(a) value = cf_asanyarray(value) return np.array(value in a).reshape((1,) * a.ndim) -# REVIEW: getitem def cf_convolve1d(a, window=None, axis=-1, origin=0): """Calculate a 1-d convolution along the given axis. @@ -163,6 +162,7 @@ def cf_convolve1d(a, window=None, axis=-1, origin=0): Convolved float array with same shape as input. """ + # REVIEW: getitem: convert a to a usable array a = cf_asanyarray(a) # Cast to float to ensure that NaNs can be stored @@ -186,7 +186,6 @@ def cf_convolve1d(a, window=None, axis=-1, origin=0): return c -# REVIEW: getitem def cf_harden_mask(a): """Harden the mask of a masked `numpy` array. @@ -207,6 +206,7 @@ def cf_harden_mask(a): The array with hardened mask. """ + # REVIEW: getitem: convert a to a usable array a = cf_asanyarray(a) if np.ma.isMA(a): try: @@ -219,7 +219,6 @@ def cf_harden_mask(a): return a -# REVIEW: getitem def cf_percentile(a, q, axis, method, keepdims=False, mtol=1): """Compute percentiles of the data along the specified axes. @@ -279,6 +278,7 @@ def cf_percentile(a, q, axis, method, keepdims=False, mtol=1): """ from math import prod + # REVIEW: getitem: convert a to a usable array a = cf_asanyarray(a) if np.ma.isMA(a) and not np.ma.is_masked(a): @@ -354,7 +354,6 @@ def cf_percentile(a, q, axis, method, keepdims=False, mtol=1): return p -# REVIEW: getitem def cf_soften_mask(a): """Soften the mask of a masked `numpy` array. @@ -375,6 +374,7 @@ def cf_soften_mask(a): The array with softened mask. """ + # REVIEW: getitem: convert a to a usable array a = cf_asanyarray(a) if np.ma.isMA(a): @@ -388,7 +388,6 @@ def cf_soften_mask(a): return a -# REVIEW: getitem def cf_where(array, condition, x, y, hardmask): """Set elements of *array* from *x* or *y* depending on *condition*. @@ -433,6 +432,7 @@ def cf_where(array, condition, x, y, hardmask): elsewhere. """ + # REVIEW: getitem: convert array, condition, x, y to usable arrays array = cf_asanyarray(array) condition = cf_asanyarray(condition) if x is not None: @@ -499,7 +499,6 @@ def _getattr(x, attr): _array_getattr = np.vectorize(_getattr, excluded="attr") -# REVIEW: getitem def cf_YMDhms(a, attr): """Return a date-time component from an array of date-time objects. @@ -541,7 +540,6 @@ def cf_YMDhms(a, attr): return _array_getattr(a, attr=attr) -# REVIEW: getitem def cf_rt2dt(a, units): """Convert an array of reference times to date-time objects. @@ -571,6 +569,7 @@ def cf_rt2dt(a, units): cftime.DatetimeGregorian(2000, 1, 2, 0, 0, 0, 0, has_year_zero=False)] """ + # REVIEW: getitem: convert a to a usable array a = cf_asanyarray(a) if not units.iscalendartime: return rt2dt(a, units_in=units) @@ -595,7 +594,6 @@ def _convert(x, units, reftime): )(a) -# REVIEW: getitem def cf_dt2rt(a, units): """Convert an array of date-time objects to reference times. @@ -627,11 +625,11 @@ def cf_dt2rt(a, units): [365 366] """ + # REVIEW: getitem: convert a to a usable array a = cf_asanyarray(a) return dt2rt(a, units_out=units, units_in=None) -# REVIEW: getitem def cf_units(a, from_units, to_units): """Convert array values to have different equivalent units. @@ -669,13 +667,13 @@ def cf_units(a, from_units, to_units): [1000. 2000.] """ + # REVIEW: getitem: convert a to a usable array a = cf_asanyarray(a) return Units.conform( a, from_units=from_units, to_units=to_units, inplace=False ) -# REVIEW: getitem def cf_is_masked(a): """Determine whether an array has masked values. @@ -694,12 +692,12 @@ def cf_is_masked(a): values. """ + # REVIEW: getitem: convert a to a usable array a = cf_asanyarray(a) out = np.ma.is_masked(a) return np.array(out).reshape((1,) * a.ndim) -# REVIEW: getitem def cf_filled(a, fill_value=None): """Replace masked elements with a fill value. @@ -728,11 +726,11 @@ def cf_filled(a, fill_value=None): [[-999 2 3]] """ + # REVIEW: getitem: convert a to a usable array a = cf_asanyarray(a) return np.ma.filled(a, fill_value=fill_value) -# REVIEW: getitem def cf_asanyarray(a): """Convert to a `numpy` array. @@ -751,6 +749,7 @@ def cf_asanyarray(a): The converted array, or the input array unchanged. """ + # REVIEW: getitem: convert a to a usable array if getattr(a, "__asanyarray__", False): return np.asanyarray(a) diff --git a/cf/data/data.py b/cf/data/data.py index 38cf2c3124..73cb804a0b 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -44,7 +44,7 @@ from .collapse import Collapse from .creation import generate_axis_identifiers, to_dask -# REVIEW: getitem +# REVIEW: getitem: import cf_asanyarray, cf_filled, cf_is_masked from .dask_utils import ( _da_ma_allclose, cf_asanyarray, @@ -377,7 +377,7 @@ def __init__( source=source, _use_array=_use_array and array is not None ) if _use_array: - # REVIEW: getitem + # REVIEW: getitem: set new asanyarray keyword in to_dask_array try: array = source.to_dask_array(asanyarray=False) except (AttributeError, TypeError): @@ -484,7 +484,7 @@ def __init__( is_dask = is_dask_collection(array) custom["deterministic"] = not is_dask - # REVIEW: getitem + # REVIEW: getitem: Set whether or not to call `np.asanyarray` on chunks to convert them to numpy arrays. # Set whether or not to call `np.asanyarray` on chunks to # convert them to numpy arrays. if is_dask: @@ -518,7 +518,7 @@ def __init__( # Reset the units self._Units = units - # REVIEW: getitem + # REVIEW: getitem: set new asanyarray keyword in _set_dask # Store the dask array self._set_dask(dx, clear=_NONE, asanyarray=None) @@ -613,7 +613,6 @@ def __contains__(self, value): False """ - # REVIEW: getitem # Check that value is scalar by seeing if its shape is () shape = getattr(value, "shape", None) if shape is None: @@ -642,6 +641,7 @@ def __contains__(self, value): f"not {value!r}" ) + # REVIEW: getitem: `cf_contains`: set 'asanyarray' # If value is a scalar Data object then conform its units if isinstance(value, self.__class__): self_units = self.Units @@ -794,7 +794,7 @@ def __len__(self): TypeError: len() of unsized object """ - # REVIEW: getitem + # REVIEW: getitem: set new asanyarray keyword in to_dask_array dx = self.to_dask_array(asanyarray=False) if math.isnan(dx.size): logger.debug("Computing data len: Performance may be degraded") @@ -901,7 +901,7 @@ def __getitem__(self, indices): # ------------------------------------------------------------ # Roll axes with cyclic slices # ------------------------------------------------------------ - # REVIEW: getitem + # REVIEW: getitem: TODO if roll: # For example, if slice(-2, 3) has been requested on a # cyclic axis, then we roll that axis by two points and @@ -966,8 +966,8 @@ def __getitem__(self, indices): # ------------------------------------------------------------ # Set the subspaced dask array # - # * A subpspace chunk may not result in an array in memory, so - # we need to set asanyarray=True + # * A subpspaced chunk might not result in an array in memory, + # so we need to set asanyarray=True # # * Subspacing the data does not affect the active storage # status @@ -1187,7 +1187,6 @@ def __setitem__(self, indices, value): return - # REVIEW: getitem @property def __asanyarray__(self): """Whether the chunks need conversion to a `numpy` array. @@ -1204,6 +1203,7 @@ def __asanyarray__(self): `False` then do not do this. """ + # REVIEW: getitem: New __asanyarray__ property. return self._custom.get("__asanyarray__", True) @property @@ -1427,7 +1427,7 @@ def _clear_after_dask_update(self, clear=_ALL): # Set active storage to False self._del_active_storage() - # REVIEW: getitem + # REVIEW: getitem: Include new asanyarray keyword to _set_dask def _set_dask(self, dx, copy=False, clear=_ALL, asanyarray=False): """Set the dask array. @@ -1454,7 +1454,7 @@ def _set_dask(self, dx, copy=False, clear=_ALL, asanyarray=False): asanyarray: `bool` or `None`, optional If True then at compute time add a final operation to the Dask graph (not in-place) that converts chunks to - `numpy` arrays, but only for chunks whose array + `numpy` arrays, but only for those chunks whose array objects have an `__asanyarray__` attribute that is also `True`. If False, the default, then do not do this. If `None` then do not change the current @@ -1491,6 +1491,7 @@ def _set_dask(self, dx, copy=False, clear=_ALL, asanyarray=False): custom = self._custom custom["dask"] = dx + # REVIEW: getitem: Set __asanyarray__ from within _set_dask. if asanyarray is not None: custom["__asanyarray__"] = bool(asanyarray) @@ -2433,7 +2434,6 @@ def pad_missing(self, axis, pad_width=None, to_size=None, inplace=False): d._set_dask(dx) return d - # REVIEW: getitem @_inplace_enabled(default=False) def percentile( self, @@ -2621,6 +2621,7 @@ def percentile( else: axes = tuple(sorted(d._parse_axes(axes))) + # REVIEW: getitem: 'cf_percentile' has its own call to 'cf_asanyarray', so we can set 'asanyarray=False'. # 'cf_percentile' has its own call to 'cf_asanyarray', so we # can set 'asanyarray=False'. dx = d.to_dask_array(asanyarray=False) @@ -2944,7 +2945,6 @@ def compute(self): # noqa: F811 return a - # REVIEW: getitem @_inplace_enabled(default=False) def convolution_filter( self, @@ -3119,10 +3119,10 @@ def convolution_filter( depth += abs(origin) - # 'cf_convolve1d' has its own call to 'cf_asanyarray', but we - # need to pre-empt that so that the halos can be created. - dx = d.to_dask_array(asanyarray=None) + # TODO: check that this is OK + dx = d.to_dask_array() + # REVIEW: getitem: rectify comment # Cast to float to ensure that NaNs can be stored (so # map_overlap can correctly assign the halos) if dx.dtype != float: @@ -3310,13 +3310,13 @@ def rechunk( """ d = _inplace_enabled_define_and_cleanup(self) - # REVIEW: getitem: set asanyarray for rechunk - # REVIEW: active: No need to change active storage status after a rechunk + # REVIEW: getitem: set asanyarray keyword for rechunk + # REVIEW: active: Do not change active storage status after a rechunk # Dask rechunking is essentially a wrapper for __getitem__ # calls on the chunks, which means that we can use the same - # 'asanyarray' and active-storage-clear settings to - # `_set_dask` as are used in `__gettem__`. + # 'asanyarray' and 'clear' keywords to `_set_dask` as are used + # in `__gettem__`. dx = d.to_dask_array(asanyarray=False) dx = dx.rechunk(chunks, threshold, block_size_limit, balance) @@ -3326,7 +3326,6 @@ def rechunk( return d - # REVIEW: getitem @_inplace_enabled(default=False) def _asdatetime(self, inplace=False): """Change the internal representation of data array elements @@ -3373,6 +3372,7 @@ def _asdatetime(self, inplace=False): f"Can't convert {units!r} values to date-time objects" ) + # REVIEW: getitem: set asanyarray keyword for _asdatetime if not d._isdatetime(): # 'cf_rt2dt' has its own call to 'cf_asanyarray', so we # can set 'asanyarray=False'. @@ -3386,7 +3386,6 @@ def _isdatetime(self): """True if the internal representation is a datetime object.""" return self.dtype.kind == "O" and self.Units.isreftime - # REVIEW: getitem @_inplace_enabled(default=False) def _asreftime(self, inplace=False): """Change the internal representation of data array elements @@ -3431,6 +3430,7 @@ def _asreftime(self, inplace=False): f"Can't convert {units!r} values to numeric reference times" ) + # REVIEW: getitem: set asanyarray keyword for _asreftime if d._isdatetime(): # 'cf_dt2rt' has its own call to 'cf_asanyarray', so we # can set 'asanyarray=False'. @@ -3985,7 +3985,6 @@ def _parse_indices(self, *args, **kwargs): "Use function 'cf.parse_indices' instead." ) - # REVIEW: getitem def _regrid( self, method=None, @@ -4045,6 +4044,7 @@ def _regrid( f"the shape of the regrid operator: {operator.src_shape}" ) + # REVIEW: getitem: set asanyarray keyword for _regrid # 'regrid' has its own calls to 'cf_asanyarray', so we can set # 'asanyarray=False'. dx = self.to_dask_array(asanyarray=False) @@ -4289,8 +4289,11 @@ def concatenate( processed_data.append(data1) copied = not copy # to avoid making two copies in a given case - # REVIEW: getitem - # Get data as dask arrays and apply concatenation operation + # REVIEW: getitem: `concatenate`: set 'asanyarray' + # Get data as dask arrays and apply concatenation + # operation. We can set 'asanyarray=False' because at compute + # time the concatenation operation does not need to access the + # actual data. dxs = [d.to_dask_array(asanyarray=False) for d in processed_data] dx = da.concatenate(dxs, axis=axis) @@ -4319,9 +4322,8 @@ def concatenate( cfa = _NONE break - # REVIEW: active: For the concatenated data, decide whether or not to clear the active_storage status during _set_dask - - # Set the active storage status + # REVIEW: active: `concatenate`: define the active_storage status + # Define the active_storage status active = _ACTIVE for d in processed_data: if not d.active_storage: @@ -4330,9 +4332,8 @@ def concatenate( active = _NONE break - # REVIEW: getitem: For the concatenated data, set the asanyarray argument to _set_dask - - # Set the __asanyarray__ status + # REVIEW: getitem: `concatenate`: define the asanyarray status + # Define the __asanyarray__ status asanyarray = processed_data[0].__asanyarray__ for d in processed_data[1:]: if d.__asanyarray__ != asanyarray: @@ -4342,6 +4343,8 @@ def concatenate( asanyarray = True break + # REVIEW: getitem: `concatenate`: set 'asanyarray' + # REVIEW: active: `concatenate`: set 'clear' # Set the new dask array data0._set_dask(dx, clear=_ALL ^ cfa ^ active, asanyarray=asanyarray) @@ -4968,7 +4971,6 @@ def _axes(self, value): # ---------------------------------------------------------------- # Dask attributes # ---------------------------------------------------------------- - # REVIEW: getitem @property def chunks(self): """The `dask` chunk sizes for each dimension. @@ -4988,6 +4990,9 @@ def chunks(self): 6 """ + # REVIEW: getitem: `chunks`: set 'asanyarray' + # The dask graph is never going to be computed, so we can set + # 'asanyarray=False'. return self.to_dask_array(asanyarray=False).chunks # ---------------------------------------------------------------- @@ -5043,7 +5048,6 @@ def Units(self): """ return self._Units - # REVIEW: getitem @Units.setter def Units(self, value): try: @@ -5071,6 +5075,8 @@ def Units(self, value): cf_func = partial(cf_units, from_units=old_units, to_units=value) + + # REVIEW: getitem: `Units`: set 'asanyarray' # 'cf_units' has its own call to 'cf_asanyarray', so we can # set 'asanyarray=False'. dx = self.to_dask_array(asanyarray=False) @@ -5110,7 +5116,6 @@ def data(self): """ return self - # REVIEW: getitem @property def dtype(self): """The `numpy` data-type of the data. @@ -5141,10 +5146,12 @@ def dtype(self): [1 2 3] """ + # REVIEW: getitem: `dtype`: set 'asanyarray' + # The dask graph is never going to be computed, so we can set + # 'asanyarray=False'. dx = self.to_dask_array(asanyarray=False) return dx.dtype - # REVIEW: getitem @dtype.setter def dtype(self, value): # Only change the datatype if it's different to that of the @@ -5237,7 +5244,6 @@ def hardmask(self): def hardmask(self, value): self._custom["hardmask"] = value - # REVIEW: getitem @property def is_masked(self): """True if the data array has any masked values. @@ -5256,6 +5262,7 @@ def is_masked(self): True """ + # REVIEW: getitem: `is_masked`: set 'asanyarray' # 'cf_is_masked' has its own call to 'cf_asanyarray', so we # can set 'asanyarray=False'. dx = self.to_dask_array(asanyarray=False) @@ -5274,7 +5281,6 @@ def is_masked(self): return bool(dx.any()) - # REVIEW: getitem @property def nbytes(self): """Total number of bytes consumed by the elements of the array. @@ -5302,6 +5308,9 @@ def nbytes(self): 24 """ + # REVIEW: getitem: `nbytes`: set 'asanyarray' + # The dask graph is never going to be computed, so we can set + # 'asanyarray=False'. dx = self.to_dask_array(asanyarray=False) if math.isnan(dx.size): logger.debug("Computing data nbytes: Performance may be degraded") @@ -5309,7 +5318,6 @@ def nbytes(self): return dx.nbytes - # REVIEW: getitem @property def ndim(self): """Number of dimensions in the data array. @@ -5337,10 +5345,12 @@ def ndim(self): 0 """ + # REVIEW: getitem: `ndim`: set 'asanyarray' + # The dask graph is never going to be computed, so we can set + # 'asanyarray=False'. dx = self.to_dask_array(asanyarray=False) return dx.ndim - # REVIEW: getitem @property def npartitions(self): """The total number of chunks. @@ -5360,9 +5370,11 @@ def npartitions(self): 6 """ + # REVIEW: getitem: `npartitions`: set 'asanyarray' + # The dask graph is never going to be computed, so we can set + # 'asanyarray=False'. return self.to_dask_array(asanyarray=False).npartitions - # REVIEW: getitem @property def numblocks(self): """The number of chunks along each dimension. @@ -5382,9 +5394,11 @@ def numblocks(self): 6 """ + # REVIEW: getitem: `numblocks` set 'asanyarray' + # The dask graph is never going to be computed, so we can set + # 'asanyarray=False'. return self.to_dask_array(asanyarray=False).numblocks - # REVIEW: getitem @property def shape(self): """Tuple of the data array's dimension sizes. @@ -5413,6 +5427,9 @@ def shape(self): () """ + # REVIEW: getitem: `shape`: set 'asanyarray' + # The dask graph is never going to be computed, so we can set + # 'asanyarray=False'. dx = self.to_dask_array(asanyarray=False) if math.isnan(dx.size): logger.debug("Computing data shape: Performance may be degraded") @@ -5420,7 +5437,6 @@ def shape(self): return dx.shape - # REVIEW: getitem @property def size(self): """Number of elements in the data array. @@ -5453,6 +5469,9 @@ def size(self): 1 """ + # REVIEW: getitem: `size` set 'asanyarray' + # The dask graph is never going to be computed, so we can set + # 'asanyarray=False'. dx = self.to_dask_array(asanyarray=False) size = dx.size if math.isnan(size): @@ -6491,7 +6510,6 @@ def argmin(self, axis=None, unravel=False): return type(self)(a) - # REVIEW: getitem @_inplace_enabled(default=False) def convert_reference_time( self, @@ -6650,6 +6668,7 @@ def convert_reference_time( ) d.Units = units0 + # REVIEW: getitem: `convert_reference_time`: set 'asanyarray' # 'cf_rt2dt' its own call to 'cf_asanyarray', so we can set # 'asanyarray=False'. dx = d.to_dask_array(asanyarray=False) @@ -6677,7 +6696,6 @@ def get_data(self, default=ValueError(), _units=None, _fill_value=None): """ return self - # REVIEW: getitem def get_deterministic_name(self): """Get the deterministic name for the data. @@ -6730,13 +6748,16 @@ def get_deterministic_name(self): raise ValueError() units = self._Units + + # REVIEW: getitem: `get_deterministic_name`: set 'asanyarray' + # The dask graph is never going to be computed, so we can set + # 'asanyarray=False'. return tokenize( self.to_dask_array(asanyarray=None).name, units.formatted(definition=True, names=True), units._canonical_calendar, ) - # REVIEW: getitem def get_filenames(self): """The names of files containing parts of the data array. @@ -6797,6 +6818,10 @@ def get_filenames(self): """ out = set() + + # REVIEW: getitem: `get_filenames`: set 'asanyarray' + # The dask graph is never going to be computed, so we can set + # 'asanyarray=False'. for a in self.todict(asanyarray=False).values(): try: out.update(a.get_filenames()) @@ -6900,7 +6925,6 @@ def set_calendar(self, calendar): """ self.Units = Units(self.get_units(default=None), calendar) - # REVIEW: getitem def add_file_location(self, location): """Add a new file location in-place. @@ -6931,6 +6955,10 @@ def add_file_location(self, location): location = abspath(location).rstrip(sep) updated = False + + # REVIEW: getitem: `add_file_location`: set 'asanyarray' + # The dask graph is never going to be computed, so we can set + # 'asanyarray=False'. dsk = self.todict(asanyarray=False) for key, a in dsk.items(): try: @@ -8329,7 +8357,6 @@ def uncompress(self, inplace=False): return d - # REVIEW: getitem def unique(self, split_every=None): """The unique elements of the data. @@ -8373,6 +8400,7 @@ def unique(self, split_every=None): # in the result. d.soften_mask() + # REVIEW: getitem: `unique`: set 'asanyarray' # The applicable chunk function will have its own call to # 'cf_asanyarray', so we can set 'asanyarray=False'. dx = d.to_dask_array(asanyarray=False) @@ -9086,7 +9114,6 @@ def halo( return d - # REVIEW: getitem def harden_mask(self): """Force the mask to hard. @@ -9115,6 +9142,7 @@ def harden_mask(self): [1 -- 3] """ + # REVIEW: getitem: `hardmask`: set 'asanyarray' # 'cf_harden_mask' has its own call to 'cf_asanyarray', so we # can set 'asanyarray=False'. dx = self.to_dask_array(asanyarray=False) @@ -9209,7 +9237,6 @@ def has_units(self): """ return hasattr(self.Units, "units") - # REVIEW: getitem def soften_mask(self): """Force the mask to soft. @@ -9238,6 +9265,7 @@ def soften_mask(self): [ 1 999 3] """ + # REVIEW: getitem: `soften_mask`: set 'asanyarray' # 'cf_soften_mask' has its own call to 'cf_asanyarray', so we # can set 'asanyarray=False'. dx = self.to_dask_array(asanyarray=False) @@ -9245,7 +9273,6 @@ def soften_mask(self): self._set_dask(dx, clear=_NONE) self.hardmask = False - # REVIEW: getitem def file_locations(self): """The locations of files containing parts of the data. @@ -9270,6 +9297,9 @@ def file_locations(self): """ out = set() + # REVIEW: getitem: `file_locations`: set 'asanyarray' + # The dask graph is never going to be computed, so we can set + # 'asanyarray=False'. for key, a in self.todict(asanyarray=False).items(): try: out.update(a.file_locations()) @@ -9279,7 +9309,6 @@ def file_locations(self): return out - # REVIEW: getitem @_inplace_enabled(default=False) def filled(self, fill_value=None, inplace=False): """Replace masked elements with a fill value. @@ -9329,6 +9358,7 @@ def filled(self, fill_value=None, inplace=False): f"data type {d.dtype.str!r}" ) + # REVIEW: getitem: `filled`: set 'asanyarray' # 'cf_filled' has its own call to 'cf_asanyarray', so we can # set 'asanyarray=False'. dx = d.to_dask_array(asanyarray=False) @@ -9960,7 +9990,7 @@ def override_calendar(self, calendar, inplace=False, i=False): d._Units = Units(d.Units._units, calendar) return d - # REVIEW: getitem + # REVIEW: getitem: `to_dask_array`: new keyword 'asanyarray' def to_dask_array(self, apply_mask_hardness=False, asanyarray=None): """Convert the data to a `dask` array. @@ -9986,13 +10016,15 @@ def to_dask_array(self, apply_mask_hardness=False, asanyarray=None): array to be that given by the `hardmask` attribute. asanyarray: `bool` or `None`, optional - If True then add a final operation to the Dask graph - that converts chunks to `numpy` arrays, but only if a - chunk's array object has an `__asanyarray__` attribute - that is also `True`. If False then do not do this. If - `None`, the default, then the final operation is added - if the `Data` object's `__asanyarray__` attribute is - `True`. + If True then add a final operation to the returned + Dask graph that converts chunks to `numpy` arrays, but + only if a chunk's array object has an `__asanyarray__` + attribute that is also `True`. If False then do not do + this. If `None`, the default, then the final operation + is added if the `Data` object's `__asanyarray__` + attribute is `True`. I.e. by default `to_dask_array` + always returns a computable Dask graph, although it + may have a extra final layer that is not needed. .. note:: Such a final operation is included in the returned Dask array, but is not included in @@ -10034,7 +10066,7 @@ def to_dask_array(self, apply_mask_hardness=False, asanyarray=None): dx = self._custom["dask"] # Note: The mask hardness functions have their own calls - # to cf_asanyarray, so we can don't need worry about + # to 'cf_asanyarray', so we can don't need worry about # setting another one. else: if asanyarray is None: @@ -10303,6 +10335,10 @@ def del_file_location(self, location): location = abspath(location).rstrip(sep) updated = False + + # REVIEW: getitem: `del_file_location`: set 'asanyarray' + # The dask graph is never going to be computed, so we can set + # 'asanyarray=False'. dsk = self.todict(asanyarray=False) for key, a in dsk.items(): try: @@ -11275,7 +11311,6 @@ def fits_in_memory(self): """ return self.size * (self.dtype.itemsize + 1) <= free_memory() - # REVIEW: getitem @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) @_manage_log_level_via_verbosity @@ -11486,6 +11521,7 @@ def where( # Missing values could be affected, so make sure that the mask # hardness has been applied. # + # REVIEW: getitem: `where`: set 'asanyarray' # 'cf_where' has its own calls to 'cf_asanyarray', so we can # set 'asanyarray=False'. dx = d.to_dask_array(apply_mask_hardness=True, asanyarray=False) @@ -11503,6 +11539,7 @@ def where( condition = type(self).asdata(condition) condition = where_broadcastable(d, condition, "condition") + # REVIEW: getitem: `where`: set 'asanyarray' # 'cf_where' has its own calls to 'cf_asanyarray', so we can # set 'asanyarray=False'. condition = condition.to_dask_array(asanyarray=False) @@ -11548,6 +11585,7 @@ def where( x, y = xy + # REVIEW: getitem: `where`: 'da.asanyarray' is no longer required # Apply the where operation dx = da.core.elemwise(cf_where, dx, condition, x, y, d.hardmask) d._set_dask(dx) From 7987bde52ca25edfcdb5fd6d44d1eb29da0e3099 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Sun, 21 Apr 2024 11:15:32 +0100 Subject: [PATCH 088/134] dev --- cf/data/array/cfah5netcdfarray.py | 2 +- cf/data/array/cfanetcdf4array.py | 2 +- cf/data/array/fullarray.py | 8 +-- cf/data/array/h5netcdfarray.py | 4 +- cf/data/array/mixin/activestoragemixin.py | 2 +- cf/data/array/mixin/arraymixin.py | 2 +- cf/data/array/mixin/cfamixin.py | 6 +- cf/data/array/mixin/indexmixin.py | 2 +- cf/data/array/netcdf4array.py | 4 +- cf/data/array/netcdfarray.py | 2 +- cf/data/array/umarray.py | 6 +- cf/data/creation.py | 8 +-- cf/data/dask_regrid.py | 4 +- cf/data/dask_utils.py | 24 +++---- cf/data/data.py | 77 +++++++++++------------ cf/data/utils.py | 6 +- 16 files changed, 77 insertions(+), 82 deletions(-) diff --git a/cf/data/array/cfah5netcdfarray.py b/cf/data/array/cfah5netcdfarray.py index 9a3244f550..edcdfceeda 100644 --- a/cf/data/array/cfah5netcdfarray.py +++ b/cf/data/array/cfah5netcdfarray.py @@ -1,8 +1,8 @@ +# REVIEW: h5: `CFAH5netcdfArray`: New class for accessing CFA with `h5netcdf` from .h5netcdfarray import H5netcdfArray from .mixin import CFAMixin -# REVIEW: h5 class CFAH5netcdfArray(CFAMixin, H5netcdfArray): """A CFA-netCDF array accessed with `h5netcdf` diff --git a/cf/data/array/cfanetcdf4array.py b/cf/data/array/cfanetcdf4array.py index 2b4edb91c8..73d926b44b 100644 --- a/cf/data/array/cfanetcdf4array.py +++ b/cf/data/array/cfanetcdf4array.py @@ -1,8 +1,8 @@ +# REVIEW: h5: `CFAnetCDF4Array`: New class for accessing CFA with `netCDF4` from .mixin import CFAMixin from .netcdf4array import NetCDF4Array -# REVIEW: h5 class CFANetCDF4Array(CFAMixin, NetCDF4Array): """A CFA-netCDF array accessed with `netCDF4`. diff --git a/cf/data/array/fullarray.py b/cf/data/array/fullarray.py index d73a5aaf04..c4ab489937 100644 --- a/cf/data/array/fullarray.py +++ b/cf/data/array/fullarray.py @@ -16,7 +16,7 @@ class FullArray(IndexMixin, Array): """ - # REVIEW: h5: Replace "units/calendar" API with "attributes" + # REVIEW: h5: `__init__`: Replace units/calendar API with 'attributes' def __init__( self, fill_value=None, @@ -121,7 +121,7 @@ def __str__(self): return f"Filled with {fill_value!r}" - # REVIEW: getitem + # REVIEW: getitem: `_get_array`: new method to convert subspace to numpy array def _get_array(self, index=None): """Returns the full array. @@ -141,8 +141,6 @@ def _get_array(self, index=None): The subspace. """ - # REVIEW: getitem - if index is None: shape = self.shape else: @@ -160,7 +158,7 @@ def _get_array(self, index=None): return array - # REVIEW: getitem + # REVIEW: getitem: `array`: New property to convert subspace to numpy array @property def array(self): """Return an independent numpy array containing the data. diff --git a/cf/data/array/h5netcdfarray.py b/cf/data/array/h5netcdfarray.py index a52f65f814..96d220fb6a 100644 --- a/cf/data/array/h5netcdfarray.py +++ b/cf/data/array/h5netcdfarray.py @@ -1,3 +1,4 @@ +# REVIEW: h5: `H5netcdfArray`: New class to access netCDF with `h5netcdf` import cfdm from ...mixin_container import Container @@ -5,7 +6,6 @@ from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin, IndexMixin -# REVIEW: h5 class H5netcdfArray( ActiveStorageMixin, IndexMixin, @@ -49,7 +49,7 @@ def _lock(self): """ return netcdf_lock - # REVIEW: getitem + # REVIEW: getitem: `_get_array`: new method to convert subspace to numpy array def _get_array(self, index=None): """Returns a subspace of the dataset variable. diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index abad445b76..ab41ef7fb9 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -1,4 +1,4 @@ -# REVIEW: active +# REVIEW: active: `ActiveStorageMixin`: new mixin class `ActiveStorageMixin` try: from activestorage import Active except ModuleNotFoundError: diff --git a/cf/data/array/mixin/arraymixin.py b/cf/data/array/mixin/arraymixin.py index f17687bdcf..b68c596668 100644 --- a/cf/data/array/mixin/arraymixin.py +++ b/cf/data/array/mixin/arraymixin.py @@ -18,7 +18,7 @@ def __array_function__(self, func, types, args, kwargs): """ return NotImplemented - # REVIEW: active + # REVIEW: active: `_meta`: Moved to here from `FileArrayMixin` @property def _meta(self): """Normalize the array to an appropriate Dask meta object. diff --git a/cf/data/array/mixin/cfamixin.py b/cf/data/array/mixin/cfamixin.py index c3a54ef8fa..f0cf64e972 100644 --- a/cf/data/array/mixin/cfamixin.py +++ b/cf/data/array/mixin/cfamixin.py @@ -36,7 +36,7 @@ def __new__(cls, *args, **kwargs): } return instance - # REVIEW: h5: Replace "units/calendar" API with "attributes" + # REVIEW: h5: `__init__`: Replace units/calendar API with 'attributes' def __init__( self, filename=None, @@ -225,7 +225,7 @@ def __init__( "substitutions", substitutions.copy(), copy=False ) - # REVIEW: h5 + # REVIEW: h5: `_parse_cfa`: Refactoring of code that used to be in `__init__` def _parse_cfa(self, x, term, substitutions): """Parse the CFA aggregation instructions. @@ -466,7 +466,7 @@ def get_fragment_shape(self): """ return self._get_component("fragment_shape") - # REVIEW: h5 + # REVIEW: h5: `get_storage_options` def get_storage_options(self): """Return `s3fs.S3FileSystem` options for accessing S3 fragment files. diff --git a/cf/data/array/mixin/indexmixin.py b/cf/data/array/mixin/indexmixin.py index 5799782b00..92306be0be 100644 --- a/cf/data/array/mixin/indexmixin.py +++ b/cf/data/array/mixin/indexmixin.py @@ -7,7 +7,7 @@ from ....functions import indices_shape, parse_indices -# REVIEW: getitem +# REVIEW: getitem: `IndexMixin`: new mixin class `IndexMixin` class IndexMixin: """Mixin class for lazy indexing of a data array. diff --git a/cf/data/array/netcdf4array.py b/cf/data/array/netcdf4array.py index 4c2ff57417..011cd28328 100644 --- a/cf/data/array/netcdf4array.py +++ b/cf/data/array/netcdf4array.py @@ -1,3 +1,4 @@ +# REVIEW: h5: `NetCDF4Array`: New class to access netCDF with `netCDF4`, replaces `NetCDFArray` import cfdm from ...mixin_container import Container @@ -5,7 +6,6 @@ from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin, IndexMixin -# REVIEW: h5 class NetCDF4Array( ActiveStorageMixin, IndexMixin, @@ -47,7 +47,7 @@ def _lock(self): """ return netcdf_lock - # REVIEW: getitem + # REVIEW: getitem: `_get_array`: new method to convert subspace to numpy array def _get_array(self, index=None): """Returns a subspace of the dataset variable. diff --git a/cf/data/array/netcdfarray.py b/cf/data/array/netcdfarray.py index e2b1d850e7..5c382bf123 100644 --- a/cf/data/array/netcdfarray.py +++ b/cf/data/array/netcdfarray.py @@ -1,4 +1,4 @@ -# REVIEW: h5 +# REVIEW: h5: `NetCDFArray`: Replaced by `NetCDF4Array` class NetCDFArray: """A netCDF array accessed with `netCDF4`. diff --git a/cf/data/array/umarray.py b/cf/data/array/umarray.py index 07723fd8b4..adaac9528f 100644 --- a/cf/data/array/umarray.py +++ b/cf/data/array/umarray.py @@ -7,7 +7,7 @@ from .mixin import FileArrayMixin, IndexMixin -# REVIEW: h5: Replace "units/calendar" API with "attributes" +# REVIEW: h5: `UMArray`: Replace units/calendar API with 'attributes' class UMArray( IndexMixin, FileArrayMixin, cfdm.data.mixin.FileArrayMixin, Array ): @@ -171,7 +171,7 @@ def __init__( # By default, close the UM file after data array access self._set_component("close", True, copy=False) - # REVIEW: getitem + # REVIEW: getitem: `_get_array`: new method to convert subspace to numpy array def _get_array(self, index=None): """Returns a subspace of the dataset variable. @@ -297,7 +297,7 @@ def _set_FillValue(self, int_hdr, real_hdr, attributes): `None """ - # REVIEW: getitem + # REVIEW: getitem: `_set_FillValue` if "FillValue" in attributes: return diff --git a/cf/data/creation.py b/cf/data/creation.py index 71bd000d2a..773d396c3a 100644 --- a/cf/data/creation.py +++ b/cf/data/creation.py @@ -59,7 +59,7 @@ def to_dask(array, chunks, **from_array_options): if is_dask_collection(array): return array - # REVIEW: getitem + # REVIEW: getitem: `to_dask`: set 'asanyarray' if hasattr(array, "to_dask_array"): try: return array.to_dask_array(chunks=chunks) @@ -82,10 +82,8 @@ def to_dask(array, chunks, **from_array_options): array = np.asanyarray(array) kwargs = from_array_options - # REVIEW: active: - # REVIEW: getitem: The file lock has been push onto an `Array` - # object (in its `_get_array` method), rather - # than being set on the Dask array itself. + # REVIEW: active: `to_dask`: + # REVIEW: getitem: `to_dask`: The file lock is now on the `Array` object (in its `_get_array` method), rather than being set on the Dask array itself. kwargs.setdefault("meta", getattr(array, "_meta", None)) try: diff --git a/cf/data/dask_regrid.py b/cf/data/dask_regrid.py index a09826450d..9db487a019 100644 --- a/cf/data/dask_regrid.py +++ b/cf/data/dask_regrid.py @@ -1,7 +1,7 @@ """Regridding functions used within a dask graph.""" import numpy as np -# REVIEW: getitem +# REVIEW: getitem: `regrid.py` from .dask_utils import cf_asanyarray @@ -176,7 +176,7 @@ def regrid( """ weights, dst_mask = weights_dst_mask - # REVIEW: getitem + # REVIEW: getitem: `regrid`: convert a to a usable array a = cf_asanyarray(a) if dst_mask is not None: dst_mask = cf_asanyarray(dst_mask) diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index c5b110c07a..290eb8003f 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -126,7 +126,7 @@ def cf_contains(a, value): value. """ - # REVIEW: getitem: convert a to a usable array + # REVIEW: getitem: `cf_contains`: convert a to a usable array a = cf_asanyarray(a) value = cf_asanyarray(value) return np.array(value in a).reshape((1,) * a.ndim) @@ -162,7 +162,7 @@ def cf_convolve1d(a, window=None, axis=-1, origin=0): Convolved float array with same shape as input. """ - # REVIEW: getitem: convert a to a usable array + # REVIEW: getitem: `cf_convolve1d`: convert a to a usable array a = cf_asanyarray(a) # Cast to float to ensure that NaNs can be stored @@ -206,7 +206,7 @@ def cf_harden_mask(a): The array with hardened mask. """ - # REVIEW: getitem: convert a to a usable array + # REVIEW: getitem: `cf_harden_mask`: convert a to a usable array a = cf_asanyarray(a) if np.ma.isMA(a): try: @@ -278,7 +278,7 @@ def cf_percentile(a, q, axis, method, keepdims=False, mtol=1): """ from math import prod - # REVIEW: getitem: convert a to a usable array + # REVIEW: getitem: `cf_percentile`: convert a to a usable array a = cf_asanyarray(a) if np.ma.isMA(a) and not np.ma.is_masked(a): @@ -374,7 +374,7 @@ def cf_soften_mask(a): The array with softened mask. """ - # REVIEW: getitem: convert a to a usable array + # REVIEW: getitem: `cf_soften_mask`: convert a to a usable array a = cf_asanyarray(a) if np.ma.isMA(a): @@ -432,7 +432,7 @@ def cf_where(array, condition, x, y, hardmask): elsewhere. """ - # REVIEW: getitem: convert array, condition, x, y to usable arrays + # REVIEW: getitem: `cf_where`: convert array, condition, x, y to usable arrays array = cf_asanyarray(array) condition = cf_asanyarray(condition) if x is not None: @@ -569,7 +569,7 @@ def cf_rt2dt(a, units): cftime.DatetimeGregorian(2000, 1, 2, 0, 0, 0, 0, has_year_zero=False)] """ - # REVIEW: getitem: convert a to a usable array + # REVIEW: getitem: `cf_rt2dt`: convert a to a usable array a = cf_asanyarray(a) if not units.iscalendartime: return rt2dt(a, units_in=units) @@ -625,7 +625,7 @@ def cf_dt2rt(a, units): [365 366] """ - # REVIEW: getitem: convert a to a usable array + # REVIEW: getitem: `cf_dt2rt`: convert a to a usable array a = cf_asanyarray(a) return dt2rt(a, units_out=units, units_in=None) @@ -667,7 +667,7 @@ def cf_units(a, from_units, to_units): [1000. 2000.] """ - # REVIEW: getitem: convert a to a usable array + # REVIEW: getitem: `cf_units`: convert a to a usable array a = cf_asanyarray(a) return Units.conform( a, from_units=from_units, to_units=to_units, inplace=False @@ -692,7 +692,7 @@ def cf_is_masked(a): values. """ - # REVIEW: getitem: convert a to a usable array + # REVIEW: getitem: `cf_is_masked`: convert a to a usable array a = cf_asanyarray(a) out = np.ma.is_masked(a) return np.array(out).reshape((1,) * a.ndim) @@ -726,7 +726,7 @@ def cf_filled(a, fill_value=None): [[-999 2 3]] """ - # REVIEW: getitem: convert a to a usable array + # REVIEW: getitem: `cf_filled`: convert a to a usable array a = cf_asanyarray(a) return np.ma.filled(a, fill_value=fill_value) @@ -749,7 +749,7 @@ def cf_asanyarray(a): The converted array, or the input array unchanged. """ - # REVIEW: getitem: convert a to a usable array + # REVIEW: getitem: `cf_asanyarray`: convert a to a usable array if getattr(a, "__asanyarray__", False): return np.asanyarray(a) diff --git a/cf/data/data.py b/cf/data/data.py index 73cb804a0b..dba08e8f4e 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -44,7 +44,7 @@ from .collapse import Collapse from .creation import generate_axis_identifiers, to_dask -# REVIEW: getitem: import cf_asanyarray, cf_filled, cf_is_masked +# REVIEW: getitem: `data.py`: import cf_asanyarray, cf_filled, cf_is_masked from .dask_utils import ( _da_ma_allclose, cf_asanyarray, @@ -103,7 +103,7 @@ _ARRAY = 1 # = 0b0001 _CACHE = 2 # = 0b0010 _CFA = 4 # = 0b0100 -# REVIEW: active: Set the active storage status bit mask +# REVIEW: active: `data.py`: Set the active storage status bit mask _ACTIVE = 8 # = 0b1000 _ALL = 15 # = 0b1111 @@ -377,7 +377,7 @@ def __init__( source=source, _use_array=_use_array and array is not None ) if _use_array: - # REVIEW: getitem: set new asanyarray keyword in to_dask_array + # REVIEW: getitem: `__init__`: set 'asanyarray' try: array = source.to_dask_array(asanyarray=False) except (AttributeError, TypeError): @@ -462,7 +462,7 @@ def __init__( except AttributeError: pass - # REVIEW: active: set the active storage status to True only for Array subclasses + # REVIEW: active: `__init__`: set the active storage status to True for Array subclasses if self._is_abstract_Array_subclass(array): # Save the input array in case it's useful later. For # compressed input arrays this will contain extra @@ -484,7 +484,7 @@ def __init__( is_dask = is_dask_collection(array) custom["deterministic"] = not is_dask - # REVIEW: getitem: Set whether or not to call `np.asanyarray` on chunks to convert them to numpy arrays. + # REVIEW: getitem: `__init__`: Set whether or not to call `np.asanyarray` on chunks to convert them to numpy arrays. # Set whether or not to call `np.asanyarray` on chunks to # convert them to numpy arrays. if is_dask: @@ -518,7 +518,7 @@ def __init__( # Reset the units self._Units = units - # REVIEW: getitem: set new asanyarray keyword in _set_dask + # REVIEW: getitem: `__init__`: set 'asanyarray' # Store the dask array self._set_dask(dx, clear=_NONE, asanyarray=None) @@ -794,7 +794,9 @@ def __len__(self): TypeError: len() of unsized object """ - # REVIEW: getitem: set new asanyarray keyword in to_dask_array + # REVIEW: getitem: `__len__`: set 'asanyarray' + # The dask graph is never going to be computed, so we can set + # 'asanyarray=False'. dx = self.to_dask_array(asanyarray=False) if math.isnan(dx.size): logger.debug("Computing data len: Performance may be degraded") @@ -901,7 +903,7 @@ def __getitem__(self, indices): # ------------------------------------------------------------ # Roll axes with cyclic slices # ------------------------------------------------------------ - # REVIEW: getitem: TODO + # REVIEW: getitem: `__getitem__`: set 'asanyarray' if roll: # For example, if slice(-2, 3) has been requested on a # cyclic axis, then we roll that axis by two points and @@ -961,8 +963,8 @@ def __getitem__(self, indices): "Non-orthogonal indexing has not yet been implemented" ) - # REVIEW: active - # REVIEW: getitem + # REVIEW: active `__getitem__` + # REVIEW: getitem: `__getitem__` # ------------------------------------------------------------ # Set the subspaced dask array # @@ -1187,6 +1189,7 @@ def __setitem__(self, indices, value): return + # REVIEW: getitem: `__asanyarray__`: new property `__asanyarray__` @property def __asanyarray__(self): """Whether the chunks need conversion to a `numpy` array. @@ -1203,7 +1206,6 @@ def __asanyarray__(self): `False` then do not do this. """ - # REVIEW: getitem: New __asanyarray__ property. return self._custom.get("__asanyarray__", True) @property @@ -1422,12 +1424,12 @@ def _clear_after_dask_update(self, clear=_ALL): # Set the CFA write status to False self._cfa_del_write() - # REVIEW: active: update the active storage status + # REVIEW: active: `_clear_after_dask_update`: update active storage status if clear & _ACTIVE: # Set active storage to False self._del_active_storage() - # REVIEW: getitem: Include new asanyarray keyword to _set_dask + # REVIEW: getitem: `_set_dask`: new keyword 'asanyarray' def _set_dask(self, dx, copy=False, clear=_ALL, asanyarray=False): """Set the dask array. @@ -1491,7 +1493,7 @@ def _set_dask(self, dx, copy=False, clear=_ALL, asanyarray=False): custom = self._custom custom["dask"] = dx - # REVIEW: getitem: Set __asanyarray__ from within _set_dask. + # REVIEW: getitem: `_set_dask`: set '__asanyarray__' if asanyarray is not None: custom["__asanyarray__"] = bool(asanyarray) @@ -1551,7 +1553,7 @@ def _del_dask(self, default=ValueError(), clear=_ALL): self._clear_after_dask_update(clear) return out - # REVIEW: active: Set the active storage status to False + # REVIEW: active: `_del_active_storage`: new method `_del_active_storage` def _del_active_storage(self): """Set the active storage reduction status to False. @@ -1637,7 +1639,7 @@ def _is_abstract_Array_subclass(self, array): """ return isinstance(array, cfdm.Array) - # REVIEW: active: set the active storage status + # REVIEW: active: `_set_active_storage`: new method `_set_active_storage` def _set_active_storage(self, value): """Set the active storage reduction status. @@ -2621,7 +2623,7 @@ def percentile( else: axes = tuple(sorted(d._parse_axes(axes))) - # REVIEW: getitem: 'cf_percentile' has its own call to 'cf_asanyarray', so we can set 'asanyarray=False'. + # REVIEW: getitem: `percentile`: set 'asanyarray' # 'cf_percentile' has its own call to 'cf_asanyarray', so we # can set 'asanyarray=False'. dx = d.to_dask_array(asanyarray=False) @@ -3122,7 +3124,7 @@ def convolution_filter( # TODO: check that this is OK dx = d.to_dask_array() - # REVIEW: getitem: rectify comment + # REVIEW: getitem: `percentile`: rectify comment # Cast to float to ensure that NaNs can be stored (so # map_overlap can correctly assign the halos) if dx.dtype != float: @@ -3310,9 +3312,7 @@ def rechunk( """ d = _inplace_enabled_define_and_cleanup(self) - # REVIEW: getitem: set asanyarray keyword for rechunk - # REVIEW: active: Do not change active storage status after a rechunk - + # REVIEW: getitem: `rechunk`: set 'asanyarray' # Dask rechunking is essentially a wrapper for __getitem__ # calls on the chunks, which means that we can use the same # 'asanyarray' and 'clear' keywords to `_set_dask` as are used @@ -3320,6 +3320,7 @@ def rechunk( dx = d.to_dask_array(asanyarray=False) dx = dx.rechunk(chunks, threshold, block_size_limit, balance) + # REVIEW: active: `rechunk`: Do not change active storage status after a rechunk d._set_dask( dx, clear=_ALL ^ _ARRAY ^ _CACHE ^ _ACTIVE, asanyarray=True ) @@ -3372,8 +3373,8 @@ def _asdatetime(self, inplace=False): f"Can't convert {units!r} values to date-time objects" ) - # REVIEW: getitem: set asanyarray keyword for _asdatetime if not d._isdatetime(): + # REVIEW: getitem: `_asdatetime`: set 'asanyarray' # 'cf_rt2dt' has its own call to 'cf_asanyarray', so we # can set 'asanyarray=False'. dx = d.to_dask_array(asanyarray=False) @@ -3430,8 +3431,8 @@ def _asreftime(self, inplace=False): f"Can't convert {units!r} values to numeric reference times" ) - # REVIEW: getitem: set asanyarray keyword for _asreftime if d._isdatetime(): + # REVIEW: getitem: `_asreftime`: set 'asanyarray' # 'cf_dt2rt' has its own call to 'cf_asanyarray', so we # can set 'asanyarray=False'. dx = d.to_dask_array(asanyarray=False) @@ -4044,7 +4045,7 @@ def _regrid( f"the shape of the regrid operator: {operator.src_shape}" ) - # REVIEW: getitem: set asanyarray keyword for _regrid + # REVIEW: getitem: `_regrid`: set 'asanyarray' # 'regrid' has its own calls to 'cf_asanyarray', so we can set # 'asanyarray=False'. dx = self.to_dask_array(asanyarray=False) @@ -4998,7 +4999,7 @@ def chunks(self): # ---------------------------------------------------------------- # Attributes # ---------------------------------------------------------------- - # REVIEW: active: return the active storage status + # REVIEW: active: `active_storage`: new property `active_storage` @property def active_storage(self): """Whether or not active storage reductions are possible. @@ -5075,7 +5076,6 @@ def Units(self, value): cf_func = partial(cf_units, from_units=old_units, to_units=value) - # REVIEW: getitem: `Units`: set 'asanyarray' # 'cf_units' has its own call to 'cf_asanyarray', so we can # set 'asanyarray=False'. @@ -5148,7 +5148,7 @@ def dtype(self): """ # REVIEW: getitem: `dtype`: set 'asanyarray' # The dask graph is never going to be computed, so we can set - # 'asanyarray=False'. + # 'asanyarray=False'. dx = self.to_dask_array(asanyarray=False) return dx.dtype @@ -5310,7 +5310,7 @@ def nbytes(self): """ # REVIEW: getitem: `nbytes`: set 'asanyarray' # The dask graph is never going to be computed, so we can set - # 'asanyarray=False'. + # 'asanyarray=False'. dx = self.to_dask_array(asanyarray=False) if math.isnan(dx.size): logger.debug("Computing data nbytes: Performance may be degraded") @@ -5347,7 +5347,7 @@ def ndim(self): """ # REVIEW: getitem: `ndim`: set 'asanyarray' # The dask graph is never going to be computed, so we can set - # 'asanyarray=False'. + # 'asanyarray=False'. dx = self.to_dask_array(asanyarray=False) return dx.ndim @@ -5372,7 +5372,7 @@ def npartitions(self): """ # REVIEW: getitem: `npartitions`: set 'asanyarray' # The dask graph is never going to be computed, so we can set - # 'asanyarray=False'. + # 'asanyarray=False'. return self.to_dask_array(asanyarray=False).npartitions @property @@ -5396,7 +5396,7 @@ def numblocks(self): """ # REVIEW: getitem: `numblocks` set 'asanyarray' # The dask graph is never going to be computed, so we can set - # 'asanyarray=False'. + # 'asanyarray=False'. return self.to_dask_array(asanyarray=False).numblocks @property @@ -5471,7 +5471,7 @@ def size(self): """ # REVIEW: getitem: `size` set 'asanyarray' # The dask graph is never going to be computed, so we can set - # 'asanyarray=False'. + # 'asanyarray=False'. dx = self.to_dask_array(asanyarray=False) size = dx.size if math.isnan(size): @@ -6748,7 +6748,7 @@ def get_deterministic_name(self): raise ValueError() units = self._Units - + # REVIEW: getitem: `get_deterministic_name`: set 'asanyarray' # The dask graph is never going to be computed, so we can set # 'asanyarray=False'. @@ -6818,7 +6818,7 @@ def get_filenames(self): """ out = set() - + # REVIEW: getitem: `get_filenames`: set 'asanyarray' # The dask graph is never going to be computed, so we can set # 'asanyarray=False'. @@ -8400,7 +8400,7 @@ def unique(self, split_every=None): # in the result. d.soften_mask() - # REVIEW: getitem: `unique`: set 'asanyarray' + # REVIEW: getitem: `unique`: set 'asanyarray' # The applicable chunk function will have its own call to # 'cf_asanyarray', so we can set 'asanyarray=False'. dx = d.to_dask_array(asanyarray=False) @@ -10304,7 +10304,6 @@ def del_calendar(self, default=ValueError()): self.override_calendar(None, inplace=True) return calendar - # REVIEW: getitem def del_file_location(self, location): """Remove a file location in-place. @@ -10338,7 +10337,7 @@ def del_file_location(self, location): # REVIEW: getitem: `del_file_location`: set 'asanyarray' # The dask graph is never going to be computed, so we can set - # 'asanyarray=False'. + # 'asanyarray=False'. dsk = self.todict(asanyarray=False) for key, a in dsk.items(): try: @@ -11775,7 +11774,6 @@ def cosh(self, inplace=False): return d - # REVIEW: getitem def cull_graph(self): """Remove unnecessary tasks from the dask graph in-place. @@ -11813,6 +11811,7 @@ def cull_graph(self): ('array-21ea057f160746a3d3f0943bba945460', 0): array([1, 2, 3])} """ + # REVIEW: getitem: `cull_graph`: set 'asanyarray' dx = self.to_dask_array(asanyarray=False) dsk, _ = cull(dx.dask, dx.__dask_keys__()) dx = da.Array(dsk, name=dx.name, chunks=dx.chunks, dtype=dx.dtype) @@ -12084,7 +12083,7 @@ def tan(self, inplace=False, i=False): return d - # REVIEW: getitem + # REVIEW: getitem: `todict`: new keywords 'apply_mask_hardness', 'asanyarray' def todict( self, optimize_graph=True, apply_mask_hardness=False, asanyarray=None ): diff --git a/cf/data/utils.py b/cf/data/utils.py index 7b6e3494b0..c9e50bfb47 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -862,7 +862,7 @@ def collapse( "keepdims": keepdims, "split_every": split_every, "mtol": mtol, - # REVIEW: active: pass the active storage status onto the collapse functions + # REVIEW: active: `collapse` :pass the active storage status onto the collapse functions "active_storage": d.active_storage, } @@ -873,7 +873,7 @@ def collapse( if ddof is not None: kwargs["ddof"] = ddof - # REVIEW: getitem + # REVIEW: getitem: `collapse`: set 'asanyarray' # The applicable chunk function will have its own call to # 'cf_asanyarray', so we can set 'asanyarray=False'. Also, setting # asanyarray=False will ensure that any active storage operations @@ -993,7 +993,7 @@ def parse_weights(d, weights, axis=None): axes = d._axes Data = type(d) for key, value in weights.items(): - # REVIEW: active + # REVIEW: active: `parse_weights` value = Data.asdata(value) # Make sure axes are in ascending order From 18b3e09031dcebc0becc4c798c1c30e5f7d60b66 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 22 Apr 2024 17:29:00 +0100 Subject: [PATCH 089/134] dev --- cf/cellmethod.py | 1 - cf/cfimplementation.py | 7 ++-- cf/constants.py | 2 +- cf/data/array/__init__.py | 9 ++++- cf/data/array/fullarray.py | 2 +- cf/data/array/mixin/__init__.py | 4 +- cf/data/array/mixin/cfamixin.py | 4 +- cf/data/array/umarray.py | 9 +++-- cf/data/collapse/__init__.py | 2 +- cf/data/collapse/collapse.py | 16 +++++++- cf/data/collapse/collapse_active.py | 9 +++-- cf/data/collapse/dask_collapse.py | 15 ++++++- cf/data/creation.py | 2 +- cf/data/dask_regrid.py | 2 +- cf/data/dask_utils.py | 4 +- cf/data/data.py | 42 ++++++-------------- cf/data/fragment/__init__.py | 5 ++- cf/data/fragment/fullfragmentarray.py | 2 +- cf/data/fragment/h5netcdffragmentarray.py | 2 +- cf/data/fragment/mixin/fragmentarraymixin.py | 4 +- cf/data/fragment/netcdf4fragmentarray.py | 2 +- cf/data/fragment/netcdffragmentarray.py | 4 +- cf/data/fragment/umfragmentarray.py | 2 +- cf/data/utils.py | 4 +- cf/docstring/docstring.py | 21 ++++++++++ cf/domain.py | 1 - cf/field.py | 29 +++++++------- cf/functions.py | 24 +++++------ cf/mixin/fielddomain.py | 1 - cf/mixin/propertiesdata.py | 15 +------ cf/read_write/netcdf/netcdfread.py | 17 ++++---- cf/read_write/netcdf/netcdfwrite.py | 6 +-- cf/read_write/read.py | 33 +++++++-------- cf/read_write/um/umread.py | 2 +- cf/read_write/write.py | 3 +- cf/test/test_Data.py | 10 ++--- cf/test/test_Field.py | 3 +- cf/test/test_FullArray.py | 2 +- cf/test/test_NetCDF4Array.py | 16 ++++---- cf/test/test_active_storage.py | 2 +- cf/test/test_functions.py | 2 +- cf/test/test_read_write.py | 4 +- 42 files changed, 190 insertions(+), 156 deletions(-) diff --git a/cf/cellmethod.py b/cf/cellmethod.py index 1b257da76b..a1a2f3be15 100644 --- a/cf/cellmethod.py +++ b/cf/cellmethod.py @@ -53,7 +53,6 @@ class CellMethod(cfdm.CellMethod): """ - # REVIEW: h5 def __new__(cls, *args, **kwargs): """This must be overridden in subclasses. diff --git a/cf/cfimplementation.py b/cf/cfimplementation.py index 868c39d71f..b08a71bc62 100644 --- a/cf/cfimplementation.py +++ b/cf/cfimplementation.py @@ -26,6 +26,8 @@ TiePointIndex, ) from .data import Data + +# REVIEW: h5: `cfimplementation.py`: import `CFAH5netcdfArray`, `CFANetCDF4Array`, `H5netcdfArray`,`NetCDF4Array` from .data.array import ( BoundsFromNodesArray, CellConnectivityArray, @@ -43,7 +45,6 @@ from .functions import CF -# REVIEW: h5 class CFImplementation(cfdm.CFDMImplementation): """A container for the CF data model implementation for `cf`. @@ -115,7 +116,7 @@ def set_construct(self, parent, construct, axes=None, copy=True, **kwargs): parent, construct, axes=axes, copy=copy, **kwargs ) - # REVIEW: h5 + # REVIEW: h5: `initialise_CFANetCDF4Array`: new method to initialise `CFANetCDF4Array` def initialise_CFANetCDF4Array(self, **kwargs): """Return a `CFANetCDF4Array` instance. @@ -132,7 +133,7 @@ def initialise_CFANetCDF4Array(self, **kwargs): cls = self.get_class("CFANetCDF4Array") return cls(**kwargs) - # REVIEW: h5 + # REVIEW: h5: `initialise_CFAH5netcdfArray`: new method to initialise `CFAH5netcdfArray` def initialise_CFAH5netcdfArray(self, **kwargs): """Return a `CFAH5netcdfArray` instance. diff --git a/cf/constants.py b/cf/constants.py index 83554d5596..3828ae6e42 100644 --- a/cf/constants.py +++ b/cf/constants.py @@ -63,7 +63,7 @@ "LOG_LEVEL": logging.getLevelName(logging.getLogger().level), "BOUNDS_COMBINATION_MODE": "AND", "CHUNKSIZE": parse_bytes(_CHUNKSIZE), - # REVIEW: active + # REVIEW: active: `CONSTANTS`: new constants 'active_storage', 'active_storage_url' "active_storage": False, "active_storage_url": None, } diff --git a/cf/data/array/__init__.py b/cf/data/array/__init__.py index 20924e6433..c57a72081a 100644 --- a/cf/data/array/__init__.py +++ b/cf/data/array/__init__.py @@ -1,12 +1,19 @@ -# REVIEW: h5 from .boundsfromnodesarray import BoundsFromNodesArray from .cellconnectivityarray import CellConnectivityArray + +# REVIEW: h5: `__init__.py`: import `CFAH5netcdfArray` from .cfah5netcdfarray import CFAH5netcdfArray + +# REVIEW: h5: `__init__.py`: import `CFAH5netcdfArray` from .cfanetcdf4array import CFANetCDF4Array from .fullarray import FullArray from .gatheredarray import GatheredArray + +# REVIEW: h5: `__init__.py`: import `H5netcdfArray` from .h5netcdfarray import H5netcdfArray from .netcdfarray import NetCDFArray + +# REVIEW: h5: `__init__.py`: import `NetCDF4Array` from .netcdf4array import NetCDF4Array from .pointtopologyarray import PointTopologyArray from .raggedcontiguousarray import RaggedContiguousArray diff --git a/cf/data/array/fullarray.py b/cf/data/array/fullarray.py index c4ab489937..b52e6480d9 100644 --- a/cf/data/array/fullarray.py +++ b/cf/data/array/fullarray.py @@ -16,7 +16,7 @@ class FullArray(IndexMixin, Array): """ - # REVIEW: h5: `__init__`: Replace units/calendar API with 'attributes' + # REVIEW: h5: `__init__`: replace units/calendar API with attributes def __init__( self, fill_value=None, diff --git a/cf/data/array/mixin/__init__.py b/cf/data/array/mixin/__init__.py index 5b7fc33cf9..7db2cd73e8 100644 --- a/cf/data/array/mixin/__init__.py +++ b/cf/data/array/mixin/__init__.py @@ -1,9 +1,9 @@ -# REVIEW: active +# REVIEW: active: `__init__.py`: import `ActiveStorageMixin` from .activestoragemixin import ActiveStorageMixin from .arraymixin import ArrayMixin from .cfamixin import CFAMixin from .compressedarraymixin import CompressedArrayMixin from .filearraymixin import FileArrayMixin -# REVIEW: getitem +# REVIEW: getitem: `__init__.py`: import `IndexMixin` from .indexmixin import IndexMixin diff --git a/cf/data/array/mixin/cfamixin.py b/cf/data/array/mixin/cfamixin.py index f0cf64e972..9464c9a639 100644 --- a/cf/data/array/mixin/cfamixin.py +++ b/cf/data/array/mixin/cfamixin.py @@ -36,7 +36,7 @@ def __new__(cls, *args, **kwargs): } return instance - # REVIEW: h5: `__init__`: Replace units/calendar API with 'attributes' + # REVIEW: h5: `__init__`: replace units/calendar API with attributes def __init__( self, filename=None, @@ -466,7 +466,7 @@ def get_fragment_shape(self): """ return self._get_component("fragment_shape") - # REVIEW: h5: `get_storage_options` + # REVIEW: h5: `get_storage_options`: new method to get file access options def get_storage_options(self): """Return `s3fs.S3FileSystem` options for accessing S3 fragment files. diff --git a/cf/data/array/umarray.py b/cf/data/array/umarray.py index adaac9528f..3b385bc72c 100644 --- a/cf/data/array/umarray.py +++ b/cf/data/array/umarray.py @@ -7,12 +7,12 @@ from .mixin import FileArrayMixin, IndexMixin -# REVIEW: h5: `UMArray`: Replace units/calendar API with 'attributes' class UMArray( IndexMixin, FileArrayMixin, cfdm.data.mixin.FileArrayMixin, Array ): """A sub-array stored in a PP or UM fields file.""" + # REVIEW: h5: `__init__`: replace units/calendar API with attributes def __init__( self, filename=None, @@ -206,6 +206,7 @@ def _get_array(self, index=None): self.close(f) del f, rec + # REVIEW: h5: `_get_array`: refactor for use of `netcdf_indexer` # Set the netCDF attributes for the data attributes = self.get_attributes({}) self._set_units(int_hdr, attributes) @@ -312,7 +313,6 @@ def _set_FillValue(self, int_hdr, real_hdr, attributes): attributes["_FillValue"] = _FillValue - # REVIEW: getitem def _set_units(self, int_hdr, attributes): """Set the ``units`` attribute. @@ -371,10 +371,11 @@ def _set_units(self, int_hdr, attributes): units = units0 break + # REVIEW: getitem: `_set_units`: record units in attributes attributes["units"] = units - # REVIEW: h5 - # REVIEW: getitem + # REVIEW: h5: `_set_unpack`: record unpack in attributes + # REVIEW: getitem: `_set_unpack`: record unpack in attributes def _set_unpack(self, int_hdr, real_hdr, attributes): """Set the ``add_offset`` and ``scale_factor`` attributes. diff --git a/cf/data/collapse/__init__.py b/cf/data/collapse/__init__.py index a2842da447..0fd44052f9 100644 --- a/cf/data/collapse/__init__.py +++ b/cf/data/collapse/__init__.py @@ -1,4 +1,4 @@ from .collapse import Collapse -# REVIEW: active +# REVIEW: active: import active storage functions from .collapse_active import actify, active_reduction_methods, active_storage diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index 5eef6efa35..587a80a33b 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -9,7 +9,6 @@ from .collapse_utils import check_input_dtype, double_precision_dtype -# REVIEW: active class Collapse(metaclass=DocstringRewriteMeta): """Container for functions that collapse dask arrays. @@ -98,6 +97,7 @@ def __docstring_package_depth__(self): """ return 0 + # REVIEW: active: `max`: active storage decoration, new keyword 'active_stoarage' @active_storage("max") def max( self, @@ -166,6 +166,7 @@ def max( meta=np.array((), dtype=dtype), ) + # REVIEW: active: `max_abs`: active storage decoration, new keyword 'active_stoarage' @active_storage("max_abs") def max_abs( self, @@ -222,6 +223,7 @@ def max_abs( active_storage=False, ) + # REVIEW: active: `mean`: active storage decoration, new keyword 'active_stoarage' @active_storage("mean") def mean( self, @@ -294,6 +296,7 @@ def mean( weights=weights, ) + # REVIEW: active: `mean_abs`: active storage decoration, new keyword 'active_stoarage' @active_storage("mean_abs") def mean_abs( self, @@ -354,6 +357,7 @@ def mean_abs( active_storage=False, ) + # REVIEW: active: `mid_range`: active storage decoration, new keyword 'active_stoarage' @active_storage("mid_range") def mid_range( self, @@ -427,6 +431,7 @@ def mid_range( meta=np.array((), dtype=dtype), ) + # REVIEW: active: `min`: active storage decoration, new keyword 'active_stoarage' @active_storage("min") def min( self, @@ -495,6 +500,7 @@ def min( meta=np.array((), dtype=dtype), ) + # REVIEW: active: `min_abs`: active storage decoration, new keyword 'active_stoarage' @active_storage("min_abs") def min_abs( self, @@ -551,6 +557,7 @@ def min_abs( active_storage=False, ) + # REVIEW: active: `range`: active storage decoration, new keyword 'active_stoarage' @active_storage("range") def range( self, @@ -623,6 +630,7 @@ def range( meta=np.array((), dtype=dtype), ) + # REVIEW: active: `rms`: active storage decoration, new keyword 'active_stoarage' @active_storage("rms") def rms( self, @@ -695,6 +703,7 @@ def rms( weights=weights, ) + # REVIEW: active: `sample_size`: active storage decoration, new keyword 'active_stoarage' @active_storage("sample_size") def sample_size( self, @@ -767,6 +776,7 @@ def sample_size( meta=np.array((), dtype=dtype), ) + # REVIEW: active: `sum`: active storage decoration, new keyword 'active_stoarage' @active_storage("sum") def sum( self, @@ -842,6 +852,7 @@ def sum( weights=weights, ) + # REVIEW: active: `sum_of_weights`: active storage decoration, new keyword 'active_stoarage' @active_storage("sum_of_weights") def sum_of_weights( self, @@ -918,6 +929,7 @@ def sum_of_weights( weights=weights, ) + # REVIEW: active: `sum_of_weights2`: active storage decoration, new keyword 'active_stoarage' @active_storage("sum_of_weights2") def sum_of_weights2( self, @@ -994,6 +1006,7 @@ def sum_of_weights2( weights=weights, ) + # REVIEW: active: `unique`: active storage decoration, new keyword 'active_stoarage' @active_storage("unique") def unique( self, a, split_every=None, chunk_function=None, active_storage=False @@ -1049,6 +1062,7 @@ def unique( meta=np.array((), dtype=dtype), ) + # REVIEW: active: `var`: active storage decoration, new keyword 'active_stoarage' @active_storage("var") def var( self, diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 3e69c64464..3422a74f43 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -1,4 +1,4 @@ -# REVIEW: active +# REVIEW: active: `collapse_active.py`: new module for active storage functionality import logging from functools import wraps @@ -221,8 +221,8 @@ def actify(a, method, axis=None): return a # Loop round the nodes of the Dask graph looking for data - # definitions that point to files and which support active storage - # operations, and then modify the Dask graph when we find them. + # definitions that i) point to files, and ii) which support active + # storage operations; and modify the Dask graph when we find them. # # The elements are traversed in reverse order so that the data # definitions will tend to come out first, allowing for the @@ -230,7 +230,8 @@ def actify(a, method, axis=None): # not possible. # # Performance: The optimising the graph can be slow for - # complicated graphs, but is nonetheless essential. + # complicated graphs, but nonetheless is essential to + # ensure that unused nodes are not considered. ok_to_actify = True dsk = collections_to_dsk((a,), optimize_graph=True) for key, value in reversed(dsk.items()): diff --git a/cf/data/collapse/dask_collapse.py b/cf/data/collapse/dask_collapse.py index 9544f3bedd..1514fc57e7 100644 --- a/cf/data/collapse/dask_collapse.py +++ b/cf/data/collapse/dask_collapse.py @@ -1,4 +1,4 @@ -# REVIEW: active +# REVIEW: active: `dask_collapse.py`: all unlabelled changes in this module are general tidying, and should be reviewed at the same time as active storage """Reduction functions intended to be passed to be dask. Most of these functions are expected to be passed to @@ -231,6 +231,7 @@ def sum_sample_sizes(pairs, axis, computing_meta=False, **kwargs): # -------------------------------------------------------------------- # mean # -------------------------------------------------------------------- +# REVIEW: active: `cf_mean_chunk`: active storage decoration @active_storage_chunk("mean") def cf_mean_chunk( x, @@ -377,6 +378,7 @@ def cf_mean_agg( # -------------------------------------------------------------------- # maximum # -------------------------------------------------------------------- +# REVIEW: active: `cf_max_chunk`: active storage decoration @active_storage_chunk("max") def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): """Chunk calculations for the maximum. @@ -531,6 +533,7 @@ def cf_mid_range_agg( # -------------------------------------------------------------------- # minimum # -------------------------------------------------------------------- +# REVIEW: active: `cf_min_chunk`: active storage decoration @active_storage_chunk("min") def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): """Chunk calculations for the minimum. @@ -637,6 +640,7 @@ def cf_min_agg( # -------------------------------------------------------------------- # range # -------------------------------------------------------------------- +# REVIEW: active: `cf_range_chunk`: active storage decoration @active_storage_chunk("range") def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs): """Chunk calculations for the range. @@ -750,6 +754,7 @@ def cf_range_agg( # -------------------------------------------------------------------- # root mean square # -------------------------------------------------------------------- +# REVIEW: active: `cf_rms_chunk`: active storage decoration @active_storage_chunk("rms") def cf_rms_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): """Chunk calculations for the root mean square (RMS). @@ -834,6 +839,7 @@ def cf_rms_agg( # -------------------------------------------------------------------- # sample size # -------------------------------------------------------------------- +# REVIEW: active: `cf_sample_size_chunk`: active storage decoration @active_storage_chunk("sample_size") def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs): """Chunk calculations for the sample size. @@ -947,6 +953,7 @@ def cf_sample_size_agg( # -------------------------------------------------------------------- # sum # -------------------------------------------------------------------- +# REVIEW: active: `cf_sum_chunk`: active storage decoration @active_storage_chunk("sum") def cf_sum_chunk( x, @@ -1082,6 +1089,7 @@ def cf_sum_agg( # -------------------------------------------------------------------- # sum of weights # -------------------------------------------------------------------- +# REVIEW: active: `cf_sum_of_weights_chunk`: active storage decoration @active_storage_chunk("sum_of_weights") def cf_sum_of_weights_chunk( x, weights=None, dtype="f8", computing_meta=False, **kwargs @@ -1125,6 +1133,7 @@ def cf_sum_of_weights_chunk( # -------------------------------------------------------------------- # sum of squares of weights # -------------------------------------------------------------------- +# REVIEW: active: `cf_sum_of_weights2_chunk`: active storage decoration @active_storage_chunk("sum_of_weights2") def cf_sum_of_weights2_chunk( x, weights=None, dtype="f8", computing_meta=False, **kwargs @@ -1170,6 +1179,7 @@ def cf_sum_of_weights2_chunk( # -------------------------------------------------------------------- # unique # -------------------------------------------------------------------- +# REVIEW: active: `cf_unique_chunk`: active storage decoration @active_storage_chunk("unique") def cf_unique_chunk(x, dtype=None, computing_meta=False, **kwargs): """Chunk calculations for the unique values. @@ -1234,11 +1244,12 @@ def cf_unique_agg(pairs, axis=None, computing_meta=False, **kwargs): # -------------------------------------------------------------------- # variance # -------------------------------------------------------------------- +# REVIEW: active: `cf_var_chunk`: active storage decoration @active_storage_chunk("var") def cf_var_chunk( x, weights=None, dtype="f8", computing_meta=False, ddof=None, **kwargs ): - """Chunk calculations for the variance. + r"""Chunk calculations for the variance. This function is passed to `dask.array.reduction` as its *chunk* parameter. diff --git a/cf/data/creation.py b/cf/data/creation.py index 773d396c3a..aa86e5786b 100644 --- a/cf/data/creation.py +++ b/cf/data/creation.py @@ -82,7 +82,7 @@ def to_dask(array, chunks, **from_array_options): array = np.asanyarray(array) kwargs = from_array_options - # REVIEW: active: `to_dask`: + # REVIEW: active: `to_dask`: '_dask_meta' renamed to '_meta' for consistency with Dask # REVIEW: getitem: `to_dask`: The file lock is now on the `Array` object (in its `_get_array` method), rather than being set on the Dask array itself. kwargs.setdefault("meta", getattr(array, "_meta", None)) diff --git a/cf/data/dask_regrid.py b/cf/data/dask_regrid.py index 9db487a019..8fb88159c5 100644 --- a/cf/data/dask_regrid.py +++ b/cf/data/dask_regrid.py @@ -1,7 +1,7 @@ """Regridding functions used within a dask graph.""" import numpy as np -# REVIEW: getitem: `regrid.py` +# REVIEW: getitem: `regrid.py`: import `cf_asanyarray` from .dask_utils import cf_asanyarray diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index 290eb8003f..591cb24582 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -731,6 +731,7 @@ def cf_filled(a, fill_value=None): return np.ma.filled(a, fill_value=fill_value) +# REVIEW: getitem: `cf_asanyarray`: convert a to a usable array def cf_asanyarray(a): """Convert to a `numpy` array. @@ -746,7 +747,8 @@ def cf_asanyarray(a): :Returns: - The converted array, or the input array unchanged. + The array converted to a `numpy` array, or the input array + unchanged if ``a.__asanyarray__`` False. """ # REVIEW: getitem: `cf_asanyarray`: convert a to a usable array diff --git a/cf/data/data.py b/cf/data/data.py index dba08e8f4e..1dd9b7ae0f 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -963,13 +963,14 @@ def __getitem__(self, indices): "Non-orthogonal indexing has not yet been implemented" ) - # REVIEW: active `__getitem__` - # REVIEW: getitem: `__getitem__` + # REVIEW: active `__getitem__`: subspacing does not affect active storage status + # REVIEW: getitem: `__getitem__`: set 'asanyarray=True' because subspaced chunks might not be in memory # ------------------------------------------------------------ # Set the subspaced dask array # # * A subpspaced chunk might not result in an array in memory, - # so we need to set asanyarray=True + # so we set asanyarray=True to ensure that, if required, + # they are converted at compute time. # # * Subspacing the data does not affect the active storage # status @@ -1199,11 +1200,12 @@ def __asanyarray__(self): :Returns: `bool` - If True then at compute time add a final operation to - the Dask graph that converts chunks to `numpy` arrays, - but only if a chunk's array object has an - `__asanyarray__` attribute that is also `True`. If - `False` then do not do this. + If True then at compute time add a final operation + (not in-place) to the Dask graph that converts a + chunk's array object to a `numpy` array if the array + object has an `__asanyarray__` attribute that is + `True`, or else does nothing. If False then do not add + this operation. """ return self._custom.get("__asanyarray__", True) @@ -1454,13 +1456,8 @@ def _set_dask(self, dx, copy=False, clear=_ALL, asanyarray=False): `_clear_after_dask_update` for details. asanyarray: `bool` or `None`, optional - If True then at compute time add a final operation to - the Dask graph (not in-place) that converts chunks to - `numpy` arrays, but only for those chunks whose array - objects have an `__asanyarray__` attribute that is - also `True`. If False, the default, then do not do - this. If `None` then do not change the current - behaviour. + If `None` then do nothing. Otherwise set + `__asanyarray__` to the Boolean value of *asanyarray*. .. versionadded:: NEXTRELEASE @@ -10015,20 +10012,7 @@ def to_dask_array(self, apply_mask_hardness=False, asanyarray=None): If True then force the mask hardness of the returned array to be that given by the `hardmask` attribute. - asanyarray: `bool` or `None`, optional - If True then add a final operation to the returned - Dask graph that converts chunks to `numpy` arrays, but - only if a chunk's array object has an `__asanyarray__` - attribute that is also `True`. If False then do not do - this. If `None`, the default, then the final operation - is added if the `Data` object's `__asanyarray__` - attribute is `True`. I.e. by default `to_dask_array` - always returns a computable Dask graph, although it - may have a extra final layer that is not needed. - - .. note:: Such a final operation is included in the - returned Dask array, but is not included in - the Dask array stored in the `Data` object. + {{asanyarray: `bool` or `None`, optional}} .. versionadded:: NEXTVERSION diff --git a/cf/data/fragment/__init__.py b/cf/data/fragment/__init__.py index 38522d3958..fd82cfa9cf 100644 --- a/cf/data/fragment/__init__.py +++ b/cf/data/fragment/__init__.py @@ -1,6 +1,9 @@ -# REVIEW: h5 from .fullfragmentarray import FullFragmentArray + +# REVIEW: h5: `__init__.py`: import `H5netcdfFragmentArray` from .h5netcdffragmentarray import H5netcdfFragmentArray from .netcdffragmentarray import NetCDFFragmentArray + +# REVIEW: h5: `__init__.py`: import `NetCDF4FragmentArray` from .netcdf4fragmentarray import NetCDF4FragmentArray from .umfragmentarray import UMFragmentArray diff --git a/cf/data/fragment/fullfragmentarray.py b/cf/data/fragment/fullfragmentarray.py index 1212b27c29..f6dad9ce95 100644 --- a/cf/data/fragment/fullfragmentarray.py +++ b/cf/data/fragment/fullfragmentarray.py @@ -9,7 +9,7 @@ class FullFragmentArray(FragmentArrayMixin, FullArray): """ - # REVIEW: h5: Replace "units/calendar" API with "attributes" + # REVIEW: h5: `__init__`: replace units/calendar API with attributes def __init__( self, fill_value=None, diff --git a/cf/data/fragment/h5netcdffragmentarray.py b/cf/data/fragment/h5netcdffragmentarray.py index c339cebb18..6ae379c984 100644 --- a/cf/data/fragment/h5netcdffragmentarray.py +++ b/cf/data/fragment/h5netcdffragmentarray.py @@ -2,7 +2,7 @@ from .mixin import FragmentArrayMixin -# REVIEW: h5 +# REVIEW: h5: `H5netcdfFragmentArray`: New class to access netCDF fragment with `h5netcdf` class H5netcdfFragmentArray(FragmentArrayMixin, H5netcdfArray): """A netCDF fragment array accessed with `h5netcdf`. diff --git a/cf/data/fragment/mixin/fragmentarraymixin.py b/cf/data/fragment/mixin/fragmentarraymixin.py index c541c3094f..d81d4cb453 100644 --- a/cf/data/fragment/mixin/fragmentarraymixin.py +++ b/cf/data/fragment/mixin/fragmentarraymixin.py @@ -12,7 +12,7 @@ class FragmentArrayMixin: """ - # REVIEW: getitem + # REVIEW: getitem: `_get_array`: new method to convert subspace to numpy array def _get_array(self, index=None): """Returns a subspace of the dataset variable. @@ -129,7 +129,6 @@ def _conform_to_aggregated_units(self, array): return array - # REVIEW: getitem def _size_1_axis(self, indices): """Find the position of a unique size 1 index. @@ -167,6 +166,7 @@ def _size_1_axis(self, indices): None """ + # REVIEW: getitem: `_size_1_axis`: refactor to use `original_shape` original_shape = self.original_shape if original_shape.count(1): return original_shape.index(1) diff --git a/cf/data/fragment/netcdf4fragmentarray.py b/cf/data/fragment/netcdf4fragmentarray.py index e2f8fa62fb..869c083676 100644 --- a/cf/data/fragment/netcdf4fragmentarray.py +++ b/cf/data/fragment/netcdf4fragmentarray.py @@ -2,7 +2,7 @@ from .mixin import FragmentArrayMixin -# REVIEW: h5 +# REVIEW: h5: `NetCDF4FragmentArray`: New class to access netCDF fragment with `netCDF4` class NetCDF4FragmentArray(FragmentArrayMixin, NetCDF4Array): """A netCDF fragment array accessed with `netCDF4`. diff --git a/cf/data/fragment/netcdffragmentarray.py b/cf/data/fragment/netcdffragmentarray.py index 3ffd17a786..31da929126 100644 --- a/cf/data/fragment/netcdffragmentarray.py +++ b/cf/data/fragment/netcdffragmentarray.py @@ -23,7 +23,7 @@ class NetCDFFragmentArray( """ - # REVIEW: h5: Replace "units/calendar" API with "attributes" + # REVIEW: h5: `__init__`: replace units/calendar API with attributes def __init__( self, filename=None, @@ -175,7 +175,7 @@ def __init__( # By default, close the file after data array access self._set_component("close", True, copy=False) - # REVIEW: h5 + # REVIEW: h5: `__getitem__`: new factory method to choose backend def __getitem__(self, indices): """Returns a subspace of the fragment as a numpy array. diff --git a/cf/data/fragment/umfragmentarray.py b/cf/data/fragment/umfragmentarray.py index 8e03ad1bdd..52b99397c7 100644 --- a/cf/data/fragment/umfragmentarray.py +++ b/cf/data/fragment/umfragmentarray.py @@ -9,7 +9,7 @@ class UMFragmentArray(FragmentArrayMixin, UMArray): """ - # REVIEW: h5: Replace "units/calendar" API with "attributes" + # REVIEW: h5: `__init__`: replace units/calendar API with attributes def __init__( self, filename=None, diff --git a/cf/data/utils.py b/cf/data/utils.py index c9e50bfb47..165e641950 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -862,7 +862,7 @@ def collapse( "keepdims": keepdims, "split_every": split_every, "mtol": mtol, - # REVIEW: active: `collapse` :pass the active storage status onto the collapse functions + # REVIEW: active: `collapse`: pass the active storage status onto the collapse functions "active_storage": d.active_storage, } @@ -991,9 +991,9 @@ def parse_weights(d, weights, axis=None): w = [] shape = d.shape axes = d._axes + # REVIEW: active: `parse_weights`: minor refactor Data = type(d) for key, value in weights.items(): - # REVIEW: active: `parse_weights` value = Data.asdata(value) # Make sure axes are in ascending order diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py index 9e56e1589d..2b1d53d388 100644 --- a/cf/docstring/docstring.py +++ b/cf/docstring/docstring.py @@ -626,6 +626,27 @@ "{{to_size: `int`, optional}}": """to_size: `int`, optional Pad the axis after so that the new axis has the given size.""", + # REVIEW: getitem: `_docstring_substitution_definitions`: 'asanyarray' + # asanyarray + "{{asanyarray: `bool` or `None`, optional}": """asanyarray: `bool` or `None`, optional + If True then add a final operation (not in-place) to + the graph of the returned Dask array that converts a + chunk's array object to a `numpy` array if the array + object has an `__asanyarray__` attribute that is + `True`, or else does nothing. If False then do not add + this operation. If `None`, the default, then the final + operation is added only if the `Data` object's + `__asanyarray__` attribute is `True`. + + By default or if *asanyarray* is True, the returned + Dask array will always provide the expected result + when computed, although if *asanyarray* is True then + the Dask graph may have an extra null operation layer + that is not requred. Setting *asanyarray* to False + should only be done in the case that the returned Dask + Array will get further operations which are guaranteed + to negate the need for the extra layer in the Dask + graph.""", # ---------------------------------------------------------------- # Method description substitutions (4 levels of indentation) # ---------------------------------------------------------------- diff --git a/cf/domain.py b/cf/domain.py index cf3567d03b..8d3afed0ec 100644 --- a/cf/domain.py +++ b/cf/domain.py @@ -695,7 +695,6 @@ def identity(self, default="", strict=False, relaxed=False, nc_only=False): return default - # REVIEW: h5 def identities(self): """Return all possible identities. diff --git a/cf/field.py b/cf/field.py index 0ba922d488..250d98af20 100644 --- a/cf/field.py +++ b/cf/field.py @@ -5113,7 +5113,7 @@ def histogram(self, digitized): """ raise RuntimeError("Use cf.histogram instead.") - # REVIEW: active + # REVIEW: active: active storage docstring @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_manage_log_level_via_verbosity def collapse( @@ -5495,12 +5495,12 @@ def collapse( **Active storage collapses** When the data being collapsed are stored remotely, the - collapse calculations may be carried out on a server that is - close (in a network distance sense) to the data, thereby - removing the time and power costs of transfering the entire - un-collapsed data to the local client. Whether or not this - will occur is determined on a case-by-case basis, and will - only be done if all of the following criteria are met: + collapse calculations may be carried out on a server (ideally + one that is close in a network distance sense) to the data, + thereby removing the time and energy costs of transfering the + entire un-collapsed data to the local client. Whether or not + this will occur is determined on a case-by-case basis, and + will only be done if all of the following criteria are met: * the collapse method is one of ``'mean'``, ``'maximum'``, ``'minimum'``, or ``'sum'``; @@ -5509,7 +5509,7 @@ def collapse( * the collapse is unweighted; - * `cf.active_storage()` is `True`; + * ``cf.active_storage()`` is `True`; * a URL of the active storage server has been set with `cf.active_storage_url`; @@ -5518,12 +5518,12 @@ def collapse( in any other file format, or in memory) and are not numerically packed; - * the `!active_storage` attribute of the `cf.Data` object - being collapsed is `True`, indicating that active storage - operations may be possible. In general, it will only be - `True` for data that are in files on disk, are not - compressed by convention and have not had any other - operations applied; + * the `!active_storage` attribute of the field's `Data` object + is `True`, indicating that active storage operations may be + possible. In general, it will only be `True` for data that + are in files on disk, are not compressed by convention and + have not had any other operations applied, apart from + subspacing; * it is possible to import the external `activestorage.Active` class. @@ -6994,6 +6994,7 @@ def collapse( "collapse" ) + # REVIEW: active: `collapse`: include size 1 axes in collapse # Note: It is important that size 1 axes are also passed # on to the Data collapse, because active storage # collapses get confused if they're not there. diff --git a/cf/functions.py b/cf/functions.py index 71ae42ff0f..3e77b57b1b 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -162,7 +162,7 @@ def _free_memory(): return float(virtual_memory().available) -# REVIEW: active +# REVIEW: active: `configuration`: new keywords 'active_storage', 'active_storage_url' def configuration( atol=None, rtol=None, @@ -410,7 +410,7 @@ def configuration( ) -# REVIEW: active +# REVIEW: active: `_configuration`: new keywords 'active_storage', 'active_storage_url' def _configuration(_Configuration, **kwargs): """Internal helper function to provide the logic for `cf.configuration`. @@ -568,7 +568,7 @@ class log_level(ConstantAccess, cfdm.log_level): _reset_log_emergence_level = _reset_log_emergence_level -# REVIEW: active +# REVIEW: active: `regrid_logging`: new examples class regrid_logging(ConstantAccess): """Whether or not to enable `esmpy` regridding logging. @@ -700,7 +700,7 @@ def _parse(cls, arg): ) # pragma: no cover -# REVIEW: active +# REVIEW: active: `relaxed_identities`: new examples class relaxed_identities(ConstantAccess): """Use 'relaxed' mode when getting a construct identity. @@ -834,7 +834,7 @@ def _parse(cls, arg): return parse_bytes(arg) -# REVIEW: active +# REVIEW: active: `tempdir`: new examples class tempdir(ConstantAccess): """The directory for internally generated temporary files. @@ -1186,7 +1186,7 @@ def _parse(cls, arg): return arg -# REVIEW: active +# REVIEW: active: `active_storage`: new function class active_storage(ConstantAccess): """Whether or not to attempt active storage reductions. @@ -1247,7 +1247,7 @@ def _parse(cls, arg): return bool(arg) -# REVIEW: active +# REVIEW: active: `active_storage_url`: new function class active_storage_url(ConstantAccess): """The URL location of the active storage reducer. @@ -2226,6 +2226,7 @@ def parse_indices(shape, indices, cyclic=False, keepdims=True): return parsed_indices, roll +# REVIEW: getitem: `get_subspace`: remove deprecated function _equals = cfdm.Data()._equals @@ -2596,7 +2597,6 @@ def flat(x): yield a -# REVIEW: h5 def abspath(filename): """Return a normalized absolute version of a file name. @@ -2641,7 +2641,6 @@ def abspath(filename): return filename -# REVIEW: h5 def relpath(filename, start=None): """Return a relative filepath to a file. @@ -2676,6 +2675,7 @@ def relpath(filename, start=None): 'http://data/archive/file.nc' """ + # REVIEW: h5: `relpath`: minor refactor u = urlparse(filename) if u.scheme != "": return filename @@ -2686,7 +2686,6 @@ def relpath(filename, start=None): return _os_path_relpath(filename) -# REVIEW: h5 def dirname(filename): """Return the directory name of a file. @@ -2715,6 +2714,7 @@ def dirname(filename): 'http://data/archive' """ + # REVIEW: h5: `relpath`: minor refactor u = urlparse(filename) if u.scheme != "": return filename.rpartition("/")[0] @@ -2722,7 +2722,6 @@ def dirname(filename): return _os_path_dirname(filename) -# REVIEW: h5 def pathjoin(path1, path2): """Join two file path components intelligently. @@ -2755,6 +2754,7 @@ def pathjoin(path1, path2): 'http://data/archive/file.nc' """ + # REVIEW: h5: `relpath`: minor refactor u = urlparse(path1) if u.scheme != "": return urljoin(path1, path2) @@ -3123,7 +3123,7 @@ def _get_module_info(module, alternative_name=False, try_except=False): ) -# REVIEW: h5 +# REVIEW: h5: `environment`: new dependencies def environment(display=True, paths=True): """Return the names and versions of the cf package and its dependencies. diff --git a/cf/mixin/fielddomain.py b/cf/mixin/fielddomain.py index 6c82147652..d81dbdf800 100644 --- a/cf/mixin/fielddomain.py +++ b/cf/mixin/fielddomain.py @@ -2022,7 +2022,6 @@ def get_coordinate_reference( return out - # REVIEW: h5 def iscyclic(self, *identity, **filter_kwargs): """Returns True if the given axis is cyclic. diff --git a/cf/mixin/propertiesdata.py b/cf/mixin/propertiesdata.py index 279402bdf2..247c70c6d1 100644 --- a/cf/mixin/propertiesdata.py +++ b/cf/mixin/propertiesdata.py @@ -4692,7 +4692,7 @@ def log(self, base=None, inplace=False, i=False): delete_props=True, ) - # REVIEW: getitem + # REVIEW: getitem: `to_dask_array`: new keyword 'asanyarray' def to_dask_array(self, apply_mask_hardness=False, asanyarray=None): """Convert the data to a `dask` array. @@ -4721,18 +4721,7 @@ def to_dask_array(self, apply_mask_hardness=False, asanyarray=None): .. versionadded:: NEXTVERSION - asanyarray: `bool` or `None`, optional - If True then add a final operation to the Dask graph - that converts chunks to `numpy` arrays, but only if a - chunk's array object has an `__asanyarray__` attribute - that is `True`. If False then do not do this. If - `None`, the default, then the final operation is added - if the `Data` object's `__asanyarray__` attribute is - `True`. - - .. note:: Such a final operation is included in the - returned Dask array, but is not included in - the Dask array stored in the `Data` object. + {{asanyarray: `bool` or `None`, optional}} .. versionadded:: NEXTVERSION diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py index 5a8892b56b..dd0fb89c1e 100644 --- a/cf/read_write/netcdf/netcdfread.py +++ b/cf/read_write/netcdf/netcdfread.py @@ -209,10 +209,10 @@ def _create_data( if data.npartitions == 1: data._cfa_set_write(True) - # REVIEW: h5 + # REVIEW: h5: `_create_data`: control caching if ( not compression_index - and self.read_vars.get("cache_metadata") + and self.read_vars.get("cache") and self.implementation.get_construct_type(construct) != "field" ): @@ -254,7 +254,7 @@ def _create_data( coord_ncvar=coord_ncvar, ) - # REVIEW: h5: Replace "units/calendar" API with "attributes" + # REVIEW: h5: `_create_data`: replace units/calendar API with attributes attributes = kwargs["attributes"] data = self._create_Data( cfa_array, @@ -263,7 +263,7 @@ def _create_data( calendar=attributes.get("calendar"), ) - # REVIEW: h5 + # REVIEW: h5: `_create_data`: don't cache data from CFA variables # Note: We don't cache elements from CFA variables, because # the data are in fragment files which have not been # opened and may not not even be openable (such as could @@ -624,7 +624,7 @@ def _cache_data_elements(self, data, ncvar): # Store the elements in the data object data._set_cached_elements(elements) - # REVIEW: h5 + # REVIEW: h5: `_create_cfanetcdfarray`: docstring/comment improvements def _create_cfanetcdfarray( self, ncvar, @@ -699,8 +699,8 @@ def _create_cfanetcdfarray( kwargs["x"] = aggregation_instructions kwargs["instructions"] = " ".join(sorted(instructions)) + # REVIEW: h5: `_create_cfanetcdfarray`: choose the correct netCDF backend # Use the kwargs to create a CFANetCDFArray instance - # array = self.implementation.initialise_CFANetCDFArray(**kwargs) if g["original_netCDF4"]: array = self.implementation.initialise_CFANetCDF4Array(**kwargs) else: @@ -709,7 +709,6 @@ def _create_cfanetcdfarray( return array, kwargs - # REVIEW: h5 def _create_cfanetcdfarray_term( self, parent_ncvar, @@ -754,6 +753,7 @@ def _create_cfanetcdfarray_term( return_kwargs_only=True, ) + # REVIEW: h5: `_create_cfanetcdfarray_term`: fix unknown fragment shape # Get rid of the incorrect shape. This will end up getting set # correctly by the CFANetCDFArray instance. kwargs.pop("shape", None) @@ -772,6 +772,7 @@ def _create_cfanetcdfarray_term( kwargs["x"] = aggregation_instructions kwargs["instructions"] = " ".join(sorted(instructions)) + # REVIEW: h5: `_create_cfanetcdfarray_term`: choose the correct netCDF backend if g["original_netCDF4"]: array = self.implementation.initialise_CFANetCDF4Array(**kwargs) else: @@ -935,7 +936,6 @@ def _customise_field_ancillaries(self, parent_ncvar, f): return out - # REVIEW: h5 def _cfa_parse_aggregated_data(self, ncvar, aggregated_data): """Parse a CFA-netCDF ``aggregated_data`` attribute. @@ -962,6 +962,7 @@ def _cfa_parse_aggregated_data(self, ncvar, aggregated_data): aggregation_instructions = g["cfa_aggregation_instructions"] variable_attributes = g["variable_attributes"] + # REVIEW: h5: `_cfa_parse_aggregated_data`: use `cfdm.netcdf_indexer` to get data # Loop round aggregation instruction terms out = {} for x in self._parse_x( diff --git a/cf/read_write/netcdf/netcdfwrite.py b/cf/read_write/netcdf/netcdfwrite.py index 6b035568a2..930a6d3e0f 100644 --- a/cf/read_write/netcdf/netcdfwrite.py +++ b/cf/read_write/netcdf/netcdfwrite.py @@ -706,7 +706,6 @@ def _cfa_write_term_variable( return ncvar - # REVIEW: getitem: def _cfa_write_non_standard_terms( self, field, fragment_ncdimensions, aggregated_data ): @@ -750,6 +749,7 @@ def _cfa_write_non_standard_terms( # more than one unique value then the fragment's value is # missing data. # + # REVIEW: getitem: `_cfa_write_non_standard_terms`: set 'asanyarray' # '_cfa_unique' has its own call to 'cf_asanyarray', so # we can set 'asanyarray=False'. dx = data.to_dask_array(asanyarray=False) @@ -789,7 +789,6 @@ def _cfa_write_non_standard_terms( return aggregated_data_attr - # REVIEW: getitem @classmethod def _cfa_unique(cls, a): """Return the unique value of an array. @@ -811,6 +810,7 @@ def _cfa_unique(cls, a): data if there is not a unique value. """ + # REVIEW: getitem: `_cfa_unique`: convert a to a usable array a = cf_asanyarray(a) out_shape = (1,) * a.ndim @@ -824,7 +824,6 @@ def _cfa_unique(cls, a): return np.ma.masked_all(out_shape, dtype=a.dtype) - # REVIEW: getitem def _cfa_aggregation_instructions(self, data, cfvar): """Convert data to standardised CFA aggregation instruction terms. @@ -967,6 +966,7 @@ def _cfa_aggregation_instructions(self, data, cfvar): # Create the location array # ------------------------------------------------------------ dtype = np.dtype(np.int32) + # REVIEW: getitem: `_cfa_aggregation_instructions`: set 'asanyarray' if ( max(data.to_dask_array(asanyarray=False).chunksize) > np.iinfo(dtype).max diff --git a/cf/read_write/read.py b/cf/read_write/read.py index 6a007892f4..88e5abb7c6 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -58,18 +58,18 @@ def read( select_options=None, follow_symlinks=False, mask=True, - # REVIEW: h5: new unpack parameter to control auto-unpacking (previously always True) + # REVIEW: h5: `read`: new 'unpack' parameter to control auto-unpacking (previously always True) unpack=True, warn_valid=False, chunks="auto", domain=False, cfa=None, - # REVIEW: h5: new netcdf_engine parameter to control how to read files + # REVIEW: h5: `read`: new 'netcdf_engine' parameter to control how to read files netcdf_engine=None, - # REVIEW: h5: new storage_options parameter to control access to S3 + # REVIEW: h5: `read`: new 'storage_options' parameter to control access to S3 storage_options=None, - # REVIEW: h5: cache_metadata parameter to control whethe or not to get to caache selected data elements - cache_metadata=True, + # REVIEW: h5: `read`: 'cache' parameter to control whether or not to get to cache selected data elements + cache=True, ): """Read field or domain constructs from files. @@ -743,13 +743,14 @@ def read( .. versionadded:: NEXTVERSION - cache_metadata: `bool`, optional + cache: `bool`, optional If True, the default, then cache the first and last array - elements of metadata constructs for fast future access. In - addition, the second and penultimate array elements will - be cached from coordinate bounds when there are two bounds - per cell. For remote data, setting *cache_metadata* to - False may speed up the parsing of the file. + elements of metadata constructs (not field constructs) for + fast future access. In addition, the second and + penultimate array elements will be cached from coordinate + bounds when there are two bounds per cell. For remote + data, setting *cache* to False may speed up the parsing of + the file. .. versionadded:: NEXTVERSION @@ -912,7 +913,7 @@ def read( cfa_options["substitutions"] = substitutions - cache_metadata = bool(cache_metadata) + cache = bool(cache) # Initialise the output list of fields/domains if domain: @@ -1049,7 +1050,7 @@ def read( cfa_options=cfa_options, netcdf_engine=netcdf_engine, storage_options=storage_options, - cache_metadata=cache_metadata, + cache=cache, ) # -------------------------------------------------------- @@ -1167,7 +1168,7 @@ def _read_a_file( cfa_options=None, netcdf_engine=None, storage_options=None, - cache_metadata=True, + cache=True, ): """Read the contents of a single file into a field list. @@ -1216,7 +1217,7 @@ def _read_a_file( .. versionadded:: NEXTVERSION - cache_metadata: `bool`, optional + cache: `bool`, optional See `cf.read` for details. .. versionadded:: NEXTVERSION @@ -1255,7 +1256,7 @@ def _read_a_file( "fmt": selected_fmt, "ignore_read_error": ignore_read_error, "cfa_options": cfa_options, - "cache_metadata": cache_metadata, + "cache": cache, } # ---------------------------------------------------------------- diff --git a/cf/read_write/um/umread.py b/cf/read_write/um/umread.py index 0a40d45812..215d0872b4 100644 --- a/cf/read_write/um/umread.py +++ b/cf/read_write/um/umread.py @@ -1957,7 +1957,7 @@ def create_data(self): recs = self.recs um_Units = self.um_Units - # REVIEW: h5: replace units/calendar API with attributes + # REVIEW: h5: `create_data`: replace units/calendar API with attributes attributes = { "units": getattr(um_Units, "units", None), "calendar": getattr(um_Units, "calendar", None), diff --git a/cf/read_write/write.py b/cf/read_write/write.py index fdfe6f7fbb..a2b7ed114b 100644 --- a/cf/read_write/write.py +++ b/cf/read_write/write.py @@ -12,9 +12,8 @@ netcdf = NetCDFWrite(implementation()) -# REVIEW: h5: docstring improvements - +# REVIEW: h5: `write`: docstring improvements @_manage_log_level_via_verbosity def write( fields, diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 85e6f253d4..4e5ae0c05d 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -1479,7 +1479,7 @@ def test_Data__getitem__(self): f = cf.Data([-999, 35], mask=[True, False]).reshape(2, 1) self.assertTrue(e.equals(f)) - # REVIEW: getitem + # REVIEW: getitem: `test_Data__getitem__`: Chained subspaces reading from disk # Chained subspaces reading from disk f = cf.read(self.filename)[0] d = f.data @@ -3292,7 +3292,7 @@ def test_Data_rechunk(self): self.assertEqual(e.chunks, ((4,), (5,))) self.assertTrue(e.equals(d)) - # REVIEW: getitem + # REVIEW: getitem: `test_Data_rechunk`: rechunking after a __getitem__ # Test rechunking after a __getitem__ e = d[:2].rechunk((2, 5)) self.assertTrue(e.equals(d[:2])) @@ -4522,7 +4522,7 @@ def test_Data__str__(self): for element in elements0: self.assertNotIn(element, d._get_cached_elements()) - # REVIEW: active + # REVIEW: getitem: `test_Data_active_storage`: test `Data.active_storage` def test_Data_active_storage(self): """Test `Data.active_storage`.""" with cf.active_storage(True): @@ -4570,7 +4570,7 @@ def test_Data_active_storage(self): d = cf.Data(n, to_memory=True) self.assertFalse(d.active_storage) - # REVIEW: getitem + # REVIEW: getitem: `test_Data_cull_graph`: prevent new asanyarray layer def test_Data_cull_graph(self): """Test `Data.cull`""" # Note: The number of layers in the culled graphs include a @@ -4829,7 +4829,7 @@ def test_Data_pad_missing(self): with self.assertRaises(ValueError): d.pad_missing(99, to_size=99) - # REVIEW: getitem + # REVIEW: getitem: `test_Data_is_masked`: test `Data.is_masked` def test_Data_is_masked(self): """Test Data.is_masked.""" d = cf.Data(np.arange(6).reshape(2, 3)) diff --git a/cf/test/test_Field.py b/cf/test/test_Field.py index b819b0d0e8..70851f5d62 100644 --- a/cf/test/test_Field.py +++ b/cf/test/test_Field.py @@ -1158,7 +1158,6 @@ def test_Field_insert_dimension(self): with self.assertRaises(ValueError): f.insert_dimension(1, "qwerty") - # REVIEW: getitem def test_Field_indices(self): f = cf.read(self.filename)[0] @@ -1462,6 +1461,7 @@ def test_Field_indices(self): shape = (1, 1, 1) self.assertEqual(g.shape, shape) + # REVIEW: getitem: `test_Field_indices`: make sure works when 'g.array' is not masked self.assertEqual(np.ma.compressed(g.array), 29) if mode != "full": self.assertEqual(g.construct("longitude").array, 83) @@ -1480,6 +1480,7 @@ def test_Field_indices(self): shape = (1, 2, 2) self.assertEqual(g.shape, shape) + # REVIEW: getitem: `test_Field_indices`: make sure works when 'g.array' is not masked self.assertTrue((np.ma.compressed(g.array) == [4, 29]).all()) # Add 2-d auxiliary coordinates with bounds, so we can diff --git a/cf/test/test_FullArray.py b/cf/test/test_FullArray.py index 5a8faf1d6c..8b25642686 100644 --- a/cf/test/test_FullArray.py +++ b/cf/test/test_FullArray.py @@ -1,3 +1,4 @@ +# REVIEW: getitem: `test_FullArray`: new test module import datetime import faulthandler import unittest @@ -9,7 +10,6 @@ import cf -# REVIEW: getitem class FullArrayTest(unittest.TestCase): def test_FullValue_inspection(self): full = 9 diff --git a/cf/test/test_NetCDF4Array.py b/cf/test/test_NetCDF4Array.py index 2124424a77..35d76581be 100644 --- a/cf/test/test_NetCDF4Array.py +++ b/cf/test/test_NetCDF4Array.py @@ -12,6 +12,7 @@ import cf +# REVIEW: h5: `test_NetCDF4Array.py`: renamed 'NetCDFArray' to 'NetCDF4Array' n_tmpfiles = 1 tmpfiles = [ tempfile.mkstemp("_test_NetCDF4Array.nc", dir=os.getcwd())[1] @@ -32,7 +33,6 @@ def _remove_tmpfiles(): atexit.register(_remove_tmpfiles) -# REVIEW: h5 class NetCDF4ArrayTest(unittest.TestCase): n = cf.NetCDF4Array( filename="filename.nc", @@ -41,7 +41,7 @@ class NetCDF4ArrayTest(unittest.TestCase): dtype=np.dtype(float), ) - # REVIEW: h5 + # REVIEW: h5: `test_NetCDF4Array`: renamed 'NetCDFArray' to 'NetCDF4Array' def test_NetCDF4Array_del_file_location(self): a = cf.NetCDF4Array(("/data1/file1", "/data2/file2"), ("tas1", "tas2")) b = a.del_file_location("/data1") @@ -62,7 +62,7 @@ def test_NetCDF4Array_del_file_location(self): with self.assertRaises(ValueError): b.del_file_location("/data1/") - # REVIEW: h5 + # REVIEW: h5: `test_NetCDF4Array`: renamed 'NetCDFArray' to 'NetCDF4Array' def test_NetCDF4Array_file_locations(self): a = cf.NetCDF4Array("/data1/file1") self.assertEqual(a.file_locations(), ("/data1",)) @@ -73,7 +73,7 @@ def test_NetCDF4Array_file_locations(self): a = cf.NetCDF4Array(("/data1/file1", "/data2/file2", "/data1/file2")) self.assertEqual(a.file_locations(), ("/data1", "/data2", "/data1")) - # REVIEW: h5 + # REVIEW: h5: `test_NetCDF4Array`: renamed 'NetCDFArray' to 'NetCDF4Array' def test_NetCDF4Array_add_file_location(self): a = cf.NetCDF4Array("/data1/file1", "tas") b = a.add_file_location("/home/user") @@ -109,7 +109,7 @@ def test_NetCDF4Array_add_file_location(self): self.assertEqual(b.get_filenames(), a.get_filenames()) self.assertEqual(b.get_addresses(), a.get_addresses()) - # REVIEW: h5 + # REVIEW: h5: `test_NetCDF4Array`: renamed 'NetCDFArray' to 'NetCDF4Array' def test_NetCDF4Array__dask_tokenize__(self): a = cf.NetCDF4Array("/data1/file1", "tas", shape=(12, 2), mask=False) self.assertEqual(tokenize(a), tokenize(a.copy())) @@ -117,7 +117,7 @@ def test_NetCDF4Array__dask_tokenize__(self): b = cf.NetCDF4Array("/home/file2", "tas", shape=(12, 2)) self.assertNotEqual(tokenize(a), tokenize(b)) - # REVIEW: h5 + # REVIEW: h5: `test_NetCDF4Array`: renamed 'NetCDFArray' to 'NetCDF4Array' def test_NetCDF4Array_multiple_files(self): f = cf.example_field(0) cf.write(f, tmpfile1) @@ -135,7 +135,7 @@ def test_NetCDF4Array_multiple_files(self): self.assertEqual(len(n.get_filenames()), 2) self.assertTrue((n[...] == f.array).all()) - # REVIEW: getitem + # REVIEW: getitem: `test_NetCDF4Array`: test `NetCDF4Array.shape` def test_NetCDF4Array_shape(self): shape = (12, 73, 96) a = cf.NetCDF4Array("/home/file2", "tas", shape=shape) @@ -145,7 +145,7 @@ def test_NetCDF4Array_shape(self): self.assertEqual(a.shape, (shape[0] // 2,) + shape[1:]) self.assertEqual(a.original_shape, shape) - # REVIEW: getitem + # REVIEW: getitem: `test_NetCDF4Array`: test `NetCDF4Array.index` def test_NetCDF4Array_index(self): shape = (12, 73, 96) a = cf.NetCDF4Array("/home/file2", "tas", shape=shape) diff --git a/cf/test/test_active_storage.py b/cf/test/test_active_storage.py index b2d166f7a8..5b4fa1645a 100644 --- a/cf/test/test_active_storage.py +++ b/cf/test/test_active_storage.py @@ -1,3 +1,4 @@ +# REVIEW: h5: `test_active_storage.py`: new test module import atexit import datetime import faulthandler @@ -34,7 +35,6 @@ def _remove_tmpfiles(): atexit.register(_remove_tmpfiles) -# REVIEW: active class ActiveStorageTest(unittest.TestCase): @unittest.skipUnless(Active is not None, "Requires activestorage.Active") def test_active_storage(self): diff --git a/cf/test/test_functions.py b/cf/test/test_functions.py index 4f32bdbdc5..f0830caef5 100644 --- a/cf/test/test_functions.py +++ b/cf/test/test_functions.py @@ -43,7 +43,7 @@ def test_aliases(self): self.assertEqual(cf.tempdir(), cf.TEMPDIR()) self.assertEqual(cf.chunksize(), cf.CHUNKSIZE()) - # REVIEW: active + # REVIEW: active: `test_configuration`: test `cf.active_storage`, cf.active_storage_url` def test_configuration(self): # This test assumes 'total_memory' remains constant throughout # the test run, which should be true generally in any diff --git a/cf/test/test_read_write.py b/cf/test/test_read_write.py index 4fc3fb4aa4..fb0b88055f 100644 --- a/cf/test/test_read_write.py +++ b/cf/test/test_read_write.py @@ -79,7 +79,7 @@ def test_write_filename(self): self.assertTrue((a == g[0].array).all()) - # REVIEW: h5 + # REVIEW: h5: `test_read_mask`: rename numpy to np def test_read_mask(self): f = self.f0.copy() @@ -561,7 +561,7 @@ def test_read_write_netCDF4_compress_shuffle(self): f"Bad read/write with lossless compression: {fmt}", ) - # REVIEW: h5 + # REVIEW: h5: `test_write_datatype`: rename numpy to np def test_write_datatype(self): f = cf.read(self.filename)[0] self.assertEqual(f.dtype, np.dtype(float)) From 87e249e58689c83cfc5df5a6c2a3590a822de75f Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Apr 2024 09:05:58 +0100 Subject: [PATCH 090/134] 2-d np index --- cf/mixin/fielddomain.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/cf/mixin/fielddomain.py b/cf/mixin/fielddomain.py index d81dbdf800..2a3e09231b 100644 --- a/cf/mixin/fielddomain.py +++ b/cf/mixin/fielddomain.py @@ -4,6 +4,7 @@ import dask.array as da import numpy as np from cfdm import is_log_level_debug, is_log_level_info +from dask.array.slicing import normalize_index from ..data import Data from ..decorators import ( @@ -458,9 +459,18 @@ def _indices(self, mode, data_axes, ancillary_mask, kwargs): logger.debug(" 1-d CASE 3:") # pragma: no cover index = item == value - index = index.data.to_dask_array() + + # Performance: Convert the 1-d 'index' to a numpy + # array of bool. + # + # This is beacuse Dask can be *very* slow at + # instantiation time when the 'index' is a Dask + # array, in which case contents of 'index' are + # unknown. + index = np.asanyarray(index) if envelope or full: + # Set ind index = np.asanyarray(index) if np.ma.isMA(index): ind = np.ma.where(index) @@ -468,6 +478,10 @@ def _indices(self, mode, data_axes, ancillary_mask, kwargs): ind = np.where(index) index = slice(None) + else: + # Convert bool to int, to save memory. + size = domain_axes[axis].get_size() + index = normalize_index(index, (size,))[0] else: raise ValueError( From 69731771fc22960cbd93120b627fd295e2d7ef8e Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Apr 2024 09:19:32 +0100 Subject: [PATCH 091/134] dask vn --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d193fbcfa1..e7e1c24b17 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ numpy>=1.22 cfdm>=1.11.2.0, <1.11.3.0 psutil>=0.6.0 cfunits>=3.3.7 -dask>=2022.12.1 +dask>=2024.4.0 packaging>=20.0 scipy>=1.10.0 h5netcdf>=1.3.0 From bac1cc85667fe29e228092af9663027bce188500 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 24 Apr 2024 15:27:08 +0100 Subject: [PATCH 092/134] fragment get_array --- cf/data/array/fullarray.py | 4 +- cf/data/array/h5netcdfarray.py | 7 ++- cf/data/array/locks.py | 1 + cf/data/array/netcdf4array.py | 5 +- cf/data/array/umarray.py | 9 +--- cf/data/fragment/mixin/fragmentarraymixin.py | 12 ++--- cf/data/fragment/netcdffragmentarray.py | 43 ++++++++++++----- cf/docstring/docstring.py | 4 ++ cf/read_write/read.py | 49 ++++++++++---------- 9 files changed, 76 insertions(+), 58 deletions(-) diff --git a/cf/data/array/fullarray.py b/cf/data/array/fullarray.py index b52e6480d9..92fa7eea67 100644 --- a/cf/data/array/fullarray.py +++ b/cf/data/array/fullarray.py @@ -131,9 +131,7 @@ def _get_array(self, index=None): :Parameters: - index: `tuple` or `None`, optional - Provide the indices that define the subspace. If `None` - then the `index` attribute is used. + {{index: `tuple` or `None`, optional}} :Returns: diff --git a/cf/data/array/h5netcdfarray.py b/cf/data/array/h5netcdfarray.py index 96d220fb6a..2101899f41 100644 --- a/cf/data/array/h5netcdfarray.py +++ b/cf/data/array/h5netcdfarray.py @@ -49,7 +49,8 @@ def _lock(self): """ return netcdf_lock - # REVIEW: getitem: `_get_array`: new method to convert subspace to numpy array + # REVIEW: h5: `_get_array`: Ignore this for h5 review + # REVIEW: getitem: `_get_array`: new method to convert subspace to numpy array. def _get_array(self, index=None): """Returns a subspace of the dataset variable. @@ -59,9 +60,7 @@ def _get_array(self, index=None): :Parameters: - index: `tuple` or `None`, optional - Provide the indices that define the subspace. If `None` - then the `index` attribute is used. + {{index: `tuple` or `None`, optional}} :Returns: diff --git a/cf/data/array/locks.py b/cf/data/array/locks.py index 5a7b2bd333..efa0114699 100644 --- a/cf/data/array/locks.py +++ b/cf/data/array/locks.py @@ -1,3 +1,4 @@ +# REVIEW: h5: `locks.py`: New module to provide file locks from dask.utils import SerializableLock # Global lock for netCDF file access diff --git a/cf/data/array/netcdf4array.py b/cf/data/array/netcdf4array.py index 011cd28328..ece5f3d3c4 100644 --- a/cf/data/array/netcdf4array.py +++ b/cf/data/array/netcdf4array.py @@ -47,6 +47,7 @@ def _lock(self): """ return netcdf_lock + # REVIEW: getitem: `_get_array`: Ignore this for h5 review # REVIEW: getitem: `_get_array`: new method to convert subspace to numpy array def _get_array(self, index=None): """Returns a subspace of the dataset variable. @@ -57,9 +58,7 @@ def _get_array(self, index=None): :Parameters: - index: `tuple` or `None`, optional - Provide the indices that define the subspace. If `None` - then the `index` attribute is used. + {{index: `tuple` or `None`, optional}} :Returns: diff --git a/cf/data/array/umarray.py b/cf/data/array/umarray.py index 3b385bc72c..c8d1ef0d02 100644 --- a/cf/data/array/umarray.py +++ b/cf/data/array/umarray.py @@ -181,9 +181,7 @@ def _get_array(self, index=None): :Parameters: - index: `tuple` or `None`, optional - Provide the indices that define the subspace. If `None` - then the `index` attribute is used. + {{index: `tuple` or `None`, optional}} :Returns: @@ -206,7 +204,6 @@ def _get_array(self, index=None): self.close(f) del f, rec - # REVIEW: h5: `_get_array`: refactor for use of `netcdf_indexer` # Set the netCDF attributes for the data attributes = self.get_attributes({}) self._set_units(int_hdr, attributes) @@ -275,6 +272,7 @@ def _get_rec(self, f, header_offset): # if r.hdr_offset == header_offset: # return r + # REVIEW: getitem: `_set_FillValue`: record _FillValue in attributes def _set_FillValue(self, int_hdr, real_hdr, attributes): """Set the ``_FillValue`` attribute. @@ -298,8 +296,6 @@ def _set_FillValue(self, int_hdr, real_hdr, attributes): `None """ - # REVIEW: getitem: `_set_FillValue` - if "FillValue" in attributes: return @@ -374,7 +370,6 @@ def _set_units(self, int_hdr, attributes): # REVIEW: getitem: `_set_units`: record units in attributes attributes["units"] = units - # REVIEW: h5: `_set_unpack`: record unpack in attributes # REVIEW: getitem: `_set_unpack`: record unpack in attributes def _set_unpack(self, int_hdr, real_hdr, attributes): """Set the ``add_offset`` and ``scale_factor`` attributes. diff --git a/cf/data/fragment/mixin/fragmentarraymixin.py b/cf/data/fragment/mixin/fragmentarraymixin.py index d81d4cb453..45c7dcf160 100644 --- a/cf/data/fragment/mixin/fragmentarraymixin.py +++ b/cf/data/fragment/mixin/fragmentarraymixin.py @@ -22,11 +22,11 @@ def _get_array(self, index=None): :Parameters: - index: `tuple` or `None`, optional - Provide the indices that define the subspace. It is - assumed that there is a distinct index for each - fragment dimension. If `None` then the `index` - attribute is used. + {{index: `tuple` or `None`, optional}} + + It is important that there is a distinct value for each + fragment dimension, which is guaranteed when the + default of the `index` attribute is being used. :Returns: @@ -60,7 +60,7 @@ def _get_array(self, index=None): # how many missing dimensions the fragment has, nor # their positions => Get the full fragment array and # then reshape it to the shape of the dask compute - # chunk. + # chunk; and then apply the index. array = super()._get_array(Ellipsis) if array.size > prod(self.original_shape): raise ValueError( diff --git a/cf/data/fragment/netcdffragmentarray.py b/cf/data/fragment/netcdffragmentarray.py index 31da929126..a8bcb4b3df 100644 --- a/cf/data/fragment/netcdffragmentarray.py +++ b/cf/data/fragment/netcdffragmentarray.py @@ -1,14 +1,16 @@ import cfdm from ..array.abstract import Array -from ..array.mixin import FileArrayMixin +from ..array.mixin import FileArrayMixin, IndexMixin from .h5netcdffragmentarray import H5netcdfFragmentArray from .mixin import FragmentArrayMixin from .netcdf4fragmentarray import NetCDF4FragmentArray +# REVIEW: getitem: `NetCDFFragmentArray`: new inheritance to allow for different netCDF backends class NetCDFFragmentArray( FragmentArrayMixin, + IndexMixin, cfdm.data.mixin.NetCDFFileMixin, FileArrayMixin, cfdm.data.mixin.FileArrayMixin, @@ -16,8 +18,7 @@ class NetCDFFragmentArray( ): """A netCDF fragment array. - Access will be with either `netCDF4` (for local and OPenDAP files) - or `h5netcdf` (for S3 files). + Access will be with either `netCDF4` or `h5netcdf`. .. versionadded:: 3.15.0 @@ -175,16 +176,36 @@ def __init__( # By default, close the file after data array access self._set_component("close", True, copy=False) - # REVIEW: h5: `__getitem__`: new factory method to choose backend - def __getitem__(self, indices): - """Returns a subspace of the fragment as a numpy array. + # REVIEW: getitem: `_get_array`: new method to convert subspace to numpy array + def _get_array(self, index=None): + """Returns a subspace of the dataset variable. - x.__getitem__(indices) <==> x[indices] + The method acts as a factory for either a + `NetCDF4FragmentArray` or a `H5netcdfFragmentArray` class, and + it is the result of calling `!_get_array` on the newly created + instance that is returned. - .. versionadded:: 3.15.0 + `H5netcdfFragmentArray` will only be used if + `NetCDF4FragmentArray` returns a `FileNotFoundError` exception. - """ + .. versionadded:: NEXTVERSION + + .. seealso:: `__array__`, `index` + + :Parameters: + + {{index: `tuple` or `None`, optional}} + It is important that there is a distinct value for each + fragment dimension, which is guaranteed when the + default of the `index` attribute is being used. + + :Returns: + + `numpy.ndarray` + The subspace. + + """ kwargs = { "dtype": self.dtype, "shape": self.shape, @@ -205,11 +226,11 @@ def __getitem__(self, indices): ) try: - return NetCDF4FragmentArray(**kwargs)[indices] + return NetCDF4FragmentArray(**kwargs)._get_array(index) except FileNotFoundError: pass except Exception: - return H5netcdfFragmentArray(**kwargs)[indices] + return H5netcdfFragmentArray(**kwargs)._get_array(index) # Still here? if len(filenames) == 1: diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py index 2b1d53d388..77dc33614f 100644 --- a/cf/docstring/docstring.py +++ b/cf/docstring/docstring.py @@ -647,6 +647,10 @@ Array will get further operations which are guaranteed to negate the need for the extra layer in the Dask graph.""", + # _get_array index + "{{index: `tuple` or `None`, optional}}": """index: `tuple` or `None`, optional + Provide the indices that define the subspace. If `None` + then the `index` attribute is used.""", # ---------------------------------------------------------------- # Method description substitutions (4 levels of indentation) # ---------------------------------------------------------------- diff --git a/cf/read_write/read.py b/cf/read_write/read.py index 88e5abb7c6..d9c5ecfac2 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -64,8 +64,8 @@ def read( chunks="auto", domain=False, cfa=None, - # REVIEW: h5: `read`: new 'netcdf_engine' parameter to control how to read files - netcdf_engine=None, + # REVIEW: h5: `read`: new 'netcdf_backend' parameter to control how to read files + netcdf_backend=None, # REVIEW: h5: `read`: new 'storage_options' parameter to control access to S3 storage_options=None, # REVIEW: h5: `read`: 'cache' parameter to control whether or not to get to cache selected data elements @@ -670,31 +670,32 @@ def read( A dictionary whose key/value pairs define text substitutions to be applied to the fragment file names. Each key may be specified with or without the - ``${...}`` syntax. For instance, the following are - equivalent: ``{'base': 'sub'}``, ``{'${base}': 'sub'}``. - The substitutions are used in conjunction with, and take - precedence over, any that are stored in the CFA-netCDF - file by the ``substitutions`` attribute of the ``file`` - CFA aggregation instruction variable. + ``${*}`` syntax (where `*` represents any amount of any + characters). For instance, ``{'substitution': + 'replacement'}`` and ``{'${substitution}' are equivalent + 'replacement'}``. The substitutions are used in + conjunction with, and take precedence over, any that are + stored in the CFA-netCDF file by the ``substitutions`` + attribute of the ``file`` fragement array variable. *Example:* - ``{'base': 'file:///data/'}`` + ``{'replacement': 'file:///data/'}`` .. versionadded:: 3.15.0 - netcdf_engine: `None` or `str`, optional - Specify which library to use for opening and reading - netCDF files. By default, or if `None`, then the first one - of `netCDF4` and `h5netcdf` to successfully open the file - netCDF file is used. Setting *netcdf_engine* to one of - ``'netCDF4'`` and ``'h5netcdf'`` will force the use of - that library. + netcdf_backend: `None` or `str`, optional + Specify which library to use for reading netCDF files. By + default, or if `None`, then the first one of `netCDF4` and + `h5netcdf` to successfully open the file netCDF file is + used. Setting *netcdf_backend* to one of ``'netCDF4'`` and + ``'h5netcdf'`` will force the use of that library. - .. note:: The *netcdf_engine* parameter does not affect + .. note:: The *netcdf_backend* parameter does not affect the opening of netCDF fragment files that define - the data of aggregated variables. For these, the - first one of `netCDF4` and `h5netcdf` to - successfully open the file is used. + the data of aggregation variables. For these, it + is always the case that the first one of + `netCDF4` and `h5netcdf` to successfully open + the file is used. .. versionadded:: NEXTVERSION @@ -1048,7 +1049,7 @@ def read( select=select, domain=domain, cfa_options=cfa_options, - netcdf_engine=netcdf_engine, + netcdf_backend=netcdf_backend, storage_options=storage_options, cache=cache, ) @@ -1166,7 +1167,7 @@ def _read_a_file( select=None, domain=False, cfa_options=None, - netcdf_engine=None, + netcdf_backend=None, storage_options=None, cache=True, ): @@ -1212,7 +1213,7 @@ def _read_a_file( .. versionadded:: NEXTVERSION - netcdf_engine: `str` or `None`, optional + netcdf_backend: `str` or `None`, optional See `cf.read` for details. .. versionadded:: NEXTVERSION @@ -1298,7 +1299,7 @@ def _read_a_file( warn_valid=warn_valid, domain=domain, storage_options=storage_options, - netcdf_engine=netcdf_engine, + netcdf_engine=netcdf_backend, ) except MaskError: # Some data required for field interpretation is missing, From bd625f5cacee90cc7fe8f60b2eb724bc1f600154 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 25 Apr 2024 21:01:51 +0100 Subject: [PATCH 093/134] dev --- cf/data/array/mixin/indexmixin.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/cf/data/array/mixin/indexmixin.py b/cf/data/array/mixin/indexmixin.py index 92306be0be..6dbd56c624 100644 --- a/cf/data/array/mixin/indexmixin.py +++ b/cf/data/array/mixin/indexmixin.py @@ -112,8 +112,8 @@ def __getitem__(self, index): if isinstance(ind0, Integral): # The previous call to __getitem__ resulted in a # dimension being removed (i.e. 'ind0' is - # integer-valued). Therefore 'index1' must have have - # fewer elements than 'index0', so we need to "carry + # integer-valued). Therefore 'index1' must have fewer + # elements than 'index0', so we need to "carry # forward" the integer-valued index so that it is # available at evaluation time. new_indices.append(ind0) @@ -139,10 +139,9 @@ def __getitem__(self, index): # computed as part of the whole graph execution; # i.e. we don't have to worry about a # compute-within-a-compute situation. (If this - # were not the case then we could get round it - # by wrapping the compute inside a `with - # dask.config.set({"scheduler": - # "synchronous"}):`.) + # were not the case then we could add + # `scheduler="synchronous"` to the compute + # call.) ind1 = ind1.compute() if isinstance(ind0, slice): @@ -174,8 +173,8 @@ def __getitem__(self, index): # ind1: int, or array of int/bool new_index = np.arange(*ind0.indices(original_size))[ind1] else: - # ind0: array of int. If we made it here then it can't - # be anything else. This is + # ind0: array of int. If we made it to here then it + # can't be anything else. This is # because we've dealt with ind0 # being a slice or an int, the # very first ind0 is always @@ -305,10 +304,13 @@ def index(self, conform=True): # Still here? Then conform the indices by: # - # 1) Converting decreasing size 1 slices to increasing ones. + # 1) Converting decreasing size 1 slices to increasing + # ones. This helps when the parent class can't cope with + # decreasing slices. # # 2) Converting, where possible, sequences of integers to - # slices. + # slices. This helps when the parent class can't cope with + # indices that are sequences of integers. ind = list(ind) for n, (i, size) in enumerate(zip(ind[:], self.original_shape)): if isinstance(i, slice): @@ -356,7 +358,7 @@ def original_shape(self): """ out = self._custom.get("original_shape") if out is None: - # If None then no subspace has been defined + # No subspace has been defined yet out = self.shape self._custom["original_shape"] = out From d05c50b98d8429707cabea166a0d3156a0dda36e Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 2 May 2024 09:52:04 +0100 Subject: [PATCH 094/134] dev --- cf/data/fragment/fragmentarray.py | 226 ++++++++++++++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100644 cf/data/fragment/fragmentarray.py diff --git a/cf/data/fragment/fragmentarray.py b/cf/data/fragment/fragmentarray.py new file mode 100644 index 0000000000..0de45546c0 --- /dev/null +++ b/cf/data/fragment/fragmentarray.py @@ -0,0 +1,226 @@ +import cfdm + +from ..array.abstract import Array +from ..array.mixin import FileArrayMixin, IndexMixin +from .h5netcdffragmentarray import H5netcdfFragmentArray +from .mixin import FragmentArrayMixin +from .netcdf4fragmentarray import NetCDF4FragmentArray +from .umfragmentarray import UMFragmentArray + + +_fragment = {'netCDF4': NetCDF4FragmentArray + 'h5netcdf': H5netcdfFragmentArray, + 'um': UMFragmentArray} + +# REVIEW: TODO getitem: `NetCDFFragmentArray`: new inheritance to allow for different netCDF backends +class FragmentArray( + FragmentArrayMixin, + IndexMixin, + FileArrayMixin, + cfdm.data.mixin.FileArrayMixin, + Array, +): + """A netCDF fragment array. + + Access will be with either `netCDF4` or `h5netcdf`. + + .. versionadded:: NEXTVERSION + + """ + + # REVIEW: h5: `__init__`: replace units/calendar API with attributes + def __init__( + self, + filename=None, + address=None, + dtype=None, + shape=None, + aggregated_units=False, + aggregated_calendar=False, + attributes=None, + storage_options=None, + source=None, + copy=True, + ): + """**Initialisation** + + :Parameters: + + filename: (sequence of `str`), optional + The locations fragment datasets containing the array. + + address: (sequence of `str`), optional + How to find the fragments in the fragment datasets. + + dtype: `numpy.dtype`, optional + The data type of the aggregated array. May be `None` + if is not known. This may differ from the data type of + the fragment's data. + + shape: `tuple`, optional + The shape of the fragment in its canonical form. + + {{init attributes: `dict` or `None`, optional}} + + If *attributes* is `None`, the default, then the + attributes will be set from the fragment dataset + during the first `__getitem__` call. + + {{aggregated_units: `str` or `None`, optional}} + + {{aggregated_calendar: `str` or `None`, optional}} + + {{init storage_options: `dict` or `None`, optional}} + + {{init source: optional}} + + {{init copy: `bool`, optional}} + + """ + super().__init__( + source=source, + copy=copy, + ) + + if source is not None: + try: + shape = source._get_component("shape", None) + except AttributeError: + shape = None + + try: + filename = source._get_component("filename", None) + except AttributeError: + filename = None + + try: + address = source._get_component("address", None) + except AttributeError: + address = None + + try: + dtype = source._get_component("dtype", None) + except AttributeError: + dtype = None + + try: + attributes = source._get_component("attributes", None) + except AttributeError: + attributes = None + + try: + aggregated_units = source._get_component( + "aggregated_units", False + ) + except AttributeError: + aggregated_units = False + + try: + aggregated_calendar = source._get_component( + "aggregated_calendar", False + ) + except AttributeError: + aggregated_calendar = False + + try: + storage_options = source._get_component( + "storage_options", None + ) + except AttributeError: + storage_options = None + + if filename is not None: + if isinstance(filename, str): + filename = (filename,) + else: + filename = tuple(filename) + + self._set_component("filename", filename, copy=False) + + if address is not None: + if isinstance(address, int): + address = (address,) + else: + address = tuple(address) + + self._set_component("address", address, copy=False) + + if storage_options is not None: + self._set_component("storage_options", storage_options, copy=False) + + self._set_component("shape", shape, copy=False) + self._set_component("dtype", dtype, copy=False) + self._set_component("attributes", attributes, copy=False) + self._set_component("mask", True, copy=False) + + self._set_component("aggregated_units", aggregated_units, copy=False) + self._set_component( + "aggregated_calendar", aggregated_calendar, copy=False + ) + + # By default, close the file after data array access + self._set_component("close", True, copy=False) + + # REVIEW: getitem: `_get_array`: new method to convert subspace to numpy array + def _get_array(self, index=None): + """Returns a subspace of the dataset variable. + + The method acts as a factory for either a + `NetCDF4FragmentArray`, `H5netcdfFragmentArray`, or + `UMFragmentArray` class, and it is the result of calling + `!_get_array` on the newly created instance that is returned. + + `H5netcdfFragmentArray` will only be used if + `NetCDF4FragmentArray` returns a `FileNotFoundError` + exception; and `UMFragmentArray` will only be used + if `H5netcdfFragmentArray` returns an `Exception`. + + .. versionadded:: NEXTVERSION + + .. seealso:: `__array__`, `index` + + :Parameters: + + {{index: `tuple` or `None`, optional}} + + When a `tuple`, there must be a distinct entry for each + fragment dimension. + + :Returns: + + `numpy.ndarray` + The subspace. + + """ + kwargs = { + "dtype": self.dtype, + "shape": self.shape, + "aggregated_units": self.get_aggregated_units(None), + "aggregated_calendar": self.get_aggregated_calendar(None), + "attributes": self.get_attributes(None), + "copy": False, + } + + # Loop round the files, returning as soon as we find one that + # is accessible. + filenames = self.get_filenames() + for filename, address in zip(filenames, self.get_addresses()): + kwargs["filename"] = filename + kwargs["address"] = address + kwargs["storage_options"] = self.get_storage_options( + create_endpoint_url=False + ) + + for backend in dataset_backends: + try: + return _fragment[backend](**kwargs)._get_array(index) + except FileNotFoundError: + pass + except KeyError: + raise ValueError("unknown backend: T sadasds TODO") + + # Still here? + if len(filenames) == 1: + raise FileNotFoundError(f"No such fragment file: {filenames[0]}") + + raise FileNotFoundError(f"No such fragment files: {filenames}") From 9b56aaead32a24c63d79f9705c31612221e59d9f Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 10 Jul 2024 11:30:29 +0100 Subject: [PATCH 095/134] engine -> backend --- cf/read_write/read.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/read_write/read.py b/cf/read_write/read.py index d9c5ecfac2..1baf002358 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -1299,7 +1299,7 @@ def _read_a_file( warn_valid=warn_valid, domain=domain, storage_options=storage_options, - netcdf_engine=netcdf_backend, + netcdf_backend=netcdf_backend, ) except MaskError: # Some data required for field interpretation is missing, From 88cdbe6bbf66a170bda3d9ae173fab8b19f7fe6a Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 15 Jul 2024 10:07:58 +0100 Subject: [PATCH 096/134] dev --- cf/data/fragment/fragmentarray.py | 5 ++--- cf/test/individual_tests.sh | 6 +++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/cf/data/fragment/fragmentarray.py b/cf/data/fragment/fragmentarray.py index 0de45546c0..6836eeeed9 100644 --- a/cf/data/fragment/fragmentarray.py +++ b/cf/data/fragment/fragmentarray.py @@ -7,7 +7,6 @@ from .netcdf4fragmentarray import NetCDF4FragmentArray from .umfragmentarray import UMFragmentArray - _fragment = {'netCDF4': NetCDF4FragmentArray 'h5netcdf': H5netcdfFragmentArray, 'um': UMFragmentArray} @@ -210,7 +209,7 @@ def _get_array(self, index=None): kwargs["storage_options"] = self.get_storage_options( create_endpoint_url=False ) - + for backend in dataset_backends: try: return _fragment[backend](**kwargs)._get_array(index) @@ -218,7 +217,7 @@ def _get_array(self, index=None): pass except KeyError: raise ValueError("unknown backend: T sadasds TODO") - + # Still here? if len(filenames) == 1: raise FileNotFoundError(f"No such fragment file: {filenames[0]}") diff --git a/cf/test/individual_tests.sh b/cf/test/individual_tests.sh index fea95ef58b..425c7dd435 100755 --- a/cf/test/individual_tests.sh +++ b/cf/test/individual_tests.sh @@ -5,9 +5,9 @@ do echo "Running $file" python $file rc=$? - if [[ $rc != 0 ]]; then - exit $rc - fi +# if [[ $rc != 0 ]]; then +# exit $rc +# fi done file=setup_create_field.py From a1dc78f32929faf0adb8d5723d5c24cf0e96e879 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 19 Jul 2024 08:26:51 +0100 Subject: [PATCH 097/134] new non-dask code start --- cf/data/collapse/collapse_active.py | 71 ++++++++++++++++++++++------- 1 file changed, 54 insertions(+), 17 deletions(-) diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 3422a74f43..bd0fed8412 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -1,6 +1,7 @@ # REVIEW: active: `collapse_active.py`: new module for active storage functionality import logging from functools import wraps +from numbers import Integral try: from activestorage import Active @@ -59,14 +60,47 @@ def active_chunk(method, x, **kwargs): {'N': 7008, 'sum': 7006221.66903949} """ - if kwargs.get("computing_meta"): - return x + # Return None if active storage reduction is not approriate, or + # raise an ActiveStorageError it is appropriate but can't/didn't + # work + if not cf_active_storage(): + return weighted = kwargs.get("weights") is not None if weighted: - raise ValueError(f"Can't do weighted {method!r} active reductions") + return + + axis = kwargs.get("axis") + if axis is not None: + if isinstance(axis, Integral): + axis = (axis,) + + if len(axis) < x.ndim: + return + + try: + filename = x.get_filename() + except AttributeError: + # This Dask chunk is not a data definition + return + else: + if not filename: + # This data definition doesn't have any files, so can't + # support active storage reductions. + return + + if hasattr(x, "actify"): + url = active_storage_url().value + if url is None: + raise ActiveStorageError("No active storage URL") + + x = x.actify(url) + + # Still here? Then do active storage reduction + if kwargs.get("computing_meta"): + return x - filename = x.get_filename() + # filename = x.get_filename() filename = "/".join(filename.split("/")[3:]) max_threads = 100 @@ -75,7 +109,7 @@ def active_chunk(method, x, **kwargs): "uri": filename, "ncvar": x.get_address(), "storage_options": x.get_storage_options(), - "active_storage_url": x.get_active_storage_url(), + "active_storage_url": url, # x.get_active_storage_url(), "storage_type": "s3", # Temporary requirement! "max_threads": max_threads, } @@ -345,19 +379,22 @@ def active_storage_chunk(method): def decorator(chunk_function): @wraps(chunk_function) def wrapper(*args, **kwargs): - if args: - x = args[0] + # if args: + # x = args[0] + # else: + # x = kwargs["x"] + # + # if getattr(x, "actified", False): + try: + # Try doing an active storage reduction on + # actified chunk data + out = active_chunk(method, *args, **kwargs) + except ActiveStorageError as warning: + # The active storage reduction failed + logger.warning(f"{warning}. Reverting to local reduction.") else: - x = kwargs["x"] - - if getattr(x, "actified", False): - try: - # Try doing an active storage reduction on - # actified chunk data - return active_chunk(method, *args, **kwargs) - except ActiveStorageError as error: - # The active storage reduction failed - logger.warning(f"{error}. Reverting to local reduction.") + if out is not None: + return out # Still here? Then do a local reduction. return chunk_function(*args, **kwargs) From dc4ce6ffbb98d0c8dd14123e832b8df56ebd54c7 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 19 Jul 2024 09:27:36 +0100 Subject: [PATCH 098/134] dev --- cf/data/array/mixin/activestoragemixin.py | 17 +++++++ cf/data/collapse/collapse.py | 3 +- cf/data/collapse/collapse_active.py | 58 ++++++++++++++--------- 3 files changed, 55 insertions(+), 23 deletions(-) diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index ab41ef7fb9..bc14c6d518 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -12,6 +12,23 @@ class ActiveStorageMixin: """ + @property + def is_actifiable(self): + """Whether active storage operations are possible. + + .. versionadded:: NEXTVERSION + + .. seealso:: `actify`, `get_active_storage_url` + + :Returns: + + `bool` + `True` if active stoage operations are possible, + otherwise `False`. + + """ + return self.get_filename(None) is not None + @property def actified(self): """Whether active storage operations are possible. diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index 587a80a33b..196608ee94 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -224,7 +224,7 @@ def max_abs( ) # REVIEW: active: `mean`: active storage decoration, new keyword 'active_stoarage' - @active_storage("mean") +# @active_storage("mean") def mean( self, a, @@ -274,6 +274,7 @@ def mean( The collapsed array. """ + print ('KKKKKKKKKK') from .dask_collapse import cf_mean_agg, cf_mean_chunk, cf_mean_combine if chunk_function is None: diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index bd0fed8412..fce3f6d4a3 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -60,9 +60,13 @@ def active_chunk(method, x, **kwargs): {'N': 7008, 'sum': 7006221.66903949} """ - # Return None if active storage reduction is not approriate, or - # raise an ActiveStorageError it is appropriate but can't/didn't - # work + # Return None if active storage reduction is not approriate: + # + # * The reduction is weighted + # * The reduction is over a subset of axes + # * x.is_actifiable is False + if not kwargs.get("computing_meta"): + print(method, repr(x), kwargs) if not cf_active_storage(): return @@ -78,29 +82,34 @@ def active_chunk(method, x, **kwargs): if len(axis) < x.ndim: return - try: - filename = x.get_filename() - except AttributeError: - # This Dask chunk is not a data definition +# try: +# filename = x.get_filename() +# except AttributeError: +# # This Dask chunk is not a data definition +# return +# else: +# if not filename: +# # This data definition doesn't have any files, so can't +# # support active storage reductions. +# return + + if not getattr(x, "is_actifiable", False): + print ('not is act', axis) return - else: - if not filename: - # This data definition doesn't have any files, so can't - # support active storage reductions. - return - - if hasattr(x, "actify"): - url = active_storage_url().value - if url is None: - raise ActiveStorageError("No active storage URL") - - x = x.actify(url) - + + # Raise an ActiveStorageError the active storage reduction can't + # happen + url = active_storage_url().value + if url is None: + raise ActiveStorageError("No active storage URL") + + x = x.actify(url) + # Still here? Then do active storage reduction if kwargs.get("computing_meta"): return x - # filename = x.get_filename() + filename = x.get_filename() filename = "/".join(filename.split("/")[3:]) max_threads = 100 @@ -124,6 +133,8 @@ def active_chunk(method, x, **kwargs): import datetime import time + # Raise an ActiveStorageError active storage reductions fail + # whilst happening try: lock = False # True #False if lock: @@ -151,6 +162,7 @@ def active_chunk(method, x, **kwargs): f"maxT={max_threads}", ) except Exception as error: + print ('565') raise ActiveStorageError(error) # Reformat the components dictionary to match the output of the @@ -391,7 +403,9 @@ def wrapper(*args, **kwargs): out = active_chunk(method, *args, **kwargs) except ActiveStorageError as warning: # The active storage reduction failed - logger.warning(f"{warning}. Reverting to local reduction.") + logger.warning( + f"Dask chunk reverting to local reduction: {warning}" + ) else: if out is not None: return out From eff61c1c7aa575fe74c21d4968dbbb13c1c38804 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 19 Jul 2024 11:00:06 +0100 Subject: [PATCH 099/134] dev --- cf/data/collapse/collapse.py | 64 +++++++++--------- cf/data/data.py | 125 ++++++++++++++++++----------------- cf/data/utils.py | 2 +- 3 files changed, 97 insertions(+), 94 deletions(-) diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index 196608ee94..312f9b8c81 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -98,7 +98,7 @@ def __docstring_package_depth__(self): return 0 # REVIEW: active: `max`: active storage decoration, new keyword 'active_stoarage' - @active_storage("max") +# @active_storage("max") def max( self, a, @@ -107,7 +107,7 @@ def max( mtol=None, split_every=None, chunk_function=None, - active_storage=False, + # active_storage=False, ): """Return maximum values of an array. @@ -167,7 +167,7 @@ def max( ) # REVIEW: active: `max_abs`: active storage decoration, new keyword 'active_stoarage' - @active_storage("max_abs") +# @active_storage("max_abs") def max_abs( self, a, @@ -176,7 +176,7 @@ def max_abs( mtol=1, split_every=None, chunk_function=None, - active_storage=False, + # active_storage=False, ): """Return maximum absolute values of an array. @@ -220,7 +220,7 @@ def max_abs( keepdims=keepdims, mtol=mtol, split_every=split_every, - active_storage=False, + # active_storage=False, ) # REVIEW: active: `mean`: active storage decoration, new keyword 'active_stoarage' @@ -234,7 +234,7 @@ def mean( mtol=None, split_every=None, chunk_function=None, - active_storage=False, + # active_storage=False, ): """Return mean values of an array. @@ -298,7 +298,7 @@ def mean( ) # REVIEW: active: `mean_abs`: active storage decoration, new keyword 'active_stoarage' - @active_storage("mean_abs") +# @active_storage("mean_abs") def mean_abs( self, a, @@ -308,7 +308,7 @@ def mean_abs( mtol=None, split_every=None, chunk_function=None, - active_storage=False, + # active_storage=False, ): """Return mean absolute values of an array. @@ -355,11 +355,11 @@ def mean_abs( keepdims=keepdims, mtol=mtol, split_every=split_every, - active_storage=False, + # active_storage=False, ) # REVIEW: active: `mid_range`: active storage decoration, new keyword 'active_stoarage' - @active_storage("mid_range") +# @active_storage("mid_range") def mid_range( self, a, @@ -369,7 +369,7 @@ def mid_range( mtol=None, split_every=None, chunk_function=None, - active_storage=False, + # active_storage=False, ): """Return mid-range values of an array. @@ -433,7 +433,7 @@ def mid_range( ) # REVIEW: active: `min`: active storage decoration, new keyword 'active_stoarage' - @active_storage("min") +# @active_storage("min") def min( self, a, @@ -442,7 +442,7 @@ def min( mtol=None, split_every=None, chunk_function=None, - active_storage=False, + # active_storage=False, ): """Return minimum values of an array. @@ -502,7 +502,7 @@ def min( ) # REVIEW: active: `min_abs`: active storage decoration, new keyword 'active_stoarage' - @active_storage("min_abs") +# @active_storage("min_abs") def min_abs( self, a, @@ -511,7 +511,7 @@ def min_abs( mtol=None, split_every=None, chunk_function=None, - active_storage=False, + # active_storage=False, ): """Return minimum absolute values of an array. @@ -555,11 +555,11 @@ def min_abs( keepdims=keepdims, mtol=mtol, split_every=split_every, - active_storage=False, + # active_storage=False, ) # REVIEW: active: `range`: active storage decoration, new keyword 'active_stoarage' - @active_storage("range") +# @active_storage("range") def range( self, a, @@ -568,7 +568,7 @@ def range( mtol=None, split_every=None, chunk_function=None, - active_storage=False, + # active_storage=False, ): """Return range values of an array. @@ -632,7 +632,7 @@ def range( ) # REVIEW: active: `rms`: active storage decoration, new keyword 'active_stoarage' - @active_storage("rms") +# @active_storage("rms") def rms( self, a, @@ -642,7 +642,7 @@ def rms( mtol=None, split_every=None, chunk_function=None, - active_storage=False, + # active_storage=False, ): """Return root mean square (RMS) values of an array. @@ -705,7 +705,7 @@ def rms( ) # REVIEW: active: `sample_size`: active storage decoration, new keyword 'active_stoarage' - @active_storage("sample_size") +# @active_storage("sample_size") def sample_size( self, a, @@ -714,7 +714,7 @@ def sample_size( mtol=None, split_every=None, chunk_function=None, - active_storage=False, + # active_storage=False, ): """Return sample size values of an array. @@ -778,7 +778,7 @@ def sample_size( ) # REVIEW: active: `sum`: active storage decoration, new keyword 'active_stoarage' - @active_storage("sum") +# @active_storage("sum") def sum( self, a, @@ -788,7 +788,7 @@ def sum( mtol=None, split_every=None, chunk_function=None, - active_storage=False, + # active_storage=False, ): """Return sum values of an array. @@ -854,7 +854,7 @@ def sum( ) # REVIEW: active: `sum_of_weights`: active storage decoration, new keyword 'active_stoarage' - @active_storage("sum_of_weights") +# @active_storage("sum_of_weights") def sum_of_weights( self, a, @@ -864,7 +864,7 @@ def sum_of_weights( mtol=None, split_every=None, chunk_function=None, - active_storage=False, + # active_storage=False, ): """Return sum of weights values for an array. @@ -931,7 +931,7 @@ def sum_of_weights( ) # REVIEW: active: `sum_of_weights2`: active storage decoration, new keyword 'active_stoarage' - @active_storage("sum_of_weights2") +# @active_storage("sum_of_weights2") def sum_of_weights2( self, a, @@ -941,7 +941,7 @@ def sum_of_weights2( mtol=None, split_every=None, chunk_function=None, - active_storage=False, + # active_storage=False, ): """Return sum of squares of weights values for an array. @@ -1008,9 +1008,9 @@ def sum_of_weights2( ) # REVIEW: active: `unique`: active storage decoration, new keyword 'active_stoarage' - @active_storage("unique") +# @active_storage("unique") def unique( - self, a, split_every=None, chunk_function=None, active_storage=False + self, a, split_every=None, chunk_function=None, # active_storage=False ): """Return unique elements of the data. @@ -1064,7 +1064,7 @@ def unique( ) # REVIEW: active: `var`: active storage decoration, new keyword 'active_stoarage' - @active_storage("var") +# @active_storage("var") def var( self, a, @@ -1075,7 +1075,7 @@ def var( ddof=None, split_every=None, chunk_function=None, - active_storage=False, + # active_storage=False, ): """Return variances of an array. diff --git a/cf/data/data.py b/cf/data/data.py index c6f3a2fdba..7f79fc7af1 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -104,7 +104,7 @@ _CACHE = 2 # = 0b0010 _CFA = 4 # = 0b0100 # REVIEW: active: `data.py`: Set the active storage status bit mask -_ACTIVE = 8 # = 0b1000 +#_ACTIVE = 8 # = 0b1000 _ALL = 15 # = 0b1111 @@ -468,8 +468,8 @@ def __init__( # compressed input arrays this will contain extra # information, such as a count or index variable. self._set_Array(array) - # Data files are candidates for active storage reductions - self._set_active_storage(True) +# # Data files are candidates for active storage reductions +# self._set_active_storage(True) # Cast the input data as a dask array kwargs = init_options.get("from_array", {}) @@ -1426,10 +1426,10 @@ def _clear_after_dask_update(self, clear=_ALL): # Set the CFA write status to False self._cfa_del_write() - # REVIEW: active: `_clear_after_dask_update`: update active storage status - if clear & _ACTIVE: - # Set active storage to False - self._del_active_storage() +# # REVIEW: active: `_clear_after_dask_update`: update active storage status +# if clear & _ACTIVE: +# # Set active storage to False +# self._del_active_storage() # REVIEW: getitem: `_set_dask`: new keyword 'asanyarray' def _set_dask(self, dx, copy=False, clear=_ALL, asanyarray=False): @@ -1550,32 +1550,32 @@ def _del_dask(self, default=ValueError(), clear=_ALL): self._clear_after_dask_update(clear) return out - # REVIEW: active: `_del_active_storage`: new method `_del_active_storage` - def _del_active_storage(self): - """Set the active storage reduction status to False. - - .. versionadded:: NEXTVERSION - - .. seealso:: `active_storage`, `_set_active_storage` - - :Returns: - - `None` - - **Examples** - - >>> d = cf.Data([9]) - >>> d.active_storage() - False - >>> d._set_active_storage(True) - >>> d.active_storage() - True - >>> d._del_active_storage() - >>> d.active_storage() - False - - """ - self._custom.pop("active_storage", False) +# # REVIEW: active: `_del_active_storage`: new method `_del_active_storage` +# def _del_active_storage(self): +# """Set the active storage reduction status to False. +# +# .. versionadded:: NEXTVERSION +# +# .. seealso:: `active_storage`, `_set_active_storage` +# +# :Returns: +# +# `None` +# +# **Examples** +# +# >>> d = cf.Data([9]) +# >>> d.active_storage() +# False +# >>> d._set_active_storage(True) +# >>> d.active_storage() +# True +# >>> d._del_active_storage() +# >>> d.active_storage() +# False +# +# """ +# self._custom.pop("active_storage", False) def _del_cached_elements(self): """Delete any cached element values. @@ -1636,32 +1636,32 @@ def _is_abstract_Array_subclass(self, array): """ return isinstance(array, cfdm.Array) - # REVIEW: active: `_set_active_storage`: new method `_set_active_storage` - def _set_active_storage(self, value): - """Set the active storage reduction status. - - .. versionadded:: NEXTVERSION - - .. seealso:: `active_storage`, `_del_active_storage` - - :Returns: - - `None` - - **Examples** - - >>> d = cf.Data([9]) - >>> d.active_storage() - False - >>> d._set_active_storage(True) - >>> d.active_storage() - True - >>> d._del_active_storage() - >>> d.active_storage() - False - - """ - self._custom["active_storage"] = bool(value) +# # REVIEW: active: `_set_active_storage`: new method `_set_active_storage` +# def _set_active_storage(self, value): +# """Set the active storage reduction status. +# +# .. versionadded:: NEXTVERSION +# +# .. seealso:: `active_storage`, `_del_active_storage` +# +# :Returns: +# +# `None` +# +# **Examples** +# +# >>> d = cf.Data([9]) +# >>> d.active_storage() +# False +# >>> d._set_active_storage(True) +# >>> d.active_storage() +# True +# >>> d._del_active_storage() +# >>> d.active_storage() +# False +# +# """ +# self._custom["active_storage"] = bool(value) def _set_cached_elements(self, elements): """Cache selected element values. @@ -3317,10 +3317,13 @@ def rechunk( dx = d.to_dask_array(asanyarray=False) dx = dx.rechunk(chunks, threshold, block_size_limit, balance) - # REVIEW: active: `rechunk`: Do not change active storage status after a rechunk +# # REVIEW: active: `rechunk`: Do not change active storage status after a rechunk d._set_dask( - dx, clear=_ALL ^ _ARRAY ^ _CACHE ^ _ACTIVE, asanyarray=True + dx, clear=_ALL ^ _ARRAY ^ _CACHE, asanyarray=True ) +# d._set_dask( +# dx, clear=_ALL ^ _ARRAY ^ _CACHE ^ _ACTIVE, asanyarray=True +# ) return d diff --git a/cf/data/utils.py b/cf/data/utils.py index 942f3425f9..66fb65c8ad 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -864,7 +864,7 @@ def collapse( "split_every": split_every, "mtol": mtol, # REVIEW: active: `collapse`: pass the active storage status onto the collapse functions - "active_storage": d.active_storage, +# "active_storage": d.active_storage, } weights = parse_weights(d, weights, axis) From 03eeb8cc0b539ff693946c629f4621c406e1c14f Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 19 Jul 2024 13:54:32 +0100 Subject: [PATCH 100/134] dev --- cf/data/array/mixin/activestoragemixin.py | 188 +++++------ cf/data/collapse/__init__.py | 3 - cf/data/collapse/collapse.py | 110 +------ cf/data/collapse/collapse_active.py | 376 ++++++---------------- cf/data/collapse/dask_collapse.py | 24 +- cf/data/data.py | 117 +------ cf/functions.py | 28 ++ cf/test/test_active_storage.py | 2 +- 8 files changed, 238 insertions(+), 610 deletions(-) diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index bc14c6d518..571efaf065 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -1,113 +1,119 @@ # REVIEW: active: `ActiveStorageMixin`: new mixin class `ActiveStorageMixin` -try: - from activestorage import Active -except ModuleNotFoundError: - Active = None +#try: +# from activestorage import Active +#except ModuleNotFoundError: +# Active = None class ActiveStorageMixin: - """Mixin class for enabling active storage reductions. + """Mixin class for enabling active storage operations. .. versionadded:: NEXTVERSION """ @property - def is_actifiable(self): - """Whether active storage operations are possible. + def active_storage(self): + """Whether active storage operations are allowed. - .. versionadded:: NEXTVERSION - - .. seealso:: `actify`, `get_active_storage_url` - - :Returns: - - `bool` - `True` if active stoage operations are possible, - otherwise `False`. - - """ - return self.get_filename(None) is not None - - @property - def actified(self): - """Whether active storage operations are possible. + Currently, active storage operations are allowed unless the + data are numerically packed. .. versionadded:: NEXTVERSION - .. seealso:: `actify`, `get_active_storage_url` - :Returns: `bool` - `True` if active stoage operations are possible, + `True` if active storage operations are allowed, otherwise `False`. """ - return self.get_active_storage_url() is not None - - def actify(self, active_storage_url): - """Return a new actified `{{class}}` instance. - - The new instance is a deep copy of the original, with the - additional setting of the active storage URL. - - .. versionadded:: NEXTVERSION - - .. seealso:: `actified`, `get_active_storage_url` - - :Parameters: - - active_storage_url: `str` or `None`, optional - The URL of the active storage server. If `None` then - `actified` will be `False` - - :Returns: - - `{{class}}` - The new `{{class}}`` instance that ues an active - storage operation. - - """ - # Don't actify when the data are packed. Note: There may come - # a time when activestorage.Active can cope with packed data, - # in which case we can remove this test. attributes = self.get_attributes({}) if "add_offset" in attributes or "scale_factor" in attributes: - raise AttributeError( - "Can't actify {self.__class__.__name__} when " - "the data have been numerically packed" - ) - - if Active is None: - raise AttributeError( - "Can't actify {self.__class__.__name__} when " - "activestorage.Active is not available" - ) - - a = self.copy() - a._custom["active_storage_url"] = active_storage_url - return a - - def get_active_storage_url(self): - """Return the active storage reduction URL. - - An active storage reduction URL is set with `actify`. - - .. versionadded:: NEXTVERSION - - .. seealso:: `actified`, `actify` - - :Returns: - - `str` or `None` - The active storage URL, or `None` if no active storage - reduction is possible. - - **Examples** - - >>> a.get_active_storage() - 'https://183.175.143.286:8080' - - """ - return self._custom.get("active_storage_url") + return False + + return True +# return self.get_filename(None) is not None + +# @property +# def actified(self): +# """Whether active storage operations are possible. +# +# .. versionadded:: NEXTVERSION +# +# .. seealso:: `actify`, `get_active_storage_url` +# +# :Returns: +# +# `bool` +# `True` if active stoage operations are possible, +# otherwise `False`. +# +# """ +# return self.get_active_storage_url() is not None +# +# def actify(self, active_storage_url): +# """Return a new actified `{{class}}` instance. +# +# The new instance is a deep copy of the original, with the +# additional setting of the active storage URL. +# +# .. versionadded:: NEXTVERSION +# +# .. seealso:: `actified`, `get_active_storage_url` +# +# :Parameters: +# +# active_storage_url: `str` or `None`, optional +# The URL of the active storage server. If `None` then +# `actified` will be `False` +# +# :Returns: +# +# `{{class}}` +# The new `{{class}}`` instance that ues an active +# storage operation. +# +# """ +# # Don't actify when the data are packed. Note: There may come +# # a time when activestorage.Active can cope with packed data, +# # in which case we can remove this test. +# attributes = self.get_attributes({}) +# if "add_offset" in attributes or "scale_factor" in attributes: +# raise AttributeError( +# "Can't actify {self.__class__.__name__} when " +# "the data have been numerically packed" +# ) +# +# if Active is None: +# raise AttributeError( +# "Can't actify {self.__class__.__name__} when " +# "activestorage.Active is not available" +# ) +# +# a = self.copy() +# a._custom["active_storage_url"] = active_storage_url +# return a +# +# def get_active_storage_url(self): +# """Return the active storage reduction URL. +# +# An active storage reduction URL is set with `actify`. +# +# .. versionadded:: NEXTVERSION +# +# .. seealso:: `actified`, `actify` +# +# :Returns: +# +# `str` or `None` +# The active storage URL, or `None` if no active storage +# reduction is possible. +# +# **Examples** +# +# >>> a.get_active_storage() +# 'https://183.175.143.286:8080' +# +# """ +# return self._custom.get("active_storage_url") diff --git a/cf/data/collapse/__init__.py b/cf/data/collapse/__init__.py index 0fd44052f9..0de12360ea 100644 --- a/cf/data/collapse/__init__.py +++ b/cf/data/collapse/__init__.py @@ -1,4 +1 @@ from .collapse import Collapse - -# REVIEW: active: import active storage functions -from .collapse_active import actify, active_reduction_methods, active_storage diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index 312f9b8c81..416eecd963 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -5,7 +5,6 @@ from dask.array.reductions import reduction from ...docstring import _docstring_substitution_definitions -from .collapse_active import active_storage from .collapse_utils import check_input_dtype, double_precision_dtype @@ -97,8 +96,6 @@ def __docstring_package_depth__(self): """ return 0 - # REVIEW: active: `max`: active storage decoration, new keyword 'active_stoarage' -# @active_storage("max") def max( self, a, @@ -107,7 +104,6 @@ def max( mtol=None, split_every=None, chunk_function=None, - # active_storage=False, ): """Return maximum values of an array. @@ -135,10 +131,6 @@ def max( {{chunk_function: callable or `None`, optional}} - {{active_storage: `bool`, optional}} - - .. versionadded:: NEXTVERSION - :Returns: `dask.array.Array` @@ -166,8 +158,6 @@ def max( meta=np.array((), dtype=dtype), ) - # REVIEW: active: `max_abs`: active storage decoration, new keyword 'active_stoarage' -# @active_storage("max_abs") def max_abs( self, a, @@ -176,7 +166,6 @@ def max_abs( mtol=1, split_every=None, chunk_function=None, - # active_storage=False, ): """Return maximum absolute values of an array. @@ -204,9 +193,6 @@ def max_abs( {{chunk_function: callable or `None`, optional}} - {{active_storage: `bool`, optional}} - - .. versionadded:: NEXTVERSION :Returns: @@ -220,11 +206,8 @@ def max_abs( keepdims=keepdims, mtol=mtol, split_every=split_every, - # active_storage=False, ) - # REVIEW: active: `mean`: active storage decoration, new keyword 'active_stoarage' -# @active_storage("mean") def mean( self, a, @@ -234,7 +217,6 @@ def mean( mtol=None, split_every=None, chunk_function=None, - # active_storage=False, ): """Return mean values of an array. @@ -264,10 +246,6 @@ def mean( {{chunk_function: callable or `None`, optional}} - {{active_storage: `bool`, optional}} - - .. versionadded:: NEXTVERSION - :Returns: `dask.array.Array` @@ -297,8 +275,6 @@ def mean( weights=weights, ) - # REVIEW: active: `mean_abs`: active storage decoration, new keyword 'active_stoarage' -# @active_storage("mean_abs") def mean_abs( self, a, @@ -308,7 +284,6 @@ def mean_abs( mtol=None, split_every=None, chunk_function=None, - # active_storage=False, ): """Return mean absolute values of an array. @@ -338,10 +313,6 @@ def mean_abs( {{chunk_function: callable or `None`, optional}} - {{active_storage: `bool`, optional}} - - .. versionadded:: NEXTVERSION - :Returns: `dask.array.Array` @@ -355,11 +326,8 @@ def mean_abs( keepdims=keepdims, mtol=mtol, split_every=split_every, - # active_storage=False, ) - # REVIEW: active: `mid_range`: active storage decoration, new keyword 'active_stoarage' -# @active_storage("mid_range") def mid_range( self, a, @@ -369,7 +337,6 @@ def mid_range( mtol=None, split_every=None, chunk_function=None, - # active_storage=False, ): """Return mid-range values of an array. @@ -397,10 +364,6 @@ def mid_range( {{chunk_function: callable or `None`, optional}} - {{active_storage: `bool`, optional}} - - .. versionadded:: NEXTVERSION - :Returns: `dask.array.Array` @@ -432,8 +395,6 @@ def mid_range( meta=np.array((), dtype=dtype), ) - # REVIEW: active: `min`: active storage decoration, new keyword 'active_stoarage' -# @active_storage("min") def min( self, a, @@ -442,7 +403,6 @@ def min( mtol=None, split_every=None, chunk_function=None, - # active_storage=False, ): """Return minimum values of an array. @@ -470,10 +430,6 @@ def min( {{chunk_function: callable or `None`, optional}} - {{active_storage: `bool`, optional}} - - .. versionadded:: NEXTVERSION - :Returns: `dask.array.Array` @@ -501,8 +457,6 @@ def min( meta=np.array((), dtype=dtype), ) - # REVIEW: active: `min_abs`: active storage decoration, new keyword 'active_stoarage' -# @active_storage("min_abs") def min_abs( self, a, @@ -511,7 +465,6 @@ def min_abs( mtol=None, split_every=None, chunk_function=None, - # active_storage=False, ): """Return minimum absolute values of an array. @@ -539,10 +492,6 @@ def min_abs( {{chunk_function: callable or `None`, optional}} - {{active_storage: `bool`, optional}} - - .. versionadded:: NEXTVERSION - :Returns: `dask.array.Array` @@ -555,11 +504,8 @@ def min_abs( keepdims=keepdims, mtol=mtol, split_every=split_every, - # active_storage=False, ) - # REVIEW: active: `range`: active storage decoration, new keyword 'active_stoarage' -# @active_storage("range") def range( self, a, @@ -568,7 +514,6 @@ def range( mtol=None, split_every=None, chunk_function=None, - # active_storage=False, ): """Return range values of an array. @@ -596,10 +541,6 @@ def range( {{chunk_function: callable or `None`, optional}} - {{active_storage: `bool`, optional}} - - .. versionadded:: NEXTVERSION - :Returns: `dask.array.Array` @@ -631,8 +572,6 @@ def range( meta=np.array((), dtype=dtype), ) - # REVIEW: active: `rms`: active storage decoration, new keyword 'active_stoarage' -# @active_storage("rms") def rms( self, a, @@ -642,7 +581,6 @@ def rms( mtol=None, split_every=None, chunk_function=None, - # active_storage=False, ): """Return root mean square (RMS) values of an array. @@ -672,9 +610,6 @@ def rms( {{chunk_function: callable or `None`, optional}} - {{active_storage: `bool`, optional}} - - .. versionadded:: NEXTVERSION :Returns: @@ -704,8 +639,6 @@ def rms( weights=weights, ) - # REVIEW: active: `sample_size`: active storage decoration, new keyword 'active_stoarage' -# @active_storage("sample_size") def sample_size( self, a, @@ -714,7 +647,6 @@ def sample_size( mtol=None, split_every=None, chunk_function=None, - # active_storage=False, ): """Return sample size values of an array. @@ -742,10 +674,6 @@ def sample_size( {{chunk_function: callable or `None`, optional}} - {{active_storage: `bool`, optional}} - - .. versionadded:: NEXTVERSION - :Returns: `dask.array.Array` @@ -777,8 +705,6 @@ def sample_size( meta=np.array((), dtype=dtype), ) - # REVIEW: active: `sum`: active storage decoration, new keyword 'active_stoarage' -# @active_storage("sum") def sum( self, a, @@ -788,7 +714,6 @@ def sum( mtol=None, split_every=None, chunk_function=None, - # active_storage=False, ): """Return sum values of an array. @@ -818,10 +743,6 @@ def sum( {{chunk_function: callable or `None`, optional}} - {{active_storage: `bool`, optional}} - - .. versionadded:: NEXTVERSION - :Returns: `dask.array.Array` @@ -853,8 +774,6 @@ def sum( weights=weights, ) - # REVIEW: active: `sum_of_weights`: active storage decoration, new keyword 'active_stoarage' -# @active_storage("sum_of_weights") def sum_of_weights( self, a, @@ -864,7 +783,6 @@ def sum_of_weights( mtol=None, split_every=None, chunk_function=None, - # active_storage=False, ): """Return sum of weights values for an array. @@ -894,10 +812,6 @@ def sum_of_weights( {{chunk_function: callable or `None`, optional}} - {{active_storage: `bool`, optional}} - - .. versionadded:: NEXTVERSION - :Returns: `dask.array.Array` @@ -930,8 +844,6 @@ def sum_of_weights( weights=weights, ) - # REVIEW: active: `sum_of_weights2`: active storage decoration, new keyword 'active_stoarage' -# @active_storage("sum_of_weights2") def sum_of_weights2( self, a, @@ -941,7 +853,6 @@ def sum_of_weights2( mtol=None, split_every=None, chunk_function=None, - # active_storage=False, ): """Return sum of squares of weights values for an array. @@ -971,10 +882,6 @@ def sum_of_weights2( {{chunk_function: callable or `None`, optional}} - {{active_storage: `bool`, optional}} - - .. versionadded:: NEXTVERSION - :Returns: `dask.array.Array` @@ -1007,11 +914,7 @@ def sum_of_weights2( weights=weights, ) - # REVIEW: active: `unique`: active storage decoration, new keyword 'active_stoarage' -# @active_storage("unique") - def unique( - self, a, split_every=None, chunk_function=None, # active_storage=False - ): + def unique(self, a, split_every=None, chunk_function=None): """Return unique elements of the data. .. versionadded:: 3.14.0 @@ -1025,10 +928,6 @@ def unique( {{chunk_function: callable or `None`, optional}} - {{active_storage: `bool`, optional}} - - .. versionadded:: NEXTVERSION - :Returns: `dask.array.Array` @@ -1063,8 +962,6 @@ def unique( meta=np.array((), dtype=dtype), ) - # REVIEW: active: `var`: active storage decoration, new keyword 'active_stoarage' -# @active_storage("var") def var( self, a, @@ -1075,7 +972,6 @@ def var( ddof=None, split_every=None, chunk_function=None, - # active_storage=False, ): """Return variances of an array. @@ -1112,10 +1008,6 @@ def var( `cf.data.collapse.dask_collapse.cf_var_chunk` for details. - {{active_storage: `bool`, optional}} - - .. versionadded:: NEXTVERSION - :Returns: `dask.array.Array` diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index fce3f6d4a3..31a363c878 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -6,10 +6,10 @@ try: from activestorage import Active except ModuleNotFoundError: - Active = None + pass from ...functions import active_storage as cf_active_storage -from ...functions import active_storage_url +from ...functions import active_storage_url, is_log_level_debug logger = logging.getLogger(__name__) @@ -23,55 +23,72 @@ class ActiveStorageError(Exception): pass -def active_chunk(method, x, **kwargs): +def active_chunk_function(method, x, **kwargs): """Collapse data in a chunk with active storage. + If an active storage reduction is not approriate then `None` is + returned, or else an ActiveStorageError is raised if the active + storage operation fails. + .. versionadded:: NEXTVERSION - .. seealso:: `actify`, `active_storage`, `cf.data.collapse.Collapse` + .. seealso:: `actify` :Parameters: - a: array_like - The data to be collapsed. - method: `str` - The name of the reduction method. If the method does not - have a corresponding active function in the - `active_chunk_functions` dictionary then active storage - computations are not carried out. + The name of the reduction method (e.g. ``'mean'``). - axis: (sequence of) `int`, optional - Axis or axes along which to operate. By default, - flattened input is used. + x: array_like + The data to be collapsed. kwargs: optional Extra keyword arguments that define the reduction. :Returns: - `dict` - The reduced data in component form. + `dict` or `None` + The reduced data in component form, or else `None` if an + active storage reduction is not approriate. **Examples** - >>> d = active_chunk('sum', x) + >>> d = active_chunk_function('sum', x) >>> d {'N': 7008, 'sum': 7006221.66903949} + Active storage reduction is not yet possible for variances: + + >>> d = active_chunk_function('variance', x) + >>> print(d) + None + """ - # Return None if active storage reduction is not approriate: - # - # * The reduction is weighted - # * The reduction is over a subset of axes - # * x.is_actifiable is False - if not kwargs.get("computing_meta"): - print(method, repr(x), kwargs) + if kwargs.get("computing_meta"): + print("COMPUTING_META", method, repr(x), kwargs) + return x + + # Return None if active storage reduction is not appropriate + print(method, repr(x), kwargs) + if not cf_active_storage(): + # Active storage is turned off => do a local reduction return + if method not in active_reduction_methods: + # Active storage is not available for this method => do a + # local reduction + return + + if not getattr(x, "active_storage", False): + # Active storage operations are not allowed on 'x' => do a + # local reduction + return + weighted = kwargs.get("weights") is not None if weighted: + # Active storage is not allowed for weighted reductions => do + # a local reduction return axis = kwargs.get("axis") @@ -80,36 +97,25 @@ def active_chunk(method, x, **kwargs): axis = (axis,) if len(axis) < x.ndim: + # Active storage is not allowed for reductions over a + # subset of the axes => do a local reduction return -# try: -# filename = x.get_filename() -# except AttributeError: -# # This Dask chunk is not a data definition -# return -# else: -# if not filename: -# # This data definition doesn't have any files, so can't -# # support active storage reductions. -# return - - if not getattr(x, "is_actifiable", False): - print ('not is act', axis) - return - - # Raise an ActiveStorageError the active storage reduction can't - # happen + # Raise an ActiveStorageError if the active storage reduction can't + # happen or fails url = active_storage_url().value if url is None: + # Active storage operations are not possible when an active + # storage URL has not been set => do a local reduction raise ActiveStorageError("No active storage URL") - - x = x.actify(url) - - # Still here? Then do active storage reduction - if kwargs.get("computing_meta"): - return x - filename = x.get_filename() + # ---------------------------------------------------------------- + # Still here? Set up an Active instance that will carry out the + # active storage operation. + # ---------------------------------------------------------------- + index = x.index() + + filename = x.get_filename() filename = "/".join(filename.split("/")[3:]) max_threads = 100 @@ -123,50 +129,39 @@ def active_chunk(method, x, **kwargs): "max_threads": max_threads, } - if False: - print(f"Active(**{active_kwargs})") - active = Active(**active_kwargs) active.method = method active.components = True + if is_log_level_debug: + logger.debug(f"Active call: Active(**{active_kwargs})[{index}]") + import datetime import time - # Raise an ActiveStorageError active storage reductions fail - # whilst happening + # ---------------------------------------------------------------- + # Execute the active storage operation + # ---------------------------------------------------------------- try: - lock = False # True #False - if lock: - x._lock.acquire() - start = time.time() - print("START LOCKED", x.index(), datetime.datetime.now()) - d = active[x.index()] - print( - "FINISH LOCKED", - x.index(), - datetime.datetime.now(), - time.time() - start, - f"maxT={max_threads}", - ) - x._lock.release() - else: - start = time.time() - print("START unlocked", x.index(), datetime.datetime.now()) - d = active[x.index()] - print( - "FINISH unlocked", - x.index(), - datetime.datetime.now(), - time.time() - start, - f"maxT={max_threads}", - ) + start = time.time() + print("START unlocked", index, datetime.datetime.now()) + d = active[index] + print( + "FINISH unlocked", + datetime.datetime.now(), + time.time() - start, + f"maxT={max_threads}", + ) except Exception as error: + # Something went wrong with the active storage operations => + # do a local reduction print ('565') raise ActiveStorageError(error) + # ---------------------------------------------------------------- # Reformat the components dictionary to match the output of the # corresponding local chunk function + # ---------------------------------------------------------------- if method == "max": # Local chunk function `cf_max_chunk` d = {"N": d["n"], "max": d["max"]} @@ -179,238 +174,61 @@ def active_chunk(method, x, **kwargs): elif method == "sum": # Local chunk function `cf_sum_chunk` d = {"N": d["n"], "sum": d["sum"]} - else: - raise ActiveStorageError( - f"Don't know how to reformat {method!r} components" - ) return d -def actify(a, method, axis=None): - """Modify a Dask array to use active storage reductions. - - The Dask graph is inspected to ensure that active storage - reductions are possible, and if not then the Dask array is - returned unchanged. - - .. note:: It is assumed that the `!active_storage` attribute of - the `Data` object that provided the Dask array *a* is - `True`. If this is not the case then an error at compute - time is likely. The value of the `Data` object's - `!active_storage` attribute is registered via the - *active_storage* parameter of `Collapse` methods. - - .. versionadded:: NEXTVERSION - - .. seealso:: `active_storage`, `cf.data.collapse.Collapse` - - :Parameters: - - a: `dask.array.Array` - The array to be collapsed. - - method: `str` - The name of the reduction method. If the method does not - have a corresponding active function in the - `active_chunk_functions` dictionary then active storage - computations are not carried out. - - axis: (sequence of) `int`, optional - Axis or axes along which to operate. By default, - flattened input is used. - - :Returns: - - (`dask.array.Array`, function) or (`dask.array.Array`, `None`) - If active storage operations are possible then return the - modified Dask array and the new chunk reduction - function. Otherwise return the unaltered input array and - `None`. - - """ - import dask.array as da - from dask.base import collections_to_dsk - - if Active is None: - raise AttributeError( - "Can't actify {self.__class__.__name__} when " - "activestorage.Active is not available" - ) - - if method not in active_reduction_methods: - # The method cannot be calculated with active storage, so - # return the input data unchanged. - return a - - url = active_storage_url().value - if url is None: - # TODOACTIVE - return a - - # Parse axis - ndim = a.ndim - if axis is None: - axis = tuple(range(ndim)) - else: - from numbers import Integral - - from dask.array.utils import validate_axis - - if isinstance(axis, Integral): - axis = (axis,) - - axis = validate_axis(axis, ndim) - if len(axis) != ndim or len(set(axis)) != ndim: - # Can't (yet) use active storage to collapse a subset of - # the axes, so return the input data unchanged. - return a - - # Loop round the nodes of the Dask graph looking for data - # definitions that i) point to files, and ii) which support active - # storage operations; and modify the Dask graph when we find them. - # - # The elements are traversed in reverse order so that the data - # definitions will tend to come out first, allowing for the - # potential of a faster short circuit when using active storage is - # not possible. - # - # Performance: The optimising the graph can be slow for - # complicated graphs, but nonetheless is essential to - # ensure that unused nodes are not considered. - ok_to_actify = True - dsk = collections_to_dsk((a,), optimize_graph=True) - for key, value in reversed(dsk.items()): - try: - filename = value.get_filename() - except AttributeError: - # This Dask chunk is not a data definition - continue - - if not filename: - # This data definition doesn't have any files, so can't - # support active storage reductions. - ok_to_actify = False - break - - # Still here? Then this chunk is a data definition that points - # to files, so try to insert an actified copy into the Dask - # graph. - try: - dsk[key] = value.actify(url) - except AttributeError: - # This data definition doesn't support active storage - # reductions - ok_to_actify = False - break - - if not ok_to_actify: - # It turns out that the Dask graph is not suitable for active - # storage reductions, so return the input data unchanged. - return a - - # Still here? Then the Dask graph supports active storage - # reductions => redefine the Dask array from the - # actified Dask graph. - logger.warning( - "At compute time, the collapse will be attempted with active " - f"storage at URL {url}" - ) - return da.Array(dsk, a.name, a.chunks, a.dtype, a._meta) - - # -------------------------------------------------------------------- # Decorators # -------------------------------------------------------------------- -def active_storage(method): - """Decorator for active storage reductions on `Collapse` methods. - - When a `Collapse` method is decorated, active storage operations - are carried out if the conditions are right. - - .. versionadded:: NEXTVERSION - - .. seealso:: `actify`, `cf.data.collapse.Collapse` - - :Parameters: - - method: `str` - The name of the reduction method. If it is one of the - `active_chunk_methods` then active storage reductions - *might* occur. - - """ - - def decorator(collapse_method): - @wraps(collapse_method) - def wrapper(self, *args, **kwargs): - if ( - Active is not None - and kwargs.get("active_storage") - and cf_active_storage() - # and active_storage_url() - and method in active_reduction_methods - and kwargs.get("weights") is None - and kwargs.get("chunk_function") is None - ): - # Attempt to actify the Dask array - args = list(args) - if args: - dx = args.pop(0) - else: - dx = kwargs.pop("a") - - dx = actify(dx, method=method, axis=kwargs.get("axis")) - args.insert(0, dx) - - # Run the collapse method - return collapse_method(self, *args, **kwargs) - - return wrapper - - return decorator - - -def active_storage_chunk(method): +def actify(method): """Decorator for active storage reductions on chunks. - Intended for the ``cf_*_chunk`` methods in + Intended for to decorate the ``cf_*_chunk`` methods in cf.data.collapse.dask_collapse`. + When a ``cf_*_chunk`` method is decorated, then its computations + will be carried out in active storage, if that is appropriate and + possible. Whether or not computations are done in active storage + is determined by `active_chunk_function`. + .. versionadded:: NEXTVERSION + .. seealso:: `active_chunk_function` + :Parameters: method: `str` - The name of the reduction method. If it is one of the - `active_chunk_methods` then active storage reductions - *might* occur. + The name of the reduction method. """ def decorator(chunk_function): @wraps(chunk_function) def wrapper(*args, **kwargs): - # if args: - # x = args[0] - # else: - # x = kwargs["x"] - # - # if getattr(x, "actified", False): + + #if args: TODO + # x = args[0] + #else: + # x = kwargs["x"] try: - # Try doing an active storage reduction on - # actified chunk data - out = active_chunk(method, *args, **kwargs) + # Try doing an active storage reduction + print (method, args, kwargs) + out = active_chunk_function(method, *args, **kwargs) except ActiveStorageError as warning: # The active storage reduction failed logger.warning( - f"Dask chunk reverting to local reduction: {warning}" + "Dask chunk failed in active storage reduction => " + f"reverting to local computation: {warning}" ) else: if out is not None: + # The active storage reduction succeeded return out - # Still here? Then do a local reduction. + # Still here? Then using active storage is not + # appropriate, or else doing the active storage operation + # failed => do a local computation. return chunk_function(*args, **kwargs) return wrapper diff --git a/cf/data/collapse/dask_collapse.py b/cf/data/collapse/dask_collapse.py index b5eb3add0d..f868bab905 100644 --- a/cf/data/collapse/dask_collapse.py +++ b/cf/data/collapse/dask_collapse.py @@ -18,7 +18,7 @@ from dask.utils import deepmap from ..dask_utils import cf_asanyarray -from .collapse_active import active_storage_chunk +from .collapse_active import actify from .collapse_utils import double_precision_dtype @@ -233,7 +233,7 @@ def sum_sample_sizes(pairs, axis, computing_meta=False, **kwargs): # mean # -------------------------------------------------------------------- # REVIEW: active: `cf_mean_chunk`: active storage decoration -@active_storage_chunk("mean") +@actify("mean") def cf_mean_chunk( x, weights=None, @@ -380,7 +380,7 @@ def cf_mean_agg( # maximum # -------------------------------------------------------------------- # REVIEW: active: `cf_max_chunk`: active storage decoration -@active_storage_chunk("max") +@actify("max") def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): """Chunk calculations for the maximum. @@ -535,7 +535,7 @@ def cf_mid_range_agg( # minimum # -------------------------------------------------------------------- # REVIEW: active: `cf_min_chunk`: active storage decoration -@active_storage_chunk("min") +@actify("min") def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): """Chunk calculations for the minimum. @@ -642,7 +642,7 @@ def cf_min_agg( # range # -------------------------------------------------------------------- # REVIEW: active: `cf_range_chunk`: active storage decoration -@active_storage_chunk("range") +@actify("range") def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs): """Chunk calculations for the range. @@ -756,7 +756,7 @@ def cf_range_agg( # root mean square # -------------------------------------------------------------------- # REVIEW: active: `cf_rms_chunk`: active storage decoration -@active_storage_chunk("rms") +@actify("rms") def cf_rms_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): """Chunk calculations for the root mean square (RMS). @@ -841,7 +841,7 @@ def cf_rms_agg( # sample size # -------------------------------------------------------------------- # REVIEW: active: `cf_sample_size_chunk`: active storage decoration -@active_storage_chunk("sample_size") +@actify("sample_size") def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs): """Chunk calculations for the sample size. @@ -955,7 +955,7 @@ def cf_sample_size_agg( # sum # -------------------------------------------------------------------- # REVIEW: active: `cf_sum_chunk`: active storage decoration -@active_storage_chunk("sum") +@actify("sum") def cf_sum_chunk( x, weights=None, @@ -1091,7 +1091,7 @@ def cf_sum_agg( # sum of weights # -------------------------------------------------------------------- # REVIEW: active: `cf_sum_of_weights_chunk`: active storage decoration -@active_storage_chunk("sum_of_weights") +@actify("sum_of_weights") def cf_sum_of_weights_chunk( x, weights=None, dtype="f8", computing_meta=False, **kwargs ): @@ -1135,7 +1135,7 @@ def cf_sum_of_weights_chunk( # sum of squares of weights # -------------------------------------------------------------------- # REVIEW: active: `cf_sum_of_weights2_chunk`: active storage decoration -@active_storage_chunk("sum_of_weights2") +@actify("sum_of_weights2") def cf_sum_of_weights2_chunk( x, weights=None, dtype="f8", computing_meta=False, **kwargs ): @@ -1181,7 +1181,7 @@ def cf_sum_of_weights2_chunk( # unique # -------------------------------------------------------------------- # REVIEW: active: `cf_unique_chunk`: active storage decoration -@active_storage_chunk("unique") +@actify("unique") def cf_unique_chunk(x, dtype=None, computing_meta=False, **kwargs): """Chunk calculations for the unique values. @@ -1246,7 +1246,7 @@ def cf_unique_agg(pairs, axis=None, computing_meta=False, **kwargs): # variance # -------------------------------------------------------------------- # REVIEW: active: `cf_var_chunk`: active storage decoration -@active_storage_chunk("var") +@actify("var") def cf_var_chunk( x, weights=None, dtype="f8", computing_meta=False, ddof=None, **kwargs ): diff --git a/cf/data/data.py b/cf/data/data.py index 7f79fc7af1..b22689f5d8 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -103,8 +103,6 @@ _ARRAY = 1 # = 0b0001 _CACHE = 2 # = 0b0010 _CFA = 4 # = 0b0100 -# REVIEW: active: `data.py`: Set the active storage status bit mask -#_ACTIVE = 8 # = 0b1000 _ALL = 15 # = 0b1111 @@ -462,14 +460,11 @@ def __init__( except AttributeError: pass - # REVIEW: active: `__init__`: set the active storage status to True for Array subclasses if self._is_abstract_Array_subclass(array): # Save the input array in case it's useful later. For # compressed input arrays this will contain extra # information, such as a count or index variable. self._set_Array(array) -# # Data files are candidates for active storage reductions -# self._set_active_storage(True) # Cast the input data as a dask array kwargs = init_options.get("from_array", {}) @@ -963,7 +958,6 @@ def __getitem__(self, indices): "Non-orthogonal indexing has not yet been implemented" ) - # REVIEW: active `__getitem__`: subspacing does not affect active storage status # REVIEW: getitem: `__getitem__`: set 'asanyarray=True' because subspaced chunks might not be in memory # ------------------------------------------------------------ # Set the subspaced dask array @@ -971,11 +965,8 @@ def __getitem__(self, indices): # * A subpspaced chunk might not result in an array in memory, # so we set asanyarray=True to ensure that, if required, # they are converted at compute time. - # - # * Subspacing the data does not affect the active storage - # status # ------------------------------------------------------------ - new._set_dask(dx, clear=_ALL ^ _ACTIVE, asanyarray=True) + new._set_dask(dx, clear=_ALL, asanyarray=True) # ------------------------------------------------------------ # Get the axis identifiers for the subspace @@ -1388,9 +1379,6 @@ def _clear_after_dask_update(self, clear=_ALL): * If ``clear & _CFA`` is non-zero then the CFA write status is set to `False`. - * If ``clear & _ACTIVE`` is non-zero then set the - active storage status to `False`. - By default *clear* is the ``_ALL`` integer-valued constant, which results in all components being removed. @@ -1426,11 +1414,6 @@ def _clear_after_dask_update(self, clear=_ALL): # Set the CFA write status to False self._cfa_del_write() -# # REVIEW: active: `_clear_after_dask_update`: update active storage status -# if clear & _ACTIVE: -# # Set active storage to False -# self._del_active_storage() - # REVIEW: getitem: `_set_dask`: new keyword 'asanyarray' def _set_dask(self, dx, copy=False, clear=_ALL, asanyarray=False): """Set the dask array. @@ -1550,33 +1533,6 @@ def _del_dask(self, default=ValueError(), clear=_ALL): self._clear_after_dask_update(clear) return out -# # REVIEW: active: `_del_active_storage`: new method `_del_active_storage` -# def _del_active_storage(self): -# """Set the active storage reduction status to False. -# -# .. versionadded:: NEXTVERSION -# -# .. seealso:: `active_storage`, `_set_active_storage` -# -# :Returns: -# -# `None` -# -# **Examples** -# -# >>> d = cf.Data([9]) -# >>> d.active_storage() -# False -# >>> d._set_active_storage(True) -# >>> d.active_storage() -# True -# >>> d._del_active_storage() -# >>> d.active_storage() -# False -# -# """ -# self._custom.pop("active_storage", False) - def _del_cached_elements(self): """Delete any cached element values. @@ -1636,33 +1592,6 @@ def _is_abstract_Array_subclass(self, array): """ return isinstance(array, cfdm.Array) -# # REVIEW: active: `_set_active_storage`: new method `_set_active_storage` -# def _set_active_storage(self, value): -# """Set the active storage reduction status. -# -# .. versionadded:: NEXTVERSION -# -# .. seealso:: `active_storage`, `_del_active_storage` -# -# :Returns: -# -# `None` -# -# **Examples** -# -# >>> d = cf.Data([9]) -# >>> d.active_storage() -# False -# >>> d._set_active_storage(True) -# >>> d.active_storage() -# True -# >>> d._del_active_storage() -# >>> d.active_storage() -# False -# -# """ -# self._custom["active_storage"] = bool(value) - def _set_cached_elements(self, elements): """Cache selected element values. @@ -3317,13 +3246,9 @@ def rechunk( dx = d.to_dask_array(asanyarray=False) dx = dx.rechunk(chunks, threshold, block_size_limit, balance) -# # REVIEW: active: `rechunk`: Do not change active storage status after a rechunk d._set_dask( dx, clear=_ALL ^ _ARRAY ^ _CACHE, asanyarray=True ) -# d._set_dask( -# dx, clear=_ALL ^ _ARRAY ^ _CACHE ^ _ACTIVE, asanyarray=True -# ) return d @@ -4323,16 +4248,6 @@ def concatenate( cfa = _NONE break - # REVIEW: active: `concatenate`: define the active_storage status - # Define the active_storage status - active = _ACTIVE - for d in processed_data: - if not d.active_storage: - # Set the output active storage status to False when - # any input data instance has False status - active = _NONE - break - # REVIEW: getitem: `concatenate`: define the asanyarray status # Define the __asanyarray__ status asanyarray = processed_data[0].__asanyarray__ @@ -4345,9 +4260,8 @@ def concatenate( break # REVIEW: getitem: `concatenate`: set 'asanyarray' - # REVIEW: active: `concatenate`: set 'clear' # Set the new dask array - data0._set_dask(dx, clear=_ALL ^ cfa ^ active, asanyarray=asanyarray) + data0._set_dask(dx, clear=_ALL ^ cfa, asanyarray=asanyarray) # Set appropriate cached elements cached_elements = {} @@ -4999,33 +4913,6 @@ def chunks(self): # ---------------------------------------------------------------- # Attributes # ---------------------------------------------------------------- - # REVIEW: active: `active_storage`: new property `active_storage` - @property - def active_storage(self): - """Whether or not active storage reductions are possible. - - When the `active_storage` attribute is False it signifies that - active storage reductions are not available. - - When the `active_storage` attribute is True it signifies that - active storage reductions are possible, but only when all of - the conditions described by `cf.data.collapse.Collapse` are - also met. - - .. versionadded:: NEXTVERSION - - **Examples** - - >>> d = cf.Data([9]) - >>> d.active_storage - False - - """ - return ( - self._custom.get("active_storage", False) - and not self.get_compression_type() - ) - @property def Units(self): """The `cf.Units` object containing the units of the data array. diff --git a/cf/functions.py b/cf/functions.py index 9dfddbc063..8bed07e40a 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -1244,6 +1244,14 @@ def _parse(cls, arg): insertion into the `CONSTANTS` dictionary. """ + try: + from activestorage import Active + except ModuleNotFoundError as error: + if arg: + raise ModuleNotFoundError( + f"Can't enable active storage operations: {error}" + ) + return bool(arg) @@ -1451,6 +1459,26 @@ def total_memory(): return CONSTANTS["TOTAL_MEMORY"] +def is_log_level_debug(logger): + """Return True if and only if log level is at least DEBUG. + + .. versionadded:: NEXTVERSION + + .. seealso:: `log_level` + + :Parameters: + + logger: `logging.Logger` + The logger in use. + + :Returns: + + `bool` + Whether or not the log level is at least DEBUG. + + """ + return logger.parent.level <= logging.DEBUG + # -------------------------------------------------------------------- # Aliases (for back-compatibility etc.): # -------------------------------------------------------------------- diff --git a/cf/test/test_active_storage.py b/cf/test/test_active_storage.py index 5b4fa1645a..90555ec4b5 100644 --- a/cf/test/test_active_storage.py +++ b/cf/test/test_active_storage.py @@ -55,7 +55,7 @@ def test_active_storage(self): with cf.configuration(active_storage=True, active_storage_url="dummy"): self.assertTrue(cf.active_storage()) self.assertEqual(cf.active_storage_url(), "dummy") - self.assertTrue(f.data.active_storage) +# self.assertTrue(f.data.active_storage) active_array = f.collapse("mean", weights=False).array self.assertEqual(active_array, local_array) From 4c6adad57af4de6c78724bf6dc357463cb4a5c3d Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 19 Jul 2024 16:22:27 +0100 Subject: [PATCH 101/134] dev --- cf/data/collapse/collapse_active.py | 8 +-- cf/test/test_Data.py | 94 ++++++++++++++--------------- 2 files changed, 50 insertions(+), 52 deletions(-) diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 31a363c878..2ed02d5059 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -23,7 +23,7 @@ class ActiveStorageError(Exception): pass -def active_chunk_function(method, x, **kwargs): +def active_chunk_function(method, x, weights=None, axis=None, keepdims=True, **kwargs): """Collapse data in a chunk with active storage. If an active storage reduction is not approriate then `None` is @@ -85,13 +85,11 @@ def active_chunk_function(method, x, **kwargs): # local reduction return - weighted = kwargs.get("weights") is not None - if weighted: + if weights is not None: # Active storage is not allowed for weighted reductions => do # a local reduction return - axis = kwargs.get("axis") if axis is not None: if isinstance(axis, Integral): axis = (axis,) @@ -213,7 +211,7 @@ def wrapper(*args, **kwargs): # x = kwargs["x"] try: # Try doing an active storage reduction - print (method, args, kwargs) +# print (method, args, kwargs) out = active_chunk_function(method, *args, **kwargs) except ActiveStorageError as warning: # The active storage reduction failed diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index ca4ac109e9..67ae62abc5 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -4520,53 +4520,53 @@ def test_Data__str__(self): for element in elements0: self.assertNotIn(element, d._get_cached_elements()) - # REVIEW: getitem: `test_Data_active_storage`: test `Data.active_storage` - def test_Data_active_storage(self): - """Test `Data.active_storage`.""" - with cf.active_storage(True): - d = cf.Data([[9, 8]]) - self.assertFalse(d.active_storage) - - d._set_active_storage(True) - self.assertTrue(d.active_storage) - d._del_active_storage() - self.assertFalse(d.active_storage) - - # Check that operations correctly set active_storage to - # False, in particular those that do not invokde - # `Data._set_dask`. - d._set_active_storage(True) - d.transpose(inplace=True) - self.assertFalse(d.active_storage) - - d._set_active_storage(True) - d[...] = -1 - self.assertFalse(d.active_storage) - - d._set_active_storage(True) - d.persist(inplace=True) - self.assertFalse(d.active_storage) - - # Rechunk should preserve active_storage - d._set_active_storage(True) - d.rechunk(1, inplace=True) - self.assertTrue(d.active_storage) - - # __getitem__ should preserve active_storage - d._set_active_storage(True) - self.assertTrue(d[0, 3:].active_storage) - - # Test with data on disk - n = cf.NetCDF4Array( - "test_file.nc", - "eastward_wind", - shape=(1, 9, 10), - dtype=np.dtype(float), - ) - d = cf.Data(n) - self.assertTrue(d.active_storage) - d = cf.Data(n, to_memory=True) - self.assertFalse(d.active_storage) +# # REVIEW: getitem: `test_Data_active_storage`: test `Data.active_storage` +# def test_Data_active_storage(self): +# """Test `Data.active_storage`.""" +# with cf.active_storage(True): +# d = cf.Data([[9, 8]]) +# self.assertFalse(d.active_storage) +# +# d._set_active_storage(True) +# self.assertTrue(d.active_storage) +# d._del_active_storage() +# self.assertFalse(d.active_storage) +# +# # Check that operations correctly set active_storage to +# # False, in particular those that do not invokde +# # `Data._set_dask`. +# d._set_active_storage(True) +# d.transpose(inplace=True) +# self.assertFalse(d.active_storage) +# +# d._set_active_storage(True) +# d[...] = -1 +# self.assertFalse(d.active_storage) +# +# d._set_active_storage(True) +# d.persist(inplace=True) +# self.assertFalse(d.active_storage) +# +# # Rechunk should preserve active_storage +# d._set_active_storage(True) +# d.rechunk(1, inplace=True) +# self.assertTrue(d.active_storage) +# +# # __getitem__ should preserve active_storage +# d._set_active_storage(True) +# self.assertTrue(d[0, 3:].active_storage) +# +# # Test with data on disk +# n = cf.NetCDF4Array( +# "test_file.nc", +# "eastward_wind", +# shape=(1, 9, 10), +# dtype=np.dtype(float), +# ) +# d = cf.Data(n) +# self.assertTrue(d.active_storage) +# d = cf.Data(n, to_memory=True) +# self.assertFalse(d.active_storage) # REVIEW: getitem: `test_Data_cull_graph`: prevent new asanyarray layer def test_Data_cull_graph(self): From 81255108fadb1d984b51a6b8e6072ea776f30cfb Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 22 Jul 2024 23:16:41 +0100 Subject: [PATCH 102/134] dev --- Changelog.rst | 2 +- cf/constants.py | 3 +- cf/data/array/mixin/activestoragemixin.py | 6 +- cf/data/collapse/__init__.py | 1 + cf/data/collapse/collapse.py | 44 ++--- cf/data/collapse/collapse_active.py | 182 +++++++++-------- cf/data/dask_utils.py | 2 +- cf/data/data.py | 17 +- cf/data/fragment/fragmentarray.py | 225 ---------------------- cf/data/utils.py | 2 +- cf/functions.py | 121 ++++++++++-- cf/test/test_Data.py | 94 ++++----- cf/test/test_active_storage.py | 2 +- cf/test/test_functions.py | 6 +- docs/source/field_analysis.rst | 39 ++-- docs/source/function.rst | 1 + 16 files changed, 320 insertions(+), 427 deletions(-) delete mode 100644 cf/data/fragment/fragmentarray.py diff --git a/Changelog.rst b/Changelog.rst index e3be9b4c67..992e4661c6 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -150,7 +150,7 @@ version 3.16.0 * New dependency: ``scipy>=1.10.0`` ---- - + version 3.15.4 -------------- diff --git a/cf/constants.py b/cf/constants.py index 3828ae6e42..3275a834f4 100644 --- a/cf/constants.py +++ b/cf/constants.py @@ -63,9 +63,10 @@ "LOG_LEVEL": logging.getLevelName(logging.getLogger().level), "BOUNDS_COMBINATION_MODE": "AND", "CHUNKSIZE": parse_bytes(_CHUNKSIZE), - # REVIEW: active: `CONSTANTS`: new constants 'active_storage', 'active_storage_url' + # REVIEW: active: `CONSTANTS`: new constants 'active_storage', 'active_storage_url', 'active_storage_max_requests' "active_storage": False, "active_storage_url": None, + "active_storage_max_requests": 100, } masked = np.ma.masked diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index 571efaf065..16b3f4fcb1 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -1,7 +1,7 @@ # REVIEW: active: `ActiveStorageMixin`: new mixin class `ActiveStorageMixin` -#try: +# try: # from activestorage import Active -#except ModuleNotFoundError: +# except ModuleNotFoundError: # Active = None @@ -33,6 +33,8 @@ def active_storage(self): return False return True + + # return self.get_filename(None) is not None # @property diff --git a/cf/data/collapse/__init__.py b/cf/data/collapse/__init__.py index 0de12360ea..ec720438f0 100644 --- a/cf/data/collapse/__init__.py +++ b/cf/data/collapse/__init__.py @@ -1 +1,2 @@ from .collapse import Collapse +from .collapse_active import active_reduction_methods diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index 416eecd963..04dfcaa29a 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -14,14 +14,17 @@ class Collapse(metaclass=DocstringRewriteMeta): **Active storage reductions** A collapse method (such as `max`, `var`, etc.) will attempt to - make use of active storage reductions when all of the following - conditions are met: + make use of active storage reduction on an individual `dask` chunk + when all of the following conditions are met: * `cf.active_storage()` is True; - * ``cf.active_storage_url()`` returns the URL of an active + * ``cf.active_storage_url()`` returns the URL of a valid active storage server; + * the `dask` chunk's data are defined by a netCDF-4 file on disk + (rather than in any other file format, or in memory); + * it is possible to import the `activestorage.Active` class; * the method is one of those specified by @@ -31,34 +34,24 @@ class Collapse(metaclass=DocstringRewriteMeta): * the collapse is unweighted; - * the data are in netCDF-4 files on disk (rather than in - any other file format, or in memory); - - * the data are not compressed by convention; - - * the `Collapse` method's *active_storage* parameter is True; + * the data are not numerically packed. - * the `Collapse` method's *chunk_function* parameter is `None`; - - * the `active_storage` attribute of the `Data` object being - collapsed is `True`, indicating that active storage operations - are possible, provided all of the other conditions are also - met. In general, it will only be `True` for data that are in - files on disk, are not compressed by convention, and have not - been previously operated on, apart from by subspacing - operations. - - in which case the Dask graph is modified to expect the per-chunk - reductions to be carried out externally. + If any of these conditions are not met then the `dask` chunk will + be collapsed "as usual", i.e. by retrieving the data to memory (if + it is not already there) and using the local client to perform the + collapse calculations. .. note:: The performance improvements from using active storage operations will increase the closer, in a network sense, the active storage server is to the data storage. If the active storage server is sufficiently far away from the - data then it may be faster and require less energy to do - a normal, non-active operation. - - See `cf.data.collapse.active_storage` for details. + data then it could even be faster and require less + energy to do non-active operation of the local client. + The performance improvements from using active storage + + See `cf.data.collapse.collapse_active.actify` and + `cf.data.collapse.collapse_active.active_chunk_function` for + further details. .. versionadded:: 3.14.0 @@ -252,7 +245,6 @@ def mean( The collapsed array. """ - print ('KKKKKKKKKK') from .dask_collapse import cf_mean_agg, cf_mean_chunk, cf_mean_combine if chunk_function is None: diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 2ed02d5059..892ae2d49c 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -1,5 +1,7 @@ # REVIEW: active: `collapse_active.py`: new module for active storage functionality +import datetime import logging +import time from functools import wraps from numbers import Integral @@ -8,8 +10,12 @@ except ModuleNotFoundError: pass -from ...functions import active_storage as cf_active_storage -from ...functions import active_storage_url, is_log_level_debug +from ...functions import ( + active_storage, + active_storage_max_requests, + active_storage_url, + is_log_level_debug, +) logger = logging.getLogger(__name__) @@ -23,7 +29,7 @@ class ActiveStorageError(Exception): pass -def active_chunk_function(method, x, weights=None, axis=None, keepdims=True, **kwargs): +def active_chunk_function(method, *args, **kwargs): """Collapse data in a chunk with active storage. If an active storage reduction is not approriate then `None` is @@ -54,111 +60,135 @@ def active_chunk_function(method, x, weights=None, axis=None, keepdims=True, **k **Examples** >>> d = active_chunk_function('sum', x) - >>> d + >>> print(d) {'N': 7008, 'sum': 7006221.66903949} - Active storage reduction is not yet possible for variances: + Active storage reduction is not (yet) possible for variances: - >>> d = active_chunk_function('variance', x) + >>> d = active_chunk_function('variance', x, weights) >>> print(d) None """ + x = args[0] if kwargs.get("computing_meta"): - print("COMPUTING_META", method, repr(x), kwargs) return x - - # Return None if active storage reduction is not appropriate - print(method, repr(x), kwargs) - - if not cf_active_storage(): - # Active storage is turned off => do a local reduction + + # ---------------------------------------------------------------- + # Return None if active storage reduction is not + # appropriate. Inside `actify` this will trigger a local reduction + # to be carried out instead. + # ---------------------------------------------------------------- + if not active_storage(): + # Active storage is turned off return + url = kwargs.get("active_storage_url") + if url is None: + url = active_storage_url().value + if url is None: + return + if method not in active_reduction_methods: - # Active storage is not available for this method => do a - # local reduction + # Active storage is not available for this method return if not getattr(x, "active_storage", False): - # Active storage operations are not allowed on 'x' => do a - # local reduction - return - - if weights is not None: - # Active storage is not allowed for weighted reductions => do - # a local reduction + # Active storage operations are not allowed on 'x' return + if len(args) == 2: + # Weights, if present, are always passed in as a positional + # parameter, never as a keyword parameter. See + # `dask.array.reductions.reduction`. + weights = args[1] + if weights is not None: + # Active storage is not allowed for weighted reductions + return + + axis = kwargs.get("axis") if axis is not None: if isinstance(axis, Integral): axis = (axis,) if len(axis) < x.ndim: # Active storage is not allowed for reductions over a - # subset of the axes => do a local reduction + # subset of the axes return - # Raise an ActiveStorageError if the active storage reduction can't - # happen or fails - url = active_storage_url().value - if url is None: - # Active storage operations are not possible when an active - # storage URL has not been set => do a local reduction - raise ActiveStorageError("No active storage URL") - # ---------------------------------------------------------------- # Still here? Set up an Active instance that will carry out the - # active storage operation. + # active storage operation. If it fails then this will trigger + # (inside `actify`) a local reduction being carried out instead. # ---------------------------------------------------------------- - index = x.index() - - filename = x.get_filename() - filename = "/".join(filename.split("/")[3:]) - - max_threads = 100 + filename = x.get_filename() + address = x.get_address() + max_requests = active_storage_max_requests() active_kwargs = { - "uri": filename, - "ncvar": x.get_address(), + "uri": "/".join(filename.split("/")[3:]), + "ncvar": address, "storage_options": x.get_storage_options(), - "active_storage_url": url, # x.get_active_storage_url(), - "storage_type": "s3", # Temporary requirement! - "max_threads": max_threads, + "active_storage_url": url, + "storage_type": "s3", # Temporary requirement to Active! + "max_threads": max_requests, } + index = x.index() + + debug = is_log_level_debug(logger) + debug = True + if debug: + start = time.time() + details = ( + f"{method!r} (file={filename}, address={address}, url={url}, " + f"max_requests={max_requests}, chunk={index})" + ) +# logger.debug( + print( + f"INITIATING active storage reduction {details}: " + f"{datetime.datetime.now()}" + ) # prgama: no cover + active = Active(**active_kwargs) active.method = method active.components = True - if is_log_level_debug: - logger.debug(f"Active call: Active(**{active_kwargs})[{index}]") - - import datetime - import time + # Force active storage reduction on remote server + active._version = 2 # ---------------------------------------------------------------- - # Execute the active storage operation + # Execute the active storage operation by indexing the Active + # instance # ---------------------------------------------------------------- try: - start = time.time() - print("START unlocked", index, datetime.datetime.now()) d = active[index] - print( - "FINISH unlocked", - datetime.datetime.now(), - time.time() - start, - f"maxT={max_threads}", - ) + print ("active.metric_data =",active.metric_data) except Exception as error: # Something went wrong with the active storage operations => - # do a local reduction - print ('565') - raise ActiveStorageError(error) - + # Raise an ActiveStorageError that will trigger (inside + # `actify`) a local reduction to be carried out instead. + if debug: + print( +# logger.debug( + f"FAILED in active storage reduction {details} ({error}): " + f"{round(time.time() - start, 6):.6f}s " + "=> reverting to local computation" + ) # prgama: no cover + + raise + raise ActiveStorageError() + else: + if debug: + print( +# logger.debug( + f"FINISHED active storage reduction {details}: " + f"{round(time.time() - start, 6):.6f}s" + ) # prgama: no cover # ---------------------------------------------------------------- - # Reformat the components dictionary to match the output of the - # corresponding local chunk function + # Active storage reduction was a success. Reformat the resulting + # components dictionary to match the output of the corresponding + # local chunk function (e.g. `cf_mean_chunk`). # ---------------------------------------------------------------- if method == "max": # Local chunk function `cf_max_chunk` @@ -185,10 +215,11 @@ def actify(method): Intended for to decorate the ``cf_*_chunk`` methods in cf.data.collapse.dask_collapse`. - When a ``cf_*_chunk`` method is decorated, then its computations - will be carried out in active storage, if that is appropriate and - possible. Whether or not computations are done in active storage - is determined by `active_chunk_function`. + When a ``cf_*_chunk`` method is decorated, its computations will + be attempted in active storage. If that is not possible (due to + configuration settings, limitations on the type of reduction that + can be done in active storage, or the active storage reduction + failed) then the computations will be done locally "as usual". .. versionadded:: NEXTVERSION @@ -204,24 +235,15 @@ def actify(method): def decorator(chunk_function): @wraps(chunk_function) def wrapper(*args, **kwargs): - - #if args: TODO - # x = args[0] - #else: - # x = kwargs["x"] try: - # Try doing an active storage reduction -# print (method, args, kwargs) + # Try doing an active storage reduction out = active_chunk_function(method, *args, **kwargs) - except ActiveStorageError as warning: + except ActiveStorageError: # The active storage reduction failed - logger.warning( - "Dask chunk failed in active storage reduction => " - f"reverting to local computation: {warning}" - ) + pass else: if out is not None: - # The active storage reduction succeeded + # The active storage reduction succeeded return out # Still here? Then using active storage is not diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index 9c0b1aeb99..3b9cc17ea9 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -122,7 +122,7 @@ def cf_contains(a, value): :Returns: `numpy.ndarray` - A size 1 Boolean array with the same number of dimensions + A size 1 Boolean array, with the same number of dimensions as *a*, that indicates whether or not *a* contains the value. diff --git a/cf/data/data.py b/cf/data/data.py index b22689f5d8..9a7afe9850 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -99,11 +99,11 @@ # Contstants used to specify which `Data` components should be cleared # when a new dask array is set. See `Data._clear_after_dask_update` # for details. -_NONE = 0 # = 0b0000 -_ARRAY = 1 # = 0b0001 -_CACHE = 2 # = 0b0010 -_CFA = 4 # = 0b0100 -_ALL = 15 # = 0b1111 +_NONE = 0 # = 0b0000 +_ARRAY = 1 # = 0b0001 +_CACHE = 2 # = 0b0010 +_CFA = 4 # = 0b0100 +_ALL = 15 # = 0b1111 class Data(DataClassDeprecationsMixin, CFANetCDF, Container, cfdm.Data): @@ -966,7 +966,7 @@ def __getitem__(self, indices): # so we set asanyarray=True to ensure that, if required, # they are converted at compute time. # ------------------------------------------------------------ - new._set_dask(dx, clear=_ALL, asanyarray=True) + new._set_dask(dx, asanyarray=True) # ------------------------------------------------------------ # Get the axis identifiers for the subspace @@ -3047,7 +3047,6 @@ def convolution_filter( depth += abs(origin) - # TODO: check that this is OK dx = d.to_dask_array() # REVIEW: getitem: `percentile`: rectify comment @@ -3246,9 +3245,7 @@ def rechunk( dx = d.to_dask_array(asanyarray=False) dx = dx.rechunk(chunks, threshold, block_size_limit, balance) - d._set_dask( - dx, clear=_ALL ^ _ARRAY ^ _CACHE, asanyarray=True - ) + d._set_dask(dx, clear=_ALL ^ _ARRAY ^ _CACHE, asanyarray=True) return d diff --git a/cf/data/fragment/fragmentarray.py b/cf/data/fragment/fragmentarray.py deleted file mode 100644 index 6836eeeed9..0000000000 --- a/cf/data/fragment/fragmentarray.py +++ /dev/null @@ -1,225 +0,0 @@ -import cfdm - -from ..array.abstract import Array -from ..array.mixin import FileArrayMixin, IndexMixin -from .h5netcdffragmentarray import H5netcdfFragmentArray -from .mixin import FragmentArrayMixin -from .netcdf4fragmentarray import NetCDF4FragmentArray -from .umfragmentarray import UMFragmentArray - -_fragment = {'netCDF4': NetCDF4FragmentArray - 'h5netcdf': H5netcdfFragmentArray, - 'um': UMFragmentArray} - -# REVIEW: TODO getitem: `NetCDFFragmentArray`: new inheritance to allow for different netCDF backends -class FragmentArray( - FragmentArrayMixin, - IndexMixin, - FileArrayMixin, - cfdm.data.mixin.FileArrayMixin, - Array, -): - """A netCDF fragment array. - - Access will be with either `netCDF4` or `h5netcdf`. - - .. versionadded:: NEXTVERSION - - """ - - # REVIEW: h5: `__init__`: replace units/calendar API with attributes - def __init__( - self, - filename=None, - address=None, - dtype=None, - shape=None, - aggregated_units=False, - aggregated_calendar=False, - attributes=None, - storage_options=None, - source=None, - copy=True, - ): - """**Initialisation** - - :Parameters: - - filename: (sequence of `str`), optional - The locations fragment datasets containing the array. - - address: (sequence of `str`), optional - How to find the fragments in the fragment datasets. - - dtype: `numpy.dtype`, optional - The data type of the aggregated array. May be `None` - if is not known. This may differ from the data type of - the fragment's data. - - shape: `tuple`, optional - The shape of the fragment in its canonical form. - - {{init attributes: `dict` or `None`, optional}} - - If *attributes* is `None`, the default, then the - attributes will be set from the fragment dataset - during the first `__getitem__` call. - - {{aggregated_units: `str` or `None`, optional}} - - {{aggregated_calendar: `str` or `None`, optional}} - - {{init storage_options: `dict` or `None`, optional}} - - {{init source: optional}} - - {{init copy: `bool`, optional}} - - """ - super().__init__( - source=source, - copy=copy, - ) - - if source is not None: - try: - shape = source._get_component("shape", None) - except AttributeError: - shape = None - - try: - filename = source._get_component("filename", None) - except AttributeError: - filename = None - - try: - address = source._get_component("address", None) - except AttributeError: - address = None - - try: - dtype = source._get_component("dtype", None) - except AttributeError: - dtype = None - - try: - attributes = source._get_component("attributes", None) - except AttributeError: - attributes = None - - try: - aggregated_units = source._get_component( - "aggregated_units", False - ) - except AttributeError: - aggregated_units = False - - try: - aggregated_calendar = source._get_component( - "aggregated_calendar", False - ) - except AttributeError: - aggregated_calendar = False - - try: - storage_options = source._get_component( - "storage_options", None - ) - except AttributeError: - storage_options = None - - if filename is not None: - if isinstance(filename, str): - filename = (filename,) - else: - filename = tuple(filename) - - self._set_component("filename", filename, copy=False) - - if address is not None: - if isinstance(address, int): - address = (address,) - else: - address = tuple(address) - - self._set_component("address", address, copy=False) - - if storage_options is not None: - self._set_component("storage_options", storage_options, copy=False) - - self._set_component("shape", shape, copy=False) - self._set_component("dtype", dtype, copy=False) - self._set_component("attributes", attributes, copy=False) - self._set_component("mask", True, copy=False) - - self._set_component("aggregated_units", aggregated_units, copy=False) - self._set_component( - "aggregated_calendar", aggregated_calendar, copy=False - ) - - # By default, close the file after data array access - self._set_component("close", True, copy=False) - - # REVIEW: getitem: `_get_array`: new method to convert subspace to numpy array - def _get_array(self, index=None): - """Returns a subspace of the dataset variable. - - The method acts as a factory for either a - `NetCDF4FragmentArray`, `H5netcdfFragmentArray`, or - `UMFragmentArray` class, and it is the result of calling - `!_get_array` on the newly created instance that is returned. - - `H5netcdfFragmentArray` will only be used if - `NetCDF4FragmentArray` returns a `FileNotFoundError` - exception; and `UMFragmentArray` will only be used - if `H5netcdfFragmentArray` returns an `Exception`. - - .. versionadded:: NEXTVERSION - - .. seealso:: `__array__`, `index` - - :Parameters: - - {{index: `tuple` or `None`, optional}} - - When a `tuple`, there must be a distinct entry for each - fragment dimension. - - :Returns: - - `numpy.ndarray` - The subspace. - - """ - kwargs = { - "dtype": self.dtype, - "shape": self.shape, - "aggregated_units": self.get_aggregated_units(None), - "aggregated_calendar": self.get_aggregated_calendar(None), - "attributes": self.get_attributes(None), - "copy": False, - } - - # Loop round the files, returning as soon as we find one that - # is accessible. - filenames = self.get_filenames() - for filename, address in zip(filenames, self.get_addresses()): - kwargs["filename"] = filename - kwargs["address"] = address - kwargs["storage_options"] = self.get_storage_options( - create_endpoint_url=False - ) - - for backend in dataset_backends: - try: - return _fragment[backend](**kwargs)._get_array(index) - except FileNotFoundError: - pass - except KeyError: - raise ValueError("unknown backend: T sadasds TODO") - - # Still here? - if len(filenames) == 1: - raise FileNotFoundError(f"No such fragment file: {filenames[0]}") - - raise FileNotFoundError(f"No such fragment files: {filenames}") diff --git a/cf/data/utils.py b/cf/data/utils.py index 66fb65c8ad..4e12781aeb 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -864,7 +864,7 @@ def collapse( "split_every": split_every, "mtol": mtol, # REVIEW: active: `collapse`: pass the active storage status onto the collapse functions -# "active_storage": d.active_storage, + # "active_storage": d.active_storage, } weights = parse_weights(d, weights, axis) diff --git a/cf/functions.py b/cf/functions.py index 8bed07e40a..81c9f4dded 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -2,6 +2,7 @@ import csv import ctypes.util import importlib +import logging import os import platform import re @@ -162,7 +163,7 @@ def _free_memory(): return float(virtual_memory().available) -# REVIEW: active: `configuration`: new keywords 'active_storage', 'active_storage_url' +# REVIEW: active: `configuration`: new keywords 'active_storage', 'active_storage_url', 'active_storage_max_requests' def configuration( atol=None, rtol=None, @@ -174,6 +175,7 @@ def configuration( bounds_combination_mode=None, active_storage=None, active_storage_url=None, + active_storage_max_requests=None, of_fraction=None, collapse_parallel_mode=None, free_memory_factor=None, @@ -194,6 +196,7 @@ def configuration( * `bounds_combination_mode` * `active_storage` * `active_storage_url` + * `active_storage_max_requests` These are all constants that apply throughout cf, except for in specific functions only if overridden by the corresponding keyword @@ -213,7 +216,8 @@ def configuration( .. seealso:: `atol`, `rtol`, `tempdir`, `chunksize`, `total_memory`, `log_level`, `regrid_logging`, `relaxed_identities`, `bounds_combination_mode`, - `active_storage`, `active_storage_url` + `active_storage`, `active_storage_url`, + `active_storage_max_requests` :Parameters: @@ -278,6 +282,11 @@ def configuration( .. versionadded:: NEXTVERSION + active_storage_max_requests: `int` or `Constant`, optional + The new value. The default is to not change the value. + + .. versionadded:: NEXTVERSION + of_fraction: `float` or `Constant`, optional Deprecated at version 3.14.0 and is no longer available. @@ -309,7 +318,8 @@ def configuration( 'bounds_combination_mode': 'AND', 'chunksize': 82873466.88000001, 'active_storage': False, - 'active_storage_url': None} + 'active_storage_url': None, + 'active_storage_max_requests': 100} >>> cf.chunksize(7.5e7) # any change to one constant... 82873466.88000001 >>> cf.configuration()['chunksize'] # ...is reflected in the configuration @@ -325,7 +335,8 @@ def configuration( 'bounds_combination_mode': 'AND', 'chunksize': 75000000.0, 'active_storage': False, - 'active_storage_url': None} + 'active_storage_url': None, + 'active_storage_max_requests': 100} >>> cf.configuration() # the items set have been updated accordingly {'rtol': 2.220446049250313e-16, 'atol': 2.220446049250313e-16, @@ -336,7 +347,8 @@ def configuration( 'bounds_combination_mode': 'AND', 'chunksize': 75000000.0, 'active_storage': False, - 'active_storage_url': None} + 'active_storage_url': None, + 'active_storage_max_requests': 100} Use as a context manager: @@ -363,7 +375,8 @@ def configuration( 'bounds_combination_mode': 'AND', 'chunksize': 75000000.0, 'active_storage': False, - 'active_storage_url': None} + 'active_storage_url': None, + 'active_storage_max_requests': 100} >>> print(cf.configuration()) {'rtol': 2.220446049250313e-16, 'atol': 2.220446049250313e-16, @@ -374,7 +387,8 @@ def configuration( 'bounds_combination_mode': 'AND', 'chunksize': 75000000.0, 'active_storage': False, - 'active_storage_url': None} + 'active_storage_url': None, + 'active_storage_max_requests': 100} """ if of_fraction is not None: @@ -407,10 +421,11 @@ def configuration( bounds_combination_mode=bounds_combination_mode, active_storage=active_storage, active_storage_url=active_storage_url, + active_storage_max_requests=active_storage_max_requests, ) -# REVIEW: active: `_configuration`: new keywords 'active_storage', 'active_storage_url' +# REVIEW: active: `_configuration`: new keywords 'active_storage', 'active_storage_url', 'active_storage_max_requests' def _configuration(_Configuration, **kwargs): """Internal helper function to provide the logic for `cf.configuration`. @@ -459,6 +474,7 @@ def _configuration(_Configuration, **kwargs): "bounds_combination_mode": bounds_combination_mode, "active_storage": active_storage, "active_storage_url": active_storage_url, + "active_storage_max_requests": active_storage_max_requests, } old_values = {} @@ -1192,7 +1208,8 @@ class active_storage(ConstantAccess): .. versionadded:: NEXTVERSION - .. seealso:: `active_storage_url`, `configuration` + .. seealso:: `active_storage_max_requests`, `active_storage_url`, + `configuration` :Parameters: @@ -1251,7 +1268,7 @@ def _parse(cls, arg): raise ModuleNotFoundError( f"Can't enable active storage operations: {error}" ) - + return bool(arg) @@ -1261,7 +1278,8 @@ class active_storage_url(ConstantAccess): .. versionadded:: NEXTVERSION - .. seealso:: `active_storage`, `configuration` + .. seealso:: `active_storage`, `active_storage_max_requests`, + `configuration` :Parameters: @@ -1319,6 +1337,86 @@ def _parse(cls, arg): return str(arg) +# REVIEW: active: `active_storage_max_requests`: new function +class active_storage_max_requests(ConstantAccess): + """Cconcurrent active storage server requests per `dask` chunk. + + This is the maximum number of concurrent requests per `dask` chunk + that are sent to the active storage server by an `Active` + instance. The default is ``100``. The optimum number may be + estimated by :math:`N = N_{max} / min(N_{PE}, N_{chunk})`, where + + * :math:`N_{max}` is the maximum number of requests that the + active storage server can process concurrently before its + performance is degraded; + + * :math:`N_{PE}` the number of available PEs on the local client. + + * :math:`N_{chunk}` the number of `dask` chunks being reduced with + active storage. + + This formula only applies to cases where all `dask` chunks for the + collapse operation are utilising active storage. If some are not + then :math:`N` will likely be underestimated. + + .. versionadded:: NEXTVERSION + + .. seealso:: `active_storage`, `active_storage_url`, + `configuration` + + :Parameters: + + arg: `int` or `Constant`, optional + Provide a value that will apply to all subsequent + operations. + + :Returns: + + `Constant` + The value prior to the change, or the current value if no + new value was specified. + + **Examples** + + >>> print(cf.active_storage_max_requests()) + 100 + >>> with cf.active_storage_max_requests(25): + ... print(cf.active_storage_max_requests()) + ... + 25 + >>> print(cf.active_storage_max_requests()) + None + >>> print(cf.active_storage_max_requests(12) + None + >>> cf.active_storage_max_requests() + 12 + + """ + + _name = "active_storage_max_requests" + + def _parse(cls, arg): + """Parse a new constant value. + + .. versionaddedd:: NEXTVERSION + + :Parameters: + + cls: + This class. + + arg: + The given new constant value. + + :Returns: + + A version of the new constant value suitable for + insertion into the `CONSTANTS` dictionary. + + """ + return int(arg) + + def CF(): """The version of the CF conventions. @@ -1479,6 +1577,7 @@ def is_log_level_debug(logger): """ return logger.parent.level <= logging.DEBUG + # -------------------------------------------------------------------- # Aliases (for back-compatibility etc.): # -------------------------------------------------------------------- diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 67ae62abc5..7f4816e95b 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -4520,53 +4520,53 @@ def test_Data__str__(self): for element in elements0: self.assertNotIn(element, d._get_cached_elements()) -# # REVIEW: getitem: `test_Data_active_storage`: test `Data.active_storage` -# def test_Data_active_storage(self): -# """Test `Data.active_storage`.""" -# with cf.active_storage(True): -# d = cf.Data([[9, 8]]) -# self.assertFalse(d.active_storage) -# -# d._set_active_storage(True) -# self.assertTrue(d.active_storage) -# d._del_active_storage() -# self.assertFalse(d.active_storage) -# -# # Check that operations correctly set active_storage to -# # False, in particular those that do not invokde -# # `Data._set_dask`. -# d._set_active_storage(True) -# d.transpose(inplace=True) -# self.assertFalse(d.active_storage) -# -# d._set_active_storage(True) -# d[...] = -1 -# self.assertFalse(d.active_storage) -# -# d._set_active_storage(True) -# d.persist(inplace=True) -# self.assertFalse(d.active_storage) -# -# # Rechunk should preserve active_storage -# d._set_active_storage(True) -# d.rechunk(1, inplace=True) -# self.assertTrue(d.active_storage) -# -# # __getitem__ should preserve active_storage -# d._set_active_storage(True) -# self.assertTrue(d[0, 3:].active_storage) -# -# # Test with data on disk -# n = cf.NetCDF4Array( -# "test_file.nc", -# "eastward_wind", -# shape=(1, 9, 10), -# dtype=np.dtype(float), -# ) -# d = cf.Data(n) -# self.assertTrue(d.active_storage) -# d = cf.Data(n, to_memory=True) -# self.assertFalse(d.active_storage) + # # REVIEW: getitem: `test_Data_active_storage`: test `Data.active_storage` + # def test_Data_active_storage(self): + # """Test `Data.active_storage`.""" + # with cf.active_storage(True): + # d = cf.Data([[9, 8]]) + # self.assertFalse(d.active_storage) + # + # d._set_active_storage(True) + # self.assertTrue(d.active_storage) + # d._del_active_storage() + # self.assertFalse(d.active_storage) + # + # # Check that operations correctly set active_storage to + # # False, in particular those that do not invokde + # # `Data._set_dask`. + # d._set_active_storage(True) + # d.transpose(inplace=True) + # self.assertFalse(d.active_storage) + # + # d._set_active_storage(True) + # d[...] = -1 + # self.assertFalse(d.active_storage) + # + # d._set_active_storage(True) + # d.persist(inplace=True) + # self.assertFalse(d.active_storage) + # + # # Rechunk should preserve active_storage + # d._set_active_storage(True) + # d.rechunk(1, inplace=True) + # self.assertTrue(d.active_storage) + # + # # __getitem__ should preserve active_storage + # d._set_active_storage(True) + # self.assertTrue(d[0, 3:].active_storage) + # + # # Test with data on disk + # n = cf.NetCDF4Array( + # "test_file.nc", + # "eastward_wind", + # shape=(1, 9, 10), + # dtype=np.dtype(float), + # ) + # d = cf.Data(n) + # self.assertTrue(d.active_storage) + # d = cf.Data(n, to_memory=True) + # self.assertFalse(d.active_storage) # REVIEW: getitem: `test_Data_cull_graph`: prevent new asanyarray layer def test_Data_cull_graph(self): diff --git a/cf/test/test_active_storage.py b/cf/test/test_active_storage.py index 90555ec4b5..b88075ea74 100644 --- a/cf/test/test_active_storage.py +++ b/cf/test/test_active_storage.py @@ -55,7 +55,7 @@ def test_active_storage(self): with cf.configuration(active_storage=True, active_storage_url="dummy"): self.assertTrue(cf.active_storage()) self.assertEqual(cf.active_storage_url(), "dummy") -# self.assertTrue(f.data.active_storage) + # self.assertTrue(f.data.active_storage) active_array = f.collapse("mean", weights=False).array self.assertEqual(active_array, local_array) diff --git a/cf/test/test_functions.py b/cf/test/test_functions.py index 9f28f42f1d..0582244210 100644 --- a/cf/test/test_functions.py +++ b/cf/test/test_functions.py @@ -47,7 +47,7 @@ def test_aliases(self): self.assertEqual(cf.tempdir(), cf.TEMPDIR()) self.assertEqual(cf.chunksize(), cf.CHUNKSIZE()) - # REVIEW: active: `test_configuration`: test `cf.active_storage`, cf.active_storage_url` + # REVIEW: active: `test_configuration`: test `cf.active_storage`, cf.active_storage_url`, cf.active_storage_max_requests` def test_configuration(self): # This test assumes 'total_memory' remains constant throughout # the test run, which should be true generally in any @@ -59,7 +59,7 @@ def test_configuration(self): self.assertIsInstance(org, dict) # Check all keys that should be there are, with correct value type: - self.assertEqual(len(org), 10) # update expected len if add new key(s) + self.assertEqual(len(org), 11) # update expected len if add new key(s) # Types expected: self.assertIsInstance(org["atol"], float) @@ -70,6 +70,7 @@ def test_configuration(self): self.assertIsInstance(org["regrid_logging"], bool) self.assertIsInstance(org["tempdir"], str) self.assertIsInstance(org["active_storage"], bool) + self.assertIsInstance(org["active_storage_max_requests"], int) # Log level may be input as an int but always given as # equiv. string self.assertIsInstance(org["log_level"], str) @@ -91,6 +92,7 @@ def test_configuration(self): "chunksize": 8e9, "active_storage": True, "active_storage_url": None, + "active_storage_max_requests": 100, } # Test the setting of each lone item. diff --git a/docs/source/field_analysis.rst b/docs/source/field_analysis.rst index 8bdd82c579..f81bfd9965 100644 --- a/docs/source/field_analysis.rst +++ b/docs/source/field_analysis.rst @@ -861,40 +861,41 @@ When the data being collapsed are stored remotely, the collapse calculations may be carried out on a server that is close (in a network distance sense) to the data, thereby removing the time and power costs of transfering the entire un-collapsed data to the local -client. Whether or not this will occur is determined on a case-by-case -basis, and will only be done if all of the following criteria are met: +client. + +Whether or not this will occur for an individual `dask` chunk is +determined on a case-by-case basis, and will only be done if all of +the following criteria are met: * ``cf.active_storage()`` is `True`; -* ``cf.active_storage_url()`` returns the URL of an active storage - server; +* ``cf.active_storage_url()`` returns the URL of a valid active + storage server; + +* the `dask` chunk's data are defined by a netCDF-4 file on disk + (rather than in any other file format, or in memory); -* it is possible to import the external `activestorage.Active` class. +* it is possible to import the `activestorage.Active` class; -* the collapse method is one of ``'mean'``, ``'maximum'``, - ``'minimum'``, or ``'sum'``; +* the method is one of those specified by + `cf.data.collapse.active_reduction_methods`; * the collapse is over all axes; * the collapse is unweighted; -* the data values are in netCDF-4 files on disk (rather than in any - other file format, or in memory); - -* the data are not compressed by convention; +* the data are not numerically packed. -* the `~cf.Data.active_storage` attribute of the `cf.Data` object - being collapsed is `True`, indicating that active storage operations - are possible, provided all of the other conditions are also met. In - general, it will only be `True` for data that are in files on disk, - are not compressed by convention, and have not been previously - operated on, apart from by subspacing operations. +If any of these conditions are not met then the `dask` chunk will be +collapsed "as usual", i.e. by retrieving the data to memory (if it is +not already there) and using the local client to perform the collapse +calculations. The performance improvements from using active storage operations will increase the closer, in a network sense, the active storage server is to the data storage. If the active storage server is sufficiently far -away from the data then it may be faster and require less energy to do -a normal, non-active operation. +away from the data then it could even be faster and require less +energy to do non-active operation of the local client. ---- diff --git a/docs/source/function.rst b/docs/source/function.rst index d48452eeae..8892f325c1 100644 --- a/docs/source/function.rst +++ b/docs/source/function.rst @@ -156,6 +156,7 @@ Active storage reductions cf.active_storage cf.active_storage_url + cf.active_storage_max_requests cf.netcdf_lock Miscellaneous From 57561a06d9c21ae537d72f65bbb38eea49cc9bf4 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 6 Aug 2024 16:37:07 +0100 Subject: [PATCH 103/134] dev --- cf/data/collapse/collapse_active.py | 123 +++++++++++++++++----------- cf/functions.py | 8 +- 2 files changed, 77 insertions(+), 54 deletions(-) diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 892ae2d49c..b7dcd14eff 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -14,7 +14,7 @@ active_storage, active_storage_max_requests, active_storage_url, - is_log_level_debug, + is_log_level_info, ) logger = logging.getLogger(__name__) @@ -32,9 +32,17 @@ class ActiveStorageError(Exception): def active_chunk_function(method, *args, **kwargs): """Collapse data in a chunk with active storage. + Called by the `actify` decorator function. + If an active storage reduction is not approriate then `None` is - returned, or else an ActiveStorageError is raised if the active - storage operation fails. + returned. + + If the active storage operation fails then ActiveStorageError is + raised. + + If the active storage operation is successful then a dictionary of + redcution components, similar to that returned by a ``cf_*_chunk`` + method, is returned. .. versionadded:: NEXTVERSION @@ -54,8 +62,8 @@ def active_chunk_function(method, *args, **kwargs): :Returns: `dict` or `None` - The reduced data in component form, or else `None` if an - active storage reduction is not approriate. + The reduced data in component form, or `None` if an active + storage reduction is not approriate. **Examples** @@ -71,13 +79,15 @@ def active_chunk_function(method, *args, **kwargs): """ x = args[0] + + # Dask reduction machinery if kwargs.get("computing_meta"): return x # ---------------------------------------------------------------- - # Return None if active storage reduction is not - # appropriate. Inside `actify` this will trigger a local reduction - # to be carried out instead. + # Return None if active storage reduction is not appropriate. + # Inside `actify`, this will trigger a local reduction to be + # carried out instead. # ---------------------------------------------------------------- if not active_storage(): # Active storage is turned off @@ -87,23 +97,27 @@ def active_chunk_function(method, *args, **kwargs): if url is None: url = active_storage_url().value if url is None: + # Active storage is not possible when no active storage + # server URL has been provided return if method not in active_reduction_methods: - # Active storage is not available for this method + # Active storage is not (yet) available for this method return if not getattr(x, "active_storage", False): - # Active storage operations are not allowed on 'x' + # The data data object 'x' is incompatible with active storage + # operations. E.g. it is a UMArray object, a numpy array, etc. return if len(args) == 2: # Weights, if present, are always passed in as a positional - # parameter, never as a keyword parameter. See - # `dask.array.reductions.reduction`. + # parameter, never as a keyword parameter (see + # `dask.array.reductions.reduction` for details). weights = args[1] if weights is not None: - # Active storage is not allowed for weighted reductions + # Active storage is not (yet) allowed for weighted + # reductions return axis = kwargs.get("axis") @@ -112,14 +126,15 @@ def active_chunk_function(method, *args, **kwargs): axis = (axis,) if len(axis) < x.ndim: - # Active storage is not allowed for reductions over a - # subset of the axes + # Active storage is not (yet) allowed for reductions over + # a subset of the axes return # ---------------------------------------------------------------- # Still here? Set up an Active instance that will carry out the - # active storage operation. If it fails then this will trigger - # (inside `actify`) a local reduction being carried out instead. + # active storage operation. If the operation fails, for any + # reason, then this will trigger (inside `actify`) a local + # reduction being carried out instead. # ---------------------------------------------------------------- filename = x.get_filename() address = x.get_address() @@ -136,25 +151,23 @@ def active_chunk_function(method, *args, **kwargs): index = x.index() - debug = is_log_level_debug(logger) - debug = True - if debug: + info = is_log_level_info(logger) + if info: + # Do some detailed logging start = time.time() details = ( f"{method!r} (file={filename}, address={address}, url={url}, " - f"max_requests={max_requests}, chunk={index})" + f"chunk={index})" ) -# logger.debug( - print( - f"INITIATING active storage reduction {details}: " - f"{datetime.datetime.now()}" - ) # prgama: no cover + logger.info( + f"STARTED active storage {details}: {datetime.datetime.now()}" + ) # pragma: no cover active = Active(**active_kwargs) active.method = method active.components = True - # Force active storage reduction on remote server + # Force an active storage reduction on the remote server active._version = 2 # ---------------------------------------------------------------- @@ -163,32 +176,40 @@ def active_chunk_function(method, *args, **kwargs): # ---------------------------------------------------------------- try: d = active[index] - print ("active.metric_data =",active.metric_data) except Exception as error: # Something went wrong with the active storage operations => - # Raise an ActiveStorageError that will trigger (inside - # `actify`) a local reduction to be carried out instead. - if debug: - print( -# logger.debug( - f"FAILED in active storage reduction {details} ({error}): " - f"{round(time.time() - start, 6):.6f}s " - "=> reverting to local computation" - ) # prgama: no cover - - raise - raise ActiveStorageError() + # Raise an ActiveStorageError that will in tuen trigger + # (inside `actify`) a local reduction to be carried out + # instead. + raise ActiveStorageError( + f"FAILED in active storage {details} ({error}))" + ) else: - if debug: - print( -# logger.debug( - f"FINISHED active storage reduction {details}: " - f"{round(time.time() - start, 6):.6f}s" - ) # prgama: no cover + # Active storage reduction was successful + if info: + # Do some detailed logging + try: + md = active.metric_data + except AttributeError: + logger.info( + f"FINISHED active storage {details}: " + f"{time.time() - start:6.2f}s" + ) # pragma: no cover + else: + logger.info( + f"FINISHED active storage {details}: " + f"dataset chunks: {md['dataset chunks']}, " + f"load nc (s): {md['load nc time']:6.2f}, " + f"indexing (s): {md['indexing time (s)']:6.2f}, " + f"reduction (s): {md['reduction time (s)']:6.2f}, " + f"selection 2 (s): {md['selection 2 time (s)']:6.2f}, " + f"Total: {(time.time() - start):6.2f}s" + ) # pragma: no cover + # ---------------------------------------------------------------- # Active storage reduction was a success. Reformat the resulting - # components dictionary to match the output of the corresponding - # local chunk function (e.g. `cf_mean_chunk`). + # components dictionary 'd' to match the output of the + # corresponding local chunk function (e.g. `cf_mean_chunk`). # ---------------------------------------------------------------- if method == "max": # Local chunk function `cf_max_chunk` @@ -238,9 +259,11 @@ def wrapper(*args, **kwargs): try: # Try doing an active storage reduction out = active_chunk_function(method, *args, **kwargs) - except ActiveStorageError: + except ActiveStorageError as error: # The active storage reduction failed - pass + logger.warning( + f"{error} => reverting to local computation" + ) # pragma: no cover else: if out is not None: # The active storage reduction succeeded diff --git a/cf/functions.py b/cf/functions.py index 81c9f4dded..aaea1c7a7f 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -1557,8 +1557,8 @@ def total_memory(): return CONSTANTS["TOTAL_MEMORY"] -def is_log_level_debug(logger): - """Return True if and only if log level is at least DEBUG. +def is_log_level_info(logger): + """Return True if and only if log level is at least INFO. .. versionadded:: NEXTVERSION @@ -1572,10 +1572,10 @@ def is_log_level_debug(logger): :Returns: `bool` - Whether or not the log level is at least DEBUG. + Whether or not the log level is at least INFO. """ - return logger.parent.level <= logging.DEBUG + return logger.parent.level <= logging.INFO # -------------------------------------------------------------------- From 581648d86f304f42b4d99d75d318a86653c32f81 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 6 Aug 2024 16:51:58 +0100 Subject: [PATCH 104/134] dev --- cf/data/collapse/collapse.py | 2 +- cf/data/collapse/collapse_active.py | 128 ++++++++++++++-------------- cf/functions.py | 4 +- 3 files changed, 66 insertions(+), 68 deletions(-) diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index 04dfcaa29a..57b48a875e 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -48,7 +48,7 @@ class Collapse(metaclass=DocstringRewriteMeta): data then it could even be faster and require less energy to do non-active operation of the local client. The performance improvements from using active storage - + See `cf.data.collapse.collapse_active.actify` and `cf.data.collapse.collapse_active.active_chunk_function` for further details. diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index b7dcd14eff..597d08469d 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -29,19 +29,68 @@ class ActiveStorageError(Exception): pass +def actify(method): + """Decorator for active storage reductions on chunks. + + Intended for to decorate the ``cf_*_chunk`` methods in + cf.data.collapse.dask_collapse`. + + When a ``cf_*_chunk`` method is decorated, its computations will + be attempted in active storage. If that is not possible (due to + configuration settings, limitations on the type of reduction that + can be done in active storage, or the active storage reduction + failed) then the computations will be done locally "as usual". + + .. versionadded:: NEXTVERSION + + .. seealso:: `active_chunk_function` + + :Parameters: + + method: `str` + The name of the reduction method. + + """ + + def decorator(chunk_function): + @wraps(chunk_function) + def wrapper(*args, **kwargs): + try: + # Try doing an active storage reduction + out = active_chunk_function(method, *args, **kwargs) + except ActiveStorageError as error: + # The active storage reduction failed + logger.warning( + f"{error} => reverting to local computation" + ) # pragma: no cover + else: + if out is not None: + # The active storage reduction succeeded + return out + + # Still here? Then using active storage was not + # appropriate, or else doing the active storage operation + # failed => do a local computation. + return chunk_function(*args, **kwargs) + + return wrapper + + return decorator + + def active_chunk_function(method, *args, **kwargs): """Collapse data in a chunk with active storage. Called by the `actify` decorator function. - If an active storage reduction is not approriate then `None` is + If an active storage reduction is not appropriate then `None` is returned. - If the active storage operation fails then ActiveStorageError is - raised. + If the active storage operation fails then an ActiveStorageError + is raised. If the active storage operation is successful then a dictionary of - redcution components, similar to that returned by a ``cf_*_chunk`` + reduction components, similar to that returned by a ``cf_*_chunk`` method, is returned. .. versionadded:: NEXTVERSION @@ -63,7 +112,7 @@ def active_chunk_function(method, *args, **kwargs): `dict` or `None` The reduced data in component form, or `None` if an active - storage reduction is not approriate. + storage reduction is not appropriate. **Examples** @@ -150,15 +199,16 @@ def active_chunk_function(method, *args, **kwargs): } index = x.index() - + + details = ( + f"{method!r} (file={filename}, address={address}, url={url}, " + f"Dask chunk={index})" + ) + info = is_log_level_info(logger) if info: # Do some detailed logging start = time.time() - details = ( - f"{method!r} (file={filename}, address={address}, url={url}, " - f"chunk={index})" - ) logger.info( f"STARTED active storage {details}: {datetime.datetime.now()}" ) # pragma: no cover @@ -178,14 +228,14 @@ def active_chunk_function(method, *args, **kwargs): d = active[index] except Exception as error: # Something went wrong with the active storage operations => - # Raise an ActiveStorageError that will in tuen trigger + # Raise an ActiveStorageError that will in turn trigger # (inside `actify`) a local reduction to be carried out # instead. raise ActiveStorageError( f"FAILED in active storage {details} ({error}))" ) else: - # Active storage reduction was successful + # Active storage reduction was successful if info: # Do some detailed logging try: @@ -205,7 +255,7 @@ def active_chunk_function(method, *args, **kwargs): f"selection 2 (s): {md['selection 2 time (s)']:6.2f}, " f"Total: {(time.time() - start):6.2f}s" ) # pragma: no cover - + # ---------------------------------------------------------------- # Active storage reduction was a success. Reformat the resulting # components dictionary 'd' to match the output of the @@ -225,55 +275,3 @@ def active_chunk_function(method, *args, **kwargs): d = {"N": d["n"], "sum": d["sum"]} return d - - -# -------------------------------------------------------------------- -# Decorators -# -------------------------------------------------------------------- -def actify(method): - """Decorator for active storage reductions on chunks. - - Intended for to decorate the ``cf_*_chunk`` methods in - cf.data.collapse.dask_collapse`. - - When a ``cf_*_chunk`` method is decorated, its computations will - be attempted in active storage. If that is not possible (due to - configuration settings, limitations on the type of reduction that - can be done in active storage, or the active storage reduction - failed) then the computations will be done locally "as usual". - - .. versionadded:: NEXTVERSION - - .. seealso:: `active_chunk_function` - - :Parameters: - - method: `str` - The name of the reduction method. - - """ - - def decorator(chunk_function): - @wraps(chunk_function) - def wrapper(*args, **kwargs): - try: - # Try doing an active storage reduction - out = active_chunk_function(method, *args, **kwargs) - except ActiveStorageError as error: - # The active storage reduction failed - logger.warning( - f"{error} => reverting to local computation" - ) # pragma: no cover - else: - if out is not None: - # The active storage reduction succeeded - return out - - # Still here? Then using active storage is not - # appropriate, or else doing the active storage operation - # failed => do a local computation. - return chunk_function(*args, **kwargs) - - return wrapper - - return decorator diff --git a/cf/functions.py b/cf/functions.py index aaea1c7a7f..bb973d3187 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -1262,7 +1262,7 @@ def _parse(cls, arg): """ try: - from activestorage import Active + from activestorage import Active # noqa: F401 except ModuleNotFoundError as error: if arg: raise ModuleNotFoundError( @@ -1358,7 +1358,7 @@ class active_storage_max_requests(ConstantAccess): This formula only applies to cases where all `dask` chunks for the collapse operation are utilising active storage. If some are not then :math:`N` will likely be underestimated. - + .. versionadded:: NEXTVERSION .. seealso:: `active_storage`, `active_storage_url`, From baf9898bec99cd5be04bac3651be835185d48068 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 21 Oct 2024 16:38:55 +0100 Subject: [PATCH 105/134] Fix typos Co-authored-by: Sadie L. Bartholomew --- cf/data/array/h5netcdfarray.py | 2 +- cf/data/array/mixin/arraymixin.py | 2 +- cf/data/array/mixin/cfamixin.py | 12 ++++++------ cf/data/array/mixin/indexmixin.py | 4 ++-- cf/data/collapse/collapse_active.py | 6 +++--- cf/data/dask_utils.py | 2 +- cf/data/data.py | 4 ++-- cf/read_write/read.py | 2 +- 8 files changed, 17 insertions(+), 17 deletions(-) diff --git a/cf/data/array/h5netcdfarray.py b/cf/data/array/h5netcdfarray.py index 2101899f41..cda5f20838 100644 --- a/cf/data/array/h5netcdfarray.py +++ b/cf/data/array/h5netcdfarray.py @@ -18,7 +18,7 @@ class H5netcdfArray( **Active storage reductions** - An active storage reductionx may be enabled with the `actify` + An active storage reduction may be enabled with the `actify` method. See `cf.data.collapse.Collapse` for details. .. versionadded:: NEXTVERSION diff --git a/cf/data/array/mixin/arraymixin.py b/cf/data/array/mixin/arraymixin.py index b68c596668..41ad6b5d6b 100644 --- a/cf/data/array/mixin/arraymixin.py +++ b/cf/data/array/mixin/arraymixin.py @@ -21,7 +21,7 @@ def __array_function__(self, func, types, args, kwargs): # REVIEW: active: `_meta`: Moved to here from `FileArrayMixin` @property def _meta(self): - """Normalize the array to an appropriate Dask meta object. + """Normalise the array to an appropriate Dask meta object. The Dask meta can be thought of as a suggestion to Dask. Dask uses this meta to generate the task graph until it can infer diff --git a/cf/data/array/mixin/cfamixin.py b/cf/data/array/mixin/cfamixin.py index 46299dc9e5..43fc23cf85 100644 --- a/cf/data/array/mixin/cfamixin.py +++ b/cf/data/array/mixin/cfamixin.py @@ -254,7 +254,7 @@ def _parse_cfa(self, x, term, substitutions): 3-`tuple` 1. The shape of the aggregated data. 2. The shape of the array of fragments. - 3. The parsed aggregation instructsions. + 3. The parsed aggregation instructions. """ aggregated_data = {} @@ -517,10 +517,10 @@ def get_term(self, default=ValueError()): def subarray_shapes(self, shapes): """Create the subarray shapes. - A fragmented dimenion (i.e. one spanned by two or fragments) - will always have a subarray size equal to the size of each of - its fragments, overriding any other size implied by the - *shapes* parameter. + A fragmented dimension (i.e. one spanned by two or more + fragments) will always have a subarray size equal to the + size of each of its fragments, overriding any other size + implied by the *shapes* parameter. .. versionadded:: 3.14.0 @@ -585,7 +585,7 @@ def subarray_shapes(self, shapes): if dim in f_dims: # This aggregated dimension is spanned by two or more # fragments => set the chunks to be the same size as - # the each fragment. + # each fragment. c = [] index = [0] * ndim for j in range(n_fragments): diff --git a/cf/data/array/mixin/indexmixin.py b/cf/data/array/mixin/indexmixin.py index 6dbd56c624..4cf2ad18b1 100644 --- a/cf/data/array/mixin/indexmixin.py +++ b/cf/data/array/mixin/indexmixin.py @@ -16,10 +16,10 @@ class IndexMixin: **Examples** - >>> a = cf.{{class}}(....) + >>> a = cf.{{class}}(...) >>> a.shape (6, 5) - >>> print(np.asanyarray(a) + >>> print(np.asanyarray(a)) [[ 0 1 2 3 4]) [ 5 6 7 8 9] [10 11 12 13 14] diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 597d08469d..8614ce8960 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -32,8 +32,8 @@ class ActiveStorageError(Exception): def actify(method): """Decorator for active storage reductions on chunks. - Intended for to decorate the ``cf_*_chunk`` methods in - cf.data.collapse.dask_collapse`. + Intended to decorate the ``cf_*_chunk`` methods in + `cf.data.collapse.dask_collapse`. When a ``cf_*_chunk`` method is decorated, its computations will be attempted in active storage. If that is not possible (due to @@ -155,7 +155,7 @@ def active_chunk_function(method, *args, **kwargs): return if not getattr(x, "active_storage", False): - # The data data object 'x' is incompatible with active storage + # The data object 'x' is incompatible with active storage # operations. E.g. it is a UMArray object, a numpy array, etc. return diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index 3b9cc17ea9..c50e16d85f 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -736,7 +736,7 @@ def cf_filled(a, fill_value=None): def cf_asanyarray(a): """Convert to a `numpy` array. - Only do this is the input *a* has an `__asanyarray__` attribute + Only do this if the input *a* has an `__asanyarray__` attribute with value True. .. versionadded:: NEXTVERSION diff --git a/cf/data/data.py b/cf/data/data.py index 9a7afe9850..07372beb3c 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -1462,7 +1462,7 @@ def _set_dask(self, dx, copy=False, clear=_ALL, asanyarray=False): "suitability (such as data type casting, " "broadcasting, etc.). Note that the exception may be " "difficult to diagnose, as dask will have silently " - "trapped it and returned NotImplemented (seeprint , for " + "trapped it and returned NotImplemented (see, for " "instance, dask.array.core.elemwise). Print " "statements in a local copy of dask are possibly the " "way to go if the cause of the error is not obvious." @@ -9937,7 +9937,7 @@ def to_dask_array(self, apply_mask_hardness=False, asanyarray=None): dx = self._custom["dask"] # Note: The mask hardness functions have their own calls - # to 'cf_asanyarray', so we can don't need worry about + # to 'cf_asanyarray', so we don't need to worry about # setting another one. else: if asanyarray is None: diff --git a/cf/read_write/read.py b/cf/read_write/read.py index 1baf002358..1492e226b6 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -737,7 +737,7 @@ def read( ``{'client_kwargs': {'endpoint_url': 'https://store'}}`` *Parameter example:* - ``{'key: 'scaleway-api-key...', 'secret': + ``{'key': 'scaleway-api-key...', 'secret': 'scaleway-secretkey...', 'endpoint_url': 'https://s3.fr-par.scw.cloud', 'client_kwargs': {'region_name': 'fr-par'}}`` From 8697288d8726ca7fc262b7ba9aef1830ee6826c0 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 21 Oct 2024 16:40:09 +0100 Subject: [PATCH 106/134] Remove dead code Co-authored-by: Sadie L. Bartholomew --- cf/__init__.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/cf/__init__.py b/cf/__init__.py index 05b5efde60..9e630d86ea 100644 --- a/cf/__init__.py +++ b/cf/__init__.py @@ -216,12 +216,6 @@ ) # Check the version of dask -# _minimum_vn = "2022.12.1" -# if Version(dask.__version__) < Version(_minimum_vn): -# raise RuntimeError( -# f"Bad dask version: cf requires dask>={_minimum_vn}. " -# f"Got {dask.__version__} at {dask.__file__}" -# ) # Check the version of Python _minimum_vn = "3.8.0" From 03067a2bb7383b169a28035d00ccfe5922c34469 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 21 Oct 2024 16:41:20 +0100 Subject: [PATCH 107/134] Remove dead code Co-authored-by: Sadie L. Bartholomew --- cf/data/array/mixin/activestoragemixin.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index 16b3f4fcb1..75e0c52be2 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -1,8 +1,4 @@ # REVIEW: active: `ActiveStorageMixin`: new mixin class `ActiveStorageMixin` -# try: -# from activestorage import Active -# except ModuleNotFoundError: -# Active = None class ActiveStorageMixin: From 20fe071a7f8b26c953a4a797178ef8ae817b2427 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 21 Oct 2024 16:43:18 +0100 Subject: [PATCH 108/134] When a note isn't a note Co-authored-by: Sadie L. Bartholomew --- cf/data/array/h5netcdfarray.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/cf/data/array/h5netcdfarray.py b/cf/data/array/h5netcdfarray.py index cda5f20838..f4355ac4f0 100644 --- a/cf/data/array/h5netcdfarray.py +++ b/cf/data/array/h5netcdfarray.py @@ -71,14 +71,13 @@ def _get_array(self, index=None): if index is None: index = self.index() - # Note: We need to lock because the netCDF file is about to be - # accessed. + # We need to lock because the netCDF file is about to be accessed. self._lock.acquire() - # Note: It's cfdm.H5netcdfArray.__getitem__ that we want to - # call here, but we use 'Container' in super because - # that comes immediately before cfdm.H5netcdfArray in - # the method resolution order. + # It's cfdm.H5netcdfArray.__getitem__ that we want to + # call here, but we use 'Container' in super because + # that comes immediately before cfdm.H5netcdfArray in + # the method resolution order. array = super(Container, self).__getitem__(index) self._lock.release() From ef8d9ae3a97dbef60f8a922f5055b701602c2c05 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 21 Oct 2024 17:13:25 +0100 Subject: [PATCH 109/134] trap no fragment files --- cf/data/fragment/netcdffragmentarray.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cf/data/fragment/netcdffragmentarray.py b/cf/data/fragment/netcdffragmentarray.py index 8cf7de8623..8cf8d86465 100644 --- a/cf/data/fragment/netcdffragmentarray.py +++ b/cf/data/fragment/netcdffragmentarray.py @@ -233,7 +233,10 @@ def _get_array(self, index=None): return H5netcdfFragmentArray(**kwargs)._get_array(index) # Still here? + if not filenames: + raise FileNotFoundError(f"No fragment files") + if len(filenames) == 1: raise FileNotFoundError(f"No such fragment file: {filenames[0]}") - + raise FileNotFoundError(f"No such fragment files: {filenames}") From 8b0086e63b99b9a3d0034425f0d789a46fd7bb7a Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 21 Oct 2024 17:14:33 +0100 Subject: [PATCH 110/134] Typo Co-authored-by: Sadie L. Bartholomew --- cf/field.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/field.py b/cf/field.py index 55c77fb4cd..4e38fda7e4 100644 --- a/cf/field.py +++ b/cf/field.py @@ -5592,7 +5592,7 @@ def collapse( When the data being collapsed are stored remotely, the collapse calculations may be carried out on a server (ideally one that is close in a network distance sense) to the data, - thereby removing the time and energy costs of transfering the + thereby removing the time and energy costs of transferring the entire un-collapsed data to the local client. Whether or not this will occur is determined on a case-by-case basis, and will only be done if all of the following criteria are met: From bd45bda11c77ff82bb6d5b59f2c8a4974af31db5 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 21 Oct 2024 17:19:34 +0100 Subject: [PATCH 111/134] Update cf.environment docs --- cf/functions.py | 78 ++++++++++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 36 deletions(-) diff --git a/cf/functions.py b/cf/functions.py index bb973d3187..f019031b74 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -3373,44 +3373,50 @@ def environment(display=True, paths=True): **Examples** >>> cf.environment() - Platform: Linux-4.15.0-54-generic-x86_64-with-glibc2.10 - HDF5 library: 1.10.6 - netcdf library: 4.8.0 - udunits2 library: /home/username/anaconda3/envs/cf-env/lib/libudunits2.so.0 - esmpy/ESMF: 8.4.1 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/esmpy/__init__.py - Python: 3.8.10 /home/username/anaconda3/envs/cf-env/bin/python - dask: 2022.6.0 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/dask/__init__.py - netCDF4: 1.5.6 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/netCDF4/__init__.py - psutil: 5.9.0 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/psutil/__init__.py - packaging: 21.3 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/packaging/__init__.py - numpy: 1.22.2 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/numpy/__init__.py - scipy: 1.10.0 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/scipy/__init__.py - matplotlib: 3.4.3 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/matplotlib/__init__.py - cftime: 1.6.0 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/cftime/__init__.py - cfunits: 3.3.6 /home/username/cfunits/cfunits/__init__.py - cfplot: 3.1.18 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/cfplot/__init__.py - cfdm: 1.10.1.0 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/cfdm/__init__.py - cf: 3.14.0 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/cf/__init__.py + Platform: Linux-5.15.0-122-generic-x86_64-with-glibc2.35 + HDF5 library: 1.12.2 + netcdf library: 4.9.3-development + udunits2 library: /home/user/lib/libudunits2.so.0 + esmpy/ESMF: 8.6.1 /home/user/lib/python3.12/site-packages/esmpy/__init__.py + Python: 3.12.2 /home/user/bin/python + dask: 2024.6.0 /home/user/lib/python3.12/site-packages/dask/__init__.py + netCDF4: 1.6.5 /home/user/lib/python3.12/site-packages/netCDF4/__init__.py + h5netcdf: 1.3.0 /home/user/lib/python3.12/site-packages/h5netcdf/__init__.py + h5py: 3.11.0 /home/user/lib/python3.12/site-packages/h5py/__init__.py + s3fs: 2024.6.0 /home/user/lib/python3.12/site-packages/s3fs/__init__.py + psutil: 5.9.8 /home/user/lib/python3.12/site-packages/psutil/__init__.py + packaging: 23.2 /home/user/lib/python3.12/site-packages/packaging/__init__.py + numpy: 1.26.4 /home/user/lib/python3.12/site-packages/numpy/__init__.py + scipy: 1.13.0 /home/user/lib/python3.12/site-packages/scipy/__init__.py + matplotlib: 3.8.4 /home/user/lib/python3.12/site-packages/matplotlib/__init__.py + cftime: 1.6.3 /home/user/lib/python3.12/site-packages/cftime/__init__.py + cfunits: 3.3.7 /home/user/lib/python3.12/site-packages/cfunits/__init__.py + cfplot: 3.3.0 /home/user/lib/python3.12/site-packages/cfplot/__init__.py + cfdm: 1.11.2.0 /home/user/cfdm/cfdm/__init__.py + cf: NEXTVERSION /home/user/cf-python/cf/__init__.py >>> cf.environment(paths=False) - Platform: Linux-4.15.0-54-generic-x86_64-with-glibc2.10 - HDF5 library: 1.10.6 - netcdf library: 4.8.0 - udunits2 library: libudunits2.so.0 - esmpy/ESMF: 8.4.1 - Python: 3.8.10 - dask: 2022.6.0 - netCDF4: 1.5.6 - psutil: 5.9.0 - packaging: 21.3 - numpy: 1.22.2 - scipy: 1.10.0 - matplotlib: 3.4.3 - cftime: 1.6.0 - cfunits: 3.3.6 - cfplot: 3.1.18 - cfdm: 1.10.1.0 - cf: 3.14.0 + Platform: Linux-5.15.0-122-generic-x86_64-with-glibc2.35 + HDF5 library: 1.12.2 + netcdf library: 4.9.3-development + udunits2 library: /home/user/lib/libudunits2.so.0 + esmpy/ESMF: 8.6.1 + Python: 3.12.2 + dask: 2024.6.0 + netCDF4: 1.6.5 + h5netcdf: 1.3.0 + h5py: 3.11.0 + s3fs: 2024.6.0 + psutil: 5.9.8 + packaging: 23.2 + numpy: 1.26.4 + scipy: 1.13.0 + matplotlib: 3.8.4 + cftime: 1.6.3 + cfunits: 3.3.7 + cfplot: 3.3.0 + cfdm: 1.11.2.0 + cf: NEXTVERSION """ dependency_version_paths_mapping = { From e2bdf644ad8c219bfd7ae8a49205a06bfaf4a098 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 21 Oct 2024 17:20:58 +0100 Subject: [PATCH 112/134] Clarify is_log_level_info docs Co-authored-by: Sadie L. Bartholomew --- cf/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/functions.py b/cf/functions.py index f019031b74..b4967a8907 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -1558,7 +1558,7 @@ def total_memory(): def is_log_level_info(logger): - """Return True if and only if log level is at least INFO. + """Return True if and only if log level is at least as verbose as INFO. .. versionadded:: NEXTVERSION From af54bd141fad17daf96c8546e845e92b5effedf4 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 21 Oct 2024 19:23:10 +0100 Subject: [PATCH 113/134] dev --- cf/data/array/mixin/filearraymixin.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cf/data/array/mixin/filearraymixin.py b/cf/data/array/mixin/filearraymixin.py index b5b314b9e2..4817e17688 100644 --- a/cf/data/array/mixin/filearraymixin.py +++ b/cf/data/array/mixin/filearraymixin.py @@ -17,6 +17,7 @@ def __dask_tokenize__(self): .. versionadded:: 3.15.0 """ + print(6666) return ( self.__class__, self.shape, From df7a6728c98990f2338580b58f6146f7cdbc588f Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 22 Oct 2024 08:44:15 +0100 Subject: [PATCH 114/134] Fix typos Co-authored-by: Sadie L. Bartholomew --- cf/read_write/netcdf/netcdfwrite.py | 2 +- cf/read_write/read.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cf/read_write/netcdf/netcdfwrite.py b/cf/read_write/netcdf/netcdfwrite.py index 930a6d3e0f..a857ce89d8 100644 --- a/cf/read_write/netcdf/netcdfwrite.py +++ b/cf/read_write/netcdf/netcdfwrite.py @@ -883,7 +883,7 @@ def _cfa_aggregation_instructions(self, data, cfvar): raise ValueError( f"Can't write {cfvar!r} as a CFA-netCDF " "aggregation variable: Dask chunk defined by index " - f"{indices} spans two or more fragments." + f"{indices} spans two or more fragments. " "A possible fix for this is to set chunks=None as " "an argument of a prior call to cf.read" ) diff --git a/cf/read_write/read.py b/cf/read_write/read.py index 1492e226b6..3e1f2ec6ae 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -672,8 +672,8 @@ def read( names. Each key may be specified with or without the ``${*}`` syntax (where `*` represents any amount of any characters). For instance, ``{'substitution': - 'replacement'}`` and ``{'${substitution}' are equivalent - 'replacement'}``. The substitutions are used in + 'replacement'}`` and ``{'${substitution}': 'replacement'}``' + are equivalent. The substitutions are used in conjunction with, and take precedence over, any that are stored in the CFA-netCDF file by the ``substitutions`` attribute of the ``file`` fragement array variable. @@ -742,7 +742,7 @@ def read( 'https://s3.fr-par.scw.cloud', 'client_kwargs': {'region_name': 'fr-par'}}`` - .. versionadded:: NEXTVERSION + .. versionadded:: NEXTVERSION cache: `bool`, optional If True, the default, then cache the first and last array From 9b4f721f10d0a7316c4466b48be5500dede29632 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 22 Oct 2024 08:47:31 +0100 Subject: [PATCH 115/134] Fix typos Co-authored-by: Sadie L. Bartholomew --- cf/functions.py | 4 ++-- docs/source/tutorial.rst | 2 +- requirements.txt | 1 - 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cf/functions.py b/cf/functions.py index b4967a8907..315aba09de 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -1339,7 +1339,7 @@ def _parse(cls, arg): # REVIEW: active: `active_storage_max_requests`: new function class active_storage_max_requests(ConstantAccess): - """Cconcurrent active storage server requests per `dask` chunk. + """Concurrent active storage server requests per `dask` chunk. This is the maximum number of concurrent requests per `dask` chunk that are sent to the active storage server by an `Active` @@ -1350,7 +1350,7 @@ class active_storage_max_requests(ConstantAccess): active storage server can process concurrently before its performance is degraded; - * :math:`N_{PE}` the number of available PEs on the local client. + * :math:`N_{PE}` the number of available PEs on the local client; * :math:`N_{chunk}` the number of `dask` chunks being reduced with active storage. diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index 2951668cad..43f76b3151 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -4610,7 +4610,7 @@ instances for the field and metadata constructs. It is, however, possible to create data from arrays that reside on disk. The `cf.read` function creates data in this manner. A pointer to an array in a netCDF file can be stored in a `cf.NetCDF4Array` or -`~cf.H5netcdfAarray` instance, which is is used to initialise a +`~cf.H5netcdfAarray` instance, which is used to initialise a `cf.Data` instance. .. code-block:: python diff --git a/requirements.txt b/requirements.txt index f467069169..8b01daddca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,3 @@ scipy>=1.10.0 h5netcdf>=1.3.0 h5py>=3.10.0 s3fs>=2024.2.0 - From 7dd8ff53fd7a54d24dbcd3d31aeddf559f67ce11 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 22 Oct 2024 08:52:02 +0100 Subject: [PATCH 116/134] Typo Co-authored-by: Sadie L. Bartholomew --- cf/data/array/netcdfarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/array/netcdfarray.py b/cf/data/array/netcdfarray.py index 5c382bf123..fab088f6f3 100644 --- a/cf/data/array/netcdfarray.py +++ b/cf/data/array/netcdfarray.py @@ -9,7 +9,7 @@ class NetCDFArray: def __init__(self, *args, **kwargs): """**Initialisation**""" - from ..functions import DeprecationError + from ...functions import DeprecationError raise DeprecationError( f"{self.__class__.__name__} was deprecated at version NEXTVERSION " From 96eb691ec9176a3eb5e2dce2d5158b428cf8078f Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 22 Oct 2024 09:00:29 +0100 Subject: [PATCH 117/134] activestorage installation instructions --- docs/source/installation.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/source/installation.rst b/docs/source/installation.rst index a864b25e6b..601fc487fc 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -270,8 +270,10 @@ environments for which these features are not required. .. rubric:: Active storage collapses -* `activestorage `_, version 1.0.0 - or newer. +* `activestorage `_. This + library is not yet in PyPi (it will be soon), so in the interim you + must use a copy of the ``pyfive`` branch of the + https://github.com/NCAS-CMS/PyActiveStorage repository. ---- From 2f9a47fa9f18e60fcddcd42707f50a4ca77a6206 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 22 Oct 2024 09:07:33 +0100 Subject: [PATCH 118/134] Typo Co-authored-by: Sadie L. Bartholomew --- docs/source/field_analysis.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/field_analysis.rst b/docs/source/field_analysis.rst index f81bfd9965..638ee2d073 100644 --- a/docs/source/field_analysis.rst +++ b/docs/source/field_analysis.rst @@ -860,7 +860,7 @@ Active storage collapses When the data being collapsed are stored remotely, the collapse calculations may be carried out on a server that is close (in a network distance sense) to the data, thereby removing the time and -power costs of transfering the entire un-collapsed data to the local +power costs of transferring the entire un-collapsed data to the local client. Whether or not this will occur for an individual `dask` chunk is From 9b0e8a614b74d1c5ee5a3914b6c2b7fbfaaeadd9 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 22 Oct 2024 09:37:45 +0100 Subject: [PATCH 119/134] dask_task_graph.png -> dask_task_graph.svg --- docs/source/performance.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/performance.rst b/docs/source/performance.rst index 9ceb7cd446..a799af42a4 100644 --- a/docs/source/performance.rst +++ b/docs/source/performance.rst @@ -213,7 +213,7 @@ basic data computation over four chunks: [18 19 20 21 22 23] [24 25 26 27 28 29]] >>> e = d ** 2 + d - >>> e.to_dask_array().visualize('dask_task_graph.png') + >>> e.to_dask_array().visualize('dask_task_graph.svg') >>> print(e.array) [[ 0 2 6 12 20 30] [ 42 56 72 90 110 132] @@ -221,14 +221,14 @@ basic data computation over four chunks: [342 380 420 462 506 552] [600 650 702 756 812 870]] -The image file ``dask_task_graph.png`` contains the visualisation of +The image file ``dask_task_graph.svg`` contains the visualisation of the Dask task graph, showing the operations on each chunk: .. figure:: images/dask_task_graph.svg :scale: 8 % The operations were only executed when their result was requested with -the final ``e.array`` command. The boxes in ``dask_task_graph.png`` +the final ``e.array`` command. The boxes in ``dask_task_graph.svg`` represent the data chunks and the circles represent the operations to be performed on the chunks. The five boxes in the bottom row are the starting data (i.e. the four chunks of ``d`` and the scalar ``2``), From 6af723cdd6c8c4db7dad9cff66007907925b8b4e Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 22 Oct 2024 09:53:53 +0100 Subject: [PATCH 120/134] remove redundant active_storage test --- cf/test/test_Data.py | 48 -------------------------------------------- 1 file changed, 48 deletions(-) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 7f4816e95b..c9d32b04c6 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -4520,54 +4520,6 @@ def test_Data__str__(self): for element in elements0: self.assertNotIn(element, d._get_cached_elements()) - # # REVIEW: getitem: `test_Data_active_storage`: test `Data.active_storage` - # def test_Data_active_storage(self): - # """Test `Data.active_storage`.""" - # with cf.active_storage(True): - # d = cf.Data([[9, 8]]) - # self.assertFalse(d.active_storage) - # - # d._set_active_storage(True) - # self.assertTrue(d.active_storage) - # d._del_active_storage() - # self.assertFalse(d.active_storage) - # - # # Check that operations correctly set active_storage to - # # False, in particular those that do not invokde - # # `Data._set_dask`. - # d._set_active_storage(True) - # d.transpose(inplace=True) - # self.assertFalse(d.active_storage) - # - # d._set_active_storage(True) - # d[...] = -1 - # self.assertFalse(d.active_storage) - # - # d._set_active_storage(True) - # d.persist(inplace=True) - # self.assertFalse(d.active_storage) - # - # # Rechunk should preserve active_storage - # d._set_active_storage(True) - # d.rechunk(1, inplace=True) - # self.assertTrue(d.active_storage) - # - # # __getitem__ should preserve active_storage - # d._set_active_storage(True) - # self.assertTrue(d[0, 3:].active_storage) - # - # # Test with data on disk - # n = cf.NetCDF4Array( - # "test_file.nc", - # "eastward_wind", - # shape=(1, 9, 10), - # dtype=np.dtype(float), - # ) - # d = cf.Data(n) - # self.assertTrue(d.active_storage) - # d = cf.Data(n, to_memory=True) - # self.assertFalse(d.active_storage) - # REVIEW: getitem: `test_Data_cull_graph`: prevent new asanyarray layer def test_Data_cull_graph(self): """Test `Data.cull`""" From dc173a38a237730aa171d53f56d8463fa8fbf3a0 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 22 Oct 2024 09:58:46 +0100 Subject: [PATCH 121/134] fix active doc string --- cf/data/collapse/collapse.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index 57b48a875e..58069a2576 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -47,7 +47,6 @@ class Collapse(metaclass=DocstringRewriteMeta): active storage server is sufficiently far away from the data then it could even be faster and require less energy to do non-active operation of the local client. - The performance improvements from using active storage See `cf.data.collapse.collapse_active.actify` and `cf.data.collapse.collapse_active.active_chunk_function` for From 050913581485d628a4bce914bd1da1b0c22e638f Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 22 Oct 2024 10:05:41 +0100 Subject: [PATCH 122/134] trap: No module named 'activestorage' --- cf/test/test_functions.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cf/test/test_functions.py b/cf/test/test_functions.py index 0582244210..0ff8995d1c 100644 --- a/cf/test/test_functions.py +++ b/cf/test/test_functions.py @@ -98,7 +98,12 @@ def test_configuration(self): # Test the setting of each lone item. expected_post_set = dict(org) # copy for safety with mutable dict for setting, value in reset_values.items(): - cf.configuration(**{setting: value}) + try: + cf.configuration(**{setting: value}) + except ModuleNotFoundError as error: + print(f"WARNING: not testing {setting!r} due to: {error}") + continue + post_set = cf.configuration() # Expect a dict that is identical to the original to start From b72c17bde48bb5f9f7f5e0d60ebbca2a0f29195e Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 22 Oct 2024 11:50:03 +0100 Subject: [PATCH 123/134] correct hdf5 chunks after data operations --- cf/data/array/mixin/filearraymixin.py | 1 - cf/data/data.py | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/cf/data/array/mixin/filearraymixin.py b/cf/data/array/mixin/filearraymixin.py index 4817e17688..b5b314b9e2 100644 --- a/cf/data/array/mixin/filearraymixin.py +++ b/cf/data/array/mixin/filearraymixin.py @@ -17,7 +17,6 @@ def __dask_tokenize__(self): .. versionadded:: 3.15.0 """ - print(6666) return ( self.__class__, self.shape, diff --git a/cf/data/data.py b/cf/data/data.py index 07372beb3c..b46da64674 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -8650,6 +8650,13 @@ def insert_dimension(self, position=0, inplace=False): data_axes.insert(position, axis) d._axes = data_axes + # Update the HDF5 chunking strategy + chunksizes = d.nc_hdf5_chunksizes() + if isinstance(chunksizes, tuple): + chunksizes = list(chunksizes) + chunksizes.insert(position, 1) + d.nc_set_hdf5_chunksizes(chunksizes) + return d @_deprecated_kwarg_check("size", version="3.14.0", removed_at="5.0.0") @@ -11891,6 +11898,14 @@ def squeeze(self, axes=None, inplace=False, i=False): # Remove the squeezed axes names d._axes = [axis for i, axis in enumerate(d._axes) if i not in iaxes] + # Update the HDF5 chunking strategy + chunksizes = d.nc_hdf5_chunksizes() + if isinstance(chunksizes, tuple): + chunksizes = [ + size for i, size in enumerate(chunksizes) if i not in iaxes + ] + d.nc_set_hdf5_chunksizes(chunksizes) + return d @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @@ -12137,6 +12152,13 @@ def transpose(self, axes=None, inplace=False, i=False): ) d._set_dask(dx) + + # Update the HDF5 chunking strategy + chunksizes = d.nc_hdf5_chunksizes() + if isinstance(chunksizes, tuple): + chunksizes = [chunksizes[i] for i in axes] + d.nc_set_hdf5_chunksizes(chunksizes) + return d @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") From dd92b55d343f7f341f7fdbf78bf12f1e988c3929 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 22 Oct 2024 12:27:39 +0100 Subject: [PATCH 124/134] set default mtol=1 everywhere, and update docstrings --- cf/data/collapse/collapse.py | 26 +++++++++++++------------- cf/data/collapse/dask_collapse.py | 11 +++++------ cf/data/utils.py | 30 ++++++++++++++++++------------ cf/docstring/docstring.py | 18 ++++++++---------- cf/field.py | 2 +- 5 files changed, 45 insertions(+), 42 deletions(-) diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index 58069a2576..550c14959c 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -93,7 +93,7 @@ def max( a, axis=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, chunk_function=None, ): @@ -206,7 +206,7 @@ def mean( axis=None, weights=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, chunk_function=None, ): @@ -272,7 +272,7 @@ def mean_abs( weights=None, axis=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, chunk_function=None, ): @@ -325,7 +325,7 @@ def mid_range( axis=None, dtype=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, chunk_function=None, ): @@ -391,7 +391,7 @@ def min( a, axis=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, chunk_function=None, ): @@ -453,7 +453,7 @@ def min_abs( a, axis=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, chunk_function=None, ): @@ -502,7 +502,7 @@ def range( a, axis=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, chunk_function=None, ): @@ -569,7 +569,7 @@ def rms( axis=None, weights=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, chunk_function=None, ): @@ -635,7 +635,7 @@ def sample_size( a, axis=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, chunk_function=None, ): @@ -702,7 +702,7 @@ def sum( axis=None, weights=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, chunk_function=None, ): @@ -771,7 +771,7 @@ def sum_of_weights( axis=None, weights=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, chunk_function=None, ): @@ -841,7 +841,7 @@ def sum_of_weights2( axis=None, weights=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, chunk_function=None, ): @@ -959,7 +959,7 @@ def var( axis=None, weights=None, keepdims=False, - mtol=None, + mtol=1, ddof=None, split_every=None, chunk_function=None, diff --git a/cf/data/collapse/dask_collapse.py b/cf/data/collapse/dask_collapse.py index f868bab905..8610d43a32 100644 --- a/cf/data/collapse/dask_collapse.py +++ b/cf/data/collapse/dask_collapse.py @@ -41,16 +41,15 @@ def mask_small_sample_size(x, N, axis, mtol, original_shape): mtol: number The sample size threshold below which collapsed values are set to missing data. It is defined as a fraction (between - 0 and 1 inclusive) of the contributing input data values. + 0 and 1 inclusive) of the contributing input data + values. A missing value in the output array occurs + whenever more than ``100*mtol%`` of its contributing input + array elements are missing data. - The default of *mtol* is 1, meaning that a missing datum + The default of *mtol* is 1, meaning that a missing value in the output array occurs whenever all of its contributing input array elements are missing data. - For other values, a missing datum in the output array - occurs whenever more than ``100*mtol%`` of its - contributing input array elements are missing data. - Note that for non-zero values of *mtol*, different collapsed elements may have different sample sizes, depending on the distribution of missing data in the input diff --git a/cf/data/utils.py b/cf/data/utils.py index 4e12781aeb..a026dba58f 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -780,7 +780,7 @@ def collapse( The function that collapses the underlying `dask` array of *d*. Must have the minimum signature (parameters and default values) ``func(dx, axis=None, keepdims=False, - mtol=None, split_every=None)`` (optionally including + mtol=1, split_every=None)`` (optionally including ``weights=None`` or ``ddof=None``), where ``dx`` is a the dask array contained in *d*. @@ -829,23 +829,29 @@ def collapse( mtol: number, optional The sample size threshold below which collapsed values are set to missing data. It is defined as a fraction (between - 0 and 1 inclusive) of the contributing input data values. + 0 and 1 inclusive) of the contributing input data + values. A missing value in the output array occurs + whenever more than ``100*mtol%`` of its contributing input + array elements are missing data. - The default of *mtol* is 1, meaning that a missing datum + The default of *mtol* is 1, meaning that a missing value in the output array occurs whenever all of its contributing input array elements are missing data. - For other values, a missing datum in the output array - occurs whenever more than ``100*mtol%`` of its - contributing input array elements are missing data. + Note that for non-zero values of *mtol*, different + collapsed elements may have different sample sizes, + depending on the distribution of missing data in the input + data. ddof: number, optional - The delta degrees of freedom. The number of degrees of - freedom used in the calculation is (N-*ddof*) where N - represents the number of non-missing elements. - - For collapse functions that do not have a ``ddof`` - parameter, *ddof* must be `None`. + The delta degrees of freedom, a non-negative number. The + number of degrees of freedom used in the calculation is + ``N-ddof`` where ``N`` is the number of non-missing + elements. A value of 1 applies Bessel's correction. If the + calculation is weighted then *ddof* can only be 0 or 1. + + For collapse functions for which delta degrees of freedom + is not applicable (such as `max`), *ddof* must be `None`. split_every: `int` or `dict`, optional Determines the depth of the recursive aggregation. See diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py index bf332d9642..bceb78dd4d 100644 --- a/cf/docstring/docstring.py +++ b/cf/docstring/docstring.py @@ -282,14 +282,12 @@ The sample size threshold below which collapsed values are set to missing data. It is defined as a fraction (between 0 and 1 inclusive) of the contributing input - data values. - - The default of *mtol* is 1, meaning that a missing - datum in the output array occurs whenever all of its + data values. A missing value in the output array + occurs whenever more than ``100*mtol%`` of its contributing input array elements are missing data. - For other values, a missing datum in the output array - occurs whenever more than ``100*mtol%`` of its + The default of *mtol* is 1, meaning that a missing + value in the output array occurs whenever all of its contributing input array elements are missing data. Note that for non-zero values of *mtol*, different @@ -300,10 +298,10 @@ "{{ddof: number}}": """ddof: number The delta degrees of freedom, a non-negative number. The number of degrees of freedom used in the - calculation is (N-*ddof*) where N represents the - number of non-missing elements. A value of 1 applies - Bessel's correction. If the calculation is weighted - then *ddof* can only be 0 or 1.""", + calculation is ``N-ddof`` where ``N`` is the number of + non-missing elements. A value of 1 applies Bessel's + correction. If the calculation is weighted then *ddof* + can only be 0 or 1.""", # split_every "{{split_every: `int` or `dict`, optional}}": """split_every: `int` or `dict`, optional Determines the depth of the recursive aggregation. If diff --git a/cf/field.py b/cf/field.py index 4e38fda7e4..d536dc6d16 100644 --- a/cf/field.py +++ b/cf/field.py @@ -7402,7 +7402,7 @@ def _collapse_grouped( group=None, group_span=None, group_contiguous=False, - mtol=None, + mtol=1, ddof=None, regroup=None, coordinate=None, From 66b84aed39492fcc529dc2edefe861bf1e4443d2 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 22 Oct 2024 12:47:48 +0100 Subject: [PATCH 125/134] warning note about current and futiue Active class APIs --- cf/data/collapse/collapse_active.py | 9 +++++++-- cf/test/test_active_storage.py | 1 - 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 8614ce8960..0f6b61a09e 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -188,15 +188,20 @@ def active_chunk_function(method, *args, **kwargs): filename = x.get_filename() address = x.get_address() max_requests = active_storage_max_requests() - active_kwargs = { "uri": "/".join(filename.split("/")[3:]), "ncvar": address, "storage_options": x.get_storage_options(), "active_storage_url": url, - "storage_type": "s3", # Temporary requirement to Active! + "storage_type": "s3", "max_threads": max_requests, } + # WARNING: The "uri", "storage_options", and "storage_type" keys + # of the `active_kwargs` dictionary are currently + # formatted according to the whims of the `Active` class + # (i.e. the pyfive branch of PyActiveStorage). Future + # versions of `Active` will have a better API, that will + # require improvements to `active_kwargs`. index = x.index() diff --git a/cf/test/test_active_storage.py b/cf/test/test_active_storage.py index b88075ea74..8c7af64bdc 100644 --- a/cf/test/test_active_storage.py +++ b/cf/test/test_active_storage.py @@ -55,7 +55,6 @@ def test_active_storage(self): with cf.configuration(active_storage=True, active_storage_url="dummy"): self.assertTrue(cf.active_storage()) self.assertEqual(cf.active_storage_url(), "dummy") - # self.assertTrue(f.data.active_storage) active_array = f.collapse("mean", weights=False).array self.assertEqual(active_array, local_array) From edc51cd119b0af5005349a1fc28a2020b134b3da Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 22 Oct 2024 12:51:32 +0100 Subject: [PATCH 126/134] warning note about current and future Active class APIs --- cf/data/collapse/collapse_active.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 0f6b61a09e..120cf15259 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -222,7 +222,12 @@ def active_chunk_function(method, *args, **kwargs): active.method = method active.components = True - # Force an active storage reduction on the remote server + # Instruct the `Active` class to do attempt active storage + # reduction on the remote server + # + # WARNING: The `_version` API of `Active` is likely to change from + # the current version (i.e. the pyfive branch of + # PyActiveStorage) active._version = 2 # ---------------------------------------------------------------- From 9e0b44612e7cd1b05eb6e65457886adaee0df467 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 22 Oct 2024 12:54:41 +0100 Subject: [PATCH 127/134] linting --- cf/data/collapse/collapse_active.py | 2 +- cf/data/fragment/netcdffragmentarray.py | 6 +++--- cf/data/utils.py | 2 +- cf/functions.py | 8 ++++---- cf/test/test_functions.py | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 120cf15259..97dc955d38 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -222,7 +222,7 @@ def active_chunk_function(method, *args, **kwargs): active.method = method active.components = True - # Instruct the `Active` class to do attempt active storage + # Instruct the `Active` class to attempt an active storage # reduction on the remote server # # WARNING: The `_version` API of `Active` is likely to change from diff --git a/cf/data/fragment/netcdffragmentarray.py b/cf/data/fragment/netcdffragmentarray.py index 8cf8d86465..c5365098c9 100644 --- a/cf/data/fragment/netcdffragmentarray.py +++ b/cf/data/fragment/netcdffragmentarray.py @@ -234,9 +234,9 @@ def _get_array(self, index=None): # Still here? if not filenames: - raise FileNotFoundError(f"No fragment files") - + raise FileNotFoundError("No fragment files") + if len(filenames) == 1: raise FileNotFoundError(f"No such fragment file: {filenames[0]}") - + raise FileNotFoundError(f"No such fragment files: {filenames}") diff --git a/cf/data/utils.py b/cf/data/utils.py index a026dba58f..454b811337 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -849,7 +849,7 @@ def collapse( ``N-ddof`` where ``N`` is the number of non-missing elements. A value of 1 applies Bessel's correction. If the calculation is weighted then *ddof* can only be 0 or 1. - + For collapse functions for which delta degrees of freedom is not applicable (such as `max`), *ddof* must be `None`. diff --git a/cf/functions.py b/cf/functions.py index 315aba09de..da04b18f58 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -3373,10 +3373,10 @@ def environment(display=True, paths=True): **Examples** >>> cf.environment() - Platform: Linux-5.15.0-122-generic-x86_64-with-glibc2.35 - HDF5 library: 1.12.2 - netcdf library: 4.9.3-development - udunits2 library: /home/user/lib/libudunits2.so.0 + Platform: Linux-5.15.0-122-generic-x86_64-with-glibc2.35 + HDF5 library: 1.12.2 + netcdf library: 4.9.3-development + udunits2 library: /home/user/lib/libudunits2.so.0 esmpy/ESMF: 8.6.1 /home/user/lib/python3.12/site-packages/esmpy/__init__.py Python: 3.12.2 /home/user/bin/python dask: 2024.6.0 /home/user/lib/python3.12/site-packages/dask/__init__.py diff --git a/cf/test/test_functions.py b/cf/test/test_functions.py index 0ff8995d1c..32bc3c4bd1 100644 --- a/cf/test/test_functions.py +++ b/cf/test/test_functions.py @@ -103,7 +103,7 @@ def test_configuration(self): except ModuleNotFoundError as error: print(f"WARNING: not testing {setting!r} due to: {error}") continue - + post_set = cf.configuration() # Expect a dict that is identical to the original to start From 885a67d44b658b071e971beea288a0935b782275 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 23 Oct 2024 09:44:51 +0100 Subject: [PATCH 128/134] Fix missing methods in on-line API docs --- docs/source/class/cf.AuxiliaryCoordinate.rst | 3 +++ docs/source/class/cf.Bounds.rst | 3 +++ docs/source/class/cf.CellMeasure.rst | 3 +++ docs/source/class/cf.Count.rst | 5 ++++- docs/source/class/cf.DimensionCoordinate.rst | 5 ++++- docs/source/class/cf.DomainAncillary.rst | 3 +++ docs/source/class/cf.Field.rst | 3 +++ docs/source/class/cf.FieldAncillary.rst | 3 +++ docs/source/class/cf.GatheredArray.rst | 3 ++- docs/source/class/cf.H5netcdfArray.rst | 12 +++++++---- docs/source/class/cf.Index.rst | 3 +++ docs/source/class/cf.List.rst | 3 +++ docs/source/class/cf.NetCDF4Array.rst | 20 ++++++++++++------- .../source/class/cf.RaggedContiguousArray.rst | 3 ++- docs/source/class/cf.RaggedIndexedArray.rst | 3 ++- .../class/cf.RaggedIndexedContiguousArray.rst | 3 ++- 16 files changed, 61 insertions(+), 17 deletions(-) diff --git a/docs/source/class/cf.AuxiliaryCoordinate.rst b/docs/source/class/cf.AuxiliaryCoordinate.rst index 3cc8f983d8..29cea86865 100644 --- a/docs/source/class/cf.AuxiliaryCoordinate.rst +++ b/docs/source/class/cf.AuxiliaryCoordinate.rst @@ -501,6 +501,9 @@ NetCDF ~cf.AuxiliaryCoordinate.nc_set_node_coordinate_variable ~cf.AuxiliaryCoordinate.nc_set_node_coordinate_variable_groups ~cf.AuxiliaryCoordinate.nc_clear_node_coordinate_variable_groups + ~cf.AuxiliaryCoordinate.nc_clear_hdf5_chunksizes + ~cf.AuxiliaryCoordinate.nc_hdf5_chunksizes + ~cf.AuxiliaryCoordinate.nc_set_hdf5_chunksizes Groups ^^^^^^ diff --git a/docs/source/class/cf.Bounds.rst b/docs/source/class/cf.Bounds.rst index 9f93594073..e963174b90 100644 --- a/docs/source/class/cf.Bounds.rst +++ b/docs/source/class/cf.Bounds.rst @@ -413,6 +413,9 @@ NetCDF ~cf.Bounds.nc_get_dimension ~cf.Bounds.nc_has_dimension ~cf.Bounds.nc_set_dimension + ~cf.Bounds.nc_clear_hdf5_chunksizes + ~cf.Bounds.nc_hdf5_chunksizes + ~cf.Bounds.nc_set_hdf5_chunksizes CFA --- diff --git a/docs/source/class/cf.CellMeasure.rst b/docs/source/class/cf.CellMeasure.rst index d3384285b5..e683526f04 100644 --- a/docs/source/class/cf.CellMeasure.rst +++ b/docs/source/class/cf.CellMeasure.rst @@ -433,6 +433,9 @@ NetCDF ~cf.CellMeasure.nc_set_variable ~cf.CellMeasure.nc_get_external ~cf.CellMeasure.nc_set_external + ~cf.CellMeasure.nc_clear_hdf5_chunksizes + ~cf.CellMeasure.nc_hdf5_chunksizes + ~cf.CellMeasure.nc_set_hdf5_chunksizes CFA --- diff --git a/docs/source/class/cf.Count.rst b/docs/source/class/cf.Count.rst index f67d610506..c43ad10a44 100644 --- a/docs/source/class/cf.Count.rst +++ b/docs/source/class/cf.Count.rst @@ -410,7 +410,10 @@ NetCDF ~cf.Count.nc_get_sample_dimension ~cf.Count.nc_has_sample_dimension ~cf.Count.nc_set_sample_dimension - + ~cf.Count.nc_clear_hdf5_chunksizes + ~cf.Count.nc_hdf5_chunksizes + ~cf.Count.nc_set_hdf5_chunksizes + CFA --- diff --git a/docs/source/class/cf.DimensionCoordinate.rst b/docs/source/class/cf.DimensionCoordinate.rst index 267ca8934e..596217e697 100644 --- a/docs/source/class/cf.DimensionCoordinate.rst +++ b/docs/source/class/cf.DimensionCoordinate.rst @@ -505,7 +505,10 @@ NetCDF ~cf.DimensionCoordinate.nc_del_variable ~cf.DimensionCoordinate.nc_get_variable ~cf.DimensionCoordinate.nc_has_variable - ~cf.DimensionCoordinate.nc_set_variable + ~cf.DimensionCoordinate.nc_set_variable + ~cf.DimensionCoordinate.nc_clear_hdf5_chunksizes + ~cf.DimensionCoordinate.nc_hdf5_chunksizes + ~cf.DimensionCoordinate.nc_set_hdf5_chunksizes Groups ^^^^^^ diff --git a/docs/source/class/cf.DomainAncillary.rst b/docs/source/class/cf.DomainAncillary.rst index 7db4a52c75..3656a91fec 100644 --- a/docs/source/class/cf.DomainAncillary.rst +++ b/docs/source/class/cf.DomainAncillary.rst @@ -462,6 +462,9 @@ NetCDF ~cf.DomainAncillary.nc_get_variable ~cf.DomainAncillary.nc_has_variable ~cf.DomainAncillary.nc_set_variable + ~cf.DomainAncillary.nc_clear_hdf5_chunksizes + ~cf.DomainAncillary.nc_hdf5_chunksizes + ~cf.DomainAncillary.nc_set_hdf5_chunksizes CFA --- diff --git a/docs/source/class/cf.Field.rst b/docs/source/class/cf.Field.rst index 8bbe0f5743..0167e18cbe 100644 --- a/docs/source/class/cf.Field.rst +++ b/docs/source/class/cf.Field.rst @@ -415,6 +415,9 @@ NetCDF ~cf.Field.nc_set_global_attribute ~cf.Field.nc_set_global_attributes ~cf.Field.ncdimensions + ~cf.Field.nc_clear_hdf5_chunksizes + ~cf.Field.nc_hdf5_chunksizes + ~cf.Field.nc_set_hdf5_chunksizes Groups ^^^^^^ diff --git a/docs/source/class/cf.FieldAncillary.rst b/docs/source/class/cf.FieldAncillary.rst index 6187fa4716..2531ed7b0f 100644 --- a/docs/source/class/cf.FieldAncillary.rst +++ b/docs/source/class/cf.FieldAncillary.rst @@ -407,6 +407,9 @@ NetCDF ~cf.FieldAncillary.nc_get_variable ~cf.FieldAncillary.nc_has_variable ~cf.FieldAncillary.nc_set_variable + ~cf.FieldAncillary.nc_clear_hdf5_chunksizes + ~cf.FieldAncillary.nc_hdf5_chunksizes + ~cf.FieldAncillary.nc_set_hdf5_chunksizes CFA --- diff --git a/docs/source/class/cf.GatheredArray.rst b/docs/source/class/cf.GatheredArray.rst index f6693a2a33..8f2d3997b7 100644 --- a/docs/source/class/cf.GatheredArray.rst +++ b/docs/source/class/cf.GatheredArray.rst @@ -20,11 +20,12 @@ cf.GatheredArray ~cf.GatheredArray.compressed_dimensions ~cf.GatheredArray.conformed_data ~cf.GatheredArray.copy + ~cf.GatheredArray.get_attributes ~cf.GatheredArray.get_calendar ~cf.GatheredArray.get_compressed_axes ~cf.GatheredArray.get_compressed_dimension ~cf.GatheredArray.get_compression_type - ~cf.GatheredArray.get_filename' + ~cf.GatheredArray.get_filename ~cf.GatheredArray.get_filenames ~cf.GatheredArray.get_list ~cf.GatheredArray.get_Subarray diff --git a/docs/source/class/cf.H5netcdfArray.rst b/docs/source/class/cf.H5netcdfArray.rst index 217d0163cd..6b51c6bc5a 100644 --- a/docs/source/class/cf.H5netcdfArray.rst +++ b/docs/source/class/cf.H5netcdfArray.rst @@ -23,6 +23,7 @@ Inspection ~cf.H5netcdfArray.get_compression_type ~cf.H5netcdfArray.get_subspace ~cf.H5netcdfArray.get_attributes + ~cf.H5netcdfArray.index .. rubric:: Attributes @@ -37,6 +38,7 @@ Inspection ~cf.H5netcdfArray.ndim ~cf.H5netcdfArray.shape ~cf.H5netcdfArray.size + ~cf.H5netcdfArray.original_shape Units ----- @@ -50,6 +52,7 @@ Units ~cf.H5netcdfArray.get_calendar ~cf.H5netcdfArray.get_units + ~cf.H5netcdfArray.Units File ---- @@ -73,6 +76,9 @@ File ~cf.H5netcdfArray.get_mask ~cf.H5netcdfArray.get_unpack ~cf.H5netcdfArray.get_storage_options + ~cf.H5netcdfArray.add_file_location + ~cf.H5netcdfArray.del_file_location + ~cf.H5netcdfArray.file_locations ~cf.H5netcdfArray._lock Miscellaneous @@ -94,10 +100,7 @@ Active storage :toctree: ../method/ :template: method.rst - ~cf.H5netcdfArray.actify - ~cf.H5netcdfArray.get_active_url - ~cf.H5netcdfArray.get_active_method - ~cf.H5netcdfArray.get_active_axis + ~cf.H5netcdfArray.active_storage Special ------- @@ -135,4 +138,5 @@ Deprecated :toctree: ../method/ :template: method.rst + ~cf.H5netcdfArray.filename ~cf.H5netcdfArray.get_missing_values diff --git a/docs/source/class/cf.Index.rst b/docs/source/class/cf.Index.rst index 8fa4ec05b6..0ea02443f9 100644 --- a/docs/source/class/cf.Index.rst +++ b/docs/source/class/cf.Index.rst @@ -411,6 +411,9 @@ NetCDF ~cf.Index.nc_get_sample_dimension ~cf.Index.nc_has_sample_dimension ~cf.Index.nc_set_sample_dimension + ~cf.Index.nc_clear_hdf5_chunksizes + ~cf.Index.nc_hdf5_chunksizes + ~cf.Index.nc_set_hdf5_chunksizes CFA --- diff --git a/docs/source/class/cf.List.rst b/docs/source/class/cf.List.rst index fd033f946b..470a6085de 100644 --- a/docs/source/class/cf.List.rst +++ b/docs/source/class/cf.List.rst @@ -403,6 +403,9 @@ NetCDF ~cf.List.nc_get_variable ~cf.List.nc_has_variable ~cf.List.nc_set_variable + ~cf.List.nc_clear_hdf5_chunksizes + ~cf.List.nc_hdf5_chunksizes + ~cf.List.nc_set_hdf5_chunksizes CFA --- diff --git a/docs/source/class/cf.NetCDF4Array.rst b/docs/source/class/cf.NetCDF4Array.rst index ef1da7a8cb..f3e668ec92 100644 --- a/docs/source/class/cf.NetCDF4Array.rst +++ b/docs/source/class/cf.NetCDF4Array.rst @@ -23,6 +23,7 @@ Inspection ~cf.NetCDF4Array.get_compression_type ~cf.NetCDF4Array.get_subspace ~cf.NetCDF4Array.get_attributes + ~cf.NetCDF4Array.index .. rubric:: Attributes @@ -37,6 +38,7 @@ Inspection ~cf.NetCDF4Array.ndim ~cf.NetCDF4Array.shape ~cf.NetCDF4Array.size + ~cf.NetCDF4Array.original_shape Units ----- @@ -50,7 +52,10 @@ Units ~cf.NetCDF4Array.get_calendar ~cf.NetCDF4Array.get_units - + ~cf.NetCDF4Array.Units + + + File ---- @@ -73,6 +78,9 @@ File ~cf.NetCDF4Array.get_mask ~cf.NetCDF4Array.get_unpack ~cf.NetCDF4Array.get_storage_options + ~cf.NetCDF4Array.add_file_location + ~cf.NetCDF4Array.del_file_location + ~cf.NetCDF4Array.file_locations ~cf.NetCDF4Array._lock Miscellaneous @@ -93,11 +101,8 @@ Active storage :nosignatures: :toctree: ../method/ :template: method.rst - - ~cf.NetCDF4Array.actify - ~cf.NetCDF4Array.get_active_url - ~cf.NetCDF4Array.get_active_method - ~cf.NetCDF4Array.get_active_axis + + ~cf.NetCDF4Array.active_storage Special ------- @@ -134,5 +139,6 @@ Deprecated :nosignatures: :toctree: ../method/ :template: method.rst - + + ~cf.NetCDF4Array.filename ~cf.NetCDF4Array.get_missing_values diff --git a/docs/source/class/cf.RaggedContiguousArray.rst b/docs/source/class/cf.RaggedContiguousArray.rst index e1f6fd8dc1..1543586451 100644 --- a/docs/source/class/cf.RaggedContiguousArray.rst +++ b/docs/source/class/cf.RaggedContiguousArray.rst @@ -21,12 +21,13 @@ cf.RaggedContiguousArray ~cf.RaggedContiguousArray.compressed_dimensions ~cf.RaggedContiguousArray.conformed_data ~cf.RaggedContiguousArray.copy + ~cf.RaggedContiguousArray.get_attributes ~cf.RaggedContiguousArray.get_calendar ~cf.RaggedContiguousArray.get_compressed_axes ~cf.RaggedContiguousArray.get_compressed_dimension ~cf.RaggedContiguousArray.get_compression_type ~cf.RaggedContiguousArray.get_count - ~cf.RaggedContiguousArray.get_filename' + ~cf.RaggedContiguousArray.get_filename ~cf.RaggedContiguousArray.get_filenames ~cf.RaggedContiguousArray.get_index ~cf.RaggedContiguousArray.get_Subarray diff --git a/docs/source/class/cf.RaggedIndexedArray.rst b/docs/source/class/cf.RaggedIndexedArray.rst index abc4edc74a..2b0fb76075 100644 --- a/docs/source/class/cf.RaggedIndexedArray.rst +++ b/docs/source/class/cf.RaggedIndexedArray.rst @@ -20,12 +20,13 @@ cf.RaggedIndexedArray ~cf.RaggedIndexedArray.compressed_dimensions ~cf.RaggedIndexedArray.conformed_data ~cf.RaggedIndexedArray.copy + ~cf.RaggedIndexedArray.get_attributes ~cf.RaggedIndexedArray.get_calendar ~cf.RaggedIndexedArray.get_compressed_axes ~cf.RaggedIndexedArray.get_compressed_dimension ~cf.RaggedIndexedArray.get_compression_type ~cf.RaggedIndexedArray.get_count - ~cf.RaggedIndexedArray.get_filename' + ~cf.RaggedIndexedArray.get_filename ~cf.RaggedIndexedArray.get_filenames ~cf.RaggedIndexedArray.get_index ~cf.RaggedIndexedArray.get_Subarray diff --git a/docs/source/class/cf.RaggedIndexedContiguousArray.rst b/docs/source/class/cf.RaggedIndexedContiguousArray.rst index af27e95277..f8041c5a31 100644 --- a/docs/source/class/cf.RaggedIndexedContiguousArray.rst +++ b/docs/source/class/cf.RaggedIndexedContiguousArray.rst @@ -20,12 +20,13 @@ cf.RaggedIndexedContiguousArray ~cf.RaggedIndexedContiguousArray.compressed_dimensions ~cf.RaggedIndexedContiguousArray.conformed_data ~cf.RaggedIndexedContiguousArray.copy + ~cf.RaggedIndexedContiguousArray.get_attributes ~cf.RaggedIndexedContiguousArray.get_calendar ~cf.RaggedIndexedContiguousArray.get_compressed_axes ~cf.RaggedIndexedContiguousArray.get_compressed_dimension ~cf.RaggedIndexedContiguousArray.get_compression_type ~cf.RaggedIndexedContiguousArray.get_count - ~cf.RaggedIndexedContiguousArray.get_filename' + ~cf.RaggedIndexedContiguousArray.get_filename ~cf.RaggedIndexedContiguousArray.get_filenames ~cf.RaggedIndexedContiguousArray.get_index ~cf.RaggedIndexedContiguousArray.get_Subarray From 5d03edfdcb72f95ca0b09f8f2896124388e118c4 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 23 Oct 2024 10:12:21 +0100 Subject: [PATCH 129/134] Remove dead code --- cf/data/array/mixin/activestoragemixin.py | 86 ----------------------- 1 file changed, 86 deletions(-) diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py index 75e0c52be2..9843d09858 100644 --- a/cf/data/array/mixin/activestoragemixin.py +++ b/cf/data/array/mixin/activestoragemixin.py @@ -29,89 +29,3 @@ def active_storage(self): return False return True - - -# return self.get_filename(None) is not None - -# @property -# def actified(self): -# """Whether active storage operations are possible. -# -# .. versionadded:: NEXTVERSION -# -# .. seealso:: `actify`, `get_active_storage_url` -# -# :Returns: -# -# `bool` -# `True` if active stoage operations are possible, -# otherwise `False`. -# -# """ -# return self.get_active_storage_url() is not None -# -# def actify(self, active_storage_url): -# """Return a new actified `{{class}}` instance. -# -# The new instance is a deep copy of the original, with the -# additional setting of the active storage URL. -# -# .. versionadded:: NEXTVERSION -# -# .. seealso:: `actified`, `get_active_storage_url` -# -# :Parameters: -# -# active_storage_url: `str` or `None`, optional -# The URL of the active storage server. If `None` then -# `actified` will be `False` -# -# :Returns: -# -# `{{class}}` -# The new `{{class}}`` instance that ues an active -# storage operation. -# -# """ -# # Don't actify when the data are packed. Note: There may come -# # a time when activestorage.Active can cope with packed data, -# # in which case we can remove this test. -# attributes = self.get_attributes({}) -# if "add_offset" in attributes or "scale_factor" in attributes: -# raise AttributeError( -# "Can't actify {self.__class__.__name__} when " -# "the data have been numerically packed" -# ) -# -# if Active is None: -# raise AttributeError( -# "Can't actify {self.__class__.__name__} when " -# "activestorage.Active is not available" -# ) -# -# a = self.copy() -# a._custom["active_storage_url"] = active_storage_url -# return a -# -# def get_active_storage_url(self): -# """Return the active storage reduction URL. -# -# An active storage reduction URL is set with `actify`. -# -# .. versionadded:: NEXTVERSION -# -# .. seealso:: `actified`, `actify` -# -# :Returns: -# -# `str` or `None` -# The active storage URL, or `None` if no active storage -# reduction is possible. -# -# **Examples** -# -# >>> a.get_active_storage() -# 'https://183.175.143.286:8080' -# -# """ -# return self._custom.get("active_storage_url") From e90430a773c529e4301de1f4727f426cc1319abe Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 23 Oct 2024 10:15:25 +0100 Subject: [PATCH 130/134] \emptyset --- cf/data/collapse/dask_collapse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/collapse/dask_collapse.py b/cf/data/collapse/dask_collapse.py index 8610d43a32..b58f9daf00 100644 --- a/cf/data/collapse/dask_collapse.py +++ b/cf/data/collapse/dask_collapse.py @@ -1265,7 +1265,7 @@ def cf_var_chunk( _{X_{i}}^{2}+N_{X_{i}}\mu _{X_{i}}^{2}\right]}-\left[\sum _{i}{N_{X_{i}}}\right]\mu _{X}^{2}\right)}} - where X_{i}\cap X_{j}=\varnothing , \forall i Date: Wed, 23 Oct 2024 17:02:09 +0100 Subject: [PATCH 131/134] asanyarray0 --- cf/data/data.py | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index b46da64674..1b0ee34e98 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -387,7 +387,7 @@ def __init__( self._set_dask(array, copy=copy, clear=_NONE) else: self._set_dask( - array, copy=copy, clear=_NONE, asanyarray=None + array, copy=copy, clear=_NONE, __asanyarray__=None ) else: self._del_dask(None, clear=_NONE) @@ -515,7 +515,7 @@ def __init__( # REVIEW: getitem: `__init__`: set 'asanyarray' # Store the dask array - self._set_dask(dx, clear=_NONE, asanyarray=None) + self._set_dask(dx, clear=_NONE, __asanyarray__=None) # Override the data type if dtype is not None: @@ -966,7 +966,7 @@ def __getitem__(self, indices): # so we set asanyarray=True to ensure that, if required, # they are converted at compute time. # ------------------------------------------------------------ - new._set_dask(dx, asanyarray=True) + new._set_dask(dx, __asanyarray__=True) # ------------------------------------------------------------ # Get the axis identifiers for the subspace @@ -1184,19 +1184,19 @@ def __setitem__(self, indices, value): # REVIEW: getitem: `__asanyarray__`: new property `__asanyarray__` @property def __asanyarray__(self): - """Whether the chunks need conversion to a `numpy` array. + """Whether the chunks need conversion to `numpy` arrays. .. versionadded:: NEXTVERSION :Returns: `bool` - If True then at compute time add a final operation - (not in-place) to the Dask graph that converts a - chunk's array object to a `numpy` array if the array - object has an `__asanyarray__` attribute that is - `True`, or else does nothing. If False then do not add - this operation. + If True then at compute time add to the Dask graph + (not in-place) a `cf_asanyarray` operation, which + converts a chunk's array object to a `numpy` array, + but only if the array object itself has an + `__asanyarray__` attribute that is `True`. If False + then this operation is not added to the Dask graph. """ return self._custom.get("__asanyarray__", True) @@ -1415,7 +1415,7 @@ def _clear_after_dask_update(self, clear=_ALL): self._cfa_del_write() # REVIEW: getitem: `_set_dask`: new keyword 'asanyarray' - def _set_dask(self, dx, copy=False, clear=_ALL, asanyarray=False): + def _set_dask(self, dx, copy=False, clear=_ALL, __asanyarray__=False): """Set the dask array. .. versionadded:: 3.14.0 @@ -1438,9 +1438,9 @@ def _set_dask(self, dx, copy=False, clear=_ALL, asanyarray=False): results in all components being removed. See `_clear_after_dask_update` for details. - asanyarray: `bool` or `None`, optional - If `None` then do nothing. Otherwise set - `__asanyarray__` to the Boolean value of *asanyarray*. + __asanyarray__: `bool` or `None`, optional + If `None` then do nothing. Otherwise set the + `__asanyarray__` attribute to *__asanyarray__*. .. versionadded:: NEXTVERSION @@ -1474,8 +1474,8 @@ def _set_dask(self, dx, copy=False, clear=_ALL, asanyarray=False): custom = self._custom custom["dask"] = dx # REVIEW: getitem: `_set_dask`: set '__asanyarray__' - if asanyarray is not None: - custom["__asanyarray__"] = bool(asanyarray) + if __asanyarray__ is not None: + custom["__asanyarray__"] = bool(__asanyarray__) self._clear_after_dask_update(clear) @@ -3245,7 +3245,7 @@ def rechunk( dx = d.to_dask_array(asanyarray=False) dx = dx.rechunk(chunks, threshold, block_size_limit, balance) - d._set_dask(dx, clear=_ALL ^ _ARRAY ^ _CACHE, asanyarray=True) + d._set_dask(dx, clear=_ALL ^ _ARRAY ^ _CACHE, __asanyarray__=True) return d @@ -4258,7 +4258,7 @@ def concatenate( # REVIEW: getitem: `concatenate`: set 'asanyarray' # Set the new dask array - data0._set_dask(dx, clear=_ALL ^ cfa, asanyarray=asanyarray) + data0._set_dask(dx, clear=_ALL ^ cfa, __asanyarray__=asanyarray) # Set appropriate cached elements cached_elements = {} @@ -6858,7 +6858,7 @@ def add_file_location(self, location): if updated: dx = self.to_dask_array(asanyarray=False) dx = da.Array(dsk, dx.name, dx.chunks, dx.dtype, dx._meta) - self._set_dask(dx, clear=_NONE, asanyarray=None) + self._set_dask(dx, clear=_NONE, __asanyarray__=None) return location @@ -10231,7 +10231,7 @@ def del_file_location(self, location): if updated: dx = self.to_dask_array(asanyarray=False) dx = da.Array(dsk, dx.name, dx.chunks, dx.dtype, dx._meta) - self._set_dask(dx, clear=_NONE, asanyarray=None) + self._set_dask(dx, clear=_NONE, __asanyarray__=None) return location @@ -11693,7 +11693,7 @@ def cull_graph(self): dx = self.to_dask_array(asanyarray=False) dsk, _ = cull(dx.dask, dx.__dask_keys__()) dx = da.Array(dsk, name=dx.name, chunks=dx.chunks, dtype=dx.dtype) - self._set_dask(dx, clear=_NONE, asanyarray=None) + self._set_dask(dx, clear=_NONE, __asanyarray__=None) @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) From d75dcd1a7cb38c64a22ed0cc9737046d6b68bb75 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 25 Oct 2024 16:48:01 +0100 Subject: [PATCH 132/134] asanyarray changes --- cf/data/creation.py | 4 +- cf/data/data.py | 206 +++++++++++++--------------- cf/data/utils.py | 6 +- cf/docstring/docstring.py | 32 ++--- cf/mixin/propertiesdata.py | 6 +- cf/read_write/netcdf/netcdfwrite.py | 8 +- cf/test/test_Data.py | 4 +- 7 files changed, 122 insertions(+), 144 deletions(-) diff --git a/cf/data/creation.py b/cf/data/creation.py index 5989691f23..0d4067a373 100644 --- a/cf/data/creation.py +++ b/cf/data/creation.py @@ -60,13 +60,13 @@ def to_dask(array, chunks, **from_array_options): if is_dask_collection(array): return array - # REVIEW: getitem: `to_dask`: set 'asanyarray' + # REVIEW: getitem: `to_dask`: set '_asanyarray' if hasattr(array, "to_dask_array"): try: return array.to_dask_array(chunks=chunks) except TypeError: try: - return array.to_dask_array(asanyarray=False) + return array.to_dask_array(_asanyarray=False) except TypeError: return array.to_dask_array() diff --git a/cf/data/data.py b/cf/data/data.py index 1b0ee34e98..8ee04cb3b3 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -377,7 +377,7 @@ def __init__( if _use_array: # REVIEW: getitem: `__init__`: set 'asanyarray' try: - array = source.to_dask_array(asanyarray=False) + array = source.to_dask_array(_asanyarray=False) except (AttributeError, TypeError): try: array = source.to_dask_array() @@ -387,7 +387,7 @@ def __init__( self._set_dask(array, copy=copy, clear=_NONE) else: self._set_dask( - array, copy=copy, clear=_NONE, __asanyarray__=None + array, copy=copy, clear=_NONE, asanyarray=None ) else: self._del_dask(None, clear=_NONE) @@ -515,7 +515,7 @@ def __init__( # REVIEW: getitem: `__init__`: set 'asanyarray' # Store the dask array - self._set_dask(dx, clear=_NONE, __asanyarray__=None) + self._set_dask(dx, clear=_NONE, asanyarray=None) # Override the data type if dtype is not None: @@ -651,12 +651,12 @@ def __contains__(self, value): return False # 'cf_contains' has its own calls to 'cf_asanyarray', so - # we can set 'asanyarray=False'. - value = value.to_dask_array(asanyarray=False) + # we can set '_asanyarray=False'. + value = value.to_dask_array(_asanyarray=False) # 'cf_contains' has its own calls to 'cf_asanyarray', so we - # can set 'asanyarray=False'. - dx = self.to_dask_array(asanyarray=False) + # can set '_asanyarray=False'. + dx = self.to_dask_array(_asanyarray=False) out_ind = tuple(range(dx.ndim)) dx_ind = out_ind @@ -791,8 +791,8 @@ def __len__(self): """ # REVIEW: getitem: `__len__`: set 'asanyarray' # The dask graph is never going to be computed, so we can set - # 'asanyarray=False'. - dx = self.to_dask_array(asanyarray=False) + # '_asanyarray=False'. + dx = self.to_dask_array(_asanyarray=False) if math.isnan(dx.size): logger.debug("Computing data len: Performance may be degraded") dx.compute_chunk_sizes() @@ -911,10 +911,10 @@ def __getitem__(self, indices): new = self.roll( axis=tuple(roll.keys()), shift=tuple(roll.values()) ) - dx = new.to_dask_array(asanyarray=False) + dx = new.to_dask_array(_asanyarray=False) else: new = self.copy() - dx = self.to_dask_array(asanyarray=False) + dx = self.to_dask_array(_asanyarray=False) # ------------------------------------------------------------ # Subspace the dask array @@ -962,11 +962,11 @@ def __getitem__(self, indices): # ------------------------------------------------------------ # Set the subspaced dask array # - # * A subpspaced chunk might not result in an array in memory, + # * A subspaced chunk might not result in an array in memory, # so we set asanyarray=True to ensure that, if required, # they are converted at compute time. # ------------------------------------------------------------ - new._set_dask(dx, __asanyarray__=True) + new._set_dask(dx, asanyarray=True) # ------------------------------------------------------------ # Get the axis identifiers for the subspace @@ -1184,19 +1184,15 @@ def __setitem__(self, indices, value): # REVIEW: getitem: `__asanyarray__`: new property `__asanyarray__` @property def __asanyarray__(self): - """Whether the chunks need conversion to `numpy` arrays. + """Whether or not chunks need conversion to `numpy` arrays. .. versionadded:: NEXTVERSION + ..seealso:: `to_dask_array`, `todict`, `_set_dask` + :Returns: `bool` - If True then at compute time add to the Dask graph - (not in-place) a `cf_asanyarray` operation, which - converts a chunk's array object to a `numpy` array, - but only if the array object itself has an - `__asanyarray__` attribute that is `True`. If False - then this operation is not added to the Dask graph. """ return self._custom.get("__asanyarray__", True) @@ -1415,7 +1411,7 @@ def _clear_after_dask_update(self, clear=_ALL): self._cfa_del_write() # REVIEW: getitem: `_set_dask`: new keyword 'asanyarray' - def _set_dask(self, dx, copy=False, clear=_ALL, __asanyarray__=False): + def _set_dask(self, dx, copy=False, clear=_ALL, asanyarray=False): """Set the dask array. .. versionadded:: 3.14.0 @@ -1438,9 +1434,9 @@ def _set_dask(self, dx, copy=False, clear=_ALL, __asanyarray__=False): results in all components being removed. See `_clear_after_dask_update` for details. - __asanyarray__: `bool` or `None`, optional + asanyarray: `None` or `bool`, optional If `None` then do nothing. Otherwise set the - `__asanyarray__` attribute to *__asanyarray__*. + `__asanyarray__` attribute to *asanyarray*. .. versionadded:: NEXTVERSION @@ -1474,8 +1470,8 @@ def _set_dask(self, dx, copy=False, clear=_ALL, __asanyarray__=False): custom = self._custom custom["dask"] = dx # REVIEW: getitem: `_set_dask`: set '__asanyarray__' - if __asanyarray__ is not None: - custom["__asanyarray__"] = bool(__asanyarray__) + if asanyarray is not None: + custom["__asanyarray__"] = bool(asanyarray) self._clear_after_dask_update(clear) @@ -2551,8 +2547,8 @@ def percentile( # REVIEW: getitem: `percentile`: set 'asanyarray' # 'cf_percentile' has its own call to 'cf_asanyarray', so we - # can set 'asanyarray=False'. - dx = d.to_dask_array(asanyarray=False) + # can set '_asanyarray=False'. + dx = d.to_dask_array(_asanyarray=False) dtype = dx.dtype shape = dx.shape @@ -3238,14 +3234,13 @@ def rechunk( d = _inplace_enabled_define_and_cleanup(self) # REVIEW: getitem: `rechunk`: set 'asanyarray' + dx = d.to_dask_array(_asanyarray=False) + dx = dx.rechunk(chunks, threshold, block_size_limit, balance) # Dask rechunking is essentially a wrapper for __getitem__ # calls on the chunks, which means that we can use the same - # 'asanyarray' and 'clear' keywords to `_set_dask` as are used - # in `__gettem__`. - - dx = d.to_dask_array(asanyarray=False) - dx = dx.rechunk(chunks, threshold, block_size_limit, balance) - d._set_dask(dx, clear=_ALL ^ _ARRAY ^ _CACHE, __asanyarray__=True) + # 'asanyarray' and 'clear' keyword values to `_set_dask` as + # are used in `__gettem__`. + d._set_dask(dx, clear=_ALL ^ _ARRAY ^ _CACHE, asanyarray=True) return d @@ -3298,8 +3293,8 @@ def _asdatetime(self, inplace=False): if not d._isdatetime(): # REVIEW: getitem: `_asdatetime`: set 'asanyarray' # 'cf_rt2dt' has its own call to 'cf_asanyarray', so we - # can set 'asanyarray=False'. - dx = d.to_dask_array(asanyarray=False) + # can set '_asanyarray=False'. + dx = d.to_dask_array(_asanyarray=False) dx = dx.map_blocks(cf_rt2dt, units=units, dtype=object) d._set_dask(dx) @@ -3356,8 +3351,8 @@ def _asreftime(self, inplace=False): if d._isdatetime(): # REVIEW: getitem: `_asreftime`: set 'asanyarray' # 'cf_dt2rt' has its own call to 'cf_asanyarray', so we - # can set 'asanyarray=False'. - dx = d.to_dask_array(asanyarray=False) + # can set '_asanyarray=False'. + dx = d.to_dask_array(_asanyarray=False) dx = dx.map_blocks(cf_dt2rt, units=units, dtype=float) d._set_dask(dx) @@ -3969,8 +3964,8 @@ def _regrid( # REVIEW: getitem: `_regrid`: set 'asanyarray' # 'regrid' has its own calls to 'cf_asanyarray', so we can set - # 'asanyarray=False'. - dx = self.to_dask_array(asanyarray=False) + # '_asanyarray=False'. + dx = self.to_dask_array(_asanyarray=False) # Rechunk so that each chunk contains data in the form # expected by the regrid operator, i.e. the regrid axes all @@ -4214,10 +4209,10 @@ def concatenate( # REVIEW: getitem: `concatenate`: set 'asanyarray' # Get data as dask arrays and apply concatenation - # operation. We can set 'asanyarray=False' because at compute + # operation. We can set '_asanyarray=False' because at compute # time the concatenation operation does not need to access the # actual data. - dxs = [d.to_dask_array(asanyarray=False) for d in processed_data] + dxs = [d.to_dask_array(_asanyarray=False) for d in processed_data] dx = da.concatenate(dxs, axis=axis) # Set the CFA write status @@ -4258,7 +4253,7 @@ def concatenate( # REVIEW: getitem: `concatenate`: set 'asanyarray' # Set the new dask array - data0._set_dask(dx, clear=_ALL ^ cfa, __asanyarray__=asanyarray) + data0._set_dask(dx, clear=_ALL ^ cfa, asanyarray=asanyarray) # Set appropriate cached elements cached_elements = {} @@ -4904,8 +4899,8 @@ def chunks(self): """ # REVIEW: getitem: `chunks`: set 'asanyarray' # The dask graph is never going to be computed, so we can set - # 'asanyarray=False'. - return self.to_dask_array(asanyarray=False).chunks + # '_asanyarray=False'. + return self.to_dask_array(_asanyarray=False).chunks # ---------------------------------------------------------------- # Attributes @@ -4962,8 +4957,8 @@ def Units(self, value): # REVIEW: getitem: `Units`: set 'asanyarray' # 'cf_units' has its own call to 'cf_asanyarray', so we can - # set 'asanyarray=False'. - dx = self.to_dask_array(asanyarray=False) + # set '_asanyarray=False'. + dx = self.to_dask_array(_asanyarray=False) dx = dx.map_blocks(cf_func, dtype=dtype) # Setting equivalent units doesn't affect the CFA write @@ -5032,8 +5027,8 @@ def dtype(self): """ # REVIEW: getitem: `dtype`: set 'asanyarray' # The dask graph is never going to be computed, so we can set - # 'asanyarray=False'. - dx = self.to_dask_array(asanyarray=False) + # '_asanyarray=False'. + dx = self.to_dask_array(_asanyarray=False) return dx.dtype @dtype.setter @@ -5148,8 +5143,8 @@ def is_masked(self): """ # REVIEW: getitem: `is_masked`: set 'asanyarray' # 'cf_is_masked' has its own call to 'cf_asanyarray', so we - # can set 'asanyarray=False'. - dx = self.to_dask_array(asanyarray=False) + # can set '_asanyarray=False'. + dx = self.to_dask_array(_asanyarray=False) out_ind = tuple(range(dx.ndim)) dx_ind = out_ind @@ -5194,8 +5189,8 @@ def nbytes(self): """ # REVIEW: getitem: `nbytes`: set 'asanyarray' # The dask graph is never going to be computed, so we can set - # 'asanyarray=False'. - dx = self.to_dask_array(asanyarray=False) + # '_asanyarray=False'. + dx = self.to_dask_array(_asanyarray=False) if math.isnan(dx.size): logger.debug("Computing data nbytes: Performance may be degraded") dx.compute_chunk_sizes() @@ -5231,8 +5226,8 @@ def ndim(self): """ # REVIEW: getitem: `ndim`: set 'asanyarray' # The dask graph is never going to be computed, so we can set - # 'asanyarray=False'. - dx = self.to_dask_array(asanyarray=False) + # '_asanyarray=False'. + dx = self.to_dask_array(_asanyarray=False) return dx.ndim @property @@ -5256,8 +5251,8 @@ def npartitions(self): """ # REVIEW: getitem: `npartitions`: set 'asanyarray' # The dask graph is never going to be computed, so we can set - # 'asanyarray=False'. - return self.to_dask_array(asanyarray=False).npartitions + # '_asanyarray=False'. + return self.to_dask_array(_asanyarray=False).npartitions @property def numblocks(self): @@ -5280,8 +5275,8 @@ def numblocks(self): """ # REVIEW: getitem: `numblocks` set 'asanyarray' # The dask graph is never going to be computed, so we can set - # 'asanyarray=False'. - return self.to_dask_array(asanyarray=False).numblocks + # '_asanyarray=False'. + return self.to_dask_array(_asanyarray=False).numblocks @property def shape(self): @@ -5313,8 +5308,8 @@ def shape(self): """ # REVIEW: getitem: `shape`: set 'asanyarray' # The dask graph is never going to be computed, so we can set - # 'asanyarray=False'. - dx = self.to_dask_array(asanyarray=False) + # '_asanyarray=False'. + dx = self.to_dask_array(_asanyarray=False) if math.isnan(dx.size): logger.debug("Computing data shape: Performance may be degraded") dx.compute_chunk_sizes() @@ -5355,8 +5350,8 @@ def size(self): """ # REVIEW: getitem: `size` set 'asanyarray' # The dask graph is never going to be computed, so we can set - # 'asanyarray=False'. - dx = self.to_dask_array(asanyarray=False) + # '_asanyarray=False'. + dx = self.to_dask_array(_asanyarray=False) size = dx.size if math.isnan(size): logger.debug("Computing data size: Performance may be degraded") @@ -6554,8 +6549,8 @@ def convert_reference_time( # REVIEW: getitem: `convert_reference_time`: set 'asanyarray' # 'cf_rt2dt' its own call to 'cf_asanyarray', so we can set - # 'asanyarray=False'. - dx = d.to_dask_array(asanyarray=False) + # '_asanyarray=False'. + dx = d.to_dask_array(_asanyarray=False) # Convert to the correct date-time objects dx = dx.map_blocks(cf_rt2dt, units=units0, dtype=object) @@ -6635,9 +6630,9 @@ def get_deterministic_name(self): # REVIEW: getitem: `get_deterministic_name`: set 'asanyarray' # The dask graph is never going to be computed, so we can set - # 'asanyarray=False'. + # '_asanyarray=False'. return tokenize( - self.to_dask_array(asanyarray=None).name, + self.to_dask_array(_asanyarray=False).name, units.formatted(definition=True, names=True), units._canonical_calendar, ) @@ -6705,8 +6700,8 @@ def get_filenames(self): # REVIEW: getitem: `get_filenames`: set 'asanyarray' # The dask graph is never going to be computed, so we can set - # 'asanyarray=False'. - for a in self.todict(asanyarray=False).values(): + # '_asanyarray=False'. + for a in self.todict(_asanyarray=False).values(): try: out.update(a.get_filenames()) except AttributeError: @@ -6842,8 +6837,8 @@ def add_file_location(self, location): # REVIEW: getitem: `add_file_location`: set 'asanyarray' # The dask graph is never going to be computed, so we can set - # 'asanyarray=False'. - dsk = self.todict(asanyarray=False) + # '_asanyarray=False'. + dsk = self.todict(_asanyarray=False) for key, a in dsk.items(): try: dsk[key] = a.add_file_location(location) @@ -6856,9 +6851,9 @@ def add_file_location(self, location): updated = True if updated: - dx = self.to_dask_array(asanyarray=False) + dx = self.to_dask_array(_asanyarray=False) dx = da.Array(dsk, dx.name, dx.chunks, dx.dtype, dx._meta) - self._set_dask(dx, clear=_NONE, __asanyarray__=None) + self._set_dask(dx, clear=_NONE, asanyarray=None) return location @@ -8286,8 +8281,8 @@ def unique(self, split_every=None): # REVIEW: getitem: `unique`: set 'asanyarray' # The applicable chunk function will have its own call to - # 'cf_asanyarray', so we can set 'asanyarray=False'. - dx = d.to_dask_array(asanyarray=False) + # 'cf_asanyarray', so we can set '_asanyarray=False'. + dx = d.to_dask_array(_asanyarray=False) dx = Collapse().unique(dx, split_every=split_every) d._set_dask(dx) @@ -9035,8 +9030,8 @@ def harden_mask(self): """ # REVIEW: getitem: `hardmask`: set 'asanyarray' # 'cf_harden_mask' has its own call to 'cf_asanyarray', so we - # can set 'asanyarray=False'. - dx = self.to_dask_array(asanyarray=False) + # can set '_asanyarray=False'. + dx = self.to_dask_array(_asanyarray=False) dx = dx.map_blocks(cf_harden_mask, dtype=self.dtype) self._set_dask(dx, clear=_NONE) self.hardmask = True @@ -9158,8 +9153,8 @@ def soften_mask(self): """ # REVIEW: getitem: `soften_mask`: set 'asanyarray' # 'cf_soften_mask' has its own call to 'cf_asanyarray', so we - # can set 'asanyarray=False'. - dx = self.to_dask_array(asanyarray=False) + # can set '_asanyarray=False'. + dx = self.to_dask_array(_asanyarray=False) dx = dx.map_blocks(cf_soften_mask, dtype=self.dtype) self._set_dask(dx, clear=_NONE) self.hardmask = False @@ -9190,8 +9185,8 @@ def file_locations(self): # REVIEW: getitem: `file_locations`: set 'asanyarray' # The dask graph is never going to be computed, so we can set - # 'asanyarray=False'. - for key, a in self.todict(asanyarray=False).items(): + # '_asanyarray=False'. + for key, a in self.todict(_asanyarray=False).items(): try: out.update(a.file_locations()) except AttributeError: @@ -9251,8 +9246,8 @@ def filled(self, fill_value=None, inplace=False): # REVIEW: getitem: `filled`: set 'asanyarray' # 'cf_filled' has its own call to 'cf_asanyarray', so we can - # set 'asanyarray=False'. - dx = d.to_dask_array(asanyarray=False) + # set '_asanyarray=False'. + dx = d.to_dask_array(_asanyarray=False) dx = dx.map_blocks(cf_filled, fill_value=fill_value, dtype=d.dtype) d._set_dask(dx) @@ -9882,7 +9877,7 @@ def override_calendar(self, calendar, inplace=False, i=False): return d # REVIEW: getitem: `to_dask_array`: new keyword 'asanyarray' - def to_dask_array(self, apply_mask_hardness=False, asanyarray=None): + def to_dask_array(self, apply_mask_hardness=False, _asanyarray=True): """Convert the data to a `dask` array. .. warning:: By default, the mask hardness of the returned @@ -9906,7 +9901,7 @@ def to_dask_array(self, apply_mask_hardness=False, asanyarray=None): If True then force the mask hardness of the returned array to be that given by the `hardmask` attribute. - {{asanyarray: `bool` or `None`, optional}} + {{_asanyarray: `bool`, optional}} .. versionadded:: NEXTVERSION @@ -9946,13 +9941,9 @@ def to_dask_array(self, apply_mask_hardness=False, asanyarray=None): # Note: The mask hardness functions have their own calls # to 'cf_asanyarray', so we don't need to worry about # setting another one. - else: - if asanyarray is None: - asanyarray = self.__asanyarray__ - - if asanyarray: - # Add a new cf_asanyarray layer to the output graph - dx = dx.map_blocks(cf_asanyarray, dtype=dx.dtype) + elif _asanyarray and self.__asanyarray__: + # Add a new cf_asanyarray layer to the output graph + dx = dx.map_blocks(cf_asanyarray, dtype=dx.dtype) return dx @@ -10215,8 +10206,8 @@ def del_file_location(self, location): # REVIEW: getitem: `del_file_location`: set 'asanyarray' # The dask graph is never going to be computed, so we can set - # 'asanyarray=False'. - dsk = self.todict(asanyarray=False) + # '_asanyarray=False'. + dsk = self.todict(_asanyarray=False) for key, a in dsk.items(): try: dsk[key] = a.del_file_location(location) @@ -10229,9 +10220,9 @@ def del_file_location(self, location): updated = True if updated: - dx = self.to_dask_array(asanyarray=False) + dx = self.to_dask_array(_asanyarray=False) dx = da.Array(dsk, dx.name, dx.chunks, dx.dtype, dx._meta) - self._set_dask(dx, clear=_NONE, __asanyarray__=None) + self._set_dask(dx, clear=_NONE, asanyarray=None) return location @@ -11400,8 +11391,8 @@ def where( # # REVIEW: getitem: `where`: set 'asanyarray' # 'cf_where' has its own calls to 'cf_asanyarray', so we can - # set 'asanyarray=False'. - dx = d.to_dask_array(apply_mask_hardness=True, asanyarray=False) + # set '_asanyarray=False'. + dx = d.to_dask_array(apply_mask_hardness=True, _asanyarray=False) units = d.Units @@ -11418,8 +11409,8 @@ def where( condition = where_broadcastable(d, condition, "condition") # REVIEW: getitem: `where`: set 'asanyarray' # 'cf_where' has its own calls to 'cf_asanyarray', so we can - # set 'asanyarray=False'. - condition = condition.to_dask_array(asanyarray=False) + # set '_asanyarray=False'. + condition = condition.to_dask_array(_asanyarray=False) # If x or y is self then change it to None. This prevents an # unnecessary copy; and, at compute time, an unncessary numpy @@ -11690,10 +11681,10 @@ def cull_graph(self): """ # REVIEW: getitem: `cull_graph`: set 'asanyarray' - dx = self.to_dask_array(asanyarray=False) + dx = self.to_dask_array(_asanyarray=False) dsk, _ = cull(dx.dask, dx.__dask_keys__()) dx = da.Array(dsk, name=dx.name, chunks=dx.chunks, dtype=dx.dtype) - self._set_dask(dx, clear=_NONE, __asanyarray__=None) + self._set_dask(dx, clear=_NONE, asanyarray=None) @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) @@ -11971,7 +11962,7 @@ def tan(self, inplace=False, i=False): # REVIEW: getitem: `todict`: new keywords 'apply_mask_hardness', 'asanyarray' def todict( - self, optimize_graph=True, apply_mask_hardness=False, asanyarray=None + self, optimize_graph=True, apply_mask_hardness=False, _asanyarray=True ): """Return a dictionary of the dask graph key/value pairs. @@ -11993,14 +11984,7 @@ def todict( .. versionadded:: NEXTVERSION - asanyarray: `bool` or `None`, optional - If True then add a final operation to the Dask graph - that converts chunks to `numpy` arrays, but only if - chunk's array object has an `__asanyarray__` attribute - that is also `True`. If False then do not do this. If - `None`, the default, then the final operation is added - if the `Data` object's `__asanyarray__` attribute is - `True`. + {{_asanyarray: `bool`, optional}} .. versionadded:: NEXTVERSION @@ -12030,7 +12014,7 @@ def todict( """ dx = self.to_dask_array( - apply_mask_hardness=apply_mask_hardness, asanyarray=asanyarray + apply_mask_hardness=apply_mask_hardness, _asanyarray=_asanyarray ) if optimize_graph: diff --git a/cf/data/utils.py b/cf/data/utils.py index 454b811337..2c34757c22 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -882,10 +882,10 @@ def collapse( # REVIEW: getitem: `collapse`: set 'asanyarray' # The applicable chunk function will have its own call to - # 'cf_asanyarray', so we can set 'asanyarray=False'. Also, setting - # asanyarray=False will ensure that any active storage operations + # 'cf_asanyarray', so we can set '_asanyarray=False'. Also, setting + # _asanyarray=False will ensure that any active storage operations # are not compromised. - dx = d.to_dask_array(asanyarray=False) + dx = d.to_dask_array(_asanyarray=False) dx = func(dx, **kwargs) d._set_dask(dx) diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py index bceb78dd4d..74bb32bca1 100644 --- a/cf/docstring/docstring.py +++ b/cf/docstring/docstring.py @@ -634,25 +634,19 @@ size.""", # REVIEW: getitem: `_docstring_substitution_definitions`: 'asanyarray' # asanyarray - "{{asanyarray: `bool` or `None`, optional}": """asanyarray: `bool` or `None`, optional - If True then add a final operation (not in-place) to - the graph of the returned Dask array that converts a - chunk's array object to a `numpy` array if the array - object has an `__asanyarray__` attribute that is - `True`, or else does nothing. If False then do not add - this operation. If `None`, the default, then the final - operation is added only if the `Data` object's - `__asanyarray__` attribute is `True`. - - By default or if *asanyarray* is True, the returned - Dask array will always provide the expected result - when computed, although if *asanyarray* is True then - the Dask graph may have an extra null operation layer - that is not requred. Setting *asanyarray* to False - should only be done in the case that the returned Dask - Array will get further operations which are guaranteed - to negate the need for the extra layer in the Dask - graph.""", + "{{_asanyarray: `bool`, optional}": """_asanyarray: `bool`, optional + If True (the default) and the `__asanyarray__` + attribute is also `True`, then add a `cf_asanyarray` + operation to the graph of the returned Dask array. If + False then this operation is not added. Setting + *_asanyarray* to False should only be done if it is + known that a) the returned Dask array is never going to + be computed; or b) it is not necessary to add a + `cf_asanyarray` operation in lieu of its functionality + being implemented by a new Dask graph layer that is + going to be created at a later stage. See + `cf.data.dask_utils.cf_asanyarray` for further + details.""", # _get_array index "{{index: `tuple` or `None`, optional}}": """index: `tuple` or `None`, optional Provide the indices that define the subspace. If `None` diff --git a/cf/mixin/propertiesdata.py b/cf/mixin/propertiesdata.py index a94cf62371..194bf8655b 100644 --- a/cf/mixin/propertiesdata.py +++ b/cf/mixin/propertiesdata.py @@ -4693,7 +4693,7 @@ def log(self, base=None, inplace=False, i=False): ) # REVIEW: getitem: `to_dask_array`: new keyword 'asanyarray' - def to_dask_array(self, apply_mask_hardness=False, asanyarray=None): + def to_dask_array(self, apply_mask_hardness=False, _asanyarray=None): """Convert the data to a `dask` array. .. warning:: By default, the mask hardness of the returned @@ -4721,7 +4721,7 @@ def to_dask_array(self, apply_mask_hardness=False, asanyarray=None): .. versionadded:: NEXTVERSION - {{asanyarray: `bool` or `None`, optional}} + {{_asanyarray: `bool`, optional}} .. versionadded:: NEXTVERSION @@ -4744,7 +4744,7 @@ def to_dask_array(self, apply_mask_hardness=False, asanyarray=None): raise ValueError("Can't get dask array when there is no data") return data.to_dask_array( - apply_mask_hardness=apply_mask_hardness, asanyarray=asanyarray + apply_mask_hardness=apply_mask_hardness, _asanyarray=_asanyarray ) @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") diff --git a/cf/read_write/netcdf/netcdfwrite.py b/cf/read_write/netcdf/netcdfwrite.py index a857ce89d8..256c4b1392 100644 --- a/cf/read_write/netcdf/netcdfwrite.py +++ b/cf/read_write/netcdf/netcdfwrite.py @@ -749,10 +749,10 @@ def _cfa_write_non_standard_terms( # more than one unique value then the fragment's value is # missing data. # - # REVIEW: getitem: `_cfa_write_non_standard_terms`: set 'asanyarray' + # REVIEW: getitem: `_cfa_write_non_standard_terms`: set '_asanyarray' # '_cfa_unique' has its own call to 'cf_asanyarray', so - # we can set 'asanyarray=False'. - dx = data.to_dask_array(asanyarray=False) + # we can set '_asanyarray=False'. + dx = data.to_dask_array(_asanyarray=False) dx_ind = tuple(range(dx.ndim)) out_ind = dx_ind dx = da.blockwise( @@ -968,7 +968,7 @@ def _cfa_aggregation_instructions(self, data, cfvar): dtype = np.dtype(np.int32) # REVIEW: getitem: `_cfa_aggregation_instructions`: set 'asanyarray' if ( - max(data.to_dask_array(asanyarray=False).chunksize) + max(data.to_dask_array(_asanyarray=False).chunksize) > np.iinfo(dtype).max ): dtype = np.dtype(np.int64) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index c9d32b04c6..a62b49b76e 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -4527,11 +4527,11 @@ def test_Data_cull_graph(self): # `cf_asanyarray` layer d = cf.Data([1, 2, 3, 4, 5], chunks=3) d = d[:2] - self.assertEqual(len(dict(d.to_dask_array(asanyarray=False).dask)), 3) + self.assertEqual(len(dict(d.to_dask_array(_asanyarray=False).dask)), 3) # Check that there are fewer keys after culling d.cull_graph() - self.assertEqual(len(dict(d.to_dask_array(asanyarray=False).dask)), 2) + self.assertEqual(len(dict(d.to_dask_array(_asanyarray=False).dask)), 2) def test_Data_npartitions(self): """Test the `npartitions` Data property.""" From 9d8f8bb7605d9e6d579cecfdfdafa66a18ba464b Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 28 Oct 2024 13:51:21 +0000 Subject: [PATCH 133/134] dev --- cf/data/data.py | 25 +++++++++++++++++++++---- cf/docstring/docstring.py | 15 --------------- cf/mixin/propertiesdata.py | 32 ++------------------------------ 3 files changed, 23 insertions(+), 49 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index 8ee04cb3b3..d310e37aec 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -9901,7 +9901,19 @@ def to_dask_array(self, apply_mask_hardness=False, _asanyarray=True): If True then force the mask hardness of the returned array to be that given by the `hardmask` attribute. - {{_asanyarray: `bool`, optional}} + _asanyarray: `bool`, optional + If True (the default) and the `__asanyarray__` + attribute is also `True`, then a `cf_asanyarray` + operation is added to the graph of the returned Dask + array. If False then this operation is not added. + + In general, setting *_asanyarray* to False should only + be done if it is known that a) the returned Dask array + is never going to be computed; or b) it is not + necessary to add a `cf_asanyarray` operation in lieu of + its functionality being implemented by a new Dask graph + layer that is going to be created at a later stage. See + `cf.data.dask_utils.cf_asanyarray` for further details. .. versionadded:: NEXTVERSION @@ -11968,11 +11980,11 @@ def todict( .. versionadded:: 3.15.0 - .. seealso:: `to_dask_array`, `tolist` + .. seealso:: `to_dask_array` :Parameters: - `optimize_graph`: `bool` + optimize_graph: `bool` If True, the default, then prior to being converted to a dictionary, the graph is optimised to remove unused chunks. Note that optimising the graph can add a @@ -11984,7 +11996,12 @@ def todict( .. versionadded:: NEXTVERSION - {{_asanyarray: `bool`, optional}} + _asanyarray: `bool`, optional + If True (the default) and the `__asanyarray__` + attribute is also `True`, then a `cf_asanyarray` + operation is added to the dictionary representation of + the Dask graph. If False then this operation is not + added. See `to_dask_array` for details. .. versionadded:: NEXTVERSION diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py index 74bb32bca1..a11e0129b9 100644 --- a/cf/docstring/docstring.py +++ b/cf/docstring/docstring.py @@ -632,21 +632,6 @@ "{{to_size: `int`, optional}}": """to_size: `int`, optional Pad the axis after so that the new axis has the given size.""", - # REVIEW: getitem: `_docstring_substitution_definitions`: 'asanyarray' - # asanyarray - "{{_asanyarray: `bool`, optional}": """_asanyarray: `bool`, optional - If True (the default) and the `__asanyarray__` - attribute is also `True`, then add a `cf_asanyarray` - operation to the graph of the returned Dask array. If - False then this operation is not added. Setting - *_asanyarray* to False should only be done if it is - known that a) the returned Dask array is never going to - be computed; or b) it is not necessary to add a - `cf_asanyarray` operation in lieu of its functionality - being implemented by a new Dask graph layer that is - going to be created at a later stage. See - `cf.data.dask_utils.cf_asanyarray` for further - details.""", # _get_array index "{{index: `tuple` or `None`, optional}}": """index: `tuple` or `None`, optional Provide the indices that define the subspace. If `None` diff --git a/cf/mixin/propertiesdata.py b/cf/mixin/propertiesdata.py index 194bf8655b..a756b4aafe 100644 --- a/cf/mixin/propertiesdata.py +++ b/cf/mixin/propertiesdata.py @@ -4692,39 +4692,13 @@ def log(self, base=None, inplace=False, i=False): delete_props=True, ) - # REVIEW: getitem: `to_dask_array`: new keyword 'asanyarray' - def to_dask_array(self, apply_mask_hardness=False, _asanyarray=None): + def to_dask_array(self): """Convert the data to a `dask` array. - .. warning:: By default, the mask hardness of the returned - dask array might not be the same as that - specified by the `hardmask` attribute. - - This could cause problems if a subsequent - operation on the returned dask array involves the - un-masking of masked values (such as by indexed - assignment). - - To guarantee that the mask hardness of the - returned dask array is correct, set the - *apply_mask_hardness* parameter to True. - .. versionadded:: 3.14.0 .. seealso:: `cf.Data.to_dask_array` - :Parameters: - - apply_mask_hardness: `bool`, optional - If True then force the mask hardness of the returned - array to be that given by the `hardmask` attribute. - - .. versionadded:: NEXTVERSION - - {{_asanyarray: `bool`, optional}} - - .. versionadded:: NEXTVERSION - :Returns: `dask.array.Array` @@ -4743,9 +4717,7 @@ def to_dask_array(self, apply_mask_hardness=False, _asanyarray=None): if data is None: raise ValueError("Can't get dask array when there is no data") - return data.to_dask_array( - apply_mask_hardness=apply_mask_hardness, _asanyarray=_asanyarray - ) + return data.to_dask_array() @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) From 93fa1f036f0c50374ebb02b38403df84349516c1 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 28 Oct 2024 13:52:32 +0000 Subject: [PATCH 134/134] Active storage placeholder Co-authored-by: Sadie L. Bartholomew --- cf/data/fragment/mixin/fragmentarraymixin.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cf/data/fragment/mixin/fragmentarraymixin.py b/cf/data/fragment/mixin/fragmentarraymixin.py index 45c7dcf160..85e844201e 100644 --- a/cf/data/fragment/mixin/fragmentarraymixin.py +++ b/cf/data/fragment/mixin/fragmentarraymixin.py @@ -117,8 +117,8 @@ def _conform_to_aggregated_units(self, array): if isinstance(array, dict): # 'array' is a dictionary. raise ValueError( - "TODOACTIVE. Placeholder notification thatn " - "we can't yet dealing with active " + "TODOACTIVE. Placeholder notification that " + "we can't yet deal with active " "storage reductions on CFA fragments." ) else: