diff --git a/doc/faq.rst b/doc/faq.rst index 9313481f50a..44bc021024b 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -119,7 +119,8 @@ conventions`_. (An exception is serialization to and from netCDF files.) An implication of this choice is that we do not propagate ``attrs`` through most operations unless explicitly flagged (some methods have a ``keep_attrs`` -option). Similarly, xarray does not check for conflicts between ``attrs`` when +option, and there is a global flag for setting this to be always True or +False). Similarly, xarray does not check for conflicts between ``attrs`` when combining arrays and datasets, unless explicitly requested with the option ``compat='identical'``. The guiding principle is that metadata should not be allowed to get in the way. diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e1744e28077..19b50797d24 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -79,7 +79,12 @@ Enhancements :py:meth:`~xarray.Dataset.differentiate`, :py:meth:`~xarray.DataArray.interp`, and :py:meth:`~xarray.Dataset.interp`. - By `Spencer Clark `_. + By `Spencer Clark `_ +- There is now a global option to either always keep or always discard + dataset and dataarray attrs upon operations. The option is set with + ``xarray.set_options(keep_attrs=True)``, and the default is to use the old + behaviour. + By `Tom Nicholas `_. - Added a new backend for the GRIB file format based on ECMWF *cfgrib* python driver and *ecCodes* C-library. (:issue:`2475`) By `Alessandro Amici `_, diff --git a/xarray/core/common.py b/xarray/core/common.py index 6c03775a5dd..e303c485523 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -11,6 +11,7 @@ from .arithmetic import SupportsArithmetic from .pycompat import OrderedDict, basestring, dask_array_type, suppress from .utils import Frozen, ReprObject, SortedKeysDict, either_dict_or_kwargs +from .options import _get_keep_attrs # Used as a sentinel value to indicate a all dimensions ALL_DIMS = ReprObject('') @@ -21,13 +22,13 @@ class ImplementsArrayReduce(object): def _reduce_method(cls, func, include_skipna, numeric_only): if include_skipna: def wrapped_func(self, dim=None, axis=None, skipna=None, - keep_attrs=False, **kwargs): - return self.reduce(func, dim, axis, keep_attrs=keep_attrs, + **kwargs): + return self.reduce(func, dim, axis, skipna=skipna, allow_lazy=True, **kwargs) else: - def wrapped_func(self, dim=None, axis=None, keep_attrs=False, + def wrapped_func(self, dim=None, axis=None, **kwargs): - return self.reduce(func, dim, axis, keep_attrs=keep_attrs, + return self.reduce(func, dim, axis, allow_lazy=True, **kwargs) return wrapped_func @@ -51,14 +52,14 @@ class ImplementsDatasetReduce(object): @classmethod def _reduce_method(cls, func, include_skipna, numeric_only): if include_skipna: - def wrapped_func(self, dim=None, keep_attrs=False, skipna=None, + def wrapped_func(self, dim=None, skipna=None, **kwargs): - return self.reduce(func, dim, keep_attrs, skipna=skipna, + return self.reduce(func, dim, skipna=skipna, numeric_only=numeric_only, allow_lazy=True, **kwargs) else: - def wrapped_func(self, dim=None, keep_attrs=False, **kwargs): - return self.reduce(func, dim, keep_attrs, + def wrapped_func(self, dim=None, **kwargs): + return self.reduce(func, dim, numeric_only=numeric_only, allow_lazy=True, **kwargs) return wrapped_func @@ -591,7 +592,7 @@ def rolling(self, dim=None, min_periods=None, center=False, **dim_kwargs): center=center) def resample(self, freq=None, dim=None, how=None, skipna=None, - closed=None, label=None, base=0, keep_attrs=False, **indexer): + closed=None, label=None, base=0, keep_attrs=None, **indexer): """Returns a Resample object for performing resampling operations. Handles both downsampling and upsampling. If any intervals contain no @@ -659,6 +660,9 @@ def resample(self, freq=None, dim=None, how=None, skipna=None, from .dataarray import DataArray from .resample import RESAMPLE_DIM + if keep_attrs is None: + keep_attrs = _get_keep_attrs(default=False) + if dim is not None: if how is None: how = 'mean' diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index f131b003a69..61e0e709c36 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -16,7 +16,7 @@ assert_coordinate_consistent, remap_label_indexers) from .dataset import Dataset, merge_indexes, split_indexes from .formatting import format_item -from .options import OPTIONS +from .options import OPTIONS, _get_keep_attrs from .pycompat import OrderedDict, basestring, iteritems, range, zip from .utils import ( decode_numpy_dict_values, either_dict_or_kwargs, ensure_us_time_resolution) @@ -1559,7 +1559,7 @@ def combine_first(self, other): """ return ops.fillna(self, other, join="outer") - def reduce(self, func, dim=None, axis=None, keep_attrs=False, **kwargs): + def reduce(self, func, dim=None, axis=None, keep_attrs=None, **kwargs): """Reduce this array by applying `func` along some dimension(s). Parameters @@ -1588,6 +1588,7 @@ def reduce(self, func, dim=None, axis=None, keep_attrs=False, **kwargs): DataArray with this object's array replaced with an array with summarized data and the indicated dimension(s) removed. """ + var = self.variable.reduce(func, dim, axis, keep_attrs, **kwargs) return self._replace_maybe_drop_dims(var) @@ -2270,7 +2271,7 @@ def sortby(self, variables, ascending=True): ds = self._to_temp_dataset().sortby(variables, ascending=ascending) return self._from_temp_dataset(ds) - def quantile(self, q, dim=None, interpolation='linear', keep_attrs=False): + def quantile(self, q, dim=None, interpolation='linear', keep_attrs=None): """Compute the qth quantile of the data along the specified dimension. Returns the qth quantiles(s) of the array elements. @@ -2316,7 +2317,7 @@ def quantile(self, q, dim=None, interpolation='linear', keep_attrs=False): q, dim=dim, keep_attrs=keep_attrs, interpolation=interpolation) return self._from_temp_dataset(ds) - def rank(self, dim, pct=False, keep_attrs=False): + def rank(self, dim, pct=False, keep_attrs=None): """Ranks the data. Equal values are assigned a rank that is the average of the ranks that @@ -2352,6 +2353,7 @@ def rank(self, dim, pct=False, keep_attrs=False): array([ 1., 2., 3.]) Dimensions without coordinates: x """ + ds = self._to_temp_dataset().rank(dim, pct=pct, keep_attrs=keep_attrs) return self._from_temp_dataset(ds) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 983270cf425..7bd99968ebb 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -28,7 +28,7 @@ from .merge import ( dataset_merge_method, dataset_update_method, merge_data_and_coords, merge_variables) -from .options import OPTIONS +from .options import OPTIONS, _get_keep_attrs from .pycompat import ( OrderedDict, basestring, dask_array_type, integer_types, iteritems, range) from .utils import ( @@ -2842,7 +2842,7 @@ def combine_first(self, other): out = ops.fillna(self, other, join="outer", dataset_join="outer") return out - def reduce(self, func, dim=None, keep_attrs=False, numeric_only=False, + def reduce(self, func, dim=None, keep_attrs=None, numeric_only=False, allow_lazy=False, **kwargs): """Reduce this dataset by applying `func` along some dimension(s). @@ -2884,6 +2884,9 @@ def reduce(self, func, dim=None, keep_attrs=False, numeric_only=False, raise ValueError('Dataset does not contain the dimensions: %s' % missing_dimensions) + if keep_attrs is None: + keep_attrs = _get_keep_attrs(default=False) + variables = OrderedDict() for name, var in iteritems(self._variables): reduce_dims = [dim for dim in var.dims if dim in dims] @@ -2912,7 +2915,7 @@ def reduce(self, func, dim=None, keep_attrs=False, numeric_only=False, attrs = self.attrs if keep_attrs else None return self._replace_vars_and_dims(variables, coord_names, attrs=attrs) - def apply(self, func, keep_attrs=False, args=(), **kwargs): + def apply(self, func, keep_attrs=None, args=(), **kwargs): """Apply a function over the data variables in this dataset. Parameters @@ -2957,6 +2960,8 @@ def apply(self, func, keep_attrs=False, args=(), **kwargs): variables = OrderedDict( (k, maybe_wrap_array(v, func(v, *args, **kwargs))) for k, v in iteritems(self.data_vars)) + if keep_attrs is None: + keep_attrs = _get_keep_attrs(default=False) attrs = self.attrs if keep_attrs else None return type(self)(variables, attrs=attrs) @@ -3621,7 +3626,7 @@ def sortby(self, variables, ascending=True): return aligned_self.isel(**indices) def quantile(self, q, dim=None, interpolation='linear', - numeric_only=False, keep_attrs=False): + numeric_only=False, keep_attrs=None): """Compute the qth quantile of the data along the specified dimension. Returns the qth quantiles(s) of the array elements for each variable @@ -3699,6 +3704,8 @@ def quantile(self, q, dim=None, interpolation='linear', # construct the new dataset coord_names = set(k for k in self.coords if k in variables) + if keep_attrs is None: + keep_attrs = _get_keep_attrs(default=False) attrs = self.attrs if keep_attrs else None new = self._replace_vars_and_dims(variables, coord_names, attrs=attrs) if 'quantile' in new.dims: @@ -3707,7 +3714,7 @@ def quantile(self, q, dim=None, interpolation='linear', new.coords['quantile'] = q return new - def rank(self, dim, pct=False, keep_attrs=False): + def rank(self, dim, pct=False, keep_attrs=None): """Ranks the data. Equal values are assigned a rank that is the average of the ranks that @@ -3747,6 +3754,8 @@ def rank(self, dim, pct=False, keep_attrs=False): variables[name] = var coord_names = set(self.coords) + if keep_attrs is None: + keep_attrs = _get_keep_attrs(default=False) attrs = self.attrs if keep_attrs else None return self._replace_vars_and_dims(variables, coord_names, attrs=attrs) @@ -3810,11 +3819,13 @@ def differentiate(self, coord, edge_order=1, datetime_unit=None): @property def real(self): - return self._unary_op(lambda x: x.real, keep_attrs=True)(self) + return self._unary_op(lambda x: x.real, + keep_attrs=True)(self) @property def imag(self): - return self._unary_op(lambda x: x.imag, keep_attrs=True)(self) + return self._unary_op(lambda x: x.imag, + keep_attrs=True)(self) def filter_by_attrs(self, **kwargs): """Returns a ``Dataset`` with variables that match specific conditions. diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index dc23eae8b76..defe72ab3ee 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -13,6 +13,7 @@ from .pycompat import integer_types, range, zip from .utils import hashable, maybe_wrap_array, peek_at, safe_cast_to_index from .variable import IndexVariable, Variable, as_variable +from .options import _get_keep_attrs def unique_value_groups(ar, sort=True): @@ -404,15 +405,17 @@ def _first_or_last(self, op, skipna, keep_attrs): # NB. this is currently only used for reductions along an existing # dimension return self._obj + if keep_attrs is None: + keep_attrs = _get_keep_attrs(default=True) return self.reduce(op, self._group_dim, skipna=skipna, keep_attrs=keep_attrs, allow_lazy=True) - def first(self, skipna=None, keep_attrs=True): + def first(self, skipna=None, keep_attrs=None): """Return the first element of each group along the group dimension """ return self._first_or_last(duck_array_ops.first, skipna, keep_attrs) - def last(self, skipna=None, keep_attrs=True): + def last(self, skipna=None, keep_attrs=None): """Return the last element of each group along the group dimension """ return self._first_or_last(duck_array_ops.last, skipna, keep_attrs) @@ -539,8 +542,8 @@ def _combine(self, applied, shortcut=False): combined = self._maybe_unstack(combined) return combined - def reduce(self, func, dim=None, axis=None, keep_attrs=False, - shortcut=True, **kwargs): + def reduce(self, func, dim=None, axis=None, + keep_attrs=None, shortcut=True, **kwargs): """Reduce the items in this group by applying `func` along some dimension(s). @@ -580,6 +583,9 @@ def reduce(self, func, dim=None, axis=None, keep_attrs=False, "warning, pass dim=xarray.ALL_DIMS explicitly.", FutureWarning, stacklevel=2) + if keep_attrs is None: + keep_attrs = _get_keep_attrs(default=False) + def reduce_array(ar): return ar.reduce(func, dim, axis, keep_attrs=keep_attrs, **kwargs) return self.apply(reduce_array, shortcut=shortcut) @@ -590,12 +596,12 @@ def reduce_array(ar): def _reduce_method(cls, func, include_skipna, numeric_only): if include_skipna: def wrapped_func(self, dim=DEFAULT_DIMS, axis=None, skipna=None, - keep_attrs=False, **kwargs): + keep_attrs=None, **kwargs): return self.reduce(func, dim, axis, keep_attrs=keep_attrs, skipna=skipna, allow_lazy=True, **kwargs) else: def wrapped_func(self, dim=DEFAULT_DIMS, axis=None, - keep_attrs=False, **kwargs): + keep_attrs=None, **kwargs): return self.reduce(func, dim, axis, keep_attrs=keep_attrs, allow_lazy=True, **kwargs) return wrapped_func @@ -651,7 +657,7 @@ def _combine(self, applied): combined = self._maybe_unstack(combined) return combined - def reduce(self, func, dim=None, keep_attrs=False, **kwargs): + def reduce(self, func, dim=None, keep_attrs=None, **kwargs): """Reduce the items in this group by applying `func` along some dimension(s). @@ -692,6 +698,9 @@ def reduce(self, func, dim=None, keep_attrs=False, **kwargs): elif dim is None: dim = self._group_dim + if keep_attrs is None: + keep_attrs = _get_keep_attrs(default=False) + def reduce_dataset(ds): return ds.reduce(func, dim, keep_attrs, **kwargs) return self.apply(reduce_dataset) @@ -701,15 +710,15 @@ def reduce_dataset(ds): @classmethod def _reduce_method(cls, func, include_skipna, numeric_only): if include_skipna: - def wrapped_func(self, dim=DEFAULT_DIMS, keep_attrs=False, + def wrapped_func(self, dim=DEFAULT_DIMS, skipna=None, **kwargs): - return self.reduce(func, dim, keep_attrs, skipna=skipna, - numeric_only=numeric_only, allow_lazy=True, - **kwargs) + return self.reduce(func, dim, + skipna=skipna, numeric_only=numeric_only, + allow_lazy=True, **kwargs) else: - def wrapped_func(self, dim=DEFAULT_DIMS, keep_attrs=False, + def wrapped_func(self, dim=DEFAULT_DIMS, **kwargs): - return self.reduce(func, dim, keep_attrs, + return self.reduce(func, dim, numeric_only=numeric_only, allow_lazy=True, **kwargs) return wrapped_func diff --git a/xarray/core/options.py b/xarray/core/options.py index 04ea0be7172..eb3013d5233 100644 --- a/xarray/core/options.py +++ b/xarray/core/options.py @@ -6,6 +6,8 @@ FILE_CACHE_MAXSIZE = 'file_cache_maxsize' CMAP_SEQUENTIAL = 'cmap_sequential' CMAP_DIVERGENT = 'cmap_divergent' +KEEP_ATTRS = 'keep_attrs' + OPTIONS = { DISPLAY_WIDTH: 80, @@ -14,6 +16,7 @@ FILE_CACHE_MAXSIZE: 128, CMAP_SEQUENTIAL: 'viridis', CMAP_DIVERGENT: 'RdBu_r', + KEEP_ATTRS: 'default' } _JOIN_OPTIONS = frozenset(['inner', 'outer', 'left', 'right', 'exact']) @@ -28,6 +31,7 @@ def _positive_integer(value): ARITHMETIC_JOIN: _JOIN_OPTIONS.__contains__, ENABLE_CFTIMEINDEX: lambda value: isinstance(value, bool), FILE_CACHE_MAXSIZE: _positive_integer, + KEEP_ATTRS: lambda choice: choice in [True, False, 'default'] } @@ -41,6 +45,17 @@ def _set_file_cache_maxsize(value): } +def _get_keep_attrs(default): + global_choice = OPTIONS['keep_attrs'] + + if global_choice is 'default': + return default + elif global_choice in [True, False]: + return global_choice + else: + raise ValueError("The global option keep_attrs must be one of True, False or 'default'.") + + class set_options(object): """Set options for xarray in a controlled context. @@ -63,8 +78,13 @@ class set_options(object): - ``cmap_divergent``: colormap to use for divergent data plots. Default: ``RdBu_r``. If string, must be matplotlib built-in colormap. Can also be a Colormap object (e.g. mpl.cm.magma) + - ``keep_attrs``: rule for whether to keep attributes on xarray + Datasets/dataarrays after operations. Either ``True`` to always keep + attrs, ``False`` to always discard them, or ``'default'`` to use original + logic that attrs should only be kept in unambiguous circumstances. + Default: ``'default'``. -f You can use ``set_options`` either as a context manager: + You can use ``set_options`` either as a context manager: >>> ds = xr.Dataset({'x': np.arange(1000)}) >>> with xr.set_options(display_width=40): diff --git a/xarray/core/resample.py b/xarray/core/resample.py index bd84e04487e..edf7dfc3d41 100644 --- a/xarray/core/resample.py +++ b/xarray/core/resample.py @@ -273,7 +273,7 @@ def apply(self, func, **kwargs): return combined.rename({self._resample_dim: self._dim}) - def reduce(self, func, dim=None, keep_attrs=False, **kwargs): + def reduce(self, func, dim=None, keep_attrs=None, **kwargs): """Reduce the items in this group by applying `func` along the pre-defined resampling dimension. diff --git a/xarray/core/variable.py b/xarray/core/variable.py index c003d52aab2..271f00102e0 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -18,6 +18,7 @@ from .pycompat import ( OrderedDict, basestring, dask_array_type, integer_types, zip) from .utils import OrderedSet, either_dict_or_kwargs +from .options import _get_keep_attrs try: import dask.array as da @@ -1303,8 +1304,8 @@ def fillna(self, value): def where(self, cond, other=dtypes.NA): return ops.where_method(self, cond, other) - def reduce(self, func, dim=None, axis=None, keep_attrs=False, - allow_lazy=False, **kwargs): + def reduce(self, func, dim=None, axis=None, + keep_attrs=None, allow_lazy=False, **kwargs): """Reduce this array by applying `func` along some dimension(s). Parameters @@ -1351,6 +1352,8 @@ def reduce(self, func, dim=None, axis=None, keep_attrs=False, dims = [adim for n, adim in enumerate(self.dims) if n not in removed_axes] + if keep_attrs is None: + keep_attrs = _get_keep_attrs(default=False) attrs = self._attrs if keep_attrs else None return Variable(dims, data, attrs=attrs) diff --git a/xarray/tests/test_options.py b/xarray/tests/test_options.py index 4441375a1b1..a21ea3e6b64 100644 --- a/xarray/tests/test_options.py +++ b/xarray/tests/test_options.py @@ -3,8 +3,10 @@ import pytest import xarray -from xarray.core.options import OPTIONS +from xarray.core.options import OPTIONS, _get_keep_attrs from xarray.backends.file_manager import FILE_CACHE +from xarray.tests.test_dataset import create_test_data +from xarray import concat, merge def test_invalid_option_raises(): @@ -44,6 +46,18 @@ def test_file_cache_maxsize(): assert FILE_CACHE.maxsize == original_size +def test_keep_attrs(): + with pytest.raises(ValueError): + xarray.set_options(keep_attrs='invalid_str') + with xarray.set_options(keep_attrs=True): + assert OPTIONS['keep_attrs'] + with xarray.set_options(keep_attrs=False): + assert not OPTIONS['keep_attrs'] + with xarray.set_options(keep_attrs='default'): + assert _get_keep_attrs(default=True) + assert not _get_keep_attrs(default=False) + + def test_nested_options(): original = OPTIONS['display_width'] with xarray.set_options(display_width=1): @@ -52,3 +66,105 @@ def test_nested_options(): assert OPTIONS['display_width'] == 2 assert OPTIONS['display_width'] == 1 assert OPTIONS['display_width'] == original + + +def create_test_dataset_attrs(seed=0): + ds = create_test_data(seed) + ds.attrs = {'attr1': 5, 'attr2': 'history', + 'attr3': {'nested': 'more_info'}} + return ds + + +def create_test_dataarray_attrs(seed=0, var='var1'): + da = create_test_data(seed)[var] + da.attrs = {'attr1': 5, 'attr2': 'history', + 'attr3': {'nested': 'more_info'}} + return da + + +class TestAttrRetention(object): + def test_dataset_attr_retention(self): + # Use .mean() for all tests: a typical reduction operation + ds = create_test_dataset_attrs() + original_attrs = ds.attrs + + # Test default behaviour + result = ds.mean() + assert result.attrs == {} + with xarray.set_options(keep_attrs='default'): + result = ds.mean() + assert result.attrs == {} + + with xarray.set_options(keep_attrs=True): + result = ds.mean() + assert result.attrs == original_attrs + + with xarray.set_options(keep_attrs=False): + result = ds.mean() + assert result.attrs == {} + + def test_dataarray_attr_retention(self): + # Use .mean() for all tests: a typical reduction operation + da = create_test_dataarray_attrs() + original_attrs = da.attrs + + # Test default behaviour + result = da.mean() + assert result.attrs == {} + with xarray.set_options(keep_attrs='default'): + result = da.mean() + assert result.attrs == {} + + with xarray.set_options(keep_attrs=True): + result = da.mean() + assert result.attrs == original_attrs + + with xarray.set_options(keep_attrs=False): + result = da.mean() + assert result.attrs == {} + + def test_groupby_attr_retention(self): + da = xarray.DataArray([1, 2, 3], [('x', [1, 1, 2])]) + da.attrs = {'attr1': 5, 'attr2': 'history', + 'attr3': {'nested': 'more_info'}} + original_attrs = da.attrs + + # Test default behaviour + result = da.groupby('x').sum(keep_attrs=True) + assert result.attrs == original_attrs + with xarray.set_options(keep_attrs='default'): + result = da.groupby('x').sum(keep_attrs=True) + assert result.attrs == original_attrs + + with xarray.set_options(keep_attrs=True): + result1 = da.groupby('x') + result = result1.sum() + assert result.attrs == original_attrs + + with xarray.set_options(keep_attrs=False): + result = da.groupby('x').sum() + assert result.attrs == {} + + def test_concat_attr_retention(self): + ds1 = create_test_dataset_attrs() + ds2 = create_test_dataset_attrs() + ds2.attrs = {'wrong': 'attributes'} + original_attrs = ds1.attrs + + # Test default behaviour of keeping the attrs of the first + # dataset in the supplied list + # global keep_attrs option current doesn't affect concat + result = concat([ds1, ds2], dim='dim1') + assert result.attrs == original_attrs + + @pytest.mark.xfail + def test_merge_attr_retention(self): + da1 = create_test_dataarray_attrs(var='var1') + da2 = create_test_dataarray_attrs(var='var2') + da2.attrs = {'wrong': 'attributes'} + original_attrs = da1.attrs + + # merge currently discards attrs, and the global keep_attrs + # option doesn't affect this + result = merge([da1, da2]) + assert result.attrs == original_attrs