From db36c5c0cdee2f5313a81fdeca8a8ae5491d1c8f Mon Sep 17 00:00:00 2001 From: hazbottles Date: Fri, 3 Jan 2020 12:16:44 +0000 Subject: [PATCH 01/14] add multiindex level name checking to .rename() (#3658) * add multiindex level name checking to .rename() * update whats-new.rst --- doc/whats-new.rst | 2 ++ xarray/core/dataset.py | 9 ++++++++- xarray/tests/test_dataset.py | 8 ++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 00d1c50780e..70853dbb730 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -62,6 +62,8 @@ Bug fixes By `Tom Augspurger `_. - Ensure :py:meth:`Dataset.quantile`, :py:meth:`DataArray.quantile` issue the correct error when ``q`` is out of bounds (:issue:`3634`) by `Mathias Hauser `_. +- :py:meth:`Dataset.rename`, :py:meth:`DataArray.rename` now check for conflicts with + MultiIndex level names. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 6be06fed117..032c66d3778 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -89,7 +89,13 @@ is_scalar, maybe_wrap_array, ) -from .variable import IndexVariable, Variable, as_variable, broadcast_variables +from .variable import ( + IndexVariable, + Variable, + as_variable, + broadcast_variables, + assert_unique_multiindex_level_names, +) if TYPE_CHECKING: from ..backends import AbstractDataStore, ZarrStore @@ -2780,6 +2786,7 @@ def rename( variables, coord_names, dims, indexes = self._rename_all( name_dict=name_dict, dims_dict=name_dict ) + assert_unique_multiindex_level_names(variables) return self._replace(variables, coord_names, dims=dims, indexes=indexes) def rename_dims( diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 7db1911621b..edda4cb2a58 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -2461,6 +2461,14 @@ def test_rename_vars(self): with pytest.raises(ValueError): original.rename_vars(names_dict_bad) + def test_rename_multiindex(self): + mindex = pd.MultiIndex.from_tuples( + [([1, 2]), ([3, 4])], names=["level0", "level1"] + ) + data = Dataset({}, {"x": mindex}) + with raises_regex(ValueError, "conflicting MultiIndex"): + data.rename({"x": "level0"}) + @requires_cftime def test_rename_does_not_change_CFTimeIndex_type(self): # make sure CFTimeIndex is not converted to DatetimeIndex #3522 From 8fc9ecedc87b5d878363b233e260c87fd632fa0f Mon Sep 17 00:00:00 2001 From: Riley Brady Date: Wed, 8 Jan 2020 10:50:03 -0700 Subject: [PATCH 02/14] Add map_blocks example to docs. (#3667) --- xarray/core/parallel.py | 42 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/xarray/core/parallel.py b/xarray/core/parallel.py index dd6c67338d8..e4fb5803191 100644 --- a/xarray/core/parallel.py +++ b/xarray/core/parallel.py @@ -154,6 +154,48 @@ def map_blocks( -------- dask.array.map_blocks, xarray.apply_ufunc, xarray.Dataset.map_blocks, xarray.DataArray.map_blocks + + Examples + -------- + + Calculate an anomaly from climatology using ``.groupby()``. Using + ``xr.map_blocks()`` allows for parallel operations with knowledge of ``xarray``, + its indices, and its methods like ``.groupby()``. + + >>> def calculate_anomaly(da, groupby_type='time.month'): + ... # Necessary workaround to xarray's check with zero dimensions + ... # https://github.com/pydata/xarray/issues/3575 + ... if sum(da.shape) == 0: + ... return da + ... gb = da.groupby(groupby_type) + ... clim = gb.mean(dim='time') + ... return gb - clim + >>> time = xr.cftime_range('1990-01', '1992-01', freq='M') + >>> np.random.seed(123) + >>> array = xr.DataArray(np.random.rand(len(time)), + ... dims="time", coords=[time]).chunk() + >>> xr.map_blocks(calculate_anomaly, array).compute() + + array([ 0.12894847, 0.11323072, -0.0855964 , -0.09334032, 0.26848862, + 0.12382735, 0.22460641, 0.07650108, -0.07673453, -0.22865714, + -0.19063865, 0.0590131 , -0.12894847, -0.11323072, 0.0855964 , + 0.09334032, -0.26848862, -0.12382735, -0.22460641, -0.07650108, + 0.07673453, 0.22865714, 0.19063865, -0.0590131 ]) + Coordinates: + * time (time) object 1990-01-31 00:00:00 ... 1991-12-31 00:00:00 + + Note that one must explicitly use ``args=[]`` and ``kwargs={}`` to pass arguments + to the function being applied in ``xr.map_blocks()``: + + >>> xr.map_blocks(calculate_anomaly, array, kwargs={'groupby_type': 'time.year'}) + + array([ 0.15361741, -0.25671244, -0.31600032, 0.008463 , 0.1766172 , + -0.11974531, 0.43791243, 0.14197797, -0.06191987, -0.15073425, + -0.19967375, 0.18619794, -0.05100474, -0.42989909, -0.09153273, + 0.24841842, -0.30708526, -0.31412523, 0.04197439, 0.0422506 , + 0.14482397, 0.35985481, 0.23487834, 0.12144652]) + Coordinates: + * time (time) object 1990-01-31 00:00:00 ... 1991-12-31 00:00:00 """ def _wrapper(func, obj, to_array, args, kwargs): From 080caf4246fe2f4d6aa0c5dcb65a99b376fa669b Mon Sep 17 00:00:00 2001 From: keewis Date: Wed, 8 Jan 2020 19:27:29 +0100 Subject: [PATCH 03/14] Support swap_dims to dimension names that are not existing variables (#3636) * test that swapping to a non-existing name works * don't try to get a variable if the variable does not exist * don't add dimensions to coord_names if they are not existing variables * add whats-new.rst entry * update the documentation --- doc/whats-new.rst | 3 +++ xarray/core/dataarray.py | 10 ++++++++-- xarray/core/dataset.py | 17 +++++++++++++---- xarray/tests/test_dataarray.py | 5 +++++ xarray/tests/test_dataset.py | 6 ++++++ 5 files changed, 35 insertions(+), 6 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 70853dbb730..351424fbb9f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -37,6 +37,9 @@ New Features - Added the ``count`` reduction method to both :py:class:`~core.rolling.DatasetCoarsen` and :py:class:`~core.rolling.DataArrayCoarsen` objects. (:pull:`3500`) By `Deepak Cherian `_ +- :py:meth:`Dataset.swap_dims` and :py:meth:`DataArray.swap_dims` + now allow swapping to dimension names that don't exist yet. (:pull:`3636`) + By `Justus Magin `_. - Extend :py:class:`core.accessor_dt.DatetimeAccessor` properties and support `.dt` accessor for timedelta via :py:class:`core.accessor_dt.TimedeltaAccessor` (:pull:`3612`) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 31aa4da57b2..cbd8d243385 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1480,8 +1480,7 @@ def swap_dims(self, dims_dict: Mapping[Hashable, Hashable]) -> "DataArray": ---------- dims_dict : dict-like Dictionary whose keys are current dimension names and whose values - are new names. Each value must already be a coordinate on this - array. + are new names. Returns ------- @@ -1504,6 +1503,13 @@ def swap_dims(self, dims_dict: Mapping[Hashable, Hashable]) -> "DataArray": Coordinates: x (y) >> arr.swap_dims({"x": "z"}) + + array([0, 1]) + Coordinates: + x (z) >> ds.swap_dims({"x": "z"}) + + Dimensions: (z: 2) + Coordinates: + x (z) Date: Thu, 9 Jan 2020 02:46:45 +0100 Subject: [PATCH 04/14] raise an error when renaming dimensions to existing names (#3645) * check we raise an error if the name already exists * raise if the new name already exists and point to swap_dims * update the documentation * whats-new.rst * fix the docstring of rename_dims Co-Authored-By: Deepak Cherian --- doc/whats-new.rst | 3 +++ xarray/core/dataset.py | 10 ++++++++-- xarray/tests/test_dataset.py | 3 +++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 351424fbb9f..5a9f2497ed6 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -65,6 +65,9 @@ Bug fixes By `Tom Augspurger `_. - Ensure :py:meth:`Dataset.quantile`, :py:meth:`DataArray.quantile` issue the correct error when ``q`` is out of bounds (:issue:`3634`) by `Mathias Hauser `_. +- Raise an error when trying to use :py:meth:`Dataset.rename_dims` to + rename to an existing name (:issue:`3438`, :pull:`3645`) + By `Justus Magin `_. - :py:meth:`Dataset.rename`, :py:meth:`DataArray.rename` now check for conflicts with MultiIndex level names. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 607350c8101..ac0a923db78 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2798,7 +2798,8 @@ def rename_dims( ---------- dims_dict : dict-like, optional Dictionary whose keys are current dimension names and - whose values are the desired names. + whose values are the desired names. The desired names must + not be the name of an existing dimension or Variable in the Dataset. **dims, optional Keyword form of ``dims_dict``. One of dims_dict or dims must be provided. @@ -2816,12 +2817,17 @@ def rename_dims( DataArray.rename """ dims_dict = either_dict_or_kwargs(dims_dict, dims, "rename_dims") - for k in dims_dict: + for k, v in dims_dict.items(): if k not in self.dims: raise ValueError( "cannot rename %r because it is not a " "dimension in this dataset" % k ) + if v in self.dims or v in self: + raise ValueError( + f"Cannot rename {k} to {v} because {v} already exists. " + "Try using swap_dims instead." + ) variables, coord_names, sizes, indexes = self._rename_all( name_dict={}, dims_dict=dims_dict diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 48d8c25b810..2220abbef31 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -2444,6 +2444,9 @@ def test_rename_dims(self): with pytest.raises(ValueError): original.rename_dims(dims_dict_bad) + with pytest.raises(ValueError): + original.rename_dims({"x": "z"}) + def test_rename_vars(self): original = Dataset({"x": ("x", [0, 1, 2]), "y": ("x", [10, 11, 12]), "z": 42}) expected = Dataset( From 24f9292114894621d8eb7a4eade6347538ce0d23 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 10 Jan 2020 16:10:56 +0000 Subject: [PATCH 05/14] Make dask names change when chunking Variables by different amounts. (#3584) * Make dask tokens change when chunking Variables by different amounts. When rechunking by the current chunk size, the dask token should not change. Add a __dask_tokenize__ method for ReprObject so that this behaviour is present when DataArrays are converted to temporary Datasets and back. Co-Authored-By: crusaderky Co-authored-by: crusaderky --- doc/whats-new.rst | 4 ++++ xarray/core/dataset.py | 5 ++++- xarray/core/utils.py | 7 ++++++- xarray/tests/test_dask.py | 18 +++++++++--------- xarray/tests/test_dataarray.py | 7 +++++++ xarray/tests/test_dataset.py | 16 ++++++++++++++++ 6 files changed, 46 insertions(+), 11 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 5a9f2497ed6..e1c4e3dd9ac 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -47,6 +47,7 @@ New Features Bug fixes ~~~~~~~~~ + - Fix :py:meth:`xarray.combine_by_coords` to allow for combining incomplete hypercubes of Datasets (:issue:`3648`). By `Ian Bolliger `_. @@ -91,6 +92,9 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ +- Make sure dask names change when rechunking by different chunk sizes. Conversely, make sure they + stay the same when rechunking by the same chunk size. (:issue:`3350`) + By `Deepak Cherian `_. - 2x to 5x speed boost (on small arrays) for :py:meth:`Dataset.isel`, :py:meth:`DataArray.isel`, and :py:meth:`DataArray.__getitem__` when indexing by int, slice, list of int, scalar ndarray, or 1-dimensional ndarray. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index ac0a923db78..129f0c0f7a7 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1754,7 +1754,10 @@ def maybe_chunk(name, var, chunks): if not chunks: chunks = None if var.ndim > 0: - token2 = tokenize(name, token if token else var._data) + # when rechunking by different amounts, make sure dask names change + # by provinding chunks as an input to tokenize. + # subtle bugs result otherwise. see GH3350 + token2 = tokenize(name, token if token else var._data, chunks) name2 = f"{name_prefix}{name}-{token2}" return var.chunk(chunks, name=name2, lock=lock) else: diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 6681375c18e..e335365d5ca 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -547,7 +547,12 @@ def __eq__(self, other) -> bool: return False def __hash__(self) -> int: - return hash((ReprObject, self._value)) + return hash((type(self), self._value)) + + def __dask_tokenize__(self): + from dask.base import normalize_token + + return normalize_token((type(self), self._value)) @contextlib.contextmanager diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index d0e2654eed3..cc554850839 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1083,7 +1083,7 @@ def func(obj): actual = xr.map_blocks(func, obj) expected = func(obj) assert_chunks_equal(expected.chunk(), actual) - xr.testing.assert_identical(actual.compute(), expected.compute()) + assert_identical(actual, expected) @pytest.mark.parametrize("obj", [make_da(), make_ds()]) @@ -1092,7 +1092,7 @@ def test_map_blocks_convert_args_to_list(obj): with raise_if_dask_computes(): actual = xr.map_blocks(operator.add, obj, [10]) assert_chunks_equal(expected.chunk(), actual) - xr.testing.assert_identical(actual.compute(), expected.compute()) + assert_identical(actual, expected) @pytest.mark.parametrize("obj", [make_da(), make_ds()]) @@ -1107,7 +1107,7 @@ def add_attrs(obj): with raise_if_dask_computes(): actual = xr.map_blocks(add_attrs, obj) - xr.testing.assert_identical(actual.compute(), expected.compute()) + assert_identical(actual, expected) def test_map_blocks_change_name(map_da): @@ -1120,7 +1120,7 @@ def change_name(obj): with raise_if_dask_computes(): actual = xr.map_blocks(change_name, map_da) - xr.testing.assert_identical(actual.compute(), expected.compute()) + assert_identical(actual, expected) @pytest.mark.parametrize("obj", [make_da(), make_ds()]) @@ -1129,7 +1129,7 @@ def test_map_blocks_kwargs(obj): with raise_if_dask_computes(): actual = xr.map_blocks(xr.full_like, obj, kwargs=dict(fill_value=np.nan)) assert_chunks_equal(expected.chunk(), actual) - xr.testing.assert_identical(actual.compute(), expected.compute()) + assert_identical(actual, expected) def test_map_blocks_to_array(map_ds): @@ -1137,7 +1137,7 @@ def test_map_blocks_to_array(map_ds): actual = xr.map_blocks(lambda x: x.to_array(), map_ds) # to_array does not preserve name, so cannot use assert_identical - assert_equal(actual.compute(), map_ds.to_array().compute()) + assert_equal(actual, map_ds.to_array()) @pytest.mark.parametrize( @@ -1156,7 +1156,7 @@ def test_map_blocks_da_transformations(func, map_da): with raise_if_dask_computes(): actual = xr.map_blocks(func, map_da) - assert_identical(actual.compute(), func(map_da).compute()) + assert_identical(actual, func(map_da)) @pytest.mark.parametrize( @@ -1175,7 +1175,7 @@ def test_map_blocks_ds_transformations(func, map_ds): with raise_if_dask_computes(): actual = xr.map_blocks(func, map_ds) - assert_identical(actual.compute(), func(map_ds).compute()) + assert_identical(actual, func(map_ds)) @pytest.mark.parametrize("obj", [make_da(), make_ds()]) @@ -1188,7 +1188,7 @@ def func(obj): expected = xr.map_blocks(func, obj) actual = obj.map_blocks(func) - assert_identical(expected.compute(), actual.compute()) + assert_identical(expected, actual) def test_map_blocks_hlg_layers(): diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 4189c3b504a..786eb5007a6 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -752,12 +752,19 @@ def test_chunk(self): blocked = unblocked.chunk() assert blocked.chunks == ((3,), (4,)) + first_dask_name = blocked.data.name blocked = unblocked.chunk(chunks=((2, 1), (2, 2))) assert blocked.chunks == ((2, 1), (2, 2)) + assert blocked.data.name != first_dask_name blocked = unblocked.chunk(chunks=(3, 3)) assert blocked.chunks == ((3,), (3, 1)) + assert blocked.data.name != first_dask_name + + # name doesn't change when rechunking by same amount + # this fails if ReprObject doesn't have __dask_tokenize__ defined + assert unblocked.chunk(2).data.name == unblocked.chunk(2).data.name assert blocked.load().chunks is None diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 2220abbef31..c953f5d22e9 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -936,19 +936,35 @@ def test_chunk(self): expected_chunks = {"dim1": (8,), "dim2": (9,), "dim3": (10,)} assert reblocked.chunks == expected_chunks + def get_dask_names(ds): + return {k: v.data.name for k, v in ds.items()} + + orig_dask_names = get_dask_names(reblocked) + reblocked = data.chunk({"time": 5, "dim1": 5, "dim2": 5, "dim3": 5}) # time is not a dim in any of the data_vars, so it # doesn't get chunked expected_chunks = {"dim1": (5, 3), "dim2": (5, 4), "dim3": (5, 5)} assert reblocked.chunks == expected_chunks + # make sure dask names change when rechunking by different amounts + # regression test for GH3350 + new_dask_names = get_dask_names(reblocked) + for k, v in new_dask_names.items(): + assert v != orig_dask_names[k] + reblocked = data.chunk(expected_chunks) assert reblocked.chunks == expected_chunks # reblock on already blocked data + orig_dask_names = get_dask_names(reblocked) reblocked = reblocked.chunk(expected_chunks) + new_dask_names = get_dask_names(reblocked) assert reblocked.chunks == expected_chunks assert_identical(reblocked, data) + # recuhnking with same chunk sizes should not change names + for k, v in new_dask_names.items(): + assert v == orig_dask_names[k] with raises_regex(ValueError, "some chunks"): data.chunk({"foo": 10}) From e8dbe6ee8cf3d94af0a35e92f7d77b08dc4b95df Mon Sep 17 00:00:00 2001 From: Riley Brady Date: Fri, 10 Jan 2020 09:48:31 -0700 Subject: [PATCH 06/14] Add map_blocks example to whats-new (#3682) --- doc/whats-new.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e1c4e3dd9ac..93df2b6569a 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -89,6 +89,8 @@ Documentation - Added examples for :py:meth:`DataArray.quantile`, :py:meth:`Dataset.quantile` and ``GroupBy.quantile``. (:pull:`3576`) By `Justus Magin `_. +- Added example for :py:func:`~xarray.map_blocks`. (:pull:`3667`) + By `Riley X. Brady `_. Internal Changes ~~~~~~~~~~~~~~~~ From ff75081304eb2e2784dcb229cc48a532da557896 Mon Sep 17 00:00:00 2001 From: rpgoldman Date: Fri, 10 Jan 2020 14:02:22 -0600 Subject: [PATCH 07/14] How do I add a new variable to dataset. (#3679) * How do I add a new variable to dataset. * Update doc/howdoi.rst Improvement from dcherian Co-Authored-By: Deepak Cherian * Add cross-reference per suggestion. * Fix cross-reference. * Update doc/howdoi.rst Suggestion from keewis Co-Authored-By: keewis Co-authored-by: Deepak Cherian Co-authored-by: keewis --- doc/data-structures.rst | 2 ++ doc/howdoi.rst | 3 +++ 2 files changed, 5 insertions(+) diff --git a/doc/data-structures.rst b/doc/data-structures.rst index 504d820a234..70e34adabed 100644 --- a/doc/data-structures.rst +++ b/doc/data-structures.rst @@ -353,6 +353,8 @@ setting) variables and attributes: This is particularly useful in an exploratory context, because you can tab-complete these variable names with tools like IPython. +.. _dictionary_like_methods: + Dictionary like methods ~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/howdoi.rst b/doc/howdoi.rst index 80266bd3b84..84c0c786027 100644 --- a/doc/howdoi.rst +++ b/doc/howdoi.rst @@ -11,6 +11,8 @@ How do I ... * - How do I... - Solution + * - add a DataArray to my dataset as a new variable + - ``my_dataset[varname] = my_dataArray`` or :py:meth:`Dataset.assign` (see also :ref:`dictionary_like_methods`) * - add variables from other datasets to my dataset - :py:meth:`Dataset.merge` * - add a new dimension and/or coordinate @@ -57,3 +59,4 @@ How do I ... - ``obj.dt.ceil``, ``obj.dt.floor``, ``obj.dt.round``. See :ref:`dt_accessor` for more. * - make a mask that is ``True`` where an object contains any of the values in a array - :py:meth:`Dataset.isin`, :py:meth:`DataArray.isin` + From 099c0901e29927935440692b1363ef08fe6d2e42 Mon Sep 17 00:00:00 2001 From: Julien Seguinot Date: Sat, 11 Jan 2020 16:22:55 +0100 Subject: [PATCH 08/14] Add option to choose mfdataset attributes source. (#3498) * Add 'master_file' kwarg in open_mfdataset, which can be a str or Path to a particular data file. --- doc/whats-new.rst | 3 +++ xarray/backends/api.py | 19 +++++++++++++++--- xarray/tests/test_backends.py | 36 +++++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 93df2b6569a..eacf8433c0a 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -37,6 +37,9 @@ New Features - Added the ``count`` reduction method to both :py:class:`~core.rolling.DatasetCoarsen` and :py:class:`~core.rolling.DataArrayCoarsen` objects. (:pull:`3500`) By `Deepak Cherian `_ +- Add `attrs_file` option in :py:func:`~xarray.open_mfdataset` to choose the + source file for global attributes in a multi-file dataset (:issue:`2382`, + :pull:`3498`) by `Julien Seguinot _`. - :py:meth:`Dataset.swap_dims` and :py:meth:`DataArray.swap_dims` now allow swapping to dimension names that don't exist yet. (:pull:`3636`) By `Justus Magin `_. diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 23d09ba5e33..eea1fb9ddce 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -718,6 +718,7 @@ def open_mfdataset( autoclose=None, parallel=False, join="outer", + attrs_file=None, **kwargs, ): """Open multiple files as a single dataset. @@ -729,8 +730,8 @@ def open_mfdataset( ``combine_by_coords`` and ``combine_nested``. By default the old (now deprecated) ``auto_combine`` will be used, please specify either ``combine='by_coords'`` or ``combine='nested'`` in future. Requires dask to be installed. See documentation for - details on dask [1]_. Attributes from the first dataset file are used for the - combined dataset. + details on dask [1]_. Global attributes from the ``attrs_file`` are used + for the combined dataset. Parameters ---------- @@ -827,6 +828,10 @@ def open_mfdataset( - 'override': if indexes are of same size, rewrite indexes to be those of the first object with that dimension. Indexes for the same dimension must have the same size in all objects. + attrs_file : str or pathlib.Path, optional + Path of the file used to read global attributes from. + By default global attributes are read from the first file provided, + with wildcard matches sorted by filename. **kwargs : optional Additional arguments passed on to :py:func:`xarray.open_dataset`. @@ -961,7 +966,15 @@ def open_mfdataset( raise combined._file_obj = _MultiFileCloser(file_objs) - combined.attrs = datasets[0].attrs + + # read global attributes from the attrs_file or from the first dataset + if attrs_file is not None: + if isinstance(attrs_file, Path): + attrs_file = str(attrs_file) + combined.attrs = datasets[paths.index(attrs_file)].attrs + else: + combined.attrs = datasets[0].attrs + return combined diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a23527bd49a..4fccdf2dd6c 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2832,6 +2832,42 @@ def test_attrs_mfdataset(self): with raises_regex(AttributeError, "no attribute"): actual.test2 + def test_open_mfdataset_attrs_file(self): + original = Dataset({"foo": ("x", np.random.randn(10))}) + with create_tmp_files(2) as (tmp1, tmp2): + ds1 = original.isel(x=slice(5)) + ds2 = original.isel(x=slice(5, 10)) + ds1.attrs["test1"] = "foo" + ds2.attrs["test2"] = "bar" + ds1.to_netcdf(tmp1) + ds2.to_netcdf(tmp2) + with open_mfdataset( + [tmp1, tmp2], concat_dim="x", combine="nested", attrs_file=tmp2 + ) as actual: + # attributes are inherited from the master file + assert actual.attrs["test2"] == ds2.attrs["test2"] + # attributes from ds1 are not retained, e.g., + assert "test1" not in actual.attrs + + def test_open_mfdataset_attrs_file_path(self): + original = Dataset({"foo": ("x", np.random.randn(10))}) + with create_tmp_files(2) as (tmp1, tmp2): + tmp1 = Path(tmp1) + tmp2 = Path(tmp2) + ds1 = original.isel(x=slice(5)) + ds2 = original.isel(x=slice(5, 10)) + ds1.attrs["test1"] = "foo" + ds2.attrs["test2"] = "bar" + ds1.to_netcdf(tmp1) + ds2.to_netcdf(tmp2) + with open_mfdataset( + [tmp1, tmp2], concat_dim="x", combine="nested", attrs_file=tmp2 + ) as actual: + # attributes are inherited from the master file + assert actual.attrs["test2"] == ds2.attrs["test2"] + # attributes from ds1 are not retained, e.g., + assert "test1" not in actual.attrs + def test_open_mfdataset_auto_combine(self): original = Dataset({"foo": ("x", np.random.randn(10)), "x": np.arange(10)}) with create_tmp_file() as tmp1: From 40fab1b303d40c37d7307342cebbceb1ea8cc608 Mon Sep 17 00:00:00 2001 From: David Caron Date: Sat, 11 Jan 2020 18:15:44 -0500 Subject: [PATCH 09/14] fix docstring for combine_first: returns a Dataset (#3683) --- xarray/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 129f0c0f7a7..c314eb41458 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4152,7 +4152,7 @@ def combine_first(self, other: "Dataset") -> "Dataset": Returns ------- - DataArray + Dataset """ out = ops.fillna(self, other, join="outer", dataset_join="outer") return out From 1689db493f10262555196f658c52e370aacb4a33 Mon Sep 17 00:00:00 2001 From: Tom Nicholas <35968931+TomNicholas@users.noreply.github.com> Date: Sun, 12 Jan 2020 13:04:01 +0000 Subject: [PATCH 10/14] ds.merge(da) bugfix (#3677) * Added mwe as test * Cast to Dataset * Updated what's new * black formatted * Use assert_identical --- doc/whats-new.rst | 2 ++ xarray/core/dataset.py | 1 + xarray/tests/test_merge.py | 7 +++++++ 3 files changed, 10 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index eacf8433c0a..6eeb55c23cb 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -74,6 +74,8 @@ Bug fixes By `Justus Magin `_. - :py:meth:`Dataset.rename`, :py:meth:`DataArray.rename` now check for conflicts with MultiIndex level names. +- :py:meth:`Dataset.merge` no longer fails when passed a `DataArray` instead of a `Dataset` object. + By `Tom Nicholas `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index c314eb41458..6ecd9b59f8e 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3607,6 +3607,7 @@ def merge( If any variables conflict (see ``compat``). """ _check_inplace(inplace) + other = other.to_dataset() if isinstance(other, xr.DataArray) else other merge_result = dataset_merge_method( self, other, diff --git a/xarray/tests/test_merge.py b/xarray/tests/test_merge.py index c1e6c7a5ce8..6c8f3f65657 100644 --- a/xarray/tests/test_merge.py +++ b/xarray/tests/test_merge.py @@ -3,6 +3,7 @@ import xarray as xr from xarray.core import dtypes, merge +from xarray.testing import assert_identical from . import raises_regex from .test_dataset import create_test_data @@ -253,3 +254,9 @@ def test_merge_no_conflicts(self): with pytest.raises(xr.MergeError): ds3 = xr.Dataset({"a": ("y", [2, 3]), "y": [1, 2]}) ds1.merge(ds3, compat="no_conflicts") + + def test_merge_dataarray(self): + ds = xr.Dataset({"a": 0}) + da = xr.DataArray(data=1, name="b") + + assert_identical(ds.merge(da), xr.merge([ds, da])) From 59d3ba5e938bafb4a1981c1a56d42aa31041df0a Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Mon, 13 Jan 2020 11:31:37 -0500 Subject: [PATCH 11/14] Explicitly convert result of pd.to_datetime to a timezone-naive type (#3688) --- xarray/tests/test_coding_times.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index d012fb36c35..00c34940ce4 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -451,7 +451,7 @@ def test_cf_datetime_nan(num_dates, units, expected_list): warnings.filterwarnings("ignore", "All-NaN") actual = coding.times.decode_cf_datetime(num_dates, units) # use pandas because numpy will deprecate timezone-aware conversions - expected = pd.to_datetime(expected_list) + expected = pd.to_datetime(expected_list).to_numpy(dtype="datetime64[ns]") assert_array_equal(expected, actual) From 8a650a11d1f859a88cc91b8815c16597203892aa Mon Sep 17 00:00:00 2001 From: Tom Nicholas <35968931+TomNicholas@users.noreply.github.com> Date: Mon, 13 Jan 2020 16:33:04 +0000 Subject: [PATCH 12/14] Fix mypy type checking tests failure in ds.merge (#3690) * Added DataArray as valid type input to ds.merge method * Fix import error by specifying type as string --- xarray/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 6ecd9b59f8e..2d15ff586e4 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3550,7 +3550,7 @@ def update(self, other: "CoercibleMapping", inplace: bool = None) -> "Dataset": def merge( self, - other: "CoercibleMapping", + other: Union["CoercibleMapping", "DataArray"], inplace: bool = None, overwrite_vars: Union[Hashable, Iterable[Hashable]] = frozenset(), compat: str = "no_conflicts", From 40423457928245d216f973530904df1c93110f6c Mon Sep 17 00:00:00 2001 From: Emmanuel Roux <15956441+fesaille@users.noreply.github.com> Date: Mon, 13 Jan 2020 21:32:17 +0100 Subject: [PATCH 13/14] Typo on DataSet/DataArray.to_dict documentation (#3692) --- xarray/core/dataarray.py | 2 +- xarray/core/dataset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index cbd8d243385..15db6ed468e 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2368,7 +2368,7 @@ def to_dict(self, data: bool = True) -> dict: naming conventions. Converts all variables and attributes to native Python objects. - Useful for coverting to json. To avoid datetime incompatibility + Useful for converting to json. To avoid datetime incompatibility use decode_times=False kwarg in xarrray.open_dataset. Parameters diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 2d15ff586e4..1aaad02b470 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4667,7 +4667,7 @@ def to_dict(self, data=True): conventions. Converts all variables and attributes to native Python objects - Useful for coverting to json. To avoid datetime incompatibility + Useful for converting to json. To avoid datetime incompatibility use decode_times=False kwarg in xarrray.open_dataset. Parameters From e0fd48052dbda34ee35d2491e4fe856495c9621b Mon Sep 17 00:00:00 2001 From: keewis Date: Tue, 14 Jan 2020 17:13:23 +0100 Subject: [PATCH 14/14] allow passing any iterable to drop when dropping variables (#3693) * allow passing any iterable to drop when dropping variables * whats-new.rst * update whats-new.rst --- doc/whats-new.rst | 3 +++ xarray/core/dataset.py | 3 +-- xarray/tests/test_dataset.py | 4 ++++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 6eeb55c23cb..1ad344a208d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -76,6 +76,9 @@ Bug fixes MultiIndex level names. - :py:meth:`Dataset.merge` no longer fails when passed a `DataArray` instead of a `Dataset` object. By `Tom Nicholas `_. +- Fix a regression in :py:meth:`Dataset.drop`: allow passing any + iterable when dropping variables (:issue:`3552`, :pull:`3693`) + By `Justus Magin `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 1aaad02b470..79f1030fabe 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -85,7 +85,6 @@ either_dict_or_kwargs, hashable, is_dict_like, - is_list_like, is_scalar, maybe_wrap_array, ) @@ -3690,7 +3689,7 @@ def drop(self, labels=None, dim=None, *, errors="raise", **labels_kwargs): raise ValueError("cannot specify dim and dict-like arguments.") labels = either_dict_or_kwargs(labels, labels_kwargs, "drop") - if dim is None and (is_list_like(labels) or is_scalar(labels)): + if dim is None and (is_scalar(labels) or isinstance(labels, Iterable)): warnings.warn( "dropping variables using `drop` will be deprecated; using drop_vars is encouraged.", PendingDeprecationWarning, diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index c953f5d22e9..f9eb37dbf2f 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -2167,6 +2167,10 @@ def test_drop_variables(self): actual = data.drop(["time", "not_found_here"], errors="ignore") assert_identical(expected, actual) + with pytest.warns(PendingDeprecationWarning): + actual = data.drop({"time", "not_found_here"}, errors="ignore") + assert_identical(expected, actual) + def test_drop_index_labels(self): data = Dataset({"A": (["x", "y"], np.random.randn(2, 3)), "x": ["a", "b"]})