From 16a2c03bb23757a92f3f9b8e74c4d489e892e6d6 Mon Sep 17 00:00:00 2001 From: Martin Pletcher Date: Mon, 25 Mar 2019 19:41:47 -0700 Subject: [PATCH] Allow expand_dims() method to support inserting/broadcasting dimensions with size>1 (#2757) * Quarter offset implemented (base is now latest pydata-master). (#2721) * Quarter offset implemented (base is now latest pydata-master). * Fixed issues raised in review (https://github.com/pydata/xarray/pull/2721#pullrequestreview-199346642) * Updated whats-new.rst with info on quarter offset support. * Updated whats-new.rst with info on quarter offset support. * Update doc/whats-new.rst Co-Authored-By: jwenfai * Added support for quarter frequencies when resampling CFTimeIndex. Less redundancy in CFTimeIndex resampling tests. * Removed normalization code (unnecessary for cftime_range) in cftime_offsets.py. Removed redundant lines in whats-new.rst. * Removed invalid option from _get_day_of_month docstring. Added tests back in that raises ValueError when resampling (base=24 when resampling to daily freq, e.g., '8D'). * Minor edits to docstrings/comments * lint * Add `Dataset.drop_dims` (#2767) * ENH: Add Dataset.drop_dims() * Drops full dimensions and any corresponding variables in a Dataset * Fixes GH1949 * DOC: Add Dataset.drop_dims() documentation * Improve name concat (#2792) * Added tests of desired name inferring behaviour * Infers names * updated what's new * Don't use deprecated np.asscalar() (#2800) It got deprecated in numpy 1.16 and throws a ton of warnings due to that. All the function does is returning .item() anyway, which is why it got deprecated. * Add support for cftime.datetime coordinates with coarsen (#2778) * some docs updates (#2746) * Friendlier io title. * Fix lists. * Fix *args, **kwargs "inline emphasis..." * misc * Reference xarray_extras for csv writing. Closes #2289 * Add metpy accessor. Closes #461 * fix transpose docstring. Closes #2576 * Revert "Fix lists." This reverts commit 39983a5835612d7158ae91a9cce7196a03742983. * Revert "Fix *args, **kwargs" This reverts commit 1b9da35ef43e44ce7855f2ab8406a781c9a68933. * Add MetPy to related projects. * Add Weather and Climate specific page. * Add hvplot. * Note open_dataset, mfdataset open files as read-only (closes #2345). * Update metpy 1 Co-Authored-By: dcherian * Update doc/weather-climate.rst Co-Authored-By: dcherian * Drop failing tests writing multi-dimensional arrays as attributes (#2810) These aren't valid for netCDF files. Fixes GH2803 * Push back finalizing deprecations for 0.12 (#2809) 0.12 will already have a big change in dropping Python 2.7 support. I'd rather wait a bit longer to finalize these deprecations to minimize the impact on users. * enable loading remote hdf5 files (#2782) * attempt at loading remote hdf5 * added a couple tests * rewind bytes after reading header * addressed comments for tests and error message * fixed pep8 formatting * created _get_engine_from_magic_number function, new tests * added description in whats-new * fixed test failure on windows * same error on windows and nix * Release 0.12.0 * Add whats-new for 0.12.1 * Rework whats-new for 0.12 * DOC: Update donation links * DOC: remove outdated warning (#2818) * Allow expand_dims() method to support inserting/broadcasting dimensions with size>1 (#2757) * Make using dim_kwargs for python 3.5 illegal -- a ValueError is thrown * dataset.expand_dims() method take dict like object where values represent length of dimensions or coordinates of dimesnsions * dataarray.expand_dims() method take dict like object where values represent length of dimensions or coordinates of dimesnsions * Add alternative option to passing a dict to the dim argument, which is now an optional kwarg, passing in each new dimension as its own kwarg * Add expand_dims enhancement from issue 2710 to whats-new.rst * Fix test_dataarray.TestDataArray.test_expand_dims_with_greater_dim_size tests to pass in python 3.5 using ordered dicts instead of regular dicts. This was needed because python 3.5 and earlier did not maintain insertion order for dicts * Restrict core logic to use 'dim' as a dict--it will be converted into a dict on entry if it is a str or a sequence of str * Don't cast dim values (coords) as a list since IndexVariable/Variable will internally convert it into a numpy.ndarray. So just use IndexVariable((k,), v) * TypeErrors should be raised for invalid input types, rather than ValueErrors. * Force 'dim' to be OrderedDict for python 3.5 * Allow expand_dims() method to support inserting/broadcasting dimensions with size>1 (#2757) * use .size attribute to determine the size of a dimension, rather than converting to a list, which can be slow for large iterables * Make using dim_kwargs for python 3.5 illegal -- a ValueError is thrown * dataset.expand_dims() method take dict like object where values represent length of dimensions or coordinates of dimesnsions * dataarray.expand_dims() method take dict like object where values represent length of dimensions or coordinates of dimesnsions * Add alternative option to passing a dict to the dim argument, which is now an optional kwarg, passing in each new dimension as its own kwarg * Add expand_dims enhancement from issue 2710 to whats-new.rst * Fix test_dataarray.TestDataArray.test_expand_dims_with_greater_dim_size tests to pass in python 3.5 using ordered dicts instead of regular dicts. This was needed because python 3.5 and earlier did not maintain insertion order for dicts * Restrict core logic to use 'dim' as a dict--it will be converted into a dict on entry if it is a str or a sequence of str * Don't cast dim values (coords) as a list since IndexVariable/Variable will internally convert it into a numpy.ndarray. So just use IndexVariable((k,), v) * TypeErrors should be raised for invalid input types, rather than ValueErrors. * Force 'dim' to be OrderedDict for python 3.5 * Allow expand_dims() method to support inserting/broadcasting dimensions with size>1 (#2757) * Move enhancement description up to 0.12.1 * use .size attribute to determine the size of a dimension, rather than converting to a list, which can be slow for large iterables * Make using dim_kwargs for python 3.5 illegal -- a ValueError is thrown * dataset.expand_dims() method take dict like object where values represent length of dimensions or coordinates of dimesnsions * dataarray.expand_dims() method take dict like object where values represent length of dimensions or coordinates of dimesnsions * Add alternative option to passing a dict to the dim argument, which is now an optional kwarg, passing in each new dimension as its own kwarg * Add expand_dims enhancement from issue 2710 to whats-new.rst * Fix test_dataarray.TestDataArray.test_expand_dims_with_greater_dim_size tests to pass in python 3.5 using ordered dicts instead of regular dicts. This was needed because python 3.5 and earlier did not maintain insertion order for dicts * Restrict core logic to use 'dim' as a dict--it will be converted into a dict on entry if it is a str or a sequence of str * Don't cast dim values (coords) as a list since IndexVariable/Variable will internally convert it into a numpy.ndarray. So just use IndexVariable((k,), v) * TypeErrors should be raised for invalid input types, rather than ValueErrors. * Force 'dim' to be OrderedDict for python 3.5 --- doc/whats-new.rst | 4 ++ xarray/core/dataarray.py | 39 ++++++++++++++++-- xarray/core/dataset.py | 73 +++++++++++++++++++++++++++------- xarray/tests/test_dataarray.py | 53 +++++++++++++++++++++++- xarray/tests/test_dataset.py | 68 +++++++++++++++++++++++++++++++ 5 files changed, 218 insertions(+), 19 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 6cf2720a033..ecdb4676b4f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -21,6 +21,10 @@ v0.12.1 (unreleased) Enhancements ~~~~~~~~~~~~ +- Allow ``expand_dims`` method to support inserting/broadcasting dimensions + with size > 1. (:issue:`2710`) + By `Martin Pletcher `_. + Bug fixes ~~~~~~~~~ diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 7cd856db5b4..c24703f5384 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1,4 +1,5 @@ import functools +import sys import warnings from collections import OrderedDict @@ -1138,7 +1139,7 @@ def swap_dims(self, dims_dict): ds = self._to_temp_dataset().swap_dims(dims_dict) return self._from_temp_dataset(ds) - def expand_dims(self, dim, axis=None): + def expand_dims(self, dim=None, axis=None, **dim_kwargs): """Return a new object with an additional axis (or axes) inserted at the corresponding position in the array shape. @@ -1147,21 +1148,53 @@ def expand_dims(self, dim, axis=None): Parameters ---------- - dim : str or sequence of str. + dim : str, sequence of str, dict, or None Dimensions to include on the new variable. - dimensions are inserted with length 1. + If provided as str or sequence of str, then dimensions are inserted + with length 1. If provided as a dict, then the keys are the new + dimensions and the values are either integers (giving the length of + the new dimensions) or sequence/ndarray (giving the coordinates of + the new dimensions). **WARNING** for python 3.5, if ``dim`` is + dict-like, then it must be an ``OrderedDict``. This is to ensure + that the order in which the dims are given is maintained. axis : integer, list (or tuple) of integers, or None Axis position(s) where new axis is to be inserted (position(s) on the result array). If a list (or tuple) of integers is passed, multiple axes are inserted. In this case, dim arguments should be same length list. If axis=None is passed, all the axes will be inserted to the start of the result array. + **dim_kwargs : int or sequence/ndarray + The keywords are arbitrary dimensions being inserted and the values + are either the lengths of the new dims (if int is given), or their + coordinates. Note, this is an alternative to passing a dict to the + dim kwarg and will only be used if dim is None. **WARNING** for + python 3.5 ``dim_kwargs`` is not available. Returns ------- expanded : same type as caller This object, but with an additional dimension(s). """ + if isinstance(dim, int): + raise TypeError('dim should be str or sequence of strs or dict') + elif isinstance(dim, str): + dim = OrderedDict(((dim, 1),)) + elif isinstance(dim, (list, tuple)): + if len(dim) != len(set(dim)): + raise ValueError('dims should not contain duplicate values.') + dim = OrderedDict(((d, 1) for d in dim)) + + # TODO: get rid of the below code block when python 3.5 is no longer + # supported. + python36_plus = sys.version_info[0] == 3 and sys.version_info[1] > 5 + not_ordereddict = dim is not None and not isinstance(dim, OrderedDict) + if not python36_plus and not_ordereddict: + raise TypeError("dim must be an OrderedDict for python <3.6") + elif not python36_plus and dim_kwargs: + raise ValueError("dim_kwargs isn't available for python <3.6") + dim_kwargs = OrderedDict(dim_kwargs) + + dim = either_dict_or_kwargs(dim, dim_kwargs, 'expand_dims') ds = self._to_temp_dataset().expand_dims(dim, axis) return self._from_temp_dataset(ds) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 3bb54e80456..ec57a33f5c2 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2324,7 +2324,7 @@ def swap_dims(self, dims_dict, inplace=None): return self._replace_with_new_dims(variables, coord_names, indexes=indexes, inplace=inplace) - def expand_dims(self, dim, axis=None): + def expand_dims(self, dim=None, axis=None, **dim_kwargs): """Return a new object with an additional axis (or axes) inserted at the corresponding position in the array shape. @@ -2333,15 +2333,27 @@ def expand_dims(self, dim, axis=None): Parameters ---------- - dim : str or sequence of str. + dim : str, sequence of str, dict, or None Dimensions to include on the new variable. - dimensions are inserted with length 1. + If provided as str or sequence of str, then dimensions are inserted + with length 1. If provided as a dict, then the keys are the new + dimensions and the values are either integers (giving the length of + the new dimensions) or sequence/ndarray (giving the coordinates of + the new dimensions). **WARNING** for python 3.5, if ``dim`` is + dict-like, then it must be an ``OrderedDict``. This is to ensure + that the order in which the dims are given is maintained. axis : integer, list (or tuple) of integers, or None Axis position(s) where new axis is to be inserted (position(s) on the result array). If a list (or tuple) of integers is passed, multiple axes are inserted. In this case, dim arguments should be - the same length list. If axis=None is passed, all the axes will - be inserted to the start of the result array. + same length list. If axis=None is passed, all the axes will be + inserted to the start of the result array. + **dim_kwargs : int or sequence/ndarray + The keywords are arbitrary dimensions being inserted and the values + are either the lengths of the new dims (if int is given), or their + coordinates. Note, this is an alternative to passing a dict to the + dim kwarg and will only be used if dim is None. **WARNING** for + python 3.5 ``dim_kwargs`` is not available. Returns ------- @@ -2349,10 +2361,25 @@ def expand_dims(self, dim, axis=None): This object, but with an additional dimension(s). """ if isinstance(dim, int): - raise ValueError('dim should be str or sequence of strs or dict') + raise TypeError('dim should be str or sequence of strs or dict') + elif isinstance(dim, str): + dim = OrderedDict(((dim, 1),)) + elif isinstance(dim, (list, tuple)): + if len(dim) != len(set(dim)): + raise ValueError('dims should not contain duplicate values.') + dim = OrderedDict(((d, 1) for d in dim)) + + # TODO: get rid of the below code block when python 3.5 is no longer + # supported. + python36_plus = sys.version_info[0] == 3 and sys.version_info[1] > 5 + not_ordereddict = dim is not None and not isinstance(dim, OrderedDict) + if not python36_plus and not_ordereddict: + raise TypeError("dim must be an OrderedDict for python <3.6") + elif not python36_plus and dim_kwargs: + raise ValueError("dim_kwargs isn't available for python <3.6") + + dim = either_dict_or_kwargs(dim, dim_kwargs, 'expand_dims') - if isinstance(dim, str): - dim = [dim] if axis is not None and not isinstance(axis, (list, tuple)): axis = [axis] @@ -2371,10 +2398,24 @@ def expand_dims(self, dim, axis=None): '{dim} already exists as coordinate or' ' variable name.'.format(dim=d)) - if len(dim) != len(set(dim)): - raise ValueError('dims should not contain duplicate values.') - variables = OrderedDict() + # If dim is a dict, then ensure that the values are either integers + # or iterables. + for k, v in dim.items(): + if hasattr(v, "__iter__"): + # If the value for the new dimension is an iterable, then + # save the coordinates to the variables dict, and set the + # value within the dim dict to the length of the iterable + # for later use. + variables[k] = xr.IndexVariable((k,), v) + self._coord_names.add(k) + dim[k] = variables[k].size + elif isinstance(v, int): + pass # Do nothing if the dimensions value is just an int + else: + raise TypeError('The value of new dimension {k} must be ' + 'an iterable or an int'.format(k=k)) + for k, v in self._variables.items(): if k not in dim: if k in self._coord_names: # Do not change coordinates @@ -2395,11 +2436,13 @@ def expand_dims(self, dim, axis=None): ' values.') # We need to sort them to make sure `axis` equals to the # axis positions of the result array. - zip_axis_dim = sorted(zip(axis_pos, dim)) + zip_axis_dim = sorted(zip(axis_pos, dim.items())) + + all_dims = list(zip(v.dims, v.shape)) + for d, c in zip_axis_dim: + all_dims.insert(d, c) + all_dims = OrderedDict(all_dims) - all_dims = list(v.dims) - for a, d in zip_axis_dim: - all_dims.insert(a, d) variables[k] = v.set_dims(all_dims) else: # If dims includes a label of a non-dimension coordinate, diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 4975071dad8..b1ecf160533 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3,6 +3,7 @@ from collections import OrderedDict from copy import deepcopy from textwrap import dedent +import sys import numpy as np import pandas as pd @@ -1303,7 +1304,7 @@ def test_expand_dims_error(self): coords={'x': np.linspace(0.0, 1.0, 3)}, attrs={'key': 'entry'}) - with raises_regex(ValueError, 'dim should be str or'): + with raises_regex(TypeError, 'dim should be str or'): array.expand_dims(0) with raises_regex(ValueError, 'lengths of dim and axis'): # dims and axis argument should be the same length @@ -1328,6 +1329,16 @@ def test_expand_dims_error(self): array.expand_dims(dim=['y', 'z'], axis=[2, -4]) array.expand_dims(dim=['y', 'z'], axis=[2, 3]) + array = DataArray(np.random.randn(3, 4), dims=['x', 'dim_0'], + coords={'x': np.linspace(0.0, 1.0, 3)}, + attrs={'key': 'entry'}) + with pytest.raises(TypeError): + array.expand_dims(OrderedDict((("new_dim", 3.2),))) + + # Attempt to use both dim and kwargs + with pytest.raises(ValueError): + array.expand_dims(OrderedDict((("d", 4),)), e=4) + def test_expand_dims(self): array = DataArray(np.random.randn(3, 4), dims=['x', 'dim_0'], coords={'x': np.linspace(0.0, 1.0, 3)}, @@ -1392,6 +1403,46 @@ def test_expand_dims_with_scalar_coordinate(self): roundtripped = actual.squeeze(['z'], drop=False) assert_identical(array, roundtripped) + def test_expand_dims_with_greater_dim_size(self): + array = DataArray(np.random.randn(3, 4), dims=['x', 'dim_0'], + coords={'x': np.linspace(0.0, 1.0, 3), 'z': 1.0}, + attrs={'key': 'entry'}) + # For python 3.5 and earlier this has to be an ordered dict, to + # maintain insertion order. + actual = array.expand_dims( + OrderedDict((('y', 2), ('z', 1), ('dim_1', ['a', 'b', 'c'])))) + + expected_coords = OrderedDict(( + ('y', [0, 1]), ('z', [1.0]), ('dim_1', ['a', 'b', 'c']), + ('x', np.linspace(0, 1, 3)), ('dim_0', range(4)))) + expected = DataArray(array.values * np.ones([2, 1, 3, 3, 4]), + coords=expected_coords, + dims=list(expected_coords.keys()), + attrs={'key': 'entry'} + ).drop(['y', 'dim_0']) + assert_identical(expected, actual) + + # Test with kwargs instead of passing dict to dim arg. + + # TODO: only the code under the if-statement is needed when python 3.5 + # is no longer supported. + python36_plus = sys.version_info[0] == 3 and sys.version_info[1] > 5 + if python36_plus: + other_way = array.expand_dims(dim_1=['a', 'b', 'c']) + + other_way_expected = DataArray( + array.values * np.ones([3, 3, 4]), + coords={'dim_1': ['a', 'b', 'c'], + 'x': np.linspace(0, 1, 3), + 'dim_0': range(4), 'z': 1.0}, + dims=['dim_1', 'x', 'dim_0'], + attrs={'key': 'entry'}).drop('dim_0') + assert_identical(other_way_expected, other_way) + else: + # In python 3.5, using dim_kwargs should raise a ValueError. + with raises_regex(ValueError, "dim_kwargs isn't"): + array.expand_dims(e=["l", "m", "n"]) + def test_set_index(self): indexes = [self.mindex.get_level_values(n) for n in self.mindex.names] coords = {idx.name: ('x', idx) for idx in indexes} diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 777a8e84a3f..f45908c5f5a 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -2030,6 +2030,27 @@ def test_expand_dims_error(self): with raises_regex(ValueError, 'already exists'): original.expand_dims(dim=['z']) + original = Dataset({'x': ('a', np.random.randn(3)), + 'y': (['b', 'a'], np.random.randn(4, 3)), + 'z': ('a', np.random.randn(3))}, + coords={'a': np.linspace(0, 1, 3), + 'b': np.linspace(0, 1, 4), + 'c': np.linspace(0, 1, 5)}, + attrs={'key': 'entry'}) + with raises_regex(TypeError, 'value of new dimension'): + original.expand_dims(OrderedDict((("d", 3.2),))) + + # TODO: only the code under the if-statement is needed when python 3.5 + # is no longer supported. + python36_plus = sys.version_info[0] == 3 and sys.version_info[1] > 5 + if python36_plus: + with raises_regex(ValueError, 'both keyword and positional'): + original.expand_dims(OrderedDict((("d", 4),)), e=4) + else: + # In python 3.5, using dim_kwargs should raise a ValueError. + with raises_regex(ValueError, "dim_kwargs isn't"): + original.expand_dims(OrderedDict((("d", 4),)), e=4) + def test_expand_dims(self): original = Dataset({'x': ('a', np.random.randn(3)), 'y': (['b', 'a'], np.random.randn(4, 3))}, @@ -2063,6 +2084,53 @@ def test_expand_dims(self): roundtripped = actual.squeeze('z') assert_identical(original, roundtripped) + # Test expanding one dimension to have size > 1 that doesn't have + # coordinates, and also expanding another dimension to have size > 1 + # that DOES have coordinates. + actual = original.expand_dims( + OrderedDict((("d", 4), ("e", ["l", "m", "n"])))) + + expected = Dataset( + {'x': xr.DataArray(original['x'].values * np.ones([4, 3, 3]), + coords=dict(d=range(4), + e=['l', 'm', 'n'], + a=np.linspace(0, 1, 3)), + dims=['d', 'e', 'a']).drop('d'), + 'y': xr.DataArray(original['y'].values * np.ones([4, 3, 4, 3]), + coords=dict(d=range(4), + e=['l', 'm', 'n'], + b=np.linspace(0, 1, 4), + a=np.linspace(0, 1, 3)), + dims=['d', 'e', 'b', 'a']).drop('d')}, + coords={'c': np.linspace(0, 1, 5)}, + attrs={'key': 'entry'}) + assert_identical(actual, expected) + + # Test with kwargs instead of passing dict to dim arg. + + # TODO: only the code under the if-statement is needed when python 3.5 + # is no longer supported. + python36_plus = sys.version_info[0] == 3 and sys.version_info[1] > 5 + if python36_plus: + other_way = original.expand_dims(e=["l", "m", "n"]) + other_way_expected = Dataset( + {'x': xr.DataArray(original['x'].values * np.ones([3, 3]), + coords=dict(e=['l', 'm', 'n'], + a=np.linspace(0, 1, 3)), + dims=['e', 'a']), + 'y': xr.DataArray(original['y'].values * np.ones([3, 4, 3]), + coords=dict(e=['l', 'm', 'n'], + b=np.linspace(0, 1, 4), + a=np.linspace(0, 1, 3)), + dims=['e', 'b', 'a'])}, + coords={'c': np.linspace(0, 1, 5)}, + attrs={'key': 'entry'}) + assert_identical(other_way_expected, other_way) + else: + # In python 3.5, using dim_kwargs should raise a ValueError. + with raises_regex(ValueError, "dim_kwargs isn't"): + original.expand_dims(e=["l", "m", "n"]) + def test_set_index(self): expected = create_test_multiindex() mindex = expected['x'].to_index()