diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 1946b5dd67b..fe9ea36660c 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -59,6 +59,10 @@ Bug fixes ``keep_attrs=True`` option. By `Jeremy McGibbon `_. +- Concatenating xarray objects along an axis with a MultiIndex or PeriodIndex + preserves the nature of the index (:issue:`875`). By + `Stephan Hoyer `_. + - ``decode_cf_timedelta`` now accepts arrays with ``ndim`` >1 (:issue:`842`). This fixes issue :issue:`665`. `Filipe Fernandes `_. diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 1fb22c7b4db..1ce480241fa 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -4,7 +4,7 @@ from . import utils from .pycompat import iteritems, reduce, OrderedDict, basestring -from .variable import Variable, as_variable, Coordinate +from .variable import Variable, as_variable, Coordinate, concat as concat_vars def concat(objs, dim=None, data_vars='all', coords='different', @@ -265,7 +265,7 @@ def ensure_common_dims(vars): # stack up each variable to fill-out the dataset for k in concat_over: vars = ensure_common_dims([ds.variables[k] for ds in datasets]) - combined = Variable.concat(vars, dim, positions) + combined = concat_vars(vars, dim, positions) insert_result_variable(k, combined) result = Dataset(result_vars, attrs=result_attrs) diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 0eff8dd56a3..7cab6aaa376 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd +from . import nputils from . import ops from .combine import concat from .common import ( @@ -66,6 +67,52 @@ def _dummy_copy(xarray_obj): raise AssertionError return res +def _is_one_or_none(obj): + return obj == 1 or obj is None + + +def _consolidate_slices(slices): + """Consolidate adjacent slices in a list of slices. + """ + result = [] + for slice_ in slices: + if not isinstance(slice_, slice): + raise ValueError('list element is not a slice: %r' % slice_) + if (result and last_slice.stop == slice_.start + and _is_one_or_none(last_slice.step) + and _is_one_or_none(slice_.step)): + last_slice = slice(last_slice.start, slice_.stop, slice_.step) + result[-1] = last_slice + else: + result.append(slice_) + last_slice = slice_ + return result + + +def _inverse_permutation_indices(positions): + """Like inverse_permutation, but also handles slices. + + Parameters + ---------- + positions : list of np.ndarray or slice objects. + If slice objects, all are assumed to be slices. + + Returns + ------- + np.ndarray of indices or None, if no permutation is necessary. + """ + if not positions: + return None + + if isinstance(positions[0], slice): + positions = _consolidate_slices(positions) + if positions == slice(None): + return None + positions = [np.arange(sl.start, sl.stop, sl.step) for sl in positions] + + indices = nputils.inverse_permutation(np.concatenate(positions)) + return indices + class GroupBy(object): """A object that implements the split-apply-combine pattern. @@ -302,6 +349,16 @@ def assign_coords(self, **kwargs): return self.apply(lambda ds: ds.assign_coords(**kwargs)) +def _maybe_reorder(xarray_obj, concat_dim, positions): + order = _inverse_permutation_indices(positions) + + if order is None: + return xarray_obj + else: + dim, = concat_dim.dims + return xarray_obj[{dim: order}] + + class DataArrayGroupBy(GroupBy, ImplementsArrayReduce): """GroupBy object specialized to grouping DataArray objects """ @@ -313,14 +370,14 @@ def _iter_grouped_shortcut(self): for indices in self.group_indices: yield var[{self.group_dim: indices}] - def _concat_shortcut(self, applied, concat_dim, positions): + def _concat_shortcut(self, applied, concat_dim, positions=None): # nb. don't worry too much about maintaining this method -- it does # speed things up, but it's not very interpretable and there are much # faster alternatives (e.g., doing the grouped aggregation in a # compiled language) - stacked = Variable.concat( - applied, concat_dim, positions, shortcut=True) - result = self.obj._replace_maybe_drop_dims(stacked) + stacked = Variable.concat(applied, concat_dim, shortcut=True) + reordered = _maybe_reorder(stacked, concat_dim, positions) + result = self.obj._replace_maybe_drop_dims(reordered) result._coords[concat_dim.name] = as_variable(concat_dim, copy=True) return result @@ -391,7 +448,8 @@ def _concat(self, applied, shortcut=False): if shortcut: combined = self._concat_shortcut(applied, concat_dim, positions) else: - combined = concat(applied, concat_dim, positions=positions) + combined = concat(applied, concat_dim) + combined = _maybe_reorder(combined, concat_dim, positions) if isinstance(combined, type(self.obj)): combined = self._restore_dim_order(combined) @@ -472,8 +530,10 @@ def apply(self, func, **kwargs): def _concat(self, applied): applied_example, applied = peek_at(applied) concat_dim, positions = self._infer_concat_args(applied_example) - combined = concat(applied, concat_dim, positions=positions) - return combined + + combined = concat(applied, concat_dim) + reordered = _maybe_reorder(combined, concat_dim, positions) + return reordered def reduce(self, func, dim=None, keep_attrs=False, **kwargs): """Reduce the items in this group by applying `func` along some diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index ccbadaa49f0..c19e46093db 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -34,24 +34,24 @@ def nanlast(values, axis): return _select_along_axis(values, idx_last, axis) -def _calc_concat_shape(arrays, axis=0): - first_shape = arrays[0].shape - length = builtins.sum(a.shape[axis] for a in arrays) - result_shape = first_shape[:axis] + (length,) + first_shape[(axis + 1):] - return result_shape - - -def interleaved_concat(arrays, indices, axis=0): - arrays = [np.asarray(a) for a in arrays] - axis = _validate_axis(arrays[0], axis) - result_shape = _calc_concat_shape(arrays, axis=axis) - dtype = reduce(np.promote_types, [a.dtype for a in arrays]) - result = np.empty(result_shape, dtype) - key = [slice(None)] * result.ndim - for a, ind in zip(arrays, indices): - key[axis] = ind - result[key] = a - return result +def inverse_permutation(indices): + """Return indices for an inverse permutation. + + Parameters + ---------- + indices : 1D np.ndarray with dtype=int + Integer positions to assign elements to. + + Returns + ------- + inverse_permutation : 1D np.ndarray with dtype=int + Integer indices to take from the original array to create the + permutation. + """ + # use intp instead of int64 because of windows :( + inverse_permutation = np.empty(len(indices), dtype=np.intp) + inverse_permutation[indices] = np.arange(len(indices), dtype=np.intp) + return inverse_permutation def _ensure_bool_is_ndarray(result, *args): diff --git a/xarray/core/ops.py b/xarray/core/ops.py index 3fc9539892c..fbc5c4555e9 100644 --- a/xarray/core/ops.py +++ b/xarray/core/ops.py @@ -9,10 +9,7 @@ from . import npcompat from .pycompat import PY3, dask_array_type -from .nputils import ( - nanfirst, nanlast, interleaved_concat as _interleaved_concat_numpy, - array_eq, array_ne, _validate_axis, _calc_concat_shape -) +from .nputils import nanfirst, nanlast, array_eq, array_ne try: @@ -100,67 +97,6 @@ def _fail_on_dask_array_input(values, msg=None, func_name=None): tensordot = _dask_or_eager_func('tensordot', n_array_args=2) -def _interleaved_indices_required(indices): - """With dask, we care about data locality and would rather avoid splitting - splitting up each arrays into single elements. This routine checks to see - if we really need the "interleaved" part of interleaved_concat. - - We don't use for the pure numpy version of interleaved_concat, because it's - just as fast or faster to directly do the interleaved concatenate rather - than check if we could simply it. - """ - next_expected = 0 - for ind in indices: - if isinstance(ind, slice): - if ((ind.start or 0) != next_expected or - ind.step not in (1, None)): - return True - next_expected = ind.stop - else: - ind = np.asarray(ind) - expected = np.arange(next_expected, next_expected + ind.size) - if (ind != expected).any(): - return True - next_expected = ind[-1] + 1 - return False - - -def _interleaved_concat_slow(arrays, indices, axis=0): - """A slow version of interleaved_concat that also works on dask arrays - """ - axis = _validate_axis(arrays[0], axis) - - result_shape = _calc_concat_shape(arrays, axis=axis) - length = result_shape[axis] - array_lookup = np.empty(length, dtype=int) - element_lookup = np.empty(length, dtype=int) - - for n, ind in enumerate(indices): - if isinstance(ind, slice): - ind = np.arange(*ind.indices(length)) - for m, i in enumerate(ind): - array_lookup[i] = n - element_lookup[i] = m - - split_arrays = [arrays[n][(slice(None),) * axis + (slice(m, m + 1),)] - for (n, m) in zip(array_lookup, element_lookup)] - return concatenate(split_arrays, axis) - - -def interleaved_concat(arrays, indices, axis=0): - """Concatenate each array along the given axis, but also assign each array - element into the location given by indices. This operation is used for - groupby.transform. - """ - if has_dask and isinstance(arrays[0], da.Array): - if not _interleaved_indices_required(indices): - return da.concatenate(arrays, axis) - else: - return _interleaved_concat_slow(arrays, indices, axis) - else: - return _interleaved_concat_numpy(arrays, indices, axis) - - def asarray(data): return data if isinstance(data, dask_array_type) else np.asarray(data) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index cb3041ddcd8..81c322aa626 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -10,6 +10,7 @@ from . import indexing from . import ops from . import utils +from . import nputils from .pycompat import basestring, OrderedDict, zip, dask_array_type from .indexing import (PandasIndexAdapter, orthogonally_indexable, LazyIntegerRange) @@ -711,15 +712,19 @@ def expand_dims(self, dims, shape=None): self_dims = set(self.dims) expanded_dims = tuple( d for d in dims if d not in self_dims) + self.dims - if shape is not None: - dims_map = dict(zip(dims, shape)) - tmp_shape = [dims_map[d] for d in expanded_dims] - expanded_data = ops.broadcast_to(self.data, tmp_shape) + + if expanded_dims == self.dims: + expanded_var = self else: - expanded_data = self.data[ - (None,) * (len(expanded_dims) - self.ndim)] - expanded_var = Variable(expanded_dims, expanded_data, self._attrs, - self._encoding, fastpath=True) + if shape is not None: + dims_map = dict(zip(dims, shape)) + tmp_shape = [dims_map[d] for d in expanded_dims] + expanded_data = ops.broadcast_to(self.data, tmp_shape) + else: + expanded_data = self.data[ + (None,) * (len(expanded_dims) - self.ndim)] + expanded_var = Variable(expanded_dims, expanded_data, self._attrs, + self._encoding, fastpath=True) return expanded_var.transpose(*dims) def _stack_once(self, dims, new_dim): @@ -926,10 +931,12 @@ def concat(cls, variables, dim='concat_dim', positions=None, if dim in first_var.dims: axis = first_var.get_axis_num(dim) dims = first_var.dims - if positions is None: - data = ops.concatenate(arrays, axis=axis) - else: - data = ops.interleaved_concat(arrays, positions, axis=axis) + data = ops.concatenate(arrays, axis=axis) + if positions is not None: + # TODO: deprecate this option -- we don't need it for groupby + # any more. + indices = nputils.inverse_permutation(np.concatenate(positions)) + data = ops.take(data, indices, axis=axis) else: axis = 0 dims = (dim,) + first_var.dims @@ -1071,6 +1078,44 @@ def __getitem__(self, key): def __setitem__(self, key, value): raise TypeError('%s values cannot be modified' % type(self).__name__) + @classmethod + def concat(cls, variables, dim='concat_dim', positions=None, + shortcut=False): + """Specialized version of Variable.concat for Coordinate variables. + + This exists because we want to avoid converting Index objects to NumPy + arrays, if possible. + """ + if not isinstance(dim, basestring): + dim, = dim.dims + + variables = list(variables) + first_var = variables[0] + + if any(not isinstance(v, cls) for v in variables): + raise TypeError('Coordinate.concat requires that all input ' + 'variables be Coordinate objects') + + arrays = [v._data_cached().array for v in variables] + + if not arrays: + data = [] + else: + data = arrays[0].append(arrays[1:]) + + if positions is not None: + indices = nputils.inverse_permutation(np.concatenate(positions)) + data = data.take(indices) + + attrs = OrderedDict(first_var.attrs) + if not shortcut: + for var in variables: + if var.dims != first_var.dims: + raise ValueError('inconsistent dimensions') + utils.remove_incompatible_items(attrs, var.attrs) + + return cls(first_var.dims, data, attrs) + def copy(self, deep=True): """Returns a copy of this object. @@ -1163,3 +1208,40 @@ def _broadcast_compat_data(self, other): other_data = other dims = self.dims return self_data, other_data, dims + + +def concat(variables, dim='concat_dim', positions=None, shortcut=False): + """Concatenate variables along a new or existing dimension. + + Parameters + ---------- + variables : iterable of Array + Arrays to stack together. Each variable is expected to have + matching dimensions and shape except for along the stacked + dimension. + dim : str or DataArray, optional + Name of the dimension to stack along. This can either be a new + dimension name, in which case it is added along axis=0, or an + existing dimension name, in which case the location of the + dimension is unchanged. Where to insert the new dimension is + determined by the first variable. + positions : None or list of integer arrays, optional + List of integer arrays which specifies the integer positions to which + to assign each dataset along the concatenated dimension. If not + supplied, objects are concatenated in the provided order. + shortcut : bool, optional + This option is used internally to speed-up groupby operations. + If `shortcut` is True, some checks of internal consistency between + arrays to concatenate are skipped. + + Returns + ------- + stacked : Variable + Concatenated Variable formed by stacking all the supplied variables + along the given dimension. + """ + variables = list(variables) + if all(isinstance(v, Coordinate) for v in variables): + return Coordinate.concat(variables, dim, positions, shortcut) + else: + return Variable.concat(variables, dim, positions, shortcut) diff --git a/xarray/test/test_combine.py b/xarray/test/test_combine.py index 0933c98333e..4ad8e11de43 100644 --- a/xarray/test/test_combine.py +++ b/xarray/test/test_combine.py @@ -214,6 +214,14 @@ def test_concat_dim_is_variable(self): actual = concat(objs, coord) self.assertDatasetIdentical(actual, expected) + def test_concat_multiindex(self): + x = pd.MultiIndex.from_product([[1, 2, 3], ['a', 'b']]) + expected = Dataset({'x': x}) + actual = concat([expected.isel(x=slice(2)), + expected.isel(x=slice(2, None))], 'x') + assert expected.equals(actual) + assert isinstance(actual.x.to_index(), pd.MultiIndex) + @requires_dask # only for toolz def test_auto_combine(self): objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py index 20f2d01bdec..804c8b9104c 100644 --- a/xarray/test/test_dataarray.py +++ b/xarray/test/test_dataarray.py @@ -1066,6 +1066,7 @@ def test_groupby_apply_identity(self): def identity(x): return x + for g in ['x', 'y', 'abc', idx]: for shortcut in [False, True]: for squeeze in [False, True]: @@ -1818,29 +1819,29 @@ def test_full_like(self): actual = _full_like(DataArray([1, 2, 3]), fill_value=np.nan) self.assertEqual(actual.dtype, np.float) np.testing.assert_equal(actual.values, np.nan) - + def test_dot(self): x = np.linspace(-3, 3, 6) y = np.linspace(-3, 3, 5) - z = range(4) + z = range(4) da_vals = np.arange(6 * 5 * 4).reshape((6, 5, 4)) da = DataArray(da_vals, coords=[x, y, z], dims=['x', 'y', 'z']) - + dm_vals = range(4) dm = DataArray(dm_vals, coords=[z], dims=['z']) - + # nd dot 1d actual = da.dot(dm) expected_vals = np.tensordot(da_vals, dm_vals, [2, 0]) expected = DataArray(expected_vals, coords=[x, y], dims=['x', 'y']) self.assertDataArrayEqual(expected, actual) - + # all shared dims actual = da.dot(da) expected_vals = np.tensordot(da_vals, da_vals, axes=([0, 1, 2], [0, 1, 2])) expected = DataArray(expected_vals) self.assertDataArrayEqual(expected, actual) - + # multiple shared dims dm_vals = np.arange(20 * 5 * 4).reshape((20, 5, 4)) j = np.linspace(-3, 3, 20) @@ -1849,7 +1850,7 @@ def test_dot(self): expected_vals = np.tensordot(da_vals, dm_vals, axes=([1, 2], [1, 2])) expected = DataArray(expected_vals, coords=[x, j], dims=['x', 'j']) self.assertDataArrayEqual(expected, actual) - + with self.assertRaises(NotImplementedError): da.dot(dm.to_dataset(name='dm')) with self.assertRaises(TypeError): diff --git a/xarray/test/test_ops.py b/xarray/test/test_ops.py index 7801f246a66..f4e818fa747 100644 --- a/xarray/test/test_ops.py +++ b/xarray/test/test_ops.py @@ -4,7 +4,6 @@ from xarray.core.ops import ( first, last, count, mean ) -from xarray.core.nputils import interleaved_concat as interleaved_concat_numpy from . import TestCase @@ -75,60 +74,3 @@ def test_count(self): def test_all_nan_arrays(self): assert np.isnan(mean([np.nan, np.nan])) - - def test_interleaved_concat(self): - for interleaved_concat in [interleaved_concat_numpy, - ops._interleaved_concat_slow, - ops.interleaved_concat]: - x = np.arange(5) - self.assertArrayEqual(x, interleaved_concat([x], [x])) - - arrays = np.arange(10).reshape(2, -1) - indices = np.arange(10).reshape(2, -1, order='F') - actual = interleaved_concat(arrays, indices) - expected = np.array([0, 5, 1, 6, 2, 7, 3, 8, 4, 9]) - self.assertArrayEqual(expected, actual) - - arrays2 = arrays.reshape(2, 5, 1) - - actual = interleaved_concat(arrays2, indices, axis=0) - self.assertArrayEqual(expected.reshape(10, 1), actual) - - actual = interleaved_concat(arrays2, [[0], [1]], axis=1) - self.assertArrayEqual(arrays.T, actual) - - actual = interleaved_concat(arrays2, [slice(1), slice(1, 2)], axis=-1) - self.assertArrayEqual(arrays.T, actual) - - with self.assertRaises(IndexError): - interleaved_concat(arrays, indices, axis=1) - with self.assertRaises(IndexError): - interleaved_concat(arrays, indices, axis=-2) - with self.assertRaises(IndexError): - interleaved_concat(arrays2, [0, 1], axis=2) - - def test_interleaved_concat_dtypes(self): - for interleaved_concat in [interleaved_concat_numpy, - ops._interleaved_concat_slow, - ops.interleaved_concat]: - a = np.array(['a']) - b = np.array(['bc']) - actual = interleaved_concat([a, b], [[0], [1]]) - expected = np.array(['a', 'bc']) - self.assertArrayEqual(expected, actual) - - c = np.array([np.nan], dtype=object) - actual = interleaved_concat([a, b, c], [[0], [1], [2]]) - expected = np.array(['a', 'bc', np.nan], dtype=object) - self.assertArrayEqual(expected, actual) - - def test_interleaved_indices_required(self): - self.assertFalse(ops._interleaved_indices_required([[0]])) - self.assertFalse(ops._interleaved_indices_required([[0, 1], [2, 3, 4]])) - self.assertFalse(ops._interleaved_indices_required([slice(3), slice(3, 4)])) - self.assertFalse(ops._interleaved_indices_required([slice(0, 2, 1)])) - self.assertTrue(ops._interleaved_indices_required([[0], [2]])) - self.assertTrue(ops._interleaved_indices_required([[1], [2, 3]])) - self.assertTrue(ops._interleaved_indices_required([[0, 1], [2, 4]])) - self.assertTrue(ops._interleaved_indices_required([[0, 1], [3.5, 4]])) - self.assertTrue(ops._interleaved_indices_required([slice(None, None, 2)])) diff --git a/xarray/test/test_variable.py b/xarray/test/test_variable.py index 8304b50c315..9fae30d6f8c 100644 --- a/xarray/test/test_variable.py +++ b/xarray/test/test_variable.py @@ -321,8 +321,10 @@ def test_concat(self): with self.assertRaisesRegexp(ValueError, 'inconsistent dimensions'): Variable.concat([v, Variable(['c'], y)], 'b') # test indexers - actual = Variable.concat([v, w], positions=[range(0, 10, 2), - range(1, 10, 2)], dim='a') + actual = Variable.concat( + [v, w], + positions=[np.arange(0, 10, 2), np.arange(1, 10, 2)], + dim='a') expected = Variable('a', np.array([x, y]).ravel(order='F')) self.assertVariableIdentical(expected, actual) # test concatenating along a dimension @@ -990,6 +992,27 @@ def test_name(self): with self.assertRaises(AttributeError): coord.name = 'y' + def test_concat_periods(self): + periods = pd.period_range('2000-01-01', periods=10) + coords = [Coordinate('t', periods[:5]), Coordinate('t', periods[5:])] + expected = Coordinate('t', periods) + actual = Coordinate.concat(coords, dim='t') + assert actual.identical(expected) + assert isinstance(actual.to_index(), pd.PeriodIndex) + + positions = [list(range(5)), list(range(5, 10))] + actual = Coordinate.concat(coords, dim='t', positions=positions) + assert actual.identical(expected) + assert isinstance(actual.to_index(), pd.PeriodIndex) + + def test_concat_multiindex(self): + idx = pd.MultiIndex.from_product([[0, 1, 2], ['a', 'b']]) + coords = [Coordinate('x', idx[:2]), Coordinate('x', idx[2:])] + expected = Coordinate('x', idx) + actual = Coordinate.concat(coords, dim='x') + assert actual.identical(expected) + assert isinstance(actual.to_index(), pd.MultiIndex) + class TestAsCompatibleData(TestCase): def test_unchanged_types(self):