Skip to content

Commit

Permalink
combine_by_coordinates to handle unnamed data arrays. (#4696)
Browse files Browse the repository at this point in the history
* Added test for combine_by_coords changes.

* Modified test case to expect a dataset instead of a DataArray. Added converter to combine_by_coords to check for all DataArray case and convert to datasets.

* Added tests to check combine_by_coords for exception with mixed DataArrays and dataset input and with empty list.

* Formatting changes after running black

* Added underscore to helper function to label as private.

* Black formatting changes for whats-new doc file.

* Removed imports in docstring that were automatically added by code styling tools to match the other docstrings.

* Removed duplicate new item line in whats-new.

* combine methods now accept unnamed DataArrays as input.

* combine nested test checks nested lists of unnamed DataArrays.

* Made combine_by_coords more readable.

* Cosmetic changes to code style.

* Removed extra test from merge with previous PR.

* Updated test to use pytest.raises instead of raises_regex.

* Added breaking-change entry to whats new page.

* Added deprecation warning to combine_coords

* Removed index monotonicity checking temporarily.

* Removed duplicate entries from whats new page.

* Removed TODO message

* Added test for combine_nested.

* Added check to combine methods to clarify parameter requirements.

* Reassigned description of changes to bug fixes category.

* Minor style changes.

* Added blank line for style purposes.
  • Loading branch information
aijams authored Jul 2, 2021
1 parent 2f8623d commit 3d1d134
Show file tree
Hide file tree
Showing 5 changed files with 230 additions and 43 deletions.
16 changes: 14 additions & 2 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,6 @@ Thomas Nicholas, Tom Nicholas, Zachary Moon.

New Features
~~~~~~~~~~~~

- Implement :py:meth:`DataArray.drop_duplicates`
to remove duplicate dimension values (:pull:`5239`).
By `Andrew Huang <https://github.com/ahuang11>`_.
Expand All @@ -143,9 +142,22 @@ New Features
- Raise more informative error when decoding time variables with invalid reference dates.
(:issue:`5199`, :pull:`5288`). By `Giacomo Caria <https://github.com/gcaria>`_.

Breaking changes
~~~~~~~~~~~~~~~~
- The main parameter to :py:func:`combine_by_coords` is renamed to `data_objects` instead
of `datasets` so anyone calling this method using a named parameter will need to update
the name accordingly (:issue:`3248`, :pull:`4696`).
By `Augustus Ijams <https://github.com/aijams>`_.

Deprecations
~~~~~~~~~~~~


Bug fixes
~~~~~~~~~

- :py:func:`combine_by_coords` can now handle combining a list of unnamed
``DataArray`` as input (:issue:`3248`, :pull:`4696`).
By `Augustus Ijams <https://github.com/aijams>`_.
- Opening netCDF files from a path that doesn't end in ``.nc`` without supplying
an explicit ``engine`` works again (:issue:`5295`), fixing a bug introduced in
0.18.0.
Expand Down
164 changes: 124 additions & 40 deletions xarray/core/combine.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import itertools
import warnings
from collections import Counter

import pandas as pd
Expand All @@ -8,6 +9,7 @@
from .dataarray import DataArray
from .dataset import Dataset
from .merge import merge
from .utils import iterate_nested


def _infer_concat_order_from_positions(datasets):
Expand Down Expand Up @@ -544,6 +546,15 @@ def combine_nested(
concat
merge
"""
mixed_datasets_and_arrays = any(
isinstance(obj, Dataset) for obj in iterate_nested(datasets)
) and any(
isinstance(obj, DataArray) and obj.name is None
for obj in iterate_nested(datasets)
)
if mixed_datasets_and_arrays:
raise ValueError("Can't combine datasets with unnamed arrays.")

if isinstance(concat_dim, (str, DataArray)) or concat_dim is None:
concat_dim = [concat_dim]

Expand All @@ -565,18 +576,79 @@ def vars_as_keys(ds):
return tuple(sorted(ds))


def combine_by_coords(
def _combine_single_variable_hypercube(
datasets,
fill_value=dtypes.NA,
data_vars="all",
coords="different",
compat="no_conflicts",
join="outer",
combine_attrs="no_conflicts",
):
"""
Attempt to combine a list of Datasets into a hypercube using their
coordinates.
All provided Datasets must belong to a single variable, ie. must be
assigned the same variable name. This precondition is not checked by this
function, so the caller is assumed to know what it's doing.
This function is NOT part of the public API.
"""
if len(datasets) == 0:
raise ValueError(
"At least one Dataset is required to resolve variable names "
"for combined hypercube."
)

combined_ids, concat_dims = _infer_concat_order_from_coords(list(datasets))

if fill_value is None:
# check that datasets form complete hypercube
_check_shape_tile_ids(combined_ids)
else:
# check only that all datasets have same dimension depth for these
# vars
_check_dimension_depth_tile_ids(combined_ids)

# Concatenate along all of concat_dims one by one to create single ds
concatenated = _combine_nd(
combined_ids,
concat_dims=concat_dims,
data_vars=data_vars,
coords=coords,
compat=compat,
fill_value=fill_value,
join=join,
combine_attrs=combine_attrs,
)

# Check the overall coordinates are monotonically increasing
for dim in concat_dims:
indexes = concatenated.indexes.get(dim)
if not (indexes.is_monotonic_increasing or indexes.is_monotonic_decreasing):
raise ValueError(
"Resulting object does not have monotonic"
" global indexes along dimension {}".format(dim)
)

return concatenated


# TODO remove empty list default param after version 0.19, see PR4696
def combine_by_coords(
data_objects=[],
compat="no_conflicts",
data_vars="all",
coords="different",
fill_value=dtypes.NA,
join="outer",
combine_attrs="no_conflicts",
datasets=None,
):
"""
Attempt to auto-magically combine the given datasets into one by using
dimension coordinates.
Attempt to auto-magically combine the given datasets (or data arrays)
into one by using dimension coordinates.
This method attempts to combine a group of datasets along any number of
dimensions into a single entity by inspecting coords and metadata and using
Expand All @@ -600,8 +672,9 @@ def combine_by_coords(
Parameters
----------
datasets : sequence of xarray.Dataset
Dataset objects to combine.
data_objects : sequence of xarray.Dataset or sequence of xarray.DataArray
Data objects to combine.
compat : {"identical", "equals", "broadcast_equals", "no_conflicts", "override"}, optional
String indicating how to compare variables of the same name for
potential conflicts:
Expand Down Expand Up @@ -776,51 +849,62 @@ def combine_by_coords(
precipitation (y, x) float64 0.4376 0.8918 0.9637 ... 0.5684 0.01879 0.6176
"""

# Group by data vars
sorted_datasets = sorted(datasets, key=vars_as_keys)
grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys)

# Perform the multidimensional combine on each group of data variables
# before merging back together
concatenated_grouped_by_data_vars = []
for vars, datasets_with_same_vars in grouped_by_vars:
combined_ids, concat_dims = _infer_concat_order_from_coords(
list(datasets_with_same_vars)
# TODO remove after version 0.19, see PR4696
if datasets is not None:
warnings.warn(
"The datasets argument has been renamed to `data_objects`."
" In future passing a value for datasets will raise an error."
)
data_objects = datasets

if fill_value is None:
# check that datasets form complete hypercube
_check_shape_tile_ids(combined_ids)
else:
# check only that all datasets have same dimension depth for these
# vars
_check_dimension_depth_tile_ids(combined_ids)
if not data_objects:
return Dataset()

# Concatenate along all of concat_dims one by one to create single ds
concatenated = _combine_nd(
combined_ids,
concat_dims=concat_dims,
mixed_arrays_and_datasets = any(
isinstance(data_object, DataArray) and data_object.name is None
for data_object in data_objects
) and any(isinstance(data_object, Dataset) for data_object in data_objects)
if mixed_arrays_and_datasets:
raise ValueError("Can't automatically combine datasets with unnamed arrays.")

all_unnamed_data_arrays = all(
isinstance(data_object, DataArray) and data_object.name is None
for data_object in data_objects
)
if all_unnamed_data_arrays:
unnamed_arrays = data_objects
temp_datasets = [data_array._to_temp_dataset() for data_array in unnamed_arrays]

combined_temp_dataset = _combine_single_variable_hypercube(
temp_datasets,
fill_value=fill_value,
data_vars=data_vars,
coords=coords,
compat=compat,
fill_value=fill_value,
join=join,
combine_attrs=combine_attrs,
)
return DataArray()._from_temp_dataset(combined_temp_dataset)

# Check the overall coordinates are monotonically increasing
# TODO (benbovy - flexible indexes): only with pandas.Index?
for dim in concat_dims:
indexes = concatenated.xindexes.get(dim)
if not (
indexes.array.is_monotonic_increasing
or indexes.array.is_monotonic_decreasing
):
raise ValueError(
"Resulting object does not have monotonic"
" global indexes along dimension {}".format(dim)
)
concatenated_grouped_by_data_vars.append(concatenated)
else:
# Group by data vars
sorted_datasets = sorted(data_objects, key=vars_as_keys)
grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys)

# Perform the multidimensional combine on each group of data variables
# before merging back together
concatenated_grouped_by_data_vars = []
for vars, datasets_with_same_vars in grouped_by_vars:
concatenated = _combine_single_variable_hypercube(
list(datasets_with_same_vars),
fill_value=fill_value,
data_vars=data_vars,
coords=coords,
compat=compat,
join=join,
combine_attrs=combine_attrs,
)
concatenated_grouped_by_data_vars.append(concatenated)

return merge(
concatenated_grouped_by_data_vars,
Expand Down
8 changes: 8 additions & 0 deletions xarray/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -907,3 +907,11 @@ class Default(Enum):


_default = Default.token


def iterate_nested(nested_list):
for item in nested_list:
if isinstance(item, list):
yield from iterate_nested(item)
else:
yield item
68 changes: 68 additions & 0 deletions xarray/tests/test_combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -646,6 +646,47 @@ def test_combine_nested_fill_value(self, fill_value):
actual = combine_nested(datasets, concat_dim="t", fill_value=fill_value)
assert_identical(expected, actual)

def test_combine_nested_unnamed_data_arrays(self):
unnamed_array = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")

actual = combine_nested([unnamed_array], concat_dim="x")
expected = unnamed_array
assert_identical(expected, actual)

unnamed_array1 = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
unnamed_array2 = DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x")

actual = combine_nested([unnamed_array1, unnamed_array2], concat_dim="x")
expected = DataArray(
data=[1.0, 2.0, 3.0, 4.0], coords={"x": [0, 1, 2, 3]}, dims="x"
)
assert_identical(expected, actual)

da1 = DataArray(data=[[0.0]], coords={"x": [0], "y": [0]}, dims=["x", "y"])
da2 = DataArray(data=[[1.0]], coords={"x": [0], "y": [1]}, dims=["x", "y"])
da3 = DataArray(data=[[2.0]], coords={"x": [1], "y": [0]}, dims=["x", "y"])
da4 = DataArray(data=[[3.0]], coords={"x": [1], "y": [1]}, dims=["x", "y"])
objs = [[da1, da2], [da3, da4]]

expected = DataArray(
data=[[0.0, 1.0], [2.0, 3.0]],
coords={"x": [0, 1], "y": [0, 1]},
dims=["x", "y"],
)
actual = combine_nested(objs, concat_dim=["x", "y"])
assert_identical(expected, actual)

# TODO aijams - Determine if this test is appropriate.
def test_nested_combine_mixed_datasets_arrays(self):
objs = [
DataArray([0, 1], dims=("x"), coords=({"x": [0, 1]})),
Dataset({"x": [2, 3]}),
]
with pytest.raises(
ValueError, match=r"Can't combine datasets with unnamed arrays."
):
combine_nested(objs, "x")


class TestCombineAuto:
def test_combine_by_coords(self):
Expand Down Expand Up @@ -689,6 +730,17 @@ def test_combine_by_coords(self):
def test_empty_input(self):
assert_identical(Dataset(), combine_by_coords([]))

def test_combine_coords_mixed_datasets_arrays(self):
objs = [
DataArray([0, 1], dims=("x"), coords=({"x": [0, 1]})),
Dataset({"x": [2, 3]}),
]
with pytest.raises(
ValueError,
match=r"Can't automatically combine datasets with unnamed arrays.",
):
combine_by_coords(objs)

@pytest.mark.parametrize(
"join, expected",
[
Expand Down Expand Up @@ -992,6 +1044,22 @@ def test_combine_by_coords_incomplete_hypercube(self):
with pytest.raises(ValueError):
combine_by_coords([x1, x2, x3], fill_value=None)

def test_combine_by_coords_unnamed_arrays(self):
unnamed_array = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")

actual = combine_by_coords([unnamed_array])
expected = unnamed_array
assert_identical(expected, actual)

unnamed_array1 = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
unnamed_array2 = DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x")

actual = combine_by_coords([unnamed_array1, unnamed_array2])
expected = DataArray(
data=[1.0, 2.0, 3.0, 4.0], coords={"x": [0, 1, 2, 3]}, dims="x"
)
assert_identical(expected, actual)


@requires_cftime
def test_combine_by_coords_distant_cftime_dates():
Expand Down
17 changes: 16 additions & 1 deletion xarray/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from xarray.coding.cftimeindex import CFTimeIndex
from xarray.core import duck_array_ops, utils
from xarray.core.indexes import PandasIndex
from xarray.core.utils import either_dict_or_kwargs
from xarray.core.utils import either_dict_or_kwargs, iterate_nested

from . import assert_array_equal, requires_cftime, requires_dask
from .test_coding_times import _all_cftime_date_types
Expand Down Expand Up @@ -318,3 +318,18 @@ def test_infix_dims(supplied, all_, expected):
def test_infix_dims_errors(supplied, all_):
with pytest.raises(ValueError):
list(utils.infix_dims(supplied, all_))


@pytest.mark.parametrize(
"nested_list, expected",
[
([], []),
([1], [1]),
([1, 2, 3], [1, 2, 3]),
([[1]], [1]),
([[1, 2], [3, 4]], [1, 2, 3, 4]),
([[[1, 2, 3], [4]], [5, 6]], [1, 2, 3, 4, 5, 6]),
],
)
def test_iterate_nested(nested_list, expected):
assert list(iterate_nested(nested_list)) == expected

0 comments on commit 3d1d134

Please sign in to comment.