Skip to content

Commit

Permalink
Introduce Grouper objects internally (#7561)
Browse files Browse the repository at this point in the history
* Introduce Grouper objects.

* Remove a copy after stacking for a groupby.

Upstream bug pandas-dev/pandas#12813 is fixed

* Fix typing

* [WIP] typing

* Cleanup

* [WIP]

* group as Variable?

* Revert "group as Variable?"

This reverts commit 2a36e21a031b9e061b932682758551956f3f06d2.

* Small cleanup

* De-duplicate alignment check

* Fix resampling

* Bugfix

* Partial reverts commit 22ad7fa.

* fix tests

* small cleanup

* more cleanup

* Apply suggestions from code review

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add ResolvedGrouper class

* GroupBy only handles ResolvedGrouper objects.

Much cleaner!

* review feedback

* minimize diff

* dataclass

* moar dataclass

Co-authored-by: Illviljan <[email protected]>

* Add typing

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Ignore type checking error.

* Update groupby.py

* Move factorize to _factorize

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update groupby.py

* Update xarray/core/groupby.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Calculate group_indices only when necessary

* Revert "Calculate group_indices only when necessary"

This reverts commit 917c77efb05bacffcf901e61eabb9defc9a429d7.

* Fix regression from deep copy

---------

Co-authored-by: Illviljan <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored May 4, 2023
1 parent da8746b commit fde773e
Show file tree
Hide file tree
Showing 7 changed files with 550 additions and 354 deletions.
8 changes: 5 additions & 3 deletions xarray/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -949,7 +949,7 @@ def _resample(
# TODO support non-string indexer after removing the old API.

from xarray.core.dataarray import DataArray
from xarray.core.groupby import TimeResampleGrouper
from xarray.core.groupby import ResolvedTimeResampleGrouper, TimeResampleGrouper
from xarray.core.resample import RESAMPLE_DIM

if keep_attrs is not None:
Expand Down Expand Up @@ -1012,11 +1012,13 @@ def _resample(
group = DataArray(
dim_coord, coords=dim_coord.coords, dims=dim_coord.dims, name=RESAMPLE_DIM
)

rgrouper = ResolvedTimeResampleGrouper(grouper, group, self)

return resample_cls(
self,
group=group,
(rgrouper,),
dim=dim_name,
grouper=grouper,
resample_dim=RESAMPLE_DIM,
restore_coord_dims=restore_coord_dims,
)
Expand Down
7 changes: 4 additions & 3 deletions xarray/core/computation.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,15 +515,16 @@ def apply_groupby_func(func, *args):
groupbys = [arg for arg in args if isinstance(arg, GroupBy)]
assert groupbys, "must have at least one groupby to iterate over"
first_groupby = groupbys[0]
if any(not first_groupby._group.equals(gb._group) for gb in groupbys[1:]):
(grouper,) = first_groupby.groupers
if any(not grouper.group.equals(gb.groupers[0].group) for gb in groupbys[1:]):
raise ValueError(
"apply_ufunc can only perform operations over "
"multiple GroupBy objects at once if they are all "
"grouped the same way"
)

grouped_dim = first_groupby._group.name
unique_values = first_groupby._unique_coord.values
grouped_dim = grouper.name
unique_values = grouper.unique_coord.values

iterators = []
for arg in args:
Expand Down
47 changes: 28 additions & 19 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -6478,21 +6478,20 @@ def groupby(
core.groupby.DataArrayGroupBy
pandas.DataFrame.groupby
"""
from xarray.core.groupby import DataArrayGroupBy

# While we don't generally check the type of every arg, passing
# multiple dimensions as multiple arguments is common enough, and the
# consequences hidden enough (strings evaluate as true) to warrant
# checking here.
# A future version could make squeeze kwarg only, but would face
# backward-compat issues.
if not isinstance(squeeze, bool):
raise TypeError(
f"`squeeze` must be True or False, but {squeeze} was supplied"
)
from xarray.core.groupby import (
DataArrayGroupBy,
ResolvedUniqueGrouper,
UniqueGrouper,
_validate_groupby_squeeze,
)

_validate_groupby_squeeze(squeeze)
rgrouper = ResolvedUniqueGrouper(UniqueGrouper(), group, self)
return DataArrayGroupBy(
self, group, squeeze=squeeze, restore_coord_dims=restore_coord_dims
self,
(rgrouper,),
squeeze=squeeze,
restore_coord_dims=restore_coord_dims,
)

def groupby_bins(
Expand Down Expand Up @@ -6563,21 +6562,31 @@ def groupby_bins(
----------
.. [1] http://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html
"""
from xarray.core.groupby import DataArrayGroupBy
from xarray.core.groupby import (
BinGrouper,
DataArrayGroupBy,
ResolvedBinGrouper,
_validate_groupby_squeeze,
)

return DataArrayGroupBy(
self,
group,
squeeze=squeeze,
_validate_groupby_squeeze(squeeze)
grouper = BinGrouper(
bins=bins,
restore_coord_dims=restore_coord_dims,
cut_kwargs={
"right": right,
"labels": labels,
"precision": precision,
"include_lowest": include_lowest,
},
)
rgrouper = ResolvedBinGrouper(grouper, group, self)

return DataArrayGroupBy(
self,
(rgrouper,),
squeeze=squeeze,
restore_coord_dims=restore_coord_dims,
)

def weighted(self, weights: DataArray) -> DataArrayWeighted:
"""
Expand Down
48 changes: 29 additions & 19 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8958,21 +8958,21 @@ def groupby(
Dataset.resample
DataArray.resample
"""
from xarray.core.groupby import DatasetGroupBy

# While we don't generally check the type of every arg, passing
# multiple dimensions as multiple arguments is common enough, and the
# consequences hidden enough (strings evaluate as true) to warrant
# checking here.
# A future version could make squeeze kwarg only, but would face
# backward-compat issues.
if not isinstance(squeeze, bool):
raise TypeError(
f"`squeeze` must be True or False, but {squeeze} was supplied"
)
from xarray.core.groupby import (
DatasetGroupBy,
ResolvedUniqueGrouper,
UniqueGrouper,
_validate_groupby_squeeze,
)

_validate_groupby_squeeze(squeeze)
rgrouper = ResolvedUniqueGrouper(UniqueGrouper(), group, self)

return DatasetGroupBy(
self, group, squeeze=squeeze, restore_coord_dims=restore_coord_dims
self,
(rgrouper,),
squeeze=squeeze,
restore_coord_dims=restore_coord_dims,
)

def groupby_bins(
Expand Down Expand Up @@ -9043,21 +9043,31 @@ def groupby_bins(
----------
.. [1] http://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html
"""
from xarray.core.groupby import DatasetGroupBy
from xarray.core.groupby import (
BinGrouper,
DatasetGroupBy,
ResolvedBinGrouper,
_validate_groupby_squeeze,
)

return DatasetGroupBy(
self,
group,
squeeze=squeeze,
_validate_groupby_squeeze(squeeze)
grouper = BinGrouper(
bins=bins,
restore_coord_dims=restore_coord_dims,
cut_kwargs={
"right": right,
"labels": labels,
"precision": precision,
"include_lowest": include_lowest,
},
)
rgrouper = ResolvedBinGrouper(grouper, group, self)

return DatasetGroupBy(
self,
(rgrouper,),
squeeze=squeeze,
restore_coord_dims=restore_coord_dims,
)

def weighted(self, weights: DataArray) -> DatasetWeighted:
"""
Expand Down
Loading

0 comments on commit fde773e

Please sign in to comment.