From fc85ba2d4384c1a8f4a935ba6c97da47d7a6ca70 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Thu, 13 Jun 2024 17:08:28 -0600 Subject: [PATCH 1/4] Remove internal use of IndexVariable --- xarray/core/groupby.py | 38 +++++++++++++++++++++++++++++--------- xarray/core/groupers.py | 37 ++++++++++++++++++++++++++----------- 2 files changed, 55 insertions(+), 20 deletions(-) diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index eceb4e62199..75919a2c915 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -299,7 +299,7 @@ class ResolvedGrouper(Generic[T_DataWithCoords]): codes: DataArray = field(init=False) full_index: pd.Index = field(init=False) group_indices: T_GroupIndices = field(init=False) - unique_coord: IndexVariable | _DummyGroup = field(init=False) + unique_coord: Variable | _DummyGroup = field(init=False) # _ensure_1d: group1d: T_Group = field(init=False) @@ -328,14 +328,18 @@ def __post_init__(self) -> None: @property def name(self) -> Hashable: + """Name for the grouped coordinate after reduction.""" # the name has to come from unique_coord because we need `_bins` suffix for BinGrouper - return self.unique_coord.name + (name,) = self.unique_coord.dims + return name @property def size(self) -> int: + """Number of groups.""" return len(self) def __len__(self) -> int: + """Number of groups.""" return len(self.full_index) @property @@ -358,8 +362,8 @@ def factorize(self) -> None: ] if encoded.unique_coord is None: unique_values = self.full_index[np.unique(encoded.codes)] - self.unique_coord = IndexVariable( - self.codes.name, unique_values, attrs=self.group.attrs + self.unique_coord = Variable( + dims=self.codes.name, data=unique_values, attrs=self.group.attrs ) else: self.unique_coord = encoded.unique_coord @@ -620,6 +624,8 @@ def _iter_grouped(self, warn_squeeze=True) -> Iterator[T_Xarray]: yield self._obj.isel({self._group_dim: indices}) def _infer_concat_args(self, applied_example): + from xarray.core.groupers import BinGrouper + (grouper,) = self.groupers if self._group_dim in applied_example.dims: coord = grouper.group1d @@ -628,7 +634,10 @@ def _infer_concat_args(self, applied_example): coord = grouper.unique_coord positions = None (dim,) = coord.dims - if isinstance(coord, _DummyGroup): + if isinstance(grouper.group, _DummyGroup) and not isinstance( + grouper.grouper, BinGrouper + ): + # When binning we actually do set the index coord = None coord = getattr(coord, "variable", coord) return coord, dim, positions @@ -645,12 +654,20 @@ def _binary_op(self, other, f, reflexive=False): codes = self._codes dims = group.dims - if isinstance(group, _DummyGroup): + if isinstance(grouper.group, _DummyGroup): group = coord = group.to_dataarray() else: coord = grouper.unique_coord - if not isinstance(coord, DataArray): - coord = DataArray(grouper.unique_coord) + if isinstance(coord, Variable): + assert coord.ndim == 1 + (coord_dim,) = coord.dims + # TODO: explicitly create Index here + coord = DataArray( + dims=coord_dim, + data=coord.data, + attrs=coord.attrs, + coords={coord_dim: coord.data}, + ) name = grouper.name if not isinstance(other, (Dataset, DataArray)): @@ -848,7 +865,10 @@ def _flox_reduce( # in the grouped variable group_dims = grouper.group.dims if set(group_dims).issubset(set(parsed_dim)): - result[grouper.name] = output_index + new_coord = Variable( + dims=grouper.name, data=np.array(output_index), attrs=self._codes.attrs + ) + result = result.assign_coords({grouper.name: new_coord}) result = result.drop_vars(unindexed_dims) # broadcast and restore non-numeric data variables (backcompat) diff --git a/xarray/core/groupers.py b/xarray/core/groupers.py index e33cd3ad99f..12f30d25a92 100644 --- a/xarray/core/groupers.py +++ b/xarray/core/groupers.py @@ -23,7 +23,7 @@ from xarray.core.resample_cftime import CFTimeGrouper from xarray.core.types import DatetimeLike, SideOptions, T_GroupIndices from xarray.core.utils import emit_user_level_warning -from xarray.core.variable import IndexVariable +from xarray.core.variable import Variable __all__ = [ "EncodedGroups", @@ -55,7 +55,17 @@ class EncodedGroups: codes: DataArray full_index: pd.Index group_indices: T_GroupIndices | None = field(default=None) - unique_coord: IndexVariable | _DummyGroup | None = field(default=None) + unique_coord: Variable | _DummyGroup | None = field(default=None) + + def __post_init__(self): + assert isinstance(self.codes, DataArray) + if self.codes.name is None: + raise ValueError("Please set a name on the array you are grouping by.") + assert isinstance(self.full_index, pd.Index) + assert ( + isinstance(self.unique_coord, (Variable, _DummyGroup)) + or self.unique_coord is None + ) class Grouper(ABC): @@ -134,10 +144,10 @@ def _factorize_unique(self) -> EncodedGroups: "Failed to group data. Are you grouping by a variable that is all NaN?" ) codes = self.group.copy(data=codes_) - unique_coord = IndexVariable( - self.group.name, unique_values, attrs=self.group.attrs + unique_coord = Variable( + dims=codes.name, data=unique_values, attrs=self.group.attrs ) - full_index = unique_coord + full_index = pd.Index(unique_values) return EncodedGroups( codes=codes, full_index=full_index, unique_coord=unique_coord @@ -152,12 +162,13 @@ def _factorize_dummy(self) -> EncodedGroups: size_range = np.arange(size) if isinstance(self.group, _DummyGroup): codes = self.group.to_dataarray().copy(data=size_range) + unique_coord = self.group + full_index = pd.RangeIndex(self.group.size) else: codes = self.group.copy(data=size_range) - unique_coord = self.group - full_index = IndexVariable( - self.group.name, unique_coord.values, self.group.attrs - ) + unique_coord = self.group.variable + full_index = pd.Index(unique_coord.data) + return EncodedGroups( codes=codes, group_indices=group_indices, @@ -201,7 +212,9 @@ def factorize(self, group) -> EncodedGroups: codes = DataArray( binned_codes, getattr(group, "coords", None), name=new_dim_name ) - unique_coord = IndexVariable(new_dim_name, pd.Index(unique_values), group.attrs) + unique_coord = Variable( + dims=new_dim_name, data=pd.Index(unique_values), attrs=group.attrs + ) return EncodedGroups( codes=codes, full_index=full_index, unique_coord=unique_coord ) @@ -318,7 +331,9 @@ def factorize(self, group) -> EncodedGroups: ] group_indices += [slice(sbins[-1], None)] - unique_coord = IndexVariable(group.name, first_items.index, group.attrs) + unique_coord = Variable( + dims=group.name, data=first_items.index, attrs=group.attrs + ) codes = group.copy(data=codes_) return EncodedGroups( From 2649f7675d1b4ac0d7873ea768150eddb801624c Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 21 Jun 2024 15:28:34 -0600 Subject: [PATCH 2/4] cleanup --- xarray/core/groupby.py | 14 +++++++------- xarray/core/groupers.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 75919a2c915..f14d14077d7 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -246,7 +246,7 @@ def to_array(self) -> DataArray: return self.to_dataarray() -T_Group = Union["T_DataArray", "IndexVariable", _DummyGroup] +T_Group = Union["T_DataArray", _DummyGroup] def _ensure_1d(group: T_Group, obj: T_DataWithCoords) -> tuple[ @@ -256,7 +256,7 @@ def _ensure_1d(group: T_Group, obj: T_DataWithCoords) -> tuple[ list[Hashable], ]: # 1D cases: do nothing - if isinstance(group, (IndexVariable, _DummyGroup)) or group.ndim == 1: + if isinstance(group, _DummyGroup) or group.ndim == 1: return group, obj, None, [] from xarray.core.dataarray import DataArray @@ -271,9 +271,7 @@ def _ensure_1d(group: T_Group, obj: T_DataWithCoords) -> tuple[ newobj = obj.stack({stacked_dim: orig_dims}) return newgroup, newobj, stacked_dim, inserted_dims - raise TypeError( - f"group must be DataArray, IndexVariable or _DummyGroup, got {type(group)!r}." - ) + raise TypeError(f"group must be DataArray or _DummyGroup, got {type(group)!r}.") @dataclass @@ -315,7 +313,7 @@ def __post_init__(self) -> None: # might be used multiple times. self.grouper = copy.deepcopy(self.grouper) - self.group: T_Group = _resolve_group(self.obj, self.group) + self.group = _resolve_group(self.obj, self.group) ( self.group1d, @@ -382,7 +380,9 @@ def _validate_groupby_squeeze(squeeze: bool | None) -> None: ) -def _resolve_group(obj: T_DataWithCoords, group: T_Group | Hashable) -> T_Group: +def _resolve_group( + obj: T_DataWithCoords, group: T_Group | Hashable | IndexVariable +) -> T_Group: from xarray.core.dataarray import DataArray error_msg = ( diff --git a/xarray/core/groupers.py b/xarray/core/groupers.py index 12f30d25a92..92758e46121 100644 --- a/xarray/core/groupers.py +++ b/xarray/core/groupers.py @@ -213,7 +213,7 @@ def factorize(self, group) -> EncodedGroups: binned_codes, getattr(group, "coords", None), name=new_dim_name ) unique_coord = Variable( - dims=new_dim_name, data=pd.Index(unique_values), attrs=group.attrs + dims=new_dim_name, data=unique_values, attrs=group.attrs ) return EncodedGroups( codes=codes, full_index=full_index, unique_coord=unique_coord From 6c60cf7e936aea514bb339bcf4438c61e50c6edb Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 21 Jun 2024 15:58:04 -0600 Subject: [PATCH 3/4] cleanup more --- xarray/core/groupby.py | 29 +++++++++++++++-------------- xarray/core/groupers.py | 2 +- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index f14d14077d7..697d7e9f8ee 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -19,8 +19,10 @@ from xarray.core.arithmetic import DataArrayGroupbyArithmetic, DatasetGroupbyArithmetic from xarray.core.common import ImplementsArrayReduce, ImplementsDatasetReduce from xarray.core.concat import concat +from xarray.core.coordinates import Coordinates from xarray.core.formatting import format_array_flat from xarray.core.indexes import ( + PandasIndex, create_default_index_implicit, filter_indexes_from_coords, ) @@ -650,11 +652,12 @@ def _binary_op(self, other, f, reflexive=False): (grouper,) = self.groupers obj = self._original_obj + name = grouper.name group = grouper.group codes = self._codes dims = group.dims - if isinstance(grouper.group, _DummyGroup): + if isinstance(group, _DummyGroup): group = coord = group.to_dataarray() else: coord = grouper.unique_coord @@ -662,12 +665,7 @@ def _binary_op(self, other, f, reflexive=False): assert coord.ndim == 1 (coord_dim,) = coord.dims # TODO: explicitly create Index here - coord = DataArray( - dims=coord_dim, - data=coord.data, - attrs=coord.attrs, - coords={coord_dim: coord.data}, - ) + coord = DataArray(coord, coords={coord_dim: coord.data}) name = grouper.name if not isinstance(other, (Dataset, DataArray)): @@ -783,6 +781,7 @@ def _flox_reduce( obj = self._original_obj (grouper,) = self.groupers + name = grouper.name isbin = isinstance(grouper.grouper, BinGrouper) if keep_attrs is None: @@ -814,14 +813,14 @@ def _flox_reduce( # weird backcompat # reducing along a unique indexed dimension with squeeze=True # should raise an error - if (dim is None or dim == grouper.name) and grouper.name in obj.xindexes: - index = obj.indexes[grouper.name] + if (dim is None or dim == name) and grouper.name in obj.xindexes: + index = obj.indexes[name] if index.is_unique and self._squeeze: - raise ValueError(f"cannot reduce over dimensions {grouper.name!r}") + raise ValueError(f"cannot reduce over dimensions {name!r}") unindexed_dims: tuple[Hashable, ...] = tuple() if isinstance(grouper.group, _DummyGroup) and not isbin: - unindexed_dims = (grouper.name,) + unindexed_dims = (name,) parsed_dim: tuple[Hashable, ...] if isinstance(dim, str): @@ -865,10 +864,12 @@ def _flox_reduce( # in the grouped variable group_dims = grouper.group.dims if set(group_dims).issubset(set(parsed_dim)): - new_coord = Variable( - dims=grouper.name, data=np.array(output_index), attrs=self._codes.attrs + result = result.assign_coords( + Coordinates( + coords={name: (name, np.array(output_index))}, + indexes={name: PandasIndex(output_index, dim=name)}, + ) ) - result = result.assign_coords({grouper.name: new_coord}) result = result.drop_vars(unindexed_dims) # broadcast and restore non-numeric data variables (backcompat) diff --git a/xarray/core/groupers.py b/xarray/core/groupers.py index 92758e46121..075afd9f62f 100644 --- a/xarray/core/groupers.py +++ b/xarray/core/groupers.py @@ -166,7 +166,7 @@ def _factorize_dummy(self) -> EncodedGroups: full_index = pd.RangeIndex(self.group.size) else: codes = self.group.copy(data=size_range) - unique_coord = self.group.variable + unique_coord = self.group.variable.to_base_variable() full_index = pd.Index(unique_coord.data) return EncodedGroups( From b1a73144b58a5f6e427bc2b2d9cdf8e31ca17c93 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 21 Jun 2024 16:03:39 -0600 Subject: [PATCH 4/4] cleanup --- xarray/core/groupby.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 697d7e9f8ee..42e7f01a526 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -666,7 +666,6 @@ def _binary_op(self, other, f, reflexive=False): (coord_dim,) = coord.dims # TODO: explicitly create Index here coord = DataArray(coord, coords={coord_dim: coord.data}) - name = grouper.name if not isinstance(other, (Dataset, DataArray)): raise TypeError( @@ -813,7 +812,7 @@ def _flox_reduce( # weird backcompat # reducing along a unique indexed dimension with squeeze=True # should raise an error - if (dim is None or dim == name) and grouper.name in obj.xindexes: + if (dim is None or dim == name) and name in obj.xindexes: index = obj.indexes[name] if index.is_unique and self._squeeze: raise ValueError(f"cannot reduce over dimensions {name!r}") @@ -876,8 +875,7 @@ def _flox_reduce( for name, var in non_numeric.items(): if all(d not in var.dims for d in parsed_dim): result[name] = var.variable.set_dims( - (grouper.name,) + var.dims, - (result.sizes[grouper.name],) + var.shape, + (name,) + var.dims, (result.sizes[name],) + var.shape ) if not isinstance(result, Dataset):