From 6c1969a35fe720cf3a804006bcc9046ba554fcc3 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Mon, 18 Dec 2023 12:34:37 -0800 Subject: [PATCH] feat: Add IntervalIndex support to bigframes.pandas.cut (#254) * feature: Add IntervalIndex support to bigframes.pandas.cut * add bins <= 0 error in CutOp * add type ignore * add type ignore to session --------- Co-authored-by: Shobhit Singh --- bigframes/core/reshape/__init__.py | 20 +++++-- bigframes/operations/aggregations.py | 41 ++++++++++----- bigframes/series.py | 2 +- bigframes/session/__init__.py | 4 +- tests/system/small/test_pandas.py | 34 ++++++++++++ .../pandas/core/reshape/tile.py | 52 +++++++++++++++---- 6 files changed, 124 insertions(+), 29 deletions(-) diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index 24c1bff309..d9cc99a036 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -14,7 +14,9 @@ from __future__ import annotations import typing -from typing import Iterable, Literal, Optional, Union +from typing import Iterable, Literal, Optional, Tuple, Union + +import pandas as pd import bigframes.constants as constants import bigframes.core as core @@ -108,17 +110,29 @@ def concat( def cut( x: bigframes.series.Series, - bins: int, + bins: Union[ + int, + pd.IntervalIndex, + Iterable[Tuple[Union[int, float], Union[int, float]]], + ], *, labels: Optional[bool] = None, ) -> bigframes.series.Series: - if bins <= 0: + if isinstance(bins, int) and bins <= 0: raise ValueError("`bins` should be a positive integer.") + if isinstance(bins, Iterable): + if not isinstance(bins, pd.IntervalIndex): + bins = pd.IntervalIndex.from_tuples(list(bins)) + + if bins.is_overlapping: + raise ValueError("Overlapping IntervalIndex is not accepted.") + if labels is not False: raise NotImplementedError( f"Only labels=False is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}" ) + return x._apply_window_op(agg_ops.CutOp(bins), window_spec=core.WindowSpec()) diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 363dfe819d..8178ebfaea 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -20,6 +20,7 @@ import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types from pandas import Int64Dtype +import pandas as pd import bigframes.constants as constants import bigframes.dtypes as dtypes @@ -228,21 +229,37 @@ def skips_nulls(self): class CutOp(WindowOp): - def __init__(self, bins: int): - self._bins_ibis = dtypes.literal_to_ibis_scalar(bins, force_dtype=Int64Dtype()) - self._bins_int = bins + def __init__(self, bins: typing.Union[int, pd.IntervalIndex]): + if isinstance(bins, int): + if not bins > 0: + raise ValueError("`bins` should be a positive integer.") + self._bins_int = bins + self._bins = dtypes.literal_to_ibis_scalar(bins, force_dtype=Int64Dtype()) + else: + self._bins_int = 0 + self._bins = bins def _as_ibis(self, x: ibis_types.Column, window=None): - col_min = _apply_window_if_present(x.min(), window) - col_max = _apply_window_if_present(x.max(), window) - bin_width = (col_max - col_min) / self._bins_ibis out = ibis.case() - for this_bin in range(self._bins_int - 1): - out = out.when( - x <= (col_min + (this_bin + 1) * bin_width), - dtypes.literal_to_ibis_scalar(this_bin, force_dtype=Int64Dtype()), - ) - out = out.when(x.notnull(), self._bins_ibis - 1) + + if self._bins_int > 0: + col_min = _apply_window_if_present(x.min(), window) + col_max = _apply_window_if_present(x.max(), window) + bin_width = (col_max - col_min) / self._bins + + for this_bin in range(self._bins_int - 1): + out = out.when( + x <= (col_min + (this_bin + 1) * bin_width), + dtypes.literal_to_ibis_scalar(this_bin, force_dtype=Int64Dtype()), + ) + out = out.when(x.notnull(), self._bins - 1) + else: + for interval in self._bins: + condition = (x > interval.left) & (x <= interval.right) + interval_struct = ibis.struct( + {"left_exclusive": interval.left, "right_inclusive": interval.right} + ) + out = out.when(condition, interval_struct) return out.end() @property diff --git a/bigframes/series.py b/bigframes/series.py index c929775a00..8d8c711c92 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1324,7 +1324,7 @@ def to_csv(self, path_or_buf=None, **kwargs) -> typing.Optional[str]: return self.to_pandas().to_csv(path_or_buf, **kwargs) def to_dict(self, into: type[dict] = dict) -> typing.Mapping: - return typing.cast(dict, self.to_pandas().to_dict(into)) + return typing.cast(dict, self.to_pandas().to_dict(into)) # type: ignore def to_excel(self, excel_writer, sheet_name="Sheet1", **kwargs) -> None: return self.to_pandas().to_excel(excel_writer, sheet_name, **kwargs) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index a57f7b94c5..fbe900106a 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1017,13 +1017,13 @@ def read_csv( header=header, names=names, index_col=index_col, - usecols=usecols, + usecols=usecols, # type: ignore dtype=dtype, engine=engine, encoding=encoding, **kwargs, ) - return self.read_pandas(pandas_df) + return self.read_pandas(pandas_df) # type: ignore def read_pickle( self, diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index a1079288cf..282c0d68eb 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -365,6 +365,40 @@ def test_cut(scalars_dfs): pd.testing.assert_series_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("bins",), + [ + ([(-5, 2), (2, 3), (-3000, -10)],), + (pd.IntervalIndex.from_tuples([(1, 2), (2, 3), (4, 5)]),), + ], +) +def test_cut_with_interval(scalars_dfs, bins): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = bpd.cut(scalars_df["int64_too"], bins, labels=False).to_pandas() + + if isinstance(bins, list): + bins = pd.IntervalIndex.from_tuples(bins) + pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=False) + + # Convert to match data format + pd_result_converted = pd.Series( + [ + {"left_exclusive": interval.left, "right_inclusive": interval.right} + if pd.notna(val) + else pd.NA + for val, interval in zip( + pd_result, pd_result.cat.categories[pd_result.cat.codes] + ) + ], + name=pd_result.name, + ) + pd_result.index = pd_result.index.astype("Int64") + + pd.testing.assert_series_equal( + bf_result, pd_result_converted, check_index=False, check_dtype=False + ) + + @pytest.mark.parametrize( ("q",), [ diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index d4471ed68e..55975c3fc1 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -24,31 +24,61 @@ def cut( ``labels=False`` implies you just want the bins back. - Examples: - - .. code-block:: - - import bigframes.pandas as pd + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([0, 1, 5, 10]) + >>> s + 0 0 + 1 1 + 2 5 + 3 10 + dtype: Int64 - pd.options.display.progress_bar = None - s = pd.Series([0, 1, 1, 2]) - pd.cut(s, bins=4, labels=False) + Cut with an integer (equal-width bins): + >>> bpd.cut(s, bins=4, labels=False) 0 0 - 1 1 + 1 0 2 1 3 3 dtype: Int64 + Cut with pd.IntervalIndex, requires importing pandas for IntervalIndex: + + >>> import pandas as pd + + >>> interval_index = pd.IntervalIndex.from_tuples([(0, 1), (1, 5), (5, 20)]) + >>> bpd.cut(s, bins=interval_index, labels=False) + 0 + 1 {'left_exclusive': 0, 'right_inclusive': 1} + 2 {'left_exclusive': 1, 'right_inclusive': 5} + 3 {'left_exclusive': 5, 'right_inclusive': 20} + dtype: struct[pyarrow] + + Cut with an iterable of tuples: + + >>> bins_tuples = [(0, 1), (1, 4), (5, 20)] + >>> bpd.cut(s, bins=bins_tuples, labels=False) + 0 + 1 {'left_exclusive': 0, 'right_inclusive': 1} + 2 + 3 {'left_exclusive': 5, 'right_inclusive': 20} + dtype: struct[pyarrow] + Args: x (Series): The input Series to be binned. Must be 1-dimensional. - bins (int): + bins (int, pd.IntervalIndex, Iterable[Tuple[Union[int, float], Union[int, float]]]): The criteria to bin by. - int : Defines the number of equal-width bins in the range of `x`. The + int: Defines the number of equal-width bins in the range of `x`. The range of `x` is extended by .1% on each side to include the minimum and maximum values of `x`. + + pd.IntervalIndex or Iterable of tuples: Defines the exact bins to be used. + It's important to ensure that these bins are non-overlapping. labels (None): Specifies the labels for the returned bins. Must be the same length as the resulting bins. If False, returns only integer indicators of the