Skip to content

Commit

Permalink
feat: Add IntervalIndex support to bigframes.pandas.cut (#254)
Browse files Browse the repository at this point in the history
* feature: Add IntervalIndex support to bigframes.pandas.cut

* add bins <= 0 error in CutOp

* add type ignore

* add type ignore to session

---------

Co-authored-by: Shobhit Singh <[email protected]>
  • Loading branch information
Genesis929 and shobsi authored Dec 18, 2023
1 parent 02f7ab6 commit 6c1969a
Show file tree
Hide file tree
Showing 6 changed files with 124 additions and 29 deletions.
20 changes: 17 additions & 3 deletions bigframes/core/reshape/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
from __future__ import annotations

import typing
from typing import Iterable, Literal, Optional, Union
from typing import Iterable, Literal, Optional, Tuple, Union

import pandas as pd

import bigframes.constants as constants
import bigframes.core as core
Expand Down Expand Up @@ -108,17 +110,29 @@ def concat(

def cut(
x: bigframes.series.Series,
bins: int,
bins: Union[
int,
pd.IntervalIndex,
Iterable[Tuple[Union[int, float], Union[int, float]]],
],
*,
labels: Optional[bool] = None,
) -> bigframes.series.Series:
if bins <= 0:
if isinstance(bins, int) and bins <= 0:
raise ValueError("`bins` should be a positive integer.")

if isinstance(bins, Iterable):
if not isinstance(bins, pd.IntervalIndex):
bins = pd.IntervalIndex.from_tuples(list(bins))

if bins.is_overlapping:
raise ValueError("Overlapping IntervalIndex is not accepted.")

if labels is not False:
raise NotImplementedError(
f"Only labels=False is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}"
)

return x._apply_window_op(agg_ops.CutOp(bins), window_spec=core.WindowSpec())


Expand Down
41 changes: 29 additions & 12 deletions bigframes/operations/aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import ibis.expr.datatypes as ibis_dtypes
import ibis.expr.types as ibis_types
from pandas import Int64Dtype
import pandas as pd

import bigframes.constants as constants
import bigframes.dtypes as dtypes
Expand Down Expand Up @@ -228,21 +229,37 @@ def skips_nulls(self):


class CutOp(WindowOp):
def __init__(self, bins: int):
self._bins_ibis = dtypes.literal_to_ibis_scalar(bins, force_dtype=Int64Dtype())
self._bins_int = bins
def __init__(self, bins: typing.Union[int, pd.IntervalIndex]):
if isinstance(bins, int):
if not bins > 0:
raise ValueError("`bins` should be a positive integer.")
self._bins_int = bins
self._bins = dtypes.literal_to_ibis_scalar(bins, force_dtype=Int64Dtype())
else:
self._bins_int = 0
self._bins = bins

def _as_ibis(self, x: ibis_types.Column, window=None):
col_min = _apply_window_if_present(x.min(), window)
col_max = _apply_window_if_present(x.max(), window)
bin_width = (col_max - col_min) / self._bins_ibis
out = ibis.case()
for this_bin in range(self._bins_int - 1):
out = out.when(
x <= (col_min + (this_bin + 1) * bin_width),
dtypes.literal_to_ibis_scalar(this_bin, force_dtype=Int64Dtype()),
)
out = out.when(x.notnull(), self._bins_ibis - 1)

if self._bins_int > 0:
col_min = _apply_window_if_present(x.min(), window)
col_max = _apply_window_if_present(x.max(), window)
bin_width = (col_max - col_min) / self._bins

for this_bin in range(self._bins_int - 1):
out = out.when(
x <= (col_min + (this_bin + 1) * bin_width),
dtypes.literal_to_ibis_scalar(this_bin, force_dtype=Int64Dtype()),
)
out = out.when(x.notnull(), self._bins - 1)
else:
for interval in self._bins:
condition = (x > interval.left) & (x <= interval.right)
interval_struct = ibis.struct(
{"left_exclusive": interval.left, "right_inclusive": interval.right}
)
out = out.when(condition, interval_struct)
return out.end()

@property
Expand Down
2 changes: 1 addition & 1 deletion bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1324,7 +1324,7 @@ def to_csv(self, path_or_buf=None, **kwargs) -> typing.Optional[str]:
return self.to_pandas().to_csv(path_or_buf, **kwargs)

def to_dict(self, into: type[dict] = dict) -> typing.Mapping:
return typing.cast(dict, self.to_pandas().to_dict(into))
return typing.cast(dict, self.to_pandas().to_dict(into)) # type: ignore

def to_excel(self, excel_writer, sheet_name="Sheet1", **kwargs) -> None:
return self.to_pandas().to_excel(excel_writer, sheet_name, **kwargs)
Expand Down
4 changes: 2 additions & 2 deletions bigframes/session/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1017,13 +1017,13 @@ def read_csv(
header=header,
names=names,
index_col=index_col,
usecols=usecols,
usecols=usecols, # type: ignore
dtype=dtype,
engine=engine,
encoding=encoding,
**kwargs,
)
return self.read_pandas(pandas_df)
return self.read_pandas(pandas_df) # type: ignore

def read_pickle(
self,
Expand Down
34 changes: 34 additions & 0 deletions tests/system/small/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,40 @@ def test_cut(scalars_dfs):
pd.testing.assert_series_equal(bf_result, pd_result)


@pytest.mark.parametrize(
("bins",),
[
([(-5, 2), (2, 3), (-3000, -10)],),
(pd.IntervalIndex.from_tuples([(1, 2), (2, 3), (4, 5)]),),
],
)
def test_cut_with_interval(scalars_dfs, bins):
scalars_df, scalars_pandas_df = scalars_dfs
bf_result = bpd.cut(scalars_df["int64_too"], bins, labels=False).to_pandas()

if isinstance(bins, list):
bins = pd.IntervalIndex.from_tuples(bins)
pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=False)

# Convert to match data format
pd_result_converted = pd.Series(
[
{"left_exclusive": interval.left, "right_inclusive": interval.right}
if pd.notna(val)
else pd.NA
for val, interval in zip(
pd_result, pd_result.cat.categories[pd_result.cat.codes]
)
],
name=pd_result.name,
)
pd_result.index = pd_result.index.astype("Int64")

pd.testing.assert_series_equal(
bf_result, pd_result_converted, check_index=False, check_dtype=False
)


@pytest.mark.parametrize(
("q",),
[
Expand Down
52 changes: 41 additions & 11 deletions third_party/bigframes_vendored/pandas/core/reshape/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,31 +24,61 @@ def cut(
``labels=False`` implies you just want the bins back.
Examples:
.. code-block::
import bigframes.pandas as pd
**Examples:**
>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None
>>> s = bpd.Series([0, 1, 5, 10])
>>> s
0 0
1 1
2 5
3 10
dtype: Int64
pd.options.display.progress_bar = None
s = pd.Series([0, 1, 1, 2])
pd.cut(s, bins=4, labels=False)
Cut with an integer (equal-width bins):
>>> bpd.cut(s, bins=4, labels=False)
0 0
1 1
1 0
2 1
3 3
dtype: Int64
Cut with pd.IntervalIndex, requires importing pandas for IntervalIndex:
>>> import pandas as pd
>>> interval_index = pd.IntervalIndex.from_tuples([(0, 1), (1, 5), (5, 20)])
>>> bpd.cut(s, bins=interval_index, labels=False)
0 <NA>
1 {'left_exclusive': 0, 'right_inclusive': 1}
2 {'left_exclusive': 1, 'right_inclusive': 5}
3 {'left_exclusive': 5, 'right_inclusive': 20}
dtype: struct<left_exclusive: int64, right_inclusive: int64>[pyarrow]
Cut with an iterable of tuples:
>>> bins_tuples = [(0, 1), (1, 4), (5, 20)]
>>> bpd.cut(s, bins=bins_tuples, labels=False)
0 <NA>
1 {'left_exclusive': 0, 'right_inclusive': 1}
2 <NA>
3 {'left_exclusive': 5, 'right_inclusive': 20}
dtype: struct<left_exclusive: int64, right_inclusive: int64>[pyarrow]
Args:
x (Series):
The input Series to be binned. Must be 1-dimensional.
bins (int):
bins (int, pd.IntervalIndex, Iterable[Tuple[Union[int, float], Union[int, float]]]):
The criteria to bin by.
int : Defines the number of equal-width bins in the range of `x`. The
int: Defines the number of equal-width bins in the range of `x`. The
range of `x` is extended by .1% on each side to include the minimum
and maximum values of `x`.
pd.IntervalIndex or Iterable of tuples: Defines the exact bins to be used.
It's important to ensure that these bins are non-overlapping.
labels (None):
Specifies the labels for the returned bins. Must be the same length as
the resulting bins. If False, returns only integer indicators of the
Expand Down

0 comments on commit 6c1969a

Please sign in to comment.