Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add IntervalIndex support to bigframes.pandas.cut #254

Merged
merged 7 commits into from
Dec 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 17 additions & 3 deletions bigframes/core/reshape/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
from __future__ import annotations

import typing
from typing import Iterable, Literal, Optional, Union
from typing import Iterable, Literal, Optional, Tuple, Union

import pandas as pd

import bigframes.constants as constants
import bigframes.core as core
Expand Down Expand Up @@ -108,17 +110,29 @@ def concat(

def cut(
x: bigframes.series.Series,
bins: int,
bins: Union[
int,
pd.IntervalIndex,
Iterable[Tuple[Union[int, float], Union[int, float]]],
],
*,
labels: Optional[bool] = None,
) -> bigframes.series.Series:
if bins <= 0:
if isinstance(bins, int) and bins <= 0:
raise ValueError("`bins` should be a positive integer.")

if isinstance(bins, Iterable):
if not isinstance(bins, pd.IntervalIndex):
bins = pd.IntervalIndex.from_tuples(list(bins))

if bins.is_overlapping:
raise ValueError("Overlapping IntervalIndex is not accepted.")

if labels is not False:
raise NotImplementedError(
f"Only labels=False is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}"
)

return x._apply_window_op(agg_ops.CutOp(bins), window_spec=core.WindowSpec())


Expand Down
41 changes: 29 additions & 12 deletions bigframes/operations/aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import ibis.expr.datatypes as ibis_dtypes
import ibis.expr.types as ibis_types
from pandas import Int64Dtype
import pandas as pd

import bigframes.constants as constants
import bigframes.dtypes as dtypes
Expand Down Expand Up @@ -228,21 +229,37 @@ def skips_nulls(self):


class CutOp(WindowOp):
def __init__(self, bins: int):
self._bins_ibis = dtypes.literal_to_ibis_scalar(bins, force_dtype=Int64Dtype())
self._bins_int = bins
def __init__(self, bins: typing.Union[int, pd.IntervalIndex]):
if isinstance(bins, int):
if not bins > 0:
raise ValueError("`bins` should be a positive integer.")
self._bins_int = bins
Copy link
Contributor

@shobsi shobsi Dec 12, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might be a good place to check bins>0. I know we are doing it in the cut method, but this is a public class as well, and also contains the business logic if-else at line 243.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, added a check here.

self._bins = dtypes.literal_to_ibis_scalar(bins, force_dtype=Int64Dtype())
else:
self._bins_int = 0
self._bins = bins

def _as_ibis(self, x: ibis_types.Column, window=None):
col_min = _apply_window_if_present(x.min(), window)
col_max = _apply_window_if_present(x.max(), window)
bin_width = (col_max - col_min) / self._bins_ibis
out = ibis.case()
for this_bin in range(self._bins_int - 1):
out = out.when(
x <= (col_min + (this_bin + 1) * bin_width),
dtypes.literal_to_ibis_scalar(this_bin, force_dtype=Int64Dtype()),
)
out = out.when(x.notnull(), self._bins_ibis - 1)

if self._bins_int > 0:
col_min = _apply_window_if_present(x.min(), window)
col_max = _apply_window_if_present(x.max(), window)
bin_width = (col_max - col_min) / self._bins

for this_bin in range(self._bins_int - 1):
out = out.when(
x <= (col_min + (this_bin + 1) * bin_width),
dtypes.literal_to_ibis_scalar(this_bin, force_dtype=Int64Dtype()),
)
out = out.when(x.notnull(), self._bins - 1)
else:
for interval in self._bins:
condition = (x > interval.left) & (x <= interval.right)
interval_struct = ibis.struct(
{"left_exclusive": interval.left, "right_inclusive": interval.right}
)
out = out.when(condition, interval_struct)
return out.end()

@property
Expand Down
2 changes: 1 addition & 1 deletion bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1324,7 +1324,7 @@ def to_csv(self, path_or_buf=None, **kwargs) -> typing.Optional[str]:
return self.to_pandas().to_csv(path_or_buf, **kwargs)

def to_dict(self, into: type[dict] = dict) -> typing.Mapping:
return typing.cast(dict, self.to_pandas().to_dict(into))
return typing.cast(dict, self.to_pandas().to_dict(into)) # type: ignore

def to_excel(self, excel_writer, sheet_name="Sheet1", **kwargs) -> None:
return self.to_pandas().to_excel(excel_writer, sheet_name, **kwargs)
Expand Down
4 changes: 2 additions & 2 deletions bigframes/session/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1017,13 +1017,13 @@ def read_csv(
header=header,
names=names,
index_col=index_col,
usecols=usecols,
usecols=usecols, # type: ignore
dtype=dtype,
engine=engine,
encoding=encoding,
**kwargs,
)
return self.read_pandas(pandas_df)
return self.read_pandas(pandas_df) # type: ignore

def read_pickle(
self,
Expand Down
34 changes: 34 additions & 0 deletions tests/system/small/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,40 @@ def test_cut(scalars_dfs):
pd.testing.assert_series_equal(bf_result, pd_result)


@pytest.mark.parametrize(
("bins",),
[
([(-5, 2), (2, 3), (-3000, -10)],),
(pd.IntervalIndex.from_tuples([(1, 2), (2, 3), (4, 5)]),),
],
)
def test_cut_with_interval(scalars_dfs, bins):
scalars_df, scalars_pandas_df = scalars_dfs
bf_result = bpd.cut(scalars_df["int64_too"], bins, labels=False).to_pandas()

if isinstance(bins, list):
bins = pd.IntervalIndex.from_tuples(bins)
pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=False)

# Convert to match data format
pd_result_converted = pd.Series(
[
{"left_exclusive": interval.left, "right_inclusive": interval.right}
if pd.notna(val)
else pd.NA
for val, interval in zip(
pd_result, pd_result.cat.categories[pd_result.cat.codes]
)
],
name=pd_result.name,
)
pd_result.index = pd_result.index.astype("Int64")

pd.testing.assert_series_equal(
bf_result, pd_result_converted, check_index=False, check_dtype=False
)


@pytest.mark.parametrize(
("q",),
[
Expand Down
52 changes: 41 additions & 11 deletions third_party/bigframes_vendored/pandas/core/reshape/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,31 +24,61 @@ def cut(

``labels=False`` implies you just want the bins back.

Examples:

.. code-block::

import bigframes.pandas as pd
**Examples:**

>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None
>>> s = bpd.Series([0, 1, 5, 10])
>>> s
0 0
1 1
2 5
3 10
dtype: Int64

pd.options.display.progress_bar = None
s = pd.Series([0, 1, 1, 2])
pd.cut(s, bins=4, labels=False)
Cut with an integer (equal-width bins):

>>> bpd.cut(s, bins=4, labels=False)
0 0
1 1
1 0
2 1
3 3
dtype: Int64

Cut with pd.IntervalIndex, requires importing pandas for IntervalIndex:

>>> import pandas as pd

>>> interval_index = pd.IntervalIndex.from_tuples([(0, 1), (1, 5), (5, 20)])
>>> bpd.cut(s, bins=interval_index, labels=False)
0 <NA>
1 {'left_exclusive': 0, 'right_inclusive': 1}
2 {'left_exclusive': 1, 'right_inclusive': 5}
3 {'left_exclusive': 5, 'right_inclusive': 20}
dtype: struct<left_exclusive: int64, right_inclusive: int64>[pyarrow]

Cut with an iterable of tuples:

>>> bins_tuples = [(0, 1), (1, 4), (5, 20)]
>>> bpd.cut(s, bins=bins_tuples, labels=False)
0 <NA>
1 {'left_exclusive': 0, 'right_inclusive': 1}
2 <NA>
3 {'left_exclusive': 5, 'right_inclusive': 20}
dtype: struct<left_exclusive: int64, right_inclusive: int64>[pyarrow]

Args:
x (Series):
The input Series to be binned. Must be 1-dimensional.
bins (int):
bins (int, pd.IntervalIndex, Iterable[Tuple[Union[int, float], Union[int, float]]]):
The criteria to bin by.

int : Defines the number of equal-width bins in the range of `x`. The
int: Defines the number of equal-width bins in the range of `x`. The
range of `x` is extended by .1% on each side to include the minimum
and maximum values of `x`.

pd.IntervalIndex or Iterable of tuples: Defines the exact bins to be used.
It's important to ensure that these bins are non-overlapping.
labels (None):
Specifies the labels for the returned bins. Must be the same length as
the resulting bins. If False, returns only integer indicators of the
Expand Down