Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cudf python groupby.diff #9446

Merged
merged 4 commits into from
Oct 21, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -860,6 +860,38 @@ def last(self):
"""Get the last non-null value in each group."""
return self.agg("last")

def diff(self, periods=1, axis=0):
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
"""Get the difference between the values in each group.

Parameters
----------
periods : int, default 1
Periods to shift for calculating difference,
accepts negative values.
axis : {0 or 'index', 1 or 'columns'}, default 0
Take difference over rows (0) or columns (1).
Only row-wise (0) shift is supported.

Returns
-------
Series or DataFrame
First differences of the Series or DataFrame.
"""

if not axis == 0:
raise NotImplementedError("Only axis=0 is supported.")

# grouped values
value_columns = self.grouping.values
_, (data, index), _ = self._groupby.groups(
cudf.core.frame.Frame(value_columns._data)
)
grouped = self.obj.__class__._from_data(data, index)
grouped = self._mimic_pandas_order(grouped)

result = grouped - self.shift(periods=periods)
return result._copy_type_metadata(value_columns)

def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries:
"""Internal implementation for `ffill` and `bfill`
"""
Expand Down
83 changes: 83 additions & 0 deletions python/cudf/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1913,6 +1913,89 @@ def test_groupby_shift_row_zero_shift(nelem, fill_value):
)


@pytest.mark.parametrize("nelem", [2, 3, 100, 1000])
@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5])
@pytest.mark.parametrize("direction", [1, -1])
def test_groupby_diff_row(nelem, shift_perc, direction):
pdf = make_frame(pd.DataFrame, nelem=nelem, extra_vals=["val2"])
gdf = cudf.from_pandas(pdf)
n_shift = int(nelem * shift_perc) * direction

expected = pdf.groupby(["x", "y"]).diff(periods=n_shift)
got = gdf.groupby(["x", "y"]).diff(periods=n_shift)

assert_groupby_results_equal(
expected[["val", "val2"]], got[["val", "val2"]]
)


@pytest.mark.parametrize("nelem", [10, 50, 100, 1000])
@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5])
@pytest.mark.parametrize("direction", [1, -1])
def test_groupby_diff_row_mixed_numerics(nelem, shift_perc, direction):
t = rand_dataframe(
dtypes_meta=[
{"dtype": "int64", "null_frequency": 0, "cardinality": 10},
{"dtype": "int64", "null_frequency": 0.4, "cardinality": 10},
{"dtype": "float32", "null_frequency": 0.4, "cardinality": 10},
{"dtype": "decimal64", "null_frequency": 0.4, "cardinality": 10},
{
"dtype": "datetime64[ns]",
"null_frequency": 0.4,
"cardinality": 10,
},
{
"dtype": "timedelta64[ns]",
"null_frequency": 0.4,
"cardinality": 10,
},
],
rows=nelem,
use_threads=False,
)
pdf = t.to_pandas()
gdf = cudf.from_pandas(pdf)
n_shift = int(nelem * shift_perc) * direction

expected = pdf.groupby(["0"]).diff(periods=n_shift)
got = gdf.groupby(["0"]).diff(periods=n_shift)

assert_groupby_results_equal(
expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
)


@pytest.mark.parametrize("nelem", [10, 50, 100, 1000])
def test_groupby_diff_row_zero_shift(nelem):
t = rand_dataframe(
dtypes_meta=[
{"dtype": "int64", "null_frequency": 0, "cardinality": 10},
{"dtype": "int64", "null_frequency": 0.4, "cardinality": 10},
{"dtype": "float32", "null_frequency": 0.4, "cardinality": 10},
{
"dtype": "datetime64[ns]",
"null_frequency": 0.4,
"cardinality": 10,
},
{
"dtype": "timedelta64[ns]",
"null_frequency": 0.4,
"cardinality": 10,
},
],
rows=nelem,
use_threads=False,
)
gdf = cudf.from_pandas(t.to_pandas())

expected = gdf
got = gdf.groupby(["0"]).shift(periods=0)

assert_groupby_results_equal(
expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
)


# TODO: test for category columns when cudf.Scalar supports category type
@pytest.mark.parametrize("nelem", [10, 100, 1000])
def test_groupby_fillna_multi_value(nelem):
Expand Down