Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix RollingGroupby and ExpandingGroupby to handle agg_columns. #1546

Merged
merged 3 commits into from
May 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions databricks/koalas/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2024,7 +2024,7 @@ def rolling(self, window, min_periods=None):
Series.groupby
DataFrame.groupby
"""
return RollingGroupby(self, self._groupkeys, window, min_periods=min_periods)
return RollingGroupby(self, window, min_periods=min_periods)

def expanding(self, min_periods=1):
"""
Expand All @@ -2046,7 +2046,7 @@ def expanding(self, min_periods=1):
Series.groupby
DataFrame.groupby
"""
return ExpandingGroupby(self, self._groupkeys, min_periods=min_periods)
return ExpandingGroupby(self, min_periods=min_periods)

def _reduce_for_stat_function(self, sfun, only_numeric, should_include_groupkeys=False):
if should_include_groupkeys:
Expand Down
175 changes: 117 additions & 58 deletions databricks/koalas/tests/test_expanding.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,26 +26,30 @@

class ExpandingTest(ReusedSQLTestCase, TestUtils):
def _test_expanding_func(self, f):
kser = ks.Series([1, 2, 3], index=np.random.rand(3))
pser = kser.to_pandas()
self.assert_eq(repr(getattr(kser.expanding(2), f)()), repr(getattr(pser.expanding(2), f)()))
pser = pd.Series([1, 2, 3], index=np.random.rand(3))
kser = ks.from_pandas(pser)
self.assert_eq(
getattr(kser.expanding(2), f)(), getattr(pser.expanding(2), f)(), almost=True
)

# Multiindex
kser = ks.Series(
pser = pd.Series(
[1, 2, 3], index=pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
)
pser = kser.to_pandas()
self.assert_eq(repr(getattr(kser.expanding(2), f)()), repr(getattr(pser.expanding(2), f)()))
kser = ks.from_pandas(pser)
self.assert_eq(
getattr(kser.expanding(2), f)(), getattr(pser.expanding(2), f)(), almost=True
)

kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]})
pdf = kdf.to_pandas()
self.assert_eq(repr(getattr(kdf.expanding(2), f)()), repr(getattr(pdf.expanding(2), f)()))
pdf = pd.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}, index=np.random.rand(4))
kdf = ks.from_pandas(pdf)
self.assert_eq(getattr(kdf.expanding(2), f)(), getattr(pdf.expanding(2), f)(), almost=True)

# Multiindex column
kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}, index=np.random.rand(4))
kdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
pdf = kdf.to_pandas()
self.assert_eq(repr(getattr(kdf.expanding(2), f)()), repr(getattr(pdf.expanding(2), f)()))
columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
pdf.columns = columns
kdf.columns = columns
self.assert_eq(getattr(kdf.expanding(2), f)(), getattr(pdf.expanding(2), f)(), almost=True)

def test_expanding_error(self):
with self.assertRaisesRegex(ValueError, "min_periods must be >= 0"):
Expand All @@ -66,39 +70,36 @@ def test_expanding_count(self):
self._test_expanding_func("count")
else:
# Series
kser = ks.Series([1, 2, 3], index=np.random.rand(3))
expected_result = ks.Series([None, 2.0, 3.0], index=kser.index.to_pandas())
idx = np.random.rand(3)
kser = ks.Series([1, 2, 3], index=idx, name="a")
expected_result = pd.Series([None, 2.0, 3.0], index=idx, name="a")
self.assert_eq(
repr(kser.expanding(2).count().sort_index()), repr(expected_result.sort_index())
kser.expanding(2).count().sort_index(), expected_result.sort_index(), almost=True
)
# MultiIndex
kser = ks.Series(
[1, 2, 3], index=pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
)
expected_result = ks.Series([None, 2.0, 3.0], index=kser.index.to_pandas())
midx = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
kser = ks.Series([1, 2, 3], index=midx, name="a")
expected_result = pd.Series([None, 2.0, 3.0], index=midx, name="a")
self.assert_eq(
repr(kser.expanding(2).count().sort_index()), repr(expected_result.sort_index())
kser.expanding(2).count().sort_index(), expected_result.sort_index(), almost=True
)

# DataFrame
kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]})
expected_result = ks.DataFrame({"a": [None, 2.0, 3.0, 4.0], "b": [None, 2.0, 3.0, 4.0]})
expected_result = pd.DataFrame({"a": [None, 2.0, 3.0, 4.0], "b": [None, 2.0, 3.0, 4.0]})
self.assert_eq(
repr(kdf.expanding(2).count().sort_index()), repr(expected_result.sort_index())
kdf.expanding(2).count().sort_index(), expected_result.sort_index(), almost=True
)

# MultiIndex columns
kdf = ks.DataFrame(
{"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}, index=np.random.rand(4)
)
idx = np.random.rand(4)
kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}, index=idx)
kdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
expected_result = ks.DataFrame(
{"a": [None, 2.0, 3.0, 4.0], "b": [None, 2.0, 3.0, 4.0]},
index=kdf.index.to_pandas(),
expected_result = pd.DataFrame(
{("a", "x"): [None, 2.0, 3.0, 4.0], ("a", "y"): [None, 2.0, 3.0, 4.0]}, index=idx,
)
expected_result.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
self.assert_eq(
repr(kdf.expanding(2).count().sort_index()), repr(expected_result.sort_index())
kdf.expanding(2).count().sort_index(), expected_result.sort_index(), almost=True
)

def test_expanding_min(self):
Expand All @@ -120,42 +121,69 @@ def test_expanding_var(self):
self._test_expanding_func("var")

def _test_groupby_expanding_func(self, f):
kser = ks.Series([1, 2, 3], index=np.random.rand(3))
pser = kser.to_pandas()
pser = pd.Series([1, 2, 3], index=np.random.rand(3), name="a")
kser = ks.from_pandas(pser)
self.assert_eq(
repr(getattr(kser.groupby(kser).expanding(2), f)().sort_index()),
repr(getattr(pser.groupby(pser).expanding(2), f)().sort_index()),
getattr(kser.groupby(kser).expanding(2), f)().sort_index(),
getattr(pser.groupby(pser).expanding(2), f)().sort_index(),
almost=True,
)

# Multiindex
kser = ks.Series(
[1, 2, 3], index=pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
pser = pd.Series(
[1, 2, 3],
index=pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")]),
name="a",
)
pser = kser.to_pandas()
kser = ks.from_pandas(pser)
self.assert_eq(
repr(getattr(kser.groupby(kser).expanding(2), f)().sort_index()),
repr(getattr(pser.groupby(pser).expanding(2), f)().sort_index()),
getattr(kser.groupby(kser).expanding(2), f)().sort_index(),
getattr(pser.groupby(pser).expanding(2), f)().sort_index(),
almost=True,
)

kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]})
pdf = kdf.to_pandas()
pdf = pd.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]})
kdf = ks.from_pandas(pdf)
self.assert_eq(
getattr(kdf.groupby(kdf.a).expanding(2), f)().sort_index(),
getattr(pdf.groupby(pdf.a).expanding(2), f)().sort_index(),
almost=True,
)
self.assert_eq(
getattr(kdf.groupby(kdf.a + 1).expanding(2), f)().sort_index(),
getattr(pdf.groupby(pdf.a + 1).expanding(2), f)().sort_index(),
almost=True,
)
self.assert_eq(
repr(getattr(kdf.groupby(kdf.a).expanding(2), f)().sort_index()),
repr(getattr(pdf.groupby(pdf.a).expanding(2), f)().sort_index()),
getattr(kdf.b.groupby(kdf.a).expanding(2), f)().sort_index(),
getattr(pdf.b.groupby(pdf.a).expanding(2), f)().sort_index(),
almost=True,
)
self.assert_eq(
getattr(kdf.groupby(kdf.a)["b"].expanding(2), f)().sort_index(),
getattr(pdf.groupby(pdf.a)["b"].expanding(2), f)().sort_index(),
almost=True,
)
self.assert_eq(
getattr(kdf.groupby(kdf.a)[["b"]].expanding(2), f)().sort_index(),
getattr(pdf.groupby(pdf.a)[["b"]].expanding(2), f)().sort_index(),
almost=True,
)

# Multiindex column
kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]})
kdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
pdf = kdf.to_pandas()
columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
pdf.columns = columns
kdf.columns = columns
self.assert_eq(
repr(getattr(kdf.groupby(("a", "x")).expanding(2), f)().sort_index()),
repr(getattr(pdf.groupby(("a", "x")).expanding(2), f)().sort_index()),
getattr(kdf.groupby(("a", "x")).expanding(2), f)().sort_index(),
getattr(pdf.groupby(("a", "x")).expanding(2), f)().sort_index(),
almost=True,
)

self.assert_eq(
repr(getattr(kdf.groupby([("a", "x"), ("a", "y")]).expanding(2), f)().sort_index()),
repr(getattr(pdf.groupby([("a", "x"), ("a", "y")]).expanding(2), f)().sort_index()),
getattr(kdf.groupby([("a", "x"), ("a", "y")]).expanding(2), f)().sort_index(),
getattr(pdf.groupby([("a", "x"), ("a", "y")]).expanding(2), f)().sort_index(),
almost=True,
)

def test_groupby_expanding_count(self):
Expand All @@ -169,39 +197,68 @@ def test_groupby_expanding_count(self):
midx = pd.MultiIndex.from_tuples(
list(zip(kser.to_pandas().values, kser.index.to_pandas().values))
)
expected_result = ks.Series([np.nan, np.nan, np.nan], index=midx)
expected_result = pd.Series([np.nan, np.nan, np.nan], index=midx)
self.assert_eq(
kser.groupby(kser).expanding(2).count().sort_index(),
expected_result.sort_index(),
almost=True,
)

# MultiIndex
kser = ks.Series(
[1, 2, 3], index=pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
)
midx = pd.MultiIndex.from_tuples([(1, "a", "x"), (2, "a", "y"), (3, "b", "z")])
expected_result = ks.Series([np.nan, np.nan, np.nan], index=midx)
expected_result = pd.Series([np.nan, np.nan, np.nan], index=midx)
self.assert_eq(
kser.groupby(kser).expanding(2).count().sort_index(),
expected_result.sort_index(),
almost=True,
)

# DataFrame
kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]})
midx = pd.MultiIndex.from_tuples([(1, 0), (2, 1), (2, 3), (3, 2)])
expected_result = ks.DataFrame(
expected_result = pd.DataFrame(
{"a": [None, None, 2.0, None], "b": [None, None, 2.0, None]}, index=midx
)
self.assert_eq(
kdf.groupby(kdf.a).expanding(2).count().sort_index(),
expected_result.sort_index(),
almost=True,
)
expected_result = pd.DataFrame(
{"a": [None, None, 2.0, None], "b": [None, None, 2.0, None]},
index=pd.MultiIndex.from_tuples([(2, 0), (3, 1), (3, 3), (4, 2)]),
)
self.assert_eq(
kdf.groupby(kdf.a + 1).expanding(2).count().sort_index(),
expected_result.sort_index(),
almost=True,
)
expected_result = pd.Series([None, None, 2.0, None], index=midx, name="b")
self.assert_eq(
kdf.b.groupby(kdf.a).expanding(2).count().sort_index(),
expected_result.sort_index(),
almost=True,
)
self.assert_eq(
kdf.groupby(kdf.a)["b"].expanding(2).count().sort_index(),
expected_result.sort_index(),
almost=True,
)
expected_result = pd.DataFrame({"b": [None, None, 2.0, None]}, index=midx)
self.assert_eq(
kdf.groupby(kdf.a)[["b"]].expanding(2).count().sort_index(),
expected_result.sort_index(),
almost=True,
)

# MultiIndex column
kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]})
kdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
midx = pd.MultiIndex.from_tuples([(1, 0), (2, 1), (2, 3), (3, 2)])
expected_result = ks.DataFrame(
expected_result = pd.DataFrame(
{"a": [None, None, 2.0, None], "b": [None, None, 2.0, None]}, index=midx
)
expected_result.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
Expand All @@ -211,11 +268,13 @@ def test_groupby_expanding_count(self):
almost=True,
)
midx = pd.MultiIndex.from_tuples([(1, 4.0, 0), (2, 1.0, 3), (2, 2.0, 1), (3, 3.0, 2)])
expected_result = ks.DataFrame(
{"a": [np.nan, np.nan, np.nan, np.nan], "b": [np.nan, np.nan, np.nan, np.nan]},
expected_result = pd.DataFrame(
{
("a", "x"): [np.nan, np.nan, np.nan, np.nan],
("a", "y"): [np.nan, np.nan, np.nan, np.nan],
},
index=midx,
)
expected_result.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
self.assert_eq(
kdf.groupby([("a", "x"), ("a", "y")]).expanding(2).count().sort_index(),
expected_result.sort_index(),
Expand Down
Loading