Skip to content

Commit

Permalink
Add missing tests for groupby (#709)
Browse files Browse the repository at this point in the history
Added missing tests or tests that weren't running properly for groupby.

I would be grateful if someone could check when possible.

Thanks.
  • Loading branch information
itholic authored and HyukjinKwon committed Aug 28, 2019
1 parent 19a32f0 commit 8b62fcc
Showing 1 changed file with 20 additions and 1 deletion.
21 changes: 20 additions & 1 deletion databricks/koalas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,11 @@ def test_aggregate(self):
pdf.groupby('A', as_index=as_index).agg({'B': ['min', 'max'],
'C': 'sum'}))

expected_error_message = (r"aggs must be a dict mapping from column name \(string\) "
r"to aggregate functions \(string or list of strings\).")
with self.assertRaisesRegex(ValueError, expected_error_message):
kdf.groupby('A', as_index=as_index).agg(0)

def test_all_any(self):
pdf = pd.DataFrame({'A': [1, 1, 2, 2, 3, 3, 4, 4, 5, 5],
'B': [True, True, True, False, False, False, None, True, None, False]})
Expand Down Expand Up @@ -168,6 +173,10 @@ def test_value_counts(self):
.value_counts(sort=True, ascending=False).sort_index()),
repr(pdf.groupby("A")['B']
.value_counts(sort=True, ascending=False).sort_index()))
self.assert_eq(repr(kdf.groupby("A")['B']
.value_counts(sort=True, ascending=True).sort_index()),
repr(pdf.groupby("A")['B']
.value_counts(sort=True, ascending=True).sort_index()))

def test_size(self):
pdf = pd.DataFrame({'A': [1, 2, 2, 3, 3, 3],
Expand Down Expand Up @@ -275,7 +284,7 @@ def test_nsmallest(self):
self.assert_eq(repr(kdf.groupby(['a'])['b'].nsmallest(2).sort_index()),
repr(pdf.groupby(['a'])['b'].nsmallest(2).sort_index()))
with self.assertRaisesRegex(ValueError, "idxmax do not support multi-index now"):
kdf.set_index(['a', 'b']).groupby(['c'])['d'].nlargest(1)
kdf.set_index(['a', 'b']).groupby(['c'])['d'].nsmallest(1)

def test_nlargest(self):
pdf = pd.DataFrame({'a': [1, 1, 1, 2, 2, 2, 3, 3, 3],
Expand Down Expand Up @@ -357,6 +366,8 @@ def test_apply(self):
pdf.groupby(['a', 'b']).apply(lambda x: x * x).sort_index())
self.assert_eq(kdf.groupby(['b'])['a'].apply(lambda x: x).sort_index(),
pdf.groupby(['b'])['a'].apply(lambda x: x).sort_index())
with self.assertRaisesRegex(TypeError, "<class 'int'> object is not callable"):
kdf.groupby("b").apply(1)

def test_apply_with_new_dataframe(self):
# Less than 1000 records will execute a shortcut by using collected pandas dataframe
Expand Down Expand Up @@ -405,6 +416,8 @@ def test_transform(self):
pdf.groupby(['a', 'b']).transform(lambda x: x * x).sort_index())
self.assert_eq(kdf.groupby(['b'])['a'].transform(lambda x: x).sort_index(),
pdf.groupby(['b'])['a'].transform(lambda x: x).sort_index())
with self.assertRaisesRegex(TypeError, "<class 'int'> object is not callable"):
kdf.groupby("b").transform(1)

def test_filter(self):
pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
Expand All @@ -425,6 +438,8 @@ def test_filter(self):
pdf.groupby("b").filter(lambda x: x.b.mean() < 4).sort_index())
self.assert_eq(kdf.groupby(['a', 'b']).filter(lambda x: any(x.a == 2)).sort_index(),
pdf.groupby(['a', 'b']).filter(lambda x: any(x.a == 2)).sort_index())
with self.assertRaisesRegex(TypeError, "<class 'int'> object is not callable"):
kdf.groupby("b").filter(1)

def test_idxmax(self):
pdf = pd.DataFrame({'a': [1, 1, 2, 2, 3],
Expand All @@ -433,6 +448,8 @@ def test_idxmax(self):
kdf = koalas.DataFrame(pdf)
self.assert_eq(pdf.groupby(['a']).idxmax(),
kdf.groupby(['a']).idxmax().sort_index())
self.assert_eq(pdf.groupby(['a']).idxmax(skipna=False),
kdf.groupby(['a']).idxmax(skipna=False).sort_index())
with self.assertRaisesRegex(ValueError, 'idxmax only support one-level index now'):
kdf.set_index(['a', 'b']).groupby(['c']).idxmax()

Expand All @@ -443,6 +460,8 @@ def test_idxmin(self):
kdf = koalas.DataFrame(pdf)
self.assert_eq(pdf.groupby(['a']).idxmin(),
kdf.groupby(['a']).idxmin().sort_index())
self.assert_eq(pdf.groupby(['a']).idxmin(skipna=False),
kdf.groupby(['a']).idxmin(skipna=False).sort_index())
with self.assertRaisesRegex(ValueError, 'idxmin only support one-level index now'):
kdf.set_index(['a', 'b']).groupby(['c']).idxmin()

Expand Down

0 comments on commit 8b62fcc

Please sign in to comment.