Skip to content

Commit

Permalink
Matching behavior with pandas 1.1.2 (#1688)
Browse files Browse the repository at this point in the history
This should resolve #1685 

- [x] DataFrame.truncate
- [x] AtIndexer with MultiIndex
- [x] GroupBy.nunique
- [x] Index.monotonic
- [x] GroupByRolling.max (Resolved in pandas-dev/pandas#36152)
- [x] GroupByRolling.mean (ditto)
- [x] GroupByRolling.min (ditto)
- [x] GroupByRolling.std (ditto)
- [x] GroupByRolling.sum (ditto)
- [x] GroupByRolling.var (ditto)
- [x] Series.truncate
  • Loading branch information
itholic authored Sep 21, 2020
1 parent 91210a2 commit 1ccb7ea
Show file tree
Hide file tree
Showing 10 changed files with 123 additions and 77 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/master.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ jobs:
pyarrow-version: 0.15.1
- python-version: 3.8
spark-version: 3.0.1
pandas-version: 1.0.5
pandas-version: 1.1.2
pyarrow-version: 1.0.1
default-index-type: 'distributed-sequence'
env:
Expand Down
10 changes: 8 additions & 2 deletions databricks/koalas/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2246,10 +2246,16 @@ def truncate(self, before=None, after=None, axis=None, copy=True):
raise ValueError("Truncate: %s must be after %s" % (after, before))

if isinstance(self, ks.Series):
result = first_series(self.to_frame().loc[before:after]).rename(self.name)
if indexes_increasing:
result = first_series(self.to_frame().loc[before:after]).rename(self.name)
else:
result = first_series(self.to_frame().loc[after:before]).rename(self.name)
elif isinstance(self, ks.DataFrame):
if axis == 0:
result = self.loc[before:after]
if indexes_increasing:
result = self.loc[before:after]
else:
result = self.loc[after:before]
elif axis == 1:
result = self.loc[:, before:after]

Expand Down
32 changes: 11 additions & 21 deletions databricks/koalas/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2082,12 +2082,12 @@ def nunique(self, dropna=True):
4 ham 5 x
5 ham 5 y
>>> df.groupby('id').nunique().sort_index() # doctest: +NORMALIZE_WHITESPACE
id value1 value2
>>> df.groupby('id').nunique().sort_index() # doctest: +SKIP
value1 value2
id
egg 1 1 1
ham 1 1 2
spam 1 2 1
egg 1 1
ham 1 2
spam 2 1
>>> df.groupby('id')['value1'].nunique().sort_index() # doctest: +NORMALIZE_WHITESPACE
id
Expand All @@ -2104,10 +2104,7 @@ def nunique(self, dropna=True):
+ F.when(F.count(F.when(col.isNull(), 1).otherwise(None)) >= 1, 1).otherwise(0)
)

should_include_groupkeys = isinstance(self, DataFrameGroupBy)
return self._reduce_for_stat_function(
stat_function, only_numeric=False, should_include_groupkeys=should_include_groupkeys
)
return self._reduce_for_stat_function(stat_function, only_numeric=False)

def rolling(self, window, min_periods=None):
"""
Expand Down Expand Up @@ -2158,13 +2155,9 @@ def expanding(self, min_periods=1):
"""
return ExpandingGroupby(self, min_periods=min_periods)

def _reduce_for_stat_function(self, sfun, only_numeric, should_include_groupkeys=False):
if should_include_groupkeys:
agg_columns = self._groupkeys + self._agg_columns
agg_columns_scols = self._groupkeys_scols + self._agg_columns_scols
else:
agg_columns = self._agg_columns
agg_columns_scols = self._agg_columns_scols
def _reduce_for_stat_function(self, sfun, only_numeric):
agg_columns = self._agg_columns
agg_columns_scols = self._agg_columns_scols

groupkey_names = [SPARK_INDEX_NAME_FORMAT(i) for i in range(len(self._groupkeys))]
groupkey_scols = [s.alias(name) for s, name in zip(self._groupkeys_scols, groupkey_names)]
Expand Down Expand Up @@ -2541,11 +2534,8 @@ def _kdf(self) -> DataFrame:
def _agg_columns(self):
return [self._kser]

def _reduce_for_stat_function(self, sfun, only_numeric, should_include_groupkeys=False):
assert not should_include_groupkeys, should_include_groupkeys
return first_series(
super()._reduce_for_stat_function(sfun, only_numeric, should_include_groupkeys)
)
def _reduce_for_stat_function(self, sfun, only_numeric):
return first_series(super()._reduce_for_stat_function(sfun, only_numeric))

def agg(self, *args, **kwargs):
return MissingPandasLikeSeriesGroupBy.agg(self, *args, **kwargs)
Expand Down
30 changes: 24 additions & 6 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3630,16 +3630,27 @@ def test_truncate(self):
self.assert_eq(kdf1.truncate(after=400), pdf1.truncate(after=400))
self.assert_eq(kdf1.truncate(copy=False), pdf1.truncate(copy=False))
self.assert_eq(kdf1.truncate(-20, 400, copy=False), pdf1.truncate(-20, 400, copy=False))
self.assert_eq(kdf2.truncate(0, 550), pdf2.truncate(0, 550))
self.assert_eq(kdf2.truncate(0, 550, copy=False), pdf2.truncate(0, 550, copy=False))
# The bug for these tests has been fixed in pandas 1.1.0.
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
self.assert_eq(kdf2.truncate(0, 550), pdf2.truncate(0, 550))
self.assert_eq(kdf2.truncate(0, 550, copy=False), pdf2.truncate(0, 550, copy=False))
else:
expected_kdf = ks.DataFrame(
{"A": ["b", "c", "d"], "B": ["i", "j", "k"], "C": ["p", "q", "r"],},
index=[550, 400, 0],
)
self.assert_eq(kdf2.truncate(0, 550), expected_kdf)
self.assert_eq(kdf2.truncate(0, 550, copy=False), expected_kdf)

# axis = 1
self.assert_eq(kdf1.truncate(axis=1), pdf1.truncate(axis=1))
self.assert_eq(kdf1.truncate(before="B", axis=1), pdf1.truncate(before="B", axis=1))
self.assert_eq(kdf1.truncate(after="A", axis=1), pdf1.truncate(after="A", axis=1))
self.assert_eq(kdf1.truncate(copy=False, axis=1), pdf1.truncate(copy=False, axis=1))
self.assert_eq(kdf2.truncate("B", "C", axis=1), pdf2.truncate("B", "C", axis=1))
self.assert_eq(
kdf1.truncate("B", "C", copy=False, axis=1), pdf1.truncate("B", "C", copy=False, axis=1)
kdf1.truncate("B", "C", copy=False, axis=1),
pdf1.truncate("B", "C", copy=False, axis=1),
)

# MultiIndex columns
Expand All @@ -3654,16 +3665,23 @@ def test_truncate(self):
self.assert_eq(kdf1.truncate(after=400), pdf1.truncate(after=400))
self.assert_eq(kdf1.truncate(copy=False), pdf1.truncate(copy=False))
self.assert_eq(kdf1.truncate(-20, 400, copy=False), pdf1.truncate(-20, 400, copy=False))
self.assert_eq(kdf2.truncate(0, 550), pdf2.truncate(0, 550))
self.assert_eq(kdf2.truncate(0, 550, copy=False), pdf2.truncate(0, 550, copy=False))
# The bug for these tests has been fixed in pandas 1.1.0.
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
self.assert_eq(kdf2.truncate(0, 550), pdf2.truncate(0, 550))
self.assert_eq(kdf2.truncate(0, 550, copy=False), pdf2.truncate(0, 550, copy=False))
else:
expected_kdf.columns = columns
self.assert_eq(kdf2.truncate(0, 550), expected_kdf)
self.assert_eq(kdf2.truncate(0, 550, copy=False), expected_kdf)
# axis = 1
self.assert_eq(kdf1.truncate(axis=1), pdf1.truncate(axis=1))
self.assert_eq(kdf1.truncate(before="B", axis=1), pdf1.truncate(before="B", axis=1))
self.assert_eq(kdf1.truncate(after="A", axis=1), pdf1.truncate(after="A", axis=1))
self.assert_eq(kdf1.truncate(copy=False, axis=1), pdf1.truncate(copy=False, axis=1))
self.assert_eq(kdf2.truncate("B", "C", axis=1), pdf2.truncate("B", "C", axis=1))
self.assert_eq(
kdf1.truncate("B", "C", copy=False, axis=1), pdf1.truncate("B", "C", copy=False, axis=1)
kdf1.truncate("B", "C", copy=False, axis=1),
pdf1.truncate("B", "C", copy=False, axis=1),
)

# Exceptions
Expand Down
46 changes: 31 additions & 15 deletions databricks/koalas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,13 +674,20 @@ def test_nunique(self):
kdf.groupby("a").agg({"b": "nunique"}).sort_index(),
pdf.groupby("a").agg({"b": "nunique"}).sort_index(),
)
self.assert_eq(
kdf.groupby("a").nunique().sort_index(), pdf.groupby("a").nunique().sort_index()
)
self.assert_eq(
kdf.groupby("a").nunique(dropna=False).sort_index(),
pdf.groupby("a").nunique(dropna=False).sort_index(),
)
if LooseVersion(pd.__version__) < LooseVersion("1.1.0"):
expected = ks.DataFrame({"b": [2, 2]}, index=pd.Index([0, 1], name="a"))
self.assert_eq(kdf.groupby("a").nunique().sort_index(), expected)
self.assert_eq(
kdf.groupby("a").nunique(dropna=False).sort_index(), expected,
)
else:
self.assert_eq(
kdf.groupby("a").nunique().sort_index(), pdf.groupby("a").nunique().sort_index()
)
self.assert_eq(
kdf.groupby("a").nunique(dropna=False).sort_index(),
pdf.groupby("a").nunique(dropna=False).sort_index(),
)
self.assert_eq(
kdf.groupby("a")["b"].nunique().sort_index(),
pdf.groupby("a")["b"].nunique().sort_index(),
Expand All @@ -702,14 +709,23 @@ def test_nunique(self):
pdf.columns = columns
kdf.columns = columns

self.assert_eq(
kdf.groupby(("x", "a")).nunique().sort_index(),
pdf.groupby(("x", "a")).nunique().sort_index(),
)
self.assert_eq(
kdf.groupby(("x", "a")).nunique(dropna=False).sort_index(),
pdf.groupby(("x", "a")).nunique(dropna=False).sort_index(),
)
if LooseVersion(pd.__version__) < LooseVersion("1.1.0"):
expected = ks.DataFrame({("y", "b"): [2, 2]}, index=pd.Index([0, 1], name=("x", "a")))
self.assert_eq(
kdf.groupby(("x", "a")).nunique().sort_index(), expected,
)
self.assert_eq(
kdf.groupby(("x", "a")).nunique(dropna=False).sort_index(), expected,
)
else:
self.assert_eq(
kdf.groupby(("x", "a")).nunique().sort_index(),
pdf.groupby(("x", "a")).nunique().sort_index(),
)
self.assert_eq(
kdf.groupby(("x", "a")).nunique(dropna=False).sort_index(),
pdf.groupby(("x", "a")).nunique(dropna=False).sort_index(),
)

def test_unique(self):
for pdf in [
Expand Down
9 changes: 6 additions & 3 deletions databricks/koalas/tests/test_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -916,9 +916,12 @@ def test_monotonic(self):
datas.append([(5, 100), (4, 200), (3, None), (2, 400), (1, 500)])
datas.append([(5, 100), (4, 200), (3, 300), (2, 400), (1, None)])
datas.append([(1, 100), (2, 200), (None, None), (4, 400), (5, 500)])
datas.append([(-5, None), (-4, None), (-3, None), (-2, None), (-1, None)])
datas.append([(None, "e"), (None, "c"), (None, "b"), (None, "d"), (None, "a")])
datas.append([(None, None), (None, None), (None, None), (None, None), (None, None)])
# The datas below cannot be an arguments for `MultiIndex.from_tuples` in pandas >= 1.1.0.
# Refer https://github.com/databricks/koalas/pull/1688#issuecomment-667156560 for detail.
if LooseVersion(pd.__version__) < LooseVersion("1.1.0"):
datas.append([(-5, None), (-4, None), (-3, None), (-2, None), (-1, None)])
datas.append([(None, "e"), (None, "c"), (None, "b"), (None, "d"), (None, "a")])
datas.append([(None, None), (None, None), (None, None), (None, None), (None, None)])

# duplicated index value tests
datas.append([("x", "d"), ("y", "c"), ("y", "b"), ("z", "a")])
Expand Down
57 changes: 32 additions & 25 deletions databricks/koalas/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,10 +183,17 @@ def test_at_multiindex(self):
pdf = self.pdf.set_index("b", append=True)
kdf = self.kdf.set_index("b", append=True)

self.assert_eq(kdf.at[(3, 6), "a"], pdf.at[(3, 6), "a"])
self.assert_eq(kdf.at[(3,), "a"], pdf.at[(3,), "a"])
self.assert_eq(list(kdf.at[(9, 0), "a"]), list(pdf.at[(9, 0), "a"]))
self.assert_eq(list(kdf.at[(9,), "a"]), list(pdf.at[(9,), "a"]))
# TODO: seems like a pandas' bug in pandas>=1.1.0
if LooseVersion(pd.__version__) < LooseVersion("1.1.0"):
self.assert_eq(kdf.at[(3, 6), "a"], pdf.at[(3, 6), "a"])
self.assert_eq(kdf.at[(3,), "a"], pdf.at[(3,), "a"])
self.assert_eq(list(kdf.at[(9, 0), "a"]), list(pdf.at[(9, 0), "a"]))
self.assert_eq(list(kdf.at[(9,), "a"]), list(pdf.at[(9,), "a"]))
else:
self.assert_eq(kdf.at[(3, 6), "a"], 3)
self.assert_eq(kdf.at[(3,), "a"], np.array([3]))
self.assert_eq(list(kdf.at[(9, 0), "a"]), [7, 8, 9])
self.assert_eq(list(kdf.at[(9,), "a"]), [7, 8, 9])

with self.assertRaises(ValueError):
kdf.at[3, "a"]
Expand Down Expand Up @@ -1127,38 +1134,38 @@ def test_index_operator_datetime(self):
kdf = ks.from_pandas(pdf)

# Positional iloc search
self.assert_eq(kdf[:4], pdf[:4])
self.assert_eq(kdf[:3], pdf[:3])
self.assert_eq(kdf[3:], pdf[3:])
self.assert_eq(kdf[2:], pdf[2:])
self.assert_eq(kdf[2:3], pdf[2:3])
self.assert_eq(kdf[2:-1], pdf[2:-1])
self.assert_eq(kdf[10:3], pdf[10:3])
self.assert_eq(kdf[:4], pdf[:4], almost=True)
self.assert_eq(kdf[:3], pdf[:3], almost=True)
self.assert_eq(kdf[3:], pdf[3:], almost=True)
self.assert_eq(kdf[2:], pdf[2:], almost=True)
self.assert_eq(kdf[2:3], pdf[2:3], almost=True)
self.assert_eq(kdf[2:-1], pdf[2:-1], almost=True)
self.assert_eq(kdf[10:3], pdf[10:3], almost=True)

# Index loc search
self.assert_eq(kdf.A[4], pdf.A[4])
self.assert_eq(kdf.A[3], pdf.A[3])

# Positional iloc search
self.assert_eq(kdf.A[:4], pdf.A[:4])
self.assert_eq(kdf.A[:3], pdf.A[:3])
self.assert_eq(kdf.A[3:], pdf.A[3:])
self.assert_eq(kdf.A[2:], pdf.A[2:])
self.assert_eq(kdf.A[2:3], pdf.A[2:3])
self.assert_eq(kdf.A[2:-1], pdf.A[2:-1])
self.assert_eq(kdf.A[10:3], pdf.A[10:3])
self.assert_eq(kdf.A[:4], pdf.A[:4], almost=True)
self.assert_eq(kdf.A[:3], pdf.A[:3], almost=True)
self.assert_eq(kdf.A[3:], pdf.A[3:], almost=True)
self.assert_eq(kdf.A[2:], pdf.A[2:], almost=True)
self.assert_eq(kdf.A[2:3], pdf.A[2:3], almost=True)
self.assert_eq(kdf.A[2:-1], pdf.A[2:-1], almost=True)
self.assert_eq(kdf.A[10:3], pdf.A[10:3], almost=True)

dt1 = datetime.datetime.strptime("2013-01-02", "%Y-%m-%d")
dt2 = datetime.datetime.strptime("2013-01-04", "%Y-%m-%d")

# Index loc search
self.assert_eq(kdf[:dt2], pdf[:dt2])
self.assert_eq(kdf[dt1:], pdf[dt1:])
self.assert_eq(kdf[dt1:dt2], pdf[dt1:dt2])
self.assert_eq(kdf.A[dt2], pdf.A[dt2])
self.assert_eq(kdf.A[:dt2], pdf.A[:dt2])
self.assert_eq(kdf.A[dt1:], pdf.A[dt1:])
self.assert_eq(kdf.A[dt1:dt2], pdf.A[dt1:dt2])
self.assert_eq(kdf[:dt2], pdf[:dt2], almost=True)
self.assert_eq(kdf[dt1:], pdf[dt1:], almost=True)
self.assert_eq(kdf[dt1:dt2], pdf[dt1:dt2], almost=True)
self.assert_eq(kdf.A[dt2], pdf.A[dt2], almost=True)
self.assert_eq(kdf.A[:dt2], pdf.A[:dt2], almost=True)
self.assert_eq(kdf.A[dt1:], pdf.A[dt1:], almost=True)
self.assert_eq(kdf.A[dt1:dt2], pdf.A[dt1:dt2], almost=True)

def test_index_operator_int(self):
pdf = pd.DataFrame(np.random.randn(6, 4), index=[1, 3, 5, 7, 9, 11], columns=list("ABCD"))
Expand Down
10 changes: 8 additions & 2 deletions databricks/koalas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1268,8 +1268,14 @@ def test_truncate(self):
self.assert_eq(kser1.truncate(after=5), pser1.truncate(after=5))
self.assert_eq(kser1.truncate(copy=False), pser1.truncate(copy=False))
self.assert_eq(kser1.truncate(2, 5, copy=False), pser1.truncate(2, 5, copy=False))
self.assert_eq(kser2.truncate(4, 6), pser2.truncate(4, 6))
self.assert_eq(kser2.truncate(4, 6, copy=False), pser2.truncate(4, 6, copy=False))
# The bug for these tests has been fixed in pandas 1.1.0.
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
self.assert_eq(kser2.truncate(4, 6), pser2.truncate(4, 6))
self.assert_eq(kser2.truncate(4, 6, copy=False), pser2.truncate(4, 6, copy=False))
else:
expected_kser = ks.Series([20, 30, 40], index=[6, 5, 4])
self.assert_eq(kser2.truncate(4, 6), expected_kser)
self.assert_eq(kser2.truncate(4, 6, copy=False), expected_kser)

kser = ks.Series([10, 20, 30, 40, 50, 60, 70], index=[1, 2, 3, 4, 3, 2, 1])
msg = "truncate requires a sorted index"
Expand Down
2 changes: 1 addition & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Dependencies in Koalas. When you update don't forget to update setup.py and install.rst in docs.
pandas>=0.23.2,<1.1.0
pandas>=0.23.2
pyarrow>=0.10
matplotlib>=3.0.0,<3.3.0
numpy>=1.14,<1.19.0
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
},
python_requires='>=3.5,<3.9',
install_requires=[
'pandas>=0.23.2,<1.1.0',
'pandas>=0.23.2',
'pyarrow>=0.10',
'numpy>=1.14,<1.19.0',
'matplotlib>=3.0.0,<3.3.0',
Expand Down

0 comments on commit 1ccb7ea

Please sign in to comment.