From 361159205e01658e0da1f2151fd6ab51055127a9 Mon Sep 17 00:00:00 2001 From: itholic Date: Thu, 12 Sep 2019 13:46:30 +0900 Subject: [PATCH 1/3] Add axis parameter to dataframe.diff --- databricks/koalas/frame.py | 15 ++++++++++----- databricks/koalas/tests/test_dataframe.py | 8 ++++++++ 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index 9faa28bcc3..624d7cbb38 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -2192,8 +2192,8 @@ def shift(self, periods=1, fill_value=None): internal = self._internal.copy(sdf=sdf, data_columns=[c.name for c in applied]) return DataFrame(internal) - # TODO: add axis parameter - def diff(self, periods=1): + # TODO: axis should support 1 or 'columns' either + def diff(self, periods: int = 1, axis: int = 0): """ First discrete difference of element. @@ -2209,6 +2209,8 @@ def diff(self, periods=1): ---------- periods : int, default 1 Periods to shift for calculating difference, accepts negative values. + axis : int, default 0 or 'index' + Can only be set to 0 at the moment. Returns ------- @@ -2259,6 +2261,8 @@ def diff(self, periods=1): 4 -1.0 -3.0 -11.0 5 NaN NaN NaN """ + if axis not in [0, 'index']: + raise ValueError('axis should be either 0 or "index" currently.') applied = [] for column in self._internal.data_columns: applied.append(self[column].diff(periods)) @@ -2267,6 +2271,7 @@ def diff(self, periods=1): internal = self._internal.copy(sdf=sdf, data_columns=[c.name for c in applied]) return DataFrame(internal) + # TODO: axis should support 1 or 'columns' either def nunique(self, axis: int = 0, dropna: bool = True, approx: bool = False, rsd: float = 0.05) -> pd.Series: """ @@ -2276,7 +2281,7 @@ def nunique(self, axis: int = 0, dropna: bool = True, approx: bool = False, Parameters ---------- - axis : int, default 0 + axis : int, default 0 or 'index' Can only be set to 0 at the moment. dropna : bool, default True Don’t include NaN in the count. @@ -2314,8 +2319,8 @@ def nunique(self, axis: int = 0, dropna: bool = True, approx: bool = False, B 1 Name: 0, dtype: int64 """ - if axis != 0: - raise ValueError("The 'nunique' method only works with axis=0 at the moment") + if axis not in [0, 'index']: + raise ValueError('axis should be either 0 or "index" currently.') res = self._sdf.select([self[column]._nunique(dropna, approx, rsd) for column in self.columns]) return res.toPandas().T.iloc[:, 0] diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py index 3397cded62..e68cb68805 100644 --- a/databricks/koalas/tests/test_dataframe.py +++ b/databricks/koalas/tests/test_dataframe.py @@ -515,6 +515,11 @@ def test_nunique(self): self.assert_eq(ks.DataFrame({'A': range(100)}).nunique(approx=True, rsd=0.01), pd.Series([100], index=['A'], name='0')) + # Assert unsupported axis value yet + msg = 'axis should be either 0 or "index" currently.' + with self.assertRaisesRegex(ValueError, msg): + kdf.nunique(axis=1) + def test_sort_values(self): pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, None, 7], 'b': [7, 6, 5, 4, 3, 2, 1]}) @@ -1472,6 +1477,9 @@ def test_diff(self): msg = "should be an int" with self.assertRaisesRegex(ValueError, msg): kdf.diff(1.5) + msg = 'axis should be either 0 or "index" currently.' + with self.assertRaisesRegex(ValueError, msg): + kdf.diff(axis=1) def test_duplicated(self): pdf = pd.DataFrame({'a': [1, 1, 1, 3], 'b': [1, 1, 1, 4], 'c': [1, 1, 1, 5]}) From 023ff22fe711364bd8d376d226a760053ff3dd58 Mon Sep 17 00:00:00 2001 From: itholic Date: Fri, 13 Sep 2019 08:55:53 +0900 Subject: [PATCH 2/3] Fix type hint & some comment --- databricks/koalas/frame.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index 624d7cbb38..4829c9bc73 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -2192,8 +2192,8 @@ def shift(self, periods=1, fill_value=None): internal = self._internal.copy(sdf=sdf, data_columns=[c.name for c in applied]) return DataFrame(internal) - # TODO: axis should support 1 or 'columns' either - def diff(self, periods: int = 1, axis: int = 0): + # TODO: axis should support 1 or 'columns' either at this moment + def diff(self, periods: int = 1, axis: Union[int, str] = 0): """ First discrete difference of element. @@ -2272,7 +2272,7 @@ def diff(self, periods: int = 1, axis: int = 0): return DataFrame(internal) # TODO: axis should support 1 or 'columns' either - def nunique(self, axis: int = 0, dropna: bool = True, approx: bool = False, + def nunique(self, axis: Union[int, str] = 0, dropna: bool = True, approx: bool = False, rsd: float = 0.05) -> pd.Series: """ Return number of unique elements in the object. From de9ecdcda0f9bb5ae51232f5762ebfe194706773 Mon Sep 17 00:00:00 2001 From: itholic Date: Fri, 13 Sep 2019 15:39:19 +0900 Subject: [PATCH 3/3] Add comments --- databricks/koalas/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index 4829c9bc73..cdb4138fc8 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -2271,7 +2271,7 @@ def diff(self, periods: int = 1, axis: Union[int, str] = 0): internal = self._internal.copy(sdf=sdf, data_columns=[c.name for c in applied]) return DataFrame(internal) - # TODO: axis should support 1 or 'columns' either + # TODO: axis should support 1 or 'columns' either at this moment def nunique(self, axis: Union[int, str] = 0, dropna: bool = True, approx: bool = False, rsd: float = 0.05) -> pd.Series: """