From 4f649e6fc19cb6f639f4a80f79c3e3c43689d287 Mon Sep 17 00:00:00 2001 From: itholic Date: Thu, 12 Dec 2019 12:19:09 +0900 Subject: [PATCH 1/4] Implement sort_values for Index/MultiIndex --- databricks/koalas/indexes.py | 77 +++++++++++++++++++++++++ databricks/koalas/missing/indexes.py | 2 - databricks/koalas/tests/test_indexes.py | 22 +++++++ docs/source/reference/indexing.rst | 9 ++- 4 files changed, 107 insertions(+), 3 deletions(-) diff --git a/databricks/koalas/indexes.py b/databricks/koalas/indexes.py index f501af20cf..b425f2ee4b 100644 --- a/databricks/koalas/indexes.py +++ b/databricks/koalas/indexes.py @@ -711,6 +711,83 @@ def symmetric_difference(self, other, result_name=None, sort=None): return result + # TODO: return_indexer + def sort_values(self, ascending=True): + """ + Return a sorted copy of the index. + + .. note:: This method is not supported for pandas when index has NaN value. + pandas raises unexpected TypeError, but we support treating NaN + as the smallest value. + + Parameters + ---------- + ascending : bool, default True + Should the index values be sorted in an ascending order. + + Returns + ------- + sorted_index : ks.Index or ks.MultiIndex + Sorted copy of the index. + + See Also + -------- + Series.sort_values : Sort values of a Series. + DataFrame.sort_values : Sort values in a DataFrame. + + Examples + -------- + >>> idx = ks.Index([10, 100, 1, 1000]) + >>> idx + Int64Index([10, 100, 1, 1000], dtype='int64') + + Sort values in ascending order (default behavior). + + >>> idx.sort_values() + Int64Index([1, 10, 100, 1000], dtype='int64') + + Sort values in descending order. + + >>> idx.sort_values(ascending=False) + Int64Index([1000, 100, 10, 1], dtype='int64') + + Support for MultiIndex. + + >>> kidx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('c', 'y', 2), ('b', 'z', 3)]) + >>> kidx # doctest: +SKIP + MultiIndex([('a', 'x', 1), + ('c', 'y', 2), + ('b', 'z', 3)], + ) + + >>> kidx.sort_values() # doctest: +SKIP + MultiIndex([('a', 'x', 1), + ('b', 'z', 3), + ('c', 'y', 2)], + ) + + >>> kidx.sort_values(ascending=False) # doctest: +SKIP + MultiIndex([('c', 'y', 2), + ('b', 'z', 3), + ('a', 'x', 1)], + ) + """ + sdf = self._internal.sdf + sdf = sdf.orderBy(self._internal.index_scols, ascending=ascending) + + internal = _InternalFrame( + sdf=sdf.select(self._internal.index_scols), + index_map=self._internal.index_map) + + result = DataFrame(internal).index + + if isinstance(self, MultiIndex): + result.names = self.names + else: + result.name = self.name + + return result + def __getattr__(self, item: str) -> Any: if hasattr(_MissingPandasLikeIndex, item): property_or_func = getattr(_MissingPandasLikeIndex, item) diff --git a/databricks/koalas/missing/indexes.py b/databricks/koalas/missing/indexes.py index 0a17f194a5..1bf2add285 100644 --- a/databricks/koalas/missing/indexes.py +++ b/databricks/koalas/missing/indexes.py @@ -86,7 +86,6 @@ class _MissingPandasLikeIndex(object): slice_indexer = unsupported_function('slice_indexer') slice_locs = unsupported_function('slice_locs') sort = unsupported_function('sort') - sort_values = unsupported_function('sort_values') sortlevel = unsupported_function('sortlevel') take = unsupported_function('take') to_flat_index = unsupported_function('to_flat_index') @@ -183,7 +182,6 @@ class _MissingPandasLikeMultiIndex(object): slice_indexer = unsupported_function('slice_indexer') slice_locs = unsupported_function('slice_locs') sort = unsupported_function('sort') - sort_values = unsupported_function('sort_values') sortlevel = unsupported_function('sortlevel') swaplevel = unsupported_function('swaplevel') take = unsupported_function('take') diff --git a/databricks/koalas/tests/test_indexes.py b/databricks/koalas/tests/test_indexes.py index 3f2d2cf4d0..3636b6c464 100644 --- a/databricks/koalas/tests/test_indexes.py +++ b/databricks/koalas/tests/test_indexes.py @@ -334,3 +334,25 @@ def test_index_fillna(self): with self.assertRaisesRegex(TypeError, "Unsupported type "): kidx.fillna([1, 2]) + + def test_sort_values(self): + pidx = pd.Index([-10, -100, 200, 100]) + kidx = ks.Index([-10, -100, 200, 100]) + + self.assert_eq(pidx.sort_values(), kidx.sort_values()) + self.assert_eq(pidx.sort_values(ascending=False), kidx.sort_values(ascending=False)) + + pidx.name = 'koalas' + kidx.name = 'koalas' + + self.assert_eq(pidx.sort_values(), kidx.sort_values()) + self.assert_eq(pidx.sort_values(ascending=False), kidx.sort_values(ascending=False)) + + pidx = pd.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)]) + kidx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)]) + + pidx.names = ['hello', 'koalas', 'goodbye'] + kidx.names = ['hello', 'koalas', 'goodbye'] + + self.assert_eq(pidx.sort_values(), kidx.sort_values()) + self.assert_eq(pidx.sort_values(ascending=False), kidx.sort_values(ascending=False)) diff --git a/docs/source/reference/indexing.rst b/docs/source/reference/indexing.rst index 6e45552951..43f00431e4 100644 --- a/docs/source/reference/indexing.rst +++ b/docs/source/reference/indexing.rst @@ -61,7 +61,12 @@ Conversion Index.to_series Index.to_numpy -.. _api.multiindex: +Sorting +~~~~~~~ +.. autosummary:: + :toctree: api/ + + Index.sort_values Combining / joining / set operations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -77,6 +82,8 @@ Selecting Index.isin +.. _api.multiindex: + MultiIndex ---------- .. autosummary:: From 9d75621662bd5609f3b0e5c434f4591c804c7e1f Mon Sep 17 00:00:00 2001 From: itholic Date: Fri, 13 Dec 2019 14:48:25 +0900 Subject: [PATCH 2/4] fix --- databricks/koalas/indexes.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/databricks/koalas/indexes.py b/databricks/koalas/indexes.py index b635b40385..a80d81d0ac 100644 --- a/databricks/koalas/indexes.py +++ b/databricks/koalas/indexes.py @@ -806,15 +806,10 @@ def sort_values(self, ascending=True): internal = _InternalFrame( sdf=sdf.select(self._internal.index_scols), - index_map=self._internal.index_map) + index_map=self._kdf._internal.index_map) result = DataFrame(internal).index - if isinstance(self, MultiIndex): - result.names = self.names - else: - result.name = self.name - return result def min(self): From 7e717cc5917021ce140bd8e291d2b73ac87b6703 Mon Sep 17 00:00:00 2001 From: itholic Date: Thu, 19 Dec 2019 10:51:30 +0900 Subject: [PATCH 3/4] fix --- databricks/koalas/missing/indexes.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/databricks/koalas/missing/indexes.py b/databricks/koalas/missing/indexes.py index 0619959be5..cce2a86636 100644 --- a/databricks/koalas/missing/indexes.py +++ b/databricks/koalas/missing/indexes.py @@ -81,8 +81,6 @@ class _MissingPandasLikeIndex(object): set_value = unsupported_function('set_value') slice_indexer = unsupported_function('slice_indexer') slice_locs = unsupported_function('slice_locs') - sort = unsupported_function('sort') - sort_values = unsupported_function('sort_values') sortlevel = unsupported_function('sortlevel') take = unsupported_function('take') to_flat_index = unsupported_function('to_flat_index') @@ -174,8 +172,6 @@ class _MissingPandasLikeMultiIndex(object): set_value = unsupported_function('set_value') slice_indexer = unsupported_function('slice_indexer') slice_locs = unsupported_function('slice_locs') - sort = unsupported_function('sort') - sort_values = unsupported_function('sort_values') sortlevel = unsupported_function('sortlevel') swaplevel = unsupported_function('swaplevel') take = unsupported_function('take') From a8dda7d833a146fcfe2d7b23e47adc9b918bd0bc Mon Sep 17 00:00:00 2001 From: itholic Date: Thu, 19 Dec 2019 10:54:08 +0900 Subject: [PATCH 4/4] add doc multiindex --- docs/source/reference/indexing.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/source/reference/indexing.rst b/docs/source/reference/indexing.rst index 4aeede60bb..eaa9e57ad6 100644 --- a/docs/source/reference/indexing.rst +++ b/docs/source/reference/indexing.rst @@ -170,3 +170,10 @@ MultiIndex Conversion MultiIndex.astype MultiIndex.to_numpy + +MultiIndex Sorting +~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + MultiIndex.sort_values