diff --git a/databricks/koalas/indexes.py b/databricks/koalas/indexes.py index 7f78bb8a7d..b891c329ff 100644 --- a/databricks/koalas/indexes.py +++ b/databricks/koalas/indexes.py @@ -777,6 +777,78 @@ def symmetric_difference(self, other, result_name=None, sort=None): return result + # TODO: return_indexer + def sort_values(self, ascending=True): + """ + Return a sorted copy of the index. + + .. note:: This method is not supported for pandas when index has NaN value. + pandas raises unexpected TypeError, but we support treating NaN + as the smallest value. + + Parameters + ---------- + ascending : bool, default True + Should the index values be sorted in an ascending order. + + Returns + ------- + sorted_index : ks.Index or ks.MultiIndex + Sorted copy of the index. + + See Also + -------- + Series.sort_values : Sort values of a Series. + DataFrame.sort_values : Sort values in a DataFrame. + + Examples + -------- + >>> idx = ks.Index([10, 100, 1, 1000]) + >>> idx + Int64Index([10, 100, 1, 1000], dtype='int64') + + Sort values in ascending order (default behavior). + + >>> idx.sort_values() + Int64Index([1, 10, 100, 1000], dtype='int64') + + Sort values in descending order. + + >>> idx.sort_values(ascending=False) + Int64Index([1000, 100, 10, 1], dtype='int64') + + Support for MultiIndex. + + >>> kidx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('c', 'y', 2), ('b', 'z', 3)]) + >>> kidx # doctest: +SKIP + MultiIndex([('a', 'x', 1), + ('c', 'y', 2), + ('b', 'z', 3)], + ) + + >>> kidx.sort_values() # doctest: +SKIP + MultiIndex([('a', 'x', 1), + ('b', 'z', 3), + ('c', 'y', 2)], + ) + + >>> kidx.sort_values(ascending=False) # doctest: +SKIP + MultiIndex([('c', 'y', 2), + ('b', 'z', 3), + ('a', 'x', 1)], + ) + """ + sdf = self._internal.sdf + sdf = sdf.orderBy(self._internal.index_scols, ascending=ascending) + + internal = _InternalFrame( + sdf=sdf.select(self._internal.index_scols), + index_map=self._kdf._internal.index_map) + + result = DataFrame(internal).index + + return result + def sort(self, *args, **kwargs): """ Use sort_values instead. diff --git a/databricks/koalas/missing/indexes.py b/databricks/koalas/missing/indexes.py index d6bf755371..cce2a86636 100644 --- a/databricks/koalas/missing/indexes.py +++ b/databricks/koalas/missing/indexes.py @@ -81,7 +81,6 @@ class _MissingPandasLikeIndex(object): set_value = unsupported_function('set_value') slice_indexer = unsupported_function('slice_indexer') slice_locs = unsupported_function('slice_locs') - sort_values = unsupported_function('sort_values') sortlevel = unsupported_function('sortlevel') take = unsupported_function('take') to_flat_index = unsupported_function('to_flat_index') @@ -173,7 +172,6 @@ class _MissingPandasLikeMultiIndex(object): set_value = unsupported_function('set_value') slice_indexer = unsupported_function('slice_indexer') slice_locs = unsupported_function('slice_locs') - sort_values = unsupported_function('sort_values') sortlevel = unsupported_function('sortlevel') swaplevel = unsupported_function('swaplevel') take = unsupported_function('take') diff --git a/databricks/koalas/tests/test_indexes.py b/databricks/koalas/tests/test_indexes.py index f858a7b8b9..7f8e01b29e 100644 --- a/databricks/koalas/tests/test_indexes.py +++ b/databricks/koalas/tests/test_indexes.py @@ -335,6 +335,28 @@ def test_index_fillna(self): with self.assertRaisesRegex(TypeError, "Unsupported type "): kidx.fillna([1, 2]) + def test_sort_values(self): + pidx = pd.Index([-10, -100, 200, 100]) + kidx = ks.Index([-10, -100, 200, 100]) + + self.assert_eq(pidx.sort_values(), kidx.sort_values()) + self.assert_eq(pidx.sort_values(ascending=False), kidx.sort_values(ascending=False)) + + pidx.name = 'koalas' + kidx.name = 'koalas' + + self.assert_eq(pidx.sort_values(), kidx.sort_values()) + self.assert_eq(pidx.sort_values(ascending=False), kidx.sort_values(ascending=False)) + + pidx = pd.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)]) + kidx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)]) + + pidx.names = ['hello', 'koalas', 'goodbye'] + kidx.names = ['hello', 'koalas', 'goodbye'] + + self.assert_eq(pidx.sort_values(), kidx.sort_values()) + self.assert_eq(pidx.sort_values(ascending=False), kidx.sort_values(ascending=False)) + def test_index_drop_duplicates(self): pidx = pd.Index([1, 1, 2]) kidx = ks.Index([1, 1, 2]) diff --git a/docs/source/reference/indexing.rst b/docs/source/reference/indexing.rst index 5a00f8a65c..eaa9e57ad6 100644 --- a/docs/source/reference/indexing.rst +++ b/docs/source/reference/indexing.rst @@ -76,6 +76,13 @@ Conversion Index.to_series Index.to_numpy +Sorting +~~~~~~~ +.. autosummary:: + :toctree: api/ + + Index.sort_values + Time-specific operations ~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: @@ -163,3 +170,10 @@ MultiIndex Conversion MultiIndex.astype MultiIndex.to_numpy + +MultiIndex Sorting +~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + MultiIndex.sort_values