Skip to content

Commit

Permalink
BUG, DEP, DOC: Patch and Align Categorical's Sorting API
Browse files Browse the repository at this point in the history
Clarifies the meaning of 'sort' in the context of `Categorical` to
mean 'organization' rather than 'order',  as it is possible to call
this method (as well as `sort_values`) when the `Categorical` is
unordered.    Also patches a bug in `Categorical.sort_values` in which
`na_position` was not being respected when  `ascending` was set to
`True`. This commit aligns the behaviour with that of `Series`.
Finally, deprecates `sort` in favor of `sort_values`, which is in
alignment with what was done with `Series` back in #10726.

Closes #12785

Author: gfyoung <[email protected]>

Closes #12882 from gfyoung/categorical-sort-doc and squashes the following commits:

f324a9c [gfyoung] BUG, DOC, DEP: Patch and Align Categorical's Sorting API
  • Loading branch information
gfyoung authored and jreback committed Apr 14, 2016
1 parent 6f1ade1 commit 2ea0601
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 99 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.18.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,7 @@ Deprecations
^^^^^^^^^^^^

- The method name ``Index.sym_diff()`` is deprecated and can be replaced by ``Index.symmetric_difference()`` (:issue:`12591`)
- The method name ``Categorical.sort()`` is deprecated in favor of ``Categorical.sort_values()`` (:issue:`12882`)



Expand Down
120 changes: 67 additions & 53 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1157,30 +1157,76 @@ def argsort(self, ascending=True, **kwargs):
return result

def sort_values(self, inplace=False, ascending=True, na_position='last'):
""" Sorts the Category by category value returning a new Categorical by
default.
""" Sorts the Categorical by category value returning a new
Categorical by default.
Only ordered Categoricals can be sorted!
Categorical.sort is the equivalent but sorts the Categorical inplace.
While an ordering is applied to the category values, sorting in this
context refers more to organizing and grouping together based on
matching category values. Thus, this function can be called on an
unordered Categorical instance unlike the functions 'Categorical.min'
and 'Categorical.max'.
Parameters
----------
inplace : boolean, default False
Do operation in place.
ascending : boolean, default True
Sort ascending. Passing False sorts descending
Order ascending. Passing False orders descending. The
ordering parameter provides the method by which the
category values are organized.
na_position : {'first', 'last'} (optional, default='last')
'first' puts NaNs at the beginning
'last' puts NaNs at the end
Returns
-------
y : Category or None
y : Categorical or None
See Also
--------
Category.sort
Categorical.sort
Examples
--------
>>> c = pd.Categorical([1, 2, 2, 1, 5])
>>> c
[1, 2, 2, 1, 5]
Categories (3, int64): [1, 2, 5]
>>> c.sort_values()
[1, 1, 2, 2, 5]
Categories (3, int64): [1, 2, 5]
>>> c.sort_values(ascending=False)
[5, 2, 2, 1, 1]
Categories (3, int64): [1, 2, 5]
Inplace sorting can be done as well:
>>> c.sort_values(inplace=True)
>>> c
[1, 1, 2, 2, 5]
Categories (3, int64): [1, 2, 5]
>>>
>>> c = pd.Categorical([1, 2, 2, 1, 5])
'sort_values' behaviour with NaNs. Note that 'na_position'
is independent of the 'ascending' parameter:
>>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5])
>>> c
[NaN, 2.0, 2.0, NaN, 5.0]
Categories (2, int64): [2, 5]
>>> c.sort_values()
[2.0, 2.0, 5.0, NaN, NaN]
Categories (2, int64): [2, 5]
>>> c.sort_values(ascending=False)
[5.0, 2.0, 2.0, NaN, NaN]
Categories (2, int64): [2, 5]
>>> c.sort_values(na_position='first')
[NaN, NaN, 2.0, 2.0, 5.0]
Categories (2, int64): [2, 5]
>>> c.sort_values(ascending=False, na_position='first')
[NaN, NaN, 5.0, 2.0, 2.0]
Categories (2, int64): [2, 5]
"""
if na_position not in ['last', 'first']:
raise ValueError('invalid na_position: {!r}'.format(na_position))
Expand All @@ -1193,13 +1239,13 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'):
na_mask = (codes == -1)
if na_mask.any():
n_nans = len(codes[na_mask])
if na_position == "first" and not ascending:
if na_position == "first":
# in this case sort to the front
new_codes = codes.copy()
new_codes[0:n_nans] = -1
new_codes[n_nans:] = codes[~na_mask]
codes = new_codes
elif na_position == "last" and not ascending:
elif na_position == "last":
# ... and to the end
new_codes = codes.copy()
pos = len(codes) - n_nans
Expand All @@ -1215,63 +1261,31 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'):

def order(self, inplace=False, ascending=True, na_position='last'):
"""
DEPRECATED: use :meth:`Categorical.sort_values`
Sorts the Category by category value returning a new Categorical by
default.
Only ordered Categoricals can be sorted!
Categorical.sort is the equivalent but sorts the Categorical inplace.
Parameters
----------
inplace : boolean, default False
Do operation in place.
ascending : boolean, default True
Sort ascending. Passing False sorts descending
na_position : {'first', 'last'} (optional, default='last')
'first' puts NaNs at the beginning
'last' puts NaNs at the end
Returns
-------
y : Category or None
DEPRECATED: use :meth:`Categorical.sort_values`. That function
is entirely equivalent to this one.
See Also
--------
Category.sort
Categorical.sort_values
"""
warn("order is deprecated, use sort_values(...)", FutureWarning,
stacklevel=2)
return self.sort_values(inplace=inplace, ascending=ascending,
na_position=na_position)

def sort(self, inplace=True, ascending=True, na_position='last'):
""" Sorts the Category inplace by category value.
Only ordered Categoricals can be sorted!
Catgorical.order is the equivalent but returns a new Categorical.
Parameters
----------
ascending : boolean, default True
Sort ascending. Passing False sorts descending
inplace : boolean, default False
Do operation in place.
na_position : {'first', 'last'} (optional, default='last')
'first' puts NaNs at the beginning
'last' puts NaNs at the end
Returns
-------
y : Category or None
"""
DEPRECATED: use :meth:`Categorical.sort_values`. That function
is just like this one, except that a new Categorical is returned
by default, so make sure to pass in 'inplace=True' to get
inplace sorting.
See Also
--------
Category.sort_values
Categorical.sort_values
"""
warn("sort is deprecated, use sort_values(...)", FutureWarning,
stacklevel=2)
return self.sort_values(inplace=inplace, ascending=ascending,
na_position=na_position)

Expand Down
107 changes: 61 additions & 46 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1277,12 +1277,11 @@ def test_mode(self):
exp = Categorical([4], categories=[5, 4, 3, 2, 1], ordered=True)
self.assertTrue(res.equals(exp))

def test_sort(self):
def test_sort_values(self):

# unordered cats are sortable
cat = Categorical(["a", "b", "b", "a"], ordered=False)
cat.sort_values()
cat.sort()

cat = Categorical(["a", "c", "b", "d"], ordered=True)

Expand All @@ -1303,10 +1302,62 @@ def test_sort(self):

# sort (inplace order)
cat1 = cat.copy()
cat1.sort()
cat1.sort_values(inplace=True)
exp = np.array(["a", "b", "c", "d"], dtype=object)
self.assert_numpy_array_equal(cat1.__array__(), exp)

# reverse
cat = Categorical(["a", "c", "c", "b", "d"], ordered=True)
res = cat.sort_values(ascending=False)
exp_val = np.array(["d", "c", "c", "b", "a"], dtype=object)
exp_categories = np.array(["a", "b", "c", "d"], dtype=object)
self.assert_numpy_array_equal(res.__array__(), exp_val)
self.assert_numpy_array_equal(res.categories, exp_categories)

def test_sort_values_na_position(self):
# see gh-12882
cat = Categorical([5, 2, np.nan, 2, np.nan], ordered=True)
exp_categories = np.array([2, 5])

exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
res = cat.sort_values() # default arguments
self.assert_numpy_array_equal(res.__array__(), exp)
self.assert_numpy_array_equal(res.categories, exp_categories)

exp = np.array([np.nan, np.nan, 2.0, 2.0, 5.0])
res = cat.sort_values(ascending=True, na_position='first')
self.assert_numpy_array_equal(res.__array__(), exp)
self.assert_numpy_array_equal(res.categories, exp_categories)

exp = np.array([np.nan, np.nan, 5.0, 2.0, 2.0])
res = cat.sort_values(ascending=False, na_position='first')
self.assert_numpy_array_equal(res.__array__(), exp)
self.assert_numpy_array_equal(res.categories, exp_categories)

exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
res = cat.sort_values(ascending=True, na_position='last')
self.assert_numpy_array_equal(res.__array__(), exp)
self.assert_numpy_array_equal(res.categories, exp_categories)

exp = np.array([5.0, 2.0, 2.0, np.nan, np.nan])
res = cat.sort_values(ascending=False, na_position='last')
self.assert_numpy_array_equal(res.__array__(), exp)
self.assert_numpy_array_equal(res.categories, exp_categories)

cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
res = cat.sort_values(ascending=False, na_position='last')
exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object)
exp_categories = np.array(["a", "b", "c", "d"], dtype=object)
self.assert_numpy_array_equal(res.__array__(), exp_val)
self.assert_numpy_array_equal(res.categories, exp_categories)

cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
res = cat.sort_values(ascending=False, na_position='first')
exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object)
exp_categories = np.array(["a", "b", "c", "d"], dtype=object)
self.assert_numpy_array_equal(res.__array__(), exp_val)
self.assert_numpy_array_equal(res.categories, exp_categories)

def test_slicing_directly(self):
cat = Categorical(["a", "b", "c", "d", "a", "b", "c"])
sliced = cat[3]
Expand Down Expand Up @@ -2951,14 +3002,16 @@ def test_count(self):
result = s.count()
self.assertEqual(result, 2)

def test_sort(self):
def test_sort_values(self):

c = Categorical(["a", "b", "b", "a"], ordered=False)
cat = Series(c)
cat = Series(c.copy())

# 9816 deprecated
with tm.assert_produces_warning(FutureWarning):
c.order()
# 'order' was deprecated in gh-10726
# 'sort' was deprecated in gh-12882
for func in ('order', 'sort'):
with tm.assert_produces_warning(FutureWarning):
getattr(c, func)()

# sort in the categories order
expected = Series(
Expand Down Expand Up @@ -3024,44 +3077,6 @@ def test_sort(self):
expected = df.iloc[[2, 1, 5, 4, 3, 0]]
tm.assert_frame_equal(result, expected)

# reverse
cat = Categorical(["a", "c", "c", "b", "d"], ordered=True)
res = cat.sort_values(ascending=False)
exp_val = np.array(["d", "c", "c", "b", "a"], dtype=object)
exp_categories = np.array(["a", "b", "c", "d"], dtype=object)
self.assert_numpy_array_equal(res.__array__(), exp_val)
self.assert_numpy_array_equal(res.categories, exp_categories)

# some NaN positions

cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
res = cat.sort_values(ascending=False, na_position='last')
exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object)
exp_categories = np.array(["a", "b", "c", "d"], dtype=object)
self.assert_numpy_array_equal(res.__array__(), exp_val)
self.assert_numpy_array_equal(res.categories, exp_categories)

cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
res = cat.sort_values(ascending=False, na_position='first')
exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object)
exp_categories = np.array(["a", "b", "c", "d"], dtype=object)
self.assert_numpy_array_equal(res.__array__(), exp_val)
self.assert_numpy_array_equal(res.categories, exp_categories)

cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
res = cat.sort_values(ascending=False, na_position='first')
exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object)
exp_categories = np.array(["a", "b", "c", "d"], dtype=object)
self.assert_numpy_array_equal(res.__array__(), exp_val)
self.assert_numpy_array_equal(res.categories, exp_categories)

cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
res = cat.sort_values(ascending=False, na_position='last')
exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object)
exp_categories = np.array(["a", "b", "c", "d"], dtype=object)
self.assert_numpy_array_equal(res.__array__(), exp_val)
self.assert_numpy_array_equal(res.categories, exp_categories)

def test_slicing(self):
cat = Series(Categorical([1, 2, 3, 4]))
reversed = cat[::-1]
Expand Down

0 comments on commit 2ea0601

Please sign in to comment.