Skip to content

Commit

Permalink
EHN/FIX: Add na_position parameter to DataFrame.sort. Fixes GH3917
Browse files Browse the repository at this point in the history
TST: Skip mergesort test if `np.argsort` raises TypeError on any mergesort.
  • Loading branch information
unutbu committed Mar 27, 2014
1 parent 110406c commit 3230ed4
Show file tree
Hide file tree
Showing 12 changed files with 352 additions and 65 deletions.
4 changes: 2 additions & 2 deletions doc/source/basics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1286,14 +1286,14 @@ The ``by`` argument can take a list of column names, e.g.:
Series has the method ``order`` (analogous to `R's order function
<http://stat.ethz.ch/R-manual/R-patched/library/base/html/order.html>`__) which
sorts by value, with special treatment of NA values via the ``na_last``
sorts by value, with special treatment of NA values via the ``na_position``
argument:

.. ipython:: python
s[2] = np.nan
s.order()
s.order(na_last=False)
s.order(na_position='first')
Some other sorting notes / nuances:

Expand Down
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@ API Changes
- Define and document the order of column vs index names in query/eval
(:issue:`6676`)

- ``DataFrame.sort`` now places NaNs at the beginning or end of the sort according to the ``na_position`` parameter. (:issue:`3917`)

Deprecations
~~~~~~~~~~~~

Expand Down
6 changes: 3 additions & 3 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,9 +316,9 @@ def array_equivalent(left, right):
# NaNs occur only in object arrays, float or complex arrays.
if issubclass(left.dtype.type, np.object_):
return ((left == right) | (pd.isnull(left) & pd.isnull(right))).all()
if not issubclass(left.dtype.type, (np.floating, np.complexfloating)):
return np.array_equal(left, right)
return ((left == right) | (np.isnan(left) & np.isnan(right))).all()
if issubclass(left.dtype.type, (np.floating, np.complexfloating)):
return ((left == right) | (np.isnan(left) & np.isnan(right))).all()
return np.array_equal(left, right)

def _iterable_not_string(x):
return (isinstance(x, collections.Iterable) and
Expand Down
57 changes: 30 additions & 27 deletions pandas/core/frame.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -2522,7 +2522,7 @@ def _m8_to_i8(x):
# Sorting

def sort(self, columns=None, axis=0, ascending=True,
inplace=False):
inplace=False, kind='quicksort', na_position='last'):
"""
Sort DataFrame either by labels (along either axis) or by the values in
column(s)
Expand All @@ -2540,6 +2540,11 @@ def sort(self, columns=None, axis=0, ascending=True,
Sort index/rows versus columns
inplace : boolean, default False
Sort the DataFrame without creating a new instance
kind : {'quicksort', 'mergesort', 'heapsort'}, optional
This option is only applied when sorting on a single column or label.
na_position : {'first', 'last'} (optional, default='last')
'first' puts NaNs at the beginning
'last' puts NaNs at the end
Examples
--------
Expand All @@ -2550,10 +2555,10 @@ def sort(self, columns=None, axis=0, ascending=True,
sorted : DataFrame
"""
return self.sort_index(by=columns, axis=axis, ascending=ascending,
inplace=inplace)
inplace=inplace, kind=kind, na_position=na_position)

def sort_index(self, axis=0, by=None, ascending=True, inplace=False,
kind='quicksort'):
kind='quicksort', na_position='last'):
"""
Sort DataFrame either by labels (along either axis) or by the values in
a column
Expand All @@ -2571,6 +2576,11 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False,
orders
inplace : boolean, default False
Sort the DataFrame without creating a new instance
na_position : {'first', 'last'} (optional, default='last')
'first' puts NaNs at the beginning
'last' puts NaNs at the end
kind : {'quicksort', 'mergesort', 'heapsort'}, optional
This option is only applied when sorting on a single column or label.
Examples
--------
Expand All @@ -2580,8 +2590,8 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False,
-------
sorted : DataFrame
"""
from pandas.core.groupby import _lexsort_indexer


from pandas.core.groupby import _lexsort_indexer, _nargsort
axis = self._get_axis_number(axis)
if axis not in [0, 1]: # pragma: no cover
raise AssertionError('Axis must be 0 or 1, got %s' % str(axis))
Expand All @@ -2597,23 +2607,19 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False,
if com._is_sequence(ascending) and len(by) != len(ascending):
raise ValueError('Length of ascending (%d) != length of by'
' (%d)' % (len(ascending), len(by)))

if len(by) > 1:
keys = []
for x in by:
k = self[x].values
if k.ndim == 2:
raise ValueError('Cannot sort by duplicate column %s'
% str(x))
keys.append(k)

def trans(v):
if com.needs_i8_conversion(v):
return v.view('i8')
return v

keys = [trans(self[x].values) for x in by]
indexer = _lexsort_indexer(keys, orders=ascending)
keys = []
for x in by:
k = self[x].values
if k.ndim == 2:
raise ValueError('Cannot sort by duplicate column %s' % str(x))
keys.append(trans(k))
indexer = _lexsort_indexer(keys, orders=ascending,
na_position=na_position)
indexer = com._ensure_platform_int(indexer)
else:
by = by[0]
Expand All @@ -2630,20 +2636,17 @@ def trans(v):
% str(by))
if isinstance(ascending, (tuple, list)):
ascending = ascending[0]
indexer = _nargsort(k, kind=kind, ascending=ascending,
na_position=na_position)

if not ascending:
k = k[::-1]
indexer = k.argsort(kind=kind)
if not ascending:
indexer = indexer.max() - indexer[::-1]
elif isinstance(labels, MultiIndex):
indexer = _lexsort_indexer(labels.labels, orders=ascending)
indexer = _lexsort_indexer(labels.labels, orders=ascending,
na_position=na_position)
indexer = com._ensure_platform_int(indexer)
else:
indexer = labels.argsort(kind=kind)
if not ascending:
indexer = indexer[::-1]

indexer = _nargsort(labels, kind=kind, ascending=ascending,
na_position=na_position)

if inplace:
if axis == 1:
new_data = self._data.reindex_items(
Expand Down
55 changes: 47 additions & 8 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -3145,33 +3145,72 @@ def _indexer_from_factorized(labels, shape, compress=True):
return indexer


def _lexsort_indexer(keys, orders=None):
def _lexsort_indexer(keys, orders=None, na_position='last'):
labels = []
shape = []

if isinstance(orders, bool):
orders = [orders] * len(keys)
elif orders is None:
orders = [True] * len(keys)

for key, order in zip(keys, orders):
key = np.asanyarray(key)
rizer = _hash.Factorizer(len(key))

if not key.dtype == np.object_:
key = key.astype('O')

# factorize maps nans to na_sentinel=-1
ids = rizer.factorize(key, sort=True)

n = len(rizer.uniques)
mask = (ids == -1)
if order: # ascending
if na_position == 'last':
ids = np.where(mask, n, ids)
elif na_position == 'first':
ids += 1
else:
raise ValueError('invalid na_position: {!r}'.format(na_position))
else: # not order means descending
if na_position == 'last':
ids = np.where(mask, n, n-ids-1)
elif na_position == 'first':
ids = np.where(mask, 0, n-ids)
else:
raise ValueError('invalid na_position: {!r}'.format(na_position))
if mask.any():
n += 1
shape.append(n)
if not order:
mask = ids == -1
ids = np.where(mask, -1, n - ids)

labels.append(ids)

return _indexer_from_factorized(labels, shape)

def _nargsort(items, kind='quicksort', ascending=True, na_position='last'):
"""
This is intended to be a drop-in replacement for np.argsort which handles NaNs
It adds ascending and na_position parameters.
GH #6399, #5231
"""
items = np.asanyarray(items)
idx = np.arange(len(items))
mask = isnull(items)
non_nans = items[~mask]
non_nan_idx = idx[~mask]
nan_idx = np.nonzero(mask)[0]
if not ascending:
non_nans = non_nans[::-1]
non_nan_idx = non_nan_idx[::-1]
indexer = non_nan_idx[non_nans.argsort(kind=kind)]
if not ascending:
indexer = indexer[::-1]
# Finally, place the NaNs at the end or the beginning according to na_position
if na_position == 'last':
indexer = np.concatenate([indexer, nan_idx])
elif na_position == 'first':
indexer = np.concatenate([nan_idx, indexer])
else:
raise ValueError('invalid na_position: {!r}'.format(na_position))
return indexer


class _KeyMapper(object):

Expand Down
10 changes: 5 additions & 5 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from pandas.core.base import FrozenList, FrozenNDArray, IndexOpsMixin

from pandas.util.decorators import cache_readonly, deprecate
from pandas.core.common import isnull
from pandas.core.common import isnull, array_equivalent
import pandas.core.common as com
from pandas.core.common import _values_from_object, is_float, is_integer, ABCSeries
from pandas.core.config import get_option
Expand Down Expand Up @@ -800,7 +800,7 @@ def equals(self, other):
if type(other) != Index:
return other.equals(self)

return np.array_equal(self, other)
return array_equivalent(self, other)

def identical(self, other):
"""Similar to equals, but check that other comparable attributes are
Expand Down Expand Up @@ -1872,7 +1872,7 @@ def equals(self, other):
# return False

try:
return np.array_equal(self, other)
return array_equivalent(self, other)
except TypeError:
# e.g. fails in numpy 1.6 with DatetimeIndex #1681
return False
Expand Down Expand Up @@ -3533,7 +3533,7 @@ def equals(self, other):
return True

if not isinstance(other, MultiIndex):
return np.array_equal(self.values, _ensure_index(other))
return array_equivalent(self.values, _ensure_index(other))

if self.nlevels != other.nlevels:
return False
Expand All @@ -3546,7 +3546,7 @@ def equals(self, other):
allow_fill=False)
ovalues = com.take_nd(other.levels[i].values, other.labels[i],
allow_fill=False)
if not np.array_equal(svalues, ovalues):
if not array_equivalent(svalues, ovalues):
return False

return True
Expand Down
19 changes: 14 additions & 5 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1743,24 +1743,32 @@ def rank(self, method='average', na_option='keep', ascending=True,
ascending=ascending, pct=pct)
return self._constructor(ranks, index=self.index).__finalize__(self)

def order(self, na_last=True, ascending=True, kind='mergesort'):
def order(self, na_last=None, ascending=True, kind='mergesort', na_position='last'):
"""
Sorts Series object, by value, maintaining index-value link
Parameters
----------
na_last : boolean (optional, default=True)
na_last : boolean (optional, default=True) (DEPRECATED; use na_position)
Put NaN's at beginning or end
ascending : boolean, default True
Sort ascending. Passing False sorts descending
kind : {'mergesort', 'quicksort', 'heapsort'}, default 'mergesort'
Choice of sorting algorithm. See np.sort for more
information. 'mergesort' is the only stable algorithm
na_position : {'first', 'last'} (optional, default='last')
'first' puts NaNs at the beginning
'last' puts NaNs at the end
Returns
-------
y : Series
"""
if na_last is not None:
warnings.warn(("na_last is deprecated. Please use na_position instead"),
FutureWarning)
na_position = 'last' if na_last else 'first'

def _try_kind_sort(arr):
# easier to ask forgiveness than permission
try:
Expand All @@ -1784,15 +1792,16 @@ def _try_kind_sort(arr):
if not ascending:
argsorted = argsorted[::-1]

if na_last:
if na_position == 'last':
n = good.sum()
sortedIdx[:n] = idx[good][argsorted]
sortedIdx[n:] = idx[bad]
else:
elif na_position == 'first':
n = bad.sum()
sortedIdx[n:] = idx[good][argsorted]
sortedIdx[:n] = idx[bad]

else:
raise ValueError('invalid na_position: {!r}'.format(na_position))
return self._constructor(arr[sortedIdx], index=self.index[sortedIdx])\
.__finalize__(self)

Expand Down
13 changes: 8 additions & 5 deletions pandas/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -835,20 +835,23 @@ cdef class Factorizer:
return self.count

def factorize(self, ndarray[object] values, sort=False, na_sentinel=-1):
"""
Factorize values with nans replaced by na_sentinel
>>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
array([ 0, 1, 20])
"""
labels = self.table.get_labels(values, self.uniques,
self.count, na_sentinel)

mask = (labels == na_sentinel)
# sort on
if sort:
if labels.dtype != np.int_:
labels = labels.astype(np.int_)

sorter = self.uniques.to_array().argsort()
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
reverse_indexer.put(sorter, np.arange(len(sorter)))

labels = reverse_indexer.take(labels)

labels = reverse_indexer.take(labels, mode='clip')
labels[mask] = na_sentinel
self.count = len(self.uniques)
return labels

Expand Down
Loading

0 comments on commit 3230ed4

Please sign in to comment.