From 0d199e4e8bb2b9ce73a35889b49d847283fadce5 Mon Sep 17 00:00:00 2001 From: Wenhuan Date: Sun, 22 Apr 2018 02:23:54 +0800 Subject: [PATCH] BUG: Fix problems in group rank when both nans and infinity are present #20561 (#20681) --- doc/source/whatsnew/v0.23.0.txt | 8 +++-- pandas/_libs/groupby_helper.pxi.in | 31 +++++++++++------- pandas/tests/groupby/test_groupby.py | 49 ++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index e340acc17fe9f..e19aedac80213 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -221,6 +221,12 @@ Current Behavior: s.rank(na_option='top') +These bugs were squashed: + +- Bug in :meth:`DataFrame.rank` and :meth:`Series.rank` when ``method='dense'`` and ``pct=True`` in which percentile ranks were not being used with the number of distinct observations (:issue:`15630`) +- Bug in :meth:`Series.rank` and :meth:`DataFrame.rank` when ``ascending='False'`` failed to return correct ranks for infinity if ``NaN`` were present (:issue:`19538`) +- Bug in :func:`DataFrameGroupBy.rank` where ranks were incorrect when both infinity and ``NaN`` were present (:issue:`20561`) + .. _whatsnew_0230.enhancements.round-trippable_json: JSON read/write round-trippable with ``orient='table'`` @@ -1082,14 +1088,12 @@ Offsets Numeric ^^^^^^^ -- Bug in :meth:`DataFrame.rank` and :meth:`Series.rank` when ``method='dense'`` and ``pct=True`` in which percentile ranks were not being used with the number of distinct observations (:issue:`15630`) - Bug in :class:`Series` constructor with an int or float list where specifying ``dtype=str``, ``dtype='str'`` or ``dtype='U'`` failed to convert the data elements to strings (:issue:`16605`) - Bug in :class:`Index` multiplication and division methods where operating with a ``Series`` would return an ``Index`` object instead of a ``Series`` object (:issue:`19042`) - Bug in the :class:`DataFrame` constructor in which data containing very large positive or very large negative numbers was causing ``OverflowError`` (:issue:`18584`) - Bug in :class:`Index` constructor with ``dtype='uint64'`` where int-like floats were not coerced to :class:`UInt64Index` (:issue:`18400`) - Bug in :class:`DataFrame` flex arithmetic (e.g. ``df.add(other, fill_value=foo)``) with a ``fill_value`` other than ``None`` failed to raise ``NotImplementedError`` in corner cases where either the frame or ``other`` has length zero (:issue:`19522`) - Multiplication and division of numeric-dtyped :class:`Index` objects with timedelta-like scalars returns ``TimedeltaIndex`` instead of raising ``TypeError`` (:issue:`19333`) -- Bug in :meth:`Series.rank` and :meth:`DataFrame.rank` when ``ascending='False'`` failed to return correct ranks for infinity if ``NaN`` were present (:issue:`19538`) - Bug where ``NaN`` was returned instead of 0 by :func:`Series.pct_change` and :func:`DataFrame.pct_change` when ``fill_method`` is not ``None`` (:issue:`19873`) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index de802f4a72277..6a33e4a09476d 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -417,7 +417,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, ndarray[int64_t] labels, bint is_datetimelike, object ties_method, bint ascending, bint pct, object na_option): - """Provides the rank of values within each group + """ + Provides the rank of values within each group. Parameters ---------- @@ -425,17 +426,24 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, values : array of {{c_type}} values to be ranked labels : array containing unique label for each group, with its ordering matching up to the corresponding record in `values` - is_datetimelike : bool + is_datetimelike : bool, default False unused in this method but provided for call compatibility with other Cython transformations - ties_method : {'keep', 'top', 'bottom'} + ties_method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' + * average: average rank of group + * min: lowest rank in group + * max: highest rank in group + * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups + ascending : boolean, default True + False for ranks by high (1) to low (N) + na_option : {'keep', 'top', 'bottom'}, default 'keep' + pct : boolean, default False + Compute percentage rank of data within each group + na_option : {'keep', 'top', 'bottom'}, default 'keep' * keep: leave NA values where they are * top: smallest rank if ascending * bottom: smallest rank if descending - ascending : boolean - False for ranks by high (1) to low (N) - pct : boolean - Compute percentage rank of data within each group Notes ----- @@ -508,7 +516,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, # if keep_na, check for missing values and assign back # to the result where appropriate - if keep_na and masked_vals[_as[i]] == nan_fill_val: + + if keep_na and mask[_as[i]]: grp_na_count += 1 out[_as[i], 0] = nan else: @@ -548,9 +557,9 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, # reset the dups and sum_ranks, knowing that a new value is coming # up. the conditional also needs to handle nan equality and the # end of iteration - if (i == N - 1 or ( - (masked_vals[_as[i]] != masked_vals[_as[i+1]]) and not - (mask[_as[i]] and mask[_as[i+1]]))): + if (i == N - 1 or + (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or + (mask[_as[i]] ^ mask[_as[i+1]])): dups = sum_ranks = 0 val_start = i grp_vals_seen += 1 diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index cdb4e3072c65d..c3400b6b710e5 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1965,6 +1965,55 @@ def test_rank_args(self, grps, vals, ties_method, ascending, pct, exp): exp_df = DataFrame(exp * len(grps), columns=['val']) assert_frame_equal(result, exp_df) + @pytest.mark.parametrize("grps", [ + ['qux'], ['qux', 'quux']]) + @pytest.mark.parametrize("vals", [ + [-np.inf, -np.inf, np.nan, 1., np.nan, np.inf, np.inf], + ]) + @pytest.mark.parametrize("ties_method,ascending,na_option,exp", [ + ('average', True, 'keep', [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]), + ('average', True, 'top', [3.5, 3.5, 1.5, 5., 1.5, 6.5, 6.5]), + ('average', True, 'bottom', [1.5, 1.5, 6.5, 3., 6.5, 4.5, 4.5]), + ('average', False, 'keep', [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]), + ('average', False, 'top', [6.5, 6.5, 1.5, 5., 1.5, 3.5, 3.5]), + ('average', False, 'bottom', [4.5, 4.5, 6.5, 3., 6.5, 1.5, 1.5]), + ('min', True, 'keep', [1., 1., np.nan, 3., np.nan, 4., 4.]), + ('min', True, 'top', [3., 3., 1., 5., 1., 6., 6.]), + ('min', True, 'bottom', [1., 1., 6., 3., 6., 4., 4.]), + ('min', False, 'keep', [4., 4., np.nan, 3., np.nan, 1., 1.]), + ('min', False, 'top', [6., 6., 1., 5., 1., 3., 3.]), + ('min', False, 'bottom', [4., 4., 6., 3., 6., 1., 1.]), + ('max', True, 'keep', [2., 2., np.nan, 3., np.nan, 5., 5.]), + ('max', True, 'top', [4., 4., 2., 5., 2., 7., 7.]), + ('max', True, 'bottom', [2., 2., 7., 3., 7., 5., 5.]), + ('max', False, 'keep', [5., 5., np.nan, 3., np.nan, 2., 2.]), + ('max', False, 'top', [7., 7., 2., 5., 2., 4., 4.]), + ('max', False, 'bottom', [5., 5., 7., 3., 7., 2., 2.]), + ('first', True, 'keep', [1., 2., np.nan, 3., np.nan, 4., 5.]), + ('first', True, 'top', [3., 4., 1., 5., 2., 6., 7.]), + ('first', True, 'bottom', [1., 2., 6., 3., 7., 4., 5.]), + ('first', False, 'keep', [4., 5., np.nan, 3., np.nan, 1., 2.]), + ('first', False, 'top', [6., 7., 1., 5., 2., 3., 4.]), + ('first', False, 'bottom', [4., 5., 6., 3., 7., 1., 2.]), + ('dense', True, 'keep', [1., 1., np.nan, 2., np.nan, 3., 3.]), + ('dense', True, 'top', [2., 2., 1., 3., 1., 4., 4.]), + ('dense', True, 'bottom', [1., 1., 4., 2., 4., 3., 3.]), + ('dense', False, 'keep', [3., 3., np.nan, 2., np.nan, 1., 1.]), + ('dense', False, 'top', [4., 4., 1., 3., 1., 2., 2.]), + ('dense', False, 'bottom', [3., 3., 4., 2., 4., 1., 1.]) + ]) + def test_infs_n_nans(self, grps, vals, ties_method, ascending, na_option, + exp): + # GH 20561 + key = np.repeat(grps, len(vals)) + vals = vals * len(grps) + df = DataFrame({'key': key, 'val': vals}) + result = df.groupby('key').rank(method=ties_method, + ascending=ascending, + na_option=na_option) + exp_df = DataFrame(exp * len(grps), columns=['val']) + assert_frame_equal(result, exp_df) + @pytest.mark.parametrize("grps", [ ['qux'], ['qux', 'quux']]) @pytest.mark.parametrize("vals", [