From 8386a565fb122d5b46f9afa6e47f4853099e3fa0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 16 Oct 2019 05:42:47 -0700 Subject: [PATCH] add uint64 support for some libgroupby funcs (#28931) --- pandas/_libs/groupby_helper.pxi.in | 62 ++++++++++++++++++++++++++- pandas/core/groupby/groupby.py | 8 ++++ pandas/tests/groupby/test_function.py | 2 +- 3 files changed, 69 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 6b434b64705811..f052feea0bbf35 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -16,6 +16,7 @@ ctypedef fused rank_t: float64_t float32_t int64_t + uint64_t object @@ -34,6 +35,7 @@ def group_last(rank_t[:, :] out, rank_t val ndarray[rank_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs + bint runtime_error = False assert min_count == -1, "'min_count' only used in add and prod" @@ -106,11 +108,20 @@ def group_last(rank_t[:, :] out, if nobs[i, j] == 0: if rank_t is int64_t: out[i, j] = NPY_NAT + elif rank_t is uint64_t: + runtime_error = True + break else: out[i, j] = NAN else: out[i, j] = resx[i, j] + if runtime_error: + # We cannot raise directly above because that is within a nogil + # block. + raise RuntimeError("empty group with uint64_t") + + group_last_float64 = group_last["float64_t"] group_last_float32 = group_last["float32_t"] group_last_int64 = group_last["int64_t"] @@ -132,6 +143,7 @@ def group_nth(rank_t[:, :] out, rank_t val ndarray[rank_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs + bint runtime_error = False assert min_count == -1, "'min_count' only used in add and prod" @@ -199,11 +211,19 @@ def group_nth(rank_t[:, :] out, if nobs[i, j] == 0: if rank_t is int64_t: out[i, j] = NPY_NAT + elif rank_t is uint64_t: + runtime_error = True + break else: out[i, j] = NAN else: out[i, j] = resx[i, j] + if runtime_error: + # We cannot raise directly above because that is within a nogil + # block. + raise RuntimeError("empty group with uint64_t") + group_nth_float64 = group_nth["float64_t"] group_nth_float32 = group_nth["float32_t"] @@ -282,12 +302,16 @@ def group_rank(float64_t[:, :] out, if ascending ^ (na_option == 'top'): if rank_t is int64_t: nan_fill_val = np.iinfo(np.int64).max + elif rank_t is uint64_t: + nan_fill_val = np.iinfo(np.uint64).max else: nan_fill_val = np.inf order = (masked_vals, mask, labels) else: if rank_t is int64_t: nan_fill_val = np.iinfo(np.int64).min + elif rank_t is uint64_t: + nan_fill_val = 0 else: nan_fill_val = -np.inf @@ -397,6 +421,7 @@ def group_rank(float64_t[:, :] out, group_rank_float64 = group_rank["float64_t"] group_rank_float32 = group_rank["float32_t"] group_rank_int64 = group_rank["int64_t"] +group_rank_uint64 = group_rank["uint64_t"] # Note: we do not have a group_rank_object because that would require a # not-nogil implementation, see GH#19560 @@ -410,6 +435,7 @@ ctypedef fused groupby_t: float64_t float32_t int64_t + uint64_t @cython.wraparound(False) @@ -426,6 +452,7 @@ def group_max(groupby_t[:, :] out, Py_ssize_t i, j, N, K, lab, ncounts = len(counts) groupby_t val, count, nan_val ndarray[groupby_t, ndim=2] maxx, nobs + bint runtime_error = False assert min_count == -1, "'min_count' only used in add and prod" @@ -439,6 +466,11 @@ def group_max(groupby_t[:, :] out, # Note: evaluated at compile-time maxx[:] = -_int64_max nan_val = NPY_NAT + elif groupby_t is uint64_t: + # NB: We do not define nan_val because there is no such thing + # for uint64_t. We carefully avoid having to reference it in this + # case. + maxx[:] = 0 else: maxx[:] = -np.inf nan_val = NAN @@ -462,7 +494,7 @@ def group_max(groupby_t[:, :] out, if val > maxx[lab, j]: maxx[lab, j] = val else: - if val == val and val != nan_val: + if val == val: nobs[lab, j] += 1 if val > maxx[lab, j]: maxx[lab, j] = val @@ -470,10 +502,18 @@ def group_max(groupby_t[:, :] out, for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: + if groupby_t is uint64_t: + runtime_error = True + break out[i, j] = nan_val else: out[i, j] = maxx[i, j] + if runtime_error: + # We cannot raise directly above because that is within a nogil + # block. + raise RuntimeError("empty group with uint64_t") + @cython.wraparound(False) @cython.boundscheck(False) @@ -489,6 +529,7 @@ def group_min(groupby_t[:, :] out, Py_ssize_t i, j, N, K, lab, ncounts = len(counts) groupby_t val, count, nan_val ndarray[groupby_t, ndim=2] minx, nobs + bint runtime_error = False assert min_count == -1, "'min_count' only used in add and prod" @@ -501,6 +542,11 @@ def group_min(groupby_t[:, :] out, if groupby_t is int64_t: minx[:] = _int64_max nan_val = NPY_NAT + elif groupby_t is uint64_t: + # NB: We do not define nan_val because there is no such thing + # for uint64_t. We carefully avoid having to reference it in this + # case. + minx[:] = np.iinfo(np.uint64).max else: minx[:] = np.inf nan_val = NAN @@ -524,7 +570,7 @@ def group_min(groupby_t[:, :] out, if val < minx[lab, j]: minx[lab, j] = val else: - if val == val and val != nan_val: + if val == val: nobs[lab, j] += 1 if val < minx[lab, j]: minx[lab, j] = val @@ -532,10 +578,18 @@ def group_min(groupby_t[:, :] out, for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: + if groupby_t is uint64_t: + runtime_error = True + break out[i, j] = nan_val else: out[i, j] = minx[i, j] + if runtime_error: + # We cannot raise directly above because that is within a nogil + # block. + raise RuntimeError("empty group with uint64_t") + @cython.boundscheck(False) @cython.wraparound(False) @@ -575,6 +629,8 @@ def group_cummin(groupby_t[:, :] out, accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) if groupby_t is int64_t: accum[:] = _int64_max + elif groupby_t is uint64_t: + accum[:] = np.iinfo(np.uint64).max else: accum[:] = np.inf @@ -642,6 +698,8 @@ def group_cummax(groupby_t[:, :] out, accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) if groupby_t is int64_t: accum[:] = -_int64_max + elif groupby_t is uint64_t: + accum[:] = 0 else: accum[:] = -np.inf diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6379e27e55d2e9..92ea733cc34475 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1355,7 +1355,15 @@ def f(self, **kwargs): return self._cython_agg_general(alias, alt=npfunc, **kwargs) except AssertionError as e: raise SpecificationError(str(e)) + except DataError: + pass except Exception: + # TODO: the remaining test cases that get here are from: + # - AttributeError from _cython_agg_blocks bug passing + # DataFrame to make_block; see GH#28275 + # - TypeError in _cython_operation calling ensure_float64 + # on object array containing complex numbers; + # see test_groupby_complex, test_max_nan_bug pass # apply a non-cython aggregation diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index afb22a732691cd..571e710ba8928f 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -378,7 +378,7 @@ def test_median_empty_bins(observed): @pytest.mark.parametrize( - "dtype", ["int8", "int16", "int32", "int64", "float32", "float64"] + "dtype", ["int8", "int16", "int32", "int64", "float32", "float64", "uint64"] ) @pytest.mark.parametrize( "method,data",