From d49305c75c1b1d0220c7a4a96722d28fd1b754aa Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Thu, 18 Aug 2022 12:09:09 -0400 Subject: [PATCH] BUG: algorithms.factorize moves null values when sort=False (#46601) * BUG: algos.factorizes moves null values when sort=False * Implementation for pyarrow < 4.0 * fixup * fixups * test fixup * type-hints * improvements * remove override of _from_factorized in string_.py * Rework na_sentinel/dropna/ignore_na * fixups * fixup for pyarrow < 4.0 * whatsnew note * doc fixup * fixups * fixup whatsnew note * whatsnew note; comment on old vs new API Co-authored-by: asv-bot --- doc/source/whatsnew/v1.5.0.rst | 3 + pandas/_libs/hashtable_class_helper.pxi.in | 12 ++-- pandas/core/algorithms.py | 67 +++++++++++++------ pandas/core/arrays/arrow/array.py | 22 ++++-- pandas/core/arrays/base.py | 6 +- pandas/core/arrays/masked.py | 32 +++++++-- pandas/core/arrays/sparse/array.py | 4 ++ pandas/core/arrays/string_.py | 4 +- pandas/core/groupby/grouper.py | 5 +- pandas/tests/groupby/test_groupby_dropna.py | 60 +++++++++++++++++ .../tests/groupby/transform/test_transform.py | 6 +- pandas/tests/test_algos.py | 16 ++--- 12 files changed, 179 insertions(+), 58 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index a7ed072fb0aacb..a9d0f75cc605de 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -299,6 +299,7 @@ Other enhancements - Added ``copy`` keyword to :meth:`Series.set_axis` and :meth:`DataFrame.set_axis` to allow user to set axis on a new object without necessarily copying the underlying data (:issue:`47932`) - :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support a ``copy`` argument. If ``False``, the underlying data is not copied in the returned object (:issue:`47934`) - :meth:`DataFrame.set_index` now supports a ``copy`` keyword. If ``False``, the underlying data is not copied when a new :class:`DataFrame` is returned (:issue:`48043`) +- The method :meth:`.ExtensionArray.factorize` accepts ``use_na_sentinel=False`` for determining how null values are to be treated (:issue:`46601`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: @@ -922,6 +923,7 @@ Numeric - Bug in division, ``pow`` and ``mod`` operations on array-likes with ``dtype="boolean"`` not being like their ``np.bool_`` counterparts (:issue:`46063`) - Bug in multiplying a :class:`Series` with ``IntegerDtype`` or ``FloatingDtype`` by an array-like with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`45622`) - Bug in :meth:`mean` where the optional dependency ``bottleneck`` causes precision loss linear in the length of the array. ``bottleneck`` has been disabled for :meth:`mean` improving the loss to log-linear but may result in a performance decrease. (:issue:`42878`) +- Bug in :func:`factorize` would convert the value ``None`` to ``np.nan`` (:issue:`46601`) Conversion ^^^^^^^^^^ @@ -1093,6 +1095,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` would not respect ``dropna=False`` when the input DataFrame/Series had a NaN values in a :class:`MultiIndex` (:issue:`46783`) - Bug in :meth:`DataFrameGroupBy.resample` raises ``KeyError`` when getting the result from a key list which misses the resample key (:issue:`47362`) - Bug in :meth:`DataFrame.groupby` would lose index columns when the DataFrame is empty for transforms, like fillna (:issue:`47787`) +- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` with ``dropna=False`` and ``sort=False`` would put any null groups at the end instead the order that they are encountered (:issue:`46584`) - Reshaping diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 3c9a1f86ad2a15..54260a9a90964f 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -711,7 +711,7 @@ cdef class {{name}}HashTable(HashTable): return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask) def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, - object na_value=None, object mask=None): + object na_value=None, object mask=None, ignore_na=True): """ Calculate unique values and labels (no sorting!) @@ -743,7 +743,7 @@ cdef class {{name}}HashTable(HashTable): """ uniques_vector = {{name}}Vector() return self._unique(values, uniques_vector, na_sentinel=na_sentinel, - na_value=na_value, ignore_na=True, mask=mask, + na_value=na_value, ignore_na=ignore_na, mask=mask, return_inverse=True) def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, @@ -1092,7 +1092,7 @@ cdef class StringHashTable(HashTable): return_inverse=return_inverse) def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, - object na_value=None, object mask=None): + object na_value=None, object mask=None, ignore_na=True): """ Calculate unique values and labels (no sorting!) @@ -1122,7 +1122,7 @@ cdef class StringHashTable(HashTable): """ uniques_vector = ObjectVector() return self._unique(values, uniques_vector, na_sentinel=na_sentinel, - na_value=na_value, ignore_na=True, + na_value=na_value, ignore_na=ignore_na, return_inverse=True) def get_labels(self, ndarray[object] values, ObjectVector uniques, @@ -1347,7 +1347,7 @@ cdef class PyObjectHashTable(HashTable): return_inverse=return_inverse) def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, - object na_value=None, object mask=None): + object na_value=None, object mask=None, ignore_na=True): """ Calculate unique values and labels (no sorting!) @@ -1377,7 +1377,7 @@ cdef class PyObjectHashTable(HashTable): """ uniques_vector = ObjectVector() return self._unique(values, uniques_vector, na_sentinel=na_sentinel, - na_value=na_value, ignore_na=True, + na_value=na_value, ignore_na=ignore_na, return_inverse=True) def get_labels(self, ndarray[object] values, ObjectVector uniques, diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index ddf4b69bed8c67..2e6737b2e61aae 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -521,7 +521,7 @@ def f(c, v): def factorize_array( values: np.ndarray, - na_sentinel: int = -1, + na_sentinel: int | None = -1, size_hint: int | None = None, na_value: object = None, mask: npt.NDArray[np.bool_] | None = None, @@ -552,6 +552,10 @@ def factorize_array( codes : ndarray[np.intp] uniques : ndarray """ + ignore_na = na_sentinel is not None + if not ignore_na: + na_sentinel = -1 + original = values if values.dtype.kind in ["m", "M"]: # _get_hashtable_algo will cast dt64/td64 to i8 via _ensure_data, so we @@ -564,7 +568,11 @@ def factorize_array( table = hash_klass(size_hint or len(values)) uniques, codes = table.factorize( - values, na_sentinel=na_sentinel, na_value=na_value, mask=mask + values, + na_sentinel=na_sentinel, + na_value=na_value, + mask=mask, + ignore_na=ignore_na, ) # re-cast e.g. i8->dt64/td64, uint8->bool @@ -738,6 +746,10 @@ def factorize( # responsible only for factorization. All data coercion, sorting and boxing # should happen here. + # GH#46910 deprecated na_sentinel in favor of use_na_sentinel: + # na_sentinel=None corresponds to use_na_sentinel=False + # na_sentinel=-1 correspond to use_na_sentinel=True + # Other na_sentinel values will not be supported when the deprecation is enforced. na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel) if isinstance(values, ABCRangeIndex): return values.factorize(sort=sort) @@ -749,10 +761,7 @@ def factorize( # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques # of values, assign na_sentinel=-1 to replace code value for NaN. - dropna = True - if na_sentinel is None: - na_sentinel = -1 - dropna = False + dropna = na_sentinel is not None if ( isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray)) @@ -765,38 +774,58 @@ def factorize( elif not isinstance(values.dtype, np.dtype): if ( - na_sentinel == -1 - and "use_na_sentinel" in inspect.signature(values.factorize).parameters - ): + na_sentinel == -1 or na_sentinel is None + ) and "use_na_sentinel" in inspect.signature(values.factorize).parameters: # Avoid using catch_warnings when possible # GH#46910 - TimelikeOps has deprecated signature codes, uniques = values.factorize( # type: ignore[call-arg] - use_na_sentinel=True + use_na_sentinel=na_sentinel is not None ) else: + na_sentinel_arg = -1 if na_sentinel is None else na_sentinel with warnings.catch_warnings(): # We've already warned above warnings.filterwarnings("ignore", ".*use_na_sentinel.*", FutureWarning) - codes, uniques = values.factorize(na_sentinel=na_sentinel) + codes, uniques = values.factorize(na_sentinel=na_sentinel_arg) else: values = np.asarray(values) # convert DTA/TDA/MultiIndex + # TODO: pass na_sentinel=na_sentinel to factorize_array. When sort is True and + # na_sentinel is None we append NA on the end because safe_sort does not + # handle null values in uniques. + if na_sentinel is None and sort: + na_sentinel_arg = -1 + elif na_sentinel is None: + na_sentinel_arg = None + else: + na_sentinel_arg = na_sentinel codes, uniques = factorize_array( - values, na_sentinel=na_sentinel, size_hint=size_hint + values, + na_sentinel=na_sentinel_arg, + size_hint=size_hint, ) if sort and len(uniques) > 0: + if na_sentinel is None: + # TODO: Can remove when na_sentinel=na_sentinel as in TODO above + na_sentinel = -1 uniques, codes = safe_sort( uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False ) - code_is_na = codes == na_sentinel - if not dropna and code_is_na.any(): - # na_value is set based on the dtype of uniques, and compat set to False is - # because we do not want na_value to be 0 for integers - na_value = na_value_for_dtype(uniques.dtype, compat=False) - uniques = np.append(uniques, [na_value]) - codes = np.where(code_is_na, len(uniques) - 1, codes) + if not dropna and sort: + # TODO: Can remove entire block when na_sentinel=na_sentinel as in TODO above + if na_sentinel is None: + na_sentinel_arg = -1 + else: + na_sentinel_arg = na_sentinel + code_is_na = codes == na_sentinel_arg + if code_is_na.any(): + # na_value is set based on the dtype of uniques, and compat set to False is + # because we do not want na_value to be 0 for integers + na_value = na_value_for_dtype(uniques.dtype, compat=False) + uniques = np.append(uniques, [na_value]) + codes = np.where(code_is_na, len(uniques) - 1, codes) uniques = _reconstruct_data(uniques, original.dtype, original) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index a3b2003b0caf3a..8d80cb18d21dbb 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -551,20 +551,32 @@ def factorize( use_na_sentinel: bool | lib.NoDefault = lib.no_default, ) -> tuple[np.ndarray, ExtensionArray]: resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel) - if resolved_na_sentinel is None: - raise NotImplementedError("Encoding NaN values is not yet implemented") + if pa_version_under4p0: + encoded = self._data.dictionary_encode() else: - na_sentinel = resolved_na_sentinel - encoded = self._data.dictionary_encode() + null_encoding = "mask" if resolved_na_sentinel is not None else "encode" + encoded = self._data.dictionary_encode(null_encoding=null_encoding) indices = pa.chunked_array( [c.indices for c in encoded.chunks], type=encoded.type.index_type ).to_pandas() if indices.dtype.kind == "f": - indices[np.isnan(indices)] = na_sentinel + indices[np.isnan(indices)] = ( + resolved_na_sentinel if resolved_na_sentinel is not None else -1 + ) indices = indices.astype(np.int64, copy=False) if encoded.num_chunks: uniques = type(self)(encoded.chunk(0).dictionary) + if resolved_na_sentinel is None and pa_version_under4p0: + # TODO: share logic with BaseMaskedArray.factorize + # Insert na with the proper code + na_mask = indices.values == -1 + na_index = na_mask.argmax() + if na_mask[na_index]: + uniques = uniques.insert(na_index, self.dtype.na_value) + na_code = 0 if na_index == 0 else indices[:na_index].argmax() + 1 + indices[indices >= na_code] += 1 + indices[indices == -1] = na_code else: uniques = type(self)(pa.array([], type=encoded.type.value_type)) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 043917376b8c10..1e3b1371846602 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1081,14 +1081,10 @@ def factorize( # 2. ExtensionArray.factorize. # Complete control over factorization. resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel) - if resolved_na_sentinel is None: - raise NotImplementedError("Encoding NaN values is not yet implemented") - else: - na_sentinel = resolved_na_sentinel arr, na_value = self._values_for_factorize() codes, uniques = factorize_array( - arr, na_sentinel=na_sentinel, na_value=na_value + arr, na_sentinel=resolved_na_sentinel, na_value=na_value ) uniques_ea = self._from_factorized(uniques, self) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 15946ab9ce80d9..c5f6dea7157abd 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -886,19 +886,39 @@ def factorize( use_na_sentinel: bool | lib.NoDefault = lib.no_default, ) -> tuple[np.ndarray, ExtensionArray]: resolved_na_sentinel = algos.resolve_na_sentinel(na_sentinel, use_na_sentinel) - if resolved_na_sentinel is None: - raise NotImplementedError("Encoding NaN values is not yet implemented") - else: - na_sentinel = resolved_na_sentinel arr = self._data mask = self._mask - codes, uniques = factorize_array(arr, na_sentinel=na_sentinel, mask=mask) + # Pass non-None na_sentinel; recode and add NA to uniques if necessary below + na_sentinel_arg = -1 if resolved_na_sentinel is None else resolved_na_sentinel + codes, uniques = factorize_array(arr, na_sentinel=na_sentinel_arg, mask=mask) # check that factorize_array correctly preserves dtype. assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype) - uniques_ea = type(self)(uniques, np.zeros(len(uniques), dtype=bool)) + has_na = mask.any() + if resolved_na_sentinel is not None or not has_na: + size = len(uniques) + else: + # Make room for an NA value + size = len(uniques) + 1 + uniques_mask = np.zeros(size, dtype=bool) + if resolved_na_sentinel is None and has_na: + na_index = mask.argmax() + # Insert na with the proper code + if na_index == 0: + na_code = np.intp(0) + else: + # mypy error: Slice index must be an integer or None + # https://github.com/python/mypy/issues/2410 + na_code = codes[:na_index].argmax() + 1 # type: ignore[misc] + codes[codes >= na_code] += 1 + codes[codes == -1] = na_code + # dummy value for uniques; not used since uniques_mask will be True + uniques = np.insert(uniques, na_code, 0) + uniques_mask[na_code] = True + uniques_ea = type(self)(uniques, uniques_mask) + return codes, uniques_ea @doc(ExtensionArray._values_for_argsort) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 13ebd2b25e9499..e2a0740dd214e5 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -879,6 +879,10 @@ def factorize( codes, uniques = algos.factorize( np.asarray(self), na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel ) + if na_sentinel is lib.no_default: + na_sentinel = -1 + if use_na_sentinel is lib.no_default or use_na_sentinel: + codes[codes == -1] = na_sentinel uniques_sp = SparseArray(uniques, dtype=self.dtype) return codes, uniques_sp diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index e2bc08c03ed3ca..0e2df9f7cf7286 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -380,8 +380,8 @@ def __arrow_array__(self, type=None): def _values_for_factorize(self): arr = self._ndarray.copy() mask = self.isna() - arr[mask] = -1 - return arr, -1 + arr[mask] = None + return arr, None def __setitem__(self, key, value): value = extract_array(value, extract_numpy=True) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 04ebc00b8e9645..72f54abdced274 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -658,9 +658,10 @@ def group_index(self) -> Index: @cache_readonly def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: - if self._passed_categorical: + if self._dropna and self._passed_categorical: # we make a CategoricalIndex out of the cat grouper - # preserving the categories / ordered attributes + # preserving the categories / ordered attributes; + # doesn't (yet - GH#46909) handle dropna=False cat = self.grouping_vector categories = cat.categories diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 515c96780e7316..ee08307c953110 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.pyarrow import pa_version_under1p01 + import pandas as pd import pandas._testing as tm @@ -387,3 +389,61 @@ def test_groupby_drop_nan_with_multi_index(): result = df.groupby(["a", "b"], dropna=False).first() expected = df tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "values, dtype", + [ + ([2, np.nan, 1, 2], None), + ([2, np.nan, 1, 2], "UInt8"), + ([2, np.nan, 1, 2], "Int8"), + ([2, np.nan, 1, 2], "UInt16"), + ([2, np.nan, 1, 2], "Int16"), + ([2, np.nan, 1, 2], "UInt32"), + ([2, np.nan, 1, 2], "Int32"), + ([2, np.nan, 1, 2], "UInt64"), + ([2, np.nan, 1, 2], "Int64"), + ([2, np.nan, 1, 2], "Float32"), + ([2, np.nan, 1, 2], "Int64"), + ([2, np.nan, 1, 2], "Float64"), + (["y", None, "x", "y"], "category"), + (["y", pd.NA, "x", "y"], "string"), + pytest.param( + ["y", pd.NA, "x", "y"], + "string[pyarrow]", + marks=pytest.mark.skipif( + pa_version_under1p01, reason="pyarrow is not installed" + ), + ), + ( + ["2016-01-01", np.datetime64("NaT"), "2017-01-01", "2016-01-01"], + "datetime64[ns]", + ), + ( + [ + pd.Period("2012-02-01", freq="D"), + pd.NA, + pd.Period("2012-01-01", freq="D"), + pd.Period("2012-02-01", freq="D"), + ], + None, + ), + (pd.arrays.SparseArray([2, np.nan, 1, 2]), None), + ], +) +@pytest.mark.parametrize("test_series", [True, False]) +def test_no_sort_keep_na(values, dtype, test_series): + # GH#46584 + key = pd.Series(values, dtype=dtype) + df = pd.DataFrame({"key": key, "a": [1, 2, 3, 4]}) + gb = df.groupby("key", dropna=False, sort=False) + if test_series: + gb = gb["a"] + result = gb.sum() + expected = pd.DataFrame({"a": [5, 2, 3]}, index=key[:-1].rename("key")) + if test_series: + expected = expected["a"] + if expected.index.is_categorical(): + # TODO: Slicing reorders categories? + expected.index = expected.index.reorder_categories(["y", "x"]) + tm.assert_equal(result, expected) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index d2928c52c33e2a..113bd4a0c4c651 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1326,12 +1326,8 @@ def test_transform_cumcount(): @pytest.mark.parametrize("keys", [["A1"], ["A1", "A2"]]) -def test_null_group_lambda_self(request, sort, dropna, keys): +def test_null_group_lambda_self(sort, dropna, keys): # GH 17093 - if not sort and not dropna: - msg = "GH#46584: null values get sorted when sort=False" - request.node.add_marker(pytest.mark.xfail(reason=msg, strict=False)) - size = 50 nulls1 = np.random.choice([False, True], size) nulls2 = np.random.choice([False, True], size) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index b3e9a0f6498b3c..6cdbdb24e716db 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -455,13 +455,13 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): [ ( ["a", None, "b", "a"], - np.array([0, 2, 1, 0], dtype=np.dtype("intp")), - np.array(["a", "b", np.nan], dtype=object), + np.array([0, 1, 2, 0], dtype=np.dtype("intp")), + np.array(["a", None, "b"], dtype=object), ), ( ["a", np.nan, "b", "a"], - np.array([0, 2, 1, 0], dtype=np.dtype("intp")), - np.array(["a", "b", np.nan], dtype=object), + np.array([0, 1, 2, 0], dtype=np.dtype("intp")), + np.array(["a", np.nan, "b"], dtype=object), ), ], ) @@ -478,13 +478,13 @@ def test_object_factorize_use_na_sentinel_false( [ ( [1, None, 1, 2], - np.array([0, 2, 0, 1], dtype=np.dtype("intp")), - np.array([1, 2, np.nan], dtype="O"), + np.array([0, 1, 0, 2], dtype=np.dtype("intp")), + np.array([1, None, 2], dtype="O"), ), ( [1, np.nan, 1, 2], - np.array([0, 2, 0, 1], dtype=np.dtype("intp")), - np.array([1, 2, np.nan], dtype=np.float64), + np.array([0, 1, 0, 2], dtype=np.dtype("intp")), + np.array([1, np.nan, 2], dtype=np.float64), ), ], )