From 552775fcd4deeeb6af9dc3e9fe4d85185b91a6da Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 27 May 2022 14:44:03 -0400 Subject: [PATCH 01/17] DEPR: na_sentinel in factorize --- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/core/algorithms.py | 44 +++++++++++++++++--- pandas/core/arrays/arrow/array.py | 13 +++++- pandas/core/arrays/base.py | 66 +++++++++++++++++++----------- pandas/core/arrays/datetimelike.py | 11 ++++- pandas/core/arrays/masked.py | 12 +++++- pandas/core/arrays/sparse/array.py | 11 ++++- pandas/core/base.py | 11 ++++- pandas/core/common.py | 49 ++++++++++++++++++++++ pandas/core/groupby/grouper.py | 8 +--- pandas/core/indexes/range.py | 7 +++- pandas/tests/test_algos.py | 33 +++++++++++---- 12 files changed, 217 insertions(+), 50 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 4a0b9a97a9d11..26bff90180c88 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -669,7 +669,7 @@ Other Deprecations - Deprecated the methods :meth:`DataFrame.mad`, :meth:`Series.mad`, and the corresponding groupby methods (:issue:`11787`) - Deprecated positional arguments to :meth:`Index.join` except for ``other``, use keyword-only arguments instead of positional arguments (:issue:`46518`) - Deprecated indexing on a timezone-naive :class:`DatetimeIndex` using a string representing a timezone-aware datetime (:issue:`46903`, :issue:`36148`) -- +- Deprecated the argument ``na_sentinel`` in :func:`pd.factorize`, :meth:`Index.factorize`, and :meth:`ExtensionArray.factorize`; use ``use_na_sentinel`` instead of an integer to use the sentinel ``-1`` for NaN values and ``False`` instead of ``None`` to encode NaN values (:issue:`46910`) .. --------------------------------------------------------------------------- .. _whatsnew_150.performance: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 888e943488953..6b7ac0e12f10c 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -80,6 +80,7 @@ na_value_for_dtype, ) +from pandas.core import common as com from pandas.core.array_algos.take import take_nd from pandas.core.construction import ( array as pd_array, @@ -580,7 +581,8 @@ def factorize_array( def factorize( values, sort: bool = False, - na_sentinel: int | None = -1, + na_sentinel: int | None | lib.NoDefault = lib.no_default, + use_na_sentinel: bool | lib.NoDefault = lib.no_default, size_hint: int | None = None, ) -> tuple[np.ndarray, np.ndarray | Index]: """ @@ -595,10 +597,21 @@ def factorize( ---------- {values}{sort} na_sentinel : int or None, default -1 - Value to mark "not found". If None, will not drop the NaN - from the uniques of the values. + Value to mark "not found". If None, NaN values will be encoded as positive + integers and will not drop the NaN from the uniques of the values. + + .. deprecated:: 1.5.0 + Specifying the specific value to use for na_sentinel is deprecated and + will be removed in a future version of pandas. Specify use_na_sentinel as + either True or False. .. versionchanged:: 1.1.2 + + use_na_sentinel : bool, default True + If True, the sentinel -1 will be used for NaN values. If False, + NaN values will be encoded as non-negative integers and will not drop the + NaN from the uniques of the values. + {size_hint}\ Returns @@ -706,7 +719,13 @@ def factorize( # responsible only for factorization. All data coercion, sorting and boxing # should happen here. + # Can't always warn here because EA's factorize will warn too; warn for each + # path below. + passed_na_sentinel = na_sentinel + na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel, warn=False) if isinstance(values, ABCRangeIndex): + # Emit warning if appropriate + _ = com.resolve_na_sentinel(passed_na_sentinel, use_na_sentinel) return values.factorize(sort=sort) values = _ensure_arraylike(values) @@ -725,15 +744,30 @@ def factorize( isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray)) and values.freq is not None ): + # Emit warning if appropriate + _ = com.resolve_na_sentinel(passed_na_sentinel, use_na_sentinel) # The presence of 'freq' means we can fast-path sorting and know there # aren't NAs codes, uniques = values.factorize(sort=sort) return _re_wrap_factorize(original, uniques, codes) - if not isinstance(values.dtype, np.dtype): + elif not isinstance(values.dtype, np.dtype): # i.e. ExtensionDtype - codes, uniques = values.factorize(na_sentinel=na_sentinel) + if passed_na_sentinel is lib.no_default: + # User didn't specify na_sentinel; avoid warning. Note EA path always + # uses a na_sentinel value. + codes, uniques = values.factorize(use_na_sentinel=True) + elif passed_na_sentinel is None: + # Emit the appropriate warning message for None + _ = com.resolve_na_sentinel(passed_na_sentinel, use_na_sentinel) + codes, uniques = values.factorize(use_na_sentinel=True) + else: + # EA.factorize will warn + codes, uniques = values.factorize(na_sentinel=na_sentinel) + else: + # Generate warning for na_sentile if appropriate + _ = com.resolve_na_sentinel(passed_na_sentinel, use_na_sentinel) values = np.asarray(values) # convert DTA/TDA/MultiIndex codes, uniques = factorize_array( values, na_sentinel=na_sentinel, size_hint=size_hint diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index fdd505e259dd9..2679b0bf8fd81 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -8,6 +8,7 @@ import numpy as np +from pandas._libs import lib from pandas._typing import ( TakeIndexer, npt, @@ -28,6 +29,7 @@ ) from pandas.core.dtypes.missing import isna +from pandas.core import common as com from pandas.core.arrays.base import ExtensionArray from pandas.core.indexers import ( check_array_indexer, @@ -122,7 +124,16 @@ def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: return type(self)(pc.drop_null(self._data)) @doc(ExtensionArray.factorize) - def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: + def factorize( + self, + na_sentinel: int | lib.NoDefault = lib.no_default, + use_na_sentinel: bool | lib.NoDefault = lib.no_default, + ) -> tuple[np.ndarray, ExtensionArray]: + resolved_na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel) + if resolved_na_sentinel is None: + raise NotImplementedError("Encoding NaN values is not yet implemented") + else: + na_sentinel = resolved_na_sentinel encoded = self._data.dictionary_encode() indices = pa.chunked_array( [c.indices for c in encoded.chunks], type=encoded.type.index_type diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index eb3c6d6d26101..2a0f461cab168 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -68,6 +68,7 @@ from pandas.core import ( arraylike, + common as com, missing, roperator, ) @@ -1002,36 +1003,50 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]: """ return self.astype(object), np.nan - def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: + def factorize( + self, + na_sentinel: int | lib.NoDefault = lib.no_default, + use_na_sentinel: bool | lib.NoDefault = lib.no_default, + ) -> tuple[np.ndarray, ExtensionArray]: """ - Encode the extension array as an enumerated type. + Encode the extension array as an enumerated type. - Parameters - ---------- - na_sentinel : int, default -1 - Value to use in the `codes` array to indicate missing values. + Parameters + ---------- + na_sentinel : int, default -1 + Value to use in the `codes` array to indicate missing values. - Returns - ------- - codes : ndarray - An integer NumPy array that's an indexer into the original - ExtensionArray. - uniques : ExtensionArray - An ExtensionArray containing the unique values of `self`. + .. deprecated:: 1.5.0 + Specifying the specific value to use for na_sentinel is deprecated and + will be removed in a future version of pandas. Specify use_na_sentinel + as either True or False. - .. note:: + use_na_sentinel : bool, default True + If True, the sentinel -1 will be used for NaN values. If False, + NaN values will be encoded as non-negative integers and will not drop the + NaN from the uniques of the values. - uniques will *not* contain an entry for the NA value of - the ExtensionArray if there are any missing values present - in `self`. + Returns + ------- + codes : ndarray + An integer NumPy array that's an indexer into the original + ExtensionArray. + uniques : ExtensionArray + An ExtensionArray containing the unique values of `self`. - See Also - -------- - factorize : Top-level factorize method that dispatches here. + .. note:: - Notes - ----- - :meth:`pandas.factorize` offers a `sort` keyword as well. + uniques will *not* contain an entry for the NA value of + the ExtensionArray if there are any missing values present + in `self`. + + See Also + -------- + factorize : Top-level factorize method that dispatches here. + + Notes + ----- + :meth:`pandas.factorize` offers a `sort` keyword as well. """ # Implementer note: There are two ways to override the behavior of # pandas.factorize @@ -1041,6 +1056,11 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: # original ExtensionArray. # 2. ExtensionArray.factorize. # Complete control over factorization. + resolved_na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel) + if resolved_na_sentinel is None: + raise NotImplementedError("Encoding NaN values is not yet implemented") + else: + na_sentinel = resolved_na_sentinel arr, na_value = self._values_for_factorize() codes, uniques = factorize_array( diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 98012bfa31943..d771851a3c971 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1899,7 +1899,12 @@ def _with_freq(self, freq): # -------------------------------------------------------------- - def factorize(self, na_sentinel=-1, sort: bool = False): + def factorize( + self, + na_sentinel: int | lib.NoDefault = lib.no_default, + use_na_sentinel: bool | lib.NoDefault = lib.no_default, + sort: bool = False, + ): if self.freq is not None: # We must be unique, so can short-circuit (and retain freq) codes = np.arange(len(self), dtype=np.intp) @@ -1909,7 +1914,9 @@ def factorize(self, na_sentinel=-1, sort: bool = False): uniques = uniques[::-1] return codes, uniques # FIXME: shouldn't get here; we are ignoring sort - return super().factorize(na_sentinel=na_sentinel) + return super().factorize( + na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel + ) # ------------------------------------------------------------------- diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 3616e3512c6fe..1ba4d7a451d89 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -59,6 +59,7 @@ from pandas.core import ( algorithms as algos, arraylike, + common as com, missing, nanops, ops, @@ -869,7 +870,16 @@ def searchsorted( return self._data.searchsorted(value, side=side, sorter=sorter) @doc(ExtensionArray.factorize) - def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: + def factorize( + self, + na_sentinel: int | lib.NoDefault = lib.no_default, + use_na_sentinel: bool | lib.NoDefault = lib.no_default, + ) -> tuple[np.ndarray, ExtensionArray]: + resolved_na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel) + if resolved_na_sentinel is None: + raise NotImplementedError("Encoding NaN values is not yet implemented") + else: + na_sentinel = resolved_na_sentinel arr = self._data mask = self._mask diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 427bf50ca7424..d91edc15e6d66 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -848,12 +848,21 @@ def _values_for_factorize(self): # Still override this for hash_pandas_object return np.asarray(self), self.fill_value - def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, SparseArray]: + def factorize( + self, + na_sentinel: int | lib.NoDefault = lib.no_default, + use_na_sentinel: bool | lib.NoDefault = lib.no_default, + ) -> tuple[np.ndarray, SparseArray]: # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA] # The sparsity on this is backwards from what Sparse would want. Want # ExtensionArray.factorize -> Tuple[EA, EA] # Given that we have to return a dense array of codes, why bother # implementing an efficient factorize? + resolved_na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel) + if resolved_na_sentinel is None: + raise NotImplementedError("Encoding NaN values is not yet implemented") + else: + na_sentinel = resolved_na_sentinel codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel) uniques_sp = SparseArray(uniques, dtype=self.dtype) return codes, uniques_sp diff --git a/pandas/core/base.py b/pandas/core/base.py index b4c2c81ee666f..7541eff9a11d4 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1136,8 +1136,15 @@ def _memory_usage(self, deep: bool = False) -> int: """ ), ) - def factorize(self, sort: bool = False, na_sentinel: int | None = -1): - return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel) + def factorize( + self, + sort: bool = False, + na_sentinel: int | lib.NoDefault = lib.no_default, + use_na_sentinel: bool | lib.NoDefault = lib.no_default, + ): + return algorithms.factorize( + self, sort=sort, na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel + ) _shared_docs[ "searchsorted" diff --git a/pandas/core/common.py b/pandas/core/common.py index eeb18759fc72c..87f00485dada2 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -708,3 +708,52 @@ def deprecate_numeric_only_default(cls: type, name: str, deprecate_none: bool = ) warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) + + +def resolve_na_sentinel( + na_sentinel: int | None | lib.NoDefault, + use_na_sentinel: bool | lib.NoDefault, + warn: bool = True, +) -> int | None: + """Determine value of na_sentinel for factorize methods. + + See GH#46910 for details on the deprecation. + + Parameters + ---------- + na_sentinel : bool, int, None, or lib.no_default + Value passed to the method. + + Returns + ------- + Resolved value of na_sentinel. + """ + if na_sentinel is not lib.no_default and use_na_sentinel is not lib.no_default: + raise ValueError( + "Cannot specify both na_sentinel and use_na_sentile; " + f"got na_sentinel={na_sentinel} and use_na_sentinel={use_na_sentinel}" + ) + if na_sentinel is lib.no_default: + result = -1 if use_na_sentinel is lib.no_default or use_na_sentinel else None + else: + if warn: + if na_sentinel is None: + msg = ( + "Specifying na_sentinel=None is deprecated, specify " + "use_na_sentinel=False instead." + ) + elif na_sentinel == -1: + msg = ( + "Specifying na_sentinel=-1 is deprecated, specify " + "use_na_sentinel=True instead." + ) + else: + msg = ( + "Specifying the specific value to use for na_sentinel is " + "deprecated and will be removed in a future version of pandas. " + "Specify na_sentinel=True to use the sentinel value -1, and " + "na_sentinel=False to encode NaN values." + ) + warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) + result = na_sentinel + return result diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 3f37c5f0c6df9..e36a24fcd84c6 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -681,13 +681,9 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: codes = self.grouping_vector.codes_info uniques = self.grouping_vector.result_index._values else: - # GH35667, replace dropna=False with na_sentinel=None - if not self._dropna: - na_sentinel = None - else: - na_sentinel = -1 + # GH35667, replace dropna=False with use_na_sentinel=False codes, uniques = algorithms.factorize( - self.grouping_vector, sort=self._sort, na_sentinel=na_sentinel + self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna ) return codes, uniques diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index fdb1ee754a7e6..3f5bf8d40c7ee 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -510,8 +510,13 @@ def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]: return result def factorize( - self, sort: bool = False, na_sentinel: int | None = -1 + self, + sort: bool = False, + na_sentinel: int | lib.NoDefault = lib.no_default, + use_na_sentinel: bool | lib.NoDefault = lib.no_default, ) -> tuple[npt.NDArray[np.intp], RangeIndex]: + # resolve to emit warning if appropriate + _ = com.resolve_na_sentinel(na_sentinel, use_na_sentinel) codes = np.arange(len(self), dtype=np.intp) uniques = self if sort and self.step < 0: diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 2d73b8e91e831..19703d2c622f7 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -75,11 +75,11 @@ def test_factorize(self, index_or_series_obj, sort): tm.assert_numpy_array_equal(result_codes, expected_codes) tm.assert_index_equal(result_uniques, expected_uniques, exact=True) - def test_series_factorize_na_sentinel_none(self): + def test_series_factorize_use_na_sentinel_false(self): # GH#35667 values = np.array([1, 2, 1, np.nan]) ser = Series(values) - codes, uniques = ser.factorize(na_sentinel=None) + codes, uniques = ser.factorize(use_na_sentinel=False) expected_codes = np.array([0, 1, 0, 2], dtype=np.intp) expected_uniques = Index([1.0, 2.0, np.nan]) @@ -87,6 +87,20 @@ def test_series_factorize_na_sentinel_none(self): tm.assert_numpy_array_equal(codes, expected_codes) tm.assert_index_equal(uniques, expected_uniques) + @pytest.mark.parametrize("na_sentinel", [None, -1, -10]) + def test_depr_na_sentinel(self, na_sentinel, index_or_series_obj): + # GH#46910 + if na_sentinel is None: + msg = "Specifying na_sentinel=None is deprecated" + elif na_sentinel == -1: + msg = "Specifying na_sentinel=-1 is deprecated" + else: + msg = "Specifying the specific value to use for na_sentinel is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + _ = pd.factorize(index_or_series_obj, na_sentinel=na_sentinel) + with tm.assert_produces_warning(FutureWarning, match=msg): + _ = index_or_series_obj.factorize(na_sentinel=na_sentinel) + def test_basic(self): codes, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"]) @@ -418,7 +432,12 @@ def test_parametrized_factorize_na_value(self, data, na_value): ids=["numpy_array", "extension_array"], ) def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): - codes, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel) + if na_sentinel == -1: + msg = "Specifying na_sentinel=-1 is deprecated" + else: + msg = "the specific value to use for na_sentinel is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + codes, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel) if sort: expected_codes = np.array([1, 0, na_sentinel, 1], dtype=np.intp) expected_uniques = algos.safe_sort(uniques) @@ -446,10 +465,10 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): ), ], ) - def test_object_factorize_na_sentinel_none( + def test_object_factorize_use_na_sentinel_false( self, data, expected_codes, expected_uniques ): - codes, uniques = algos.factorize(data, na_sentinel=None) + codes, uniques = algos.factorize(data, use_na_sentinel=False) tm.assert_numpy_array_equal(uniques, expected_uniques) tm.assert_numpy_array_equal(codes, expected_codes) @@ -469,10 +488,10 @@ def test_object_factorize_na_sentinel_none( ), ], ) - def test_int_factorize_na_sentinel_none( + def test_int_factorize_use_na_sentinel_false( self, data, expected_codes, expected_uniques ): - codes, uniques = algos.factorize(data, na_sentinel=None) + codes, uniques = algos.factorize(data, use_na_sentinel=False) tm.assert_numpy_array_equal(uniques, expected_uniques) tm.assert_numpy_array_equal(codes, expected_codes) From 79231e795c0a91f6f734427a88f85b65b1ff6cba Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 27 May 2022 14:50:11 -0400 Subject: [PATCH 02/17] WIP --- pandas/tests/extension/base/methods.py | 16 +++++++++++++--- pandas/tests/extension/test_boolean.py | 7 ++++++- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index b829b017d5fb1..a18a557429caf 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -213,7 +213,12 @@ def test_unique(self, data, box, method): @pytest.mark.parametrize("na_sentinel", [-1, -2]) def test_factorize(self, data_for_grouping, na_sentinel): - codes, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) + if na_sentinel == -1: + msg = "Specifying na_sentinel=-1 is deprecated" + else: + msg = "Specifying the specific value to use for na_sentinel is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + codes, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) expected_codes = np.array( [0, 0, na_sentinel, na_sentinel, 1, 1, 0, 2], dtype=np.intp ) @@ -224,8 +229,13 @@ def test_factorize(self, data_for_grouping, na_sentinel): @pytest.mark.parametrize("na_sentinel", [-1, -2]) def test_factorize_equivalence(self, data_for_grouping, na_sentinel): - codes_1, uniques_1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) - codes_2, uniques_2 = data_for_grouping.factorize(na_sentinel=na_sentinel) + if na_sentinel == -1: + msg = "Specifying na_sentinel=-1 is deprecated" + else: + msg = "Specifying the specific value to use for na_sentinel is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + codes_1, uniques_1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) + codes_2, uniques_2 = data_for_grouping.factorize(na_sentinel=na_sentinel) tm.assert_numpy_array_equal(codes_1, codes_2) self.assert_extension_array_equal(uniques_1, uniques_2) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index e45bffba944c0..89d074108f881 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -177,7 +177,12 @@ class TestMethods(base.BaseMethodsTests): @pytest.mark.parametrize("na_sentinel", [-1, -2]) def test_factorize(self, data_for_grouping, na_sentinel): # override because we only have 2 unique values - labels, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) + if na_sentinel == -1: + msg = "Specifying na_sentinel=-1 is deprecated" + else: + msg = "Specifying the specific value to use for na_sentinel is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + labels, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) expected_labels = np.array( [0, 0, na_sentinel, na_sentinel, 1, 1, 0], dtype=np.intp ) From c82228256bcd08afdcb6c0fcd9339ebf620831c6 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 27 May 2022 14:44:03 -0400 Subject: [PATCH 03/17] DEPR: na_sentinel in factorize --- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/core/algorithms.py | 45 ++++++++++++++++++++--- pandas/core/arrays/arrow/array.py | 13 ++++++- pandas/core/arrays/base.py | 24 ++++++++++++- pandas/core/arrays/datetimelike.py | 11 ++++-- pandas/core/arrays/masked.py | 12 ++++++- pandas/core/arrays/sparse/array.py | 11 ++++-- pandas/core/base.py | 11 ++++-- pandas/core/common.py | 49 ++++++++++++++++++++++++++ pandas/core/groupby/grouper.py | 8 ++--- pandas/core/indexes/range.py | 7 +++- pandas/tests/extension/base/methods.py | 16 +++++++-- pandas/tests/extension/test_boolean.py | 7 +++- pandas/tests/test_algos.py | 33 +++++++++++++---- 14 files changed, 216 insertions(+), 33 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 4ceb833214a79..a5beb08778da8 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -669,7 +669,7 @@ Other Deprecations - Deprecated the methods :meth:`DataFrame.mad`, :meth:`Series.mad`, and the corresponding groupby methods (:issue:`11787`) - Deprecated positional arguments to :meth:`Index.join` except for ``other``, use keyword-only arguments instead of positional arguments (:issue:`46518`) - Deprecated indexing on a timezone-naive :class:`DatetimeIndex` using a string representing a timezone-aware datetime (:issue:`46903`, :issue:`36148`) -- +- Deprecated the argument ``na_sentinel`` in :func:`factorize`, :meth:`Index.factorize`, and :meth:`.ExtensionArray.factorize`; use ``use_na_sentinel`` instead of an integer to use the sentinel ``-1`` for NaN values and ``False`` instead of ``None`` to encode NaN values (:issue:`46910`) .. --------------------------------------------------------------------------- .. _whatsnew_150.performance: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 888e943488953..bb3bb8a79951c 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -80,6 +80,7 @@ na_value_for_dtype, ) +from pandas.core import common as com from pandas.core.array_algos.take import take_nd from pandas.core.construction import ( array as pd_array, @@ -580,7 +581,8 @@ def factorize_array( def factorize( values, sort: bool = False, - na_sentinel: int | None = -1, + na_sentinel: int | None | lib.NoDefault = lib.no_default, + use_na_sentinel: bool | lib.NoDefault = lib.no_default, size_hint: int | None = None, ) -> tuple[np.ndarray, np.ndarray | Index]: """ @@ -595,10 +597,22 @@ def factorize( ---------- {values}{sort} na_sentinel : int or None, default -1 - Value to mark "not found". If None, will not drop the NaN - from the uniques of the values. + Value to mark "not found". If None, NaN values will be encoded as positive + integers and will not drop the NaN from the uniques of the values. + + .. deprecated:: 1.5.0 + Specifying the specific value to use for na_sentinel is deprecated and + will be removed in a future version of pandas. Specify use_na_sentinel as + either True or False. .. versionchanged:: 1.1.2 + + use_na_sentinel : bool, default True + If True, the sentinel -1 will be used for NaN values. If False, + NaN values will be encoded as non-negative integers and will not drop the + NaN from the uniques of the values. + + .. versionadded:: 1.5.0 {size_hint}\ Returns @@ -706,7 +720,13 @@ def factorize( # responsible only for factorization. All data coercion, sorting and boxing # should happen here. + # Can't always warn here because EA's factorize will warn too; warn for each + # path below. + passed_na_sentinel = na_sentinel + na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel, warn=False) if isinstance(values, ABCRangeIndex): + # Emit warning if appropriate + _ = com.resolve_na_sentinel(passed_na_sentinel, use_na_sentinel) return values.factorize(sort=sort) values = _ensure_arraylike(values) @@ -725,15 +745,30 @@ def factorize( isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray)) and values.freq is not None ): + # Emit warning if appropriate + _ = com.resolve_na_sentinel(passed_na_sentinel, use_na_sentinel) # The presence of 'freq' means we can fast-path sorting and know there # aren't NAs codes, uniques = values.factorize(sort=sort) return _re_wrap_factorize(original, uniques, codes) - if not isinstance(values.dtype, np.dtype): + elif not isinstance(values.dtype, np.dtype): # i.e. ExtensionDtype - codes, uniques = values.factorize(na_sentinel=na_sentinel) + if passed_na_sentinel is lib.no_default: + # User didn't specify na_sentinel; avoid warning. Note EA path always + # uses a na_sentinel value. + codes, uniques = values.factorize(use_na_sentinel=True) + elif passed_na_sentinel is None: + # Emit the appropriate warning message for None + _ = com.resolve_na_sentinel(passed_na_sentinel, use_na_sentinel) + codes, uniques = values.factorize(use_na_sentinel=True) + else: + # EA.factorize will warn + codes, uniques = values.factorize(na_sentinel=na_sentinel) + else: + # Generate warning for na_sentile if appropriate + _ = com.resolve_na_sentinel(passed_na_sentinel, use_na_sentinel) values = np.asarray(values) # convert DTA/TDA/MultiIndex codes, uniques = factorize_array( values, na_sentinel=na_sentinel, size_hint=size_hint diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index fdd505e259dd9..2679b0bf8fd81 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -8,6 +8,7 @@ import numpy as np +from pandas._libs import lib from pandas._typing import ( TakeIndexer, npt, @@ -28,6 +29,7 @@ ) from pandas.core.dtypes.missing import isna +from pandas.core import common as com from pandas.core.arrays.base import ExtensionArray from pandas.core.indexers import ( check_array_indexer, @@ -122,7 +124,16 @@ def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: return type(self)(pc.drop_null(self._data)) @doc(ExtensionArray.factorize) - def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: + def factorize( + self, + na_sentinel: int | lib.NoDefault = lib.no_default, + use_na_sentinel: bool | lib.NoDefault = lib.no_default, + ) -> tuple[np.ndarray, ExtensionArray]: + resolved_na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel) + if resolved_na_sentinel is None: + raise NotImplementedError("Encoding NaN values is not yet implemented") + else: + na_sentinel = resolved_na_sentinel encoded = self._data.dictionary_encode() indices = pa.chunked_array( [c.indices for c in encoded.chunks], type=encoded.type.index_type diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index eb3c6d6d26101..3e39653f207fd 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -68,6 +68,7 @@ from pandas.core import ( arraylike, + common as com, missing, roperator, ) @@ -1002,7 +1003,11 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]: """ return self.astype(object), np.nan - def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: + def factorize( + self, + na_sentinel: int | lib.NoDefault = lib.no_default, + use_na_sentinel: bool | lib.NoDefault = lib.no_default, + ) -> tuple[np.ndarray, ExtensionArray]: """ Encode the extension array as an enumerated type. @@ -1011,6 +1016,18 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: na_sentinel : int, default -1 Value to use in the `codes` array to indicate missing values. + .. deprecated:: 1.5.0 + Specifying the specific value to use for na_sentinel is deprecated and + will be removed in a future version of pandas. Specify use_na_sentinel + as either True or False. + + use_na_sentinel : bool, default True + If True, the sentinel -1 will be used for NaN values. If False, + NaN values will be encoded as non-negative integers and will not drop the + NaN from the uniques of the values. + + .. versionadded:: 1.5.0 + Returns ------- codes : ndarray @@ -1041,6 +1058,11 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: # original ExtensionArray. # 2. ExtensionArray.factorize. # Complete control over factorization. + resolved_na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel) + if resolved_na_sentinel is None: + raise NotImplementedError("Encoding NaN values is not yet implemented") + else: + na_sentinel = resolved_na_sentinel arr, na_value = self._values_for_factorize() codes, uniques = factorize_array( diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 98012bfa31943..d771851a3c971 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1899,7 +1899,12 @@ def _with_freq(self, freq): # -------------------------------------------------------------- - def factorize(self, na_sentinel=-1, sort: bool = False): + def factorize( + self, + na_sentinel: int | lib.NoDefault = lib.no_default, + use_na_sentinel: bool | lib.NoDefault = lib.no_default, + sort: bool = False, + ): if self.freq is not None: # We must be unique, so can short-circuit (and retain freq) codes = np.arange(len(self), dtype=np.intp) @@ -1909,7 +1914,9 @@ def factorize(self, na_sentinel=-1, sort: bool = False): uniques = uniques[::-1] return codes, uniques # FIXME: shouldn't get here; we are ignoring sort - return super().factorize(na_sentinel=na_sentinel) + return super().factorize( + na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel + ) # ------------------------------------------------------------------- diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 3616e3512c6fe..1ba4d7a451d89 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -59,6 +59,7 @@ from pandas.core import ( algorithms as algos, arraylike, + common as com, missing, nanops, ops, @@ -869,7 +870,16 @@ def searchsorted( return self._data.searchsorted(value, side=side, sorter=sorter) @doc(ExtensionArray.factorize) - def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: + def factorize( + self, + na_sentinel: int | lib.NoDefault = lib.no_default, + use_na_sentinel: bool | lib.NoDefault = lib.no_default, + ) -> tuple[np.ndarray, ExtensionArray]: + resolved_na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel) + if resolved_na_sentinel is None: + raise NotImplementedError("Encoding NaN values is not yet implemented") + else: + na_sentinel = resolved_na_sentinel arr = self._data mask = self._mask diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 427bf50ca7424..c1873a1b7c18d 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -848,13 +848,20 @@ def _values_for_factorize(self): # Still override this for hash_pandas_object return np.asarray(self), self.fill_value - def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, SparseArray]: + def factorize( + self, + na_sentinel: int | lib.NoDefault = lib.no_default, + use_na_sentinel: bool | lib.NoDefault = lib.no_default, + ) -> tuple[np.ndarray, SparseArray]: # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA] # The sparsity on this is backwards from what Sparse would want. Want # ExtensionArray.factorize -> Tuple[EA, EA] # Given that we have to return a dense array of codes, why bother # implementing an efficient factorize? - codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel) + # resolved_na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel) + codes, uniques = algos.factorize( + np.asarray(self), na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel + ) uniques_sp = SparseArray(uniques, dtype=self.dtype) return codes, uniques_sp diff --git a/pandas/core/base.py b/pandas/core/base.py index b4c2c81ee666f..7541eff9a11d4 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1136,8 +1136,15 @@ def _memory_usage(self, deep: bool = False) -> int: """ ), ) - def factorize(self, sort: bool = False, na_sentinel: int | None = -1): - return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel) + def factorize( + self, + sort: bool = False, + na_sentinel: int | lib.NoDefault = lib.no_default, + use_na_sentinel: bool | lib.NoDefault = lib.no_default, + ): + return algorithms.factorize( + self, sort=sort, na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel + ) _shared_docs[ "searchsorted" diff --git a/pandas/core/common.py b/pandas/core/common.py index 05eb101dabb98..13ccaa7679684 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -704,3 +704,52 @@ def deprecate_numeric_only_default(cls: type, name: str, deprecate_none: bool = ) warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) + + +def resolve_na_sentinel( + na_sentinel: int | None | lib.NoDefault, + use_na_sentinel: bool | lib.NoDefault, + warn: bool = True, +) -> int | None: + """Determine value of na_sentinel for factorize methods. + + See GH#46910 for details on the deprecation. + + Parameters + ---------- + na_sentinel : bool, int, None, or lib.no_default + Value passed to the method. + + Returns + ------- + Resolved value of na_sentinel. + """ + if na_sentinel is not lib.no_default and use_na_sentinel is not lib.no_default: + raise ValueError( + "Cannot specify both na_sentinel and use_na_sentile; " + f"got na_sentinel={na_sentinel} and use_na_sentinel={use_na_sentinel}" + ) + if na_sentinel is lib.no_default: + result = -1 if use_na_sentinel is lib.no_default or use_na_sentinel else None + else: + if warn: + if na_sentinel is None: + msg = ( + "Specifying na_sentinel=None is deprecated, specify " + "use_na_sentinel=False instead." + ) + elif na_sentinel == -1: + msg = ( + "Specifying na_sentinel=-1 is` deprecated, specify " + "use_na_sentinel=True instead." + ) + else: + msg = ( + "Specifying the specific value to use for na_sentinel is " + "deprecated and will be removed in a future version of pandas. " + "Specify na_sentinel=True to use the sentinel value -1, and " + "na_sentinel=False to encode NaN values." + ) + warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) + result = na_sentinel + return result diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 3f37c5f0c6df9..e36a24fcd84c6 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -681,13 +681,9 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: codes = self.grouping_vector.codes_info uniques = self.grouping_vector.result_index._values else: - # GH35667, replace dropna=False with na_sentinel=None - if not self._dropna: - na_sentinel = None - else: - na_sentinel = -1 + # GH35667, replace dropna=False with use_na_sentinel=False codes, uniques = algorithms.factorize( - self.grouping_vector, sort=self._sort, na_sentinel=na_sentinel + self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna ) return codes, uniques diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index fdb1ee754a7e6..3f5bf8d40c7ee 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -510,8 +510,13 @@ def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]: return result def factorize( - self, sort: bool = False, na_sentinel: int | None = -1 + self, + sort: bool = False, + na_sentinel: int | lib.NoDefault = lib.no_default, + use_na_sentinel: bool | lib.NoDefault = lib.no_default, ) -> tuple[npt.NDArray[np.intp], RangeIndex]: + # resolve to emit warning if appropriate + _ = com.resolve_na_sentinel(na_sentinel, use_na_sentinel) codes = np.arange(len(self), dtype=np.intp) uniques = self if sort and self.step < 0: diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index b829b017d5fb1..a18a557429caf 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -213,7 +213,12 @@ def test_unique(self, data, box, method): @pytest.mark.parametrize("na_sentinel", [-1, -2]) def test_factorize(self, data_for_grouping, na_sentinel): - codes, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) + if na_sentinel == -1: + msg = "Specifying na_sentinel=-1 is deprecated" + else: + msg = "Specifying the specific value to use for na_sentinel is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + codes, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) expected_codes = np.array( [0, 0, na_sentinel, na_sentinel, 1, 1, 0, 2], dtype=np.intp ) @@ -224,8 +229,13 @@ def test_factorize(self, data_for_grouping, na_sentinel): @pytest.mark.parametrize("na_sentinel", [-1, -2]) def test_factorize_equivalence(self, data_for_grouping, na_sentinel): - codes_1, uniques_1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) - codes_2, uniques_2 = data_for_grouping.factorize(na_sentinel=na_sentinel) + if na_sentinel == -1: + msg = "Specifying na_sentinel=-1 is deprecated" + else: + msg = "Specifying the specific value to use for na_sentinel is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + codes_1, uniques_1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) + codes_2, uniques_2 = data_for_grouping.factorize(na_sentinel=na_sentinel) tm.assert_numpy_array_equal(codes_1, codes_2) self.assert_extension_array_equal(uniques_1, uniques_2) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index e45bffba944c0..89d074108f881 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -177,7 +177,12 @@ class TestMethods(base.BaseMethodsTests): @pytest.mark.parametrize("na_sentinel", [-1, -2]) def test_factorize(self, data_for_grouping, na_sentinel): # override because we only have 2 unique values - labels, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) + if na_sentinel == -1: + msg = "Specifying na_sentinel=-1 is deprecated" + else: + msg = "Specifying the specific value to use for na_sentinel is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + labels, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) expected_labels = np.array( [0, 0, na_sentinel, na_sentinel, 1, 1, 0], dtype=np.intp ) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 2d73b8e91e831..19703d2c622f7 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -75,11 +75,11 @@ def test_factorize(self, index_or_series_obj, sort): tm.assert_numpy_array_equal(result_codes, expected_codes) tm.assert_index_equal(result_uniques, expected_uniques, exact=True) - def test_series_factorize_na_sentinel_none(self): + def test_series_factorize_use_na_sentinel_false(self): # GH#35667 values = np.array([1, 2, 1, np.nan]) ser = Series(values) - codes, uniques = ser.factorize(na_sentinel=None) + codes, uniques = ser.factorize(use_na_sentinel=False) expected_codes = np.array([0, 1, 0, 2], dtype=np.intp) expected_uniques = Index([1.0, 2.0, np.nan]) @@ -87,6 +87,20 @@ def test_series_factorize_na_sentinel_none(self): tm.assert_numpy_array_equal(codes, expected_codes) tm.assert_index_equal(uniques, expected_uniques) + @pytest.mark.parametrize("na_sentinel", [None, -1, -10]) + def test_depr_na_sentinel(self, na_sentinel, index_or_series_obj): + # GH#46910 + if na_sentinel is None: + msg = "Specifying na_sentinel=None is deprecated" + elif na_sentinel == -1: + msg = "Specifying na_sentinel=-1 is deprecated" + else: + msg = "Specifying the specific value to use for na_sentinel is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + _ = pd.factorize(index_or_series_obj, na_sentinel=na_sentinel) + with tm.assert_produces_warning(FutureWarning, match=msg): + _ = index_or_series_obj.factorize(na_sentinel=na_sentinel) + def test_basic(self): codes, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"]) @@ -418,7 +432,12 @@ def test_parametrized_factorize_na_value(self, data, na_value): ids=["numpy_array", "extension_array"], ) def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): - codes, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel) + if na_sentinel == -1: + msg = "Specifying na_sentinel=-1 is deprecated" + else: + msg = "the specific value to use for na_sentinel is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + codes, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel) if sort: expected_codes = np.array([1, 0, na_sentinel, 1], dtype=np.intp) expected_uniques = algos.safe_sort(uniques) @@ -446,10 +465,10 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): ), ], ) - def test_object_factorize_na_sentinel_none( + def test_object_factorize_use_na_sentinel_false( self, data, expected_codes, expected_uniques ): - codes, uniques = algos.factorize(data, na_sentinel=None) + codes, uniques = algos.factorize(data, use_na_sentinel=False) tm.assert_numpy_array_equal(uniques, expected_uniques) tm.assert_numpy_array_equal(codes, expected_codes) @@ -469,10 +488,10 @@ def test_object_factorize_na_sentinel_none( ), ], ) - def test_int_factorize_na_sentinel_none( + def test_int_factorize_use_na_sentinel_false( self, data, expected_codes, expected_uniques ): - codes, uniques = algos.factorize(data, na_sentinel=None) + codes, uniques = algos.factorize(data, use_na_sentinel=False) tm.assert_numpy_array_equal(uniques, expected_uniques) tm.assert_numpy_array_equal(codes, expected_codes) From 05fa0caebd0337839f6dd4c0dbad6bae37cee604 Mon Sep 17 00:00:00 2001 From: richard Date: Sat, 28 May 2022 01:01:16 -0400 Subject: [PATCH 04/17] Fixups --- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/core/algorithms.py | 4 ++-- pandas/core/arrays/sparse/array.py | 1 - pandas/core/common.py | 6 +++++- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index a5beb08778da8..db4916d775e33 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -669,7 +669,7 @@ Other Deprecations - Deprecated the methods :meth:`DataFrame.mad`, :meth:`Series.mad`, and the corresponding groupby methods (:issue:`11787`) - Deprecated positional arguments to :meth:`Index.join` except for ``other``, use keyword-only arguments instead of positional arguments (:issue:`46518`) - Deprecated indexing on a timezone-naive :class:`DatetimeIndex` using a string representing a timezone-aware datetime (:issue:`46903`, :issue:`36148`) -- Deprecated the argument ``na_sentinel`` in :func:`factorize`, :meth:`Index.factorize`, and :meth:`.ExtensionArray.factorize`; use ``use_na_sentinel`` instead of an integer to use the sentinel ``-1`` for NaN values and ``False`` instead of ``None`` to encode NaN values (:issue:`46910`) +- Deprecated the argument ``na_sentinel`` in :func:`factorize`, :meth:`Index.factorize`, and :meth:`.ExtensionArray.factorize`; pass ``use_na_sentinel=True`` instead to use the sentinel ``-1`` for NaN values and ``use_na_sentinel=False`` instead of ``na_sentinel=None`` to encode NaN values (:issue:`46910`) .. --------------------------------------------------------------------------- .. _whatsnew_150.performance: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index bb3bb8a79951c..c58471810a7e2 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -597,8 +597,8 @@ def factorize( ---------- {values}{sort} na_sentinel : int or None, default -1 - Value to mark "not found". If None, NaN values will be encoded as positive - integers and will not drop the NaN from the uniques of the values. + Value to mark "not found". If None, will not drop the NaN + from the uniques of the values. .. deprecated:: 1.5.0 Specifying the specific value to use for na_sentinel is deprecated and diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index c1873a1b7c18d..6c2cb7e84fa25 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -858,7 +858,6 @@ def factorize( # ExtensionArray.factorize -> Tuple[EA, EA] # Given that we have to return a dense array of codes, why bother # implementing an efficient factorize? - # resolved_na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel) codes, uniques = algos.factorize( np.asarray(self), na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel ) diff --git a/pandas/core/common.py b/pandas/core/common.py index 13ccaa7679684..855fb66e69670 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -717,8 +717,12 @@ def resolve_na_sentinel( Parameters ---------- - na_sentinel : bool, int, None, or lib.no_default + na_sentinel : int, None, or lib.no_default Value passed to the method. + use_na_sentinel : bool or lib.no_default + Value passed to the method. + warn : bool, default True + Whether to emit a warning if a deprecated use is detected. Returns ------- From f626dd81f31e8c9fcf33943208bebeef10111ed7 Mon Sep 17 00:00:00 2001 From: richard Date: Sat, 28 May 2022 01:06:33 -0400 Subject: [PATCH 05/17] Fixups --- pandas/core/algorithms.py | 2 +- pandas/core/arrays/base.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c58471810a7e2..768901841e138 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -601,7 +601,7 @@ def factorize( from the uniques of the values. .. deprecated:: 1.5.0 - Specifying the specific value to use for na_sentinel is deprecated and + The na_sentinel argument is deprecated and will be removed in a future version of pandas. Specify use_na_sentinel as either True or False. diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 3e39653f207fd..23f0f1ceec2d3 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1017,7 +1017,7 @@ def factorize( Value to use in the `codes` array to indicate missing values. .. deprecated:: 1.5.0 - Specifying the specific value to use for na_sentinel is deprecated and + The na_sentinel argument is deprecated and will be removed in a future version of pandas. Specify use_na_sentinel as either True or False. From a15e43a0147aa0dd704048a9d28652f28987d21f Mon Sep 17 00:00:00 2001 From: richard Date: Sat, 28 May 2022 01:10:18 -0400 Subject: [PATCH 06/17] black --- pandas/tests/extension/base/methods.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index a18a557429caf..f5a4b5c3f3939 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -234,7 +234,9 @@ def test_factorize_equivalence(self, data_for_grouping, na_sentinel): else: msg = "Specifying the specific value to use for na_sentinel is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): - codes_1, uniques_1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) + codes_1, uniques_1 = pd.factorize( + data_for_grouping, na_sentinel=na_sentinel + ) codes_2, uniques_2 = data_for_grouping.factorize(na_sentinel=na_sentinel) tm.assert_numpy_array_equal(codes_1, codes_2) From 9a33637822e865a7b01d2dcb2526b32a6c5866a3 Mon Sep 17 00:00:00 2001 From: richard Date: Sat, 28 May 2022 08:08:24 -0400 Subject: [PATCH 07/17] fixup --- pandas/core/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 855fb66e69670..16e7bbd0ec456 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -744,7 +744,7 @@ def resolve_na_sentinel( ) elif na_sentinel == -1: msg = ( - "Specifying na_sentinel=-1 is` deprecated, specify " + "Specifying na_sentinel=-1 is deprecated, specify " "use_na_sentinel=True instead." ) else: From 46e7a8d322e729142d05ad8ad8686c27cd5d0642 Mon Sep 17 00:00:00 2001 From: richard Date: Sat, 28 May 2022 09:26:23 -0400 Subject: [PATCH 08/17] docs --- pandas/core/algorithms.py | 10 +++++----- pandas/core/arrays/base.py | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 768901841e138..a9a8616f1c7b4 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -660,8 +660,8 @@ def factorize( >>> uniques array(['a', 'b', 'c'], dtype=object) - Missing values are indicated in `codes` with `na_sentinel` - (``-1`` by default). Note that missing values are never + When ``use_na_sentinel=True`` (the default), missing values are indicated in + the `codes` with the sentinel value ``-1`` and missing values are not included in `uniques`. >>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b']) @@ -696,16 +696,16 @@ def factorize( Index(['a', 'c'], dtype='object') If NaN is in the values, and we want to include NaN in the uniques of the - values, it can be achieved by setting ``na_sentinel=None``. + values, it can be achieved by setting ``use_na_sentinel=False``. >>> values = np.array([1, 2, 1, np.nan]) - >>> codes, uniques = pd.factorize(values) # default: na_sentinel=-1 + >>> codes, uniques = pd.factorize(values) # default: use_na_sentinel=True >>> codes array([ 0, 1, 0, -1]) >>> uniques array([1., 2.]) - >>> codes, uniques = pd.factorize(values, na_sentinel=None) + >>> codes, uniques = pd.factorize(values, use_na_sentinel=False) >>> codes array([0, 1, 0, 2]) >>> uniques diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 23f0f1ceec2d3..f69fe4bc7f5b6 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1016,10 +1016,10 @@ def factorize( na_sentinel : int, default -1 Value to use in the `codes` array to indicate missing values. - .. deprecated:: 1.5.0 - The na_sentinel argument is deprecated and - will be removed in a future version of pandas. Specify use_na_sentinel - as either True or False. + .. deprecated:: 1.5.0 + The na_sentinel argument is deprecated and + will be removed in a future version of pandas. Specify use_na_sentinel + as either True or False. use_na_sentinel : bool, default True If True, the sentinel -1 will be used for NaN values. If False, From 0fd1ea795d64f9f1ad69965b9ff367be86ee6b21 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 31 May 2022 17:00:48 -0400 Subject: [PATCH 09/17] newline --- pandas/core/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 16e7bbd0ec456..3afc3de390460 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -711,7 +711,8 @@ def resolve_na_sentinel( use_na_sentinel: bool | lib.NoDefault, warn: bool = True, ) -> int | None: - """Determine value of na_sentinel for factorize methods. + """ + Determine value of na_sentinel for factorize methods. See GH#46910 for details on the deprecation. From 465ab2be3c19f6f2d63d41fb2efc9d727585d37d Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 31 May 2022 21:14:55 -0400 Subject: [PATCH 10/17] Warn on class construction, rework pd.factorize warnings --- pandas/core/algorithms.py | 29 +++++------------ pandas/core/arrays/base.py | 17 ++++++++++ pandas/core/common.py | 40 +++++++++++------------- pandas/tests/extension/test_extension.py | 14 +++++++++ 4 files changed, 56 insertions(+), 44 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index a9a8616f1c7b4..4a5cff96395e3 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -14,7 +14,7 @@ cast, final, ) -from warnings import warn +import warnings import numpy as np @@ -722,11 +722,8 @@ def factorize( # Can't always warn here because EA's factorize will warn too; warn for each # path below. - passed_na_sentinel = na_sentinel - na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel, warn=False) + na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel) if isinstance(values, ABCRangeIndex): - # Emit warning if appropriate - _ = com.resolve_na_sentinel(passed_na_sentinel, use_na_sentinel) return values.factorize(sort=sort) values = _ensure_arraylike(values) @@ -745,30 +742,18 @@ def factorize( isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray)) and values.freq is not None ): - # Emit warning if appropriate - _ = com.resolve_na_sentinel(passed_na_sentinel, use_na_sentinel) # The presence of 'freq' means we can fast-path sorting and know there # aren't NAs codes, uniques = values.factorize(sort=sort) return _re_wrap_factorize(original, uniques, codes) elif not isinstance(values.dtype, np.dtype): - # i.e. ExtensionDtype - if passed_na_sentinel is lib.no_default: - # User didn't specify na_sentinel; avoid warning. Note EA path always - # uses a na_sentinel value. - codes, uniques = values.factorize(use_na_sentinel=True) - elif passed_na_sentinel is None: - # Emit the appropriate warning message for None - _ = com.resolve_na_sentinel(passed_na_sentinel, use_na_sentinel) - codes, uniques = values.factorize(use_na_sentinel=True) - else: - # EA.factorize will warn + with warnings.catch_warnings(): + # We've already warned above + warnings.filterwarnings("ignore", ".*use_na_sentinel.*", FutureWarning) codes, uniques = values.factorize(na_sentinel=na_sentinel) else: - # Generate warning for na_sentile if appropriate - _ = com.resolve_na_sentinel(passed_na_sentinel, use_na_sentinel) values = np.asarray(values) # convert DTA/TDA/MultiIndex codes, uniques = factorize_array( values, na_sentinel=na_sentinel, size_hint=size_hint @@ -985,7 +970,7 @@ def mode( try: npresult = np.sort(npresult) except TypeError as err: - warn(f"Unable to sort modes: {err}") + warnings.warn(f"Unable to sort modes: {err}") result = _reconstruct_data(npresult, original.dtype, original) return result @@ -1599,7 +1584,7 @@ def diff(arr, n: int, axis: int = 0): raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}") return op(arr, arr.shift(n)) else: - warn( + warnings.warn( "dtype lost in 'diff()'. In the future this will raise a " "TypeError. Convert to a suitable dtype prior to calling 'diff'.", FutureWarning, diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index f69fe4bc7f5b6..7ce9694dbf376 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -8,6 +8,7 @@ """ from __future__ import annotations +import inspect import operator from typing import ( TYPE_CHECKING, @@ -20,6 +21,7 @@ cast, overload, ) +import warnings import numpy as np @@ -45,6 +47,7 @@ cache_readonly, deprecate_nonkeyword_arguments, ) +from pandas.util._exceptions import find_stack_level from pandas.util._validators import ( validate_bool_kwarg, validate_fillna_kwargs, @@ -457,6 +460,20 @@ def __ne__(self, other: Any) -> ArrayLike: # type: ignore[override] """ return ~(self == other) + def __init_subclass__(cls, **kwargs): + factorize = getattr(cls, "factorize") + if "use_na_sentinel" not in inspect.signature(factorize).parameters: + # See GH#46910 for details on the deprecation + name = cls.__name__ + warnings.warn( + f"The na_sentinel argument of {name}.factorize is deprecated. " + f"In the future, pandas will use the use_na_sentinel argument instead. " + f"Add this argument to {name}.factorize to be compatible with future" + f"versions of pandas and silence this warning.", + FutureWarning, + stacklevel=find_stack_level(), + ) + def to_numpy( self, dtype: npt.DTypeLike | None = None, diff --git a/pandas/core/common.py b/pandas/core/common.py index 3afc3de390460..7fd1aca19df23 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -709,7 +709,6 @@ def deprecate_numeric_only_default(cls: type, name: str, deprecate_none: bool = def resolve_na_sentinel( na_sentinel: int | None | lib.NoDefault, use_na_sentinel: bool | lib.NoDefault, - warn: bool = True, ) -> int | None: """ Determine value of na_sentinel for factorize methods. @@ -722,8 +721,6 @@ def resolve_na_sentinel( Value passed to the method. use_na_sentinel : bool or lib.no_default Value passed to the method. - warn : bool, default True - Whether to emit a warning if a deprecated use is detected. Returns ------- @@ -737,24 +734,23 @@ def resolve_na_sentinel( if na_sentinel is lib.no_default: result = -1 if use_na_sentinel is lib.no_default or use_na_sentinel else None else: - if warn: - if na_sentinel is None: - msg = ( - "Specifying na_sentinel=None is deprecated, specify " - "use_na_sentinel=False instead." - ) - elif na_sentinel == -1: - msg = ( - "Specifying na_sentinel=-1 is deprecated, specify " - "use_na_sentinel=True instead." - ) - else: - msg = ( - "Specifying the specific value to use for na_sentinel is " - "deprecated and will be removed in a future version of pandas. " - "Specify na_sentinel=True to use the sentinel value -1, and " - "na_sentinel=False to encode NaN values." - ) - warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) + if na_sentinel is None: + msg = ( + "Specifying na_sentinel=None is deprecated, specify " + "use_na_sentinel=False instead." + ) + elif na_sentinel == -1: + msg = ( + "Specifying na_sentinel=-1 is deprecated, specify " + "use_na_sentinel=True instead." + ) + else: + msg = ( + "Specifying the specific value to use for na_sentinel is " + "deprecated and will be removed in a future version of pandas. " + "Specify use_na_sentinel=True to use the sentinel value -1, and " + "use_na_sentinel=False to encode NaN values." + ) + warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) result = na_sentinel return result diff --git a/pandas/tests/extension/test_extension.py b/pandas/tests/extension/test_extension.py index 1ed626cd51080..fcf1f439db702 100644 --- a/pandas/tests/extension/test_extension.py +++ b/pandas/tests/extension/test_extension.py @@ -4,6 +4,7 @@ import numpy as np import pytest +import pandas._testing as tm from pandas.core.arrays import ExtensionArray @@ -24,3 +25,16 @@ def test_errors(self, data, all_arithmetic_operators): op_name = all_arithmetic_operators with pytest.raises(AttributeError): getattr(data, op_name) + + +def test_depr_na_sentinel(): + # GH#46910 + msg = "The na_sentinel argument of MyEA.factorize is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + + class MyEA(ExtensionArray): + def factorize(self, na_sentinel=-1): + pass + + with tm.assert_produces_warning(None): + MyEA() From 6b4917cca3042bf71490c47912a6d6ce4d78b64b Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 31 May 2022 21:23:56 -0400 Subject: [PATCH 11/17] FutureWarning -> DeprecationWarning --- pandas/core/arrays/base.py | 2 +- pandas/tests/extension/test_extension.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 7ce9694dbf376..86645b0726eff 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -470,7 +470,7 @@ def __init_subclass__(cls, **kwargs): f"In the future, pandas will use the use_na_sentinel argument instead. " f"Add this argument to {name}.factorize to be compatible with future" f"versions of pandas and silence this warning.", - FutureWarning, + DeprecationWarning, stacklevel=find_stack_level(), ) diff --git a/pandas/tests/extension/test_extension.py b/pandas/tests/extension/test_extension.py index fcf1f439db702..a9b4eea2ae8f6 100644 --- a/pandas/tests/extension/test_extension.py +++ b/pandas/tests/extension/test_extension.py @@ -30,7 +30,7 @@ def test_errors(self, data, all_arithmetic_operators): def test_depr_na_sentinel(): # GH#46910 msg = "The na_sentinel argument of MyEA.factorize is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): class MyEA(ExtensionArray): def factorize(self, na_sentinel=-1): From d8e3d6b413b00a28a510694fe80c2486292563bf Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 31 May 2022 21:37:23 -0400 Subject: [PATCH 12/17] Remove old comment --- pandas/core/algorithms.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4a5cff96395e3..106ad8d9a587a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -720,8 +720,6 @@ def factorize( # responsible only for factorization. All data coercion, sorting and boxing # should happen here. - # Can't always warn here because EA's factorize will warn too; warn for each - # path below. na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel) if isinstance(values, ABCRangeIndex): return values.factorize(sort=sort) From 39b37477246b1f99f976509d0a376e2e9758f65a Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 10 Jun 2022 17:07:54 -0400 Subject: [PATCH 13/17] backticks in warnings, revert datetimelike, avoid catch_warnings --- pandas/core/algorithms.py | 16 ++++++++++++---- pandas/core/arrays/base.py | 8 ++++---- pandas/core/arrays/datetimelike.py | 8 +++----- pandas/core/common.py | 18 +++++++++--------- pandas/core/indexes/multi.py | 3 +-- pandas/tests/extension/base/methods.py | 8 ++++---- pandas/tests/extension/test_boolean.py | 4 ++-- pandas/tests/extension/test_extension.py | 2 +- pandas/tests/test_algos.py | 10 +++++----- 9 files changed, 41 insertions(+), 36 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 106ad8d9a587a..1c22f60547f6a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -4,6 +4,7 @@ """ from __future__ import annotations +import inspect import operator from textwrap import dedent from typing import ( @@ -746,10 +747,17 @@ def factorize( return _re_wrap_factorize(original, uniques, codes) elif not isinstance(values.dtype, np.dtype): - with warnings.catch_warnings(): - # We've already warned above - warnings.filterwarnings("ignore", ".*use_na_sentinel.*", FutureWarning) - codes, uniques = values.factorize(na_sentinel=na_sentinel) + if ( + na_sentinel == -1 + and "use_na_sentinel" in inspect.signature(values.factorize).parameters + ): + # Avoid using catch_warnings when possible + codes, uniques = values.factorize(use_na_sentinel=True) + else: + with warnings.catch_warnings(): + # We've already warned above + warnings.filterwarnings("ignore", ".*use_na_sentinel.*", FutureWarning) + codes, uniques = values.factorize(na_sentinel=na_sentinel) else: values = np.asarray(values) # convert DTA/TDA/MultiIndex diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 86645b0726eff..cd232d7432134 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -466,10 +466,10 @@ def __init_subclass__(cls, **kwargs): # See GH#46910 for details on the deprecation name = cls.__name__ warnings.warn( - f"The na_sentinel argument of {name}.factorize is deprecated. " - f"In the future, pandas will use the use_na_sentinel argument instead. " - f"Add this argument to {name}.factorize to be compatible with future" - f"versions of pandas and silence this warning.", + f"The `na_sentinel` argument of `{name}.factorize` is deprecated. " + f"In the future, pandas will use the `use_na_sentinel` argument " + f"instead. Add this argument to `{name}.factorize` to be compatible " + f"with future versions of pandas and silence this warning.", DeprecationWarning, stacklevel=find_stack_level(), ) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index b5fc28cb1d18f..fef84fc74c75b 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1998,10 +1998,10 @@ def _with_freq(self, freq): # -------------------------------------------------------------- + # GH#46910 - Keep old signature to test we don't break things for EA library authors def factorize( self, - na_sentinel: int | lib.NoDefault = lib.no_default, - use_na_sentinel: bool | lib.NoDefault = lib.no_default, + na_sentinel: int = -1, sort: bool = False, ): if self.freq is not None: @@ -2013,9 +2013,7 @@ def factorize( uniques = uniques[::-1] return codes, uniques # FIXME: shouldn't get here; we are ignoring sort - return super().factorize( - na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel - ) + return super().factorize(na_sentinel=na_sentinel) # ------------------------------------------------------------------- diff --git a/pandas/core/common.py b/pandas/core/common.py index f7b3c675240b6..34edb57b8fb5e 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -724,28 +724,28 @@ def resolve_na_sentinel( """ if na_sentinel is not lib.no_default and use_na_sentinel is not lib.no_default: raise ValueError( - "Cannot specify both na_sentinel and use_na_sentile; " - f"got na_sentinel={na_sentinel} and use_na_sentinel={use_na_sentinel}" + "Cannot specify both `na_sentinel` and `use_na_sentile`; " + f"got `na_sentinel={na_sentinel}` and `use_na_sentinel={use_na_sentinel}`" ) if na_sentinel is lib.no_default: result = -1 if use_na_sentinel is lib.no_default or use_na_sentinel else None else: if na_sentinel is None: msg = ( - "Specifying na_sentinel=None is deprecated, specify " - "use_na_sentinel=False instead." + "Specifying `na_sentinel=None` is deprecated, specify " + "`use_na_sentinel=False` instead." ) elif na_sentinel == -1: msg = ( - "Specifying na_sentinel=-1 is deprecated, specify " - "use_na_sentinel=True instead." + "Specifying `na_sentinel=-1` is deprecated, specify " + "`use_na_sentinel=True` instead." ) else: msg = ( - "Specifying the specific value to use for na_sentinel is " + "Specifying the specific value to use for `na_sentinel` is " "deprecated and will be removed in a future version of pandas. " - "Specify use_na_sentinel=True to use the sentinel value -1, and " - "use_na_sentinel=False to encode NaN values." + "Specify `use_na_sentinel=True` to use the sentinel value -1, and " + "`use_na_sentinel=False` to encode NaN values." ) warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) result = na_sentinel diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 351cae6816ace..858c213566ad6 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1517,8 +1517,7 @@ def _get_grouper_for_level( return grouper, None, None values = self.get_level_values(level) - na_sentinel = -1 if dropna else None - codes, uniques = algos.factorize(values, sort=True, na_sentinel=na_sentinel) + codes, uniques = algos.factorize(values, sort=True, use_na_sentinel=dropna) assert isinstance(uniques, Index) if self.levels[level]._can_hold_na: diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index f5a4b5c3f3939..6e9130b18e94f 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -214,9 +214,9 @@ def test_unique(self, data, box, method): @pytest.mark.parametrize("na_sentinel", [-1, -2]) def test_factorize(self, data_for_grouping, na_sentinel): if na_sentinel == -1: - msg = "Specifying na_sentinel=-1 is deprecated" + msg = "Specifying `na_sentinel=-1` is deprecated" else: - msg = "Specifying the specific value to use for na_sentinel is deprecated" + msg = "Specifying the specific value to use for `na_sentinel` is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): codes, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) expected_codes = np.array( @@ -230,9 +230,9 @@ def test_factorize(self, data_for_grouping, na_sentinel): @pytest.mark.parametrize("na_sentinel", [-1, -2]) def test_factorize_equivalence(self, data_for_grouping, na_sentinel): if na_sentinel == -1: - msg = "Specifying na_sentinel=-1 is deprecated" + msg = "Specifying `na_sentinel=-1` is deprecated" else: - msg = "Specifying the specific value to use for na_sentinel is deprecated" + msg = "Specifying the specific value to use for `na_sentinel` is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): codes_1, uniques_1 = pd.factorize( data_for_grouping, na_sentinel=na_sentinel diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 89d074108f881..dd067102aba6c 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -178,9 +178,9 @@ class TestMethods(base.BaseMethodsTests): def test_factorize(self, data_for_grouping, na_sentinel): # override because we only have 2 unique values if na_sentinel == -1: - msg = "Specifying na_sentinel=-1 is deprecated" + msg = "Specifying `na_sentinel=-1` is deprecated" else: - msg = "Specifying the specific value to use for na_sentinel is deprecated" + msg = "Specifying the specific value to use for `na_sentinel` is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): labels, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) expected_labels = np.array( diff --git a/pandas/tests/extension/test_extension.py b/pandas/tests/extension/test_extension.py index a9b4eea2ae8f6..a4b1a4b43ef2b 100644 --- a/pandas/tests/extension/test_extension.py +++ b/pandas/tests/extension/test_extension.py @@ -29,7 +29,7 @@ def test_errors(self, data, all_arithmetic_operators): def test_depr_na_sentinel(): # GH#46910 - msg = "The na_sentinel argument of MyEA.factorize is deprecated" + msg = "The `na_sentinel` argument of `MyEA.factorize` is deprecated" with tm.assert_produces_warning(DeprecationWarning, match=msg): class MyEA(ExtensionArray): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index c7e0c8cb42d33..04b8c370cbe4a 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -91,11 +91,11 @@ def test_series_factorize_use_na_sentinel_false(self): def test_depr_na_sentinel(self, na_sentinel, index_or_series_obj): # GH#46910 if na_sentinel is None: - msg = "Specifying na_sentinel=None is deprecated" + msg = "Specifying `na_sentinel=None` is deprecated" elif na_sentinel == -1: - msg = "Specifying na_sentinel=-1 is deprecated" + msg = "Specifying `na_sentinel=-1` is deprecated" else: - msg = "Specifying the specific value to use for na_sentinel is deprecated" + msg = "Specifying the specific value to use for `na_sentinel` is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): _ = pd.factorize(index_or_series_obj, na_sentinel=na_sentinel) with tm.assert_produces_warning(FutureWarning, match=msg): @@ -433,9 +433,9 @@ def test_parametrized_factorize_na_value(self, data, na_value): ) def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): if na_sentinel == -1: - msg = "Specifying na_sentinel=-1 is deprecated" + msg = "Specifying `na_sentinel=-1` is deprecated" else: - msg = "the specific value to use for na_sentinel is deprecated" + msg = "the specific value to use for `na_sentinel` is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): codes, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel) if sort: From 58420531b52c8d5284c5e63544e968f7f6f77db5 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 10 Jun 2022 17:31:52 -0400 Subject: [PATCH 14/17] fixup for warnings --- pandas/core/arrays/base.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index cd232d7432134..a2606e53fbe39 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -462,7 +462,11 @@ def __ne__(self, other: Any) -> ArrayLike: # type: ignore[override] def __init_subclass__(cls, **kwargs): factorize = getattr(cls, "factorize") - if "use_na_sentinel" not in inspect.signature(factorize).parameters: + if ( + "use_na_sentinel" not in inspect.signature(factorize).parameters + # TimelikeOps uses old factorize args to ensure we don't break things + and cls.__name__ not in ("TimelikeOps", "DatetimeArray", "TimedeltaArray") + ): # See GH#46910 for details on the deprecation name = cls.__name__ warnings.warn( From 945bb04faf583aaf77e8e3e76945715b1fd20586 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 11 Jun 2022 09:01:00 -0400 Subject: [PATCH 15/17] mypy fixups --- pandas/core/algorithms.py | 5 ++++- pandas/core/arrays/datetimelike.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1c22f60547f6a..ec5a18fa9d5db 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -752,7 +752,10 @@ def factorize( and "use_na_sentinel" in inspect.signature(values.factorize).parameters ): # Avoid using catch_warnings when possible - codes, uniques = values.factorize(use_na_sentinel=True) + # GH#46910 - TimelikeOps has deprecated signature + codes, uniques = values.factorize( # type: ignore[call-arg] + use_na_sentinel=True + ) else: with warnings.catch_warnings(): # We've already warned above diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index fef84fc74c75b..2c099c253f80f 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1999,7 +1999,7 @@ def _with_freq(self, freq): # -------------------------------------------------------------- # GH#46910 - Keep old signature to test we don't break things for EA library authors - def factorize( + def factorize( # type:ignore[override] self, na_sentinel: int = -1, sort: bool = False, From 5524d539ca84c23ee3f2bc32cb5090557d9a64ef Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 12 Jun 2022 08:26:11 -0400 Subject: [PATCH 16/17] Move resolve_na_sentinel --- pandas/core/algorithms.py | 53 +++++++++++++++++++++++++++++-- pandas/core/arrays/arrow/array.py | 4 +-- pandas/core/arrays/base.py | 4 +-- pandas/core/arrays/masked.py | 3 +- pandas/core/common.py | 50 ----------------------------- pandas/core/indexes/range.py | 3 +- 6 files changed, 58 insertions(+), 59 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index ec5a18fa9d5db..1aef8e3d369bb 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -81,7 +81,6 @@ na_value_for_dtype, ) -from pandas.core import common as com from pandas.core.array_algos.take import take_nd from pandas.core.construction import ( array as pd_array, @@ -721,7 +720,7 @@ def factorize( # responsible only for factorization. All data coercion, sorting and boxing # should happen here. - na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel) + na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel) if isinstance(values, ABCRangeIndex): return values.factorize(sort=sort) @@ -786,6 +785,56 @@ def factorize( return _re_wrap_factorize(original, uniques, codes) +def resolve_na_sentinel( + na_sentinel: int | None | lib.NoDefault, + use_na_sentinel: bool | lib.NoDefault, +) -> int | None: + """ + Determine value of na_sentinel for factorize methods. + + See GH#46910 for details on the deprecation. + + Parameters + ---------- + na_sentinel : int, None, or lib.no_default + Value passed to the method. + use_na_sentinel : bool or lib.no_default + Value passed to the method. + + Returns + ------- + Resolved value of na_sentinel. + """ + if na_sentinel is not lib.no_default and use_na_sentinel is not lib.no_default: + raise ValueError( + "Cannot specify both `na_sentinel` and `use_na_sentile`; " + f"got `na_sentinel={na_sentinel}` and `use_na_sentinel={use_na_sentinel}`" + ) + if na_sentinel is lib.no_default: + result = -1 if use_na_sentinel is lib.no_default or use_na_sentinel else None + else: + if na_sentinel is None: + msg = ( + "Specifying `na_sentinel=None` is deprecated, specify " + "`use_na_sentinel=False` instead." + ) + elif na_sentinel == -1: + msg = ( + "Specifying `na_sentinel=-1` is deprecated, specify " + "`use_na_sentinel=True` instead." + ) + else: + msg = ( + "Specifying the specific value to use for `na_sentinel` is " + "deprecated and will be removed in a future version of pandas. " + "Specify `use_na_sentinel=True` to use the sentinel value -1, and " + "`use_na_sentinel=False` to encode NaN values." + ) + warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) + result = na_sentinel + return result + + def _re_wrap_factorize(original, uniques, codes: np.ndarray): """ Wrap factorize results in Series or Index depending on original type. diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d6f8ce0878339..15fa3667064e6 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -32,7 +32,7 @@ ) from pandas.core.dtypes.missing import isna -from pandas.core import common as com +from pandas.core.algorithms import resolve_na_sentinel from pandas.core.arrays.base import ExtensionArray from pandas.core.indexers import ( check_array_indexer, @@ -255,7 +255,7 @@ def factorize( na_sentinel: int | lib.NoDefault = lib.no_default, use_na_sentinel: bool | lib.NoDefault = lib.no_default, ) -> tuple[np.ndarray, ExtensionArray]: - resolved_na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel) + resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel) if resolved_na_sentinel is None: raise NotImplementedError("Encoding NaN values is not yet implemented") else: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index a2606e53fbe39..4274e6e5a911c 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -71,7 +71,6 @@ from pandas.core import ( arraylike, - common as com, missing, roperator, ) @@ -80,6 +79,7 @@ isin, mode, rank, + resolve_na_sentinel, unique, ) from pandas.core.array_algos.quantile import quantile_with_mask @@ -1079,7 +1079,7 @@ def factorize( # original ExtensionArray. # 2. ExtensionArray.factorize. # Complete control over factorization. - resolved_na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel) + resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel) if resolved_na_sentinel is None: raise NotImplementedError("Encoding NaN values is not yet implemented") else: diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 1ba4d7a451d89..4cedc4160e7af 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -59,7 +59,6 @@ from pandas.core import ( algorithms as algos, arraylike, - common as com, missing, nanops, ops, @@ -875,7 +874,7 @@ def factorize( na_sentinel: int | lib.NoDefault = lib.no_default, use_na_sentinel: bool | lib.NoDefault = lib.no_default, ) -> tuple[np.ndarray, ExtensionArray]: - resolved_na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel) + resolved_na_sentinel = algos.resolve_na_sentinel(na_sentinel, use_na_sentinel) if resolved_na_sentinel is None: raise NotImplementedError("Encoding NaN values is not yet implemented") else: diff --git a/pandas/core/common.py b/pandas/core/common.py index 34edb57b8fb5e..7225b26a910dd 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -700,53 +700,3 @@ def deprecate_numeric_only_default(cls: type, name: str, deprecate_none: bool = ) warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) - - -def resolve_na_sentinel( - na_sentinel: int | None | lib.NoDefault, - use_na_sentinel: bool | lib.NoDefault, -) -> int | None: - """ - Determine value of na_sentinel for factorize methods. - - See GH#46910 for details on the deprecation. - - Parameters - ---------- - na_sentinel : int, None, or lib.no_default - Value passed to the method. - use_na_sentinel : bool or lib.no_default - Value passed to the method. - - Returns - ------- - Resolved value of na_sentinel. - """ - if na_sentinel is not lib.no_default and use_na_sentinel is not lib.no_default: - raise ValueError( - "Cannot specify both `na_sentinel` and `use_na_sentile`; " - f"got `na_sentinel={na_sentinel}` and `use_na_sentinel={use_na_sentinel}`" - ) - if na_sentinel is lib.no_default: - result = -1 if use_na_sentinel is lib.no_default or use_na_sentinel else None - else: - if na_sentinel is None: - msg = ( - "Specifying `na_sentinel=None` is deprecated, specify " - "`use_na_sentinel=False` instead." - ) - elif na_sentinel == -1: - msg = ( - "Specifying `na_sentinel=-1` is deprecated, specify " - "`use_na_sentinel=True` instead." - ) - else: - msg = ( - "Specifying the specific value to use for `na_sentinel` is " - "deprecated and will be removed in a future version of pandas. " - "Specify `use_na_sentinel=True` to use the sentinel value -1, and " - "`use_na_sentinel=False` to encode NaN values." - ) - warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) - result = na_sentinel - return result diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 3f5bf8d40c7ee..5c05a63ea3e58 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -43,6 +43,7 @@ from pandas.core.dtypes.generic import ABCTimedeltaIndex from pandas.core import ops +from pandas.core.algorithms import resolve_na_sentinel import pandas.core.common as com from pandas.core.construction import extract_array import pandas.core.indexes.base as ibase @@ -516,7 +517,7 @@ def factorize( use_na_sentinel: bool | lib.NoDefault = lib.no_default, ) -> tuple[npt.NDArray[np.intp], RangeIndex]: # resolve to emit warning if appropriate - _ = com.resolve_na_sentinel(na_sentinel, use_na_sentinel) + _ = resolve_na_sentinel(na_sentinel, use_na_sentinel) codes = np.arange(len(self), dtype=np.intp) uniques = self if sort and self.step < 0: From dc97a7bb3329f36ced64170370d1e804347c2e16 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 24 Jun 2022 14:31:58 -0400 Subject: [PATCH 17/17] Remove underscores --- pandas/core/indexes/range.py | 2 +- pandas/tests/test_algos.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 5c05a63ea3e58..5b384fbc97c1a 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -517,7 +517,7 @@ def factorize( use_na_sentinel: bool | lib.NoDefault = lib.no_default, ) -> tuple[npt.NDArray[np.intp], RangeIndex]: # resolve to emit warning if appropriate - _ = resolve_na_sentinel(na_sentinel, use_na_sentinel) + resolve_na_sentinel(na_sentinel, use_na_sentinel) codes = np.arange(len(self), dtype=np.intp) uniques = self if sort and self.step < 0: diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 04b8c370cbe4a..357b6ea8a4b64 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -97,9 +97,9 @@ def test_depr_na_sentinel(self, na_sentinel, index_or_series_obj): else: msg = "Specifying the specific value to use for `na_sentinel` is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): - _ = pd.factorize(index_or_series_obj, na_sentinel=na_sentinel) + pd.factorize(index_or_series_obj, na_sentinel=na_sentinel) with tm.assert_produces_warning(FutureWarning, match=msg): - _ = index_or_series_obj.factorize(na_sentinel=na_sentinel) + index_or_series_obj.factorize(na_sentinel=na_sentinel) def test_basic(self):