DEPR: na_sentinel in factorize (#47157)

* DEPR: na_sentinel in factorize * WIP * DEPR: na_sentinel in factorize * Fixups * Fixups * black * fixup * docs * newline * Warn on class construction, rework pd.factorize warnings * FutureWarning -> DeprecationWarning * Remove old comment * backticks in warnings, revert datetimelike, avoid catch_warnings * fixup for warnings * mypy fixups * Move resolve_na_sentinel * Remove underscores Co-authored-by: Jeff Reback <[email protected]>
pandas-dev · Jun 24, 2022 · d580826 · d580826
1 parent 6786ab2
commit d580826
Show file tree

Hide file tree

Showing 15 changed files with 252 additions and 41 deletions.
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -725,8 +725,9 @@ Other Deprecations
 - Deprecated the ``closed`` argument in :class:`ArrowInterval` in favor of ``inclusive`` argument; In a future version passing ``closed`` will raise (:issue:`40245`)
 - Deprecated allowing ``unit="M"`` or ``unit="Y"`` in :class:`Timestamp` constructor with a non-round float value (:issue:`47267`)
 - Deprecated the ``display.column_space`` global configuration option (:issue:`7576`)
+- Deprecated the argument ``na_sentinel`` in :func:`factorize`, :meth:`Index.factorize`, and :meth:`.ExtensionArray.factorize`; pass ``use_na_sentinel=True`` instead to use the sentinel ``-1`` for NaN values and ``use_na_sentinel=False`` instead of ``na_sentinel=None`` to encode NaN values (:issue:`46910`)
 - Deprecated :meth:`DataFrameGroupBy.transform` not aligning the result when the UDF returned DataFrame (:issue:`45648`)
--
+
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.performance:

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -4,6 +4,7 @@
 """
 from __future__ import annotations
 
+import inspect
 import operator
 from textwrap import dedent
 from typing import (
@@ -14,7 +15,7 @@
     cast,
     final,
 )
-from warnings import warn
+import warnings
 
 import numpy as np
 
@@ -586,7 +587,8 @@ def factorize_array(
 def factorize(
     values,
     sort: bool = False,
-    na_sentinel: int | None = -1,
+    na_sentinel: int | None | lib.NoDefault = lib.no_default,
+    use_na_sentinel: bool | lib.NoDefault = lib.no_default,
     size_hint: int | None = None,
 ) -> tuple[np.ndarray, np.ndarray | Index]:
     """
@@ -604,7 +606,19 @@ def factorize(
         Value to mark "not found". If None, will not drop the NaN
         from the uniques of the values.
 
+        .. deprecated:: 1.5.0
+            The na_sentinel argument is deprecated and
+            will be removed in a future version of pandas. Specify use_na_sentinel as
+            either True or False.
+
         .. versionchanged:: 1.1.2
+
+    use_na_sentinel : bool, default True
+        If True, the sentinel -1 will be used for NaN values. If False,
+        NaN values will be encoded as non-negative integers and will not drop the
+        NaN from the uniques of the values.
+
+        .. versionadded:: 1.5.0
     {size_hint}\
 
     Returns
@@ -652,8 +666,8 @@ def factorize(
     >>> uniques
     array(['a', 'b', 'c'], dtype=object)
 
-    Missing values are indicated in `codes` with `na_sentinel`
-    (``-1`` by default). Note that missing values are never
+    When ``use_na_sentinel=True`` (the default), missing values are indicated in
+    the `codes` with the sentinel value ``-1`` and missing values are not
     included in `uniques`.
 
     >>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
@@ -688,16 +702,16 @@ def factorize(
     Index(['a', 'c'], dtype='object')
 
     If NaN is in the values, and we want to include NaN in the uniques of the
-    values, it can be achieved by setting ``na_sentinel=None``.
+    values, it can be achieved by setting ``use_na_sentinel=False``.
 
     >>> values = np.array([1, 2, 1, np.nan])
-    >>> codes, uniques = pd.factorize(values)  # default: na_sentinel=-1
+    >>> codes, uniques = pd.factorize(values)  # default: use_na_sentinel=True
     >>> codes
     array([ 0,  1,  0, -1])
     >>> uniques
     array([1., 2.])
 
-    >>> codes, uniques = pd.factorize(values, na_sentinel=None)
+    >>> codes, uniques = pd.factorize(values, use_na_sentinel=False)
     >>> codes
     array([0, 1, 0, 2])
     >>> uniques
@@ -712,6 +726,7 @@ def factorize(
     # responsible only for factorization. All data coercion, sorting and boxing
     # should happen here.
 
+    na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
     if isinstance(values, ABCRangeIndex):
         return values.factorize(sort=sort)
 
@@ -736,9 +751,22 @@ def factorize(
         codes, uniques = values.factorize(sort=sort)
         return _re_wrap_factorize(original, uniques, codes)
 
-    if not isinstance(values.dtype, np.dtype):
-        # i.e. ExtensionDtype
-        codes, uniques = values.factorize(na_sentinel=na_sentinel)
+    elif not isinstance(values.dtype, np.dtype):
+        if (
+            na_sentinel == -1
+            and "use_na_sentinel" in inspect.signature(values.factorize).parameters
+        ):
+            # Avoid using catch_warnings when possible
+            # GH#46910 - TimelikeOps has deprecated signature
+            codes, uniques = values.factorize(  # type: ignore[call-arg]
+                use_na_sentinel=True
+            )
+        else:
+            with warnings.catch_warnings():
+                # We've already warned above
+                warnings.filterwarnings("ignore", ".*use_na_sentinel.*", FutureWarning)
+                codes, uniques = values.factorize(na_sentinel=na_sentinel)
+
     else:
         values = np.asarray(values)  # convert DTA/TDA/MultiIndex
         codes, uniques = factorize_array(
@@ -763,6 +791,56 @@ def factorize(
     return _re_wrap_factorize(original, uniques, codes)
 
 
+def resolve_na_sentinel(
+    na_sentinel: int | None | lib.NoDefault,
+    use_na_sentinel: bool | lib.NoDefault,
+) -> int | None:
+    """
+    Determine value of na_sentinel for factorize methods.
+
+    See GH#46910 for details on the deprecation.
+
+    Parameters
+    ----------
+    na_sentinel : int, None, or lib.no_default
+        Value passed to the method.
+    use_na_sentinel : bool or lib.no_default
+        Value passed to the method.
+
+    Returns
+    -------
+    Resolved value of na_sentinel.
+    """
+    if na_sentinel is not lib.no_default and use_na_sentinel is not lib.no_default:
+        raise ValueError(
+            "Cannot specify both `na_sentinel` and `use_na_sentile`; "
+            f"got `na_sentinel={na_sentinel}` and `use_na_sentinel={use_na_sentinel}`"
+        )
+    if na_sentinel is lib.no_default:
+        result = -1 if use_na_sentinel is lib.no_default or use_na_sentinel else None
+    else:
+        if na_sentinel is None:
+            msg = (
+                "Specifying `na_sentinel=None` is deprecated, specify "
+                "`use_na_sentinel=False` instead."
+            )
+        elif na_sentinel == -1:
+            msg = (
+                "Specifying `na_sentinel=-1` is deprecated, specify "
+                "`use_na_sentinel=True` instead."
+            )
+        else:
+            msg = (
+                "Specifying the specific value to use for `na_sentinel` is "
+                "deprecated and will be removed in a future version of pandas. "
+                "Specify `use_na_sentinel=True` to use the sentinel value -1, and "
+                "`use_na_sentinel=False` to encode NaN values."
+            )
+        warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
+        result = na_sentinel
+    return result
+
+
 def _re_wrap_factorize(original, uniques, codes: np.ndarray):
     """
     Wrap factorize results in Series or Index depending on original type.
@@ -956,7 +1034,7 @@ def mode(
     try:
         npresult = np.sort(npresult)
     except TypeError as err:
-        warn(f"Unable to sort modes: {err}")
+        warnings.warn(f"Unable to sort modes: {err}")
 
     result = _reconstruct_data(npresult, original.dtype, original)
     return result
@@ -1576,7 +1654,7 @@ def diff(arr, n: int, axis: int = 0):
                 raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}")
             return op(arr, arr.shift(n))
         else:
-            warn(
+            warnings.warn(
                 "dtype lost in 'diff()'. In the future this will raise a "
                 "TypeError. Convert to a suitable dtype prior to calling 'diff'.",
                 FutureWarning,

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -8,6 +8,7 @@
 
 import numpy as np
 
+from pandas._libs import lib
 from pandas._typing import (
     Dtype,
     PositionalIndexer,
@@ -31,6 +32,7 @@
 )
 from pandas.core.dtypes.missing import isna
 
+from pandas.core.algorithms import resolve_na_sentinel
 from pandas.core.arraylike import OpsMixin
 from pandas.core.arrays.base import ExtensionArray
 from pandas.core.indexers import (
@@ -286,7 +288,16 @@ def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
             return type(self)(pc.drop_null(self._data))
 
     @doc(ExtensionArray.factorize)
-    def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
+    def factorize(
+        self,
+        na_sentinel: int | lib.NoDefault = lib.no_default,
+        use_na_sentinel: bool | lib.NoDefault = lib.no_default,
+    ) -> tuple[np.ndarray, ExtensionArray]:
+        resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
+        if resolved_na_sentinel is None:
+            raise NotImplementedError("Encoding NaN values is not yet implemented")
+        else:
+            na_sentinel = resolved_na_sentinel
         encoded = self._data.dictionary_encode()
         indices = pa.chunked_array(
             [c.indices for c in encoded.chunks], type=encoded.type.index_type

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -8,6 +8,7 @@
 """
 from __future__ import annotations
 
+import inspect
 import operator
 from typing import (
     TYPE_CHECKING,
@@ -20,6 +21,7 @@
     cast,
     overload,
 )
+import warnings
 
 import numpy as np
 
@@ -45,6 +47,7 @@
     cache_readonly,
     deprecate_nonkeyword_arguments,
 )
+from pandas.util._exceptions import find_stack_level
 from pandas.util._validators import (
     validate_bool_kwarg,
     validate_fillna_kwargs,
@@ -76,6 +79,7 @@
     isin,
     mode,
     rank,
+    resolve_na_sentinel,
     unique,
 )
 from pandas.core.array_algos.quantile import quantile_with_mask
@@ -456,6 +460,24 @@ def __ne__(self, other: Any) -> ArrayLike:  # type: ignore[override]
         """
         return ~(self == other)
 
+    def __init_subclass__(cls, **kwargs):
+        factorize = getattr(cls, "factorize")
+        if (
+            "use_na_sentinel" not in inspect.signature(factorize).parameters
+            # TimelikeOps uses old factorize args to ensure we don't break things
+            and cls.__name__ not in ("TimelikeOps", "DatetimeArray", "TimedeltaArray")
+        ):
+            # See GH#46910 for details on the deprecation
+            name = cls.__name__
+            warnings.warn(
+                f"The `na_sentinel` argument of `{name}.factorize` is deprecated. "
+                f"In the future, pandas will use the `use_na_sentinel` argument "
+                f"instead.  Add this argument to `{name}.factorize` to be compatible "
+                f"with future versions of pandas and silence this warning.",
+                DeprecationWarning,
+                stacklevel=find_stack_level(),
+            )
+
     def to_numpy(
         self,
         dtype: npt.DTypeLike | None = None,
@@ -1002,7 +1024,11 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
         """
         return self.astype(object), np.nan
 
-    def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
+    def factorize(
+        self,
+        na_sentinel: int | lib.NoDefault = lib.no_default,
+        use_na_sentinel: bool | lib.NoDefault = lib.no_default,
+    ) -> tuple[np.ndarray, ExtensionArray]:
         """
         Encode the extension array as an enumerated type.
 
@@ -1011,6 +1037,18 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
         na_sentinel : int, default -1
             Value to use in the `codes` array to indicate missing values.
 
+            .. deprecated:: 1.5.0
+                The na_sentinel argument is deprecated and
+                will be removed in a future version of pandas. Specify use_na_sentinel
+                as either True or False.
+
+        use_na_sentinel : bool, default True
+            If True, the sentinel -1 will be used for NaN values. If False,
+            NaN values will be encoded as non-negative integers and will not drop the
+            NaN from the uniques of the values.
+
+            .. versionadded:: 1.5.0
+
         Returns
         -------
         codes : ndarray
@@ -1041,6 +1079,11 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
         #    original ExtensionArray.
         # 2. ExtensionArray.factorize.
         #    Complete control over factorization.
+        resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
+        if resolved_na_sentinel is None:
+            raise NotImplementedError("Encoding NaN values is not yet implemented")
+        else:
+            na_sentinel = resolved_na_sentinel
         arr, na_value = self._values_for_factorize()
 
         codes, uniques = factorize_array(

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -1996,7 +1996,12 @@ def _with_freq(self, freq):
 
     # --------------------------------------------------------------
 
-    def factorize(self, na_sentinel=-1, sort: bool = False):
+    # GH#46910 - Keep old signature to test we don't break things for EA library authors
+    def factorize(  # type:ignore[override]
+        self,
+        na_sentinel: int = -1,
+        sort: bool = False,
+    ):
         if self.freq is not None:
             # We must be unique, so can short-circuit (and retain freq)
             codes = np.arange(len(self), dtype=np.intp)

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -869,7 +869,16 @@ def searchsorted(
         return self._data.searchsorted(value, side=side, sorter=sorter)
 
     @doc(ExtensionArray.factorize)
-    def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
+    def factorize(
+        self,
+        na_sentinel: int | lib.NoDefault = lib.no_default,
+        use_na_sentinel: bool | lib.NoDefault = lib.no_default,
+    ) -> tuple[np.ndarray, ExtensionArray]:
+        resolved_na_sentinel = algos.resolve_na_sentinel(na_sentinel, use_na_sentinel)
+        if resolved_na_sentinel is None:
+            raise NotImplementedError("Encoding NaN values is not yet implemented")
+        else:
+            na_sentinel = resolved_na_sentinel
         arr = self._data
         mask = self._mask
 

diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -848,13 +848,19 @@ def _values_for_factorize(self):
         # Still override this for hash_pandas_object
         return np.asarray(self), self.fill_value
 
-    def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, SparseArray]:
+    def factorize(
+        self,
+        na_sentinel: int | lib.NoDefault = lib.no_default,
+        use_na_sentinel: bool | lib.NoDefault = lib.no_default,
+    ) -> tuple[np.ndarray, SparseArray]:
         # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
         # The sparsity on this is backwards from what Sparse would want. Want
         # ExtensionArray.factorize -> Tuple[EA, EA]
         # Given that we have to return a dense array of codes, why bother
         # implementing an efficient factorize?
-        codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
+        codes, uniques = algos.factorize(
+            np.asarray(self), na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel
+        )
         uniques_sp = SparseArray(uniques, dtype=self.dtype)
         return codes, uniques_sp