Skip to content

Commit

Permalink
DEPR: na_sentinel in factorize (#47157)
Browse files Browse the repository at this point in the history
* DEPR: na_sentinel in factorize

* WIP

* DEPR: na_sentinel in factorize

* Fixups

* Fixups

* black

* fixup

* docs

* newline

* Warn on class construction, rework pd.factorize warnings

* FutureWarning -> DeprecationWarning

* Remove old comment

* backticks in warnings, revert datetimelike, avoid catch_warnings

* fixup for warnings

* mypy fixups

* Move resolve_na_sentinel

* Remove underscores

Co-authored-by: Jeff Reback <[email protected]>
  • Loading branch information
rhshadrach and jreback authored Jun 24, 2022
1 parent 6786ab2 commit d580826
Show file tree
Hide file tree
Showing 15 changed files with 252 additions and 41 deletions.
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -725,8 +725,9 @@ Other Deprecations
- Deprecated the ``closed`` argument in :class:`ArrowInterval` in favor of ``inclusive`` argument; In a future version passing ``closed`` will raise (:issue:`40245`)
- Deprecated allowing ``unit="M"`` or ``unit="Y"`` in :class:`Timestamp` constructor with a non-round float value (:issue:`47267`)
- Deprecated the ``display.column_space`` global configuration option (:issue:`7576`)
- Deprecated the argument ``na_sentinel`` in :func:`factorize`, :meth:`Index.factorize`, and :meth:`.ExtensionArray.factorize`; pass ``use_na_sentinel=True`` instead to use the sentinel ``-1`` for NaN values and ``use_na_sentinel=False`` instead of ``na_sentinel=None`` to encode NaN values (:issue:`46910`)
- Deprecated :meth:`DataFrameGroupBy.transform` not aligning the result when the UDF returned DataFrame (:issue:`45648`)
-


.. ---------------------------------------------------------------------------
.. _whatsnew_150.performance:
Expand Down
102 changes: 90 additions & 12 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""
from __future__ import annotations

import inspect
import operator
from textwrap import dedent
from typing import (
Expand All @@ -14,7 +15,7 @@
cast,
final,
)
from warnings import warn
import warnings

import numpy as np

Expand Down Expand Up @@ -586,7 +587,8 @@ def factorize_array(
def factorize(
values,
sort: bool = False,
na_sentinel: int | None = -1,
na_sentinel: int | None | lib.NoDefault = lib.no_default,
use_na_sentinel: bool | lib.NoDefault = lib.no_default,
size_hint: int | None = None,
) -> tuple[np.ndarray, np.ndarray | Index]:
"""
Expand All @@ -604,7 +606,19 @@ def factorize(
Value to mark "not found". If None, will not drop the NaN
from the uniques of the values.
.. deprecated:: 1.5.0
The na_sentinel argument is deprecated and
will be removed in a future version of pandas. Specify use_na_sentinel as
either True or False.
.. versionchanged:: 1.1.2
use_na_sentinel : bool, default True
If True, the sentinel -1 will be used for NaN values. If False,
NaN values will be encoded as non-negative integers and will not drop the
NaN from the uniques of the values.
.. versionadded:: 1.5.0
{size_hint}\
Returns
Expand Down Expand Up @@ -652,8 +666,8 @@ def factorize(
>>> uniques
array(['a', 'b', 'c'], dtype=object)
Missing values are indicated in `codes` with `na_sentinel`
(``-1`` by default). Note that missing values are never
When ``use_na_sentinel=True`` (the default), missing values are indicated in
the `codes` with the sentinel value ``-1`` and missing values are not
included in `uniques`.
>>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
Expand Down Expand Up @@ -688,16 +702,16 @@ def factorize(
Index(['a', 'c'], dtype='object')
If NaN is in the values, and we want to include NaN in the uniques of the
values, it can be achieved by setting ``na_sentinel=None``.
values, it can be achieved by setting ``use_na_sentinel=False``.
>>> values = np.array([1, 2, 1, np.nan])
>>> codes, uniques = pd.factorize(values) # default: na_sentinel=-1
>>> codes, uniques = pd.factorize(values) # default: use_na_sentinel=True
>>> codes
array([ 0, 1, 0, -1])
>>> uniques
array([1., 2.])
>>> codes, uniques = pd.factorize(values, na_sentinel=None)
>>> codes, uniques = pd.factorize(values, use_na_sentinel=False)
>>> codes
array([0, 1, 0, 2])
>>> uniques
Expand All @@ -712,6 +726,7 @@ def factorize(
# responsible only for factorization. All data coercion, sorting and boxing
# should happen here.

na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
if isinstance(values, ABCRangeIndex):
return values.factorize(sort=sort)

Expand All @@ -736,9 +751,22 @@ def factorize(
codes, uniques = values.factorize(sort=sort)
return _re_wrap_factorize(original, uniques, codes)

if not isinstance(values.dtype, np.dtype):
# i.e. ExtensionDtype
codes, uniques = values.factorize(na_sentinel=na_sentinel)
elif not isinstance(values.dtype, np.dtype):
if (
na_sentinel == -1
and "use_na_sentinel" in inspect.signature(values.factorize).parameters
):
# Avoid using catch_warnings when possible
# GH#46910 - TimelikeOps has deprecated signature
codes, uniques = values.factorize( # type: ignore[call-arg]
use_na_sentinel=True
)
else:
with warnings.catch_warnings():
# We've already warned above
warnings.filterwarnings("ignore", ".*use_na_sentinel.*", FutureWarning)
codes, uniques = values.factorize(na_sentinel=na_sentinel)

else:
values = np.asarray(values) # convert DTA/TDA/MultiIndex
codes, uniques = factorize_array(
Expand All @@ -763,6 +791,56 @@ def factorize(
return _re_wrap_factorize(original, uniques, codes)


def resolve_na_sentinel(
na_sentinel: int | None | lib.NoDefault,
use_na_sentinel: bool | lib.NoDefault,
) -> int | None:
"""
Determine value of na_sentinel for factorize methods.
See GH#46910 for details on the deprecation.
Parameters
----------
na_sentinel : int, None, or lib.no_default
Value passed to the method.
use_na_sentinel : bool or lib.no_default
Value passed to the method.
Returns
-------
Resolved value of na_sentinel.
"""
if na_sentinel is not lib.no_default and use_na_sentinel is not lib.no_default:
raise ValueError(
"Cannot specify both `na_sentinel` and `use_na_sentile`; "
f"got `na_sentinel={na_sentinel}` and `use_na_sentinel={use_na_sentinel}`"
)
if na_sentinel is lib.no_default:
result = -1 if use_na_sentinel is lib.no_default or use_na_sentinel else None
else:
if na_sentinel is None:
msg = (
"Specifying `na_sentinel=None` is deprecated, specify "
"`use_na_sentinel=False` instead."
)
elif na_sentinel == -1:
msg = (
"Specifying `na_sentinel=-1` is deprecated, specify "
"`use_na_sentinel=True` instead."
)
else:
msg = (
"Specifying the specific value to use for `na_sentinel` is "
"deprecated and will be removed in a future version of pandas. "
"Specify `use_na_sentinel=True` to use the sentinel value -1, and "
"`use_na_sentinel=False` to encode NaN values."
)
warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
result = na_sentinel
return result


def _re_wrap_factorize(original, uniques, codes: np.ndarray):
"""
Wrap factorize results in Series or Index depending on original type.
Expand Down Expand Up @@ -956,7 +1034,7 @@ def mode(
try:
npresult = np.sort(npresult)
except TypeError as err:
warn(f"Unable to sort modes: {err}")
warnings.warn(f"Unable to sort modes: {err}")

result = _reconstruct_data(npresult, original.dtype, original)
return result
Expand Down Expand Up @@ -1576,7 +1654,7 @@ def diff(arr, n: int, axis: int = 0):
raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}")
return op(arr, arr.shift(n))
else:
warn(
warnings.warn(
"dtype lost in 'diff()'. In the future this will raise a "
"TypeError. Convert to a suitable dtype prior to calling 'diff'.",
FutureWarning,
Expand Down
13 changes: 12 additions & 1 deletion pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import numpy as np

from pandas._libs import lib
from pandas._typing import (
Dtype,
PositionalIndexer,
Expand All @@ -31,6 +32,7 @@
)
from pandas.core.dtypes.missing import isna

from pandas.core.algorithms import resolve_na_sentinel
from pandas.core.arraylike import OpsMixin
from pandas.core.arrays.base import ExtensionArray
from pandas.core.indexers import (
Expand Down Expand Up @@ -286,7 +288,16 @@ def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
return type(self)(pc.drop_null(self._data))

@doc(ExtensionArray.factorize)
def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
def factorize(
self,
na_sentinel: int | lib.NoDefault = lib.no_default,
use_na_sentinel: bool | lib.NoDefault = lib.no_default,
) -> tuple[np.ndarray, ExtensionArray]:
resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
if resolved_na_sentinel is None:
raise NotImplementedError("Encoding NaN values is not yet implemented")
else:
na_sentinel = resolved_na_sentinel
encoded = self._data.dictionary_encode()
indices = pa.chunked_array(
[c.indices for c in encoded.chunks], type=encoded.type.index_type
Expand Down
45 changes: 44 additions & 1 deletion pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"""
from __future__ import annotations

import inspect
import operator
from typing import (
TYPE_CHECKING,
Expand All @@ -20,6 +21,7 @@
cast,
overload,
)
import warnings

import numpy as np

Expand All @@ -45,6 +47,7 @@
cache_readonly,
deprecate_nonkeyword_arguments,
)
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import (
validate_bool_kwarg,
validate_fillna_kwargs,
Expand Down Expand Up @@ -76,6 +79,7 @@
isin,
mode,
rank,
resolve_na_sentinel,
unique,
)
from pandas.core.array_algos.quantile import quantile_with_mask
Expand Down Expand Up @@ -456,6 +460,24 @@ def __ne__(self, other: Any) -> ArrayLike: # type: ignore[override]
"""
return ~(self == other)

def __init_subclass__(cls, **kwargs):
factorize = getattr(cls, "factorize")
if (
"use_na_sentinel" not in inspect.signature(factorize).parameters
# TimelikeOps uses old factorize args to ensure we don't break things
and cls.__name__ not in ("TimelikeOps", "DatetimeArray", "TimedeltaArray")
):
# See GH#46910 for details on the deprecation
name = cls.__name__
warnings.warn(
f"The `na_sentinel` argument of `{name}.factorize` is deprecated. "
f"In the future, pandas will use the `use_na_sentinel` argument "
f"instead. Add this argument to `{name}.factorize` to be compatible "
f"with future versions of pandas and silence this warning.",
DeprecationWarning,
stacklevel=find_stack_level(),
)

def to_numpy(
self,
dtype: npt.DTypeLike | None = None,
Expand Down Expand Up @@ -1002,7 +1024,11 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
"""
return self.astype(object), np.nan

def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
def factorize(
self,
na_sentinel: int | lib.NoDefault = lib.no_default,
use_na_sentinel: bool | lib.NoDefault = lib.no_default,
) -> tuple[np.ndarray, ExtensionArray]:
"""
Encode the extension array as an enumerated type.
Expand All @@ -1011,6 +1037,18 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
na_sentinel : int, default -1
Value to use in the `codes` array to indicate missing values.
.. deprecated:: 1.5.0
The na_sentinel argument is deprecated and
will be removed in a future version of pandas. Specify use_na_sentinel
as either True or False.
use_na_sentinel : bool, default True
If True, the sentinel -1 will be used for NaN values. If False,
NaN values will be encoded as non-negative integers and will not drop the
NaN from the uniques of the values.
.. versionadded:: 1.5.0
Returns
-------
codes : ndarray
Expand Down Expand Up @@ -1041,6 +1079,11 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
# original ExtensionArray.
# 2. ExtensionArray.factorize.
# Complete control over factorization.
resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
if resolved_na_sentinel is None:
raise NotImplementedError("Encoding NaN values is not yet implemented")
else:
na_sentinel = resolved_na_sentinel
arr, na_value = self._values_for_factorize()

codes, uniques = factorize_array(
Expand Down
7 changes: 6 additions & 1 deletion pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -1996,7 +1996,12 @@ def _with_freq(self, freq):

# --------------------------------------------------------------

def factorize(self, na_sentinel=-1, sort: bool = False):
# GH#46910 - Keep old signature to test we don't break things for EA library authors
def factorize( # type:ignore[override]
self,
na_sentinel: int = -1,
sort: bool = False,
):
if self.freq is not None:
# We must be unique, so can short-circuit (and retain freq)
codes = np.arange(len(self), dtype=np.intp)
Expand Down
11 changes: 10 additions & 1 deletion pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -869,7 +869,16 @@ def searchsorted(
return self._data.searchsorted(value, side=side, sorter=sorter)

@doc(ExtensionArray.factorize)
def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
def factorize(
self,
na_sentinel: int | lib.NoDefault = lib.no_default,
use_na_sentinel: bool | lib.NoDefault = lib.no_default,
) -> tuple[np.ndarray, ExtensionArray]:
resolved_na_sentinel = algos.resolve_na_sentinel(na_sentinel, use_na_sentinel)
if resolved_na_sentinel is None:
raise NotImplementedError("Encoding NaN values is not yet implemented")
else:
na_sentinel = resolved_na_sentinel
arr = self._data
mask = self._mask

Expand Down
10 changes: 8 additions & 2 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -848,13 +848,19 @@ def _values_for_factorize(self):
# Still override this for hash_pandas_object
return np.asarray(self), self.fill_value

def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, SparseArray]:
def factorize(
self,
na_sentinel: int | lib.NoDefault = lib.no_default,
use_na_sentinel: bool | lib.NoDefault = lib.no_default,
) -> tuple[np.ndarray, SparseArray]:
# Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
# The sparsity on this is backwards from what Sparse would want. Want
# ExtensionArray.factorize -> Tuple[EA, EA]
# Given that we have to return a dense array of codes, why bother
# implementing an efficient factorize?
codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
codes, uniques = algos.factorize(
np.asarray(self), na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel
)
uniques_sp = SparseArray(uniques, dtype=self.dtype)
return codes, uniques_sp

Expand Down
Loading

0 comments on commit d580826

Please sign in to comment.