-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
DEPR: na_sentinel in factorize #47157
Changes from 24 commits
552775f
79231e7
c822282
05fa0ca
f626dd8
a15e43a
9a33637
46e7a8d
b1edc89
c8d6fa2
0fd1ea7
465ab2b
6b4917c
d8e3d6b
0111eef
39b3747
5842053
945bb04
8f8cb18
9630edc
5524d53
fcd8a75
c615de4
80aaf5e
dc97a7b
eb6d5a1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
""" | ||
from __future__ import annotations | ||
|
||
import inspect | ||
import operator | ||
from textwrap import dedent | ||
from typing import ( | ||
|
@@ -14,7 +15,7 @@ | |
cast, | ||
final, | ||
) | ||
from warnings import warn | ||
import warnings | ||
|
||
import numpy as np | ||
|
||
|
@@ -580,7 +581,8 @@ def factorize_array( | |
def factorize( | ||
values, | ||
sort: bool = False, | ||
na_sentinel: int | None = -1, | ||
na_sentinel: int | None | lib.NoDefault = lib.no_default, | ||
use_na_sentinel: bool | lib.NoDefault = lib.no_default, | ||
size_hint: int | None = None, | ||
) -> tuple[np.ndarray, np.ndarray | Index]: | ||
""" | ||
|
@@ -598,7 +600,19 @@ def factorize( | |
Value to mark "not found". If None, will not drop the NaN | ||
from the uniques of the values. | ||
|
||
.. deprecated:: 1.5.0 | ||
The na_sentinel argument is deprecated and | ||
will be removed in a future version of pandas. Specify use_na_sentinel as | ||
either True or False. | ||
|
||
.. versionchanged:: 1.1.2 | ||
|
||
use_na_sentinel : bool, default True | ||
If True, the sentinel -1 will be used for NaN values. If False, | ||
NaN values will be encoded as non-negative integers and will not drop the | ||
NaN from the uniques of the values. | ||
|
||
.. versionadded:: 1.5.0 | ||
{size_hint}\ | ||
|
||
Returns | ||
|
@@ -646,8 +660,8 @@ def factorize( | |
>>> uniques | ||
array(['a', 'b', 'c'], dtype=object) | ||
|
||
Missing values are indicated in `codes` with `na_sentinel` | ||
(``-1`` by default). Note that missing values are never | ||
When ``use_na_sentinel=True`` (the default), missing values are indicated in | ||
the `codes` with the sentinel value ``-1`` and missing values are not | ||
included in `uniques`. | ||
|
||
>>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b']) | ||
|
@@ -682,16 +696,16 @@ def factorize( | |
Index(['a', 'c'], dtype='object') | ||
|
||
If NaN is in the values, and we want to include NaN in the uniques of the | ||
values, it can be achieved by setting ``na_sentinel=None``. | ||
values, it can be achieved by setting ``use_na_sentinel=False``. | ||
|
||
>>> values = np.array([1, 2, 1, np.nan]) | ||
>>> codes, uniques = pd.factorize(values) # default: na_sentinel=-1 | ||
>>> codes, uniques = pd.factorize(values) # default: use_na_sentinel=True | ||
>>> codes | ||
array([ 0, 1, 0, -1]) | ||
>>> uniques | ||
array([1., 2.]) | ||
|
||
>>> codes, uniques = pd.factorize(values, na_sentinel=None) | ||
>>> codes, uniques = pd.factorize(values, use_na_sentinel=False) | ||
>>> codes | ||
array([0, 1, 0, 2]) | ||
>>> uniques | ||
|
@@ -706,6 +720,7 @@ def factorize( | |
# responsible only for factorization. All data coercion, sorting and boxing | ||
# should happen here. | ||
|
||
na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel) | ||
if isinstance(values, ABCRangeIndex): | ||
return values.factorize(sort=sort) | ||
|
||
|
@@ -730,9 +745,22 @@ def factorize( | |
codes, uniques = values.factorize(sort=sort) | ||
return _re_wrap_factorize(original, uniques, codes) | ||
|
||
if not isinstance(values.dtype, np.dtype): | ||
# i.e. ExtensionDtype | ||
codes, uniques = values.factorize(na_sentinel=na_sentinel) | ||
elif not isinstance(values.dtype, np.dtype): | ||
if ( | ||
na_sentinel == -1 | ||
and "use_na_sentinel" in inspect.signature(values.factorize).parameters | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are you inspecting like this? its always passed (with no_default maybe) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We're inspecting whether the EA There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Might be good to add a comment to that effect |
||
): | ||
# Avoid using catch_warnings when possible | ||
# GH#46910 - TimelikeOps has deprecated signature | ||
codes, uniques = values.factorize( # type: ignore[call-arg] | ||
use_na_sentinel=True | ||
) | ||
else: | ||
with warnings.catch_warnings(): | ||
# We've already warned above | ||
warnings.filterwarnings("ignore", ".*use_na_sentinel.*", FutureWarning) | ||
codes, uniques = values.factorize(na_sentinel=na_sentinel) | ||
|
||
else: | ||
values = np.asarray(values) # convert DTA/TDA/MultiIndex | ||
codes, uniques = factorize_array( | ||
|
@@ -757,6 +785,56 @@ def factorize( | |
return _re_wrap_factorize(original, uniques, codes) | ||
|
||
|
||
def resolve_na_sentinel( | ||
na_sentinel: int | None | lib.NoDefault, | ||
use_na_sentinel: bool | lib.NoDefault, | ||
) -> int | None: | ||
""" | ||
Determine value of na_sentinel for factorize methods. | ||
|
||
See GH#46910 for details on the deprecation. | ||
|
||
Parameters | ||
---------- | ||
na_sentinel : int, None, or lib.no_default | ||
Value passed to the method. | ||
use_na_sentinel : bool or lib.no_default | ||
Value passed to the method. | ||
|
||
Returns | ||
------- | ||
Resolved value of na_sentinel. | ||
""" | ||
if na_sentinel is not lib.no_default and use_na_sentinel is not lib.no_default: | ||
raise ValueError( | ||
"Cannot specify both `na_sentinel` and `use_na_sentile`; " | ||
f"got `na_sentinel={na_sentinel}` and `use_na_sentinel={use_na_sentinel}`" | ||
) | ||
if na_sentinel is lib.no_default: | ||
result = -1 if use_na_sentinel is lib.no_default or use_na_sentinel else None | ||
else: | ||
if na_sentinel is None: | ||
msg = ( | ||
"Specifying `na_sentinel=None` is deprecated, specify " | ||
"`use_na_sentinel=False` instead." | ||
) | ||
elif na_sentinel == -1: | ||
msg = ( | ||
"Specifying `na_sentinel=-1` is deprecated, specify " | ||
"`use_na_sentinel=True` instead." | ||
) | ||
else: | ||
msg = ( | ||
"Specifying the specific value to use for `na_sentinel` is " | ||
"deprecated and will be removed in a future version of pandas. " | ||
"Specify `use_na_sentinel=True` to use the sentinel value -1, and " | ||
"`use_na_sentinel=False` to encode NaN values." | ||
) | ||
warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) | ||
result = na_sentinel | ||
return result | ||
|
||
|
||
def _re_wrap_factorize(original, uniques, codes: np.ndarray): | ||
""" | ||
Wrap factorize results in Series or Index depending on original type. | ||
|
@@ -950,7 +1028,7 @@ def mode( | |
try: | ||
npresult = np.sort(npresult) | ||
except TypeError as err: | ||
warn(f"Unable to sort modes: {err}") | ||
warnings.warn(f"Unable to sort modes: {err}") | ||
|
||
result = _reconstruct_data(npresult, original.dtype, original) | ||
return result | ||
|
@@ -1570,7 +1648,7 @@ def diff(arr, n: int, axis: int = 0): | |
raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}") | ||
return op(arr, arr.shift(n)) | ||
else: | ||
warn( | ||
warnings.warn( | ||
"dtype lost in 'diff()'. In the future this will raise a " | ||
"TypeError. Convert to a suitable dtype prior to calling 'diff'.", | ||
FutureWarning, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I took this to mean use backticks for any argument / code in the warnings. I went and implemented that for all warnings.