-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
DEPR: na_sentinel in factorize #47157
Changes from 6 commits
552775f
79231e7
c822282
05fa0ca
f626dd8
a15e43a
9a33637
46e7a8d
b1edc89
c8d6fa2
0fd1ea7
465ab2b
6b4917c
d8e3d6b
0111eef
39b3747
5842053
945bb04
8f8cb18
9630edc
5524d53
fcd8a75
c615de4
80aaf5e
dc97a7b
eb6d5a1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -80,6 +80,7 @@ | |
na_value_for_dtype, | ||
) | ||
|
||
from pandas.core import common as com | ||
from pandas.core.array_algos.take import take_nd | ||
from pandas.core.construction import ( | ||
array as pd_array, | ||
|
@@ -580,7 +581,8 @@ def factorize_array( | |
def factorize( | ||
values, | ||
sort: bool = False, | ||
na_sentinel: int | None = -1, | ||
na_sentinel: int | None | lib.NoDefault = lib.no_default, | ||
use_na_sentinel: bool | lib.NoDefault = lib.no_default, | ||
size_hint: int | None = None, | ||
) -> tuple[np.ndarray, np.ndarray | Index]: | ||
""" | ||
|
@@ -598,7 +600,19 @@ def factorize( | |
Value to mark "not found". If None, will not drop the NaN | ||
from the uniques of the values. | ||
|
||
.. deprecated:: 1.5.0 | ||
The na_sentinel argument is deprecated and | ||
will be removed in a future version of pandas. Specify use_na_sentinel as | ||
either True or False. | ||
|
||
.. versionchanged:: 1.1.2 | ||
|
||
use_na_sentinel : bool, default True | ||
If True, the sentinel -1 will be used for NaN values. If False, | ||
NaN values will be encoded as non-negative integers and will not drop the | ||
NaN from the uniques of the values. | ||
|
||
.. versionadded:: 1.5.0 | ||
{size_hint}\ | ||
|
||
Returns | ||
|
@@ -646,8 +660,8 @@ def factorize( | |
>>> uniques | ||
array(['a', 'b', 'c'], dtype=object) | ||
|
||
Missing values are indicated in `codes` with `na_sentinel` | ||
(``-1`` by default). Note that missing values are never | ||
When ``use_na_sentinel=True`` (the default), missing values are indicated in | ||
the `codes` with the sentinel value ``-1`` and missing values are not | ||
included in `uniques`. | ||
|
||
>>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b']) | ||
|
@@ -682,16 +696,16 @@ def factorize( | |
Index(['a', 'c'], dtype='object') | ||
|
||
If NaN is in the values, and we want to include NaN in the uniques of the | ||
values, it can be achieved by setting ``na_sentinel=None``. | ||
values, it can be achieved by setting ``use_na_sentinel=False``. | ||
|
||
>>> values = np.array([1, 2, 1, np.nan]) | ||
>>> codes, uniques = pd.factorize(values) # default: na_sentinel=-1 | ||
>>> codes, uniques = pd.factorize(values) # default: use_na_sentinel=True | ||
>>> codes | ||
array([ 0, 1, 0, -1]) | ||
>>> uniques | ||
array([1., 2.]) | ||
|
||
>>> codes, uniques = pd.factorize(values, na_sentinel=None) | ||
>>> codes, uniques = pd.factorize(values, use_na_sentinel=False) | ||
>>> codes | ||
array([0, 1, 0, 2]) | ||
>>> uniques | ||
|
@@ -706,7 +720,13 @@ def factorize( | |
# responsible only for factorization. All data coercion, sorting and boxing | ||
# should happen here. | ||
|
||
# Can't always warn here because EA's factorize will warn too; warn for each | ||
# path below. | ||
passed_na_sentinel = na_sentinel | ||
na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel, warn=False) | ||
if isinstance(values, ABCRangeIndex): | ||
# Emit warning if appropriate | ||
_ = com.resolve_na_sentinel(passed_na_sentinel, use_na_sentinel) | ||
return values.factorize(sort=sort) | ||
|
||
values = _ensure_arraylike(values) | ||
|
@@ -725,15 +745,30 @@ def factorize( | |
isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray)) | ||
and values.freq is not None | ||
): | ||
# Emit warning if appropriate | ||
_ = com.resolve_na_sentinel(passed_na_sentinel, use_na_sentinel) | ||
# The presence of 'freq' means we can fast-path sorting and know there | ||
# aren't NAs | ||
codes, uniques = values.factorize(sort=sort) | ||
return _re_wrap_factorize(original, uniques, codes) | ||
|
||
if not isinstance(values.dtype, np.dtype): | ||
elif not isinstance(values.dtype, np.dtype): | ||
# i.e. ExtensionDtype | ||
codes, uniques = values.factorize(na_sentinel=na_sentinel) | ||
if passed_na_sentinel is lib.no_default: | ||
# User didn't specify na_sentinel; avoid warning. Note EA path always | ||
# uses a na_sentinel value. | ||
codes, uniques = values.factorize(use_na_sentinel=True) | ||
elif passed_na_sentinel is None: | ||
# Emit the appropriate warning message for None | ||
_ = com.resolve_na_sentinel(passed_na_sentinel, use_na_sentinel) | ||
codes, uniques = values.factorize(use_na_sentinel=True) | ||
else: | ||
# EA.factorize will warn | ||
codes, uniques = values.factorize(na_sentinel=na_sentinel) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we need to distinguish deprecating the keyword that the user passes (I think that is handled above?) vs deprecating what the EA can accept. Currently, if you have an external EA that has a custom There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i guess also worth clarifying is whether we expect the user to call EA.factorize directly vs pd.factorize There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Generally we don't "expect" users to directly call EA.factorize, but it is a public method on a public object, so it falls under a deprecation policy (as is being done in this PR already). So I think for EA authors that have a custom EA.factorize, we should recommend doing a similar deprecation in their custom method? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @jorisvandenbossche. I think anyone using a mypy will be getting errors that their signature does not match the base class. I guess this not sufficient? It does seem noisy to add warnings for users, so I added the warning via This also caught a defect in my previous implementation - pd.factorize was passing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, I didn't notice you mentioned deprecation warning instead of future warning. That makes sense and will be less noisy. |
||
|
||
else: | ||
# Generate warning for na_sentile if appropriate | ||
_ = com.resolve_na_sentinel(passed_na_sentinel, use_na_sentinel) | ||
values = np.asarray(values) # convert DTA/TDA/MultiIndex | ||
codes, uniques = factorize_array( | ||
values, na_sentinel=na_sentinel, size_hint=size_hint | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -704,3 +704,56 @@ def deprecate_numeric_only_default(cls: type, name: str, deprecate_none: bool = | |
) | ||
|
||
warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) | ||
|
||
|
||
def resolve_na_sentinel( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you put this somewhere else, closer to where its used There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Moved to algorithms. |
||
na_sentinel: int | None | lib.NoDefault, | ||
use_na_sentinel: bool | lib.NoDefault, | ||
warn: bool = True, | ||
) -> int | None: | ||
"""Determine value of na_sentinel for factorize methods. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nitpick newline before Determine |
||
|
||
See GH#46910 for details on the deprecation. | ||
|
||
Parameters | ||
---------- | ||
na_sentinel : int, None, or lib.no_default | ||
Value passed to the method. | ||
use_na_sentinel : bool or lib.no_default | ||
Value passed to the method. | ||
warn : bool, default True | ||
Whether to emit a warning if a deprecated use is detected. | ||
|
||
Returns | ||
------- | ||
Resolved value of na_sentinel. | ||
""" | ||
if na_sentinel is not lib.no_default and use_na_sentinel is not lib.no_default: | ||
raise ValueError( | ||
"Cannot specify both na_sentinel and use_na_sentile; " | ||
f"got na_sentinel={na_sentinel} and use_na_sentinel={use_na_sentinel}" | ||
) | ||
if na_sentinel is lib.no_default: | ||
result = -1 if use_na_sentinel is lib.no_default or use_na_sentinel else None | ||
else: | ||
if warn: | ||
if na_sentinel is None: | ||
msg = ( | ||
"Specifying na_sentinel=None is deprecated, specify " | ||
"use_na_sentinel=False instead." | ||
) | ||
elif na_sentinel == -1: | ||
msg = ( | ||
"Specifying na_sentinel=-1 is deprecated, specify " | ||
"use_na_sentinel=True instead." | ||
) | ||
else: | ||
msg = ( | ||
"Specifying the specific value to use for na_sentinel is " | ||
"deprecated and will be removed in a future version of pandas. " | ||
"Specify na_sentinel=True to use the sentinel value -1, and " | ||
"na_sentinel=False to encode NaN values." | ||
) | ||
warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) | ||
result = na_sentinel | ||
return result |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I took this to mean use backticks for any argument / code in the warnings. I went and implemented that for all warnings.