Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backport PR #35852 on branch 1.1.x (API: replace dropna=False option with na_sentinel=None in factorize) #36071

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions doc/source/whatsnew/v1.1.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,14 @@ Bug fixes

.. ---------------------------------------------------------------------------

.. _whatsnew_112.other:

Other
~~~~~
- :meth:`factorize` now supports ``na_sentinel=None`` to include NaN in the uniques of the values and remove ``dropna`` keyword which was unintentionally exposed to public facing API in 1.1 version from :meth:`factorize`(:issue:`35667`)

.. ---------------------------------------------------------------------------

.. _whatsnew_112.contributors:

Contributors
Expand Down
33 changes: 29 additions & 4 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,9 +525,8 @@ def _factorize_array(
def factorize(
values,
sort: bool = False,
na_sentinel: int = -1,
na_sentinel: Optional[int] = -1,
size_hint: Optional[int] = None,
dropna: bool = True,
) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]:
"""
Encode the object as an enumerated type or categorical variable.
Expand All @@ -540,8 +539,11 @@ def factorize(
Parameters
----------
{values}{sort}
na_sentinel : int, default -1
Value to mark "not found".
na_sentinel : int or None, default -1
Value to mark "not found". If None, will not drop the NaN
from the uniques of the values.

.. versionchanged:: 1.1.2
{size_hint}\

Returns
Expand Down Expand Up @@ -619,6 +621,22 @@ def factorize(
array([0, 0, 1]...)
>>> uniques
Index(['a', 'c'], dtype='object')

If NaN is in the values, and we want to include NaN in the uniques of the
values, it can be achieved by setting ``na_sentinel=None``.

>>> values = np.array([1, 2, 1, np.nan])
>>> codes, uniques = pd.factorize(values) # default: na_sentinel=-1
>>> codes
array([ 0, 1, 0, -1])
>>> uniques
array([1., 2.])

>>> codes, uniques = pd.factorize(values, na_sentinel=None)
>>> codes
array([0, 1, 0, 2])
>>> uniques
array([ 1., 2., nan])
"""
# Implementation notes: This method is responsible for 3 things
# 1.) coercing data to array-like (ndarray, Index, extension array)
Expand All @@ -632,6 +650,13 @@ def factorize(
values = _ensure_arraylike(values)
original = values

# GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques
# of values, assign na_sentinel=-1 to replace code value for NaN.
dropna = True
if na_sentinel is None:
na_sentinel = -1
dropna = False

if is_extension_array_dtype(values.dtype):
values = extract_array(values)
codes, uniques = values.factorize(na_sentinel=na_sentinel)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1398,7 +1398,7 @@ def memory_usage(self, deep=False):
"""
),
)
def factorize(self, sort=False, na_sentinel=-1):
def factorize(self, sort: bool = False, na_sentinel: Optional[int] = -1):
return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel)

_shared_docs[
Expand Down
7 changes: 6 additions & 1 deletion pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,8 +585,13 @@ def _make_codes(self) -> None:
codes = self.grouper.codes_info
uniques = self.grouper.result_index
else:
# GH35667, replace dropna=False with na_sentinel=None
if not self.dropna:
na_sentinel = None
else:
na_sentinel = -1
codes, uniques = algorithms.factorize(
self.grouper, sort=self.sort, dropna=self.dropna
self.grouper, sort=self.sort, na_sentinel=na_sentinel
)
uniques = Index(uniques, name=self.name)
self._codes = codes
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/base/test_factorize.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,16 @@ def test_factorize(index_or_series_obj, sort):

tm.assert_numpy_array_equal(result_codes, expected_codes)
tm.assert_index_equal(result_uniques, expected_uniques)


def test_series_factorize_na_sentinel_none():
# GH35667
values = np.array([1, 2, 1, np.nan])
ser = pd.Series(values)
codes, uniques = ser.factorize(na_sentinel=None)

expected_codes = np.array([0, 1, 0, 2], dtype="int64")
expected_uniques = pd.Index([1.0, 2.0, np.nan])

tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_index_equal(uniques, expected_uniques)
44 changes: 9 additions & 35 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,73 +326,47 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques):
tm.assert_extension_array_equal(uniques, expected_uniques)

@pytest.mark.parametrize(
"data, dropna, expected_codes, expected_uniques",
"data, expected_codes, expected_uniques",
[
(
["a", None, "b", "a"],
True,
np.array([0, -1, 1, 0], dtype=np.dtype("intp")),
np.array(["a", "b"], dtype=object),
),
(
["a", np.nan, "b", "a"],
True,
np.array([0, -1, 1, 0], dtype=np.dtype("intp")),
np.array(["a", "b"], dtype=object),
),
(
["a", None, "b", "a"],
False,
np.array([0, 2, 1, 0], dtype=np.dtype("intp")),
np.array(["a", "b", np.nan], dtype=object),
),
(
["a", np.nan, "b", "a"],
False,
np.array([0, 2, 1, 0], dtype=np.dtype("intp")),
np.array(["a", "b", np.nan], dtype=object),
),
],
)
def test_object_factorize_dropna(
self, data, dropna, expected_codes, expected_uniques
def test_object_factorize_na_sentinel_none(
self, data, expected_codes, expected_uniques
):
codes, uniques = algos.factorize(data, dropna=dropna)
codes, uniques = algos.factorize(data, na_sentinel=None)

tm.assert_numpy_array_equal(uniques, expected_uniques)
tm.assert_numpy_array_equal(codes, expected_codes)

@pytest.mark.parametrize(
"data, dropna, expected_codes, expected_uniques",
"data, expected_codes, expected_uniques",
[
(
[1, None, 1, 2],
True,
np.array([0, -1, 0, 1], dtype=np.dtype("intp")),
np.array([1, 2], dtype="O"),
),
(
[1, np.nan, 1, 2],
True,
np.array([0, -1, 0, 1], dtype=np.dtype("intp")),
np.array([1, 2], dtype=np.float64),
),
(
[1, None, 1, 2],
False,
np.array([0, 2, 0, 1], dtype=np.dtype("intp")),
np.array([1, 2, np.nan], dtype="O"),
),
(
[1, np.nan, 1, 2],
False,
np.array([0, 2, 0, 1], dtype=np.dtype("intp")),
np.array([1, 2, np.nan], dtype=np.float64),
),
],
)
def test_int_factorize_dropna(self, data, dropna, expected_codes, expected_uniques):
codes, uniques = algos.factorize(data, dropna=dropna)
def test_int_factorize_na_sentinel_none(
self, data, expected_codes, expected_uniques
):
codes, uniques = algos.factorize(data, na_sentinel=None)

tm.assert_numpy_array_equal(uniques, expected_uniques)
tm.assert_numpy_array_equal(codes, expected_codes)
Expand Down