Skip to content

Commit

Permalink
Backport PR #35852: API: replace dropna=False option with na_sentinel…
Browse files Browse the repository at this point in the history
…=None in factorize (#36071)

Co-authored-by: Kaiqi Dong <[email protected]>
  • Loading branch information
meeseeksmachine and charlesdong1991 authored Sep 2, 2020
1 parent f9fed6a commit 9bc223e
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 41 deletions.
8 changes: 8 additions & 0 deletions doc/source/whatsnew/v1.1.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,14 @@ Bug fixes

.. ---------------------------------------------------------------------------
.. _whatsnew_112.other:

Other
~~~~~
- :meth:`factorize` now supports ``na_sentinel=None`` to include NaN in the uniques of the values and remove ``dropna`` keyword which was unintentionally exposed to public facing API in 1.1 version from :meth:`factorize`(:issue:`35667`)

.. ---------------------------------------------------------------------------
.. _whatsnew_112.contributors:

Contributors
Expand Down
33 changes: 29 additions & 4 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,9 +525,8 @@ def _factorize_array(
def factorize(
values,
sort: bool = False,
na_sentinel: int = -1,
na_sentinel: Optional[int] = -1,
size_hint: Optional[int] = None,
dropna: bool = True,
) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]:
"""
Encode the object as an enumerated type or categorical variable.
Expand All @@ -540,8 +539,11 @@ def factorize(
Parameters
----------
{values}{sort}
na_sentinel : int, default -1
Value to mark "not found".
na_sentinel : int or None, default -1
Value to mark "not found". If None, will not drop the NaN
from the uniques of the values.
.. versionchanged:: 1.1.2
{size_hint}\
Returns
Expand Down Expand Up @@ -619,6 +621,22 @@ def factorize(
array([0, 0, 1]...)
>>> uniques
Index(['a', 'c'], dtype='object')
If NaN is in the values, and we want to include NaN in the uniques of the
values, it can be achieved by setting ``na_sentinel=None``.
>>> values = np.array([1, 2, 1, np.nan])
>>> codes, uniques = pd.factorize(values) # default: na_sentinel=-1
>>> codes
array([ 0, 1, 0, -1])
>>> uniques
array([1., 2.])
>>> codes, uniques = pd.factorize(values, na_sentinel=None)
>>> codes
array([0, 1, 0, 2])
>>> uniques
array([ 1., 2., nan])
"""
# Implementation notes: This method is responsible for 3 things
# 1.) coercing data to array-like (ndarray, Index, extension array)
Expand All @@ -632,6 +650,13 @@ def factorize(
values = _ensure_arraylike(values)
original = values

# GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques
# of values, assign na_sentinel=-1 to replace code value for NaN.
dropna = True
if na_sentinel is None:
na_sentinel = -1
dropna = False

if is_extension_array_dtype(values.dtype):
values = extract_array(values)
codes, uniques = values.factorize(na_sentinel=na_sentinel)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1398,7 +1398,7 @@ def memory_usage(self, deep=False):
"""
),
)
def factorize(self, sort=False, na_sentinel=-1):
def factorize(self, sort: bool = False, na_sentinel: Optional[int] = -1):
return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel)

_shared_docs[
Expand Down
7 changes: 6 additions & 1 deletion pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,8 +585,13 @@ def _make_codes(self) -> None:
codes = self.grouper.codes_info
uniques = self.grouper.result_index
else:
# GH35667, replace dropna=False with na_sentinel=None
if not self.dropna:
na_sentinel = None
else:
na_sentinel = -1
codes, uniques = algorithms.factorize(
self.grouper, sort=self.sort, dropna=self.dropna
self.grouper, sort=self.sort, na_sentinel=na_sentinel
)
uniques = Index(uniques, name=self.name)
self._codes = codes
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/base/test_factorize.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,16 @@ def test_factorize(index_or_series_obj, sort):

tm.assert_numpy_array_equal(result_codes, expected_codes)
tm.assert_index_equal(result_uniques, expected_uniques)


def test_series_factorize_na_sentinel_none():
# GH35667
values = np.array([1, 2, 1, np.nan])
ser = pd.Series(values)
codes, uniques = ser.factorize(na_sentinel=None)

expected_codes = np.array([0, 1, 0, 2], dtype="int64")
expected_uniques = pd.Index([1.0, 2.0, np.nan])

tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_index_equal(uniques, expected_uniques)
44 changes: 9 additions & 35 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,73 +326,47 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques):
tm.assert_extension_array_equal(uniques, expected_uniques)

@pytest.mark.parametrize(
"data, dropna, expected_codes, expected_uniques",
"data, expected_codes, expected_uniques",
[
(
["a", None, "b", "a"],
True,
np.array([0, -1, 1, 0], dtype=np.dtype("intp")),
np.array(["a", "b"], dtype=object),
),
(
["a", np.nan, "b", "a"],
True,
np.array([0, -1, 1, 0], dtype=np.dtype("intp")),
np.array(["a", "b"], dtype=object),
),
(
["a", None, "b", "a"],
False,
np.array([0, 2, 1, 0], dtype=np.dtype("intp")),
np.array(["a", "b", np.nan], dtype=object),
),
(
["a", np.nan, "b", "a"],
False,
np.array([0, 2, 1, 0], dtype=np.dtype("intp")),
np.array(["a", "b", np.nan], dtype=object),
),
],
)
def test_object_factorize_dropna(
self, data, dropna, expected_codes, expected_uniques
def test_object_factorize_na_sentinel_none(
self, data, expected_codes, expected_uniques
):
codes, uniques = algos.factorize(data, dropna=dropna)
codes, uniques = algos.factorize(data, na_sentinel=None)

tm.assert_numpy_array_equal(uniques, expected_uniques)
tm.assert_numpy_array_equal(codes, expected_codes)

@pytest.mark.parametrize(
"data, dropna, expected_codes, expected_uniques",
"data, expected_codes, expected_uniques",
[
(
[1, None, 1, 2],
True,
np.array([0, -1, 0, 1], dtype=np.dtype("intp")),
np.array([1, 2], dtype="O"),
),
(
[1, np.nan, 1, 2],
True,
np.array([0, -1, 0, 1], dtype=np.dtype("intp")),
np.array([1, 2], dtype=np.float64),
),
(
[1, None, 1, 2],
False,
np.array([0, 2, 0, 1], dtype=np.dtype("intp")),
np.array([1, 2, np.nan], dtype="O"),
),
(
[1, np.nan, 1, 2],
False,
np.array([0, 2, 0, 1], dtype=np.dtype("intp")),
np.array([1, 2, np.nan], dtype=np.float64),
),
],
)
def test_int_factorize_dropna(self, data, dropna, expected_codes, expected_uniques):
codes, uniques = algos.factorize(data, dropna=dropna)
def test_int_factorize_na_sentinel_none(
self, data, expected_codes, expected_uniques
):
codes, uniques = algos.factorize(data, na_sentinel=None)

tm.assert_numpy_array_equal(uniques, expected_uniques)
tm.assert_numpy_array_equal(codes, expected_codes)
Expand Down

0 comments on commit 9bc223e

Please sign in to comment.