diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 9be5b5f0ad2dc..dc7adf6d9d00e 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -34,6 +34,14 @@ Bug fixes .. --------------------------------------------------------------------------- +.. _whatsnew_112.other: + +Other +~~~~~ +- :meth:`factorize` now supports ``na_sentinel=None`` to include NaN in the uniques of the values and remove ``dropna`` keyword which was unintentionally exposed to public facing API in 1.1 version from :meth:`factorize`(:issue:`35667`) + +.. --------------------------------------------------------------------------- + .. _whatsnew_112.contributors: Contributors diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9e3ca4cc53363..856b4ead3f3cc 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -525,9 +525,8 @@ def _factorize_array( def factorize( values, sort: bool = False, - na_sentinel: int = -1, + na_sentinel: Optional[int] = -1, size_hint: Optional[int] = None, - dropna: bool = True, ) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]: """ Encode the object as an enumerated type or categorical variable. @@ -540,8 +539,11 @@ def factorize( Parameters ---------- {values}{sort} - na_sentinel : int, default -1 - Value to mark "not found". + na_sentinel : int or None, default -1 + Value to mark "not found". If None, will not drop the NaN + from the uniques of the values. + + .. versionchanged:: 1.1.2 {size_hint}\ Returns @@ -619,6 +621,22 @@ def factorize( array([0, 0, 1]...) >>> uniques Index(['a', 'c'], dtype='object') + + If NaN is in the values, and we want to include NaN in the uniques of the + values, it can be achieved by setting ``na_sentinel=None``. + + >>> values = np.array([1, 2, 1, np.nan]) + >>> codes, uniques = pd.factorize(values) # default: na_sentinel=-1 + >>> codes + array([ 0, 1, 0, -1]) + >>> uniques + array([1., 2.]) + + >>> codes, uniques = pd.factorize(values, na_sentinel=None) + >>> codes + array([0, 1, 0, 2]) + >>> uniques + array([ 1., 2., nan]) """ # Implementation notes: This method is responsible for 3 things # 1.) coercing data to array-like (ndarray, Index, extension array) @@ -632,6 +650,13 @@ def factorize( values = _ensure_arraylike(values) original = values + # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques + # of values, assign na_sentinel=-1 to replace code value for NaN. + dropna = True + if na_sentinel is None: + na_sentinel = -1 + dropna = False + if is_extension_array_dtype(values.dtype): values = extract_array(values) codes, uniques = values.factorize(na_sentinel=na_sentinel) diff --git a/pandas/core/base.py b/pandas/core/base.py index b62ef668df5e1..1926803d8f04b 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1398,7 +1398,7 @@ def memory_usage(self, deep=False): """ ), ) - def factorize(self, sort=False, na_sentinel=-1): + def factorize(self, sort: bool = False, na_sentinel: Optional[int] = -1): return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel) _shared_docs[ diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 8239a792c65dd..272afe7335c6a 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -585,8 +585,13 @@ def _make_codes(self) -> None: codes = self.grouper.codes_info uniques = self.grouper.result_index else: + # GH35667, replace dropna=False with na_sentinel=None + if not self.dropna: + na_sentinel = None + else: + na_sentinel = -1 codes, uniques = algorithms.factorize( - self.grouper, sort=self.sort, dropna=self.dropna + self.grouper, sort=self.sort, na_sentinel=na_sentinel ) uniques = Index(uniques, name=self.name) self._codes = codes diff --git a/pandas/tests/base/test_factorize.py b/pandas/tests/base/test_factorize.py index 415a8b7e4362f..9fad9856d53cc 100644 --- a/pandas/tests/base/test_factorize.py +++ b/pandas/tests/base/test_factorize.py @@ -26,3 +26,16 @@ def test_factorize(index_or_series_obj, sort): tm.assert_numpy_array_equal(result_codes, expected_codes) tm.assert_index_equal(result_uniques, expected_uniques) + + +def test_series_factorize_na_sentinel_none(): + # GH35667 + values = np.array([1, 2, 1, np.nan]) + ser = pd.Series(values) + codes, uniques = ser.factorize(na_sentinel=None) + + expected_codes = np.array([0, 1, 0, 2], dtype="int64") + expected_uniques = pd.Index([1.0, 2.0, np.nan]) + + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_index_equal(uniques, expected_uniques) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index a080bf0feaebc..326c926238f89 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -326,73 +326,47 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): tm.assert_extension_array_equal(uniques, expected_uniques) @pytest.mark.parametrize( - "data, dropna, expected_codes, expected_uniques", + "data, expected_codes, expected_uniques", [ ( ["a", None, "b", "a"], - True, - np.array([0, -1, 1, 0], dtype=np.dtype("intp")), - np.array(["a", "b"], dtype=object), - ), - ( - ["a", np.nan, "b", "a"], - True, - np.array([0, -1, 1, 0], dtype=np.dtype("intp")), - np.array(["a", "b"], dtype=object), - ), - ( - ["a", None, "b", "a"], - False, np.array([0, 2, 1, 0], dtype=np.dtype("intp")), np.array(["a", "b", np.nan], dtype=object), ), ( ["a", np.nan, "b", "a"], - False, np.array([0, 2, 1, 0], dtype=np.dtype("intp")), np.array(["a", "b", np.nan], dtype=object), ), ], ) - def test_object_factorize_dropna( - self, data, dropna, expected_codes, expected_uniques + def test_object_factorize_na_sentinel_none( + self, data, expected_codes, expected_uniques ): - codes, uniques = algos.factorize(data, dropna=dropna) + codes, uniques = algos.factorize(data, na_sentinel=None) tm.assert_numpy_array_equal(uniques, expected_uniques) tm.assert_numpy_array_equal(codes, expected_codes) @pytest.mark.parametrize( - "data, dropna, expected_codes, expected_uniques", + "data, expected_codes, expected_uniques", [ ( [1, None, 1, 2], - True, - np.array([0, -1, 0, 1], dtype=np.dtype("intp")), - np.array([1, 2], dtype="O"), - ), - ( - [1, np.nan, 1, 2], - True, - np.array([0, -1, 0, 1], dtype=np.dtype("intp")), - np.array([1, 2], dtype=np.float64), - ), - ( - [1, None, 1, 2], - False, np.array([0, 2, 0, 1], dtype=np.dtype("intp")), np.array([1, 2, np.nan], dtype="O"), ), ( [1, np.nan, 1, 2], - False, np.array([0, 2, 0, 1], dtype=np.dtype("intp")), np.array([1, 2, np.nan], dtype=np.float64), ), ], ) - def test_int_factorize_dropna(self, data, dropna, expected_codes, expected_uniques): - codes, uniques = algos.factorize(data, dropna=dropna) + def test_int_factorize_na_sentinel_none( + self, data, expected_codes, expected_uniques + ): + codes, uniques = algos.factorize(data, na_sentinel=None) tm.assert_numpy_array_equal(uniques, expected_uniques) tm.assert_numpy_array_equal(codes, expected_codes)