From 670c2e8dc2f370ff847b9ce3242dcf643072f922 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 1 Apr 2022 10:14:36 -0400 Subject: [PATCH 01/16] BUG: algos.factorizes moves null values when sort=False --- doc/source/whatsnew/v1.5.0.rst | 2 + pandas/_libs/hashtable_class_helper.pxi.in | 12 ++--- pandas/core/algorithms.py | 40 +++++++++++---- pandas/core/arrays/arrow/array.py | 7 ++- pandas/core/arrays/base.py | 11 ++++- pandas/core/arrays/datetimelike.py | 5 +- pandas/core/arrays/masked.py | 28 +++++++++-- pandas/core/arrays/sparse/array.py | 9 +++- pandas/core/arrays/string_.py | 7 +++ pandas/core/groupby/grouper.py | 5 +- pandas/tests/groupby/test_groupby_dropna.py | 49 +++++++++++++++++++ .../tests/groupby/transform/test_transform.py | 6 +-- pandas/tests/test_algos.py | 16 +++--- 13 files changed, 155 insertions(+), 42 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 08500019143ed..74ab37f856144 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -96,6 +96,7 @@ Other enhancements - :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`) - Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, and :meth:`DataFrame.cov` (:issue:`46560`) - A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`) +- The method :meth:`.ExtensionArray.factorize` has gained the argument ``dropna`` for determining how null values are to be treated. (:issue:`46601`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: @@ -605,6 +606,7 @@ Groupby/resample/rolling - Bug in :meth:`SeriesGroupBy.apply` would incorrectly name its result when there was a unique group (:issue:`46369`) - Bug in :meth:`.Rolling.var` would segfault calculating weighted variance when window size was larger than data size (:issue:`46760`) - Bug in :meth:`Grouper.__repr__` where ``dropna`` was not included. Now it is (:issue:`46754`) +- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` with ``dropna=False`` and ``sort=False`` would put any null groups at the end instead the order that they are encountered (:issue:`46584`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 8a2b9c2f77627..f25e4ad009153 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -658,7 +658,7 @@ cdef class {{name}}HashTable(HashTable): return_inverse=return_inverse) def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, - object na_value=None, object mask=None): + object na_value=None, object mask=None, ignore_na=True): """ Calculate unique values and labels (no sorting!) @@ -690,7 +690,7 @@ cdef class {{name}}HashTable(HashTable): """ uniques_vector = {{name}}Vector() return self._unique(values, uniques_vector, na_sentinel=na_sentinel, - na_value=na_value, ignore_na=True, mask=mask, + na_value=na_value, ignore_na=ignore_na, mask=mask, return_inverse=True) def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, @@ -1037,7 +1037,7 @@ cdef class StringHashTable(HashTable): return_inverse=return_inverse) def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, - object na_value=None, object mask=None): + object na_value=None, object mask=None, ignore_na=True): """ Calculate unique values and labels (no sorting!) @@ -1067,7 +1067,7 @@ cdef class StringHashTable(HashTable): """ uniques_vector = ObjectVector() return self._unique(values, uniques_vector, na_sentinel=na_sentinel, - na_value=na_value, ignore_na=True, + na_value=na_value, ignore_na=ignore_na, return_inverse=True) def get_labels(self, ndarray[object] values, ObjectVector uniques, @@ -1290,7 +1290,7 @@ cdef class PyObjectHashTable(HashTable): return_inverse=return_inverse) def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, - object na_value=None, object mask=None): + object na_value=None, object mask=None, ignore_na=True): """ Calculate unique values and labels (no sorting!) @@ -1320,7 +1320,7 @@ cdef class PyObjectHashTable(HashTable): """ uniques_vector = ObjectVector() return self._unique(values, uniques_vector, na_sentinel=na_sentinel, - na_value=na_value, ignore_na=True, + na_value=na_value, ignore_na=ignore_na, return_inverse=True) def get_labels(self, ndarray[object] values, ObjectVector uniques, diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 0c0b93f41c657..88969502b99fd 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -502,6 +502,7 @@ def factorize_array( size_hint: int | None = None, na_value=None, mask: npt.NDArray[np.bool_] | None = None, + dropna: bool = True, ) -> tuple[npt.NDArray[np.intp], np.ndarray]: """ Factorize a numpy array to codes and uniques. @@ -523,6 +524,11 @@ def factorize_array( If not None, the mask is used as indicator for missing values (True = missing, False = valid) instead of `na_value` or condition "val != val". + dropna: bool, default True + Whether null values will appear in uniques. When False, null values + will receive a nonnegative code instead of na_sentinel. + + ..versionadded:: 1.5.0 Returns ------- @@ -541,7 +547,11 @@ def factorize_array( table = hash_klass(size_hint or len(values)) uniques, codes = table.factorize( - values, na_sentinel=na_sentinel, na_value=na_value, mask=mask + values, + na_sentinel=na_sentinel, + na_value=na_value, + mask=mask, + ignore_na=dropna, ) # re-cast e.g. i8->dt64/td64, uint8->bool @@ -728,11 +738,19 @@ def factorize( if not isinstance(values.dtype, np.dtype): # i.e. ExtensionDtype - codes, uniques = values.factorize(na_sentinel=na_sentinel) + # TODO: pass ignore_na=dropna. When sort is True we ignore_na here and append + # on the end because safe_sort does not handle null values in uniques + codes, uniques = values.factorize( + na_sentinel=na_sentinel, dropna=dropna or sort + ) else: values = np.asarray(values) # convert DTA/TDA/MultiIndex + # TODO: pass ignore_na=dropna; see above codes, uniques = factorize_array( - values, na_sentinel=na_sentinel, size_hint=size_hint + values, + na_sentinel=na_sentinel, + size_hint=size_hint, + dropna=dropna or sort, ) if sort and len(uniques) > 0: @@ -740,13 +758,15 @@ def factorize( uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False ) - code_is_na = codes == na_sentinel - if not dropna and code_is_na.any(): - # na_value is set based on the dtype of uniques, and compat set to False is - # because we do not want na_value to be 0 for integers - na_value = na_value_for_dtype(uniques.dtype, compat=False) - uniques = np.append(uniques, [na_value]) - codes = np.where(code_is_na, len(uniques) - 1, codes) + if not dropna and sort: + # TODO: Can remove if we pass ignore_na=dropna; see above + code_is_na = codes == na_sentinel + if code_is_na.any(): + # na_value is set based on the dtype of uniques, and compat set to False is + # because we do not want na_value to be 0 for integers + na_value = na_value_for_dtype(uniques.dtype, compat=False) + uniques = np.append(uniques, [na_value]) + codes = np.where(code_is_na, len(uniques) - 1, codes) uniques = _reconstruct_data(uniques, original.dtype, original) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 0a48638f5cf05..eeff227ed9ba5 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -105,8 +105,11 @@ def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: return type(self)(self._data) @doc(ExtensionArray.factorize) - def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: - encoded = self._data.dictionary_encode() + def factorize( + self, na_sentinel: int = -1, dropna=True + ) -> tuple[np.ndarray, ExtensionArray]: + null_encoding = "mask" if dropna else "encode" + encoded = self._data.dictionary_encode(null_encoding=null_encoding) indices = pa.chunked_array( [c.indices for c in encoded.chunks], type=encoded.type.index_type ).to_pandas() diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index b06a46dfd1447..e057c1596765c 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1002,7 +1002,9 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]: """ return self.astype(object), np.nan - def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: + def factorize( + self, na_sentinel: int = -1, dropna: bool = True + ) -> tuple[np.ndarray, ExtensionArray]: """ Encode the extension array as an enumerated type. @@ -1010,6 +1012,11 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: ---------- na_sentinel : int, default -1 Value to use in the `codes` array to indicate missing values. + dropna: bool, default True + Whether null values will appear in uniques. When False, null values + will receive a nonnegative code instead of na_sentinel. + + ..versionadded:: 1.5.0 Returns ------- @@ -1044,7 +1051,7 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: arr, na_value = self._values_for_factorize() codes, uniques = factorize_array( - arr, na_sentinel=na_sentinel, na_value=na_value + arr, na_sentinel=na_sentinel, na_value=na_value, dropna=dropna ) uniques_ea = self._from_factorized(uniques, self) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 27260f8ed62ca..dc0aecafea56b 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1878,7 +1878,7 @@ def _with_freq(self, freq): # -------------------------------------------------------------- - def factorize(self, na_sentinel=-1, sort: bool = False): + def factorize(self, na_sentinel=-1, sort: bool = False, dropna: bool = True): if self.freq is not None: # We must be unique, so can short-circuit (and retain freq) codes = np.arange(len(self), dtype=np.intp) @@ -1888,7 +1888,8 @@ def factorize(self, na_sentinel=-1, sort: bool = False): uniques = uniques[::-1] return codes, uniques # FIXME: shouldn't get here; we are ignoring sort - return super().factorize(na_sentinel=na_sentinel) + result = super().factorize(na_sentinel=na_sentinel, dropna=dropna) + return result # ------------------------------------------------------------------- diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 95363e598a06c..537d59f2cd779 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -868,16 +868,38 @@ def searchsorted( return self._data.searchsorted(value, side=side, sorter=sorter) @doc(ExtensionArray.factorize) - def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: + def factorize( + self, na_sentinel: int = -1, dropna: bool = True + ) -> tuple[np.ndarray, ExtensionArray]: arr = self._data mask = self._mask - codes, uniques = factorize_array(arr, na_sentinel=na_sentinel, mask=mask) + codes, uniques = factorize_array( + arr, na_sentinel=na_sentinel, mask=mask, dropna=True + ) # check that factorize_array correctly preserves dtype. assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype) - uniques_ea = type(self)(uniques, np.zeros(len(uniques), dtype=bool)) + # Make room for a null value if we're not ignoring it and it exists + size = len(uniques) if dropna or not mask.any() else len(uniques) + 1 + uniques_mask = np.zeros(size, dtype=bool) + if not dropna: + na_index = mask.argmax() + if mask[na_index]: + # Insert na with the proper code + na_code = 0 if na_index == 0 else codes[:na_index].argmax() + 1 + if na_sentinel < 0: + # codes can never equal na_sentinel and be >= na_code + codes[codes >= na_code] += 1 + else: + codes[(codes >= na_code) & (codes != na_sentinel)] += 1 + codes[codes == na_sentinel] = na_code + # dummy value for uniques; not used since uniques_mask will be True + uniques = np.insert(uniques, na_code, 0) + uniques_mask[na_code] = True + uniques_ea = type(self)(uniques, uniques_mask) + return codes, uniques_ea @doc(ExtensionArray._values_for_argsort) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index c2a13961687fd..c38baab1d9ae1 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -847,13 +847,18 @@ def _values_for_factorize(self): # Still override this for hash_pandas_object return np.asarray(self), self.fill_value - def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, SparseArray]: + def factorize( + self, na_sentinel: int = -1, dropna: bool = True + ) -> tuple[np.ndarray, SparseArray]: # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA] # The sparsity on this is backwards from what Sparse would want. Want # ExtensionArray.factorize -> Tuple[EA, EA] # Given that we have to return a dense array of codes, why bother # implementing an efficient factorize? - codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel) + na_sentinel_arg = na_sentinel if dropna else None + codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel_arg) + if not dropna: + codes[codes == -1] = na_sentinel uniques_sp = SparseArray(uniques, dtype=self.dtype) return codes, uniques_sp diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 21b5dc625956e..304bc7a768bcc 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -382,6 +382,13 @@ def _values_for_factorize(self): arr[mask] = -1 return arr, -1 + @classmethod + def _from_factorized(cls, values, original): + assert values.dtype == original._ndarray.dtype + # When dropna (i.e. ignore_na) is False, can get -1 from nulls + values[values == -1] = None + return original._from_backing_data(values) + def __setitem__(self, key, value): value = extract_array(value, extract_numpy=True) if isinstance(value, type(self)): diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 3f37c5f0c6df9..e146e9462c6f2 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -657,9 +657,10 @@ def group_index(self) -> Index: @cache_readonly def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: - if self._passed_categorical: + if self._dropna and self._passed_categorical: # we make a CategoricalIndex out of the cat grouper - # preserving the categories / ordered attributes + # preserving the categories / ordered attributes; + # doesn't (yet) handle dropna=False cat = self.grouping_vector categories = cat.categories diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 9fcc28d8f65a9..d79935538a24b 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -348,3 +348,52 @@ def test_groupby_nan_included(): tm.assert_numpy_array_equal(result_values, expected_values) assert np.isnan(list(result.keys())[2]) assert list(result.keys())[0:2] == ["g1", "g2"] + + +@pytest.mark.parametrize( + "key", + [ + pd.Series([2, np.nan, 1, 2]), + pd.Series([2, np.nan, 1, 2], dtype="UInt8"), + pd.Series([2, np.nan, 1, 2], dtype="Int8"), + pd.Series([2, np.nan, 1, 2], dtype="UInt16"), + pd.Series([2, np.nan, 1, 2], dtype="Int16"), + pd.Series([2, np.nan, 1, 2], dtype="UInt32"), + pd.Series([2, np.nan, 1, 2], dtype="Int32"), + pd.Series([2, np.nan, 1, 2], dtype="UInt64"), + pd.Series([2, np.nan, 1, 2], dtype="Int64"), + pd.Series([2, np.nan, 1, 2], dtype="Float32"), + pd.Series([2, np.nan, 1, 2], dtype="Int64"), + pd.Series([2, np.nan, 1, 2], dtype="Float64"), + pd.Series(["y", None, "x", "y"], dtype="category"), + pd.Series(["y", pd.NA, "x", "y"], dtype="string"), + pd.Series(["y", pd.NA, "x", "y"], dtype="string[pyarrow]"), + pd.Series( + ["2016-01-01", np.datetime64("NaT"), "2017-01-01", "2016-01-01"], + dtype="datetime64[ns]", + ), + pd.Series( + [ + pd.Period("2012-02-01", freq="D"), + pd.NA, + pd.Period("2012-01-01", freq="D"), + pd.Period("2012-02-01", freq="D"), + ] + ), + pd.Series(pd.arrays.SparseArray([2, np.nan, 1, 2])), + ], +) +@pytest.mark.parametrize("test_series", [True, False]) +def test_no_sort_keep_na(key, test_series): + df = pd.DataFrame({"key": key, "a": [1, 2, 3, 4]}) + gb = df.groupby("key", dropna=False, sort=False) + if test_series: + gb = gb["a"] + result = gb.sum() + expected = pd.DataFrame({"a": [5, 2, 3]}, index=key[:-1].rename("key")) + if test_series: + expected = expected["a"] + if expected.index.is_categorical(): + # TODO: Slicing reorders categories? + expected.index = expected.index.reorder_categories(["y", "x"]) + tm.assert_equal(result, expected) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 237d6a96f39ec..779964a5da28a 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1311,12 +1311,8 @@ def test_transform_cumcount(): @pytest.mark.parametrize("keys", [["A1"], ["A1", "A2"]]) -def test_null_group_lambda_self(request, sort, dropna, keys): +def test_null_group_lambda_self(sort, dropna, keys): # GH 17093 - if not sort and not dropna: - msg = "GH#46584: null values get sorted when sort=False" - request.node.add_marker(pytest.mark.xfail(reason=msg, strict=False)) - size = 50 nulls1 = np.random.choice([False, True], size) nulls2 = np.random.choice([False, True], size) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 2d73b8e91e831..e0b18bd607da1 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -436,13 +436,13 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): [ ( ["a", None, "b", "a"], - np.array([0, 2, 1, 0], dtype=np.dtype("intp")), - np.array(["a", "b", np.nan], dtype=object), + np.array([0, 1, 2, 0], dtype=np.dtype("intp")), + np.array(["a", None, "b"], dtype=object), ), ( ["a", np.nan, "b", "a"], - np.array([0, 2, 1, 0], dtype=np.dtype("intp")), - np.array(["a", "b", np.nan], dtype=object), + np.array([0, 1, 2, 0], dtype=np.dtype("intp")), + np.array(["a", np.nan, "b"], dtype=object), ), ], ) @@ -459,13 +459,13 @@ def test_object_factorize_na_sentinel_none( [ ( [1, None, 1, 2], - np.array([0, 2, 0, 1], dtype=np.dtype("intp")), - np.array([1, 2, np.nan], dtype="O"), + np.array([0, 1, 0, 2], dtype=np.dtype("intp")), + np.array([1, None, 2], dtype="O"), ), ( [1, np.nan, 1, 2], - np.array([0, 2, 0, 1], dtype=np.dtype("intp")), - np.array([1, 2, np.nan], dtype=np.float64), + np.array([0, 1, 0, 2], dtype=np.dtype("intp")), + np.array([1, np.nan, 2], dtype=np.float64), ), ], ) From 98c6c18aa4e82d4fe20f068aaf5086f18beaabf0 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 19 Apr 2022 19:25:35 -0400 Subject: [PATCH 02/16] Implementation for pyarrow < 4.0 --- pandas/core/arrays/arrow/array.py | 21 +++++++++++++++++++-- pandas/core/arrays/base.py | 2 +- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index eeff227ed9ba5..6d694683f3fd4 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -15,6 +15,7 @@ from pandas.compat import ( pa_version_under1p01, pa_version_under2p0, + pa_version_under4p0, pa_version_under5p0, ) from pandas.util._decorators import doc @@ -108,8 +109,11 @@ def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: def factorize( self, na_sentinel: int = -1, dropna=True ) -> tuple[np.ndarray, ExtensionArray]: - null_encoding = "mask" if dropna else "encode" - encoded = self._data.dictionary_encode(null_encoding=null_encoding) + if pa_version_under4p0: + encoded = self._data.dictionary_encode() + else: + null_encoding = "mask" if dropna else "encode" + encoded = self._data.dictionary_encode(null_encoding=null_encoding) indices = pa.chunked_array( [c.indices for c in encoded.chunks], type=encoded.type.index_type ).to_pandas() @@ -119,6 +123,19 @@ def factorize( if encoded.num_chunks: uniques = type(self)(encoded.chunk(0).dictionary) + if pa_version_under4p0: + # Insert na with the proper code + na_mask = indices.values == na_sentinel + na_index = na_mask.argmax() + if na_mask[na_index]: + uniques = uniques.insert(na_index, self.dtype.na_value) + na_code = 0 if na_index == 0 else indices[:na_index].argmax() + 1 + if na_sentinel < 0: + # codes can never equal na_sentinel and be >= na_code + indices[indices >= na_code] += 1 + else: + indices[(indices >= na_code) & (indices != na_sentinel)] += 1 + indices[indices == na_sentinel] = na_code else: uniques = type(self)(pa.array([], type=encoded.type.value_type)) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index e057c1596765c..642ad07887d9f 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1012,7 +1012,7 @@ def factorize( ---------- na_sentinel : int, default -1 Value to use in the `codes` array to indicate missing values. - dropna: bool, default True + dropna : bool, default True Whether null values will appear in uniques. When False, null values will receive a nonnegative code instead of na_sentinel. From 007329badc93d101ae85fc4360a085581a5ad5e1 Mon Sep 17 00:00:00 2001 From: richard Date: Tue, 19 Apr 2022 22:21:13 -0400 Subject: [PATCH 03/16] fixup --- pandas/core/arrays/arrow/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 6d694683f3fd4..54f183bc2c493 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -123,7 +123,7 @@ def factorize( if encoded.num_chunks: uniques = type(self)(encoded.chunk(0).dictionary) - if pa_version_under4p0: + if not dropna and pa_version_under4p0: # Insert na with the proper code na_mask = indices.values == na_sentinel na_index = na_mask.argmax() From ffaf20c00c8d0762c19bb097c7cda64e9dc999e0 Mon Sep 17 00:00:00 2001 From: richard Date: Tue, 19 Apr 2022 22:39:33 -0400 Subject: [PATCH 04/16] fixups --- pandas/core/algorithms.py | 6 +++--- pandas/core/arrays/base.py | 4 ++-- pandas/tests/groupby/test_groupby_dropna.py | 9 ++++++++- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 88969502b99fd..2271c8939e400 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -524,11 +524,11 @@ def factorize_array( If not None, the mask is used as indicator for missing values (True = missing, False = valid) instead of `na_value` or condition "val != val". - dropna: bool, default True - Whether null values will appear in uniques. When False, null values + dropna : bool, default True + Whether NA values will appear in uniques. When False, NA values will receive a nonnegative code instead of na_sentinel. - ..versionadded:: 1.5.0 + .. versionadded:: 1.5.0 Returns ------- diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 642ad07887d9f..2eff7dffd3f0d 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1013,10 +1013,10 @@ def factorize( na_sentinel : int, default -1 Value to use in the `codes` array to indicate missing values. dropna : bool, default True - Whether null values will appear in uniques. When False, null values + Whether NA values will appear in uniques. When False, NA values will receive a nonnegative code instead of na_sentinel. - ..versionadded:: 1.5.0 + .. versionadded:: 1.5.0 Returns ------- diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index d79935538a24b..30b6a9d43cb44 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.pyarrow import pa_version_under1p01 + import pandas as pd import pandas._testing as tm @@ -367,7 +369,12 @@ def test_groupby_nan_included(): pd.Series([2, np.nan, 1, 2], dtype="Float64"), pd.Series(["y", None, "x", "y"], dtype="category"), pd.Series(["y", pd.NA, "x", "y"], dtype="string"), - pd.Series(["y", pd.NA, "x", "y"], dtype="string[pyarrow]"), + pytest.param( + pd.Series(["y", pd.NA, "x", "y"], dtype="string[pyarrow]"), + marks=pytest.mark.skipif( + pa_version_under1p01, reason="pyarrow is not installed" + ), + ), pd.Series( ["2016-01-01", np.datetime64("NaT"), "2017-01-01", "2016-01-01"], dtype="datetime64[ns]", From f7326bdeea2c98cde44d95d8baff654f2226c5ca Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 23 Apr 2022 10:58:13 -0400 Subject: [PATCH 05/16] test fixup --- pandas/tests/groupby/test_groupby_dropna.py | 48 +++++++++++---------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 30b6a9d43cb44..f89c08e91793c 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -353,45 +353,49 @@ def test_groupby_nan_included(): @pytest.mark.parametrize( - "key", + "values, dtype", [ - pd.Series([2, np.nan, 1, 2]), - pd.Series([2, np.nan, 1, 2], dtype="UInt8"), - pd.Series([2, np.nan, 1, 2], dtype="Int8"), - pd.Series([2, np.nan, 1, 2], dtype="UInt16"), - pd.Series([2, np.nan, 1, 2], dtype="Int16"), - pd.Series([2, np.nan, 1, 2], dtype="UInt32"), - pd.Series([2, np.nan, 1, 2], dtype="Int32"), - pd.Series([2, np.nan, 1, 2], dtype="UInt64"), - pd.Series([2, np.nan, 1, 2], dtype="Int64"), - pd.Series([2, np.nan, 1, 2], dtype="Float32"), - pd.Series([2, np.nan, 1, 2], dtype="Int64"), - pd.Series([2, np.nan, 1, 2], dtype="Float64"), - pd.Series(["y", None, "x", "y"], dtype="category"), - pd.Series(["y", pd.NA, "x", "y"], dtype="string"), + ([2, np.nan, 1, 2], None), + ([2, np.nan, 1, 2], "UInt8"), + ([2, np.nan, 1, 2], "Int8"), + ([2, np.nan, 1, 2], "UInt16"), + ([2, np.nan, 1, 2], "Int16"), + ([2, np.nan, 1, 2], "UInt32"), + ([2, np.nan, 1, 2], "Int32"), + ([2, np.nan, 1, 2], "UInt64"), + ([2, np.nan, 1, 2], "Int64"), + ([2, np.nan, 1, 2], "Float32"), + ([2, np.nan, 1, 2], "Int64"), + ([2, np.nan, 1, 2], "Float64"), + (["y", None, "x", "y"], "category"), + (["y", pd.NA, "x", "y"], "string"), pytest.param( - pd.Series(["y", pd.NA, "x", "y"], dtype="string[pyarrow]"), + ["y", pd.NA, "x", "y"], + "string[pyarrow]", marks=pytest.mark.skipif( pa_version_under1p01, reason="pyarrow is not installed" ), ), - pd.Series( + ( ["2016-01-01", np.datetime64("NaT"), "2017-01-01", "2016-01-01"], - dtype="datetime64[ns]", + "datetime64[ns]", ), - pd.Series( + ( [ pd.Period("2012-02-01", freq="D"), pd.NA, pd.Period("2012-01-01", freq="D"), pd.Period("2012-02-01", freq="D"), - ] + ], + None, ), - pd.Series(pd.arrays.SparseArray([2, np.nan, 1, 2])), + (pd.arrays.SparseArray([2, np.nan, 1, 2]), None), ], ) @pytest.mark.parametrize("test_series", [True, False]) -def test_no_sort_keep_na(key, test_series): +def test_no_sort_keep_na(values, dtype, test_series): + # GH#46584 + key = pd.Series(values, dtype=dtype) df = pd.DataFrame({"key": key, "a": [1, 2, 3, 4]}) gb = df.groupby("key", dropna=False, sort=False) if test_series: From b0ec48adab544ec27af1f3dd189ac33679f8cbb7 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 24 Apr 2022 10:19:09 -0400 Subject: [PATCH 06/16] type-hints --- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/arrays/masked.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index dc0aecafea56b..4a3fa0659b86e 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1878,7 +1878,7 @@ def _with_freq(self, freq): # -------------------------------------------------------------- - def factorize(self, na_sentinel=-1, sort: bool = False, dropna: bool = True): + def factorize(self, na_sentinel: int = -1, dropna: bool = True, sort: bool = False): if self.freq is not None: # We must be unique, so can short-circuit (and retain freq) codes = np.arange(len(self), dtype=np.intp) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 537d59f2cd779..bebd92168ebdc 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -888,7 +888,12 @@ def factorize( na_index = mask.argmax() if mask[na_index]: # Insert na with the proper code - na_code = 0 if na_index == 0 else codes[:na_index].argmax() + 1 + if na_index == 0: + na_code = np.intp(0) + else: + # error: Slice index must be an integer or None + # https://github.com/python/mypy/issues/2410 + na_code = codes[:na_index].argmax() + 1 # type: ignore[misc] if na_sentinel < 0: # codes can never equal na_sentinel and be >= na_code codes[codes >= na_code] += 1 From cadab86a6818a2ad678130b84cdf9f12ed93f87c Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 30 Apr 2022 19:00:46 -0400 Subject: [PATCH 07/16] improvements --- pandas/core/arrays/arrow/array.py | 3 ++- pandas/core/arrays/string_.py | 6 ++---- pandas/core/groupby/grouper.py | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index f7aae4780305a..2fcee9e261404 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -124,7 +124,7 @@ def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: @doc(ExtensionArray.factorize) def factorize( - self, na_sentinel: int = -1, dropna=True + self, na_sentinel: int = -1, dropna: bool = True ) -> tuple[np.ndarray, ExtensionArray]: if pa_version_under4p0: encoded = self._data.dictionary_encode() @@ -141,6 +141,7 @@ def factorize( if encoded.num_chunks: uniques = type(self)(encoded.chunk(0).dictionary) if not dropna and pa_version_under4p0: + # TODO: share logic with BaseMaskedArray.factorize # Insert na with the proper code na_mask = indices.values == na_sentinel na_index = na_mask.argmax() diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 304bc7a768bcc..86f8828f2aea5 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -379,14 +379,12 @@ def __arrow_array__(self, type=None): def _values_for_factorize(self): arr = self._ndarray.copy() mask = self.isna() - arr[mask] = -1 - return arr, -1 + arr[mask] = None + return arr, None @classmethod def _from_factorized(cls, values, original): assert values.dtype == original._ndarray.dtype - # When dropna (i.e. ignore_na) is False, can get -1 from nulls - values[values == -1] = None return original._from_backing_data(values) def __setitem__(self, key, value): diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e146e9462c6f2..623b1d6a5aa4c 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -660,7 +660,7 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: if self._dropna and self._passed_categorical: # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes; - # doesn't (yet) handle dropna=False + # doesn't (yet - GH#46909) handle dropna=False cat = self.grouping_vector categories = cat.categories From f44b7f3483e74e7346775a98e4b71e075c21053c Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 30 Apr 2022 19:29:12 -0400 Subject: [PATCH 08/16] remove override of _from_factorized in string_.py --- pandas/core/arrays/string_.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 86f8828f2aea5..0593edeb6645b 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -382,11 +382,6 @@ def _values_for_factorize(self): arr[mask] = None return arr, None - @classmethod - def _from_factorized(cls, values, original): - assert values.dtype == original._ndarray.dtype - return original._from_backing_data(values) - def __setitem__(self, key, value): value = extract_array(value, extract_numpy=True) if isinstance(value, type(self)): From ef49c74e7a51a240fcd594d50ef0a3b5f4d315e7 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 5 May 2022 17:28:53 -0400 Subject: [PATCH 09/16] Rework na_sentinel/dropna/ignore_na --- pandas/core/algorithms.py | 56 +++++++++++++++++------------- pandas/core/arrays/arrow/array.py | 16 ++++----- pandas/core/arrays/base.py | 13 ++++--- pandas/core/arrays/datetimelike.py | 5 ++- pandas/core/arrays/masked.py | 47 ++++++++++++------------- pandas/core/arrays/sparse/array.py | 9 ++--- 6 files changed, 71 insertions(+), 75 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c4d123d1e2cd9..831bb70b25615 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -498,11 +498,10 @@ def f(c, v): def factorize_array( values: np.ndarray, - na_sentinel: int = -1, + na_sentinel: int | None = -1, size_hint: int | None = None, na_value: object = None, mask: npt.NDArray[np.bool_] | None = None, - dropna: bool = True, ) -> tuple[npt.NDArray[np.intp], np.ndarray]: """ Factorize a numpy array to codes and uniques. @@ -512,7 +511,10 @@ def factorize_array( Parameters ---------- values : ndarray - na_sentinel : int, default -1 + na_sentinel : int or None, default -1 + Code to use for NA values. When None, no sentinel will be used. Instead, NA + values will be coded as a non-negative integer and the NA value will appear in + uniques. size_hint : int, optional Passed through to the hashtable's 'get_labels' method na_value : object, optional @@ -524,17 +526,16 @@ def factorize_array( If not None, the mask is used as indicator for missing values (True = missing, False = valid) instead of `na_value` or condition "val != val". - dropna : bool, default True - Whether NA values will appear in uniques. When False, NA values - will receive a nonnegative code instead of na_sentinel. - - .. versionadded:: 1.5.0 Returns ------- codes : ndarray[np.intp] uniques : ndarray """ + ignore_na = na_sentinel is not None + if not ignore_na: + na_sentinel = -1 + original = values if values.dtype.kind in ["m", "M"]: # _get_hashtable_algo will cast dt64/td64 to i8 via _ensure_data, so we @@ -551,7 +552,7 @@ def factorize_array( na_sentinel=na_sentinel, na_value=na_value, mask=mask, - ignore_na=dropna, + ignore_na=ignore_na, ) # re-cast e.g. i8->dt64/td64, uint8->bool @@ -720,12 +721,7 @@ def factorize( if not isinstance(values, ABCMultiIndex): values = extract_array(values, extract_numpy=True) - # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques - # of values, assign na_sentinel=-1 to replace code value for NaN. - dropna = True - if na_sentinel is None: - na_sentinel = -1 - dropna = False + dropna = na_sentinel is not None if ( isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray)) @@ -738,29 +734,39 @@ def factorize( if not isinstance(values.dtype, np.dtype): # i.e. ExtensionDtype - # TODO: pass ignore_na=dropna. When sort is True we ignore_na here and append - # on the end because safe_sort does not handle null values in uniques - codes, uniques = values.factorize( - na_sentinel=na_sentinel, dropna=dropna or sort - ) + codes, uniques = values.factorize(na_sentinel=na_sentinel) else: values = np.asarray(values) # convert DTA/TDA/MultiIndex - # TODO: pass ignore_na=dropna; see above + # TODO: pass na_sentinel=na_sentinel to factorize_array. When sort is True and + # na_sentinel is None we append NA on the end because safe_sort does not + # handle null values in uniques. + if na_sentinel is None and sort: + na_sentinel_arg = -1 + elif na_sentinel is None: + na_sentinel_arg = None + else: + na_sentinel_arg = na_sentinel codes, uniques = factorize_array( values, - na_sentinel=na_sentinel, + na_sentinel=na_sentinel_arg, size_hint=size_hint, - dropna=dropna or sort, ) if sort and len(uniques) > 0: + if na_sentinel is None: + # TODO: Can remove when na_sentinel=na_sentinel in TODO above + na_sentinel = -1 uniques, codes = safe_sort( uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False ) if not dropna and sort: - # TODO: Can remove if we pass ignore_na=dropna; see above - code_is_na = codes == na_sentinel + # TODO: Can remove entire block when na_sentinel=na_sentinel in TODO above + if na_sentinel is None: + na_sentinel_arg = -1 + else: + na_sentinel_arg = na_sentinel + code_is_na = codes == na_sentinel_arg if code_is_na.any(): # na_value is set based on the dtype of uniques, and compat set to False is # because we do not want na_value to be 0 for integers diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 2fcee9e261404..b3880ece10983 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -124,12 +124,12 @@ def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: @doc(ExtensionArray.factorize) def factorize( - self, na_sentinel: int = -1, dropna: bool = True + self, na_sentinel: int | None = -1 ) -> tuple[np.ndarray, ExtensionArray]: if pa_version_under4p0: encoded = self._data.dictionary_encode() else: - null_encoding = "mask" if dropna else "encode" + null_encoding = "mask" if na_sentinel is not None else "encode" encoded = self._data.dictionary_encode(null_encoding=null_encoding) indices = pa.chunked_array( [c.indices for c in encoded.chunks], type=encoded.type.index_type @@ -140,20 +140,16 @@ def factorize( if encoded.num_chunks: uniques = type(self)(encoded.chunk(0).dictionary) - if not dropna and pa_version_under4p0: + if na_sentinel is None and pa_version_under4p0: # TODO: share logic with BaseMaskedArray.factorize # Insert na with the proper code - na_mask = indices.values == na_sentinel + na_mask = indices.values == -1 na_index = na_mask.argmax() if na_mask[na_index]: uniques = uniques.insert(na_index, self.dtype.na_value) na_code = 0 if na_index == 0 else indices[:na_index].argmax() + 1 - if na_sentinel < 0: - # codes can never equal na_sentinel and be >= na_code - indices[indices >= na_code] += 1 - else: - indices[(indices >= na_code) & (indices != na_sentinel)] += 1 - indices[indices == na_sentinel] = na_code + indices[indices >= na_code] += 1 + indices[indices == -1] = na_code else: uniques = type(self)(pa.array([], type=encoded.type.value_type)) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 8524e28353e80..a834c54a3a82f 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1003,20 +1003,19 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]: return self.astype(object), np.nan def factorize( - self, na_sentinel: int = -1, dropna: bool = True + self, na_sentinel: int | None = -1 ) -> tuple[np.ndarray, ExtensionArray]: """ Encode the extension array as an enumerated type. Parameters ---------- - na_sentinel : int, default -1 + na_sentinel : int or None, default -1 Value to use in the `codes` array to indicate missing values. - dropna : bool, default True - Whether NA values will appear in uniques. When False, NA values - will receive a nonnegative code instead of na_sentinel. + When None, no sentinel will be used. Instead, NA values will be coded as a + non-negative integer and the NA value will appear in uniques. - .. versionadded:: 1.5.0 + .. versionchanged:: 1.5.0 Returns ------- @@ -1051,7 +1050,7 @@ def factorize( arr, na_value = self._values_for_factorize() codes, uniques = factorize_array( - arr, na_sentinel=na_sentinel, na_value=na_value, dropna=dropna + arr, na_sentinel=na_sentinel, na_value=na_value ) uniques_ea = self._from_factorized(uniques, self) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 6135b78580233..4ef9377d56afd 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1878,7 +1878,7 @@ def _with_freq(self, freq): # -------------------------------------------------------------- - def factorize(self, na_sentinel: int = -1, dropna: bool = True, sort: bool = False): + def factorize(self, na_sentinel: int | None = -1, sort: bool = False): if self.freq is not None: # We must be unique, so can short-circuit (and retain freq) codes = np.arange(len(self), dtype=np.intp) @@ -1888,8 +1888,7 @@ def factorize(self, na_sentinel: int = -1, dropna: bool = True, sort: bool = Fal uniques = uniques[::-1] return codes, uniques # FIXME: shouldn't get here; we are ignoring sort - result = super().factorize(na_sentinel=na_sentinel, dropna=dropna) - return result + return super().factorize(na_sentinel=na_sentinel) # ------------------------------------------------------------------- diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 17b44c9de6e99..69f7a034176bb 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -870,40 +870,39 @@ def searchsorted( @doc(ExtensionArray.factorize) def factorize( - self, na_sentinel: int = -1, dropna: bool = True + self, na_sentinel: int | None = -1 ) -> tuple[np.ndarray, ExtensionArray]: arr = self._data mask = self._mask - codes, uniques = factorize_array( - arr, na_sentinel=na_sentinel, mask=mask, dropna=True - ) + # Pass non-None na_sentinel; recode and add NA to uniques if necessary below + na_sentinel_arg = -1 if na_sentinel is None else na_sentinel + codes, uniques = factorize_array(arr, na_sentinel=na_sentinel_arg, mask=mask) # check that factorize_array correctly preserves dtype. assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype) - # Make room for a null value if we're not ignoring it and it exists - size = len(uniques) if dropna or not mask.any() else len(uniques) + 1 + has_na = mask.any() + if na_sentinel is not None or not has_na: + size = len(uniques) + else: + # Make room for an NA value + size = len(uniques) + 1 uniques_mask = np.zeros(size, dtype=bool) - if not dropna: + if na_sentinel is None and has_na: na_index = mask.argmax() - if mask[na_index]: - # Insert na with the proper code - if na_index == 0: - na_code = np.intp(0) - else: - # error: Slice index must be an integer or None - # https://github.com/python/mypy/issues/2410 - na_code = codes[:na_index].argmax() + 1 # type: ignore[misc] - if na_sentinel < 0: - # codes can never equal na_sentinel and be >= na_code - codes[codes >= na_code] += 1 - else: - codes[(codes >= na_code) & (codes != na_sentinel)] += 1 - codes[codes == na_sentinel] = na_code - # dummy value for uniques; not used since uniques_mask will be True - uniques = np.insert(uniques, na_code, 0) - uniques_mask[na_code] = True + # Insert na with the proper code + if na_index == 0: + na_code = np.intp(0) + else: + # mypy error: Slice index must be an integer or None + # https://github.com/python/mypy/issues/2410 + na_code = codes[:na_index].argmax() + 1 # type: ignore[misc] + codes[codes >= na_code] += 1 + codes[codes == -1] = na_code + # dummy value for uniques; not used since uniques_mask will be True + uniques = np.insert(uniques, na_code, 0) + uniques_mask[na_code] = True uniques_ea = type(self)(uniques, uniques_mask) return codes, uniques_ea diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index c38baab1d9ae1..3a655541ddee4 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -847,17 +847,14 @@ def _values_for_factorize(self): # Still override this for hash_pandas_object return np.asarray(self), self.fill_value - def factorize( - self, na_sentinel: int = -1, dropna: bool = True - ) -> tuple[np.ndarray, SparseArray]: + def factorize(self, na_sentinel: int | None = -1) -> tuple[np.ndarray, SparseArray]: # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA] # The sparsity on this is backwards from what Sparse would want. Want # ExtensionArray.factorize -> Tuple[EA, EA] # Given that we have to return a dense array of codes, why bother # implementing an efficient factorize? - na_sentinel_arg = na_sentinel if dropna else None - codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel_arg) - if not dropna: + codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel) + if na_sentinel is not None: codes[codes == -1] = na_sentinel uniques_sp = SparseArray(uniques, dtype=self.dtype) return codes, uniques_sp From 8378ba047edf521846d6c7ea1b99e8edf5350785 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 5 May 2022 17:33:36 -0400 Subject: [PATCH 10/16] fixups --- pandas/core/algorithms.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 831bb70b25615..f3bb4e6157da7 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -721,6 +721,8 @@ def factorize( if not isinstance(values, ABCMultiIndex): values = extract_array(values, extract_numpy=True) + # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques + # of values, assign na_sentinel=-1 to replace code value for NaN. dropna = na_sentinel is not None if ( @@ -754,14 +756,14 @@ def factorize( if sort and len(uniques) > 0: if na_sentinel is None: - # TODO: Can remove when na_sentinel=na_sentinel in TODO above + # TODO: Can remove when na_sentinel=na_sentinel as in TODO above na_sentinel = -1 uniques, codes = safe_sort( uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False ) if not dropna and sort: - # TODO: Can remove entire block when na_sentinel=na_sentinel in TODO above + # TODO: Can remove entire block when na_sentinel=na_sentinel as in TODO above if na_sentinel is None: na_sentinel_arg = -1 else: From f2e24dfd3d7022528d3e5e72ac0a494945fabacb Mon Sep 17 00:00:00 2001 From: richard Date: Thu, 5 May 2022 21:56:00 -0400 Subject: [PATCH 11/16] fixup for pyarrow < 4.0 --- pandas/core/arrays/arrow/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index b3880ece10983..62f16b75ba503 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -135,7 +135,7 @@ def factorize( [c.indices for c in encoded.chunks], type=encoded.type.index_type ).to_pandas() if indices.dtype.kind == "f": - indices[np.isnan(indices)] = na_sentinel + indices[np.isnan(indices)] = na_sentinel if na_sentinel is not None else -1 indices = indices.astype(np.int64, copy=False) if encoded.num_chunks: From b51e88ff375d0785420149a4fb92a6f986a8c00b Mon Sep 17 00:00:00 2001 From: richard Date: Fri, 6 May 2022 06:10:20 -0400 Subject: [PATCH 12/16] whatsnew note --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index e74dadd43f9da..ce8aeb449573a 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -125,7 +125,7 @@ Other enhancements - Added ``validate`` argument to :meth:`DataFrame.join` (:issue:`46622`) - A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`) - Added ``numeric_only`` argument to :meth:`Resampler.sum`, :meth:`Resampler.prod`, :meth:`Resampler.min`, :meth:`Resampler.max`, :meth:`Resampler.first`, and :meth:`Resampler.last` (:issue:`46442`) -- The method :meth:`.ExtensionArray.factorize` has gained the argument ``dropna`` for determining how null values are to be treated. (:issue:`46601`) +- The method :meth:`.ExtensionArray.factorize` can now be passed `na_sentinel=None` for determining how null values are to be treated. (:issue:`46601`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: From 4a36bf07d2b5b15f47eccd45d3402563ff55b279 Mon Sep 17 00:00:00 2001 From: richard Date: Sat, 7 May 2022 09:35:23 -0400 Subject: [PATCH 13/16] doc fixup --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index ce8aeb449573a..7d0d2fb4253fb 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -125,7 +125,7 @@ Other enhancements - Added ``validate`` argument to :meth:`DataFrame.join` (:issue:`46622`) - A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`) - Added ``numeric_only`` argument to :meth:`Resampler.sum`, :meth:`Resampler.prod`, :meth:`Resampler.min`, :meth:`Resampler.max`, :meth:`Resampler.first`, and :meth:`Resampler.last` (:issue:`46442`) -- The method :meth:`.ExtensionArray.factorize` can now be passed `na_sentinel=None` for determining how null values are to be treated. (:issue:`46601`) +- The method :meth:`.ExtensionArray.factorize` can now be passed ``na_sentinel=None`` for determining how null values are to be treated. (:issue:`46601`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: From cf561351c21caf4686122fa5a4fb1031bdc0bf57 Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 26 Jun 2022 22:43:09 -0400 Subject: [PATCH 14/16] fixups --- pandas/core/algorithms.py | 15 ++++++--------- pandas/core/arrays/arrow/array.py | 10 +++++----- pandas/core/arrays/base.py | 12 ++---------- pandas/core/arrays/masked.py | 10 +++------- pandas/core/arrays/sparse/array.py | 2 ++ 5 files changed, 18 insertions(+), 31 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index db158e9cf9daf..9234bf8e65ff0 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -522,10 +522,7 @@ def factorize_array( Parameters ---------- values : ndarray - na_sentinel : int or None, default -1 - Code to use for NA values. When None, no sentinel will be used. Instead, NA - values will be coded as a non-negative integer and the NA value will appear in - uniques. + na_sentinel : int, default -1 size_hint : int, optional Passed through to the hashtable's 'get_labels' method na_value : object, optional @@ -761,19 +758,19 @@ def factorize( elif not isinstance(values.dtype, np.dtype): if ( - na_sentinel == -1 - and "use_na_sentinel" in inspect.signature(values.factorize).parameters - ): + na_sentinel == -1 or na_sentinel is None + ) and "use_na_sentinel" in inspect.signature(values.factorize).parameters: # Avoid using catch_warnings when possible # GH#46910 - TimelikeOps has deprecated signature codes, uniques = values.factorize( # type: ignore[call-arg] - use_na_sentinel=True + use_na_sentinel=na_sentinel is not None ) else: + na_sentinel_arg = -1 if na_sentinel is None else na_sentinel with warnings.catch_warnings(): # We've already warned above warnings.filterwarnings("ignore", ".*use_na_sentinel.*", FutureWarning) - codes, uniques = values.factorize(na_sentinel=na_sentinel) + codes, uniques = values.factorize(na_sentinel=na_sentinel_arg) else: values = np.asarray(values) # convert DTA/TDA/MultiIndex diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index f28a47c08af9c..046ede0a331dd 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -298,20 +298,20 @@ def factorize( if pa_version_under4p0: encoded = self._data.dictionary_encode() else: - null_encoding = "mask" if resolved_na_sentinel else "encode" + null_encoding = "mask" if resolved_na_sentinel is not None else "encode" encoded = self._data.dictionary_encode(null_encoding=null_encoding) - - encoded = self._data.dictionary_encode() indices = pa.chunked_array( [c.indices for c in encoded.chunks], type=encoded.type.index_type ).to_pandas() if indices.dtype.kind == "f": - indices[np.isnan(indices)] = na_sentinel if na_sentinel is not None else -1 + indices[np.isnan(indices)] = ( + resolved_na_sentinel if resolved_na_sentinel is not None else -1 + ) indices = indices.astype(np.int64, copy=False) if encoded.num_chunks: uniques = type(self)(encoded.chunk(0).dictionary) - if not resolved_na_sentinel and pa_version_under4p0: + if resolved_na_sentinel is None and pa_version_under4p0: # TODO: share logic with BaseMaskedArray.factorize # Insert na with the proper code na_mask = indices.values == -1 diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 02893a272363b..35e2c759277cc 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1034,12 +1034,8 @@ def factorize( Parameters ---------- - na_sentinel : int or None, default -1 + na_sentinel : int, default -1 Value to use in the `codes` array to indicate missing values. - When None, no sentinel will be used. Instead, NA values will be coded as a - non-negative integer and the NA value will appear in uniques. - - .. versionchanged:: 1.5.0 .. deprecated:: 1.5.0 The na_sentinel argument is deprecated and @@ -1084,14 +1080,10 @@ def factorize( # 2. ExtensionArray.factorize. # Complete control over factorization. resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel) - if resolved_na_sentinel is None: - raise NotImplementedError("Encoding NaN values is not yet implemented") - else: - na_sentinel = resolved_na_sentinel arr, na_value = self._values_for_factorize() codes, uniques = factorize_array( - arr, na_sentinel=na_sentinel, na_value=na_value + arr, na_sentinel=resolved_na_sentinel, na_value=na_value ) uniques_ea = self._from_factorized(uniques, self) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 8f36ea34dc092..0126efa9bd347 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -875,28 +875,24 @@ def factorize( use_na_sentinel: bool | lib.NoDefault = lib.no_default, ) -> tuple[np.ndarray, ExtensionArray]: resolved_na_sentinel = algos.resolve_na_sentinel(na_sentinel, use_na_sentinel) - if resolved_na_sentinel is None: - raise NotImplementedError("Encoding NaN values is not yet implemented") - else: - na_sentinel = resolved_na_sentinel arr = self._data mask = self._mask # Pass non-None na_sentinel; recode and add NA to uniques if necessary below - na_sentinel_arg = -1 if na_sentinel is None else na_sentinel + na_sentinel_arg = -1 if resolved_na_sentinel is None else resolved_na_sentinel codes, uniques = factorize_array(arr, na_sentinel=na_sentinel_arg, mask=mask) # check that factorize_array correctly preserves dtype. assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype) has_na = mask.any() - if na_sentinel is not None or not has_na: + if resolved_na_sentinel is not None or not has_na: size = len(uniques) else: # Make room for an NA value size = len(uniques) + 1 uniques_mask = np.zeros(size, dtype=bool) - if na_sentinel is None and has_na: + if resolved_na_sentinel is None and has_na: na_index = mask.argmax() # Insert na with the proper code if na_index == 0: diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 37c4b8b29fc0c..0218f674c3ddf 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -861,6 +861,8 @@ def factorize( codes, uniques = algos.factorize( np.asarray(self), na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel ) + if na_sentinel is lib.no_default: + na_sentinel = -1 if use_na_sentinel is lib.no_default or use_na_sentinel: codes[codes == -1] = na_sentinel uniques_sp = SparseArray(uniques, dtype=self.dtype) From 9c35dd0844b1cfc24330857547b8db7388ce6457 Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 11 Jul 2022 16:34:12 -0400 Subject: [PATCH 15/16] fixup whatsnew note --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 8e54e58efdbcf..65baef94cc9e3 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -278,7 +278,7 @@ Other enhancements - :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`) - :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`) - :meth:`RangeIndex.union` now can return a :class:`RangeIndex` instead of a :class:`Int64Index` if the resulting values are equally spaced (:issue:`47557`, :issue:`43885`) -- The method :meth:`.ExtensionArray.factorize` can now be passed ``use_na_sentinel=False`` for determining how null values are to be treated. (:issue:`46601`) +- The method :meth:`.ExtensionArray.factorize` accepts ``use_na_sentinel=False`` for determining how null values are to be treated (:issue:`46601`) - .. --------------------------------------------------------------------------- From 7143a526896ea473e3079968637d0f76641337f1 Mon Sep 17 00:00:00 2001 From: richard Date: Tue, 16 Aug 2022 21:17:39 -0400 Subject: [PATCH 16/16] whatsnew note; comment on old vs new API --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/core/algorithms.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 9068c4b4c23b1..df58fde15c8bf 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -926,6 +926,7 @@ Numeric - Bug in division, ``pow`` and ``mod`` operations on array-likes with ``dtype="boolean"`` not being like their ``np.bool_`` counterparts (:issue:`46063`) - Bug in multiplying a :class:`Series` with ``IntegerDtype`` or ``FloatingDtype`` by an array-like with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`45622`) - Bug in :meth:`mean` where the optional dependency ``bottleneck`` causes precision loss linear in the length of the array. ``bottleneck`` has been disabled for :meth:`mean` improving the loss to log-linear but may result in a performance decrease. (:issue:`42878`) +- Bug in :func:`factorize` would convert the value ``None`` to ``np.nan`` (:issue:`46601`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 593efb4a50ec7..9b031fc9517c7 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -734,6 +734,10 @@ def factorize( # responsible only for factorization. All data coercion, sorting and boxing # should happen here. + # GH#46910 deprecated na_sentinel in favor of use_na_sentinel: + # na_sentinel=None corresponds to use_na_sentinel=False + # na_sentinel=-1 correspond to use_na_sentinel=True + # Other na_sentinel values will not be supported when the deprecation is enforced. na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel) if isinstance(values, ABCRangeIndex): return values.factorize(sort=sort)