From 670c2e8dc2f370ff847b9ce3242dcf643072f922 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Fri, 1 Apr 2022 10:14:36 -0400
Subject: [PATCH 01/16] BUG: algos.factorizes moves null values when sort=False

---
 doc/source/whatsnew/v1.5.0.rst                |  2 +
 pandas/_libs/hashtable_class_helper.pxi.in    | 12 ++---
 pandas/core/algorithms.py                     | 40 +++++++++++----
 pandas/core/arrays/arrow/array.py             |  7 ++-
 pandas/core/arrays/base.py                    | 11 ++++-
 pandas/core/arrays/datetimelike.py            |  5 +-
 pandas/core/arrays/masked.py                  | 28 +++++++++--
 pandas/core/arrays/sparse/array.py            |  9 +++-
 pandas/core/arrays/string_.py                 |  7 +++
 pandas/core/groupby/grouper.py                |  5 +-
 pandas/tests/groupby/test_groupby_dropna.py   | 49 +++++++++++++++++++
 .../tests/groupby/transform/test_transform.py |  6 +--
 pandas/tests/test_algos.py                    | 16 +++---
 13 files changed, 155 insertions(+), 42 deletions(-)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index 08500019143ed..74ab37f856144 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -96,6 +96,7 @@ Other enhancements
 - :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`)
 - Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, and :meth:`DataFrame.cov` (:issue:`46560`)
 - A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`)
+- The method :meth:`.ExtensionArray.factorize` has gained the argument ``dropna`` for determining how null values are to be treated. (:issue:`46601`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.notable_bug_fixes:
@@ -605,6 +606,7 @@ Groupby/resample/rolling
 - Bug in :meth:`SeriesGroupBy.apply` would incorrectly name its result when there was a unique group (:issue:`46369`)
 - Bug in :meth:`.Rolling.var` would segfault calculating weighted variance when window size was larger than data size (:issue:`46760`)
 - Bug in :meth:`Grouper.__repr__` where ``dropna`` was not included. Now it is (:issue:`46754`)
+- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` with ``dropna=False`` and ``sort=False`` would put any null groups at the end instead the order that they are encountered (:issue:`46584`)
 
 Reshaping
 ^^^^^^^^^
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 8a2b9c2f77627..f25e4ad009153 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -658,7 +658,7 @@ cdef class {{name}}HashTable(HashTable):
                             return_inverse=return_inverse)
 
     def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None, object mask=None):
+                  object na_value=None, object mask=None, ignore_na=True):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -690,7 +690,7 @@ cdef class {{name}}HashTable(HashTable):
         """
         uniques_vector = {{name}}Vector()
         return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
-                            na_value=na_value, ignore_na=True, mask=mask,
+                            na_value=na_value, ignore_na=ignore_na, mask=mask,
                             return_inverse=True)
 
     def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
@@ -1037,7 +1037,7 @@ cdef class StringHashTable(HashTable):
                             return_inverse=return_inverse)
 
     def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None, object mask=None):
+                  object na_value=None, object mask=None, ignore_na=True):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -1067,7 +1067,7 @@ cdef class StringHashTable(HashTable):
         """
         uniques_vector = ObjectVector()
         return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
-                            na_value=na_value, ignore_na=True,
+                            na_value=na_value, ignore_na=ignore_na,
                             return_inverse=True)
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
@@ -1290,7 +1290,7 @@ cdef class PyObjectHashTable(HashTable):
                             return_inverse=return_inverse)
 
     def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None, object mask=None):
+                  object na_value=None, object mask=None, ignore_na=True):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -1320,7 +1320,7 @@ cdef class PyObjectHashTable(HashTable):
         """
         uniques_vector = ObjectVector()
         return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
-                            na_value=na_value, ignore_na=True,
+                            na_value=na_value, ignore_na=ignore_na,
                             return_inverse=True)
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 0c0b93f41c657..88969502b99fd 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -502,6 +502,7 @@ def factorize_array(
     size_hint: int | None = None,
     na_value=None,
     mask: npt.NDArray[np.bool_] | None = None,
+    dropna: bool = True,
 ) -> tuple[npt.NDArray[np.intp], np.ndarray]:
     """
     Factorize a numpy array to codes and uniques.
@@ -523,6 +524,11 @@ def factorize_array(
         If not None, the mask is used as indicator for missing values
         (True = missing, False = valid) instead of `na_value` or
         condition "val != val".
+    dropna: bool, default True
+        Whether null values will appear in uniques. When False, null values
+        will receive a nonnegative code instead of na_sentinel.
+
+        ..versionadded:: 1.5.0
 
     Returns
     -------
@@ -541,7 +547,11 @@ def factorize_array(
 
     table = hash_klass(size_hint or len(values))
     uniques, codes = table.factorize(
-        values, na_sentinel=na_sentinel, na_value=na_value, mask=mask
+        values,
+        na_sentinel=na_sentinel,
+        na_value=na_value,
+        mask=mask,
+        ignore_na=dropna,
     )
 
     # re-cast e.g. i8->dt64/td64, uint8->bool
@@ -728,11 +738,19 @@ def factorize(
 
     if not isinstance(values.dtype, np.dtype):
         # i.e. ExtensionDtype
-        codes, uniques = values.factorize(na_sentinel=na_sentinel)
+        # TODO: pass ignore_na=dropna. When sort is True we ignore_na here and append
+        #       on the end because safe_sort does not handle null values in uniques
+        codes, uniques = values.factorize(
+            na_sentinel=na_sentinel, dropna=dropna or sort
+        )
     else:
         values = np.asarray(values)  # convert DTA/TDA/MultiIndex
+        # TODO: pass ignore_na=dropna; see above
         codes, uniques = factorize_array(
-            values, na_sentinel=na_sentinel, size_hint=size_hint
+            values,
+            na_sentinel=na_sentinel,
+            size_hint=size_hint,
+            dropna=dropna or sort,
         )
 
     if sort and len(uniques) > 0:
@@ -740,13 +758,15 @@ def factorize(
             uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
         )
 
-    code_is_na = codes == na_sentinel
-    if not dropna and code_is_na.any():
-        # na_value is set based on the dtype of uniques, and compat set to False is
-        # because we do not want na_value to be 0 for integers
-        na_value = na_value_for_dtype(uniques.dtype, compat=False)
-        uniques = np.append(uniques, [na_value])
-        codes = np.where(code_is_na, len(uniques) - 1, codes)
+    if not dropna and sort:
+        # TODO: Can remove if we pass ignore_na=dropna; see above
+        code_is_na = codes == na_sentinel
+        if code_is_na.any():
+            # na_value is set based on the dtype of uniques, and compat set to False is
+            # because we do not want na_value to be 0 for integers
+            na_value = na_value_for_dtype(uniques.dtype, compat=False)
+            uniques = np.append(uniques, [na_value])
+            codes = np.where(code_is_na, len(uniques) - 1, codes)
 
     uniques = _reconstruct_data(uniques, original.dtype, original)
 
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 0a48638f5cf05..eeff227ed9ba5 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -105,8 +105,11 @@ def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
         return type(self)(self._data)
 
     @doc(ExtensionArray.factorize)
-    def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
-        encoded = self._data.dictionary_encode()
+    def factorize(
+        self, na_sentinel: int = -1, dropna=True
+    ) -> tuple[np.ndarray, ExtensionArray]:
+        null_encoding = "mask" if dropna else "encode"
+        encoded = self._data.dictionary_encode(null_encoding=null_encoding)
         indices = pa.chunked_array(
             [c.indices for c in encoded.chunks], type=encoded.type.index_type
         ).to_pandas()
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index b06a46dfd1447..e057c1596765c 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -1002,7 +1002,9 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
         """
         return self.astype(object), np.nan
 
-    def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
+    def factorize(
+        self, na_sentinel: int = -1, dropna: bool = True
+    ) -> tuple[np.ndarray, ExtensionArray]:
         """
         Encode the extension array as an enumerated type.
 
@@ -1010,6 +1012,11 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
         ----------
         na_sentinel : int, default -1
             Value to use in the `codes` array to indicate missing values.
+        dropna: bool, default True
+            Whether null values will appear in uniques. When False, null values
+            will receive a nonnegative code instead of na_sentinel.
+
+            ..versionadded:: 1.5.0
 
         Returns
         -------
@@ -1044,7 +1051,7 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
         arr, na_value = self._values_for_factorize()
 
         codes, uniques = factorize_array(
-            arr, na_sentinel=na_sentinel, na_value=na_value
+            arr, na_sentinel=na_sentinel, na_value=na_value, dropna=dropna
         )
 
         uniques_ea = self._from_factorized(uniques, self)
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index 27260f8ed62ca..dc0aecafea56b 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -1878,7 +1878,7 @@ def _with_freq(self, freq):
 
     # --------------------------------------------------------------
 
-    def factorize(self, na_sentinel=-1, sort: bool = False):
+    def factorize(self, na_sentinel=-1, sort: bool = False, dropna: bool = True):
         if self.freq is not None:
             # We must be unique, so can short-circuit (and retain freq)
             codes = np.arange(len(self), dtype=np.intp)
@@ -1888,7 +1888,8 @@ def factorize(self, na_sentinel=-1, sort: bool = False):
                 uniques = uniques[::-1]
             return codes, uniques
         # FIXME: shouldn't get here; we are ignoring sort
-        return super().factorize(na_sentinel=na_sentinel)
+        result = super().factorize(na_sentinel=na_sentinel, dropna=dropna)
+        return result
 
 
 # -------------------------------------------------------------------
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index 95363e598a06c..537d59f2cd779 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -868,16 +868,38 @@ def searchsorted(
         return self._data.searchsorted(value, side=side, sorter=sorter)
 
     @doc(ExtensionArray.factorize)
-    def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
+    def factorize(
+        self, na_sentinel: int = -1, dropna: bool = True
+    ) -> tuple[np.ndarray, ExtensionArray]:
         arr = self._data
         mask = self._mask
 
-        codes, uniques = factorize_array(arr, na_sentinel=na_sentinel, mask=mask)
+        codes, uniques = factorize_array(
+            arr, na_sentinel=na_sentinel, mask=mask, dropna=True
+        )
 
         # check that factorize_array correctly preserves dtype.
         assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype)
 
-        uniques_ea = type(self)(uniques, np.zeros(len(uniques), dtype=bool))
+        # Make room for a null value if we're not ignoring it and it exists
+        size = len(uniques) if dropna or not mask.any() else len(uniques) + 1
+        uniques_mask = np.zeros(size, dtype=bool)
+        if not dropna:
+            na_index = mask.argmax()
+            if mask[na_index]:
+                # Insert na with the proper code
+                na_code = 0 if na_index == 0 else codes[:na_index].argmax() + 1
+                if na_sentinel < 0:
+                    # codes can never equal na_sentinel and be >= na_code
+                    codes[codes >= na_code] += 1
+                else:
+                    codes[(codes >= na_code) & (codes != na_sentinel)] += 1
+                codes[codes == na_sentinel] = na_code
+                # dummy value for uniques; not used since uniques_mask will be True
+                uniques = np.insert(uniques, na_code, 0)
+                uniques_mask[na_code] = True
+        uniques_ea = type(self)(uniques, uniques_mask)
+
         return codes, uniques_ea
 
     @doc(ExtensionArray._values_for_argsort)
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
index c2a13961687fd..c38baab1d9ae1 100644
--- a/pandas/core/arrays/sparse/array.py
+++ b/pandas/core/arrays/sparse/array.py
@@ -847,13 +847,18 @@ def _values_for_factorize(self):
         # Still override this for hash_pandas_object
         return np.asarray(self), self.fill_value
 
-    def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, SparseArray]:
+    def factorize(
+        self, na_sentinel: int = -1, dropna: bool = True
+    ) -> tuple[np.ndarray, SparseArray]:
         # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
         # The sparsity on this is backwards from what Sparse would want. Want
         # ExtensionArray.factorize -> Tuple[EA, EA]
         # Given that we have to return a dense array of codes, why bother
         # implementing an efficient factorize?
-        codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
+        na_sentinel_arg = na_sentinel if dropna else None
+        codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel_arg)
+        if not dropna:
+            codes[codes == -1] = na_sentinel
         uniques_sp = SparseArray(uniques, dtype=self.dtype)
         return codes, uniques_sp
 
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 21b5dc625956e..304bc7a768bcc 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -382,6 +382,13 @@ def _values_for_factorize(self):
         arr[mask] = -1
         return arr, -1
 
+    @classmethod
+    def _from_factorized(cls, values, original):
+        assert values.dtype == original._ndarray.dtype
+        # When dropna (i.e. ignore_na) is False, can get -1 from nulls
+        values[values == -1] = None
+        return original._from_backing_data(values)
+
     def __setitem__(self, key, value):
         value = extract_array(value, extract_numpy=True)
         if isinstance(value, type(self)):
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
index 3f37c5f0c6df9..e146e9462c6f2 100644
--- a/pandas/core/groupby/grouper.py
+++ b/pandas/core/groupby/grouper.py
@@ -657,9 +657,10 @@ def group_index(self) -> Index:
 
     @cache_readonly
     def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
-        if self._passed_categorical:
+        if self._dropna and self._passed_categorical:
             # we make a CategoricalIndex out of the cat grouper
-            # preserving the categories / ordered attributes
+            # preserving the categories / ordered attributes;
+            # doesn't (yet) handle dropna=False
             cat = self.grouping_vector
             categories = cat.categories
 
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
index 9fcc28d8f65a9..d79935538a24b 100644
--- a/pandas/tests/groupby/test_groupby_dropna.py
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -348,3 +348,52 @@ def test_groupby_nan_included():
         tm.assert_numpy_array_equal(result_values, expected_values)
     assert np.isnan(list(result.keys())[2])
     assert list(result.keys())[0:2] == ["g1", "g2"]
+
+
+@pytest.mark.parametrize(
+    "key",
+    [
+        pd.Series([2, np.nan, 1, 2]),
+        pd.Series([2, np.nan, 1, 2], dtype="UInt8"),
+        pd.Series([2, np.nan, 1, 2], dtype="Int8"),
+        pd.Series([2, np.nan, 1, 2], dtype="UInt16"),
+        pd.Series([2, np.nan, 1, 2], dtype="Int16"),
+        pd.Series([2, np.nan, 1, 2], dtype="UInt32"),
+        pd.Series([2, np.nan, 1, 2], dtype="Int32"),
+        pd.Series([2, np.nan, 1, 2], dtype="UInt64"),
+        pd.Series([2, np.nan, 1, 2], dtype="Int64"),
+        pd.Series([2, np.nan, 1, 2], dtype="Float32"),
+        pd.Series([2, np.nan, 1, 2], dtype="Int64"),
+        pd.Series([2, np.nan, 1, 2], dtype="Float64"),
+        pd.Series(["y", None, "x", "y"], dtype="category"),
+        pd.Series(["y", pd.NA, "x", "y"], dtype="string"),
+        pd.Series(["y", pd.NA, "x", "y"], dtype="string[pyarrow]"),
+        pd.Series(
+            ["2016-01-01", np.datetime64("NaT"), "2017-01-01", "2016-01-01"],
+            dtype="datetime64[ns]",
+        ),
+        pd.Series(
+            [
+                pd.Period("2012-02-01", freq="D"),
+                pd.NA,
+                pd.Period("2012-01-01", freq="D"),
+                pd.Period("2012-02-01", freq="D"),
+            ]
+        ),
+        pd.Series(pd.arrays.SparseArray([2, np.nan, 1, 2])),
+    ],
+)
+@pytest.mark.parametrize("test_series", [True, False])
+def test_no_sort_keep_na(key, test_series):
+    df = pd.DataFrame({"key": key, "a": [1, 2, 3, 4]})
+    gb = df.groupby("key", dropna=False, sort=False)
+    if test_series:
+        gb = gb["a"]
+    result = gb.sum()
+    expected = pd.DataFrame({"a": [5, 2, 3]}, index=key[:-1].rename("key"))
+    if test_series:
+        expected = expected["a"]
+    if expected.index.is_categorical():
+        # TODO: Slicing reorders categories?
+        expected.index = expected.index.reorder_categories(["y", "x"])
+    tm.assert_equal(result, expected)
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
index 237d6a96f39ec..779964a5da28a 100644
--- a/pandas/tests/groupby/transform/test_transform.py
+++ b/pandas/tests/groupby/transform/test_transform.py
@@ -1311,12 +1311,8 @@ def test_transform_cumcount():
 
 
 @pytest.mark.parametrize("keys", [["A1"], ["A1", "A2"]])
-def test_null_group_lambda_self(request, sort, dropna, keys):
+def test_null_group_lambda_self(sort, dropna, keys):
     # GH 17093
-    if not sort and not dropna:
-        msg = "GH#46584: null values get sorted when sort=False"
-        request.node.add_marker(pytest.mark.xfail(reason=msg, strict=False))
-
     size = 50
     nulls1 = np.random.choice([False, True], size)
     nulls2 = np.random.choice([False, True], size)
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 2d73b8e91e831..e0b18bd607da1 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -436,13 +436,13 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques):
         [
             (
                 ["a", None, "b", "a"],
-                np.array([0, 2, 1, 0], dtype=np.dtype("intp")),
-                np.array(["a", "b", np.nan], dtype=object),
+                np.array([0, 1, 2, 0], dtype=np.dtype("intp")),
+                np.array(["a", None, "b"], dtype=object),
             ),
             (
                 ["a", np.nan, "b", "a"],
-                np.array([0, 2, 1, 0], dtype=np.dtype("intp")),
-                np.array(["a", "b", np.nan], dtype=object),
+                np.array([0, 1, 2, 0], dtype=np.dtype("intp")),
+                np.array(["a", np.nan, "b"], dtype=object),
             ),
         ],
     )
@@ -459,13 +459,13 @@ def test_object_factorize_na_sentinel_none(
         [
             (
                 [1, None, 1, 2],
-                np.array([0, 2, 0, 1], dtype=np.dtype("intp")),
-                np.array([1, 2, np.nan], dtype="O"),
+                np.array([0, 1, 0, 2], dtype=np.dtype("intp")),
+                np.array([1, None, 2], dtype="O"),
             ),
             (
                 [1, np.nan, 1, 2],
-                np.array([0, 2, 0, 1], dtype=np.dtype("intp")),
-                np.array([1, 2, np.nan], dtype=np.float64),
+                np.array([0, 1, 0, 2], dtype=np.dtype("intp")),
+                np.array([1, np.nan, 2], dtype=np.float64),
             ),
         ],
     )

From 98c6c18aa4e82d4fe20f068aaf5086f18beaabf0 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Tue, 19 Apr 2022 19:25:35 -0400
Subject: [PATCH 02/16] Implementation for pyarrow < 4.0

---
 pandas/core/arrays/arrow/array.py | 21 +++++++++++++++++++--
 pandas/core/arrays/base.py        |  2 +-
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index eeff227ed9ba5..6d694683f3fd4 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -15,6 +15,7 @@
 from pandas.compat import (
     pa_version_under1p01,
     pa_version_under2p0,
+    pa_version_under4p0,
     pa_version_under5p0,
 )
 from pandas.util._decorators import doc
@@ -108,8 +109,11 @@ def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
     def factorize(
         self, na_sentinel: int = -1, dropna=True
     ) -> tuple[np.ndarray, ExtensionArray]:
-        null_encoding = "mask" if dropna else "encode"
-        encoded = self._data.dictionary_encode(null_encoding=null_encoding)
+        if pa_version_under4p0:
+            encoded = self._data.dictionary_encode()
+        else:
+            null_encoding = "mask" if dropna else "encode"
+            encoded = self._data.dictionary_encode(null_encoding=null_encoding)
         indices = pa.chunked_array(
             [c.indices for c in encoded.chunks], type=encoded.type.index_type
         ).to_pandas()
@@ -119,6 +123,19 @@ def factorize(
 
         if encoded.num_chunks:
             uniques = type(self)(encoded.chunk(0).dictionary)
+            if pa_version_under4p0:
+                # Insert na with the proper code
+                na_mask = indices.values == na_sentinel
+                na_index = na_mask.argmax()
+                if na_mask[na_index]:
+                    uniques = uniques.insert(na_index, self.dtype.na_value)
+                    na_code = 0 if na_index == 0 else indices[:na_index].argmax() + 1
+                    if na_sentinel < 0:
+                        # codes can never equal na_sentinel and be >= na_code
+                        indices[indices >= na_code] += 1
+                    else:
+                        indices[(indices >= na_code) & (indices != na_sentinel)] += 1
+                    indices[indices == na_sentinel] = na_code
         else:
             uniques = type(self)(pa.array([], type=encoded.type.value_type))
 
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index e057c1596765c..642ad07887d9f 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -1012,7 +1012,7 @@ def factorize(
         ----------
         na_sentinel : int, default -1
             Value to use in the `codes` array to indicate missing values.
-        dropna: bool, default True
+        dropna : bool, default True
             Whether null values will appear in uniques. When False, null values
             will receive a nonnegative code instead of na_sentinel.
 

From 007329badc93d101ae85fc4360a085581a5ad5e1 Mon Sep 17 00:00:00 2001
From: richard <rhshadrach@gmail.com>
Date: Tue, 19 Apr 2022 22:21:13 -0400
Subject: [PATCH 03/16] fixup

---
 pandas/core/arrays/arrow/array.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 6d694683f3fd4..54f183bc2c493 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -123,7 +123,7 @@ def factorize(
 
         if encoded.num_chunks:
             uniques = type(self)(encoded.chunk(0).dictionary)
-            if pa_version_under4p0:
+            if not dropna and pa_version_under4p0:
                 # Insert na with the proper code
                 na_mask = indices.values == na_sentinel
                 na_index = na_mask.argmax()

From ffaf20c00c8d0762c19bb097c7cda64e9dc999e0 Mon Sep 17 00:00:00 2001
From: richard <rhshadrach@gmail.com>
Date: Tue, 19 Apr 2022 22:39:33 -0400
Subject: [PATCH 04/16] fixups

---
 pandas/core/algorithms.py                   | 6 +++---
 pandas/core/arrays/base.py                  | 4 ++--
 pandas/tests/groupby/test_groupby_dropna.py | 9 ++++++++-
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 88969502b99fd..2271c8939e400 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -524,11 +524,11 @@ def factorize_array(
         If not None, the mask is used as indicator for missing values
         (True = missing, False = valid) instead of `na_value` or
         condition "val != val".
-    dropna: bool, default True
-        Whether null values will appear in uniques. When False, null values
+    dropna : bool, default True
+        Whether NA values will appear in uniques. When False, NA values
         will receive a nonnegative code instead of na_sentinel.
 
-        ..versionadded:: 1.5.0
+        .. versionadded:: 1.5.0
 
     Returns
     -------
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 642ad07887d9f..2eff7dffd3f0d 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -1013,10 +1013,10 @@ def factorize(
         na_sentinel : int, default -1
             Value to use in the `codes` array to indicate missing values.
         dropna : bool, default True
-            Whether null values will appear in uniques. When False, null values
+            Whether NA values will appear in uniques. When False, NA values
             will receive a nonnegative code instead of na_sentinel.
 
-            ..versionadded:: 1.5.0
+            .. versionadded:: 1.5.0
 
         Returns
         -------
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
index d79935538a24b..30b6a9d43cb44 100644
--- a/pandas/tests/groupby/test_groupby_dropna.py
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 
+from pandas.compat.pyarrow import pa_version_under1p01
+
 import pandas as pd
 import pandas._testing as tm
 
@@ -367,7 +369,12 @@ def test_groupby_nan_included():
         pd.Series([2, np.nan, 1, 2], dtype="Float64"),
         pd.Series(["y", None, "x", "y"], dtype="category"),
         pd.Series(["y", pd.NA, "x", "y"], dtype="string"),
-        pd.Series(["y", pd.NA, "x", "y"], dtype="string[pyarrow]"),
+        pytest.param(
+            pd.Series(["y", pd.NA, "x", "y"], dtype="string[pyarrow]"),
+            marks=pytest.mark.skipif(
+                pa_version_under1p01, reason="pyarrow is not installed"
+            ),
+        ),
         pd.Series(
             ["2016-01-01", np.datetime64("NaT"), "2017-01-01", "2016-01-01"],
             dtype="datetime64[ns]",

From f7326bdeea2c98cde44d95d8baff654f2226c5ca Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Sat, 23 Apr 2022 10:58:13 -0400
Subject: [PATCH 05/16] test fixup

---
 pandas/tests/groupby/test_groupby_dropna.py | 48 +++++++++++----------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
index 30b6a9d43cb44..f89c08e91793c 100644
--- a/pandas/tests/groupby/test_groupby_dropna.py
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -353,45 +353,49 @@ def test_groupby_nan_included():
 
 
 @pytest.mark.parametrize(
-    "key",
+    "values, dtype",
     [
-        pd.Series([2, np.nan, 1, 2]),
-        pd.Series([2, np.nan, 1, 2], dtype="UInt8"),
-        pd.Series([2, np.nan, 1, 2], dtype="Int8"),
-        pd.Series([2, np.nan, 1, 2], dtype="UInt16"),
-        pd.Series([2, np.nan, 1, 2], dtype="Int16"),
-        pd.Series([2, np.nan, 1, 2], dtype="UInt32"),
-        pd.Series([2, np.nan, 1, 2], dtype="Int32"),
-        pd.Series([2, np.nan, 1, 2], dtype="UInt64"),
-        pd.Series([2, np.nan, 1, 2], dtype="Int64"),
-        pd.Series([2, np.nan, 1, 2], dtype="Float32"),
-        pd.Series([2, np.nan, 1, 2], dtype="Int64"),
-        pd.Series([2, np.nan, 1, 2], dtype="Float64"),
-        pd.Series(["y", None, "x", "y"], dtype="category"),
-        pd.Series(["y", pd.NA, "x", "y"], dtype="string"),
+        ([2, np.nan, 1, 2], None),
+        ([2, np.nan, 1, 2], "UInt8"),
+        ([2, np.nan, 1, 2], "Int8"),
+        ([2, np.nan, 1, 2], "UInt16"),
+        ([2, np.nan, 1, 2], "Int16"),
+        ([2, np.nan, 1, 2], "UInt32"),
+        ([2, np.nan, 1, 2], "Int32"),
+        ([2, np.nan, 1, 2], "UInt64"),
+        ([2, np.nan, 1, 2], "Int64"),
+        ([2, np.nan, 1, 2], "Float32"),
+        ([2, np.nan, 1, 2], "Int64"),
+        ([2, np.nan, 1, 2], "Float64"),
+        (["y", None, "x", "y"], "category"),
+        (["y", pd.NA, "x", "y"], "string"),
         pytest.param(
-            pd.Series(["y", pd.NA, "x", "y"], dtype="string[pyarrow]"),
+            ["y", pd.NA, "x", "y"],
+            "string[pyarrow]",
             marks=pytest.mark.skipif(
                 pa_version_under1p01, reason="pyarrow is not installed"
             ),
         ),
-        pd.Series(
+        (
             ["2016-01-01", np.datetime64("NaT"), "2017-01-01", "2016-01-01"],
-            dtype="datetime64[ns]",
+            "datetime64[ns]",
         ),
-        pd.Series(
+        (
             [
                 pd.Period("2012-02-01", freq="D"),
                 pd.NA,
                 pd.Period("2012-01-01", freq="D"),
                 pd.Period("2012-02-01", freq="D"),
-            ]
+            ],
+            None,
         ),
-        pd.Series(pd.arrays.SparseArray([2, np.nan, 1, 2])),
+        (pd.arrays.SparseArray([2, np.nan, 1, 2]), None),
     ],
 )
 @pytest.mark.parametrize("test_series", [True, False])
-def test_no_sort_keep_na(key, test_series):
+def test_no_sort_keep_na(values, dtype, test_series):
+    # GH#46584
+    key = pd.Series(values, dtype=dtype)
     df = pd.DataFrame({"key": key, "a": [1, 2, 3, 4]})
     gb = df.groupby("key", dropna=False, sort=False)
     if test_series:

From b0ec48adab544ec27af1f3dd189ac33679f8cbb7 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Sun, 24 Apr 2022 10:19:09 -0400
Subject: [PATCH 06/16] type-hints

---
 pandas/core/arrays/datetimelike.py | 2 +-
 pandas/core/arrays/masked.py       | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index dc0aecafea56b..4a3fa0659b86e 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -1878,7 +1878,7 @@ def _with_freq(self, freq):
 
     # --------------------------------------------------------------
 
-    def factorize(self, na_sentinel=-1, sort: bool = False, dropna: bool = True):
+    def factorize(self, na_sentinel: int = -1, dropna: bool = True, sort: bool = False):
         if self.freq is not None:
             # We must be unique, so can short-circuit (and retain freq)
             codes = np.arange(len(self), dtype=np.intp)
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index 537d59f2cd779..bebd92168ebdc 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -888,7 +888,12 @@ def factorize(
             na_index = mask.argmax()
             if mask[na_index]:
                 # Insert na with the proper code
-                na_code = 0 if na_index == 0 else codes[:na_index].argmax() + 1
+                if na_index == 0:
+                    na_code = np.intp(0)
+                else:
+                    # error: Slice index must be an integer or None
+                    # https://github.com/python/mypy/issues/2410
+                    na_code = codes[:na_index].argmax() + 1  # type: ignore[misc]
                 if na_sentinel < 0:
                     # codes can never equal na_sentinel and be >= na_code
                     codes[codes >= na_code] += 1

From cadab86a6818a2ad678130b84cdf9f12ed93f87c Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Sat, 30 Apr 2022 19:00:46 -0400
Subject: [PATCH 07/16] improvements

---
 pandas/core/arrays/arrow/array.py | 3 ++-
 pandas/core/arrays/string_.py     | 6 ++----
 pandas/core/groupby/grouper.py    | 2 +-
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index f7aae4780305a..2fcee9e261404 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -124,7 +124,7 @@ def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
 
     @doc(ExtensionArray.factorize)
     def factorize(
-        self, na_sentinel: int = -1, dropna=True
+        self, na_sentinel: int = -1, dropna: bool = True
     ) -> tuple[np.ndarray, ExtensionArray]:
         if pa_version_under4p0:
             encoded = self._data.dictionary_encode()
@@ -141,6 +141,7 @@ def factorize(
         if encoded.num_chunks:
             uniques = type(self)(encoded.chunk(0).dictionary)
             if not dropna and pa_version_under4p0:
+                # TODO: share logic with BaseMaskedArray.factorize
                 # Insert na with the proper code
                 na_mask = indices.values == na_sentinel
                 na_index = na_mask.argmax()
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 304bc7a768bcc..86f8828f2aea5 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -379,14 +379,12 @@ def __arrow_array__(self, type=None):
     def _values_for_factorize(self):
         arr = self._ndarray.copy()
         mask = self.isna()
-        arr[mask] = -1
-        return arr, -1
+        arr[mask] = None
+        return arr, None
 
     @classmethod
     def _from_factorized(cls, values, original):
         assert values.dtype == original._ndarray.dtype
-        # When dropna (i.e. ignore_na) is False, can get -1 from nulls
-        values[values == -1] = None
         return original._from_backing_data(values)
 
     def __setitem__(self, key, value):
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
index e146e9462c6f2..623b1d6a5aa4c 100644
--- a/pandas/core/groupby/grouper.py
+++ b/pandas/core/groupby/grouper.py
@@ -660,7 +660,7 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
         if self._dropna and self._passed_categorical:
             # we make a CategoricalIndex out of the cat grouper
             # preserving the categories / ordered attributes;
-            # doesn't (yet) handle dropna=False
+            # doesn't (yet - GH#46909) handle dropna=False
             cat = self.grouping_vector
             categories = cat.categories
 

From f44b7f3483e74e7346775a98e4b71e075c21053c Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Sat, 30 Apr 2022 19:29:12 -0400
Subject: [PATCH 08/16] remove override of _from_factorized in string_.py

---
 pandas/core/arrays/string_.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 86f8828f2aea5..0593edeb6645b 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -382,11 +382,6 @@ def _values_for_factorize(self):
         arr[mask] = None
         return arr, None
 
-    @classmethod
-    def _from_factorized(cls, values, original):
-        assert values.dtype == original._ndarray.dtype
-        return original._from_backing_data(values)
-
     def __setitem__(self, key, value):
         value = extract_array(value, extract_numpy=True)
         if isinstance(value, type(self)):

From ef49c74e7a51a240fcd594d50ef0a3b5f4d315e7 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Thu, 5 May 2022 17:28:53 -0400
Subject: [PATCH 09/16] Rework na_sentinel/dropna/ignore_na

---
 pandas/core/algorithms.py          | 56 +++++++++++++++++-------------
 pandas/core/arrays/arrow/array.py  | 16 ++++-----
 pandas/core/arrays/base.py         | 13 ++++---
 pandas/core/arrays/datetimelike.py |  5 ++-
 pandas/core/arrays/masked.py       | 47 ++++++++++++-------------
 pandas/core/arrays/sparse/array.py |  9 ++---
 6 files changed, 71 insertions(+), 75 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index c4d123d1e2cd9..831bb70b25615 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -498,11 +498,10 @@ def f(c, v):
 
 def factorize_array(
     values: np.ndarray,
-    na_sentinel: int = -1,
+    na_sentinel: int | None = -1,
     size_hint: int | None = None,
     na_value: object = None,
     mask: npt.NDArray[np.bool_] | None = None,
-    dropna: bool = True,
 ) -> tuple[npt.NDArray[np.intp], np.ndarray]:
     """
     Factorize a numpy array to codes and uniques.
@@ -512,7 +511,10 @@ def factorize_array(
     Parameters
     ----------
     values : ndarray
-    na_sentinel : int, default -1
+    na_sentinel : int or None, default -1
+        Code to use for NA values. When None, no sentinel will be used. Instead, NA
+        values will be coded as a non-negative integer and the NA value will appear in
+        uniques.
     size_hint : int, optional
         Passed through to the hashtable's 'get_labels' method
     na_value : object, optional
@@ -524,17 +526,16 @@ def factorize_array(
         If not None, the mask is used as indicator for missing values
         (True = missing, False = valid) instead of `na_value` or
         condition "val != val".
-    dropna : bool, default True
-        Whether NA values will appear in uniques. When False, NA values
-        will receive a nonnegative code instead of na_sentinel.
-
-        .. versionadded:: 1.5.0
 
     Returns
     -------
     codes : ndarray[np.intp]
     uniques : ndarray
     """
+    ignore_na = na_sentinel is not None
+    if not ignore_na:
+        na_sentinel = -1
+
     original = values
     if values.dtype.kind in ["m", "M"]:
         # _get_hashtable_algo will cast dt64/td64 to i8 via _ensure_data, so we
@@ -551,7 +552,7 @@ def factorize_array(
         na_sentinel=na_sentinel,
         na_value=na_value,
         mask=mask,
-        ignore_na=dropna,
+        ignore_na=ignore_na,
     )
 
     # re-cast e.g. i8->dt64/td64, uint8->bool
@@ -720,12 +721,7 @@ def factorize(
     if not isinstance(values, ABCMultiIndex):
         values = extract_array(values, extract_numpy=True)
 
-    # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques
-    # of values, assign na_sentinel=-1 to replace code value for NaN.
-    dropna = True
-    if na_sentinel is None:
-        na_sentinel = -1
-        dropna = False
+    dropna = na_sentinel is not None
 
     if (
         isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray))
@@ -738,29 +734,39 @@ def factorize(
 
     if not isinstance(values.dtype, np.dtype):
         # i.e. ExtensionDtype
-        # TODO: pass ignore_na=dropna. When sort is True we ignore_na here and append
-        #       on the end because safe_sort does not handle null values in uniques
-        codes, uniques = values.factorize(
-            na_sentinel=na_sentinel, dropna=dropna or sort
-        )
+        codes, uniques = values.factorize(na_sentinel=na_sentinel)
     else:
         values = np.asarray(values)  # convert DTA/TDA/MultiIndex
-        # TODO: pass ignore_na=dropna; see above
+        # TODO: pass na_sentinel=na_sentinel to factorize_array. When sort is True and
+        #       na_sentinel is None we append NA on the end because safe_sort does not
+        #       handle null values in uniques.
+        if na_sentinel is None and sort:
+            na_sentinel_arg = -1
+        elif na_sentinel is None:
+            na_sentinel_arg = None
+        else:
+            na_sentinel_arg = na_sentinel
         codes, uniques = factorize_array(
             values,
-            na_sentinel=na_sentinel,
+            na_sentinel=na_sentinel_arg,
             size_hint=size_hint,
-            dropna=dropna or sort,
         )
 
     if sort and len(uniques) > 0:
+        if na_sentinel is None:
+            # TODO: Can remove when na_sentinel=na_sentinel in TODO above
+            na_sentinel = -1
         uniques, codes = safe_sort(
             uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
         )
 
     if not dropna and sort:
-        # TODO: Can remove if we pass ignore_na=dropna; see above
-        code_is_na = codes == na_sentinel
+        # TODO: Can remove entire block when na_sentinel=na_sentinel in TODO above
+        if na_sentinel is None:
+            na_sentinel_arg = -1
+        else:
+            na_sentinel_arg = na_sentinel
+        code_is_na = codes == na_sentinel_arg
         if code_is_na.any():
             # na_value is set based on the dtype of uniques, and compat set to False is
             # because we do not want na_value to be 0 for integers
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 2fcee9e261404..b3880ece10983 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -124,12 +124,12 @@ def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
 
     @doc(ExtensionArray.factorize)
     def factorize(
-        self, na_sentinel: int = -1, dropna: bool = True
+        self, na_sentinel: int | None = -1
     ) -> tuple[np.ndarray, ExtensionArray]:
         if pa_version_under4p0:
             encoded = self._data.dictionary_encode()
         else:
-            null_encoding = "mask" if dropna else "encode"
+            null_encoding = "mask" if na_sentinel is not None else "encode"
             encoded = self._data.dictionary_encode(null_encoding=null_encoding)
         indices = pa.chunked_array(
             [c.indices for c in encoded.chunks], type=encoded.type.index_type
@@ -140,20 +140,16 @@ def factorize(
 
         if encoded.num_chunks:
             uniques = type(self)(encoded.chunk(0).dictionary)
-            if not dropna and pa_version_under4p0:
+            if na_sentinel is None and pa_version_under4p0:
                 # TODO: share logic with BaseMaskedArray.factorize
                 # Insert na with the proper code
-                na_mask = indices.values == na_sentinel
+                na_mask = indices.values == -1
                 na_index = na_mask.argmax()
                 if na_mask[na_index]:
                     uniques = uniques.insert(na_index, self.dtype.na_value)
                     na_code = 0 if na_index == 0 else indices[:na_index].argmax() + 1
-                    if na_sentinel < 0:
-                        # codes can never equal na_sentinel and be >= na_code
-                        indices[indices >= na_code] += 1
-                    else:
-                        indices[(indices >= na_code) & (indices != na_sentinel)] += 1
-                    indices[indices == na_sentinel] = na_code
+                    indices[indices >= na_code] += 1
+                    indices[indices == -1] = na_code
         else:
             uniques = type(self)(pa.array([], type=encoded.type.value_type))
 
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 8524e28353e80..a834c54a3a82f 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -1003,20 +1003,19 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
         return self.astype(object), np.nan
 
     def factorize(
-        self, na_sentinel: int = -1, dropna: bool = True
+        self, na_sentinel: int | None = -1
     ) -> tuple[np.ndarray, ExtensionArray]:
         """
         Encode the extension array as an enumerated type.
 
         Parameters
         ----------
-        na_sentinel : int, default -1
+        na_sentinel : int or None, default -1
             Value to use in the `codes` array to indicate missing values.
-        dropna : bool, default True
-            Whether NA values will appear in uniques. When False, NA values
-            will receive a nonnegative code instead of na_sentinel.
+            When None, no sentinel will be used. Instead, NA values will be coded as a
+            non-negative integer and the NA value will appear in uniques.
 
-            .. versionadded:: 1.5.0
+            .. versionchanged:: 1.5.0
 
         Returns
         -------
@@ -1051,7 +1050,7 @@ def factorize(
         arr, na_value = self._values_for_factorize()
 
         codes, uniques = factorize_array(
-            arr, na_sentinel=na_sentinel, na_value=na_value, dropna=dropna
+            arr, na_sentinel=na_sentinel, na_value=na_value
         )
 
         uniques_ea = self._from_factorized(uniques, self)
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index 6135b78580233..4ef9377d56afd 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -1878,7 +1878,7 @@ def _with_freq(self, freq):
 
     # --------------------------------------------------------------
 
-    def factorize(self, na_sentinel: int = -1, dropna: bool = True, sort: bool = False):
+    def factorize(self, na_sentinel: int | None = -1, sort: bool = False):
         if self.freq is not None:
             # We must be unique, so can short-circuit (and retain freq)
             codes = np.arange(len(self), dtype=np.intp)
@@ -1888,8 +1888,7 @@ def factorize(self, na_sentinel: int = -1, dropna: bool = True, sort: bool = Fal
                 uniques = uniques[::-1]
             return codes, uniques
         # FIXME: shouldn't get here; we are ignoring sort
-        result = super().factorize(na_sentinel=na_sentinel, dropna=dropna)
-        return result
+        return super().factorize(na_sentinel=na_sentinel)
 
 
 # -------------------------------------------------------------------
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index 17b44c9de6e99..69f7a034176bb 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -870,40 +870,39 @@ def searchsorted(
 
     @doc(ExtensionArray.factorize)
     def factorize(
-        self, na_sentinel: int = -1, dropna: bool = True
+        self, na_sentinel: int | None = -1
     ) -> tuple[np.ndarray, ExtensionArray]:
         arr = self._data
         mask = self._mask
 
-        codes, uniques = factorize_array(
-            arr, na_sentinel=na_sentinel, mask=mask, dropna=True
-        )
+        # Pass non-None na_sentinel; recode and add NA to uniques if necessary below
+        na_sentinel_arg = -1 if na_sentinel is None else na_sentinel
+        codes, uniques = factorize_array(arr, na_sentinel=na_sentinel_arg, mask=mask)
 
         # check that factorize_array correctly preserves dtype.
         assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype)
 
-        # Make room for a null value if we're not ignoring it and it exists
-        size = len(uniques) if dropna or not mask.any() else len(uniques) + 1
+        has_na = mask.any()
+        if na_sentinel is not None or not has_na:
+            size = len(uniques)
+        else:
+            # Make room for an NA value
+            size = len(uniques) + 1
         uniques_mask = np.zeros(size, dtype=bool)
-        if not dropna:
+        if na_sentinel is None and has_na:
             na_index = mask.argmax()
-            if mask[na_index]:
-                # Insert na with the proper code
-                if na_index == 0:
-                    na_code = np.intp(0)
-                else:
-                    # error: Slice index must be an integer or None
-                    # https://github.com/python/mypy/issues/2410
-                    na_code = codes[:na_index].argmax() + 1  # type: ignore[misc]
-                if na_sentinel < 0:
-                    # codes can never equal na_sentinel and be >= na_code
-                    codes[codes >= na_code] += 1
-                else:
-                    codes[(codes >= na_code) & (codes != na_sentinel)] += 1
-                codes[codes == na_sentinel] = na_code
-                # dummy value for uniques; not used since uniques_mask will be True
-                uniques = np.insert(uniques, na_code, 0)
-                uniques_mask[na_code] = True
+            # Insert na with the proper code
+            if na_index == 0:
+                na_code = np.intp(0)
+            else:
+                # mypy error: Slice index must be an integer or None
+                # https://github.com/python/mypy/issues/2410
+                na_code = codes[:na_index].argmax() + 1  # type: ignore[misc]
+            codes[codes >= na_code] += 1
+            codes[codes == -1] = na_code
+            # dummy value for uniques; not used since uniques_mask will be True
+            uniques = np.insert(uniques, na_code, 0)
+            uniques_mask[na_code] = True
         uniques_ea = type(self)(uniques, uniques_mask)
 
         return codes, uniques_ea
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
index c38baab1d9ae1..3a655541ddee4 100644
--- a/pandas/core/arrays/sparse/array.py
+++ b/pandas/core/arrays/sparse/array.py
@@ -847,17 +847,14 @@ def _values_for_factorize(self):
         # Still override this for hash_pandas_object
         return np.asarray(self), self.fill_value
 
-    def factorize(
-        self, na_sentinel: int = -1, dropna: bool = True
-    ) -> tuple[np.ndarray, SparseArray]:
+    def factorize(self, na_sentinel: int | None = -1) -> tuple[np.ndarray, SparseArray]:
         # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
         # The sparsity on this is backwards from what Sparse would want. Want
         # ExtensionArray.factorize -> Tuple[EA, EA]
         # Given that we have to return a dense array of codes, why bother
         # implementing an efficient factorize?
-        na_sentinel_arg = na_sentinel if dropna else None
-        codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel_arg)
-        if not dropna:
+        codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
+        if na_sentinel is not None:
             codes[codes == -1] = na_sentinel
         uniques_sp = SparseArray(uniques, dtype=self.dtype)
         return codes, uniques_sp

From 8378ba047edf521846d6c7ea1b99e8edf5350785 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Thu, 5 May 2022 17:33:36 -0400
Subject: [PATCH 10/16] fixups

---
 pandas/core/algorithms.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 831bb70b25615..f3bb4e6157da7 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -721,6 +721,8 @@ def factorize(
     if not isinstance(values, ABCMultiIndex):
         values = extract_array(values, extract_numpy=True)
 
+    # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques
+    # of values, assign na_sentinel=-1 to replace code value for NaN.
     dropna = na_sentinel is not None
 
     if (
@@ -754,14 +756,14 @@ def factorize(
 
     if sort and len(uniques) > 0:
         if na_sentinel is None:
-            # TODO: Can remove when na_sentinel=na_sentinel in TODO above
+            # TODO: Can remove when na_sentinel=na_sentinel as in TODO above
             na_sentinel = -1
         uniques, codes = safe_sort(
             uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
         )
 
     if not dropna and sort:
-        # TODO: Can remove entire block when na_sentinel=na_sentinel in TODO above
+        # TODO: Can remove entire block when na_sentinel=na_sentinel as in TODO above
         if na_sentinel is None:
             na_sentinel_arg = -1
         else:

From f2e24dfd3d7022528d3e5e72ac0a494945fabacb Mon Sep 17 00:00:00 2001
From: richard <rhshadrach@gmail.com>
Date: Thu, 5 May 2022 21:56:00 -0400
Subject: [PATCH 11/16] fixup for pyarrow < 4.0

---
 pandas/core/arrays/arrow/array.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index b3880ece10983..62f16b75ba503 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -135,7 +135,7 @@ def factorize(
             [c.indices for c in encoded.chunks], type=encoded.type.index_type
         ).to_pandas()
         if indices.dtype.kind == "f":
-            indices[np.isnan(indices)] = na_sentinel
+            indices[np.isnan(indices)] = na_sentinel if na_sentinel is not None else -1
         indices = indices.astype(np.int64, copy=False)
 
         if encoded.num_chunks:

From b51e88ff375d0785420149a4fb92a6f986a8c00b Mon Sep 17 00:00:00 2001
From: richard <rhshadrach@gmail.com>
Date: Fri, 6 May 2022 06:10:20 -0400
Subject: [PATCH 12/16] whatsnew note

---
 doc/source/whatsnew/v1.5.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index e74dadd43f9da..ce8aeb449573a 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -125,7 +125,7 @@ Other enhancements
 - Added ``validate`` argument to :meth:`DataFrame.join` (:issue:`46622`)
 - A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`)
 - Added ``numeric_only`` argument to :meth:`Resampler.sum`, :meth:`Resampler.prod`, :meth:`Resampler.min`, :meth:`Resampler.max`, :meth:`Resampler.first`, and :meth:`Resampler.last` (:issue:`46442`)
-- The method :meth:`.ExtensionArray.factorize` has gained the argument ``dropna`` for determining how null values are to be treated. (:issue:`46601`)
+- The method :meth:`.ExtensionArray.factorize` can now be passed `na_sentinel=None` for determining how null values are to be treated. (:issue:`46601`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.notable_bug_fixes:

From 4a36bf07d2b5b15f47eccd45d3402563ff55b279 Mon Sep 17 00:00:00 2001
From: richard <rhshadrach@gmail.com>
Date: Sat, 7 May 2022 09:35:23 -0400
Subject: [PATCH 13/16] doc fixup

---
 doc/source/whatsnew/v1.5.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index ce8aeb449573a..7d0d2fb4253fb 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -125,7 +125,7 @@ Other enhancements
 - Added ``validate`` argument to :meth:`DataFrame.join` (:issue:`46622`)
 - A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`)
 - Added ``numeric_only`` argument to :meth:`Resampler.sum`, :meth:`Resampler.prod`, :meth:`Resampler.min`, :meth:`Resampler.max`, :meth:`Resampler.first`, and :meth:`Resampler.last` (:issue:`46442`)
-- The method :meth:`.ExtensionArray.factorize` can now be passed `na_sentinel=None` for determining how null values are to be treated. (:issue:`46601`)
+- The method :meth:`.ExtensionArray.factorize` can now be passed ``na_sentinel=None`` for determining how null values are to be treated. (:issue:`46601`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.notable_bug_fixes:

From cf561351c21caf4686122fa5a4fb1031bdc0bf57 Mon Sep 17 00:00:00 2001
From: richard <rhshadrach@gmail.com>
Date: Sun, 26 Jun 2022 22:43:09 -0400
Subject: [PATCH 14/16] fixups

---
 pandas/core/algorithms.py          | 15 ++++++---------
 pandas/core/arrays/arrow/array.py  | 10 +++++-----
 pandas/core/arrays/base.py         | 12 ++----------
 pandas/core/arrays/masked.py       | 10 +++-------
 pandas/core/arrays/sparse/array.py |  2 ++
 5 files changed, 18 insertions(+), 31 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index db158e9cf9daf..9234bf8e65ff0 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -522,10 +522,7 @@ def factorize_array(
     Parameters
     ----------
     values : ndarray
-    na_sentinel : int or None, default -1
-        Code to use for NA values. When None, no sentinel will be used. Instead, NA
-        values will be coded as a non-negative integer and the NA value will appear in
-        uniques.
+    na_sentinel : int, default -1
     size_hint : int, optional
         Passed through to the hashtable's 'get_labels' method
     na_value : object, optional
@@ -761,19 +758,19 @@ def factorize(
 
     elif not isinstance(values.dtype, np.dtype):
         if (
-            na_sentinel == -1
-            and "use_na_sentinel" in inspect.signature(values.factorize).parameters
-        ):
+            na_sentinel == -1 or na_sentinel is None
+        ) and "use_na_sentinel" in inspect.signature(values.factorize).parameters:
             # Avoid using catch_warnings when possible
             # GH#46910 - TimelikeOps has deprecated signature
             codes, uniques = values.factorize(  # type: ignore[call-arg]
-                use_na_sentinel=True
+                use_na_sentinel=na_sentinel is not None
             )
         else:
+            na_sentinel_arg = -1 if na_sentinel is None else na_sentinel
             with warnings.catch_warnings():
                 # We've already warned above
                 warnings.filterwarnings("ignore", ".*use_na_sentinel.*", FutureWarning)
-                codes, uniques = values.factorize(na_sentinel=na_sentinel)
+                codes, uniques = values.factorize(na_sentinel=na_sentinel_arg)
 
     else:
         values = np.asarray(values)  # convert DTA/TDA/MultiIndex
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index f28a47c08af9c..046ede0a331dd 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -298,20 +298,20 @@ def factorize(
         if pa_version_under4p0:
             encoded = self._data.dictionary_encode()
         else:
-            null_encoding = "mask" if resolved_na_sentinel else "encode"
+            null_encoding = "mask" if resolved_na_sentinel is not None else "encode"
             encoded = self._data.dictionary_encode(null_encoding=null_encoding)
-
-        encoded = self._data.dictionary_encode()
         indices = pa.chunked_array(
             [c.indices for c in encoded.chunks], type=encoded.type.index_type
         ).to_pandas()
         if indices.dtype.kind == "f":
-            indices[np.isnan(indices)] = na_sentinel if na_sentinel is not None else -1
+            indices[np.isnan(indices)] = (
+                resolved_na_sentinel if resolved_na_sentinel is not None else -1
+            )
         indices = indices.astype(np.int64, copy=False)
 
         if encoded.num_chunks:
             uniques = type(self)(encoded.chunk(0).dictionary)
-            if not resolved_na_sentinel and pa_version_under4p0:
+            if resolved_na_sentinel is None and pa_version_under4p0:
                 # TODO: share logic with BaseMaskedArray.factorize
                 # Insert na with the proper code
                 na_mask = indices.values == -1
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 02893a272363b..35e2c759277cc 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -1034,12 +1034,8 @@ def factorize(
 
         Parameters
         ----------
-        na_sentinel : int or None, default -1
+        na_sentinel : int, default -1
             Value to use in the `codes` array to indicate missing values.
-            When None, no sentinel will be used. Instead, NA values will be coded as a
-            non-negative integer and the NA value will appear in uniques.
-
-            .. versionchanged:: 1.5.0
 
             .. deprecated:: 1.5.0
                 The na_sentinel argument is deprecated and
@@ -1084,14 +1080,10 @@ def factorize(
         # 2. ExtensionArray.factorize.
         #    Complete control over factorization.
         resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
-        if resolved_na_sentinel is None:
-            raise NotImplementedError("Encoding NaN values is not yet implemented")
-        else:
-            na_sentinel = resolved_na_sentinel
         arr, na_value = self._values_for_factorize()
 
         codes, uniques = factorize_array(
-            arr, na_sentinel=na_sentinel, na_value=na_value
+            arr, na_sentinel=resolved_na_sentinel, na_value=na_value
         )
 
         uniques_ea = self._from_factorized(uniques, self)
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index 8f36ea34dc092..0126efa9bd347 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -875,28 +875,24 @@ def factorize(
         use_na_sentinel: bool | lib.NoDefault = lib.no_default,
     ) -> tuple[np.ndarray, ExtensionArray]:
         resolved_na_sentinel = algos.resolve_na_sentinel(na_sentinel, use_na_sentinel)
-        if resolved_na_sentinel is None:
-            raise NotImplementedError("Encoding NaN values is not yet implemented")
-        else:
-            na_sentinel = resolved_na_sentinel
         arr = self._data
         mask = self._mask
 
         # Pass non-None na_sentinel; recode and add NA to uniques if necessary below
-        na_sentinel_arg = -1 if na_sentinel is None else na_sentinel
+        na_sentinel_arg = -1 if resolved_na_sentinel is None else resolved_na_sentinel
         codes, uniques = factorize_array(arr, na_sentinel=na_sentinel_arg, mask=mask)
 
         # check that factorize_array correctly preserves dtype.
         assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype)
 
         has_na = mask.any()
-        if na_sentinel is not None or not has_na:
+        if resolved_na_sentinel is not None or not has_na:
             size = len(uniques)
         else:
             # Make room for an NA value
             size = len(uniques) + 1
         uniques_mask = np.zeros(size, dtype=bool)
-        if na_sentinel is None and has_na:
+        if resolved_na_sentinel is None and has_na:
             na_index = mask.argmax()
             # Insert na with the proper code
             if na_index == 0:
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
index 37c4b8b29fc0c..0218f674c3ddf 100644
--- a/pandas/core/arrays/sparse/array.py
+++ b/pandas/core/arrays/sparse/array.py
@@ -861,6 +861,8 @@ def factorize(
         codes, uniques = algos.factorize(
             np.asarray(self), na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel
         )
+        if na_sentinel is lib.no_default:
+            na_sentinel = -1
         if use_na_sentinel is lib.no_default or use_na_sentinel:
             codes[codes == -1] = na_sentinel
         uniques_sp = SparseArray(uniques, dtype=self.dtype)

From 9c35dd0844b1cfc24330857547b8db7388ce6457 Mon Sep 17 00:00:00 2001
From: richard <rhshadrach@gmail.com>
Date: Mon, 11 Jul 2022 16:34:12 -0400
Subject: [PATCH 15/16] fixup whatsnew note

---
 doc/source/whatsnew/v1.5.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index 8e54e58efdbcf..65baef94cc9e3 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -278,7 +278,7 @@ Other enhancements
 - :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`)
 - :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`)
 - :meth:`RangeIndex.union` now can return a :class:`RangeIndex` instead of a :class:`Int64Index` if the resulting values are equally spaced (:issue:`47557`, :issue:`43885`)
-- The method :meth:`.ExtensionArray.factorize` can now be passed ``use_na_sentinel=False`` for determining how null values are to be treated. (:issue:`46601`)
+- The method :meth:`.ExtensionArray.factorize` accepts ``use_na_sentinel=False`` for determining how null values are to be treated (:issue:`46601`)
 -
 
 .. ---------------------------------------------------------------------------

From 7143a526896ea473e3079968637d0f76641337f1 Mon Sep 17 00:00:00 2001
From: richard <rhshadrach@gmail.com>
Date: Tue, 16 Aug 2022 21:17:39 -0400
Subject: [PATCH 16/16] whatsnew note; comment on old vs new API

---
 doc/source/whatsnew/v1.5.0.rst | 1 +
 pandas/core/algorithms.py      | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index 9068c4b4c23b1..df58fde15c8bf 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -926,6 +926,7 @@ Numeric
 - Bug in division, ``pow`` and ``mod`` operations on array-likes with ``dtype="boolean"`` not being like their ``np.bool_`` counterparts (:issue:`46063`)
 - Bug in multiplying a :class:`Series` with ``IntegerDtype`` or ``FloatingDtype`` by an array-like with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`45622`)
 - Bug in :meth:`mean` where the optional dependency ``bottleneck`` causes precision loss linear in the length of the array. ``bottleneck`` has been disabled for :meth:`mean` improving the loss to log-linear but may result in a performance decrease. (:issue:`42878`)
+- Bug in :func:`factorize` would convert the value ``None`` to ``np.nan`` (:issue:`46601`)
 
 Conversion
 ^^^^^^^^^^
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 593efb4a50ec7..9b031fc9517c7 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -734,6 +734,10 @@ def factorize(
     # responsible only for factorization. All data coercion, sorting and boxing
     # should happen here.
 
+    # GH#46910 deprecated na_sentinel in favor of use_na_sentinel:
+    #   na_sentinel=None corresponds to use_na_sentinel=False
+    #   na_sentinel=-1 correspond to use_na_sentinel=True
+    # Other na_sentinel values will not be supported when the deprecation is enforced.
     na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
     if isinstance(values, ABCRangeIndex):
         return values.factorize(sort=sort)