BUG: Replace on Series/DataFrame stops replacing after first NA (#57865)

* update test for GH#56599 * bug: ser/df.replace only replaces first occurence with NAs * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add whatsnew * fmt fix --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
pandas-dev · Mar 20, 2024 · 0f7ded2 · 0f7ded2
1 parent 114a84d
commit 0f7ded2
Show file tree

Hide file tree

Showing 3 changed files with 17 additions and 12 deletions.
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -298,6 +298,7 @@ Bug fixes
 - Fixed bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`)
 - Fixed bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`)
 - Fixed bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`)
+- Fixed bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`)
 
 Categorical
 ^^^^^^^^^^^

diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py
@@ -93,17 +93,18 @@ def _check_comparison_types(
         )
 
     # GH#32621 use mask to avoid comparing to NAs
-    if isinstance(a, np.ndarray):
+    if isinstance(a, np.ndarray) and mask is not None:
         a = a[mask]
-
-    result = op(a)
-
-    if isinstance(result, np.ndarray) and mask is not None:
-        # The shape of the mask can differ to that of the result
-        # since we may compare only a subset of a's or b's elements
-        tmp = np.zeros(mask.shape, dtype=np.bool_)
-        np.place(tmp, mask, result)
-        result = tmp
+        result = op(a)
+
+        if isinstance(result, np.ndarray):
+            # The shape of the mask can differ to that of the result
+            # since we may compare only a subset of a's or b's elements
+            tmp = np.zeros(mask.shape, dtype=np.bool_)
+            np.place(tmp, mask, result)
+            result = tmp
+    else:
+        result = op(a)
 
     _check_comparison_types(result, a, b)
     return result

diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py
@@ -616,15 +616,18 @@ def test_replace_with_compiled_regex(self):
 
     def test_pandas_replace_na(self):
         # GH#43344
-        ser = pd.Series(["AA", "BB", "CC", "DD", "EE", "", pd.NA], dtype="string")
+        # GH#56599
+        ser = pd.Series(["AA", "BB", "CC", "DD", "EE", "", pd.NA, "AA"], dtype="string")
         regex_mapping = {
             "AA": "CC",
             "BB": "CC",
             "EE": "CC",
             "CC": "CC-REPL",
         }
         result = ser.replace(regex_mapping, regex=True)
-        exp = pd.Series(["CC", "CC", "CC-REPL", "DD", "CC", "", pd.NA], dtype="string")
+        exp = pd.Series(
+            ["CC", "CC", "CC-REPL", "DD", "CC", "", pd.NA, "CC"], dtype="string"
+        )
         tm.assert_series_equal(result, exp)
 
     @pytest.mark.parametrize(