rapidsai · rapids-bot · Jul 20, 2021 · Jul 14, 2021 · Jul 15, 2021 · Jul 16, 2021
@@ -1065,20 +1065,52 @@ def find_and_replace(
         Return col with *to_replace* replaced with *replacement*.
         """
         to_replace_col = column.as_column(to_replace)
+        if len(to_replace_col) == to_replace_col.null_count:
+            to_replace_col = to_replace_col.astype(self.categories.dtype)
         replacement_col = column.as_column(replacement)
+        if len(replacement_col) == replacement_col.null_count:
+            replacement_col = replacement_col.astype(self.categories.dtype)
 
         if type(to_replace_col) != type(replacement_col):
             raise TypeError(
                 f"to_replace and value should be of same types,"
                 f"got to_replace dtype: {to_replace_col.dtype} and "
                 f"value dtype: {replacement_col.dtype}"
             )
+        df = cudf.DataFrame({"old": to_replace_col, "new": replacement_col})
+        df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
+        if df._data["old"].null_count == 1:
+            fill_value = df._data["new"][df._data["old"].isna()][0]
+            if fill_value in self.categories:
+                replaced = self.fillna(fill_value)
+            else:
+                new_categories = self.categories.append(
+                    column.as_column([fill_value])
+                )
+                replaced = self.copy()
+                replaced = replaced._set_categories(new_categories)
+                replaced = replaced.fillna(fill_value)
+            df = df.dropna(subset=["old"])
+            to_replace_col = df["old"]._column
+            replacement_col = df["new"]._column
+        else:
+            replaced = self
+        if df._data["new"].null_count > 0:
+            drop_values = df._data["old"][df._data["new"].isna()]
+            cur_categories = replaced.categories
+            new_categories = cur_categories[
+                ~cudf.Series(cur_categories.isin(drop_values))
+            ]
+            replaced = replaced._set_categories(new_categories)
+            df = df.dropna(subset=["new"])
+            to_replace_col = df["old"]._column
+            replacement_col = df["new"]._column
 
         # create a dataframe containing the pre-replacement categories
         # and a copy of them to work with. The index of this dataframe
         # represents the original ints that map to the categories
         old_cats = cudf.DataFrame()
-        old_cats["cats"] = column.as_column(self.dtype.categories)
+        old_cats["cats"] = column.as_column(replaced.dtype.categories)
         new_cats = old_cats.copy(deep=True)
 
         # Create a column with the appropriate labels replaced
@@ -1116,11 +1148,11 @@ def find_and_replace(
         # named 'index', which came from the filtered categories,
         # contains the new ints that we need to map to
         to_replace_col = column.as_column(catmap.index).astype(
-            self.codes.dtype
+            replaced.codes.dtype
         )
-        replacement_col = catmap["index"]._column.astype(self.codes.dtype)
+        replacement_col = catmap["index"]._column.astype(replaced.codes.dtype)
 
-        replaced = column.as_column(self.codes)
+        replaced = column.as_column(replaced.codes)
         output = libcudf.replace.replace(
             replaced, to_replace_col, replacement_col
         )

@@ -298,8 +298,8 @@ def find_and_replace(
         """
         Return col with *to_replace* replaced with *value*.
         """
-        to_replace_col = as_column(to_replace)
-        replacement_col = as_column(replacement)
+        to_replace_col = column.as_column(to_replace)
+        replacement_col = column.as_column(replacement)
 
         if type(to_replace_col) != type(replacement_col):
             raise TypeError(
@@ -334,8 +334,16 @@ def find_and_replace(
         to_replace_col, replacement_col, replaced = numeric_normalize_types(
             to_replace_col, replacement_col, replaced
         )
+        df = cudf.DataFrame({"old": to_replace_col, "new": replacement_col})
+        df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
+        if df._data["old"].null_count == 1:
+            replaced = replaced.fillna(
+                df._data["new"][df._data["old"].isna()][0]
+            )
+            df = df.dropna(subset=["old"])
+
         return libcudf.replace.replace(
-            replaced, to_replace_col, replacement_col
+            replaced, df["old"]._column, df["new"]._column
         )
 
     def fillna(
@@ -618,9 +626,12 @@ def _normalize_find_and_replace_input(
         )
         # Scalar case
         if len(col_to_normalize) == 1:
-            col_to_normalize_casted = input_column_dtype.type(
-                col_to_normalize[0]
-            )
+            if cudf._lib.scalar._is_null_host_scalar(col_to_normalize[0]):
+                return normalized_column.astype(input_column_dtype)
+            else:
+                col_to_normalize_casted = input_column_dtype.type(
+                    col_to_normalize[0]
+                )
             if not np.isnan(col_to_normalize_casted) and (
                 col_to_normalize_casted != col_to_normalize[0]
             ):

@@ -5277,8 +5277,16 @@ def find_and_replace(
             and replacement_col.dtype != self.dtype
         ):
             return self.copy()
-
-        return libcudf.replace.replace(self, to_replace_col, replacement_col)
+        df = cudf.DataFrame({"old": to_replace_col, "new": replacement_col})
+        df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
+        if df._data["old"].null_count == 1:
+            res = self.fillna(df._data["new"][df._data["old"].isna()][0])
+            df = df.dropna(subset=["old"])
+        else:
+            res = self
+        return libcudf.replace.replace(
+            res, df["old"]._column, df["new"]._column
+        )
 
     def fillna(
         self,
@@ -5289,15 +5297,18 @@ def fillna(
         if fill_value is not None:
             if not is_scalar(fill_value):
                 fill_value = column.as_column(fill_value, dtype=self.dtype)
+            elif cudf._lib.scalar._is_null_host_scalar(fill_value):
+                # Trying to fill <NA> with <NA> value? Return copy.
+                return self.copy(deep=True)
             return super().fillna(value=fill_value, dtype="object")
         else:
             return super().fillna(method=method)
 
     def _find_first_and_last(self, value: ScalarLike) -> Tuple[int, int]:
         found_indices = libstrings.contains_re(self, f"^{value}$")
         found_indices = libcudf.unary.cast(found_indices, dtype=np.int32)
-        first = column.as_column(found_indices).find_first_value(1)
-        last = column.as_column(found_indices).find_last_value(1)
+        first = column.as_column(found_indices).find_first_value(np.int32(1))
+        last = column.as_column(found_indices).find_last_value(np.int32(1))
         return first, last
 
     def find_first_value(

@@ -1275,3 +1275,69 @@ def test_series_replace_errors():
         lfunc_args_and_kwargs=([{"a": 1}, object()],),
         rfunc_args_and_kwargs=([{"a": 1}, object()],),
     )
+
+
+@pytest.mark.parametrize(
+    "gsr,old,new,expected",
+    [
+        (
+            cudf.Series(["a", "b", "c", None]),
+            None,
+            "a",
+            cudf.Series(["a", "b", "c", "a"]),
+        ),
+        (
+            cudf.Series(["a", "b", "c", None]),
+            [None, "a", "a"],
+            ["c", "b", "d"],
+            cudf.Series(["d", "b", "c", "c"]),
+        ),
+        (
+            cudf.Series(["a", "b", "c", None]),
+            [None, "a"],
+            ["b", None],
+            cudf.Series([None, "b", "c", "b"]),
+        ),
+        (
+            cudf.Series(["a", "b", "c", None]),
+            [None, None],
+            [None, None],
+            cudf.Series(["a", "b", "c", None]),
+        ),
+        (cudf.Series([1, 2, None, 3]), None, 10, cudf.Series([1, 2, 10, 3])),
+        (
+            cudf.Series([1, 2, None, 3]),
+            [None, 1, 1],
+            [3, 2, 4],
+            cudf.Series([4, 2, 3, 3]),
+        ),
+        (
+            cudf.Series([1, 2, None, 3]),
+            [None, 1],
+            [2, None],
+            cudf.Series([None, 2, 2, 3]),
+        ),
+        (
+            cudf.Series(["a", "q", "t", None], dtype="category"),
+            None,
+            "z",
+            cudf.Series(["a", "q", "t", "z"], dtype="category"),
+        ),
+        (
+            cudf.Series(["a", "q", "t", None], dtype="category"),
+            [None, "a", "q"],
+            ["z", None, None],
+            cudf.Series([None, None, "t", "z"], dtype="category"),
+        ),
+        (
+            cudf.Series(["a", None, "t", None], dtype="category"),
+            [None, "t"],
+            ["p", None],
+            cudf.Series(["a", "p", None, "p"], dtype="category"),
+        ),
+    ],
+)
+def test_replace_nulls(gsr, old, new, expected):
+
+    actual = gsr.replace(old, new)
+    assert_eq(expected, actual)