Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Fix replace to handle null values correctly #8744

Merged
merged 8 commits into from
Jul 20, 2021
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 36 additions & 4 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1065,20 +1065,52 @@ def find_and_replace(
Return col with *to_replace* replaced with *replacement*.
"""
to_replace_col = column.as_column(to_replace)
if len(to_replace_col) == to_replace_col.null_count:
to_replace_col = to_replace_col.astype(self.categories.dtype)
replacement_col = column.as_column(replacement)
if len(replacement_col) == replacement_col.null_count:
replacement_col = replacement_col.astype(self.categories.dtype)

if type(to_replace_col) != type(replacement_col):
raise TypeError(
f"to_replace and value should be of same types,"
f"got to_replace dtype: {to_replace_col.dtype} and "
f"value dtype: {replacement_col.dtype}"
)
df = cudf.DataFrame({"old": to_replace_col, "new": replacement_col})
df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
if df._data["old"].null_count == 1:
fill_value = df._data["new"][df._data["old"].isna()][0]
if fill_value in self.categories:
replaced = self.fillna(fill_value)
else:
new_categories = self.categories.append(
column.as_column([fill_value])
)
replaced = self.copy()
replaced = replaced._set_categories(new_categories)
replaced = replaced.fillna(fill_value)
df = df.dropna(subset=["old"])
to_replace_col = df["old"]._column
replacement_col = df["new"]._column
else:
replaced = self
if df._data["new"].null_count > 0:
drop_values = df._data["old"][df._data["new"].isna()]
cur_categories = replaced.categories
new_categories = cur_categories[
~cudf.Series(cur_categories.isin(drop_values))
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
]
replaced = replaced._set_categories(new_categories)
df = df.dropna(subset=["new"])
to_replace_col = df["old"]._column
replacement_col = df["new"]._column

# create a dataframe containing the pre-replacement categories
# and a copy of them to work with. The index of this dataframe
# represents the original ints that map to the categories
old_cats = cudf.DataFrame()
old_cats["cats"] = column.as_column(self.dtype.categories)
old_cats["cats"] = column.as_column(replaced.dtype.categories)
new_cats = old_cats.copy(deep=True)

# Create a column with the appropriate labels replaced
Expand Down Expand Up @@ -1116,11 +1148,11 @@ def find_and_replace(
# named 'index', which came from the filtered categories,
# contains the new ints that we need to map to
to_replace_col = column.as_column(catmap.index).astype(
self.codes.dtype
replaced.codes.dtype
)
replacement_col = catmap["index"]._column.astype(self.codes.dtype)
replacement_col = catmap["index"]._column.astype(replaced.codes.dtype)

replaced = column.as_column(self.codes)
replaced = column.as_column(replaced.codes)
output = libcudf.replace.replace(
replaced, to_replace_col, replacement_col
)
Expand Down
23 changes: 17 additions & 6 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,8 +298,8 @@ def find_and_replace(
"""
Return col with *to_replace* replaced with *value*.
"""
to_replace_col = as_column(to_replace)
replacement_col = as_column(replacement)
to_replace_col = column.as_column(to_replace)
replacement_col = column.as_column(replacement)

if type(to_replace_col) != type(replacement_col):
raise TypeError(
Expand Down Expand Up @@ -334,8 +334,16 @@ def find_and_replace(
to_replace_col, replacement_col, replaced = numeric_normalize_types(
to_replace_col, replacement_col, replaced
)
df = cudf.DataFrame({"old": to_replace_col, "new": replacement_col})
df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
if df._data["old"].null_count == 1:
replaced = replaced.fillna(
df._data["new"][df._data["old"].isna()][0]
)
df = df.dropna(subset=["old"])

return libcudf.replace.replace(
replaced, to_replace_col, replacement_col
replaced, df["old"]._column, df["new"]._column
)

def fillna(
Expand Down Expand Up @@ -618,9 +626,12 @@ def _normalize_find_and_replace_input(
)
# Scalar case
if len(col_to_normalize) == 1:
col_to_normalize_casted = input_column_dtype.type(
col_to_normalize[0]
)
if cudf._lib.scalar._is_null_host_scalar(col_to_normalize[0]):
return normalized_column.astype(input_column_dtype)
else:
col_to_normalize_casted = input_column_dtype.type(
col_to_normalize[0]
)
if not np.isnan(col_to_normalize_casted) and (
col_to_normalize_casted != col_to_normalize[0]
):
Expand Down
19 changes: 15 additions & 4 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -5277,8 +5277,16 @@ def find_and_replace(
and replacement_col.dtype != self.dtype
):
return self.copy()

return libcudf.replace.replace(self, to_replace_col, replacement_col)
df = cudf.DataFrame({"old": to_replace_col, "new": replacement_col})
df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
if df._data["old"].null_count == 1:
res = self.fillna(df._data["new"][df._data["old"].isna()][0])
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
df = df.dropna(subset=["old"])
else:
res = self
return libcudf.replace.replace(
res, df["old"]._column, df["new"]._column
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
)

def fillna(
self,
Expand All @@ -5289,15 +5297,18 @@ def fillna(
if fill_value is not None:
if not is_scalar(fill_value):
fill_value = column.as_column(fill_value, dtype=self.dtype)
elif cudf._lib.scalar._is_null_host_scalar(fill_value):
# Trying to fill <NA> with <NA> value? Return copy.
return self.copy(deep=True)
return super().fillna(value=fill_value, dtype="object")
else:
return super().fillna(method=method)

def _find_first_and_last(self, value: ScalarLike) -> Tuple[int, int]:
found_indices = libstrings.contains_re(self, f"^{value}$")
found_indices = libcudf.unary.cast(found_indices, dtype=np.int32)
first = column.as_column(found_indices).find_first_value(1)
last = column.as_column(found_indices).find_last_value(1)
first = column.as_column(found_indices).find_first_value(np.int32(1))
last = column.as_column(found_indices).find_last_value(np.int32(1))
return first, last

def find_first_value(
Expand Down
66 changes: 66 additions & 0 deletions python/cudf/cudf/tests/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -1275,3 +1275,69 @@ def test_series_replace_errors():
lfunc_args_and_kwargs=([{"a": 1}, object()],),
rfunc_args_and_kwargs=([{"a": 1}, object()],),
)


@pytest.mark.parametrize(
"gsr,old,new,expected",
[
(
cudf.Series(["a", "b", "c", None]),
None,
"a",
cudf.Series(["a", "b", "c", "a"]),
),
(
cudf.Series(["a", "b", "c", None]),
[None, "a", "a"],
["c", "b", "d"],
cudf.Series(["d", "b", "c", "c"]),
),
(
cudf.Series(["a", "b", "c", None]),
[None, "a"],
["b", None],
cudf.Series([None, "b", "c", "b"]),
),
(
cudf.Series(["a", "b", "c", None]),
[None, None],
[None, None],
cudf.Series(["a", "b", "c", None]),
),
(cudf.Series([1, 2, None, 3]), None, 10, cudf.Series([1, 2, 10, 3])),
(
cudf.Series([1, 2, None, 3]),
[None, 1, 1],
[3, 2, 4],
cudf.Series([4, 2, 3, 3]),
),
(
cudf.Series([1, 2, None, 3]),
[None, 1],
[2, None],
cudf.Series([None, 2, 2, 3]),
),
(
cudf.Series(["a", "q", "t", None], dtype="category"),
None,
"z",
cudf.Series(["a", "q", "t", "z"], dtype="category"),
),
(
cudf.Series(["a", "q", "t", None], dtype="category"),
[None, "a", "q"],
["z", None, None],
cudf.Series([None, None, "t", "z"], dtype="category"),
),
(
cudf.Series(["a", None, "t", None], dtype="category"),
[None, "t"],
["p", None],
cudf.Series(["a", "p", None, "p"], dtype="category"),
),
],
)
def test_replace_nulls(gsr, old, new, expected):

actual = gsr.replace(old, new)
assert_eq(expected, actual)