From fae3e8034faf66eb8ef00bcbed73d48e4ef791d3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 15 Nov 2024 19:50:49 +0100 Subject: [PATCH] TST (string dtype): resolve xfails for frame fillna and replace tests + fix bug in replace for string (#60295) * TST (string dtype): resolve xfails for frame fillna and replace tests + fix bug in replace for string * fix fillna upcast issue * fix reshaping of condition in where - only do for 2d blocks --- pandas/core/array_algos/replace.py | 2 + pandas/core/internals/blocks.py | 7 ++ pandas/tests/frame/methods/test_fillna.py | 57 +++++-------- pandas/tests/frame/methods/test_replace.py | 94 +++++++++++----------- 4 files changed, 80 insertions(+), 80 deletions(-) diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index f946c5adcbb0b..a9ad66b7cb2e5 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -151,4 +151,6 @@ def re_replacer(s): if mask is None: values[:] = f(values) else: + if values.ndim != mask.ndim: + mask = np.broadcast_to(mask, values.shape) values[mask] = f(values[mask]) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3c207e8c14b5b..54273ff89f1af 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1688,6 +1688,13 @@ def where(self, other, cond) -> list[Block]: if isinstance(self.dtype, (IntervalDtype, StringDtype)): # TestSetitemFloatIntervalWithIntIntervalValues blk = self.coerce_to_target_dtype(orig_other, raise_on_upcast=False) + if ( + self.ndim == 2 + and isinstance(orig_cond, np.ndarray) + and orig_cond.ndim == 1 + and not is_1d_only_ea_dtype(blk.dtype) + ): + orig_cond = orig_cond[:, None] return blk.where(orig_other, orig_cond) elif isinstance(self, NDArrayBackedExtensionBlock): diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index ad1a37916e381..67d1d45af1cb3 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( Categorical, DataFrame, @@ -65,15 +63,20 @@ def test_fillna_datetime(self, datetime_frame): with pytest.raises(TypeError, match=msg): datetime_frame.fillna() - # TODO(infer_string) test as actual error instead of xfail - @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") - def test_fillna_mixed_type(self, float_string_frame): + def test_fillna_mixed_type(self, float_string_frame, using_infer_string): mf = float_string_frame mf.loc[mf.index[5:20], "foo"] = np.nan mf.loc[mf.index[-10:], "A"] = np.nan - # TODO: make stronger assertion here, GH 25640 - mf.fillna(value=0) - mf.ffill() + + result = mf.ffill() + assert ( + result.loc[result.index[-10:], "A"] == result.loc[result.index[-11], "A"] + ).all() + assert (result.loc[result.index[5:20], "foo"] == "bar").all() + + result = mf.fillna(value=0) + assert (result.loc[result.index[-10:], "A"] == 0).all() + assert (result.loc[result.index[5:20], "foo"] == 0).all() def test_fillna_mixed_float(self, mixed_float_frame): # mixed numeric (but no float16) @@ -84,28 +87,21 @@ def test_fillna_mixed_float(self, mixed_float_frame): result = mf.ffill() _check_mixed_float(result, dtype={"C": None}) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_fillna_different_dtype(self, using_infer_string): + def test_fillna_different_dtype(self): # with different dtype (GH#3386) df = DataFrame( [["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]] ) - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.fillna({2: "foo"}) - else: - result = df.fillna({2: "foo"}) + result = df.fillna({2: "foo"}) expected = DataFrame( [["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]] ) + # column is originally float (all-NaN) -> filling with string gives object dtype + expected[2] = expected[2].astype("object") tm.assert_frame_equal(result, expected) - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - return_value = df.fillna({2: "foo"}, inplace=True) - else: - return_value = df.fillna({2: "foo"}, inplace=True) + return_value = df.fillna({2: "foo"}, inplace=True) tm.assert_frame_equal(df, expected) assert return_value is None @@ -276,8 +272,7 @@ def test_fillna_dictlike_value_duplicate_colnames(self, columns): expected["A"] = 0.0 tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_fillna_dtype_conversion(self, using_infer_string): + def test_fillna_dtype_conversion(self): # make sure that fillna on an empty frame works df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) result = df.dtypes @@ -292,7 +287,7 @@ def test_fillna_dtype_conversion(self, using_infer_string): # empty block df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64") result = df.fillna("nan") - expected = DataFrame("nan", index=range(3), columns=["A", "B"]) + expected = DataFrame("nan", dtype="object", index=range(3), columns=["A", "B"]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("val", ["", 1, np.nan, 1.0]) @@ -540,18 +535,10 @@ def test_fillna_col_reordering(self): filled = df.ffill() assert df.columns.tolist() == filled.columns.tolist() - # TODO(infer_string) test as actual error instead of xfail - @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") - def test_fill_corner(self, float_frame, float_string_frame): - mf = float_string_frame - mf.loc[mf.index[5:20], "foo"] = np.nan - mf.loc[mf.index[-10:], "A"] = np.nan - - filled = float_string_frame.fillna(value=0) - assert (filled.loc[filled.index[5:20], "foo"] == 0).all() - del float_string_frame["foo"] - - float_frame.reindex(columns=[]).fillna(value=0) + def test_fill_empty(self, float_frame): + df = float_frame.reindex(columns=[]) + result = df.fillna(value=0) + tm.assert_frame_equal(result, df) def test_fillna_with_columns_and_limit(self): # GH40989 diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 73f44bcc6657e..b2320798ea9a2 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -30,7 +28,6 @@ def mix_abc() -> dict[str, list[float | str]]: class TestDataFrameReplace: - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_inplace(self, datetime_frame, float_string_frame): datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan @@ -46,7 +43,9 @@ def test_replace_inplace(self, datetime_frame, float_string_frame): mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan result = float_string_frame.replace(np.nan, 0) - expected = float_string_frame.fillna(value=0) + expected = float_string_frame.copy() + expected["foo"] = expected["foo"].astype(object) + expected = expected.fillna(value=0) tm.assert_frame_equal(result, expected) tsframe = datetime_frame.copy() @@ -291,22 +290,20 @@ def test_regex_replace_dict_nested_non_first_character( expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_dict_nested_gh4115(self): - df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) - expected = DataFrame( - {"Type": Series([0, 1, 0, 0, 1], dtype=df.Type.dtype), "tmp": 2} + df = DataFrame( + {"Type": Series(["Q", "T", "Q", "Q", "T"], dtype=object), "tmp": 2} ) + expected = DataFrame({"Type": Series([0, 1, 0, 0, 1], dtype=object), "tmp": 2}) result = df.replace({"Type": {"Q": 0, "T": 1}}) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_list_to_scalar(self, mix_abc): df = DataFrame(mix_abc) expec = DataFrame( { "a": mix_abc["a"], - "b": np.array([np.nan] * 4, dtype=object), + "b": Series([np.nan] * 4, dtype="str"), "c": [np.nan, np.nan, np.nan, "d"], } ) @@ -326,7 +323,6 @@ def test_regex_replace_list_to_scalar(self, mix_abc): tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_str_to_numeric(self, mix_abc): # what happens when you try to replace a numeric value with a regex? df = DataFrame(mix_abc) @@ -338,11 +334,12 @@ def test_regex_replace_str_to_numeric(self, mix_abc): return_value = res3.replace(regex=r"\s*\.\s*", value=0, inplace=True) assert return_value is None expec = DataFrame({"a": mix_abc["a"], "b": ["a", "b", 0, 0], "c": mix_abc["c"]}) + # TODO(infer_string) + expec["c"] = expec["c"].astype(object) tm.assert_frame_equal(res, expec) tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_regex_list_to_numeric(self, mix_abc): df = DataFrame(mix_abc) res = df.replace([r"\s*\.\s*", "b"], 0, regex=True) @@ -535,31 +532,37 @@ def test_replace_series_dict(self): result = df.replace(s, df.mean()) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") - def test_replace_convert(self): - # gh 3907 - df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) + def test_replace_convert(self, any_string_dtype): + # gh 3907 (pandas >= 3.0 no longer converts dtypes) + df = DataFrame( + [["foo", "bar", "bah"], ["bar", "foo", "bah"]], dtype=any_string_dtype + ) m = {"foo": 1, "bar": 2, "bah": 3} rep = df.replace(m) - expec = df.dtypes - res = rep.dtypes - tm.assert_series_equal(expec, res) + assert (rep.dtypes == object).all() - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_mixed(self, float_string_frame): mf = float_string_frame mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan result = float_string_frame.replace(np.nan, -18) - expected = float_string_frame.fillna(value=-18) + expected = float_string_frame.copy() + expected["foo"] = expected["foo"].astype(object) + expected = expected.fillna(value=-18) tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result.replace(-18, np.nan), float_string_frame) + expected2 = float_string_frame.copy() + expected2["foo"] = expected2["foo"].astype(object) + tm.assert_frame_equal(result.replace(-18, np.nan), expected2) result = float_string_frame.replace(np.nan, -1e8) - expected = float_string_frame.fillna(value=-1e8) + expected = float_string_frame.copy() + expected["foo"] = expected["foo"].astype(object) + expected = expected.fillna(value=-1e8) tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result.replace(-1e8, np.nan), float_string_frame) + expected2 = float_string_frame.copy() + expected2["foo"] = expected2["foo"].astype(object) + tm.assert_frame_equal(result.replace(-1e8, np.nan), expected2) def test_replace_mixed_int_block_upcasting(self): # int block upcasting @@ -601,8 +604,7 @@ def test_replace_mixed_int_block_splitting(self): result = df.replace(0, 0.5) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_replace_mixed2(self, using_infer_string): + def test_replace_mixed2(self): # to object block upcasting df = DataFrame( { @@ -621,7 +623,7 @@ def test_replace_mixed2(self, using_infer_string): expected = DataFrame( { - "A": Series(["foo", "bar"]), + "A": Series(["foo", "bar"], dtype="object"), "B": Series([0, "foo"], dtype="object"), } ) @@ -917,8 +919,7 @@ def test_replace_limit(self): # TODO pass - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") - def test_replace_dict_no_regex(self): + def test_replace_dict_no_regex(self, any_string_dtype): answer = Series( { 0: "Strongly Agree", @@ -926,7 +927,8 @@ def test_replace_dict_no_regex(self): 2: "Neutral", 3: "Disagree", 4: "Strongly Disagree", - } + }, + dtype=any_string_dtype, ) weights = { "Agree": 4, @@ -935,11 +937,11 @@ def test_replace_dict_no_regex(self): "Strongly Agree": 5, "Strongly Disagree": 1, } - expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}, dtype=answer.dtype) + expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}, dtype=object) result = answer.replace(weights) tm.assert_series_equal(result, expected) - def test_replace_series_no_regex(self): + def test_replace_series_no_regex(self, any_string_dtype): answer = Series( { 0: "Strongly Agree", @@ -947,7 +949,8 @@ def test_replace_series_no_regex(self): 2: "Neutral", 3: "Disagree", 4: "Strongly Disagree", - } + }, + dtype=any_string_dtype, ) weights = Series( { @@ -1043,16 +1046,15 @@ def test_nested_dict_overlapping_keys_replace_str(self): expected = df.replace({"a": dict(zip(astr, bstr))}) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") - def test_replace_swapping_bug(self, using_infer_string): + def test_replace_swapping_bug(self): df = DataFrame({"a": [True, False, True]}) res = df.replace({"a": {True: "Y", False: "N"}}) - expect = DataFrame({"a": ["Y", "N", "Y"]}) + expect = DataFrame({"a": ["Y", "N", "Y"]}, dtype=object) tm.assert_frame_equal(res, expect) df = DataFrame({"a": [0, 1, 0]}) res = df.replace({"a": {0: "Y", 1: "N"}}) - expect = DataFrame({"a": ["Y", "N", "Y"]}) + expect = DataFrame({"a": ["Y", "N", "Y"]}, dtype=object) tm.assert_frame_equal(res, expect) def test_replace_datetimetz(self): @@ -1186,7 +1188,7 @@ def test_replace_commutative(self, df, to_replace, exp): ) def test_replace_replacer_dtype(self, replacer): # GH26632 - df = DataFrame(["a"]) + df = DataFrame(["a"], dtype=object) result = df.replace({"a": replacer, "b": replacer}) expected = DataFrame([replacer], dtype=object) tm.assert_frame_equal(result, expected) @@ -1266,7 +1268,6 @@ def test_categorical_replace_with_dict(self, replace_dict, final_data): assert return_value is None tm.assert_frame_equal(df, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_value_category_type(self): """ Test for #23305: to ensure category dtypes are maintained @@ -1322,7 +1323,7 @@ def test_replace_value_category_type(self): lambda x: x.astype("category").cat.rename_categories({"cat2": "catX"}) ) - result = result.astype({"col1": "int64", "col3": "float64", "col5": "object"}) + result = result.astype({"col1": "int64", "col3": "float64", "col5": "str"}) tm.assert_frame_equal(result, expected) def test_replace_dict_category_type(self): @@ -1363,12 +1364,11 @@ def test_replace_with_compiled_regex(self): expected = DataFrame(["z", "b", "c"]) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_intervals(self): # https://github.com/pandas-dev/pandas/issues/35931 df = DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]}) result = df.replace({"a": {pd.Interval(0, 1): "x"}}) - expected = DataFrame({"a": ["x", "x"]}) + expected = DataFrame({"a": ["x", "x"]}, dtype=object) tm.assert_frame_equal(result, expected) def test_replace_unicode(self): @@ -1468,17 +1468,21 @@ def test_regex_replace_scalar( expected.loc[expected["a"] == ".", "a"] = expected_replace_val tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_frame(self, regex): # GH-48644 df1 = DataFrame({"A": ["0"], "B": ["0"]}) - expected_df1 = DataFrame({"A": [1], "B": [1]}, dtype=df1.dtypes.iloc[0]) + expected_df1 = DataFrame({"A": [1], "B": [1]}, dtype=object) result_df1 = df1.replace(to_replace="0", value=1, regex=regex) tm.assert_frame_equal(result_df1, expected_df1) df2 = DataFrame({"A": ["0"], "B": ["1"]}) - expected_df2 = DataFrame({"A": [1], "B": ["1"]}, dtype=df2.dtypes.iloc[0]) + if regex: + # TODO(infer_string): both string columns get cast to object, + # while only needed for column A + expected_df2 = DataFrame({"A": [1], "B": ["1"]}, dtype=object) + else: + expected_df2 = DataFrame({"A": Series([1], dtype=object), "B": ["1"]}) result_df2 = df2.replace(to_replace="0", value=1, regex=regex) tm.assert_frame_equal(result_df2, expected_df2)