diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 3aa53d4b07aa5..5fa1a984b8aea 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -12,6 +12,7 @@ import numpy as np +from pandas._config import using_string_dtype from pandas._config.localization import ( can_set_locale, get_locales, @@ -106,7 +107,10 @@ ALL_FLOAT_DTYPES: list[Dtype] = [*FLOAT_NUMPY_DTYPES, *FLOAT_EA_DTYPES] COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"] -STRING_DTYPES: list[Dtype] = [str, "str", "U"] +if using_string_dtype(): + STRING_DTYPES: list[Dtype] = [str, "U"] +else: + STRING_DTYPES: list[Dtype] = [str, "str", "U"] # type: ignore[no-redef] COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES] DATETIME64_DTYPES: list[Dtype] = ["datetime64[ns]", "M8[ns]"] diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index a17056b51a014..d07bfeda50e1d 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -575,7 +575,10 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, np.ndarray): if not len(item): # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] - if self._dtype.name == "string" and self._dtype.storage == "pyarrow": + if ( + isinstance(self._dtype, StringDtype) + and self._dtype.storage == "pyarrow" + ): # TODO(infer_string) should this be large_string? pa_dtype = pa.string() else: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index b75dbaa3a15e8..8a4fd9fc1b34d 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -4,7 +4,6 @@ from typing import ( TYPE_CHECKING, Any, - ClassVar, Literal, cast, ) @@ -118,9 +117,12 @@ class StringDtype(StorageExtensionDtype): string[pyarrow] """ - # error: Cannot override instance variable (previously declared on - # base class "StorageExtensionDtype") with class variable - name: ClassVar[str] = "string" # type: ignore[misc] + @property + def name(self) -> str: # type: ignore[override] + if self._na_value is libmissing.NA: + return "string" + else: + return "str" #: StringDtype().na_value uses pandas.NA except the implementation that # follows NumPy semantics, which uses nan. @@ -137,7 +139,7 @@ def __init__( ) -> None: # infer defaults if storage is None: - if using_string_dtype() and na_value is not libmissing.NA: + if na_value is not libmissing.NA: if HAS_PYARROW: storage = "pyarrow" else: @@ -170,11 +172,19 @@ def __init__( self.storage = storage self._na_value = na_value + def __repr__(self) -> str: + if self._na_value is libmissing.NA: + return f"{self.name}[{self.storage}]" + else: + # TODO add more informative repr + return self.name + def __eq__(self, other: object) -> bool: # we need to override the base class __eq__ because na_value (NA or NaN) # cannot be checked with normal `==` if isinstance(other, str): - if other == self.name: + # TODO should dtype == "string" work for the NaN variant? + if other == "string" or other == self.name: # noqa: PLR1714 return True try: other = self.construct_from_string(other) @@ -231,6 +241,8 @@ def construct_from_string(cls, string) -> Self: ) if string == "string": return cls() + elif string == "str" and using_string_dtype(): + return cls(na_value=np.nan) elif string == "string[python]": return cls(storage="python") elif string == "string[pyarrow]": diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ea91046f4b8e4..b8039746d9952 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4807,7 +4807,9 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: ----- * To select all *numeric* types, use ``np.number`` or ``'number'`` * To select strings you must use the ``object`` dtype, but note that - this will return *all* object dtype columns + this will return *all* object dtype columns. With + ``pd.options.future.infer_string`` enabled, using ``"str"`` will + work to select all string columns. * See the `numpy dtype hierarchy `__ * To select datetimes, use ``np.datetime64``, ``'datetime'`` or diff --git a/pandas/core/interchange/utils.py b/pandas/core/interchange/utils.py index fd1c7c9639242..035a1f8abdbc5 100644 --- a/pandas/core/interchange/utils.py +++ b/pandas/core/interchange/utils.py @@ -135,7 +135,12 @@ def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str: if format_str is not None: return format_str - if lib.is_np_dtype(dtype, "M"): + if isinstance(dtype, pd.StringDtype): + # TODO(infer_string) this should be LARGE_STRING for pyarrow storage, + # but current tests don't cover this distinction + return ArrowCTypes.STRING + + elif lib.is_np_dtype(dtype, "M"): # Selecting the first char of resolution string: # dtype.str -> ' 'n' resolution = np.datetime_data(dtype)[0][0] diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index aee9100702350..6ac0b49f0e4e7 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -110,7 +110,7 @@ def test_numba_unsupported_dtypes(apply_axis): with pytest.raises( ValueError, - match="Column b must have a numeric dtype. Found 'object|string' instead", + match="Column b must have a numeric dtype. Found 'object|str' instead", ): df.apply(f, engine="numba", axis=apply_axis) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index 53fc135e77780..76704de6f2d10 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -224,7 +224,7 @@ def test_apply_categorical(by_row, using_infer_string): result = ser.apply(lambda x: "A") exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) - assert result.dtype == object if not using_infer_string else "string[pyarrow_numpy]" + assert result.dtype == object if not using_infer_string else "str" @pytest.mark.parametrize("series", [["1-1", "1-1", np.nan], ["1-1", "1-2", np.nan]]) diff --git a/pandas/tests/arrays/boolean/test_astype.py b/pandas/tests/arrays/boolean/test_astype.py index 932e903c0e448..8c2672218f273 100644 --- a/pandas/tests/arrays/boolean/test_astype.py +++ b/pandas/tests/arrays/boolean/test_astype.py @@ -5,7 +5,7 @@ import pandas._testing as tm -def test_astype(): +def test_astype(using_infer_string): # with missing values arr = pd.array([True, False, None], dtype="boolean") @@ -20,8 +20,14 @@ def test_astype(): tm.assert_numpy_array_equal(result, expected) result = arr.astype("str") - expected = np.array(["True", "False", ""], dtype=f"{tm.ENDIAN}U5") - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array( + ["True", "False", None], dtype=pd.StringDtype(na_value=np.nan) + ) + tm.assert_extension_array_equal(result, expected) + else: + expected = np.array(["True", "False", ""], dtype=f"{tm.ENDIAN}U5") + tm.assert_numpy_array_equal(result, expected) # no missing values arr = pd.array([True, False, True], dtype="boolean") diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py index 7cfd8ec8dfadf..00999d491b242 100644 --- a/pandas/tests/arrays/categorical/test_astype.py +++ b/pandas/tests/arrays/categorical/test_astype.py @@ -88,7 +88,7 @@ def test_astype(self, ordered): expected = np.array(cat) tm.assert_numpy_array_equal(result, expected) - msg = r"Cannot cast object|string dtype to float64" + msg = r"Cannot cast object|str dtype to float64" with pytest.raises(ValueError, match=msg): cat.astype(float) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index e2e5d47f50209..3a2c489920eb0 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -22,7 +22,7 @@ def test_print(self, using_infer_string): if using_infer_string: expected = [ "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", - "Categories (3, string): [a < b < c]", + "Categories (3, str): [a < b < c]", ] else: expected = [ diff --git a/pandas/tests/arrays/floating/test_astype.py b/pandas/tests/arrays/floating/test_astype.py index ade3dbd2c99da..ccf644b34051d 100644 --- a/pandas/tests/arrays/floating/test_astype.py +++ b/pandas/tests/arrays/floating/test_astype.py @@ -63,12 +63,21 @@ def test_astype_to_integer_array(): tm.assert_extension_array_equal(result, expected) -def test_astype_str(): +def test_astype_str(using_infer_string): a = pd.array([0.1, 0.2, None], dtype="Float64") - expected = np.array(["0.1", "0.2", ""], dtype="U32") - tm.assert_numpy_array_equal(a.astype(str), expected) - tm.assert_numpy_array_equal(a.astype("str"), expected) + if using_infer_string: + expected = pd.array(["0.1", "0.2", None], dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_extension_array_equal(a.astype("str"), expected) + + # TODO(infer_string) this should also be a string array like above + expected = np.array(["0.1", "0.2", ""], dtype="U32") + tm.assert_numpy_array_equal(a.astype(str), expected) + else: + expected = np.array(["0.1", "0.2", ""], dtype="U32") + + tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_numpy_array_equal(a.astype("str"), expected) def test_astype_copy(): diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index 8ed6dbcd32d3d..fadd7ac67b58d 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -276,12 +276,21 @@ def test_to_numpy_na_raises(dtype): a.to_numpy(dtype=dtype) -def test_astype_str(): +def test_astype_str(using_infer_string): a = pd.array([1, 2, None], dtype="Int64") - expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") - tm.assert_numpy_array_equal(a.astype(str), expected) - tm.assert_numpy_array_equal(a.astype("str"), expected) + if using_infer_string: + expected = pd.array(["1", "2", None], dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_extension_array_equal(a.astype("str"), expected) + + # TODO(infer_string) this should also be a string array like above + expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") + tm.assert_numpy_array_equal(a.astype(str), expected) + else: + expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") + + tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_numpy_array_equal(a.astype("str"), expected) def test_astype_boolean(): diff --git a/pandas/tests/arrays/interval/test_interval_pyarrow.py b/pandas/tests/arrays/interval/test_interval_pyarrow.py index be87d5d3ef7ba..ef8701be81e2b 100644 --- a/pandas/tests/arrays/interval/test_interval_pyarrow.py +++ b/pandas/tests/arrays/interval/test_interval_pyarrow.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm from pandas.core.arrays import IntervalArray @@ -82,7 +80,6 @@ def test_arrow_array_missing(): assert result.storage.equals(expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) diff --git a/pandas/tests/arrays/period/test_arrow_compat.py b/pandas/tests/arrays/period/test_arrow_compat.py index ff86b696c8403..431309aca0df2 100644 --- a/pandas/tests/arrays/period/test_arrow_compat.py +++ b/pandas/tests/arrays/period/test_arrow_compat.py @@ -1,7 +1,5 @@ import pytest -from pandas._config import using_string_dtype - from pandas.compat.pyarrow import pa_version_under10p1 from pandas.core.dtypes.dtypes import PeriodDtype @@ -79,7 +77,6 @@ def test_arrow_array_missing(): assert result.storage.equals(expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_arrow_table_roundtrip(): from pandas.core.arrays.arrow.extension_types import ArrowPeriodType @@ -99,7 +96,6 @@ def test_arrow_table_roundtrip(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_arrow_load_from_zero_chunks(): # GH-41040 diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 3688d2998b3c7..91ad01a2fb0eb 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -66,7 +66,7 @@ def test_repr(dtype): assert repr(df) == expected if dtype.na_value is np.nan: - expected = "0 a\n1 NaN\n2 b\nName: A, dtype: string" + expected = "0 a\n1 NaN\n2 b\nName: A, dtype: str" else: expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected @@ -76,10 +76,10 @@ def test_repr(dtype): expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" elif dtype.storage == "pyarrow" and dtype.na_value is np.nan: arr_name = "ArrowStringArrayNumpySemantics" - expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" + expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str" elif dtype.storage == "python" and dtype.na_value is np.nan: arr_name = "StringArrayNumpySemantics" - expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" + expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str" else: arr_name = "StringArray" expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" @@ -500,7 +500,7 @@ def test_fillna_args(dtype): tm.assert_extension_array_equal(res, expected) if dtype.storage == "pyarrow": - msg = "Invalid value '1' for dtype string" + msg = "Invalid value '1' for dtype str" else: msg = "Cannot set non-string value '1' into a StringArray." with pytest.raises(TypeError, match=msg): @@ -522,7 +522,7 @@ def test_arrow_array(dtype): assert arr.equals(expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_roundtrip(dtype, string_storage, using_infer_string): # roundtrip possible from arrow 1.0.0 @@ -537,14 +537,17 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string): assert table.field("a").type == "large_string" with pd.option_context("string_storage", string_storage): result = table.to_pandas() - assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage}]") - tm.assert_frame_equal(result, expected) - # ensure the missing value is represented by NA and not np.nan or None - assert result.loc[2, "a"] is result["a"].dtype.na_value + if dtype.na_value is np.nan and not using_string_dtype(): + assert result["a"].dtype == "object" + else: + assert isinstance(result["a"].dtype, pd.StringDtype) + expected = df.astype(f"string[{string_storage}]") + tm.assert_frame_equal(result, expected) + # ensure the missing value is represented by NA and not np.nan or None + assert result.loc[2, "a"] is result["a"].dtype.na_value -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): # GH-41040 @@ -561,9 +564,13 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) with pd.option_context("string_storage", string_storage): result = table.to_pandas() - assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage}]") - tm.assert_frame_equal(result, expected) + + if dtype.na_value is np.nan and not using_string_dtype(): + assert result["a"].dtype == "object" + else: + assert isinstance(result["a"].dtype, pd.StringDtype) + expected = df.astype(f"string[{string_storage}]") + tm.assert_frame_equal(result, expected) def test_value_counts_na(dtype): diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 65c6ce8e9cd08..7d4aae0f7bb4e 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -4,6 +4,7 @@ import numpy as np import pytest +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -27,8 +28,9 @@ def test_eq_all_na(): def test_config(string_storage, request, using_infer_string): - if using_infer_string and string_storage == "python": - # python string storage with na_value=NaN is not yet implemented + if using_infer_string and string_storage == "python" and HAS_PYARROW: + # string storage with na_value=NaN always uses pyarrow if available + # -> does not yet honor the option request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) with pd.option_context("string_storage", string_storage): diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 3d8f8d791b763..5834b268be2be 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -297,7 +297,9 @@ def test_searchsorted(self): assert result == 10 @pytest.mark.parametrize("box", [None, "index", "series"]) - def test_searchsorted_castable_strings(self, arr1d, box, string_storage): + def test_searchsorted_castable_strings( + self, arr1d, box, string_storage, using_infer_string + ): arr = arr1d if box is None: pass @@ -333,7 +335,8 @@ def test_searchsorted_castable_strings(self, arr1d, box, string_storage): TypeError, match=re.escape( f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', " - "or array of those. Got string array instead." + "or array of those. Got " + f"{'str' if using_infer_string else 'string'} array instead." ), ): arr.searchsorted([str(arr[1]), "baz"]) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index f47815ee059af..4bf97b1fd8494 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -799,3 +799,22 @@ def test_pandas_dtype_ea_not_instance(): # GH 31356 GH 54592 with tm.assert_produces_warning(UserWarning, match="without any arguments"): assert pandas_dtype(CategoricalDtype) == CategoricalDtype() + + +def test_pandas_dtype_string_dtypes(string_storage): + # TODO(infer_string) remove skip if "python" is supported + pytest.importorskip("pyarrow") + with pd.option_context("future.infer_string", True): + with pd.option_context("string_storage", string_storage): + result = pandas_dtype("str") + # TODO(infer_string) hardcoded to pyarrow until python is supported + assert result == pd.StringDtype("pyarrow", na_value=np.nan) + + with pd.option_context("future.infer_string", False): + with pd.option_context("string_storage", string_storage): + result = pandas_dtype("str") + assert result == np.dtype("U") + + with pd.option_context("string_storage", string_storage): + result = pandas_dtype("string") + assert result == pd.StringDtype(string_storage, na_value=pd.NA) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index b6c5becf49fa0..7c7da41124b83 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1061,7 +1061,7 @@ def test_str_vs_repr(self, ordered, using_infer_string): c1 = CategoricalDtype(["a", "b"], ordered=ordered) assert str(c1) == "category" # Py2 will have unicode prefixes - dtype = "string" if using_infer_string else "object" + dtype = "str" if using_infer_string else "object" pat = ( r"CategoricalDtype\(categories=\[.*\], ordered={ordered}, " rf"categories_dtype={dtype}\)" diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 2ab248787a1cf..b59c10824c5c4 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -117,6 +117,20 @@ def test_is_not_string_type(self, dtype): # because StringDtype is a string type assert is_string_dtype(dtype) + def test_is_dtype_from_name(self, dtype, using_infer_string): + if dtype.na_value is np.nan and not using_infer_string: + result = type(dtype).is_dtype(dtype.name) + assert result is False + else: + super().test_is_dtype_from_name(dtype) + + def test_construct_from_string_own_name(self, dtype, using_infer_string): + if dtype.na_value is np.nan and not using_infer_string: + with pytest.raises(TypeError, match="Cannot construct a 'StringDtype'"): + dtype.construct_from_string(dtype.name) + else: + super().test_construct_from_string_own_name(dtype) + def test_view(self, data): if data.dtype.storage == "pyarrow": pytest.skip(reason="2D support not implemented for ArrowStringArray") diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index b0b33b4a565ec..826ac2be3339b 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -323,7 +323,7 @@ def test_setitem(self, float_frame, using_infer_string): smaller["col10"] = ["1", "2"] if using_infer_string: - assert smaller["col10"].dtype == "string" + assert smaller["col10"].dtype == "str" else: assert smaller["col10"].dtype == np.object_ assert (smaller["col10"] == ["1", "2"]).all() @@ -458,13 +458,13 @@ def test_setitem_corner(self, float_frame, using_infer_string): del dm["foo"] dm["foo"] = "bar" if using_infer_string: - assert dm["foo"].dtype == "string" + assert dm["foo"].dtype == "str" else: assert dm["foo"].dtype == np.object_ dm["coercible"] = ["1", "2", "3"] if using_infer_string: - assert dm["coercible"].dtype == "string" + assert dm["coercible"].dtype == "str" else: assert dm["coercible"].dtype == np.object_ @@ -500,7 +500,7 @@ def test_setitem_ambig(self, using_infer_string): dm[2] = uncoercable_series assert len(dm.columns) == 3 if using_infer_string: - assert dm[2].dtype == "string" + assert dm[2].dtype == "str" else: assert dm[2].dtype == np.object_ diff --git a/pandas/tests/frame/indexing/test_set_value.py b/pandas/tests/frame/indexing/test_set_value.py index aaf95daf232e2..124505f440e6c 100644 --- a/pandas/tests/frame/indexing/test_set_value.py +++ b/pandas/tests/frame/indexing/test_set_value.py @@ -28,7 +28,7 @@ def test_set_value_resize(self, float_frame, using_infer_string): res = float_frame.copy() res._set_value("foobar", "baz", "sam") if using_infer_string: - assert res["baz"].dtype == "string" + assert res["baz"].dtype == "str" else: assert res["baz"].dtype == np.object_ res = float_frame.copy() diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index c6c702a1a0b1b..8647df0e8ad96 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -201,7 +201,7 @@ def test_astype_dict_like(self, dtype_class): expected = DataFrame( { "a": a, - "b": Series(["0", "1", "2", "3", "4"], dtype="object"), + "b": Series(["0", "1", "2", "3", "4"], dtype="str"), "c": c, "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"), } @@ -262,9 +262,9 @@ def test_astype_duplicate_col(self): a2 = Series([0, 1, 2, 3, 4], name="a") df = concat([a1, b, a2], axis=1) - result = df.astype(str) + result = df.astype("str") a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a") - b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype=str, name="b") + b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype="str", name="b") a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a") expected = concat([a1_str, b_str, a2_str], axis=1) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_get_numeric_data.py b/pandas/tests/frame/methods/test_get_numeric_data.py index c5d32d56d03c1..6d097e75f6703 100644 --- a/pandas/tests/frame/methods/test_get_numeric_data.py +++ b/pandas/tests/frame/methods/test_get_numeric_data.py @@ -33,7 +33,9 @@ def test_get_numeric_data(self, using_infer_string): [ np.dtype("float64"), np.dtype("int64"), - np.dtype(objectname) if not using_infer_string else "string", + np.dtype(objectname) + if not using_infer_string + else pd.StringDtype(na_value=np.nan), np.dtype(datetime64name), ], index=["a", "b", "c", "f"], diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index 56bb3126455a5..52e871cc795b4 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -74,7 +74,7 @@ def test_nlargest_n(self, nselect_method, n, order): ) if "b" in order: error_msg = ( - f"Column 'b' has dtype (object|string), " + f"Column 'b' has dtype (object|str), " f"cannot use method '{nselect_method}' with this dtype" ) with pytest.raises(TypeError, match=error_msg): diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index c487bc4cfb89a..88e43b678a7e4 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -662,7 +662,7 @@ def test_reset_index_dtypes_on_empty_frame_with_multiindex( idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) result = DataFrame(index=idx)[:0].reset_index().dtypes if using_infer_string and dtype == object: - dtype = "string" + dtype = pd.StringDtype(na_value=np.nan) expected = Series({"level_0": np.int64, "level_1": np.float64, "level_2": dtype}) tm.assert_series_equal(result, expected) @@ -695,7 +695,7 @@ def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby( expected["c3"] = expected["c3"].astype("datetime64[ns]") expected["c1"] = expected["c1"].astype("float64") if using_infer_string: - expected["c2"] = expected["c2"].astype("string[pyarrow_numpy]") + expected["c2"] = expected["c2"].astype("str") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index d1bee6a3de613..875dca321635f 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -50,7 +50,7 @@ def copy(self): class TestSelectDtypes: - def test_select_dtypes_include_using_list_like(self): + def test_select_dtypes_include_using_list_like(self, using_infer_string): df = DataFrame( { "a": list("abc"), @@ -94,6 +94,11 @@ def test_select_dtypes_include_using_list_like(self): with pytest.raises(NotImplementedError, match=r"^$"): df.select_dtypes(include=["period"]) + if using_infer_string: + ri = df.select_dtypes(include=["str"]) + ei = df[["a"]] + tm.assert_frame_equal(ri, ei) + def test_select_dtypes_exclude_using_list_like(self): df = DataFrame( { @@ -151,7 +156,7 @@ def test_select_dtypes_exclude_include_int(self, include): expected = df[["b", "c", "e"]] tm.assert_frame_equal(result, expected) - def test_select_dtypes_include_using_scalars(self): + def test_select_dtypes_include_using_scalars(self, using_infer_string): df = DataFrame( { "a": list("abc"), @@ -187,6 +192,11 @@ def test_select_dtypes_include_using_scalars(self): with pytest.raises(NotImplementedError, match=r"^$"): df.select_dtypes(include="period") + if using_infer_string: + ri = df.select_dtypes(include="str") + ei = df[["a"]] + tm.assert_frame_equal(ri, ei) + def test_select_dtypes_exclude_using_scalars(self): df = DataFrame( { @@ -347,7 +357,10 @@ def test_select_dtypes_datetime_with_tz(self): @pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"]) @pytest.mark.parametrize("arg", ["include", "exclude"]) - def test_select_dtypes_str_raises(self, dtype, arg): + def test_select_dtypes_str_raises(self, dtype, arg, using_infer_string): + if using_infer_string and dtype == "str": + # this is tested below + pytest.skip("Selecting string columns works with future strings") df = DataFrame( { "a": list("abc"), diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 7fb1658394632..adb327e90bb76 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -714,10 +714,7 @@ def test_to_csv_interval_index(self, temp_file, using_infer_string): # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) expected = df.copy() - if using_infer_string: - expected.index = expected.index.astype("string[pyarrow_numpy]") - else: - expected.index = expected.index.astype(str) + expected.index = expected.index.astype("str") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index c95c382bb5131..47eb387abc8e8 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -186,7 +186,9 @@ def test_construction_with_mixed(self, float_string_frame, using_infer_string): expected = Series( [np.dtype("float64")] * 4 + [ - np.dtype("object") if not using_infer_string else "string", + np.dtype("object") + if not using_infer_string + else pd.StringDtype(na_value=np.nan), np.dtype("datetime64[us]"), np.dtype("timedelta64[us]"), ], diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 607e333d82823..a210af94561f9 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -256,7 +256,7 @@ def test_emptylike_constructor(self, emptylike, expected_index, expected_columns tm.assert_frame_equal(result, expected) def test_constructor_mixed(self, float_string_frame, using_infer_string): - dtype = "string" if using_infer_string else np.object_ + dtype = "str" if using_infer_string else np.object_ assert float_string_frame["foo"].dtype == dtype def test_constructor_cast_failure(self): @@ -760,7 +760,7 @@ def test_constructor_dict_cast(self, using_infer_string): frame = DataFrame(test_data) assert len(frame) == 3 - assert frame["B"].dtype == np.object_ if not using_infer_string else "string" + assert frame["B"].dtype == np.object_ if not using_infer_string else "str" assert frame["A"].dtype == np.float64 def test_constructor_dict_cast2(self): @@ -1182,7 +1182,7 @@ def test_constructor_scalar_inference(self, using_infer_string): assert df["bool"].dtype == np.bool_ assert df["float"].dtype == np.float64 assert df["complex"].dtype == np.complex128 - assert df["object"].dtype == np.object_ if not using_infer_string else "string" + assert df["object"].dtype == np.object_ if not using_infer_string else "str" def test_constructor_arrays_and_scalars(self): df = DataFrame({"a": np.random.default_rng(2).standard_normal(10), "b": True}) @@ -1265,7 +1265,7 @@ def test_constructor_list_of_lists(self, using_infer_string): # GH #484 df = DataFrame(data=[[1, "a"], [2, "b"]], columns=["num", "str"]) assert is_integer_dtype(df["num"]) - assert df["str"].dtype == np.object_ if not using_infer_string else "string" + assert df["str"].dtype == np.object_ if not using_infer_string else "str" # GH 4851 # list of 0-dim ndarrays @@ -1833,7 +1833,12 @@ def test_constructor_with_datetimes(self, using_infer_string): result = df.dtypes expected = Series( [np.dtype("int64")] - + [np.dtype(objectname) if not using_infer_string else "string"] * 2 + + [ + np.dtype(objectname) + if not using_infer_string + else pd.StringDtype(na_value=np.nan) + ] + * 2 + [np.dtype("M8[s]"), np.dtype("M8[us]")], index=list("ABCDE"), ) @@ -1855,7 +1860,11 @@ def test_constructor_with_datetimes(self, using_infer_string): expected = Series( [np.dtype("float64")] + [np.dtype("int64")] - + [np.dtype("object") if not using_infer_string else "string"] + + [ + np.dtype("object") + if not using_infer_string + else pd.StringDtype(na_value=np.nan) + ] + [np.dtype("float64")] + [np.dtype(intname)], index=["a", "b", "c", floatname, intname], @@ -1877,7 +1886,11 @@ def test_constructor_with_datetimes(self, using_infer_string): expected = Series( [np.dtype("float64")] + [np.dtype("int64")] - + [np.dtype("object") if not using_infer_string else "string"] + + [ + np.dtype("object") + if not using_infer_string + else pd.StringDtype(na_value=np.nan) + ] + [np.dtype("float64")] + [np.dtype(intname)], index=["a", "b", "c", floatname, intname], @@ -2103,7 +2116,9 @@ def test_constructor_for_list_with_dtypes(self, using_infer_string): [ np.dtype("int64"), np.dtype("float64"), - np.dtype("object") if not using_infer_string else "string", + np.dtype("object") + if not using_infer_string + else pd.StringDtype(na_value=np.nan), np.dtype("datetime64[us]"), np.dtype("float64"), ], diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 92bcd6f0c7d0c..b4f02b6f81b6f 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -658,7 +658,11 @@ def test_unstack_dtypes(self, using_infer_string): df2["D"] = "foo" df3 = df2.unstack("B") result = df3.dtypes - dtype = "string" if using_infer_string else np.dtype("object") + dtype = ( + pd.StringDtype(na_value=np.nan) + if using_infer_string + else np.dtype("object") + ) expected = Series( [np.dtype("float64")] * 2 + [dtype] * 2, index=MultiIndex.from_arrays( diff --git a/pandas/tests/generic/test_to_xarray.py b/pandas/tests/generic/test_to_xarray.py index d8401a8b2ae3f..9fe9bca8abdc9 100644 --- a/pandas/tests/generic/test_to_xarray.py +++ b/pandas/tests/generic/test_to_xarray.py @@ -52,7 +52,7 @@ def test_to_xarray_index_types(self, index_flat, df, using_infer_string): # column names are lost expected = df.copy() expected["f"] = expected["f"].astype( - object if not using_infer_string else "string[pyarrow_numpy]" + object if not using_infer_string else "str" ) expected.columns.name = None tm.assert_frame_equal(result.to_dataframe(), expected) @@ -81,7 +81,7 @@ def test_to_xarray_with_multiindex(self, df, using_infer_string): result = result.to_dataframe() expected = df.copy() expected["f"] = expected["f"].astype( - object if not using_infer_string else "string[pyarrow_numpy]" + object if not using_infer_string else "str" ) expected.columns.name = None tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 644f93a37a3a3..1a4127ab49b0e 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -79,7 +77,7 @@ def test_apply_index_date(using_infer_string): tm.assert_frame_equal(result, expected) -def test_apply_index_date_object(using_infer_string): +def test_apply_index_date_object(): # GH 5789 # don't auto coerce dates ts = [ @@ -111,10 +109,7 @@ def test_apply_index_date_object(using_infer_string): 1.40750, 1.40649, ] - dtype = "string[pyarrow_numpy]" if using_infer_string else object - exp_idx = Index( - ["2011-05-16", "2011-05-17", "2011-05-18"], dtype=dtype, name="date" - ) + exp_idx = Index(["2011-05-16", "2011-05-17", "2011-05-18"], name="date") expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(DeprecationWarning, match=msg): @@ -922,12 +917,11 @@ def test_func_returns_object(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "group_column_dtlike", [datetime.today(), datetime.today().date(), datetime.today().time()], ) -def test_apply_datetime_issue(group_column_dtlike, using_infer_string): +def test_apply_datetime_issue(group_column_dtlike): # GH-28247 # groupby-apply throws an error if one of the columns in the DataFrame # is a datetime object and the column labels are different from @@ -938,8 +932,7 @@ def test_apply_datetime_issue(group_column_dtlike, using_infer_string): with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) - dtype = "string" if using_infer_string else "object" - expected = DataFrame(["spam"], Index(["foo"], dtype=dtype, name="a"), columns=[42]) + expected = DataFrame(["spam"], Index(["foo"], dtype="str", name="a"), columns=[42]) tm.assert_frame_equal(result, expected) @@ -1020,7 +1013,7 @@ def test_groupby_apply_datetime_result_dtypes(using_infer_string): msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(DeprecationWarning, match=msg): result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes - dtype = "string" if using_infer_string else object + dtype = pd.StringDtype(na_value=np.nan) if using_infer_string else object expected = Series( [np.dtype("datetime64[us]"), dtype, dtype, np.int64, dtype], index=["observation", "color", "mood", "intensity", "score"], diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index c35f5d2bc26e8..1e86b5401ee09 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -134,7 +134,7 @@ def f(x): result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name="person_id") - dtype = "string[pyarrow_numpy]" if using_infer_string else object + dtype = "str" if using_infer_string else object expected["person_name"] = expected["person_name"].astype(dtype) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 5ac6dc990c092..791f279bffc94 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -889,7 +889,7 @@ def test_groupby_complex_mean(): tm.assert_frame_equal(result, expected) -def test_groupby_complex_numbers(using_infer_string): +def test_groupby_complex_numbers(): # GH 17927 df = DataFrame( [ @@ -898,11 +898,10 @@ def test_groupby_complex_numbers(using_infer_string): {"a": 4, "b": 1}, ] ) - dtype = "string[pyarrow_numpy]" if using_infer_string else object expected = DataFrame( np.array([1, 1, 1], dtype=np.int64), index=Index([(1 + 1j), (1 + 2j), (1 + 0j)], name="b"), - columns=Index(["a"], dtype=dtype), + columns=Index(["a"]), ) result = df.groupby("b", sort=False).count() tm.assert_frame_equal(result, expected) @@ -1759,7 +1758,7 @@ def get_categorical_invalid_expected(): idx = Index(lev, name=keys[0]) if using_infer_string: - columns = Index([], dtype="string[pyarrow_numpy]") + columns = Index([], dtype="str") else: columns = [] expected = DataFrame([], columns=columns, index=idx) diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index 7e7c84fa2b390..41e00f8121b14 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -180,6 +180,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): "category type does not support sum operations", re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), + re.escape(f"agg function failed [how->{method},dtype->str]"), ] ) with pytest.raises(exception, match=msg): @@ -197,6 +198,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): f"Cannot perform {method} with non-ordered Categorical", re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), + re.escape(f"agg function failed [how->{method},dtype->str]"), ] ) with pytest.raises(exception, match=msg): diff --git a/pandas/tests/indexes/base_class/test_formats.py b/pandas/tests/indexes/base_class/test_formats.py index dc4763d96bc71..260b4203a4f04 100644 --- a/pandas/tests/indexes/base_class/test_formats.py +++ b/pandas/tests/indexes/base_class/test_formats.py @@ -9,7 +9,6 @@ class TestIndexRendering: - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_repr_is_valid_construction_code(self): # for the case of Index, where the repr is traditional rather than # stylized diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index 38e0920b7004e..b2867d4ac8e68 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -850,7 +850,7 @@ def test_dtype_representation(using_infer_string): # GH#46900 pmidx = MultiIndex.from_arrays([[1], ["a"]], names=[("a", "b"), ("c", "d")]) result = pmidx.dtypes - exp = "object" if not using_infer_string else "string" + exp = "object" if not using_infer_string else pd.StringDtype(na_value=np.nan) expected = Series( ["int64", exp], index=MultiIndex.from_tuples([("a", "b"), ("c", "d")]), diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index dd4bba42eda6f..7f292aacc39ca 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -41,7 +41,7 @@ def test_get_dtypes(using_infer_string): names=["int", "string", "dt"], ) - exp = "object" if not using_infer_string else "string" + exp = "object" if not using_infer_string else pd.StringDtype(na_value=np.nan) expected = pd.Series( { "int": np.dtype("int64"), @@ -61,7 +61,7 @@ def test_get_dtypes_no_level_name(using_infer_string): pd.date_range("20200101", periods=2, tz="UTC"), ], ) - exp = "object" if not using_infer_string else "string" + exp = "object" if not using_infer_string else pd.StringDtype(na_value=np.nan) expected = pd.Series( { "level_0": np.dtype("int64"), @@ -82,7 +82,7 @@ def test_get_dtypes_duplicate_level_names(using_infer_string): ], names=["A", "A", "A"], ).dtypes - exp = "object" if not using_infer_string else "string" + exp = "object" if not using_infer_string else pd.StringDtype(na_value=np.nan) expected = pd.Series( [np.dtype("int64"), exp, DatetimeTZDtype(tz="utc")], index=["A", "A", "A"], diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 039836da75cd5..1eeeebd6b8ca9 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -171,6 +171,7 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): class TestSliceLocs: + # TODO(infer_string) parametrize over multiple string dtypes @pytest.mark.parametrize( "dtype", [ @@ -209,6 +210,7 @@ def test_slice_locs_negative_step(self, in_slice, expected, dtype): expected = Index(list(expected), dtype=dtype) tm.assert_index_equal(result, expected) + # TODO(infer_string) parametrize over multiple string dtypes @td.skip_if_no("pyarrow") def test_slice_locs_negative_step_oob(self): index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]") diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 8d2245d0d9978..0911f2aec74d6 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -79,7 +79,7 @@ def test_constructor_copy(self, using_infer_string): assert new_index.name == "name" if using_infer_string: tm.assert_extension_array_equal( - new_index.values, pd.array(arr, dtype="string[pyarrow_numpy]") + new_index.values, pd.array(arr, dtype="str") ) else: tm.assert_numpy_array_equal(arr, new_index.values) @@ -157,7 +157,7 @@ def test_constructor_from_frame_series_freq(self, using_infer_string): df = DataFrame(np.random.default_rng(2).random((5, 3))) df["date"] = dts result = DatetimeIndex(df["date"], freq="MS") - dtype = object if not using_infer_string else "string" + dtype = object if not using_infer_string else "str" assert df["date"].dtype == dtype expected.name = "date" tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index d482d20591a8a..ec9767aa4bab4 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -601,7 +601,7 @@ def test_loc_nan_multiindex(using_infer_string): np.ones((1, 4)), index=Index( [np.nan], - dtype="object" if not using_infer_string else "string[pyarrow_numpy]", + dtype="object" if not using_infer_string else "str", name="u3", ), columns=Index(["d1", "d2", "d3", "d4"]), diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 6b072bc27ed81..ef8c0e432ca49 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -259,7 +259,7 @@ def test_dups_fancy_indexing_only_missing_label(self, using_infer_string): with pytest.raises( KeyError, match=re.escape( - "\"None of [Index(['E'], dtype='string')] are in the [index]\"" + "\"None of [Index(['E'], dtype='str')] are in the [index]\"" ), ): dfnu.loc[["E"]] diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 1b2dc0819006c..247501f1504e7 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -62,12 +62,17 @@ def test_not_change_nan_loc(series, new_series, expected_ser): class TestLoc: - def test_none_values_on_string_columns(self): + def test_none_values_on_string_columns(self, using_infer_string): # Issue #32218 - df = DataFrame(["1", "2", None], columns=["a"], dtype="str") - + df = DataFrame(["1", "2", None], columns=["a"], dtype=object) assert df.loc[2, "a"] is None + df = DataFrame(["1", "2", None], columns=["a"], dtype="str") + if using_infer_string: + assert np.isnan(df.loc[2, "a"]) + else: + assert df.loc[2, "a"] is None + def test_loc_getitem_int(self, frame_or_series): # int label obj = frame_or_series(range(3), index=Index(list("abc"), dtype=object)) @@ -1394,7 +1399,7 @@ def test_loc_setitem_single_row_categorical(self, using_infer_string): result = df["Alpha"] expected = Series(categories, index=df.index, name="Alpha").astype( - object if not using_infer_string else "string[pyarrow_numpy]" + object if not using_infer_string else "str" ) tm.assert_series_equal(result, expected) @@ -1563,7 +1568,7 @@ def test_loc_setitem_single_column_mixed(self, using_infer_string): df.loc[df.index[::2], "str"] = np.nan expected = Series( [np.nan, "qux", np.nan, "qux", np.nan], - dtype=object if not using_infer_string else "string[pyarrow_numpy]", + dtype=object if not using_infer_string else "str", ).values tm.assert_almost_equal(df["str"].values, expected) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 4d232d5ed1312..3dbdedbb94618 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -227,7 +227,7 @@ def test_partial_set_empty_frame_empty_consistencies(self, using_infer_string): { "x": Series( ["1", "2"], - dtype=object if not using_infer_string else "string[pyarrow_numpy]", + dtype=object if not using_infer_string else "str", ), "y": Series([np.nan, np.nan], dtype=object), } diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 579d3fbfb3435..c1f71c6de92dd 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -621,7 +621,7 @@ def _compare(old_mgr, new_mgr): mgr.iset(1, np.array(["2."] * N, dtype=np.object_)) mgr.iset(2, np.array(["foo."] * N, dtype=np.object_)) new_mgr = mgr.convert() - dtype = "string[pyarrow_numpy]" if using_infer_string else np.object_ + dtype = "str" if using_infer_string else np.object_ assert new_mgr.iget(0).dtype == dtype assert new_mgr.iget(1).dtype == dtype assert new_mgr.iget(2).dtype == dtype diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 0d753cb871c64..e1cdfb8bfa7e3 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -775,7 +775,7 @@ def test_to_excel_interval_no_labels(self, tmp_excel, using_infer_string): df["new"] = pd.cut(df[0], 10) expected["new"] = pd.cut(expected[0], 10).astype( - str if not using_infer_string else "string[pyarrow_numpy]" + str if not using_infer_string else "str" ) df.to_excel(tmp_excel, sheet_name="test1") diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index bddd71d2bd5f6..7f367ded39863 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -76,7 +76,7 @@ def test_build_table_schema(self, df_schema, using_infer_string): "primaryKey": ["idx"], } if using_infer_string: - expected["fields"][2] = {"name": "B", "type": "any", "extDtype": "string"} + expected["fields"][2] = {"name": "B", "type": "any", "extDtype": "str"} assert result == expected result = build_table_schema(df_schema) assert "pandas_version" in result @@ -129,7 +129,7 @@ def test_multiindex(self, df_schema, using_infer_string): "type": "any", "extDtype": "string", } - expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "string"} + expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "str"} assert result == expected df.index.names = ["idx0", None] @@ -309,7 +309,7 @@ def test_to_json(self, df_table, using_infer_string): ] if using_infer_string: - fields[2] = {"name": "B", "type": "any", "extDtype": "string"} + fields[2] = {"name": "B", "type": "any", "extDtype": "str"} schema = {"fields": fields, "primaryKey": ["idx"]} data = [ diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index d281729e9704c..1bc227369a968 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -255,7 +255,7 @@ def test_roundtrip_categorical( expected = categorical_frame.copy() expected.index = expected.index.astype( - str if not using_infer_string else "string[pyarrow_numpy]" + str if not using_infer_string else "str" ) # Categorical not preserved expected.index.name = None # index names aren't preserved in JSON assert_json_roundtrip_equal(result, expected, orient) @@ -610,7 +610,7 @@ def test_blocks_compat_GH9037(self, using_infer_string): # JSON deserialisation always creates unicode strings df_mixed.columns = df_mixed.columns.astype( - np.str_ if not using_infer_string else "string[pyarrow_numpy]" + np.str_ if not using_infer_string else "str" ) data = StringIO(df_mixed.to_json(orient="split")) df_roundtrip = read_json(data, orient="split") @@ -695,7 +695,7 @@ def test_series_roundtrip_simple(self, orient, string_series, using_infer_string expected = string_series if using_infer_string and orient in ("split", "index", "columns"): # These schemas don't contain dtypes, so we infer string - expected.index = expected.index.astype("string[pyarrow_numpy]") + expected.index = expected.index.astype("str") if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": @@ -1573,7 +1573,6 @@ def test_from_json_to_json_table_index_and_columns(self, index, columns): result = read_json(StringIO(dfjson), orient="table") tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_from_json_to_json_table_dtypes(self): # GH21345 expected = DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]}) diff --git a/pandas/tests/io/pytables/test_keys.py b/pandas/tests/io/pytables/test_keys.py index 55bd3f0d5a03a..7d0802dcf2e47 100644 --- a/pandas/tests/io/pytables/test_keys.py +++ b/pandas/tests/io/pytables/test_keys.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, HDFStore, @@ -13,7 +15,10 @@ tables, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_keys(setup_path): diff --git a/pandas/tests/io/pytables/test_subclass.py b/pandas/tests/io/pytables/test_subclass.py index 03622faa2b5a8..bbe1cd77e0d9f 100644 --- a/pandas/tests/io/pytables/test_subclass.py +++ b/pandas/tests/io/pytables/test_subclass.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Series, @@ -17,6 +19,7 @@ class TestHDFStoreSubclass: # GH 33748 + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_supported_for_subclass_dataframe(self, tmp_path): data = {"a": [1, 2], "b": [3, 4]} sdf = tm.SubclassedDataFrame(data, dtype=np.intp) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index c583f9b2c4f99..10e3af601b7ef 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -364,6 +364,7 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module): expected = f_path.read() assert result == expected + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_write_fspath_hdf5(self): # Same test as write_fspath_all, except HDF5 files aren't # necessarily byte-for-byte identical for a given dataframe, so we'll diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 7ffee9ea78ddc..45e0cab2165a7 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -176,6 +176,7 @@ def test_excel_options(fsspectest): assert fsspectest.test[0] == "read" +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_parquet_new_file(cleared_fs, df1): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") @@ -283,7 +284,6 @@ def test_not_present_exception(): read_csv("memory://test/test.csv") -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_feather_options(fsspectest): pytest.importorskip("pyarrow") df = DataFrame({"a": [0]}) diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index e113fa25b2a3f..bf56a5781f7cd 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -208,6 +208,7 @@ def test_to_csv_compression_encoding_gcs( tm.assert_frame_equal(df, read_df) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index 409aafee58e49..96ef50f9d7149 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -4,8 +4,6 @@ import pytest -from pandas._config import using_string_dtype - from pandas.errors import ParserWarning import pandas.util._test_decorators as td @@ -85,7 +83,6 @@ def read_xml_iterparse(data, **kwargs): # DTYPE -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dtype_single_str(parser): df_result = read_xml(StringIO(xml_types), dtype={"degrees": "str"}, parser=parser) df_iter = read_xml_iterparse( @@ -211,7 +208,6 @@ def test_wrong_dtype(xml_books, parser, iterparse): ) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_both_dtype_converters(parser): df_expected = DataFrame( { diff --git a/pandas/tests/reshape/concat/test_categorical.py b/pandas/tests/reshape/concat/test_categorical.py index bbaaf0abecfbd..8e6a14e6bfb8f 100644 --- a/pandas/tests/reshape/concat/test_categorical.py +++ b/pandas/tests/reshape/concat/test_categorical.py @@ -59,9 +59,7 @@ def test_categorical_concat_dtypes(self, using_infer_string): num = Series([1, 2, 3]) df = pd.concat([Series(cat), obj, num], axis=1, keys=index) - result = df.dtypes == ( - object if not using_infer_string else "string[pyarrow_numpy]" - ) + result = df.dtypes == (object if not using_infer_string else "str") expected = Series([False, True, False], index=index) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_empty.py b/pandas/tests/reshape/concat/test_empty.py index 06d57c48df817..4869cfbf3a556 100644 --- a/pandas/tests/reshape/concat/test_empty.py +++ b/pandas/tests/reshape/concat/test_empty.py @@ -27,7 +27,7 @@ def test_handle_empty_objects(self, sort, using_infer_string): expected = df.reindex(columns=["a", "b", "c", "d", "foo"]) expected["foo"] = expected["foo"].astype( - object if not using_infer_string else "string[pyarrow_numpy]" + object if not using_infer_string else "str" ) expected.loc[0:4, "foo"] = "bar" @@ -282,7 +282,7 @@ def test_concat_empty_dataframe_different_dtypes(self, using_infer_string): result = concat([df1[:0], df2[:0]]) assert result["a"].dtype == np.int64 - assert result["b"].dtype == np.object_ if not using_infer_string else "string" + assert result["b"].dtype == np.object_ if not using_infer_string else "str" def test_concat_to_empty_ea(self): """48510 `concat` to an empty EA should maintain type EA dtype.""" diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index e13b042192fc6..17f1a9d4ecbf1 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -449,9 +449,7 @@ def test_concat_axis_1_sort_false_rangeindex(self, using_infer_string): s1 = Series(["a", "b", "c"]) s2 = Series(["a", "b"]) s3 = Series(["a", "b", "c", "d"]) - s4 = Series( - [], dtype=object if not using_infer_string else "string[pyarrow_numpy]" - ) + s4 = Series([], dtype=object if not using_infer_string else "str") result = concat( [s1, s2, s3, s4], sort=False, join="outer", ignore_index=False, axis=1 ) @@ -462,7 +460,7 @@ def test_concat_axis_1_sort_false_rangeindex(self, using_infer_string): ["c", np.nan] * 2, [np.nan] * 2 + ["d"] + [np.nan], ], - dtype=object if not using_infer_string else "string[pyarrow_numpy]", + dtype=object if not using_infer_string else "str", ) tm.assert_frame_equal( result, expected, check_index_type=True, check_column_type=True diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 883a50a0c1399..f090ded06119a 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -155,7 +155,7 @@ def test_join_on(self, target_source, infer_string): # overlap msg = ( - "You are trying to merge on float64 and object|string columns for key " + "You are trying to merge on float64 and object|str columns for key " "'A'. If you wish to proceed you should use pd.concat" ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 4a6228e47eba0..ad704d87a491b 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -812,7 +812,7 @@ def test_overlapping_columns_error_message(self): # #2649, #10639 df2.columns = ["key1", "foo", "foo"] - msg = r"Data columns not unique: Index\(\['foo'\], dtype='object|string'\)" + msg = r"Data columns not unique: Index\(\['foo'\], dtype='object|str'\)" with pytest.raises(MergeError, match=msg): merge(df, df2) @@ -1859,7 +1859,7 @@ def test_identical(self, left, using_infer_string): # merging on the same, should preserve dtypes merged = merge(left, left, on="X") result = merged.dtypes.sort_index() - dtype = np.dtype("O") if not using_infer_string else "string" + dtype = np.dtype("O") if not using_infer_string else "str" expected = Series( [CategoricalDtype(categories=["foo", "bar"]), dtype, dtype], index=["X", "Y_x", "Y_y"], @@ -1871,7 +1871,7 @@ def test_basic(self, left, right, using_infer_string): # so should preserve the merged column merged = merge(left, right, on="X") result = merged.dtypes.sort_index() - dtype = np.dtype("O") if not using_infer_string else "string" + dtype = np.dtype("O") if not using_infer_string else "str" expected = Series( [ CategoricalDtype(categories=["foo", "bar"]), @@ -1985,7 +1985,7 @@ def test_other_columns(self, left, right, using_infer_string): merged = merge(left, right, on="X") result = merged.dtypes.sort_index() - dtype = np.dtype("O") if not using_infer_string else "string" + dtype = np.dtype("O") if not using_infer_string else "str" expected = Series( [ CategoricalDtype(categories=["foo", "bar"]), @@ -2022,7 +2022,7 @@ def test_dtype_on_merged_different( merged = merge(left, right, on="X", how=join_type) result = merged.dtypes.sort_index() - dtype = np.dtype("O") if not using_infer_string else "string" + dtype = np.dtype("O") if not using_infer_string else "str" expected = Series([dtype, dtype, np.dtype("int64")], index=["X", "Y", "Z"]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 62fd8c5a7e231..8d972087b0dff 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -3164,7 +3164,7 @@ def test_by_nullable(self, any_numeric_ea_dtype, using_infer_string): ) expected["value_y"] = np.array([np.nan, np.nan, np.nan], dtype=object) if using_infer_string: - expected["value_y"] = expected["value_y"].astype("string[pyarrow_numpy]") + expected["value_y"] = expected["value_y"].astype("str") tm.assert_frame_equal(result, expected) def test_merge_by_col_tz_aware(self): @@ -3215,7 +3215,7 @@ def test_by_mixed_tz_aware(self, using_infer_string): ) expected["value_y"] = np.array([np.nan], dtype=object) if using_infer_string: - expected["value_y"] = expected["value_y"].astype("string[pyarrow_numpy]") + expected["value_y"] = expected["value_y"].astype("str") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", ["float64", "int16", "m8[ns]", "M8[us]"]) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index bfb6a3c0167c8..da1930323f464 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -337,7 +337,7 @@ def test_no_prefix_string_cats_default_category( result = from_dummies(dummies, default_category=default_category) expected = DataFrame(expected) if using_infer_string: - expected[""] = expected[""].astype("string[pyarrow_numpy]") + expected[""] = expected[""].astype("str") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 304ba65f38d3c..27a34decae7b0 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -122,7 +122,7 @@ def test_get_dummies_basic_types(self, sparse, dtype, using_infer_string): result = get_dummies(s_df, columns=["a"], sparse=sparse, dtype=dtype) - key = "string" if using_infer_string else "object" + key = "str" if using_infer_string else "object" expected_counts = {"int64": 1, key: 1} expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0) diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 03e823ce607fb..3e1ece6b7f59e 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -585,7 +585,7 @@ def test_strftime_period_days(self, using_infer_string): dtype="=U10", ) if using_infer_string: - expected = expected.astype("string[pyarrow_numpy]") + expected = expected.astype("str") tm.assert_index_equal(result, expected) @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") diff --git a/pandas/tests/series/indexing/test_delitem.py b/pandas/tests/series/indexing/test_delitem.py index 3d1082c3d040b..7440ef2692c47 100644 --- a/pandas/tests/series/indexing/test_delitem.py +++ b/pandas/tests/series/indexing/test_delitem.py @@ -31,16 +31,15 @@ def test_delitem(self): del s[0] tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="int64"))) - def test_delitem_object_index(self, using_infer_string): + def test_delitem_object_index(self): # Index(dtype=object) - dtype = "string[pyarrow_numpy]" if using_infer_string else object - s = Series(1, index=Index(["a"], dtype=dtype)) + s = Series(1, index=Index(["a"], dtype="str")) del s["a"] - tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype))) + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="str"))) s["a"] = 1 - tm.assert_series_equal(s, Series(1, index=Index(["a"], dtype=dtype))) + tm.assert_series_equal(s, Series(1, index=Index(["a"], dtype="str"))) del s["a"] - tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype))) + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="str"))) def test_delitem_missing_key(self): # empty diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index ede39ba61dfeb..8ba5e8452711d 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -360,9 +360,7 @@ def test_getitem_no_matches(self, box): key = Series(["C"], dtype=object) key = box(key) - msg = ( - r"None of \[Index\(\['C'\], dtype='object|string'\)\] are in the \[index\]" - ) + msg = r"None of \[Index\(\['C'\], dtype='object|str'\)\] are in the \[index\]" with pytest.raises(KeyError, match=msg): ser[key] diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 3fcf664c3f01b..742091d761d62 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -594,7 +594,7 @@ def test_setitem_enlargement_object_none(self, nulls_fixture, using_infer_string ser = Series(["a", "b"]) ser[3] = nulls_fixture dtype = ( - "string[pyarrow_numpy]" + "str" if using_infer_string and not isinstance(nulls_fixture, Decimal) else object ) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index d2d92d7273d3d..579d41f964df0 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -532,12 +532,12 @@ def test_astype_categorical_to_other(self): expected = ser tm.assert_series_equal(ser.astype("category"), expected) tm.assert_series_equal(ser.astype(CategoricalDtype()), expected) - msg = r"Cannot cast object|string dtype to float64" + msg = r"Cannot cast object|str dtype to float64" with pytest.raises(ValueError, match=msg): ser.astype("float64") cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) - exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"], dtype=object) + exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"], dtype="str") tm.assert_series_equal(cat.astype("str"), exp) s2 = Series(Categorical(["1", "2", "3", "4"])) exp2 = Series([1, 2, 3, 4]).astype("int") diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index f4f72854e50d3..fe84ffafa70b4 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -101,16 +101,16 @@ def test_map_series_stringdtype(any_string_dtype, using_infer_string): expected = Series(data=["rabbit", "dog", "cat", item], dtype=any_string_dtype) if using_infer_string and any_string_dtype == "object": - expected = expected.astype("string[pyarrow_numpy]") + expected = expected.astype("str") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "data, expected_dtype", - [(["1-1", "1-1", np.nan], "category"), (["1-1", "1-2", np.nan], object)], + [(["1-1", "1-1", np.nan], "category"), (["1-1", "1-2", np.nan], "str")], ) -def test_map_categorical_with_nan_values(data, expected_dtype, using_infer_string): +def test_map_categorical_with_nan_values(data, expected_dtype): # GH 20714 bug fixed in: GH 24275 def func(val): return val.split("-")[0] @@ -118,8 +118,6 @@ def func(val): s = Series(data, dtype="category") result = s.map(func, na_action="ignore") - if using_infer_string and expected_dtype == object: - expected_dtype = "string[pyarrow_numpy]" expected = Series(["1", "1", np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -145,9 +143,7 @@ def test_map_simple_str_callables_same_as_astype( # test that we are evaluating row-by-row first # before vectorized evaluation result = string_series.map(func) - expected = string_series.astype( - str if not using_infer_string else "string[pyarrow_numpy]" - ) + expected = string_series.astype(str if not using_infer_string else "str") tm.assert_series_equal(result, expected) @@ -493,7 +489,7 @@ def test_map_categorical(na_action, using_infer_string): result = s.map(lambda x: "A", na_action=na_action) exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) - assert result.dtype == object if not using_infer_string else "string" + assert result.dtype == object if not using_infer_string else "str" @pytest.mark.parametrize( diff --git a/pandas/tests/series/methods/test_rename.py b/pandas/tests/series/methods/test_rename.py index 1da98b3a273be..f5a97d61990a4 100644 --- a/pandas/tests/series/methods/test_rename.py +++ b/pandas/tests/series/methods/test_rename.py @@ -64,7 +64,7 @@ def test_rename_set_name_inplace(self, using_infer_string): assert ser.name == name exp = np.array(["a", "b", "c"], dtype=np.object_) if using_infer_string: - exp = array(exp, dtype="string[pyarrow_numpy]") + exp = array(exp, dtype="str") tm.assert_extension_array_equal(ser.index.values, exp) else: tm.assert_numpy_array_equal(ser.index.values, exp) diff --git a/pandas/tests/series/methods/test_reset_index.py b/pandas/tests/series/methods/test_reset_index.py index 32b6cbd5acce3..d42aafc001680 100644 --- a/pandas/tests/series/methods/test_reset_index.py +++ b/pandas/tests/series/methods/test_reset_index.py @@ -193,7 +193,7 @@ def test_reset_index_dtypes_on_empty_series_with_multiindex( # GH 19602 - Preserve dtype on empty Series with MultiIndex idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) result = Series(dtype=object, index=idx)[:0].reset_index().dtypes - exp = "string" if using_infer_string else object + exp = "str" if using_infer_string else object expected = Series( { "level_0": np.int64, diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index 0bcad49847291..6eb7c74d2eca0 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -176,8 +176,5 @@ def test_to_csv_interval_index(self, using_infer_string, temp_file): # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) expected = s - if using_infer_string: - expected.index = expected.index.astype("string[pyarrow_numpy]") - else: - expected.index = expected.index.astype(str) + expected.index = expected.index.astype("str") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index ff6ece4de9ec4..57b14d4b82a63 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -166,7 +166,7 @@ def test_constructor(self, datetime_series, using_infer_string): # Mixed type Series mixed = Series(["hello", np.nan], index=[0, 1]) - assert mixed.dtype == np.object_ if not using_infer_string else "string" + assert mixed.dtype == np.object_ if not using_infer_string else "str" assert np.isnan(mixed[1]) assert not empty_series.index._is_all_dates @@ -1454,7 +1454,7 @@ def test_fromDict(self, using_infer_string): data = {"a": 0, "b": "1", "c": "2", "d": "3"} series = Series(data) - assert series.dtype == np.object_ if not using_infer_string else "string" + assert series.dtype == np.object_ if not using_infer_string else "str" data = {"a": "0", "b": "1"} series = Series(data, dtype=float) @@ -1466,7 +1466,7 @@ def test_fromValue(self, datetime_series, using_infer_string): assert len(nans) == len(datetime_series) strings = Series("foo", index=datetime_series.index) - assert strings.dtype == np.object_ if not using_infer_string else "string" + assert strings.dtype == np.object_ if not using_infer_string else "str" assert len(strings) == len(datetime_series) d = datetime.now() @@ -2121,6 +2121,11 @@ def test_series_string_inference_storage_definition(self): result = Series(["a", "b"], dtype="string") tm.assert_series_equal(result, expected) + expected = Series(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)) + with pd.option_context("future.infer_string", True): + result = Series(["a", "b"], dtype="str") + tm.assert_series_equal(result, expected) + def test_series_constructor_infer_string_scalar(self): # GH#55537 with pd.option_context("future.infer_string", True): diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py index 1d95fbf8dccb8..ab083d5c58b35 100644 --- a/pandas/tests/series/test_formats.py +++ b/pandas/tests/series/test_formats.py @@ -314,7 +314,7 @@ def test_categorical_repr(self, using_infer_string): "0 a\n1 b\n" " ..\n" "48 a\n49 b\n" - "Length: 50, dtype: category\nCategories (2, string): [a, b]" + "Length: 50, dtype: category\nCategories (2, str): [a, b]" ) else: exp = ( @@ -332,7 +332,7 @@ def test_categorical_repr(self, using_infer_string): exp = ( "0 a\n1 b\n" "dtype: category\n" - "Categories (26, string): [a < b < c < d ... w < x < y < z]" + "Categories (26, str): [a < b < c < d ... w < x < y < z]" ) else: exp = ( diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 1e6538ca5a8fb..ee26fdae74960 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -10,8 +10,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import IntCastingNaNError import pandas as pd @@ -166,7 +164,6 @@ def test_pandas_datareader(): pytest.importorskip("pandas_datareader") -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_pyarrow(df): pyarrow = pytest.importorskip("pyarrow") diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index a3f4b091713f9..28d96ea25cba7 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -112,7 +112,7 @@ def test_empty_dtypes(check_dtype): @pytest.mark.parametrize("check_like", [True, False]) def test_frame_equal_index_mismatch(check_like, frame_or_series, using_infer_string): if using_infer_string: - dtype = "string" + dtype = "str" else: dtype = "object" msg = f"""{frame_or_series.__name__}\\.index are different @@ -134,7 +134,7 @@ def test_frame_equal_index_mismatch(check_like, frame_or_series, using_infer_str @pytest.mark.parametrize("check_like", [True, False]) def test_frame_equal_columns_mismatch(check_like, frame_or_series, using_infer_string): if using_infer_string: - dtype = "string" + dtype = "str" else: dtype = "object" msg = f"""{frame_or_series.__name__}\\.columns are different diff --git a/pandas/tests/util/test_assert_index_equal.py b/pandas/tests/util/test_assert_index_equal.py index dc6efdcec380e..ab52d6c8e9f39 100644 --- a/pandas/tests/util/test_assert_index_equal.py +++ b/pandas/tests/util/test_assert_index_equal.py @@ -207,7 +207,7 @@ def test_index_equal_names(name1, name2): def test_index_equal_category_mismatch(check_categorical, using_infer_string): if using_infer_string: - dtype = "string" + dtype = "str" else: dtype = "object" msg = f"""Index are different diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index f75f48157aad2..a3b24c029fbac 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -220,9 +220,9 @@ def test_series_equal_categorical_values_mismatch(rtol, using_infer_string): Series values are different \\(66\\.66667 %\\) \\[index\\]: \\[0, 1, 2\\] \\[left\\]: \\['a', 'b', 'c'\\] -Categories \\(3, string\\): \\[a, b, c\\] +Categories \\(3, str\\): \\[a, b, c\\] \\[right\\]: \\['a', 'c', 'b'\\] -Categories \\(3, string\\): \\[a, b, c\\]""" +Categories \\(3, str\\): \\[a, b, c\\]""" else: msg = """Series are different @@ -257,7 +257,7 @@ def test_series_equal_datetime_values_mismatch(rtol): def test_series_equal_categorical_mismatch(check_categorical, using_infer_string): if using_infer_string: - dtype = "string" + dtype = "str" else: dtype = "object" msg = f"""Attributes of Series are different diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 5ba08ac13fcee..15eaa8c167487 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -71,7 +71,7 @@ def test_sum_object_str_raises(step): df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) r = df.rolling(window=3, step=step) with pytest.raises( - DataError, match="Cannot aggregate non-numeric type: object|string" + DataError, match="Cannot aggregate non-numeric type: object|str" ): # GH#42738, enforced in 2.0 r.sum()