From 50d31248a798d6067a249e77bfecc745b41a44d7 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 20 Sep 2024 13:54:03 -0400 Subject: [PATCH] String dtype: use 'str' string alias and representation for NaN-variant of the dtype (#59388) --- pandas/_testing/__init__.py | 6 +++- pandas/core/arrays/arrow/array.py | 5 ++- pandas/core/arrays/string_.py | 24 +++++++++---- pandas/core/frame.py | 4 ++- pandas/core/interchange/utils.py | 7 +++- pandas/tests/apply/test_numba.py | 2 +- pandas/tests/apply/test_series_apply.py | 2 +- pandas/tests/arrays/boolean/test_astype.py | 12 +++++-- .../tests/arrays/categorical/test_astype.py | 2 +- pandas/tests/arrays/categorical/test_repr.py | 2 +- pandas/tests/arrays/floating/test_astype.py | 17 ++++++--- pandas/tests/arrays/integer/test_dtypes.py | 17 ++++++--- .../arrays/interval/test_interval_pyarrow.py | 3 -- .../tests/arrays/period/test_arrow_compat.py | 4 --- pandas/tests/arrays/string_/test_string.py | 35 +++++++++++-------- .../tests/arrays/string_/test_string_arrow.py | 6 ++-- pandas/tests/arrays/test_datetimelike.py | 7 ++-- pandas/tests/dtypes/test_common.py | 19 ++++++++++ pandas/tests/dtypes/test_dtypes.py | 2 +- pandas/tests/extension/test_string.py | 14 ++++++++ pandas/tests/frame/indexing/test_indexing.py | 8 ++--- pandas/tests/frame/indexing/test_set_value.py | 2 +- pandas/tests/frame/methods/test_astype.py | 6 ++-- .../frame/methods/test_get_numeric_data.py | 4 ++- pandas/tests/frame/methods/test_nlargest.py | 2 +- .../tests/frame/methods/test_reset_index.py | 4 +-- .../tests/frame/methods/test_select_dtypes.py | 19 ++++++++-- pandas/tests/frame/methods/test_to_csv.py | 5 +-- pandas/tests/frame/test_block_internals.py | 4 ++- pandas/tests/frame/test_constructors.py | 31 +++++++++++----- pandas/tests/frame/test_stack_unstack.py | 6 +++- pandas/tests/generic/test_to_xarray.py | 4 +-- pandas/tests/groupby/test_apply.py | 17 +++------ pandas/tests/groupby/test_categorical.py | 2 +- pandas/tests/groupby/test_groupby.py | 7 ++-- pandas/tests/groupby/test_numeric_only.py | 2 ++ .../tests/indexes/base_class/test_formats.py | 1 - .../tests/indexes/multi/test_constructors.py | 2 +- pandas/tests/indexes/multi/test_get_set.py | 6 ++-- pandas/tests/indexes/object/test_indexing.py | 2 ++ pandas/tests/indexes/test_base.py | 4 +-- pandas/tests/indexing/multiindex/test_loc.py | 2 +- pandas/tests/indexing/test_indexing.py | 2 +- pandas/tests/indexing/test_loc.py | 15 +++++--- pandas/tests/indexing/test_partial.py | 2 +- pandas/tests/internals/test_internals.py | 2 +- pandas/tests/io/excel/test_writers.py | 2 +- .../tests/io/json/test_json_table_schema.py | 6 ++-- pandas/tests/io/json/test_pandas.py | 7 ++-- pandas/tests/io/pytables/test_keys.py | 7 +++- pandas/tests/io/pytables/test_subclass.py | 3 ++ pandas/tests/io/test_common.py | 1 + pandas/tests/io/test_fsspec.py | 2 +- pandas/tests/io/test_gcs.py | 1 + pandas/tests/io/xml/test_xml_dtypes.py | 4 --- .../tests/reshape/concat/test_categorical.py | 4 +-- pandas/tests/reshape/concat/test_empty.py | 4 +-- pandas/tests/reshape/concat/test_index.py | 6 ++-- pandas/tests/reshape/merge/test_join.py | 2 +- pandas/tests/reshape/merge/test_merge.py | 10 +++--- pandas/tests/reshape/merge/test_merge_asof.py | 4 +-- pandas/tests/reshape/test_from_dummies.py | 2 +- pandas/tests/reshape/test_get_dummies.py | 2 +- .../series/accessors/test_dt_accessor.py | 2 +- pandas/tests/series/indexing/test_delitem.py | 11 +++--- pandas/tests/series/indexing/test_getitem.py | 4 +-- pandas/tests/series/indexing/test_setitem.py | 2 +- pandas/tests/series/methods/test_astype.py | 4 +-- pandas/tests/series/methods/test_map.py | 14 +++----- pandas/tests/series/methods/test_rename.py | 2 +- .../tests/series/methods/test_reset_index.py | 2 +- pandas/tests/series/methods/test_to_csv.py | 7 ++-- pandas/tests/series/test_constructors.py | 11 ++++-- pandas/tests/series/test_formats.py | 4 +-- pandas/tests/test_downstream.py | 3 -- pandas/tests/util/test_assert_frame_equal.py | 4 +-- pandas/tests/util/test_assert_index_equal.py | 2 +- pandas/tests/util/test_assert_series_equal.py | 6 ++-- pandas/tests/window/test_api.py | 2 +- 79 files changed, 306 insertions(+), 192 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 994b351acf42c5..10c1c490551fb5 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -14,6 +14,7 @@ import numpy as np +from pandas._config import using_string_dtype from pandas._config.localization import ( can_set_locale, get_locales, @@ -110,7 +111,10 @@ ALL_FLOAT_DTYPES: list[Dtype] = [*FLOAT_NUMPY_DTYPES, *FLOAT_EA_DTYPES] COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"] -STRING_DTYPES: list[Dtype] = [str, "str", "U"] +if using_string_dtype(): + STRING_DTYPES: list[Dtype] = [str, "U"] +else: + STRING_DTYPES: list[Dtype] = [str, "str", "U"] # type: ignore[no-redef] COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES] DATETIME64_DTYPES: list[Dtype] = ["datetime64[ns]", "M8[ns]"] diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index a156042ac0c0e6..6c44b7759f0e29 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -570,7 +570,10 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, np.ndarray): if not len(item): # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] - if self._dtype.name == "string" and self._dtype.storage == "pyarrow": + if ( + isinstance(self._dtype, StringDtype) + and self._dtype.storage == "pyarrow" + ): # TODO(infer_string) should this be large_string? pa_dtype = pa.string() else: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 3cbacec9d411da..0929791ded58c3 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -4,7 +4,6 @@ from typing import ( TYPE_CHECKING, Any, - ClassVar, Literal, cast, ) @@ -114,9 +113,12 @@ class StringDtype(StorageExtensionDtype): string[pyarrow] """ - # error: Cannot override instance variable (previously declared on - # base class "StorageExtensionDtype") with class variable - name: ClassVar[str] = "string" # type: ignore[misc] + @property + def name(self) -> str: # type: ignore[override] + if self._na_value is libmissing.NA: + return "string" + else: + return "str" #: StringDtype().na_value uses pandas.NA except the implementation that # follows NumPy semantics, which uses nan. @@ -133,7 +135,7 @@ def __init__( ) -> None: # infer defaults if storage is None: - if using_string_dtype() and na_value is not libmissing.NA: + if na_value is not libmissing.NA: if HAS_PYARROW: storage = "pyarrow" else: @@ -166,11 +168,19 @@ def __init__( self.storage = storage self._na_value = na_value + def __repr__(self) -> str: + if self._na_value is libmissing.NA: + return f"{self.name}[{self.storage}]" + else: + # TODO add more informative repr + return self.name + def __eq__(self, other: object) -> bool: # we need to override the base class __eq__ because na_value (NA or NaN) # cannot be checked with normal `==` if isinstance(other, str): - if other == self.name: + # TODO should dtype == "string" work for the NaN variant? + if other == "string" or other == self.name: # noqa: PLR1714 return True try: other = self.construct_from_string(other) @@ -227,6 +237,8 @@ def construct_from_string(cls, string) -> Self: ) if string == "string": return cls() + elif string == "str" and using_string_dtype(): + return cls(na_value=np.nan) elif string == "string[python]": return cls(storage="python") elif string == "string[pyarrow]": diff --git a/pandas/core/frame.py b/pandas/core/frame.py index afcd4d014316e3..1403fc2ceaaf8d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4979,7 +4979,9 @@ def select_dtypes(self, include=None, exclude=None) -> Self: ----- * To select all *numeric* types, use ``np.number`` or ``'number'`` * To select strings you must use the ``object`` dtype, but note that - this will return *all* object dtype columns + this will return *all* object dtype columns. With + ``pd.options.future.infer_string`` enabled, using ``"str"`` will + work to select all string columns. * See the `numpy dtype hierarchy `__ * To select datetimes, use ``np.datetime64``, ``'datetime'`` or diff --git a/pandas/core/interchange/utils.py b/pandas/core/interchange/utils.py index fd1c7c96392428..035a1f8abdbc5d 100644 --- a/pandas/core/interchange/utils.py +++ b/pandas/core/interchange/utils.py @@ -135,7 +135,12 @@ def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str: if format_str is not None: return format_str - if lib.is_np_dtype(dtype, "M"): + if isinstance(dtype, pd.StringDtype): + # TODO(infer_string) this should be LARGE_STRING for pyarrow storage, + # but current tests don't cover this distinction + return ArrowCTypes.STRING + + elif lib.is_np_dtype(dtype, "M"): # Selecting the first char of resolution string: # dtype.str -> ' 'n' resolution = np.datetime_data(dtype)[0][0] diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index aee9100702350f..6ac0b49f0e4e76 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -110,7 +110,7 @@ def test_numba_unsupported_dtypes(apply_axis): with pytest.raises( ValueError, - match="Column b must have a numeric dtype. Found 'object|string' instead", + match="Column b must have a numeric dtype. Found 'object|str' instead", ): df.apply(f, engine="numba", axis=apply_axis) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index df24fa08f48e15..69f84ca74ab0b4 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -244,7 +244,7 @@ def test_apply_categorical(by_row, using_infer_string): result = ser.apply(lambda x: "A") exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) - assert result.dtype == object if not using_infer_string else "string[pyarrow_numpy]" + assert result.dtype == object if not using_infer_string else "str" @pytest.mark.parametrize("series", [["1-1", "1-1", np.nan], ["1-1", "1-2", np.nan]]) diff --git a/pandas/tests/arrays/boolean/test_astype.py b/pandas/tests/arrays/boolean/test_astype.py index 932e903c0e4481..8c2672218f273c 100644 --- a/pandas/tests/arrays/boolean/test_astype.py +++ b/pandas/tests/arrays/boolean/test_astype.py @@ -5,7 +5,7 @@ import pandas._testing as tm -def test_astype(): +def test_astype(using_infer_string): # with missing values arr = pd.array([True, False, None], dtype="boolean") @@ -20,8 +20,14 @@ def test_astype(): tm.assert_numpy_array_equal(result, expected) result = arr.astype("str") - expected = np.array(["True", "False", ""], dtype=f"{tm.ENDIAN}U5") - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array( + ["True", "False", None], dtype=pd.StringDtype(na_value=np.nan) + ) + tm.assert_extension_array_equal(result, expected) + else: + expected = np.array(["True", "False", ""], dtype=f"{tm.ENDIAN}U5") + tm.assert_numpy_array_equal(result, expected) # no missing values arr = pd.array([True, False, True], dtype="boolean") diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py index a2a53af6ab1ad3..ee930ac84aaf24 100644 --- a/pandas/tests/arrays/categorical/test_astype.py +++ b/pandas/tests/arrays/categorical/test_astype.py @@ -89,7 +89,7 @@ def test_astype(self, ordered): expected = np.array(cat) tm.assert_numpy_array_equal(result, expected) - msg = r"Cannot cast object|string dtype to float64" + msg = r"Cannot cast object|str dtype to float64" with pytest.raises(ValueError, match=msg): cat.astype(float) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index e2e5d47f502093..3a2c489920eb05 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -22,7 +22,7 @@ def test_print(self, using_infer_string): if using_infer_string: expected = [ "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", - "Categories (3, string): [a < b < c]", + "Categories (3, str): [a < b < c]", ] else: expected = [ diff --git a/pandas/tests/arrays/floating/test_astype.py b/pandas/tests/arrays/floating/test_astype.py index ade3dbd2c99da3..ccf644b34051d7 100644 --- a/pandas/tests/arrays/floating/test_astype.py +++ b/pandas/tests/arrays/floating/test_astype.py @@ -63,12 +63,21 @@ def test_astype_to_integer_array(): tm.assert_extension_array_equal(result, expected) -def test_astype_str(): +def test_astype_str(using_infer_string): a = pd.array([0.1, 0.2, None], dtype="Float64") - expected = np.array(["0.1", "0.2", ""], dtype="U32") - tm.assert_numpy_array_equal(a.astype(str), expected) - tm.assert_numpy_array_equal(a.astype("str"), expected) + if using_infer_string: + expected = pd.array(["0.1", "0.2", None], dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_extension_array_equal(a.astype("str"), expected) + + # TODO(infer_string) this should also be a string array like above + expected = np.array(["0.1", "0.2", ""], dtype="U32") + tm.assert_numpy_array_equal(a.astype(str), expected) + else: + expected = np.array(["0.1", "0.2", ""], dtype="U32") + + tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_numpy_array_equal(a.astype("str"), expected) def test_astype_copy(): diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index 8620763988e067..7be00e569b3fe6 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -278,12 +278,21 @@ def test_to_numpy_na_raises(dtype): a.to_numpy(dtype=dtype) -def test_astype_str(): +def test_astype_str(using_infer_string): a = pd.array([1, 2, None], dtype="Int64") - expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") - tm.assert_numpy_array_equal(a.astype(str), expected) - tm.assert_numpy_array_equal(a.astype("str"), expected) + if using_infer_string: + expected = pd.array(["1", "2", None], dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_extension_array_equal(a.astype("str"), expected) + + # TODO(infer_string) this should also be a string array like above + expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") + tm.assert_numpy_array_equal(a.astype(str), expected) + else: + expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") + + tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_numpy_array_equal(a.astype("str"), expected) def test_astype_boolean(): diff --git a/pandas/tests/arrays/interval/test_interval_pyarrow.py b/pandas/tests/arrays/interval/test_interval_pyarrow.py index be87d5d3ef7bab..ef8701be81e2b9 100644 --- a/pandas/tests/arrays/interval/test_interval_pyarrow.py +++ b/pandas/tests/arrays/interval/test_interval_pyarrow.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm from pandas.core.arrays import IntervalArray @@ -82,7 +80,6 @@ def test_arrow_array_missing(): assert result.storage.equals(expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) diff --git a/pandas/tests/arrays/period/test_arrow_compat.py b/pandas/tests/arrays/period/test_arrow_compat.py index ff86b696c84037..431309aca0df21 100644 --- a/pandas/tests/arrays/period/test_arrow_compat.py +++ b/pandas/tests/arrays/period/test_arrow_compat.py @@ -1,7 +1,5 @@ import pytest -from pandas._config import using_string_dtype - from pandas.compat.pyarrow import pa_version_under10p1 from pandas.core.dtypes.dtypes import PeriodDtype @@ -79,7 +77,6 @@ def test_arrow_array_missing(): assert result.storage.equals(expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_arrow_table_roundtrip(): from pandas.core.arrays.arrow.extension_types import ArrowPeriodType @@ -99,7 +96,6 @@ def test_arrow_table_roundtrip(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_arrow_load_from_zero_chunks(): # GH-41040 diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index b51b01c2b51685..1296cc3b5a4941 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -65,7 +65,7 @@ def test_repr(dtype): assert repr(df) == expected if dtype.na_value is np.nan: - expected = "0 a\n1 NaN\n2 b\nName: A, dtype: string" + expected = "0 a\n1 NaN\n2 b\nName: A, dtype: str" else: expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected @@ -75,10 +75,10 @@ def test_repr(dtype): expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" elif dtype.storage == "pyarrow" and dtype.na_value is np.nan: arr_name = "ArrowStringArrayNumpySemantics" - expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" + expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str" elif dtype.storage == "python" and dtype.na_value is np.nan: arr_name = "StringArrayNumpySemantics" - expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" + expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str" else: arr_name = "StringArray" expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" @@ -502,7 +502,7 @@ def test_fillna_args(dtype): tm.assert_extension_array_equal(res, expected) if dtype.storage == "pyarrow": - msg = "Invalid value '1' for dtype string" + msg = "Invalid value '1' for dtype str" else: msg = "Cannot set non-string value '1' into a StringArray." with pytest.raises(TypeError, match=msg): @@ -524,7 +524,7 @@ def test_arrow_array(dtype): assert arr.equals(expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_roundtrip(dtype, string_storage, using_infer_string): # roundtrip possible from arrow 1.0.0 @@ -539,14 +539,17 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string): assert table.field("a").type == "large_string" with pd.option_context("string_storage", string_storage): result = table.to_pandas() - assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage}]") - tm.assert_frame_equal(result, expected) - # ensure the missing value is represented by NA and not np.nan or None - assert result.loc[2, "a"] is result["a"].dtype.na_value + if dtype.na_value is np.nan and not using_string_dtype(): + assert result["a"].dtype == "object" + else: + assert isinstance(result["a"].dtype, pd.StringDtype) + expected = df.astype(f"string[{string_storage}]") + tm.assert_frame_equal(result, expected) + # ensure the missing value is represented by NA and not np.nan or None + assert result.loc[2, "a"] is result["a"].dtype.na_value -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): # GH-41040 @@ -563,9 +566,13 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) with pd.option_context("string_storage", string_storage): result = table.to_pandas() - assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage}]") - tm.assert_frame_equal(result, expected) + + if dtype.na_value is np.nan and not using_string_dtype(): + assert result["a"].dtype == "object" + else: + assert isinstance(result["a"].dtype, pd.StringDtype) + expected = df.astype(f"string[{string_storage}]") + tm.assert_frame_equal(result, expected) def test_value_counts_na(dtype): diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index d38b728aaf1205..8d5c16e448ceee 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -4,6 +4,7 @@ import numpy as np import pytest +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -27,8 +28,9 @@ def test_eq_all_na(): def test_config(string_storage, request, using_infer_string): - if using_infer_string and string_storage == "python": - # python string storage with na_value=NaN is not yet implemented + if using_infer_string and string_storage == "python" and HAS_PYARROW: + # string storage with na_value=NaN always uses pyarrow if available + # -> does not yet honor the option request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) with pd.option_context("string_storage", string_storage): diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 4961123a7ca079..360ab960088eda 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -295,7 +295,9 @@ def test_searchsorted(self): assert result == 10 @pytest.mark.parametrize("box", [None, "index", "series"]) - def test_searchsorted_castable_strings(self, arr1d, box, string_storage): + def test_searchsorted_castable_strings( + self, arr1d, box, string_storage, using_infer_string + ): arr = arr1d if box is None: pass @@ -331,7 +333,8 @@ def test_searchsorted_castable_strings(self, arr1d, box, string_storage): TypeError, match=re.escape( f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', " - "or array of those. Got string array instead." + "or array of those. Got " + f"{'str' if using_infer_string else 'string'} array instead." ), ): arr.searchsorted([str(arr[1]), "baz"]) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index c34c97b6e4f048..e0232bb292d6eb 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -799,3 +799,22 @@ def test_pandas_dtype_ea_not_instance(): # GH 31356 GH 54592 with tm.assert_produces_warning(UserWarning): assert pandas_dtype(CategoricalDtype) == CategoricalDtype() + + +def test_pandas_dtype_string_dtypes(string_storage): + # TODO(infer_string) remove skip if "python" is supported + pytest.importorskip("pyarrow") + with pd.option_context("future.infer_string", True): + with pd.option_context("string_storage", string_storage): + result = pandas_dtype("str") + # TODO(infer_string) hardcoded to pyarrow until python is supported + assert result == pd.StringDtype("pyarrow", na_value=np.nan) + + with pd.option_context("future.infer_string", False): + with pd.option_context("string_storage", string_storage): + result = pandas_dtype("str") + assert result == np.dtype("U") + + with pd.option_context("string_storage", string_storage): + result = pandas_dtype("string") + assert result == pd.StringDtype(string_storage, na_value=pd.NA) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index e522d2666a2dca..a4916ed1bbd8ac 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1062,7 +1062,7 @@ def test_str_vs_repr(self, ordered, using_infer_string): c1 = CategoricalDtype(["a", "b"], ordered=ordered) assert str(c1) == "category" # Py2 will have unicode prefixes - dtype = "string" if using_infer_string else "object" + dtype = "str" if using_infer_string else "object" pat = ( r"CategoricalDtype\(categories=\[.*\], ordered={ordered}, " rf"categories_dtype={dtype}\)" diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 1102d9d9416634..f800f734ec9d96 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -116,6 +116,20 @@ def test_is_not_string_type(self, dtype): # because StringDtype is a string type assert is_string_dtype(dtype) + def test_is_dtype_from_name(self, dtype, using_infer_string): + if dtype.na_value is np.nan and not using_infer_string: + result = type(dtype).is_dtype(dtype.name) + assert result is False + else: + super().test_is_dtype_from_name(dtype) + + def test_construct_from_string_own_name(self, dtype, using_infer_string): + if dtype.na_value is np.nan and not using_infer_string: + with pytest.raises(TypeError, match="Cannot construct a 'StringDtype'"): + dtype.construct_from_string(dtype.name) + else: + super().test_construct_from_string_own_name(dtype) + def test_view(self, data): if data.dtype.storage == "pyarrow": pytest.skip(reason="2D support not implemented for ArrowStringArray") diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 09f359df37dd19..ec00044b84c49e 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -337,7 +337,7 @@ def test_setitem( smaller["col10"] = ["1", "2"] if using_infer_string: - assert smaller["col10"].dtype == "string" + assert smaller["col10"].dtype == "str" else: assert smaller["col10"].dtype == np.object_ assert (smaller["col10"] == ["1", "2"]).all() @@ -472,13 +472,13 @@ def test_setitem_corner(self, float_frame, using_infer_string): del dm["foo"] dm["foo"] = "bar" if using_infer_string: - assert dm["foo"].dtype == "string" + assert dm["foo"].dtype == "str" else: assert dm["foo"].dtype == np.object_ dm["coercible"] = ["1", "2", "3"] if using_infer_string: - assert dm["coercible"].dtype == "string" + assert dm["coercible"].dtype == "str" else: assert dm["coercible"].dtype == np.object_ @@ -514,7 +514,7 @@ def test_setitem_ambig(self, using_infer_string): dm[2] = uncoercable_series assert len(dm.columns) == 3 if using_infer_string: - assert dm[2].dtype == "string" + assert dm[2].dtype == "str" else: assert dm[2].dtype == np.object_ diff --git a/pandas/tests/frame/indexing/test_set_value.py b/pandas/tests/frame/indexing/test_set_value.py index ce771280bc2649..3d23e13264911c 100644 --- a/pandas/tests/frame/indexing/test_set_value.py +++ b/pandas/tests/frame/indexing/test_set_value.py @@ -28,7 +28,7 @@ def test_set_value_resize(self, float_frame, using_infer_string): res = float_frame.copy() res._set_value("foobar", "baz", "sam") if using_infer_string: - assert res["baz"].dtype == "string" + assert res["baz"].dtype == "str" else: assert res["baz"].dtype == np.object_ res = float_frame.copy() diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index ea9cc22d937580..9c27e76de91b20 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -202,7 +202,7 @@ def test_astype_dict_like(self, dtype_class): expected = DataFrame( { "a": a, - "b": Series(["0", "1", "2", "3", "4"], dtype="object"), + "b": Series(["0", "1", "2", "3", "4"], dtype="str"), "c": c, "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"), } @@ -263,9 +263,9 @@ def test_astype_duplicate_col(self): a2 = Series([0, 1, 2, 3, 4], name="a") df = concat([a1, b, a2], axis=1) - result = df.astype(str) + result = df.astype("str") a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a") - b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype=str, name="b") + b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype="str", name="b") a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a") expected = concat([a1_str, b_str, a2_str], axis=1) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_get_numeric_data.py b/pandas/tests/frame/methods/test_get_numeric_data.py index c5d32d56d03c11..6d097e75f6703c 100644 --- a/pandas/tests/frame/methods/test_get_numeric_data.py +++ b/pandas/tests/frame/methods/test_get_numeric_data.py @@ -33,7 +33,9 @@ def test_get_numeric_data(self, using_infer_string): [ np.dtype("float64"), np.dtype("int64"), - np.dtype(objectname) if not using_infer_string else "string", + np.dtype(objectname) + if not using_infer_string + else pd.StringDtype(na_value=np.nan), np.dtype(datetime64name), ], index=["a", "b", "c", "f"], diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index 3ba893501914ac..54f2e45488b788 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -86,7 +86,7 @@ def test_nlargest_n(self, df_strings, nselect_method, n, order): df = df_strings if "b" in order: error_msg = ( - f"Column 'b' has dtype (object|string), " + f"Column 'b' has dtype (object|str), " f"cannot use method '{nselect_method}' with this dtype" ) with pytest.raises(TypeError, match=error_msg): diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 44d7bbf57fe0a2..8d93c97b6b68ac 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -664,7 +664,7 @@ def test_reset_index_dtypes_on_empty_frame_with_multiindex( idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) result = DataFrame(index=idx)[:0].reset_index().dtypes if using_infer_string and dtype == object: - dtype = "string" + dtype = pd.StringDtype(na_value=np.nan) expected = Series({"level_0": np.int64, "level_1": np.float64, "level_2": dtype}) tm.assert_series_equal(result, expected) @@ -697,7 +697,7 @@ def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby( expected["c3"] = expected["c3"].astype("datetime64[ns]") expected["c1"] = expected["c1"].astype("float64") if using_infer_string: - expected["c2"] = expected["c2"].astype("string[pyarrow_numpy]") + expected["c2"] = expected["c2"].astype("str") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index d1bee6a3de6139..875dca321635f4 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -50,7 +50,7 @@ def copy(self): class TestSelectDtypes: - def test_select_dtypes_include_using_list_like(self): + def test_select_dtypes_include_using_list_like(self, using_infer_string): df = DataFrame( { "a": list("abc"), @@ -94,6 +94,11 @@ def test_select_dtypes_include_using_list_like(self): with pytest.raises(NotImplementedError, match=r"^$"): df.select_dtypes(include=["period"]) + if using_infer_string: + ri = df.select_dtypes(include=["str"]) + ei = df[["a"]] + tm.assert_frame_equal(ri, ei) + def test_select_dtypes_exclude_using_list_like(self): df = DataFrame( { @@ -151,7 +156,7 @@ def test_select_dtypes_exclude_include_int(self, include): expected = df[["b", "c", "e"]] tm.assert_frame_equal(result, expected) - def test_select_dtypes_include_using_scalars(self): + def test_select_dtypes_include_using_scalars(self, using_infer_string): df = DataFrame( { "a": list("abc"), @@ -187,6 +192,11 @@ def test_select_dtypes_include_using_scalars(self): with pytest.raises(NotImplementedError, match=r"^$"): df.select_dtypes(include="period") + if using_infer_string: + ri = df.select_dtypes(include="str") + ei = df[["a"]] + tm.assert_frame_equal(ri, ei) + def test_select_dtypes_exclude_using_scalars(self): df = DataFrame( { @@ -347,7 +357,10 @@ def test_select_dtypes_datetime_with_tz(self): @pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"]) @pytest.mark.parametrize("arg", ["include", "exclude"]) - def test_select_dtypes_str_raises(self, dtype, arg): + def test_select_dtypes_str_raises(self, dtype, arg, using_infer_string): + if using_infer_string and dtype == "str": + # this is tested below + pytest.skip("Selecting string columns works with future strings") df = DataFrame( { "a": list("abc"), diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index bed8b030bc72a8..20a8e95f990ecc 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -697,10 +697,7 @@ def test_to_csv_interval_index(self, using_infer_string): # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) expected = df.copy() - if using_infer_string: - expected.index = expected.index.astype("string[pyarrow_numpy]") - else: - expected.index = expected.index.astype(str) + expected.index = expected.index.astype("str") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 9bd61736624cab..0766e927a64a9a 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -209,7 +209,9 @@ def test_construction_with_mixed(self, float_string_frame, using_infer_string): expected = Series( [np.dtype("float64")] * 4 + [ - np.dtype("object") if not using_infer_string else "string", + np.dtype("object") + if not using_infer_string + else pd.StringDtype(na_value=np.nan), np.dtype("datetime64[us]"), np.dtype("timedelta64[us]"), ], diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index aab900f6eef473..c9eb2d5ca7be43 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -265,7 +265,7 @@ def test_emptylike_constructor(self, emptylike, expected_index, expected_columns tm.assert_frame_equal(result, expected) def test_constructor_mixed(self, float_string_frame, using_infer_string): - dtype = "string" if using_infer_string else np.object_ + dtype = "str" if using_infer_string else np.object_ assert float_string_frame["foo"].dtype == dtype def test_constructor_cast_failure(self): @@ -789,7 +789,7 @@ def test_constructor_dict_cast(self, using_infer_string): frame = DataFrame(test_data) assert len(frame) == 3 - assert frame["B"].dtype == np.object_ if not using_infer_string else "string" + assert frame["B"].dtype == np.object_ if not using_infer_string else "str" assert frame["A"].dtype == np.float64 def test_constructor_dict_cast2(self): @@ -1209,7 +1209,7 @@ def test_constructor_scalar_inference(self, using_infer_string): assert df["bool"].dtype == np.bool_ assert df["float"].dtype == np.float64 assert df["complex"].dtype == np.complex128 - assert df["object"].dtype == np.object_ if not using_infer_string else "string" + assert df["object"].dtype == np.object_ if not using_infer_string else "str" def test_constructor_arrays_and_scalars(self): df = DataFrame({"a": np.random.default_rng(2).standard_normal(10), "b": True}) @@ -1292,7 +1292,7 @@ def test_constructor_list_of_lists(self, using_infer_string): # GH #484 df = DataFrame(data=[[1, "a"], [2, "b"]], columns=["num", "str"]) assert is_integer_dtype(df["num"]) - assert df["str"].dtype == np.object_ if not using_infer_string else "string" + assert df["str"].dtype == np.object_ if not using_infer_string else "str" # GH 4851 # list of 0-dim ndarrays @@ -1860,7 +1860,12 @@ def test_constructor_with_datetimes(self, using_infer_string): result = df.dtypes expected = Series( [np.dtype("int64")] - + [np.dtype(objectname) if not using_infer_string else "string"] * 2 + + [ + np.dtype(objectname) + if not using_infer_string + else pd.StringDtype(na_value=np.nan) + ] + * 2 + [np.dtype("M8[s]"), np.dtype("M8[us]")], index=list("ABCDE"), ) @@ -1882,7 +1887,11 @@ def test_constructor_with_datetimes(self, using_infer_string): expected = Series( [np.dtype("float64")] + [np.dtype("int64")] - + [np.dtype("object") if not using_infer_string else "string"] + + [ + np.dtype("object") + if not using_infer_string + else pd.StringDtype(na_value=np.nan) + ] + [np.dtype("float64")] + [np.dtype(intname)], index=["a", "b", "c", floatname, intname], @@ -1904,7 +1913,11 @@ def test_constructor_with_datetimes(self, using_infer_string): expected = Series( [np.dtype("float64")] + [np.dtype("int64")] - + [np.dtype("object") if not using_infer_string else "string"] + + [ + np.dtype("object") + if not using_infer_string + else pd.StringDtype(na_value=np.nan) + ] + [np.dtype("float64")] + [np.dtype(intname)], index=["a", "b", "c", floatname, intname], @@ -2124,7 +2137,9 @@ def test_constructor_for_list_with_dtypes(self, using_infer_string): [ np.dtype("int64"), np.dtype("float64"), - np.dtype("object") if not using_infer_string else "string", + np.dtype("object") + if not using_infer_string + else pd.StringDtype(na_value=np.nan), np.dtype("datetime64[ns]"), np.dtype("float64"), ], diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 75ef348b75debb..2c3e9c1d5e327c 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -657,7 +657,11 @@ def test_unstack_dtypes(self, using_infer_string): df2["D"] = "foo" df3 = df2.unstack("B") result = df3.dtypes - dtype = "string" if using_infer_string else np.dtype("object") + dtype = ( + pd.StringDtype(na_value=np.nan) + if using_infer_string + else np.dtype("object") + ) expected = Series( [np.dtype("float64")] * 2 + [dtype] * 2, index=MultiIndex.from_arrays( diff --git a/pandas/tests/generic/test_to_xarray.py b/pandas/tests/generic/test_to_xarray.py index d8401a8b2ae3f3..9fe9bca8abdc9b 100644 --- a/pandas/tests/generic/test_to_xarray.py +++ b/pandas/tests/generic/test_to_xarray.py @@ -52,7 +52,7 @@ def test_to_xarray_index_types(self, index_flat, df, using_infer_string): # column names are lost expected = df.copy() expected["f"] = expected["f"].astype( - object if not using_infer_string else "string[pyarrow_numpy]" + object if not using_infer_string else "str" ) expected.columns.name = None tm.assert_frame_equal(result.to_dataframe(), expected) @@ -81,7 +81,7 @@ def test_to_xarray_with_multiindex(self, df, using_infer_string): result = result.to_dataframe() expected = df.copy() expected["f"] = expected["f"].astype( - object if not using_infer_string else "string[pyarrow_numpy]" + object if not using_infer_string else "str" ) expected.columns.name = None tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 4972a6b3afa177..d91510d834e6c8 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -79,7 +77,7 @@ def test_apply_index_date(using_infer_string): tm.assert_frame_equal(result, expected) -def test_apply_index_date_object(using_infer_string): +def test_apply_index_date_object(): # GH 5789 # don't auto coerce dates ts = [ @@ -111,10 +109,7 @@ def test_apply_index_date_object(using_infer_string): 1.40750, 1.40649, ] - dtype = "string[pyarrow_numpy]" if using_infer_string else object - exp_idx = Index( - ["2011-05-16", "2011-05-17", "2011-05-18"], dtype=dtype, name="date" - ) + exp_idx = Index(["2011-05-16", "2011-05-17", "2011-05-18"], name="date") expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(DeprecationWarning, match=msg): @@ -942,12 +937,11 @@ def test_func_returns_object(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "group_column_dtlike", [datetime.today(), datetime.today().date(), datetime.today().time()], ) -def test_apply_datetime_issue(group_column_dtlike, using_infer_string): +def test_apply_datetime_issue(group_column_dtlike): # GH-28247 # groupby-apply throws an error if one of the columns in the DataFrame # is a datetime object and the column labels are different from @@ -958,8 +952,7 @@ def test_apply_datetime_issue(group_column_dtlike, using_infer_string): with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) - dtype = "string" if using_infer_string else "object" - expected = DataFrame(["spam"], Index(["foo"], dtype=dtype, name="a"), columns=[42]) + expected = DataFrame(["spam"], Index(["foo"], dtype="str", name="a"), columns=[42]) tm.assert_frame_equal(result, expected) @@ -1040,7 +1033,7 @@ def test_groupby_apply_datetime_result_dtypes(using_infer_string): msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(DeprecationWarning, match=msg): result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes - dtype = "string" if using_infer_string else object + dtype = pd.StringDtype(na_value=np.nan) if using_infer_string else object expected = Series( [np.dtype("datetime64[ns]"), dtype, dtype, np.int64, dtype], index=["observation", "color", "mood", "intensity", "score"], diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 1073dda954563e..c70995de7b3b27 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -131,7 +131,7 @@ def f(x): result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name="person_id") - dtype = "string[pyarrow_numpy]" if using_infer_string else object + dtype = "str" if using_infer_string else object expected["person_name"] = expected["person_name"].astype(dtype) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4381b36b0b73aa..dc1658e9acf3b0 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1216,7 +1216,7 @@ def test_groupby_complex_mean(): tm.assert_frame_equal(result, expected) -def test_groupby_complex_numbers(using_infer_string): +def test_groupby_complex_numbers(): # GH 17927 df = DataFrame( [ @@ -1225,11 +1225,10 @@ def test_groupby_complex_numbers(using_infer_string): {"a": 4, "b": 1}, ] ) - dtype = "string[pyarrow_numpy]" if using_infer_string else object expected = DataFrame( np.array([1, 1, 1], dtype=np.int64), index=Index([(1 + 1j), (1 + 2j), (1 + 0j)], name="b"), - columns=Index(["a"], dtype=dtype), + columns=Index(["a"]), ) result = df.groupby("b", sort=False).count() tm.assert_frame_equal(result, expected) @@ -2097,7 +2096,7 @@ def get_categorical_invalid_expected(): idx = Index(lev, name=keys[0]) if using_infer_string: - columns = Index([], dtype="string[pyarrow_numpy]") + columns = Index([], dtype="str") else: columns = [] expected = DataFrame([], columns=columns, index=idx) diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index ff4685b1e412d9..029d322e4fdc3b 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -181,6 +181,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): "category type does not support sum operations", re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), + re.escape(f"agg function failed [how->{method},dtype->str]"), ] ) with pytest.raises(exception, match=msg): @@ -198,6 +199,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): f"Cannot perform {method} with non-ordered Categorical", re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), + re.escape(f"agg function failed [how->{method},dtype->str]"), ] ) with pytest.raises(exception, match=msg): diff --git a/pandas/tests/indexes/base_class/test_formats.py b/pandas/tests/indexes/base_class/test_formats.py index b2f345e5e6f774..955e3be107f751 100644 --- a/pandas/tests/indexes/base_class/test_formats.py +++ b/pandas/tests/indexes/base_class/test_formats.py @@ -9,7 +9,6 @@ class TestIndexRendering: - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_repr_is_valid_construction_code(self): # for the case of Index, where the repr is traditional rather than # stylized diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index 8456e6a7acba5a..b1180f2d7af145 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -851,7 +851,7 @@ def test_dtype_representation(using_infer_string): # GH#46900 pmidx = MultiIndex.from_arrays([[1], ["a"]], names=[("a", "b"), ("c", "d")]) result = pmidx.dtypes - exp = "object" if not using_infer_string else "string" + exp = "object" if not using_infer_string else pd.StringDtype(na_value=np.nan) expected = Series( ["int64", exp], index=MultiIndex.from_tuples([("a", "b"), ("c", "d")]), diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index 6eeaeb6711d03c..17ca8764873302 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -41,7 +41,7 @@ def test_get_dtypes(using_infer_string): names=["int", "string", "dt"], ) - exp = "object" if not using_infer_string else "string" + exp = "object" if not using_infer_string else pd.StringDtype(na_value=np.nan) expected = pd.Series( { "int": np.dtype("int64"), @@ -61,7 +61,7 @@ def test_get_dtypes_no_level_name(using_infer_string): pd.date_range("20200101", periods=2, tz="UTC"), ], ) - exp = "object" if not using_infer_string else "string" + exp = "object" if not using_infer_string else pd.StringDtype(na_value=np.nan) expected = pd.Series( { "level_0": np.dtype("int64"), @@ -82,7 +82,7 @@ def test_get_dtypes_duplicate_level_names(using_infer_string): ], names=["A", "A", "A"], ).dtypes - exp = "object" if not using_infer_string else "string" + exp = "object" if not using_infer_string else pd.StringDtype(na_value=np.nan) expected = pd.Series( [np.dtype("int64"), exp, DatetimeTZDtype(tz="utc")], index=["A", "A", "A"], diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index ebf9dac715f8d1..493a5be735d1a2 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -170,6 +170,7 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): class TestSliceLocs: + # TODO(infer_string) parametrize over multiple string dtypes @pytest.mark.parametrize( "dtype", [ @@ -208,6 +209,7 @@ def test_slice_locs_negative_step(self, in_slice, expected, dtype): expected = Index(list(expected), dtype=dtype) tm.assert_index_equal(result, expected) + # TODO(infer_string) parametrize over multiple string dtypes @td.skip_if_no("pyarrow") def test_slice_locs_negative_step_oob(self): index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]") diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 7eeb626d91dc8e..4d02ec853e0daa 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -79,7 +79,7 @@ def test_constructor_copy(self, using_infer_string): assert new_index.name == "name" if using_infer_string: tm.assert_extension_array_equal( - new_index.values, pd.array(arr, dtype="string[pyarrow_numpy]") + new_index.values, pd.array(arr, dtype="str") ) else: tm.assert_numpy_array_equal(arr, new_index.values) @@ -160,7 +160,7 @@ def test_constructor_from_frame_series_freq(self, using_infer_string): df = DataFrame(np.random.default_rng(2).random((5, 3))) df["date"] = dts result = DatetimeIndex(df["date"], freq="MS") - dtype = object if not using_infer_string else "string" + dtype = object if not using_infer_string else "str" assert df["date"].dtype == dtype expected.name = "date" tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index 5508153322adba..fa5ec63dd32fe5 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -588,7 +588,7 @@ def test_loc_nan_multiindex(using_infer_string): np.ones((1, 4)), index=Index( [np.nan], - dtype="object" if not using_infer_string else "string[pyarrow_numpy]", + dtype="object" if not using_infer_string else "str", name="u3", ), columns=Index(["d1", "d2", "d3", "d4"]), diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index d2c8454019a5e1..908e95accfb0f4 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -294,7 +294,7 @@ def test_dups_fancy_indexing_only_missing_label(self, using_infer_string): with pytest.raises( KeyError, match=re.escape( - "\"None of [Index(['E'], dtype='string')] are in the [index]\"" + "\"None of [Index(['E'], dtype='str')] are in the [index]\"" ), ): dfnu.loc[["E"]] diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 41431c0e2813b7..34d827a209daea 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -63,12 +63,17 @@ def test_not_change_nan_loc(series, new_series, expected_ser): class TestLoc: - def test_none_values_on_string_columns(self): + def test_none_values_on_string_columns(self, using_infer_string): # Issue #32218 - df = DataFrame(["1", "2", None], columns=["a"], dtype="str") - + df = DataFrame(["1", "2", None], columns=["a"], dtype=object) assert df.loc[2, "a"] is None + df = DataFrame(["1", "2", None], columns=["a"], dtype="str") + if using_infer_string: + assert np.isnan(df.loc[2, "a"]) + else: + assert df.loc[2, "a"] is None + @pytest.mark.parametrize("kind", ["series", "frame"]) def test_loc_getitem_int(self, kind, request): # int label @@ -1460,7 +1465,7 @@ def test_loc_setitem_single_row_categorical(self, using_infer_string): result = df["Alpha"] expected = Series(categories, index=df.index, name="Alpha").astype( - object if not using_infer_string else "string[pyarrow_numpy]" + object if not using_infer_string else "str" ) tm.assert_series_equal(result, expected) @@ -1635,7 +1640,7 @@ def test_loc_setitem_single_column_mixed(self, using_infer_string): df.loc[df.index[::2], "str"] = np.nan expected = Series( [np.nan, "qux", np.nan, "qux", np.nan], - dtype=object if not using_infer_string else "string[pyarrow_numpy]", + dtype=object if not using_infer_string else "str", ).values tm.assert_almost_equal(df["str"].values, expected) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index ca551024b4c1fe..5fcb71d0186a68 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -227,7 +227,7 @@ def test_partial_set_empty_frame_empty_consistencies(self, using_infer_string): { "x": Series( ["1", "2"], - dtype=object if not using_infer_string else "string[pyarrow_numpy]", + dtype=object if not using_infer_string else "str", ), "y": Series([np.nan, np.nan], dtype=object), } diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index ce88bae6e02f28..30c5d3177c5a56 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -626,7 +626,7 @@ def _compare(old_mgr, new_mgr): mgr.iset(1, np.array(["2."] * N, dtype=np.object_)) mgr.iset(2, np.array(["foo."] * N, dtype=np.object_)) new_mgr = mgr.convert(copy=True) - dtype = "string[pyarrow_numpy]" if using_infer_string else np.object_ + dtype = "str" if using_infer_string else np.object_ assert new_mgr.iget(0).dtype == dtype assert new_mgr.iget(1).dtype == dtype assert new_mgr.iget(2).dtype == dtype diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 7ecddb18a61ece..57091b268a9db0 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -766,7 +766,7 @@ def test_to_excel_interval_no_labels(self, path, using_infer_string): df["new"] = pd.cut(df[0], 10) expected["new"] = pd.cut(expected[0], 10).astype( - str if not using_infer_string else "string[pyarrow_numpy]" + str if not using_infer_string else "str" ) df.to_excel(path, sheet_name="test1") diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 53e819ac5eaff6..1e47b3bc38737c 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -75,7 +75,7 @@ def test_build_table_schema(self, df_schema, using_infer_string): "primaryKey": ["idx"], } if using_infer_string: - expected["fields"][2] = {"name": "B", "type": "any", "extDtype": "string"} + expected["fields"][2] = {"name": "B", "type": "any", "extDtype": "str"} assert result == expected result = build_table_schema(df_schema) assert "pandas_version" in result @@ -128,7 +128,7 @@ def test_multiindex(self, df_schema, using_infer_string): "type": "any", "extDtype": "string", } - expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "string"} + expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "str"} assert result == expected df.index.names = ["idx0", None] @@ -311,7 +311,7 @@ def test_to_json(self, df_table, using_infer_string): ] if using_infer_string: - fields[2] = {"name": "B", "type": "any", "extDtype": "string"} + fields[2] = {"name": "B", "type": "any", "extDtype": "str"} schema = {"fields": fields, "primaryKey": ["idx"]} data = [ diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index a1d2e93e7c5236..cb94111aedffdb 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -266,7 +266,7 @@ def test_roundtrip_categorical( expected = categorical_frame.copy() expected.index = expected.index.astype( - str if not using_infer_string else "string[pyarrow_numpy]" + str if not using_infer_string else "str" ) # Categorical not preserved expected.index.name = None # index names aren't preserved in JSON assert_json_roundtrip_equal(result, expected, orient) @@ -621,7 +621,7 @@ def test_blocks_compat_GH9037(self, using_infer_string): # JSON deserialisation always creates unicode strings df_mixed.columns = df_mixed.columns.astype( - np.str_ if not using_infer_string else "string[pyarrow_numpy]" + np.str_ if not using_infer_string else "str" ) data = StringIO(df_mixed.to_json(orient="split")) df_roundtrip = read_json(data, orient="split") @@ -706,7 +706,7 @@ def test_series_roundtrip_simple(self, orient, string_series, using_infer_string expected = string_series if using_infer_string and orient in ("split", "index", "columns"): # These schemas don't contain dtypes, so we infer string - expected.index = expected.index.astype("string[pyarrow_numpy]") + expected.index = expected.index.astype("str") if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": @@ -1492,7 +1492,6 @@ def test_from_json_to_json_table_index_and_columns(self, index, columns): result = read_json(StringIO(dfjson), orient="table") tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_from_json_to_json_table_dtypes(self): # GH21345 expected = DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]}) diff --git a/pandas/tests/io/pytables/test_keys.py b/pandas/tests/io/pytables/test_keys.py index 55bd3f0d5a03a1..7d0802dcf2e471 100644 --- a/pandas/tests/io/pytables/test_keys.py +++ b/pandas/tests/io/pytables/test_keys.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, HDFStore, @@ -13,7 +15,10 @@ tables, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_keys(setup_path): diff --git a/pandas/tests/io/pytables/test_subclass.py b/pandas/tests/io/pytables/test_subclass.py index 03622faa2b5a8f..bbe1cd77e0d9fa 100644 --- a/pandas/tests/io/pytables/test_subclass.py +++ b/pandas/tests/io/pytables/test_subclass.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Series, @@ -17,6 +19,7 @@ class TestHDFStoreSubclass: # GH 33748 + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_supported_for_subclass_dataframe(self, tmp_path): data = {"a": [1, 2], "b": [3, 4]} sdf = tm.SubclassedDataFrame(data, dtype=np.intp) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 56707560c2fda8..75ecd1d929d586 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -377,6 +377,7 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module): expected = f_path.read() assert result == expected + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_write_fspath_hdf5(self): # Same test as write_fspath_all, except HDF5 files aren't # necessarily byte-for-byte identical for a given dataframe, so we'll diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 65f4156cedf492..19b60e17d3a924 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -168,6 +168,7 @@ def test_excel_options(fsspectest): assert fsspectest.test[0] == "read" +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_parquet_new_file(cleared_fs, df1): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") @@ -277,7 +278,6 @@ def test_not_present_exception(): read_csv("memory://test/test.csv") -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_feather_options(fsspectest): pytest.importorskip("pyarrow") df = DataFrame({"a": [0]}) diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index a7ae9c70497021..96bc0326b23aba 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -197,6 +197,7 @@ def test_to_csv_compression_encoding_gcs( tm.assert_frame_equal(df, read_df) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index b2d96cb1d91335..a85576ff13f5c1 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -4,8 +4,6 @@ import pytest -from pandas._config import using_string_dtype - from pandas.errors import ParserWarning import pandas.util._test_decorators as td @@ -85,7 +83,6 @@ def read_xml_iterparse(data, **kwargs): # DTYPE -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dtype_single_str(parser): df_result = read_xml(StringIO(xml_types), dtype={"degrees": "str"}, parser=parser) df_iter = read_xml_iterparse( @@ -211,7 +208,6 @@ def test_wrong_dtype(xml_books, parser, iterparse): ) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_both_dtype_converters(parser): df_expected = DataFrame( { diff --git a/pandas/tests/reshape/concat/test_categorical.py b/pandas/tests/reshape/concat/test_categorical.py index bbaaf0abecfbdf..8e6a14e6bfb8f1 100644 --- a/pandas/tests/reshape/concat/test_categorical.py +++ b/pandas/tests/reshape/concat/test_categorical.py @@ -59,9 +59,7 @@ def test_categorical_concat_dtypes(self, using_infer_string): num = Series([1, 2, 3]) df = pd.concat([Series(cat), obj, num], axis=1, keys=index) - result = df.dtypes == ( - object if not using_infer_string else "string[pyarrow_numpy]" - ) + result = df.dtypes == (object if not using_infer_string else "str") expected = Series([False, True, False], index=index) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_empty.py b/pandas/tests/reshape/concat/test_empty.py index 30ef0a934157ba..95600876151236 100644 --- a/pandas/tests/reshape/concat/test_empty.py +++ b/pandas/tests/reshape/concat/test_empty.py @@ -27,7 +27,7 @@ def test_handle_empty_objects(self, sort, using_infer_string): expected = df.reindex(columns=["a", "b", "c", "d", "foo"]) expected["foo"] = expected["foo"].astype( - object if not using_infer_string else "string[pyarrow_numpy]" + object if not using_infer_string else "str" ) expected.loc[0:4, "foo"] = "bar" @@ -284,7 +284,7 @@ def test_concat_empty_dataframe_different_dtypes(self, using_infer_string): result = concat([df1[:0], df2[:0]]) assert result["a"].dtype == np.int64 - assert result["b"].dtype == np.object_ if not using_infer_string else "string" + assert result["b"].dtype == np.object_ if not using_infer_string else "str" def test_concat_to_empty_ea(self): """48510 `concat` to an empty EA should maintain type EA dtype.""" diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index 52bb9fa0f151bb..49c94168d203e4 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -452,9 +452,7 @@ def test_concat_axis_1_sort_false_rangeindex(self, using_infer_string): s1 = Series(["a", "b", "c"]) s2 = Series(["a", "b"]) s3 = Series(["a", "b", "c", "d"]) - s4 = Series( - [], dtype=object if not using_infer_string else "string[pyarrow_numpy]" - ) + s4 = Series([], dtype=object if not using_infer_string else "str") result = concat( [s1, s2, s3, s4], sort=False, join="outer", ignore_index=False, axis=1 ) @@ -465,7 +463,7 @@ def test_concat_axis_1_sort_false_rangeindex(self, using_infer_string): ["c", np.nan] * 2, [np.nan] * 2 + ["d"] + [np.nan], ], - dtype=object if not using_infer_string else "string[pyarrow_numpy]", + dtype=object if not using_infer_string else "str", ) tm.assert_frame_equal( result, expected, check_index_type=True, check_column_type=True diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index db5a0437a14f0c..91f0cf6c310852 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -156,7 +156,7 @@ def test_join_on(self, target_source, infer_string): # overlap source_copy = source.copy() msg = ( - "You are trying to merge on float64 and object|string columns for key " + "You are trying to merge on float64 and object|str columns for key " "'A'. If you wish to proceed you should use pd.concat" ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index ed49f3b758cc5c..8a9fe9f3e2cfde 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -826,7 +826,7 @@ def test_overlapping_columns_error_message(self): # #2649, #10639 df2.columns = ["key1", "foo", "foo"] - msg = r"Data columns not unique: Index\(\['foo'\], dtype='object|string'\)" + msg = r"Data columns not unique: Index\(\['foo'\], dtype='object|str'\)" with pytest.raises(MergeError, match=msg): merge(df, df2) @@ -1877,7 +1877,7 @@ def test_identical(self, left, using_infer_string): # merging on the same, should preserve dtypes merged = merge(left, left, on="X") result = merged.dtypes.sort_index() - dtype = np.dtype("O") if not using_infer_string else "string" + dtype = np.dtype("O") if not using_infer_string else "str" expected = Series( [CategoricalDtype(categories=["foo", "bar"]), dtype, dtype], index=["X", "Y_x", "Y_y"], @@ -1889,7 +1889,7 @@ def test_basic(self, left, right, using_infer_string): # so should preserve the merged column merged = merge(left, right, on="X") result = merged.dtypes.sort_index() - dtype = np.dtype("O") if not using_infer_string else "string" + dtype = np.dtype("O") if not using_infer_string else "str" expected = Series( [ CategoricalDtype(categories=["foo", "bar"]), @@ -2003,7 +2003,7 @@ def test_other_columns(self, left, right, using_infer_string): merged = merge(left, right, on="X") result = merged.dtypes.sort_index() - dtype = np.dtype("O") if not using_infer_string else "string" + dtype = np.dtype("O") if not using_infer_string else "str" expected = Series( [ CategoricalDtype(categories=["foo", "bar"]), @@ -2040,7 +2040,7 @@ def test_dtype_on_merged_different( merged = merge(left, right, on="X", how=join_type) result = merged.dtypes.sort_index() - dtype = np.dtype("O") if not using_infer_string else "string" + dtype = np.dtype("O") if not using_infer_string else "str" expected = Series([dtype, dtype, np.dtype("int64")], index=["X", "Y", "Z"]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 0865e3cfa81493..11e29f4e10dc42 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -3183,7 +3183,7 @@ def test_by_nullable(self, any_numeric_ea_dtype, using_infer_string): ) expected["value_y"] = np.array([np.nan, np.nan, np.nan], dtype=object) if using_infer_string: - expected["value_y"] = expected["value_y"].astype("string[pyarrow_numpy]") + expected["value_y"] = expected["value_y"].astype("str") tm.assert_frame_equal(result, expected) def test_merge_by_col_tz_aware(self): @@ -3234,7 +3234,7 @@ def test_by_mixed_tz_aware(self, using_infer_string): ) expected["value_y"] = np.array([np.nan], dtype=object) if using_infer_string: - expected["value_y"] = expected["value_y"].astype("string[pyarrow_numpy]") + expected["value_y"] = expected["value_y"].astype("str") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", ["float64", "int16", "m8[ns]", "M8[us]"]) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index b12438b6327ad4..6009b263a83c56 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -336,7 +336,7 @@ def test_no_prefix_string_cats_default_category( dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) result = from_dummies(dummies, default_category=default_category) if using_infer_string: - expected[""] = expected[""].astype("string[pyarrow_numpy]") + expected[""] = expected[""].astype("str") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 3d9b3a6d1c7a23..2c17b7f6a5a475 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -122,7 +122,7 @@ def test_get_dummies_basic_types(self, sparse, dtype, using_infer_string): result = get_dummies(s_df, columns=["a"], sparse=sparse, dtype=dtype) - key = "string" if using_infer_string else "object" + key = "str" if using_infer_string else "object" expected_counts = {"int64": 1, key: 1} expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0) diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index f0803ac2f2a30c..0dd2c227d6aa7a 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -599,7 +599,7 @@ def test_strftime_period_days(self, using_infer_string): dtype="=U10", ) if using_infer_string: - expected = expected.astype("string[pyarrow_numpy]") + expected = expected.astype("str") tm.assert_index_equal(result, expected) @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") diff --git a/pandas/tests/series/indexing/test_delitem.py b/pandas/tests/series/indexing/test_delitem.py index 3d1082c3d040b9..7440ef2692c47f 100644 --- a/pandas/tests/series/indexing/test_delitem.py +++ b/pandas/tests/series/indexing/test_delitem.py @@ -31,16 +31,15 @@ def test_delitem(self): del s[0] tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="int64"))) - def test_delitem_object_index(self, using_infer_string): + def test_delitem_object_index(self): # Index(dtype=object) - dtype = "string[pyarrow_numpy]" if using_infer_string else object - s = Series(1, index=Index(["a"], dtype=dtype)) + s = Series(1, index=Index(["a"], dtype="str")) del s["a"] - tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype))) + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="str"))) s["a"] = 1 - tm.assert_series_equal(s, Series(1, index=Index(["a"], dtype=dtype))) + tm.assert_series_equal(s, Series(1, index=Index(["a"], dtype="str"))) del s["a"] - tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype))) + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="str"))) def test_delitem_missing_key(self): # empty diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index 596a225c288b89..9783dcd2fea070 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -363,9 +363,7 @@ def test_getitem_no_matches(self, box): key = Series(["C"], dtype=object) key = box(key) - msg = ( - r"None of \[Index\(\['C'\], dtype='object|string'\)\] are in the \[index\]" - ) + msg = r"None of \[Index\(\['C'\], dtype='object|str'\)\] are in the \[index\]" with pytest.raises(KeyError, match=msg): ser[key] diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index fb8e5c31929b26..e2c27fe5575dbc 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -624,7 +624,7 @@ def test_setitem_enlargement_object_none(self, nulls_fixture, using_infer_string ser = Series(["a", "b"]) ser[3] = nulls_fixture dtype = ( - "string[pyarrow_numpy]" + "str" if using_infer_string and not isinstance(nulls_fixture, Decimal) else object ) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 4c8028e74ee551..ef0757ffe4aa8c 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -538,12 +538,12 @@ def test_astype_categorical_to_other(self): expected = ser tm.assert_series_equal(ser.astype("category"), expected) tm.assert_series_equal(ser.astype(CategoricalDtype()), expected) - msg = r"Cannot cast object|string dtype to float64" + msg = r"Cannot cast object|str dtype to float64" with pytest.raises(ValueError, match=msg): ser.astype("float64") cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) - exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"], dtype=object) + exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"], dtype="str") tm.assert_series_equal(cat.astype("str"), exp) s2 = Series(Categorical(["1", "2", "3", "4"])) exp2 = Series([1, 2, 3, 4]).astype("int") diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 251d4063008b96..ac489b2579e053 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -101,16 +101,16 @@ def test_map_series_stringdtype(any_string_dtype, using_infer_string): expected = Series(data=["rabbit", "dog", "cat", item], dtype=any_string_dtype) if using_infer_string and any_string_dtype == "object": - expected = expected.astype("string[pyarrow_numpy]") + expected = expected.astype("str") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "data, expected_dtype", - [(["1-1", "1-1", np.nan], "category"), (["1-1", "1-2", np.nan], object)], + [(["1-1", "1-1", np.nan], "category"), (["1-1", "1-2", np.nan], "str")], ) -def test_map_categorical_with_nan_values(data, expected_dtype, using_infer_string): +def test_map_categorical_with_nan_values(data, expected_dtype): # GH 20714 bug fixed in: GH 24275 def func(val): return val.split("-")[0] @@ -118,8 +118,6 @@ def func(val): s = Series(data, dtype="category") result = s.map(func, na_action="ignore") - if using_infer_string and expected_dtype == object: - expected_dtype = "string[pyarrow_numpy]" expected = Series(["1", "1", np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -145,9 +143,7 @@ def test_map_simple_str_callables_same_as_astype( # test that we are evaluating row-by-row first # before vectorized evaluation result = string_series.map(func) - expected = string_series.astype( - str if not using_infer_string else "string[pyarrow_numpy]" - ) + expected = string_series.astype(str if not using_infer_string else "str") tm.assert_series_equal(result, expected) @@ -497,7 +493,7 @@ def test_map_categorical(na_action, using_infer_string): result = s.map(lambda x: "A", na_action=na_action) exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) - assert result.dtype == object if not using_infer_string else "string" + assert result.dtype == object if not using_infer_string else "str" @pytest.mark.parametrize( diff --git a/pandas/tests/series/methods/test_rename.py b/pandas/tests/series/methods/test_rename.py index 119654bd19b3fa..a8f3862d39f07f 100644 --- a/pandas/tests/series/methods/test_rename.py +++ b/pandas/tests/series/methods/test_rename.py @@ -64,7 +64,7 @@ def test_rename_set_name_inplace(self, using_infer_string): assert ser.name == name exp = np.array(["a", "b", "c"], dtype=np.object_) if using_infer_string: - exp = array(exp, dtype="string[pyarrow_numpy]") + exp = array(exp, dtype="str") tm.assert_extension_array_equal(ser.index.values, exp) else: tm.assert_numpy_array_equal(ser.index.values, exp) diff --git a/pandas/tests/series/methods/test_reset_index.py b/pandas/tests/series/methods/test_reset_index.py index 48e2608a1032a0..fa571fa126b382 100644 --- a/pandas/tests/series/methods/test_reset_index.py +++ b/pandas/tests/series/methods/test_reset_index.py @@ -193,7 +193,7 @@ def test_reset_index_dtypes_on_empty_series_with_multiindex( # GH 19602 - Preserve dtype on empty Series with MultiIndex idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) result = Series(dtype=object, index=idx)[:0].reset_index().dtypes - exp = "string" if using_infer_string else object + exp = "str" if using_infer_string else object expected = Series( { "level_0": np.int64, diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index 999dd90d337d9d..efb249fdedf3db 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -177,9 +177,6 @@ def test_to_csv_interval_index(self, using_infer_string): result = self.read_csv(path, index_col=0) # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) - expected = s.copy() - if using_infer_string: - expected.index = expected.index.astype("string[pyarrow_numpy]") - else: - expected.index = expected.index.astype(str) + expected = s + expected.index = expected.index.astype("str") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 0aaa8ddcfda0c4..6efe0bcb8b45d4 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -167,7 +167,7 @@ def test_constructor(self, datetime_series, using_infer_string): # Mixed type Series mixed = Series(["hello", np.nan], index=[0, 1]) - assert mixed.dtype == np.object_ if not using_infer_string else "string" + assert mixed.dtype == np.object_ if not using_infer_string else "str" assert np.isnan(mixed[1]) assert not empty_series.index._is_all_dates @@ -1469,7 +1469,7 @@ def test_fromDict(self, using_infer_string): data = {"a": 0, "b": "1", "c": "2", "d": "3"} series = Series(data) - assert series.dtype == np.object_ if not using_infer_string else "string" + assert series.dtype == np.object_ if not using_infer_string else "str" data = {"a": "0", "b": "1"} series = Series(data, dtype=float) @@ -1481,7 +1481,7 @@ def test_fromValue(self, datetime_series, using_infer_string): assert len(nans) == len(datetime_series) strings = Series("foo", index=datetime_series.index) - assert strings.dtype == np.object_ if not using_infer_string else "string" + assert strings.dtype == np.object_ if not using_infer_string else "str" assert len(strings) == len(datetime_series) d = datetime.now() @@ -2141,6 +2141,11 @@ def test_series_string_inference_storage_definition(self): result = Series(["a", "b"], dtype="string") tm.assert_series_equal(result, expected) + expected = Series(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)) + with pd.option_context("future.infer_string", True): + result = Series(["a", "b"], dtype="str") + tm.assert_series_equal(result, expected) + def test_series_constructor_infer_string_scalar(self): # GH#55537 with pd.option_context("future.infer_string", True): diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py index 4939f3221d268e..77e77a9337d63a 100644 --- a/pandas/tests/series/test_formats.py +++ b/pandas/tests/series/test_formats.py @@ -323,7 +323,7 @@ def test_categorical_repr(self, using_infer_string): "0 a\n1 b\n" " ..\n" "48 a\n49 b\n" - "Length: 50, dtype: category\nCategories (2, string): [a, b]" + "Length: 50, dtype: category\nCategories (2, str): [a, b]" ) else: exp = ( @@ -341,7 +341,7 @@ def test_categorical_repr(self, using_infer_string): exp = ( "0 a\n1 b\n" "dtype: category\n" - "Categories (26, string): [a < b < c < d ... w < x < y < z]" + "Categories (26, str): [a < b < c < d ... w < x < y < z]" ) else: exp = ( diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index bf88da04b73ffb..51ce73ef54300c 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td @@ -167,7 +165,6 @@ def test_pandas_datareader(): pytest.importorskip("pandas_datareader") -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_pyarrow(df): pyarrow = pytest.importorskip("pyarrow") diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 79132591b15b3d..dd5218ab9404f3 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -111,7 +111,7 @@ def test_empty_dtypes(check_dtype): @pytest.mark.parametrize("check_like", [True, False]) def test_frame_equal_index_mismatch(check_like, obj_fixture, using_infer_string): if using_infer_string: - dtype = "string" + dtype = "str" else: dtype = "object" msg = f"""{obj_fixture}\\.index are different @@ -131,7 +131,7 @@ def test_frame_equal_index_mismatch(check_like, obj_fixture, using_infer_string) @pytest.mark.parametrize("check_like", [True, False]) def test_frame_equal_columns_mismatch(check_like, obj_fixture, using_infer_string): if using_infer_string: - dtype = "string" + dtype = "str" else: dtype = "object" msg = f"""{obj_fixture}\\.columns are different diff --git a/pandas/tests/util/test_assert_index_equal.py b/pandas/tests/util/test_assert_index_equal.py index dc6efdcec380e3..ab52d6c8e9f392 100644 --- a/pandas/tests/util/test_assert_index_equal.py +++ b/pandas/tests/util/test_assert_index_equal.py @@ -207,7 +207,7 @@ def test_index_equal_names(name1, name2): def test_index_equal_category_mismatch(check_categorical, using_infer_string): if using_infer_string: - dtype = "string" + dtype = "str" else: dtype = "object" msg = f"""Index are different diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 1878e7d8380648..0d56885a1cb847 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -221,9 +221,9 @@ def test_series_equal_categorical_values_mismatch(rtol, using_infer_string): Series values are different \\(66\\.66667 %\\) \\[index\\]: \\[0, 1, 2\\] \\[left\\]: \\['a', 'b', 'c'\\] -Categories \\(3, string\\): \\[a, b, c\\] +Categories \\(3, str\\): \\[a, b, c\\] \\[right\\]: \\['a', 'c', 'b'\\] -Categories \\(3, string\\): \\[a, b, c\\]""" +Categories \\(3, str\\): \\[a, b, c\\]""" else: msg = """Series are different @@ -258,7 +258,7 @@ def test_series_equal_datetime_values_mismatch(rtol): def test_series_equal_categorical_mismatch(check_categorical, using_infer_string): if using_infer_string: - dtype = "string" + dtype = "str" else: dtype = "object" msg = f"""Attributes of Series are different diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index fe2da210c6fe9d..948565be36b5be 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -71,7 +71,7 @@ def test_sum_object_str_raises(step): df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) r = df.rolling(window=3, step=step) with pytest.raises( - DataError, match="Cannot aggregate non-numeric type: object|string" + DataError, match="Cannot aggregate non-numeric type: object|str" ): # GH#42738, enforced in 2.0 r.sum()