Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

String dtype: use 'str' string alias and representation for NaN-variant of the dtype #59388

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,7 +575,10 @@ def __getitem__(self, item: PositionalIndexer):
if isinstance(item, np.ndarray):
if not len(item):
# Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string]
if self._dtype.name == "string" and self._dtype.storage == "pyarrow":
if (
isinstance(self._dtype, StringDtype)
and self._dtype.storage == "pyarrow"
):
# TODO(infer_string) should this be large_string?
pa_dtype = pa.string()
else:
Expand Down
24 changes: 18 additions & 6 deletions pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

from typing import (
TYPE_CHECKING,
ClassVar,
Literal,
cast,
)
Expand Down Expand Up @@ -110,9 +109,12 @@ class StringDtype(StorageExtensionDtype):
string[pyarrow]
"""

# error: Cannot override instance variable (previously declared on
# base class "StorageExtensionDtype") with class variable
name: ClassVar[str] = "string" # type: ignore[misc]
@property
def name(self) -> str:
if self._na_value is libmissing.NA:
return "string"
else:
return "str"

#: StringDtype().na_value uses pandas.NA except the implementation that
# follows NumPy semantics, which uses nan.
Expand All @@ -129,7 +131,7 @@ def __init__(
) -> None:
# infer defaults
if storage is None:
if using_string_dtype() and na_value is not libmissing.NA:
if na_value is not libmissing.NA:
storage = "pyarrow"
else:
storage = get_option("mode.string_storage")
Expand Down Expand Up @@ -159,11 +161,19 @@ def __init__(
self.storage = storage
self._na_value = na_value

def __repr__(self) -> str:
if self._na_value is libmissing.NA:
return f"{self.name}[{self.storage}]"
else:
# TODO add more informative repr
return self.name
Comment on lines +179 to +180
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To discuss in #59342


def __eq__(self, other: object) -> bool:
# we need to override the base class __eq__ because na_value (NA or NaN)
# cannot be checked with normal `==`
if isinstance(other, str):
if other == self.name:
# TODO should dtype == "string" work for the NaN variant?
if other == "string" or other == self.name: # noqa: PLR1714
Comment on lines +186 to +187
Copy link
Member Author

@jorisvandenbossche jorisvandenbossche Aug 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This other == "string" is added as special case to (for now) keep dtype == "string" working also for the NaN-variant. See the top-post of this PR for some context.

return True
try:
other = self.construct_from_string(other)
Expand Down Expand Up @@ -220,6 +230,8 @@ def construct_from_string(cls, string) -> Self:
)
if string == "string":
return cls()
elif string == "str" and using_string_dtype():
return cls(na_value=np.nan)
elif string == "string[python]":
return cls(storage="python")
elif string == "string[pyarrow]":
Expand Down
1 change: 1 addition & 0 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,7 @@ def __init__(
elif copy:
data = data.copy()
else:
# breakpoint()
data = sanitize_array(data, index, dtype, copy)
data = SingleBlockManager.from_array(data, index, refs=refs)

Expand Down
33 changes: 20 additions & 13 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_repr(dtype):
assert repr(df) == expected

if dtype.na_value is np.nan:
expected = "0 a\n1 NaN\n2 b\nName: A, dtype: string"
expected = "0 a\n1 NaN\n2 b\nName: A, dtype: str"
else:
expected = "0 a\n1 <NA>\n2 b\nName: A, dtype: string"
assert repr(df.A) == expected
Expand All @@ -75,7 +75,7 @@ def test_repr(dtype):
expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
elif dtype.storage == "pyarrow" and dtype.na_value is np.nan:
arr_name = "ArrowStringArrayNumpySemantics"
expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string"
expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str"
else:
arr_name = "StringArray"
expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
Expand Down Expand Up @@ -492,7 +492,7 @@ def test_fillna_args(dtype):
tm.assert_extension_array_equal(res, expected)

if dtype.storage == "pyarrow":
msg = "Invalid value '1' for dtype string"
msg = "Invalid value '1' for dtype str"
else:
msg = "Cannot set non-string value '1' into a StringArray."
with pytest.raises(TypeError, match=msg):
Expand All @@ -514,7 +514,7 @@ def test_arrow_array(dtype):
assert arr.equals(expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
# roundtrip possible from arrow 1.0.0
Expand All @@ -529,14 +529,17 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
assert table.field("a").type == "large_string"
with pd.option_context("string_storage", string_storage):
result = table.to_pandas()
assert isinstance(result["a"].dtype, pd.StringDtype)
expected = df.astype(f"string[{string_storage}]")
tm.assert_frame_equal(result, expected)
# ensure the missing value is represented by NA and not np.nan or None
assert result.loc[2, "a"] is result["a"].dtype.na_value
if dtype.na_value is np.nan and not using_string_dtype():
assert result["a"].dtype == "object"
else:
assert isinstance(result["a"].dtype, pd.StringDtype)
expected = df.astype(f"string[{string_storage}]")
tm.assert_frame_equal(result, expected)
# ensure the missing value is represented by NA and not np.nan or None
assert result.loc[2, "a"] is result["a"].dtype.na_value


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string):
# GH-41040
Expand All @@ -553,9 +556,13 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string):
table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
with pd.option_context("string_storage", string_storage):
result = table.to_pandas()
assert isinstance(result["a"].dtype, pd.StringDtype)
expected = df.astype(f"string[{string_storage}]")
tm.assert_frame_equal(result, expected)

if dtype.na_value is np.nan and not using_string_dtype():
assert result["a"].dtype == "object"
else:
assert isinstance(result["a"].dtype, pd.StringDtype)
expected = df.astype(f"string[{string_storage}]")
tm.assert_frame_equal(result, expected)


def test_value_counts_na(dtype):
Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/dtypes/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -799,3 +799,20 @@ def test_pandas_dtype_ea_not_instance():
# GH 31356 GH 54592
with tm.assert_produces_warning(UserWarning, match="without any arguments"):
assert pandas_dtype(CategoricalDtype) == CategoricalDtype()


def test_pandas_dtype_string_dtypes(string_storage):
with pd.option_context("future.infer_string", True):
with pd.option_context("string_storage", string_storage):
result = pandas_dtype("str")
# TODO(infer_string) hardcoded to pyarrow until python is supported
assert result == pd.StringDtype("pyarrow", na_value=np.nan)

with pd.option_context("future.infer_string", False):
with pd.option_context("string_storage", string_storage):
result = pandas_dtype("str")
assert result == np.dtype("U")

with pd.option_context("string_storage", string_storage):
result = pandas_dtype("string")
assert result == pd.StringDtype(string_storage, na_value=pd.NA)
14 changes: 14 additions & 0 deletions pandas/tests/extension/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,20 @@ def test_is_not_string_type(self, dtype):
# because StringDtype is a string type
assert is_string_dtype(dtype)

def test_is_dtype_from_name(self, dtype, using_infer_string):
if dtype.na_value is np.nan and not using_infer_string:
result = type(dtype).is_dtype(dtype.name)
assert result is False
else:
super().test_is_dtype_from_name(dtype)

def test_construct_from_string_own_name(self, dtype, using_infer_string):
if dtype.na_value is np.nan and not using_infer_string:
with pytest.raises(TypeError, match="Cannot construct a 'StringDtype'"):
dtype.construct_from_string(dtype.name)
else:
super().test_construct_from_string_own_name(dtype)

def test_view(self, data):
if data.dtype.storage == "pyarrow":
pytest.skip(reason="2D support not implemented for ArrowStringArray")
Expand Down
11 changes: 8 additions & 3 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2117,10 +2117,15 @@ def test_series_string_inference_storage_definition(self):
# but after PDEP-14 (string dtype), it was decided to keep dtype="string"
# returning the NA string dtype, so expected is changed from
# "string[pyarrow_numpy]" to "string[pyarrow]"
pytest.importorskip("pyarrow")
expected = Series(["a", "b"], dtype="string[python]")
# pytest.importorskip("pyarrow")
# expected = Series(["a", "b"], dtype="string[python]")
# with pd.option_context("future.infer_string", True):
# result = Series(["a", "b"], dtype="string")
# tm.assert_series_equal(result, expected)

expected = Series(["a", "b"], dtype=pd.StringDtype(na_value=np.nan))
with pd.option_context("future.infer_string", True):
result = Series(["a", "b"], dtype="string")
result = Series(["a", "b"], dtype="str")
tm.assert_series_equal(result, expected)

def test_series_constructor_infer_string_scalar(self):
Expand Down
Loading