-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
String dtype: use 'str' string alias and representation for NaN-variant of the dtype #59388
Changes from 11 commits
6e8d76a
7c00808
1ea64cf
a12b345
476aa44
757ea8a
220960f
d75f89d
61ec243
7399ae5
f2ff5db
c445a36
d816476
39662d2
53c8f75
8ddb6cc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,6 @@ | |
from typing import ( | ||
TYPE_CHECKING, | ||
Any, | ||
ClassVar, | ||
Literal, | ||
cast, | ||
) | ||
|
@@ -118,9 +117,12 @@ class StringDtype(StorageExtensionDtype): | |
string[pyarrow] | ||
""" | ||
|
||
# error: Cannot override instance variable (previously declared on | ||
# base class "StorageExtensionDtype") with class variable | ||
name: ClassVar[str] = "string" # type: ignore[misc] | ||
@property | ||
def name(self) -> str: # type: ignore[override] | ||
if self._na_value is libmissing.NA: | ||
return "string" | ||
else: | ||
return "str" | ||
|
||
#: StringDtype().na_value uses pandas.NA except the implementation that | ||
# follows NumPy semantics, which uses nan. | ||
|
@@ -137,7 +139,7 @@ def __init__( | |
) -> None: | ||
# infer defaults | ||
if storage is None: | ||
if using_string_dtype() and na_value is not libmissing.NA: | ||
if na_value is not libmissing.NA: | ||
if HAS_PYARROW: | ||
storage = "pyarrow" | ||
else: | ||
|
@@ -170,11 +172,19 @@ def __init__( | |
self.storage = storage | ||
self._na_value = na_value | ||
|
||
def __repr__(self) -> str: | ||
if self._na_value is libmissing.NA: | ||
return f"{self.name}[{self.storage}]" | ||
else: | ||
# TODO add more informative repr | ||
return self.name | ||
|
||
def __eq__(self, other: object) -> bool: | ||
# we need to override the base class __eq__ because na_value (NA or NaN) | ||
# cannot be checked with normal `==` | ||
if isinstance(other, str): | ||
if other == self.name: | ||
# TODO should dtype == "string" work for the NaN variant? | ||
if other == "string" or other == self.name: # noqa: PLR1714 | ||
Comment on lines
+186
to
+187
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This |
||
return True | ||
try: | ||
other = self.construct_from_string(other) | ||
|
@@ -231,6 +241,8 @@ def construct_from_string(cls, string) -> Self: | |
) | ||
if string == "string": | ||
return cls() | ||
elif string == "str" and using_string_dtype(): | ||
return cls(na_value=np.nan) | ||
elif string == "string[python]": | ||
return cls(storage="python") | ||
elif string == "string[pyarrow]": | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -224,7 +224,7 @@ def test_apply_categorical(by_row, using_infer_string): | |
result = ser.apply(lambda x: "A") | ||
exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) | ||
tm.assert_series_equal(result, exp) | ||
assert result.dtype == object if not using_infer_string else "string[pyarrow_numpy]" | ||
assert result.dtype == object if not using_infer_string else "str" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we should use this as an opportunity to move away from string comparisons and use the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I changed it in some places to an explicit StringDtype(..) creation where we were testing a series of dtypes, but for now kept using the string alias for convenience in other places. |
||
|
||
|
||
@pytest.mark.parametrize("series", [["1-1", "1-1", np.nan], ["1-1", "1-2", np.nan]]) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To discuss in #59342