-
-
Notifications
You must be signed in to change notification settings - Fork 17.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CoW: __array__ not recognizing ea dtypes #51966
Changes from 8 commits
3a08a45
38453c4
ee751b4
a4eb612
c515870
5568ec2
e9641ad
bf1f9de
8922467
413fc7c
9681861
2d8a2f2
42368e0
5c0f1bb
4572e4a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -107,6 +107,7 @@ | |
ensure_object, | ||
ensure_platform_int, | ||
ensure_str, | ||
is_1d_only_ea_dtype, | ||
is_bool, | ||
is_bool_dtype, | ||
is_datetime64_any_dtype, | ||
|
@@ -1995,10 +1996,13 @@ def empty(self) -> bool_t: | |
def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: | ||
values = self._values | ||
arr = np.asarray(values, dtype=dtype) | ||
if arr is values and using_copy_on_write(): | ||
# TODO(CoW) also properly handle extension dtypes | ||
arr = arr.view() | ||
arr.flags.writeable = False | ||
if arr is values and using_copy_on_write() and self._mgr.is_single_block: | ||
# Check if self._values coerced data | ||
if not is_1d_only_ea_dtype(self.dtypes.iloc[0]) or not is_numeric_dtype( | ||
self.dtypes.iloc[0] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why the Can you expand the comment a bit more to explain those checks? (from just reading the code, I find it hard to reason about, also with the "not .. or not ..") There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Something else, could we also use the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have 2 different cases here:
astype_is_view would work when we get rid of the conversion to object in the middle. I'll clarify the comment though. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So for the EA dtypes, you rely on the assumption that numeric dtypes were cast to object. But in theory, someone could implement its own EA which is indicated to be "numeric" but does not do this conversion to object dtype. Now, in general I think we are lacking a part of the story around having proper information about copies/views with generic EAs (that was also the case when doing the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's actually what our numeric arrow dtypes do, but since those rely on pyarrow to convert itself to numpy arrays, those are already set to readonly in case they are a view:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fair point, there is actual a more elegant solution. We can check if both steps can be done without copying via astype_is_view. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah I ran into this as well (that they set themselves to read only) |
||
): | ||
arr = arr.view() | ||
arr.flags.writeable = False | ||
return arr | ||
|
||
@final | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -110,3 +110,55 @@ def test_series_to_numpy(using_copy_on_write): | |
arr = ser.to_numpy(dtype="float64") | ||
assert not np.shares_memory(arr, get_array(ser, "name")) | ||
assert arr.flags.writeable is True | ||
|
||
|
||
def test_series_array_ea_dtypes(using_copy_on_write): | ||
ser = Series([1, 2, 3], dtype="Int64") | ||
arr = np.asarray(ser, dtype="int64") | ||
assert np.shares_memory(arr, get_array(ser)) | ||
if using_copy_on_write: | ||
assert arr.flags.writeable is False | ||
else: | ||
assert arr.flags.writeable is True | ||
|
||
arr = np.asarray(ser) | ||
assert not np.shares_memory(arr, get_array(ser)) | ||
if using_copy_on_write: | ||
# TODO(CoW): This should be True | ||
assert arr.flags.writeable is False | ||
else: | ||
assert arr.flags.writeable is True | ||
|
||
|
||
def test_dataframe_array_ea_dtypes(using_copy_on_write): | ||
df = DataFrame({"a": [1, 2, 3]}, dtype="Int64") | ||
arr = np.asarray(df, dtype="int64") | ||
# TODO: This should be able to share memory, but we are roundtripping | ||
# through object | ||
assert not np.shares_memory(arr, get_array(df, "a")) | ||
assert arr.flags.writeable is True | ||
|
||
arr = np.asarray(df) | ||
if using_copy_on_write: | ||
# TODO(CoW): This should be True | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not this one, because without specifying There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes exactly, this triggers a copy and hence the array should be writeable? |
||
assert arr.flags.writeable is False | ||
else: | ||
assert arr.flags.writeable is True | ||
|
||
|
||
def test_dataframe_array_string_dtype(using_copy_on_write, using_array_manager): | ||
df = DataFrame({"a": ["a", "b"]}, dtype="string") | ||
arr = np.asarray(df) | ||
if not using_array_manager: | ||
assert np.shares_memory(arr, get_array(df, "a")) | ||
if using_copy_on_write: | ||
assert arr.flags.writeable is False | ||
else: | ||
assert arr.flags.writeable is True | ||
|
||
|
||
def test_dataframe_multiple_numpy_dtypes(): | ||
df = DataFrame({"a": [1, 2, 3], "b": 1.5}) | ||
arr = np.asarray(df) | ||
assert not np.shares_memory(arr, get_array(df, "a")) | ||
assert arr.flags.writeable is True |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
arr is values
might prevent catching some EA cases, since it seems that_values
can still return an EA (so the EA->ndarray conversion only happens inarr = np.asarray(values ..)
).Example (running with this PR):
For series you left out this check, so maybe can be done here as well?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
good point, thx