Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CoW: __array__ not recognizing ea dtypes #51966

Merged
merged 15 commits into from
Apr 2, 2023
12 changes: 8 additions & 4 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@
validate_inclusive,
)

from pandas.core.dtypes.astype import astype_is_view
from pandas.core.dtypes.common import (
ensure_object,
ensure_platform_int,
Expand Down Expand Up @@ -2005,10 +2006,13 @@ def empty(self) -> bool_t:
def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
values = self._values
arr = np.asarray(values, dtype=dtype)
if arr is values and using_copy_on_write():
# TODO(CoW) also properly handle extension dtypes
arr = arr.view()
arr.flags.writeable = False
if arr is values and using_copy_on_write() and self._mgr.is_single_block:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The arr is values might prevent catching some EA cases, since it seems that _values can still return an EA (so the EA->ndarray conversion only happens in arr = np.asarray(values ..)).

Example (running with this PR):

In [1]: pd.options.mode.copy_on_write = True

In [2]: df = pd.DataFrame({"a": pd.date_range("2012-01-01", periods=3)})

In [3]: arr = np.asarray(df)

In [4]: arr.flags.writeable
Out[4]: True

In [5]: arr[0] = 0

In [6]: df
Out[6]: 
           a
0 1970-01-01
1 2012-01-02
2 2012-01-03

For series you left out this check, so maybe can be done here as well?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good point, thx

# Check if both conversions can be done without a copy
if astype_is_view(self.dtypes.iloc[0], values.dtype) and astype_is_view(
values.dtype, arr.dtype
):
arr = arr.view()
arr.flags.writeable = False
return arr

@final
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
validate_percentile,
)

from pandas.core.dtypes.astype import astype_is_view
from pandas.core.dtypes.cast import (
LossySetitemError,
convert_dtypes,
Expand Down Expand Up @@ -913,8 +914,7 @@ def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
"""
values = self._values
arr = np.asarray(values, dtype=dtype)
if arr is values and using_copy_on_write():
# TODO(CoW) also properly handle extension dtypes
if using_copy_on_write() and astype_is_view(values.dtype, arr.dtype):
arr = arr.view()
arr.flags.writeable = False
return arr
Expand Down
54 changes: 54 additions & 0 deletions pandas/tests/copy_view/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,57 @@ def test_ravel_read_only(using_copy_on_write, order):
if using_copy_on_write:
assert arr.flags.writeable is False
assert np.shares_memory(get_array(ser), arr)


def test_series_array_ea_dtypes(using_copy_on_write):
ser = Series([1, 2, 3], dtype="Int64")
arr = np.asarray(ser, dtype="int64")
assert np.shares_memory(arr, get_array(ser))
if using_copy_on_write:
assert arr.flags.writeable is False
else:
assert arr.flags.writeable is True

arr = np.asarray(ser)
assert not np.shares_memory(arr, get_array(ser))
assert arr.flags.writeable is True


def test_dataframe_array_ea_dtypes(using_copy_on_write):
df = DataFrame({"a": [1, 2, 3]}, dtype="Int64")
arr = np.asarray(df, dtype="int64")
# TODO: This should be able to share memory, but we are roundtripping
# through object
assert not np.shares_memory(arr, get_array(df, "a"))
assert arr.flags.writeable is True

arr = np.asarray(df)
if using_copy_on_write:
# TODO(CoW): This should be True
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not this one, because without specifying dtype="int64" we create an object dtype array?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes exactly, this triggers a copy and hence the array should be writeable?

assert arr.flags.writeable is False
else:
assert arr.flags.writeable is True


def test_dataframe_array_string_dtype(using_copy_on_write, using_array_manager):
df = DataFrame({"a": ["a", "b"]}, dtype="string")
arr = np.asarray(df)
if not using_array_manager:
assert np.shares_memory(arr, get_array(df, "a"))
if using_copy_on_write:
assert arr.flags.writeable is False
else:
assert arr.flags.writeable is True


def test_dataframe_multiple_numpy_dtypes():
df = DataFrame({"a": [1, 2, 3], "b": 1.5})
arr = np.asarray(df)
assert not np.shares_memory(arr, get_array(df, "a"))
assert arr.flags.writeable is True


def test_empty_dataframe():
df = DataFrame()
arr = np.asarray(df)
assert arr.flags.writeable is True