Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-41098: [Python] Add copy keyword in Array.__array__ for numpy 2.0+ compatibility #41071

Merged
merged 8 commits into from
Apr 15, 2024
21 changes: 19 additions & 2 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1516,11 +1516,28 @@ cdef class Array(_PandasConvertible):
def _to_pandas(self, options, types_mapper=None, **kwargs):
return _array_like_to_pandas(self, options, types_mapper=types_mapper)

def __array__(self, dtype=None):
def __array__(self, dtype=None, copy=None):
if copy is False:
try:
values = self.to_numpy(zero_copy_only=True)
except ArrowInvalid:
raise ValueError(
"Unable to avoid a copy while creating a numpy array as requested.\n"
"If using `np.array(obj, copy=False)` replace it with "
"`np.asarray(obj)` to allow a copy when needed"
)
# values is already a numpy array at this point, but calling np.array(..)
# again to handle the `dtype` keyword with a no-copy guarantee
return np.array(values, dtype=dtype, copy=False)

values = self.to_numpy(zero_copy_only=False)
if copy is True and is_primitive(self.type.id) and self.null_count == 0:
# to_numpy did not yet make a copy
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will this make a second copy for e.g. boolean?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, boolean is not considered as primitive (although you might expect that). So a boolean array takes the code path below just returning the numpy values is no dtype was required, so no second copy.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, sorry, boolean is of course "primitive", what I meant to say is that it is not "numeric", and the code above was meant to use is_numeric (I added that to libarrow.pxd for that purpose). Good catch ..

return np.array(values, dtype=dtype, copy=True)

if dtype is None:
return values
return values.astype(dtype)
return np.asarray(values, dtype=dtype)

def to_numpy(self, zero_copy_only=True, writable=False):
"""
Expand Down
1 change: 1 addition & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
c_string ToString()

c_bool is_primitive(Type type)
c_bool is_numeric(Type type)

cdef cppclass CArrayData" arrow::ArrayData":
shared_ptr[CDataType] type
Expand Down
23 changes: 20 additions & 3 deletions python/pyarrow/table.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -525,11 +525,19 @@ cdef class ChunkedArray(_PandasConvertible):

return values

def __array__(self, dtype=None):
def __array__(self, dtype=None, copy=None):
if copy is False:
raise ValueError(
"Unable to avoid a copy while creating a numpy array as requested "
"(converting a pyarrow.ChunkedArray always results in a copy).\n"
"If using `np.array(obj, copy=False)` replace it with "
"`np.asarray(obj)` to allow a copy when needed"
)
# 'copy' can further be ignored because to_numpy() already returns a copy
values = self.to_numpy()
if dtype is None:
return values
return values.astype(dtype)
return values.astype(dtype, copy=False)

def cast(self, object target_type=None, safe=None, options=None):
"""
Expand Down Expand Up @@ -1533,7 +1541,16 @@ cdef class _Tabular(_PandasConvertible):
raise TypeError(f"Do not call {self.__class__.__name__}'s constructor directly, use "
f"one of the `{self.__class__.__name__}.from_*` functions instead.")

def __array__(self, dtype=None):
def __array__(self, dtype=None, copy=None):
if copy is False:
raise ValueError(
"Unable to avoid a copy while creating a numpy array as requested "
f"(converting a pyarrow.{self.__class__.__name__} always results "
"in a copy).\n"
"If using `np.array(obj, copy=False)` replace it with "
"`np.asarray(obj)` to allow a copy when needed"
)
# 'copy' can further be ignored because stacking will result in a copy
column_arrays = [
np.asarray(self.column(i), dtype=dtype) for i in range(self.num_columns)
]
Expand Down
47 changes: 47 additions & 0 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@

import pyarrow as pa
import pyarrow.tests.strategies as past
from pyarrow.vendored.version import Version


def test_total_bytes_allocated():
Expand Down Expand Up @@ -3302,6 +3303,52 @@ def test_array_from_large_pyints():
pa.array([int(2 ** 63)])


def test_numpy_array_protocol():
# test the __array__ method on pyarrow.Array
arr = pa.array([1, 2, 3])
result = np.asarray(arr)
expected = np.array([1, 2, 3], dtype="int64")
np.testing.assert_array_equal(result, expected)

# this should not raise a deprecation warning with numpy 2.0+
result = np.array(arr, copy=False)
np.testing.assert_array_equal(result, expected)

result = np.array(arr, dtype="int64", copy=False)
np.testing.assert_array_equal(result, expected)

# no zero-copy is possible
arr = pa.array([1, 2, None])
expected = np.array([1, 2, np.nan], dtype="float64")
result = np.asarray(arr)
np.testing.assert_array_equal(result, expected)

if Version(np.__version__) < Version("2.0"):
# copy keyword is not strict and not passed down to __array__
result = np.array(arr, copy=False)
np.testing.assert_array_equal(result, expected)

result = np.array(arr, dtype="float64", copy=False)
np.testing.assert_array_equal(result, expected)
else:
# starting with numpy 2.0, the copy=False keyword is assumed to be strict
with pytest.raises(ValueError, match="Unable to avoid a copy"):
np.array(arr, copy=False)

arr = pa.array([1, 2, 3])
with pytest.raises(ValueError):
np.array(arr, dtype="float64", copy=False)

# copy=True -> not yet passed by numpy, so we have to call this directly to test
arr = pa.array([1, 2, 3])
result = arr.__array__(copy=True)
assert result.flags.writeable

arr = pa.array([1, 2, 3])
result = arr.__array__(dtype=np.dtype("float64"), copy=True)
assert result.dtype == "float64"


def test_array_protocol():

class MyArray:
Expand Down
16 changes: 16 additions & 0 deletions python/pyarrow/tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import pytest
import pyarrow as pa
import pyarrow.compute as pc
from pyarrow.vendored.version import Version


def test_chunked_array_basics():
Expand Down Expand Up @@ -3197,6 +3198,21 @@ def test_numpy_asarray(constructor):
assert result.dtype == "int32"


@pytest.mark.parametrize("constructor", [pa.table, pa.record_batch])
def test_numpy_array_protocol(constructor):
table = constructor([[1, 2, 3], [4.0, 5.0, 6.0]], names=["a", "b"])
expected = np.array([[1, 4], [2, 5], [3, 6]], dtype="float64")

if Version(np.__version__) < Version("2.0"):
# copy keyword is not strict and not passed down to __array__
result = np.array(table, copy=False)
np.testing.assert_array_equal(result, expected)
else:
# starting with numpy 2.0, the copy=False keyword is assumed to be strict
with pytest.raises(ValueError, match="Unable to avoid a copy"):
np.array(table, copy=False)


@pytest.mark.acero
def test_invalid_non_join_column():
NUM_ITEMS = 30
Expand Down
Loading