Skip to content

Commit

Permalink
apacheGH-41098: [Python] Add copy keyword in Array.__array__ for nump…
Browse files Browse the repository at this point in the history
…y 2.0+ compatibility (apache#41071)

### Rationale for this change

Adapting for changes in numpy 2.0 as decribed at https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword and future changes to pass copy=True (numpy/numpy#26208)

### What changes are included in this PR?

Add a `copy=None` to the signatures of our `__array__` methods.

This does have impact on the user facing behaviour, though. Questioning that upstream at numpy/numpy#25941 (comment)

### Are these changes tested?

Yes

### Are there any user-facing changes?

No (compared to usage with numpy<2)
* GitHub Issue: apache#39532
* GitHub Issue: apache#41098

Authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
jorisvandenbossche authored and tolleybot committed May 2, 2024
1 parent 9e58ac3 commit 6034816
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 5 deletions.
21 changes: 19 additions & 2 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1516,11 +1516,28 @@ cdef class Array(_PandasConvertible):
def _to_pandas(self, options, types_mapper=None, **kwargs):
return _array_like_to_pandas(self, options, types_mapper=types_mapper)

def __array__(self, dtype=None):
def __array__(self, dtype=None, copy=None):
if copy is False:
try:
values = self.to_numpy(zero_copy_only=True)
except ArrowInvalid:
raise ValueError(
"Unable to avoid a copy while creating a numpy array as requested.\n"
"If using `np.array(obj, copy=False)` replace it with "
"`np.asarray(obj)` to allow a copy when needed"
)
# values is already a numpy array at this point, but calling np.array(..)
# again to handle the `dtype` keyword with a no-copy guarantee
return np.array(values, dtype=dtype, copy=False)

values = self.to_numpy(zero_copy_only=False)
if copy is True and is_numeric(self.type.id) and self.null_count == 0:
# to_numpy did not yet make a copy (is_numeric = integer/floats, no decimal)
return np.array(values, dtype=dtype, copy=True)

if dtype is None:
return values
return values.astype(dtype)
return np.asarray(values, dtype=dtype)

def to_numpy(self, zero_copy_only=True, writable=False):
"""
Expand Down
1 change: 1 addition & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
c_string ToString()

c_bool is_primitive(Type type)
c_bool is_numeric(Type type)

cdef cppclass CArrayData" arrow::ArrayData":
shared_ptr[CDataType] type
Expand Down
23 changes: 20 additions & 3 deletions python/pyarrow/table.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -525,11 +525,19 @@ cdef class ChunkedArray(_PandasConvertible):

return values

def __array__(self, dtype=None):
def __array__(self, dtype=None, copy=None):
if copy is False:
raise ValueError(
"Unable to avoid a copy while creating a numpy array as requested "
"(converting a pyarrow.ChunkedArray always results in a copy).\n"
"If using `np.array(obj, copy=False)` replace it with "
"`np.asarray(obj)` to allow a copy when needed"
)
# 'copy' can further be ignored because to_numpy() already returns a copy
values = self.to_numpy()
if dtype is None:
return values
return values.astype(dtype)
return values.astype(dtype, copy=False)

def cast(self, object target_type=None, safe=None, options=None):
"""
Expand Down Expand Up @@ -1562,7 +1570,16 @@ cdef class _Tabular(_PandasConvertible):
raise TypeError(f"Do not call {self.__class__.__name__}'s constructor directly, use "
f"one of the `{self.__class__.__name__}.from_*` functions instead.")

def __array__(self, dtype=None):
def __array__(self, dtype=None, copy=None):
if copy is False:
raise ValueError(
"Unable to avoid a copy while creating a numpy array as requested "
f"(converting a pyarrow.{self.__class__.__name__} always results "
"in a copy).\n"
"If using `np.array(obj, copy=False)` replace it with "
"`np.asarray(obj)` to allow a copy when needed"
)
# 'copy' can further be ignored because stacking will result in a copy
column_arrays = [
np.asarray(self.column(i), dtype=dtype) for i in range(self.num_columns)
]
Expand Down
47 changes: 47 additions & 0 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@

import pyarrow as pa
import pyarrow.tests.strategies as past
from pyarrow.vendored.version import Version


def test_total_bytes_allocated():
Expand Down Expand Up @@ -3302,6 +3303,52 @@ def test_array_from_large_pyints():
pa.array([int(2 ** 63)])


def test_numpy_array_protocol():
# test the __array__ method on pyarrow.Array
arr = pa.array([1, 2, 3])
result = np.asarray(arr)
expected = np.array([1, 2, 3], dtype="int64")
np.testing.assert_array_equal(result, expected)

# this should not raise a deprecation warning with numpy 2.0+
result = np.array(arr, copy=False)
np.testing.assert_array_equal(result, expected)

result = np.array(arr, dtype="int64", copy=False)
np.testing.assert_array_equal(result, expected)

# no zero-copy is possible
arr = pa.array([1, 2, None])
expected = np.array([1, 2, np.nan], dtype="float64")
result = np.asarray(arr)
np.testing.assert_array_equal(result, expected)

if Version(np.__version__) < Version("2.0"):
# copy keyword is not strict and not passed down to __array__
result = np.array(arr, copy=False)
np.testing.assert_array_equal(result, expected)

result = np.array(arr, dtype="float64", copy=False)
np.testing.assert_array_equal(result, expected)
else:
# starting with numpy 2.0, the copy=False keyword is assumed to be strict
with pytest.raises(ValueError, match="Unable to avoid a copy"):
np.array(arr, copy=False)

arr = pa.array([1, 2, 3])
with pytest.raises(ValueError):
np.array(arr, dtype="float64", copy=False)

# copy=True -> not yet passed by numpy, so we have to call this directly to test
arr = pa.array([1, 2, 3])
result = arr.__array__(copy=True)
assert result.flags.writeable

arr = pa.array([1, 2, 3])
result = arr.__array__(dtype=np.dtype("float64"), copy=True)
assert result.dtype == "float64"


def test_array_protocol():

class MyArray:
Expand Down
16 changes: 16 additions & 0 deletions python/pyarrow/tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import pytest
import pyarrow as pa
import pyarrow.compute as pc
from pyarrow.vendored.version import Version


def test_chunked_array_basics():
Expand Down Expand Up @@ -3275,6 +3276,21 @@ def test_numpy_asarray(constructor):
assert result.dtype == "int32"


@pytest.mark.parametrize("constructor", [pa.table, pa.record_batch])
def test_numpy_array_protocol(constructor):
table = constructor([[1, 2, 3], [4.0, 5.0, 6.0]], names=["a", "b"])
expected = np.array([[1, 4], [2, 5], [3, 6]], dtype="float64")

if Version(np.__version__) < Version("2.0"):
# copy keyword is not strict and not passed down to __array__
result = np.array(table, copy=False)
np.testing.assert_array_equal(result, expected)
else:
# starting with numpy 2.0, the copy=False keyword is assumed to be strict
with pytest.raises(ValueError, match="Unable to avoid a copy"):
np.array(table, copy=False)


@pytest.mark.acero
def test_invalid_non_join_column():
NUM_ITEMS = 30
Expand Down

0 comments on commit 6034816

Please sign in to comment.