Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

String dtype: avoid surfacing pyarrow exception in binary operations #59610

Merged
31 changes: 26 additions & 5 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -736,8 +736,19 @@ def _cmp_method(self, other, op) -> ArrowExtensionArray:
)
return ArrowExtensionArray(result)

def _op_method_error_message(self, other, op) -> str:
if hasattr(other, "dtype"):
other_type = f"dtype '{other.dtype}'"
else:
other_type = f"object of type {type(other)}"
return (
f"operation '{op.__name__}' not supported for "
f"dtype '{self.dtype}' with {other_type}"
)

def _evaluate_op_method(self, other, op, arrow_funcs) -> Self:
pa_type = self._pa_array.type
other_original = other
other = self._box_pa(other)

if (
Expand All @@ -747,10 +758,15 @@ def _evaluate_op_method(self, other, op, arrow_funcs) -> Self:
):
if op in [operator.add, roperator.radd]:
sep = pa.scalar("", type=pa_type)
if op is operator.add:
result = pc.binary_join_element_wise(self._pa_array, other, sep)
elif op is roperator.radd:
result = pc.binary_join_element_wise(other, self._pa_array, sep)
try:
if op is operator.add:
result = pc.binary_join_element_wise(self._pa_array, other, sep)
elif op is roperator.radd:
result = pc.binary_join_element_wise(other, self._pa_array, sep)
except pa.lib.ArrowNotImplementedError as err:
raise TypeError(
self._op_method_error_message(other_original, op)
) from err
return type(self)(result)
elif op in [operator.mul, roperator.rmul]:
binary = self._pa_array
Expand Down Expand Up @@ -782,9 +798,14 @@ def _evaluate_op_method(self, other, op, arrow_funcs) -> Self:

pc_func = arrow_funcs[op.__name__]
if pc_func is NotImplemented:
if pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type):
raise TypeError(self._op_method_error_message(other_original, op))
raise NotImplementedError(f"{op.__name__} not implemented.")

result = pc_func(self._pa_array, other)
try:
result = pc_func(self._pa_array, other)
except pa.lib.ArrowNotImplementedError as err:
raise TypeError(self._op_method_error_message(other_original, op)) from err
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am doing from err here, so the original pyarrow message like "ArrowNotImplementedError: Function 'binary_join_element_wise' has no kernel matching input types (large_string, int64, large_string)" is still visible up in the traceback.

For the new string dtype, I would actually be fine with suppressing this (using from None) to limit the length of the error traceback. But for usage of ArrowDtype in general, I suppose this information is still very useful to see.

But as I mentioned in the top post, I could also only do this try/except in case we are in ArrowStringArray, and leave the generic ArrowExtensionArray as is (raising the pyarrow error directly)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the from err - I don't see a huge harm in including all of that information

return type(self)(result)

def _logical_method(self, other, op) -> Self:
Expand Down
26 changes: 7 additions & 19 deletions pandas/tests/arrays/boolean/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat import HAS_PYARROW

import pandas as pd
import pandas._testing as tm

Expand Down Expand Up @@ -94,19 +90,8 @@ def test_op_int8(left_array, right_array, opname):
# -----------------------------------------------------------------------------


@pytest.mark.xfail(
using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
)
def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
def test_error_invalid_values(data, all_arithmetic_operators):
# invalid ops

if using_infer_string:
import pyarrow as pa

err = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
else:
err = TypeError

op = all_arithmetic_operators
s = pd.Series(data)
ops = getattr(s, op)
Expand All @@ -116,7 +101,8 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string
"did not contain a loop with signature matching types|"
"BooleanArray cannot perform the operation|"
"not supported for the input types, and the inputs could not be safely coerced "
"to any supported types according to the casting rule ''safe''"
"to any supported types according to the casting rule ''safe''|"
"not supported for dtype"
)
with pytest.raises(TypeError, match=msg):
ops("foo")
Expand All @@ -125,9 +111,10 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string
r"unsupported operand type\(s\) for",
"Concatenation operation is not implemented for NumPy arrays",
"has no kernel",
"not supported for dtype",
]
)
with pytest.raises(err, match=msg):
with pytest.raises(TypeError, match=msg):
ops(pd.Timestamp("20180101"))

# invalid array-likes
Expand All @@ -140,7 +127,8 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string
"not all arguments converted during string formatting",
"has no kernel",
"not implemented",
"not supported for dtype",
]
)
with pytest.raises(err, match=msg):
with pytest.raises(TypeError, match=msg):
ops(pd.Series("foo", index=s.index))
23 changes: 8 additions & 15 deletions pandas/tests/arrays/floating/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import FloatingArray
Expand Down Expand Up @@ -124,19 +122,11 @@ def test_arith_zero_dim_ndarray(other):
# -----------------------------------------------------------------------------


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
def test_error_invalid_values(data, all_arithmetic_operators):
op = all_arithmetic_operators
s = pd.Series(data)
ops = getattr(s, op)

if using_infer_string:
import pyarrow as pa

errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
else:
errs = TypeError

# invalid scalars
msg = "|".join(
[
Expand All @@ -152,15 +142,17 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string
"Concatenation operation is not implemented for NumPy arrays",
"has no kernel",
"not implemented",
"not supported for dtype",
"Can only string multiply by an integer",
]
)
with pytest.raises(errs, match=msg):
with pytest.raises(TypeError, match=msg):
ops("foo")
with pytest.raises(errs, match=msg):
with pytest.raises(TypeError, match=msg):
ops(pd.Timestamp("20180101"))

# invalid array-likes
with pytest.raises(errs, match=msg):
with pytest.raises(TypeError, match=msg):
ops(pd.Series("foo", index=s.index))

msg = "|".join(
Expand All @@ -181,9 +173,10 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string
"cannot subtract DatetimeArray from ndarray",
"has no kernel",
"not implemented",
"not supported for dtype",
]
)
with pytest.raises(errs, match=msg):
with pytest.raises(TypeError, match=msg):
ops(pd.Series(pd.date_range("20180101", periods=len(s))))


Expand Down
32 changes: 10 additions & 22 deletions pandas/tests/arrays/integer/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
import pandas._testing as tm
from pandas.core import ops
Expand Down Expand Up @@ -174,19 +172,11 @@ def test_numpy_zero_dim_ndarray(other):
# -----------------------------------------------------------------------------


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
op = all_arithmetic_operators
s = pd.Series(data)
ops = getattr(s, op)

if using_infer_string:
import pyarrow as pa

errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
else:
errs = TypeError

# invalid scalars
msg = "|".join(
[
Expand All @@ -201,24 +191,21 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string
"has no kernel",
"not implemented",
"The 'out' kwarg is necessary. Use numpy.strings.multiply without it.",
"not supported for dtype",
]
)
with pytest.raises(errs, match=msg):
with pytest.raises(TypeError, match=msg):
ops("foo")
with pytest.raises(errs, match=msg):
with pytest.raises(TypeError, match=msg):
ops(pd.Timestamp("20180101"))

# invalid array-likes
str_ser = pd.Series("foo", index=s.index)
# with pytest.raises(TypeError, match=msg):
if (
all_arithmetic_operators
in [
"__mul__",
"__rmul__",
]
and not using_infer_string
): # (data[~data.isna()] >= 0).all():
if all_arithmetic_operators in [
"__mul__",
"__rmul__",
]: # (data[~data.isna()] >= 0).all():
res = ops(str_ser)
expected = pd.Series(["foo" * x for x in data], index=s.index)
expected = expected.fillna(np.nan)
Expand All @@ -227,7 +214,7 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string
# more-correct than np.nan here.
tm.assert_series_equal(res, expected)
else:
with pytest.raises(errs, match=msg):
with pytest.raises(TypeError, match=msg):
ops(str_ser)

msg = "|".join(
Expand All @@ -242,9 +229,10 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string
"cannot subtract DatetimeArray from ndarray",
"has no kernel",
"not implemented",
"not supported for dtype",
]
)
with pytest.raises(errs, match=msg):
with pytest.raises(TypeError, match=msg):
ops(pd.Series(pd.date_range("20180101", periods=len(s))))


Expand Down
Loading