Skip to content

Commit

Permalink
ARROW-7663: [Python] Raise better error message when passing mixed-ty…
Browse files Browse the repository at this point in the history
…pe (int/string) Pandas dataframe to pyarrow Table

This PR homogenizes error messages for mixed-type `Pandas` inputs to `pa.Table`.

The message for `Pandas` column with `int` followed by `string`  is now
```
In [2]: table = pa.Table.from_pandas(pd.DataFrame({'a': [ 19, 'a']}))
(... traceback...)
ArrowInvalid: ('Could not convert a with type str: tried to convert to int', 'Conversion failed for column a with type object')
```
the same as for `double` followed by `string`:
```
In [3]: table = pa.Table.from_pandas(pd.DataFrame({'a': [ 19.0, 'a']}))
(... traceback...)
ArrowInvalid: ('Could not convert a with type str: tried to convert to double', 'Conversion failed for column a with type object')
```

As a side effect, this snippet [xref #5866, ARROW-7168] now throws an `ArrowInvalid` (has been `FutureWarning` since 0.16):
```
In [8]: cat = pd.Categorical.from_codes(np.array([0, 1], dtype='int8'), np.array(['a', 'b'], dtype=object))
   ...: typ = pa.dictionary(index_type=pa.int8(), value_type=pa.int64())
   ...: result = pa.array(cat, type=typ)
(... traceback...)
ArrowInvalid: Could not convert a with type str: tried to convert to int
```
Finally, this *does* break a test [xref #4484, ARROW-4036] - see code comment

Closes #8044 from arw2019/ARROW-7663

Authored-by: arw2019 <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
  • Loading branch information
arw2019 authored and jorisvandenbossche committed Sep 15, 2020
1 parent 2d3046f commit 8800a22
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 24 deletions.
7 changes: 6 additions & 1 deletion cpp/src/arrow/python/python_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,12 @@ struct ValueConverter<Type, enable_if_integer<Type>> {

static inline Result<ValueType> FromPython(PyObject* obj) {
ValueType value;
RETURN_NOT_OK(internal::CIntFromPython(obj, &value));
arrow::Status s_ = internal::CIntFromPython(obj, &value);
if (!s_.ok() && !internal::PyIntScalar_Check(obj)) {
return internal::InvalidValue(obj, "tried to convert to int");
} else {
RETURN_NOT_OK(s_);
}
return value;
}
};
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/tests/test_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -736,7 +736,7 @@ def test_is_null():
def test_fill_null():
arr = pa.array([1, 2, None, 4], type=pa.int8())
fill_value = pa.array([5], type=pa.int8())
with pytest.raises(TypeError):
with pytest.raises(pa.ArrowInvalid, match="tried to convert to int"):
arr.fill_null(fill_value)

arr = pa.array([None, None, None, None], type=pa.null())
Expand Down
8 changes: 2 additions & 6 deletions python/pyarrow/tests/test_convert_builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
import decimal
import itertools
import math
import traceback

import numpy as np
import pytz
Expand Down Expand Up @@ -382,11 +381,8 @@ def test_sequence_custom_integers(seq):
@parametrize_with_iterable_types
def test_broken_integers(seq):
data = [MyBrokenInt()]
with pytest.raises(ZeroDivisionError) as exc_info:
with pytest.raises(pa.ArrowInvalid):
pa.array(seq(data), type=pa.int64())
# Original traceback is kept
tb_lines = traceback.format_tb(exc_info.tb)
assert "# MARKER" in tb_lines[-1]


def test_numpy_scalars_mixed_type():
Expand Down Expand Up @@ -1643,7 +1639,7 @@ def test_map_from_dicts():

# Invalid dictionary types
for entry in [[{'key': '1', 'value': 5}], [{'key': {'value': 2}}]]:
with pytest.raises(TypeError, match="integer is required"):
with pytest.raises(pa.ArrowInvalid, match="tried to convert to int"):
pa.array([entry], type=pa.map_('i4', 'i4'))


Expand Down
34 changes: 18 additions & 16 deletions python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2447,19 +2447,22 @@ def test_category_zero_chunks(self):
expected = pd.DataFrame({'a': expected})
tm.assert_frame_equal(result, expected)

def test_mixed_types_fails(self):
data = pd.DataFrame({'a': ['a', 1, 2.0]})
with pytest.raises(pa.ArrowTypeError):
pa.Table.from_pandas(data)

data = pd.DataFrame({'a': [1, True]})
with pytest.raises(pa.ArrowTypeError):
pa.Table.from_pandas(data)

data = pd.DataFrame({'a': ['a', 1, 2.0]})
expected_msg = 'Conversion failed for column a'
with pytest.raises(pa.ArrowTypeError, match=expected_msg):
pa.Table.from_pandas(data)
@pytest.mark.parametrize(
"data,error_type",
[
({"a": ["a", 1, 2.0]}, pa.ArrowTypeError),
({"a": ["a", 1, 2.0]}, pa.ArrowTypeError),
({"a": [1, True]}, pa.ArrowTypeError),
({"a": [True, "a"]}, pa.ArrowInvalid),
({"a": [1, "a"]}, pa.ArrowInvalid),
({"a": [1.0, "a"]}, pa.ArrowInvalid),
],
)
def test_mixed_types_fails(self, data, error_type):
df = pd.DataFrame(data)
msg = "Conversion failed for column a with type object"
with pytest.raises(error_type, match=msg):
pa.Table.from_pandas(df)

def test_strided_data_import(self):
cases = []
Expand Down Expand Up @@ -3531,11 +3534,10 @@ def test_dictionary_from_pandas_specified_type():
assert result.type.equals(typ)
assert result.to_pylist() == ['a', 'b']

# mismatching values type -> raise error (for now a deprecation warning)
# mismatching values type -> raise error
typ = pa.dictionary(index_type=pa.int8(), value_type=pa.int64())
with pytest.warns(FutureWarning):
with pytest.raises(pa.ArrowInvalid):
result = pa.array(cat, type=typ)
assert result.to_pylist() == ['a', 'b']

# mismatching order -> raise error (for now a deprecation warning)
typ = pa.dictionary(
Expand Down

0 comments on commit 8800a22

Please sign in to comment.