From 8800a222d8f3d63f0a783b64ab69ab7e252603c5 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 15 Sep 2020 16:00:42 +0200 Subject: [PATCH] ARROW-7663: [Python] Raise better error message when passing mixed-type (int/string) Pandas dataframe to pyarrow Table This PR homogenizes error messages for mixed-type `Pandas` inputs to `pa.Table`. The message for `Pandas` column with `int` followed by `string` is now ``` In [2]: table = pa.Table.from_pandas(pd.DataFrame({'a': [ 19, 'a']})) (... traceback...) ArrowInvalid: ('Could not convert a with type str: tried to convert to int', 'Conversion failed for column a with type object') ``` the same as for `double` followed by `string`: ``` In [3]: table = pa.Table.from_pandas(pd.DataFrame({'a': [ 19.0, 'a']})) (... traceback...) ArrowInvalid: ('Could not convert a with type str: tried to convert to double', 'Conversion failed for column a with type object') ``` As a side effect, this snippet [xref #5866, ARROW-7168] now throws an `ArrowInvalid` (has been `FutureWarning` since 0.16): ``` In [8]: cat = pd.Categorical.from_codes(np.array([0, 1], dtype='int8'), np.array(['a', 'b'], dtype=object)) ...: typ = pa.dictionary(index_type=pa.int8(), value_type=pa.int64()) ...: result = pa.array(cat, type=typ) (... traceback...) ArrowInvalid: Could not convert a with type str: tried to convert to int ``` Finally, this *does* break a test [xref #4484, ARROW-4036] - see code comment Closes #8044 from arw2019/ARROW-7663 Authored-by: arw2019 Signed-off-by: Joris Van den Bossche --- cpp/src/arrow/python/python_to_arrow.cc | 7 +++- python/pyarrow/tests/test_compute.py | 2 +- python/pyarrow/tests/test_convert_builtin.py | 8 ++--- python/pyarrow/tests/test_pandas.py | 34 +++++++++++--------- 4 files changed, 27 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 949213f4bb25c..849c474ded395 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -106,7 +106,12 @@ struct ValueConverter> { static inline Result FromPython(PyObject* obj) { ValueType value; - RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); + arrow::Status s_ = internal::CIntFromPython(obj, &value); + if (!s_.ok() && !internal::PyIntScalar_Check(obj)) { + return internal::InvalidValue(obj, "tried to convert to int"); + } else { + RETURN_NOT_OK(s_); + } return value; } }; diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 3c2e0865f8d05..ce45fc6f1bdee 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -736,7 +736,7 @@ def test_is_null(): def test_fill_null(): arr = pa.array([1, 2, None, 4], type=pa.int8()) fill_value = pa.array([5], type=pa.int8()) - with pytest.raises(TypeError): + with pytest.raises(pa.ArrowInvalid, match="tried to convert to int"): arr.fill_null(fill_value) arr = pa.array([None, None, None, None], type=pa.null()) diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index f62a9414e1eff..b8050f964689c 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -25,7 +25,6 @@ import decimal import itertools import math -import traceback import numpy as np import pytz @@ -382,11 +381,8 @@ def test_sequence_custom_integers(seq): @parametrize_with_iterable_types def test_broken_integers(seq): data = [MyBrokenInt()] - with pytest.raises(ZeroDivisionError) as exc_info: + with pytest.raises(pa.ArrowInvalid): pa.array(seq(data), type=pa.int64()) - # Original traceback is kept - tb_lines = traceback.format_tb(exc_info.tb) - assert "# MARKER" in tb_lines[-1] def test_numpy_scalars_mixed_type(): @@ -1643,7 +1639,7 @@ def test_map_from_dicts(): # Invalid dictionary types for entry in [[{'key': '1', 'value': 5}], [{'key': {'value': 2}}]]: - with pytest.raises(TypeError, match="integer is required"): + with pytest.raises(pa.ArrowInvalid, match="tried to convert to int"): pa.array([entry], type=pa.map_('i4', 'i4')) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 2d66a320481bd..03407521c12ad 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -2447,19 +2447,22 @@ def test_category_zero_chunks(self): expected = pd.DataFrame({'a': expected}) tm.assert_frame_equal(result, expected) - def test_mixed_types_fails(self): - data = pd.DataFrame({'a': ['a', 1, 2.0]}) - with pytest.raises(pa.ArrowTypeError): - pa.Table.from_pandas(data) - - data = pd.DataFrame({'a': [1, True]}) - with pytest.raises(pa.ArrowTypeError): - pa.Table.from_pandas(data) - - data = pd.DataFrame({'a': ['a', 1, 2.0]}) - expected_msg = 'Conversion failed for column a' - with pytest.raises(pa.ArrowTypeError, match=expected_msg): - pa.Table.from_pandas(data) + @pytest.mark.parametrize( + "data,error_type", + [ + ({"a": ["a", 1, 2.0]}, pa.ArrowTypeError), + ({"a": ["a", 1, 2.0]}, pa.ArrowTypeError), + ({"a": [1, True]}, pa.ArrowTypeError), + ({"a": [True, "a"]}, pa.ArrowInvalid), + ({"a": [1, "a"]}, pa.ArrowInvalid), + ({"a": [1.0, "a"]}, pa.ArrowInvalid), + ], + ) + def test_mixed_types_fails(self, data, error_type): + df = pd.DataFrame(data) + msg = "Conversion failed for column a with type object" + with pytest.raises(error_type, match=msg): + pa.Table.from_pandas(df) def test_strided_data_import(self): cases = [] @@ -3531,11 +3534,10 @@ def test_dictionary_from_pandas_specified_type(): assert result.type.equals(typ) assert result.to_pylist() == ['a', 'b'] - # mismatching values type -> raise error (for now a deprecation warning) + # mismatching values type -> raise error typ = pa.dictionary(index_type=pa.int8(), value_type=pa.int64()) - with pytest.warns(FutureWarning): + with pytest.raises(pa.ArrowInvalid): result = pa.array(cat, type=typ) - assert result.to_pylist() == ['a', 'b'] # mismatching order -> raise error (for now a deprecation warning) typ = pa.dictionary(