ARROW-7663: [Python] Raise better error message when passing mixed-ty…

…pe (int/string) Pandas dataframe to pyarrow Table This PR homogenizes error messages for mixed-type `Pandas` inputs to `pa.Table`. The message for `Pandas` column with `int` followed by `string` is now ``` In [2]: table = pa.Table.from_pandas(pd.DataFrame({'a': [ 19, 'a']})) (... traceback...) ArrowInvalid: ('Could not convert a with type str: tried to convert to int', 'Conversion failed for column a with type object') ``` the same as for `double` followed by `string`: ``` In [3]: table = pa.Table.from_pandas(pd.DataFrame({'a': [ 19.0, 'a']})) (... traceback...) ArrowInvalid: ('Could not convert a with type str: tried to convert to double', 'Conversion failed for column a with type object') ``` As a side effect, this snippet [xref #5866, ARROW-7168] now throws an `ArrowInvalid` (has been `FutureWarning` since 0.16): ``` In [8]: cat = pd.Categorical.from_codes(np.array([0, 1], dtype='int8'), np.array(['a', 'b'], dtype=object)) ...: typ = pa.dictionary(index_type=pa.int8(), value_type=pa.int64()) ...: result = pa.array(cat, type=typ) (... traceback...) ArrowInvalid: Could not convert a with type str: tried to convert to int ``` Finally, this *does* break a test [xref #4484, ARROW-4036] - see code comment Closes #8044 from arw2019/ARROW-7663 Authored-by: arw2019 <[email protected]> Signed-off-by: Joris Van den Bossche <[email protected]>
apache · Sep 15, 2020 · 8800a22 · 8800a22
1 parent 2d3046f
commit 8800a22
Show file tree

Hide file tree

Showing 4 changed files with 27 additions and 24 deletions.
diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc
@@ -106,7 +106,12 @@ struct ValueConverter<Type, enable_if_integer<Type>> {
 
   static inline Result<ValueType> FromPython(PyObject* obj) {
     ValueType value;
-    RETURN_NOT_OK(internal::CIntFromPython(obj, &value));
+    arrow::Status s_ = internal::CIntFromPython(obj, &value);
+    if (!s_.ok() && !internal::PyIntScalar_Check(obj)) {
+      return internal::InvalidValue(obj, "tried to convert to int");
+    } else {
+      RETURN_NOT_OK(s_);
+    }
     return value;
   }
 };

diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
@@ -736,7 +736,7 @@ def test_is_null():
 def test_fill_null():
     arr = pa.array([1, 2, None, 4], type=pa.int8())
     fill_value = pa.array([5], type=pa.int8())
-    with pytest.raises(TypeError):
+    with pytest.raises(pa.ArrowInvalid, match="tried to convert to int"):
         arr.fill_null(fill_value)
 
     arr = pa.array([None, None, None, None], type=pa.null())

diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
@@ -25,7 +25,6 @@
 import decimal
 import itertools
 import math
-import traceback
 
 import numpy as np
 import pytz
@@ -382,11 +381,8 @@ def test_sequence_custom_integers(seq):
 @parametrize_with_iterable_types
 def test_broken_integers(seq):
     data = [MyBrokenInt()]
-    with pytest.raises(ZeroDivisionError) as exc_info:
+    with pytest.raises(pa.ArrowInvalid):
         pa.array(seq(data), type=pa.int64())
-    # Original traceback is kept
-    tb_lines = traceback.format_tb(exc_info.tb)
-    assert "# MARKER" in tb_lines[-1]
 
 
 def test_numpy_scalars_mixed_type():
@@ -1643,7 +1639,7 @@ def test_map_from_dicts():
 
     # Invalid dictionary types
     for entry in [[{'key': '1', 'value': 5}], [{'key': {'value': 2}}]]:
-        with pytest.raises(TypeError, match="integer is required"):
+        with pytest.raises(pa.ArrowInvalid, match="tried to convert to int"):
             pa.array([entry], type=pa.map_('i4', 'i4'))
 
 

diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
@@ -2447,19 +2447,22 @@ def test_category_zero_chunks(self):
             expected = pd.DataFrame({'a': expected})
             tm.assert_frame_equal(result, expected)
 
-    def test_mixed_types_fails(self):
-        data = pd.DataFrame({'a': ['a', 1, 2.0]})
-        with pytest.raises(pa.ArrowTypeError):
-            pa.Table.from_pandas(data)
-
-        data = pd.DataFrame({'a': [1, True]})
-        with pytest.raises(pa.ArrowTypeError):
-            pa.Table.from_pandas(data)
-
-        data = pd.DataFrame({'a': ['a', 1, 2.0]})
-        expected_msg = 'Conversion failed for column a'
-        with pytest.raises(pa.ArrowTypeError, match=expected_msg):
-            pa.Table.from_pandas(data)
+    @pytest.mark.parametrize(
+        "data,error_type",
+        [
+            ({"a": ["a", 1, 2.0]}, pa.ArrowTypeError),
+            ({"a": ["a", 1, 2.0]}, pa.ArrowTypeError),
+            ({"a": [1, True]}, pa.ArrowTypeError),
+            ({"a": [True, "a"]}, pa.ArrowInvalid),
+            ({"a": [1, "a"]}, pa.ArrowInvalid),
+            ({"a": [1.0, "a"]}, pa.ArrowInvalid),
+        ],
+    )
+    def test_mixed_types_fails(self, data, error_type):
+        df = pd.DataFrame(data)
+        msg = "Conversion failed for column a with type object"
+        with pytest.raises(error_type, match=msg):
+            pa.Table.from_pandas(df)
 
     def test_strided_data_import(self):
         cases = []
@@ -3531,11 +3534,10 @@ def test_dictionary_from_pandas_specified_type():
     assert result.type.equals(typ)
     assert result.to_pylist() == ['a', 'b']
 
-    # mismatching values type -> raise error (for now a deprecation warning)
+    # mismatching values type -> raise error
     typ = pa.dictionary(index_type=pa.int8(), value_type=pa.int64())
-    with pytest.warns(FutureWarning):
+    with pytest.raises(pa.ArrowInvalid):
         result = pa.array(cat, type=typ)
-    assert result.to_pylist() == ['a', 'b']
 
     # mismatching order -> raise error (for now a deprecation warning)
     typ = pa.dictionary(