TST: change pyarrow skips to xfails (#55576)

* TST: change pyarrow skips to xfails * revert edits where CI is different from local --------- Co-authored-by: Matthew Roeschke <[email protected]>
pandas-dev · Oct 23, 2023 · f64c608 · f64c608
1 parent 0c7d303
commit f64c608
Show file tree

Hide file tree

Showing 15 changed files with 268 additions and 239 deletions.
diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py
@@ -34,7 +34,6 @@
 )
 
 xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
-skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
 
 
 def test_override_set_noconvert_columns():
@@ -515,8 +514,6 @@ def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
     tm.assert_frame_equal(result, expected)
 
 
-# Skip for now, actually only one test fails though, but its tricky to xfail
-@skip_pyarrow
 @pytest.mark.parametrize(
     "sep,skip_blank_lines,exp_data",
     [
@@ -536,7 +533,7 @@ def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
         ),
     ],
 )
-def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data):
+def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data, request):
     parser = all_parsers
     data = """\
 A,B,C
@@ -550,6 +547,12 @@ def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data):
 
     if sep == r"\s+":
         data = data.replace(",", "  ")
+        if parser.engine == "pyarrow":
+            mark = pytest.mark.xfail(
+                raises=ValueError,
+                reason="the 'pyarrow' engine does not support regex separators",
+            )
+            request.applymarker(mark)
 
     result = parser.read_csv(StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines)
     expected = DataFrame(exp_data, columns=["A", "B", "C"])

diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py
@@ -21,10 +21,6 @@
 
 xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
-# GH#43650: Some expected failures with the pyarrow engine can occasionally
-# cause a deadlock instead, so we skip these instead of xfailing
-skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
-
 
 @pytest.mark.parametrize(
     "data,kwargs,expected",
@@ -278,7 +274,8 @@ def test_empty_with_index(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
+# CSV parse error: Empty CSV file or block: cannot infer number of columns
+@xfail_pyarrow
 def test_empty_with_multi_index(all_parsers):
     # see gh-10467
     data = "x,y,z"
@@ -291,7 +288,8 @@ def test_empty_with_multi_index(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
+# CSV parse error: Empty CSV file or block: cannot infer number of columns
+@xfail_pyarrow
 def test_empty_with_reversed_multi_index(all_parsers):
     data = "x,y,z"
     parser = all_parsers

diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py
@@ -17,9 +17,7 @@
     "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
 )
 
-# GH#43650: Some expected failures with the pyarrow engine can occasionally
-# cause a deadlock instead, so we skip these instead of xfailing
-skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
 
 def test_int_conversion(all_parsers):
@@ -102,12 +100,16 @@ def test_parse_integers_above_fp_precision(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow  # Flaky
 @pytest.mark.parametrize("sep", [" ", r"\s+"])
 def test_integer_overflow_bug(all_parsers, sep):
     # see gh-2601
     data = "65248E10 11\n55555E55 22\n"
     parser = all_parsers
+    if parser.engine == "pyarrow" and sep != " ":
+        msg = "the 'pyarrow' engine does not support regex separators"
+        with pytest.raises(ValueError, match=msg):
+            parser.read_csv(StringIO(data), header=None, sep=sep)
+        return
 
     result = parser.read_csv(StringIO(data), header=None, sep=sep)
     expected = DataFrame([[6.5248e14, 11], [5.5555e59, 22]])
@@ -124,7 +126,8 @@ def test_int64_min_issues(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
+# ValueError: The 'converters' option is not supported with the 'pyarrow' engine
+@xfail_pyarrow
 @pytest.mark.parametrize("conv", [None, np.int64, np.uint64])
 def test_int64_overflow(all_parsers, conv):
     data = """ID
@@ -168,7 +171,7 @@ def test_int64_overflow(all_parsers, conv):
             parser.read_csv(StringIO(data), converters={"ID": conv})
 
 
-@skip_pyarrow
+@xfail_pyarrow  # CSV parse error: Empty CSV file or block
 @pytest.mark.parametrize(
     "val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min]
 )
@@ -182,7 +185,7 @@ def test_int64_uint64_range(all_parsers, val):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
+@xfail_pyarrow  # CSV parse error: Empty CSV file or block
 @pytest.mark.parametrize(
     "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]
 )
@@ -196,7 +199,7 @@ def test_outside_int64_uint64_range(all_parsers, val):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
+@xfail_pyarrow  # gets float64 dtype instead of object
 @pytest.mark.parametrize("exp_data", [[str(-1), str(2**63)], [str(2**63), str(-1)]])
 def test_numeric_range_too_wide(all_parsers, exp_data):
     # No numerical dtype can hold both negative and uint64

diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py
@@ -22,7 +22,6 @@
 import pandas._testing as tm
 
 xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
-skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
 
 
 def test_empty_decimal_marker(all_parsers):
@@ -44,7 +43,6 @@ def test_empty_decimal_marker(all_parsers):
         parser.read_csv(StringIO(data), decimal="")
 
 
-@skip_pyarrow
 def test_bad_stream_exception(all_parsers, csv_dir_path):
     # see gh-13652
     #
@@ -65,7 +63,7 @@ def test_bad_stream_exception(all_parsers, csv_dir_path):
             parser.read_csv(stream)
 
 
-@skip_pyarrow
+@xfail_pyarrow  # ValueError: The 'comment' option is not supported
 def test_malformed(all_parsers):
     # see gh-6607
     parser = all_parsers
@@ -80,7 +78,7 @@ def test_malformed(all_parsers):
         parser.read_csv(StringIO(data), header=1, comment="#")
 
 
-@skip_pyarrow
+@xfail_pyarrow  # ValueError: The 'iterator' option is not supported
 @pytest.mark.parametrize("nrows", [5, 3, None])
 def test_malformed_chunks(all_parsers, nrows):
     data = """ignore
@@ -100,7 +98,7 @@ def test_malformed_chunks(all_parsers, nrows):
             reader.read(nrows)
 
 
-@skip_pyarrow
+@xfail_pyarrow  # does not raise
 def test_catch_too_many_names(all_parsers):
     # see gh-5156
     data = """\
@@ -115,12 +113,17 @@ def test_catch_too_many_names(all_parsers):
         else "Number of passed names did not match "
         "number of header fields in the file"
     )
+    depr_msg = "Passing a BlockManager to DataFrame is deprecated"
+    warn = None
+    if parser.engine == "pyarrow":
+        warn = DeprecationWarning
 
-    with pytest.raises(ValueError, match=msg):
-        parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"])
+    with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False):
+        with pytest.raises(ValueError, match=msg):
+            parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"])
 
 
-@skip_pyarrow
+@xfail_pyarrow  # CSV parse error: Empty CSV file or block
 @pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5])
 def test_raise_on_no_columns(all_parsers, nrows):
     parser = all_parsers
@@ -208,7 +211,6 @@ def test_read_csv_wrong_num_columns(all_parsers):
         parser.read_csv(StringIO(data))
 
 
-@skip_pyarrow
 def test_null_byte_char(request, all_parsers):
     # see gh-2741
     data = "\x00,foo"
@@ -226,12 +228,19 @@ def test_null_byte_char(request, all_parsers):
         out = parser.read_csv(StringIO(data), names=names)
         tm.assert_frame_equal(out, expected)
     else:
-        msg = "NULL byte detected"
+        if parser.engine == "pyarrow":
+            msg = (
+                "CSV parse error: Empty CSV file or block: "
+                "cannot infer number of columns"
+            )
+        else:
+            msg = "NULL byte detected"
         with pytest.raises(ParserError, match=msg):
             parser.read_csv(StringIO(data), names=names)
 
 
-@skip_pyarrow
+# ValueError: the 'pyarrow' engine does not support sep=None with delim_whitespace=False
+@xfail_pyarrow
 @pytest.mark.filterwarnings("always::ResourceWarning")
 def test_open_file(request, all_parsers):
     # GH 39024

diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
@@ -279,19 +279,3 @@ def pyarrow_xfail(request):
     if parser.engine == "pyarrow":
         mark = pytest.mark.xfail(reason="pyarrow doesn't support this.")
         request.applymarker(mark)
-
-
-@pytest.fixture
-def pyarrow_skip(request):
-    """
-    Fixture that skips a test if the engine is pyarrow.
-    """
-    if "all_parsers" in request.fixturenames:
-        parser = request.getfixturevalue("all_parsers")
-    elif "all_parsers_all_precisions" in request.fixturenames:
-        # Return value is tuple of (engine, precision)
-        parser = request.getfixturevalue("all_parsers_all_precisions")[0]
-    else:
-        return
-    if parser.engine == "pyarrow":
-        pytest.skip("pyarrow doesn't support this.")
diff --git a/pandas/tests/io/parser/dtypes/test_categorical.py b/pandas/tests/io/parser/dtypes/test_categorical.py
@@ -25,7 +25,6 @@
 )
 
 xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
-skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
 
 
 @xfail_pyarrow
@@ -55,9 +54,8 @@ def test_categorical_dtype(all_parsers, dtype):
     tm.assert_frame_equal(actual, expected)
 
 
-@skip_pyarrow  # Flaky
 @pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}])
-def test_categorical_dtype_single(all_parsers, dtype):
+def test_categorical_dtype_single(all_parsers, dtype, request):
     # see gh-10153
     parser = all_parsers
     data = """a,b,c
@@ -67,6 +65,13 @@ def test_categorical_dtype_single(all_parsers, dtype):
     expected = DataFrame(
         {"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]}
     )
+    if parser.engine == "pyarrow":
+        mark = pytest.mark.xfail(
+            strict=False,
+            reason="Flaky test sometimes gives object dtype instead of Categorical",
+        )
+        request.applymarker(mark)
+
     actual = parser.read_csv(StringIO(data), dtype=dtype)
     tm.assert_frame_equal(actual, expected)
 
@@ -141,6 +146,7 @@ def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
     tm.assert_frame_equal(actual, expected)
 
 
+# ValueError: The 'chunksize' option is not supported with the 'pyarrow' engine
 @xfail_pyarrow
 def test_categorical_dtype_chunksize_infer_categories(all_parsers):
     # see gh-10153
@@ -161,6 +167,7 @@ def test_categorical_dtype_chunksize_infer_categories(all_parsers):
             tm.assert_frame_equal(actual, expected)
 
 
+# ValueError: The 'chunksize' option is not supported with the 'pyarrow' engine
 @xfail_pyarrow
 def test_categorical_dtype_chunksize_explicit_categories(all_parsers):
     # see gh-10153
@@ -253,7 +260,6 @@ def test_categorical_coerces_numeric(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow  # Flaky
 def test_categorical_coerces_datetime(all_parsers):
     parser = all_parsers
     dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None)

diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py
@@ -17,8 +17,6 @@
     "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
 )
 
-skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
-
 
 @pytest.fixture(params=[True, False])
 def buffer(request):
@@ -36,7 +34,6 @@ def parser_and_data(all_parsers, csv1):
     return parser, data, expected
 
 
-@skip_pyarrow
 @pytest.mark.parametrize("compression", ["zip", "infer", "zip2"])
 def test_zip(parser_and_data, compression):
     parser, data, expected = parser_and_data
@@ -54,7 +51,6 @@ def test_zip(parser_and_data, compression):
         tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 @pytest.mark.parametrize("compression", ["zip", "infer"])
 def test_zip_error_multiple_files(parser_and_data, compression):
     parser, data, expected = parser_and_data
@@ -70,7 +66,6 @@ def test_zip_error_multiple_files(parser_and_data, compression):
             parser.read_csv(path, compression=compression)
 
 
-@skip_pyarrow
 def test_zip_error_no_files(parser_and_data):
     parser, _, _ = parser_and_data
 
@@ -82,7 +77,6 @@ def test_zip_error_no_files(parser_and_data):
             parser.read_csv(path, compression="zip")
 
 
-@skip_pyarrow
 def test_zip_error_invalid_zip(parser_and_data):
     parser, _, _ = parser_and_data
 
@@ -92,7 +86,6 @@ def test_zip_error_invalid_zip(parser_and_data):
                 parser.read_csv(f, compression="zip")
 
 
-@skip_pyarrow
 @pytest.mark.parametrize("filename", [None, "test.{ext}"])
 def test_compression(
     request,
@@ -128,7 +121,6 @@ def test_compression(
         tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 @pytest.mark.parametrize("ext", [None, "gz", "bz2"])
 def test_infer_compression(all_parsers, csv1, buffer, ext):
     # see gh-9770
@@ -148,7 +140,6 @@ def test_infer_compression(all_parsers, csv1, buffer, ext):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt):
     # see gh-18071, gh-24130
     parser = all_parsers
@@ -166,7 +157,6 @@ def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 @pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"])
 def test_invalid_compression(all_parsers, invalid_compression):
     parser = all_parsers
@@ -178,7 +168,6 @@ def test_invalid_compression(all_parsers, invalid_compression):
         parser.read_csv("test_file.zip", **compress_kwargs)
 
 
-@skip_pyarrow
 def test_compression_tar_archive(all_parsers, csv_dir_path):
     parser = all_parsers
     path = os.path.join(csv_dir_path, "tar_csv.tar.gz")
@@ -200,7 +189,6 @@ def test_ignore_compression_extension(all_parsers):
             tm.assert_frame_equal(parser.read_csv(path_zip, compression=None), df)
 
 
-@skip_pyarrow
 def test_writes_tar_gz(all_parsers):
     parser = all_parsers
     data = DataFrame(