From 788200a434462325c9feff4b52203520a90694e4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 13 Mar 2024 14:20:52 +0100 Subject: [PATCH] GH-40428: [Python][CI] Fix dataset partition filter tests with pandas nightly (#40429) ### Rationale for this change From debugging the failure, it seems this is due to pandas changing a filter operation to sometimes preserve a RangeIndex now instead of returning an Integer64Index. And the conversion to Arrow changes based on that (RangeIndex is metadata only by default, integer index becomes a column) Therefore making the tests more robust to ensure there is always at least one non-partition column in the DataFrame, so it doesn't depend on the index whether the result is empty or not. * GitHub Issue: #40428 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/tests/parquet/test_dataset.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py index 30dae05124f5d..47e608a1404ff 100644 --- a/python/pyarrow/tests/parquet/test_dataset.py +++ b/python/pyarrow/tests/parquet/test_dataset.py @@ -107,9 +107,9 @@ def test_filters_equivalency(tempdir): df = pd.DataFrame({ 'integer': np.array(integer_keys, dtype='i4').repeat(15), 'string': np.tile(np.tile(np.array(string_keys, dtype=object), 5), 2), - 'boolean': np.tile(np.tile(np.array(boolean_keys, dtype='bool'), 5), - 3), - }, columns=['integer', 'string', 'boolean']) + 'boolean': np.tile(np.tile(np.array(boolean_keys, dtype='bool'), 5), 3), + 'values': np.arange(30), + }) _generate_partition_directories(local, base_path, partition_spec, df) @@ -312,9 +312,9 @@ def test_filters_inclusive_set(tempdir): df = pd.DataFrame({ 'integer': np.array(integer_keys, dtype='i4').repeat(15), 'string': np.tile(np.tile(np.array(string_keys, dtype=object), 5), 2), - 'boolean': np.tile(np.tile(np.array(boolean_keys, dtype='bool'), 5), - 3), - }, columns=['integer', 'string', 'boolean']) + 'boolean': np.tile(np.tile(np.array(boolean_keys, dtype='bool'), 5), 3), + 'values': np.arange(30), + }) _generate_partition_directories(local, base_path, partition_spec, df)