diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py index 2518e37ec6f3d..326b37ec6e1ae 100644 --- a/python/pyarrow/dataset.py +++ b/python/pyarrow/dataset.py @@ -964,7 +964,7 @@ def file_visitor(written_file): # was converted to one of those two. So we can grab the schema # to build the partitioning object from Dataset. if isinstance(data, Scanner): - partitioning_schema = data.dataset_schema + partitioning_schema = data.projected_schema else: partitioning_schema = data.schema partitioning = _ensure_write_partitioning(partitioning, diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index b900e694a91da..3dc9c3beb6ee1 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -4706,3 +4706,28 @@ def test_dataset_filter(tempdir): "colA": [1, 2], "col2": ["a", "b"] }) + + +def test_write_dataset_with_scanner_use_projected_schema(tempdir): + """ + Ensure the projected schema is used to validate partitions for scanner + + https://issues.apache.org/jira/browse/ARROW-17228 + """ + table = pa.table([pa.array(range(20))], names=["original_column"]) + table_dataset = ds.dataset(table) + columns = { + "renamed_column": ds.field("original_column"), + } + scanner = table_dataset.scanner(columns=columns) + + ds.write_dataset( + scanner, tempdir, partitioning=["renamed_column"], format="ipc") + with ( + pytest.raises( + KeyError, match=r"'Column original_column does not exist in schema" + ) + ): + ds.write_dataset( + scanner, tempdir, partitioning=["original_column"], format="ipc" + )