From 2e6a338caf892741b6dc54813ea74c9276e743ba Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Tue, 6 Feb 2024 19:06:46 +0100 Subject: [PATCH 1/3] skip row groups with 0 records --- python/deltalake/writer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/deltalake/writer.py b/python/deltalake/writer.py index d3b956cbfc..29ff262677 100644 --- a/python/deltalake/writer.py +++ b/python/deltalake/writer.py @@ -666,7 +666,8 @@ def get_file_stats_from_metadata( def iter_groups(metadata: Any) -> Iterator[Any]: for i in range(metadata.num_row_groups): - yield metadata.row_group(i) + if metadata.row_group(i).num_rows > 0: + yield metadata.row_group(i) for column_idx in range(metadata.num_columns): name = metadata.row_group(0).column(column_idx).path_in_schema From 723f91f0c50aff89d3f0e0ad464500d8d4d065be Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Tue, 6 Feb 2024 19:17:11 +0100 Subject: [PATCH 2/3] add test --- python/tests/test_writer.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/python/tests/test_writer.py b/python/tests/test_writer.py index 9252dfdd41..e210f0d4a0 100644 --- a/python/tests/test_writer.py +++ b/python/tests/test_writer.py @@ -1251,3 +1251,24 @@ def test_with_deltalake_schema(tmp_path: pathlib.Path, sample_data: pa.Table): ) delta_table = DeltaTable(tmp_path) assert delta_table.schema().to_pyarrow() == sample_data.schema + + +def test_write_stats_empty_rowgroups_2169(tmp_path: pathlib.Path): + data = pa.table( + { + "data": pa.array(["B"] * 1024 * 33), + } + ) + write_deltalake( + tmp_path, + data, + max_rows_per_file=1024 * 32, + max_rows_per_group=1024 * 16, + min_rows_per_group=8 * 1024, + mode="overwrite", + ) + dt = DeltaTable(tmp_path) + assert ( + dt.to_pyarrow_dataset().to_table(filter=(pc.field("data") == "B")).shape[0] + == 33792 + ) From d9c212d343b7203843b9d291ac629a0b0c4691ef Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Tue, 6 Feb 2024 19:28:08 +0100 Subject: [PATCH 3/3] Update python/tests/test_writer.py Co-authored-by: Will Jones --- python/tests/test_writer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/tests/test_writer.py b/python/tests/test_writer.py index e210f0d4a0..186eae0b64 100644 --- a/python/tests/test_writer.py +++ b/python/tests/test_writer.py @@ -1253,7 +1253,8 @@ def test_with_deltalake_schema(tmp_path: pathlib.Path, sample_data: pa.Table): assert delta_table.schema().to_pyarrow() == sample_data.schema -def test_write_stats_empty_rowgroups_2169(tmp_path: pathlib.Path): +def test_write_stats_empty_rowgroups(tmp_path: pathlib.Path): + # https://github.com/delta-io/delta-rs/issues/2169 data = pa.table( { "data": pa.array(["B"] * 1024 * 33),