Skip to content

Commit

Permalink
Disallow writing empty Manifest files (apache#876)
Browse files Browse the repository at this point in the history
* Disallow writing empty Avro files/blocks

Raising an exception when doing this might look extreme, but
there is no real good reason to allow this.

* Relax the constaints a bit
  • Loading branch information
Fokko authored Jul 9, 2024
1 parent b68e109 commit cdc3e54
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 1 deletion.
6 changes: 6 additions & 0 deletions pyiceberg/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,10 @@ def __exit__(
traceback: Optional[TracebackType],
) -> None:
"""Close the writer."""
if (self._added_files + self._existing_files + self._deleted_files) == 0:
# This is just a guard to ensure that we don't write empty manifest files
raise ValueError("An empty manifest file has been written")

self.closed = True
self._writer.__exit__(exc_type, exc_value, traceback)

Expand Down Expand Up @@ -757,6 +761,8 @@ def add_entry(self, entry: ManifestEntry) -> ManifestWriter:
elif entry.status == ManifestEntryStatus.DELETED:
self._deleted_files += 1
self._deleted_rows += entry.data_file.record_count
else:
raise ValueError(f"Unknown entry: {entry.status}")

self._partitions.append(entry.data_file.partition)

Expand Down
19 changes: 18 additions & 1 deletion tests/utils/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
write_manifest,
write_manifest_list,
)
from pyiceberg.partitioning import PartitionField, PartitionSpec
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionSpec
from pyiceberg.schema import Schema
from pyiceberg.table.snapshots import Operation, Snapshot, Summary
from pyiceberg.transforms import IdentityTransform
Expand Down Expand Up @@ -306,6 +306,23 @@ def test_read_manifest_v2(generated_manifest_file_file_v2: str) -> None:
assert entry.status == ManifestEntryStatus.ADDED


def test_write_empty_manifest() -> None:
io = load_file_io()
test_schema = Schema(NestedField(1, "foo", IntegerType(), False))
with TemporaryDirectory() as tmpdir:
tmp_avro_file = tmpdir + "/test_write_manifest.avro"

with pytest.raises(ValueError, match="An empty manifest file has been written"):
with write_manifest(
format_version=1,
spec=UNPARTITIONED_PARTITION_SPEC,
schema=test_schema,
output_file=io.new_output(tmp_avro_file),
snapshot_id=8744736658442914487,
) as _:
pass


@pytest.mark.parametrize("format_version", [1, 2])
def test_write_manifest(
generated_manifest_file_file_v1: str, generated_manifest_file_file_v2: str, format_version: TableVersion
Expand Down

0 comments on commit cdc3e54

Please sign in to comment.