From e6f4e2077479393cd3b43188df185555d8e597ad Mon Sep 17 00:00:00 2001 From: Miles Date: Mon, 12 Feb 2024 14:29:34 +0100 Subject: [PATCH] GH-39780: [Python][Parquet] Support hashing for FileMetaData and ParquetSchema (#39781) I think the hash, especially for `FileMetaData` could be better, maybe just use return of `__repr__`, even though that won't include row group info? ### Rationale for this change Helpful for dependent projects. ### What changes are included in this PR? Impl `__hash__` for `ParquetSchema` and `FileMetaData` ### Are these changes tested? Yes ### Are there any user-facing changes? Supports hashing metadata: ```python In [1]: import pyarrow.parquet as pq In [2]: f = pq.ParquetFile('test.parquet') In [3]: hash(f.metadata) Out[3]: 4816453453708427907 In [4]: hash(f.metadata.schema) Out[4]: 2300988959078172540 ``` * Closes: #39780 Authored-by: Miles Granger Signed-off-by: Antoine Pitrou --- python/pyarrow/_parquet.pyx | 10 +++++++ python/pyarrow/tests/parquet/test_metadata.py | 26 +++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 0b685245655a2..7bc68a288aa78 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -849,6 +849,13 @@ cdef class FileMetaData(_Weakrefable): cdef Buffer buffer = sink.getvalue() return _reconstruct_filemetadata, (buffer,) + def __hash__(self): + return hash((self.schema, + self.num_rows, + self.num_row_groups, + self.format_version, + self.serialized_size)) + def __repr__(self): return """{0} created_by: {1} @@ -1071,6 +1078,9 @@ cdef class ParquetSchema(_Weakrefable): def __getitem__(self, i): return self.column(i) + def __hash__(self): + return hash(self.schema.ToString()) + @property def names(self): """Name of each field (list of str).""" diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py index 73284d2e53b9e..bf186bd923c4f 100644 --- a/python/pyarrow/tests/parquet/test_metadata.py +++ b/python/pyarrow/tests/parquet/test_metadata.py @@ -499,6 +499,32 @@ def test_multi_dataset_metadata(tempdir): assert md['serialized_size'] > 0 +def test_metadata_hashing(tempdir): + path1 = str(tempdir / "metadata1") + schema1 = pa.schema([("a", "int64"), ("b", "float64")]) + pq.write_metadata(schema1, path1) + parquet_meta1 = pq.read_metadata(path1) + + # Same as 1, just different path + path2 = str(tempdir / "metadata2") + schema2 = pa.schema([("a", "int64"), ("b", "float64")]) + pq.write_metadata(schema2, path2) + parquet_meta2 = pq.read_metadata(path2) + + # different schema + path3 = str(tempdir / "metadata3") + schema3 = pa.schema([("a", "int64"), ("b", "float32")]) + pq.write_metadata(schema3, path3) + parquet_meta3 = pq.read_metadata(path3) + + # Deterministic + assert hash(parquet_meta1) == hash(parquet_meta1) # equal w/ same instance + assert hash(parquet_meta1) == hash(parquet_meta2) # equal w/ different instance + + # Not the same as other metadata with different schema + assert hash(parquet_meta1) != hash(parquet_meta3) + + @pytest.mark.filterwarnings("ignore:Parquet format:FutureWarning") def test_write_metadata(tempdir): path = str(tempdir / "metadata")