From 40ea1632a4d6e1c9e9654eac3d9da8b1226eb0e6 Mon Sep 17 00:00:00 2001 From: coufon Date: Fri, 22 Dec 2023 05:48:36 +0000 Subject: [PATCH] Add unit test for record manifest writer --- python/src/space/core/manifests/utils.py | 3 +- python/tests/core/manifests/test_index.py | 13 ++--- python/tests/core/manifests/test_record.py | 62 ++++++++++++++++++++++ python/tests/core/manifests/test_utils.py | 1 - 4 files changed, 68 insertions(+), 11 deletions(-) create mode 100644 python/tests/core/manifests/test_record.py diff --git a/python/src/space/core/manifests/utils.py b/python/src/space/core/manifests/utils.py index cc99418..5ff32c4 100644 --- a/python/src/space/core/manifests/utils.py +++ b/python/src/space/core/manifests/utils.py @@ -19,11 +19,10 @@ def write_parquet_file(file_path: str, schema: pa.Schema, - data: pa.Table) -> str: + data: pa.Table) -> None: """Materialize a single Parquet file.""" # TODO: currently assume this file is small, so always write a single file. writer = pq.ParquetWriter(file_path, schema) writer.write_table(data) writer.close() - return file_path diff --git a/python/tests/core/manifests/test_index.py b/python/tests/core/manifests/test_index.py index fe6cfc8..fc15891 100644 --- a/python/tests/core/manifests/test_index.py +++ b/python/tests/core/manifests/test_index.py @@ -53,11 +53,10 @@ def test_write_all_types(self, tmp_path): schema=schema, primary_keys=["int64", "float64", "bool", "string"]) - file_path = str(data_dir / "file0") # TODO: the test should cover all types supported by column stats. manifest_writer.write( - file_path, - _write_parquet_file(file_path, schema, [{ + "data/file0", + _write_parquet_file(str(data_dir / "file0"), schema, [{ "int64": [1, 2, 3], "float64": [0.1, 0.2, 0.3], "bool": [True, False, False], @@ -68,10 +67,9 @@ def test_write_all_types(self, tmp_path): "bool": [False, False], "string": ["A", "z"] }])) - file_path = str(data_dir / "file1") manifest_writer.write( - file_path, - _write_parquet_file(file_path, schema, [{ + "data/file1", + _write_parquet_file(str(data_dir / "file1"), schema, [{ "int64": [1000, 1000000], "float64": [-0.001, 0.001], "bool": [False, False], @@ -80,10 +78,9 @@ def test_write_all_types(self, tmp_path): manifest_path = manifest_writer.finish() - data_dir_str = str(data_dir) assert manifest_path is not None assert pq.read_table(manifest_path).to_pydict() == { - "_FILE": [f"{data_dir_str}/file0", f"{data_dir_str}/file1"], + "_FILE": ["data/file0", "data/file1"], "_INDEX_COMPRESSED_BYTES": [645, 334], "_INDEX_UNCOMPRESSED_BYTES": [624, 320], "_NUM_ROWS": [5, 2], diff --git a/python/tests/core/manifests/test_record.py b/python/tests/core/manifests/test_record.py new file mode 100644 index 0000000..369c1a6 --- /dev/null +++ b/python/tests/core/manifests/test_record.py @@ -0,0 +1,62 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, List + +import pyarrow as pa +import pyarrow.parquet as pq + +from space.core.manifests import RecordManifestWriter +import space.core.proto.metadata_pb2 as meta +from space.core.schema.arrow import field_metadata + + +class TestRecordManifestWriter: + + def test_write(self, tmp_path): + data_dir = tmp_path / "dataset" / "data" + metadata_dir = tmp_path / "dataset" / "metadata" + metadata_dir.mkdir(parents=True) + + manifest_writer = RecordManifestWriter(metadata_dir=str(metadata_dir)) + + manifest_writer.write( + "data/file0.arrayrecord", 0, + meta.StorageStatistics(num_rows=123, + index_compressed_bytes=10, + index_uncompressed_bytes=20, + record_uncompressed_bytes=30)) + manifest_writer.write( + "data/file1.arrayrecord", 1, + meta.StorageStatistics(num_rows=456, + index_compressed_bytes=10, + index_uncompressed_bytes=20, + record_uncompressed_bytes=100)) + + manifest_path = manifest_writer.finish() + + data_dir_str = str(data_dir) + assert manifest_path is not None + assert pq.read_table(manifest_path).to_pydict() == { + "_FILE": ["data/file0.arrayrecord", "data/file1.arrayrecord"], + "_FIELD_ID": [0, 1], + "_NUM_ROWS": [123, 456], + "_UNCOMPRESSED_BYTES": [30, 100] + } + + def test_empty_manifest_should_return_none(self, tmp_path): + metadata_dir = tmp_path / "dataset" / "metadata" + manifest_writer = RecordManifestWriter(metadata_dir=str(metadata_dir)) + + assert manifest_writer.finish() is None diff --git a/python/tests/core/manifests/test_utils.py b/python/tests/core/manifests/test_utils.py index 030098c..50e20eb 100644 --- a/python/tests/core/manifests/test_utils.py +++ b/python/tests/core/manifests/test_utils.py @@ -26,4 +26,3 @@ def test_write_parquet_file(tmp_path): pa.Table.from_pydict({"int64": [1, 2]})) assert data_dir.exists() - assert returned_path == file_path