unit test refactoring

google · Dec 24, 2023 · 7759cab · 7759cab
1 parent f7b90d7
commit 7759cab
Show file tree

Hide file tree

Showing 3 changed files with 106 additions and 122 deletions.
diff --git a/python/tests/core/ops/conftest.py b/python/tests/core/ops/conftest.py
@@ -0,0 +1,81 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pyarrow as pa
+import pytest
+from tensorflow_datasets import features  # type: ignore[import-untyped]
+
+from space.core.schema.types import TfFeatures
+
+
+# TODO: the test should cover all types supported by column stats.
+@pytest.fixture
+def all_types_schema():
+  return pa.schema([
+      pa.field("int64", pa.int64()),
+      pa.field("float64", pa.float64()),
+      pa.field("bool", pa.bool_()),
+      pa.field("string", pa.string())
+  ])
+
+
+@pytest.fixture
+def all_types_input_data():
+  return [{
+      "int64": [1, 2, 3],
+      "float64": [0.1, 0.2, 0.3],
+      "bool": [True, False, False],
+      "string": ["a", "b", "c"]
+  }, {
+      "int64": [0, 10],
+      "float64": [-0.1, 100.0],
+      "bool": [False, False],
+      "string": ["A", "z"]
+  }]
+
+
+@pytest.fixture
+def record_fields_schema():
+  tf_features_images = features.FeaturesDict(
+      {"images": features.Image(shape=(None, None, 3), dtype=np.uint8)})
+  tf_features_objects = features.FeaturesDict({
+      "objects":
+      features.Sequence({
+          "bbox": features.BBoxFeature(),
+          "id": np.int64
+      }),
+  })
+
+  return pa.schema([
+      pa.field("int64", pa.int64()),
+      pa.field("string", pa.string()),
+      pa.field("images", TfFeatures(tf_features_images)),
+      pa.field("objects", TfFeatures(tf_features_objects))
+  ])
+
+
+@pytest.fixture
+def record_fields_input_data():
+  return [{
+      "int64": [1, 2, 3],
+      "string": ["a", "b", "c"],
+      "images": [b"images0", b"images1", b"images2"],
+      "objects": [b"objects0", b"objects1", b"objects2"]
+  }, {
+      "int64": [0, 10],
+      "string": ["A", "z"],
+      "images": [b"images3", b"images4"],
+      "objects": [b"objects3", b"objects4"]
+  }]
diff --git a/python/tests/core/ops/test_append.py b/python/tests/core/ops/test_append.py
@@ -16,46 +16,27 @@
 import numpy as np
 import pyarrow as pa
 import pyarrow.parquet as pq
-from tensorflow_datasets import features  # type: ignore[import-untyped]
 
 from space.core.ops import LocalAppendOp
 import space.core.proto.metadata_pb2 as meta
-from space.core.schema.types import TfFeatures
 from space.core.storage import Storage
 
 
 class TestLocalAppendOp:
 
   # TODO: to add tests using Arrow table input.
 
-  def test_write_pydict_all_types(self, tmp_path):
+  def test_write_pydict_all_types(self, tmp_path, all_types_schema,
+                                  all_types_input_data):
     location = tmp_path / "dataset"
-    schema = pa.schema([
-        pa.field("int64", pa.int64()),
-        pa.field("float64", pa.float64()),
-        pa.field("bool", pa.bool_()),
-        pa.field("string", pa.string())
-    ])
     storage = Storage.create(location=str(location),
-                             schema=schema,
+                             schema=all_types_schema,
                              primary_keys=["int64"],
                              record_fields=[])
 
     op = LocalAppendOp(str(location), storage.metadata)
-
-    # TODO: the test should cover all types supported by column stats.
-    op.write({
-        "int64": [1, 2, 3],
-        "float64": [0.1, 0.2, 0.3],
-        "bool": [True, False, False],
-        "string": ["a", "b", "c"]
-    })
-    op.write({
-        "int64": [0, 10],
-        "float64": [-0.1, 100.0],
-        "bool": [False, False],
-        "string": ["A", "z"]
-    })
+    for batch in all_types_input_data:
+      op.write(batch)
 
     patch = op.finish()
     assert patch is not None
@@ -81,43 +62,18 @@ def test_write_pydict_all_types(self, tmp_path):
     assert patch.storage_statistics_update == meta.StorageStatistics(
         num_rows=5, index_compressed_bytes=114, index_uncompressed_bytes=126)
 
-  def test_write_pydict_with_record_fields(self, tmp_path):
-    tf_features_images = features.FeaturesDict(
-        {"images": features.Image(shape=(None, None, 3), dtype=np.uint8)})
-    tf_features_objects = features.FeaturesDict({
-        "objects":
-        features.Sequence({
-            "bbox": features.BBoxFeature(),
-            "id": np.int64
-        }),
-    })
-
+  def test_write_pydict_with_record_fields(self, tmp_path,
+                                           record_fields_schema,
+                                           record_fields_input_data):
     location = tmp_path / "dataset"
-    schema = pa.schema([
-        pa.field("int64", pa.int64()),
-        pa.field("string", pa.string()),
-        pa.field("images", TfFeatures(tf_features_images)),
-        pa.field("objects", TfFeatures(tf_features_objects))
-    ])
     storage = Storage.create(location=str(location),
-                             schema=schema,
+                             schema=record_fields_schema,
                              primary_keys=["int64"],
                              record_fields=["images", "objects"])
 
     op = LocalAppendOp(str(location), storage.metadata)
-
-    op.write({
-        "int64": [1, 2, 3],
-        "string": ["a", "b", "c"],
-        "images": [b"images0", b"images1", b"images2"],
-        "objects": [b"objects0", b"objects1", b"objects2"]
-    })
-    op.write({
-        "int64": [0, 10],
-        "string": ["A", "z"],
-        "images": [b"images3", b"images4"],
-        "objects": [b"objects3", b"objects4"]
-    })
+    for batch in record_fields_input_data:
+      op.write(batch)
 
     patch = op.finish()
     assert patch is not None

diff --git a/python/tests/core/ops/test_read.py b/python/tests/core/ops/test_read.py
@@ -12,54 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List
 import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
-import pyarrow.parquet as pq
-from tensorflow_datasets import features  # type: ignore[import-untyped]
 
 from space.core.ops import LocalAppendOp
 from space.core.ops import FileSetReadOp
-import space.core.proto.metadata_pb2 as meta
-from space.core.schema.types import TfFeatures
 from space.core.storage import Storage
 
 
 class TestFileSetReadOp:
 
   # TODO: to add tests using Arrow table input.
-  def test_read_all_types(self, tmp_path):
+  def test_read_all_types(self, tmp_path, all_types_schema,
+                          all_types_input_data):
     location = tmp_path / "dataset"
-    schema = pa.schema([
-        pa.field("int64", pa.int64()),
-        pa.field("float64", pa.float64()),
-        pa.field("bool", pa.bool_()),
-        pa.field("string", pa.string())
-    ])
     storage = Storage.create(location=str(location),
-                             schema=schema,
+                             schema=all_types_schema,
                              primary_keys=["int64"],
                              record_fields=[])
 
     append_op = LocalAppendOp(str(location), storage.metadata)
-
     # TODO: the test should cover all types supported by column stats.
-    input_data = [
-        pa.Table.from_pydict({
-            "int64": [1, 2, 3],
-            "float64": [0.1, 0.2, 0.3],
-            "bool": [True, False, False],
-            "string": ["a", "b", "c"]
-        }),
-        pa.Table.from_pydict({
-            "int64": [0, 10],
-            "float64": [-0.1, 100.0],
-            "bool": [False, False],
-            "string": ["A", "z"]
-        })
-    ]
-
+    input_data = [pa.Table.from_pydict(d) for d in all_types_input_data]
     for batch in input_data:
       append_op.write(batch)
 
@@ -72,10 +47,12 @@ def test_read_all_types(self, tmp_path):
     assert list(iter(read_op))[0] == pa.concat_tables(input_data)
 
     # Test FileSetReadOp with filters.
-    read_op = FileSetReadOp(str(location),
-                            storage.metadata,
-                            data_files,
-                            filter_=(pc.field("bool") == True))
+    read_op = FileSetReadOp(
+        str(location),
+        storage.metadata,
+        data_files,
+        # pylint: disable=singleton-comparison
+        filter_=pc.field("bool") == True)
     results = list(iter(read_op))
     assert len(results) == 1
     assert list(iter(read_op))[0] == pa.Table.from_pydict({
@@ -85,46 +62,16 @@ def test_read_all_types(self, tmp_path):
         "string": ["a"]
     })
 
-  def test_read_with_record_filters(self, tmp_path):
-    tf_features_images = features.FeaturesDict(
-        {"images": features.Image(shape=(None, None, 3), dtype=np.uint8)})
-    tf_features_objects = features.FeaturesDict({
-        "objects":
-        features.Sequence({
-            "bbox": features.BBoxFeature(),
-            "id": np.int64
-        }),
-    })
-
+  def test_read_with_record_filters(self, tmp_path, record_fields_schema,
+                                    record_fields_input_data):
     location = tmp_path / "dataset"
-    schema = pa.schema([
-        pa.field("int64", pa.int64()),
-        pa.field("string", pa.string()),
-        pa.field("images", TfFeatures(tf_features_images)),
-        pa.field("objects", TfFeatures(tf_features_objects))
-    ])
     storage = Storage.create(location=str(location),
-                             schema=schema,
+                             schema=record_fields_schema,
                              primary_keys=["int64"],
                              record_fields=["images", "objects"])
 
     append_op = LocalAppendOp(str(location), storage.metadata)
-
-    input_data = [
-        pa.Table.from_pydict({
-            "int64": [1, 2, 3],
-            "string": ["a", "b", "c"],
-            "images": [b"images0", b"images1", b"images2"],
-            "objects": [b"objects0", b"objects1", b"objects2"]
-        }),
-        pa.Table.from_pydict({
-            "int64": [0, 10],
-            "string": ["A", "z"],
-            "images": [b"images3", b"images4"],
-            "objects": [b"objects3", b"objects4"]
-        })
-    ]
-
+    input_data = [pa.Table.from_pydict(d) for d in record_fields_input_data]
     for batch in input_data:
       append_op.write(batch)