Skip to content

Commit

Permalink
unit test refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
coufon committed Dec 24, 2023
1 parent f7b90d7 commit 7759cab
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 122 deletions.
81 changes: 81 additions & 0 deletions python/tests/core/ops/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import pyarrow as pa
import pytest
from tensorflow_datasets import features # type: ignore[import-untyped]

from space.core.schema.types import TfFeatures


# TODO: the test should cover all types supported by column stats.
@pytest.fixture
def all_types_schema():
return pa.schema([
pa.field("int64", pa.int64()),
pa.field("float64", pa.float64()),
pa.field("bool", pa.bool_()),
pa.field("string", pa.string())
])


@pytest.fixture
def all_types_input_data():
return [{
"int64": [1, 2, 3],
"float64": [0.1, 0.2, 0.3],
"bool": [True, False, False],
"string": ["a", "b", "c"]
}, {
"int64": [0, 10],
"float64": [-0.1, 100.0],
"bool": [False, False],
"string": ["A", "z"]
}]


@pytest.fixture
def record_fields_schema():
tf_features_images = features.FeaturesDict(
{"images": features.Image(shape=(None, None, 3), dtype=np.uint8)})
tf_features_objects = features.FeaturesDict({
"objects":
features.Sequence({
"bbox": features.BBoxFeature(),
"id": np.int64
}),
})

return pa.schema([
pa.field("int64", pa.int64()),
pa.field("string", pa.string()),
pa.field("images", TfFeatures(tf_features_images)),
pa.field("objects", TfFeatures(tf_features_objects))
])


@pytest.fixture
def record_fields_input_data():
return [{
"int64": [1, 2, 3],
"string": ["a", "b", "c"],
"images": [b"images0", b"images1", b"images2"],
"objects": [b"objects0", b"objects1", b"objects2"]
}, {
"int64": [0, 10],
"string": ["A", "z"],
"images": [b"images3", b"images4"],
"objects": [b"objects3", b"objects4"]
}]
66 changes: 11 additions & 55 deletions python/tests/core/ops/test_append.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,46 +16,27 @@
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
from tensorflow_datasets import features # type: ignore[import-untyped]

from space.core.ops import LocalAppendOp
import space.core.proto.metadata_pb2 as meta
from space.core.schema.types import TfFeatures
from space.core.storage import Storage


class TestLocalAppendOp:

# TODO: to add tests using Arrow table input.

def test_write_pydict_all_types(self, tmp_path):
def test_write_pydict_all_types(self, tmp_path, all_types_schema,
all_types_input_data):
location = tmp_path / "dataset"
schema = pa.schema([
pa.field("int64", pa.int64()),
pa.field("float64", pa.float64()),
pa.field("bool", pa.bool_()),
pa.field("string", pa.string())
])
storage = Storage.create(location=str(location),
schema=schema,
schema=all_types_schema,
primary_keys=["int64"],
record_fields=[])

op = LocalAppendOp(str(location), storage.metadata)

# TODO: the test should cover all types supported by column stats.
op.write({
"int64": [1, 2, 3],
"float64": [0.1, 0.2, 0.3],
"bool": [True, False, False],
"string": ["a", "b", "c"]
})
op.write({
"int64": [0, 10],
"float64": [-0.1, 100.0],
"bool": [False, False],
"string": ["A", "z"]
})
for batch in all_types_input_data:
op.write(batch)

patch = op.finish()
assert patch is not None
Expand All @@ -81,43 +62,18 @@ def test_write_pydict_all_types(self, tmp_path):
assert patch.storage_statistics_update == meta.StorageStatistics(
num_rows=5, index_compressed_bytes=114, index_uncompressed_bytes=126)

def test_write_pydict_with_record_fields(self, tmp_path):
tf_features_images = features.FeaturesDict(
{"images": features.Image(shape=(None, None, 3), dtype=np.uint8)})
tf_features_objects = features.FeaturesDict({
"objects":
features.Sequence({
"bbox": features.BBoxFeature(),
"id": np.int64
}),
})

def test_write_pydict_with_record_fields(self, tmp_path,
record_fields_schema,
record_fields_input_data):
location = tmp_path / "dataset"
schema = pa.schema([
pa.field("int64", pa.int64()),
pa.field("string", pa.string()),
pa.field("images", TfFeatures(tf_features_images)),
pa.field("objects", TfFeatures(tf_features_objects))
])
storage = Storage.create(location=str(location),
schema=schema,
schema=record_fields_schema,
primary_keys=["int64"],
record_fields=["images", "objects"])

op = LocalAppendOp(str(location), storage.metadata)

op.write({
"int64": [1, 2, 3],
"string": ["a", "b", "c"],
"images": [b"images0", b"images1", b"images2"],
"objects": [b"objects0", b"objects1", b"objects2"]
})
op.write({
"int64": [0, 10],
"string": ["A", "z"],
"images": [b"images3", b"images4"],
"objects": [b"objects3", b"objects4"]
})
for batch in record_fields_input_data:
op.write(batch)

patch = op.finish()
assert patch is not None
Expand Down
81 changes: 14 additions & 67 deletions python/tests/core/ops/test_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,54 +12,29 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List
import numpy as np
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.parquet as pq
from tensorflow_datasets import features # type: ignore[import-untyped]

from space.core.ops import LocalAppendOp
from space.core.ops import FileSetReadOp
import space.core.proto.metadata_pb2 as meta
from space.core.schema.types import TfFeatures
from space.core.storage import Storage


class TestFileSetReadOp:

# TODO: to add tests using Arrow table input.
def test_read_all_types(self, tmp_path):
def test_read_all_types(self, tmp_path, all_types_schema,
all_types_input_data):
location = tmp_path / "dataset"
schema = pa.schema([
pa.field("int64", pa.int64()),
pa.field("float64", pa.float64()),
pa.field("bool", pa.bool_()),
pa.field("string", pa.string())
])
storage = Storage.create(location=str(location),
schema=schema,
schema=all_types_schema,
primary_keys=["int64"],
record_fields=[])

append_op = LocalAppendOp(str(location), storage.metadata)

# TODO: the test should cover all types supported by column stats.
input_data = [
pa.Table.from_pydict({
"int64": [1, 2, 3],
"float64": [0.1, 0.2, 0.3],
"bool": [True, False, False],
"string": ["a", "b", "c"]
}),
pa.Table.from_pydict({
"int64": [0, 10],
"float64": [-0.1, 100.0],
"bool": [False, False],
"string": ["A", "z"]
})
]

input_data = [pa.Table.from_pydict(d) for d in all_types_input_data]
for batch in input_data:
append_op.write(batch)

Expand All @@ -72,10 +47,12 @@ def test_read_all_types(self, tmp_path):
assert list(iter(read_op))[0] == pa.concat_tables(input_data)

# Test FileSetReadOp with filters.
read_op = FileSetReadOp(str(location),
storage.metadata,
data_files,
filter_=(pc.field("bool") == True))
read_op = FileSetReadOp(
str(location),
storage.metadata,
data_files,
# pylint: disable=singleton-comparison
filter_=pc.field("bool") == True)
results = list(iter(read_op))
assert len(results) == 1
assert list(iter(read_op))[0] == pa.Table.from_pydict({
Expand All @@ -85,46 +62,16 @@ def test_read_all_types(self, tmp_path):
"string": ["a"]
})

def test_read_with_record_filters(self, tmp_path):
tf_features_images = features.FeaturesDict(
{"images": features.Image(shape=(None, None, 3), dtype=np.uint8)})
tf_features_objects = features.FeaturesDict({
"objects":
features.Sequence({
"bbox": features.BBoxFeature(),
"id": np.int64
}),
})

def test_read_with_record_filters(self, tmp_path, record_fields_schema,
record_fields_input_data):
location = tmp_path / "dataset"
schema = pa.schema([
pa.field("int64", pa.int64()),
pa.field("string", pa.string()),
pa.field("images", TfFeatures(tf_features_images)),
pa.field("objects", TfFeatures(tf_features_objects))
])
storage = Storage.create(location=str(location),
schema=schema,
schema=record_fields_schema,
primary_keys=["int64"],
record_fields=["images", "objects"])

append_op = LocalAppendOp(str(location), storage.metadata)

input_data = [
pa.Table.from_pydict({
"int64": [1, 2, 3],
"string": ["a", "b", "c"],
"images": [b"images0", b"images1", b"images2"],
"objects": [b"objects0", b"objects1", b"objects2"]
}),
pa.Table.from_pydict({
"int64": [0, 10],
"string": ["A", "z"],
"images": [b"images3", b"images4"],
"objects": [b"objects3", b"objects4"]
})
]

input_data = [pa.Table.from_pydict(d) for d in record_fields_input_data]
for batch in input_data:
append_op.write(batch)

Expand Down

0 comments on commit 7759cab

Please sign in to comment.