From bd6a33f9ca51f2f6a47d886fca8025db7f948bc0 Mon Sep 17 00:00:00 2001 From: coufon Date: Sun, 17 Dec 2023 06:50:12 +0000 Subject: [PATCH] Add init metadata proto, substrait proto, utilities, and storage class --- .github/workflows/python-ci.yml | 4 +- .gitignore | 4 + README.md | 2 +- python/build_proto.sh | 8 + python/pyproject.toml | 48 +++++ python/src/space/__init__.py | 13 ++ python/src/space/core/__init__.py | 13 ++ python/src/space/core/fs/__init__.py | 13 ++ python/src/space/core/fs/arrow.py | 63 +++++++ python/src/space/core/fs/base.py | 36 ++++ python/src/space/core/fs/factory.py | 24 +++ python/src/space/core/proto/__init__.py | 13 ++ python/src/space/core/proto/metadata.proto | 78 ++++++++ python/src/space/core/proto/metadata_pb2.py | 39 ++++ python/src/space/core/proto/metadata_pb2.pyi | 188 +++++++++++++++++++ python/src/space/core/storage.py | 91 +++++++++ python/src/space/core/utils/__init__.py | 13 ++ python/src/space/core/utils/paths.py | 96 ++++++++++ python/src/space/core/utils/protos.py | 31 +++ python/src/space/core/utils/uuids.py | 27 +++ python/tests/core/fs/test_arrow.py | 59 ++++++ python/tests/core/test_storage.py | 77 ++++++++ python/tests/core/utils/test_paths.py | 94 ++++++++++ python/tests/core/utils/test_protos.py | 21 +++ python/tests/core/utils/test_uuids.py | 25 +++ 25 files changed, 1077 insertions(+), 3 deletions(-) create mode 100644 python/pyproject.toml create mode 100644 python/src/space/__init__.py create mode 100644 python/src/space/core/__init__.py create mode 100644 python/src/space/core/fs/__init__.py create mode 100644 python/src/space/core/fs/arrow.py create mode 100644 python/src/space/core/fs/base.py create mode 100644 python/src/space/core/fs/factory.py create mode 100644 python/src/space/core/proto/__init__.py create mode 100644 python/src/space/core/proto/metadata.proto create mode 100644 python/src/space/core/proto/metadata_pb2.py create mode 100644 python/src/space/core/proto/metadata_pb2.pyi create mode 100644 python/src/space/core/storage.py create mode 100644 python/src/space/core/utils/__init__.py create mode 100644 python/src/space/core/utils/paths.py create mode 100644 python/src/space/core/utils/protos.py create mode 100644 python/src/space/core/utils/uuids.py create mode 100644 python/tests/core/fs/test_arrow.py create mode 100644 python/tests/core/test_storage.py create mode 100644 python/tests/core/utils/test_paths.py create mode 100644 python/tests/core/utils/test_protos.py create mode 100644 python/tests/core/utils/test_uuids.py diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 2ff1b86..b1cf6df 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -35,11 +35,11 @@ jobs: - name: Install test dependencies run: | python -m pip install --upgrade pip - pip install mypy pylint pytest pyarrow-stubs + pip install mypy pylint pytest mock - name: Install runtime dependencies working-directory: ./python run: | - pip install . + pip install .[dev] - name: Analysing the code with pylint working-directory: ./python/src run: | diff --git a/.gitignore b/.gitignore index 312771b..11402ca 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,10 @@ __pycache__/ *.egg-info/ build/ dist/ +.pytype/ +out/ +.mypy_cache/ +.pytest_cache/ # VSCode files .vscode/ diff --git a/README.md b/README.md index 8d8e1f1..11ecb3c 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Space: LakeHouse for Machine Learning Datasets +# Space: Storage Framework for Machine Learning Datasets [![Python CI](https://github.com/google/space/actions/workflows/python-ci.yml/badge.svg?branch=main)](https://github.com/google/space/actions/workflows/python-ci.yml) diff --git a/python/build_proto.sh b/python/build_proto.sh index eac3119..a3023cc 100644 --- a/python/build_proto.sh +++ b/python/build_proto.sh @@ -25,3 +25,11 @@ protoc --python_out="${SRC_FOLDER}" \ --mypy_out="${SRC_FOLDER}" \ substrait/*.proto substrait/extensions/*.proto \ --proto_path=. + +# Build Space protos. +cd "${SRC_FOLDER}" +protoc --python_out=. \ + --mypy_out=. \ + space/core/proto/*.proto \ + --proto_path=. \ + --proto_path=../../substrait/proto diff --git a/python/pyproject.toml b/python/pyproject.toml new file mode 100644 index 0000000..7db728d --- /dev/null +++ b/python/pyproject.toml @@ -0,0 +1,48 @@ +[project] +name = "space" +version = "0.0.1" +authors = [ + { name="Space team", email="no-reply@google.com" }, +] +description = "A storage framework for machine learning datasets" +license = {text = "Apache-2.0"} +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11" +] +requires-python = ">=3.8" +dependencies = [ + "protobuf", + "pyarrow >= 14.0.0", +] + +[project.optional-dependencies] +dev = [ + "pyarrow-stubs", + "types-protobuf" +] + +[project.urls] +Homepage = "https://github.com/google/space" +Issues = "https://github.com/google/space/issues" + +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[tool.pytest.ini_options] +addopts = ["--import-mode=importlib"] +pythonpath = ["src"] + +[tool.pylint.format] +max-line-length = 80 +indent-string = ' ' +disable = ['fixme'] + +[tool.pylint.MAIN] +ignore = 'space/core/proto' +ignored-modules = ['space.core.proto', 'google.protobuf'] diff --git a/python/src/space/__init__.py b/python/src/space/__init__.py new file mode 100644 index 0000000..5678014 --- /dev/null +++ b/python/src/space/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/src/space/core/__init__.py b/python/src/space/core/__init__.py new file mode 100644 index 0000000..5678014 --- /dev/null +++ b/python/src/space/core/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/src/space/core/fs/__init__.py b/python/src/space/core/fs/__init__.py new file mode 100644 index 0000000..5678014 --- /dev/null +++ b/python/src/space/core/fs/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/src/space/core/fs/arrow.py b/python/src/space/core/fs/arrow.py new file mode 100644 index 0000000..a1b22f9 --- /dev/null +++ b/python/src/space/core/fs/arrow.py @@ -0,0 +1,63 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Arrow file system implementation.""" + +from abc import abstractmethod + +from google.protobuf import message +from google.protobuf import text_format +from pyarrow import fs + +from space.core.fs.base import BaseFileSystem +from space.core.utils.protos import proto_to_text +from space.core.utils.uuids import random_id + + +class ArrowFileSystem(BaseFileSystem): + """Abstract Arrow file system.""" + + def __init__(self): + super().__init__() + self._fs = self.create_fs() + + @abstractmethod + def create_fs(self) -> fs.FileSystem: + """Create a new underlying Arrow file system.""" + + def create_dir(self, dir_path: str) -> None: + self._fs.create_dir(dir_path) + + def write_proto(self, file_path: str, msg: message.Message) -> None: + # TODO: the current implement overwrite an existing file; to support an + # to disallow overwrite. + tmp_file_path = f"{file_path}.{random_id()}.tmp" + + with self._fs.open_output_stream(tmp_file_path) as f: + f.write(proto_to_text(msg)) + + self._fs.move(tmp_file_path, file_path) + + def read_proto(self, file_path: str, + empty_msg: message.Message) -> message.Message: + with self._fs.open_input_file(file_path) as f: + result = text_format.Parse(f.readall(), empty_msg) + return result + + +class ArrowLocalFileSystem(ArrowFileSystem): + """Arrow local file system implementation.""" + + def create_fs(self) -> fs.FileSystem: + return fs.LocalFileSystem() diff --git a/python/src/space/core/fs/base.py b/python/src/space/core/fs/base.py new file mode 100644 index 0000000..266f9a8 --- /dev/null +++ b/python/src/space/core/fs/base.py @@ -0,0 +1,36 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Abstract base file system.""" + +from abc import ABC, abstractmethod + +from google.protobuf import message + + +class BaseFileSystem(ABC): + """Abstract file system.""" + + @abstractmethod + def create_dir(self, dir_path: str) -> None: + """Create a new directory.""" + + @abstractmethod + def write_proto(self, file_path: str, msg: message.Message) -> None: + """Write a proto message in text format to a file.""" + + @abstractmethod + def read_proto(self, file_path: str, + empty_msg: message.Message) -> message.Message: + """Read a proto message in text format from a file.""" diff --git a/python/src/space/core/fs/factory.py b/python/src/space/core/fs/factory.py new file mode 100644 index 0000000..fe1e92e --- /dev/null +++ b/python/src/space/core/fs/factory.py @@ -0,0 +1,24 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""File system factory implementation.""" + +from space.core.fs.arrow import ArrowLocalFileSystem +from space.core.fs.base import BaseFileSystem + + +def create_fs(path: str) -> BaseFileSystem: # pylint: disable=unused-argument + """Create a file system based on the path.""" + # TODO: to support more file systems. + return ArrowLocalFileSystem() diff --git a/python/src/space/core/proto/__init__.py b/python/src/space/core/proto/__init__.py new file mode 100644 index 0000000..5678014 --- /dev/null +++ b/python/src/space/core/proto/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/src/space/core/proto/metadata.proto b/python/src/space/core/proto/metadata.proto new file mode 100644 index 0000000..ecc4739 --- /dev/null +++ b/python/src/space/core/proto/metadata.proto @@ -0,0 +1,78 @@ +// Copyright 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package space.proto; + +import "google/protobuf/timestamp.proto"; +import "substrait/type.proto"; + +// Record the current storage metadata path in a static local file. +// A mutation to storage generates a new metadata file. The current metadata +// file path is either persisted in the entry point file, or an external +// catalog (not implemented yet). +// NEXT_ID: 2 +message EntryPoint { + // File path of the current storage metadata file. + string metadata_file = 1; +} + +// Metadata persisting the current status of a storage, including logical +// metadata such as schema, and physical metadata persisted as a history of +// snapshots +// NEXT_ID: 7 +message StorageMetadata { + // Create time of the storage. + google.protobuf.Timestamp create_time = 1; + + // Last update time of the storage. + google.protobuf.Timestamp last_update_time = 2; + + // The storage type. + enum Type { + TYPE_UNSPECIFIED = 0; + // A Space dataset. + DATASET = 1; + } + Type type = 3; + + // The dataset schema. + Schema schema = 4; + + // The current snapshot ID. + int64 current_snapshot_id = 5; + + // All alive snapshots. + map snapshots = 6; +} + +// The storage logical schema where user provided types are persisted instead +// of their physical storage format. +// NEXT_ID: 2 +message Schema { + // Fields persisted as Substrait named struct. + substrait.NamedStruct fields = 1; +} + +// Storage snapshot persisting physical metadata such as manifest file paths. +// It is used for obtaining all alive data file paths for a given snapshot. +// NEXT_ID: 3 +message Snapshot { + // The snapshot ID. + int64 snapshot_id = 1; + + // The create time of the snapshot. + google.protobuf.Timestamp create_time = 2; +} diff --git a/python/src/space/core/proto/metadata_pb2.py b/python/src/space/core/proto/metadata_pb2.py new file mode 100644 index 0000000..7121cf9 --- /dev/null +++ b/python/src/space/core/proto/metadata_pb2.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: space/core/proto/metadata.proto +"""Generated protocol buffer code.""" +from google.protobuf.internal import builder as _builder +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import symbol_database as _symbol_database +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2 +from substrait import type_pb2 as substrait_dot_type__pb2 + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x1fspace/core/proto/metadata.proto\x12\x0bspace.proto\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x14substrait/type.proto\"#\n\nEntryPoint\x12\x15\n\rmetadata_file\x18\x01 \x01(\t\"\x9f\x03\n\x0fStorageMetadata\x12/\n\x0b\x63reate_time\x18\x01 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x34\n\x10last_update_time\x18\x02 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12/\n\x04type\x18\x03 \x01(\x0e\x32!.space.proto.StorageMetadata.Type\x12#\n\x06schema\x18\x04 \x01(\x0b\x32\x13.space.proto.Schema\x12\x1b\n\x13\x63urrent_snapshot_id\x18\x05 \x01(\x03\x12>\n\tsnapshots\x18\x06 \x03(\x0b\x32+.space.proto.StorageMetadata.SnapshotsEntry\x1aG\n\x0eSnapshotsEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12$\n\x05value\x18\x02 \x01(\x0b\x32\x15.space.proto.Snapshot:\x02\x38\x01\")\n\x04Type\x12\x14\n\x10TYPE_UNSPECIFIED\x10\x00\x12\x0b\n\x07\x44\x41TASET\x10\x01\"0\n\x06Schema\x12&\n\x06\x66ields\x18\x01 \x01(\x0b\x32\x16.substrait.NamedStruct\"P\n\x08Snapshot\x12\x13\n\x0bsnapshot_id\x18\x01 \x01(\x03\x12/\n\x0b\x63reate_time\x18\x02 \x01(\x0b\x32\x1a.google.protobuf.Timestampb\x06proto3') + +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'space.core.proto.metadata_pb2', globals()) +if _descriptor._USE_C_DESCRIPTORS == False: + + DESCRIPTOR._options = None + _STORAGEMETADATA_SNAPSHOTSENTRY._options = None + _STORAGEMETADATA_SNAPSHOTSENTRY._serialized_options = b'8\001' + _ENTRYPOINT._serialized_start=103 + _ENTRYPOINT._serialized_end=138 + _STORAGEMETADATA._serialized_start=141 + _STORAGEMETADATA._serialized_end=556 + _STORAGEMETADATA_SNAPSHOTSENTRY._serialized_start=442 + _STORAGEMETADATA_SNAPSHOTSENTRY._serialized_end=513 + _STORAGEMETADATA_TYPE._serialized_start=515 + _STORAGEMETADATA_TYPE._serialized_end=556 + _SCHEMA._serialized_start=558 + _SCHEMA._serialized_end=606 + _SNAPSHOT._serialized_start=608 + _SNAPSHOT._serialized_end=688 +# @@protoc_insertion_point(module_scope) diff --git a/python/src/space/core/proto/metadata_pb2.pyi b/python/src/space/core/proto/metadata_pb2.pyi new file mode 100644 index 0000000..c72d4cd --- /dev/null +++ b/python/src/space/core/proto/metadata_pb2.pyi @@ -0,0 +1,188 @@ +""" +@generated by mypy-protobuf. Do not edit manually! +isort:skip_file +Copyright 2023 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import builtins +import collections.abc +import google.protobuf.descriptor +import google.protobuf.internal.containers +import google.protobuf.internal.enum_type_wrapper +import google.protobuf.message +import google.protobuf.timestamp_pb2 +import substrait.type_pb2 +import sys +import typing + +if sys.version_info >= (3, 10): + import typing as typing_extensions +else: + import typing_extensions + +DESCRIPTOR: google.protobuf.descriptor.FileDescriptor + +@typing_extensions.final +class EntryPoint(google.protobuf.message.Message): + """Record the current storage metadata path in a static local file. + A mutation to storage generates a new metadata file. The current metadata + file path is either persisted in the entry point file, or an external + catalog (not implemented yet). + NEXT_ID: 2 + """ + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + METADATA_FILE_FIELD_NUMBER: builtins.int + metadata_file: builtins.str + """File path of the current storage metadata file.""" + def __init__( + self, + *, + metadata_file: builtins.str = ..., + ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["metadata_file", b"metadata_file"]) -> None: ... + +global___EntryPoint = EntryPoint + +@typing_extensions.final +class StorageMetadata(google.protobuf.message.Message): + """Metadata persisting the current status of a storage, including logical + metadata such as schema, and physical metadata persisted as a history of + snapshots + NEXT_ID: 7 + """ + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + class _Type: + ValueType = typing.NewType("ValueType", builtins.int) + V: typing_extensions.TypeAlias = ValueType + + class _TypeEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[StorageMetadata._Type.ValueType], builtins.type): + DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor + TYPE_UNSPECIFIED: StorageMetadata._Type.ValueType # 0 + DATASET: StorageMetadata._Type.ValueType # 1 + """A Space dataset.""" + + class Type(_Type, metaclass=_TypeEnumTypeWrapper): + """The storage type.""" + + TYPE_UNSPECIFIED: StorageMetadata.Type.ValueType # 0 + DATASET: StorageMetadata.Type.ValueType # 1 + """A Space dataset.""" + + @typing_extensions.final + class SnapshotsEntry(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + KEY_FIELD_NUMBER: builtins.int + VALUE_FIELD_NUMBER: builtins.int + key: builtins.int + @property + def value(self) -> global___Snapshot: ... + def __init__( + self, + *, + key: builtins.int = ..., + value: global___Snapshot | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["value", b"value"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["key", b"key", "value", b"value"]) -> None: ... + + CREATE_TIME_FIELD_NUMBER: builtins.int + LAST_UPDATE_TIME_FIELD_NUMBER: builtins.int + TYPE_FIELD_NUMBER: builtins.int + SCHEMA_FIELD_NUMBER: builtins.int + CURRENT_SNAPSHOT_ID_FIELD_NUMBER: builtins.int + SNAPSHOTS_FIELD_NUMBER: builtins.int + @property + def create_time(self) -> google.protobuf.timestamp_pb2.Timestamp: + """Create time of the storage.""" + @property + def last_update_time(self) -> google.protobuf.timestamp_pb2.Timestamp: + """Last update time of the storage.""" + type: global___StorageMetadata.Type.ValueType + @property + def schema(self) -> global___Schema: + """The dataset schema.""" + current_snapshot_id: builtins.int + """The current snapshot ID.""" + @property + def snapshots(self) -> google.protobuf.internal.containers.MessageMap[builtins.int, global___Snapshot]: + """All alive snapshots.""" + def __init__( + self, + *, + create_time: google.protobuf.timestamp_pb2.Timestamp | None = ..., + last_update_time: google.protobuf.timestamp_pb2.Timestamp | None = ..., + type: global___StorageMetadata.Type.ValueType = ..., + schema: global___Schema | None = ..., + current_snapshot_id: builtins.int = ..., + snapshots: collections.abc.Mapping[builtins.int, global___Snapshot] | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["create_time", b"create_time", "last_update_time", b"last_update_time", "schema", b"schema"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["create_time", b"create_time", "current_snapshot_id", b"current_snapshot_id", "last_update_time", b"last_update_time", "schema", b"schema", "snapshots", b"snapshots", "type", b"type"]) -> None: ... + +global___StorageMetadata = StorageMetadata + +@typing_extensions.final +class Schema(google.protobuf.message.Message): + """The storage logical schema where user provided types are persisted instead + of their physical storage format. + NEXT_ID: 2 + """ + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + FIELDS_FIELD_NUMBER: builtins.int + @property + def fields(self) -> substrait.type_pb2.NamedStruct: + """Fields persisted as Substrait named struct.""" + def __init__( + self, + *, + fields: substrait.type_pb2.NamedStruct | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["fields", b"fields"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["fields", b"fields"]) -> None: ... + +global___Schema = Schema + +@typing_extensions.final +class Snapshot(google.protobuf.message.Message): + """Storage snapshot persisting physical metadata such as manifest file paths. + It is used for obtaining all alive data file paths for a given snapshot. + NEXT_ID: 3 + """ + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + SNAPSHOT_ID_FIELD_NUMBER: builtins.int + CREATE_TIME_FIELD_NUMBER: builtins.int + snapshot_id: builtins.int + """The snapshot ID.""" + @property + def create_time(self) -> google.protobuf.timestamp_pb2.Timestamp: + """The create time of the snapshot.""" + def __init__( + self, + *, + snapshot_id: builtins.int = ..., + create_time: google.protobuf.timestamp_pb2.Timestamp | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["create_time", b"create_time"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["create_time", b"create_time", "snapshot_id", b"snapshot_id"]) -> None: ... + +global___Snapshot = Snapshot diff --git a/python/src/space/core/storage.py b/python/src/space/core/storage.py new file mode 100644 index 0000000..abc4a83 --- /dev/null +++ b/python/src/space/core/storage.py @@ -0,0 +1,91 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Underlying storage implementation for datasets.""" + +from __future__ import annotations +from typing import Optional + +import pyarrow as pa + +from space.core.fs.factory import create_fs +import space.core.proto.metadata_pb2 as meta +from space.core.utils import paths +from space.core.utils.protos import proto_now + +# Initial snapshot ID. +_INIT_SNAPSHOT_ID = 0 + + +class Storage(paths.StoragePaths): + """Storage manages data files by metadata using the Space format.""" + + def __init__(self, location: str, metadata: meta.StorageMetadata): + super().__init__(location) + self._metadata = metadata + self._fs = create_fs(location) + + def _initialize(self, metadata_path: str) -> None: + """Initialize a new storage by creating folders and files.""" + self._fs.create_dir(self._data_dir) + self._fs.create_dir(self._metadata_dir) + self._write_metadata(metadata_path, self._metadata) + + def _write_metadata( + self, + metadata_path: str, + metadata: meta.StorageMetadata, + ) -> None: + """Persist a StorageMetadata to files.""" + self._fs.write_proto(metadata_path, metadata) + self._fs.write_proto( + self._entry_point_file, + meta.EntryPoint(metadata_file=self.short_path(metadata_path))) + + @property + def metadata(self) -> meta.StorageMetadata: + """Return the storage metadata.""" + return self._metadata + + def snapshot(self, snapshot_id: Optional[int] = None) -> meta.Snapshot: + """Return the snapshot specified by a snapshot ID.""" + if snapshot_id is None: + snapshot_id = self._metadata.current_snapshot_id + + if snapshot_id in self._metadata.snapshots: + return self._metadata.snapshots[snapshot_id] + + raise RuntimeError(f"Snapshot {snapshot_id} is not found") + + @classmethod + def create(cls, location: str, logical_schema: pa.Schema) -> Storage: # pylint: disable=unused-argument + """Create a new empty dataset.""" + # TODO: to verify that location is an empty directory. + # TODO: to assign field IDs to schema fields. + + now = proto_now() + # TODO: to convert Arrow schema to Substrait schema. + metadata = meta.StorageMetadata(create_time=now, + last_update_time=now, + current_snapshot_id=_INIT_SNAPSHOT_ID, + type=meta.StorageMetadata.DATASET) + + new_metadata_path = paths.new_metadata_path(paths.metadata_dir(location)) + + snapshot = meta.Snapshot(snapshot_id=_INIT_SNAPSHOT_ID, create_time=now) + metadata.snapshots[metadata.current_snapshot_id].CopyFrom(snapshot) + + storage = Storage(location, metadata) + storage._initialize(new_metadata_path) + return storage diff --git a/python/src/space/core/utils/__init__.py b/python/src/space/core/utils/__init__.py new file mode 100644 index 0000000..5678014 --- /dev/null +++ b/python/src/space/core/utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/src/space/core/utils/paths.py b/python/src/space/core/utils/paths.py new file mode 100644 index 0000000..950069b --- /dev/null +++ b/python/src/space/core/utils/paths.py @@ -0,0 +1,96 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Utility methods for file paths.""" + +from os import path + +from space.core.utils.uuids import uuid_ + +_ENTRY_POINT_FILE = "entrypoint.txtpb" +_DATA_DIR = "data" +_METADATA_DIR = "metadata" + + +def new_index_file_path(data_dir_: str): + """Return a random index file path in a given data directory..""" + return path.join(data_dir_, f"index_{uuid_()}.parquet") + + +def new_record_file_path(data_dir_: str, field_name: str): + """Return a random record file path in a given data directory..""" + return path.join(data_dir_, f"{field_name}_{uuid_()}.arrowrecord") + + +def new_index_manifest_path(metadata_dir_: str): + """Return a random index manifest file path in a given metadata directory.""" + return path.join(metadata_dir_, f"index_manifest_{uuid_()}.parquet") + + +def new_record_manifest_path(metadata_dir_: str): + """Return a random record manifest file path in a given metadata directory.""" + return path.join(metadata_dir_, f"record_manifest_{uuid_()}.parquet") + + +def data_dir(location: str) -> str: + """Return the data directory path in a given location.""" + return path.join(location, _DATA_DIR) + + +def metadata_dir(location: str) -> str: + """Return the metadata directory path in a given location.""" + return path.join(location, _METADATA_DIR) + + +def entry_point_path(location: str) -> str: + """Return the static entry point file path in a given location.""" + return path.join(location, _METADATA_DIR, _ENTRY_POINT_FILE) + + +def new_metadata_path(metadata_dir_: str) -> str: + """Return a random metadata file path in a given metadata directory.""" + return path.join(metadata_dir_, f"metadata_{uuid_()}.txtpb") + + +class StoragePaths: + """Provides util methods for file and directory paths.""" + + def __init__(self, location: str): + self._location = location + + self._data_dir = data_dir(self._location) + self._metadata_dir = metadata_dir(self._location) + self._entry_point_file = entry_point_path(self._location) + + @property + def data_dir(self) -> str: + """Return the data directory.""" + return self._data_dir + + @property + def metadata_dir(self) -> str: + """Return the metadata directory.""" + return self._metadata_dir + + def short_path(self, full_path: str) -> str: + """Return the short relative path from a full path.""" + return path.relpath(full_path, self._location) + + def full_path(self, short_path: str) -> str: + """Return the full path from a full or short path.""" + return path.join(self._location, short_path) + + def new_metadata_path(self) -> str: + """Return a random metadata file path.""" + return new_metadata_path(self._metadata_dir) diff --git a/python/src/space/core/utils/protos.py b/python/src/space/core/utils/protos.py new file mode 100644 index 0000000..433dac4 --- /dev/null +++ b/python/src/space/core/utils/protos.py @@ -0,0 +1,31 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Utility methods for protos.""" + +from google.protobuf import message +from google.protobuf import text_format +from google.protobuf.timestamp_pb2 import Timestamp + + +def proto_to_text(msg: message.Message) -> bytes: + """Return the text format of a proto.""" + return text_format.MessageToString(msg).encode("utf-8") + + +def proto_now() -> Timestamp: + """Return the current time in the proto format.""" + timestamp = Timestamp() + timestamp.GetCurrentTime() + return timestamp diff --git a/python/src/space/core/utils/uuids.py b/python/src/space/core/utils/uuids.py new file mode 100644 index 0000000..64df970 --- /dev/null +++ b/python/src/space/core/utils/uuids.py @@ -0,0 +1,27 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Utility methods for UUIDs.""" + +import uuid + + +def uuid_() -> str: + """Return a new UUID.""" + return str(uuid.uuid4()) + + +def random_id() -> str: + """Return a short random ID.""" + return uuid_().split("-", maxsplit=1)[0] diff --git a/python/tests/core/fs/test_arrow.py b/python/tests/core/fs/test_arrow.py new file mode 100644 index 0000000..5a7a079 --- /dev/null +++ b/python/tests/core/fs/test_arrow.py @@ -0,0 +1,59 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from space.core.fs.arrow import ArrowLocalFileSystem +import space.core.proto.metadata_pb2 as meta + + +class TestArrowLocalFileSystem: + + @pytest.fixture + def fs(self): + return ArrowLocalFileSystem() + + def test_create_dir(self, tmp_path, fs): + dir_path = tmp_path / "test_create_dir" + fs.create_dir(str(dir_path)) + assert dir_path.exists() + + def _read_proto(self, fs, file_path): + read_msg = meta.StorageMetadata() + fs.read_proto(file_path, read_msg) + return read_msg + + def test_write_read_proto(self, tmp_path, fs): + dir_path = tmp_path / "test_write_read_proto" + fs.create_dir(str(dir_path)) + + file_path = str(dir_path / "output.txtpb") + write_msg = meta.StorageMetadata(current_snapshot_id=100) + fs.write_proto(file_path, write_msg) + assert dir_path.exists() + + assert self._read_proto(fs, file_path) == write_msg + + def test_overwrite_proto_file(self, tmp_path, fs): + dir_path = tmp_path / "test_write_read_proto" + fs.create_dir(str(dir_path)) + + file_path = str(dir_path / "output.txtpb") + write_msg = meta.StorageMetadata(current_snapshot_id=100) + fs.write_proto(file_path, write_msg) + assert self._read_proto(fs, file_path).current_snapshot_id == 100 + + write_msg = meta.StorageMetadata(current_snapshot_id=200) + fs.write_proto(file_path, write_msg) + assert self._read_proto(fs, file_path).current_snapshot_id == 200 diff --git a/python/tests/core/test_storage.py b/python/tests/core/test_storage.py new file mode 100644 index 0000000..f969752 --- /dev/null +++ b/python/tests/core/test_storage.py @@ -0,0 +1,77 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from google.protobuf.timestamp_pb2 import Timestamp +import pytest +from pathlib import Path + +from space.core.fs.arrow import ArrowLocalFileSystem +import space.core.proto.metadata_pb2 as meta +from space.core.storage import Storage +from space.core.utils.paths import _ENTRY_POINT_FILE + + +class TestStorage: + + _LOCATION = "location" + _SNAPSHOT_ID = 100 + + @pytest.fixture + def metadata(self): + current_snapshot_id = self._SNAPSHOT_ID + now = Timestamp(seconds=123456) + metadata = meta.StorageMetadata(create_time=now, + last_update_time=now, + current_snapshot_id=current_snapshot_id, + type=meta.StorageMetadata.DATASET) + metadata.snapshots[current_snapshot_id].CopyFrom( + meta.Snapshot(snapshot_id=current_snapshot_id, create_time=now)) + + # Add a previous snapshot for testing. + previous_snapshot_id = 10 + metadata.snapshots[previous_snapshot_id].CopyFrom( + meta.Snapshot(snapshot_id=previous_snapshot_id, + create_time=Timestamp(seconds=123000))) + return metadata + + @pytest.fixture + def storage(self, metadata): + return Storage(self._LOCATION, metadata) + + def test_create_dir(self, storage, metadata): + assert storage.metadata == metadata + + def test_snapshot(self, storage): + # Test current snapshot ID. + current_snapshot_id = self._SNAPSHOT_ID + assert storage.snapshot().snapshot_id == current_snapshot_id + assert storage.snapshot( + snapshot_id=current_snapshot_id).snapshot_id == current_snapshot_id + + # Test previous snapshot ID. + assert storage.snapshot(snapshot_id=10).snapshot_id == 10 + + def test_create_storage(self, tmp_path): + dir_path = tmp_path / "test_create_storage" / "dataset" + storage = Storage.create(location=str(dir_path), logical_schema=None) + + entry_point_file = dir_path / "metadata" / _ENTRY_POINT_FILE + assert entry_point_file.exists() + + metadata = storage.metadata + snapshot = storage.snapshot() + assert metadata.current_snapshot_id == 0 + assert metadata.type == meta.StorageMetadata.DATASET + assert snapshot.snapshot_id == 0 + assert metadata.create_time == metadata.last_update_time == snapshot.create_time diff --git a/python/tests/core/utils/test_paths.py b/python/tests/core/utils/test_paths.py new file mode 100644 index 0000000..4e11018 --- /dev/null +++ b/python/tests/core/utils/test_paths.py @@ -0,0 +1,94 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from mock import patch +import pytest + +from space.core.utils import paths + +_UUID_PATH = "space.core.utils.paths.uuid_" + + +def _mocked_uuid() -> str: + return "" + + +@patch(_UUID_PATH, side_effect=_mocked_uuid) +def test_new_index_file_path(mock_uuid): + assert paths.new_index_file_path("data") == "data/index_.parquet" + + +@patch(_UUID_PATH, side_effect=_mocked_uuid) +def test_new_record_file_path(mock_uuid): + assert paths.new_record_file_path("data", + "field") == "data/field_.arrowrecord" + + +@patch(_UUID_PATH, side_effect=_mocked_uuid) +def test_new_index_manifest_path(mock_uuid): + assert paths.new_index_manifest_path( + "metadata") == "metadata/index_manifest_.parquet" + + +@patch(_UUID_PATH, side_effect=_mocked_uuid) +def test_new_record_manifest_path(mock_uuid): + assert paths.new_record_manifest_path( + "metadata") == "metadata/record_manifest_.parquet" + + +@patch(_UUID_PATH, side_effect=_mocked_uuid) +def test_data_dir(mock_uuid): + assert paths.data_dir("location") == "location/data" + + +@patch(_UUID_PATH, side_effect=_mocked_uuid) +def test_metadata_dir(mock_uuid): + assert paths.metadata_dir("location") == "location/metadata" + + +@patch(_UUID_PATH, side_effect=_mocked_uuid) +def test_entry_point_path(mock_uuid): + assert paths.entry_point_path( + "location") == "location/metadata/entrypoint.txtpb" + + +@patch(_UUID_PATH, side_effect=_mocked_uuid) +def test_new_metadata_path(mock_uuid): + assert paths.new_metadata_path( + "metadata") == "metadata/metadata_.txtpb" + + +class TestStoragePaths: + + _LOCATION = "location" + + @pytest.fixture + def storage_paths(self): + return paths.StoragePaths(self._LOCATION) + + def test_data_dir(self, storage_paths): + assert storage_paths.data_dir == f"{self._LOCATION}/data" + + def test_short_path(self, storage_paths): + assert storage_paths.short_path( + f"{self._LOCATION}/metadata/file.parquet") == "metadata/file.parquet" + + def test_full_path(self, storage_paths): + assert storage_paths.full_path( + "data/file.parquet") == f"{self._LOCATION}/data/file.parquet" + + def test_new_metadata_path(self, storage_paths): + with patch(_UUID_PATH, side_effect=_mocked_uuid): + assert storage_paths.new_metadata_path( + ) == f"{self._LOCATION}/metadata/metadata_.txtpb" diff --git a/python/tests/core/utils/test_protos.py b/python/tests/core/utils/test_protos.py new file mode 100644 index 0000000..7449375 --- /dev/null +++ b/python/tests/core/utils/test_protos.py @@ -0,0 +1,21 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from space.core.utils import protos +import space.core.proto.metadata_pb2 as meta + + +def test_proto_to_text(): + text = protos.proto_to_text(meta.StorageMetadata(current_snapshot_id=100)) + assert text.decode("utf-8") == "current_snapshot_id: 100\n" diff --git a/python/tests/core/utils/test_uuids.py b/python/tests/core/utils/test_uuids.py new file mode 100644 index 0000000..531f9c9 --- /dev/null +++ b/python/tests/core/utils/test_uuids.py @@ -0,0 +1,25 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import uuid + +from space.core.utils import uuids + + +def test_uuid_(): + assert uuid.UUID(uuids.uuid_()).version == 4 + + +def test_random_id(): + assert len(uuids.random_id()) == 8