-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from google/coufon-dev
Add init metadata proto, substrait proto, utilities, and storage class.
- Loading branch information
Showing
25 changed files
with
1,106 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,10 @@ __pycache__/ | |
*.egg-info/ | ||
build/ | ||
dist/ | ||
.pytype/ | ||
out/ | ||
.mypy_cache/ | ||
.pytest_cache/ | ||
|
||
# VSCode files | ||
.vscode/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
[project] | ||
name = "space" | ||
version = "0.0.1" | ||
authors = [ | ||
{ name="Space team", email="[email protected]" }, | ||
] | ||
description = "A storage framework for machine learning datasets" | ||
license = {text = "Apache-2.0"} | ||
classifiers = [ | ||
"License :: OSI Approved :: Apache Software License", | ||
"Operating System :: OS Independent", | ||
"Programming Language :: Python :: 3.8", | ||
"Programming Language :: Python :: 3.9", | ||
"Programming Language :: Python :: 3.10", | ||
"Programming Language :: Python :: 3.11" | ||
] | ||
requires-python = ">=3.8" | ||
dependencies = [ | ||
"protobuf", | ||
"pyarrow >= 14.0.0", | ||
] | ||
|
||
[project.optional-dependencies] | ||
dev = [ | ||
"pyarrow-stubs", | ||
"types-protobuf" | ||
] | ||
|
||
[project.urls] | ||
Homepage = "https://github.com/google/space" | ||
Issues = "https://github.com/google/space/issues" | ||
|
||
[build-system] | ||
requires = ["setuptools"] | ||
build-backend = "setuptools.build_meta" | ||
|
||
[tool.pytest.ini_options] | ||
addopts = ["--import-mode=importlib"] | ||
pythonpath = ["src"] | ||
|
||
[tool.pylint.format] | ||
max-line-length = 80 | ||
indent-string = ' ' | ||
disable = ['fixme'] | ||
|
||
[tool.pylint.MAIN] | ||
ignore = 'space/core/proto' | ||
ignored-modules = ['space.core.proto', 'google.protobuf'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# Copyright 2023 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# Copyright 2023 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# Copyright 2023 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
# Copyright 2023 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
"""Arrow file system implementation.""" | ||
|
||
from abc import abstractmethod | ||
|
||
from google.protobuf import text_format | ||
from pyarrow import fs | ||
|
||
from space.core.fs.base import BaseFileSystem, ProtoT | ||
from space.core.utils.protos import proto_to_text | ||
from space.core.utils.uuids import random_id | ||
|
||
|
||
class ArrowFileSystem(BaseFileSystem): | ||
"""Abstract Arrow file system.""" | ||
|
||
def __init__(self): | ||
super().__init__() | ||
self._fs = self.create_fs() | ||
|
||
@abstractmethod | ||
def create_fs(self) -> fs.FileSystem: | ||
"""Create a new underlying Arrow file system.""" | ||
|
||
def create_dir(self, dir_path: str) -> None: | ||
self._fs.create_dir(dir_path) | ||
|
||
def write_proto(self, file_path: str, msg: ProtoT) -> None: | ||
# TODO: the current implement overwrite an existing file; to support an | ||
# to disallow overwrite. | ||
tmp_file_path = f"{file_path}.{random_id()}.tmp" | ||
|
||
with self._fs.open_output_stream(tmp_file_path) as f: | ||
f.write(proto_to_text(msg)) | ||
|
||
self._fs.move(tmp_file_path, file_path) | ||
|
||
def read_proto(self, file_path: str, empty_msg: ProtoT) -> ProtoT: | ||
with self._fs.open_input_file(file_path) as f: | ||
result = text_format.Parse(f.readall(), empty_msg) | ||
return result | ||
|
||
|
||
class ArrowLocalFileSystem(ArrowFileSystem): | ||
"""Arrow local file system implementation.""" | ||
|
||
def create_fs(self) -> fs.FileSystem: | ||
return fs.LocalFileSystem() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
# Copyright 2023 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
"""Abstract base file system.""" | ||
|
||
from abc import ABC, abstractmethod | ||
from typing import TypeVar | ||
|
||
from google.protobuf import message | ||
|
||
ProtoT = TypeVar("ProtoT", bound=message.Message) | ||
|
||
|
||
class BaseFileSystem(ABC): | ||
"""Abstract file system.""" | ||
|
||
@abstractmethod | ||
def create_dir(self, dir_path: str) -> None: | ||
"""Create a new directory.""" | ||
|
||
@abstractmethod | ||
def write_proto(self, file_path: str, msg: ProtoT) -> None: | ||
"""Write a proto message in text format to a file.""" | ||
|
||
@abstractmethod | ||
def read_proto(self, file_path: str, empty_msg: ProtoT) -> ProtoT: | ||
"""Read a proto message in text format from a file.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# Copyright 2023 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
"""File system factory implementation.""" | ||
|
||
from space.core.fs.arrow import ArrowLocalFileSystem | ||
from space.core.fs.base import BaseFileSystem | ||
|
||
|
||
def create_fs(path: str) -> BaseFileSystem: # pylint: disable=unused-argument | ||
"""Create a file system based on the path.""" | ||
# TODO: to support more file systems. | ||
return ArrowLocalFileSystem() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# Copyright 2023 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
// Copyright 2023 Google LLC | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// https://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
syntax = "proto3"; | ||
|
||
package space.proto; | ||
|
||
import "google/protobuf/timestamp.proto"; | ||
import "substrait/type.proto"; | ||
|
||
// Record the current storage metadata path in a static local file. | ||
// A mutation to storage generates a new metadata file. The current metadata | ||
// file path is either persisted in the entry point file, or an external | ||
// catalog (not implemented yet). | ||
// NEXT_ID: 2 | ||
message EntryPoint { | ||
// File path of the current storage metadata file. | ||
string metadata_file = 1; | ||
} | ||
|
||
// Metadata persisting the current status of a storage, including logical | ||
// metadata such as schema, and physical metadata persisted as a history of | ||
// snapshots | ||
// NEXT_ID: 7 | ||
message StorageMetadata { | ||
// Create time of the storage. | ||
google.protobuf.Timestamp create_time = 1; | ||
|
||
// Last update time of the storage. | ||
google.protobuf.Timestamp last_update_time = 2; | ||
|
||
// The storage type. | ||
enum Type { | ||
TYPE_UNSPECIFIED = 0; | ||
// The dataset type supports fully managed storage features. | ||
DATASET = 1; | ||
} | ||
Type type = 3; | ||
|
||
// The storage schema. | ||
Schema schema = 4; | ||
|
||
// The current snapshot ID. | ||
int64 current_snapshot_id = 5; | ||
|
||
// All alive snapshots with snapshot ID as key. | ||
map<int64, Snapshot> snapshots = 6; | ||
} | ||
|
||
// The storage logical schema where user provided types are persisted instead | ||
// of their physical storage format. | ||
// NEXT_ID: 2 | ||
message Schema { | ||
// Fields persisted as Substrait named struct. | ||
substrait.NamedStruct fields = 1; | ||
} | ||
|
||
// Storage snapshot persisting physical metadata such as manifest file paths. | ||
// It is used for obtaining all alive data file paths for a given snapshot. | ||
// NEXT_ID: 3 | ||
message Snapshot { | ||
// The snapshot ID. | ||
int64 snapshot_id = 1; | ||
|
||
// The create time of the snapshot. | ||
google.protobuf.Timestamp create_time = 2; | ||
} |
Oops, something went wrong.