Skip to content

Commit

Permalink
Add init metadata proto, substrait proto, utilities, and storage class
Browse files Browse the repository at this point in the history
  • Loading branch information
coufon committed Dec 17, 2023
1 parent c98ba24 commit bd6a33f
Show file tree
Hide file tree
Showing 25 changed files with 1,077 additions and 3 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/python-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@ jobs:
- name: Install test dependencies
run: |
python -m pip install --upgrade pip
pip install mypy pylint pytest pyarrow-stubs
pip install mypy pylint pytest mock
- name: Install runtime dependencies
working-directory: ./python
run: |
pip install .
pip install .[dev]
- name: Analysing the code with pylint
working-directory: ./python/src
run: |
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ __pycache__/
*.egg-info/
build/
dist/
.pytype/
out/
.mypy_cache/
.pytest_cache/

# VSCode files
.vscode/
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Space: LakeHouse for Machine Learning Datasets
# Space: Storage Framework for Machine Learning Datasets

[![Python CI](https://github.com/google/space/actions/workflows/python-ci.yml/badge.svg?branch=main)](https://github.com/google/space/actions/workflows/python-ci.yml)

Expand Down
8 changes: 8 additions & 0 deletions python/build_proto.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,11 @@ protoc --python_out="${SRC_FOLDER}" \
--mypy_out="${SRC_FOLDER}" \
substrait/*.proto substrait/extensions/*.proto \
--proto_path=.

# Build Space protos.
cd "${SRC_FOLDER}"
protoc --python_out=. \
--mypy_out=. \
space/core/proto/*.proto \
--proto_path=. \
--proto_path=../../substrait/proto
48 changes: 48 additions & 0 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
[project]
name = "space"
version = "0.0.1"
authors = [
{ name="Space team", email="[email protected]" },
]
description = "A storage framework for machine learning datasets"
license = {text = "Apache-2.0"}
classifiers = [
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11"
]
requires-python = ">=3.8"
dependencies = [
"protobuf",
"pyarrow >= 14.0.0",
]

[project.optional-dependencies]
dev = [
"pyarrow-stubs",
"types-protobuf"
]

[project.urls]
Homepage = "https://github.com/google/space"
Issues = "https://github.com/google/space/issues"

[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"

[tool.pytest.ini_options]
addopts = ["--import-mode=importlib"]
pythonpath = ["src"]

[tool.pylint.format]
max-line-length = 80
indent-string = ' '
disable = ['fixme']

[tool.pylint.MAIN]
ignore = 'space/core/proto'
ignored-modules = ['space.core.proto', 'google.protobuf']
13 changes: 13 additions & 0 deletions python/src/space/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
13 changes: 13 additions & 0 deletions python/src/space/core/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
13 changes: 13 additions & 0 deletions python/src/space/core/fs/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
63 changes: 63 additions & 0 deletions python/src/space/core/fs/arrow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Arrow file system implementation."""

from abc import abstractmethod

from google.protobuf import message
from google.protobuf import text_format
from pyarrow import fs

from space.core.fs.base import BaseFileSystem
from space.core.utils.protos import proto_to_text
from space.core.utils.uuids import random_id


class ArrowFileSystem(BaseFileSystem):
"""Abstract Arrow file system."""

def __init__(self):
super().__init__()
self._fs = self.create_fs()

@abstractmethod
def create_fs(self) -> fs.FileSystem:
"""Create a new underlying Arrow file system."""

def create_dir(self, dir_path: str) -> None:
self._fs.create_dir(dir_path)

def write_proto(self, file_path: str, msg: message.Message) -> None:
# TODO: the current implement overwrite an existing file; to support an
# to disallow overwrite.
tmp_file_path = f"{file_path}.{random_id()}.tmp"

with self._fs.open_output_stream(tmp_file_path) as f:
f.write(proto_to_text(msg))

self._fs.move(tmp_file_path, file_path)

def read_proto(self, file_path: str,
empty_msg: message.Message) -> message.Message:
with self._fs.open_input_file(file_path) as f:
result = text_format.Parse(f.readall(), empty_msg)
return result


class ArrowLocalFileSystem(ArrowFileSystem):
"""Arrow local file system implementation."""

def create_fs(self) -> fs.FileSystem:
return fs.LocalFileSystem()
36 changes: 36 additions & 0 deletions python/src/space/core/fs/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Abstract base file system."""

from abc import ABC, abstractmethod

from google.protobuf import message


class BaseFileSystem(ABC):
"""Abstract file system."""

@abstractmethod
def create_dir(self, dir_path: str) -> None:
"""Create a new directory."""

@abstractmethod
def write_proto(self, file_path: str, msg: message.Message) -> None:
"""Write a proto message in text format to a file."""

@abstractmethod
def read_proto(self, file_path: str,
empty_msg: message.Message) -> message.Message:
"""Read a proto message in text format from a file."""
24 changes: 24 additions & 0 deletions python/src/space/core/fs/factory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""File system factory implementation."""

from space.core.fs.arrow import ArrowLocalFileSystem
from space.core.fs.base import BaseFileSystem


def create_fs(path: str) -> BaseFileSystem: # pylint: disable=unused-argument
"""Create a file system based on the path."""
# TODO: to support more file systems.
return ArrowLocalFileSystem()
13 changes: 13 additions & 0 deletions python/src/space/core/proto/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
78 changes: 78 additions & 0 deletions python/src/space/core/proto/metadata.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// Copyright 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package space.proto;

import "google/protobuf/timestamp.proto";
import "substrait/type.proto";

// Record the current storage metadata path in a static local file.
// A mutation to storage generates a new metadata file. The current metadata
// file path is either persisted in the entry point file, or an external
// catalog (not implemented yet).
// NEXT_ID: 2
message EntryPoint {
// File path of the current storage metadata file.
string metadata_file = 1;
}

// Metadata persisting the current status of a storage, including logical
// metadata such as schema, and physical metadata persisted as a history of
// snapshots
// NEXT_ID: 7
message StorageMetadata {
// Create time of the storage.
google.protobuf.Timestamp create_time = 1;

// Last update time of the storage.
google.protobuf.Timestamp last_update_time = 2;

// The storage type.
enum Type {
TYPE_UNSPECIFIED = 0;
// A Space dataset.
DATASET = 1;
}
Type type = 3;

// The dataset schema.
Schema schema = 4;

// The current snapshot ID.
int64 current_snapshot_id = 5;

// All alive snapshots.
map<int64, Snapshot> snapshots = 6;
}

// The storage logical schema where user provided types are persisted instead
// of their physical storage format.
// NEXT_ID: 2
message Schema {
// Fields persisted as Substrait named struct.
substrait.NamedStruct fields = 1;
}

// Storage snapshot persisting physical metadata such as manifest file paths.
// It is used for obtaining all alive data file paths for a given snapshot.
// NEXT_ID: 3
message Snapshot {
// The snapshot ID.
int64 snapshot_id = 1;

// The create time of the snapshot.
google.protobuf.Timestamp create_time = 2;
}
39 changes: 39 additions & 0 deletions python/src/space/core/proto/metadata_pb2.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit bd6a33f

Please sign in to comment.