-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support loading external Parquet files into Space
- Loading branch information
coufon
committed
Dec 27, 2023
1 parent
9a87356
commit 5038dec
Showing
9 changed files
with
231 additions
and
35 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# Copyright 2023 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
# Copyright 2023 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
"""Load Parquet files into Space datasets.""" | ||
|
||
from typing import Optional | ||
|
||
import pyarrow.parquet as pq | ||
|
||
from space.core.manifests import IndexManifestWriter | ||
from space.core.loaders.utils import list_files | ||
from space.core.proto import metadata_pb2 as meta | ||
from space.core.proto import runtime_pb2 as runtime | ||
from space.core.ops import utils | ||
from space.core.schema import arrow | ||
from space.core.utils.paths import StoragePathsMixin | ||
|
||
|
||
class LocalParquetLoadOp(StoragePathsMixin): | ||
"""Load ArrayRecord files into Space without copying data.""" | ||
|
||
def __init__(self, location: str, metadata: meta.StorageMetadata, | ||
input_dir: str): | ||
StoragePathsMixin.__init__(self, location) | ||
|
||
self._metadata = metadata | ||
self._input_dir = input_dir | ||
|
||
assert len(self._metadata.schema.record_fields) == 0 | ||
|
||
self._physical_schema = arrow.arrow_schema(self._metadata.schema.fields, | ||
set(), | ||
physical=True) | ||
self._input_files = list_files(input_dir, suffix=".parquet") | ||
|
||
def write(self) -> Optional[runtime.Patch]: | ||
"""Write metadata files to load Parquet files to Space dataset.""" | ||
index_manifest_writer = IndexManifestWriter( | ||
self._metadata_dir, self._physical_schema, | ||
self._metadata.schema.primary_keys) # type: ignore[arg-type] | ||
patch = runtime.Patch() | ||
|
||
for f in self._input_files: | ||
stats = _write_index_manifest(index_manifest_writer, f) | ||
utils.update_index_storage_stats(base=patch.storage_statistics_update, | ||
update=stats) | ||
|
||
index_manifest_full_path = index_manifest_writer.finish() | ||
if index_manifest_full_path is not None: | ||
patch.addition.index_manifest_files.append( | ||
self.short_path(index_manifest_full_path)) | ||
|
||
return patch | ||
|
||
|
||
def _write_index_manifest(manifest_writer: IndexManifestWriter, | ||
file_path: str) -> meta.StorageStatistics: | ||
# TODO: to verify that file schemas are compatible with dataset. | ||
metadata = pq.read_metadata(file_path) | ||
return manifest_writer.write(file_path, metadata) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# Copyright 2023 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
"""Utilities for loaders.""" | ||
|
||
import os | ||
from typing import List, Optional | ||
|
||
|
||
def list_files(directory: str, | ||
substr: Optional[str] = None, | ||
suffix: Optional[str] = None) -> List[str]: | ||
"""List files in a directory.""" | ||
files: List[str] = [] | ||
for f in os.listdir(directory): | ||
full_path = os.path.join(directory, f) | ||
if (os.path.isfile(full_path) and (substr is None or substr in f) | ||
and (suffix is None or f.endswith(suffix))): | ||
files.append(full_path) | ||
|
||
return files |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.