From 945a0cfe8c5340629840abf5ddd646460fcfa260 Mon Sep 17 00:00:00 2001 From: Kevin Su Date: Tue, 1 Mar 2022 17:57:43 +0800 Subject: [PATCH] Add GCS protocol in the structured dataset Signed-off-by: Kevin Su --- flytekit/types/structured/basic_dfs.py | 3 ++- flytekit/types/structured/structured_dataset.py | 1 + .../flytekit-data-fsspec/flytekitplugins/fsspec/__init__.py | 5 ++++- plugins/flytekit-data-fsspec/setup.py | 1 + 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/flytekit/types/structured/basic_dfs.py b/flytekit/types/structured/basic_dfs.py index 49b2f13ed9..2e263ed133 100644 --- a/flytekit/types/structured/basic_dfs.py +++ b/flytekit/types/structured/basic_dfs.py @@ -12,6 +12,7 @@ from flytekit.models.literals import StructuredDatasetMetadata from flytekit.models.types import StructuredDatasetType from flytekit.types.structured.structured_dataset import ( + GCS, LOCAL, PARQUET, S3, @@ -106,7 +107,7 @@ def decode( return pq.read_table(local_dir) -for protocol in [LOCAL, S3]: # Should we add GCS +for protocol in [LOCAL, S3, GCS]: StructuredDatasetTransformerEngine.register(PandasToParquetEncodingHandler(protocol), default_for_type=True) StructuredDatasetTransformerEngine.register(ParquetToPandasDecodingHandler(protocol), default_for_type=True) StructuredDatasetTransformerEngine.register(ArrowToParquetEncodingHandler(protocol), default_for_type=True) diff --git a/flytekit/types/structured/structured_dataset.py b/flytekit/types/structured/structured_dataset.py index 819bc012cc..bdacc6b913 100644 --- a/flytekit/types/structured/structured_dataset.py +++ b/flytekit/types/structured/structured_dataset.py @@ -37,6 +37,7 @@ # Protocols BIGQUERY = "bq" S3 = "s3" +GCS = "gs" LOCAL = "/" # For specifying the storage formats of StructuredDatasets. It's just a string, nothing fancy. diff --git a/plugins/flytekit-data-fsspec/flytekitplugins/fsspec/__init__.py b/plugins/flytekit-data-fsspec/flytekitplugins/fsspec/__init__.py index 85a65d17b8..4ec7c6e6ca 100644 --- a/plugins/flytekit-data-fsspec/flytekitplugins/fsspec/__init__.py +++ b/plugins/flytekit-data-fsspec/flytekitplugins/fsspec/__init__.py @@ -1,7 +1,7 @@ import importlib from flytekit import USE_STRUCTURED_DATASET, StructuredDatasetTransformerEngine, logger -from flytekit.types.structured.structured_dataset import S3 +from flytekit.types.structured.structured_dataset import GCS, S3 from .persist import FSSpecPersistence @@ -18,3 +18,6 @@ def _register(protocol: str): if importlib.util.find_spec("s3fs"): _register(S3) + + if importlib.util.find_spec("gcsfs"): + _register(GCS) diff --git a/plugins/flytekit-data-fsspec/setup.py b/plugins/flytekit-data-fsspec/setup.py index a0986a2e82..a20cc8a550 100644 --- a/plugins/flytekit-data-fsspec/setup.py +++ b/plugins/flytekit-data-fsspec/setup.py @@ -23,6 +23,7 @@ extras_require={ # https://github.com/fsspec/filesystem_spec/blob/master/setup.py#L36 "aws": ["s3fs>=2021.7.0"], + "gcp": ["gcsfs>=2021.7.0"], }, license="apache2", python_requires=">=3.7",