apache · jerryshao · Oct 24, 2024 · Sep 25, 2024 · Sep 25, 2024 · Sep 25, 2024
diff --git a/clients/client-python/gravitino/filesystem/gvfs.py b/clients/client-python/gravitino/filesystem/gvfs.py
@@ -49,6 +49,8 @@ class StorageType(Enum):
     HDFS = "hdfs"
     LOCAL = "file"
     GCS = "gs"
+    S3A = "s3a"
+    S3 = "s3"
 
 
 class FilesetContextPair:
@@ -314,7 +316,12 @@ def mv(self, path1, path2, recursive=False, maxdepth=None, **kwargs):
 
         # convert the following to in
 
-        if storage_type in [StorageType.HDFS, StorageType.GCS]:
+        if storage_type in [
+            StorageType.HDFS,
+            StorageType.GCS,
+            StorageType.S3,
+            StorageType.S3A,
+        ]:
             src_context_pair.filesystem().mv(
                 self._strip_storage_protocol(storage_type, src_actual_path),
                 self._strip_storage_protocol(storage_type, dst_actual_path),
@@ -547,9 +554,12 @@ def _convert_actual_path(
         """
 
         # If the storage path starts with hdfs, gcs, we should use the path as the prefix.
-        if storage_location.startswith(
-            f"{StorageType.HDFS.value}://"
-        ) or storage_location.startswith(f"{StorageType.GCS.value}://"):
+        if (
+            storage_location.startswith(f"{StorageType.HDFS.value}://")
+            or storage_location.startswith(f"{StorageType.GCS.value}://")
+            or storage_location.startswith(f"{StorageType.S3.value}://")
+            or storage_location.startswith(f"{StorageType.S3A.value}://")
+        ):
             actual_prefix = infer_storage_options(storage_location)["path"]
         elif storage_location.startswith(f"{StorageType.LOCAL.value}:/"):
             actual_prefix = storage_location[len(f"{StorageType.LOCAL.value}:") :]
@@ -692,6 +702,10 @@ def _recognize_storage_type(path: str):
             return StorageType.LOCAL
         if path.startswith(f"{StorageType.GCS.value}://"):
             return StorageType.GCS
+        if path.startswith(f"{StorageType.S3A.value}://"):
+            return StorageType.S3A
+        if path.startswith(f"{StorageType.S3.value}://"):
+            return StorageType.S3
         raise GravitinoRuntimeException(
             f"Storage type doesn't support now. Path:{path}"
         )
@@ -716,7 +730,12 @@ def _strip_storage_protocol(storage_type: StorageType, path: str):
         :param path: The path
         :return: The stripped path
         """
-        if storage_type in (StorageType.HDFS, StorageType.GCS):
+        if storage_type in (
+            StorageType.HDFS,
+            StorageType.GCS,
+            StorageType.S3A,
+            StorageType.S3,
+        ):
             return path
         if storage_type == StorageType.LOCAL:
             return path[len(f"{StorageType.LOCAL.value}:") :]
@@ -792,6 +811,8 @@ def _get_filesystem(self, actual_file_location: str):
                 fs = LocalFileSystem()
             elif storage_type == StorageType.GCS:
                 fs = ArrowFSWrapper(self._get_gcs_filesystem())
+            elif storage_type in (StorageType.S3A, StorageType.S3):
+                fs = ArrowFSWrapper(self._get_s3_filesystem())
             else:
                 raise GravitinoRuntimeException(
                     f"Storage type: `{storage_type}` doesn't support now."
@@ -819,5 +840,40 @@ def _get_gcs_filesystem(self):
 
         return importlib.import_module("pyarrow.fs").GcsFileSystem()
 
+    def _get_s3_filesystem(self):
+        # get All keys from the options that start with 'gravitino.bypass.s3.' and remove the prefix
+        s3_options = {
+            key[len(GVFSConfig.GVFS_FILESYSTEM_BY_PASS_S3) :]: value
+            for key, value in self._options.items()
+            if key.startswith(GVFSConfig.GVFS_FILESYSTEM_BY_PASS_S3)
+        }
+
+        # get 'aws_access_key_id' from s3_options, if the key is not found, throw an exception
+        aws_access_key_id = s3_options.get(GVFSConfig.GVFS_FILESYSTEM_S3_ACCESS_KEY)
+        if aws_access_key_id is None:
+            raise GravitinoRuntimeException(
+                "AWS access key id is not found in the options."
+            )
+
+        # get 'aws_secret_access_key' from s3_options, if the key is not found, throw an exception
+        aws_secret_access_key = s3_options.get(GVFSConfig.GVFS_FILESYSTEM_S3_SECRET_KEY)
+        if aws_secret_access_key is None:
+            raise GravitinoRuntimeException(
+                "AWS secret access key is not found in the options."
+            )
+
+        # get 'aws_endpoint_url' from s3_options, if the key is not found, throw an exception
+        aws_endpoint_url = s3_options.get(GVFSConfig.GVFS_FILESYSTEM_S3_ENDPOINT)
+        if aws_endpoint_url is None:
+            raise GravitinoRuntimeException(
+                "AWS endpoint url is not found in the options."
+            )
+
+        return importlib.import_module("pyarrow.fs").S3FileSystem(
+            access_key=aws_access_key_id,
+            secret_key=aws_secret_access_key,
+            endpoint_override=aws_endpoint_url,
+        )
+
 
 fsspec.register_implementation(PROTOCOL_NAME, GravitinoVirtualFileSystem)
diff --git a/clients/client-python/gravitino/filesystem/gvfs_config.py b/clients/client-python/gravitino/filesystem/gvfs_config.py
@@ -35,3 +35,8 @@ class GVFSConfig:
     GVFS_FILESYSTEM_BY_PASS = "gravitino.bypass"
     GVFS_FILESYSTEM_BY_PASS_GCS = "gravitino.bypass.gcs."
     GVFS_FILESYSTEM_KEY_FILE = "service-account-key-path"
+
+    GVFS_FILESYSTEM_BY_PASS_S3 = "gravitino.bypass.s3."
+    GVFS_FILESYSTEM_S3_ACCESS_KEY = "access-key"
+    GVFS_FILESYSTEM_S3_SECRET_KEY = "secret-key"
+    GVFS_FILESYSTEM_S3_ENDPOINT = "endpoint"
diff --git a/clients/client-python/tests/integration/test_gvfs_with_s3.py b/clients/client-python/tests/integration/test_gvfs_with_s3.py
@@ -0,0 +1,166 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import logging
+import os
+from random import randint
+import unittest
+
+from fsspec.implementations.arrow import ArrowFSWrapper
+from pyarrow.fs import S3FileSystem
+
+from tests.integration.test_gvfs_with_hdfs import TestGvfsWithHDFS
+from gravitino import (
+    gvfs,
+    GravitinoClient,
+    Catalog,
+    Fileset,
+)
+from gravitino.exceptions.base import GravitinoRuntimeException
+from gravitino.filesystem.gvfs_config import GVFSConfig
+
+logger = logging.getLogger(__name__)
+
+
+@unittest.skip("This test require S3 service account")
+class TestGvfsWithS3(TestGvfsWithHDFS):
+    # Before running this test, please set the make sure aws-bundle-x.jar has been
+    # copy to the $GRAVITINO_HOME/catalogs/hadoop/libs/ directory
+    s3_access_key = "your_access_key"
+    s3_secret_key = "your_secret_key"
+    s3_endpoint = "your_endpoint"
+    bucket_name = "your_bucket_name"
+
+    metalake_name: str = "TestGvfsWithS3_metalake" + str(randint(1, 10000))
+
+    def setUp(self):
+        self.options = {
+            f"{GVFSConfig.GVFS_FILESYSTEM_BY_PASS_S3}{GVFSConfig.GVFS_FILESYSTEM_S3_ACCESS_KEY}": self.s3_access_key,
+            f"{GVFSConfig.GVFS_FILESYSTEM_BY_PASS_S3}{GVFSConfig.GVFS_FILESYSTEM_S3_SECRET_KEY}": self.s3_secret_key,
+            f"{GVFSConfig.GVFS_FILESYSTEM_BY_PASS_S3}{GVFSConfig.GVFS_FILESYSTEM_S3_ENDPOINT}": self.s3_endpoint,
+        }
+
+    def tearDown(self):
+        self.options = {}
+
+    @classmethod
+    def setUpClass(cls):
+        cls._get_gravitino_home()
+
+        cls.hadoop_conf_path = f"{cls.gravitino_home}/catalogs/hadoop/conf/hadoop.conf"
+        # restart the server
+        cls.restart_server()
+        # create entity
+        cls._init_test_entities()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls._clean_test_data()
+        # reset server conf in case of other ITs like HDFS has changed it and fail
+        # to reset it
+        cls._reset_conf(cls.config, cls.hadoop_conf_path)
+        # restart server
+        cls.restart_server()
+
+    # clear all config in the conf_path
+    @classmethod
+    def _reset_conf(cls, config, conf_path):
+        logger.info("Reset %s.", conf_path)
+        if not os.path.exists(conf_path):
+            raise GravitinoRuntimeException(f"Conf file is not found at `{conf_path}`.")
+        filtered_lines = []
+        with open(conf_path, mode="r", encoding="utf-8") as file:
+            origin_lines = file.readlines()
+
+        for line in origin_lines:
+            line = line.strip()
+            if line.startswith("#"):
+                # append annotations directly
+                filtered_lines.append(line + "\n")
+
+        with open(conf_path, mode="w", encoding="utf-8") as file:
+            for line in filtered_lines:
+                file.write(line)
+
+    @classmethod
+    def _init_test_entities(cls):
+        cls.gravitino_admin_client.create_metalake(
+            name=cls.metalake_name, comment="", properties={}
+        )
+        cls.gravitino_client = GravitinoClient(
+            uri="http://localhost:8090", metalake_name=cls.metalake_name
+        )
+
+        cls.config = {}
+        cls.conf = {}
+        catalog = cls.gravitino_client.create_catalog(
+            name=cls.catalog_name,
+            catalog_type=Catalog.Type.FILESET,
+            provider=cls.catalog_provider,
+            comment="",
+            properties={
+                "filesystem-providers": "s3",
+                "gravitino.bypass.fs.s3a.access.key": cls.s3_access_key,
+                "gravitino.bypass.fs.s3a.secret.key": cls.s3_secret_key,
+                "gravitino.bypass.fs.s3a.endpoint": cls.s3_endpoint,
+            },
+        )
+        catalog.as_schemas().create_schema(
+            schema_name=cls.schema_name, comment="", properties={}
+        )
+
+        cls.fileset_storage_location: str = (
+            f"s3a://{cls.bucket_name}/{cls.catalog_name}/{cls.schema_name}/{cls.fileset_name}"
+        )
+        cls.fileset_gvfs_location = (
+            f"gvfs://fileset/{cls.catalog_name}/{cls.schema_name}/{cls.fileset_name}"
+        )
+        catalog.as_fileset_catalog().create_fileset(
+            ident=cls.fileset_ident,
+            fileset_type=Fileset.Type.MANAGED,
+            comment=cls.fileset_comment,
+            storage_location=cls.fileset_storage_location,
+            properties=cls.fileset_properties,
+        )
+
+        arrow_s3_fs = S3FileSystem(
+            access_key=cls.s3_access_key,
+            secret_key=cls.s3_secret_key,
+            endpoint_override=cls.s3_endpoint,
+        )
+        cls.fs = ArrowFSWrapper(arrow_s3_fs)
+
+    def test_modified(self):
+        modified_dir = self.fileset_gvfs_location + "/test_modified"
+        modified_actual_dir = self.fileset_storage_location + "/test_modified"
+        fs = gvfs.GravitinoVirtualFileSystem(
+            server_uri="http://localhost:8090",
+            metalake_name=self.metalake_name,
+            options=self.options,
+            **self.conf,
+        )
+        self.fs.mkdir(modified_actual_dir)
+        self.assertTrue(self.fs.exists(modified_actual_dir))
+        self.assertTrue(fs.exists(modified_dir))
+
+        self.assertIsNone(fs.modified(modified_dir))
+
+        # create a file under the dir 'modified_dir'.
+        file_path = modified_dir + "/test.txt"
+        fs.touch(file_path)
+        self.assertTrue(fs.exists(file_path))
+        self.assertIsNotNone(fs.modified(file_path))