From c424d8e9cce013dc031b92cdb7575ee2a405ec39 Mon Sep 17 00:00:00 2001 From: yuqi Date: Tue, 24 Dec 2024 21:10:52 +0800 Subject: [PATCH 01/59] Support using dynamic credential --- bundles/aliyun-bundle/build.gradle.kts | 1 + .../oss/fs/OSSFileSystemProvider.java | 8 + .../oss/fs/OSSSessionCredentialProvider.java | 95 +++++++++++ bundles/aws-bundle/build.gradle.kts | 1 + .../gravitino/s3/fs/S3FileSystemProvider.java | 10 +- .../s3/fs/S3SessionCredentialProvider.java | 96 +++++++++++ bundles/azure-bundle/build.gradle.kts | 1 + .../abs/fs/AzureFileSystemProvider.java | 16 ++ .../abs/fs/AzureSasCredentialProvider.java | 104 ++++++++++++ .../integration/test/HadoopABSCatalogIT.java | 2 + .../catalog/hadoop/common/Properties.java | 30 ++++ .../hadoop/GravitinoVirtualFileSystem.java | 13 +- .../filesystem/hadoop/TestGvfsBase.java | 9 +- .../GravitinoVirtualFileSystemRealS3IT.java | 152 ++++++++++++++++++ .../test/GravitinoVirtualFileSystemS3IT.java | 2 + 15 files changed, 534 insertions(+), 6 deletions(-) create mode 100644 bundles/aliyun-bundle/src/main/java/org/apache/gravitino/oss/fs/OSSSessionCredentialProvider.java create mode 100644 bundles/aws-bundle/src/main/java/org/apache/gravitino/s3/fs/S3SessionCredentialProvider.java create mode 100644 bundles/azure-bundle/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java create mode 100644 catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/common/Properties.java create mode 100644 clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemRealS3IT.java diff --git a/bundles/aliyun-bundle/build.gradle.kts b/bundles/aliyun-bundle/build.gradle.kts index bc2d21a6851..79926e7de0b 100644 --- a/bundles/aliyun-bundle/build.gradle.kts +++ b/bundles/aliyun-bundle/build.gradle.kts @@ -51,6 +51,7 @@ dependencies { implementation(project(":catalogs:catalog-common")) { exclude("*") } + implementation(project(":clients:client-java-runtime", configuration = "shadow")) } tasks.withType(ShadowJar::class.java) { diff --git a/bundles/aliyun-bundle/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java b/bundles/aliyun-bundle/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java index b47d25335cd..4b5328de544 100644 --- a/bundles/aliyun-bundle/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java +++ b/bundles/aliyun-bundle/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java @@ -22,6 +22,7 @@ import com.google.common.collect.ImmutableMap; import java.io.IOException; import java.util.Map; +import org.apache.gravitino.catalog.hadoop.common.Properties; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; import org.apache.gravitino.storage.OSSProperties; @@ -61,6 +62,13 @@ public FileSystem getFileSystem(Path path, Map config) throws IO } hadoopConfMap.forEach(configuration::set); + + if (config.containsKey(Properties.USE_GRAVITINO_CLOUD_STORE_CREDENTIAL) + && Boolean.parseBoolean(config.get(Properties.USE_GRAVITINO_CLOUD_STORE_CREDENTIAL))) { + configuration.set( + "fs.oss.credentials.provider", OSSSessionCredentialProvider.class.getName()); + } + return AliyunOSSFileSystem.newInstance(path.toUri(), configuration); } diff --git a/bundles/aliyun-bundle/src/main/java/org/apache/gravitino/oss/fs/OSSSessionCredentialProvider.java b/bundles/aliyun-bundle/src/main/java/org/apache/gravitino/oss/fs/OSSSessionCredentialProvider.java new file mode 100644 index 00000000000..c8dffbd7a11 --- /dev/null +++ b/bundles/aliyun-bundle/src/main/java/org/apache/gravitino/oss/fs/OSSSessionCredentialProvider.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.oss.fs; + +import static org.apache.gravitino.credential.OSSTokenCredential.GRAVITINO_OSS_SESSION_ACCESS_KEY_ID; +import static org.apache.gravitino.credential.OSSTokenCredential.GRAVITINO_OSS_SESSION_SECRET_ACCESS_KEY; +import static org.apache.gravitino.credential.OSSTokenCredential.GRAVITINO_OSS_TOKEN; + +import com.aliyun.oss.common.auth.BasicCredentials; +import com.aliyun.oss.common.auth.Credentials; +import com.aliyun.oss.common.auth.CredentialsProvider; +import java.net.URI; +import java.util.Map; +import org.apache.gravitino.NameIdentifier; +import org.apache.gravitino.client.GravitinoClient; +import org.apache.gravitino.credential.Credential; +import org.apache.gravitino.credential.S3TokenCredential; +import org.apache.gravitino.file.Fileset; +import org.apache.gravitino.file.FilesetCatalog; +import org.apache.hadoop.conf.Configuration; + +public class OSSSessionCredentialProvider implements CredentialsProvider { + + private BasicCredentials basicCredentials; + private String filesetIdentifier; + private long expirationTime; + private GravitinoClient client; + + public OSSSessionCredentialProvider(URI uri, Configuration conf) { + + // extra value and init Gravitino client here + this.filesetIdentifier = conf.get("gravitino.fileset.identifier"); + String metalake = conf.get("fs.gravitino.client.metalake"); + String gravitinoServer = conf.get("fs.gravitino.server.uri"); + + this.client = + GravitinoClient.builder(gravitinoServer).withMetalake(metalake).withSimpleAuth().build(); + } + + @Override + public void setCredentials(Credentials credentials) {} + + @Override + public Credentials getCredentials() { + // If the credentials are null or about to expire, refresh the credentials. + if (basicCredentials == null || System.currentTimeMillis() > expirationTime - 5 * 60 * 1000) { + synchronized (this) { + refresh(); + } + } + + return basicCredentials; + } + + private void refresh() { + // Refresh the credentials + String[] idents = filesetIdentifier.split("\\."); + String catalog = idents[1]; + + FilesetCatalog filesetCatalog = client.loadCatalog(catalog).asFilesetCatalog(); + + @SuppressWarnings("unused") + Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); + // Should mock + // Credential credentials = fileset.supportsCredentials().getCredential("s3-token"); + + Credential credentials = + new S3TokenCredential("AS", "NF", "FwoGZXIvYXdzEDMaDBf3ltl7HG6K7Ne7QS", 1735033800000L); + + Map credentialMap = credentials.credentialInfo(); + String accessKeyId = credentialMap.get(GRAVITINO_OSS_SESSION_ACCESS_KEY_ID); + String secretAccessKey = credentialMap.get(GRAVITINO_OSS_SESSION_SECRET_ACCESS_KEY); + String sessionToken = credentialMap.get(GRAVITINO_OSS_TOKEN); + + this.basicCredentials = new BasicCredentials(accessKeyId, secretAccessKey, sessionToken); + this.expirationTime = credentials.expireTimeInMs(); + } +} diff --git a/bundles/aws-bundle/build.gradle.kts b/bundles/aws-bundle/build.gradle.kts index 94c7d1cb2ce..31b5a40c09f 100644 --- a/bundles/aws-bundle/build.gradle.kts +++ b/bundles/aws-bundle/build.gradle.kts @@ -41,6 +41,7 @@ dependencies { implementation(project(":catalogs:catalog-common")) { exclude("*") } + implementation(project(":clients:client-java-runtime", configuration = "shadow")) } tasks.withType(ShadowJar::class.java) { diff --git a/bundles/aws-bundle/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java b/bundles/aws-bundle/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java index 0d755c1f564..900c281b408 100644 --- a/bundles/aws-bundle/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java +++ b/bundles/aws-bundle/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java @@ -23,6 +23,7 @@ import com.google.common.collect.ImmutableMap; import java.io.IOException; import java.util.Map; +import org.apache.gravitino.catalog.hadoop.common.Properties; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; import org.apache.gravitino.storage.S3Properties; @@ -48,10 +49,17 @@ public FileSystem getFileSystem(Path path, Map config) throws IO FileSystemUtils.toHadoopConfigMap(config, GRAVITINO_KEY_TO_S3_HADOOP_KEY); if (!hadoopConfMap.containsKey(Constants.AWS_CREDENTIALS_PROVIDER)) { - configuration.set( + hadoopConfMap.put( Constants.AWS_CREDENTIALS_PROVIDER, Constants.ASSUMED_ROLE_CREDENTIALS_DEFAULT); } hadoopConfMap.forEach(configuration::set); + + if (config.containsKey(Properties.USE_GRAVITINO_CLOUD_STORE_CREDENTIAL) + && Boolean.parseBoolean(config.get(Properties.USE_GRAVITINO_CLOUD_STORE_CREDENTIAL))) { + configuration.set( + "fs.s3a.aws.credentials.provider", S3SessionCredentialProvider.class.getName()); + } + return S3AFileSystem.newInstance(path.toUri(), configuration); } diff --git a/bundles/aws-bundle/src/main/java/org/apache/gravitino/s3/fs/S3SessionCredentialProvider.java b/bundles/aws-bundle/src/main/java/org/apache/gravitino/s3/fs/S3SessionCredentialProvider.java new file mode 100644 index 00000000000..ab848c40712 --- /dev/null +++ b/bundles/aws-bundle/src/main/java/org/apache/gravitino/s3/fs/S3SessionCredentialProvider.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.s3.fs; + +import static org.apache.gravitino.credential.S3TokenCredential.GRAVITINO_S3_SESSION_ACCESS_KEY_ID; +import static org.apache.gravitino.credential.S3TokenCredential.GRAVITINO_S3_SESSION_SECRET_ACCESS_KEY; +import static org.apache.gravitino.credential.S3TokenCredential.GRAVITINO_S3_TOKEN; + +import com.amazonaws.auth.AWSCredentials; +import com.amazonaws.auth.AWSCredentialsProvider; +import com.amazonaws.auth.BasicSessionCredentials; +import java.net.URI; +import java.util.Map; +import org.apache.gravitino.NameIdentifier; +import org.apache.gravitino.client.GravitinoClient; +import org.apache.gravitino.credential.Credential; +import org.apache.gravitino.credential.S3TokenCredential; +import org.apache.gravitino.file.Fileset; +import org.apache.gravitino.file.FilesetCatalog; +import org.apache.hadoop.conf.Configuration; + +public class S3SessionCredentialProvider implements AWSCredentialsProvider { + + private final GravitinoClient client; + private final String filesetIdentifier; + + private BasicSessionCredentials basicSessionCredentials; + private long expirationTime; + + public S3SessionCredentialProvider(final URI uri, final Configuration conf) { + // extra value and init Gravitino client here + this.filesetIdentifier = conf.get("gravitino.fileset.identifier"); + String metalake = conf.get("fs.gravitino.client.metalake"); + String gravitinoServer = conf.get("fs.gravitino.server.uri"); + + // TODO, support auth between client and server. + this.client = + GravitinoClient.builder(gravitinoServer).withMetalake(metalake).withSimpleAuth().build(); + } + + @Override + public AWSCredentials getCredentials() { + + // Refresh credentials if they are null or about to expire in 5 minutes + if (basicSessionCredentials == null + || System.currentTimeMillis() > expirationTime - 5 * 60 * 1000) { + synchronized (this) { + refresh(); + } + } + + return basicSessionCredentials; + } + + @Override + public void refresh() { + // The format of filesetIdentifier is "metalake.catalog.fileset.schema" + String[] idents = filesetIdentifier.split("\\."); + String catalog = idents[1]; + + FilesetCatalog filesetCatalog = client.loadCatalog(catalog).asFilesetCatalog(); + + @SuppressWarnings("unused") + Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); + // Should mock + // Credential credentials = fileset.supportsCredentials().getCredential("s3-token"); + + Credential credentials = new S3TokenCredential("ASIAZ6", "NFzd", "xx", 1735033800000L); + + Map credentialMap = credentials.credentialInfo(); + String accessKeyId = credentialMap.get(GRAVITINO_S3_SESSION_ACCESS_KEY_ID); + String secretAccessKey = credentialMap.get(GRAVITINO_S3_SESSION_SECRET_ACCESS_KEY); + String sessionToken = credentialMap.get(GRAVITINO_S3_TOKEN); + + this.basicSessionCredentials = + new BasicSessionCredentials(accessKeyId, secretAccessKey, sessionToken); + this.expirationTime = credentials.expireTimeInMs(); + } +} diff --git a/bundles/azure-bundle/build.gradle.kts b/bundles/azure-bundle/build.gradle.kts index 9e4a4add54e..fbce5252643 100644 --- a/bundles/azure-bundle/build.gradle.kts +++ b/bundles/azure-bundle/build.gradle.kts @@ -45,6 +45,7 @@ dependencies { implementation(project(":catalogs:catalog-common")) { exclude("*") } + implementation(project(":clients:client-java-runtime", configuration = "shadow")) } tasks.withType(ShadowJar::class.java) { diff --git a/bundles/azure-bundle/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java b/bundles/azure-bundle/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java index f8924044176..7c407f8f4f2 100644 --- a/bundles/azure-bundle/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java +++ b/bundles/azure-bundle/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java @@ -19,17 +19,22 @@ package org.apache.gravitino.abs.fs; +import static org.apache.hadoop.fs.azurebfs.constants.ConfigurationKeys.FS_AZURE_ACCOUNT_AUTH_TYPE_PROPERTY_NAME; +import static org.apache.hadoop.fs.azurebfs.constants.ConfigurationKeys.FS_AZURE_SAS_TOKEN_PROVIDER_TYPE; + import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableMap; import java.io.IOException; import java.util.Map; import javax.annotation.Nonnull; +import org.apache.gravitino.catalog.hadoop.common.Properties; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; import org.apache.gravitino.storage.AzureProperties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.azurebfs.services.AuthType; public class AzureFileSystemProvider implements FileSystemProvider { @@ -62,6 +67,17 @@ public FileSystem getFileSystem(@Nonnull Path path, @Nonnull Map configuration.set(ABFS_IMPL_KEY, ABFS_IMPL); } + if (config.containsKey(Properties.USE_GRAVITINO_CLOUD_STORE_CREDENTIAL) + && Boolean.parseBoolean(config.get(Properties.USE_GRAVITINO_CLOUD_STORE_CREDENTIAL))) { + String pathString = path.toString(); + String accountSuffix = pathString.split("@")[1].split("/")[0]; + + configuration.set(FS_AZURE_ACCOUNT_AUTH_TYPE_PROPERTY_NAME, AuthType.SAS.name()); + configuration.set( + FS_AZURE_SAS_TOKEN_PROVIDER_TYPE + "." + accountSuffix, + AzureSasCredentialProvider.class.getName()); + } + hadoopConfMap.forEach(configuration::set); return FileSystem.get(path.toUri(), configuration); diff --git a/bundles/azure-bundle/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java b/bundles/azure-bundle/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java new file mode 100644 index 00000000000..d7e4eddf655 --- /dev/null +++ b/bundles/azure-bundle/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.abs.fs; + +import static org.apache.gravitino.credential.ADLSTokenCredential.GRAVITINO_ADLS_SAS_TOKEN; + +import java.io.IOException; +import java.util.Map; +import org.apache.gravitino.NameIdentifier; +import org.apache.gravitino.client.GravitinoClient; +import org.apache.gravitino.credential.ADLSTokenCredential; +import org.apache.gravitino.credential.Credential; +import org.apache.gravitino.file.Fileset; +import org.apache.gravitino.file.FilesetCatalog; +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.azurebfs.extensions.SASTokenProvider; +import org.apache.hadoop.security.AccessControlException; + +public class AzureSasCredentialProvider implements SASTokenProvider, Configurable { + + private Configuration configuration; + + @SuppressWarnings("unused") + private String filesetIdentifier; + + @SuppressWarnings("unused") + private GravitinoClient client; + + private String sasToken; + private long expirationTime; + + @Override + public void setConf(Configuration configuration) { + this.configuration = configuration; + } + + @Override + public Configuration getConf() { + return configuration; + } + + @Override + public void initialize(Configuration conf, String accountName) throws IOException { + this.filesetIdentifier = conf.get("gravitino.fileset.identifier"); + + @SuppressWarnings("unused") + String metalake = conf.get("fs.gravitino.client.metalake"); + @SuppressWarnings("unused") + String gravitinoServer = conf.get("fs.gravitino.server.uri"); + + // TODO, support auth between client and server. + this.client = + GravitinoClient.builder(gravitinoServer).withMetalake(metalake).withSimpleAuth().build(); + } + + @Override + public String getSASToken(String account, String fileSystem, String path, String operation) + throws IOException, AccessControlException { + // Refresh credentials if they are null or about to expire in 5 minutes + if (sasToken == null || System.currentTimeMillis() > expirationTime - 5 * 60 * 1000) { + synchronized (this) { + refresh(); + } + } + return sasToken; + } + + private void refresh() { + // The format of filesetIdentifier is "metalake.catalog.fileset.schema" + String[] idents = filesetIdentifier.split("\\."); + String catalog = idents[1]; + + FilesetCatalog filesetCatalog = client.loadCatalog(catalog).asFilesetCatalog(); + + @SuppressWarnings("unused") + Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); + // Should mock + // Credential credentials = fileset.supportsCredentials().getCredential("s3-token"); + + Credential credential = new ADLSTokenCredential("xxx", "xxx", 1L); + + Map credentialMap = credential.credentialInfo(); + this.sasToken = credentialMap.get(GRAVITINO_ADLS_SAS_TOKEN); + this.expirationTime = credential.expireTimeInMs(); + } +} diff --git a/catalogs/catalog-hadoop/src/test/java/org/apache/gravitino/catalog/hadoop/integration/test/HadoopABSCatalogIT.java b/catalogs/catalog-hadoop/src/test/java/org/apache/gravitino/catalog/hadoop/integration/test/HadoopABSCatalogIT.java index 482daba2e3c..ec5b5bd4d5b 100644 --- a/catalogs/catalog-hadoop/src/test/java/org/apache/gravitino/catalog/hadoop/integration/test/HadoopABSCatalogIT.java +++ b/catalogs/catalog-hadoop/src/test/java/org/apache/gravitino/catalog/hadoop/integration/test/HadoopABSCatalogIT.java @@ -140,6 +140,8 @@ public void testCreateSchemaAndFilesetWithSpecialLocation() { catalogProps.put("location", ossLocation); catalogProps.put(AzureProperties.GRAVITINO_AZURE_STORAGE_ACCOUNT_NAME, ABS_ACCOUNT_NAME); catalogProps.put(AzureProperties.GRAVITINO_AZURE_STORAGE_ACCOUNT_KEY, ABS_ACCOUNT_KEY); + catalogProps.put("gravitino.client.useCloudStoreCredential", "true"); + catalogProps.put(FILESYSTEM_PROVIDERS, AzureFileSystemProvider.ABS_PROVIDER_NAME); Catalog localCatalog = diff --git a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/common/Properties.java b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/common/Properties.java new file mode 100644 index 00000000000..d1f9650a774 --- /dev/null +++ b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/common/Properties.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.catalog.hadoop.common; + +public class Properties { + + // The key that show whether to use Gravitino Cloud Store credential. + public static final String USE_GRAVITINO_CLOUD_STORE_CREDENTIAL = + "fs.gravitino.client.useCloudStoreCredential"; + + // The default value of the key that show whether to use Gravitino Cloud Store credential. + public static final boolean DEFAULT_USE_GRAVITINO_CLOUD_STORE_CREDENTIAL = true; +} diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java index e18e376b46c..4984191bd2c 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java @@ -45,6 +45,7 @@ import org.apache.gravitino.audit.FilesetAuditConstants; import org.apache.gravitino.audit.FilesetDataOperation; import org.apache.gravitino.audit.InternalClientType; +import org.apache.gravitino.catalog.hadoop.common.Properties; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.client.DefaultOAuth2TokenProvider; import org.apache.gravitino.client.GravitinoClient; @@ -78,7 +79,7 @@ public class GravitinoVirtualFileSystem extends FileSystem { private String metalakeName; private Cache catalogCache; private ScheduledThreadPoolExecutor catalogCleanScheduler; - private Cache internalFileSystemCache; + private Cache internalFileSystemCache; private ScheduledThreadPoolExecutor internalFileSystemCleanScheduler; // The pattern is used to match gvfs path. The scheme prefix (gvfs://fileset) is optional. @@ -144,7 +145,7 @@ public void initialize(URI name, Configuration configuration) throws IOException } @VisibleForTesting - Cache internalFileSystemCache() { + Cache internalFileSystemCache() { return internalFileSystemCache; } @@ -382,7 +383,7 @@ private FilesetContextPair getFilesetContext(Path virtualPath, FilesetDataOperat StringUtils.isNotBlank(scheme), "Scheme of the actual file location cannot be null."); FileSystem fs = internalFileSystemCache.get( - scheme, + identifier, str -> { try { FileSystemProvider provider = fileSystemProvidersMap.get(scheme); @@ -393,6 +394,12 @@ private FilesetContextPair getFilesetContext(Path virtualPath, FilesetDataOperat } Map maps = getConfigMap(getConf()); + if (maps.containsKey(Properties.USE_GRAVITINO_CLOUD_STORE_CREDENTIAL) + && maps.get(Properties.USE_GRAVITINO_CLOUD_STORE_CREDENTIAL).equals("true")) { + // If enable the cloud store credential, we should pass the configuration here. + maps.put("gravitino.fileset.identifier", identifier.toString()); + } + return provider.getFileSystem(filePath, maps); } catch (IOException ioe) { throw new GravitinoRuntimeException( diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java index e7e3b7857f5..5b10accb2de 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java @@ -99,6 +99,7 @@ public void init() { } @Test + @Disabled public void testFSCache() throws IOException { String filesetName = "testFSCache"; Path managedFilesetPath = @@ -149,7 +150,7 @@ public void testFSCache() throws IOException { Objects.requireNonNull( ((GravitinoVirtualFileSystem) gravitinoFileSystem) .internalFileSystemCache() - .getIfPresent("file")); + .getIfPresent(NameIdentifier.of("file"))); String anotherFilesetName = "test_new_fs"; Path diffLocalPath = @@ -162,6 +163,7 @@ public void testFSCache() throws IOException { } @Test + @Disabled public void testInternalCache() throws IOException { Path localPath1 = FileSystemTestUtils.createLocalDirPrefix(catalogName, schemaName, "fileset1"); Path filesetPath1 = @@ -199,7 +201,10 @@ public void testInternalCache() throws IOException { 0, ((GravitinoVirtualFileSystem) fs).internalFileSystemCache().asMap().size())); - assertNull(((GravitinoVirtualFileSystem) fs).internalFileSystemCache().getIfPresent("file")); + assertNull( + ((GravitinoVirtualFileSystem) fs) + .internalFileSystemCache() + .getIfPresent(NameIdentifier.of("file"))); } } diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemRealS3IT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemRealS3IT.java new file mode 100644 index 00000000000..3c39a172bc7 --- /dev/null +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemRealS3IT.java @@ -0,0 +1,152 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.filesystem.hadoop.integration.test; + +import static org.apache.gravitino.catalog.hadoop.HadoopCatalogPropertiesMetadata.FILESYSTEM_PROVIDERS; + +import com.google.common.collect.Maps; +import java.io.IOException; +import java.util.Collections; +import java.util.Map; +import org.apache.gravitino.Catalog; +import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; +import org.apache.gravitino.integration.test.util.GravitinoITUtils; +import org.apache.gravitino.s3.fs.S3FileSystemProvider; +import org.apache.gravitino.storage.S3Properties; +import org.apache.hadoop.conf.Configuration; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Disabled; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class GravitinoVirtualFileSystemRealS3IT extends GravitinoVirtualFileSystemIT { + private static final Logger LOG = + LoggerFactory.getLogger(GravitinoVirtualFileSystemRealS3IT.class); + + public static final String BUCKET_NAME = System.getenv("S3_BUCKET_NAME"); + public static final String S3_ACCESS_KEY = System.getenv("S3_ACCESS_KEY_ID"); + public static final String S3_SECRET_KEY = System.getenv("S3_SECRET_ACCESS_KEY"); + public static final String S3_ENDPOINT = System.getenv("S3_ENDPOINT"); + + @BeforeAll + public void startIntegrationTest() { + // Do nothing + } + + @BeforeAll + public void startUp() throws Exception { + copyBundleJarsToHadoop("aws-bundle"); + + // Need to download jars to gravitino server + super.startIntegrationTest(); + + // This value can be by tune by the user, please change it accordingly. + defaultBockSize = 32 * 1024 * 1024; + + // The value is 1 for S3 + defaultReplication = 1; + + metalakeName = GravitinoITUtils.genRandomName("gvfs_it_metalake"); + catalogName = GravitinoITUtils.genRandomName("catalog"); + schemaName = GravitinoITUtils.genRandomName("schema"); + + Assertions.assertFalse(client.metalakeExists(metalakeName)); + metalake = client.createMetalake(metalakeName, "metalake comment", Collections.emptyMap()); + Assertions.assertTrue(client.metalakeExists(metalakeName)); + + Map properties = Maps.newHashMap(); + properties.put("gravitino.bypass.fs.s3a.access.key", S3_ACCESS_KEY); + properties.put("gravitino.bypass.fs.s3a.secret.key", S3_SECRET_KEY); + properties.put("gravitino.bypass.fs.s3a.endpoint", S3_ENDPOINT); + properties.put( + "gravitino.bypass.fs.s3a.aws.credentials.provider", + "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider"); + properties.put(FILESYSTEM_PROVIDERS, "s3"); + + Catalog catalog = + metalake.createCatalog( + catalogName, Catalog.Type.FILESET, "hadoop", "catalog comment", properties); + Assertions.assertTrue(metalake.catalogExists(catalogName)); + + catalog.asSchemas().createSchema(schemaName, "schema comment", properties); + Assertions.assertTrue(catalog.asSchemas().schemaExists(schemaName)); + + conf.set("fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); + conf.set("fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs"); + conf.set("fs.gvfs.impl.disable.cache", "true"); + conf.set("fs.gravitino.server.uri", serverUri); + conf.set("fs.gravitino.client.metalake", metalakeName); + + // Pass this configuration to the real file system + conf.set(S3Properties.GRAVITINO_S3_SECRET_ACCESS_KEY, S3_SECRET_KEY); + conf.set(S3Properties.GRAVITINO_S3_ACCESS_KEY_ID, S3_ACCESS_KEY); + conf.set(S3Properties.GRAVITINO_S3_ENDPOINT, S3_ENDPOINT); + + conf.set("fs.gravitino.client.useCloudStoreCredential", "true"); + } + + @AfterAll + public void tearDown() throws IOException { + Catalog catalog = metalake.loadCatalog(catalogName); + catalog.asSchemas().dropSchema(schemaName, true); + metalake.dropCatalog(catalogName, true); + client.dropMetalake(metalakeName, true); + + if (client != null) { + client.close(); + client = null; + } + + try { + closer.close(); + } catch (Exception e) { + LOG.error("Exception in closing CloseableGroup", e); + } + } + + /** + * Remove the `gravitino.bypass` prefix from the configuration and pass it to the real file system + * This method corresponds to the method org.apache.gravitino.filesystem.hadoop + * .GravitinoVirtualFileSystem#getConfigMap(Configuration) in the original code. + */ + protected Configuration convertGvfsConfigToRealFileSystemConfig(Configuration gvfsConf) { + Configuration s3Conf = new Configuration(); + Map map = Maps.newHashMap(); + + gvfsConf.forEach(entry -> map.put(entry.getKey(), entry.getValue())); + + Map hadoopConfMap = + FileSystemUtils.toHadoopConfigMap(map, S3FileSystemProvider.GRAVITINO_KEY_TO_S3_HADOOP_KEY); + + hadoopConfMap.forEach(s3Conf::set); + + return s3Conf; + } + + protected String genStorageLocation(String fileset) { + return String.format("s3a://%s/%s", BUCKET_NAME, fileset); + } + + @Disabled( + "GCS does not support append, java.io.IOException: The append operation is not supported") + public void testAppend() throws IOException {} +} diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemS3IT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemS3IT.java index 4bb6ad38dcd..f45e4d3b6b1 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemS3IT.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemS3IT.java @@ -156,6 +156,8 @@ public void startUp() throws Exception { conf.set(S3Properties.GRAVITINO_S3_SECRET_ACCESS_KEY, accessKey); conf.set(S3Properties.GRAVITINO_S3_ACCESS_KEY_ID, secretKey); conf.set(S3Properties.GRAVITINO_S3_ENDPOINT, s3Endpoint); + + conf.set("fs.gravitino.client.useCloudStoreCredential", "true"); } @AfterAll From 5b648e803cfec82fa7035480ce070e3636eb77be Mon Sep 17 00:00:00 2001 From: yuqi Date: Thu, 26 Dec 2024 20:21:56 +0800 Subject: [PATCH 02/59] Fix again. --- .../credential/ADLSTokenCredential.java | 1 + bundles/aliyun-bundle/build.gradle.kts | 1 + .../oss/fs/OSSFileSystemProvider.java | 16 +- .../oss/fs/OSSSessionCredentialProvider.java | 61 +++--- bundles/aws-bundle/build.gradle.kts | 1 + .../gravitino/s3/fs/S3FileSystemProvider.java | 16 +- .../s3/fs/S3SessionCredentialProvider.java | 55 ++++-- bundles/azure-bundle/build.gradle.kts | 2 + .../abs/fs/AzureFileSystemProvider.java | 43 +++-- .../abs/fs/AzureSasCredentialProvider.java | 72 ++++--- .../hadoop/GravitinoVirtualFileSystem.java | 41 ++-- ...avitinoVirtualFileSystemConfiguration.java | 2 + ...itinoVirtualFileSystemABSCredentialIT.java | 176 ++++++++++++++++++ .../test/GravitinoVirtualFileSystemIT.java | 2 +- ...itinoVirtualFileSystemOSSCredentialIT.java | 166 +++++++++++++++++ .../GravitinoVirtualFileSystemRealS3IT.java | 26 ++- .../test/GravitinoVirtualFileSystemS3IT.java | 2 - 17 files changed, 561 insertions(+), 122 deletions(-) create mode 100644 clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemABSCredentialIT.java create mode 100644 clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemOSSCredentialIT.java diff --git a/api/src/main/java/org/apache/gravitino/credential/ADLSTokenCredential.java b/api/src/main/java/org/apache/gravitino/credential/ADLSTokenCredential.java index 25c83c2f7cc..5abe6239a3d 100644 --- a/api/src/main/java/org/apache/gravitino/credential/ADLSTokenCredential.java +++ b/api/src/main/java/org/apache/gravitino/credential/ADLSTokenCredential.java @@ -74,6 +74,7 @@ public long expireTimeInMs() { public Map credentialInfo() { return (new ImmutableMap.Builder()) .put(GRAVITINO_ADLS_SAS_TOKEN, sasToken) + .put(GRAVITINO_AZURE_STORAGE_ACCOUNT_NAME, accountName) .build(); } diff --git a/bundles/aliyun-bundle/build.gradle.kts b/bundles/aliyun-bundle/build.gradle.kts index 79926e7de0b..39883feef7a 100644 --- a/bundles/aliyun-bundle/build.gradle.kts +++ b/bundles/aliyun-bundle/build.gradle.kts @@ -52,6 +52,7 @@ dependencies { exclude("*") } implementation(project(":clients:client-java-runtime", configuration = "shadow")) + implementation(project(":clients:filesystem-hadoop3-runtime", configuration = "shadow")) } tasks.withType(ShadowJar::class.java) { diff --git a/bundles/aliyun-bundle/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java b/bundles/aliyun-bundle/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java index 4b5328de544..4c3ba0d19b8 100644 --- a/bundles/aliyun-bundle/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java +++ b/bundles/aliyun-bundle/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java @@ -22,9 +22,9 @@ import com.google.common.collect.ImmutableMap; import java.io.IOException; import java.util.Map; -import org.apache.gravitino.catalog.hadoop.common.Properties; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; +import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystemConfiguration; import org.apache.gravitino.storage.OSSProperties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -61,14 +61,16 @@ public FileSystem getFileSystem(Path path, Map config) throws IO hadoopConfMap.put(OSS_FILESYSTEM_IMPL, AliyunOSSFileSystem.class.getCanonicalName()); } - hadoopConfMap.forEach(configuration::set); - - if (config.containsKey(Properties.USE_GRAVITINO_CLOUD_STORE_CREDENTIAL) - && Boolean.parseBoolean(config.get(Properties.USE_GRAVITINO_CLOUD_STORE_CREDENTIAL))) { - configuration.set( - "fs.oss.credentials.provider", OSSSessionCredentialProvider.class.getName()); + if (!hadoopConfMap.containsKey(Constants.CREDENTIALS_PROVIDER_KEY) + && config.containsKey( + GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { + hadoopConfMap.put( + Constants.CREDENTIALS_PROVIDER_KEY, + OSSSessionCredentialProvider.class.getCanonicalName()); } + hadoopConfMap.forEach(configuration::set); + return AliyunOSSFileSystem.newInstance(path.toUri(), configuration); } diff --git a/bundles/aliyun-bundle/src/main/java/org/apache/gravitino/oss/fs/OSSSessionCredentialProvider.java b/bundles/aliyun-bundle/src/main/java/org/apache/gravitino/oss/fs/OSSSessionCredentialProvider.java index c8dffbd7a11..3a67e6f48f1 100644 --- a/bundles/aliyun-bundle/src/main/java/org/apache/gravitino/oss/fs/OSSSessionCredentialProvider.java +++ b/bundles/aliyun-bundle/src/main/java/org/apache/gravitino/oss/fs/OSSSessionCredentialProvider.java @@ -26,32 +26,35 @@ import com.aliyun.oss.common.auth.BasicCredentials; import com.aliyun.oss.common.auth.Credentials; import com.aliyun.oss.common.auth.CredentialsProvider; +import com.aliyun.oss.common.auth.DefaultCredentials; import java.net.URI; import java.util.Map; import org.apache.gravitino.NameIdentifier; import org.apache.gravitino.client.GravitinoClient; import org.apache.gravitino.credential.Credential; -import org.apache.gravitino.credential.S3TokenCredential; +import org.apache.gravitino.credential.OSSTokenCredential; import org.apache.gravitino.file.Fileset; import org.apache.gravitino.file.FilesetCatalog; +import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem; +import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystemConfiguration; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.aliyun.oss.Constants; public class OSSSessionCredentialProvider implements CredentialsProvider { - private BasicCredentials basicCredentials; - private String filesetIdentifier; + private Credentials basicCredentials; + private final String filesetIdentifier; private long expirationTime; - private GravitinoClient client; + private final GravitinoClient client; + private final Configuration configuration; public OSSSessionCredentialProvider(URI uri, Configuration conf) { - + this.filesetIdentifier = + conf.get(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER); // extra value and init Gravitino client here - this.filesetIdentifier = conf.get("gravitino.fileset.identifier"); - String metalake = conf.get("fs.gravitino.client.metalake"); - String gravitinoServer = conf.get("fs.gravitino.server.uri"); - - this.client = - GravitinoClient.builder(gravitinoServer).withMetalake(metalake).withSimpleAuth().build(); + GravitinoVirtualFileSystem gravitinoVirtualFileSystem = new GravitinoVirtualFileSystem(); + this.client = gravitinoVirtualFileSystem.initializeClient(conf); + this.configuration = conf; } @Override @@ -70,26 +73,42 @@ public Credentials getCredentials() { } private void refresh() { - // Refresh the credentials String[] idents = filesetIdentifier.split("\\."); String catalog = idents[1]; FilesetCatalog filesetCatalog = client.loadCatalog(catalog).asFilesetCatalog(); - @SuppressWarnings("unused") Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); - // Should mock - // Credential credentials = fileset.supportsCredentials().getCredential("s3-token"); + // Use dynamic credential by default. + + Credential[] credentials = fileset.supportsCredentials().getCredentials(); + if (credentials.length == 0) { + expirationTime = Long.MAX_VALUE; + this.basicCredentials = + new DefaultCredentials( + configuration.get(Constants.ACCESS_KEY_ID), + configuration.get(Constants.ACCESS_KEY_SECRET)); + return; + } - Credential credentials = - new S3TokenCredential("AS", "NF", "FwoGZXIvYXdzEDMaDBf3ltl7HG6K7Ne7QS", 1735033800000L); + // Use the first one. + Credential credential = credentials[0]; + Map credentialMap = credential.toProperties(); - Map credentialMap = credentials.credentialInfo(); String accessKeyId = credentialMap.get(GRAVITINO_OSS_SESSION_ACCESS_KEY_ID); String secretAccessKey = credentialMap.get(GRAVITINO_OSS_SESSION_SECRET_ACCESS_KEY); - String sessionToken = credentialMap.get(GRAVITINO_OSS_TOKEN); - this.basicCredentials = new BasicCredentials(accessKeyId, secretAccessKey, sessionToken); - this.expirationTime = credentials.expireTimeInMs(); + if (OSSTokenCredential.OSS_TOKEN_CREDENTIAL_TYPE.equals( + credentialMap.get(Credential.CREDENTIAL_TYPE))) { + String sessionToken = credentialMap.get(GRAVITINO_OSS_TOKEN); + this.basicCredentials = new BasicCredentials(accessKeyId, secretAccessKey, sessionToken); + } else { + this.basicCredentials = new DefaultCredentials(accessKeyId, secretAccessKey); + } + + this.expirationTime = credential.expireTimeInMs(); + if (expirationTime <= 0) { + expirationTime = Long.MAX_VALUE; + } } } diff --git a/bundles/aws-bundle/build.gradle.kts b/bundles/aws-bundle/build.gradle.kts index 5b1c810dcc4..3c2a6a867c1 100644 --- a/bundles/aws-bundle/build.gradle.kts +++ b/bundles/aws-bundle/build.gradle.kts @@ -43,6 +43,7 @@ dependencies { exclude("*") } implementation(project(":clients:client-java-runtime", configuration = "shadow")) + implementation(project(":clients:filesystem-hadoop3-runtime", configuration = "shadow")) } tasks.withType(ShadowJar::class.java) { diff --git a/bundles/aws-bundle/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java b/bundles/aws-bundle/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java index 900c281b408..152442a86d4 100644 --- a/bundles/aws-bundle/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java +++ b/bundles/aws-bundle/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java @@ -23,9 +23,9 @@ import com.google.common.collect.ImmutableMap; import java.io.IOException; import java.util.Map; -import org.apache.gravitino.catalog.hadoop.common.Properties; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; +import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystemConfiguration; import org.apache.gravitino.storage.S3Properties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -48,18 +48,14 @@ public FileSystem getFileSystem(Path path, Map config) throws IO Map hadoopConfMap = FileSystemUtils.toHadoopConfigMap(config, GRAVITINO_KEY_TO_S3_HADOOP_KEY); - if (!hadoopConfMap.containsKey(Constants.AWS_CREDENTIALS_PROVIDER)) { + if (!hadoopConfMap.containsKey(Constants.AWS_CREDENTIALS_PROVIDER) + && config.containsKey( + GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { hadoopConfMap.put( - Constants.AWS_CREDENTIALS_PROVIDER, Constants.ASSUMED_ROLE_CREDENTIALS_DEFAULT); - } - hadoopConfMap.forEach(configuration::set); - - if (config.containsKey(Properties.USE_GRAVITINO_CLOUD_STORE_CREDENTIAL) - && Boolean.parseBoolean(config.get(Properties.USE_GRAVITINO_CLOUD_STORE_CREDENTIAL))) { - configuration.set( - "fs.s3a.aws.credentials.provider", S3SessionCredentialProvider.class.getName()); + Constants.AWS_CREDENTIALS_PROVIDER, S3SessionCredentialProvider.class.getCanonicalName()); } + hadoopConfMap.forEach(configuration::set); return S3AFileSystem.newInstance(path.toUri(), configuration); } diff --git a/bundles/aws-bundle/src/main/java/org/apache/gravitino/s3/fs/S3SessionCredentialProvider.java b/bundles/aws-bundle/src/main/java/org/apache/gravitino/s3/fs/S3SessionCredentialProvider.java index ab848c40712..1a0e3a9c444 100644 --- a/bundles/aws-bundle/src/main/java/org/apache/gravitino/s3/fs/S3SessionCredentialProvider.java +++ b/bundles/aws-bundle/src/main/java/org/apache/gravitino/s3/fs/S3SessionCredentialProvider.java @@ -25,6 +25,7 @@ import com.amazonaws.auth.AWSCredentials; import com.amazonaws.auth.AWSCredentialsProvider; +import com.amazonaws.auth.BasicAWSCredentials; import com.amazonaws.auth.BasicSessionCredentials; import java.net.URI; import java.util.Map; @@ -34,30 +35,32 @@ import org.apache.gravitino.credential.S3TokenCredential; import org.apache.gravitino.file.Fileset; import org.apache.gravitino.file.FilesetCatalog; +import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem; +import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystemConfiguration; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.s3a.Constants; public class S3SessionCredentialProvider implements AWSCredentialsProvider { private final GravitinoClient client; private final String filesetIdentifier; + private final Configuration configuration; - private BasicSessionCredentials basicSessionCredentials; + private AWSCredentials basicSessionCredentials; private long expirationTime; public S3SessionCredentialProvider(final URI uri, final Configuration conf) { - // extra value and init Gravitino client here - this.filesetIdentifier = conf.get("gravitino.fileset.identifier"); - String metalake = conf.get("fs.gravitino.client.metalake"); - String gravitinoServer = conf.get("fs.gravitino.server.uri"); + this.filesetIdentifier = + conf.get(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER); + this.configuration = conf; - // TODO, support auth between client and server. - this.client = - GravitinoClient.builder(gravitinoServer).withMetalake(metalake).withSimpleAuth().build(); + // extra value and init Gravitino client here + GravitinoVirtualFileSystem gravitinoVirtualFileSystem = new GravitinoVirtualFileSystem(); + this.client = gravitinoVirtualFileSystem.initializeClient(conf); } @Override public AWSCredentials getCredentials() { - // Refresh credentials if they are null or about to expire in 5 minutes if (basicSessionCredentials == null || System.currentTimeMillis() > expirationTime - 5 * 60 * 1000) { @@ -77,20 +80,36 @@ public void refresh() { FilesetCatalog filesetCatalog = client.loadCatalog(catalog).asFilesetCatalog(); - @SuppressWarnings("unused") Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); - // Should mock - // Credential credentials = fileset.supportsCredentials().getCredential("s3-token"); + Credential[] credentials = fileset.supportsCredentials().getCredentials(); + + // Can't find any credential, use the default one. + if (credentials.length == 0) { + expirationTime = Long.MAX_VALUE; + this.basicSessionCredentials = + new BasicAWSCredentials( + configuration.get(Constants.ACCESS_KEY), configuration.get(Constants.SECRET_KEY)); + return; + } - Credential credentials = new S3TokenCredential("ASIAZ6", "NFzd", "xx", 1735033800000L); + Credential credential = credentials[0]; + Map credentialMap = credential.toProperties(); - Map credentialMap = credentials.credentialInfo(); String accessKeyId = credentialMap.get(GRAVITINO_S3_SESSION_ACCESS_KEY_ID); String secretAccessKey = credentialMap.get(GRAVITINO_S3_SESSION_SECRET_ACCESS_KEY); - String sessionToken = credentialMap.get(GRAVITINO_S3_TOKEN); - this.basicSessionCredentials = - new BasicSessionCredentials(accessKeyId, secretAccessKey, sessionToken); - this.expirationTime = credentials.expireTimeInMs(); + if (S3TokenCredential.S3_TOKEN_CREDENTIAL_TYPE.equals( + credentialMap.get(Credential.CREDENTIAL_TYPE))) { + String sessionToken = credentialMap.get(GRAVITINO_S3_TOKEN); + this.basicSessionCredentials = + new BasicSessionCredentials(accessKeyId, secretAccessKey, sessionToken); + } else { + this.basicSessionCredentials = new BasicAWSCredentials(accessKeyId, secretAccessKey); + } + + this.expirationTime = credential.expireTimeInMs(); + if (expirationTime <= 0) { + expirationTime = Long.MAX_VALUE; + } } } diff --git a/bundles/azure-bundle/build.gradle.kts b/bundles/azure-bundle/build.gradle.kts index fbce5252643..f9ce722b807 100644 --- a/bundles/azure-bundle/build.gradle.kts +++ b/bundles/azure-bundle/build.gradle.kts @@ -46,6 +46,8 @@ dependencies { exclude("*") } implementation(project(":clients:client-java-runtime", configuration = "shadow")) + + implementation(project(":clients:filesystem-hadoop3-runtime", configuration = "shadow")) } tasks.withType(ShadowJar::class.java) { diff --git a/bundles/azure-bundle/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java b/bundles/azure-bundle/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java index 7c407f8f4f2..e37cf75d94c 100644 --- a/bundles/azure-bundle/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java +++ b/bundles/azure-bundle/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java @@ -27,17 +27,21 @@ import java.io.IOException; import java.util.Map; import javax.annotation.Nonnull; -import org.apache.gravitino.catalog.hadoop.common.Properties; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; +import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystemConfiguration; import org.apache.gravitino.storage.AzureProperties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.azurebfs.services.AuthType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class AzureFileSystemProvider implements FileSystemProvider { + private static final Logger LOGGER = LoggerFactory.getLogger(AzureFileSystemProvider.class); + @VisibleForTesting public static final String ABS_PROVIDER_SCHEME = "abfss"; @VisibleForTesting public static final String ABS_PROVIDER_NAME = "abs"; @@ -67,19 +71,36 @@ public FileSystem getFileSystem(@Nonnull Path path, @Nonnull Map configuration.set(ABFS_IMPL_KEY, ABFS_IMPL); } - if (config.containsKey(Properties.USE_GRAVITINO_CLOUD_STORE_CREDENTIAL) - && Boolean.parseBoolean(config.get(Properties.USE_GRAVITINO_CLOUD_STORE_CREDENTIAL))) { - String pathString = path.toString(); - String accountSuffix = pathString.split("@")[1].split("/")[0]; + hadoopConfMap.forEach(configuration::set); - configuration.set(FS_AZURE_ACCOUNT_AUTH_TYPE_PROPERTY_NAME, AuthType.SAS.name()); - configuration.set( - FS_AZURE_SAS_TOKEN_PROVIDER_TYPE + "." + accountSuffix, - AzureSasCredentialProvider.class.getName()); + // Check whether this is from GVFS client. + if (config.containsKey(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { + // Test whether SAS works + try { + AzureSasCredentialProvider azureSasCredentialProvider = new AzureSasCredentialProvider(); + azureSasCredentialProvider.initialize(configuration, null); + String sas = azureSasCredentialProvider.getSASToken(null, null, null, null); + if (sas != null) { + configuration.set(FS_AZURE_ACCOUNT_AUTH_TYPE_PROPERTY_NAME, AuthType.SAS.name()); + configuration.set( + FS_AZURE_SAS_TOKEN_PROVIDER_TYPE + ".dfs.core.windows.net", + AzureSasCredentialProvider.class.getName()); + } else if (azureSasCredentialProvider.getAzureStorageAccountKey() != null + && azureSasCredentialProvider.getAzureStorageAccountName() != null) { + configuration.set( + String.format( + "fs.azure.account.key.%s.dfs.core.windows.net", + azureSasCredentialProvider.getAzureStorageAccountName()), + azureSasCredentialProvider.getAzureStorageAccountKey()); + } + } catch (Exception e) { + // Can't use SAS, use account key and account key instead + LOGGER.warn( + "Failed to use SAS token and user account from credential provider, use default conf. ", + e); + } } - hadoopConfMap.forEach(configuration::set); - return FileSystem.get(path.toUri(), configuration); } diff --git a/bundles/azure-bundle/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java b/bundles/azure-bundle/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java index d7e4eddf655..aa32cb39c6d 100644 --- a/bundles/azure-bundle/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java +++ b/bundles/azure-bundle/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java @@ -20,6 +20,8 @@ package org.apache.gravitino.abs.fs; import static org.apache.gravitino.credential.ADLSTokenCredential.GRAVITINO_ADLS_SAS_TOKEN; +import static org.apache.gravitino.credential.AzureAccountKeyCredential.GRAVITINO_AZURE_STORAGE_ACCOUNT_KEY; +import static org.apache.gravitino.credential.AzureAccountKeyCredential.GRAVITINO_AZURE_STORAGE_ACCOUNT_NAME; import java.io.IOException; import java.util.Map; @@ -29,24 +31,43 @@ import org.apache.gravitino.credential.Credential; import org.apache.gravitino.file.Fileset; import org.apache.gravitino.file.FilesetCatalog; +import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem; +import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystemConfiguration; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.azurebfs.extensions.SASTokenProvider; -import org.apache.hadoop.security.AccessControlException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class AzureSasCredentialProvider implements SASTokenProvider, Configurable { + private static final Logger LOGGER = LoggerFactory.getLogger(AzureSasCredentialProvider.class); + private Configuration configuration; - @SuppressWarnings("unused") private String filesetIdentifier; - @SuppressWarnings("unused") private GravitinoClient client; private String sasToken; + + private String azureStorageAccountName; + private String azureStorageAccountKey; + private long expirationTime; + public String getSasToken() { + return sasToken; + } + + public String getAzureStorageAccountName() { + return azureStorageAccountName; + } + + public String getAzureStorageAccountKey() { + return azureStorageAccountKey; + } + @Override public void setConf(Configuration configuration) { this.configuration = configuration; @@ -59,21 +80,16 @@ public Configuration getConf() { @Override public void initialize(Configuration conf, String accountName) throws IOException { - this.filesetIdentifier = conf.get("gravitino.fileset.identifier"); - - @SuppressWarnings("unused") - String metalake = conf.get("fs.gravitino.client.metalake"); - @SuppressWarnings("unused") - String gravitinoServer = conf.get("fs.gravitino.server.uri"); + this.filesetIdentifier = + conf.get(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER); - // TODO, support auth between client and server. - this.client = - GravitinoClient.builder(gravitinoServer).withMetalake(metalake).withSimpleAuth().build(); + // extra value and init Gravitino client here + GravitinoVirtualFileSystem gravitinoVirtualFileSystem = new GravitinoVirtualFileSystem(); + this.client = gravitinoVirtualFileSystem.initializeClient(conf); } @Override - public String getSASToken(String account, String fileSystem, String path, String operation) - throws IOException, AccessControlException { + public String getSASToken(String account, String fileSystem, String path, String operation) { // Refresh credentials if they are null or about to expire in 5 minutes if (sasToken == null || System.currentTimeMillis() > expirationTime - 5 * 60 * 1000) { synchronized (this) { @@ -84,21 +100,33 @@ public String getSASToken(String account, String fileSystem, String path, String } private void refresh() { - // The format of filesetIdentifier is "metalake.catalog.fileset.schema" String[] idents = filesetIdentifier.split("\\."); String catalog = idents[1]; FilesetCatalog filesetCatalog = client.loadCatalog(catalog).asFilesetCatalog(); - - @SuppressWarnings("unused") Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); - // Should mock - // Credential credentials = fileset.supportsCredentials().getCredential("s3-token"); - Credential credential = new ADLSTokenCredential("xxx", "xxx", 1L); + Credential[] credentials = fileset.supportsCredentials().getCredentials(); + if (credentials.length == 0) { + LOGGER.warn("No credentials found for fileset {}", filesetIdentifier); + return; + } + + // Use the first one. + Credential credential = credentials[0]; + Map credentialMap = credential.toProperties(); + + if (ADLSTokenCredential.ADLS_SAS_TOKEN_CREDENTIAL_TYPE.equals( + credentialMap.get(Credential.CREDENTIAL_TYPE))) { + this.sasToken = credentialMap.get(GRAVITINO_ADLS_SAS_TOKEN); + } else { + this.azureStorageAccountName = credentialMap.get(GRAVITINO_AZURE_STORAGE_ACCOUNT_NAME); + this.azureStorageAccountKey = credentialMap.get(GRAVITINO_AZURE_STORAGE_ACCOUNT_KEY); + } - Map credentialMap = credential.credentialInfo(); - this.sasToken = credentialMap.get(GRAVITINO_ADLS_SAS_TOKEN); this.expirationTime = credential.expireTimeInMs(); + if (expirationTime <= 0) { + expirationTime = Long.MAX_VALUE; + } } } diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java index 4984191bd2c..a9a4c8ac621 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java @@ -18,6 +18,8 @@ */ package org.apache.gravitino.filesystem.hadoop; +import static org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER; + import com.github.benmanes.caffeine.cache.Cache; import com.github.benmanes.caffeine.cache.Caffeine; import com.github.benmanes.caffeine.cache.Scheduler; @@ -45,7 +47,6 @@ import org.apache.gravitino.audit.FilesetAuditConstants; import org.apache.gravitino.audit.FilesetDataOperation; import org.apache.gravitino.audit.InternalClientType; -import org.apache.gravitino.catalog.hadoop.common.Properties; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.client.DefaultOAuth2TokenProvider; import org.apache.gravitino.client.GravitinoClient; @@ -132,7 +133,7 @@ public void initialize(URI name, Configuration configuration) throws IOException "'%s' is not set in the configuration", GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_METALAKE_KEY); - initializeClient(configuration); + this.client = initializeClient(configuration); // Register the default local and HDFS FileSystemProvider fileSystemProvidersMap.putAll(getFileSystemProviders()); @@ -193,10 +194,12 @@ private ThreadFactory newDaemonThreadFactory(String name) { return new ThreadFactoryBuilder().setDaemon(true).setNameFormat(name + "-%d").build(); } - private void initializeClient(Configuration configuration) { + public GravitinoClient initializeClient(Configuration configuration) { // initialize the Gravitino client String serverUri = configuration.get(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY); + String metalakeValue = + configuration.get(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_METALAKE_KEY); Preconditions.checkArgument( StringUtils.isNotBlank(serverUri), "'%s' is not set in the configuration", @@ -207,8 +210,10 @@ private void initializeClient(Configuration configuration) { GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_AUTH_TYPE_KEY, GravitinoVirtualFileSystemConfiguration.SIMPLE_AUTH_TYPE); if (authType.equalsIgnoreCase(GravitinoVirtualFileSystemConfiguration.SIMPLE_AUTH_TYPE)) { - this.client = - GravitinoClient.builder(serverUri).withMetalake(metalakeName).withSimpleAuth().build(); + return GravitinoClient.builder(serverUri) + .withMetalake(metalakeValue) + .withSimpleAuth() + .build(); } else if (authType.equalsIgnoreCase( GravitinoVirtualFileSystemConfiguration.OAUTH2_AUTH_TYPE)) { String authServerUri = @@ -251,11 +256,10 @@ private void initializeClient(Configuration configuration) { .withScope(scope) .build(); - this.client = - GravitinoClient.builder(serverUri) - .withMetalake(metalakeName) - .withOAuth(authDataProvider) - .build(); + return GravitinoClient.builder(serverUri) + .withMetalake(metalakeValue) + .withOAuth(authDataProvider) + .build(); } else if (authType.equalsIgnoreCase( GravitinoVirtualFileSystemConfiguration.KERBEROS_AUTH_TYPE)) { String principal = @@ -281,11 +285,11 @@ private void initializeClient(Configuration configuration) { // Using ticket cache to create auth provider authDataProvider = KerberosTokenProvider.builder().withClientPrincipal(principal).build(); } - this.client = - GravitinoClient.builder(serverUri) - .withMetalake(metalakeName) - .withKerberosAuth(authDataProvider) - .build(); + + return GravitinoClient.builder(serverUri) + .withMetalake(metalakeValue) + .withKerberosAuth(authDataProvider) + .build(); } else { throw new IllegalArgumentException( String.format( @@ -394,11 +398,8 @@ private FilesetContextPair getFilesetContext(Path virtualPath, FilesetDataOperat } Map maps = getConfigMap(getConf()); - if (maps.containsKey(Properties.USE_GRAVITINO_CLOUD_STORE_CREDENTIAL) - && maps.get(Properties.USE_GRAVITINO_CLOUD_STORE_CREDENTIAL).equals("true")) { - // If enable the cloud store credential, we should pass the configuration here. - maps.put("gravitino.fileset.identifier", identifier.toString()); - } + // If enable the cloud store credential, we should pass the configuration here. + maps.put(GVFS_FILESET_IDENTIFIER, identifier.toString()); return provider.getFileSystem(filePath, maps); } catch (IOException ioe) { diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java index e2bce734531..ef2b8de852a 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java @@ -98,5 +98,7 @@ public class GravitinoVirtualFileSystemConfiguration { public static final long FS_GRAVITINO_FILESET_CACHE_EVICTION_MILLS_AFTER_ACCESS_DEFAULT = 1000L * 60 * 60; + public static final String GVFS_FILESET_IDENTIFIER = "fs.gvfs.fileset.identifier"; + private GravitinoVirtualFileSystemConfiguration() {} } diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemABSCredentialIT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemABSCredentialIT.java new file mode 100644 index 00000000000..02f9499a34c --- /dev/null +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemABSCredentialIT.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.filesystem.hadoop.integration.test; + +import static org.apache.gravitino.catalog.hadoop.HadoopCatalogPropertiesMetadata.FILESYSTEM_PROVIDERS; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Maps; +import java.io.IOException; +import java.util.Collections; +import java.util.Map; +import org.apache.gravitino.Catalog; +import org.apache.gravitino.abs.fs.AzureFileSystemProvider; +import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; +import org.apache.gravitino.credential.CredentialConstants; +import org.apache.gravitino.integration.test.util.GravitinoITUtils; +import org.apache.gravitino.storage.AzureProperties; +import org.apache.hadoop.conf.Configuration; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.condition.EnabledIf; +import org.junit.platform.commons.util.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@EnabledIf("absIsConfigured") +public class GravitinoVirtualFileSystemABSCredentialIT extends GravitinoVirtualFileSystemIT { + private static final Logger LOG = + LoggerFactory.getLogger(GravitinoVirtualFileSystemABSCredentialIT.class); + + public static final String ABS_ACCOUNT_NAME = System.getenv("ABS_STS_ACCOUNT_NAME"); + public static final String ABS_ACCOUNT_KEY = System.getenv("ABS_STS_ACCOUNT_KEY"); + public static final String ABS_CONTAINER_NAME = System.getenv("ABS_STS_CONTAINER_NAME"); + public static final String ABS_TENANT_ID = System.getenv("ABS_STS_TENANT_ID"); + public static final String ABS_CLIENT_ID = System.getenv("ABS_STS_CLIENT_ID"); + public static final String ABS_CLIENT_SECRET = System.getenv("ABS_STS_CLIENT_SECRET"); + + @BeforeAll + public void startIntegrationTest() { + // Do nothing + } + + @BeforeAll + public void startUp() throws Exception { + // Copy the Azure jars to the gravitino server if in deploy mode. + copyBundleJarsToHadoop("azure-bundle"); + // Need to download jars to gravitino server + super.startIntegrationTest(); + + // This value can be by tune by the user, please change it accordingly. + defaultBockSize = 32 * 1024 * 1024; + + // This value is 1 for ABS, 3 for GCS, and 1 for S3A. + defaultReplication = 1; + + metalakeName = GravitinoITUtils.genRandomName("gvfs_it_metalake"); + catalogName = GravitinoITUtils.genRandomName("catalog"); + schemaName = GravitinoITUtils.genRandomName("schema"); + + Assertions.assertFalse(client.metalakeExists(metalakeName)); + metalake = client.createMetalake(metalakeName, "metalake comment", Collections.emptyMap()); + Assertions.assertTrue(client.metalakeExists(metalakeName)); + + Map properties = Maps.newHashMap(); + + properties.put(AzureProperties.GRAVITINO_AZURE_STORAGE_ACCOUNT_NAME, ABS_ACCOUNT_NAME); + properties.put(AzureProperties.GRAVITINO_AZURE_STORAGE_ACCOUNT_KEY, ABS_ACCOUNT_KEY); + properties.put(AzureProperties.GRAVITINO_AZURE_CLIENT_ID, ABS_CLIENT_ID); + properties.put(AzureProperties.GRAVITINO_AZURE_CLIENT_SECRET, ABS_CLIENT_SECRET); + properties.put(AzureProperties.GRAVITINO_AZURE_TENANT_ID, ABS_TENANT_ID); + properties.put(CredentialConstants.CREDENTIAL_PROVIDERS, "adls-token"); + + properties.put(FILESYSTEM_PROVIDERS, AzureFileSystemProvider.ABS_PROVIDER_NAME); + + Catalog catalog = + metalake.createCatalog( + catalogName, Catalog.Type.FILESET, "hadoop", "catalog comment", properties); + Assertions.assertTrue(metalake.catalogExists(catalogName)); + + catalog.asSchemas().createSchema(schemaName, "schema comment", properties); + Assertions.assertTrue(catalog.asSchemas().schemaExists(schemaName)); + + conf.set("fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); + conf.set("fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs"); + conf.set("fs.gvfs.impl.disable.cache", "true"); + conf.set("fs.gravitino.server.uri", serverUri); + conf.set("fs.gravitino.client.metalake", metalakeName); + + // Pass this configuration to the real file system + conf.set(AzureProperties.GRAVITINO_AZURE_STORAGE_ACCOUNT_NAME, ABS_ACCOUNT_NAME); + conf.set(AzureProperties.GRAVITINO_AZURE_STORAGE_ACCOUNT_KEY, ABS_ACCOUNT_KEY); + conf.set("fs.abfss.impl", "org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem"); + + conf.set("fs.gravitino.client.useCloudStoreCredential", "true"); + } + + @AfterAll + public void tearDown() throws IOException { + Catalog catalog = metalake.loadCatalog(catalogName); + catalog.asSchemas().dropSchema(schemaName, true); + metalake.dropCatalog(catalogName, true); + client.dropMetalake(metalakeName, true); + + if (client != null) { + client.close(); + client = null; + } + + try { + closer.close(); + } catch (Exception e) { + LOG.error("Exception in closing CloseableGroup", e); + } + } + + /** + * Remove the `gravitino.bypass` prefix from the configuration and pass it to the real file system + * This method corresponds to the method org.apache.gravitino.filesystem.hadoop + * .GravitinoVirtualFileSystem#getConfigMap(Configuration) in the original code. + */ + protected Configuration convertGvfsConfigToRealFileSystemConfig(Configuration gvfsConf) { + Configuration absConf = new Configuration(); + Map map = Maps.newHashMap(); + + gvfsConf.forEach(entry -> map.put(entry.getKey(), entry.getValue())); + + Map hadoopConfMap = FileSystemUtils.toHadoopConfigMap(map, ImmutableMap.of()); + + if (gvfsConf.get(AzureProperties.GRAVITINO_AZURE_STORAGE_ACCOUNT_NAME) != null + && gvfsConf.get(AzureProperties.GRAVITINO_AZURE_STORAGE_ACCOUNT_KEY) != null) { + hadoopConfMap.put( + String.format( + "fs.azure.account.key.%s.dfs.core.windows.net", + gvfsConf.get(AzureProperties.GRAVITINO_AZURE_STORAGE_ACCOUNT_NAME)), + gvfsConf.get(AzureProperties.GRAVITINO_AZURE_STORAGE_ACCOUNT_KEY)); + } + + hadoopConfMap.forEach(absConf::set); + + return absConf; + } + + protected String genStorageLocation(String fileset) { + return String.format( + "%s://%s@%s.dfs.core.windows.net/%s", + AzureFileSystemProvider.ABS_PROVIDER_SCHEME, ABS_CONTAINER_NAME, ABS_ACCOUNT_NAME, fileset); + } + + @Disabled("java.lang.UnsupportedOperationException: Append Support not enabled") + public void testAppend() throws IOException {} + + private static boolean absIsConfigured() { + return StringUtils.isNotBlank(System.getenv("ABS_STS_ACCOUNT_NAME")) + && StringUtils.isNotBlank(System.getenv("ABS_STS_ACCOUNT_KEY")) + && StringUtils.isNotBlank(System.getenv("ABS_STS_CONTAINER_NAME")); + } +} diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemIT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemIT.java index b971ab918d2..a77081adfa0 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemIT.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemIT.java @@ -241,7 +241,7 @@ public void testDelete() throws IOException { String fileName = "test.txt"; Path deletePath = new Path(gvfsPath + "/" + fileName); try (FileSystem gvfs = gvfsPath.getFileSystem(conf)) { - Assertions.assertTrue(gvfs.exists(gvfsPath)); + // Assertions.assertTrue(gvfs.exists(gvfsPath)); gvfs.create(deletePath).close(); Assertions.assertTrue(gvfs.exists(deletePath)); Assertions.assertTrue(gvfs.getFileStatus(deletePath).isFile()); diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemOSSCredentialIT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemOSSCredentialIT.java new file mode 100644 index 00000000000..b5e1a5418a3 --- /dev/null +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemOSSCredentialIT.java @@ -0,0 +1,166 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.filesystem.hadoop.integration.test; + +import static org.apache.gravitino.catalog.hadoop.HadoopCatalogPropertiesMetadata.FILESYSTEM_PROVIDERS; + +import com.google.common.collect.Maps; +import java.io.IOException; +import java.util.Collections; +import java.util.Map; +import org.apache.gravitino.Catalog; +import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; +import org.apache.gravitino.credential.CredentialConstants; +import org.apache.gravitino.credential.OSSTokenCredential; +import org.apache.gravitino.integration.test.util.GravitinoITUtils; +import org.apache.gravitino.oss.fs.OSSFileSystemProvider; +import org.apache.gravitino.storage.OSSProperties; +import org.apache.hadoop.conf.Configuration; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.condition.EnabledIf; +import org.junit.platform.commons.util.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@EnabledIf(value = "ossIsConfigured", disabledReason = "OSS is not prepared") +public class GravitinoVirtualFileSystemOSSCredentialIT extends GravitinoVirtualFileSystemIT { + private static final Logger LOG = + LoggerFactory.getLogger(GravitinoVirtualFileSystemOSSCredentialIT.class); + + public static final String BUCKET_NAME = System.getenv("OSS_STS_BUCKET_NAME"); + public static final String OSS_ACCESS_KEY = System.getenv("OSS_STS_ACCESS_KEY_ID"); + public static final String OSS_SECRET_KEY = System.getenv("OSS_STS_SECRET_ACCESS_KEY"); + public static final String OSS_ENDPOINT = System.getenv("OSS_STS_ENDPOINT"); + public static final String OSS_REGION = System.getenv("OSS_STS_REGION"); + public static final String OSS_ROLE_ARN = System.getenv("OSS_STS_ROLE_ARN"); + + @BeforeAll + public void startIntegrationTest() { + // Do nothing + } + + @BeforeAll + public void startUp() throws Exception { + copyBundleJarsToHadoop("aliyun-bundle"); + // Need to download jars to gravitino server + super.startIntegrationTest(); + + // This value can be by tune by the user, please change it accordingly. + defaultBockSize = 64 * 1024 * 1024; + + // The default replication factor is 1. + defaultReplication = 1; + + metalakeName = GravitinoITUtils.genRandomName("gvfs_it_metalake"); + catalogName = GravitinoITUtils.genRandomName("catalog"); + schemaName = GravitinoITUtils.genRandomName("schema"); + + Assertions.assertFalse(client.metalakeExists(metalakeName)); + metalake = client.createMetalake(metalakeName, "metalake comment", Collections.emptyMap()); + Assertions.assertTrue(client.metalakeExists(metalakeName)); + + Map properties = Maps.newHashMap(); + properties.put(FILESYSTEM_PROVIDERS, "oss"); + properties.put(OSSProperties.GRAVITINO_OSS_ACCESS_KEY_ID, OSS_ACCESS_KEY); + properties.put(OSSProperties.GRAVITINO_OSS_ACCESS_KEY_SECRET, OSS_SECRET_KEY); + properties.put(OSSProperties.GRAVITINO_OSS_ENDPOINT, OSS_ENDPOINT); + properties.put(OSSProperties.GRAVITINO_OSS_REGION, OSS_REGION); + properties.put(OSSProperties.GRAVITINO_OSS_ROLE_ARN, OSS_ROLE_ARN); + properties.put( + CredentialConstants.CREDENTIAL_PROVIDERS, OSSTokenCredential.OSS_TOKEN_CREDENTIAL_TYPE); + + Catalog catalog = + metalake.createCatalog( + catalogName, Catalog.Type.FILESET, "hadoop", "catalog comment", properties); + Assertions.assertTrue(metalake.catalogExists(catalogName)); + + catalog.asSchemas().createSchema(schemaName, "schema comment", properties); + Assertions.assertTrue(catalog.asSchemas().schemaExists(schemaName)); + + conf.set("fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); + conf.set("fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs"); + conf.set("fs.gvfs.impl.disable.cache", "true"); + conf.set("fs.gravitino.server.uri", serverUri); + conf.set("fs.gravitino.client.metalake", metalakeName); + + // Pass this configuration to the real file system + conf.set(OSSProperties.GRAVITINO_OSS_ACCESS_KEY_ID, OSS_ACCESS_KEY); + conf.set(OSSProperties.GRAVITINO_OSS_ACCESS_KEY_SECRET, OSS_SECRET_KEY); + conf.set(OSSProperties.GRAVITINO_OSS_ENDPOINT, OSS_ENDPOINT); + conf.set("fs.oss.impl", "org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem"); + } + + @AfterAll + public void tearDown() throws IOException { + Catalog catalog = metalake.loadCatalog(catalogName); + catalog.asSchemas().dropSchema(schemaName, true); + metalake.dropCatalog(catalogName, true); + client.dropMetalake(metalakeName, true); + + if (client != null) { + client.close(); + client = null; + } + + try { + closer.close(); + } catch (Exception e) { + LOG.error("Exception in closing CloseableGroup", e); + } + } + + /** + * Remove the `gravitino.bypass` prefix from the configuration and pass it to the real file system + * This method corresponds to the method org.apache.gravitino.filesystem.hadoop + * .GravitinoVirtualFileSystem#getConfigMap(Configuration) in the original code. + */ + protected Configuration convertGvfsConfigToRealFileSystemConfig(Configuration gvfsConf) { + Configuration ossConf = new Configuration(); + Map map = Maps.newHashMap(); + + gvfsConf.forEach(entry -> map.put(entry.getKey(), entry.getValue())); + + Map hadoopConfMap = + FileSystemUtils.toHadoopConfigMap( + map, OSSFileSystemProvider.GRAVITINO_KEY_TO_OSS_HADOOP_KEY); + + hadoopConfMap.forEach(ossConf::set); + + return ossConf; + } + + protected String genStorageLocation(String fileset) { + return String.format("oss://%s/%s", BUCKET_NAME, fileset); + } + + @Disabled( + "OSS does not support append, java.io.IOException: The append operation is not supported") + public void testAppend() throws IOException {} + + protected static boolean ossIsConfigured() { + return StringUtils.isNotBlank(System.getenv("OSS_STS_ACCESS_KEY_ID")) + && StringUtils.isNotBlank(System.getenv("OSS_STS_SECRET_ACCESS_KEY")) + && StringUtils.isNotBlank(System.getenv("OSS_STS_ENDPOINT")) + && StringUtils.isNotBlank(System.getenv("OSS_STS_BUCKET_NAME")); + } +} diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemRealS3IT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemRealS3IT.java index 3c39a172bc7..e5d775b6d7d 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemRealS3IT.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemRealS3IT.java @@ -27,6 +27,8 @@ import java.util.Map; import org.apache.gravitino.Catalog; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; +import org.apache.gravitino.credential.CredentialConstants; +import org.apache.gravitino.credential.S3TokenCredential; import org.apache.gravitino.integration.test.util.GravitinoITUtils; import org.apache.gravitino.s3.fs.S3FileSystemProvider; import org.apache.gravitino.storage.S3Properties; @@ -42,10 +44,11 @@ public class GravitinoVirtualFileSystemRealS3IT extends GravitinoVirtualFileSyst private static final Logger LOG = LoggerFactory.getLogger(GravitinoVirtualFileSystemRealS3IT.class); - public static final String BUCKET_NAME = System.getenv("S3_BUCKET_NAME"); - public static final String S3_ACCESS_KEY = System.getenv("S3_ACCESS_KEY_ID"); - public static final String S3_SECRET_KEY = System.getenv("S3_SECRET_ACCESS_KEY"); - public static final String S3_ENDPOINT = System.getenv("S3_ENDPOINT"); + public static final String BUCKET_NAME = System.getenv("S3_STS_BUCKET_NAME"); + public static final String S3_ACCESS_KEY = System.getenv("S3_STS_ACCESS_KEY_ID"); + public static final String S3_SECRET_KEY = System.getenv("S3_STS_SECRET_ACCESS_KEY"); + public static final String S3_REGION = System.getenv("S3_STS_REGION"); + public static final String S3_ROLE_ARN = System.getenv("S3_STS_ROLE_ARN"); @BeforeAll public void startIntegrationTest() { @@ -74,14 +77,18 @@ public void startUp() throws Exception { Assertions.assertTrue(client.metalakeExists(metalakeName)); Map properties = Maps.newHashMap(); - properties.put("gravitino.bypass.fs.s3a.access.key", S3_ACCESS_KEY); - properties.put("gravitino.bypass.fs.s3a.secret.key", S3_SECRET_KEY); - properties.put("gravitino.bypass.fs.s3a.endpoint", S3_ENDPOINT); + properties.put(S3Properties.GRAVITINO_S3_ACCESS_KEY_ID, S3_ACCESS_KEY); + properties.put(S3Properties.GRAVITINO_S3_SECRET_ACCESS_KEY, S3_SECRET_KEY); properties.put( "gravitino.bypass.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider"); properties.put(FILESYSTEM_PROVIDERS, "s3"); + properties.put(S3Properties.GRAVITINO_S3_REGION, S3_REGION); + properties.put(S3Properties.GRAVITINO_S3_ROLE_ARN, S3_ROLE_ARN); + properties.put( + CredentialConstants.CREDENTIAL_PROVIDERS, S3TokenCredential.S3_TOKEN_CREDENTIAL_TYPE); + Catalog catalog = metalake.createCatalog( catalogName, Catalog.Type.FILESET, "hadoop", "catalog comment", properties); @@ -99,9 +106,8 @@ public void startUp() throws Exception { // Pass this configuration to the real file system conf.set(S3Properties.GRAVITINO_S3_SECRET_ACCESS_KEY, S3_SECRET_KEY); conf.set(S3Properties.GRAVITINO_S3_ACCESS_KEY_ID, S3_ACCESS_KEY); - conf.set(S3Properties.GRAVITINO_S3_ENDPOINT, S3_ENDPOINT); - - conf.set("fs.gravitino.client.useCloudStoreCredential", "true"); + conf.set(S3Properties.GRAVITINO_S3_REGION, S3_REGION); + conf.set(S3Properties.GRAVITINO_S3_ROLE_ARN, S3_ROLE_ARN); } @AfterAll diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemS3IT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemS3IT.java index f45e4d3b6b1..4bb6ad38dcd 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemS3IT.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemS3IT.java @@ -156,8 +156,6 @@ public void startUp() throws Exception { conf.set(S3Properties.GRAVITINO_S3_SECRET_ACCESS_KEY, accessKey); conf.set(S3Properties.GRAVITINO_S3_ACCESS_KEY_ID, secretKey); conf.set(S3Properties.GRAVITINO_S3_ENDPOINT, s3Endpoint); - - conf.set("fs.gravitino.client.useCloudStoreCredential", "true"); } @AfterAll From 0c61a4857cc8aec67c2ea43ecf1fc08cb1e48035 Mon Sep 17 00:00:00 2001 From: yuqi Date: Fri, 27 Dec 2024 18:32:29 +0800 Subject: [PATCH 03/59] fix --- .../oss/fs/OSSSessionCredentialProvider.java | 6 +- .../s3/fs/S3SessionCredentialProvider.java | 4 + bundles/gcp-bundle/build.gradle.kts | 1 + .../gcs/fs/GCSCredentialProvider.java | 107 +++++++++++++ .../gcs/fs/GCSFileSystemProvider.java | 15 +- ...itinoVirtualFileSystemGCSCredentialIT.java | 151 ++++++++++++++++++ ...itinoVirtualFileSystemS3CredentialIT.java} | 4 +- 7 files changed, 283 insertions(+), 5 deletions(-) create mode 100644 bundles/gcp-bundle/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialProvider.java create mode 100644 clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java rename clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/{GravitinoVirtualFileSystemRealS3IT.java => GravitinoVirtualFileSystemS3CredentialIT.java} (97%) diff --git a/bundles/aliyun-bundle/src/main/java/org/apache/gravitino/oss/fs/OSSSessionCredentialProvider.java b/bundles/aliyun-bundle/src/main/java/org/apache/gravitino/oss/fs/OSSSessionCredentialProvider.java index 3a67e6f48f1..e2225be27fb 100644 --- a/bundles/aliyun-bundle/src/main/java/org/apache/gravitino/oss/fs/OSSSessionCredentialProvider.java +++ b/bundles/aliyun-bundle/src/main/java/org/apache/gravitino/oss/fs/OSSSessionCredentialProvider.java @@ -39,9 +39,12 @@ import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystemConfiguration; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.aliyun.oss.Constants; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class OSSSessionCredentialProvider implements CredentialsProvider { + private static final Logger LOGGER = LoggerFactory.getLogger(OSSSessionCredentialProvider.class); private Credentials basicCredentials; private final String filesetIdentifier; private long expirationTime; @@ -79,10 +82,9 @@ private void refresh() { FilesetCatalog filesetCatalog = client.loadCatalog(catalog).asFilesetCatalog(); Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); - // Use dynamic credential by default. - Credential[] credentials = fileset.supportsCredentials().getCredentials(); if (credentials.length == 0) { + LOGGER.warn("No credential found for fileset: {}, try to use static AKSK", filesetIdentifier); expirationTime = Long.MAX_VALUE; this.basicCredentials = new DefaultCredentials( diff --git a/bundles/aws-bundle/src/main/java/org/apache/gravitino/s3/fs/S3SessionCredentialProvider.java b/bundles/aws-bundle/src/main/java/org/apache/gravitino/s3/fs/S3SessionCredentialProvider.java index 1a0e3a9c444..49c3c09b144 100644 --- a/bundles/aws-bundle/src/main/java/org/apache/gravitino/s3/fs/S3SessionCredentialProvider.java +++ b/bundles/aws-bundle/src/main/java/org/apache/gravitino/s3/fs/S3SessionCredentialProvider.java @@ -39,9 +39,12 @@ import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystemConfiguration; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.s3a.Constants; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class S3SessionCredentialProvider implements AWSCredentialsProvider { + private static final Logger LOGGER = LoggerFactory.getLogger(S3SessionCredentialProvider.class); private final GravitinoClient client; private final String filesetIdentifier; private final Configuration configuration; @@ -85,6 +88,7 @@ public void refresh() { // Can't find any credential, use the default one. if (credentials.length == 0) { + LOGGER.warn("No credential found for fileset: {}, try to use static AKSK", filesetIdentifier); expirationTime = Long.MAX_VALUE; this.basicSessionCredentials = new BasicAWSCredentials( diff --git a/bundles/gcp-bundle/build.gradle.kts b/bundles/gcp-bundle/build.gradle.kts index bae7411c75e..05c0c8d9146 100644 --- a/bundles/gcp-bundle/build.gradle.kts +++ b/bundles/gcp-bundle/build.gradle.kts @@ -44,6 +44,7 @@ dependencies { } implementation(libs.google.auth.http) implementation(libs.google.auth.credentials) + implementation(project(":clients:filesystem-hadoop3-runtime", configuration = "shadow")) } tasks.withType(ShadowJar::class.java) { diff --git a/bundles/gcp-bundle/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialProvider.java b/bundles/gcp-bundle/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialProvider.java new file mode 100644 index 00000000000..dfb2207b181 --- /dev/null +++ b/bundles/gcp-bundle/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialProvider.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.gcs.fs; + +import com.google.cloud.hadoop.util.AccessTokenProvider; +import java.io.IOException; +import java.util.Map; +import org.apache.gravitino.NameIdentifier; +import org.apache.gravitino.client.GravitinoClient; +import org.apache.gravitino.credential.Credential; +import org.apache.gravitino.credential.GCSTokenCredential; +import org.apache.gravitino.file.Fileset; +import org.apache.gravitino.file.FilesetCatalog; +import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem; +import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystemConfiguration; +import org.apache.hadoop.conf.Configuration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class GCSCredentialProvider implements AccessTokenProvider { + private static final Logger LOGGER = LoggerFactory.getLogger(GCSCredentialProvider.class); + private Configuration configuration; + private GravitinoClient client; + private String filesetIdentifier; + + private AccessToken accessToken; + private long expirationTime; + + @Override + public AccessToken getAccessToken() { + if (accessToken == null || expirationTime < System.currentTimeMillis() + 5 * 60 * 1000) { + try { + refresh(); + } catch (IOException e) { + LOGGER.error("Failed to refresh the access token", e); + } + } + return accessToken; + } + + @Override + public void refresh() throws IOException { + // Refresh credentials if they are null or about to expire in 5 minutes + // The format of filesetIdentifier is "metalake.catalog.fileset.schema" + String[] idents = filesetIdentifier.split("\\."); + String catalog = idents[1]; + + FilesetCatalog filesetCatalog = client.loadCatalog(catalog).asFilesetCatalog(); + + Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); + Credential[] credentials = fileset.supportsCredentials().getCredentials(); + + // Can't find any credential, use the default one. + if (credentials.length == 0) { + LOGGER.warn( + "No credential found for fileset: {}, try to use static JSON file", filesetIdentifier); + return; + } + + Credential credential = credentials[0]; + Map credentialMap = credential.toProperties(); + + if (GCSTokenCredential.GCS_TOKEN_CREDENTIAL_TYPE.equals( + credentialMap.get(Credential.CREDENTIAL_TYPE))) { + String sessionToken = credentialMap.get(GCSTokenCredential.GCS_TOKEN_NAME); + accessToken = new AccessToken(sessionToken, expirationTime); + } + + this.expirationTime = credential.expireTimeInMs(); + if (expirationTime <= 0) { + expirationTime = Long.MAX_VALUE; + } + } + + @Override + public void setConf(Configuration configuration) { + this.configuration = configuration; + + this.filesetIdentifier = + configuration.get(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER); + // extra value and init Gravitino client here + GravitinoVirtualFileSystem gravitinoVirtualFileSystem = new GravitinoVirtualFileSystem(); + this.client = gravitinoVirtualFileSystem.initializeClient(configuration); + } + + @Override + public Configuration getConf() { + return this.configuration; + } +} diff --git a/bundles/gcp-bundle/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java b/bundles/gcp-bundle/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java index a07ff3d6ece..fa26015971f 100644 --- a/bundles/gcp-bundle/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java +++ b/bundles/gcp-bundle/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java @@ -19,12 +19,14 @@ package org.apache.gravitino.gcs.fs; import com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem; +import com.google.cloud.hadoop.util.AccessTokenProvider; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableMap; import java.io.IOException; import java.util.Map; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; +import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystemConfiguration; import org.apache.gravitino.storage.GCSProperties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -46,7 +48,18 @@ public FileSystem getFileSystem(Path path, Map config) throws IO Configuration configuration = new Configuration(); FileSystemUtils.toHadoopConfigMap(config, GRAVITINO_KEY_TO_GCS_HADOOP_KEY) .forEach(configuration::set); - LOGGER.info("Creating GCS file system with config: {}", config); + + if (config.containsKey(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { + AccessTokenProvider accessTokenProvider = new GCSCredentialProvider(); + accessTokenProvider.setConf(configuration); + // Why is this check necessary?, if Gravitino fails to get any credentials, we fall back to + // the default behavior of the GoogleHadoopFileSystem to use service account credentials. + if (accessTokenProvider.getAccessToken() != null) { + LOGGER.info("Creating GCS file system with credential provider: {}", config); + configuration.set( + "fs.gs.auth.access.token.provider.impl", GCSCredentialProvider.class.getName()); + } + } return GoogleHadoopFileSystem.newInstance(path.toUri(), configuration); } diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java new file mode 100644 index 00000000000..c2460b75f7b --- /dev/null +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.filesystem.hadoop.integration.test; + +import static org.apache.gravitino.catalog.hadoop.HadoopCatalogPropertiesMetadata.FILESYSTEM_PROVIDERS; + +import com.google.common.collect.Maps; +import java.io.IOException; +import java.util.Collections; +import java.util.Map; +import org.apache.commons.lang3.StringUtils; +import org.apache.gravitino.Catalog; +import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; +import org.apache.gravitino.credential.CredentialConstants; +import org.apache.gravitino.gcs.fs.GCSFileSystemProvider; +import org.apache.gravitino.integration.test.util.GravitinoITUtils; +import org.apache.gravitino.storage.GCSProperties; +import org.apache.hadoop.conf.Configuration; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.condition.EnabledIf; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@EnabledIf(value = "isGCPConfigured", disabledReason = "GCP is not configured") +public class GravitinoVirtualFileSystemGCSCredentialIT extends GravitinoVirtualFileSystemIT { + private static final Logger LOG = + LoggerFactory.getLogger(GravitinoVirtualFileSystemGCSCredentialIT.class); + + public static final String BUCKET_NAME = System.getenv("GCS_STS_BUCKET_NAME"); + public static final String SERVICE_ACCOUNT_FILE = + System.getenv("GCS_STS_SERVICE_ACCOUNT_JSON_PATH"); + + @BeforeAll + public void startIntegrationTest() { + // Do nothing + } + + @BeforeAll + public void startUp() throws Exception { + // Copy the GCP jars to the gravitino server if in deploy mode. + copyBundleJarsToHadoop("gcp-bundle"); + // Need to download jars to gravitino server + super.startIntegrationTest(); + + // This value can be by tune by the user, please change it accordingly. + defaultBockSize = 64 * 1024 * 1024; + + metalakeName = GravitinoITUtils.genRandomName("gvfs_it_metalake"); + catalogName = GravitinoITUtils.genRandomName("catalog"); + schemaName = GravitinoITUtils.genRandomName("schema"); + + Assertions.assertFalse(client.metalakeExists(metalakeName)); + metalake = client.createMetalake(metalakeName, "metalake comment", Collections.emptyMap()); + Assertions.assertTrue(client.metalakeExists(metalakeName)); + + Map properties = Maps.newHashMap(); + properties.put(FILESYSTEM_PROVIDERS, "gcs"); + properties.put(GCSProperties.GCS_SERVICE_ACCOUNT_JSON_PATH, SERVICE_ACCOUNT_FILE); + properties.put("gcs-credential-file-path", SERVICE_ACCOUNT_FILE); + properties.put(CredentialConstants.CREDENTIAL_PROVIDERS, "gcs-token"); + + Catalog catalog = + metalake.createCatalog( + catalogName, Catalog.Type.FILESET, "hadoop", "catalog comment", properties); + Assertions.assertTrue(metalake.catalogExists(catalogName)); + + catalog.asSchemas().createSchema(schemaName, "schema comment", properties); + Assertions.assertTrue(catalog.asSchemas().schemaExists(schemaName)); + + conf.set("fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); + conf.set("fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs"); + conf.set("fs.gvfs.impl.disable.cache", "true"); + conf.set("fs.gravitino.server.uri", serverUri); + conf.set("fs.gravitino.client.metalake", metalakeName); + + // Pass this configuration to the real file system + conf.set(GCSProperties.GCS_SERVICE_ACCOUNT_JSON_PATH, SERVICE_ACCOUNT_FILE); + } + + @AfterAll + public void tearDown() throws IOException { + Catalog catalog = metalake.loadCatalog(catalogName); + catalog.asSchemas().dropSchema(schemaName, true); + metalake.dropCatalog(catalogName, true); + client.dropMetalake(metalakeName, true); + + if (client != null) { + client.close(); + client = null; + } + + try { + closer.close(); + } catch (Exception e) { + LOG.error("Exception in closing CloseableGroup", e); + } + } + + /** + * Remove the `gravitino.bypass` prefix from the configuration and pass it to the real file system + * This method corresponds to the method org.apache.gravitino.filesystem.hadoop + * .GravitinoVirtualFileSystem#getConfigMap(Configuration) in the original code. + */ + protected Configuration convertGvfsConfigToRealFileSystemConfig(Configuration gvfsConf) { + Configuration gcsConf = new Configuration(); + Map map = Maps.newHashMap(); + + gvfsConf.forEach(entry -> map.put(entry.getKey(), entry.getValue())); + + Map hadoopConfMap = + FileSystemUtils.toHadoopConfigMap( + map, GCSFileSystemProvider.GRAVITINO_KEY_TO_GCS_HADOOP_KEY); + + hadoopConfMap.forEach(gcsConf::set); + + return gcsConf; + } + + protected String genStorageLocation(String fileset) { + return String.format("gs://%s/%s", BUCKET_NAME, fileset); + } + + @Disabled( + "GCS does not support append, java.io.IOException: The append operation is not supported") + public void testAppend() throws IOException {} + + private static boolean isGCPConfigured() { + return StringUtils.isNotBlank(System.getenv("GCS_STS_SERVICE_ACCOUNT_JSON_PATH")) + && StringUtils.isNotBlank(System.getenv("GCS_STS_BUCKET_NAME")); + } +} diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemRealS3IT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemS3CredentialIT.java similarity index 97% rename from clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemRealS3IT.java rename to clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemS3CredentialIT.java index e5d775b6d7d..0490399bc55 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemRealS3IT.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemS3CredentialIT.java @@ -40,9 +40,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class GravitinoVirtualFileSystemRealS3IT extends GravitinoVirtualFileSystemIT { +public class GravitinoVirtualFileSystemS3CredentialIT extends GravitinoVirtualFileSystemIT { private static final Logger LOG = - LoggerFactory.getLogger(GravitinoVirtualFileSystemRealS3IT.class); + LoggerFactory.getLogger(GravitinoVirtualFileSystemS3CredentialIT.class); public static final String BUCKET_NAME = System.getenv("S3_STS_BUCKET_NAME"); public static final String S3_ACCESS_KEY = System.getenv("S3_STS_ACCESS_KEY_ID"); From 18f6ff6a7bfd6b73d50681b91f19f4154837a848 Mon Sep 17 00:00:00 2001 From: yuqi Date: Fri, 27 Dec 2024 19:44:33 +0800 Subject: [PATCH 04/59] fix --- bundles/aliyun-bundle/build.gradle.kts | 1 + .../integration/test/HadoopABSCatalogIT.java | 1 - .../catalog/hadoop/common/Properties.java | 30 ------------------- 3 files changed, 1 insertion(+), 31 deletions(-) delete mode 100644 catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/common/Properties.java diff --git a/bundles/aliyun-bundle/build.gradle.kts b/bundles/aliyun-bundle/build.gradle.kts index bd0841ecb0d..c8377285599 100644 --- a/bundles/aliyun-bundle/build.gradle.kts +++ b/bundles/aliyun-bundle/build.gradle.kts @@ -30,6 +30,7 @@ dependencies { implementation(libs.hadoop3.client.api) implementation(libs.hadoop3.client.runtime) implementation(libs.hadoop3.oss) + implementation(libs.httpclient) } tasks.withType(ShadowJar::class.java) { diff --git a/catalogs/catalog-hadoop/src/test/java/org/apache/gravitino/catalog/hadoop/integration/test/HadoopABSCatalogIT.java b/catalogs/catalog-hadoop/src/test/java/org/apache/gravitino/catalog/hadoop/integration/test/HadoopABSCatalogIT.java index ec5b5bd4d5b..8d068f37ad4 100644 --- a/catalogs/catalog-hadoop/src/test/java/org/apache/gravitino/catalog/hadoop/integration/test/HadoopABSCatalogIT.java +++ b/catalogs/catalog-hadoop/src/test/java/org/apache/gravitino/catalog/hadoop/integration/test/HadoopABSCatalogIT.java @@ -140,7 +140,6 @@ public void testCreateSchemaAndFilesetWithSpecialLocation() { catalogProps.put("location", ossLocation); catalogProps.put(AzureProperties.GRAVITINO_AZURE_STORAGE_ACCOUNT_NAME, ABS_ACCOUNT_NAME); catalogProps.put(AzureProperties.GRAVITINO_AZURE_STORAGE_ACCOUNT_KEY, ABS_ACCOUNT_KEY); - catalogProps.put("gravitino.client.useCloudStoreCredential", "true"); catalogProps.put(FILESYSTEM_PROVIDERS, AzureFileSystemProvider.ABS_PROVIDER_NAME); diff --git a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/common/Properties.java b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/common/Properties.java deleted file mode 100644 index d1f9650a774..00000000000 --- a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/common/Properties.java +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.gravitino.catalog.hadoop.common; - -public class Properties { - - // The key that show whether to use Gravitino Cloud Store credential. - public static final String USE_GRAVITINO_CLOUD_STORE_CREDENTIAL = - "fs.gravitino.client.useCloudStoreCredential"; - - // The default value of the key that show whether to use Gravitino Cloud Store credential. - public static final boolean DEFAULT_USE_GRAVITINO_CLOUD_STORE_CREDENTIAL = true; -} From 8fc56aeed25a9c962407be00a91f6816f076f26f Mon Sep 17 00:00:00 2001 From: yuqi Date: Fri, 27 Dec 2024 19:50:22 +0800 Subject: [PATCH 05/59] Fix --- .../filesystem/hadoop/GravitinoVirtualFileSystem.java | 5 +++++ .../hadoop/GravitinoVirtualFileSystemConfiguration.java | 1 + 2 files changed, 6 insertions(+) diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java index 5b422da1327..230489953dd 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java @@ -195,6 +195,11 @@ private ThreadFactory newDaemonThreadFactory(String name) { return new ThreadFactoryBuilder().setDaemon(true).setNameFormat(name + "-%d").build(); } + /** + * Get Gravitino client by the configuration. + * @param configuration The configuration for the Gravitino client. + * @return The Gravitino client. + */ public GravitinoClient initializeClient(Configuration configuration) { // initialize the Gravitino client String serverUri = diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java index ef2b8de852a..8b83dc002b3 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java @@ -98,6 +98,7 @@ public class GravitinoVirtualFileSystemConfiguration { public static final long FS_GRAVITINO_FILESET_CACHE_EVICTION_MILLS_AFTER_ACCESS_DEFAULT = 1000L * 60 * 60; + /** The configuration key for the fileset identifier. */ public static final String GVFS_FILESET_IDENTIFIER = "fs.gvfs.fileset.identifier"; private GravitinoVirtualFileSystemConfiguration() {} From 00fa0983cd60868037ff9d2cfeedb54e48efcb68 Mon Sep 17 00:00:00 2001 From: yuqi Date: Fri, 27 Dec 2024 19:53:30 +0800 Subject: [PATCH 06/59] Fix --- .../hadoop/integration/test/GravitinoVirtualFileSystemIT.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemIT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemIT.java index a77081adfa0..b971ab918d2 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemIT.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemIT.java @@ -241,7 +241,7 @@ public void testDelete() throws IOException { String fileName = "test.txt"; Path deletePath = new Path(gvfsPath + "/" + fileName); try (FileSystem gvfs = gvfsPath.getFileSystem(conf)) { - // Assertions.assertTrue(gvfs.exists(gvfsPath)); + Assertions.assertTrue(gvfs.exists(gvfsPath)); gvfs.create(deletePath).close(); Assertions.assertTrue(gvfs.exists(deletePath)); Assertions.assertTrue(gvfs.getFileStatus(deletePath).isFile()); From 682705d528183e1ae2be034b4816970180d831e2 Mon Sep 17 00:00:00 2001 From: yuqi Date: Fri, 27 Dec 2024 20:17:37 +0800 Subject: [PATCH 07/59] Fix --- .../gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java | 1 + 1 file changed, 1 insertion(+) diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java index 230489953dd..505d8429e9e 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java @@ -197,6 +197,7 @@ private ThreadFactory newDaemonThreadFactory(String name) { /** * Get Gravitino client by the configuration. + * * @param configuration The configuration for the Gravitino client. * @return The Gravitino client. */ From 50a4d15e5b3765c9884d43cff0571f657c270d76 Mon Sep 17 00:00:00 2001 From: yuqi Date: Sat, 28 Dec 2024 10:08:38 +0800 Subject: [PATCH 08/59] Fix --- .../org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java index 5b10accb2de..16fb4e1282c 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java @@ -99,7 +99,6 @@ public void init() { } @Test - @Disabled public void testFSCache() throws IOException { String filesetName = "testFSCache"; Path managedFilesetPath = @@ -150,7 +149,8 @@ public void testFSCache() throws IOException { Objects.requireNonNull( ((GravitinoVirtualFileSystem) gravitinoFileSystem) .internalFileSystemCache() - .getIfPresent(NameIdentifier.of("file"))); + .getIfPresent( + NameIdentifier.of(metalakeName, catalogName, schemaName, "testFSCache"))); String anotherFilesetName = "test_new_fs"; Path diffLocalPath = @@ -163,7 +163,6 @@ public void testFSCache() throws IOException { } @Test - @Disabled public void testInternalCache() throws IOException { Path localPath1 = FileSystemTestUtils.createLocalDirPrefix(catalogName, schemaName, "fileset1"); Path filesetPath1 = From 7f0a99b7dcd7571f2f1c121cbbc67477bc1276f5 Mon Sep 17 00:00:00 2001 From: yuqi Date: Sun, 29 Dec 2024 18:34:54 +0800 Subject: [PATCH 09/59] Fix --- .../gravitino/oss/credential/OSSTokenProvider.java | 2 +- .../gravitino/s3/fs/S3FileSystemProvider.java | 13 +++++++------ .../gravitino/gcs/fs/GCSFileSystemProvider.java | 7 +++++++ .../GravitinoVirtualFileSystemGCSCredentialIT.java | 1 + .../test/GravitinoVirtualFileSystemGCSIT.java | 1 + .../GravitinoVirtualFileSystemS3CredentialIT.java | 3 +++ 6 files changed, 20 insertions(+), 7 deletions(-) diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/credential/OSSTokenProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/credential/OSSTokenProvider.java index 04ef0022a10..660dd614f63 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/credential/OSSTokenProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/credential/OSSTokenProvider.java @@ -156,7 +156,7 @@ private String createPolicy(Set readLocations, Set writeLocation key -> Statement.builder() .effect(Effect.ALLOW) - .addAction("oss:ListBucket") + .addAction("oss:ListObjects") .addResource(key) .condition(getCondition(uri))); // GetBucketLocation diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java index 625ba462ee1..a54b828106e 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java @@ -62,14 +62,15 @@ public FileSystem getFileSystem(Path path, Map config) throws IO Map hadoopConfMap = FileSystemUtils.toHadoopConfigMap(config, GRAVITINO_KEY_TO_S3_HADOOP_KEY); - if (!hadoopConfMap.containsKey(S3_CREDENTIAL_KEY) - && config.containsKey( - GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { - hadoopConfMap.put( - Constants.AWS_CREDENTIALS_PROVIDER, S3SessionCredentialProvider.class.getCanonicalName()); + hadoopConfMap.forEach(configuration::set); + if (!hadoopConfMap.containsKey(S3_CREDENTIAL_KEY)) { + configuration.set(S3_CREDENTIAL_KEY, S3_SIMPLE_CREDENTIAL); } - hadoopConfMap.forEach(configuration::set); + if (config.containsKey(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { + configuration.set( + Constants.AWS_CREDENTIALS_PROVIDER, S3SessionCredentialProvider.class.getCanonicalName()); + } // Hadoop-aws 2 does not support IAMInstanceCredentialsProvider checkAndSetCredentialProvider(configuration); diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java index 3ac0fc4a63a..376d4285649 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java @@ -18,11 +18,13 @@ */ package org.apache.gravitino.gcs.fs; +import com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem; import com.google.cloud.hadoop.util.AccessTokenProvider; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableMap; import java.io.IOException; import java.util.Map; +import org.apache.commons.lang3.StringUtils; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystemConfiguration; @@ -55,6 +57,11 @@ public FileSystem getFileSystem(Path path, Map config) throws IO "fs.gs.auth.access.token.provider.impl", GCSCredentialProvider.class.getName()); } } + + if (StringUtils.isBlank(configuration.get("fs.gs.impl"))) { + configuration.set("fs.gs.impl", GoogleHadoopFileSystem.class.getName()); + } + return FileSystem.newInstance(path.toUri(), configuration); } diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java index c2460b75f7b..32486b48c47 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java @@ -92,6 +92,7 @@ public void startUp() throws Exception { conf.set("fs.gvfs.impl.disable.cache", "true"); conf.set("fs.gravitino.server.uri", serverUri); conf.set("fs.gravitino.client.metalake", metalakeName); + conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem"); // Pass this configuration to the real file system conf.set(GCSProperties.GCS_SERVICE_ACCOUNT_JSON_PATH, SERVICE_ACCOUNT_FILE); diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSIT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSIT.java index f273708810c..33f8f910ae3 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSIT.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSIT.java @@ -88,6 +88,7 @@ public void startUp() throws Exception { conf.set("fs.gvfs.impl.disable.cache", "true"); conf.set("fs.gravitino.server.uri", serverUri); conf.set("fs.gravitino.client.metalake", metalakeName); + conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem"); // Pass this configuration to the real file system conf.set(GCSProperties.GCS_SERVICE_ACCOUNT_JSON_PATH, SERVICE_ACCOUNT_FILE); diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemS3CredentialIT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemS3CredentialIT.java index 19297b131ab..e424615b79a 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemS3CredentialIT.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemS3CredentialIT.java @@ -50,6 +50,7 @@ public class GravitinoVirtualFileSystemS3CredentialIT extends GravitinoVirtualFi public static final String BUCKET_NAME = System.getenv("S3_STS_BUCKET_NAME"); public static final String S3_ACCESS_KEY = System.getenv("S3_STS_ACCESS_KEY_ID"); public static final String S3_SECRET_KEY = System.getenv("S3_STS_SECRET_ACCESS_KEY"); + public static final String S3_ENDPOINT = System.getenv("S3_STS_ENDPOINT"); public static final String S3_REGION = System.getenv("S3_STS_REGION"); public static final String S3_ROLE_ARN = System.getenv("S3_STS_ROLE_ARN"); @@ -82,6 +83,7 @@ public void startUp() throws Exception { Map properties = Maps.newHashMap(); properties.put(S3Properties.GRAVITINO_S3_ACCESS_KEY_ID, S3_ACCESS_KEY); properties.put(S3Properties.GRAVITINO_S3_SECRET_ACCESS_KEY, S3_SECRET_KEY); + properties.put(S3Properties.GRAVITINO_S3_ENDPOINT, S3_ENDPOINT); properties.put( "gravitino.bypass.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider"); @@ -109,6 +111,7 @@ public void startUp() throws Exception { // Pass this configuration to the real file system conf.set(S3Properties.GRAVITINO_S3_SECRET_ACCESS_KEY, S3_SECRET_KEY); conf.set(S3Properties.GRAVITINO_S3_ACCESS_KEY_ID, S3_ACCESS_KEY); + conf.set(S3Properties.GRAVITINO_S3_ENDPOINT, S3_ENDPOINT); conf.set(S3Properties.GRAVITINO_S3_REGION, S3_REGION); conf.set(S3Properties.GRAVITINO_S3_ROLE_ARN, S3_ROLE_ARN); } From 22bea5cb28b1eb515706b79e14c9e8ef1233c665 Mon Sep 17 00:00:00 2001 From: yuqi Date: Sun, 29 Dec 2024 19:22:37 +0800 Subject: [PATCH 10/59] fix conflict --- .../org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java index aa32cb39c6d..5c2900ab5a1 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java @@ -116,7 +116,7 @@ private void refresh() { Credential credential = credentials[0]; Map credentialMap = credential.toProperties(); - if (ADLSTokenCredential.ADLS_SAS_TOKEN_CREDENTIAL_TYPE.equals( + if (ADLSTokenCredential.ADLS_TOKEN_CREDENTIAL_TYPE.equals( credentialMap.get(Credential.CREDENTIAL_TYPE))) { this.sasToken = credentialMap.get(GRAVITINO_ADLS_SAS_TOKEN); } else { From f4f22876903afd9297fa5f235380d2b20b8678d1 Mon Sep 17 00:00:00 2001 From: yuqi Date: Mon, 30 Dec 2024 11:59:24 +0800 Subject: [PATCH 11/59] fix --- .../gravitino/abs/fs/AzureFileSystemProvider.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java index e37cf75d94c..7cead5b8854 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java @@ -81,9 +81,15 @@ public FileSystem getFileSystem(@Nonnull Path path, @Nonnull Map azureSasCredentialProvider.initialize(configuration, null); String sas = azureSasCredentialProvider.getSASToken(null, null, null, null); if (sas != null) { - configuration.set(FS_AZURE_ACCOUNT_AUTH_TYPE_PROPERTY_NAME, AuthType.SAS.name()); + String accountName = + String.format( + "%s.dfs.core.windows.net", + config.get(AzureProperties.GRAVITINO_AZURE_STORAGE_ACCOUNT_NAME)); + + configuration.set( + FS_AZURE_ACCOUNT_AUTH_TYPE_PROPERTY_NAME + "." + accountName, AuthType.SAS.name()); configuration.set( - FS_AZURE_SAS_TOKEN_PROVIDER_TYPE + ".dfs.core.windows.net", + FS_AZURE_SAS_TOKEN_PROVIDER_TYPE + "." + accountName, AzureSasCredentialProvider.class.getName()); } else if (azureSasCredentialProvider.getAzureStorageAccountKey() != null && azureSasCredentialProvider.getAzureStorageAccountName() != null) { From 20f7ec6b144f8ed9dcf5dfa4e32ccde46abf3ba9 Mon Sep 17 00:00:00 2001 From: yuqi Date: Mon, 30 Dec 2024 20:00:56 +0800 Subject: [PATCH 12/59] fix --- .../gravitino/oss/credential/OSSTokenProvider.java | 1 + .../gravitino/s3/credential/S3TokenProvider.java | 11 +++++++++-- .../gravitino/gcs/credential/GCSTokenProvider.java | 6 ++++++ .../filesystem/hadoop/GravitinoVirtualFileSystem.java | 6 ++++++ 4 files changed, 22 insertions(+), 2 deletions(-) diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/credential/OSSTokenProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/credential/OSSTokenProvider.java index 660dd614f63..e1ea92daeb1 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/credential/OSSTokenProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/credential/OSSTokenProvider.java @@ -166,6 +166,7 @@ private String createPolicy(Set readLocations, Set writeLocation Statement.builder() .effect(Effect.ALLOW) .addAction("oss:GetBucketLocation") + .addAction("oss:GetBucketInfo") .addResource(key)); }); diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/credential/S3TokenProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/credential/S3TokenProvider.java index 24b88875de9..18f82794740 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/credential/S3TokenProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/credential/S3TokenProvider.java @@ -20,6 +20,7 @@ package org.apache.gravitino.s3.credential; import java.net.URI; +import java.util.Arrays; import java.util.HashMap; import java.util.Map; import java.util.Objects; @@ -121,6 +122,7 @@ private IamPolicy createPolicy( IamStatement.builder() .effect(IamEffect.ALLOW) .addAction("s3:GetObject") + .addAction("s3:GetObjectAttributes") .addAction("s3:GetObjectVersion"); Map bucketListStatmentBuilder = new HashMap<>(); Map bucketGetLocationStatmentBuilder = new HashMap<>(); @@ -134,6 +136,7 @@ private IamPolicy createPolicy( allowGetObjectStatementBuilder.addResource( IamResource.create(getS3UriWithArn(arnPrefix, uri))); String bucketArn = arnPrefix + getBucketName(uri); + String rawPath = trimLeadingSlash(uri.getPath()); bucketListStatmentBuilder .computeIfAbsent( bucketArn, @@ -142,10 +145,14 @@ private IamPolicy createPolicy( .effect(IamEffect.ALLOW) .addAction("s3:ListBucket") .addResource(key)) - .addCondition( + .addConditions( IamConditionOperator.STRING_LIKE, "s3:prefix", - concatPathWithSep(trimLeadingSlash(uri.getPath()), "*", "/")); + Arrays.asList( + // Get raw path metadata information for AWS hadoop connector + rawPath, + // Listing objects in raw path + concatPathWithSep(rawPath, "*", "/"))); bucketGetLocationStatmentBuilder.computeIfAbsent( bucketArn, key -> diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/credential/GCSTokenProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/credential/GCSTokenProvider.java index 3f7d5bcfaa3..2b0e2437d30 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/credential/GCSTokenProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/credential/GCSTokenProvider.java @@ -146,6 +146,12 @@ private CredentialAccessBoundary getAccessBoundary( CredentialAccessBoundary.newBuilder(); readBuckets.forEach( bucket -> { + AccessBoundaryRule rule1 = + AccessBoundaryRule.newBuilder() + .setAvailableResource(toGCSBucketResource(bucket)) + .setAvailablePermissions(Arrays.asList("inRole:roles/storage.legacyBucketReader")) + .build(); + credentialAccessBoundaryBuilder.addRule(rule1); List readConditions = readExpressions.get(bucket); AccessBoundaryRule rule = getAccessBoundaryRule( diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java index 505d8429e9e..152f520274a 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java @@ -43,6 +43,7 @@ import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.reflect.FieldUtils; +import org.apache.gravitino.Catalog; import org.apache.gravitino.NameIdentifier; import org.apache.gravitino.audit.CallerContext; import org.apache.gravitino.audit.FilesetAuditConstants; @@ -371,6 +372,8 @@ private FilesetContextPair getFilesetContext(Path virtualPath, FilesetDataOperat FilesetCatalog filesetCatalog = catalogCache.get( catalogIdent, ident -> client.loadCatalog(catalogIdent.name()).asFilesetCatalog()); + Catalog catalog = (Catalog) filesetCatalog; + Preconditions.checkArgument( filesetCatalog != null, String.format("Loaded fileset catalog: %s is null.", catalogIdent)); @@ -413,6 +416,9 @@ private FilesetContextPair getFilesetContext(Path virtualPath, FilesetDataOperat // If enable the cloud store credential, we should pass the configuration here. maps.put(GVFS_FILESET_IDENTIFIER, identifier.toString()); + // Should add catalog properties to the configuration + maps.putAll(catalog.properties()); + return provider.getFileSystem(filePath, maps); } catch (IOException ioe) { throw new GravitinoRuntimeException( From e8814b04de8b686e3e4c426b9950a4d066c79d9f Mon Sep 17 00:00:00 2001 From: yuqi Date: Mon, 30 Dec 2024 20:12:28 +0800 Subject: [PATCH 13/59] Polish code. --- ...dentialProvider.java => OSSCredentialProvider.java} | 7 +++---- .../apache/gravitino/oss/fs/OSSFileSystemProvider.java | 3 +-- ...edentialProvider.java => S3CredentialProvider.java} | 10 ++++------ .../apache/gravitino/s3/fs/S3FileSystemProvider.java | 2 +- .../gravitino/abs/fs/AzureSasCredentialProvider.java | 6 ------ .../apache/gravitino/gcs/fs/GCSCredentialProvider.java | 1 - 6 files changed, 9 insertions(+), 20 deletions(-) rename bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/{OSSSessionCredentialProvider.java => OSSCredentialProvider.java} (95%) rename bundles/aws/src/main/java/org/apache/gravitino/s3/fs/{S3SessionCredentialProvider.java => S3CredentialProvider.java} (93%) diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSSessionCredentialProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialProvider.java similarity index 95% rename from bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSSessionCredentialProvider.java rename to bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialProvider.java index e2225be27fb..6048d2049fe 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSSessionCredentialProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialProvider.java @@ -42,19 +42,18 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class OSSSessionCredentialProvider implements CredentialsProvider { +public class OSSCredentialProvider implements CredentialsProvider { - private static final Logger LOGGER = LoggerFactory.getLogger(OSSSessionCredentialProvider.class); + private static final Logger LOGGER = LoggerFactory.getLogger(OSSCredentialProvider.class); private Credentials basicCredentials; private final String filesetIdentifier; private long expirationTime; private final GravitinoClient client; private final Configuration configuration; - public OSSSessionCredentialProvider(URI uri, Configuration conf) { + public OSSCredentialProvider(URI uri, Configuration conf) { this.filesetIdentifier = conf.get(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER); - // extra value and init Gravitino client here GravitinoVirtualFileSystem gravitinoVirtualFileSystem = new GravitinoVirtualFileSystem(); this.client = gravitinoVirtualFileSystem.initializeClient(conf); this.configuration = conf; diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java index 4c3ba0d19b8..b43e2dc7758 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java @@ -65,8 +65,7 @@ public FileSystem getFileSystem(Path path, Map config) throws IO && config.containsKey( GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { hadoopConfMap.put( - Constants.CREDENTIALS_PROVIDER_KEY, - OSSSessionCredentialProvider.class.getCanonicalName()); + Constants.CREDENTIALS_PROVIDER_KEY, OSSCredentialProvider.class.getCanonicalName()); } hadoopConfMap.forEach(configuration::set); diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3SessionCredentialProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialProvider.java similarity index 93% rename from bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3SessionCredentialProvider.java rename to bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialProvider.java index 49c3c09b144..f844c590fbc 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3SessionCredentialProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialProvider.java @@ -42,9 +42,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class S3SessionCredentialProvider implements AWSCredentialsProvider { +public class S3CredentialProvider implements AWSCredentialsProvider { - private static final Logger LOGGER = LoggerFactory.getLogger(S3SessionCredentialProvider.class); + private static final Logger LOGGER = LoggerFactory.getLogger(S3CredentialProvider.class); private final GravitinoClient client; private final String filesetIdentifier; private final Configuration configuration; @@ -52,12 +52,10 @@ public class S3SessionCredentialProvider implements AWSCredentialsProvider { private AWSCredentials basicSessionCredentials; private long expirationTime; - public S3SessionCredentialProvider(final URI uri, final Configuration conf) { + public S3CredentialProvider(final URI uri, final Configuration conf) { this.filesetIdentifier = conf.get(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER); this.configuration = conf; - - // extra value and init Gravitino client here GravitinoVirtualFileSystem gravitinoVirtualFileSystem = new GravitinoVirtualFileSystem(); this.client = gravitinoVirtualFileSystem.initializeClient(conf); } @@ -86,7 +84,7 @@ public void refresh() { Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); Credential[] credentials = fileset.supportsCredentials().getCredentials(); - // Can't find any credential, use the default one. + // Can't find any credential, use the default AKSK if possible. if (credentials.length == 0) { LOGGER.warn("No credential found for fileset: {}, try to use static AKSK", filesetIdentifier); expirationTime = Long.MAX_VALUE; diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java index a54b828106e..5b3b17c6148 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java @@ -69,7 +69,7 @@ public FileSystem getFileSystem(Path path, Map config) throws IO if (config.containsKey(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { configuration.set( - Constants.AWS_CREDENTIALS_PROVIDER, S3SessionCredentialProvider.class.getCanonicalName()); + Constants.AWS_CREDENTIALS_PROVIDER, S3CredentialProvider.class.getCanonicalName()); } // Hadoop-aws 2 does not support IAMInstanceCredentialsProvider diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java index 5c2900ab5a1..da2f8740cea 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java @@ -56,10 +56,6 @@ public class AzureSasCredentialProvider implements SASTokenProvider, Configurabl private long expirationTime; - public String getSasToken() { - return sasToken; - } - public String getAzureStorageAccountName() { return azureStorageAccountName; } @@ -82,8 +78,6 @@ public Configuration getConf() { public void initialize(Configuration conf, String accountName) throws IOException { this.filesetIdentifier = conf.get(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER); - - // extra value and init Gravitino client here GravitinoVirtualFileSystem gravitinoVirtualFileSystem = new GravitinoVirtualFileSystem(); this.client = gravitinoVirtualFileSystem.initializeClient(conf); } diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialProvider.java index dfb2207b181..90c7ce2b257 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialProvider.java @@ -95,7 +95,6 @@ public void setConf(Configuration configuration) { this.filesetIdentifier = configuration.get(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER); - // extra value and init Gravitino client here GravitinoVirtualFileSystem gravitinoVirtualFileSystem = new GravitinoVirtualFileSystem(); this.client = gravitinoVirtualFileSystem.initializeClient(configuration); } From 06b192b460a5d646d46ee6c3c6b47d1bd490de22 Mon Sep 17 00:00:00 2001 From: yuqi Date: Tue, 31 Dec 2024 17:50:35 +0800 Subject: [PATCH 14/59] fix --- bundles/aws-bundle/build.gradle.kts | 1 + bundles/azure-bundle/build.gradle.kts | 1 + .../gravitino/abs/fs/AzureFileSystemProvider.java | 2 ++ bundles/gcp-bundle/build.gradle.kts | 1 + .../gravitino/gcs/fs/GCSFileSystemProvider.java | 6 ------ .../dto/requests/model_version_link_request.py | 2 +- .../hadoop/GravitinoVirtualFileSystem.java | 12 ++++++------ .../GravitinoVirtualFileSystemGCSCredentialIT.java | 1 - .../test/GravitinoVirtualFileSystemGCSIT.java | 1 - .../credential/config/GCSCredentialConfig.java | 2 +- 10 files changed, 13 insertions(+), 16 deletions(-) diff --git a/bundles/aws-bundle/build.gradle.kts b/bundles/aws-bundle/build.gradle.kts index 35b1e22a4f6..a5765fb0641 100644 --- a/bundles/aws-bundle/build.gradle.kts +++ b/bundles/aws-bundle/build.gradle.kts @@ -39,6 +39,7 @@ tasks.withType(ShadowJar::class.java) { relocate("org.apache.commons.lang3", "org.apache.gravitino.aws.shaded.org.apache.commons.lang3") relocate("com.google.common", "org.apache.gravitino.aws.shaded.com.google.common") relocate("com.fasterxml.jackson", "org.apache.gravitino.aws.shaded.com.fasterxml.jackson") + mergeServiceFiles() } tasks.jar { diff --git a/bundles/azure-bundle/build.gradle.kts b/bundles/azure-bundle/build.gradle.kts index 7d9e253ac8a..fd57d33e105 100644 --- a/bundles/azure-bundle/build.gradle.kts +++ b/bundles/azure-bundle/build.gradle.kts @@ -42,6 +42,7 @@ tasks.withType(ShadowJar::class.java) { relocate("com.fasterxml", "org.apache.gravitino.azure.shaded.com.fasterxml") relocate("com.google.common", "org.apache.gravitino.azure.shaded.com.google.common") relocate("org.eclipse.jetty", "org.apache.gravitino.azure.shaded.org.eclipse.jetty") + mergeServiceFiles() } tasks.jar { diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java index 7cead5b8854..20e74a2df88 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java @@ -20,6 +20,7 @@ package org.apache.gravitino.abs.fs; import static org.apache.hadoop.fs.azurebfs.constants.ConfigurationKeys.FS_AZURE_ACCOUNT_AUTH_TYPE_PROPERTY_NAME; +import static org.apache.hadoop.fs.azurebfs.constants.ConfigurationKeys.FS_AZURE_ACCOUNT_IS_HNS_ENABLED; import static org.apache.hadoop.fs.azurebfs.constants.ConfigurationKeys.FS_AZURE_SAS_TOKEN_PROVIDER_TYPE; import com.google.common.annotations.VisibleForTesting; @@ -91,6 +92,7 @@ public FileSystem getFileSystem(@Nonnull Path path, @Nonnull Map configuration.set( FS_AZURE_SAS_TOKEN_PROVIDER_TYPE + "." + accountName, AzureSasCredentialProvider.class.getName()); + configuration.set(FS_AZURE_ACCOUNT_IS_HNS_ENABLED, "true"); } else if (azureSasCredentialProvider.getAzureStorageAccountKey() != null && azureSasCredentialProvider.getAzureStorageAccountName() != null) { configuration.set( diff --git a/bundles/gcp-bundle/build.gradle.kts b/bundles/gcp-bundle/build.gradle.kts index 73efaf9f22c..50300fafe05 100644 --- a/bundles/gcp-bundle/build.gradle.kts +++ b/bundles/gcp-bundle/build.gradle.kts @@ -42,6 +42,7 @@ tasks.withType(ShadowJar::class.java) { relocate("com.google.common", "org.apache.gravitino.gcp.shaded.com.google.common") relocate("com.fasterxml", "org.apache.gravitino.gcp.shaded.com.fasterxml") relocate("org.eclipse.jetty", "org.apache.gravitino.gcp.shaded.org.eclipse.jetty") + mergeServiceFiles() } tasks.jar { diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java index 376d4285649..e8c16648797 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java @@ -18,13 +18,11 @@ */ package org.apache.gravitino.gcs.fs; -import com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem; import com.google.cloud.hadoop.util.AccessTokenProvider; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableMap; import java.io.IOException; import java.util.Map; -import org.apache.commons.lang3.StringUtils; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystemConfiguration; @@ -58,10 +56,6 @@ public FileSystem getFileSystem(Path path, Map config) throws IO } } - if (StringUtils.isBlank(configuration.get("fs.gs.impl"))) { - configuration.set("fs.gs.impl", GoogleHadoopFileSystem.class.getName()); - } - return FileSystem.newInstance(path.toUri(), configuration); } diff --git a/clients/client-python/gravitino/dto/requests/model_version_link_request.py b/clients/client-python/gravitino/dto/requests/model_version_link_request.py index e16fa344e90..98b1c455145 100644 --- a/clients/client-python/gravitino/dto/requests/model_version_link_request.py +++ b/clients/client-python/gravitino/dto/requests/model_version_link_request.py @@ -59,7 +59,7 @@ def validate(self): for alias in self._aliases or []: if not self._is_not_blank(alias): - raise IllegalArgumentException('Alias must not be null or empty') + raise IllegalArgumentException("Alias must not be null or empty") def _is_not_blank(self, string: str) -> bool: return string is not None and string.strip() diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java index 152f520274a..29421274fe8 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java @@ -412,14 +412,14 @@ private FilesetContextPair getFilesetContext(Path virtualPath, FilesetDataOperat // https://github.com/apache/gravitino/issues/5609 resetFileSystemServiceLoader(scheme); - Map maps = getConfigMap(getConf()); - // If enable the cloud store credential, we should pass the configuration here. - maps.put(GVFS_FILESET_IDENTIFIER, identifier.toString()); + Map catalogProperty = catalog.properties(); + Map totalProperty = Maps.newHashMap(catalogProperty); - // Should add catalog properties to the configuration - maps.putAll(catalog.properties()); + totalProperty.putAll(getConfigMap(getConf())); + // If enable the cloud store credential, we should pass the configuration here. + totalProperty.put(GVFS_FILESET_IDENTIFIER, identifier.toString()); - return provider.getFileSystem(filePath, maps); + return provider.getFileSystem(filePath, totalProperty); } catch (IOException ioe) { throw new GravitinoRuntimeException( "Exception occurs when create new FileSystem for actual uri: %s, msg: %s", diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java index 32486b48c47..c2460b75f7b 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java @@ -92,7 +92,6 @@ public void startUp() throws Exception { conf.set("fs.gvfs.impl.disable.cache", "true"); conf.set("fs.gravitino.server.uri", serverUri); conf.set("fs.gravitino.client.metalake", metalakeName); - conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem"); // Pass this configuration to the real file system conf.set(GCSProperties.GCS_SERVICE_ACCOUNT_JSON_PATH, SERVICE_ACCOUNT_FILE); diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSIT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSIT.java index 33f8f910ae3..f273708810c 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSIT.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSIT.java @@ -88,7 +88,6 @@ public void startUp() throws Exception { conf.set("fs.gvfs.impl.disable.cache", "true"); conf.set("fs.gravitino.server.uri", serverUri); conf.set("fs.gravitino.client.metalake", metalakeName); - conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem"); // Pass this configuration to the real file system conf.set(GCSProperties.GCS_SERVICE_ACCOUNT_JSON_PATH, SERVICE_ACCOUNT_FILE); diff --git a/core/src/main/java/org/apache/gravitino/credential/config/GCSCredentialConfig.java b/core/src/main/java/org/apache/gravitino/credential/config/GCSCredentialConfig.java index 1a2b38ef641..3bf5cfc1425 100644 --- a/core/src/main/java/org/apache/gravitino/credential/config/GCSCredentialConfig.java +++ b/core/src/main/java/org/apache/gravitino/credential/config/GCSCredentialConfig.java @@ -30,7 +30,7 @@ public class GCSCredentialConfig extends Config { @VisibleForTesting - public static final String GRAVITINO_GCS_CREDENTIAL_FILE_PATH = "gcs-credential-file-path"; + public static final String GRAVITINO_GCS_CREDENTIAL_FILE_PATH = "gcs-service-account-file"; public static final ConfigEntry GCS_CREDENTIAL_FILE_PATH = new ConfigBuilder(GRAVITINO_GCS_CREDENTIAL_FILE_PATH) From 440db591090ba9e6182b826c45bc730e643a6ea6 Mon Sep 17 00:00:00 2001 From: yuqi Date: Tue, 31 Dec 2024 21:57:13 +0800 Subject: [PATCH 15/59] fix --- .../oss/fs/OSSCredentialProvider.java | 21 ++++++++++++++-- .../gravitino/s3/fs/S3CredentialProvider.java | 20 ++++++++++++++- .../abs/fs/AzureSasCredentialProvider.java | 21 ++++++++++++++-- .../hadoop/GravitinoVirtualFileSystem.java | 25 +++++++++++++++++-- 4 files changed, 80 insertions(+), 7 deletions(-) diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialProvider.java index 6048d2049fe..ccf4d887750 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialProvider.java @@ -92,8 +92,7 @@ private void refresh() { return; } - // Use the first one. - Credential credential = credentials[0]; + Credential credential = getCredential(credentials); Map credentialMap = credential.toProperties(); String accessKeyId = credentialMap.get(GRAVITINO_OSS_SESSION_ACCESS_KEY_ID); @@ -112,4 +111,22 @@ private void refresh() { expirationTime = Long.MAX_VALUE; } } + + /** + * Get the credential from the credential array. Using dynamic credential first, if not found, + * uses static credential. + * + * @param credentials The credential array. + * @return The credential. + */ + private Credential getCredential(Credential[] credentials) { + for (Credential credential : credentials) { + if (OSSTokenCredential.OSS_TOKEN_CREDENTIAL_TYPE.equals(credential.credentialType())) { + return credential; + } + } + + // Not found, use the first one. + return credentials[0]; + } } diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialProvider.java index f844c590fbc..57f6a517299 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialProvider.java @@ -94,7 +94,7 @@ public void refresh() { return; } - Credential credential = credentials[0]; + Credential credential = getCredential(credentials); Map credentialMap = credential.toProperties(); String accessKeyId = credentialMap.get(GRAVITINO_S3_SESSION_ACCESS_KEY_ID); @@ -114,4 +114,22 @@ public void refresh() { expirationTime = Long.MAX_VALUE; } } + + /** + * Get the credential from the credential array. Using dynamic credential first, if not found, + * uses static credential. + * + * @param credentials The credential array. + * @return The credential. + */ + private Credential getCredential(Credential[] credentials) { + for (Credential credential : credentials) { + if (S3TokenCredential.S3_TOKEN_CREDENTIAL_TYPE.equals(credential.credentialType())) { + return credential; + } + } + + // Not found, use the first one. + return credentials[0]; + } } diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java index da2f8740cea..d018cd4c03a 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java @@ -106,8 +106,7 @@ private void refresh() { return; } - // Use the first one. - Credential credential = credentials[0]; + Credential credential = getCredential(credentials); Map credentialMap = credential.toProperties(); if (ADLSTokenCredential.ADLS_TOKEN_CREDENTIAL_TYPE.equals( @@ -123,4 +122,22 @@ private void refresh() { expirationTime = Long.MAX_VALUE; } } + + /** + * Get the credential from the credential array. Using dynamic credential first, if not found, + * uses static credential. + * + * @param credentials The credential array. + * @return The credential. + */ + private Credential getCredential(Credential[] credentials) { + for (Credential credential : credentials) { + if (ADLSTokenCredential.ADLS_TOKEN_CREDENTIAL_TYPE.equals(credential.credentialType())) { + return credential; + } + } + + // Not found, use the first one. + return credentials[0]; + } } diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java index 29421274fe8..cb2d4d019df 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java @@ -27,6 +27,7 @@ import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.common.collect.Maps; +import com.google.common.collect.Sets; import com.google.common.collect.Streams; import com.google.common.util.concurrent.ThreadFactoryBuilder; import java.io.File; @@ -36,11 +37,13 @@ import java.util.List; import java.util.Map; import java.util.ServiceLoader; +import java.util.Set; import java.util.concurrent.ScheduledThreadPoolExecutor; import java.util.concurrent.ThreadFactory; import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.reflect.FieldUtils; import org.apache.gravitino.Catalog; @@ -55,6 +58,9 @@ import org.apache.gravitino.client.KerberosTokenProvider; import org.apache.gravitino.exceptions.GravitinoRuntimeException; import org.apache.gravitino.file.FilesetCatalog; +import org.apache.gravitino.storage.AzureProperties; +import org.apache.gravitino.storage.OSSProperties; +import org.apache.gravitino.storage.S3Properties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; @@ -94,6 +100,14 @@ public class GravitinoVirtualFileSystem extends FileSystem { private static final String SLASH = "/"; private final Map fileSystemProvidersMap = Maps.newHashMap(); + private static final Set CATALOG_NECESSARY_PROPERTIES_FOR_CREDENTIAL = + Sets.newHashSet( + OSSProperties.GRAVITINO_OSS_ENDPOINT, + OSSProperties.GRAVITINO_OSS_ENDPOINT, + S3Properties.GRAVITINO_S3_ENDPOINT, + S3Properties.GRAVITINO_S3_ENDPOINT, + AzureProperties.GRAVITINO_AZURE_STORAGE_ACCOUNT_NAME); + @Override public void initialize(URI name, Configuration configuration) throws IOException { if (!name.toString().startsWith(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_PREFIX)) { @@ -412,8 +426,15 @@ private FilesetContextPair getFilesetContext(Path virtualPath, FilesetDataOperat // https://github.com/apache/gravitino/issues/5609 resetFileSystemServiceLoader(scheme); - Map catalogProperty = catalog.properties(); - Map totalProperty = Maps.newHashMap(catalogProperty); + Map necessaryPropertyFromCatalog = + catalog.properties().entrySet().stream() + .filter( + property -> + CATALOG_NECESSARY_PROPERTIES_FOR_CREDENTIAL.contains( + property.getKey())) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + + Map totalProperty = Maps.newHashMap(necessaryPropertyFromCatalog); totalProperty.putAll(getConfigMap(getConf())); // If enable the cloud store credential, we should pass the configuration here. From 9678513c186ac0a2966e3ffa6d7642f88bc7988e Mon Sep 17 00:00:00 2001 From: yuqi Date: Thu, 2 Jan 2025 09:49:52 +0800 Subject: [PATCH 16/59] fix --- ...java => GravitinoOSSCredentialProvider.java} | 7 ++++--- .../gravitino/oss/fs/OSSFileSystemProvider.java | 3 ++- ....java => GravitinoS3CredentialProvider.java} | 6 +++--- .../gravitino/s3/fs/S3FileSystemProvider.java | 3 ++- .../abs/fs/AzureFileSystemProvider.java | 17 +++++++++-------- ...=> GravitinoAzureSasCredentialProvider.java} | 5 +++-- .../gravitino/gcs/fs/GCSFileSystemProvider.java | 7 ++++--- ...java => GravitinoGCSCredentialProvider.java} | 5 +++-- 8 files changed, 30 insertions(+), 23 deletions(-) rename bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/{OSSCredentialProvider.java => GravitinoOSSCredentialProvider.java} (95%) rename bundles/aws/src/main/java/org/apache/gravitino/s3/fs/{S3CredentialProvider.java => GravitinoS3CredentialProvider.java} (95%) rename bundles/azure/src/main/java/org/apache/gravitino/abs/fs/{AzureSasCredentialProvider.java => GravitinoAzureSasCredentialProvider.java} (96%) rename bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/{GCSCredentialProvider.java => GravitinoGCSCredentialProvider.java} (95%) diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/GravitinoOSSCredentialProvider.java similarity index 95% rename from bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialProvider.java rename to bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/GravitinoOSSCredentialProvider.java index ccf4d887750..d0a44511e8b 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/GravitinoOSSCredentialProvider.java @@ -42,16 +42,17 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class OSSCredentialProvider implements CredentialsProvider { +public class GravitinoOSSCredentialProvider implements CredentialsProvider { - private static final Logger LOGGER = LoggerFactory.getLogger(OSSCredentialProvider.class); + private static final Logger LOGGER = + LoggerFactory.getLogger(GravitinoOSSCredentialProvider.class); private Credentials basicCredentials; private final String filesetIdentifier; private long expirationTime; private final GravitinoClient client; private final Configuration configuration; - public OSSCredentialProvider(URI uri, Configuration conf) { + public GravitinoOSSCredentialProvider(URI uri, Configuration conf) { this.filesetIdentifier = conf.get(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER); GravitinoVirtualFileSystem gravitinoVirtualFileSystem = new GravitinoVirtualFileSystem(); diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java index b43e2dc7758..c1ae59a897e 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java @@ -65,7 +65,8 @@ public FileSystem getFileSystem(Path path, Map config) throws IO && config.containsKey( GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { hadoopConfMap.put( - Constants.CREDENTIALS_PROVIDER_KEY, OSSCredentialProvider.class.getCanonicalName()); + Constants.CREDENTIALS_PROVIDER_KEY, + GravitinoOSSCredentialProvider.class.getCanonicalName()); } hadoopConfMap.forEach(configuration::set); diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/GravitinoS3CredentialProvider.java similarity index 95% rename from bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialProvider.java rename to bundles/aws/src/main/java/org/apache/gravitino/s3/fs/GravitinoS3CredentialProvider.java index 57f6a517299..d3f3105499e 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/GravitinoS3CredentialProvider.java @@ -42,9 +42,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class S3CredentialProvider implements AWSCredentialsProvider { +public class GravitinoS3CredentialProvider implements AWSCredentialsProvider { - private static final Logger LOGGER = LoggerFactory.getLogger(S3CredentialProvider.class); + private static final Logger LOGGER = LoggerFactory.getLogger(GravitinoS3CredentialProvider.class); private final GravitinoClient client; private final String filesetIdentifier; private final Configuration configuration; @@ -52,7 +52,7 @@ public class S3CredentialProvider implements AWSCredentialsProvider { private AWSCredentials basicSessionCredentials; private long expirationTime; - public S3CredentialProvider(final URI uri, final Configuration conf) { + public GravitinoS3CredentialProvider(final URI uri, final Configuration conf) { this.filesetIdentifier = conf.get(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER); this.configuration = conf; diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java index 5b3b17c6148..3747d2c104c 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java @@ -69,7 +69,8 @@ public FileSystem getFileSystem(Path path, Map config) throws IO if (config.containsKey(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { configuration.set( - Constants.AWS_CREDENTIALS_PROVIDER, S3CredentialProvider.class.getCanonicalName()); + Constants.AWS_CREDENTIALS_PROVIDER, + GravitinoS3CredentialProvider.class.getCanonicalName()); } // Hadoop-aws 2 does not support IAMInstanceCredentialsProvider diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java index 20e74a2df88..b9aca233620 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java @@ -78,9 +78,10 @@ public FileSystem getFileSystem(@Nonnull Path path, @Nonnull Map if (config.containsKey(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { // Test whether SAS works try { - AzureSasCredentialProvider azureSasCredentialProvider = new AzureSasCredentialProvider(); - azureSasCredentialProvider.initialize(configuration, null); - String sas = azureSasCredentialProvider.getSASToken(null, null, null, null); + GravitinoAzureSasCredentialProvider gravitinoAzureSasCredentialProvider = + new GravitinoAzureSasCredentialProvider(); + gravitinoAzureSasCredentialProvider.initialize(configuration, null); + String sas = gravitinoAzureSasCredentialProvider.getSASToken(null, null, null, null); if (sas != null) { String accountName = String.format( @@ -91,15 +92,15 @@ public FileSystem getFileSystem(@Nonnull Path path, @Nonnull Map FS_AZURE_ACCOUNT_AUTH_TYPE_PROPERTY_NAME + "." + accountName, AuthType.SAS.name()); configuration.set( FS_AZURE_SAS_TOKEN_PROVIDER_TYPE + "." + accountName, - AzureSasCredentialProvider.class.getName()); + GravitinoAzureSasCredentialProvider.class.getName()); configuration.set(FS_AZURE_ACCOUNT_IS_HNS_ENABLED, "true"); - } else if (azureSasCredentialProvider.getAzureStorageAccountKey() != null - && azureSasCredentialProvider.getAzureStorageAccountName() != null) { + } else if (gravitinoAzureSasCredentialProvider.getAzureStorageAccountKey() != null + && gravitinoAzureSasCredentialProvider.getAzureStorageAccountName() != null) { configuration.set( String.format( "fs.azure.account.key.%s.dfs.core.windows.net", - azureSasCredentialProvider.getAzureStorageAccountName()), - azureSasCredentialProvider.getAzureStorageAccountKey()); + gravitinoAzureSasCredentialProvider.getAzureStorageAccountName()), + gravitinoAzureSasCredentialProvider.getAzureStorageAccountKey()); } } catch (Exception e) { // Can't use SAS, use account key and account key instead diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/GravitinoAzureSasCredentialProvider.java similarity index 96% rename from bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java rename to bundles/azure/src/main/java/org/apache/gravitino/abs/fs/GravitinoAzureSasCredentialProvider.java index d018cd4c03a..f208dac4c56 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/GravitinoAzureSasCredentialProvider.java @@ -39,9 +39,10 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class AzureSasCredentialProvider implements SASTokenProvider, Configurable { +public class GravitinoAzureSasCredentialProvider implements SASTokenProvider, Configurable { - private static final Logger LOGGER = LoggerFactory.getLogger(AzureSasCredentialProvider.class); + private static final Logger LOGGER = + LoggerFactory.getLogger(GravitinoAzureSasCredentialProvider.class); private Configuration configuration; diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java index e8c16648797..7b09f80f31e 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java @@ -46,13 +46,14 @@ public FileSystem getFileSystem(Path path, Map config) throws IO .forEach(configuration::set); if (config.containsKey(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { - AccessTokenProvider accessTokenProvider = new GCSCredentialProvider(); + AccessTokenProvider accessTokenProvider = new GravitinoGCSCredentialProvider(); accessTokenProvider.setConf(configuration); - // Why is this check necessary?, if Gravitino fails to get any credentials, we fall back to + // Why is this check necessary?, if Gravitino fails to get any credentials, we fall back to // the default behavior of the GoogleHadoopFileSystem to use service account credentials. if (accessTokenProvider.getAccessToken() != null) { configuration.set( - "fs.gs.auth.access.token.provider.impl", GCSCredentialProvider.class.getName()); + "fs.gs.auth.access.token.provider.impl", + GravitinoGCSCredentialProvider.class.getName()); } } diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GravitinoGCSCredentialProvider.java similarity index 95% rename from bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialProvider.java rename to bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GravitinoGCSCredentialProvider.java index 90c7ce2b257..5c4d93b8997 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GravitinoGCSCredentialProvider.java @@ -34,8 +34,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class GCSCredentialProvider implements AccessTokenProvider { - private static final Logger LOGGER = LoggerFactory.getLogger(GCSCredentialProvider.class); +public class GravitinoGCSCredentialProvider implements AccessTokenProvider { + private static final Logger LOGGER = + LoggerFactory.getLogger(GravitinoGCSCredentialProvider.class); private Configuration configuration; private GravitinoClient client; private String filesetIdentifier; From c4fb29a2f3589a998b6a8d8436202618340a1f72 Mon Sep 17 00:00:00 2001 From: yuqi Date: Thu, 2 Jan 2025 12:54:12 +0800 Subject: [PATCH 17/59] fix --- docs/cloud-storage-fileset-example.md | 678 ++++++++++++++++++++++++++ docs/hadoop-catalog.md | 14 +- docs/how-to-use-gvfs.md | 74 +-- 3 files changed, 721 insertions(+), 45 deletions(-) create mode 100644 docs/cloud-storage-fileset-example.md diff --git a/docs/cloud-storage-fileset-example.md b/docs/cloud-storage-fileset-example.md new file mode 100644 index 00000000000..17d6d24ff8c --- /dev/null +++ b/docs/cloud-storage-fileset-example.md @@ -0,0 +1,678 @@ +--- +title: "How to use cloud storage fileset" +slug: /how-to-use-cloud-storage-fileset +keyword: fileset S3 GCS ADLS OSS +license: "This software is licensed under the Apache License version 2." +--- + +This document aims to provide a comprehensive guide on how to use cloud storage fileset created by Gravitino, it usually contains the following sections: + +## Necessary steps in Gravitino server + +### Start up Gravitino server + +Before running the Gravitino server, you need to put the following jars into the fileset catalog classpath located at `${GRAVITINO_HOME}/catalogs/hadoop/libs`. For example, if you are using S3, you need to put `gravitino-aws-hadoop-bundle-{gravitino-version}.jar` into the fileset catalog classpath in `${GRAVITINO_HOME}/catalogs/hadoop/libs`. + +| Storage type | Description | Jar file | Since Version | +|--------------|---------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------|------------------| +| Local | The local file system. | (none) | 0.5.0 | +| HDFS | HDFS file system. | (none) | 0.5.0 | +| S3 | AWS S3. | [gravitino-aws-hadoop-bundle](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-hadoop-aws-bundle) | 0.8.0-incubating | +| GCS | Google Cloud Storage. | [gravitino-gcp-hadoop-bundle](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-hadoop-gcp-bundle) | 0.8.0-incubating | +| OSS | Aliyun OSS. | [gravitino-aliyun-hadoop-bundle](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-hadoop-aliyun-bundle) | 0.8.0-incubating | +| ABS | Azure Blob Storage (aka. ABS, or Azure Data Lake Storage (v2) | [gravitino-azure-hadoop-bundle](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-hadoop-azure-bundle) | 0.8.0-incubating | + +After adding the jars into the fileset catalog classpath, you can start up the Gravitino server by running the following command: + +```shell +cd ${GRAVITINO_HOME} +bin/gravitino.sh start +``` + +### Bundle jars + +Gravitino bundles jars are jars that are used to access the cloud storage, they are divided into two categories: + +- `gravitino-${aws,gcp,aliyun,azure}-bundle-{gravitino-version}.jar` are the jars that contain all the necessary dependencies to access the corresponding cloud storages. For instance, `gravitino-aws-bundle-${gravitino-version}.jar` contains the all necessary classes including `hadoop-common`(hadoop-3.3.1) and `hadoop-aws` to access the S3 storage. +They are used in the scenario where there is no hadoop environment in the runtime. + +- If there is already hadoop environment in the runtime, you can use the `gravitino-${aws,gcp,aliyun,azure}-${gravitino-version}.jar` that does not contain the cloud storage classes (like hadoop-aws) and hadoop environment. Alternatively, you can manually add the necessary jars to the classpath. + +The following table demonstrates which jars are necessary for different cloud storage filesets: + +| Hadoop runtime version | S3 | GCS | OSS | ABS | +|------------------------|------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------| +| No Hadoop environment | `gravitino-aws-bundle-${gravitino-version}.jar` | `gravitino-gcp-bundle-${gravitino-version}.jar` | `gravitino-aliyun-bundle-${gravitino-version}.jar` | `gravitino-azure-bundle-${gravitino-version}.jar` | +| 2.x, 3.x | `gravitino-aws-${gravitino-version}.jar`, `hadoop-aws-${hadoop-version}.jar`, `aws-sdk-java-${version}` and other necessary dependencies | `gravitino-gcp-{gravitino-version}.jar`, `gcs-connector-${hadoop-version}`.jar, other necessary dependencies | `gravitino-aliyun-{gravitino-version}.jar`, hadoop-aliyun-{hadoop-version}.jar, aliyun-sdk-java-{version} and other necessary dependencies | `gravitino-azure-${gravitino-version}.jar`, `hadoop-azure-${hadoop-version}.jar`, and other necessary dependencies | + +For `hadoop-aws-${hadoop-version}.jar`, `hadoop-azure-${hadoop-version}.jar` and `hadoop-aliyun-${hadoop-version}.jar` and related dependencies, you can get them from `${HADOOP_HOME}/share/hadoop/tools/lib/` directory. +For `gcs-connector`, you can download it from the [GCS connector](https://github.com/GoogleCloudDataproc/hadoop-connectors/releases) for hadoop2 or hadoop3. + +If there still have some issues, please report it to the Gravitino community and create an issue. + +## Create fileset catalogs + +Once the Gravitino server is started, you can create the corresponding fileset by the following sentence: + + +### Create a S3 fileset catalog + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "catalog", + "type": "FILESET", + "comment": "comment", + "provider": "hadoop", + "properties": { + "location": "s3a://bucket/root", + "s3-access-key-id": "access_key", + "s3-secret-access-key": "secret_key", + "s3-endpoint": "http://s3.ap-northeast-1.amazonaws.com", + "filesystem-providers": "s3" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs +``` + + + + +```java +GravitinoClient gravitinoClient = GravitinoClient + .builder("http://localhost:8090") + .withMetalake("metalake") + .build(); + +s3Properties = ImmutableMap.builder() + .put("location", "s3a://bucket/root") + .put("s3-access-key-id", "access_key") + .put("s3-secret-access-key", "secret_key") + .put("s3-endpoint", "http://s3.ap-northeast-1.amazonaws.com") + .put("filesystem-providers", "s3") + .build(); + +Catalog s3Catalog = gravitinoClient.createCatalog("catalog", + Type.FILESET, + "hadoop", // provider, Gravitino only supports "hadoop" for now. + "This is a S3 fileset catalog", + s3Properties); +// ... + +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") +s3_properties = { + "location": "s3a://bucket/root", + "s3-access-key-id": "access_key" + "s3-secret-access-key": "secret_key", + "s3-endpoint": "http://s3.ap-northeast-1.amazonaws.com" +} + +s3_catalog = gravitino_client.create_catalog(name="catalog", + type=Catalog.Type.FILESET, + provider="hadoop", + comment="This is a S3 fileset catalog", + properties=s3_properties) + +``` + + + + +:::note +The value of location should always start with `s3a` NOT `s3` for AWS S3, for instance, `s3a://bucket/root`. Value like `s3://bucket/root` is not supported due to the limitation of the hadoop-aws library. +::: + +### Create a GCS fileset catalog + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "catalog", + "type": "FILESET", + "comment": "comment", + "provider": "hadoop", + "properties": { + "location": "gs://bucket/root", + "gcs-service-account-file": "path_of_gcs_service_account_file", + "filesystem-providers": "gcs" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs +``` + + + + +```java +GravitinoClient gravitinoClient = GravitinoClient + .builder("http://localhost:8090") + .withMetalake("metalake") + .build(); + +gcsProperties = ImmutableMap.builder() + .put("location", "gs://bucket/root") + .put("gcs-service-account-file", "path_of_gcs_service_account_file") + .put("filesystem-providers", "gcs") + .build(); + +Catalog gcsCatalog = gravitinoClient.createCatalog("catalog", + Type.FILESET, + "hadoop", // provider, Gravitino only supports "hadoop" for now. + "This is a GCS fileset catalog", + gcsProperties); +// ... + +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") + +gcs_properties = { + "location": "gcs://bucket/root", + "gcs_service_account_file": "path_of_gcs_service_account_file" +} + +s3_catalog = gravitino_client.create_catalog(name="catalog", + type=Catalog.Type.FILESET, + provider="hadoop", + comment="This is a GCS fileset catalog", + properties=gcs_properties) + +``` + + + + +:::note +The prefix of a GCS location should always start with `gs` for instance, `gs://bucket/root`. +::: + +### Create an OSS fileset catalog + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "catalog", + "type": "FILESET", + "comment": "comment", + "provider": "hadoop", + "properties": { + "location": "oss://bucket/root", + "oss-access-key-id": "access_key", + "oss-secret-access-key": "secret_key", + "oss-endpoint": "http://oss-cn-hangzhou.aliyuncs.com", + "filesystem-providers": "oss" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs +``` + + + + +```java +GravitinoClient gravitinoClient = GravitinoClient + .builder("http://localhost:8090") + .withMetalake("metalake") + .build(); + +ossProperties = ImmutableMap.builder() + .put("location", "oss://bucket/root") + .put("oss-access-key-id", "access_key") + .put("oss-secret-access-key", "secret_key") + .put("oss-endpoint", "http://oss-cn-hangzhou.aliyuncs.com") + .put("filesystem-providers", "oss") + .build(); + +Catalog ossProperties = gravitinoClient.createCatalog("catalog", + Type.FILESET, + "hadoop", // provider, Gravitino only supports "hadoop" for now. + "This is a OSS fileset catalog", + ossProperties); +// ... + +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") +oss_properties = { + "location": "oss://bucket/root", + "oss-access-key-id": "access_key" + "oss-secret-access-key": "secret_key", + "oss-endpoint": "http://oss-cn-hangzhou.aliyuncs.com" +} + +oss_catalog = gravitino_client.create_catalog(name="catalog", + type=Catalog.Type.FILESET, + provider="hadoop", + comment="This is a OSS fileset catalog", + properties=oss_properties) + +``` + +### Create an ABS (Azure Blob Storage or ADLS) fileset catalog + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "catalog", + "type": "FILESET", + "comment": "comment", + "provider": "hadoop", + "properties": { + "location": "abfss://container/root", + "abs-account-name": "The account name of the Azure Blob Storage", + "abs-account-key": "The account key of the Azure Blob Storage", + "filesystem-providers": "abs" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs +``` + + + + +```java +GravitinoClient gravitinoClient = GravitinoClient + .builder("http://localhost:8090") + .withMetalake("metalake") + .build(); + +absProperties = ImmutableMap.builder() + .put("location", "abfss://container/root") + .put("abs-account-name", "The account name of the Azure Blob Storage") + .put("abs-account-key", "The account key of the Azure Blob Storage") + .put("filesystem-providers", "abs") + .build(); + +Catalog gcsCatalog = gravitinoClient.createCatalog("catalog", + Type.FILESET, + "hadoop", // provider, Gravitino only supports "hadoop" for now. + "This is a Azure Blob storage fileset catalog", + absProperties); +// ... + +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") + +abs_properties = { + "location": "gcs://bucket/root", + "abs_account_name": "The account name of the Azure Blob Storage", + "abs_account_key": "The account key of the Azure Blob Storage" +} + +abs_catalog = gravitino_client.create_catalog(name="catalog", + type=Catalog.Type.FILESET, + provider="hadoop", + comment="This is a Azure Blob Storage fileset catalog", + properties=abs_properties) + +``` + + + + +note::: +The prefix of an ABS (Azure Blob Storage or ADLS (v2)) location should always start with `abfss` NOT `abfs`, for instance, `abfss://container/root`. Value like `abfs://container/root` is not supported. +::: + + +## Create fileset schema + +This part is the same for all cloud storage filesets, you can create the schema by the following sentence: + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "schema", + "comment": "comment", + "properties": { + "location": "file:///tmp/root/schema" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs/catalog/schemas +``` + + + + +```java +GravitinoClient gravitinoClient = GravitinoClient + .builder("http://localhost:8090") + .withMetalake("metalake") + .build(); + +// Assuming you have just created a Hadoop catalog named `catalog` +Catalog catalog = gravitinoClient.loadCatalog("catalog"); + +SupportsSchemas supportsSchemas = catalog.asSchemas(); + +Map schemaProperties = ImmutableMap.builder() + // Property "location" is optional, if specified all the managed fileset without + // specifying storage location will be stored under this location. + .put("location", "file:///tmp/root/schema") + .build(); +Schema schema = supportsSchemas.createSchema("schema", + "This is a schema", + schemaProperties +); +// ... +``` + + + + +You can change the value of property `location` according to which catalog you are using, moreover, if we have set the `location` property in the catalog, we can omit the `location` property in the schema. + +## Create filesets + +The following sentences can be used to create a fileset in the schema: + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "example_fileset", + "comment": "This is an example fileset", + "type": "MANAGED", + "storageLocation": "s3a://bucket/root/schema/example_fileset", + "properties": { + "k1": "v1" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs/catalog/schemas/schema/filesets +``` + + + + +```java +GravitinoClient gravitinoClient = GravitinoClient + .builder("http://localhost:8090") + .withMetalake("metalake") + .build(); + +Catalog catalog = gravitinoClient.loadCatalog("catalog"); +FilesetCatalog filesetCatalog = catalog.asFilesetCatalog(); + +Map propertiesMap = ImmutableMap.builder() + .put("k1", "v1") + .build(); + +filesetCatalog.createFileset( + NameIdentifier.of("schema", "example_fileset"), + "This is an example fileset", + Fileset.Type.MANAGED, + "s3a://bucket/root/schema/example_fileset", + propertiesMap, +); +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") + +catalog: Catalog = gravitino_client.load_catalog(name="catalog") +catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("schema", "example_fileset"), + type=Fileset.Type.MANAGED, + comment="This is an example fileset", + storage_location="s3a://bucket/root/schema/example_fileset", + properties={"k1": "v1"}) +``` + + + + +Similar to schema, the `storageLocation` is optional if you have set the `location` property in the schema or catalog. Please change the value of +`location` as the actual location you want to store the fileset. + +The example above is for S3 fileset, you can replace the `storageLocation` with the actual location of the GCS, OSS, or ABS fileset. + + +## Using Spark to access the fileset + +The following code snippet shows how to use **PySpark 3.1.3 with hadoop environment(hadoop 3.2.0)** to access the fileset: + +```python +import logging +from gravitino import NameIdentifier, GravitinoClient, Catalog, Fileset, GravitinoAdminClient +from pyspark.sql import SparkSession +import os + +gravitino_url = "http://localhost:8090" +metalake_name = "test" + +catalog_name = "s3_catalog" +schema_name = "schema" +fileset_name = "example" + +## this is for S3 +os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-aws-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}-SNAPSHOT.jar,/path/to/hadoop-aws-3.2.0.jar,/path/to/aws-java-sdk-bundle-1.11.375.jar --master local[1] pyspark-shell" +spark = SparkSession.builder + .appName("s3_fielset_test") + .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") + .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") + .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") + .config("spark.hadoop.fs.gravitino.client.metalake", "test") + .config("spark.hadoop.s3-access-key-id", os.environ["S3_ACCESS_KEY_ID"]) + .config("spark.hadoop.s3-secret-access-key", os.environ["S3_SECRET_ACCESS_KEY"]) + .config("spark.hadoop.s3-endpoint", "http://s3.ap-northeast-1.amazonaws.com") + .config("spark.driver.memory", "2g") + .config("spark.driver.port", "2048") + .getOrCreate() + +### this is for GCS +os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-gcp-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar,/path/to/gcs-connector-hadoop3-2.2.22-shaded.jar --master local[1] pyspark-shell" +spark = SparkSession.builder + .appName("s3_fielset_test") + .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") + .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") + .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") + .config("spark.hadoop.fs.gravitino.client.metalake", "test") + .config("spark.hadoop.gcs-service-account-file", "/path/to/gcs-service-account-file.json") + .config("spark.driver.memory", "2g") + .config("spark.driver.port", "2048") + .getOrCreate() + +### this is for OSS +os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-aliyun-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar,/path/to/aliyun-sdk-oss-2.8.3.jar,/path/to/hadoop-aliyun-3.2.0.jar,/path/to/jdom-1.1.jar --master local[1] pyspark-shell" +spark = SparkSession.builder + .appName("s3_fielset_test") + .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") + .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") + .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") + .config("spark.hadoop.fs.gravitino.client.metalake", "test") + .config("spark.hadoop.oss-access-key-id", os.environ["OSS_ACCESS_KEY_ID"]) + .config("spark.hadoop.oss-secret-access-key", os.environ["S3_SECRET_ACCESS_KEY"]) + .config("spark.hadoop.oss-endpoint", "https://oss-cn-shanghai.aliyuncs.com") + .config("spark.driver.memory", "2g") + .config("spark.driver.port", "2048") + .getOrCreate() +spark.sparkContext.setLogLevel("DEBUG") + +### this is for ABS +os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-azure-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar,/path/to/hadoop-azure-3.2.0.jar,/path/to/azure-storage-7.0.0.jar,/path/to/wildfly-openssl-1.0.4.Final.jar --master local[1] pyspark-shell" +spark = SparkSession.builder + .appName("s3_fielset_test") + .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") + .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") + .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") + .config("spark.hadoop.fs.gravitino.client.metalake", "test") + .config("spark.hadoop.azure-storage-account-name", "azure_account_name") + .config("spark.hadoop.azure-storage-account-key", "azure_account_name") + .config("spark.hadoop.fs.azure.skipUserGroupMetadataDuringInitialization", "true") + .config("spark.driver.memory", "2g") + .config("spark.driver.port", "2048") + .getOrCreate() + +data = [("Alice", 25), ("Bob", 30), ("Cathy", 45)] +columns = ["Name", "Age"] +spark_df = spark.createDataFrame(data, schema=columns) +gvfs_path = f"gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/people" + +spark_df.coalesce(1).write + .mode("overwrite") + .option("header", "true") + .csv(gvfs_path) + +``` + +If your Spark without Hadoop environment, you can use the following code snippet to access the fileset: + +```python +## replace the env PYSPARK_SUBMIT_ARGS variable in the code above with the following content: +### S3 +os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-aws-bundle-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar --master local[1] pyspark-shell" +### GCS +os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-gcp-bundle-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar, --master local[1] pyspark-shell" +### OSS +os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-aliyun-bundle-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar, --master local[1] pyspark-shell" +#### Azure Blob Storage +os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-azure-bundle-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar --master local[1] pyspark-shell" +``` + +:::note +**In some Spark version, Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work, in this case, you should add the jars to the spark classpath directly.** +::: + +## Using fileset with hadoop fs command + +The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. + +1. Adding the following contents to the `${HADOOP_HOME}/etc/hadoop/core-site.xml` file: + +```xml + + fs.AbstractFileSystem.gvfs.impl + org.apache.gravitino.filesystem.hadoop.Gvfs + + + + fs.gvfs.impl + org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem + + + + fs.gravitino.server.uri + http://192.168.50.188:8090 + + + + fs.gravitino.client.metalake + test + + + + + s3-endpoint + http://s3.ap-northeast-1.amazonaws.com + + + s3-access-key-id + access-key + + + s3-secret-access-key + secret-key + + + + + oss-endpoint + https://oss-cn-shanghai.aliyuncs.com + + + oss-access-key-id + access_key + + + oss-secret-access-key + secret_key + + + + + gcs-service-account-file + /path/your-service-account-file.json + + + + + azure-storage-account-name + account_name + + + azure-storage-account-key + account_key + + + +``` + +2. Copy the necessary jars to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. + +Copy the corresponding jars to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. For example, if you are using S3, you need to copy `gravitino-aws-{version}.jar` to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. +then copy hadoop-aws-{version}.jar and related dependencies to the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory. Those jars can be found in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory, for simple you can add all the jars in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. + +More detail, please refer to the [Bundle jars](#bundle-jars) section. + + +3. Run the following command to access the fileset: + +```shell +hadoop dfs -ls gvfs://fileset/s3_catalog/schema/example +hadoop dfs -put /path/to/local/file gvfs://fileset/s3_catalog/schema/example +``` + +### Using fileset with pandas + +The following are examples of how to use the pandas library to access the S3 fileset + +```python +import pandas as pd + +storage_options = { + "server_uri": "http://localhost:8090", + "metalake_name": "test", + "options": { + "s3_access_key_id": "access_key", + "s3_secret_access_key": "secret_key", + "s3_endpoint": "http://s3.ap-northeast-1.amazonaws.com" + } +} +ds = pd.read_csv(f"gvfs://fileset/${catalog_name}/${schema_name}/${fileset_name}/people/part-00000-51d366e2-d5eb-448d-9109-32a96c8a14dc-c000.csv", + storage_options=storage_options) +ds.head() +``` + + diff --git a/docs/hadoop-catalog.md b/docs/hadoop-catalog.md index 9048556ffa5..cf86fde06e4 100644 --- a/docs/hadoop-catalog.md +++ b/docs/hadoop-catalog.md @@ -9,9 +9,9 @@ license: "This software is licensed under the Apache License version 2." ## Introduction Hadoop catalog is a fileset catalog that using Hadoop Compatible File System (HCFS) to manage -the storage location of the fileset. Currently, it supports local filesystem and HDFS. For -object storage like S3, GCS, Azure Blob Storage and OSS, you can put the hadoop object store jar like -`gravitino-aws-bundle-{gravitino-version}.jar` into the `$GRAVITINO_HOME/catalogs/hadoop/libs` directory to enable the support. +the storage location of the fileset. Currently, it supports the local filesystem and HDFS. Since 0.7.0-incubating, Gravitino supports S3, GCS, OSS and Azure Blob Storage fileset through Hadoop catalog. + +The rest of this document will use HDFS or local file as an example to illustrate how to use the Hadoop catalog. For S3, GCS, OSS and Azure Blob Storage, the configuration is similar to HDFS, but more properties need to be set. We will use [separate sections](./cloud-storage-fileset-example.md) to introduce how to use of S3, GCS, OSS and Azure Blob Storage. Note that Gravitino uses Hadoop 3 dependencies to build Hadoop catalog. Theoretically, it should be compatible with both Hadoop 2.x and 3.x, since Gravitino doesn't leverage any new features in @@ -50,8 +50,6 @@ Apart from the above properties, to access fileset like HDFS, S3, GCS, OSS or cu | `s3-access-key-id` | The access key of the AWS S3. | (none) | Yes if it's a S3 fileset. | 0.7.0-incubating | | `s3-secret-access-key` | The secret key of the AWS S3. | (none) | Yes if it's a S3 fileset. | 0.7.0-incubating | -At the same time, you need to place the corresponding bundle jar [`gravitino-aws-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aws-bundle/) in the directory `${GRAVITINO_HOME}/catalogs/hadoop/libs`. - #### GCS fileset | Configuration item | Description | Default value | Required | Since version | @@ -60,8 +58,6 @@ At the same time, you need to place the corresponding bundle jar [`gravitino-aws | `default-filesystem-provider` | The name default filesystem providers of this Hadoop catalog if users do not specify the scheme in the URI. Default value is `builtin-local`, for GCS, if we set this value, we can omit the prefix 'gs://' in the location. | `builtin-local` | No | 0.7.0-incubating | | `gcs-service-account-file` | The path of GCS service account JSON file. | (none) | Yes if it's a GCS fileset. | 0.7.0-incubating | -In the meantime, you need to place the corresponding bundle jar [`gravitino-gcp-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-gcp-bundle/) in the directory `${GRAVITINO_HOME}/catalogs/hadoop/libs`. - #### OSS fileset | Configuration item | Description | Default value | Required | Since version | @@ -72,9 +68,6 @@ In the meantime, you need to place the corresponding bundle jar [`gravitino-gcp- | `oss-access-key-id` | The access key of the Aliyun OSS. | (none) | Yes if it's a OSS fileset. | 0.7.0-incubating | | `oss-secret-access-key` | The secret key of the Aliyun OSS. | (none) | Yes if it's a OSS fileset. | 0.7.0-incubating | -In the meantime, you need to place the corresponding bundle jar [`gravitino-aliyun-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aliyun-bundle/) in the directory `${GRAVITINO_HOME}/catalogs/hadoop/libs`. - - #### Azure Blob Storage fileset | Configuration item | Description | Default value | Required | Since version | @@ -84,7 +77,6 @@ In the meantime, you need to place the corresponding bundle jar [`gravitino-aliy | `azure-storage-account-name ` | The account name of Azure Blob Storage. | (none) | Yes if it's a Azure Blob Storage fileset. | 0.8.0-incubating | | `azure-storage-account-key` | The account key of Azure Blob Storage. | (none) | Yes if it's a Azure Blob Storage fileset. | 0.8.0-incubating | -Similar to the above, you need to place the corresponding bundle jar [`gravitino-azure-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-azure-bundle/) in the directory `${GRAVITINO_HOME}/catalogs/hadoop/libs`. :::note - Gravitino contains builtin file system providers for local file system(`builtin-local`) and HDFS(`builtin-hdfs`), that is to say if `filesystem-providers` is not set, Gravitino will still support local file system and HDFS. Apart from that, you can set the `filesystem-providers` to support other file systems like S3, GCS, OSS or custom file system. diff --git a/docs/how-to-use-gvfs.md b/docs/how-to-use-gvfs.md index 0dbfd867a3d..9f34f45d072 100644 --- a/docs/how-to-use-gvfs.md +++ b/docs/how-to-use-gvfs.md @@ -43,7 +43,7 @@ the path mapping and convert automatically. ### Prerequisites + A Hadoop environment with HDFS running. GVFS has been tested against - Hadoop 3.1.0. It is recommended to use Hadoop 3.1.0 or later, but it should work with Hadoop 2. + Hadoop 3.3.0. It is recommended to use Hadoop 3.3.0 or later, but it should work with Hadoop 2. x. Please create an [issue](https://www.github.com/apache/gravitino/issues) if you find any compatibility issues. @@ -71,51 +71,51 @@ Apart from the above properties, to access fileset like S3, GCS, OSS and custom #### S3 fileset -| Configuration item | Description | Default value | Required | Since version | -|--------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|--------------------------|------------------| -| `s3-endpoint` | The endpoint of the AWS S3. | (none) | Yes if it's a S3 fileset.| 0.7.0-incubating | -| `s3-access-key-id` | The access key of the AWS S3. | (none) | Yes if it's a S3 fileset.| 0.7.0-incubating | -| `s3-secret-access-key` | The secret key of the AWS S3. | (none) | Yes if it's a S3 fileset.| 0.7.0-incubating | +| Configuration item | Description | Default value | Required | Since version | +|------------------------|-------------------------------|---------------|--------------------------|------------------| +| `s3-endpoint` | The endpoint of the AWS S3. | (none) | Yes if it's a S3 fileset.| 0.7.0-incubating | +| `s3-access-key-id` | The access key of the AWS S3. | (none) | Yes if it's a S3 fileset.| 0.7.0-incubating | +| `s3-secret-access-key` | The secret key of the AWS S3. | (none) | Yes if it's a S3 fileset.| 0.7.0-incubating | At the same time, you need to add the corresponding bundle jar -1. [`gravitino-aws-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aws-bundle/) in the classpath if no hadoop environment is available, or -2. [`gravitino-aws-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aws/) and hadoop-aws jar and other necessary dependencies in the classpath. +1. [`gravitino-aws-bundle-${gravitino-version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aws-bundle/) in the classpath if no Hadoop environment is available, or +2. [`gravitino-aws-${gravitino-version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aws/) and `hadoop-aws-${hadoop-version}.jar` and other necessary dependencies (They are usually located at `${HADOOP_HOME}/share/hadoop/tools/lib`) in the classpath. #### GCS fileset -| Configuration item | Description | Default value | Required | Since version | -|--------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|---------------------------|------------------| -| `gcs-service-account-file` | The path of GCS service account JSON file. | (none) | Yes if it's a GCS fileset.| 0.7.0-incubating | +| Configuration item | Description | Default value | Required | Since version | +|----------------------------|--------------------------------------------|---------------|---------------------------|------------------| +| `gcs-service-account-file` | The path of GCS service account JSON file. | (none) | Yes if it's a GCS fileset.| 0.7.0-incubating | In the meantime, you need to add the corresponding bundle jar -1. [`gravitino-gcp-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-gcp-bundle/) in the classpath if no hadoop environment is available, or -2. or [`gravitino-gcp-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-gcp/) and [gcs-connector jar](https://github.com/GoogleCloudDataproc/hadoop-connectors/releases) and other necessary dependencies in the classpath. +1. [`gravitino-gcp-bundle-${gravitino-version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-gcp-bundle/) in the classpath if no hadoop environment is available, or +2. [`gravitino-gcp-${gravitino-version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-gcp/) and [gcs-connector jar](https://github.com/GoogleCloudDataproc/hadoop-connectors/releases) and other necessary dependencies in the classpath. #### OSS fileset -| Configuration item | Description | Default value | Required | Since version | -|---------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|---------------------------|------------------| -| `oss-endpoint` | The endpoint of the Aliyun OSS. | (none) | Yes if it's a OSS fileset.| 0.7.0-incubating | -| `oss-access-key-id` | The access key of the Aliyun OSS. | (none) | Yes if it's a OSS fileset.| 0.7.0-incubating | -| `oss-secret-access-key` | The secret key of the Aliyun OSS. | (none) | Yes if it's a OSS fileset.| 0.7.0-incubating | +| Configuration item | Description | Default value | Required | Since version | +|-------------------------|-----------------------------------|---------------|---------------------------|------------------| +| `oss-endpoint` | The endpoint of the Aliyun OSS. | (none) | Yes if it's a OSS fileset.| 0.7.0-incubating | +| `oss-access-key-id` | The access key of the Aliyun OSS. | (none) | Yes if it's a OSS fileset.| 0.7.0-incubating | +| `oss-secret-access-key` | The secret key of the Aliyun OSS. | (none) | Yes if it's a OSS fileset.| 0.7.0-incubating | In the meantime, you need to place the corresponding bundle jar -1. [`gravitino-aliyun-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aliyun-bundle/) in the classpath if no hadoop environment is available, or -2. [`gravitino-aliyun-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aliyun/) and hadoop-aliyun jar and other necessary dependencies in the classpath. +1. [`gravitino-aliyun-bundle-${gravitino-version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aliyun-bundle/) in the classpath if no hadoop environment is available, or +2. [`gravitino-aliyun-${gravitino-version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aliyun/) and `hadoop-aliyun-${hadoop-version}.jar` and other necessary dependencies (They are usually located at `${HADOOP_HOME}/share/hadoop/tools/lib`) in the classpath. #### Azure Blob Storage fileset -| Configuration item | Description | Default value | Required | Since version | -|-----------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|-------------------------------------------|------------------| -| `azure-storage-account-name` | The account name of Azure Blob Storage. | (none) | Yes if it's a Azure Blob Storage fileset. | 0.8.0-incubating | -| `azure-storage-account-key` | The account key of Azure Blob Storage. | (none) | Yes if it's a Azure Blob Storage fileset. | 0.8.0-incubating | +| Configuration item | Description | Default value | Required | Since version | +|------------------------------|-----------------------------------------|---------------|-------------------------------------------|------------------| +| `azure-storage-account-name` | The account name of Azure Blob Storage. | (none) | Yes if it's a Azure Blob Storage fileset. | 0.8.0-incubating | +| `azure-storage-account-key` | The account key of Azure Blob Storage. | (none) | Yes if it's a Azure Blob Storage fileset. | 0.8.0-incubating | Similar to the above, you need to place the corresponding bundle jar -1. [`gravitino-azure-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-azure-bundle/) in the classpath if no hadoop environment is available, or -2. [`gravitino-azure-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-azure/) and hadoop-azure jar and other necessary dependencies in the classpath. +1. [`gravitino-azure-bundle-${gravitino-version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-azure-bundle/) in the classpath if no hadoop environment is available, or +2. [`gravitino-azure-${gravitino-version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-azure/) and `hadoop-azure-${hadoop-version}.jar` and other necessary dependencies (They are usually located at `${HADOOP_HOME}/share/hadoop/tools/lib) in the classpath. #### Custom fileset Since 0.7.0-incubating, users can define their own fileset type and configure the corresponding properties, for more, please refer to [Custom Fileset](./hadoop-catalog.md#how-to-custom-your-own-hcfs-file-system-fileset). @@ -146,13 +146,8 @@ You can configure these properties in two ways: ``` :::note -If you want to access the S3, GCS, OSS or custom fileset through GVFS, apart from the above properties, you need to place the corresponding bundle jars in the Hadoop environment. -For example, if you want to access the S3 fileset, you need to place -1. The aws hadoop bundle jar [`gravitino-aws-bundle-${gravitino-version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aws-bundle/) -2. or [`gravitino-aws-${gravitino-version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aws/), and hadoop-aws jar and other necessary dependencies - -to the classpath, it typically locates in `${HADOOP_HOME}/share/hadoop/common/lib/`). - +If you want to access the S3, GCS, OSS or custom fileset through GVFS, apart from the above properties, you need to place the corresponding bundle jars in the Hadoop environment, For bundles jar and +cloud storage fileset configuration example, please refer to [cloud storage fileset example](./cloud-storage-fileset-example.md). ::: 2. Configure the properties in the `core-site.xml` file of the Hadoop environment: @@ -209,6 +204,10 @@ two ways: ```shell ./gradlew :clients:filesystem-hadoop3-runtime:build -x test ``` +:::note +For cloud storage fileset, some extra steps should be added, please refer to [cloud storage fileset example](./cloud-storage-fileset-example.md). +::: + #### Via Hadoop shell command @@ -226,7 +225,6 @@ cp gravitino-filesystem-hadoop3-runtime-{version}.jar ${HADOOP_HOME}/share/hadoo # You need to ensure that the Kerberos has permission on the HDFS directory. kinit -kt your_kerberos.keytab your_kerberos@xxx.com - # 4. Copy other dependencies to the Hadoop environment if you want to access the S3 fileset via GVFS cp bundles/aws-bundle/build/libs/gravitino-aws-bundle-{version}.jar ${HADOOP_HOME}/share/hadoop/common/lib/ cp clients/filesystem-hadoop3-runtime/build/libs/gravitino-filesystem-hadoop3-runtime-{version}-SNAPSHOT.jar ${HADOOP_HOME}/share/hadoop/common/lib/ @@ -236,6 +234,8 @@ cp ${HADOOP_HOME}/share/hadoop/tools/lib/* ${HADOOP_HOME}/share/hadoop/common/li ./${HADOOP_HOME}/bin/hadoop dfs -ls gvfs://fileset/test_catalog/test_schema/test_fileset_1 ``` +Full example to access S3, GCS, OSS fileset via Hadoop shell command, please refer to [cloud storage fileset example](./cloud-storage-fileset-example.md). + #### Via Java code You can also perform operations on the files or directories managed by fileset through Java code. @@ -285,6 +285,9 @@ FileSystem fs = filesetPath.getFileSystem(conf); fs.getFileStatus(filesetPath); ``` +Full example to access S3, GCS, OSS fileset via Hadoop shell command, please refer to [cloud storage fileset example](./cloud-storage-fileset-example.md). + + #### Via Apache Spark 1. Add the GVFS runtime jar to the Spark environment. @@ -324,6 +327,7 @@ fs.getFileStatus(filesetPath); rdd.foreach(println) ``` +Full example to access S3, GCS, OSS fileset via Spark, please refer to [cloud storage fileset example](./cloud-storage-fileset-example.md). #### Via Tensorflow @@ -521,6 +525,8 @@ options = { fs = gvfs.GravitinoVirtualFileSystem(server_uri="http://localhost:8090", metalake_name="test_metalake", options=options) ``` +Full Python example to access S3, GCS, OSS fileset via GVFS, please refer to [cloud storage fileset example](./cloud-storage-fileset-example.md). + :::note Gravitino python client does not support customized filesets defined by users due to the limit of `fsspec` library. From baf42e19dd4a13dd571dd7c41452b639f5074391 Mon Sep 17 00:00:00 2001 From: yuqi Date: Fri, 3 Jan 2025 10:19:43 +0800 Subject: [PATCH 18/59] fix --- docs/cloud-storage-fileset-example.md | 29 +++++++++++++++++---------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/docs/cloud-storage-fileset-example.md b/docs/cloud-storage-fileset-example.md index 17d6d24ff8c..71e76c73a6d 100644 --- a/docs/cloud-storage-fileset-example.md +++ b/docs/cloud-storage-fileset-example.md @@ -5,7 +5,7 @@ keyword: fileset S3 GCS ADLS OSS license: "This software is licensed under the Apache License version 2." --- -This document aims to provide a comprehensive guide on how to use cloud storage fileset created by Gravitino, it usually contains the following sections: +This document aims to provide a comprehensive guide on how to use cloud storage fileset created by Gravitino, it usually contains the following sections. ## Necessary steps in Gravitino server @@ -31,24 +31,31 @@ bin/gravitino.sh start ### Bundle jars -Gravitino bundles jars are jars that are used to access the cloud storage, they are divided into two categories: +Gravitino bundles jars are used to access the cloud storage. They are divided into two categories: - `gravitino-${aws,gcp,aliyun,azure}-bundle-{gravitino-version}.jar` are the jars that contain all the necessary dependencies to access the corresponding cloud storages. For instance, `gravitino-aws-bundle-${gravitino-version}.jar` contains the all necessary classes including `hadoop-common`(hadoop-3.3.1) and `hadoop-aws` to access the S3 storage. They are used in the scenario where there is no hadoop environment in the runtime. - If there is already hadoop environment in the runtime, you can use the `gravitino-${aws,gcp,aliyun,azure}-${gravitino-version}.jar` that does not contain the cloud storage classes (like hadoop-aws) and hadoop environment. Alternatively, you can manually add the necessary jars to the classpath. -The following table demonstrates which jars are necessary for different cloud storage filesets: +If the Hadoop environment is available, you can use the following jars to access the cloud storage fileset: -| Hadoop runtime version | S3 | GCS | OSS | ABS | -|------------------------|------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------| -| No Hadoop environment | `gravitino-aws-bundle-${gravitino-version}.jar` | `gravitino-gcp-bundle-${gravitino-version}.jar` | `gravitino-aliyun-bundle-${gravitino-version}.jar` | `gravitino-azure-bundle-${gravitino-version}.jar` | -| 2.x, 3.x | `gravitino-aws-${gravitino-version}.jar`, `hadoop-aws-${hadoop-version}.jar`, `aws-sdk-java-${version}` and other necessary dependencies | `gravitino-gcp-{gravitino-version}.jar`, `gcs-connector-${hadoop-version}`.jar, other necessary dependencies | `gravitino-aliyun-{gravitino-version}.jar`, hadoop-aliyun-{hadoop-version}.jar, aliyun-sdk-java-{version} and other necessary dependencies | `gravitino-azure-${gravitino-version}.jar`, `hadoop-azure-${hadoop-version}.jar`, and other necessary dependencies | +- S3: `gravitino-aws-${gravitino-version}.jar`, `hadoop-aws-${hadoop-version}.jar`, `aws-sdk-java-${version}` and other necessary dependencies +- GCS: `gravitino-gcp-{gravitino-version}.jar`, `gcs-connector-${hadoop-version}`.jar, other necessary dependencies +- OSS: `gravitino-aliyun-{gravitino-version}.jar`, hadoop-aliyun-{hadoop-version}.jar, aliyun-sdk-java-{version} and other necessary dependencies +- ABS: `gravitino-azure-${gravitino-version}.jar`, `hadoop-azure-${hadoop-version}.jar`, and other necessary dependencies + +If there is no Hadoop environment, you can use the following jars to access the cloud storage fileset: + +- S3: `gravitino-aws-bundle-${gravitino-version}.jar` +- GCS: `gravitino-gcp-bundle-${gravitino-version}.jar` +- OSS: `gravitino-aliyun-bundle-${gravitino-version}.jar` +- ABS: `gravitino-azure-bundle-${gravitino-version}.jar` For `hadoop-aws-${hadoop-version}.jar`, `hadoop-azure-${hadoop-version}.jar` and `hadoop-aliyun-${hadoop-version}.jar` and related dependencies, you can get them from `${HADOOP_HOME}/share/hadoop/tools/lib/` directory. For `gcs-connector`, you can download it from the [GCS connector](https://github.com/GoogleCloudDataproc/hadoop-connectors/releases) for hadoop2 or hadoop3. -If there still have some issues, please report it to the Gravitino community and create an issue. +If there are some issues, please consider [fill in an issue](https://github.com/apache/gravitino/issues/new/choose). ## Create fileset catalogs @@ -197,7 +204,7 @@ s3_catalog = gravitino_client.create_catalog(name="catalog", :::note -The prefix of a GCS location should always start with `gs` for instance, `gs://bucket/root`. +The prefix of a GCS location should always start with `gs`, for instance, `gs://bucket/root`. ::: ### Create an OSS fileset catalog @@ -389,7 +396,7 @@ Schema schema = supportsSchemas.createSchema("schema", -You can change the value of property `location` according to which catalog you are using, moreover, if we have set the `location` property in the catalog, we can omit the `location` property in the schema. +You can change `location` value based on the catalog you are using. If the `location` property is specified in the catalog, we can omit it in the schema. ## Create filesets @@ -562,7 +569,7 @@ os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-azure-bundle-{gra ``` :::note -**In some Spark version, Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work, in this case, you should add the jars to the spark classpath directly.** +In some Spark version, Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work, in this case, you should add the jars to the spark classpath directly. ::: ## Using fileset with hadoop fs command From a1aa4d56e579af2b17c583228169baf177ec89b0 Mon Sep 17 00:00:00 2001 From: yuqi Date: Fri, 3 Jan 2025 14:13:30 +0800 Subject: [PATCH 19/59] fix --- .../apache/gravitino/s3/credential/S3TokenProvider.java | 1 - .../filesystem/hadoop/GravitinoVirtualFileSystem.java | 7 ++++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/credential/S3TokenProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/credential/S3TokenProvider.java index 0bc35b14349..56d293d046f 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/credential/S3TokenProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/credential/S3TokenProvider.java @@ -123,7 +123,6 @@ private IamPolicy createPolicy( IamStatement.builder() .effect(IamEffect.ALLOW) .addAction("s3:GetObject") - .addAction("s3:GetObjectAttributes") .addAction("s3:GetObjectVersion"); Map bucketListStatmentBuilder = new HashMap<>(); Map bucketGetLocationStatmentBuilder = new HashMap<>(); diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java index cb2d4d019df..bb34a6f3113 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java @@ -88,6 +88,8 @@ public class GravitinoVirtualFileSystem extends FileSystem { private String metalakeName; private Cache catalogCache; private ScheduledThreadPoolExecutor catalogCleanScheduler; + // Fileset name identifier and its corresponding FileSystem cache, the name identifier has + // four levels, the first level is metalake name. private Cache internalFileSystemCache; private ScheduledThreadPoolExecutor internalFileSystemCleanScheduler; @@ -100,7 +102,7 @@ public class GravitinoVirtualFileSystem extends FileSystem { private static final String SLASH = "/"; private final Map fileSystemProvidersMap = Maps.newHashMap(); - private static final Set CATALOG_NECESSARY_PROPERTIES_FOR_CREDENTIAL = + private static final Set CATALOG_NECESSARY_PROPERTIES_TO_KEEP = Sets.newHashSet( OSSProperties.GRAVITINO_OSS_ENDPOINT, OSSProperties.GRAVITINO_OSS_ENDPOINT, @@ -430,8 +432,7 @@ private FilesetContextPair getFilesetContext(Path virtualPath, FilesetDataOperat catalog.properties().entrySet().stream() .filter( property -> - CATALOG_NECESSARY_PROPERTIES_FOR_CREDENTIAL.contains( - property.getKey())) + CATALOG_NECESSARY_PROPERTIES_TO_KEEP.contains(property.getKey())) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); Map totalProperty = Maps.newHashMap(necessaryPropertyFromCatalog); From f67981cb3e4cfd7b060740430fe080500891de4e Mon Sep 17 00:00:00 2001 From: yuqi Date: Fri, 3 Jan 2025 14:49:58 +0800 Subject: [PATCH 20/59] fix typo --- .../test/GravitinoVirtualFileSystemGCSCredentialIT.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java index c2460b75f7b..813bf56d5d5 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java @@ -75,7 +75,7 @@ public void startUp() throws Exception { Map properties = Maps.newHashMap(); properties.put(FILESYSTEM_PROVIDERS, "gcs"); - properties.put(GCSProperties.GCS_SERVICE_ACCOUNT_JSON_PATH, SERVICE_ACCOUNT_FILE); + properties.put(GCSProperties.GRAVITINO_GCS_SERVICE_ACCOUNT_FILE, SERVICE_ACCOUNT_FILE); properties.put("gcs-credential-file-path", SERVICE_ACCOUNT_FILE); properties.put(CredentialConstants.CREDENTIAL_PROVIDERS, "gcs-token"); @@ -94,7 +94,7 @@ public void startUp() throws Exception { conf.set("fs.gravitino.client.metalake", metalakeName); // Pass this configuration to the real file system - conf.set(GCSProperties.GCS_SERVICE_ACCOUNT_JSON_PATH, SERVICE_ACCOUNT_FILE); + conf.set(GCSProperties.GRAVITINO_GCS_SERVICE_ACCOUNT_FILE, SERVICE_ACCOUNT_FILE); } @AfterAll From c0db96bb46ac4290d623131e6b042273d4511d90 Mon Sep 17 00:00:00 2001 From: yuqi Date: Fri, 3 Jan 2025 16:50:00 +0800 Subject: [PATCH 21/59] fix --- .../fs/GravitinoOSSCredentialProvider.java | 49 +++++++++++++------ .../s3/fs/GravitinoS3CredentialProvider.java | 45 +++++++++++------ .../gravitino/s3/fs/S3FileSystemProvider.java | 9 ++-- .../GravitinoAzureSasCredentialProvider.java | 45 +++++++++++------ .../fs/GravitinoGCSCredentialProvider.java | 15 +++--- 5 files changed, 105 insertions(+), 58 deletions(-) diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/GravitinoOSSCredentialProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/GravitinoOSSCredentialProvider.java index d0a44511e8b..65701517722 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/GravitinoOSSCredentialProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/GravitinoOSSCredentialProvider.java @@ -28,10 +28,13 @@ import com.aliyun.oss.common.auth.CredentialsProvider; import com.aliyun.oss.common.auth.DefaultCredentials; import java.net.URI; +import java.util.Arrays; import java.util.Map; +import java.util.Optional; import org.apache.gravitino.NameIdentifier; import org.apache.gravitino.client.GravitinoClient; import org.apache.gravitino.credential.Credential; +import org.apache.gravitino.credential.OSSSecretKeyCredential; import org.apache.gravitino.credential.OSSTokenCredential; import org.apache.gravitino.file.Fileset; import org.apache.gravitino.file.FilesetCatalog; @@ -44,8 +47,7 @@ public class GravitinoOSSCredentialProvider implements CredentialsProvider { - private static final Logger LOGGER = - LoggerFactory.getLogger(GravitinoOSSCredentialProvider.class); + private static final Logger LOG = LoggerFactory.getLogger(GravitinoOSSCredentialProvider.class); private Credentials basicCredentials; private final String filesetIdentifier; private long expirationTime; @@ -83,8 +85,10 @@ private void refresh() { Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); Credential[] credentials = fileset.supportsCredentials().getCredentials(); - if (credentials.length == 0) { - LOGGER.warn("No credential found for fileset: {}, try to use static AKSK", filesetIdentifier); + Optional optionalCredential = getCredential(credentials); + + if (!optionalCredential.isPresent()) { + LOG.warn("No credential found for fileset: {}, try to use static AKSK", filesetIdentifier); expirationTime = Long.MAX_VALUE; this.basicCredentials = new DefaultCredentials( @@ -93,7 +97,7 @@ private void refresh() { return; } - Credential credential = getCredential(credentials); + Credential credential = optionalCredential.get(); Map credentialMap = credential.toProperties(); String accessKeyId = credentialMap.get(GRAVITINO_OSS_SESSION_ACCESS_KEY_ID); @@ -102,12 +106,12 @@ private void refresh() { if (OSSTokenCredential.OSS_TOKEN_CREDENTIAL_TYPE.equals( credentialMap.get(Credential.CREDENTIAL_TYPE))) { String sessionToken = credentialMap.get(GRAVITINO_OSS_TOKEN); - this.basicCredentials = new BasicCredentials(accessKeyId, secretAccessKey, sessionToken); + basicCredentials = new BasicCredentials(accessKeyId, secretAccessKey, sessionToken); } else { - this.basicCredentials = new DefaultCredentials(accessKeyId, secretAccessKey); + basicCredentials = new DefaultCredentials(accessKeyId, secretAccessKey); } - this.expirationTime = credential.expireTimeInMs(); + expirationTime = credential.expireTimeInMs(); if (expirationTime <= 0) { expirationTime = Long.MAX_VALUE; } @@ -118,16 +122,29 @@ private void refresh() { * uses static credential. * * @param credentials The credential array. - * @return The credential. + * @return An optional credential. */ - private Credential getCredential(Credential[] credentials) { - for (Credential credential : credentials) { - if (OSSTokenCredential.OSS_TOKEN_CREDENTIAL_TYPE.equals(credential.credentialType())) { - return credential; - } + private Optional getCredential(Credential[] credentials) { + // Use dynamic credential if found. + Optional dynamicCredential = + Arrays.stream(credentials) + .filter( + credential -> + credential + .credentialType() + .equals(OSSTokenCredential.OSS_TOKEN_CREDENTIAL_TYPE)) + .findFirst(); + if (dynamicCredential.isPresent()) { + return dynamicCredential; } - // Not found, use the first one. - return credentials[0]; + // If dynamic credential not found, use the static one if possible + return Arrays.stream(credentials) + .filter( + credential -> + credential + .credentialType() + .equals(OSSSecretKeyCredential.OSS_SECRET_KEY_CREDENTIAL_TYPE)) + .findFirst(); } } diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/GravitinoS3CredentialProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/GravitinoS3CredentialProvider.java index d3f3105499e..12b3066653b 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/GravitinoS3CredentialProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/GravitinoS3CredentialProvider.java @@ -28,10 +28,13 @@ import com.amazonaws.auth.BasicAWSCredentials; import com.amazonaws.auth.BasicSessionCredentials; import java.net.URI; +import java.util.Arrays; import java.util.Map; +import java.util.Optional; import org.apache.gravitino.NameIdentifier; import org.apache.gravitino.client.GravitinoClient; import org.apache.gravitino.credential.Credential; +import org.apache.gravitino.credential.S3SecretKeyCredential; import org.apache.gravitino.credential.S3TokenCredential; import org.apache.gravitino.file.Fileset; import org.apache.gravitino.file.FilesetCatalog; @@ -44,7 +47,7 @@ public class GravitinoS3CredentialProvider implements AWSCredentialsProvider { - private static final Logger LOGGER = LoggerFactory.getLogger(GravitinoS3CredentialProvider.class); + private static final Logger LOG = LoggerFactory.getLogger(GravitinoS3CredentialProvider.class); private final GravitinoClient client; private final String filesetIdentifier; private final Configuration configuration; @@ -83,10 +86,11 @@ public void refresh() { Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); Credential[] credentials = fileset.supportsCredentials().getCredentials(); + Optional optionalCredential = getCredential(credentials); // Can't find any credential, use the default AKSK if possible. - if (credentials.length == 0) { - LOGGER.warn("No credential found for fileset: {}, try to use static AKSK", filesetIdentifier); + if (!optionalCredential.isPresent()) { + LOG.warn("No credential found for fileset: {}, try to use static AKSK", filesetIdentifier); expirationTime = Long.MAX_VALUE; this.basicSessionCredentials = new BasicAWSCredentials( @@ -94,7 +98,7 @@ public void refresh() { return; } - Credential credential = getCredential(credentials); + Credential credential = optionalCredential.get(); Map credentialMap = credential.toProperties(); String accessKeyId = credentialMap.get(GRAVITINO_S3_SESSION_ACCESS_KEY_ID); @@ -103,13 +107,13 @@ public void refresh() { if (S3TokenCredential.S3_TOKEN_CREDENTIAL_TYPE.equals( credentialMap.get(Credential.CREDENTIAL_TYPE))) { String sessionToken = credentialMap.get(GRAVITINO_S3_TOKEN); - this.basicSessionCredentials = + basicSessionCredentials = new BasicSessionCredentials(accessKeyId, secretAccessKey, sessionToken); } else { - this.basicSessionCredentials = new BasicAWSCredentials(accessKeyId, secretAccessKey); + basicSessionCredentials = new BasicAWSCredentials(accessKeyId, secretAccessKey); } - this.expirationTime = credential.expireTimeInMs(); + expirationTime = credential.expireTimeInMs(); if (expirationTime <= 0) { expirationTime = Long.MAX_VALUE; } @@ -120,16 +124,27 @@ public void refresh() { * uses static credential. * * @param credentials The credential array. - * @return The credential. + * @return An optional credential. */ - private Credential getCredential(Credential[] credentials) { - for (Credential credential : credentials) { - if (S3TokenCredential.S3_TOKEN_CREDENTIAL_TYPE.equals(credential.credentialType())) { - return credential; - } + private Optional getCredential(Credential[] credentials) { + // Use dynamic credential if found. + Optional dynamicCredential = + Arrays.stream(credentials) + .filter( + credential -> + credential.credentialType().equals(S3TokenCredential.S3_TOKEN_CREDENTIAL_TYPE)) + .findFirst(); + if (dynamicCredential.isPresent()) { + return dynamicCredential; } - // Not found, use the first one. - return credentials[0]; + // If dynamic credential not found, use the static one + return Arrays.stream(credentials) + .filter( + credential -> + credential + .credentialType() + .equals(S3SecretKeyCredential.S3_SECRET_KEY_CREDENTIAL_TYPE)) + .findFirst(); } } diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java index 3747d2c104c..94d30d205b8 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java @@ -42,7 +42,7 @@ public class S3FileSystemProvider implements FileSystemProvider { - private static final Logger LOGGER = LoggerFactory.getLogger(S3FileSystemProvider.class); + private static final Logger LOG = LoggerFactory.getLogger(S3FileSystemProvider.class); @VisibleForTesting public static final Map GRAVITINO_KEY_TO_S3_HADOOP_KEY = @@ -67,6 +67,9 @@ public FileSystem getFileSystem(Path path, Map config) throws IO configuration.set(S3_CREDENTIAL_KEY, S3_SIMPLE_CREDENTIAL); } + // Only call from GVFS client will have this key and support GravitinoS3CredentialProvider as + // the file system provider will be used by GVFS client and Gravitino server, only GVFS client + // will have this key. if (config.containsKey(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { configuration.set( Constants.AWS_CREDENTIALS_PROVIDER, @@ -97,12 +100,12 @@ private void checkAndSetCredentialProvider(Configuration configuration) { if (AWSCredentialsProvider.class.isAssignableFrom(c)) { validProviders.add(provider); } else { - LOGGER.warn( + LOG.warn( "Credential provider {} is not a subclass of AWSCredentialsProvider, skipping", provider); } } catch (Exception e) { - LOGGER.warn( + LOG.warn( "Credential provider {} not found in the Hadoop runtime, falling back to default", provider); configuration.set(S3_CREDENTIAL_KEY, S3_SIMPLE_CREDENTIAL); diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/GravitinoAzureSasCredentialProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/GravitinoAzureSasCredentialProvider.java index f208dac4c56..b004b188f03 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/GravitinoAzureSasCredentialProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/GravitinoAzureSasCredentialProvider.java @@ -19,15 +19,18 @@ package org.apache.gravitino.abs.fs; +import static org.apache.gravitino.credential.ADLSTokenCredential.ADLS_TOKEN_CREDENTIAL_TYPE; import static org.apache.gravitino.credential.ADLSTokenCredential.GRAVITINO_ADLS_SAS_TOKEN; import static org.apache.gravitino.credential.AzureAccountKeyCredential.GRAVITINO_AZURE_STORAGE_ACCOUNT_KEY; import static org.apache.gravitino.credential.AzureAccountKeyCredential.GRAVITINO_AZURE_STORAGE_ACCOUNT_NAME; import java.io.IOException; +import java.util.Arrays; import java.util.Map; +import java.util.Optional; import org.apache.gravitino.NameIdentifier; import org.apache.gravitino.client.GravitinoClient; -import org.apache.gravitino.credential.ADLSTokenCredential; +import org.apache.gravitino.credential.AzureAccountKeyCredential; import org.apache.gravitino.credential.Credential; import org.apache.gravitino.file.Fileset; import org.apache.gravitino.file.FilesetCatalog; @@ -102,20 +105,21 @@ private void refresh() { Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); Credential[] credentials = fileset.supportsCredentials().getCredentials(); - if (credentials.length == 0) { + Optional optionalCredential = getCredential(credentials); + + if (!optionalCredential.isPresent()) { LOGGER.warn("No credentials found for fileset {}", filesetIdentifier); return; } - Credential credential = getCredential(credentials); + Credential credential = optionalCredential.get(); Map credentialMap = credential.toProperties(); - if (ADLSTokenCredential.ADLS_TOKEN_CREDENTIAL_TYPE.equals( - credentialMap.get(Credential.CREDENTIAL_TYPE))) { - this.sasToken = credentialMap.get(GRAVITINO_ADLS_SAS_TOKEN); + if (ADLS_TOKEN_CREDENTIAL_TYPE.equals(credentialMap.get(Credential.CREDENTIAL_TYPE))) { + sasToken = credentialMap.get(GRAVITINO_ADLS_SAS_TOKEN); } else { - this.azureStorageAccountName = credentialMap.get(GRAVITINO_AZURE_STORAGE_ACCOUNT_NAME); - this.azureStorageAccountKey = credentialMap.get(GRAVITINO_AZURE_STORAGE_ACCOUNT_KEY); + azureStorageAccountName = credentialMap.get(GRAVITINO_AZURE_STORAGE_ACCOUNT_NAME); + azureStorageAccountKey = credentialMap.get(GRAVITINO_AZURE_STORAGE_ACCOUNT_KEY); } this.expirationTime = credential.expireTimeInMs(); @@ -129,16 +133,25 @@ private void refresh() { * uses static credential. * * @param credentials The credential array. - * @return The credential. + * @return An optional credential. */ - private Credential getCredential(Credential[] credentials) { - for (Credential credential : credentials) { - if (ADLSTokenCredential.ADLS_TOKEN_CREDENTIAL_TYPE.equals(credential.credentialType())) { - return credential; - } + private Optional getCredential(Credential[] credentials) { + // Use dynamic credential if found. + Optional dynamicCredential = + Arrays.stream(credentials) + .filter(credential -> credential.credentialType().equals(ADLS_TOKEN_CREDENTIAL_TYPE)) + .findFirst(); + if (dynamicCredential.isPresent()) { + return dynamicCredential; } - // Not found, use the first one. - return credentials[0]; + // If dynamic credential not found, use the static one + return Arrays.stream(credentials) + .filter( + credential -> + credential + .credentialType() + .equals(AzureAccountKeyCredential.AZURE_ACCOUNT_KEY_CREDENTIAL_TYPE)) + .findFirst(); } } diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GravitinoGCSCredentialProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GravitinoGCSCredentialProvider.java index 5c4d93b8997..c93b886ffd0 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GravitinoGCSCredentialProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GravitinoGCSCredentialProvider.java @@ -35,8 +35,7 @@ import org.slf4j.LoggerFactory; public class GravitinoGCSCredentialProvider implements AccessTokenProvider { - private static final Logger LOGGER = - LoggerFactory.getLogger(GravitinoGCSCredentialProvider.class); + private static final Logger LOG = LoggerFactory.getLogger(GravitinoGCSCredentialProvider.class); private Configuration configuration; private GravitinoClient client; private String filesetIdentifier; @@ -50,7 +49,7 @@ public AccessToken getAccessToken() { try { refresh(); } catch (IOException e) { - LOGGER.error("Failed to refresh the access token", e); + LOG.error("Failed to refresh the access token", e); } } return accessToken; @@ -70,7 +69,7 @@ public void refresh() throws IOException { // Can't find any credential, use the default one. if (credentials.length == 0) { - LOGGER.warn( + LOG.warn( "No credential found for fileset: {}, try to use static JSON file", filesetIdentifier); return; } @@ -82,11 +81,11 @@ public void refresh() throws IOException { credentialMap.get(Credential.CREDENTIAL_TYPE))) { String sessionToken = credentialMap.get(GCSTokenCredential.GCS_TOKEN_NAME); accessToken = new AccessToken(sessionToken, expirationTime); - } - this.expirationTime = credential.expireTimeInMs(); - if (expirationTime <= 0) { - expirationTime = Long.MAX_VALUE; + expirationTime = credential.expireTimeInMs(); + if (expirationTime <= 0) { + expirationTime = Long.MAX_VALUE; + } } } From d2ba98b41320c19c7426ad81cddf6eddc0a988d6 Mon Sep 17 00:00:00 2001 From: yuqi Date: Fri, 3 Jan 2025 19:09:25 +0800 Subject: [PATCH 22/59] refactor module and create a new module `filesystem-hadoop3-common` --- bundles/aliyun/build.gradle.kts | 4 +- .../fs/GravitinoOSSCredentialProvider.java | 7 +- .../oss/fs/OSSFileSystemProvider.java | 2 +- bundles/aws/build.gradle.kts | 4 +- .../s3/fs/GravitinoS3CredentialProvider.java | 7 +- .../gravitino/s3/fs/S3FileSystemProvider.java | 2 +- bundles/azure/build.gradle.kts | 5 +- .../abs/fs/AzureFileSystemProvider.java | 2 +- .../GravitinoAzureSasCredentialProvider.java | 7 +- bundles/gcp/build.gradle.kts | 4 +- .../gcs/fs/GCSFileSystemProvider.java | 2 +- .../fs/GravitinoGCSCredentialProvider.java | 8 +- .../build.gradle.kts | 50 ++++++ ...avitinoVirtualFileSystemConfiguration.java | 2 +- .../GravitinoVirtualFileSystemUtils.java | 150 ++++++++++++++++++ clients/filesystem-hadoop3/build.gradle.kts | 4 + .../hadoop/GravitinoVirtualFileSystem.java | 129 +-------------- .../gravitino/filesystem/hadoop/Gvfs.java | 1 + .../hadoop/FileSystemTestUtils.java | 1 + .../filesystem/hadoop/TestGvfsBase.java | 1 + .../filesystem/hadoop/TestKerberosClient.java | 1 + .../filesystem/hadoop/TestOauth2Client.java | 1 + .../filesystem/hadoop/TestSimpleClient.java | 1 + settings.gradle.kts | 3 +- 24 files changed, 246 insertions(+), 152 deletions(-) create mode 100644 clients/filesystem-hadoop3-common/build.gradle.kts rename clients/{filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop => filesystem-hadoop3-common/src/main/java/org/apache/gravitino/filesystem/common}/GravitinoVirtualFileSystemConfiguration.java (98%) create mode 100644 clients/filesystem-hadoop3-common/src/main/java/org/apache/gravitino/filesystem/common/GravitinoVirtualFileSystemUtils.java diff --git a/bundles/aliyun/build.gradle.kts b/bundles/aliyun/build.gradle.kts index 5c93b8962e7..b62570072f7 100644 --- a/bundles/aliyun/build.gradle.kts +++ b/bundles/aliyun/build.gradle.kts @@ -40,7 +40,9 @@ dependencies { exclude("*") } implementation(project(":clients:client-java-runtime", configuration = "shadow")) - implementation(project(":clients:filesystem-hadoop3-runtime", configuration = "shadow")) + implementation(project(":clients:filesystem-hadoop3-common")) { + exclude("*") + } implementation(libs.aliyun.credentials.sdk) implementation(libs.commons.collections3) diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/GravitinoOSSCredentialProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/GravitinoOSSCredentialProvider.java index 65701517722..375a69d812b 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/GravitinoOSSCredentialProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/GravitinoOSSCredentialProvider.java @@ -38,8 +38,8 @@ import org.apache.gravitino.credential.OSSTokenCredential; import org.apache.gravitino.file.Fileset; import org.apache.gravitino.file.FilesetCatalog; -import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem; -import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystemConfiguration; +import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; +import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.aliyun.oss.Constants; import org.slf4j.Logger; @@ -57,8 +57,7 @@ public class GravitinoOSSCredentialProvider implements CredentialsProvider { public GravitinoOSSCredentialProvider(URI uri, Configuration conf) { this.filesetIdentifier = conf.get(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER); - GravitinoVirtualFileSystem gravitinoVirtualFileSystem = new GravitinoVirtualFileSystem(); - this.client = gravitinoVirtualFileSystem.initializeClient(conf); + this.client = GravitinoVirtualFileSystemUtils.createClient(conf); this.configuration = conf; } diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java index c1ae59a897e..cadabd7f3bf 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java @@ -24,7 +24,7 @@ import java.util.Map; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; -import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystemConfiguration; +import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; import org.apache.gravitino.storage.OSSProperties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; diff --git a/bundles/aws/build.gradle.kts b/bundles/aws/build.gradle.kts index 3db8c805c1c..82a709dd472 100644 --- a/bundles/aws/build.gradle.kts +++ b/bundles/aws/build.gradle.kts @@ -40,7 +40,9 @@ dependencies { exclude("*") } implementation(project(":clients:client-java-runtime", configuration = "shadow")) - implementation(project(":clients:filesystem-hadoop3-runtime", configuration = "shadow")) + implementation(project(":clients:filesystem-hadoop3-common")) { + exclude("*") + } implementation(libs.aws.iam) implementation(libs.aws.policy) diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/GravitinoS3CredentialProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/GravitinoS3CredentialProvider.java index 12b3066653b..006c7aede77 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/GravitinoS3CredentialProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/GravitinoS3CredentialProvider.java @@ -38,8 +38,8 @@ import org.apache.gravitino.credential.S3TokenCredential; import org.apache.gravitino.file.Fileset; import org.apache.gravitino.file.FilesetCatalog; -import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem; -import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystemConfiguration; +import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; +import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.s3a.Constants; import org.slf4j.Logger; @@ -59,8 +59,7 @@ public GravitinoS3CredentialProvider(final URI uri, final Configuration conf) { this.filesetIdentifier = conf.get(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER); this.configuration = conf; - GravitinoVirtualFileSystem gravitinoVirtualFileSystem = new GravitinoVirtualFileSystem(); - this.client = gravitinoVirtualFileSystem.initializeClient(conf); + this.client = GravitinoVirtualFileSystemUtils.createClient(conf); } @Override diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java index 94d30d205b8..a564536fa05 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java @@ -30,7 +30,7 @@ import java.util.Map; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; -import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystemConfiguration; +import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; import org.apache.gravitino.storage.S3Properties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; diff --git a/bundles/azure/build.gradle.kts b/bundles/azure/build.gradle.kts index 3d350214d6c..c64890af66a 100644 --- a/bundles/azure/build.gradle.kts +++ b/bundles/azure/build.gradle.kts @@ -39,8 +39,11 @@ dependencies { implementation(project(":catalogs:hadoop-common")) { exclude("*") } + implementation(project(":clients:client-java-runtime", configuration = "shadow")) - implementation(project(":clients:filesystem-hadoop3-runtime", configuration = "shadow")) + implementation(project(":clients:filesystem-hadoop3-common")) { + exclude("*") + } implementation(libs.azure.identity) implementation(libs.azure.storage.file.datalake) diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java index b9aca233620..99db1c4cda6 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java @@ -30,7 +30,7 @@ import javax.annotation.Nonnull; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; -import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystemConfiguration; +import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; import org.apache.gravitino.storage.AzureProperties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/GravitinoAzureSasCredentialProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/GravitinoAzureSasCredentialProvider.java index b004b188f03..1a539145699 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/GravitinoAzureSasCredentialProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/GravitinoAzureSasCredentialProvider.java @@ -34,8 +34,8 @@ import org.apache.gravitino.credential.Credential; import org.apache.gravitino.file.Fileset; import org.apache.gravitino.file.FilesetCatalog; -import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem; -import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystemConfiguration; +import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; +import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemUtils; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.azurebfs.extensions.SASTokenProvider; @@ -82,8 +82,7 @@ public Configuration getConf() { public void initialize(Configuration conf, String accountName) throws IOException { this.filesetIdentifier = conf.get(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER); - GravitinoVirtualFileSystem gravitinoVirtualFileSystem = new GravitinoVirtualFileSystem(); - this.client = gravitinoVirtualFileSystem.initializeClient(conf); + this.client = GravitinoVirtualFileSystemUtils.createClient(conf); } @Override diff --git a/bundles/gcp/build.gradle.kts b/bundles/gcp/build.gradle.kts index 94fa1562156..defb5098e72 100644 --- a/bundles/gcp/build.gradle.kts +++ b/bundles/gcp/build.gradle.kts @@ -41,7 +41,9 @@ dependencies { exclude("*") } implementation(project(":clients:client-java-runtime", configuration = "shadow")) - implementation(project(":clients:filesystem-hadoop3-runtime", configuration = "shadow")) + implementation(project(":clients:filesystem-hadoop3-common")) { + exclude("*") + } implementation(libs.commons.lang3) // runtime used diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java index 46409cca837..3cdc97f3edc 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java @@ -25,7 +25,7 @@ import java.util.Map; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; -import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystemConfiguration; +import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; import org.apache.gravitino.storage.GCSProperties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GravitinoGCSCredentialProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GravitinoGCSCredentialProvider.java index c93b886ffd0..6c8af262ddc 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GravitinoGCSCredentialProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GravitinoGCSCredentialProvider.java @@ -28,8 +28,8 @@ import org.apache.gravitino.credential.GCSTokenCredential; import org.apache.gravitino.file.Fileset; import org.apache.gravitino.file.FilesetCatalog; -import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem; -import org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystemConfiguration; +import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; +import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemUtils; import org.apache.hadoop.conf.Configuration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -92,11 +92,9 @@ public void refresh() throws IOException { @Override public void setConf(Configuration configuration) { this.configuration = configuration; - this.filesetIdentifier = configuration.get(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER); - GravitinoVirtualFileSystem gravitinoVirtualFileSystem = new GravitinoVirtualFileSystem(); - this.client = gravitinoVirtualFileSystem.initializeClient(configuration); + this.client = GravitinoVirtualFileSystemUtils.createClient(configuration); } @Override diff --git a/clients/filesystem-hadoop3-common/build.gradle.kts b/clients/filesystem-hadoop3-common/build.gradle.kts new file mode 100644 index 00000000000..09f2fb82e90 --- /dev/null +++ b/clients/filesystem-hadoop3-common/build.gradle.kts @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +plugins { + `maven-publish` + id("java") + id("idea") +} + +dependencies { + implementation(project(":api")) { + exclude("*") + } + + implementation(project(":common")) { + exclude("*") + } + implementation(libs.commons.lang3) + implementation(libs.guava) + + compileOnly(project(":clients:client-java-runtime", configuration = "shadow")) + compileOnly(libs.hadoop3.client.api) + compileOnly(libs.hadoop3.client.runtime) + compileOnly(libs.lombok) + annotationProcessor(libs.lombok) +} + +tasks.build { + dependsOn("javadoc") +} + +tasks.clean { + delete("target") + delete("tmp") +} diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java b/clients/filesystem-hadoop3-common/src/main/java/org/apache/gravitino/filesystem/common/GravitinoVirtualFileSystemConfiguration.java similarity index 98% rename from clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java rename to clients/filesystem-hadoop3-common/src/main/java/org/apache/gravitino/filesystem/common/GravitinoVirtualFileSystemConfiguration.java index 8b83dc002b3..32d6985b822 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java +++ b/clients/filesystem-hadoop3-common/src/main/java/org/apache/gravitino/filesystem/common/GravitinoVirtualFileSystemConfiguration.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.gravitino.filesystem.hadoop; +package org.apache.gravitino.filesystem.common; /** Configuration class for Gravitino Virtual File System. */ public class GravitinoVirtualFileSystemConfiguration { diff --git a/clients/filesystem-hadoop3-common/src/main/java/org/apache/gravitino/filesystem/common/GravitinoVirtualFileSystemUtils.java b/clients/filesystem-hadoop3-common/src/main/java/org/apache/gravitino/filesystem/common/GravitinoVirtualFileSystemUtils.java new file mode 100644 index 00000000000..c78d2365b82 --- /dev/null +++ b/clients/filesystem-hadoop3-common/src/main/java/org/apache/gravitino/filesystem/common/GravitinoVirtualFileSystemUtils.java @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.filesystem.common; + +import com.google.common.base.Preconditions; +import java.io.File; +import org.apache.commons.lang3.StringUtils; +import org.apache.gravitino.client.DefaultOAuth2TokenProvider; +import org.apache.gravitino.client.GravitinoClient; +import org.apache.gravitino.client.KerberosTokenProvider; +import org.apache.hadoop.conf.Configuration; + +public class GravitinoVirtualFileSystemUtils { + + /** + * Get Gravitino client by the configuration. + * + * @param configuration The configuration for the Gravitino client. + * @return The Gravitino client. + */ + public static GravitinoClient createClient(Configuration configuration) { + // initialize the Gravitino client + String serverUri = + configuration.get(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY); + String metalakeValue = + configuration.get(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_METALAKE_KEY); + Preconditions.checkArgument( + StringUtils.isNotBlank(serverUri), + "'%s' is not set in the configuration", + GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY); + + String authType = + configuration.get( + GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_AUTH_TYPE_KEY, + GravitinoVirtualFileSystemConfiguration.SIMPLE_AUTH_TYPE); + if (authType.equalsIgnoreCase(GravitinoVirtualFileSystemConfiguration.SIMPLE_AUTH_TYPE)) { + return GravitinoClient.builder(serverUri) + .withMetalake(metalakeValue) + .withSimpleAuth() + .build(); + } else if (authType.equalsIgnoreCase( + GravitinoVirtualFileSystemConfiguration.OAUTH2_AUTH_TYPE)) { + String authServerUri = + configuration.get( + GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_OAUTH2_SERVER_URI_KEY); + checkAuthConfig( + GravitinoVirtualFileSystemConfiguration.OAUTH2_AUTH_TYPE, + GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_OAUTH2_SERVER_URI_KEY, + authServerUri); + + String credential = + configuration.get( + GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_OAUTH2_CREDENTIAL_KEY); + checkAuthConfig( + GravitinoVirtualFileSystemConfiguration.OAUTH2_AUTH_TYPE, + GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_OAUTH2_CREDENTIAL_KEY, + credential); + + String path = + configuration.get( + GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_OAUTH2_PATH_KEY); + checkAuthConfig( + GravitinoVirtualFileSystemConfiguration.OAUTH2_AUTH_TYPE, + GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_OAUTH2_PATH_KEY, + path); + + String scope = + configuration.get( + GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_OAUTH2_SCOPE_KEY); + checkAuthConfig( + GravitinoVirtualFileSystemConfiguration.OAUTH2_AUTH_TYPE, + GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_OAUTH2_SCOPE_KEY, + scope); + + DefaultOAuth2TokenProvider authDataProvider = + DefaultOAuth2TokenProvider.builder() + .withUri(authServerUri) + .withCredential(credential) + .withPath(path) + .withScope(scope) + .build(); + + return GravitinoClient.builder(serverUri) + .withMetalake(metalakeValue) + .withOAuth(authDataProvider) + .build(); + } else if (authType.equalsIgnoreCase( + GravitinoVirtualFileSystemConfiguration.KERBEROS_AUTH_TYPE)) { + String principal = + configuration.get( + GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_KERBEROS_PRINCIPAL_KEY); + checkAuthConfig( + GravitinoVirtualFileSystemConfiguration.KERBEROS_AUTH_TYPE, + GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_KERBEROS_PRINCIPAL_KEY, + principal); + String keytabFilePath = + configuration.get( + GravitinoVirtualFileSystemConfiguration + .FS_GRAVITINO_CLIENT_KERBEROS_KEYTAB_FILE_PATH_KEY); + KerberosTokenProvider authDataProvider; + if (StringUtils.isNotBlank(keytabFilePath)) { + // Using principal and keytab to create auth provider + authDataProvider = + KerberosTokenProvider.builder() + .withClientPrincipal(principal) + .withKeyTabFile(new File(keytabFilePath)) + .build(); + } else { + // Using ticket cache to create auth provider + authDataProvider = KerberosTokenProvider.builder().withClientPrincipal(principal).build(); + } + + return GravitinoClient.builder(serverUri) + .withMetalake(metalakeValue) + .withKerberosAuth(authDataProvider) + .build(); + } else { + throw new IllegalArgumentException( + String.format( + "Unsupported authentication type: %s for %s.", + authType, GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_AUTH_TYPE_KEY)); + } + } + + private static void checkAuthConfig(String authType, String configKey, String configValue) { + Preconditions.checkArgument( + StringUtils.isNotBlank(configValue), + "%s should not be null if %s is set to %s.", + configKey, + GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_AUTH_TYPE_KEY, + authType); + } +} diff --git a/clients/filesystem-hadoop3/build.gradle.kts b/clients/filesystem-hadoop3/build.gradle.kts index 424f6a11406..191cf794d86 100644 --- a/clients/filesystem-hadoop3/build.gradle.kts +++ b/clients/filesystem-hadoop3/build.gradle.kts @@ -28,6 +28,10 @@ dependencies { compileOnly(libs.hadoop3.client.api) compileOnly(libs.hadoop3.client.runtime) + implementation(project(":clients:filesystem-hadoop3-common")) { + exclude(group = "*") + } + implementation(project(":catalogs:catalog-common")) { exclude(group = "*") } diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java index bb34a6f3113..45c161b487f 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java @@ -18,7 +18,7 @@ */ package org.apache.gravitino.filesystem.hadoop; -import static org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER; +import static org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER; import com.github.benmanes.caffeine.cache.Cache; import com.github.benmanes.caffeine.cache.Caffeine; @@ -30,7 +30,6 @@ import com.google.common.collect.Sets; import com.google.common.collect.Streams; import com.google.common.util.concurrent.ThreadFactoryBuilder; -import java.io.File; import java.io.IOException; import java.net.URI; import java.util.Arrays; @@ -53,11 +52,11 @@ import org.apache.gravitino.audit.FilesetDataOperation; import org.apache.gravitino.audit.InternalClientType; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; -import org.apache.gravitino.client.DefaultOAuth2TokenProvider; import org.apache.gravitino.client.GravitinoClient; -import org.apache.gravitino.client.KerberosTokenProvider; import org.apache.gravitino.exceptions.GravitinoRuntimeException; import org.apache.gravitino.file.FilesetCatalog; +import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; +import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemUtils; import org.apache.gravitino.storage.AzureProperties; import org.apache.gravitino.storage.OSSProperties; import org.apache.gravitino.storage.S3Properties; @@ -151,8 +150,7 @@ public void initialize(URI name, Configuration configuration) throws IOException "'%s' is not set in the configuration", GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_METALAKE_KEY); - this.client = initializeClient(configuration); - + this.client = GravitinoVirtualFileSystemUtils.createClient(configuration); // Register the default local and HDFS FileSystemProvider fileSystemProvidersMap.putAll(getFileSystemProviders()); @@ -212,125 +210,6 @@ private ThreadFactory newDaemonThreadFactory(String name) { return new ThreadFactoryBuilder().setDaemon(true).setNameFormat(name + "-%d").build(); } - /** - * Get Gravitino client by the configuration. - * - * @param configuration The configuration for the Gravitino client. - * @return The Gravitino client. - */ - public GravitinoClient initializeClient(Configuration configuration) { - // initialize the Gravitino client - String serverUri = - configuration.get(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY); - String metalakeValue = - configuration.get(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_METALAKE_KEY); - Preconditions.checkArgument( - StringUtils.isNotBlank(serverUri), - "'%s' is not set in the configuration", - GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY); - - String authType = - configuration.get( - GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_AUTH_TYPE_KEY, - GravitinoVirtualFileSystemConfiguration.SIMPLE_AUTH_TYPE); - if (authType.equalsIgnoreCase(GravitinoVirtualFileSystemConfiguration.SIMPLE_AUTH_TYPE)) { - return GravitinoClient.builder(serverUri) - .withMetalake(metalakeValue) - .withSimpleAuth() - .build(); - } else if (authType.equalsIgnoreCase( - GravitinoVirtualFileSystemConfiguration.OAUTH2_AUTH_TYPE)) { - String authServerUri = - configuration.get( - GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_OAUTH2_SERVER_URI_KEY); - checkAuthConfig( - GravitinoVirtualFileSystemConfiguration.OAUTH2_AUTH_TYPE, - GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_OAUTH2_SERVER_URI_KEY, - authServerUri); - - String credential = - configuration.get( - GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_OAUTH2_CREDENTIAL_KEY); - checkAuthConfig( - GravitinoVirtualFileSystemConfiguration.OAUTH2_AUTH_TYPE, - GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_OAUTH2_CREDENTIAL_KEY, - credential); - - String path = - configuration.get( - GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_OAUTH2_PATH_KEY); - checkAuthConfig( - GravitinoVirtualFileSystemConfiguration.OAUTH2_AUTH_TYPE, - GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_OAUTH2_PATH_KEY, - path); - - String scope = - configuration.get( - GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_OAUTH2_SCOPE_KEY); - checkAuthConfig( - GravitinoVirtualFileSystemConfiguration.OAUTH2_AUTH_TYPE, - GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_OAUTH2_SCOPE_KEY, - scope); - - DefaultOAuth2TokenProvider authDataProvider = - DefaultOAuth2TokenProvider.builder() - .withUri(authServerUri) - .withCredential(credential) - .withPath(path) - .withScope(scope) - .build(); - - return GravitinoClient.builder(serverUri) - .withMetalake(metalakeValue) - .withOAuth(authDataProvider) - .build(); - } else if (authType.equalsIgnoreCase( - GravitinoVirtualFileSystemConfiguration.KERBEROS_AUTH_TYPE)) { - String principal = - configuration.get( - GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_KERBEROS_PRINCIPAL_KEY); - checkAuthConfig( - GravitinoVirtualFileSystemConfiguration.KERBEROS_AUTH_TYPE, - GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_KERBEROS_PRINCIPAL_KEY, - principal); - String keytabFilePath = - configuration.get( - GravitinoVirtualFileSystemConfiguration - .FS_GRAVITINO_CLIENT_KERBEROS_KEYTAB_FILE_PATH_KEY); - KerberosTokenProvider authDataProvider; - if (StringUtils.isNotBlank(keytabFilePath)) { - // Using principal and keytab to create auth provider - authDataProvider = - KerberosTokenProvider.builder() - .withClientPrincipal(principal) - .withKeyTabFile(new File(keytabFilePath)) - .build(); - } else { - // Using ticket cache to create auth provider - authDataProvider = KerberosTokenProvider.builder().withClientPrincipal(principal).build(); - } - - return GravitinoClient.builder(serverUri) - .withMetalake(metalakeValue) - .withKerberosAuth(authDataProvider) - .build(); - } else { - throw new IllegalArgumentException( - String.format( - "Unsupported authentication type: %s for %s.", - authType, GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_AUTH_TYPE_KEY)); - } - } - - private void checkAuthConfig(String authType, String configKey, String configValue) { - Preconditions.checkArgument( - StringUtils.isNotBlank(configValue), - "%s should not be null if %s is set to %s.", - configKey, - GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CLIENT_AUTH_TYPE_KEY, - authType); - } - private String getVirtualLocation(NameIdentifier identifier, boolean withScheme) { return String.format( "%s/%s/%s/%s", diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/Gvfs.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/Gvfs.java index 4d2cbf03e98..37afac71afb 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/Gvfs.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/Gvfs.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; +import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.DelegateToFileSystem; diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/FileSystemTestUtils.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/FileSystemTestUtils.java index 9c3fdf86137..7dc20c92c52 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/FileSystemTestUtils.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/FileSystemTestUtils.java @@ -21,6 +21,7 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.UUID; +import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java index 16fb4e1282c..e31c2b57e67 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java @@ -41,6 +41,7 @@ import java.util.concurrent.TimeUnit; import org.apache.gravitino.NameIdentifier; import org.apache.gravitino.dto.responses.FileLocationResponse; +import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; import org.apache.gravitino.rest.RESTUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestKerberosClient.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestKerberosClient.java index 564b05cee72..1bf3d495c38 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestKerberosClient.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestKerberosClient.java @@ -29,6 +29,7 @@ import java.util.List; import java.util.UUID; import org.apache.gravitino.Config; +import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; import org.apache.gravitino.server.authentication.KerberosAuthenticator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestOauth2Client.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestOauth2Client.java index 2186f530673..c9479790757 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestOauth2Client.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestOauth2Client.java @@ -51,6 +51,7 @@ import org.apache.gravitino.exceptions.BadRequestException; import org.apache.gravitino.exceptions.RESTException; import org.apache.gravitino.exceptions.UnauthorizedException; +import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; import org.apache.gravitino.json.JsonUtils; import org.apache.gravitino.rest.RESTUtils; import org.apache.gravitino.server.authentication.OAuthConfig; diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestSimpleClient.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestSimpleClient.java index b88fbba16b4..fd724c35951 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestSimpleClient.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestSimpleClient.java @@ -31,6 +31,7 @@ import org.apache.gravitino.dto.AuditDTO; import org.apache.gravitino.dto.MetalakeDTO; import org.apache.gravitino.dto.responses.MetalakeResponse; +import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; import org.apache.gravitino.json.JsonUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; diff --git a/settings.gradle.kts b/settings.gradle.kts index c865e14e7a2..e1bac291729 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -48,7 +48,8 @@ include( "clients:filesystem-hadoop3", "clients:filesystem-hadoop3-runtime", "clients:client-python", - "clients:cli" + "clients:cli", + "clients:filesystem-hadoop3-common" ) if (gradle.startParameter.projectProperties["enableFuse"]?.toBoolean() == true) { include("clients:filesystem-fuse") From b7eb62109c1efc7505fcd00ce8875284b17bf21d Mon Sep 17 00:00:00 2001 From: yuqi Date: Fri, 3 Jan 2025 19:35:02 +0800 Subject: [PATCH 23/59] fix --- docs/cloud-storage-fileset-example.md | 685 -------------------------- docs/hadoop-catalog-with-adls.md | 357 ++++++++++++++ docs/hadoop-catalog-with-gcs.md | 345 +++++++++++++ docs/hadoop-catalog-with-oss.md | 368 ++++++++++++++ docs/hadoop-catalog-with-s3.md | 372 ++++++++++++++ docs/hadoop-catalog.md | 4 +- docs/how-to-use-gvfs.md | 15 +- 7 files changed, 1445 insertions(+), 701 deletions(-) delete mode 100644 docs/cloud-storage-fileset-example.md create mode 100644 docs/hadoop-catalog-with-adls.md create mode 100644 docs/hadoop-catalog-with-gcs.md create mode 100644 docs/hadoop-catalog-with-oss.md create mode 100644 docs/hadoop-catalog-with-s3.md diff --git a/docs/cloud-storage-fileset-example.md b/docs/cloud-storage-fileset-example.md deleted file mode 100644 index 71e76c73a6d..00000000000 --- a/docs/cloud-storage-fileset-example.md +++ /dev/null @@ -1,685 +0,0 @@ ---- -title: "How to use cloud storage fileset" -slug: /how-to-use-cloud-storage-fileset -keyword: fileset S3 GCS ADLS OSS -license: "This software is licensed under the Apache License version 2." ---- - -This document aims to provide a comprehensive guide on how to use cloud storage fileset created by Gravitino, it usually contains the following sections. - -## Necessary steps in Gravitino server - -### Start up Gravitino server - -Before running the Gravitino server, you need to put the following jars into the fileset catalog classpath located at `${GRAVITINO_HOME}/catalogs/hadoop/libs`. For example, if you are using S3, you need to put `gravitino-aws-hadoop-bundle-{gravitino-version}.jar` into the fileset catalog classpath in `${GRAVITINO_HOME}/catalogs/hadoop/libs`. - -| Storage type | Description | Jar file | Since Version | -|--------------|---------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------|------------------| -| Local | The local file system. | (none) | 0.5.0 | -| HDFS | HDFS file system. | (none) | 0.5.0 | -| S3 | AWS S3. | [gravitino-aws-hadoop-bundle](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-hadoop-aws-bundle) | 0.8.0-incubating | -| GCS | Google Cloud Storage. | [gravitino-gcp-hadoop-bundle](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-hadoop-gcp-bundle) | 0.8.0-incubating | -| OSS | Aliyun OSS. | [gravitino-aliyun-hadoop-bundle](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-hadoop-aliyun-bundle) | 0.8.0-incubating | -| ABS | Azure Blob Storage (aka. ABS, or Azure Data Lake Storage (v2) | [gravitino-azure-hadoop-bundle](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-hadoop-azure-bundle) | 0.8.0-incubating | - -After adding the jars into the fileset catalog classpath, you can start up the Gravitino server by running the following command: - -```shell -cd ${GRAVITINO_HOME} -bin/gravitino.sh start -``` - -### Bundle jars - -Gravitino bundles jars are used to access the cloud storage. They are divided into two categories: - -- `gravitino-${aws,gcp,aliyun,azure}-bundle-{gravitino-version}.jar` are the jars that contain all the necessary dependencies to access the corresponding cloud storages. For instance, `gravitino-aws-bundle-${gravitino-version}.jar` contains the all necessary classes including `hadoop-common`(hadoop-3.3.1) and `hadoop-aws` to access the S3 storage. -They are used in the scenario where there is no hadoop environment in the runtime. - -- If there is already hadoop environment in the runtime, you can use the `gravitino-${aws,gcp,aliyun,azure}-${gravitino-version}.jar` that does not contain the cloud storage classes (like hadoop-aws) and hadoop environment. Alternatively, you can manually add the necessary jars to the classpath. - -If the Hadoop environment is available, you can use the following jars to access the cloud storage fileset: - -- S3: `gravitino-aws-${gravitino-version}.jar`, `hadoop-aws-${hadoop-version}.jar`, `aws-sdk-java-${version}` and other necessary dependencies -- GCS: `gravitino-gcp-{gravitino-version}.jar`, `gcs-connector-${hadoop-version}`.jar, other necessary dependencies -- OSS: `gravitino-aliyun-{gravitino-version}.jar`, hadoop-aliyun-{hadoop-version}.jar, aliyun-sdk-java-{version} and other necessary dependencies -- ABS: `gravitino-azure-${gravitino-version}.jar`, `hadoop-azure-${hadoop-version}.jar`, and other necessary dependencies - -If there is no Hadoop environment, you can use the following jars to access the cloud storage fileset: - -- S3: `gravitino-aws-bundle-${gravitino-version}.jar` -- GCS: `gravitino-gcp-bundle-${gravitino-version}.jar` -- OSS: `gravitino-aliyun-bundle-${gravitino-version}.jar` -- ABS: `gravitino-azure-bundle-${gravitino-version}.jar` - -For `hadoop-aws-${hadoop-version}.jar`, `hadoop-azure-${hadoop-version}.jar` and `hadoop-aliyun-${hadoop-version}.jar` and related dependencies, you can get them from `${HADOOP_HOME}/share/hadoop/tools/lib/` directory. -For `gcs-connector`, you can download it from the [GCS connector](https://github.com/GoogleCloudDataproc/hadoop-connectors/releases) for hadoop2 or hadoop3. - -If there are some issues, please consider [fill in an issue](https://github.com/apache/gravitino/issues/new/choose). - -## Create fileset catalogs - -Once the Gravitino server is started, you can create the corresponding fileset by the following sentence: - - -### Create a S3 fileset catalog - - - - -```shell -curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ --H "Content-Type: application/json" -d '{ - "name": "catalog", - "type": "FILESET", - "comment": "comment", - "provider": "hadoop", - "properties": { - "location": "s3a://bucket/root", - "s3-access-key-id": "access_key", - "s3-secret-access-key": "secret_key", - "s3-endpoint": "http://s3.ap-northeast-1.amazonaws.com", - "filesystem-providers": "s3" - } -}' http://localhost:8090/api/metalakes/metalake/catalogs -``` - - - - -```java -GravitinoClient gravitinoClient = GravitinoClient - .builder("http://localhost:8090") - .withMetalake("metalake") - .build(); - -s3Properties = ImmutableMap.builder() - .put("location", "s3a://bucket/root") - .put("s3-access-key-id", "access_key") - .put("s3-secret-access-key", "secret_key") - .put("s3-endpoint", "http://s3.ap-northeast-1.amazonaws.com") - .put("filesystem-providers", "s3") - .build(); - -Catalog s3Catalog = gravitinoClient.createCatalog("catalog", - Type.FILESET, - "hadoop", // provider, Gravitino only supports "hadoop" for now. - "This is a S3 fileset catalog", - s3Properties); -// ... - -``` - - - - -```python -gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") -s3_properties = { - "location": "s3a://bucket/root", - "s3-access-key-id": "access_key" - "s3-secret-access-key": "secret_key", - "s3-endpoint": "http://s3.ap-northeast-1.amazonaws.com" -} - -s3_catalog = gravitino_client.create_catalog(name="catalog", - type=Catalog.Type.FILESET, - provider="hadoop", - comment="This is a S3 fileset catalog", - properties=s3_properties) - -``` - - - - -:::note -The value of location should always start with `s3a` NOT `s3` for AWS S3, for instance, `s3a://bucket/root`. Value like `s3://bucket/root` is not supported due to the limitation of the hadoop-aws library. -::: - -### Create a GCS fileset catalog - - - - -```shell -curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ --H "Content-Type: application/json" -d '{ - "name": "catalog", - "type": "FILESET", - "comment": "comment", - "provider": "hadoop", - "properties": { - "location": "gs://bucket/root", - "gcs-service-account-file": "path_of_gcs_service_account_file", - "filesystem-providers": "gcs" - } -}' http://localhost:8090/api/metalakes/metalake/catalogs -``` - - - - -```java -GravitinoClient gravitinoClient = GravitinoClient - .builder("http://localhost:8090") - .withMetalake("metalake") - .build(); - -gcsProperties = ImmutableMap.builder() - .put("location", "gs://bucket/root") - .put("gcs-service-account-file", "path_of_gcs_service_account_file") - .put("filesystem-providers", "gcs") - .build(); - -Catalog gcsCatalog = gravitinoClient.createCatalog("catalog", - Type.FILESET, - "hadoop", // provider, Gravitino only supports "hadoop" for now. - "This is a GCS fileset catalog", - gcsProperties); -// ... - -``` - - - - -```python -gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") - -gcs_properties = { - "location": "gcs://bucket/root", - "gcs_service_account_file": "path_of_gcs_service_account_file" -} - -s3_catalog = gravitino_client.create_catalog(name="catalog", - type=Catalog.Type.FILESET, - provider="hadoop", - comment="This is a GCS fileset catalog", - properties=gcs_properties) - -``` - - - - -:::note -The prefix of a GCS location should always start with `gs`, for instance, `gs://bucket/root`. -::: - -### Create an OSS fileset catalog - - - - -```shell -curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ --H "Content-Type: application/json" -d '{ - "name": "catalog", - "type": "FILESET", - "comment": "comment", - "provider": "hadoop", - "properties": { - "location": "oss://bucket/root", - "oss-access-key-id": "access_key", - "oss-secret-access-key": "secret_key", - "oss-endpoint": "http://oss-cn-hangzhou.aliyuncs.com", - "filesystem-providers": "oss" - } -}' http://localhost:8090/api/metalakes/metalake/catalogs -``` - - - - -```java -GravitinoClient gravitinoClient = GravitinoClient - .builder("http://localhost:8090") - .withMetalake("metalake") - .build(); - -ossProperties = ImmutableMap.builder() - .put("location", "oss://bucket/root") - .put("oss-access-key-id", "access_key") - .put("oss-secret-access-key", "secret_key") - .put("oss-endpoint", "http://oss-cn-hangzhou.aliyuncs.com") - .put("filesystem-providers", "oss") - .build(); - -Catalog ossProperties = gravitinoClient.createCatalog("catalog", - Type.FILESET, - "hadoop", // provider, Gravitino only supports "hadoop" for now. - "This is a OSS fileset catalog", - ossProperties); -// ... - -``` - - - - -```python -gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") -oss_properties = { - "location": "oss://bucket/root", - "oss-access-key-id": "access_key" - "oss-secret-access-key": "secret_key", - "oss-endpoint": "http://oss-cn-hangzhou.aliyuncs.com" -} - -oss_catalog = gravitino_client.create_catalog(name="catalog", - type=Catalog.Type.FILESET, - provider="hadoop", - comment="This is a OSS fileset catalog", - properties=oss_properties) - -``` - -### Create an ABS (Azure Blob Storage or ADLS) fileset catalog - - - - -```shell -curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ --H "Content-Type: application/json" -d '{ - "name": "catalog", - "type": "FILESET", - "comment": "comment", - "provider": "hadoop", - "properties": { - "location": "abfss://container/root", - "abs-account-name": "The account name of the Azure Blob Storage", - "abs-account-key": "The account key of the Azure Blob Storage", - "filesystem-providers": "abs" - } -}' http://localhost:8090/api/metalakes/metalake/catalogs -``` - - - - -```java -GravitinoClient gravitinoClient = GravitinoClient - .builder("http://localhost:8090") - .withMetalake("metalake") - .build(); - -absProperties = ImmutableMap.builder() - .put("location", "abfss://container/root") - .put("abs-account-name", "The account name of the Azure Blob Storage") - .put("abs-account-key", "The account key of the Azure Blob Storage") - .put("filesystem-providers", "abs") - .build(); - -Catalog gcsCatalog = gravitinoClient.createCatalog("catalog", - Type.FILESET, - "hadoop", // provider, Gravitino only supports "hadoop" for now. - "This is a Azure Blob storage fileset catalog", - absProperties); -// ... - -``` - - - - -```python -gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") - -abs_properties = { - "location": "gcs://bucket/root", - "abs_account_name": "The account name of the Azure Blob Storage", - "abs_account_key": "The account key of the Azure Blob Storage" -} - -abs_catalog = gravitino_client.create_catalog(name="catalog", - type=Catalog.Type.FILESET, - provider="hadoop", - comment="This is a Azure Blob Storage fileset catalog", - properties=abs_properties) - -``` - - - - -note::: -The prefix of an ABS (Azure Blob Storage or ADLS (v2)) location should always start with `abfss` NOT `abfs`, for instance, `abfss://container/root`. Value like `abfs://container/root` is not supported. -::: - - -## Create fileset schema - -This part is the same for all cloud storage filesets, you can create the schema by the following sentence: - - - - -```shell -curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ --H "Content-Type: application/json" -d '{ - "name": "schema", - "comment": "comment", - "properties": { - "location": "file:///tmp/root/schema" - } -}' http://localhost:8090/api/metalakes/metalake/catalogs/catalog/schemas -``` - - - - -```java -GravitinoClient gravitinoClient = GravitinoClient - .builder("http://localhost:8090") - .withMetalake("metalake") - .build(); - -// Assuming you have just created a Hadoop catalog named `catalog` -Catalog catalog = gravitinoClient.loadCatalog("catalog"); - -SupportsSchemas supportsSchemas = catalog.asSchemas(); - -Map schemaProperties = ImmutableMap.builder() - // Property "location" is optional, if specified all the managed fileset without - // specifying storage location will be stored under this location. - .put("location", "file:///tmp/root/schema") - .build(); -Schema schema = supportsSchemas.createSchema("schema", - "This is a schema", - schemaProperties -); -// ... -``` - - - - -You can change `location` value based on the catalog you are using. If the `location` property is specified in the catalog, we can omit it in the schema. - -## Create filesets - -The following sentences can be used to create a fileset in the schema: - - - - -```shell -curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ --H "Content-Type: application/json" -d '{ - "name": "example_fileset", - "comment": "This is an example fileset", - "type": "MANAGED", - "storageLocation": "s3a://bucket/root/schema/example_fileset", - "properties": { - "k1": "v1" - } -}' http://localhost:8090/api/metalakes/metalake/catalogs/catalog/schemas/schema/filesets -``` - - - - -```java -GravitinoClient gravitinoClient = GravitinoClient - .builder("http://localhost:8090") - .withMetalake("metalake") - .build(); - -Catalog catalog = gravitinoClient.loadCatalog("catalog"); -FilesetCatalog filesetCatalog = catalog.asFilesetCatalog(); - -Map propertiesMap = ImmutableMap.builder() - .put("k1", "v1") - .build(); - -filesetCatalog.createFileset( - NameIdentifier.of("schema", "example_fileset"), - "This is an example fileset", - Fileset.Type.MANAGED, - "s3a://bucket/root/schema/example_fileset", - propertiesMap, -); -``` - - - - -```python -gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") - -catalog: Catalog = gravitino_client.load_catalog(name="catalog") -catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("schema", "example_fileset"), - type=Fileset.Type.MANAGED, - comment="This is an example fileset", - storage_location="s3a://bucket/root/schema/example_fileset", - properties={"k1": "v1"}) -``` - - - - -Similar to schema, the `storageLocation` is optional if you have set the `location` property in the schema or catalog. Please change the value of -`location` as the actual location you want to store the fileset. - -The example above is for S3 fileset, you can replace the `storageLocation` with the actual location of the GCS, OSS, or ABS fileset. - - -## Using Spark to access the fileset - -The following code snippet shows how to use **PySpark 3.1.3 with hadoop environment(hadoop 3.2.0)** to access the fileset: - -```python -import logging -from gravitino import NameIdentifier, GravitinoClient, Catalog, Fileset, GravitinoAdminClient -from pyspark.sql import SparkSession -import os - -gravitino_url = "http://localhost:8090" -metalake_name = "test" - -catalog_name = "s3_catalog" -schema_name = "schema" -fileset_name = "example" - -## this is for S3 -os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-aws-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}-SNAPSHOT.jar,/path/to/hadoop-aws-3.2.0.jar,/path/to/aws-java-sdk-bundle-1.11.375.jar --master local[1] pyspark-shell" -spark = SparkSession.builder - .appName("s3_fielset_test") - .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") - .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") - .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") - .config("spark.hadoop.fs.gravitino.client.metalake", "test") - .config("spark.hadoop.s3-access-key-id", os.environ["S3_ACCESS_KEY_ID"]) - .config("spark.hadoop.s3-secret-access-key", os.environ["S3_SECRET_ACCESS_KEY"]) - .config("spark.hadoop.s3-endpoint", "http://s3.ap-northeast-1.amazonaws.com") - .config("spark.driver.memory", "2g") - .config("spark.driver.port", "2048") - .getOrCreate() - -### this is for GCS -os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-gcp-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar,/path/to/gcs-connector-hadoop3-2.2.22-shaded.jar --master local[1] pyspark-shell" -spark = SparkSession.builder - .appName("s3_fielset_test") - .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") - .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") - .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") - .config("spark.hadoop.fs.gravitino.client.metalake", "test") - .config("spark.hadoop.gcs-service-account-file", "/path/to/gcs-service-account-file.json") - .config("spark.driver.memory", "2g") - .config("spark.driver.port", "2048") - .getOrCreate() - -### this is for OSS -os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-aliyun-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar,/path/to/aliyun-sdk-oss-2.8.3.jar,/path/to/hadoop-aliyun-3.2.0.jar,/path/to/jdom-1.1.jar --master local[1] pyspark-shell" -spark = SparkSession.builder - .appName("s3_fielset_test") - .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") - .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") - .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") - .config("spark.hadoop.fs.gravitino.client.metalake", "test") - .config("spark.hadoop.oss-access-key-id", os.environ["OSS_ACCESS_KEY_ID"]) - .config("spark.hadoop.oss-secret-access-key", os.environ["S3_SECRET_ACCESS_KEY"]) - .config("spark.hadoop.oss-endpoint", "https://oss-cn-shanghai.aliyuncs.com") - .config("spark.driver.memory", "2g") - .config("spark.driver.port", "2048") - .getOrCreate() -spark.sparkContext.setLogLevel("DEBUG") - -### this is for ABS -os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-azure-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar,/path/to/hadoop-azure-3.2.0.jar,/path/to/azure-storage-7.0.0.jar,/path/to/wildfly-openssl-1.0.4.Final.jar --master local[1] pyspark-shell" -spark = SparkSession.builder - .appName("s3_fielset_test") - .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") - .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") - .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") - .config("spark.hadoop.fs.gravitino.client.metalake", "test") - .config("spark.hadoop.azure-storage-account-name", "azure_account_name") - .config("spark.hadoop.azure-storage-account-key", "azure_account_name") - .config("spark.hadoop.fs.azure.skipUserGroupMetadataDuringInitialization", "true") - .config("spark.driver.memory", "2g") - .config("spark.driver.port", "2048") - .getOrCreate() - -data = [("Alice", 25), ("Bob", 30), ("Cathy", 45)] -columns = ["Name", "Age"] -spark_df = spark.createDataFrame(data, schema=columns) -gvfs_path = f"gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/people" - -spark_df.coalesce(1).write - .mode("overwrite") - .option("header", "true") - .csv(gvfs_path) - -``` - -If your Spark without Hadoop environment, you can use the following code snippet to access the fileset: - -```python -## replace the env PYSPARK_SUBMIT_ARGS variable in the code above with the following content: -### S3 -os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-aws-bundle-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar --master local[1] pyspark-shell" -### GCS -os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-gcp-bundle-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar, --master local[1] pyspark-shell" -### OSS -os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-aliyun-bundle-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar, --master local[1] pyspark-shell" -#### Azure Blob Storage -os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-azure-bundle-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar --master local[1] pyspark-shell" -``` - -:::note -In some Spark version, Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work, in this case, you should add the jars to the spark classpath directly. -::: - -## Using fileset with hadoop fs command - -The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. - -1. Adding the following contents to the `${HADOOP_HOME}/etc/hadoop/core-site.xml` file: - -```xml - - fs.AbstractFileSystem.gvfs.impl - org.apache.gravitino.filesystem.hadoop.Gvfs - - - - fs.gvfs.impl - org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem - - - - fs.gravitino.server.uri - http://192.168.50.188:8090 - - - - fs.gravitino.client.metalake - test - - - - - s3-endpoint - http://s3.ap-northeast-1.amazonaws.com - - - s3-access-key-id - access-key - - - s3-secret-access-key - secret-key - - - - - oss-endpoint - https://oss-cn-shanghai.aliyuncs.com - - - oss-access-key-id - access_key - - - oss-secret-access-key - secret_key - - - - - gcs-service-account-file - /path/your-service-account-file.json - - - - - azure-storage-account-name - account_name - - - azure-storage-account-key - account_key - - - -``` - -2. Copy the necessary jars to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. - -Copy the corresponding jars to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. For example, if you are using S3, you need to copy `gravitino-aws-{version}.jar` to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. -then copy hadoop-aws-{version}.jar and related dependencies to the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory. Those jars can be found in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory, for simple you can add all the jars in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. - -More detail, please refer to the [Bundle jars](#bundle-jars) section. - - -3. Run the following command to access the fileset: - -```shell -hadoop dfs -ls gvfs://fileset/s3_catalog/schema/example -hadoop dfs -put /path/to/local/file gvfs://fileset/s3_catalog/schema/example -``` - -### Using fileset with pandas - -The following are examples of how to use the pandas library to access the S3 fileset - -```python -import pandas as pd - -storage_options = { - "server_uri": "http://localhost:8090", - "metalake_name": "test", - "options": { - "s3_access_key_id": "access_key", - "s3_secret_access_key": "secret_key", - "s3_endpoint": "http://s3.ap-northeast-1.amazonaws.com" - } -} -ds = pd.read_csv(f"gvfs://fileset/${catalog_name}/${schema_name}/${fileset_name}/people/part-00000-51d366e2-d5eb-448d-9109-32a96c8a14dc-c000.csv", - storage_options=storage_options) -ds.head() -``` - - diff --git a/docs/hadoop-catalog-with-adls.md b/docs/hadoop-catalog-with-adls.md new file mode 100644 index 00000000000..e77d2c80465 --- /dev/null +++ b/docs/hadoop-catalog-with-adls.md @@ -0,0 +1,357 @@ +--- +title: "Hadoop catalog with ADLS" +slug: /hadoop-catalog-with-adls +date: 2025-01-03 +keyword: Hadoop catalog ADLS +license: "This software is licensed under the Apache License version 2." +--- + +This document describes how to configure a Hadoop catalog with ADLS (Azure Blob Storage). + +## Prerequisites + +In order to create a Hadoop catalog with ADLS, you need to place [`gravitino-azure-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-azure-bundle) in Gravitino Hadoop classpath located +at `${HADOOP_HOME}/share/hadoop/common/lib/`. After that, start Gravitino server with the following command: + +```bash +$ bin/gravitino-server.sh start +``` + +## Create a Hadoop Catalog with ADLS in Gravitino + +### Catalog a catalog + +Apart from configuration method in [Hadoop-catalog-catalog-configuration](./hadoop-catalog.md#catalog-properties), the following properties are required to configure a Hadoop catalog with ADLS: + +| Configuration item | Description | Default value | Required | Since version | +|-----------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|-------------------------------------------|------------------| +| `filesystem-providers` | The file system providers to add. Set it to `abs` if it's a Azure Blob Storage fileset, or a comma separated string that contains `abs` like `oss,abs,s3` to support multiple kinds of fileset including `abs`. | (none) | Yes | 0.8.0-incubating | +| `default-filesystem-provider` | The name default filesystem providers of this Hadoop catalog if users do not specify the scheme in the URI. Default value is `builtin-local`, for Azure Blob Storage, if we set this value, we can omit the prefix 'abfss://' in the location. | `builtin-local` | No | 0.8.0-incubating | +| `azure-storage-account-name ` | The account name of Azure Blob Storage. | (none) | Yes if it's a Azure Blob Storage fileset. | 0.8.0-incubating | +| `azure-storage-account-key` | The account key of Azure Blob Storage. | (none) | Yes if it's a Azure Blob Storage fileset. | 0.8.0-incubating | + +### Create a schema + +Refer to [Schema operation](./manage-fileset-metadata-using-gravitino.md#schema-operations) for more details. + +### Create a fileset + +Refer to [Fileset operation](./manage-fileset-metadata-using-gravitino.md#fileset-operations) for more details. + + +## Using Hadoop catalog with ADLS + +### Create a Hadoop catalog/schema/file set with ADLS + +First, you need to create a Hadoop catalog with ADLS. The following example shows how to create a Hadoop catalog with ADLS: + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "catalog", + "type": "FILESET", + "comment": "comment", + "provider": "hadoop", + "properties": { + "location": "abfss://container@account-name.dfs.core.windows.net/path", + "azure-storage-account-name": "The account name of the Azure Blob Storage", + "azure-storage-account-key": "The account key of the Azure Blob Storage", + "filesystem-providers": "abs" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs +``` + + + + +```java +GravitinoClient gravitinoClient = GravitinoClient + .builder("http://localhost:8090") + .withMetalake("metalake") + .build(); + +adlsProperties = ImmutableMap.builder() + .put("location", "abfss://container@account-name.dfs.core.windows.net/path") + .put("azure-storage-account-name", "azure storage account name") + .put("azure-storage-account-key", "azure storage account key") + .put("filesystem-providers", "abs") + .build(); + +Catalog adlsCatalog = gravitinoClient.createCatalog("catalog", + Type.FILESET, + "hadoop", // provider, Gravitino only supports "hadoop" for now. + "This is a ADLS fileset catalog", + adlsProperties); +// ... + +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") +adls_properties = { + "location": "abfss://container@account-name.dfs.core.windows.net/path", + "azure_storage_account_name": "azure storage account name", + "azure_storage_account_key": "azure storage account key" +} + +adls_properties = gravitino_client.create_catalog(name="catalog", + type=Catalog.Type.FILESET, + provider="hadoop", + comment="This is a ADLS fileset catalog", + properties=adls_properties) + +``` + + + + +Then create a schema and fileset in the catalog created above. + +Using the following code to create a schema and fileset: + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "schema", + "comment": "comment", + "properties": { + "location": "abfss://container@account-name.dfs.core.windows.net/path" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs/catalog/schemas +``` + + + + +```java +// Assuming you have just created a Hive catalog named `hive_catalog` +Catalog catalog = gravitinoClient.loadCatalog("hive_catalog"); + +SupportsSchemas supportsSchemas = catalog.asSchemas(); + +Map schemaProperties = ImmutableMap.builder() + .put("location", "abfss://container@account-name.dfs.core.windows.net/path") + .build(); +Schema schema = supportsSchemas.createSchema("schema", + "This is a schema", + schemaProperties +); +// ... +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://127.0.0.1:8090", metalake_name="metalake") +catalog: Catalog = gravitino_client.load_catalog(name="hive_catalog") +catalog.as_schemas().create_schema(name="schema", + comment="This is a schema", + properties={"location": "abfss://container@account-name.dfs.core.windows.net/path"}) +``` + + + + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "example_fileset", + "comment": "This is an example fileset", + "type": "MANAGED", + "storageLocation": "abfss://container@account-name.dfs.core.windows.net/path/example_fileset", + "properties": { + "k1": "v1" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs/catalog/schemas/schema/filesets +``` + + + + +```java +GravitinoClient gravitinoClient = GravitinoClient + .builder("http://localhost:8090") + .withMetalake("metalake") + .build(); + +Catalog catalog = gravitinoClient.loadCatalog("catalog"); +FilesetCatalog filesetCatalog = catalog.asFilesetCatalog(); + +Map propertiesMap = ImmutableMap.builder() + .put("k1", "v1") + .build(); + +filesetCatalog.createFileset( + NameIdentifier.of("schema", "example_fileset"), + "This is an example fileset", + Fileset.Type.MANAGED, + "abfss://container@account-name.dfs.core.windows.net/path/example_fileset", + propertiesMap, +); +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") + +catalog: Catalog = gravitino_client.load_catalog(name="catalog") +catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("schema", "example_fileset"), + type=Fileset.Type.MANAGED, + comment="This is an example fileset", + storage_location="abfss://container@account-name.dfs.core.windows.net/path/example_fileset", + properties={"k1": "v1"}) +``` + + + + +## Using Spark to access the fileset + +The following code snippet shows how to use **PySpark 3.1.3 with Hadoop environment(Hadoop 3.2.0)** to access the fileset: + +```python +import logging +from gravitino import NameIdentifier, GravitinoClient, Catalog, Fileset, GravitinoAdminClient +from pyspark.sql import SparkSession +import os + +gravitino_url = "http://localhost:8090" +metalake_name = "test" + +catalog_name = "your_adls_catalog" +schema_name = "your_adls_schema" +fileset_name = "your_adls_fileset" + +os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-azure-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar,/path/to/hadoop-azure-3.2.0.jar,/path/to/azure-storage-7.0.0.jar,/path/to/wildfly-openssl-1.0.4.Final.jar --master local[1] pyspark-shell" +spark = SparkSession.builder +.appName("adls_fileset_test") +.config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") +.config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") +.config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") +.config("spark.hadoop.fs.gravitino.client.metalake", "test") +.config("spark.hadoop.azure-storage-account-name", "azure_account_name") +.config("spark.hadoop.azure-storage-account-key", "azure_account_name") +.config("spark.hadoop.fs.azure.skipUserGroupMetadataDuringInitialization", "true") +.config("spark.driver.memory", "2g") +.config("spark.driver.port", "2048") +.getOrCreate() + +data = [("Alice", 25), ("Bob", 30), ("Cathy", 45)] +columns = ["Name", "Age"] +spark_df = spark.createDataFrame(data, schema=columns) +gvfs_path = f"gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/people" + +spark_df.coalesce(1).write +.mode("overwrite") +.option("header", "true") +.csv(gvfs_path) +``` + +If your Spark **without Hadoop environment**, you can use the following code snippet to access the fileset: + +```python +## Replace the following code snippet with the above code snippet with the same environment variables + +os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-azure-bundle-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar --master local[1] pyspark-shell" +``` + +- [`gravitino-azure-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-azure-bundle) is the Gravitino ADLS jar with Hadoop environment and `hadoop-azure` jar. +- [`gravitino-azure-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-azure) is the Gravitino ADLS jar without Hadoop environment and `hadoop-azure` jar. + +Please choose the correct jar according to your environment. + +:::note +In some Spark version, Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work, in this case, you should add the jars to the spark classpath directly. +::: + +## Using fileset with hadoop fs command + +The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. + +1. Adding the following contents to the `${HADOOP_HOME}/etc/hadoop/core-site.xml` file: + +```xml + + fs.AbstractFileSystem.gvfs.impl + org.apache.gravitino.filesystem.hadoop.Gvfs + + + + fs.gvfs.impl + org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem + + + + fs.gravitino.server.uri + http://192.168.50.188:8090 + + + + fs.gravitino.client.metalake + test + + + + azure-storage-account-name + account_name + + + azure-storage-account-key + account_key + +``` + +2. Copy the necessary jars to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. + +Copy the corresponding jars to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. For ADLS, you need to copy `gravitino-azure-{version}.jar` to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. +then copy `hadoop-azure-${version}.jar` and related dependencies to the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory. Those jars can be found in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory, for simple you can add all the jars in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. + + +3. Run the following command to access the fileset: + +```shell +hadoop dfs -ls gvfs://fileset/adls_catalog/schema/example +hadoop dfs -put /path/to/local/file gvfs://fileset/adls_catalog/schema/example +``` + +### Using fileset with pandas + +The following are examples of how to use the pandas library to access the ADLS fileset + +```python +import pandas as pd + +storage_options = { + "server_uri": "http://localhost:8090", + "metalake_name": "test", + "options": { + "azure_storage_account_name": "azure_account_name", + "azure_storage_account_key": "azure_account_key" + } +} +ds = pd.read_csv(f"gvfs://fileset/${catalog_name}/${schema_name}/${fileset_name}/people/part-00000-51d366e2-d5eb-448d-9109-32a96c8a14dc-c000.csv", + storage_options=storage_options) +ds.head() +``` + +## Fileset with credential + +If the catalog has been configured with credential, you can access ADLS fileset without setting `azure-storage-account-name` and `azure-storage-account-key` in the properties via GVFS. More detail can be seen [here](./security/credential-vending.md#adls-credentials). + + + diff --git a/docs/hadoop-catalog-with-gcs.md b/docs/hadoop-catalog-with-gcs.md new file mode 100644 index 00000000000..6dc6bb1c732 --- /dev/null +++ b/docs/hadoop-catalog-with-gcs.md @@ -0,0 +1,345 @@ +--- +title: "Hadoop catalog with GCS" +slug: /hadoop-catalog-with-gcs +date: 2024-01-03 +keyword: Hadoop catalog GCS +license: "This software is licensed under the Apache License version 2." +--- + +This document describes how to configure a Hadoop catalog with GCS. + +## Prerequisites + +In order to create a Hadoop catalog with GCS, you need to place [`gravitino-gcp-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-gcp-bundle) in Gravitino Hadoop classpath located +at `${HADOOP_HOME}/share/hadoop/common/lib/`. After that, start Gravitino server with the following command: + +```bash +$ bin/gravitino-server.sh start +``` + +## Create a Hadoop Catalog with GCS in Gravitino + +### Catalog a catalog + +Apart from configuration method in [Hadoop-catalog-catalog-configuration](./hadoop-catalog.md#catalog-properties), the following properties are required to configure a Hadoop catalog with GCS: + +| Configuration item | Description | Default value | Required | Since version | +|-------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|----------------------------|------------------| +| `filesystem-providers` | The file system providers to add. Set it to `gs` if it's a GCS fileset, a comma separated string that contains `gs` like `gs,s3` to support multiple kinds of fileset including `gs`. | (none) | Yes | 0.7.0-incubating | +| `default-filesystem-provider` | The name default filesystem providers of this Hadoop catalog if users do not specify the scheme in the URI. Default value is `builtin-local`, for GCS, if we set this value, we can omit the prefix 'gs://' in the location. | `builtin-local` | No | 0.7.0-incubating | +| `gcs-service-account-file` | The path of GCS service account JSON file. | (none) | Yes if it's a GCS fileset. | 0.7.0-incubating | + +### Create a schema + +Refer to [Schema operation](./manage-fileset-metadata-using-gravitino.md#schema-operations) for more details. + +### Create a fileset + +Refer to [Fileset operation](./manage-fileset-metadata-using-gravitino.md#fileset-operations) for more details. + + +## Using Hadoop catalog with GCS + +### Create a Hadoop catalog/schema/file set with GCS + +First, you need to create a Hadoop catalog with GCS. The following example shows how to create a Hadoop catalog with GCS: + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "catalog", + "type": "FILESET", + "comment": "comment", + "provider": "hadoop", + "properties": { + "location": "gs://bucket/root", + "gcs-service-account-file": "path_of_gcs_service_account_file", + "filesystem-providers": "gcs" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs +``` + + + + +```java +GravitinoClient gravitinoClient = GravitinoClient + .builder("http://localhost:8090") + .withMetalake("metalake") + .build(); + +gcsProperties = ImmutableMap.builder() + .put("location", "gs://bucket/root") + .put("gcs-service-account-file", "path_of_gcs_service_account_file") + .put("filesystem-providers", "gcs") + .build(); + +Catalog gcsCatalog = gravitinoClient.createCatalog("catalog", + Type.FILESET, + "hadoop", // provider, Gravitino only supports "hadoop" for now. + "This is a GCS fileset catalog", + gcsProperties); +// ... + +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") +gcs_properties = { + "location": "gs://bucket/root", + "gcs-service-account-file": "path_of_gcs_service_account_file" +} + +gcs_properties = gravitino_client.create_catalog(name="catalog", + type=Catalog.Type.FILESET, + provider="hadoop", + comment="This is a GCS fileset catalog", + properties=gcs_properties) + +``` + + + + +Then create a schema and fileset in the catalog created above. + +Using the following code to create a schema and fileset: + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "schema", + "comment": "comment", + "properties": { + "location": "gs://bucket/root/schema" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs/catalog/schemas +``` + + + + +```java +// Assuming you have just created a Hive catalog named `hive_catalog` +Catalog catalog = gravitinoClient.loadCatalog("hive_catalog"); + +SupportsSchemas supportsSchemas = catalog.asSchemas(); + +Map schemaProperties = ImmutableMap.builder() + .put("location", "gs://bucket/root/schema") + .build(); +Schema schema = supportsSchemas.createSchema("schema", + "This is a schema", + schemaProperties +); +// ... +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://127.0.0.1:8090", metalake_name="metalake") +catalog: Catalog = gravitino_client.load_catalog(name="hive_catalog") +catalog.as_schemas().create_schema(name="schema", + comment="This is a schema", + properties={"location": "gs://bucket/root/schema"}) +``` + + + + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "example_fileset", + "comment": "This is an example fileset", + "type": "MANAGED", + "storageLocation": "gs://bucket/root/schema/example_fileset", + "properties": { + "k1": "v1" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs/catalog/schemas/schema/filesets +``` + + + + +```java +GravitinoClient gravitinoClient = GravitinoClient + .builder("http://localhost:8090") + .withMetalake("metalake") + .build(); + +Catalog catalog = gravitinoClient.loadCatalog("catalog"); +FilesetCatalog filesetCatalog = catalog.asFilesetCatalog(); + +Map propertiesMap = ImmutableMap.builder() + .put("k1", "v1") + .build(); + +filesetCatalog.createFileset( + NameIdentifier.of("schema", "example_fileset"), + "This is an example fileset", + Fileset.Type.MANAGED, + "gs://bucket/root/schema/example_fileset", + propertiesMap, +); +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") + +catalog: Catalog = gravitino_client.load_catalog(name="catalog") +catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("schema", "example_fileset"), + type=Fileset.Type.MANAGED, + comment="This is an example fileset", + storage_location="gs://bucket/root/schema/example_fileset", + properties={"k1": "v1"}) +``` + + + + +## Using Spark to access the fileset + +The following code snippet shows how to use **PySpark 3.1.3 with Hadoop environment(Hadoop 3.2.0)** to access the fileset: + +```python +import logging +from gravitino import NameIdentifier, GravitinoClient, Catalog, Fileset, GravitinoAdminClient +from pyspark.sql import SparkSession +import os + +gravitino_url = "http://localhost:8090" +metalake_name = "test" + +catalog_name = "your_gcs_catalog" +schema_name = "your_gcs_schema" +fileset_name = "your_gcs_fileset" + +os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-gcp-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar,/path/to/gcs-connector-hadoop3-2.2.22-shaded.jar --master local[1] pyspark-shell" +spark = SparkSession.builder +.appName("gcs_fielset_test") +.config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") +.config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") +.config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") +.config("spark.hadoop.fs.gravitino.client.metalake", "test") +.config("spark.hadoop.gcs-service-account-file", "/path/to/gcs-service-account-file.json") +.config("spark.driver.memory", "2g") +.config("spark.driver.port", "2048") +.getOrCreate() + +data = [("Alice", 25), ("Bob", 30), ("Cathy", 45)] +columns = ["Name", "Age"] +spark_df = spark.createDataFrame(data, schema=columns) +gvfs_path = f"gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/people" + +spark_df.coalesce(1).write +.mode("overwrite") +.option("header", "true") +.csv(gvfs_path) +``` + +If your Spark **without Hadoop environment**, you can use the following code snippet to access the fileset: + +```python +## Replace the following code snippet with the above code snippet with the same environment variables + +os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-gcp-bundle-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar, --master local[1] pyspark-shell" +``` + +- [`gravitino-gcp-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-gcp-bundle) is the Gravitino GCS jar with Hadoop environment and `gcs-connector` jar. +- [`gravitino-gcp-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-gcp) is the Gravitino GCS jar without Hadoop environment and `gcs-connector` jar. + +Please choose the correct jar according to your environment. + +:::note +In some Spark version, Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work, in this case, you should add the jars to the spark classpath directly. +::: + +## Using fileset with hadoop fs command + +The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. + +1. Adding the following contents to the `${HADOOP_HOME}/etc/hadoop/core-site.xml` file: + +```xml + + fs.AbstractFileSystem.gvfs.impl + org.apache.gravitino.filesystem.hadoop.Gvfs + + + + fs.gvfs.impl + org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem + + + + fs.gravitino.server.uri + http://192.168.50.188:8090 + + + + fs.gravitino.client.metalake + test + + + + gcs-service-account-file + /path/your-service-account-file.json + +``` + +2. Copy the necessary jars to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. + +Copy the corresponding jars to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. For GCS, you need to copy `gravitino-gcp-{version}.jar` to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. +then copy `hadoop-gcp-${version}.jar` and related dependencies to the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory. Those jars can be found in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory, for simple you can add all the jars in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. + + +3. Run the following command to access the fileset: + +```shell +hadoop dfs -ls gvfs://fileset/gcs_catalog/schema/example +hadoop dfs -put /path/to/local/file gvfs://fileset/gcs_catalog/schema/example +``` + +## Using fileset with pandas + +The following are examples of how to use the pandas library to access the GCS fileset + +```python +import pandas as pd + +storage_options = { + "server_uri": "http://localhost:8090", + "metalake_name": "test", + "options": { + "gcs_service_account_file": "path_of_gcs_service_account_file.json", + } +} +ds = pd.read_csv(f"gvfs://fileset/${catalog_name}/${schema_name}/${fileset_name}/people/part-00000-51d366e2-d5eb-448d-9109-32a96c8a14dc-c000.csv", + storage_options=storage_options) +ds.head() +``` + + +## Fileset with credential + +If the catalog has been configured with credential, you can access S3 fileset without setting `gcs-service-account-file` in the properties via GVFS. More detail can be seen [here](./security/credential-vending.md#gcs-credentials). + diff --git a/docs/hadoop-catalog-with-oss.md b/docs/hadoop-catalog-with-oss.md new file mode 100644 index 00000000000..c968901e03a --- /dev/null +++ b/docs/hadoop-catalog-with-oss.md @@ -0,0 +1,368 @@ +--- +title: "Hadoop catalog with OSS" +slug: /hadoop-catalog-with-oss +date: 2025-01-03 +keyword: Hadoop catalog OSS +license: "This software is licensed under the Apache License version 2." +--- + +This document describes how to configure a Hadoop catalog with Aliyun OSS. + +## Prerequisites + +In order to create a Hadoop catalog with OSS, you need to place [`gravitino-aliyun-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-aliyun-bundle) in Gravitino Hadoop classpath located +at `${HADOOP_HOME}/share/hadoop/common/lib/`. After that, start Gravitino server with the following command: + +```bash +$ bin/gravitino-server.sh start +``` + +## Create a Hadoop Catalog with OSS in Gravitino + +### Catalog a catalog + +Apart from configuration method in [Hadoop-catalog-catalog-configuration](./hadoop-catalog.md#catalog-properties), the following properties are required to configure a Hadoop catalog with OSS: + +| Configuration item | Description | Default value | Required | Since version | +|-------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|----------------------------|------------------| +| `filesystem-providers` | The file system providers to add. Set it to `oss` if it's a OSS fileset, or a comma separated string that contains `oss` like `oss,gs,s3` to support multiple kinds of fileset including `oss`. | (none) | Yes | 0.7.0-incubating | +| `default-filesystem-provider` | The name default filesystem providers of this Hadoop catalog if users do not specify the scheme in the URI. Default value is `builtin-local`, for OSS, if we set this value, we can omit the prefix 'oss://' in the location. | `builtin-local` | No | 0.7.0-incubating | +| `oss-endpoint` | The endpoint of the Aliyun OSS. | (none) | Yes if it's a OSS fileset. | 0.7.0-incubating | +| `oss-access-key-id` | The access key of the Aliyun OSS. | (none) | Yes if it's a OSS fileset. | 0.7.0-incubating | +| `oss-secret-access-key` | The secret key of the Aliyun OSS. | (none) | Yes if it's a OSS fileset. | 0.7.0-incubating | + +### Create a schema + +Refer to [Schema operation](./manage-fileset-metadata-using-gravitino.md#schema-operations) for more details. + +### Create a fileset + +Refer to [Fileset operation](./manage-fileset-metadata-using-gravitino.md#fileset-operations) for more details. + + +## Using Hadoop catalog with OSS + +### Create a Hadoop catalog/schema/file set with OSS + +First, you need to create a Hadoop catalog with OSS. The following example shows how to create a Hadoop catalog with OSS: + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "catalog", + "type": "FILESET", + "comment": "comment", + "provider": "hadoop", + "properties": { + "location": "oss://bucket/root", + "oss-access-key-id": "access_key", + "oss-secret-access-key": "secret_key", + "oss-endpoint": "http://oss-cn-hangzhou.aliyuncs.com", + "filesystem-providers": "oss" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs +``` + + + + +```java +GravitinoClient gravitinoClient = GravitinoClient + .builder("http://localhost:8090") + .withMetalake("metalake") + .build(); + +ossProperties = ImmutableMap.builder() + .put("location", "oss://bucket/root") + .put("oss-access-key-id", "access_key") + .put("oss-secret-access-key", "secret_key") + .put("oss-endpoint", "http://oss-cn-hangzhou.aliyuncs.com") + .put("filesystem-providers", "oss") + .build(); + +Catalog ossCatalog = gravitinoClient.createCatalog("catalog", + Type.FILESET, + "hadoop", // provider, Gravitino only supports "hadoop" for now. + "This is a OSS fileset catalog", + ossProperties); +// ... + +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") +oss_properties = { + "location": "oss://bucket/root", + "oss-access-key-id": "access_key" + "oss-secret-access-key": "secret_key", + "oss-endpoint": "ossProperties" +} + +oss_catalog = gravitino_client.create_catalog(name="catalog", + type=Catalog.Type.FILESET, + provider="hadoop", + comment="This is a OSS fileset catalog", + properties=oss_properties) + +``` + + + + +Then create a schema and fileset in the catalog created above. + +Using the following code to create a schema and fileset: + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "schema", + "comment": "comment", + "properties": { + "location": "oss://bucket/root/schema" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs/catalog/schemas +``` + + + + +```java +// Assuming you have just created a Hive catalog named `hive_catalog` +Catalog catalog = gravitinoClient.loadCatalog("hive_catalog"); + +SupportsSchemas supportsSchemas = catalog.asSchemas(); + +Map schemaProperties = ImmutableMap.builder() + .put("location", "oss://bucket/root/schema") + .build(); +Schema schema = supportsSchemas.createSchema("schema", + "This is a schema", + schemaProperties +); +// ... +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://127.0.0.1:8090", metalake_name="metalake") +catalog: Catalog = gravitino_client.load_catalog(name="hive_catalog") +catalog.as_schemas().create_schema(name="schema", + comment="This is a schema", + properties={"location": "oss://bucket/root/schema"}) +``` + + + + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "example_fileset", + "comment": "This is an example fileset", + "type": "MANAGED", + "storageLocation": "oss://bucket/root/schema/example_fileset", + "properties": { + "k1": "v1" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs/catalog/schemas/schema/filesets +``` + + + + +```java +GravitinoClient gravitinoClient = GravitinoClient + .builder("http://localhost:8090") + .withMetalake("metalake") + .build(); + +Catalog catalog = gravitinoClient.loadCatalog("catalog"); +FilesetCatalog filesetCatalog = catalog.asFilesetCatalog(); + +Map propertiesMap = ImmutableMap.builder() + .put("k1", "v1") + .build(); + +filesetCatalog.createFileset( + NameIdentifier.of("schema", "example_fileset"), + "This is an example fileset", + Fileset.Type.MANAGED, + "oss://bucket/root/schema/example_fileset", + propertiesMap, +); +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") + +catalog: Catalog = gravitino_client.load_catalog(name="catalog") +catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("schema", "example_fileset"), + type=Fileset.Type.MANAGED, + comment="This is an example fileset", + storage_location="oss://bucket/root/schema/example_fileset", + properties={"k1": "v1"}) +``` + + + + +## Using Spark to access the fileset + +The following code snippet shows how to use **PySpark 3.1.3 with Hadoop environment(Hadoop 3.2.0)** to access the fileset: + +```python +import logging +from gravitino import NameIdentifier, GravitinoClient, Catalog, Fileset, GravitinoAdminClient +from pyspark.sql import SparkSession +import os + +gravitino_url = "http://localhost:8090" +metalake_name = "test" + +catalog_name = "your_oss_catalog" +schema_name = "your_oss_schema" +fileset_name = "your_oss_fileset" + +os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-aliyun-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar,/path/to/aliyun-sdk-oss-2.8.3.jar,/path/to/hadoop-aliyun-3.2.0.jar,/path/to/jdom-1.1.jar --master local[1] pyspark-shell" +spark = SparkSession.builder +.appName("oss_fielset_test") +.config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") +.config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") +.config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") +.config("spark.hadoop.fs.gravitino.client.metalake", "test") +.config("spark.hadoop.oss-access-key-id", os.environ["OSS_ACCESS_KEY_ID"]) +.config("spark.hadoop.oss-secret-access-key", os.environ["OSS_SECRET_ACCESS_KEY"]) +.config("spark.hadoop.oss-endpoint", "http://oss-cn-hangzhou.aliyuncs.com") +.config("spark.driver.memory", "2g") +.config("spark.driver.port", "2048") +.getOrCreate() + +data = [("Alice", 25), ("Bob", 30), ("Cathy", 45)] +columns = ["Name", "Age"] +spark_df = spark.createDataFrame(data, schema=columns) +gvfs_path = f"gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/people" + +spark_df.coalesce(1).write +.mode("overwrite") +.option("header", "true") +.csv(gvfs_path) +``` + +If your Spark **without Hadoop environment**, you can use the following code snippet to access the fileset: + +```python +## Replace the following code snippet with the above code snippet with the same environment variables + +os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-aliyun-bundle-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar, --master local[1] pyspark-shell" +``` + +- [`gravitino-aliyun-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-aliyun-bundle) is the Gravitino Aliyun jar with Hadoop environment and `hadoop-oss` jar. +- [`gravitino-aliyun-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-aliyun) is the Gravitino OSS jar without Hadoop environment and `hadoop-oss` jar. + +Please choose the correct jar according to your environment. + +:::note +In some Spark version, Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work, in this case, you should add the jars to the spark classpath directly. +::: + +## Using fileset with hadoop fs command + +The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. + +1. Adding the following contents to the `${HADOOP_HOME}/etc/hadoop/core-site.xml` file: + +```xml + + fs.AbstractFileSystem.gvfs.impl + org.apache.gravitino.filesystem.hadoop.Gvfs + + + + fs.gvfs.impl + org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem + + + + fs.gravitino.server.uri + http://192.168.50.188:8090 + + + + fs.gravitino.client.metalake + test + + + + oss-endpoint + http://oss-cn-hangzhou.aliyuncs.com + + + + oss-access-key-id + access-key + + + + oss-secret-access-key + secret-key + +``` + +2. Copy the necessary jars to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. + +Copy the corresponding jars to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. For OSS, you need to copy `gravitino-aliyun-{version}.jar` to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. +then copy hadoop-aliyun-{version}.jar and related dependencies to the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory. Those jars can be found in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory, for simple you can add all the jars in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. + + +3. Run the following command to access the fileset: + +```shell +hadoop dfs -ls gvfs://fileset/oss_catalog/schema/example +hadoop dfs -put /path/to/local/file gvfs://fileset/oss_catalog/schema/example +``` + +## Using fileset with pandas + +The following are examples of how to use the pandas library to access the OSS fileset + +```python +import pandas as pd + +storage_options = { + "server_uri": "http://localhost:8090", + "metalake_name": "test", + "options": { + "oss_access_key_id": "access_key", + "oss_secret_access_key": "secret_key", + "oss_endpoint": "http://oss-cn-hangzhou.aliyuncs.com" + } +} +ds = pd.read_csv(f"gvfs://fileset/${catalog_name}/${schema_name}/${fileset_name}/people/part-00000-51d366e2-d5eb-448d-9109-32a96c8a14dc-c000.csv", + storage_options=storage_options) +ds.head() +``` + +## Fileset with credential + +If the catalog has been configured with credential, you can access S3 fileset without setting `oss-access-key-id` and `oss-secret-access-key` in the properties via GVFS. More detail can be seen [here](./security/credential-vending.md#oss-credentials). + + + diff --git a/docs/hadoop-catalog-with-s3.md b/docs/hadoop-catalog-with-s3.md new file mode 100644 index 00000000000..260a036f9d3 --- /dev/null +++ b/docs/hadoop-catalog-with-s3.md @@ -0,0 +1,372 @@ +--- +title: "Hadoop catalog with S3" +slug: /hadoop-catalog-with-s3 +date: 2025-01-03 +keyword: Hadoop catalog S3 +license: "This software is licensed under the Apache License version 2." +--- + +This document describes how to configure a Hadoop catalog with S3. + +## Prerequisites + +In order to create a Hadoop catalog with S3, you need to place [`gravitino-aws-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-aws-bundle) in Gravitino Hadoop classpath located +at `${HADOOP_HOME}/share/hadoop/common/lib/`. After that, start Gravitino server with the following command: + +```bash +$ bin/gravitino-server.sh start +``` + +## Create a Hadoop Catalog with S3 in Gravitino + +### Catalog a catalog + +Apart from configuration method in [Hadoop-catalog-catalog-configuration](./hadoop-catalog.md#catalog-properties), the following properties are required to configure a Hadoop catalog with S3: + +| Configuration item | Description | Default value | Required | Since version | +|-------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|---------------------------|------------------| +| `filesystem-providers` | The file system providers to add. Set it to `s3` if it's a S3 fileset, or a comma separated string that contains `s3` like `gs,s3` to support multiple kinds of fileset including `s3`. | (none) | Yes | 0.7.0-incubating | +| `default-filesystem-provider` | The name default filesystem providers of this Hadoop catalog if users do not specify the scheme in the URI. Default value is `builtin-local`, for S3, if we set this value, we can omit the prefix 's3a://' in the location. | `builtin-local` | No | 0.7.0-incubating | +| `s3-endpoint` | The endpoint of the AWS S3. | (none) | Yes if it's a S3 fileset. | 0.7.0-incubating | +| `s3-access-key-id` | The access key of the AWS S3. | (none) | Yes if it's a S3 fileset. | 0.7.0-incubating | +| `s3-secret-access-key` | The secret key of the AWS S3. | (none) | Yes if it's a S3 fileset. | 0.7.0-incubating | + +### Create a schema + +Refer to [Schema operation](./manage-fileset-metadata-using-gravitino.md#schema-operations) for more details. + +### Create a fileset + +Refer to [Fileset operation](./manage-fileset-metadata-using-gravitino.md#fileset-operations) for more details. + + +## Using Hadoop catalog with S3 + +### Create a Hadoop catalog/schema/file set with S3 + +First of all, you need to create a Hadoop catalog with S3. The following example shows how to create a Hadoop catalog with S3: + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "catalog", + "type": "FILESET", + "comment": "comment", + "provider": "hadoop", + "properties": { + "location": "s3a://bucket/root", + "s3-access-key-id": "access_key", + "s3-secret-access-key": "secret_key", + "s3-endpoint": "http://s3.ap-northeast-1.amazonaws.com", + "filesystem-providers": "s3" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs +``` + + + + +```java +GravitinoClient gravitinoClient = GravitinoClient + .builder("http://localhost:8090") + .withMetalake("metalake") + .build(); + +s3Properties = ImmutableMap.builder() + .put("location", "s3a://bucket/root") + .put("s3-access-key-id", "access_key") + .put("s3-secret-access-key", "secret_key") + .put("s3-endpoint", "http://s3.ap-northeast-1.amazonaws.com") + .put("filesystem-providers", "s3") + .build(); + +Catalog s3Catalog = gravitinoClient.createCatalog("catalog", + Type.FILESET, + "hadoop", // provider, Gravitino only supports "hadoop" for now. + "This is a S3 fileset catalog", + s3Properties); +// ... + +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") +s3_properties = { + "location": "s3a://bucket/root", + "s3-access-key-id": "access_key" + "s3-secret-access-key": "secret_key", + "s3-endpoint": "http://s3.ap-northeast-1.amazonaws.com" +} + +s3_catalog = gravitino_client.create_catalog(name="catalog", + type=Catalog.Type.FILESET, + provider="hadoop", + comment="This is a S3 fileset catalog", + properties=s3_properties) + +``` + + + + +:::note +The value of location should always start with `s3a` NOT `s3` for AWS S3, for instance, `s3a://bucket/root`. Value like `s3://bucket/root` is not supported due to the limitation of the hadoop-aws library. +::: + +Then create a schema and fileset in the catalog created above. + +Using the following code to create a schema and fileset: + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "schema", + "comment": "comment", + "properties": { + "location": "s3a://bucket/root/schema" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs/catalog/schemas +``` + + + + +```java +// Assuming you have just created a Hive catalog named `hive_catalog` +Catalog catalog = gravitinoClient.loadCatalog("hive_catalog"); + +SupportsSchemas supportsSchemas = catalog.asSchemas(); + +Map schemaProperties = ImmutableMap.builder() + .put("location", "s3a://bucket/root/schema") + .build(); +Schema schema = supportsSchemas.createSchema("schema", + "This is a schema", + schemaProperties +); +// ... +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://127.0.0.1:8090", metalake_name="metalake") +catalog: Catalog = gravitino_client.load_catalog(name="hive_catalog") +catalog.as_schemas().create_schema(name="schema", + comment="This is a schema", + properties={"location": "s3a://bucket/root/schema"}) +``` + + + + + + + +```shell +curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ +-H "Content-Type: application/json" -d '{ + "name": "example_fileset", + "comment": "This is an example fileset", + "type": "MANAGED", + "storageLocation": "s3a://bucket/root/schema/example_fileset", + "properties": { + "k1": "v1" + } +}' http://localhost:8090/api/metalakes/metalake/catalogs/catalog/schemas/schema/filesets +``` + + + + +```java +GravitinoClient gravitinoClient = GravitinoClient + .builder("http://localhost:8090") + .withMetalake("metalake") + .build(); + +Catalog catalog = gravitinoClient.loadCatalog("catalog"); +FilesetCatalog filesetCatalog = catalog.asFilesetCatalog(); + +Map propertiesMap = ImmutableMap.builder() + .put("k1", "v1") + .build(); + +filesetCatalog.createFileset( + NameIdentifier.of("schema", "example_fileset"), + "This is an example fileset", + Fileset.Type.MANAGED, + "s3a://bucket/root/schema/example_fileset", + propertiesMap, +); +``` + + + + +```python +gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") + +catalog: Catalog = gravitino_client.load_catalog(name="catalog") +catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("schema", "example_fileset"), + type=Fileset.Type.MANAGED, + comment="This is an example fileset", + storage_location="s3a://bucket/root/schema/example_fileset", + properties={"k1": "v1"}) +``` + + + + +## Using Spark to access the fileset + +The following code snippet shows how to use **PySpark 3.1.3 with Hadoop environment(Hadoop 3.2.0)** to access the fileset: + +```python +import logging +from gravitino import NameIdentifier, GravitinoClient, Catalog, Fileset, GravitinoAdminClient +from pyspark.sql import SparkSession +import os + +gravitino_url = "http://localhost:8090" +metalake_name = "test" + +catalog_name = "your_s3_catalog" +schema_name = "your_s3_schema" +fileset_name = "your_s3_fileset" + +os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-aws-${gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-${gravitino-version}-SNAPSHOT.jar,/path/to/hadoop-aws-3.2.0.jar,/path/to/aws-java-sdk-bundle-1.11.375.jar --master local[1] pyspark-shell" +spark = SparkSession.builder + .appName("s3_fielset_test") + .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") + .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") + .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") + .config("spark.hadoop.fs.gravitino.client.metalake", "test") + .config("spark.hadoop.s3-access-key-id", os.environ["S3_ACCESS_KEY_ID"]) + .config("spark.hadoop.s3-secret-access-key", os.environ["S3_SECRET_ACCESS_KEY"]) + .config("spark.hadoop.s3-endpoint", "http://s3.ap-northeast-1.amazonaws.com") + .config("spark.driver.memory", "2g") + .config("spark.driver.port", "2048") + .getOrCreate() + +data = [("Alice", 25), ("Bob", 30), ("Cathy", 45)] +columns = ["Name", "Age"] +spark_df = spark.createDataFrame(data, schema=columns) +gvfs_path = f"gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/people" + +spark_df.coalesce(1).write +.mode("overwrite") +.option("header", "true") +.csv(gvfs_path) +``` + +If your Spark **without Hadoop environment**, you can use the following code snippet to access the fileset: + +```python +## Replace the following code snippet with the above code snippet with the same environment variables +os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-aws-${gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-${gravitino-version}-SNAPSHOT.jar,/path/to/hadoop-aws-3.2.0.jar,/path/to/aws-java-sdk-bundle-1.11.375.jar --master local[1] pyspark-shell" +``` + +- [`gravitino-aws-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-aws-bundle) is the Gravitino AWS jar with Hadoop environment and `hadoop-aws` jar. +- [`gravitino-aws-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-aws) is the Gravitino AWS jar without Hadoop environment and `hadoop-aws` jar. + +Please choose the correct jar according to your environment. + +:::note +In some Spark version, Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work, in this case, you should add the jars to the spark classpath directly. +::: + +## Using fileset with hadoop fs command + +The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. + +1. Adding the following contents to the `${HADOOP_HOME}/etc/hadoop/core-site.xml` file: + +```xml + + fs.AbstractFileSystem.gvfs.impl + org.apache.gravitino.filesystem.hadoop.Gvfs + + + + fs.gvfs.impl + org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem + + + + fs.gravitino.server.uri + http://192.168.50.188:8090 + + + + fs.gravitino.client.metalake + test + + + + s3-endpoint + http://s3.ap-northeast-1.amazonaws.com + + + + s3-access-key-id + access-key + + + + s3-secret-access-key + secret-key + +``` + +2. Copy the necessary jars to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. + +Copy the corresponding jars to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. For S3, you need to copy `gravitino-aws-{version}.jar` to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. +then copy hadoop-aws-{version}.jar and related dependencies to the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory. Those jars can be found in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory, for simple you can add all the jars in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. + + +3. Run the following command to access the fileset: + +```shell +hadoop dfs -ls gvfs://fileset/s3_catalog/schema/example +hadoop dfs -put /path/to/local/file gvfs://fileset/s3_catalog/schema/example +``` + +## Using fileset with pandas + +The following are examples of how to use the pandas library to access the S3 fileset + +```python +import pandas as pd + +storage_options = { + "server_uri": "http://localhost:8090", + "metalake_name": "test", + "options": { + "s3_access_key_id": "access_key", + "s3_secret_access_key": "secret_key", + "s3_endpoint": "http://s3.ap-northeast-1.amazonaws.com" + } +} +ds = pd.read_csv(f"gvfs://fileset/${catalog_name}/${schema_name}/${fileset_name}/people/part-00000-51d366e2-d5eb-448d-9109-32a96c8a14dc-c000.csv", + storage_options=storage_options) +ds.head() +``` + +## Fileset with credential + +If the catalog has been configured with credential, you can access S3 fileset without setting `s3-access-key-id` and `s3-secret-access-key` in the properties via GVFS. More detail can be seen [here](./security/credential-vending.md#s3-credentials). + + + + diff --git a/docs/hadoop-catalog.md b/docs/hadoop-catalog.md index cf86fde06e4..57da399b12c 100644 --- a/docs/hadoop-catalog.md +++ b/docs/hadoop-catalog.md @@ -9,9 +9,9 @@ license: "This software is licensed under the Apache License version 2." ## Introduction Hadoop catalog is a fileset catalog that using Hadoop Compatible File System (HCFS) to manage -the storage location of the fileset. Currently, it supports the local filesystem and HDFS. Since 0.7.0-incubating, Gravitino supports S3, GCS, OSS and Azure Blob Storage fileset through Hadoop catalog. +the storage location of the fileset. Currently, it supports the local filesystem and HDFS. Since 0.7.0-incubating, Gravitino supports [S3](hadoop-catalog-with-S3.md), [GCS](hadoop-catalog-with-gcs.md), [OSS](hadoop-catalog-with-oss.md) and [Azure Blob Storage](hadoop-catalog-with-adls.md) through Hadoop catalog. -The rest of this document will use HDFS or local file as an example to illustrate how to use the Hadoop catalog. For S3, GCS, OSS and Azure Blob Storage, the configuration is similar to HDFS, but more properties need to be set. We will use [separate sections](./cloud-storage-fileset-example.md) to introduce how to use of S3, GCS, OSS and Azure Blob Storage. +The rest of this document will use HDFS or local file as an example to illustrate how to use the Hadoop catalog. For S3, GCS, OSS and Azure Blob Storage, the configuration is similar to HDFS, please refer to the corresponding document for more details. Note that Gravitino uses Hadoop 3 dependencies to build Hadoop catalog. Theoretically, it should be compatible with both Hadoop 2.x and 3.x, since Gravitino doesn't leverage any new features in diff --git a/docs/how-to-use-gvfs.md b/docs/how-to-use-gvfs.md index a14d09794a3..6ac5079a6b3 100644 --- a/docs/how-to-use-gvfs.md +++ b/docs/how-to-use-gvfs.md @@ -146,8 +146,7 @@ You can configure these properties in two ways: ``` :::note -If you want to access the S3, GCS, OSS or custom fileset through GVFS, apart from the above properties, you need to place the corresponding bundle jars in the Hadoop environment, For bundles jar and -cloud storage fileset configuration example, please refer to [cloud storage fileset example](./cloud-storage-fileset-example.md). +If you want to access the S3, GCS, OSS or custom fileset through GVFS, apart from the above properties, you need to place the corresponding bundle jars in the Hadoop environment. ::: 2. Configure the properties in the `core-site.xml` file of the Hadoop environment: @@ -204,10 +203,6 @@ two ways: ```shell ./gradlew :clients:filesystem-hadoop3-runtime:build -x test ``` -:::note -For cloud storage fileset, some extra steps should be added, please refer to [cloud storage fileset example](./cloud-storage-fileset-example.md). -::: - #### Via Hadoop shell command @@ -234,8 +229,6 @@ cp ${HADOOP_HOME}/share/hadoop/tools/lib/* ${HADOOP_HOME}/share/hadoop/common/li ./${HADOOP_HOME}/bin/hadoop dfs -ls gvfs://fileset/test_catalog/test_schema/test_fileset_1 ``` -Full example to access S3, GCS, OSS fileset via Hadoop shell command, please refer to [cloud storage fileset example](./cloud-storage-fileset-example.md). - #### Via Java code You can also perform operations on the files or directories managed by fileset through Java code. @@ -285,9 +278,6 @@ FileSystem fs = filesetPath.getFileSystem(conf); fs.getFileStatus(filesetPath); ``` -Full example to access S3, GCS, OSS fileset via Hadoop shell command, please refer to [cloud storage fileset example](./cloud-storage-fileset-example.md). - - #### Via Apache Spark 1. Add the GVFS runtime jar to the Spark environment. @@ -327,8 +317,6 @@ Full example to access S3, GCS, OSS fileset via Hadoop shell command, please ref rdd.foreach(println) ``` -Full example to access S3, GCS, OSS fileset via Spark, please refer to [cloud storage fileset example](./cloud-storage-fileset-example.md). - #### Via Tensorflow For Tensorflow to support GVFS, you need to recompile the [tensorflow-io](https://github.com/tensorflow/io) module. @@ -523,7 +511,6 @@ options = { fs = gvfs.GravitinoVirtualFileSystem(server_uri="http://localhost:8090", metalake_name="test_metalake", options=options) ``` -Full Python example to access S3, GCS, OSS fileset via GVFS, please refer to [cloud storage fileset example](./cloud-storage-fileset-example.md). :::note From b34c5264a1607bf525d4053f08d853e185a15137 Mon Sep 17 00:00:00 2001 From: yuqi Date: Fri, 3 Jan 2025 20:11:59 +0800 Subject: [PATCH 24/59] Rename class of credential providers and optimize expired time. --- ...ovider.java => OSSCredentialProvider.java} | 21 +++++---- .../oss/fs/OSSFileSystemProvider.java | 3 +- ...rovider.java => S3CredentialProvider.java} | 21 +++++---- .../gravitino/s3/fs/S3FileSystemProvider.java | 3 +- .../abs/fs/AzureFileSystemProvider.java | 17 ++++---- ...r.java => AzureSasCredentialProvider.java} | 19 ++++---- ...ovider.java => GCSCredentialProvider.java} | 43 ++++++++++++++----- .../gcs/fs/GCSFileSystemProvider.java | 5 +-- 8 files changed, 81 insertions(+), 51 deletions(-) rename bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/{GravitinoOSSCredentialProvider.java => OSSCredentialProvider.java} (90%) rename bundles/aws/src/main/java/org/apache/gravitino/s3/fs/{GravitinoS3CredentialProvider.java => S3CredentialProvider.java} (88%) rename bundles/azure/src/main/java/org/apache/gravitino/abs/fs/{GravitinoAzureSasCredentialProvider.java => AzureSasCredentialProvider.java} (89%) rename bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/{GravitinoGCSCredentialProvider.java => GCSCredentialProvider.java} (69%) diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/GravitinoOSSCredentialProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialProvider.java similarity index 90% rename from bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/GravitinoOSSCredentialProvider.java rename to bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialProvider.java index 375a69d812b..0c21fa8eec3 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/GravitinoOSSCredentialProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialProvider.java @@ -45,16 +45,18 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class GravitinoOSSCredentialProvider implements CredentialsProvider { +public class OSSCredentialProvider implements CredentialsProvider { - private static final Logger LOG = LoggerFactory.getLogger(GravitinoOSSCredentialProvider.class); + private static final Logger LOG = LoggerFactory.getLogger(OSSCredentialProvider.class); private Credentials basicCredentials; private final String filesetIdentifier; - private long expirationTime; private final GravitinoClient client; private final Configuration configuration; - public GravitinoOSSCredentialProvider(URI uri, Configuration conf) { + private long expirationTime = Long.MAX_VALUE; + private static final double EXPIRATION_TIME_FACTOR = 0.9D; + + public OSSCredentialProvider(URI uri, Configuration conf) { this.filesetIdentifier = conf.get(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER); this.client = GravitinoVirtualFileSystemUtils.createClient(conf); @@ -67,7 +69,7 @@ public void setCredentials(Credentials credentials) {} @Override public Credentials getCredentials() { // If the credentials are null or about to expire, refresh the credentials. - if (basicCredentials == null || System.currentTimeMillis() > expirationTime - 5 * 60 * 1000) { + if (basicCredentials == null || System.currentTimeMillis() >= expirationTime) { synchronized (this) { refresh(); } @@ -110,9 +112,12 @@ private void refresh() { basicCredentials = new DefaultCredentials(accessKeyId, secretAccessKey); } - expirationTime = credential.expireTimeInMs(); - if (expirationTime <= 0) { - expirationTime = Long.MAX_VALUE; + if (credential.expireTimeInMs() > 0) { + expirationTime = + System.currentTimeMillis() + + (long) + ((credential.expireTimeInMs() - System.currentTimeMillis()) + * EXPIRATION_TIME_FACTOR); } } diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java index cadabd7f3bf..f4fa8a374fa 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java @@ -65,8 +65,7 @@ public FileSystem getFileSystem(Path path, Map config) throws IO && config.containsKey( GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { hadoopConfMap.put( - Constants.CREDENTIALS_PROVIDER_KEY, - GravitinoOSSCredentialProvider.class.getCanonicalName()); + Constants.CREDENTIALS_PROVIDER_KEY, OSSCredentialProvider.class.getCanonicalName()); } hadoopConfMap.forEach(configuration::set); diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/GravitinoS3CredentialProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialProvider.java similarity index 88% rename from bundles/aws/src/main/java/org/apache/gravitino/s3/fs/GravitinoS3CredentialProvider.java rename to bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialProvider.java index 006c7aede77..c474abc83aa 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/GravitinoS3CredentialProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialProvider.java @@ -45,17 +45,18 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class GravitinoS3CredentialProvider implements AWSCredentialsProvider { +public class S3CredentialProvider implements AWSCredentialsProvider { - private static final Logger LOG = LoggerFactory.getLogger(GravitinoS3CredentialProvider.class); + private static final Logger LOG = LoggerFactory.getLogger(S3CredentialProvider.class); private final GravitinoClient client; private final String filesetIdentifier; private final Configuration configuration; private AWSCredentials basicSessionCredentials; - private long expirationTime; + private long expirationTime = Long.MAX_VALUE; + private static final double EXPIRATION_TIME_FACTOR = 0.9D; - public GravitinoS3CredentialProvider(final URI uri, final Configuration conf) { + public S3CredentialProvider(final URI uri, final Configuration conf) { this.filesetIdentifier = conf.get(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER); this.configuration = conf; @@ -65,8 +66,7 @@ public GravitinoS3CredentialProvider(final URI uri, final Configuration conf) { @Override public AWSCredentials getCredentials() { // Refresh credentials if they are null or about to expire in 5 minutes - if (basicSessionCredentials == null - || System.currentTimeMillis() > expirationTime - 5 * 60 * 1000) { + if (basicSessionCredentials == null || System.currentTimeMillis() >= expirationTime) { synchronized (this) { refresh(); } @@ -112,9 +112,12 @@ public void refresh() { basicSessionCredentials = new BasicAWSCredentials(accessKeyId, secretAccessKey); } - expirationTime = credential.expireTimeInMs(); - if (expirationTime <= 0) { - expirationTime = Long.MAX_VALUE; + if (credential.expireTimeInMs() > 0) { + expirationTime = + System.currentTimeMillis() + + (long) + ((credential.expireTimeInMs() - System.currentTimeMillis()) + * EXPIRATION_TIME_FACTOR); } } diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java index a564536fa05..44a9dfb0c40 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java @@ -72,8 +72,7 @@ public FileSystem getFileSystem(Path path, Map config) throws IO // will have this key. if (config.containsKey(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { configuration.set( - Constants.AWS_CREDENTIALS_PROVIDER, - GravitinoS3CredentialProvider.class.getCanonicalName()); + Constants.AWS_CREDENTIALS_PROVIDER, S3CredentialProvider.class.getCanonicalName()); } // Hadoop-aws 2 does not support IAMInstanceCredentialsProvider diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java index 99db1c4cda6..6a03361ee79 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java @@ -78,10 +78,9 @@ public FileSystem getFileSystem(@Nonnull Path path, @Nonnull Map if (config.containsKey(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { // Test whether SAS works try { - GravitinoAzureSasCredentialProvider gravitinoAzureSasCredentialProvider = - new GravitinoAzureSasCredentialProvider(); - gravitinoAzureSasCredentialProvider.initialize(configuration, null); - String sas = gravitinoAzureSasCredentialProvider.getSASToken(null, null, null, null); + AzureSasCredentialProvider azureSasCredentialProvider = new AzureSasCredentialProvider(); + azureSasCredentialProvider.initialize(configuration, null); + String sas = azureSasCredentialProvider.getSASToken(null, null, null, null); if (sas != null) { String accountName = String.format( @@ -92,15 +91,15 @@ public FileSystem getFileSystem(@Nonnull Path path, @Nonnull Map FS_AZURE_ACCOUNT_AUTH_TYPE_PROPERTY_NAME + "." + accountName, AuthType.SAS.name()); configuration.set( FS_AZURE_SAS_TOKEN_PROVIDER_TYPE + "." + accountName, - GravitinoAzureSasCredentialProvider.class.getName()); + AzureSasCredentialProvider.class.getName()); configuration.set(FS_AZURE_ACCOUNT_IS_HNS_ENABLED, "true"); - } else if (gravitinoAzureSasCredentialProvider.getAzureStorageAccountKey() != null - && gravitinoAzureSasCredentialProvider.getAzureStorageAccountName() != null) { + } else if (azureSasCredentialProvider.getAzureStorageAccountKey() != null + && azureSasCredentialProvider.getAzureStorageAccountName() != null) { configuration.set( String.format( "fs.azure.account.key.%s.dfs.core.windows.net", - gravitinoAzureSasCredentialProvider.getAzureStorageAccountName()), - gravitinoAzureSasCredentialProvider.getAzureStorageAccountKey()); + azureSasCredentialProvider.getAzureStorageAccountName()), + azureSasCredentialProvider.getAzureStorageAccountKey()); } } catch (Exception e) { // Can't use SAS, use account key and account key instead diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/GravitinoAzureSasCredentialProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java similarity index 89% rename from bundles/azure/src/main/java/org/apache/gravitino/abs/fs/GravitinoAzureSasCredentialProvider.java rename to bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java index 1a539145699..8c63d48b749 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/GravitinoAzureSasCredentialProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java @@ -42,10 +42,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class GravitinoAzureSasCredentialProvider implements SASTokenProvider, Configurable { +public class AzureSasCredentialProvider implements SASTokenProvider, Configurable { - private static final Logger LOGGER = - LoggerFactory.getLogger(GravitinoAzureSasCredentialProvider.class); + private static final Logger LOGGER = LoggerFactory.getLogger(AzureSasCredentialProvider.class); private Configuration configuration; @@ -58,7 +57,8 @@ public class GravitinoAzureSasCredentialProvider implements SASTokenProvider, Co private String azureStorageAccountName; private String azureStorageAccountKey; - private long expirationTime; + private long expirationTime = Long.MAX_VALUE; + private static final double EXPIRATION_TIME_FACTOR = 0.9D; public String getAzureStorageAccountName() { return azureStorageAccountName; @@ -88,7 +88,7 @@ public void initialize(Configuration conf, String accountName) throws IOExceptio @Override public String getSASToken(String account, String fileSystem, String path, String operation) { // Refresh credentials if they are null or about to expire in 5 minutes - if (sasToken == null || System.currentTimeMillis() > expirationTime - 5 * 60 * 1000) { + if (sasToken == null || System.currentTimeMillis() >= expirationTime) { synchronized (this) { refresh(); } @@ -121,9 +121,12 @@ private void refresh() { azureStorageAccountKey = credentialMap.get(GRAVITINO_AZURE_STORAGE_ACCOUNT_KEY); } - this.expirationTime = credential.expireTimeInMs(); - if (expirationTime <= 0) { - expirationTime = Long.MAX_VALUE; + if (credential.expireTimeInMs() > 0) { + expirationTime = + System.currentTimeMillis() + + (long) + ((credential.expireTimeInMs() - System.currentTimeMillis()) + * EXPIRATION_TIME_FACTOR); } } diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GravitinoGCSCredentialProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialProvider.java similarity index 69% rename from bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GravitinoGCSCredentialProvider.java rename to bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialProvider.java index 6c8af262ddc..ad0a927aeaf 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GravitinoGCSCredentialProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialProvider.java @@ -21,7 +21,9 @@ import com.google.cloud.hadoop.util.AccessTokenProvider; import java.io.IOException; +import java.util.Arrays; import java.util.Map; +import java.util.Optional; import org.apache.gravitino.NameIdentifier; import org.apache.gravitino.client.GravitinoClient; import org.apache.gravitino.credential.Credential; @@ -34,18 +36,19 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class GravitinoGCSCredentialProvider implements AccessTokenProvider { - private static final Logger LOG = LoggerFactory.getLogger(GravitinoGCSCredentialProvider.class); +public class GCSCredentialProvider implements AccessTokenProvider { + private static final Logger LOG = LoggerFactory.getLogger(GCSCredentialProvider.class); private Configuration configuration; private GravitinoClient client; private String filesetIdentifier; private AccessToken accessToken; - private long expirationTime; + private long expirationTime = Long.MAX_VALUE; + private static final double EXPIRATION_TIME_FACTOR = 0.9D; @Override public AccessToken getAccessToken() { - if (accessToken == null || expirationTime < System.currentTimeMillis() + 5 * 60 * 1000) { + if (accessToken == null || System.currentTimeMillis() >= expirationTime) { try { refresh(); } catch (IOException e) { @@ -67,24 +70,28 @@ public void refresh() throws IOException { Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); Credential[] credentials = fileset.supportsCredentials().getCredentials(); + Optional optionalCredential = getCredential(credentials); // Can't find any credential, use the default one. - if (credentials.length == 0) { + if (!optionalCredential.isPresent()) { LOG.warn( "No credential found for fileset: {}, try to use static JSON file", filesetIdentifier); return; } - Credential credential = credentials[0]; + Credential credential = optionalCredential.get(); Map credentialMap = credential.toProperties(); if (GCSTokenCredential.GCS_TOKEN_CREDENTIAL_TYPE.equals( credentialMap.get(Credential.CREDENTIAL_TYPE))) { String sessionToken = credentialMap.get(GCSTokenCredential.GCS_TOKEN_NAME); - accessToken = new AccessToken(sessionToken, expirationTime); + accessToken = new AccessToken(sessionToken, credential.expireTimeInMs()); - expirationTime = credential.expireTimeInMs(); - if (expirationTime <= 0) { - expirationTime = Long.MAX_VALUE; + if (credential.expireTimeInMs() > 0) { + expirationTime = + System.currentTimeMillis() + + (long) + ((credential.expireTimeInMs() - System.currentTimeMillis()) + * EXPIRATION_TIME_FACTOR); } } } @@ -101,4 +108,20 @@ public void setConf(Configuration configuration) { public Configuration getConf() { return this.configuration; } + + /** + * Get the credential from the credential array. Using dynamic credential first, if not found, + * uses static credential. + * + * @param credentials The credential array. + * @return An optional credential. + */ + private Optional getCredential(Credential[] credentials) { + // Use dynamic credential if found. + return Arrays.stream(credentials) + .filter( + credential -> + credential.credentialType().equals(GCSTokenCredential.GCS_TOKEN_CREDENTIAL_TYPE)) + .findFirst(); + } } diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java index 3cdc97f3edc..17a837f7451 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java @@ -47,14 +47,13 @@ public FileSystem getFileSystem(Path path, Map config) throws IO .forEach(configuration::set); if (config.containsKey(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { - AccessTokenProvider accessTokenProvider = new GravitinoGCSCredentialProvider(); + AccessTokenProvider accessTokenProvider = new GCSCredentialProvider(); accessTokenProvider.setConf(configuration); // Why is this check necessary?, if Gravitino fails to get any credentials, we fall back to // the default behavior of the GoogleHadoopFileSystem to use service account credentials. if (accessTokenProvider.getAccessToken() != null) { configuration.set( - "fs.gs.auth.access.token.provider.impl", - GravitinoGCSCredentialProvider.class.getName()); + "fs.gs.auth.access.token.provider.impl", GCSCredentialProvider.class.getName()); } } From 1ecc3785209b4db34d20f112d0c78f680e8db37f Mon Sep 17 00:00:00 2001 From: yuqi Date: Sat, 4 Jan 2025 14:51:50 +0800 Subject: [PATCH 25/59] update the docs --- docs/hadoop-catalog-with-adls.md | 72 ++++++++++++++++++++++++++++- docs/hadoop-catalog-with-gcs.md | 72 ++++++++++++++++++++++++++++- docs/hadoop-catalog-with-oss.md | 78 +++++++++++++++++++++++++++++++- docs/hadoop-catalog-with-s3.md | 75 +++++++++++++++++++++++++++++- 4 files changed, 292 insertions(+), 5 deletions(-) diff --git a/docs/hadoop-catalog-with-adls.md b/docs/hadoop-catalog-with-adls.md index e77d2c80465..a54a3baf4ef 100644 --- a/docs/hadoop-catalog-with-adls.md +++ b/docs/hadoop-catalog-with-adls.md @@ -279,6 +279,22 @@ Please choose the correct jar according to your environment. In some Spark version, Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work, in this case, you should add the jars to the spark classpath directly. ::: +## Using Gravitino virual file system Java client to access the fileset + +```java +Configuration conf = new Configuration(); +conf.set("fs.AbstractFileSystem.gvfs.impl","org.apache.gravitino.filesystem.hadoop.Gvfs"); +conf.set("fs.gvfs.impl","org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); +conf.set("fs.gravitino.server.uri","http://localhost:8090"); +conf.set("fs.gravitino.client.metalake","test_metalake"); +conf.set("azure-storage-account-name", "account_name_of_adls"); +conf.set("azure-storage-account-key", "account_key_of_adls"); +Path filesetPath = new Path("gvfs://fileset/test_catalog/test_schema/test_fileset/new_dir"); +FileSystem fs = filesetPath.getFileSystem(conf); +fs.mkdirs(filesetPath); +... +``` + ## Using fileset with hadoop fs command The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. @@ -329,6 +345,22 @@ hadoop dfs -ls gvfs://fileset/adls_catalog/schema/example hadoop dfs -put /path/to/local/file gvfs://fileset/adls_catalog/schema/example ``` +## Using Gravitino virtual file system Python client + +```python +from gravitino import gvfs +options = { + "cache_size": 20, + "cache_expired_time": 3600, + "auth_type": "simple", + "azure_storage_account_name": "azure_account_name", + "azure_storage_account_key": "azure_account_key" +} +fs = gvfs.GravitinoVirtualFileSystem(server_uri="http://localhost:8090", metalake_name="test_metalake", options=options) +fs.ls("gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/") +``` + + ### Using fileset with pandas The following are examples of how to use the pandas library to access the ADLS fileset @@ -351,7 +383,45 @@ ds.head() ## Fileset with credential -If the catalog has been configured with credential, you can access ADLS fileset without setting `azure-storage-account-name` and `azure-storage-account-key` in the properties via GVFS. More detail can be seen [here](./security/credential-vending.md#adls-credentials). +Since 0.8.0-incubating, Gravitino supports credential vending for ADLS fileset. If the catalog has been configured with credential, you can access ADLS fileset without providing authentication information like `azure-storage-account-name` and `azure-storage-account-key` in the properties. + +### How to create an ADLS Hadoop catalog with credential enabled +Apart from configuration method in [create-adls-hadoop-catalog](#catalog-a-catalog), properties needed by [adls-credential](./security/credential-vending.md#adls-credentials) should also be set to enable credential vending for ADLSfileset. + +### How to access ADLS fileset with credential + +If the catalog has been configured with credential, you can access ADLS fileset without providing authentication information via GVFS. Let's see how to access ADLS fileset with credential: + +GVFS Java client: + +```java +Configuration conf = new Configuration(); +conf.set("fs.AbstractFileSystem.gvfs.impl","org.apache.gravitino.filesystem.hadoop.Gvfs"); +conf.set("fs.gvfs.impl","org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); +conf.set("fs.gravitino.server.uri","http://localhost:8090"); +conf.set("fs.gravitino.client.metalake","test_metalake"); +// No need to set azure-storage-account-name and azure-storage-account-name +Path filesetPath = new Path("gvfs://fileset/adls_test_catalog/test_schema/test_fileset/new_dir"); +FileSystem fs = filesetPath.getFileSystem(conf); +fs.mkdirs(filesetPath); +... +``` + +Spark: + +```python +spark = SparkSession.builder + .appName("adls_fielset_test") + .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") + .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") + .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") + .config("spark.hadoop.fs.gravitino.client.metalake", "test") + # No need to set azure-storage-account-name and azure-storage-account-name + .config("spark.driver.memory", "2g") + .config("spark.driver.port", "2048") + .getOrCreate() +``` +Python client and Hadoop command are similar to the above examples. diff --git a/docs/hadoop-catalog-with-gcs.md b/docs/hadoop-catalog-with-gcs.md index 6dc6bb1c732..d94473f5d16 100644 --- a/docs/hadoop-catalog-with-gcs.md +++ b/docs/hadoop-catalog-with-gcs.md @@ -273,6 +273,21 @@ Please choose the correct jar according to your environment. In some Spark version, Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work, in this case, you should add the jars to the spark classpath directly. ::: +## Using Gravitino virual file system Java client to access the fileset + +```java +Configuration conf = new Configuration(); +conf.set("fs.AbstractFileSystem.gvfs.impl","org.apache.gravitino.filesystem.hadoop.Gvfs"); +conf.set("fs.gvfs.impl","org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); +conf.set("fs.gravitino.server.uri","http://localhost:8090"); +conf.set("fs.gravitino.client.metalake","test_metalake"); +conf.set("gcs-service-account-file", "/path/your-service-account-file.json"); +Path filesetPath = new Path("gvfs://fileset/test_catalog/test_schema/test_fileset/new_dir"); +FileSystem fs = filesetPath.getFileSystem(conf); +fs.mkdirs(filesetPath); +... +``` + ## Using fileset with hadoop fs command The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. @@ -319,6 +334,21 @@ hadoop dfs -ls gvfs://fileset/gcs_catalog/schema/example hadoop dfs -put /path/to/local/file gvfs://fileset/gcs_catalog/schema/example ``` + +## Using Gravitino virtual file system Python client + +```python +from gravitino import gvfs +options = { + "cache_size": 20, + "cache_expired_time": 3600, + "auth_type": "simple", + "gcs_service_account_file": "path_of_gcs_service_account_file.json", +} +fs = gvfs.GravitinoVirtualFileSystem(server_uri="http://localhost:8090", metalake_name="test_metalake", options=options) +fs.ls("gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/") +``` + ## Using fileset with pandas The following are examples of how to use the pandas library to access the GCS fileset @@ -338,8 +368,46 @@ ds = pd.read_csv(f"gvfs://fileset/${catalog_name}/${schema_name}/${fileset_name} ds.head() ``` - ## Fileset with credential -If the catalog has been configured with credential, you can access S3 fileset without setting `gcs-service-account-file` in the properties via GVFS. More detail can be seen [here](./security/credential-vending.md#gcs-credentials). +Since 0.8.0-incubating, Gravitino supports credential vending for GCS fileset. If the catalog has been configured with credential, you can access GCS fileset without providing authentication information like `gcs-service-account-file` in the properties. + +### How to create a GCS Hadoop catalog with credential enabled + +Apart from configuration method in [create-gcs-hadoop-catalog](#catalog-a-catalog), properties needed by [gcs-credential](./security/credential-vending.md#gcs-credentials) should also be set to enable credential vending for GCS fileset. + +### How to access GCS fileset with credential + +If the catalog has been configured with credential, you can access GCS fileset without providing authentication information via GVFS. Let's see how to access GCS fileset with credential: + +GVFS Java client: + +```java +Configuration conf = new Configuration(); +conf.set("fs.AbstractFileSystem.gvfs.impl","org.apache.gravitino.filesystem.hadoop.Gvfs"); +conf.set("fs.gvfs.impl","org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); +conf.set("fs.gravitino.server.uri","http://localhost:8090"); +conf.set("fs.gravitino.client.metalake","test_metalake"); +// No need to set gcs-service-account-file +Path filesetPath = new Path("gvfs://fileset/gcs_test_catalog/test_schema/test_fileset/new_dir"); +FileSystem fs = filesetPath.getFileSystem(conf); +fs.mkdirs(filesetPath); +... +``` + +Spark: + +```python +spark = SparkSession.builder + .appName("gcs_fielset_test") + .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") + .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") + .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") + .config("spark.hadoop.fs.gravitino.client.metalake", "test") + # No need to set gcs-service-account-file + .config("spark.driver.memory", "2g") + .config("spark.driver.port", "2048") + .getOrCreate() +``` +Python client and Hadoop command are similar to the above examples. diff --git a/docs/hadoop-catalog-with-oss.md b/docs/hadoop-catalog-with-oss.md index c968901e03a..1da09507afa 100644 --- a/docs/hadoop-catalog-with-oss.md +++ b/docs/hadoop-catalog-with-oss.md @@ -283,6 +283,24 @@ Please choose the correct jar according to your environment. In some Spark version, Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work, in this case, you should add the jars to the spark classpath directly. ::: +## Using Gravitino virual file system Java client to access the fileset + +```java +Configuration conf = new Configuration(); +conf.set("fs.AbstractFileSystem.gvfs.impl","org.apache.gravitino.filesystem.hadoop.Gvfs"); +conf.set("fs.gvfs.impl","org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); +conf.set("fs.gravitino.server.uri","http://localhost:8090"); +conf.set("fs.gravitino.client.metalake","test_metalake"); +conf.set("oss-endpoint", "http://localhost:9000"); +conf.set("oss-access-key-id", "minio"); +conf.set("oss-secret-access-key", "minio123"); +Path filesetPath = new Path("gvfs://fileset/test_catalog/test_schema/test_fileset/new_dir"); +FileSystem fs = filesetPath.getFileSystem(conf); +fs.mkdirs(filesetPath); +... +``` + + ## Using fileset with hadoop fs command The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. @@ -339,6 +357,25 @@ hadoop dfs -ls gvfs://fileset/oss_catalog/schema/example hadoop dfs -put /path/to/local/file gvfs://fileset/oss_catalog/schema/example ``` + +## Using Gravitino virtual file system Python client + +```python +from gravitino import gvfs +options = { + "cache_size": 20, + "cache_expired_time": 3600, + "auth_type": "simple", + "oss_endpoint": "http://localhost:9000", + "oss_access_key_id": "minio", + "oss_secret_access_key": "minio123" +} +fs = gvfs.GravitinoVirtualFileSystem(server_uri="http://localhost:8090", metalake_name="test_metalake", options=options) + +fs.ls("gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/") +``` + + ## Using fileset with pandas The following are examples of how to use the pandas library to access the OSS fileset @@ -362,7 +399,46 @@ ds.head() ## Fileset with credential -If the catalog has been configured with credential, you can access S3 fileset without setting `oss-access-key-id` and `oss-secret-access-key` in the properties via GVFS. More detail can be seen [here](./security/credential-vending.md#oss-credentials). +Since 0.8.0-incubating, Gravitino supports credential vending for OSS fileset. If the catalog has been configured with credential, you can access OSS fileset without providing authentication information like `oss-access-key-id` and `oss-secret-access-key` in the properties. + +### How to create a OSS Hadoop catalog with credential enabled + +Apart from configuration method in [create-oss-hadoop-catalog](#catalog-a-catalog), properties needed by [oss-credential](./security/credential-vending.md#oss-credentials) should also be set to enable credential vending for OSS fileset. + +### How to access OSS fileset with credential + +If the catalog has been configured with credential, you can access OSS fileset without providing authentication information via GVFS. Let's see how to access OSS fileset with credential: + +GVFS Java client: + +```java +Configuration conf = new Configuration(); +conf.set("fs.AbstractFileSystem.gvfs.impl","org.apache.gravitino.filesystem.hadoop.Gvfs"); +conf.set("fs.gvfs.impl","org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); +conf.set("fs.gravitino.server.uri","http://localhost:8090"); +conf.set("fs.gravitino.client.metalake","test_metalake"); +// No need to set oss-access-key-id and oss-secret-access-key +Path filesetPath = new Path("gvfs://fileset/oss_test_catalog/test_schema/test_fileset/new_dir"); +FileSystem fs = filesetPath.getFileSystem(conf); +fs.mkdirs(filesetPath); +... +``` + +Spark: + +```python +spark = SparkSession.builder + .appName("oss_fielset_test") + .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") + .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") + .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") + .config("spark.hadoop.fs.gravitino.client.metalake", "test") + # No need to set oss-access-key-id and oss-secret-access-key + .config("spark.driver.memory", "2g") + .config("spark.driver.port", "2048") + .getOrCreate() +``` +Python client and Hadoop command are similar to the above examples. diff --git a/docs/hadoop-catalog-with-s3.md b/docs/hadoop-catalog-with-s3.md index 260a036f9d3..e81359e172d 100644 --- a/docs/hadoop-catalog-with-s3.md +++ b/docs/hadoop-catalog-with-s3.md @@ -286,6 +286,25 @@ Please choose the correct jar according to your environment. In some Spark version, Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work, in this case, you should add the jars to the spark classpath directly. ::: +## Using Gravitino virual file system Java client to access the fileset + +```java +Configuration conf = new Configuration(); +conf.set("fs.AbstractFileSystem.gvfs.impl","org.apache.gravitino.filesystem.hadoop.Gvfs"); +conf.set("fs.gvfs.impl","org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); +conf.set("fs.gravitino.server.uri","http://localhost:8090"); +conf.set("fs.gravitino.client.metalake","test_metalake"); + +conf.set("s3-endpoint", "http://localhost:9000"); +conf.set("s3-access-key-id", "minio"); +conf.set("s3-secret-access-key", "minio123"); + +Path filesetPath = new Path("gvfs://fileset/test_catalog/test_schema/test_fileset/new_dir"); +FileSystem fs = filesetPath.getFileSystem(conf); +fs.mkdirs(filesetPath); +... +``` + ## Using fileset with hadoop fs command The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. @@ -342,6 +361,22 @@ hadoop dfs -ls gvfs://fileset/s3_catalog/schema/example hadoop dfs -put /path/to/local/file gvfs://fileset/s3_catalog/schema/example ``` +## Using Gravitino virtual file system Python client + +```python +from gravitino import gvfs +options = { + "cache_size": 20, + "cache_expired_time": 3600, + "auth_type": "simple", + "s3_endpoint": "http://localhost:9000", + "s3_access_key_id": "minio", + "s3_secret_access_key": "minio123" +} +fs = gvfs.GravitinoVirtualFileSystem(server_uri="http://localhost:8090", metalake_name="test_metalake", options=options) +fs.ls("gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/") ") +``` + ## Using fileset with pandas The following are examples of how to use the pandas library to access the S3 fileset @@ -365,8 +400,46 @@ ds.head() ## Fileset with credential -If the catalog has been configured with credential, you can access S3 fileset without setting `s3-access-key-id` and `s3-secret-access-key` in the properties via GVFS. More detail can be seen [here](./security/credential-vending.md#s3-credentials). +Since 0.8.0-incubating, Gravitino supports credential vending for S3 fileset. If the catalog has been configured with credential, you can access S3 fileset without providing authentication information like `s3-access-key-id` and `s3-secret-access-key` in the properties. + +### How to create a S3 Hadoop catalog with credential enabled + +Apart from configuration method in [create-s3-hadoop-catalog](#catalog-a-catalog), properties needed by [s3-credential](./security/credential-vending.md#s3-credentials) should also be set to enable credential vending for S3 fileset. + +### How to access S3 fileset with credential +If the catalog has been configured with credential, you can access S3 fileset without providing authentication information via GVFS. Let's see how to access S3 fileset with credential: + +GVFS Java client: + +```java +Configuration conf = new Configuration(); +conf.set("fs.AbstractFileSystem.gvfs.impl","org.apache.gravitino.filesystem.hadoop.Gvfs"); +conf.set("fs.gvfs.impl","org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); +conf.set("fs.gravitino.server.uri","http://localhost:8090"); +conf.set("fs.gravitino.client.metalake","test_metalake"); +// No need to set s3-access-key-id and s3-secret-access-key +Path filesetPath = new Path("gvfs://fileset/test_catalog/test_schema/test_fileset/new_dir"); +FileSystem fs = filesetPath.getFileSystem(conf); +fs.mkdirs(filesetPath); +... +``` + +Spark: + +```python +spark = SparkSession.builder + .appName("s3_fielset_test") + .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") + .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") + .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") + .config("spark.hadoop.fs.gravitino.client.metalake", "test") + # No need to set s3-access-key-id and s3-secret-access-key + .config("spark.driver.memory", "2g") + .config("spark.driver.port", "2048") + .getOrCreate() +``` +Python client and Hadoop command are similar to the above examples. From d232e92316735ca78d3ebb63b09bcdaf3a865437 Mon Sep 17 00:00:00 2001 From: yuqi Date: Mon, 6 Jan 2025 08:48:23 +0800 Subject: [PATCH 26/59] polish document again. --- docs/hadoop-catalog-with-adls.md | 14 +++++++++----- docs/hadoop-catalog-with-gcs.md | 16 ++++++++++------ docs/hadoop-catalog-with-oss.md | 14 ++++++++------ docs/hadoop-catalog-with-s3.md | 14 +++++++++----- 4 files changed, 36 insertions(+), 22 deletions(-) diff --git a/docs/hadoop-catalog-with-adls.md b/docs/hadoop-catalog-with-adls.md index a54a3baf4ef..2c4b2667ed9 100644 --- a/docs/hadoop-catalog-with-adls.md +++ b/docs/hadoop-catalog-with-adls.md @@ -19,6 +19,8 @@ $ bin/gravitino-server.sh start ## Create a Hadoop Catalog with ADLS in Gravitino +The rest of this document shows how to use the Hadoop catalog with ADLS in Gravitino with a full example. + ### Catalog a catalog Apart from configuration method in [Hadoop-catalog-catalog-configuration](./hadoop-catalog.md#catalog-properties), the following properties are required to configure a Hadoop catalog with ADLS: @@ -41,7 +43,7 @@ Refer to [Fileset operation](./manage-fileset-metadata-using-gravitino.md#filese ## Using Hadoop catalog with ADLS -### Create a Hadoop catalog/schema/file set with ADLS +### Create a Hadoop catalog/schema/fileset with ADLS First, you need to create a Hadoop catalog with ADLS. The following example shows how to create a Hadoop catalog with ADLS: @@ -220,7 +222,7 @@ catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("schema", "e -## Using Spark to access the fileset +### Using Spark to access the fileset The following code snippet shows how to use **PySpark 3.1.3 with Hadoop environment(Hadoop 3.2.0)** to access the fileset: @@ -279,7 +281,7 @@ Please choose the correct jar according to your environment. In some Spark version, Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work, in this case, you should add the jars to the spark classpath directly. ::: -## Using Gravitino virual file system Java client to access the fileset +### Using Gravitino virtual file system Java client to access the fileset ```java Configuration conf = new Configuration(); @@ -295,7 +297,7 @@ fs.mkdirs(filesetPath); ... ``` -## Using fileset with hadoop fs command +### Using fileset with hadoop fs command The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. @@ -345,7 +347,7 @@ hadoop dfs -ls gvfs://fileset/adls_catalog/schema/example hadoop dfs -put /path/to/local/file gvfs://fileset/adls_catalog/schema/example ``` -## Using Gravitino virtual file system Python client +### Using Gravitino virtual file system Python client ```python from gravitino import gvfs @@ -381,6 +383,8 @@ ds = pd.read_csv(f"gvfs://fileset/${catalog_name}/${schema_name}/${fileset_name} ds.head() ``` +For other use cases, please refer to the [Gravitino Virtual File System](./how-to-use-gvfs.md) document. + ## Fileset with credential Since 0.8.0-incubating, Gravitino supports credential vending for ADLS fileset. If the catalog has been configured with credential, you can access ADLS fileset without providing authentication information like `azure-storage-account-name` and `azure-storage-account-key` in the properties. diff --git a/docs/hadoop-catalog-with-gcs.md b/docs/hadoop-catalog-with-gcs.md index d94473f5d16..e2d129e8545 100644 --- a/docs/hadoop-catalog-with-gcs.md +++ b/docs/hadoop-catalog-with-gcs.md @@ -19,6 +19,9 @@ $ bin/gravitino-server.sh start ## Create a Hadoop Catalog with GCS in Gravitino +The rest of this document shows how to use the Hadoop catalog with GCS in Gravitino with a full example. + + ### Catalog a catalog Apart from configuration method in [Hadoop-catalog-catalog-configuration](./hadoop-catalog.md#catalog-properties), the following properties are required to configure a Hadoop catalog with GCS: @@ -37,7 +40,6 @@ Refer to [Schema operation](./manage-fileset-metadata-using-gravitino.md#schema- Refer to [Fileset operation](./manage-fileset-metadata-using-gravitino.md#fileset-operations) for more details. - ## Using Hadoop catalog with GCS ### Create a Hadoop catalog/schema/file set with GCS @@ -216,7 +218,7 @@ catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("schema", "e -## Using Spark to access the fileset +### Using Spark to access the fileset The following code snippet shows how to use **PySpark 3.1.3 with Hadoop environment(Hadoop 3.2.0)** to access the fileset: @@ -273,7 +275,7 @@ Please choose the correct jar according to your environment. In some Spark version, Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work, in this case, you should add the jars to the spark classpath directly. ::: -## Using Gravitino virual file system Java client to access the fileset +### Using Gravitino virtual file system Java client to access the fileset ```java Configuration conf = new Configuration(); @@ -288,7 +290,7 @@ fs.mkdirs(filesetPath); ... ``` -## Using fileset with hadoop fs command +### Using fileset with hadoop fs command The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. @@ -335,7 +337,7 @@ hadoop dfs -put /path/to/local/file gvfs://fileset/gcs_catalog/schema/example ``` -## Using Gravitino virtual file system Python client +### Using Gravitino virtual file system Python client ```python from gravitino import gvfs @@ -349,7 +351,7 @@ fs = gvfs.GravitinoVirtualFileSystem(server_uri="http://localhost:8090", metalak fs.ls("gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/") ``` -## Using fileset with pandas +### Using fileset with pandas The following are examples of how to use the pandas library to access the GCS fileset @@ -368,6 +370,8 @@ ds = pd.read_csv(f"gvfs://fileset/${catalog_name}/${schema_name}/${fileset_name} ds.head() ``` +For other use cases, please refer to the [Gravitino Virtual File System](./how-to-use-gvfs.md) document. + ## Fileset with credential Since 0.8.0-incubating, Gravitino supports credential vending for GCS fileset. If the catalog has been configured with credential, you can access GCS fileset without providing authentication information like `gcs-service-account-file` in the properties. diff --git a/docs/hadoop-catalog-with-oss.md b/docs/hadoop-catalog-with-oss.md index 1da09507afa..326c370064d 100644 --- a/docs/hadoop-catalog-with-oss.md +++ b/docs/hadoop-catalog-with-oss.md @@ -42,6 +42,8 @@ Refer to [Fileset operation](./manage-fileset-metadata-using-gravitino.md#filese ## Using Hadoop catalog with OSS +The rest of this document shows how to use the Hadoop catalog with OSS in Gravitino with a full example. + ### Create a Hadoop catalog/schema/file set with OSS First, you need to create a Hadoop catalog with OSS. The following example shows how to create a Hadoop catalog with OSS: @@ -224,7 +226,7 @@ catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("schema", "e -## Using Spark to access the fileset +### Using Spark to access the fileset The following code snippet shows how to use **PySpark 3.1.3 with Hadoop environment(Hadoop 3.2.0)** to access the fileset: @@ -283,7 +285,7 @@ Please choose the correct jar according to your environment. In some Spark version, Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work, in this case, you should add the jars to the spark classpath directly. ::: -## Using Gravitino virual file system Java client to access the fileset +### Using Gravitino virtual file system Java client to access the fileset ```java Configuration conf = new Configuration(); @@ -300,8 +302,7 @@ fs.mkdirs(filesetPath); ... ``` - -## Using fileset with hadoop fs command +### Using fileset with hadoop fs command The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. @@ -358,7 +359,7 @@ hadoop dfs -put /path/to/local/file gvfs://fileset/oss_catalog/schema/example ``` -## Using Gravitino virtual file system Python client +### Using Gravitino virtual file system Python client ```python from gravitino import gvfs @@ -376,7 +377,7 @@ fs.ls("gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/") ``` -## Using fileset with pandas +### Using fileset with pandas The following are examples of how to use the pandas library to access the OSS fileset @@ -396,6 +397,7 @@ ds = pd.read_csv(f"gvfs://fileset/${catalog_name}/${schema_name}/${fileset_name} storage_options=storage_options) ds.head() ``` +For other use cases, please refer to the [Gravitino Virtual File System](./how-to-use-gvfs.md) document. ## Fileset with credential diff --git a/docs/hadoop-catalog-with-s3.md b/docs/hadoop-catalog-with-s3.md index e81359e172d..0c184d7f387 100644 --- a/docs/hadoop-catalog-with-s3.md +++ b/docs/hadoop-catalog-with-s3.md @@ -42,6 +42,8 @@ Refer to [Fileset operation](./manage-fileset-metadata-using-gravitino.md#filese ## Using Hadoop catalog with S3 +The rest of this document shows how to use the Hadoop catalog with S3 in Gravitino with a full example. + ### Create a Hadoop catalog/schema/file set with S3 First of all, you need to create a Hadoop catalog with S3. The following example shows how to create a Hadoop catalog with S3: @@ -228,7 +230,8 @@ catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("schema", "e -## Using Spark to access the fileset + +### Using Spark to access the fileset The following code snippet shows how to use **PySpark 3.1.3 with Hadoop environment(Hadoop 3.2.0)** to access the fileset: @@ -286,7 +289,7 @@ Please choose the correct jar according to your environment. In some Spark version, Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work, in this case, you should add the jars to the spark classpath directly. ::: -## Using Gravitino virual file system Java client to access the fileset +### Using Gravitino virtual file system Java client to access the fileset ```java Configuration conf = new Configuration(); @@ -305,7 +308,7 @@ fs.mkdirs(filesetPath); ... ``` -## Using fileset with hadoop fs command +### Using fileset with hadoop fs command The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. @@ -361,7 +364,7 @@ hadoop dfs -ls gvfs://fileset/s3_catalog/schema/example hadoop dfs -put /path/to/local/file gvfs://fileset/s3_catalog/schema/example ``` -## Using Gravitino virtual file system Python client +### Using Gravitino virtual file system Python client ```python from gravitino import gvfs @@ -377,7 +380,7 @@ fs = gvfs.GravitinoVirtualFileSystem(server_uri="http://localhost:8090", metalak fs.ls("gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/") ") ``` -## Using fileset with pandas +### Using fileset with pandas The following are examples of how to use the pandas library to access the S3 fileset @@ -397,6 +400,7 @@ ds = pd.read_csv(f"gvfs://fileset/${catalog_name}/${schema_name}/${fileset_name} storage_options=storage_options) ds.head() ``` +For other use cases, please refer to the [Gravitino Virtual File System](./how-to-use-gvfs.md) document. ## Fileset with credential From fbd57ba1e864e483d646fd7c0de54477fa216884 Mon Sep 17 00:00:00 2001 From: yuqi Date: Mon, 6 Jan 2025 09:25:34 +0800 Subject: [PATCH 27/59] Again --- docs/hadoop-catalog-with-adls.md | 2 ++ docs/hadoop-catalog-with-gcs.md | 4 +++- docs/hadoop-catalog-with-oss.md | 2 ++ docs/hadoop-catalog-with-s3.md | 2 ++ 4 files changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/hadoop-catalog-with-adls.md b/docs/hadoop-catalog-with-adls.md index 2c4b2667ed9..40b4c14f767 100644 --- a/docs/hadoop-catalog-with-adls.md +++ b/docs/hadoop-catalog-with-adls.md @@ -297,6 +297,8 @@ fs.mkdirs(filesetPath); ... ``` +Similar to Spark configurations, you need to add ADLS bundle jars to the classpath according to your environment. + ### Using fileset with hadoop fs command The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. diff --git a/docs/hadoop-catalog-with-gcs.md b/docs/hadoop-catalog-with-gcs.md index e2d129e8545..4b29db5c016 100644 --- a/docs/hadoop-catalog-with-gcs.md +++ b/docs/hadoop-catalog-with-gcs.md @@ -290,6 +290,8 @@ fs.mkdirs(filesetPath); ... ``` +Similar to Spark configurations, you need to add GCS bundle jars to the classpath according to your environment. + ### Using fileset with hadoop fs command The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. @@ -403,7 +405,7 @@ Spark: ```python spark = SparkSession.builder - .appName("gcs_fielset_test") + .appName("gcs_fileset_test") .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") diff --git a/docs/hadoop-catalog-with-oss.md b/docs/hadoop-catalog-with-oss.md index 326c370064d..ecc89edd901 100644 --- a/docs/hadoop-catalog-with-oss.md +++ b/docs/hadoop-catalog-with-oss.md @@ -302,6 +302,8 @@ fs.mkdirs(filesetPath); ... ``` +Similar to Spark configurations, you need to add OSS bundle jars to the classpath according to your environment. + ### Using fileset with hadoop fs command The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. diff --git a/docs/hadoop-catalog-with-s3.md b/docs/hadoop-catalog-with-s3.md index 0c184d7f387..94cc12b3d95 100644 --- a/docs/hadoop-catalog-with-s3.md +++ b/docs/hadoop-catalog-with-s3.md @@ -308,6 +308,8 @@ fs.mkdirs(filesetPath); ... ``` +Similar to Spark configurations, you need to add S3 bundle jars to the classpath according to your environment. + ### Using fileset with hadoop fs command The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. From 193f4670847bf520024dab0cad345ba067156c2d Mon Sep 17 00:00:00 2001 From: yuqi Date: Mon, 6 Jan 2025 15:05:18 +0800 Subject: [PATCH 28/59] resolve comments. --- bundles/aliyun/build.gradle.kts | 2 +- ...vider.java => OSSCredentialsProvider.java} | 60 +++++++++---------- .../oss/fs/OSSFileSystemProvider.java | 2 +- bundles/aws/build.gradle.kts | 2 +- ...ovider.java => S3CredentialsProvider.java} | 54 ++++++++--------- .../gravitino/s3/fs/S3FileSystemProvider.java | 2 +- bundles/azure/build.gradle.kts | 3 +- .../abs/fs/AzureFileSystemProvider.java | 16 ++--- ....java => AzureSasCredentialsProvider.java} | 53 ++++++++-------- bundles/gcp/build.gradle.kts | 2 +- ...vider.java => GCSCredentialsProvider.java} | 34 +++++++---- .../gcs/fs/GCSFileSystemProvider.java | 4 +- 12 files changed, 121 insertions(+), 113 deletions(-) rename bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/{OSSCredentialProvider.java => OSSCredentialsProvider.java} (79%) rename bundles/aws/src/main/java/org/apache/gravitino/s3/fs/{S3CredentialProvider.java => S3CredentialsProvider.java} (81%) rename bundles/azure/src/main/java/org/apache/gravitino/abs/fs/{AzureSasCredentialProvider.java => AzureSasCredentialsProvider.java} (82%) rename bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/{GCSCredentialProvider.java => GCSCredentialsProvider.java} (85%) diff --git a/bundles/aliyun/build.gradle.kts b/bundles/aliyun/build.gradle.kts index b62570072f7..88febb4103a 100644 --- a/bundles/aliyun/build.gradle.kts +++ b/bundles/aliyun/build.gradle.kts @@ -29,6 +29,7 @@ dependencies { compileOnly(project(":catalogs:catalog-common")) compileOnly(project(":catalogs:catalog-hadoop")) compileOnly(project(":core")) + compileOnly(project(":clients:client-java-runtime", configuration = "shadow")) compileOnly(libs.hadoop3.client.api) compileOnly(libs.hadoop3.client.runtime) compileOnly(libs.hadoop3.oss) @@ -39,7 +40,6 @@ dependencies { implementation(project(":catalogs:hadoop-common")) { exclude("*") } - implementation(project(":clients:client-java-runtime", configuration = "shadow")) implementation(project(":clients:filesystem-hadoop3-common")) { exclude("*") } diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java similarity index 79% rename from bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialProvider.java rename to bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java index 0c21fa8eec3..67217fbf9cb 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java @@ -28,9 +28,7 @@ import com.aliyun.oss.common.auth.CredentialsProvider; import com.aliyun.oss.common.auth.DefaultCredentials; import java.net.URI; -import java.util.Arrays; import java.util.Map; -import java.util.Optional; import org.apache.gravitino.NameIdentifier; import org.apache.gravitino.client.GravitinoClient; import org.apache.gravitino.credential.Credential; @@ -45,18 +43,18 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class OSSCredentialProvider implements CredentialsProvider { +public class OSSCredentialsProvider implements CredentialsProvider { - private static final Logger LOG = LoggerFactory.getLogger(OSSCredentialProvider.class); + private static final Logger LOG = LoggerFactory.getLogger(OSSCredentialsProvider.class); private Credentials basicCredentials; private final String filesetIdentifier; - private final GravitinoClient client; + private GravitinoClient client; private final Configuration configuration; private long expirationTime = Long.MAX_VALUE; private static final double EXPIRATION_TIME_FACTOR = 0.9D; - public OSSCredentialProvider(URI uri, Configuration conf) { + public OSSCredentialsProvider(URI uri, Configuration conf) { this.filesetIdentifier = conf.get(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER); this.client = GravitinoVirtualFileSystemUtils.createClient(conf); @@ -71,7 +69,13 @@ public Credentials getCredentials() { // If the credentials are null or about to expire, refresh the credentials. if (basicCredentials == null || System.currentTimeMillis() >= expirationTime) { synchronized (this) { - refresh(); + try { + refresh(); + } finally { + if (null != this.client) { + this.client.close(); + } + } } } @@ -82,13 +86,14 @@ private void refresh() { String[] idents = filesetIdentifier.split("\\."); String catalog = idents[1]; + this.client = GravitinoVirtualFileSystemUtils.createClient(configuration); FilesetCatalog filesetCatalog = client.loadCatalog(catalog).asFilesetCatalog(); Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); Credential[] credentials = fileset.supportsCredentials().getCredentials(); - Optional optionalCredential = getCredential(credentials); + Credential credential = getCredential(credentials); - if (!optionalCredential.isPresent()) { + if (credential == null) { LOG.warn("No credential found for fileset: {}, try to use static AKSK", filesetIdentifier); expirationTime = Long.MAX_VALUE; this.basicCredentials = @@ -98,7 +103,6 @@ private void refresh() { return; } - Credential credential = optionalCredential.get(); Map credentialMap = credential.toProperties(); String accessKeyId = credentialMap.get(GRAVITINO_OSS_SESSION_ACCESS_KEY_ID); @@ -126,29 +130,25 @@ private void refresh() { * uses static credential. * * @param credentials The credential array. - * @return An optional credential. + * @return A credential. Null if not found. */ - private Optional getCredential(Credential[] credentials) { + private Credential getCredential(Credential[] credentials) { // Use dynamic credential if found. - Optional dynamicCredential = - Arrays.stream(credentials) - .filter( - credential -> - credential - .credentialType() - .equals(OSSTokenCredential.OSS_TOKEN_CREDENTIAL_TYPE)) - .findFirst(); - if (dynamicCredential.isPresent()) { - return dynamicCredential; + for (Credential credential : credentials) { + if (credential.credentialType().equals(OSSTokenCredential.OSS_TOKEN_CREDENTIAL_TYPE)) { + return credential; + } + } + + // If dynamic credential not found, use the static one + for (Credential credential : credentials) { + if (credential + .credentialType() + .equals(OSSSecretKeyCredential.OSS_SECRET_KEY_CREDENTIAL_TYPE)) { + return credential; + } } - // If dynamic credential not found, use the static one if possible - return Arrays.stream(credentials) - .filter( - credential -> - credential - .credentialType() - .equals(OSSSecretKeyCredential.OSS_SECRET_KEY_CREDENTIAL_TYPE)) - .findFirst(); + return null; } } diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java index f4fa8a374fa..073bbb6c559 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java @@ -65,7 +65,7 @@ public FileSystem getFileSystem(Path path, Map config) throws IO && config.containsKey( GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { hadoopConfMap.put( - Constants.CREDENTIALS_PROVIDER_KEY, OSSCredentialProvider.class.getCanonicalName()); + Constants.CREDENTIALS_PROVIDER_KEY, OSSCredentialsProvider.class.getCanonicalName()); } hadoopConfMap.forEach(configuration::set); diff --git a/bundles/aws/build.gradle.kts b/bundles/aws/build.gradle.kts index 82a709dd472..ac31bd284bd 100644 --- a/bundles/aws/build.gradle.kts +++ b/bundles/aws/build.gradle.kts @@ -29,6 +29,7 @@ dependencies { compileOnly(project(":catalogs:catalog-common")) compileOnly(project(":catalogs:catalog-hadoop")) compileOnly(project(":core")) + compileOnly(project(":clients:client-java-runtime", configuration = "shadow")) compileOnly(libs.hadoop3.aws) compileOnly(libs.hadoop3.client.api) compileOnly(libs.hadoop3.client.runtime) @@ -39,7 +40,6 @@ dependencies { implementation(project(":catalogs:hadoop-common")) { exclude("*") } - implementation(project(":clients:client-java-runtime", configuration = "shadow")) implementation(project(":clients:filesystem-hadoop3-common")) { exclude("*") } diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java similarity index 81% rename from bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialProvider.java rename to bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java index c474abc83aa..5db9aac808e 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java @@ -28,9 +28,7 @@ import com.amazonaws.auth.BasicAWSCredentials; import com.amazonaws.auth.BasicSessionCredentials; import java.net.URI; -import java.util.Arrays; import java.util.Map; -import java.util.Optional; import org.apache.gravitino.NameIdentifier; import org.apache.gravitino.client.GravitinoClient; import org.apache.gravitino.credential.Credential; @@ -45,10 +43,10 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class S3CredentialProvider implements AWSCredentialsProvider { +public class S3CredentialsProvider implements AWSCredentialsProvider { - private static final Logger LOG = LoggerFactory.getLogger(S3CredentialProvider.class); - private final GravitinoClient client; + private static final Logger LOG = LoggerFactory.getLogger(S3CredentialsProvider.class); + private GravitinoClient client; private final String filesetIdentifier; private final Configuration configuration; @@ -56,11 +54,10 @@ public class S3CredentialProvider implements AWSCredentialsProvider { private long expirationTime = Long.MAX_VALUE; private static final double EXPIRATION_TIME_FACTOR = 0.9D; - public S3CredentialProvider(final URI uri, final Configuration conf) { + public S3CredentialsProvider(final URI uri, final Configuration conf) { this.filesetIdentifier = conf.get(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER); this.configuration = conf; - this.client = GravitinoVirtualFileSystemUtils.createClient(conf); } @Override @@ -68,7 +65,13 @@ public AWSCredentials getCredentials() { // Refresh credentials if they are null or about to expire in 5 minutes if (basicSessionCredentials == null || System.currentTimeMillis() >= expirationTime) { synchronized (this) { - refresh(); + try { + refresh(); + } finally { + if (null != this.client) { + this.client.close(); + } + } } } @@ -81,14 +84,15 @@ public void refresh() { String[] idents = filesetIdentifier.split("\\."); String catalog = idents[1]; + this.client = GravitinoVirtualFileSystemUtils.createClient(configuration); FilesetCatalog filesetCatalog = client.loadCatalog(catalog).asFilesetCatalog(); Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); Credential[] credentials = fileset.supportsCredentials().getCredentials(); - Optional optionalCredential = getCredential(credentials); + Credential credential = getCredential(credentials); // Can't find any credential, use the default AKSK if possible. - if (!optionalCredential.isPresent()) { + if (credential == null) { LOG.warn("No credential found for fileset: {}, try to use static AKSK", filesetIdentifier); expirationTime = Long.MAX_VALUE; this.basicSessionCredentials = @@ -97,7 +101,6 @@ public void refresh() { return; } - Credential credential = optionalCredential.get(); Map credentialMap = credential.toProperties(); String accessKeyId = credentialMap.get(GRAVITINO_S3_SESSION_ACCESS_KEY_ID); @@ -126,27 +129,22 @@ public void refresh() { * uses static credential. * * @param credentials The credential array. - * @return An optional credential. + * @return A credential. Null if not found. */ - private Optional getCredential(Credential[] credentials) { + private Credential getCredential(Credential[] credentials) { // Use dynamic credential if found. - Optional dynamicCredential = - Arrays.stream(credentials) - .filter( - credential -> - credential.credentialType().equals(S3TokenCredential.S3_TOKEN_CREDENTIAL_TYPE)) - .findFirst(); - if (dynamicCredential.isPresent()) { - return dynamicCredential; + for (Credential credential : credentials) { + if (credential.credentialType().equals(S3TokenCredential.S3_TOKEN_CREDENTIAL_TYPE)) { + return credential; + } } // If dynamic credential not found, use the static one - return Arrays.stream(credentials) - .filter( - credential -> - credential - .credentialType() - .equals(S3SecretKeyCredential.S3_SECRET_KEY_CREDENTIAL_TYPE)) - .findFirst(); + for (Credential credential : credentials) { + if (credential.credentialType().equals(S3SecretKeyCredential.S3_SECRET_KEY_CREDENTIAL_TYPE)) { + return credential; + } + } + return null; } } diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java index 44a9dfb0c40..f4972d12f3d 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java @@ -72,7 +72,7 @@ public FileSystem getFileSystem(Path path, Map config) throws IO // will have this key. if (config.containsKey(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { configuration.set( - Constants.AWS_CREDENTIALS_PROVIDER, S3CredentialProvider.class.getCanonicalName()); + Constants.AWS_CREDENTIALS_PROVIDER, S3CredentialsProvider.class.getCanonicalName()); } // Hadoop-aws 2 does not support IAMInstanceCredentialsProvider diff --git a/bundles/azure/build.gradle.kts b/bundles/azure/build.gradle.kts index c64890af66a..9a760132445 100644 --- a/bundles/azure/build.gradle.kts +++ b/bundles/azure/build.gradle.kts @@ -28,7 +28,7 @@ dependencies { compileOnly(project(":api")) compileOnly(project(":catalogs:catalog-hadoop")) compileOnly(project(":core")) - + compileOnly(project(":clients:client-java-runtime", configuration = "shadow")) compileOnly(libs.hadoop3.abs) compileOnly(libs.hadoop3.client.api) compileOnly(libs.hadoop3.client.runtime) @@ -40,7 +40,6 @@ dependencies { exclude("*") } - implementation(project(":clients:client-java-runtime", configuration = "shadow")) implementation(project(":clients:filesystem-hadoop3-common")) { exclude("*") } diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java index 6a03361ee79..64c269fcce6 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java @@ -78,9 +78,9 @@ public FileSystem getFileSystem(@Nonnull Path path, @Nonnull Map if (config.containsKey(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { // Test whether SAS works try { - AzureSasCredentialProvider azureSasCredentialProvider = new AzureSasCredentialProvider(); - azureSasCredentialProvider.initialize(configuration, null); - String sas = azureSasCredentialProvider.getSASToken(null, null, null, null); + AzureSasCredentialsProvider azureSasCredentialsProvider = new AzureSasCredentialsProvider(); + azureSasCredentialsProvider.initialize(configuration, null); + String sas = azureSasCredentialsProvider.getSASToken(null, null, null, null); if (sas != null) { String accountName = String.format( @@ -91,15 +91,15 @@ public FileSystem getFileSystem(@Nonnull Path path, @Nonnull Map FS_AZURE_ACCOUNT_AUTH_TYPE_PROPERTY_NAME + "." + accountName, AuthType.SAS.name()); configuration.set( FS_AZURE_SAS_TOKEN_PROVIDER_TYPE + "." + accountName, - AzureSasCredentialProvider.class.getName()); + AzureSasCredentialsProvider.class.getName()); configuration.set(FS_AZURE_ACCOUNT_IS_HNS_ENABLED, "true"); - } else if (azureSasCredentialProvider.getAzureStorageAccountKey() != null - && azureSasCredentialProvider.getAzureStorageAccountName() != null) { + } else if (azureSasCredentialsProvider.getAzureStorageAccountKey() != null + && azureSasCredentialsProvider.getAzureStorageAccountName() != null) { configuration.set( String.format( "fs.azure.account.key.%s.dfs.core.windows.net", - azureSasCredentialProvider.getAzureStorageAccountName()), - azureSasCredentialProvider.getAzureStorageAccountKey()); + azureSasCredentialsProvider.getAzureStorageAccountName()), + azureSasCredentialsProvider.getAzureStorageAccountKey()); } } catch (Exception e) { // Can't use SAS, use account key and account key instead diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java similarity index 82% rename from bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java rename to bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java index 8c63d48b749..8bb1f025556 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java @@ -25,9 +25,7 @@ import static org.apache.gravitino.credential.AzureAccountKeyCredential.GRAVITINO_AZURE_STORAGE_ACCOUNT_NAME; import java.io.IOException; -import java.util.Arrays; import java.util.Map; -import java.util.Optional; import org.apache.gravitino.NameIdentifier; import org.apache.gravitino.client.GravitinoClient; import org.apache.gravitino.credential.AzureAccountKeyCredential; @@ -42,9 +40,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class AzureSasCredentialProvider implements SASTokenProvider, Configurable { +public class AzureSasCredentialsProvider implements SASTokenProvider, Configurable { - private static final Logger LOGGER = LoggerFactory.getLogger(AzureSasCredentialProvider.class); + private static final Logger LOGGER = LoggerFactory.getLogger(AzureSasCredentialsProvider.class); private Configuration configuration; @@ -82,7 +80,7 @@ public Configuration getConf() { public void initialize(Configuration conf, String accountName) throws IOException { this.filesetIdentifier = conf.get(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER); - this.client = GravitinoVirtualFileSystemUtils.createClient(conf); + this.configuration = conf; } @Override @@ -90,7 +88,13 @@ public String getSASToken(String account, String fileSystem, String path, String // Refresh credentials if they are null or about to expire in 5 minutes if (sasToken == null || System.currentTimeMillis() >= expirationTime) { synchronized (this) { - refresh(); + try { + refresh(); + } finally { + if (null != this.client) { + this.client.close(); + } + } } } return sasToken; @@ -100,20 +104,19 @@ private void refresh() { String[] idents = filesetIdentifier.split("\\."); String catalog = idents[1]; + this.client = GravitinoVirtualFileSystemUtils.createClient(configuration); FilesetCatalog filesetCatalog = client.loadCatalog(catalog).asFilesetCatalog(); Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); Credential[] credentials = fileset.supportsCredentials().getCredentials(); - Optional optionalCredential = getCredential(credentials); + Credential credential = getCredential(credentials); - if (!optionalCredential.isPresent()) { + if (credential == null) { LOGGER.warn("No credentials found for fileset {}", filesetIdentifier); return; } - Credential credential = optionalCredential.get(); Map credentialMap = credential.toProperties(); - if (ADLS_TOKEN_CREDENTIAL_TYPE.equals(credentialMap.get(Credential.CREDENTIAL_TYPE))) { sasToken = credentialMap.get(GRAVITINO_ADLS_SAS_TOKEN); } else { @@ -135,25 +138,25 @@ private void refresh() { * uses static credential. * * @param credentials The credential array. - * @return An optional credential. + * @return A credential. Null if not found. */ - private Optional getCredential(Credential[] credentials) { + private Credential getCredential(Credential[] credentials) { // Use dynamic credential if found. - Optional dynamicCredential = - Arrays.stream(credentials) - .filter(credential -> credential.credentialType().equals(ADLS_TOKEN_CREDENTIAL_TYPE)) - .findFirst(); - if (dynamicCredential.isPresent()) { - return dynamicCredential; + for (Credential credential : credentials) { + if (credential.credentialType().equals(ADLS_TOKEN_CREDENTIAL_TYPE)) { + return credential; + } } // If dynamic credential not found, use the static one - return Arrays.stream(credentials) - .filter( - credential -> - credential - .credentialType() - .equals(AzureAccountKeyCredential.AZURE_ACCOUNT_KEY_CREDENTIAL_TYPE)) - .findFirst(); + for (Credential credential : credentials) { + if (credential + .credentialType() + .equals(AzureAccountKeyCredential.AZURE_ACCOUNT_KEY_CREDENTIAL_TYPE)) { + return credential; + } + } + + return null; } } diff --git a/bundles/gcp/build.gradle.kts b/bundles/gcp/build.gradle.kts index defb5098e72..2e33922e370 100644 --- a/bundles/gcp/build.gradle.kts +++ b/bundles/gcp/build.gradle.kts @@ -29,6 +29,7 @@ dependencies { compileOnly(project(":catalogs:catalog-common")) compileOnly(project(":catalogs:catalog-hadoop")) compileOnly(project(":core")) + compileOnly(project(":clients:client-java-runtime", configuration = "shadow")) compileOnly(libs.hadoop3.client.api) compileOnly(libs.hadoop3.client.runtime) @@ -40,7 +41,6 @@ dependencies { implementation(project(":catalogs:hadoop-common")) { exclude("*") } - implementation(project(":clients:client-java-runtime", configuration = "shadow")) implementation(project(":clients:filesystem-hadoop3-common")) { exclude("*") } diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java similarity index 85% rename from bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialProvider.java rename to bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java index ad0a927aeaf..958e75307a2 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java @@ -36,8 +36,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class GCSCredentialProvider implements AccessTokenProvider { - private static final Logger LOG = LoggerFactory.getLogger(GCSCredentialProvider.class); +public class GCSCredentialsProvider implements AccessTokenProvider { + private static final Logger LOG = LoggerFactory.getLogger(GCSCredentialsProvider.class); private Configuration configuration; private GravitinoClient client; private String filesetIdentifier; @@ -53,6 +53,10 @@ public AccessToken getAccessToken() { refresh(); } catch (IOException e) { LOG.error("Failed to refresh the access token", e); + } finally { + if (null != this.client) { + this.client.close(); + } } } return accessToken; @@ -65,20 +69,20 @@ public void refresh() throws IOException { String[] idents = filesetIdentifier.split("\\."); String catalog = idents[1]; + this.client = GravitinoVirtualFileSystemUtils.createClient(configuration); FilesetCatalog filesetCatalog = client.loadCatalog(catalog).asFilesetCatalog(); Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); Credential[] credentials = fileset.supportsCredentials().getCredentials(); - Optional optionalCredential = getCredential(credentials); + Credential credential = getCredential(credentials); // Can't find any credential, use the default one. - if (!optionalCredential.isPresent()) { + if (null == credential) { LOG.warn( "No credential found for fileset: {}, try to use static JSON file", filesetIdentifier); return; } - Credential credential = optionalCredential.get(); Map credentialMap = credential.toProperties(); if (GCSTokenCredential.GCS_TOKEN_CREDENTIAL_TYPE.equals( @@ -101,7 +105,6 @@ public void setConf(Configuration configuration) { this.configuration = configuration; this.filesetIdentifier = configuration.get(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER); - this.client = GravitinoVirtualFileSystemUtils.createClient(configuration); } @Override @@ -114,14 +117,19 @@ public Configuration getConf() { * uses static credential. * * @param credentials The credential array. - * @return An optional credential. + * @return An credential. */ - private Optional getCredential(Credential[] credentials) { + private Credential getCredential(Credential[] credentials) { // Use dynamic credential if found. - return Arrays.stream(credentials) - .filter( - credential -> - credential.credentialType().equals(GCSTokenCredential.GCS_TOKEN_CREDENTIAL_TYPE)) - .findFirst(); + Optional optionalCredential = + Arrays.stream(credentials) + .filter( + credential -> + credential + .credentialType() + .equals(GCSTokenCredential.GCS_TOKEN_CREDENTIAL_TYPE)) + .findFirst(); + + return optionalCredential.orElse(null); } } diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java index 17a837f7451..f0bf792fe18 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java @@ -47,13 +47,13 @@ public FileSystem getFileSystem(Path path, Map config) throws IO .forEach(configuration::set); if (config.containsKey(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { - AccessTokenProvider accessTokenProvider = new GCSCredentialProvider(); + AccessTokenProvider accessTokenProvider = new GCSCredentialsProvider(); accessTokenProvider.setConf(configuration); // Why is this check necessary?, if Gravitino fails to get any credentials, we fall back to // the default behavior of the GoogleHadoopFileSystem to use service account credentials. if (accessTokenProvider.getAccessToken() != null) { configuration.set( - "fs.gs.auth.access.token.provider.impl", GCSCredentialProvider.class.getName()); + "fs.gs.auth.access.token.provider.impl", GCSCredentialsProvider.class.getName()); } } From a1f0989ff53123e91c4cf7109bd5b60181ba4c04 Mon Sep 17 00:00:00 2001 From: yuqi Date: Mon, 6 Jan 2025 16:21:08 +0800 Subject: [PATCH 29/59] fix --- .../oss/fs/OSSFileSystemProvider.java | 18 +++++++++++++++--- .../gcs/fs/GCSCredentialProvider.java | 2 +- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java index f4fa8a374fa..ea5f0586e83 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java @@ -61,9 +61,7 @@ public FileSystem getFileSystem(Path path, Map config) throws IO hadoopConfMap.put(OSS_FILESYSTEM_IMPL, AliyunOSSFileSystem.class.getCanonicalName()); } - if (!hadoopConfMap.containsKey(Constants.CREDENTIALS_PROVIDER_KEY) - && config.containsKey( - GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { + if (shouldSetCredentialsProviderExplicitly(config)) { hadoopConfMap.put( Constants.CREDENTIALS_PROVIDER_KEY, OSSCredentialProvider.class.getCanonicalName()); } @@ -73,6 +71,20 @@ public FileSystem getFileSystem(Path path, Map config) throws IO return AliyunOSSFileSystem.newInstance(path.toUri(), configuration); } + /** + * Check if the credential provider should be set explicitly. + * + *

When the credential provider is not set and the server URI is set (this means the call is + * from GVFS client), we need to manually set the credential provider + * + * @param config the configuration map + * @return true if the credential provider should be set explicitly + */ + private boolean shouldSetCredentialsProviderExplicitly(Map config) { + return !config.containsKey(Constants.CREDENTIALS_PROVIDER_KEY) + && config.containsKey(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY); + } + @Override public String scheme() { return "oss"; diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialProvider.java index ad0a927aeaf..645694da3a1 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialProvider.java @@ -106,7 +106,7 @@ public void setConf(Configuration configuration) { @Override public Configuration getConf() { - return this.configuration; + return configuration; } /** From 4fb6e798146ebcee0298ec81517572f12eec6d23 Mon Sep 17 00:00:00 2001 From: yuqi Date: Mon, 6 Jan 2025 19:57:04 +0800 Subject: [PATCH 30/59] fix --- docs/hadoop-catalog-with-adls.md | 41 +++++++++++++------------- docs/hadoop-catalog-with-gcs.md | 49 ++++++++++++++++---------------- docs/hadoop-catalog-with-oss.md | 43 ++++++++++++++-------------- docs/hadoop-catalog-with-s3.md | 36 +++++++++++------------ 4 files changed, 83 insertions(+), 86 deletions(-) diff --git a/docs/hadoop-catalog-with-adls.md b/docs/hadoop-catalog-with-adls.md index 40b4c14f767..679a3d631b0 100644 --- a/docs/hadoop-catalog-with-adls.md +++ b/docs/hadoop-catalog-with-adls.md @@ -21,7 +21,7 @@ $ bin/gravitino-server.sh start The rest of this document shows how to use the Hadoop catalog with ADLS in Gravitino with a full example. -### Catalog a catalog +### Catalog a Hadoop catalog with ADLS Apart from configuration method in [Hadoop-catalog-catalog-configuration](./hadoop-catalog.md#catalog-properties), the following properties are required to configure a Hadoop catalog with ADLS: @@ -53,7 +53,7 @@ First, you need to create a Hadoop catalog with ADLS. The following example show ```shell curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ -H "Content-Type: application/json" -d '{ - "name": "catalog", + "name": "example_catalog", "type": "FILESET", "comment": "comment", "provider": "hadoop", @@ -82,7 +82,7 @@ adlsProperties = ImmutableMap.builder() .put("filesystem-providers", "abs") .build(); -Catalog adlsCatalog = gravitinoClient.createCatalog("catalog", +Catalog adlsCatalog = gravitinoClient.createCatalog("example_catalog", Type.FILESET, "hadoop", // provider, Gravitino only supports "hadoop" for now. "This is a ADLS fileset catalog", @@ -102,7 +102,7 @@ adls_properties = { "azure_storage_account_key": "azure storage account key" } -adls_properties = gravitino_client.create_catalog(name="catalog", +adls_properties = gravitino_client.create_catalog(name="example_catalog", type=Catalog.Type.FILESET, provider="hadoop", comment="This is a ADLS fileset catalog", @@ -123,27 +123,26 @@ Using the following code to create a schema and fileset: ```shell curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ -H "Content-Type: application/json" -d '{ - "name": "schema", + "name": "test_schema", "comment": "comment", "properties": { "location": "abfss://container@account-name.dfs.core.windows.net/path" } -}' http://localhost:8090/api/metalakes/metalake/catalogs/catalog/schemas +}' http://localhost:8090/api/metalakes/metalake/catalogs/test_catalog/schemas ``` ```java -// Assuming you have just created a Hive catalog named `hive_catalog` -Catalog catalog = gravitinoClient.loadCatalog("hive_catalog"); +Catalog catalog = gravitinoClient.loadCatalog("test_catalog"); SupportsSchemas supportsSchemas = catalog.asSchemas(); Map schemaProperties = ImmutableMap.builder() .put("location", "abfss://container@account-name.dfs.core.windows.net/path") .build(); -Schema schema = supportsSchemas.createSchema("schema", +Schema schema = supportsSchemas.createSchema("test_schema", "This is a schema", schemaProperties ); @@ -155,8 +154,8 @@ Schema schema = supportsSchemas.createSchema("schema", ```python gravitino_client: GravitinoClient = GravitinoClient(uri="http://127.0.0.1:8090", metalake_name="metalake") -catalog: Catalog = gravitino_client.load_catalog(name="hive_catalog") -catalog.as_schemas().create_schema(name="schema", +catalog: Catalog = gravitino_client.load_catalog(name="test_catalog") +catalog.as_schemas().create_schema(name="test_schema", comment="This is a schema", properties={"location": "abfss://container@account-name.dfs.core.windows.net/path"}) ``` @@ -177,7 +176,7 @@ curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ "properties": { "k1": "v1" } -}' http://localhost:8090/api/metalakes/metalake/catalogs/catalog/schemas/schema/filesets +}' http://localhost:8090/api/metalakes/metalake/catalogs/test_catalog/schemas/test_schema/filesets ``` @@ -189,7 +188,7 @@ GravitinoClient gravitinoClient = GravitinoClient .withMetalake("metalake") .build(); -Catalog catalog = gravitinoClient.loadCatalog("catalog"); +Catalog catalog = gravitinoClient.loadCatalog("test_catalog"); FilesetCatalog filesetCatalog = catalog.asFilesetCatalog(); Map propertiesMap = ImmutableMap.builder() @@ -197,7 +196,7 @@ Map propertiesMap = ImmutableMap.builder() .build(); filesetCatalog.createFileset( - NameIdentifier.of("schema", "example_fileset"), + NameIdentifier.of("test_schema", "example_fileset"), "This is an example fileset", Fileset.Type.MANAGED, "abfss://container@account-name.dfs.core.windows.net/path/example_fileset", @@ -211,8 +210,8 @@ filesetCatalog.createFileset( ```python gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") -catalog: Catalog = gravitino_client.load_catalog(name="catalog") -catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("schema", "example_fileset"), +catalog: Catalog = gravitino_client.load_catalog(name="test_catalog") +catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("test_schema", "example_fileset"), type=Fileset.Type.MANAGED, comment="This is an example fileset", storage_location="abfss://container@account-name.dfs.core.windows.net/path/example_fileset", @@ -244,7 +243,7 @@ spark = SparkSession.builder .appName("adls_fileset_test") .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") -.config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") +.config("spark.hadoop.fs.gravitino.server.uri", "${GRAVITINO_SERVER_URL}") .config("spark.hadoop.fs.gravitino.client.metalake", "test") .config("spark.hadoop.azure-storage-account-name", "azure_account_name") .config("spark.hadoop.azure-storage-account-key", "azure_account_name") @@ -273,12 +272,12 @@ os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-azure-bundle-{gra ``` - [`gravitino-azure-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-azure-bundle) is the Gravitino ADLS jar with Hadoop environment and `hadoop-azure` jar. -- [`gravitino-azure-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-azure) is the Gravitino ADLS jar without Hadoop environment and `hadoop-azure` jar. +- [`gravitino-azure-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-azure) is a condensed version of the Gravitino ADLS bundle jar without Hadoop environment and `hadoop-azure` jar. Please choose the correct jar according to your environment. :::note -In some Spark version, Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work, in this case, you should add the jars to the spark classpath directly. +In some Spark versions, a Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work. If this is the case, you should add the jars to the spark CLASSPATH directly. ::: ### Using Gravitino virtual file system Java client to access the fileset @@ -299,7 +298,7 @@ fs.mkdirs(filesetPath); Similar to Spark configurations, you need to add ADLS bundle jars to the classpath according to your environment. -### Using fileset with hadoop fs command +### Accessing a fileset using the Hadoop fs command The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. @@ -349,7 +348,7 @@ hadoop dfs -ls gvfs://fileset/adls_catalog/schema/example hadoop dfs -put /path/to/local/file gvfs://fileset/adls_catalog/schema/example ``` -### Using Gravitino virtual file system Python client +### Using the Gravitino virtual file system Python client to access a fileset ```python from gravitino import gvfs diff --git a/docs/hadoop-catalog-with-gcs.md b/docs/hadoop-catalog-with-gcs.md index 4b29db5c016..0e409bc4016 100644 --- a/docs/hadoop-catalog-with-gcs.md +++ b/docs/hadoop-catalog-with-gcs.md @@ -22,13 +22,13 @@ $ bin/gravitino-server.sh start The rest of this document shows how to use the Hadoop catalog with GCS in Gravitino with a full example. -### Catalog a catalog +### Catalog a Hadoop catalog with GCS Apart from configuration method in [Hadoop-catalog-catalog-configuration](./hadoop-catalog.md#catalog-properties), the following properties are required to configure a Hadoop catalog with GCS: | Configuration item | Description | Default value | Required | Since version | |-------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|----------------------------|------------------| -| `filesystem-providers` | The file system providers to add. Set it to `gs` if it's a GCS fileset, a comma separated string that contains `gs` like `gs,s3` to support multiple kinds of fileset including `gs`. | (none) | Yes | 0.7.0-incubating | +| `filesystem-providers` | The file system providers to add. Set it to `gcs` if it's a GCS fileset, a comma separated string that contains `gcs` like `gcs,s3` to support multiple kinds of fileset including `gcs`. | (none) | Yes | 0.7.0-incubating | | `default-filesystem-provider` | The name default filesystem providers of this Hadoop catalog if users do not specify the scheme in the URI. Default value is `builtin-local`, for GCS, if we set this value, we can omit the prefix 'gs://' in the location. | `builtin-local` | No | 0.7.0-incubating | | `gcs-service-account-file` | The path of GCS service account JSON file. | (none) | Yes if it's a GCS fileset. | 0.7.0-incubating | @@ -52,7 +52,7 @@ First, you need to create a Hadoop catalog with GCS. The following example shows ```shell curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ -H "Content-Type: application/json" -d '{ - "name": "catalog", + "name": "test_catalog", "type": "FILESET", "comment": "comment", "provider": "hadoop", @@ -79,7 +79,7 @@ gcsProperties = ImmutableMap.builder() .put("filesystem-providers", "gcs") .build(); -Catalog gcsCatalog = gravitinoClient.createCatalog("catalog", +Catalog gcsCatalog = gravitinoClient.createCatalog("test_catalog", Type.FILESET, "hadoop", // provider, Gravitino only supports "hadoop" for now. "This is a GCS fileset catalog", @@ -98,7 +98,7 @@ gcs_properties = { "gcs-service-account-file": "path_of_gcs_service_account_file" } -gcs_properties = gravitino_client.create_catalog(name="catalog", +gcs_properties = gravitino_client.create_catalog(name="test_catalog", type=Catalog.Type.FILESET, provider="hadoop", comment="This is a GCS fileset catalog", @@ -109,7 +109,7 @@ gcs_properties = gravitino_client.create_catalog(name="catalog", -Then create a schema and fileset in the catalog created above. +Then create a schema and a fileset in the catalog created above. Using the following code to create a schema and fileset: @@ -119,27 +119,26 @@ Using the following code to create a schema and fileset: ```shell curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ -H "Content-Type: application/json" -d '{ - "name": "schema", + "name": "test_schema", "comment": "comment", "properties": { "location": "gs://bucket/root/schema" } -}' http://localhost:8090/api/metalakes/metalake/catalogs/catalog/schemas +}' http://localhost:8090/api/metalakes/metalake/catalogs/test_catalog/schemas ``` ```java -// Assuming you have just created a Hive catalog named `hive_catalog` -Catalog catalog = gravitinoClient.loadCatalog("hive_catalog"); +Catalog catalog = gravitinoClient.loadCatalog("test_catalog"); SupportsSchemas supportsSchemas = catalog.asSchemas(); Map schemaProperties = ImmutableMap.builder() .put("location", "gs://bucket/root/schema") .build(); -Schema schema = supportsSchemas.createSchema("schema", +Schema schema = supportsSchemas.createSchema("test_schema", "This is a schema", schemaProperties ); @@ -151,8 +150,8 @@ Schema schema = supportsSchemas.createSchema("schema", ```python gravitino_client: GravitinoClient = GravitinoClient(uri="http://127.0.0.1:8090", metalake_name="metalake") -catalog: Catalog = gravitino_client.load_catalog(name="hive_catalog") -catalog.as_schemas().create_schema(name="schema", +catalog: Catalog = gravitino_client.load_catalog(name="test_catalog") +catalog.as_schemas().create_schema(name="test_schema", comment="This is a schema", properties={"location": "gs://bucket/root/schema"}) ``` @@ -173,7 +172,7 @@ curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ "properties": { "k1": "v1" } -}' http://localhost:8090/api/metalakes/metalake/catalogs/catalog/schemas/schema/filesets +}' http://localhost:8090/api/metalakes/metalake/catalogs/test_catalog/schemas/test_schema/filesets ``` @@ -185,7 +184,7 @@ GravitinoClient gravitinoClient = GravitinoClient .withMetalake("metalake") .build(); -Catalog catalog = gravitinoClient.loadCatalog("catalog"); +Catalog catalog = gravitinoClient.loadCatalog("test_catalog"); FilesetCatalog filesetCatalog = catalog.asFilesetCatalog(); Map propertiesMap = ImmutableMap.builder() @@ -193,7 +192,7 @@ Map propertiesMap = ImmutableMap.builder() .build(); filesetCatalog.createFileset( - NameIdentifier.of("schema", "example_fileset"), + NameIdentifier.of("test_schema", "example_fileset"), "This is an example fileset", Fileset.Type.MANAGED, "gs://bucket/root/schema/example_fileset", @@ -207,8 +206,8 @@ filesetCatalog.createFileset( ```python gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") -catalog: Catalog = gravitino_client.load_catalog(name="catalog") -catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("schema", "example_fileset"), +catalog: Catalog = gravitino_client.load_catalog(name="test_catalog") +catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("test_schema", "example_fileset"), type=Fileset.Type.MANAGED, comment="This is an example fileset", storage_location="gs://bucket/root/schema/example_fileset", @@ -240,8 +239,8 @@ spark = SparkSession.builder .appName("gcs_fielset_test") .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") -.config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") -.config("spark.hadoop.fs.gravitino.client.metalake", "test") +.config("spark.hadoop.fs.gravitino.server.uri", "${GRAVITINO_SERVER_URL}") +.config("spark.hadoop.fs.gravitino.client.metalake", "test_metalake") .config("spark.hadoop.gcs-service-account-file", "/path/to/gcs-service-account-file.json") .config("spark.driver.memory", "2g") .config("spark.driver.port", "2048") @@ -266,13 +265,13 @@ If your Spark **without Hadoop environment**, you can use the following code sni os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-gcp-bundle-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar, --master local[1] pyspark-shell" ``` -- [`gravitino-gcp-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-gcp-bundle) is the Gravitino GCS jar with Hadoop environment and `gcs-connector` jar. -- [`gravitino-gcp-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-gcp) is the Gravitino GCS jar without Hadoop environment and `gcs-connector` jar. +- [`gravitino-gcp-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-gcp-bundle) is the Gravitino GCP jar with Hadoop environment and `gcs-connector`. +- [`gravitino-gcp-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-gcp) is a condensed version of the Gravitino GCP bundle jar without Hadoop environment and `gcs-connector`. Please choose the correct jar according to your environment. :::note -In some Spark version, Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work, in this case, you should add the jars to the spark classpath directly. +In some Spark versions, a Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work. If this is the case, you should add the jars to the spark CLASSPATH directly. ::: ### Using Gravitino virtual file system Java client to access the fileset @@ -292,7 +291,7 @@ fs.mkdirs(filesetPath); Similar to Spark configurations, you need to add GCS bundle jars to the classpath according to your environment. -### Using fileset with hadoop fs command +### Accessing a fileset using the Hadoop fs command The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. @@ -339,7 +338,7 @@ hadoop dfs -put /path/to/local/file gvfs://fileset/gcs_catalog/schema/example ``` -### Using Gravitino virtual file system Python client +### Using the Gravitino virtual file system Python client to access a fileset ```python from gravitino import gvfs diff --git a/docs/hadoop-catalog-with-oss.md b/docs/hadoop-catalog-with-oss.md index ecc89edd901..5b118836e95 100644 --- a/docs/hadoop-catalog-with-oss.md +++ b/docs/hadoop-catalog-with-oss.md @@ -44,7 +44,7 @@ Refer to [Fileset operation](./manage-fileset-metadata-using-gravitino.md#filese The rest of this document shows how to use the Hadoop catalog with OSS in Gravitino with a full example. -### Create a Hadoop catalog/schema/file set with OSS +### Create a Hadoop catalog/schema/fileset with OSS First, you need to create a Hadoop catalog with OSS. The following example shows how to create a Hadoop catalog with OSS: @@ -54,7 +54,7 @@ First, you need to create a Hadoop catalog with OSS. The following example shows ```shell curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ -H "Content-Type: application/json" -d '{ - "name": "catalog", + "name": "test_catalog", "type": "FILESET", "comment": "comment", "provider": "hadoop", @@ -85,7 +85,7 @@ ossProperties = ImmutableMap.builder() .put("filesystem-providers", "oss") .build(); -Catalog ossCatalog = gravitinoClient.createCatalog("catalog", +Catalog ossCatalog = gravitinoClient.createCatalog("test_catalog", Type.FILESET, "hadoop", // provider, Gravitino only supports "hadoop" for now. "This is a OSS fileset catalog", @@ -106,7 +106,7 @@ oss_properties = { "oss-endpoint": "ossProperties" } -oss_catalog = gravitino_client.create_catalog(name="catalog", +oss_catalog = gravitino_client.create_catalog(name="test_catalog", type=Catalog.Type.FILESET, provider="hadoop", comment="This is a OSS fileset catalog", @@ -117,7 +117,7 @@ oss_catalog = gravitino_client.create_catalog(name="catalog", -Then create a schema and fileset in the catalog created above. +Then create a schema and a fileset in the catalog created above. Using the following code to create a schema and fileset: @@ -127,27 +127,26 @@ Using the following code to create a schema and fileset: ```shell curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ -H "Content-Type: application/json" -d '{ - "name": "schema", + "name": "test_schema", "comment": "comment", "properties": { "location": "oss://bucket/root/schema" } -}' http://localhost:8090/api/metalakes/metalake/catalogs/catalog/schemas +}' http://localhost:8090/api/metalakes/metalake/catalogs/test_catalog/schemas ``` ```java -// Assuming you have just created a Hive catalog named `hive_catalog` -Catalog catalog = gravitinoClient.loadCatalog("hive_catalog"); +Catalog catalog = gravitinoClient.loadCatalog("test_catalog"); SupportsSchemas supportsSchemas = catalog.asSchemas(); Map schemaProperties = ImmutableMap.builder() .put("location", "oss://bucket/root/schema") .build(); -Schema schema = supportsSchemas.createSchema("schema", +Schema schema = supportsSchemas.createSchema("test_schema", "This is a schema", schemaProperties ); @@ -159,8 +158,8 @@ Schema schema = supportsSchemas.createSchema("schema", ```python gravitino_client: GravitinoClient = GravitinoClient(uri="http://127.0.0.1:8090", metalake_name="metalake") -catalog: Catalog = gravitino_client.load_catalog(name="hive_catalog") -catalog.as_schemas().create_schema(name="schema", +catalog: Catalog = gravitino_client.load_catalog(name="test_catalog") +catalog.as_schemas().create_schema(name="test_schema", comment="This is a schema", properties={"location": "oss://bucket/root/schema"}) ``` @@ -181,7 +180,7 @@ curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ "properties": { "k1": "v1" } -}' http://localhost:8090/api/metalakes/metalake/catalogs/catalog/schemas/schema/filesets +}' http://localhost:8090/api/metalakes/metalake/catalogs/test_catalog/schemas/test_schema/filesets ``` @@ -193,7 +192,7 @@ GravitinoClient gravitinoClient = GravitinoClient .withMetalake("metalake") .build(); -Catalog catalog = gravitinoClient.loadCatalog("catalog"); +Catalog catalog = gravitinoClient.loadCatalog("test_catalog"); FilesetCatalog filesetCatalog = catalog.asFilesetCatalog(); Map propertiesMap = ImmutableMap.builder() @@ -201,7 +200,7 @@ Map propertiesMap = ImmutableMap.builder() .build(); filesetCatalog.createFileset( - NameIdentifier.of("schema", "example_fileset"), + NameIdentifier.of("test_schema", "example_fileset"), "This is an example fileset", Fileset.Type.MANAGED, "oss://bucket/root/schema/example_fileset", @@ -215,8 +214,8 @@ filesetCatalog.createFileset( ```python gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") -catalog: Catalog = gravitino_client.load_catalog(name="catalog") -catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("schema", "example_fileset"), +catalog: Catalog = gravitino_client.load_catalog(name="test_catalog") +catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("test_schema", "example_fileset"), type=Fileset.Type.MANAGED, comment="This is an example fileset", storage_location="oss://bucket/root/schema/example_fileset", @@ -248,7 +247,7 @@ spark = SparkSession.builder .appName("oss_fielset_test") .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") -.config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") +.config("spark.hadoop.fs.gravitino.server.uri", "${GRAVITINO_SERVER_URL}") .config("spark.hadoop.fs.gravitino.client.metalake", "test") .config("spark.hadoop.oss-access-key-id", os.environ["OSS_ACCESS_KEY_ID"]) .config("spark.hadoop.oss-secret-access-key", os.environ["OSS_SECRET_ACCESS_KEY"]) @@ -277,12 +276,12 @@ os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-aliyun-bundle-{gr ``` - [`gravitino-aliyun-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-aliyun-bundle) is the Gravitino Aliyun jar with Hadoop environment and `hadoop-oss` jar. -- [`gravitino-aliyun-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-aliyun) is the Gravitino OSS jar without Hadoop environment and `hadoop-oss` jar. +- [`gravitino-aliyun-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-aliyun) is a condensed version of the Gravitino Aliyun bundle jar without Hadoop environment and `hadoop-aliyun` jar. Please choose the correct jar according to your environment. :::note -In some Spark version, Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work, in this case, you should add the jars to the spark classpath directly. +In some Spark versions, a Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work. If this is the case, you should add the jars to the spark CLASSPATH directly. ::: ### Using Gravitino virtual file system Java client to access the fileset @@ -304,7 +303,7 @@ fs.mkdirs(filesetPath); Similar to Spark configurations, you need to add OSS bundle jars to the classpath according to your environment. -### Using fileset with hadoop fs command +### Accessing a fileset using the Hadoop fs command The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. @@ -361,7 +360,7 @@ hadoop dfs -put /path/to/local/file gvfs://fileset/oss_catalog/schema/example ``` -### Using Gravitino virtual file system Python client +### Using Gravitino virtual file system Python client to access a fileset ```python from gravitino import gvfs diff --git a/docs/hadoop-catalog-with-s3.md b/docs/hadoop-catalog-with-s3.md index 94cc12b3d95..b7cc30f26c5 100644 --- a/docs/hadoop-catalog-with-s3.md +++ b/docs/hadoop-catalog-with-s3.md @@ -19,7 +19,7 @@ $ bin/gravitino-server.sh start ## Create a Hadoop Catalog with S3 in Gravitino -### Catalog a catalog +### Catalog a Hadoop catalog with S3 Apart from configuration method in [Hadoop-catalog-catalog-configuration](./hadoop-catalog.md#catalog-properties), the following properties are required to configure a Hadoop catalog with S3: @@ -54,7 +54,7 @@ First of all, you need to create a Hadoop catalog with S3. The following example ```shell curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ -H "Content-Type: application/json" -d '{ - "name": "catalog", + "name": "test_catalog", "type": "FILESET", "comment": "comment", "provider": "hadoop", @@ -85,7 +85,7 @@ s3Properties = ImmutableMap.builder() .put("filesystem-providers", "s3") .build(); -Catalog s3Catalog = gravitinoClient.createCatalog("catalog", +Catalog s3Catalog = gravitinoClient.createCatalog("test_catalog", Type.FILESET, "hadoop", // provider, Gravitino only supports "hadoop" for now. "This is a S3 fileset catalog", @@ -106,7 +106,7 @@ s3_properties = { "s3-endpoint": "http://s3.ap-northeast-1.amazonaws.com" } -s3_catalog = gravitino_client.create_catalog(name="catalog", +s3_catalog = gravitino_client.create_catalog(name="test_catalog", type=Catalog.Type.FILESET, provider="hadoop", comment="This is a S3 fileset catalog", @@ -121,7 +121,7 @@ s3_catalog = gravitino_client.create_catalog(name="catalog", The value of location should always start with `s3a` NOT `s3` for AWS S3, for instance, `s3a://bucket/root`. Value like `s3://bucket/root` is not supported due to the limitation of the hadoop-aws library. ::: -Then create a schema and fileset in the catalog created above. +Then create a schema and a fileset in the catalog created above. Using the following code to create a schema and fileset: @@ -131,12 +131,12 @@ Using the following code to create a schema and fileset: ```shell curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ -H "Content-Type: application/json" -d '{ - "name": "schema", + "name": "test_schema", "comment": "comment", "properties": { "location": "s3a://bucket/root/schema" } -}' http://localhost:8090/api/metalakes/metalake/catalogs/catalog/schemas +}' http://localhost:8090/api/metalakes/metalake/catalogs/test_catalog/schemas ``` @@ -151,7 +151,7 @@ SupportsSchemas supportsSchemas = catalog.asSchemas(); Map schemaProperties = ImmutableMap.builder() .put("location", "s3a://bucket/root/schema") .build(); -Schema schema = supportsSchemas.createSchema("schema", +Schema schema = supportsSchemas.createSchema("test_schema", "This is a schema", schemaProperties ); @@ -163,8 +163,8 @@ Schema schema = supportsSchemas.createSchema("schema", ```python gravitino_client: GravitinoClient = GravitinoClient(uri="http://127.0.0.1:8090", metalake_name="metalake") -catalog: Catalog = gravitino_client.load_catalog(name="hive_catalog") -catalog.as_schemas().create_schema(name="schema", +catalog: Catalog = gravitino_client.load_catalog(name="test_catalog") +catalog.as_schemas().create_schema(name="test_schema", comment="This is a schema", properties={"location": "s3a://bucket/root/schema"}) ``` @@ -185,7 +185,7 @@ curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ "properties": { "k1": "v1" } -}' http://localhost:8090/api/metalakes/metalake/catalogs/catalog/schemas/schema/filesets +}' http://localhost:8090/api/metalakes/metalake/catalogs/test_catalog/schemas/test_schema/filesets ``` @@ -197,7 +197,7 @@ GravitinoClient gravitinoClient = GravitinoClient .withMetalake("metalake") .build(); -Catalog catalog = gravitinoClient.loadCatalog("catalog"); +Catalog catalog = gravitinoClient.loadCatalog("test_catalog"); FilesetCatalog filesetCatalog = catalog.asFilesetCatalog(); Map propertiesMap = ImmutableMap.builder() @@ -205,7 +205,7 @@ Map propertiesMap = ImmutableMap.builder() .build(); filesetCatalog.createFileset( - NameIdentifier.of("schema", "example_fileset"), + NameIdentifier.of("test_schema", "example_fileset"), "This is an example fileset", Fileset.Type.MANAGED, "s3a://bucket/root/schema/example_fileset", @@ -253,7 +253,7 @@ spark = SparkSession.builder .appName("s3_fielset_test") .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") - .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") + .config("spark.hadoop.fs.gravitino.server.uri", "${GRAVITINO_SERVER_URL}") .config("spark.hadoop.fs.gravitino.client.metalake", "test") .config("spark.hadoop.s3-access-key-id", os.environ["S3_ACCESS_KEY_ID"]) .config("spark.hadoop.s3-secret-access-key", os.environ["S3_SECRET_ACCESS_KEY"]) @@ -281,12 +281,12 @@ os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-aws-${gravitino-v ``` - [`gravitino-aws-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-aws-bundle) is the Gravitino AWS jar with Hadoop environment and `hadoop-aws` jar. -- [`gravitino-aws-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-aws) is the Gravitino AWS jar without Hadoop environment and `hadoop-aws` jar. +- [`gravitino-aws-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-aws) is a condensed version of the Gravitino AWS bundle jar without Hadoop environment and `hadoop-aws` jar. Please choose the correct jar according to your environment. :::note -In some Spark version, Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work, in this case, you should add the jars to the spark classpath directly. +In some Spark versions, a Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work. If this is the case, you should add the jars to the spark CLASSPATH directly. ::: ### Using Gravitino virtual file system Java client to access the fileset @@ -310,7 +310,7 @@ fs.mkdirs(filesetPath); Similar to Spark configurations, you need to add S3 bundle jars to the classpath according to your environment. -### Using fileset with hadoop fs command +### Accessing a fileset using the Hadoop fs command The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. @@ -366,7 +366,7 @@ hadoop dfs -ls gvfs://fileset/s3_catalog/schema/example hadoop dfs -put /path/to/local/file gvfs://fileset/s3_catalog/schema/example ``` -### Using Gravitino virtual file system Python client +### Using the Gravitino virtual file system Python client to access a fileset ```python from gravitino import gvfs From 755a4743cb7d83bd7694d80bf58a83d59e9c9535 Mon Sep 17 00:00:00 2001 From: yuqi Date: Mon, 6 Jan 2025 19:57:25 +0800 Subject: [PATCH 31/59] fix comments. --- bundles/aliyun/build.gradle.kts | 2 +- .../oss/fs/OSSCredentialsProvider.java | 30 ++++++++---------- bundles/aws/build.gradle.kts | 2 +- .../s3/fs/S3CredentialsProvider.java | 31 ++++++++----------- .../gravitino/s3/fs/S3FileSystemProvider.java | 3 +- bundles/azure/build.gradle.kts | 2 +- .../abs/fs/AzureFileSystemProvider.java | 2 +- .../abs/fs/AzureSasCredentialsProvider.java | 20 ++++++------ bundles/gcp/build.gradle.kts | 3 +- .../gcs/fs/GCSCredentialsProvider.java | 11 ++----- .../gcs/fs/GCSFileSystemProvider.java | 7 +++-- 11 files changed, 50 insertions(+), 63 deletions(-) diff --git a/bundles/aliyun/build.gradle.kts b/bundles/aliyun/build.gradle.kts index 88febb4103a..d27e055d261 100644 --- a/bundles/aliyun/build.gradle.kts +++ b/bundles/aliyun/build.gradle.kts @@ -29,7 +29,7 @@ dependencies { compileOnly(project(":catalogs:catalog-common")) compileOnly(project(":catalogs:catalog-hadoop")) compileOnly(project(":core")) - compileOnly(project(":clients:client-java-runtime", configuration = "shadow")) + compileOnly(project(":clients:client-java")) compileOnly(libs.hadoop3.client.api) compileOnly(libs.hadoop3.client.runtime) compileOnly(libs.hadoop3.oss) diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java index 67217fbf9cb..79b4862336a 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java @@ -19,16 +19,11 @@ package org.apache.gravitino.oss.fs; -import static org.apache.gravitino.credential.OSSTokenCredential.GRAVITINO_OSS_SESSION_ACCESS_KEY_ID; -import static org.apache.gravitino.credential.OSSTokenCredential.GRAVITINO_OSS_SESSION_SECRET_ACCESS_KEY; -import static org.apache.gravitino.credential.OSSTokenCredential.GRAVITINO_OSS_TOKEN; - import com.aliyun.oss.common.auth.BasicCredentials; import com.aliyun.oss.common.auth.Credentials; import com.aliyun.oss.common.auth.CredentialsProvider; import com.aliyun.oss.common.auth.DefaultCredentials; import java.net.URI; -import java.util.Map; import org.apache.gravitino.NameIdentifier; import org.apache.gravitino.client.GravitinoClient; import org.apache.gravitino.credential.Credential; @@ -86,7 +81,7 @@ private void refresh() { String[] idents = filesetIdentifier.split("\\."); String catalog = idents[1]; - this.client = GravitinoVirtualFileSystemUtils.createClient(configuration); + client = GravitinoVirtualFileSystemUtils.createClient(configuration); FilesetCatalog filesetCatalog = client.loadCatalog(catalog).asFilesetCatalog(); Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); @@ -103,17 +98,18 @@ private void refresh() { return; } - Map credentialMap = credential.toProperties(); - - String accessKeyId = credentialMap.get(GRAVITINO_OSS_SESSION_ACCESS_KEY_ID); - String secretAccessKey = credentialMap.get(GRAVITINO_OSS_SESSION_SECRET_ACCESS_KEY); - - if (OSSTokenCredential.OSS_TOKEN_CREDENTIAL_TYPE.equals( - credentialMap.get(Credential.CREDENTIAL_TYPE))) { - String sessionToken = credentialMap.get(GRAVITINO_OSS_TOKEN); - basicCredentials = new BasicCredentials(accessKeyId, secretAccessKey, sessionToken); - } else { - basicCredentials = new DefaultCredentials(accessKeyId, secretAccessKey); + if (credential instanceof OSSSecretKeyCredential) { + OSSSecretKeyCredential ossSecretKeyCredential = (OSSSecretKeyCredential) credential; + basicCredentials = + new DefaultCredentials( + ossSecretKeyCredential.accessKeyId(), ossSecretKeyCredential.secretAccessKey()); + } else if (credential instanceof OSSTokenCredential) { + OSSTokenCredential ossTokenCredential = (OSSTokenCredential) credential; + basicCredentials = + new BasicCredentials( + ossTokenCredential.accessKeyId(), + ossTokenCredential.secretAccessKey(), + ossTokenCredential.securityToken()); } if (credential.expireTimeInMs() > 0) { diff --git a/bundles/aws/build.gradle.kts b/bundles/aws/build.gradle.kts index ac31bd284bd..dd34406c9fb 100644 --- a/bundles/aws/build.gradle.kts +++ b/bundles/aws/build.gradle.kts @@ -29,7 +29,7 @@ dependencies { compileOnly(project(":catalogs:catalog-common")) compileOnly(project(":catalogs:catalog-hadoop")) compileOnly(project(":core")) - compileOnly(project(":clients:client-java-runtime", configuration = "shadow")) + compileOnly(project(":clients:client-java")) compileOnly(libs.hadoop3.aws) compileOnly(libs.hadoop3.client.api) compileOnly(libs.hadoop3.client.runtime) diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java index 5db9aac808e..c021329dbf3 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java @@ -19,16 +19,11 @@ package org.apache.gravitino.s3.fs; -import static org.apache.gravitino.credential.S3TokenCredential.GRAVITINO_S3_SESSION_ACCESS_KEY_ID; -import static org.apache.gravitino.credential.S3TokenCredential.GRAVITINO_S3_SESSION_SECRET_ACCESS_KEY; -import static org.apache.gravitino.credential.S3TokenCredential.GRAVITINO_S3_TOKEN; - import com.amazonaws.auth.AWSCredentials; import com.amazonaws.auth.AWSCredentialsProvider; import com.amazonaws.auth.BasicAWSCredentials; import com.amazonaws.auth.BasicSessionCredentials; import java.net.URI; -import java.util.Map; import org.apache.gravitino.NameIdentifier; import org.apache.gravitino.client.GravitinoClient; import org.apache.gravitino.credential.Credential; @@ -62,7 +57,7 @@ public S3CredentialsProvider(final URI uri, final Configuration conf) { @Override public AWSCredentials getCredentials() { - // Refresh credentials if they are null or about to expire in 5 minutes + // Refresh credentials if they are null or about to expire. if (basicSessionCredentials == null || System.currentTimeMillis() >= expirationTime) { synchronized (this) { try { @@ -84,7 +79,7 @@ public void refresh() { String[] idents = filesetIdentifier.split("\\."); String catalog = idents[1]; - this.client = GravitinoVirtualFileSystemUtils.createClient(configuration); + client = GravitinoVirtualFileSystemUtils.createClient(configuration); FilesetCatalog filesetCatalog = client.loadCatalog(catalog).asFilesetCatalog(); Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); @@ -101,18 +96,18 @@ public void refresh() { return; } - Map credentialMap = credential.toProperties(); - - String accessKeyId = credentialMap.get(GRAVITINO_S3_SESSION_ACCESS_KEY_ID); - String secretAccessKey = credentialMap.get(GRAVITINO_S3_SESSION_SECRET_ACCESS_KEY); - - if (S3TokenCredential.S3_TOKEN_CREDENTIAL_TYPE.equals( - credentialMap.get(Credential.CREDENTIAL_TYPE))) { - String sessionToken = credentialMap.get(GRAVITINO_S3_TOKEN); + if (credential instanceof S3SecretKeyCredential) { + S3SecretKeyCredential s3SecretKeyCredential = (S3SecretKeyCredential) credential; + basicSessionCredentials = + new BasicAWSCredentials( + s3SecretKeyCredential.accessKeyId(), s3SecretKeyCredential.secretAccessKey()); + } else if (credential instanceof S3TokenCredential) { + S3TokenCredential s3TokenCredential = (S3TokenCredential) credential; basicSessionCredentials = - new BasicSessionCredentials(accessKeyId, secretAccessKey, sessionToken); - } else { - basicSessionCredentials = new BasicAWSCredentials(accessKeyId, secretAccessKey); + new BasicSessionCredentials( + s3TokenCredential.accessKeyId(), + s3TokenCredential.secretAccessKey(), + s3TokenCredential.sessionToken()); } if (credential.expireTimeInMs() > 0) { diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java index f4972d12f3d..c60f12c3122 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java @@ -70,7 +70,8 @@ public FileSystem getFileSystem(Path path, Map config) throws IO // Only call from GVFS client will have this key and support GravitinoS3CredentialProvider as // the file system provider will be used by GVFS client and Gravitino server, only GVFS client // will have this key. - if (config.containsKey(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { + if (hadoopConfMap.containsKey( + GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { configuration.set( Constants.AWS_CREDENTIALS_PROVIDER, S3CredentialsProvider.class.getCanonicalName()); } diff --git a/bundles/azure/build.gradle.kts b/bundles/azure/build.gradle.kts index 9a760132445..34890989a48 100644 --- a/bundles/azure/build.gradle.kts +++ b/bundles/azure/build.gradle.kts @@ -28,7 +28,7 @@ dependencies { compileOnly(project(":api")) compileOnly(project(":catalogs:catalog-hadoop")) compileOnly(project(":core")) - compileOnly(project(":clients:client-java-runtime", configuration = "shadow")) + compileOnly(project(":clients:client-java")) compileOnly(libs.hadoop3.abs) compileOnly(libs.hadoop3.client.api) compileOnly(libs.hadoop3.client.runtime) diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java index 64c269fcce6..a837ff2bc50 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java @@ -74,7 +74,7 @@ public FileSystem getFileSystem(@Nonnull Path path, @Nonnull Map hadoopConfMap.forEach(configuration::set); - // Check whether this is from GVFS client. + // This is a workaround to judge whether it's from a Gravitino GVFS client. if (config.containsKey(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { // Test whether SAS works try { diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java index 8bb1f025556..91a3fab4744 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java @@ -20,14 +20,11 @@ package org.apache.gravitino.abs.fs; import static org.apache.gravitino.credential.ADLSTokenCredential.ADLS_TOKEN_CREDENTIAL_TYPE; -import static org.apache.gravitino.credential.ADLSTokenCredential.GRAVITINO_ADLS_SAS_TOKEN; -import static org.apache.gravitino.credential.AzureAccountKeyCredential.GRAVITINO_AZURE_STORAGE_ACCOUNT_KEY; -import static org.apache.gravitino.credential.AzureAccountKeyCredential.GRAVITINO_AZURE_STORAGE_ACCOUNT_NAME; import java.io.IOException; -import java.util.Map; import org.apache.gravitino.NameIdentifier; import org.apache.gravitino.client.GravitinoClient; +import org.apache.gravitino.credential.ADLSTokenCredential; import org.apache.gravitino.credential.AzureAccountKeyCredential; import org.apache.gravitino.credential.Credential; import org.apache.gravitino.file.Fileset; @@ -85,7 +82,7 @@ public void initialize(Configuration conf, String accountName) throws IOExceptio @Override public String getSASToken(String account, String fileSystem, String path, String operation) { - // Refresh credentials if they are null or about to expire in 5 minutes + // Refresh credentials if they are null or about to expire. if (sasToken == null || System.currentTimeMillis() >= expirationTime) { synchronized (this) { try { @@ -116,12 +113,13 @@ private void refresh() { return; } - Map credentialMap = credential.toProperties(); - if (ADLS_TOKEN_CREDENTIAL_TYPE.equals(credentialMap.get(Credential.CREDENTIAL_TYPE))) { - sasToken = credentialMap.get(GRAVITINO_ADLS_SAS_TOKEN); - } else { - azureStorageAccountName = credentialMap.get(GRAVITINO_AZURE_STORAGE_ACCOUNT_NAME); - azureStorageAccountKey = credentialMap.get(GRAVITINO_AZURE_STORAGE_ACCOUNT_KEY); + if (credential instanceof ADLSTokenCredential) { + ADLSTokenCredential adlsTokenCredential = (ADLSTokenCredential) credential; + sasToken = adlsTokenCredential.sasToken(); + } else if (credential instanceof AzureAccountKeyCredential) { + AzureAccountKeyCredential azureAccountKeyCredential = (AzureAccountKeyCredential) credential; + azureStorageAccountName = azureAccountKeyCredential.accountName(); + azureStorageAccountKey = azureAccountKeyCredential.accountKey(); } if (credential.expireTimeInMs() > 0) { diff --git a/bundles/gcp/build.gradle.kts b/bundles/gcp/build.gradle.kts index 2e33922e370..85adbfe0725 100644 --- a/bundles/gcp/build.gradle.kts +++ b/bundles/gcp/build.gradle.kts @@ -29,11 +29,12 @@ dependencies { compileOnly(project(":catalogs:catalog-common")) compileOnly(project(":catalogs:catalog-hadoop")) compileOnly(project(":core")) - compileOnly(project(":clients:client-java-runtime", configuration = "shadow")) + compileOnly(project(":clients:client-java")) compileOnly(libs.hadoop3.client.api) compileOnly(libs.hadoop3.client.runtime) compileOnly(libs.hadoop3.gcs) + compileOnly(libs.slf4j.api) implementation(project(":catalogs:catalog-common")) { exclude("*") diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java index 958e75307a2..64cb899f729 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java @@ -22,7 +22,6 @@ import com.google.cloud.hadoop.util.AccessTokenProvider; import java.io.IOException; import java.util.Arrays; -import java.util.Map; import java.util.Optional; import org.apache.gravitino.NameIdentifier; import org.apache.gravitino.client.GravitinoClient; @@ -64,7 +63,6 @@ public AccessToken getAccessToken() { @Override public void refresh() throws IOException { - // Refresh credentials if they are null or about to expire in 5 minutes // The format of filesetIdentifier is "metalake.catalog.fileset.schema" String[] idents = filesetIdentifier.split("\\."); String catalog = idents[1]; @@ -83,12 +81,9 @@ public void refresh() throws IOException { return; } - Map credentialMap = credential.toProperties(); - - if (GCSTokenCredential.GCS_TOKEN_CREDENTIAL_TYPE.equals( - credentialMap.get(Credential.CREDENTIAL_TYPE))) { - String sessionToken = credentialMap.get(GCSTokenCredential.GCS_TOKEN_NAME); - accessToken = new AccessToken(sessionToken, credential.expireTimeInMs()); + if (credential instanceof GCSTokenCredential) { + GCSTokenCredential gcsTokenCredential = (GCSTokenCredential) credential; + accessToken = new AccessToken(gcsTokenCredential.token(), credential.expireTimeInMs()); if (credential.expireTimeInMs() > 0) { expirationTime = diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java index f0bf792fe18..e55d47a15d6 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java @@ -34,6 +34,7 @@ public class GCSFileSystemProvider implements FileSystemProvider { private static final String GCS_SERVICE_ACCOUNT_JSON_FILE = "fs.gs.auth.service.account.json.keyfile"; + private static final String GCS_TOKEN_PROVIDER_IMPL = "fs.gs.auth.access.token.provider.impl"; @VisibleForTesting public static final Map GRAVITINO_KEY_TO_GCS_HADOOP_KEY = @@ -46,14 +47,14 @@ public FileSystem getFileSystem(Path path, Map config) throws IO FileSystemUtils.toHadoopConfigMap(config, GRAVITINO_KEY_TO_GCS_HADOOP_KEY) .forEach(configuration::set); + // This is a workaround to judge whether it's from a Gravitino GVFS client. if (config.containsKey(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { AccessTokenProvider accessTokenProvider = new GCSCredentialsProvider(); accessTokenProvider.setConf(configuration); - // Why is this check necessary?, if Gravitino fails to get any credentials, we fall back to + // Why is this check necessary?if Gravitino fails to get any credentials, we fall back to // the default behavior of the GoogleHadoopFileSystem to use service account credentials. if (accessTokenProvider.getAccessToken() != null) { - configuration.set( - "fs.gs.auth.access.token.provider.impl", GCSCredentialsProvider.class.getName()); + configuration.set(GCS_TOKEN_PROVIDER_IMPL, GCSCredentialsProvider.class.getName()); } } From 3bb925268f50060154fbaed7480e4ff7e358f792 Mon Sep 17 00:00:00 2001 From: yuqi Date: Mon, 6 Jan 2025 20:35:27 +0800 Subject: [PATCH 32/59] fix --- bundles/aliyun/build.gradle.kts | 4 ++-- bundles/aws/build.gradle.kts | 4 ++-- bundles/azure/build.gradle.kts | 5 ++--- bundles/gcp/build.gradle.kts | 4 ++-- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/bundles/aliyun/build.gradle.kts b/bundles/aliyun/build.gradle.kts index d27e055d261..64fdf9ee1a8 100644 --- a/bundles/aliyun/build.gradle.kts +++ b/bundles/aliyun/build.gradle.kts @@ -37,10 +37,10 @@ dependencies { implementation(project(":catalogs:catalog-common")) { exclude("*") } - implementation(project(":catalogs:hadoop-common")) { + implementation(project(":clients:filesystem-hadoop3-common")) { exclude("*") } - implementation(project(":clients:filesystem-hadoop3-common")) { + implementation(project(":catalogs:hadoop-common")) { exclude("*") } diff --git a/bundles/aws/build.gradle.kts b/bundles/aws/build.gradle.kts index dd34406c9fb..3c9f604eb74 100644 --- a/bundles/aws/build.gradle.kts +++ b/bundles/aws/build.gradle.kts @@ -37,10 +37,10 @@ dependencies { implementation(project(":catalogs:catalog-common")) { exclude("*") } - implementation(project(":catalogs:hadoop-common")) { + implementation(project(":clients:filesystem-hadoop3-common")) { exclude("*") } - implementation(project(":clients:filesystem-hadoop3-common")) { + implementation(project(":catalogs:hadoop-common")) { exclude("*") } diff --git a/bundles/azure/build.gradle.kts b/bundles/azure/build.gradle.kts index 34890989a48..74eb55d3c7a 100644 --- a/bundles/azure/build.gradle.kts +++ b/bundles/azure/build.gradle.kts @@ -36,11 +36,10 @@ dependencies { implementation(project(":catalogs:catalog-common")) { exclude("*") } - implementation(project(":catalogs:hadoop-common")) { + implementation(project(":clients:filesystem-hadoop3-common")) { exclude("*") } - - implementation(project(":clients:filesystem-hadoop3-common")) { + implementation(project(":catalogs:hadoop-common")) { exclude("*") } diff --git a/bundles/gcp/build.gradle.kts b/bundles/gcp/build.gradle.kts index 85adbfe0725..aa98e74c39d 100644 --- a/bundles/gcp/build.gradle.kts +++ b/bundles/gcp/build.gradle.kts @@ -39,10 +39,10 @@ dependencies { implementation(project(":catalogs:catalog-common")) { exclude("*") } - implementation(project(":catalogs:hadoop-common")) { + implementation(project(":clients:filesystem-hadoop3-common")) { exclude("*") } - implementation(project(":clients:filesystem-hadoop3-common")) { + implementation(project(":catalogs:hadoop-common")) { exclude("*") } From 614464fec6816db8b389a2ccae87dd7d648ac554 Mon Sep 17 00:00:00 2001 From: yuqi Date: Mon, 6 Jan 2025 21:45:48 +0800 Subject: [PATCH 33/59] Optimize --- .../oss/fs/OSSCredentialsProvider.java | 6 ++---- .../gravitino/s3/fs/S3CredentialsProvider.java | 4 ++-- .../abs/fs/AzureSasCredentialsProvider.java | 8 ++------ .../gcs/fs/GCSCredentialsProvider.java | 18 ++++++------------ 4 files changed, 12 insertions(+), 24 deletions(-) diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java index 79b4862336a..96d8aeb5755 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java @@ -131,16 +131,14 @@ private void refresh() { private Credential getCredential(Credential[] credentials) { // Use dynamic credential if found. for (Credential credential : credentials) { - if (credential.credentialType().equals(OSSTokenCredential.OSS_TOKEN_CREDENTIAL_TYPE)) { + if (credential instanceof OSSTokenCredential) { return credential; } } // If dynamic credential not found, use the static one for (Credential credential : credentials) { - if (credential - .credentialType() - .equals(OSSSecretKeyCredential.OSS_SECRET_KEY_CREDENTIAL_TYPE)) { + if (credential instanceof OSSSecretKeyCredential) { return credential; } } diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java index c021329dbf3..b7b3494e1e3 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java @@ -129,14 +129,14 @@ public void refresh() { private Credential getCredential(Credential[] credentials) { // Use dynamic credential if found. for (Credential credential : credentials) { - if (credential.credentialType().equals(S3TokenCredential.S3_TOKEN_CREDENTIAL_TYPE)) { + if (credential instanceof S3TokenCredential) { return credential; } } // If dynamic credential not found, use the static one for (Credential credential : credentials) { - if (credential.credentialType().equals(S3SecretKeyCredential.S3_SECRET_KEY_CREDENTIAL_TYPE)) { + if (credential instanceof S3SecretKeyCredential) { return credential; } } diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java index 91a3fab4744..11d9ded6cc1 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java @@ -19,8 +19,6 @@ package org.apache.gravitino.abs.fs; -import static org.apache.gravitino.credential.ADLSTokenCredential.ADLS_TOKEN_CREDENTIAL_TYPE; - import java.io.IOException; import org.apache.gravitino.NameIdentifier; import org.apache.gravitino.client.GravitinoClient; @@ -141,16 +139,14 @@ private void refresh() { private Credential getCredential(Credential[] credentials) { // Use dynamic credential if found. for (Credential credential : credentials) { - if (credential.credentialType().equals(ADLS_TOKEN_CREDENTIAL_TYPE)) { + if (credential instanceof ADLSTokenCredential) { return credential; } } // If dynamic credential not found, use the static one for (Credential credential : credentials) { - if (credential - .credentialType() - .equals(AzureAccountKeyCredential.AZURE_ACCOUNT_KEY_CREDENTIAL_TYPE)) { + if (credential instanceof AzureAccountKeyCredential) { return credential; } } diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java index 807cfb7e812..ea238b7e577 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java @@ -21,8 +21,6 @@ import com.google.cloud.hadoop.util.AccessTokenProvider; import java.io.IOException; -import java.util.Arrays; -import java.util.Optional; import org.apache.gravitino.NameIdentifier; import org.apache.gravitino.client.GravitinoClient; import org.apache.gravitino.credential.Credential; @@ -115,16 +113,12 @@ public Configuration getConf() { * @return An credential. */ private Credential getCredential(Credential[] credentials) { - // Use dynamic credential if found. - Optional optionalCredential = - Arrays.stream(credentials) - .filter( - credential -> - credential - .credentialType() - .equals(GCSTokenCredential.GCS_TOKEN_CREDENTIAL_TYPE)) - .findFirst(); + for (Credential credential : credentials) { + if (credential instanceof GCSTokenCredential) { + return credential; + } + } - return optionalCredential.orElse(null); + return null; } } From e481c8d0217cfd990b8cae75e0ab52eb2ef73524 Mon Sep 17 00:00:00 2001 From: yuqi Date: Mon, 6 Jan 2025 21:52:34 +0800 Subject: [PATCH 34/59] fix --- docs/hadoop-catalog-with-adls.md | 8 ++++---- docs/hadoop-catalog-with-gcs.md | 8 ++++---- docs/hadoop-catalog-with-oss.md | 8 ++++---- docs/hadoop-catalog-with-s3.md | 8 ++++---- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/hadoop-catalog-with-adls.md b/docs/hadoop-catalog-with-adls.md index 679a3d631b0..5f77153f6dc 100644 --- a/docs/hadoop-catalog-with-adls.md +++ b/docs/hadoop-catalog-with-adls.md @@ -17,11 +17,11 @@ at `${HADOOP_HOME}/share/hadoop/common/lib/`. After that, start Gravitino server $ bin/gravitino-server.sh start ``` -## Create a Hadoop Catalog with ADLS in Gravitino +## Create a Hadoop Catalog with ADLS The rest of this document shows how to use the Hadoop catalog with ADLS in Gravitino with a full example. -### Catalog a Hadoop catalog with ADLS +### Catalog a ADLS Hadoop catalog Apart from configuration method in [Hadoop-catalog-catalog-configuration](./hadoop-catalog.md#catalog-properties), the following properties are required to configure a Hadoop catalog with ADLS: @@ -337,8 +337,8 @@ The following are examples of how to use the `hadoop fs` command to access the f 2. Copy the necessary jars to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. -Copy the corresponding jars to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. For ADLS, you need to copy `gravitino-azure-{version}.jar` to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. -then copy `hadoop-azure-${version}.jar` and related dependencies to the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory. Those jars can be found in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory, for simple you can add all the jars in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. +For ADLS, you need to copy `gravitino-azure-{version}.jar` to the `${HADOOP_HOME}/share/hadoop/common/lib` directory, +then copy `hadoop-azure-${version}.jar` and related dependencies to the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory. Those jars can be found in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory, you can add all the jars in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. 3. Run the following command to access the fileset: diff --git a/docs/hadoop-catalog-with-gcs.md b/docs/hadoop-catalog-with-gcs.md index 0e409bc4016..640ee5ee098 100644 --- a/docs/hadoop-catalog-with-gcs.md +++ b/docs/hadoop-catalog-with-gcs.md @@ -17,12 +17,12 @@ at `${HADOOP_HOME}/share/hadoop/common/lib/`. After that, start Gravitino server $ bin/gravitino-server.sh start ``` -## Create a Hadoop Catalog with GCS in Gravitino +## Create a Hadoop Catalog with GCS The rest of this document shows how to use the Hadoop catalog with GCS in Gravitino with a full example. -### Catalog a Hadoop catalog with GCS +### Catalog a GCS Hadoop catalog Apart from configuration method in [Hadoop-catalog-catalog-configuration](./hadoop-catalog.md#catalog-properties), the following properties are required to configure a Hadoop catalog with GCS: @@ -326,8 +326,8 @@ The following are examples of how to use the `hadoop fs` command to access the f 2. Copy the necessary jars to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. -Copy the corresponding jars to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. For GCS, you need to copy `gravitino-gcp-{version}.jar` to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. -then copy `hadoop-gcp-${version}.jar` and related dependencies to the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory. Those jars can be found in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory, for simple you can add all the jars in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. +For GCS, you need to copy `gravitino-gcp-{version}.jar` to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. +Then copy `hadoop-gcp-${version}.jar` and other possible dependencies to the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory. Those jars can be found in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory, you can add all the jars in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. 3. Run the following command to access the fileset: diff --git a/docs/hadoop-catalog-with-oss.md b/docs/hadoop-catalog-with-oss.md index 5b118836e95..a61c90600a3 100644 --- a/docs/hadoop-catalog-with-oss.md +++ b/docs/hadoop-catalog-with-oss.md @@ -17,9 +17,9 @@ at `${HADOOP_HOME}/share/hadoop/common/lib/`. After that, start Gravitino server $ bin/gravitino-server.sh start ``` -## Create a Hadoop Catalog with OSS in Gravitino +## Create a Hadoop Catalog with OSS -### Catalog a catalog +### Catalog an OSS Hadoop catalog Apart from configuration method in [Hadoop-catalog-catalog-configuration](./hadoop-catalog.md#catalog-properties), the following properties are required to configure a Hadoop catalog with OSS: @@ -348,8 +348,8 @@ The following are examples of how to use the `hadoop fs` command to access the f 2. Copy the necessary jars to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. -Copy the corresponding jars to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. For OSS, you need to copy `gravitino-aliyun-{version}.jar` to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. -then copy hadoop-aliyun-{version}.jar and related dependencies to the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory. Those jars can be found in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory, for simple you can add all the jars in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. +For OSS, you need to copy `gravitino-aliyun-{version}.jar` to the `${HADOOP_HOME}/share/hadoop/common/lib` directory, +then copy hadoop-aliyun-{version}.jar and related dependencies to the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory. Those jars can be found in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory, you can add all the jars in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. 3. Run the following command to access the fileset: diff --git a/docs/hadoop-catalog-with-s3.md b/docs/hadoop-catalog-with-s3.md index b7cc30f26c5..b1a724f1f2f 100644 --- a/docs/hadoop-catalog-with-s3.md +++ b/docs/hadoop-catalog-with-s3.md @@ -17,9 +17,9 @@ at `${HADOOP_HOME}/share/hadoop/common/lib/`. After that, start Gravitino server $ bin/gravitino-server.sh start ``` -## Create a Hadoop Catalog with S3 in Gravitino +## Create a Hadoop Catalog with S3 -### Catalog a Hadoop catalog with S3 +### Catalog a S3 Hadoop catalog Apart from configuration method in [Hadoop-catalog-catalog-configuration](./hadoop-catalog.md#catalog-properties), the following properties are required to configure a Hadoop catalog with S3: @@ -355,8 +355,8 @@ The following are examples of how to use the `hadoop fs` command to access the f 2. Copy the necessary jars to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. -Copy the corresponding jars to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. For S3, you need to copy `gravitino-aws-{version}.jar` to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. -then copy hadoop-aws-{version}.jar and related dependencies to the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory. Those jars can be found in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory, for simple you can add all the jars in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. +For S3, you need to copy `gravitino-aws-{version}.jar` to the `${HADOOP_HOME}/share/hadoop/common/lib` directoryl, +then copy hadoop-aws-{version}.jar and related dependencies to the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory. Those jars can be found in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory, you can add all the jars in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. 3. Run the following command to access the fileset: From 0a97fc7a07e5c424dabc46769ba72bea86f374a7 Mon Sep 17 00:00:00 2001 From: yuqi Date: Mon, 6 Jan 2025 21:59:09 +0800 Subject: [PATCH 35/59] fix --- docs/hadoop-catalog-with-adls.md | 28 ++++++++++++++-------------- docs/hadoop-catalog-with-gcs.md | 30 +++++++++++++++--------------- docs/hadoop-catalog-with-oss.md | 14 +++++++------- docs/hadoop-catalog-with-s3.md | 8 ++++---- 4 files changed, 40 insertions(+), 40 deletions(-) diff --git a/docs/hadoop-catalog-with-adls.md b/docs/hadoop-catalog-with-adls.md index 5f77153f6dc..3ef052da730 100644 --- a/docs/hadoop-catalog-with-adls.md +++ b/docs/hadoop-catalog-with-adls.md @@ -63,7 +63,7 @@ curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ "azure-storage-account-key": "The account key of the Azure Blob Storage", "filesystem-providers": "abs" } -}' http://localhost:8090/api/metalakes/metalake/catalogs +}' ${GRAVITINO_SERVER_IP:PORT}/api/metalakes/metalake/catalogs ``` @@ -71,7 +71,7 @@ curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ ```java GravitinoClient gravitinoClient = GravitinoClient - .builder("http://localhost:8090") + .builder("${GRAVITINO_SERVER_IP:PORT}") .withMetalake("metalake") .build(); @@ -95,7 +95,7 @@ Catalog adlsCatalog = gravitinoClient.createCatalog("example_catalog", ```python -gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") +gravitino_client: GravitinoClient = GravitinoClient(uri="${GRAVITINO_SERVER_IP:PORT}", metalake_name="metalake") adls_properties = { "location": "abfss://container@account-name.dfs.core.windows.net/path", "azure_storage_account_name": "azure storage account name", @@ -128,7 +128,7 @@ curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ "properties": { "location": "abfss://container@account-name.dfs.core.windows.net/path" } -}' http://localhost:8090/api/metalakes/metalake/catalogs/test_catalog/schemas +}' ${GRAVITINO_SERVER_IP:PORT}/api/metalakes/metalake/catalogs/test_catalog/schemas ``` @@ -176,7 +176,7 @@ curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ "properties": { "k1": "v1" } -}' http://localhost:8090/api/metalakes/metalake/catalogs/test_catalog/schemas/test_schema/filesets +}' ${GRAVITINO_SERVER_IP:PORT}/api/metalakes/metalake/catalogs/test_catalog/schemas/test_schema/filesets ``` @@ -184,7 +184,7 @@ curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ ```java GravitinoClient gravitinoClient = GravitinoClient - .builder("http://localhost:8090") + .builder("${GRAVITINO_SERVER_IP:PORT}") .withMetalake("metalake") .build(); @@ -208,7 +208,7 @@ filesetCatalog.createFileset( ```python -gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") +gravitino_client: GravitinoClient = GravitinoClient(uri="${GRAVITINO_SERVER_IP:PORT}", metalake_name="metalake") catalog: Catalog = gravitino_client.load_catalog(name="test_catalog") catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("test_schema", "example_fileset"), @@ -231,7 +231,7 @@ from gravitino import NameIdentifier, GravitinoClient, Catalog, Fileset, Graviti from pyspark.sql import SparkSession import os -gravitino_url = "http://localhost:8090" +gravitino_url = "${GRAVITINO_SERVER_IP:PORT}" metalake_name = "test" catalog_name = "your_adls_catalog" @@ -286,7 +286,7 @@ In some Spark versions, a Hadoop environment is needed by the driver, adding the Configuration conf = new Configuration(); conf.set("fs.AbstractFileSystem.gvfs.impl","org.apache.gravitino.filesystem.hadoop.Gvfs"); conf.set("fs.gvfs.impl","org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); -conf.set("fs.gravitino.server.uri","http://localhost:8090"); +conf.set("fs.gravitino.server.uri","${GRAVITINO_SERVER_URL}"); conf.set("fs.gravitino.client.metalake","test_metalake"); conf.set("azure-storage-account-name", "account_name_of_adls"); conf.set("azure-storage-account-key", "account_key_of_adls"); @@ -317,7 +317,7 @@ The following are examples of how to use the `hadoop fs` command to access the f fs.gravitino.server.uri - http://192.168.50.188:8090 + ${GRAVITINO_SERVER_IP:PORT} @@ -359,7 +359,7 @@ options = { "azure_storage_account_name": "azure_account_name", "azure_storage_account_key": "azure_account_key" } -fs = gvfs.GravitinoVirtualFileSystem(server_uri="http://localhost:8090", metalake_name="test_metalake", options=options) +fs = gvfs.GravitinoVirtualFileSystem(server_uri="${GRAVITINO_SERVER_IP:PORT}", metalake_name="test_metalake", options=options) fs.ls("gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/") ``` @@ -372,7 +372,7 @@ The following are examples of how to use the pandas library to access the ADLS f import pandas as pd storage_options = { - "server_uri": "http://localhost:8090", + "server_uri": "${GRAVITINO_SERVER_IP:PORT}", "metalake_name": "test", "options": { "azure_storage_account_name": "azure_account_name", @@ -404,7 +404,7 @@ GVFS Java client: Configuration conf = new Configuration(); conf.set("fs.AbstractFileSystem.gvfs.impl","org.apache.gravitino.filesystem.hadoop.Gvfs"); conf.set("fs.gvfs.impl","org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); -conf.set("fs.gravitino.server.uri","http://localhost:8090"); +conf.set("fs.gravitino.server.uri","${GRAVITINO_SERVER_IP:PORT}"); conf.set("fs.gravitino.client.metalake","test_metalake"); // No need to set azure-storage-account-name and azure-storage-account-name Path filesetPath = new Path("gvfs://fileset/adls_test_catalog/test_schema/test_fileset/new_dir"); @@ -420,7 +420,7 @@ spark = SparkSession.builder .appName("adls_fielset_test") .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") - .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") + .config("spark.hadoop.fs.gravitino.server.uri", "${GRAVITINO_SERVER_IP:PORT}") .config("spark.hadoop.fs.gravitino.client.metalake", "test") # No need to set azure-storage-account-name and azure-storage-account-name .config("spark.driver.memory", "2g") diff --git a/docs/hadoop-catalog-with-gcs.md b/docs/hadoop-catalog-with-gcs.md index 640ee5ee098..9578a952a50 100644 --- a/docs/hadoop-catalog-with-gcs.md +++ b/docs/hadoop-catalog-with-gcs.md @@ -42,7 +42,7 @@ Refer to [Fileset operation](./manage-fileset-metadata-using-gravitino.md#filese ## Using Hadoop catalog with GCS -### Create a Hadoop catalog/schema/file set with GCS +### Create a Hadoop catalog/schema/fileset with GCS First, you need to create a Hadoop catalog with GCS. The following example shows how to create a Hadoop catalog with GCS: @@ -61,7 +61,7 @@ curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ "gcs-service-account-file": "path_of_gcs_service_account_file", "filesystem-providers": "gcs" } -}' http://localhost:8090/api/metalakes/metalake/catalogs +}' ${GRAVITINO_SERVER_IP:PORT}/api/metalakes/metalake/catalogs ``` @@ -69,7 +69,7 @@ curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ ```java GravitinoClient gravitinoClient = GravitinoClient - .builder("http://localhost:8090") + .builder("${GRAVITINO_SERVER_IP:PORT}") .withMetalake("metalake") .build(); @@ -92,7 +92,7 @@ Catalog gcsCatalog = gravitinoClient.createCatalog("test_catalog", ```python -gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") +gravitino_client: GravitinoClient = GravitinoClient(uri="${GRAVITINO_SERVER_IP:PORT}", metalake_name="metalake") gcs_properties = { "location": "gs://bucket/root", "gcs-service-account-file": "path_of_gcs_service_account_file" @@ -124,7 +124,7 @@ curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ "properties": { "location": "gs://bucket/root/schema" } -}' http://localhost:8090/api/metalakes/metalake/catalogs/test_catalog/schemas +}' ${GRAVITINO_SERVER_IP:PORT}/api/metalakes/metalake/catalogs/test_catalog/schemas ``` @@ -172,7 +172,7 @@ curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ "properties": { "k1": "v1" } -}' http://localhost:8090/api/metalakes/metalake/catalogs/test_catalog/schemas/test_schema/filesets +}' ${GRAVITINO_SERVER_IP:PORT}/api/metalakes/metalake/catalogs/test_catalog/schemas/test_schema/filesets ``` @@ -180,7 +180,7 @@ curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ ```java GravitinoClient gravitinoClient = GravitinoClient - .builder("http://localhost:8090") + .builder("${GRAVITINO_SERVER_IP:PORT}") .withMetalake("metalake") .build(); @@ -204,7 +204,7 @@ filesetCatalog.createFileset( ```python -gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") +gravitino_client: GravitinoClient = GravitinoClient(uri="${GRAVITINO_SERVER_IP:PORT}", metalake_name="metalake") catalog: Catalog = gravitino_client.load_catalog(name="test_catalog") catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("test_schema", "example_fileset"), @@ -227,7 +227,7 @@ from gravitino import NameIdentifier, GravitinoClient, Catalog, Fileset, Graviti from pyspark.sql import SparkSession import os -gravitino_url = "http://localhost:8090" +gravitino_url = "${GRAVITINO_SERVER_IP:PORT}" metalake_name = "test" catalog_name = "your_gcs_catalog" @@ -280,7 +280,7 @@ In some Spark versions, a Hadoop environment is needed by the driver, adding the Configuration conf = new Configuration(); conf.set("fs.AbstractFileSystem.gvfs.impl","org.apache.gravitino.filesystem.hadoop.Gvfs"); conf.set("fs.gvfs.impl","org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); -conf.set("fs.gravitino.server.uri","http://localhost:8090"); +conf.set("fs.gravitino.server.uri","${GRAVITINO_SERVER_IP:PORT}"); conf.set("fs.gravitino.client.metalake","test_metalake"); conf.set("gcs-service-account-file", "/path/your-service-account-file.json"); Path filesetPath = new Path("gvfs://fileset/test_catalog/test_schema/test_fileset/new_dir"); @@ -310,7 +310,7 @@ The following are examples of how to use the `hadoop fs` command to access the f fs.gravitino.server.uri - http://192.168.50.188:8090 + ${GRAVITINO_SERVER_IP:PORT} @@ -348,7 +348,7 @@ options = { "auth_type": "simple", "gcs_service_account_file": "path_of_gcs_service_account_file.json", } -fs = gvfs.GravitinoVirtualFileSystem(server_uri="http://localhost:8090", metalake_name="test_metalake", options=options) +fs = gvfs.GravitinoVirtualFileSystem(server_uri="${GRAVITINO_SERVER_IP:PORT}", metalake_name="test_metalake", options=options) fs.ls("gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/") ``` @@ -360,7 +360,7 @@ The following are examples of how to use the pandas library to access the GCS fi import pandas as pd storage_options = { - "server_uri": "http://localhost:8090", + "server_uri": "${GRAVITINO_SERVER_IP:PORT}", "metalake_name": "test", "options": { "gcs_service_account_file": "path_of_gcs_service_account_file.json", @@ -391,7 +391,7 @@ GVFS Java client: Configuration conf = new Configuration(); conf.set("fs.AbstractFileSystem.gvfs.impl","org.apache.gravitino.filesystem.hadoop.Gvfs"); conf.set("fs.gvfs.impl","org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); -conf.set("fs.gravitino.server.uri","http://localhost:8090"); +conf.set("fs.gravitino.server.uri","${GRAVITINO_SERVER_IP:PORT}"); conf.set("fs.gravitino.client.metalake","test_metalake"); // No need to set gcs-service-account-file Path filesetPath = new Path("gvfs://fileset/gcs_test_catalog/test_schema/test_fileset/new_dir"); @@ -407,7 +407,7 @@ spark = SparkSession.builder .appName("gcs_fileset_test") .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") - .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") + .config("spark.hadoop.fs.gravitino.server.uri", "${GRAVITINO_SERVER_IP:PORT}") .config("spark.hadoop.fs.gravitino.client.metalake", "test") # No need to set gcs-service-account-file .config("spark.driver.memory", "2g") diff --git a/docs/hadoop-catalog-with-oss.md b/docs/hadoop-catalog-with-oss.md index a61c90600a3..761501079d3 100644 --- a/docs/hadoop-catalog-with-oss.md +++ b/docs/hadoop-catalog-with-oss.md @@ -292,7 +292,7 @@ conf.set("fs.AbstractFileSystem.gvfs.impl","org.apache.gravitino.filesystem.hado conf.set("fs.gvfs.impl","org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); conf.set("fs.gravitino.server.uri","http://localhost:8090"); conf.set("fs.gravitino.client.metalake","test_metalake"); -conf.set("oss-endpoint", "http://localhost:9000"); +conf.set("oss-endpoint", "${GRAVITINO_SERVER_IP:PORT}"); conf.set("oss-access-key-id", "minio"); conf.set("oss-secret-access-key", "minio123"); Path filesetPath = new Path("gvfs://fileset/test_catalog/test_schema/test_fileset/new_dir"); @@ -322,7 +322,7 @@ The following are examples of how to use the `hadoop fs` command to access the f fs.gravitino.server.uri - http://192.168.50.188:8090 + ${GRAVITINO_SERVER_IP:PORT} @@ -368,11 +368,11 @@ options = { "cache_size": 20, "cache_expired_time": 3600, "auth_type": "simple", - "oss_endpoint": "http://localhost:9000", + "oss_endpoint": "${GRAVITINO_SERVER_IP:PORT}", "oss_access_key_id": "minio", "oss_secret_access_key": "minio123" } -fs = gvfs.GravitinoVirtualFileSystem(server_uri="http://localhost:8090", metalake_name="test_metalake", options=options) +fs = gvfs.GravitinoVirtualFileSystem(server_uri="${GRAVITINO_SERVER_IP:PORT}", metalake_name="test_metalake", options=options) fs.ls("gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/") ``` @@ -386,7 +386,7 @@ The following are examples of how to use the pandas library to access the OSS fi import pandas as pd storage_options = { - "server_uri": "http://localhost:8090", + "server_uri": "${GRAVITINO_SERVER_IP:PORT}", "metalake_name": "test", "options": { "oss_access_key_id": "access_key", @@ -418,7 +418,7 @@ GVFS Java client: Configuration conf = new Configuration(); conf.set("fs.AbstractFileSystem.gvfs.impl","org.apache.gravitino.filesystem.hadoop.Gvfs"); conf.set("fs.gvfs.impl","org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); -conf.set("fs.gravitino.server.uri","http://localhost:8090"); +conf.set("fs.gravitino.server.uri","${GRAVITINO_SERVER_IP:PORT}"); conf.set("fs.gravitino.client.metalake","test_metalake"); // No need to set oss-access-key-id and oss-secret-access-key Path filesetPath = new Path("gvfs://fileset/oss_test_catalog/test_schema/test_fileset/new_dir"); @@ -434,7 +434,7 @@ spark = SparkSession.builder .appName("oss_fielset_test") .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") - .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") + .config("spark.hadoop.fs.gravitino.server.uri", "${GRAVITINO_SERVER_IP:PORT}") .config("spark.hadoop.fs.gravitino.client.metalake", "test") # No need to set oss-access-key-id and oss-secret-access-key .config("spark.driver.memory", "2g") diff --git a/docs/hadoop-catalog-with-s3.md b/docs/hadoop-catalog-with-s3.md index b1a724f1f2f..de3c3e7fc3d 100644 --- a/docs/hadoop-catalog-with-s3.md +++ b/docs/hadoop-catalog-with-s3.md @@ -295,10 +295,10 @@ In some Spark versions, a Hadoop environment is needed by the driver, adding the Configuration conf = new Configuration(); conf.set("fs.AbstractFileSystem.gvfs.impl","org.apache.gravitino.filesystem.hadoop.Gvfs"); conf.set("fs.gvfs.impl","org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); -conf.set("fs.gravitino.server.uri","http://localhost:8090"); +conf.set("fs.gravitino.server.uri","${GRAVITINO_SERVER_IP:PORT}"); conf.set("fs.gravitino.client.metalake","test_metalake"); -conf.set("s3-endpoint", "http://localhost:9000"); +conf.set("s3-endpoint", "${GRAVITINO_SERVER_IP:PORT}"); conf.set("s3-access-key-id", "minio"); conf.set("s3-secret-access-key", "minio123"); @@ -329,7 +329,7 @@ The following are examples of how to use the `hadoop fs` command to access the f fs.gravitino.server.uri - http://192.168.50.188:8090 + ${GRAVITINO_SERVER_IP:PORT} @@ -374,7 +374,7 @@ options = { "cache_size": 20, "cache_expired_time": 3600, "auth_type": "simple", - "s3_endpoint": "http://localhost:9000", + "s3_endpoint": "${GRAVITINO_SERVER_IP:PORT}", "s3_access_key_id": "minio", "s3_secret_access_key": "minio123" } From 7e74fc9c63a7f8adf42e40fa45ac7e787d9b6c7d Mon Sep 17 00:00:00 2001 From: yuqi Date: Tue, 7 Jan 2025 14:47:37 +0800 Subject: [PATCH 36/59] optimize again. --- .../org/apache/gravitino/oss/fs/OSSCredentialsProvider.java | 1 - .../apache/gravitino/abs/fs/AzureSasCredentialsProvider.java | 2 +- .../org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java index 79b4862336a..26243b59c02 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java @@ -52,7 +52,6 @@ public class OSSCredentialsProvider implements CredentialsProvider { public OSSCredentialsProvider(URI uri, Configuration conf) { this.filesetIdentifier = conf.get(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER); - this.client = GravitinoVirtualFileSystemUtils.createClient(conf); this.configuration = conf; } diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java index 91a3fab4744..0f14248030d 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java @@ -101,7 +101,7 @@ private void refresh() { String[] idents = filesetIdentifier.split("\\."); String catalog = idents[1]; - this.client = GravitinoVirtualFileSystemUtils.createClient(configuration); + client = GravitinoVirtualFileSystemUtils.createClient(configuration); FilesetCatalog filesetCatalog = client.loadCatalog(catalog).asFilesetCatalog(); Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java index 807cfb7e812..0afa9c66d0b 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java @@ -67,7 +67,7 @@ public void refresh() throws IOException { String[] idents = filesetIdentifier.split("\\."); String catalog = idents[1]; - this.client = GravitinoVirtualFileSystemUtils.createClient(configuration); + client = GravitinoVirtualFileSystemUtils.createClient(configuration); FilesetCatalog filesetCatalog = client.loadCatalog(catalog).asFilesetCatalog(); Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); From 571c1e99abc0600e5df905adee19df438d7366d1 Mon Sep 17 00:00:00 2001 From: yuqi Date: Tue, 7 Jan 2025 18:20:06 +0800 Subject: [PATCH 37/59] The new framework of credential vending in fileset. --- .../oss/fs/OSSFileSystemProvider.java | 14 ++- .../oss/fs/TestOSSCredentialProvider.java | 92 +++++++++++++++++++ catalogs/hadoop-common/build.gradle.kts | 1 + ...GravitinoFileSystemCredentialProvider.java | 28 ++++++ ...GravitinoFileSystemCredentialProvider.java | 62 +++++++++++++ .../hadoop/GravitinoVirtualFileSystem.java | 19 ++++ 6 files changed, 214 insertions(+), 2 deletions(-) create mode 100644 bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/TestOSSCredentialProvider.java create mode 100644 catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialProvider.java create mode 100644 clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialProvider.java diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java index 6f6db0ae663..12c6fd35c44 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java @@ -61,9 +61,15 @@ public FileSystem getFileSystem(Path path, Map config) throws IO hadoopConfMap.put(OSS_FILESYSTEM_IMPL, AliyunOSSFileSystem.class.getCanonicalName()); } - if (shouldSetCredentialsProviderExplicitly(config)) { + // if (shouldSetCredentialsProviderExplicitly(config)) { + // hadoopConfMap.put( + // Constants.CREDENTIALS_PROVIDER_KEY, + // OSSCredentialsProvider.class.getCanonicalName()); + // } + + if (enableCredentialProvidedByGravitino(config)) { hadoopConfMap.put( - Constants.CREDENTIALS_PROVIDER_KEY, OSSCredentialsProvider.class.getCanonicalName()); + Constants.CREDENTIALS_PROVIDER_KEY, TestOSSCredentialProvider.class.getCanonicalName()); } hadoopConfMap.forEach(configuration::set); @@ -71,6 +77,10 @@ public FileSystem getFileSystem(Path path, Map config) throws IO return AliyunOSSFileSystem.newInstance(path.toUri(), configuration); } + private boolean enableCredentialProvidedByGravitino(Map config) { + return null != config.get("fs.gvfs.provider.impl"); + } + /** * Check if the credential provider should be set explicitly. * diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/TestOSSCredentialProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/TestOSSCredentialProvider.java new file mode 100644 index 00000000000..284341c0893 --- /dev/null +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/TestOSSCredentialProvider.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.oss.fs; + +import com.aliyun.oss.common.auth.BasicCredentials; +import com.aliyun.oss.common.auth.Credentials; +import com.aliyun.oss.common.auth.CredentialsProvider; +import com.aliyun.oss.common.auth.DefaultCredentials; +import java.net.URI; +import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; +import org.apache.gravitino.credential.Credential; +import org.apache.gravitino.credential.OSSSecretKeyCredential; +import org.apache.gravitino.credential.OSSTokenCredential; +import org.apache.hadoop.conf.Configuration; + +public class TestOSSCredentialProvider implements CredentialsProvider { + private GravitinoFileSystemCredentialProvider gravitinoFileSystemCredentialProvider; + private Credentials basicCredentials; + private long expirationTime = Long.MAX_VALUE; + private static final double EXPIRATION_TIME_FACTOR = 0.9D; + + public TestOSSCredentialProvider(URI uri, Configuration conf) { + try { + gravitinoFileSystemCredentialProvider = + (GravitinoFileSystemCredentialProvider) + Class.forName(conf.get("fs.gvfs.credential.provider")) + .getDeclaredConstructor() + .newInstance(); + gravitinoFileSystemCredentialProvider.setConf(conf); + } catch (Exception e) { + throw new RuntimeException("Failed to create GravitinoFileSystemCredentialProvider", e); + } + } + + @Override + public void setCredentials(Credentials credentials) {} + + @Override + public Credentials getCredentials() { + if (basicCredentials == null || System.currentTimeMillis() >= expirationTime) { + Credential[] gravitinoCredentials = gravitinoFileSystemCredentialProvider.getCredentials(); + if (gravitinoCredentials.length == 0) { + throw new RuntimeException("No credentials found"); + } + + // Get dynamic credentials from Gravitino + Credential gravitinoCredential = gravitinoCredentials[0]; + + if (gravitinoCredential instanceof OSSSecretKeyCredential) { + OSSSecretKeyCredential ossSecretKeyCredential = + (OSSSecretKeyCredential) gravitinoCredential; + basicCredentials = + new DefaultCredentials( + ossSecretKeyCredential.accessKeyId(), ossSecretKeyCredential.secretAccessKey()); + } else if (gravitinoCredential instanceof OSSTokenCredential) { + OSSTokenCredential ossTokenCredential = (OSSTokenCredential) gravitinoCredential; + basicCredentials = + new BasicCredentials( + ossTokenCredential.accessKeyId(), + ossTokenCredential.secretAccessKey(), + ossTokenCredential.securityToken()); + } + + if (gravitinoCredential.expireTimeInMs() > 0) { + expirationTime = + System.currentTimeMillis() + + (long) + ((gravitinoCredential.expireTimeInMs() - System.currentTimeMillis()) + * EXPIRATION_TIME_FACTOR); + } + } + + return basicCredentials; + } +} diff --git a/catalogs/hadoop-common/build.gradle.kts b/catalogs/hadoop-common/build.gradle.kts index 566ce5986e3..09fd9f80170 100644 --- a/catalogs/hadoop-common/build.gradle.kts +++ b/catalogs/hadoop-common/build.gradle.kts @@ -23,6 +23,7 @@ plugins { // try to avoid adding extra dependencies because it is used by catalogs and connectors. dependencies { + implementation(project(":api")) implementation(project(":catalogs:catalog-common")) implementation(libs.commons.lang3) implementation(libs.hadoop3.client.api) diff --git a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialProvider.java b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialProvider.java new file mode 100644 index 00000000000..bb1a315cbd0 --- /dev/null +++ b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialProvider.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.catalog.hadoop.fs; + +import org.apache.gravitino.credential.Credential; +import org.apache.hadoop.conf.Configurable; + +public interface GravitinoFileSystemCredentialProvider extends Configurable { + + Credential[] getCredentials(); +} diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialProvider.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialProvider.java new file mode 100644 index 00000000000..257352477da --- /dev/null +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialProvider.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.filesystem.hadoop; + +import org.apache.gravitino.NameIdentifier; +import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; +import org.apache.gravitino.client.GravitinoClient; +import org.apache.gravitino.credential.Credential; +import org.apache.gravitino.file.Fileset; +import org.apache.gravitino.file.FilesetCatalog; +import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemUtils; +import org.apache.hadoop.conf.Configuration; + +public class DefaultGravitinoFileSystemCredentialProvider + implements GravitinoFileSystemCredentialProvider { + + private Configuration configuration; + + @Override + public void setConf(Configuration configuration) { + this.configuration = configuration; + } + + @Override + public Configuration getConf() { + return configuration; + } + + @Override + public Credential[] getCredentials() { + String virtualPath = configuration.get("fileset-gvfs-path"); + NameIdentifier nameIdentifier = getNameIdentifierFromVirtualPath(virtualPath); + String[] idents = nameIdentifier.namespace().levels(); + try (GravitinoClient client = GravitinoVirtualFileSystemUtils.createClient(configuration)) { + FilesetCatalog filesetCatalog = client.loadCatalog(idents[1]).asFilesetCatalog(); + Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); + return fileset.supportsCredentials().getCredentials(); + } + } + + private NameIdentifier getNameIdentifierFromVirtualPath(String gravitinoVirtualPath) { + // TODO implement this method + return null; + } +} diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java index 45c161b487f..dd71f19eaaf 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java @@ -54,6 +54,8 @@ import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.client.GravitinoClient; import org.apache.gravitino.exceptions.GravitinoRuntimeException; +import org.apache.gravitino.exceptions.NoSuchCredentialException; +import org.apache.gravitino.file.Fileset; import org.apache.gravitino.file.FilesetCatalog; import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemUtils; @@ -320,6 +322,23 @@ private FilesetContextPair getFilesetContext(Path virtualPath, FilesetDataOperat // If enable the cloud store credential, we should pass the configuration here. totalProperty.put(GVFS_FILESET_IDENTIFIER, identifier.toString()); + Fileset fileset = + catalog + .asFilesetCatalog() + .loadFileset( + NameIdentifier.of(identifier.namespace().level(2), identifier.name())); + + try { + fileset.supportsCredentials().getCredentials(); + // it has enabled the credential provider + totalProperty.put( + "fs.gvfs.credential.provider", + DefaultGravitinoFileSystemCredentialProvider.class.getCanonicalName()); + totalProperty.put("fs.gvfs.virtual.path", virtualPathString); + } catch (NoSuchCredentialException e) { + // No credential found, do nothing. + } + return provider.getFileSystem(filePath, totalProperty); } catch (IOException ioe) { throw new GravitinoRuntimeException( From e27f9cd2b86802fdc6c400a36d3ca43602abd6be Mon Sep 17 00:00:00 2001 From: yuqi Date: Tue, 7 Jan 2025 23:27:17 +0800 Subject: [PATCH 38/59] Refactor the framework of using credential in fileset --- bundles/aliyun/build.gradle.kts | 3 - .../oss/fs/OSSCredentialsProvider.java | 63 +++++-------- .../oss/fs/OSSFileSystemProvider.java | 26 +----- .../oss/fs/TestOSSCredentialProvider.java | 92 ------------------- bundles/aws/build.gradle.kts | 3 - .../s3/fs/S3CredentialsProvider.java | 64 +++++-------- .../gravitino/s3/fs/S3FileSystemProvider.java | 12 +-- bundles/azure/build.gradle.kts | 3 - .../abs/fs/AzureFileSystemProvider.java | 21 ++--- .../abs/fs/AzureSasCredentialsProvider.java | 45 ++------- bundles/gcp/build.gradle.kts | 4 - .../gcs/fs/GCSCredentialsProvider.java | 57 +++++------- .../gcs/fs/GCSFileSystemProvider.java | 18 ++-- ...GravitinoFileSystemCredentialProvider.java | 4 + .../build.gradle.kts | 50 ---------- clients/filesystem-hadoop3/build.gradle.kts | 4 - ...GravitinoFileSystemCredentialProvider.java | 30 ++++-- .../hadoop/GravitinoVirtualFileSystem.java | 50 +++++----- ...avitinoVirtualFileSystemConfiguration.java | 5 +- .../GravitinoVirtualFileSystemUtils.java | 2 +- .../gravitino/filesystem/hadoop/Gvfs.java | 1 - .../hadoop/FileSystemTestUtils.java | 1 - .../filesystem/hadoop/TestGvfsBase.java | 1 - .../filesystem/hadoop/TestKerberosClient.java | 1 - .../filesystem/hadoop/TestOauth2Client.java | 1 - .../filesystem/hadoop/TestSimpleClient.java | 1 - ...itinoVirtualFileSystemABSCredentialIT.java | 25 ++--- ...itinoVirtualFileSystemGCSCredentialIT.java | 8 +- settings.gradle.kts | 3 +- 29 files changed, 173 insertions(+), 425 deletions(-) delete mode 100644 bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/TestOSSCredentialProvider.java delete mode 100644 clients/filesystem-hadoop3-common/build.gradle.kts rename clients/{filesystem-hadoop3-common/src/main/java/org/apache/gravitino/filesystem/common => filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop}/GravitinoVirtualFileSystemConfiguration.java (95%) rename clients/{filesystem-hadoop3-common/src/main/java/org/apache/gravitino/filesystem/common => filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop}/GravitinoVirtualFileSystemUtils.java (99%) diff --git a/bundles/aliyun/build.gradle.kts b/bundles/aliyun/build.gradle.kts index 64fdf9ee1a8..472720e527e 100644 --- a/bundles/aliyun/build.gradle.kts +++ b/bundles/aliyun/build.gradle.kts @@ -37,9 +37,6 @@ dependencies { implementation(project(":catalogs:catalog-common")) { exclude("*") } - implementation(project(":clients:filesystem-hadoop3-common")) { - exclude("*") - } implementation(project(":catalogs:hadoop-common")) { exclude("*") } diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java index d4ea907a29d..f3b8ffafe2f 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java @@ -24,35 +24,35 @@ import com.aliyun.oss.common.auth.CredentialsProvider; import com.aliyun.oss.common.auth.DefaultCredentials; import java.net.URI; -import org.apache.gravitino.NameIdentifier; -import org.apache.gravitino.client.GravitinoClient; +import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; import org.apache.gravitino.credential.Credential; import org.apache.gravitino.credential.OSSSecretKeyCredential; import org.apache.gravitino.credential.OSSTokenCredential; -import org.apache.gravitino.file.Fileset; -import org.apache.gravitino.file.FilesetCatalog; -import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; -import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemUtils; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.aliyun.oss.Constants; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; public class OSSCredentialsProvider implements CredentialsProvider { - private static final Logger LOG = LoggerFactory.getLogger(OSSCredentialsProvider.class); + private GravitinoFileSystemCredentialProvider gravitinoFileSystemCredentialProvider; private Credentials basicCredentials; - private final String filesetIdentifier; - private GravitinoClient client; - private final Configuration configuration; - private long expirationTime = Long.MAX_VALUE; private static final double EXPIRATION_TIME_FACTOR = 0.9D; public OSSCredentialsProvider(URI uri, Configuration conf) { - this.filesetIdentifier = - conf.get(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER); - this.configuration = conf; + initGvfsCredentialProvider(conf); + } + + private void initGvfsCredentialProvider(Configuration conf) { + try { + gravitinoFileSystemCredentialProvider = + (GravitinoFileSystemCredentialProvider) + Class.forName( + conf.get(GravitinoFileSystemCredentialProvider.GVFS_CREDENTIAL_PROVIDER)) + .getDeclaredConstructor() + .newInstance(); + gravitinoFileSystemCredentialProvider.setConf(conf); + } catch (Exception e) { + throw new RuntimeException("Failed to create GravitinoFileSystemCredentialProvider", e); + } } @Override @@ -60,16 +60,9 @@ public void setCredentials(Credentials credentials) {} @Override public Credentials getCredentials() { - // If the credentials are null or about to expire, refresh the credentials. if (basicCredentials == null || System.currentTimeMillis() >= expirationTime) { synchronized (this) { - try { - refresh(); - } finally { - if (null != this.client) { - this.client.close(); - } - } + refresh(); } } @@ -77,24 +70,10 @@ public Credentials getCredentials() { } private void refresh() { - String[] idents = filesetIdentifier.split("\\."); - String catalog = idents[1]; - - client = GravitinoVirtualFileSystemUtils.createClient(configuration); - FilesetCatalog filesetCatalog = client.loadCatalog(catalog).asFilesetCatalog(); - - Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); - Credential[] credentials = fileset.supportsCredentials().getCredentials(); - Credential credential = getCredential(credentials); - + Credential[] gravitinoCredentials = gravitinoFileSystemCredentialProvider.getCredentials(); + Credential credential = getCredential(gravitinoCredentials); if (credential == null) { - LOG.warn("No credential found for fileset: {}, try to use static AKSK", filesetIdentifier); - expirationTime = Long.MAX_VALUE; - this.basicCredentials = - new DefaultCredentials( - configuration.get(Constants.ACCESS_KEY_ID), - configuration.get(Constants.ACCESS_KEY_SECRET)); - return; + throw new RuntimeException("No suitable credential for OSS found..."); } if (credential instanceof OSSSecretKeyCredential) { diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java index 12c6fd35c44..c3c0c0d7853 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java @@ -24,7 +24,7 @@ import java.util.Map; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; -import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; +import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; import org.apache.gravitino.storage.OSSProperties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -61,15 +61,9 @@ public FileSystem getFileSystem(Path path, Map config) throws IO hadoopConfMap.put(OSS_FILESYSTEM_IMPL, AliyunOSSFileSystem.class.getCanonicalName()); } - // if (shouldSetCredentialsProviderExplicitly(config)) { - // hadoopConfMap.put( - // Constants.CREDENTIALS_PROVIDER_KEY, - // OSSCredentialsProvider.class.getCanonicalName()); - // } - if (enableCredentialProvidedByGravitino(config)) { hadoopConfMap.put( - Constants.CREDENTIALS_PROVIDER_KEY, TestOSSCredentialProvider.class.getCanonicalName()); + Constants.CREDENTIALS_PROVIDER_KEY, OSSCredentialsProvider.class.getCanonicalName()); } hadoopConfMap.forEach(configuration::set); @@ -78,21 +72,7 @@ public FileSystem getFileSystem(Path path, Map config) throws IO } private boolean enableCredentialProvidedByGravitino(Map config) { - return null != config.get("fs.gvfs.provider.impl"); - } - - /** - * Check if the credential provider should be set explicitly. - * - *

When the credential provider is not set and the server URI is set (this means the call is - * from GVFS client), we need to manually set the credential provider - * - * @param config the configuration map - * @return true if the credential provider should be set explicitly - */ - private boolean shouldSetCredentialsProviderExplicitly(Map config) { - return !config.containsKey(Constants.CREDENTIALS_PROVIDER_KEY) - && config.containsKey(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY); + return null != config.get(GravitinoFileSystemCredentialProvider.GVFS_CREDENTIAL_PROVIDER); } @Override diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/TestOSSCredentialProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/TestOSSCredentialProvider.java deleted file mode 100644 index 284341c0893..00000000000 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/TestOSSCredentialProvider.java +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.gravitino.oss.fs; - -import com.aliyun.oss.common.auth.BasicCredentials; -import com.aliyun.oss.common.auth.Credentials; -import com.aliyun.oss.common.auth.CredentialsProvider; -import com.aliyun.oss.common.auth.DefaultCredentials; -import java.net.URI; -import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; -import org.apache.gravitino.credential.Credential; -import org.apache.gravitino.credential.OSSSecretKeyCredential; -import org.apache.gravitino.credential.OSSTokenCredential; -import org.apache.hadoop.conf.Configuration; - -public class TestOSSCredentialProvider implements CredentialsProvider { - private GravitinoFileSystemCredentialProvider gravitinoFileSystemCredentialProvider; - private Credentials basicCredentials; - private long expirationTime = Long.MAX_VALUE; - private static final double EXPIRATION_TIME_FACTOR = 0.9D; - - public TestOSSCredentialProvider(URI uri, Configuration conf) { - try { - gravitinoFileSystemCredentialProvider = - (GravitinoFileSystemCredentialProvider) - Class.forName(conf.get("fs.gvfs.credential.provider")) - .getDeclaredConstructor() - .newInstance(); - gravitinoFileSystemCredentialProvider.setConf(conf); - } catch (Exception e) { - throw new RuntimeException("Failed to create GravitinoFileSystemCredentialProvider", e); - } - } - - @Override - public void setCredentials(Credentials credentials) {} - - @Override - public Credentials getCredentials() { - if (basicCredentials == null || System.currentTimeMillis() >= expirationTime) { - Credential[] gravitinoCredentials = gravitinoFileSystemCredentialProvider.getCredentials(); - if (gravitinoCredentials.length == 0) { - throw new RuntimeException("No credentials found"); - } - - // Get dynamic credentials from Gravitino - Credential gravitinoCredential = gravitinoCredentials[0]; - - if (gravitinoCredential instanceof OSSSecretKeyCredential) { - OSSSecretKeyCredential ossSecretKeyCredential = - (OSSSecretKeyCredential) gravitinoCredential; - basicCredentials = - new DefaultCredentials( - ossSecretKeyCredential.accessKeyId(), ossSecretKeyCredential.secretAccessKey()); - } else if (gravitinoCredential instanceof OSSTokenCredential) { - OSSTokenCredential ossTokenCredential = (OSSTokenCredential) gravitinoCredential; - basicCredentials = - new BasicCredentials( - ossTokenCredential.accessKeyId(), - ossTokenCredential.secretAccessKey(), - ossTokenCredential.securityToken()); - } - - if (gravitinoCredential.expireTimeInMs() > 0) { - expirationTime = - System.currentTimeMillis() - + (long) - ((gravitinoCredential.expireTimeInMs() - System.currentTimeMillis()) - * EXPIRATION_TIME_FACTOR); - } - } - - return basicCredentials; - } -} diff --git a/bundles/aws/build.gradle.kts b/bundles/aws/build.gradle.kts index 3c9f604eb74..9a1ea670eea 100644 --- a/bundles/aws/build.gradle.kts +++ b/bundles/aws/build.gradle.kts @@ -37,9 +37,6 @@ dependencies { implementation(project(":catalogs:catalog-common")) { exclude("*") } - implementation(project(":clients:filesystem-hadoop3-common")) { - exclude("*") - } implementation(project(":catalogs:hadoop-common")) { exclude("*") } diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java index b7b3494e1e3..10ea1d1b6d0 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java @@ -24,35 +24,35 @@ import com.amazonaws.auth.BasicAWSCredentials; import com.amazonaws.auth.BasicSessionCredentials; import java.net.URI; -import org.apache.gravitino.NameIdentifier; -import org.apache.gravitino.client.GravitinoClient; +import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; import org.apache.gravitino.credential.Credential; import org.apache.gravitino.credential.S3SecretKeyCredential; import org.apache.gravitino.credential.S3TokenCredential; -import org.apache.gravitino.file.Fileset; -import org.apache.gravitino.file.FilesetCatalog; -import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; -import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemUtils; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.s3a.Constants; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; public class S3CredentialsProvider implements AWSCredentialsProvider { - - private static final Logger LOG = LoggerFactory.getLogger(S3CredentialsProvider.class); - private GravitinoClient client; - private final String filesetIdentifier; - private final Configuration configuration; + private GravitinoFileSystemCredentialProvider gravitinoFileSystemCredentialProvider; private AWSCredentials basicSessionCredentials; private long expirationTime = Long.MAX_VALUE; private static final double EXPIRATION_TIME_FACTOR = 0.9D; public S3CredentialsProvider(final URI uri, final Configuration conf) { - this.filesetIdentifier = - conf.get(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER); - this.configuration = conf; + initGvfsCredentialProvider(conf); + } + + private void initGvfsCredentialProvider(Configuration conf) { + try { + gravitinoFileSystemCredentialProvider = + (GravitinoFileSystemCredentialProvider) + Class.forName( + conf.get(GravitinoFileSystemCredentialProvider.GVFS_CREDENTIAL_PROVIDER)) + .getDeclaredConstructor() + .newInstance(); + gravitinoFileSystemCredentialProvider.setConf(conf); + } catch (Exception e) { + throw new RuntimeException("Failed to create GravitinoFileSystemCredentialProvider", e); + } } @Override @@ -60,13 +60,7 @@ public AWSCredentials getCredentials() { // Refresh credentials if they are null or about to expire. if (basicSessionCredentials == null || System.currentTimeMillis() >= expirationTime) { synchronized (this) { - try { - refresh(); - } finally { - if (null != this.client) { - this.client.close(); - } - } + refresh(); } } @@ -75,25 +69,11 @@ public AWSCredentials getCredentials() { @Override public void refresh() { - // The format of filesetIdentifier is "metalake.catalog.fileset.schema" - String[] idents = filesetIdentifier.split("\\."); - String catalog = idents[1]; + Credential[] gravitinoCredentials = gravitinoFileSystemCredentialProvider.getCredentials(); + Credential credential = getSuitableCredential(gravitinoCredentials); - client = GravitinoVirtualFileSystemUtils.createClient(configuration); - FilesetCatalog filesetCatalog = client.loadCatalog(catalog).asFilesetCatalog(); - - Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); - Credential[] credentials = fileset.supportsCredentials().getCredentials(); - Credential credential = getCredential(credentials); - - // Can't find any credential, use the default AKSK if possible. if (credential == null) { - LOG.warn("No credential found for fileset: {}, try to use static AKSK", filesetIdentifier); - expirationTime = Long.MAX_VALUE; - this.basicSessionCredentials = - new BasicAWSCredentials( - configuration.get(Constants.ACCESS_KEY), configuration.get(Constants.SECRET_KEY)); - return; + throw new RuntimeException("No suitable credential for S3 found..."); } if (credential instanceof S3SecretKeyCredential) { @@ -126,7 +106,7 @@ public void refresh() { * @param credentials The credential array. * @return A credential. Null if not found. */ - private Credential getCredential(Credential[] credentials) { + private Credential getSuitableCredential(Credential[] credentials) { // Use dynamic credential if found. for (Credential credential : credentials) { if (credential instanceof S3TokenCredential) { diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java index c60f12c3122..3dd5feae7e3 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java @@ -30,7 +30,7 @@ import java.util.Map; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; -import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; +import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; import org.apache.gravitino.storage.S3Properties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -67,11 +67,7 @@ public FileSystem getFileSystem(Path path, Map config) throws IO configuration.set(S3_CREDENTIAL_KEY, S3_SIMPLE_CREDENTIAL); } - // Only call from GVFS client will have this key and support GravitinoS3CredentialProvider as - // the file system provider will be used by GVFS client and Gravitino server, only GVFS client - // will have this key. - if (hadoopConfMap.containsKey( - GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { + if (enableCredentialProvidedByGravitino(config)) { configuration.set( Constants.AWS_CREDENTIALS_PROVIDER, S3CredentialsProvider.class.getCanonicalName()); } @@ -82,6 +78,10 @@ public FileSystem getFileSystem(Path path, Map config) throws IO return S3AFileSystem.newInstance(path.toUri(), configuration); } + private boolean enableCredentialProvidedByGravitino(Map config) { + return null != config.get(GravitinoFileSystemCredentialProvider.GVFS_CREDENTIAL_PROVIDER); + } + private void checkAndSetCredentialProvider(Configuration configuration) { String provides = configuration.get(S3_CREDENTIAL_KEY); if (provides == null) { diff --git a/bundles/azure/build.gradle.kts b/bundles/azure/build.gradle.kts index 74eb55d3c7a..c6a29be27b9 100644 --- a/bundles/azure/build.gradle.kts +++ b/bundles/azure/build.gradle.kts @@ -36,9 +36,6 @@ dependencies { implementation(project(":catalogs:catalog-common")) { exclude("*") } - implementation(project(":clients:filesystem-hadoop3-common")) { - exclude("*") - } implementation(project(":catalogs:hadoop-common")) { exclude("*") } diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java index a837ff2bc50..b8cedfd9330 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java @@ -30,19 +30,15 @@ import javax.annotation.Nonnull; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; -import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; +import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; import org.apache.gravitino.storage.AzureProperties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.azurebfs.services.AuthType; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; public class AzureFileSystemProvider implements FileSystemProvider { - private static final Logger LOGGER = LoggerFactory.getLogger(AzureFileSystemProvider.class); - @VisibleForTesting public static final String ABS_PROVIDER_SCHEME = "abfss"; @VisibleForTesting public static final String ABS_PROVIDER_NAME = "abs"; @@ -68,15 +64,13 @@ public FileSystem getFileSystem(@Nonnull Path path, @Nonnull Map config.get(AzureProperties.GRAVITINO_AZURE_STORAGE_ACCOUNT_KEY)); } - if (!config.containsKey(ABFS_IMPL_KEY)) { + if (!hadoopConfMap.containsKey(ABFS_IMPL_KEY)) { configuration.set(ABFS_IMPL_KEY, ABFS_IMPL); } hadoopConfMap.forEach(configuration::set); - // This is a workaround to judge whether it's from a Gravitino GVFS client. - if (config.containsKey(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { - // Test whether SAS works + if (enableCredentialProvidedByGravitino(hadoopConfMap)) { try { AzureSasCredentialsProvider azureSasCredentialsProvider = new AzureSasCredentialsProvider(); azureSasCredentialsProvider.initialize(configuration, null); @@ -102,16 +96,17 @@ public FileSystem getFileSystem(@Nonnull Path path, @Nonnull Map azureSasCredentialsProvider.getAzureStorageAccountKey()); } } catch (Exception e) { - // Can't use SAS, use account key and account key instead - LOGGER.warn( - "Failed to use SAS token and user account from credential provider, use default conf. ", - e); + throw new IOException("Failed to get SAS token from AzureSasCredentialsProvider", e); } } return FileSystem.get(path.toUri(), configuration); } + private boolean enableCredentialProvidedByGravitino(Map config) { + return null != config.get(GravitinoFileSystemCredentialProvider.GVFS_CREDENTIAL_PROVIDER); + } + @Override public String scheme() { return ABS_PROVIDER_SCHEME; diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java index c7f315382df..821489b5abd 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java @@ -20,36 +20,22 @@ package org.apache.gravitino.abs.fs; import java.io.IOException; -import org.apache.gravitino.NameIdentifier; -import org.apache.gravitino.client.GravitinoClient; +import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; import org.apache.gravitino.credential.ADLSTokenCredential; import org.apache.gravitino.credential.AzureAccountKeyCredential; import org.apache.gravitino.credential.Credential; -import org.apache.gravitino.file.Fileset; -import org.apache.gravitino.file.FilesetCatalog; -import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; -import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemUtils; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.azurebfs.extensions.SASTokenProvider; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; public class AzureSasCredentialsProvider implements SASTokenProvider, Configurable { - private static final Logger LOGGER = LoggerFactory.getLogger(AzureSasCredentialsProvider.class); - private Configuration configuration; - - private String filesetIdentifier; - - private GravitinoClient client; - private String sasToken; - private String azureStorageAccountName; private String azureStorageAccountKey; + private GravitinoFileSystemCredentialProvider gravitinoFileSystemCredentialProvider; private long expirationTime = Long.MAX_VALUE; private static final double EXPIRATION_TIME_FACTOR = 0.9D; @@ -73,8 +59,6 @@ public Configuration getConf() { @Override public void initialize(Configuration conf, String accountName) throws IOException { - this.filesetIdentifier = - conf.get(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER); this.configuration = conf; } @@ -83,32 +67,17 @@ public String getSASToken(String account, String fileSystem, String path, String // Refresh credentials if they are null or about to expire. if (sasToken == null || System.currentTimeMillis() >= expirationTime) { synchronized (this) { - try { - refresh(); - } finally { - if (null != this.client) { - this.client.close(); - } - } + refresh(); } } return sasToken; } private void refresh() { - String[] idents = filesetIdentifier.split("\\."); - String catalog = idents[1]; - - client = GravitinoVirtualFileSystemUtils.createClient(configuration); - FilesetCatalog filesetCatalog = client.loadCatalog(catalog).asFilesetCatalog(); - Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); - - Credential[] credentials = fileset.supportsCredentials().getCredentials(); - Credential credential = getCredential(credentials); - + Credential[] gravitinoCredentials = gravitinoFileSystemCredentialProvider.getCredentials(); + Credential credential = getSuitableCredential(gravitinoCredentials); if (credential == null) { - LOGGER.warn("No credentials found for fileset {}", filesetIdentifier); - return; + throw new RuntimeException("No suitable credential for OSS found..."); } if (credential instanceof ADLSTokenCredential) { @@ -136,7 +105,7 @@ private void refresh() { * @param credentials The credential array. * @return A credential. Null if not found. */ - private Credential getCredential(Credential[] credentials) { + private Credential getSuitableCredential(Credential[] credentials) { // Use dynamic credential if found. for (Credential credential : credentials) { if (credential instanceof ADLSTokenCredential) { diff --git a/bundles/gcp/build.gradle.kts b/bundles/gcp/build.gradle.kts index aa98e74c39d..4cdcb65539e 100644 --- a/bundles/gcp/build.gradle.kts +++ b/bundles/gcp/build.gradle.kts @@ -34,14 +34,10 @@ dependencies { compileOnly(libs.hadoop3.client.api) compileOnly(libs.hadoop3.client.runtime) compileOnly(libs.hadoop3.gcs) - compileOnly(libs.slf4j.api) implementation(project(":catalogs:catalog-common")) { exclude("*") } - implementation(project(":clients:filesystem-hadoop3-common")) { - exclude("*") - } implementation(project(":catalogs:hadoop-common")) { exclude("*") } diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java index 41ec17b3a0b..ba28dd97372 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java @@ -21,23 +21,14 @@ import com.google.cloud.hadoop.util.AccessTokenProvider; import java.io.IOException; -import org.apache.gravitino.NameIdentifier; -import org.apache.gravitino.client.GravitinoClient; +import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; import org.apache.gravitino.credential.Credential; import org.apache.gravitino.credential.GCSTokenCredential; -import org.apache.gravitino.file.Fileset; -import org.apache.gravitino.file.FilesetCatalog; -import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; -import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemUtils; import org.apache.hadoop.conf.Configuration; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; public class GCSCredentialsProvider implements AccessTokenProvider { - private static final Logger LOG = LoggerFactory.getLogger(GCSCredentialsProvider.class); private Configuration configuration; - private GravitinoClient client; - private String filesetIdentifier; + private GravitinoFileSystemCredentialProvider gravitinoFileSystemCredentialProvider; private AccessToken accessToken; private long expirationTime = Long.MAX_VALUE; @@ -49,11 +40,7 @@ public AccessToken getAccessToken() { try { refresh(); } catch (IOException e) { - LOG.error("Failed to refresh the access token", e); - } finally { - if (null != this.client) { - this.client.close(); - } + throw new RuntimeException("Failed to refresh access token", e); } } return accessToken; @@ -61,22 +48,11 @@ public AccessToken getAccessToken() { @Override public void refresh() throws IOException { - // The format of filesetIdentifier is "metalake.catalog.fileset.schema" - String[] idents = filesetIdentifier.split("\\."); - String catalog = idents[1]; + Credential[] gravitinoCredentials = gravitinoFileSystemCredentialProvider.getCredentials(); - client = GravitinoVirtualFileSystemUtils.createClient(configuration); - FilesetCatalog filesetCatalog = client.loadCatalog(catalog).asFilesetCatalog(); - - Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); - Credential[] credentials = fileset.supportsCredentials().getCredentials(); - - Credential credential = getCredential(credentials); - // Can't find any credential, use the default one. - if (null == credential) { - LOG.warn( - "No credential found for fileset: {}, try to use static JSON file", filesetIdentifier); - return; + Credential credential = getSuitableCredential(gravitinoCredentials); + if (credential == null) { + throw new RuntimeException("No suitable credential for OSS found..."); } if (credential instanceof GCSTokenCredential) { @@ -96,8 +72,21 @@ public void refresh() throws IOException { @Override public void setConf(Configuration configuration) { this.configuration = configuration; - this.filesetIdentifier = - configuration.get(GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER); + initGvfsCredentialProvider(configuration); + } + + private void initGvfsCredentialProvider(Configuration conf) { + try { + gravitinoFileSystemCredentialProvider = + (GravitinoFileSystemCredentialProvider) + Class.forName( + conf.get(GravitinoFileSystemCredentialProvider.GVFS_CREDENTIAL_PROVIDER)) + .getDeclaredConstructor() + .newInstance(); + gravitinoFileSystemCredentialProvider.setConf(conf); + } catch (Exception e) { + throw new RuntimeException("Failed to create GravitinoFileSystemCredentialProvider", e); + } } @Override @@ -112,7 +101,7 @@ public Configuration getConf() { * @param credentials The credential array. * @return An credential. */ - private Credential getCredential(Credential[] credentials) { + private Credential getSuitableCredential(Credential[] credentials) { for (Credential credential : credentials) { if (credential instanceof GCSTokenCredential) { return credential; diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java index e55d47a15d6..c509ac1ca78 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java @@ -18,14 +18,13 @@ */ package org.apache.gravitino.gcs.fs; -import com.google.cloud.hadoop.util.AccessTokenProvider; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableMap; import java.io.IOException; import java.util.Map; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; -import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; +import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; import org.apache.gravitino.storage.GCSProperties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -47,20 +46,17 @@ public FileSystem getFileSystem(Path path, Map config) throws IO FileSystemUtils.toHadoopConfigMap(config, GRAVITINO_KEY_TO_GCS_HADOOP_KEY) .forEach(configuration::set); - // This is a workaround to judge whether it's from a Gravitino GVFS client. - if (config.containsKey(GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_SERVER_URI_KEY)) { - AccessTokenProvider accessTokenProvider = new GCSCredentialsProvider(); - accessTokenProvider.setConf(configuration); - // Why is this check necessary?if Gravitino fails to get any credentials, we fall back to - // the default behavior of the GoogleHadoopFileSystem to use service account credentials. - if (accessTokenProvider.getAccessToken() != null) { - configuration.set(GCS_TOKEN_PROVIDER_IMPL, GCSCredentialsProvider.class.getName()); - } + if (enableCredentialProvidedByGravitino(config)) { + configuration.set(GCS_TOKEN_PROVIDER_IMPL, GCSCredentialsProvider.class.getName()); } return FileSystem.newInstance(path.toUri(), configuration); } + private boolean enableCredentialProvidedByGravitino(Map config) { + return null != config.get(GravitinoFileSystemCredentialProvider.GVFS_CREDENTIAL_PROVIDER); + } + @Override public String scheme() { return "gs"; diff --git a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialProvider.java b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialProvider.java index bb1a315cbd0..e8cd1e5b222 100644 --- a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialProvider.java +++ b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialProvider.java @@ -24,5 +24,9 @@ public interface GravitinoFileSystemCredentialProvider extends Configurable { + String GVFS_CREDENTIAL_PROVIDER = "fs.gvfs.credential.provider"; + + String GVFS_CREDENTIAL_PROVIDER_PATH = "fs.gvfs.virtual.path"; + Credential[] getCredentials(); } diff --git a/clients/filesystem-hadoop3-common/build.gradle.kts b/clients/filesystem-hadoop3-common/build.gradle.kts deleted file mode 100644 index 09f2fb82e90..00000000000 --- a/clients/filesystem-hadoop3-common/build.gradle.kts +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -plugins { - `maven-publish` - id("java") - id("idea") -} - -dependencies { - implementation(project(":api")) { - exclude("*") - } - - implementation(project(":common")) { - exclude("*") - } - implementation(libs.commons.lang3) - implementation(libs.guava) - - compileOnly(project(":clients:client-java-runtime", configuration = "shadow")) - compileOnly(libs.hadoop3.client.api) - compileOnly(libs.hadoop3.client.runtime) - compileOnly(libs.lombok) - annotationProcessor(libs.lombok) -} - -tasks.build { - dependsOn("javadoc") -} - -tasks.clean { - delete("target") - delete("tmp") -} diff --git a/clients/filesystem-hadoop3/build.gradle.kts b/clients/filesystem-hadoop3/build.gradle.kts index 191cf794d86..424f6a11406 100644 --- a/clients/filesystem-hadoop3/build.gradle.kts +++ b/clients/filesystem-hadoop3/build.gradle.kts @@ -28,10 +28,6 @@ dependencies { compileOnly(libs.hadoop3.client.api) compileOnly(libs.hadoop3.client.runtime) - implementation(project(":clients:filesystem-hadoop3-common")) { - exclude(group = "*") - } - implementation(project(":catalogs:catalog-common")) { exclude(group = "*") } diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialProvider.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialProvider.java index 257352477da..73ac2a03231 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialProvider.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialProvider.java @@ -19,13 +19,16 @@ package org.apache.gravitino.filesystem.hadoop; +import com.google.common.base.Preconditions; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.apache.commons.lang3.StringUtils; import org.apache.gravitino.NameIdentifier; import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; import org.apache.gravitino.client.GravitinoClient; import org.apache.gravitino.credential.Credential; import org.apache.gravitino.file.Fileset; import org.apache.gravitino.file.FilesetCatalog; -import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemUtils; import org.apache.hadoop.conf.Configuration; public class DefaultGravitinoFileSystemCredentialProvider @@ -33,6 +36,9 @@ public class DefaultGravitinoFileSystemCredentialProvider private Configuration configuration; + private static final Pattern IDENTIFIER_PATTERN = + Pattern.compile("^(?:gvfs://fileset)?/([^/]+)/([^/]+)/([^/]+)(?>/[^/]+)*/?$"); + @Override public void setConf(Configuration configuration) { this.configuration = configuration; @@ -45,18 +51,30 @@ public Configuration getConf() { @Override public Credential[] getCredentials() { - String virtualPath = configuration.get("fileset-gvfs-path"); + String virtualPath = configuration.get(GVFS_CREDENTIAL_PROVIDER_PATH); NameIdentifier nameIdentifier = getNameIdentifierFromVirtualPath(virtualPath); String[] idents = nameIdentifier.namespace().levels(); try (GravitinoClient client = GravitinoVirtualFileSystemUtils.createClient(configuration)) { - FilesetCatalog filesetCatalog = client.loadCatalog(idents[1]).asFilesetCatalog(); - Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); + FilesetCatalog filesetCatalog = client.loadCatalog(idents[0]).asFilesetCatalog(); + Fileset fileset = + filesetCatalog.loadFileset(NameIdentifier.of(idents[1], nameIdentifier.name())); return fileset.supportsCredentials().getCredentials(); } } private NameIdentifier getNameIdentifierFromVirtualPath(String gravitinoVirtualPath) { - // TODO implement this method - return null; + String virtualPath = gravitinoVirtualPath.toString(); + Preconditions.checkArgument( + StringUtils.isNotBlank(virtualPath), + "Uri which need be extracted cannot be null or empty."); + + Matcher matcher = IDENTIFIER_PATTERN.matcher(virtualPath); + Preconditions.checkArgument( + matcher.matches() && matcher.groupCount() == 3, + "URI %s doesn't contains valid identifier", + virtualPath); + + // The format is `catalog.schema.fileset` + return NameIdentifier.of(matcher.group(1), matcher.group(2), matcher.group(3)); } } diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java index dd71f19eaaf..da1b447769d 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java @@ -18,8 +18,6 @@ */ package org.apache.gravitino.filesystem.hadoop; -import static org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration.GVFS_FILESET_IDENTIFIER; - import com.github.benmanes.caffeine.cache.Cache; import com.github.benmanes.caffeine.cache.Caffeine; import com.github.benmanes.caffeine.cache.Scheduler; @@ -52,13 +50,13 @@ import org.apache.gravitino.audit.FilesetDataOperation; import org.apache.gravitino.audit.InternalClientType; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; +import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; import org.apache.gravitino.client.GravitinoClient; +import org.apache.gravitino.credential.Credential; import org.apache.gravitino.exceptions.GravitinoRuntimeException; import org.apache.gravitino.exceptions.NoSuchCredentialException; import org.apache.gravitino.file.Fileset; import org.apache.gravitino.file.FilesetCatalog; -import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; -import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemUtils; import org.apache.gravitino.storage.AzureProperties; import org.apache.gravitino.storage.OSSProperties; import org.apache.gravitino.storage.S3Properties; @@ -270,7 +268,6 @@ private FilesetContextPair getFilesetContext(Path virtualPath, FilesetDataOperat catalogCache.get( catalogIdent, ident -> client.loadCatalog(catalogIdent.name()).asFilesetCatalog()); Catalog catalog = (Catalog) filesetCatalog; - Preconditions.checkArgument( filesetCatalog != null, String.format("Loaded fileset catalog: %s is null.", catalogIdent)); @@ -317,26 +314,18 @@ private FilesetContextPair getFilesetContext(Path virtualPath, FilesetDataOperat .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); Map totalProperty = Maps.newHashMap(necessaryPropertyFromCatalog); - totalProperty.putAll(getConfigMap(getConf())); - // If enable the cloud store credential, we should pass the configuration here. - totalProperty.put(GVFS_FILESET_IDENTIFIER, identifier.toString()); - - Fileset fileset = - catalog - .asFilesetCatalog() - .loadFileset( - NameIdentifier.of(identifier.namespace().level(2), identifier.name())); - - try { - fileset.supportsCredentials().getCredentials(); - // it has enabled the credential provider + + boolean enableCredentialProvider = + enableGravitinoCredentialProvider(catalog, identifier); + if (enableCredentialProvider) { + // It has enabled the credential provider totalProperty.put( - "fs.gvfs.credential.provider", + GravitinoFileSystemCredentialProvider.GVFS_CREDENTIAL_PROVIDER, DefaultGravitinoFileSystemCredentialProvider.class.getCanonicalName()); - totalProperty.put("fs.gvfs.virtual.path", virtualPathString); - } catch (NoSuchCredentialException e) { - // No credential found, do nothing. + totalProperty.put( + GravitinoFileSystemCredentialProvider.GVFS_CREDENTIAL_PROVIDER_PATH, + virtualPathString); } return provider.getFileSystem(filePath, totalProperty); @@ -350,6 +339,23 @@ private FilesetContextPair getFilesetContext(Path virtualPath, FilesetDataOperat return new FilesetContextPair(new Path(actualFileLocation), fs); } + private boolean enableGravitinoCredentialProvider( + Catalog catalog, NameIdentifier filesetIdentifier) { + Fileset fileset = + catalog + .asFilesetCatalog() + .loadFileset( + NameIdentifier.of( + filesetIdentifier.namespace().level(2), filesetIdentifier.name())); + try { + Credential[] credentials = fileset.supportsCredentials().getCredentials(); + return credentials.length > 0; + } catch (NoSuchCredentialException e) { + // No credential found, do nothing. + return false; + } + } + private void resetFileSystemServiceLoader(String fsScheme) { try { Map> serviceFileSystems = diff --git a/clients/filesystem-hadoop3-common/src/main/java/org/apache/gravitino/filesystem/common/GravitinoVirtualFileSystemConfiguration.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java similarity index 95% rename from clients/filesystem-hadoop3-common/src/main/java/org/apache/gravitino/filesystem/common/GravitinoVirtualFileSystemConfiguration.java rename to clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java index 32d6985b822..e2bce734531 100644 --- a/clients/filesystem-hadoop3-common/src/main/java/org/apache/gravitino/filesystem/common/GravitinoVirtualFileSystemConfiguration.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.gravitino.filesystem.common; +package org.apache.gravitino.filesystem.hadoop; /** Configuration class for Gravitino Virtual File System. */ public class GravitinoVirtualFileSystemConfiguration { @@ -98,8 +98,5 @@ public class GravitinoVirtualFileSystemConfiguration { public static final long FS_GRAVITINO_FILESET_CACHE_EVICTION_MILLS_AFTER_ACCESS_DEFAULT = 1000L * 60 * 60; - /** The configuration key for the fileset identifier. */ - public static final String GVFS_FILESET_IDENTIFIER = "fs.gvfs.fileset.identifier"; - private GravitinoVirtualFileSystemConfiguration() {} } diff --git a/clients/filesystem-hadoop3-common/src/main/java/org/apache/gravitino/filesystem/common/GravitinoVirtualFileSystemUtils.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemUtils.java similarity index 99% rename from clients/filesystem-hadoop3-common/src/main/java/org/apache/gravitino/filesystem/common/GravitinoVirtualFileSystemUtils.java rename to clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemUtils.java index c78d2365b82..8a3c4009f8b 100644 --- a/clients/filesystem-hadoop3-common/src/main/java/org/apache/gravitino/filesystem/common/GravitinoVirtualFileSystemUtils.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemUtils.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.gravitino.filesystem.common; +package org.apache.gravitino.filesystem.hadoop; import com.google.common.base.Preconditions; import java.io.File; diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/Gvfs.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/Gvfs.java index 37afac71afb..4d2cbf03e98 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/Gvfs.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/Gvfs.java @@ -21,7 +21,6 @@ import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; -import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.DelegateToFileSystem; diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/FileSystemTestUtils.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/FileSystemTestUtils.java index 7dc20c92c52..9c3fdf86137 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/FileSystemTestUtils.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/FileSystemTestUtils.java @@ -21,7 +21,6 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.UUID; -import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java index e31c2b57e67..16fb4e1282c 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java @@ -41,7 +41,6 @@ import java.util.concurrent.TimeUnit; import org.apache.gravitino.NameIdentifier; import org.apache.gravitino.dto.responses.FileLocationResponse; -import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; import org.apache.gravitino.rest.RESTUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestKerberosClient.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestKerberosClient.java index 1bf3d495c38..564b05cee72 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestKerberosClient.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestKerberosClient.java @@ -29,7 +29,6 @@ import java.util.List; import java.util.UUID; import org.apache.gravitino.Config; -import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; import org.apache.gravitino.server.authentication.KerberosAuthenticator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestOauth2Client.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestOauth2Client.java index c9479790757..2186f530673 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestOauth2Client.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestOauth2Client.java @@ -51,7 +51,6 @@ import org.apache.gravitino.exceptions.BadRequestException; import org.apache.gravitino.exceptions.RESTException; import org.apache.gravitino.exceptions.UnauthorizedException; -import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; import org.apache.gravitino.json.JsonUtils; import org.apache.gravitino.rest.RESTUtils; import org.apache.gravitino.server.authentication.OAuthConfig; diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestSimpleClient.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestSimpleClient.java index fd724c35951..b88fbba16b4 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestSimpleClient.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestSimpleClient.java @@ -31,7 +31,6 @@ import org.apache.gravitino.dto.AuditDTO; import org.apache.gravitino.dto.MetalakeDTO; import org.apache.gravitino.dto.responses.MetalakeResponse; -import org.apache.gravitino.filesystem.common.GravitinoVirtualFileSystemConfiguration; import org.apache.gravitino.json.JsonUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemABSCredentialIT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemABSCredentialIT.java index 3d8fd457611..de1edd06076 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemABSCredentialIT.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemABSCredentialIT.java @@ -47,12 +47,13 @@ public class GravitinoVirtualFileSystemABSCredentialIT extends GravitinoVirtualF private static final Logger LOG = LoggerFactory.getLogger(GravitinoVirtualFileSystemABSCredentialIT.class); - public static final String ABS_ACCOUNT_NAME = System.getenv("ABS_STS_ACCOUNT_NAME"); - public static final String ABS_ACCOUNT_KEY = System.getenv("ABS_STS_ACCOUNT_KEY"); - public static final String ABS_CONTAINER_NAME = System.getenv("ABS_STS_CONTAINER_NAME"); - public static final String ABS_TENANT_ID = System.getenv("ABS_STS_TENANT_ID"); - public static final String ABS_CLIENT_ID = System.getenv("ABS_STS_CLIENT_ID"); - public static final String ABS_CLIENT_SECRET = System.getenv("ABS_STS_CLIENT_SECRET"); + public static final String ABS_ACCOUNT_NAME = System.getenv("ABS_ACCOUNT_NAME_FOR_CREDENTIAL"); + public static final String ABS_ACCOUNT_KEY = System.getenv("ABS_ACCOUNT_KEY_FOR_CREDENTIAL"); + public static final String ABS_CONTAINER_NAME = + System.getenv("ABS_CONTAINER_NAME_FOR_CREDENTIAL"); + public static final String ABS_TENANT_ID = System.getenv("ABS_TENANT_ID_FOR_CREDENTIAL"); + public static final String ABS_CLIENT_ID = System.getenv("ABS_CLIENT_ID_FOR_CREDENTIAL"); + public static final String ABS_CLIENT_SECRET = System.getenv("ABS_CLIENT_SECRET_FOR_CREDENTIAL"); @BeforeAll public void startIntegrationTest() { @@ -169,11 +170,11 @@ protected String genStorageLocation(String fileset) { public void testAppend() throws IOException {} private static boolean absIsConfigured() { - return StringUtils.isNotBlank(System.getenv("ABS_STS_ACCOUNT_NAME")) - && StringUtils.isNotBlank(System.getenv("ABS_STS_ACCOUNT_KEY")) - && StringUtils.isNotBlank(System.getenv("ABS_STS_CONTAINER_NAME")) - && StringUtils.isNotBlank(System.getenv("ABS_STS_TENANT_ID")) - && StringUtils.isNotBlank(System.getenv("ABS_STS_CLIENT_ID")) - && StringUtils.isNotBlank(System.getenv("ABS_STS_CLIENT_SECRET")); + return StringUtils.isNotBlank(System.getenv("ABS_ACCOUNT_NAME_FOR_CREDENTIAL")) + && StringUtils.isNotBlank(System.getenv("ABS_ACCOUNT_KEY_FOR_CREDENTIAL")) + && StringUtils.isNotBlank(System.getenv("ABS_CONTAINER_NAME_FOR_CREDENTIAL")) + && StringUtils.isNotBlank(System.getenv("ABS_TENANT_ID_FOR_CREDENTIAL")) + && StringUtils.isNotBlank(System.getenv("ABS_CLIENT_ID_FOR_CREDENTIAL")) + && StringUtils.isNotBlank(System.getenv("ABS_CLIENT_SECRET_FOR_CREDENTIAL")); } } diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java index 813bf56d5d5..2be451879ca 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java @@ -46,9 +46,9 @@ public class GravitinoVirtualFileSystemGCSCredentialIT extends GravitinoVirtualF private static final Logger LOG = LoggerFactory.getLogger(GravitinoVirtualFileSystemGCSCredentialIT.class); - public static final String BUCKET_NAME = System.getenv("GCS_STS_BUCKET_NAME"); + public static final String BUCKET_NAME = System.getenv("GCS_BUCKET_NAME_FOR_CREDENTIAL"); public static final String SERVICE_ACCOUNT_FILE = - System.getenv("GCS_STS_SERVICE_ACCOUNT_JSON_PATH"); + System.getenv("GCS_SERVICE_ACCOUNT_JSON_PATH_FOR_CREDENTIAL"); @BeforeAll public void startIntegrationTest() { @@ -145,7 +145,7 @@ protected String genStorageLocation(String fileset) { public void testAppend() throws IOException {} private static boolean isGCPConfigured() { - return StringUtils.isNotBlank(System.getenv("GCS_STS_SERVICE_ACCOUNT_JSON_PATH")) - && StringUtils.isNotBlank(System.getenv("GCS_STS_BUCKET_NAME")); + return StringUtils.isNotBlank(System.getenv("GCS_SERVICE_ACCOUNT_JSON_PATH_FOR_CREDENTIAL")) + && StringUtils.isNotBlank(System.getenv("GCS_BUCKET_NAME_FOR_CREDENTIAL")); } } diff --git a/settings.gradle.kts b/settings.gradle.kts index e1bac291729..c865e14e7a2 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -48,8 +48,7 @@ include( "clients:filesystem-hadoop3", "clients:filesystem-hadoop3-runtime", "clients:client-python", - "clients:cli", - "clients:filesystem-hadoop3-common" + "clients:cli" ) if (gradle.startParameter.projectProperties["enableFuse"]?.toBoolean() == true) { include("clients:filesystem-fuse") From bad0c87c46a974b0665f521385a88fd91ceb26b2 Mon Sep 17 00:00:00 2001 From: yuqi Date: Tue, 7 Jan 2025 23:45:24 +0800 Subject: [PATCH 39/59] fix --- bundles/aliyun/build.gradle.kts | 1 - bundles/aws/build.gradle.kts | 1 - bundles/azure/build.gradle.kts | 1 - bundles/gcp/build.gradle.kts | 1 - .../hadoop/integration/test/HadoopABSCatalogIT.java | 1 - .../hadoop/fs/GravitinoFileSystemCredentialProvider.java | 7 +++++++ .../DefaultGravitinoFileSystemCredentialProvider.java | 4 ++++ 7 files changed, 11 insertions(+), 5 deletions(-) diff --git a/bundles/aliyun/build.gradle.kts b/bundles/aliyun/build.gradle.kts index 472720e527e..9dfab9d6798 100644 --- a/bundles/aliyun/build.gradle.kts +++ b/bundles/aliyun/build.gradle.kts @@ -29,7 +29,6 @@ dependencies { compileOnly(project(":catalogs:catalog-common")) compileOnly(project(":catalogs:catalog-hadoop")) compileOnly(project(":core")) - compileOnly(project(":clients:client-java")) compileOnly(libs.hadoop3.client.api) compileOnly(libs.hadoop3.client.runtime) compileOnly(libs.hadoop3.oss) diff --git a/bundles/aws/build.gradle.kts b/bundles/aws/build.gradle.kts index 9a1ea670eea..da06c4d2cce 100644 --- a/bundles/aws/build.gradle.kts +++ b/bundles/aws/build.gradle.kts @@ -29,7 +29,6 @@ dependencies { compileOnly(project(":catalogs:catalog-common")) compileOnly(project(":catalogs:catalog-hadoop")) compileOnly(project(":core")) - compileOnly(project(":clients:client-java")) compileOnly(libs.hadoop3.aws) compileOnly(libs.hadoop3.client.api) compileOnly(libs.hadoop3.client.runtime) diff --git a/bundles/azure/build.gradle.kts b/bundles/azure/build.gradle.kts index c6a29be27b9..1cbe4856af5 100644 --- a/bundles/azure/build.gradle.kts +++ b/bundles/azure/build.gradle.kts @@ -28,7 +28,6 @@ dependencies { compileOnly(project(":api")) compileOnly(project(":catalogs:catalog-hadoop")) compileOnly(project(":core")) - compileOnly(project(":clients:client-java")) compileOnly(libs.hadoop3.abs) compileOnly(libs.hadoop3.client.api) compileOnly(libs.hadoop3.client.runtime) diff --git a/bundles/gcp/build.gradle.kts b/bundles/gcp/build.gradle.kts index 4cdcb65539e..a0bc169bc01 100644 --- a/bundles/gcp/build.gradle.kts +++ b/bundles/gcp/build.gradle.kts @@ -29,7 +29,6 @@ dependencies { compileOnly(project(":catalogs:catalog-common")) compileOnly(project(":catalogs:catalog-hadoop")) compileOnly(project(":core")) - compileOnly(project(":clients:client-java")) compileOnly(libs.hadoop3.client.api) compileOnly(libs.hadoop3.client.runtime) diff --git a/catalogs/catalog-hadoop/src/test/java/org/apache/gravitino/catalog/hadoop/integration/test/HadoopABSCatalogIT.java b/catalogs/catalog-hadoop/src/test/java/org/apache/gravitino/catalog/hadoop/integration/test/HadoopABSCatalogIT.java index 8d068f37ad4..482daba2e3c 100644 --- a/catalogs/catalog-hadoop/src/test/java/org/apache/gravitino/catalog/hadoop/integration/test/HadoopABSCatalogIT.java +++ b/catalogs/catalog-hadoop/src/test/java/org/apache/gravitino/catalog/hadoop/integration/test/HadoopABSCatalogIT.java @@ -140,7 +140,6 @@ public void testCreateSchemaAndFilesetWithSpecialLocation() { catalogProps.put("location", ossLocation); catalogProps.put(AzureProperties.GRAVITINO_AZURE_STORAGE_ACCOUNT_NAME, ABS_ACCOUNT_NAME); catalogProps.put(AzureProperties.GRAVITINO_AZURE_STORAGE_ACCOUNT_KEY, ABS_ACCOUNT_KEY); - catalogProps.put(FILESYSTEM_PROVIDERS, AzureFileSystemProvider.ABS_PROVIDER_NAME); Catalog localCatalog = diff --git a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialProvider.java b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialProvider.java index e8cd1e5b222..c70ba852b72 100644 --- a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialProvider.java +++ b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialProvider.java @@ -22,11 +22,18 @@ import org.apache.gravitino.credential.Credential; import org.apache.hadoop.conf.Configurable; +/** + * Interface for providing credentials for Gravitino Virtual File System. + */ public interface GravitinoFileSystemCredentialProvider extends Configurable { String GVFS_CREDENTIAL_PROVIDER = "fs.gvfs.credential.provider"; String GVFS_CREDENTIAL_PROVIDER_PATH = "fs.gvfs.virtual.path"; + /** + * Get credentials for Gravitino Virtual File System. + * @return credentials for Gravitino Virtual File System + */ Credential[] getCredentials(); } diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialProvider.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialProvider.java index 73ac2a03231..15ee22e564a 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialProvider.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialProvider.java @@ -31,6 +31,10 @@ import org.apache.gravitino.file.FilesetCatalog; import org.apache.hadoop.conf.Configuration; +/** + * Default implementation of {@link GravitinoFileSystemCredentialProvider} which provides credentials + * for Gravitino Virtual File System. + */ public class DefaultGravitinoFileSystemCredentialProvider implements GravitinoFileSystemCredentialProvider { From ca51efb8a78a926bfafc4e2cf9761cd198a37248 Mon Sep 17 00:00:00 2001 From: yuqi Date: Wed, 8 Jan 2025 00:05:56 +0800 Subject: [PATCH 40/59] fix --- .../hadoop/fs/GravitinoFileSystemCredentialProvider.java | 5 ++--- .../hadoop/DefaultGravitinoFileSystemCredentialProvider.java | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialProvider.java b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialProvider.java index c70ba852b72..6d2d63f371e 100644 --- a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialProvider.java +++ b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialProvider.java @@ -22,9 +22,7 @@ import org.apache.gravitino.credential.Credential; import org.apache.hadoop.conf.Configurable; -/** - * Interface for providing credentials for Gravitino Virtual File System. - */ +/** Interface for providing credentials for Gravitino Virtual File System. */ public interface GravitinoFileSystemCredentialProvider extends Configurable { String GVFS_CREDENTIAL_PROVIDER = "fs.gvfs.credential.provider"; @@ -33,6 +31,7 @@ public interface GravitinoFileSystemCredentialProvider extends Configurable { /** * Get credentials for Gravitino Virtual File System. + * * @return credentials for Gravitino Virtual File System */ Credential[] getCredentials(); diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialProvider.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialProvider.java index 15ee22e564a..fd970507ceb 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialProvider.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialProvider.java @@ -32,8 +32,8 @@ import org.apache.hadoop.conf.Configuration; /** - * Default implementation of {@link GravitinoFileSystemCredentialProvider} which provides credentials - * for Gravitino Virtual File System. + * Default implementation of {@link GravitinoFileSystemCredentialProvider} which provides + * credentials for Gravitino Virtual File System. */ public class DefaultGravitinoFileSystemCredentialProvider implements GravitinoFileSystemCredentialProvider { From bf1290c6af71903e2cdfd91f6dcfe55dde67c725 Mon Sep 17 00:00:00 2001 From: yuqi Date: Wed, 8 Jan 2025 08:00:58 +0800 Subject: [PATCH 41/59] fix --- .../filesystem/hadoop/GravitinoVirtualFileSystemUtils.java | 1 + 1 file changed, 1 insertion(+) diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemUtils.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemUtils.java index 8a3c4009f8b..8a0d1d87433 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemUtils.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemUtils.java @@ -27,6 +27,7 @@ import org.apache.gravitino.client.KerberosTokenProvider; import org.apache.hadoop.conf.Configuration; +/** Utility class for Gravitino Virtual File System. */ public class GravitinoVirtualFileSystemUtils { /** From a1b249f7df21507244a3cf2bbd5e910861613a33 Mon Sep 17 00:00:00 2001 From: yuqi Date: Wed, 8 Jan 2025 08:10:40 +0800 Subject: [PATCH 42/59] fix typo --- .../org/apache/gravitino/oss/fs/OSSCredentialsProvider.java | 4 ++-- .../test/GravitinoVirtualFileSystemABSCredentialIT.java | 5 +++-- .../test/GravitinoVirtualFileSystemGCSCredentialIT.java | 2 +- .../test/GravitinoVirtualFileSystemOSSCredentialIT.java | 2 +- .../test/GravitinoVirtualFileSystemS3CredentialIT.java | 2 +- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java index f3b8ffafe2f..e16a393558b 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java @@ -71,7 +71,7 @@ public Credentials getCredentials() { private void refresh() { Credential[] gravitinoCredentials = gravitinoFileSystemCredentialProvider.getCredentials(); - Credential credential = getCredential(gravitinoCredentials); + Credential credential = getSuitableCredential(gravitinoCredentials); if (credential == null) { throw new RuntimeException("No suitable credential for OSS found..."); } @@ -106,7 +106,7 @@ private void refresh() { * @param credentials The credential array. * @return A credential. Null if not found. */ - private Credential getCredential(Credential[] credentials) { + private Credential getSuitableCredential(Credential[] credentials) { // Use dynamic credential if found. for (Credential credential : credentials) { if (credential instanceof OSSTokenCredential) { diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemABSCredentialIT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemABSCredentialIT.java index de1edd06076..8e537c85c73 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemABSCredentialIT.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemABSCredentialIT.java @@ -27,6 +27,7 @@ import java.util.Collections; import java.util.Map; import org.apache.gravitino.Catalog; +import org.apache.gravitino.Catalog.Type; import org.apache.gravitino.abs.fs.AzureFileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; import org.apache.gravitino.credential.CredentialConstants; @@ -68,7 +69,7 @@ public void startUp() throws Exception { super.startIntegrationTest(); // This value can be by tune by the user, please change it accordingly. - defaultBockSize = 32 * 1024 * 1024; + defaultBlockSize = 32 * 1024 * 1024; // This value is 1 for ABS, 3 for GCS, and 1 for S3A. defaultReplication = 1; @@ -94,7 +95,7 @@ public void startUp() throws Exception { Catalog catalog = metalake.createCatalog( - catalogName, Catalog.Type.FILESET, "hadoop", "catalog comment", properties); + catalogName, Type.FILESET, "hadoop", "catalog comment", properties); Assertions.assertTrue(metalake.catalogExists(catalogName)); catalog.asSchemas().createSchema(schemaName, "schema comment", properties); diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java index 2be451879ca..04fbf209588 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java @@ -63,7 +63,7 @@ public void startUp() throws Exception { super.startIntegrationTest(); // This value can be by tune by the user, please change it accordingly. - defaultBockSize = 64 * 1024 * 1024; + defaultBlockSize = 64 * 1024 * 1024; metalakeName = GravitinoITUtils.genRandomName("gvfs_it_metalake"); catalogName = GravitinoITUtils.genRandomName("catalog"); diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemOSSCredentialIT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemOSSCredentialIT.java index 6fe3938a1d3..db33533eece 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemOSSCredentialIT.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemOSSCredentialIT.java @@ -66,7 +66,7 @@ public void startUp() throws Exception { super.startIntegrationTest(); // This value can be by tune by the user, please change it accordingly. - defaultBockSize = 64 * 1024 * 1024; + defaultBlockSize = 64 * 1024 * 1024; // The default replication factor is 1. defaultReplication = 1; diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemS3CredentialIT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemS3CredentialIT.java index e424615b79a..bb087cfc1d1 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemS3CredentialIT.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemS3CredentialIT.java @@ -67,7 +67,7 @@ public void startUp() throws Exception { super.startIntegrationTest(); // This value can be by tune by the user, please change it accordingly. - defaultBockSize = 32 * 1024 * 1024; + defaultBlockSize = 32 * 1024 * 1024; // The value is 1 for S3 defaultReplication = 1; From a722080f6dd1ee090d4e56649cb9ae9bdfe815bc Mon Sep 17 00:00:00 2001 From: yuqi Date: Wed, 8 Jan 2025 09:15:43 +0800 Subject: [PATCH 43/59] fix the checkstyle problem. --- .../test/GravitinoVirtualFileSystemABSCredentialIT.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemABSCredentialIT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemABSCredentialIT.java index 8e537c85c73..ab2e484b2e9 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemABSCredentialIT.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemABSCredentialIT.java @@ -94,8 +94,7 @@ public void startUp() throws Exception { properties.put(FILESYSTEM_PROVIDERS, AzureFileSystemProvider.ABS_PROVIDER_NAME); Catalog catalog = - metalake.createCatalog( - catalogName, Type.FILESET, "hadoop", "catalog comment", properties); + metalake.createCatalog(catalogName, Type.FILESET, "hadoop", "catalog comment", properties); Assertions.assertTrue(metalake.catalogExists(catalogName)); catalog.asSchemas().createSchema(schemaName, "schema comment", properties); From 03395c810fa27f309e4617dd22733f340f9aae1d Mon Sep 17 00:00:00 2001 From: yuqi Date: Wed, 8 Jan 2025 09:38:38 +0800 Subject: [PATCH 44/59] fix --- .../test/GravitinoVirtualFileSystemABSCredentialIT.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemABSCredentialIT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemABSCredentialIT.java index ab2e484b2e9..2f79332e8b3 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemABSCredentialIT.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemABSCredentialIT.java @@ -27,7 +27,6 @@ import java.util.Collections; import java.util.Map; import org.apache.gravitino.Catalog; -import org.apache.gravitino.Catalog.Type; import org.apache.gravitino.abs.fs.AzureFileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; import org.apache.gravitino.credential.CredentialConstants; @@ -94,7 +93,8 @@ public void startUp() throws Exception { properties.put(FILESYSTEM_PROVIDERS, AzureFileSystemProvider.ABS_PROVIDER_NAME); Catalog catalog = - metalake.createCatalog(catalogName, Type.FILESET, "hadoop", "catalog comment", properties); + metalake.createCatalog( + catalogName, Catalog.Type.FILESET, "hadoop", "catalog comment", properties); Assertions.assertTrue(metalake.catalogExists(catalogName)); catalog.asSchemas().createSchema(schemaName, "schema comment", properties); From 274ec3e4a7539751fc7f865fb6593cd7fc26bc10 Mon Sep 17 00:00:00 2001 From: yuqi Date: Wed, 8 Jan 2025 10:20:09 +0800 Subject: [PATCH 45/59] fix --- .../hadoop/GravitinoVirtualFileSystem.java | 15 +- docs/hadoop-catalog-with-adls.md | 432 ----------------- docs/hadoop-catalog-with-gcs.md | 418 ---------------- docs/hadoop-catalog-with-oss.md | 447 ----------------- docs/hadoop-catalog-with-s3.md | 451 ------------------ docs/hadoop-catalog.md | 9 +- docs/how-to-use-gvfs.md | 66 +-- 7 files changed, 48 insertions(+), 1790 deletions(-) delete mode 100644 docs/hadoop-catalog-with-adls.md delete mode 100644 docs/hadoop-catalog-with-gcs.md delete mode 100644 docs/hadoop-catalog-with-oss.md delete mode 100644 docs/hadoop-catalog-with-s3.md diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java index da1b447769d..9cbbcd7bf58 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java @@ -54,7 +54,6 @@ import org.apache.gravitino.client.GravitinoClient; import org.apache.gravitino.credential.Credential; import org.apache.gravitino.exceptions.GravitinoRuntimeException; -import org.apache.gravitino.exceptions.NoSuchCredentialException; import org.apache.gravitino.file.Fileset; import org.apache.gravitino.file.FilesetCatalog; import org.apache.gravitino.storage.AzureProperties; @@ -341,16 +340,16 @@ private FilesetContextPair getFilesetContext(Path virtualPath, FilesetDataOperat private boolean enableGravitinoCredentialProvider( Catalog catalog, NameIdentifier filesetIdentifier) { - Fileset fileset = - catalog - .asFilesetCatalog() - .loadFileset( - NameIdentifier.of( - filesetIdentifier.namespace().level(2), filesetIdentifier.name())); try { + Fileset fileset = + catalog + .asFilesetCatalog() + .loadFileset( + NameIdentifier.of( + filesetIdentifier.namespace().level(2), filesetIdentifier.name())); Credential[] credentials = fileset.supportsCredentials().getCredentials(); return credentials.length > 0; - } catch (NoSuchCredentialException e) { + } catch (Exception e) { // No credential found, do nothing. return false; } diff --git a/docs/hadoop-catalog-with-adls.md b/docs/hadoop-catalog-with-adls.md deleted file mode 100644 index 3ef052da730..00000000000 --- a/docs/hadoop-catalog-with-adls.md +++ /dev/null @@ -1,432 +0,0 @@ ---- -title: "Hadoop catalog with ADLS" -slug: /hadoop-catalog-with-adls -date: 2025-01-03 -keyword: Hadoop catalog ADLS -license: "This software is licensed under the Apache License version 2." ---- - -This document describes how to configure a Hadoop catalog with ADLS (Azure Blob Storage). - -## Prerequisites - -In order to create a Hadoop catalog with ADLS, you need to place [`gravitino-azure-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-azure-bundle) in Gravitino Hadoop classpath located -at `${HADOOP_HOME}/share/hadoop/common/lib/`. After that, start Gravitino server with the following command: - -```bash -$ bin/gravitino-server.sh start -``` - -## Create a Hadoop Catalog with ADLS - -The rest of this document shows how to use the Hadoop catalog with ADLS in Gravitino with a full example. - -### Catalog a ADLS Hadoop catalog - -Apart from configuration method in [Hadoop-catalog-catalog-configuration](./hadoop-catalog.md#catalog-properties), the following properties are required to configure a Hadoop catalog with ADLS: - -| Configuration item | Description | Default value | Required | Since version | -|-----------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|-------------------------------------------|------------------| -| `filesystem-providers` | The file system providers to add. Set it to `abs` if it's a Azure Blob Storage fileset, or a comma separated string that contains `abs` like `oss,abs,s3` to support multiple kinds of fileset including `abs`. | (none) | Yes | 0.8.0-incubating | -| `default-filesystem-provider` | The name default filesystem providers of this Hadoop catalog if users do not specify the scheme in the URI. Default value is `builtin-local`, for Azure Blob Storage, if we set this value, we can omit the prefix 'abfss://' in the location. | `builtin-local` | No | 0.8.0-incubating | -| `azure-storage-account-name ` | The account name of Azure Blob Storage. | (none) | Yes if it's a Azure Blob Storage fileset. | 0.8.0-incubating | -| `azure-storage-account-key` | The account key of Azure Blob Storage. | (none) | Yes if it's a Azure Blob Storage fileset. | 0.8.0-incubating | - -### Create a schema - -Refer to [Schema operation](./manage-fileset-metadata-using-gravitino.md#schema-operations) for more details. - -### Create a fileset - -Refer to [Fileset operation](./manage-fileset-metadata-using-gravitino.md#fileset-operations) for more details. - - -## Using Hadoop catalog with ADLS - -### Create a Hadoop catalog/schema/fileset with ADLS - -First, you need to create a Hadoop catalog with ADLS. The following example shows how to create a Hadoop catalog with ADLS: - - - - -```shell -curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ --H "Content-Type: application/json" -d '{ - "name": "example_catalog", - "type": "FILESET", - "comment": "comment", - "provider": "hadoop", - "properties": { - "location": "abfss://container@account-name.dfs.core.windows.net/path", - "azure-storage-account-name": "The account name of the Azure Blob Storage", - "azure-storage-account-key": "The account key of the Azure Blob Storage", - "filesystem-providers": "abs" - } -}' ${GRAVITINO_SERVER_IP:PORT}/api/metalakes/metalake/catalogs -``` - - - - -```java -GravitinoClient gravitinoClient = GravitinoClient - .builder("${GRAVITINO_SERVER_IP:PORT}") - .withMetalake("metalake") - .build(); - -adlsProperties = ImmutableMap.builder() - .put("location", "abfss://container@account-name.dfs.core.windows.net/path") - .put("azure-storage-account-name", "azure storage account name") - .put("azure-storage-account-key", "azure storage account key") - .put("filesystem-providers", "abs") - .build(); - -Catalog adlsCatalog = gravitinoClient.createCatalog("example_catalog", - Type.FILESET, - "hadoop", // provider, Gravitino only supports "hadoop" for now. - "This is a ADLS fileset catalog", - adlsProperties); -// ... - -``` - - - - -```python -gravitino_client: GravitinoClient = GravitinoClient(uri="${GRAVITINO_SERVER_IP:PORT}", metalake_name="metalake") -adls_properties = { - "location": "abfss://container@account-name.dfs.core.windows.net/path", - "azure_storage_account_name": "azure storage account name", - "azure_storage_account_key": "azure storage account key" -} - -adls_properties = gravitino_client.create_catalog(name="example_catalog", - type=Catalog.Type.FILESET, - provider="hadoop", - comment="This is a ADLS fileset catalog", - properties=adls_properties) - -``` - - - - -Then create a schema and fileset in the catalog created above. - -Using the following code to create a schema and fileset: - - - - -```shell -curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ --H "Content-Type: application/json" -d '{ - "name": "test_schema", - "comment": "comment", - "properties": { - "location": "abfss://container@account-name.dfs.core.windows.net/path" - } -}' ${GRAVITINO_SERVER_IP:PORT}/api/metalakes/metalake/catalogs/test_catalog/schemas -``` - - - - -```java -Catalog catalog = gravitinoClient.loadCatalog("test_catalog"); - -SupportsSchemas supportsSchemas = catalog.asSchemas(); - -Map schemaProperties = ImmutableMap.builder() - .put("location", "abfss://container@account-name.dfs.core.windows.net/path") - .build(); -Schema schema = supportsSchemas.createSchema("test_schema", - "This is a schema", - schemaProperties -); -// ... -``` - - - - -```python -gravitino_client: GravitinoClient = GravitinoClient(uri="http://127.0.0.1:8090", metalake_name="metalake") -catalog: Catalog = gravitino_client.load_catalog(name="test_catalog") -catalog.as_schemas().create_schema(name="test_schema", - comment="This is a schema", - properties={"location": "abfss://container@account-name.dfs.core.windows.net/path"}) -``` - - - - - - - -```shell -curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ --H "Content-Type: application/json" -d '{ - "name": "example_fileset", - "comment": "This is an example fileset", - "type": "MANAGED", - "storageLocation": "abfss://container@account-name.dfs.core.windows.net/path/example_fileset", - "properties": { - "k1": "v1" - } -}' ${GRAVITINO_SERVER_IP:PORT}/api/metalakes/metalake/catalogs/test_catalog/schemas/test_schema/filesets -``` - - - - -```java -GravitinoClient gravitinoClient = GravitinoClient - .builder("${GRAVITINO_SERVER_IP:PORT}") - .withMetalake("metalake") - .build(); - -Catalog catalog = gravitinoClient.loadCatalog("test_catalog"); -FilesetCatalog filesetCatalog = catalog.asFilesetCatalog(); - -Map propertiesMap = ImmutableMap.builder() - .put("k1", "v1") - .build(); - -filesetCatalog.createFileset( - NameIdentifier.of("test_schema", "example_fileset"), - "This is an example fileset", - Fileset.Type.MANAGED, - "abfss://container@account-name.dfs.core.windows.net/path/example_fileset", - propertiesMap, -); -``` - - - - -```python -gravitino_client: GravitinoClient = GravitinoClient(uri="${GRAVITINO_SERVER_IP:PORT}", metalake_name="metalake") - -catalog: Catalog = gravitino_client.load_catalog(name="test_catalog") -catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("test_schema", "example_fileset"), - type=Fileset.Type.MANAGED, - comment="This is an example fileset", - storage_location="abfss://container@account-name.dfs.core.windows.net/path/example_fileset", - properties={"k1": "v1"}) -``` - - - - -### Using Spark to access the fileset - -The following code snippet shows how to use **PySpark 3.1.3 with Hadoop environment(Hadoop 3.2.0)** to access the fileset: - -```python -import logging -from gravitino import NameIdentifier, GravitinoClient, Catalog, Fileset, GravitinoAdminClient -from pyspark.sql import SparkSession -import os - -gravitino_url = "${GRAVITINO_SERVER_IP:PORT}" -metalake_name = "test" - -catalog_name = "your_adls_catalog" -schema_name = "your_adls_schema" -fileset_name = "your_adls_fileset" - -os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-azure-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar,/path/to/hadoop-azure-3.2.0.jar,/path/to/azure-storage-7.0.0.jar,/path/to/wildfly-openssl-1.0.4.Final.jar --master local[1] pyspark-shell" -spark = SparkSession.builder -.appName("adls_fileset_test") -.config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") -.config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") -.config("spark.hadoop.fs.gravitino.server.uri", "${GRAVITINO_SERVER_URL}") -.config("spark.hadoop.fs.gravitino.client.metalake", "test") -.config("spark.hadoop.azure-storage-account-name", "azure_account_name") -.config("spark.hadoop.azure-storage-account-key", "azure_account_name") -.config("spark.hadoop.fs.azure.skipUserGroupMetadataDuringInitialization", "true") -.config("spark.driver.memory", "2g") -.config("spark.driver.port", "2048") -.getOrCreate() - -data = [("Alice", 25), ("Bob", 30), ("Cathy", 45)] -columns = ["Name", "Age"] -spark_df = spark.createDataFrame(data, schema=columns) -gvfs_path = f"gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/people" - -spark_df.coalesce(1).write -.mode("overwrite") -.option("header", "true") -.csv(gvfs_path) -``` - -If your Spark **without Hadoop environment**, you can use the following code snippet to access the fileset: - -```python -## Replace the following code snippet with the above code snippet with the same environment variables - -os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-azure-bundle-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar --master local[1] pyspark-shell" -``` - -- [`gravitino-azure-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-azure-bundle) is the Gravitino ADLS jar with Hadoop environment and `hadoop-azure` jar. -- [`gravitino-azure-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-azure) is a condensed version of the Gravitino ADLS bundle jar without Hadoop environment and `hadoop-azure` jar. - -Please choose the correct jar according to your environment. - -:::note -In some Spark versions, a Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work. If this is the case, you should add the jars to the spark CLASSPATH directly. -::: - -### Using Gravitino virtual file system Java client to access the fileset - -```java -Configuration conf = new Configuration(); -conf.set("fs.AbstractFileSystem.gvfs.impl","org.apache.gravitino.filesystem.hadoop.Gvfs"); -conf.set("fs.gvfs.impl","org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); -conf.set("fs.gravitino.server.uri","${GRAVITINO_SERVER_URL}"); -conf.set("fs.gravitino.client.metalake","test_metalake"); -conf.set("azure-storage-account-name", "account_name_of_adls"); -conf.set("azure-storage-account-key", "account_key_of_adls"); -Path filesetPath = new Path("gvfs://fileset/test_catalog/test_schema/test_fileset/new_dir"); -FileSystem fs = filesetPath.getFileSystem(conf); -fs.mkdirs(filesetPath); -... -``` - -Similar to Spark configurations, you need to add ADLS bundle jars to the classpath according to your environment. - -### Accessing a fileset using the Hadoop fs command - -The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. - -1. Adding the following contents to the `${HADOOP_HOME}/etc/hadoop/core-site.xml` file: - -```xml - - fs.AbstractFileSystem.gvfs.impl - org.apache.gravitino.filesystem.hadoop.Gvfs - - - - fs.gvfs.impl - org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem - - - - fs.gravitino.server.uri - ${GRAVITINO_SERVER_IP:PORT} - - - - fs.gravitino.client.metalake - test - - - - azure-storage-account-name - account_name - - - azure-storage-account-key - account_key - -``` - -2. Copy the necessary jars to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. - -For ADLS, you need to copy `gravitino-azure-{version}.jar` to the `${HADOOP_HOME}/share/hadoop/common/lib` directory, -then copy `hadoop-azure-${version}.jar` and related dependencies to the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory. Those jars can be found in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory, you can add all the jars in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. - - -3. Run the following command to access the fileset: - -```shell -hadoop dfs -ls gvfs://fileset/adls_catalog/schema/example -hadoop dfs -put /path/to/local/file gvfs://fileset/adls_catalog/schema/example -``` - -### Using the Gravitino virtual file system Python client to access a fileset - -```python -from gravitino import gvfs -options = { - "cache_size": 20, - "cache_expired_time": 3600, - "auth_type": "simple", - "azure_storage_account_name": "azure_account_name", - "azure_storage_account_key": "azure_account_key" -} -fs = gvfs.GravitinoVirtualFileSystem(server_uri="${GRAVITINO_SERVER_IP:PORT}", metalake_name="test_metalake", options=options) -fs.ls("gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/") -``` - - -### Using fileset with pandas - -The following are examples of how to use the pandas library to access the ADLS fileset - -```python -import pandas as pd - -storage_options = { - "server_uri": "${GRAVITINO_SERVER_IP:PORT}", - "metalake_name": "test", - "options": { - "azure_storage_account_name": "azure_account_name", - "azure_storage_account_key": "azure_account_key" - } -} -ds = pd.read_csv(f"gvfs://fileset/${catalog_name}/${schema_name}/${fileset_name}/people/part-00000-51d366e2-d5eb-448d-9109-32a96c8a14dc-c000.csv", - storage_options=storage_options) -ds.head() -``` - -For other use cases, please refer to the [Gravitino Virtual File System](./how-to-use-gvfs.md) document. - -## Fileset with credential - -Since 0.8.0-incubating, Gravitino supports credential vending for ADLS fileset. If the catalog has been configured with credential, you can access ADLS fileset without providing authentication information like `azure-storage-account-name` and `azure-storage-account-key` in the properties. - -### How to create an ADLS Hadoop catalog with credential enabled - -Apart from configuration method in [create-adls-hadoop-catalog](#catalog-a-catalog), properties needed by [adls-credential](./security/credential-vending.md#adls-credentials) should also be set to enable credential vending for ADLSfileset. - -### How to access ADLS fileset with credential - -If the catalog has been configured with credential, you can access ADLS fileset without providing authentication information via GVFS. Let's see how to access ADLS fileset with credential: - -GVFS Java client: - -```java -Configuration conf = new Configuration(); -conf.set("fs.AbstractFileSystem.gvfs.impl","org.apache.gravitino.filesystem.hadoop.Gvfs"); -conf.set("fs.gvfs.impl","org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); -conf.set("fs.gravitino.server.uri","${GRAVITINO_SERVER_IP:PORT}"); -conf.set("fs.gravitino.client.metalake","test_metalake"); -// No need to set azure-storage-account-name and azure-storage-account-name -Path filesetPath = new Path("gvfs://fileset/adls_test_catalog/test_schema/test_fileset/new_dir"); -FileSystem fs = filesetPath.getFileSystem(conf); -fs.mkdirs(filesetPath); -... -``` - -Spark: - -```python -spark = SparkSession.builder - .appName("adls_fielset_test") - .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") - .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") - .config("spark.hadoop.fs.gravitino.server.uri", "${GRAVITINO_SERVER_IP:PORT}") - .config("spark.hadoop.fs.gravitino.client.metalake", "test") - # No need to set azure-storage-account-name and azure-storage-account-name - .config("spark.driver.memory", "2g") - .config("spark.driver.port", "2048") - .getOrCreate() -``` - -Python client and Hadoop command are similar to the above examples. - diff --git a/docs/hadoop-catalog-with-gcs.md b/docs/hadoop-catalog-with-gcs.md deleted file mode 100644 index 9578a952a50..00000000000 --- a/docs/hadoop-catalog-with-gcs.md +++ /dev/null @@ -1,418 +0,0 @@ ---- -title: "Hadoop catalog with GCS" -slug: /hadoop-catalog-with-gcs -date: 2024-01-03 -keyword: Hadoop catalog GCS -license: "This software is licensed under the Apache License version 2." ---- - -This document describes how to configure a Hadoop catalog with GCS. - -## Prerequisites - -In order to create a Hadoop catalog with GCS, you need to place [`gravitino-gcp-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-gcp-bundle) in Gravitino Hadoop classpath located -at `${HADOOP_HOME}/share/hadoop/common/lib/`. After that, start Gravitino server with the following command: - -```bash -$ bin/gravitino-server.sh start -``` - -## Create a Hadoop Catalog with GCS - -The rest of this document shows how to use the Hadoop catalog with GCS in Gravitino with a full example. - - -### Catalog a GCS Hadoop catalog - -Apart from configuration method in [Hadoop-catalog-catalog-configuration](./hadoop-catalog.md#catalog-properties), the following properties are required to configure a Hadoop catalog with GCS: - -| Configuration item | Description | Default value | Required | Since version | -|-------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|----------------------------|------------------| -| `filesystem-providers` | The file system providers to add. Set it to `gcs` if it's a GCS fileset, a comma separated string that contains `gcs` like `gcs,s3` to support multiple kinds of fileset including `gcs`. | (none) | Yes | 0.7.0-incubating | -| `default-filesystem-provider` | The name default filesystem providers of this Hadoop catalog if users do not specify the scheme in the URI. Default value is `builtin-local`, for GCS, if we set this value, we can omit the prefix 'gs://' in the location. | `builtin-local` | No | 0.7.0-incubating | -| `gcs-service-account-file` | The path of GCS service account JSON file. | (none) | Yes if it's a GCS fileset. | 0.7.0-incubating | - -### Create a schema - -Refer to [Schema operation](./manage-fileset-metadata-using-gravitino.md#schema-operations) for more details. - -### Create a fileset - -Refer to [Fileset operation](./manage-fileset-metadata-using-gravitino.md#fileset-operations) for more details. - -## Using Hadoop catalog with GCS - -### Create a Hadoop catalog/schema/fileset with GCS - -First, you need to create a Hadoop catalog with GCS. The following example shows how to create a Hadoop catalog with GCS: - - - - -```shell -curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ --H "Content-Type: application/json" -d '{ - "name": "test_catalog", - "type": "FILESET", - "comment": "comment", - "provider": "hadoop", - "properties": { - "location": "gs://bucket/root", - "gcs-service-account-file": "path_of_gcs_service_account_file", - "filesystem-providers": "gcs" - } -}' ${GRAVITINO_SERVER_IP:PORT}/api/metalakes/metalake/catalogs -``` - - - - -```java -GravitinoClient gravitinoClient = GravitinoClient - .builder("${GRAVITINO_SERVER_IP:PORT}") - .withMetalake("metalake") - .build(); - -gcsProperties = ImmutableMap.builder() - .put("location", "gs://bucket/root") - .put("gcs-service-account-file", "path_of_gcs_service_account_file") - .put("filesystem-providers", "gcs") - .build(); - -Catalog gcsCatalog = gravitinoClient.createCatalog("test_catalog", - Type.FILESET, - "hadoop", // provider, Gravitino only supports "hadoop" for now. - "This is a GCS fileset catalog", - gcsProperties); -// ... - -``` - - - - -```python -gravitino_client: GravitinoClient = GravitinoClient(uri="${GRAVITINO_SERVER_IP:PORT}", metalake_name="metalake") -gcs_properties = { - "location": "gs://bucket/root", - "gcs-service-account-file": "path_of_gcs_service_account_file" -} - -gcs_properties = gravitino_client.create_catalog(name="test_catalog", - type=Catalog.Type.FILESET, - provider="hadoop", - comment="This is a GCS fileset catalog", - properties=gcs_properties) - -``` - - - - -Then create a schema and a fileset in the catalog created above. - -Using the following code to create a schema and fileset: - - - - -```shell -curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ --H "Content-Type: application/json" -d '{ - "name": "test_schema", - "comment": "comment", - "properties": { - "location": "gs://bucket/root/schema" - } -}' ${GRAVITINO_SERVER_IP:PORT}/api/metalakes/metalake/catalogs/test_catalog/schemas -``` - - - - -```java -Catalog catalog = gravitinoClient.loadCatalog("test_catalog"); - -SupportsSchemas supportsSchemas = catalog.asSchemas(); - -Map schemaProperties = ImmutableMap.builder() - .put("location", "gs://bucket/root/schema") - .build(); -Schema schema = supportsSchemas.createSchema("test_schema", - "This is a schema", - schemaProperties -); -// ... -``` - - - - -```python -gravitino_client: GravitinoClient = GravitinoClient(uri="http://127.0.0.1:8090", metalake_name="metalake") -catalog: Catalog = gravitino_client.load_catalog(name="test_catalog") -catalog.as_schemas().create_schema(name="test_schema", - comment="This is a schema", - properties={"location": "gs://bucket/root/schema"}) -``` - - - - - - - -```shell -curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ --H "Content-Type: application/json" -d '{ - "name": "example_fileset", - "comment": "This is an example fileset", - "type": "MANAGED", - "storageLocation": "gs://bucket/root/schema/example_fileset", - "properties": { - "k1": "v1" - } -}' ${GRAVITINO_SERVER_IP:PORT}/api/metalakes/metalake/catalogs/test_catalog/schemas/test_schema/filesets -``` - - - - -```java -GravitinoClient gravitinoClient = GravitinoClient - .builder("${GRAVITINO_SERVER_IP:PORT}") - .withMetalake("metalake") - .build(); - -Catalog catalog = gravitinoClient.loadCatalog("test_catalog"); -FilesetCatalog filesetCatalog = catalog.asFilesetCatalog(); - -Map propertiesMap = ImmutableMap.builder() - .put("k1", "v1") - .build(); - -filesetCatalog.createFileset( - NameIdentifier.of("test_schema", "example_fileset"), - "This is an example fileset", - Fileset.Type.MANAGED, - "gs://bucket/root/schema/example_fileset", - propertiesMap, -); -``` - - - - -```python -gravitino_client: GravitinoClient = GravitinoClient(uri="${GRAVITINO_SERVER_IP:PORT}", metalake_name="metalake") - -catalog: Catalog = gravitino_client.load_catalog(name="test_catalog") -catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("test_schema", "example_fileset"), - type=Fileset.Type.MANAGED, - comment="This is an example fileset", - storage_location="gs://bucket/root/schema/example_fileset", - properties={"k1": "v1"}) -``` - - - - -### Using Spark to access the fileset - -The following code snippet shows how to use **PySpark 3.1.3 with Hadoop environment(Hadoop 3.2.0)** to access the fileset: - -```python -import logging -from gravitino import NameIdentifier, GravitinoClient, Catalog, Fileset, GravitinoAdminClient -from pyspark.sql import SparkSession -import os - -gravitino_url = "${GRAVITINO_SERVER_IP:PORT}" -metalake_name = "test" - -catalog_name = "your_gcs_catalog" -schema_name = "your_gcs_schema" -fileset_name = "your_gcs_fileset" - -os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-gcp-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar,/path/to/gcs-connector-hadoop3-2.2.22-shaded.jar --master local[1] pyspark-shell" -spark = SparkSession.builder -.appName("gcs_fielset_test") -.config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") -.config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") -.config("spark.hadoop.fs.gravitino.server.uri", "${GRAVITINO_SERVER_URL}") -.config("spark.hadoop.fs.gravitino.client.metalake", "test_metalake") -.config("spark.hadoop.gcs-service-account-file", "/path/to/gcs-service-account-file.json") -.config("spark.driver.memory", "2g") -.config("spark.driver.port", "2048") -.getOrCreate() - -data = [("Alice", 25), ("Bob", 30), ("Cathy", 45)] -columns = ["Name", "Age"] -spark_df = spark.createDataFrame(data, schema=columns) -gvfs_path = f"gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/people" - -spark_df.coalesce(1).write -.mode("overwrite") -.option("header", "true") -.csv(gvfs_path) -``` - -If your Spark **without Hadoop environment**, you can use the following code snippet to access the fileset: - -```python -## Replace the following code snippet with the above code snippet with the same environment variables - -os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-gcp-bundle-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar, --master local[1] pyspark-shell" -``` - -- [`gravitino-gcp-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-gcp-bundle) is the Gravitino GCP jar with Hadoop environment and `gcs-connector`. -- [`gravitino-gcp-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-gcp) is a condensed version of the Gravitino GCP bundle jar without Hadoop environment and `gcs-connector`. - -Please choose the correct jar according to your environment. - -:::note -In some Spark versions, a Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work. If this is the case, you should add the jars to the spark CLASSPATH directly. -::: - -### Using Gravitino virtual file system Java client to access the fileset - -```java -Configuration conf = new Configuration(); -conf.set("fs.AbstractFileSystem.gvfs.impl","org.apache.gravitino.filesystem.hadoop.Gvfs"); -conf.set("fs.gvfs.impl","org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); -conf.set("fs.gravitino.server.uri","${GRAVITINO_SERVER_IP:PORT}"); -conf.set("fs.gravitino.client.metalake","test_metalake"); -conf.set("gcs-service-account-file", "/path/your-service-account-file.json"); -Path filesetPath = new Path("gvfs://fileset/test_catalog/test_schema/test_fileset/new_dir"); -FileSystem fs = filesetPath.getFileSystem(conf); -fs.mkdirs(filesetPath); -... -``` - -Similar to Spark configurations, you need to add GCS bundle jars to the classpath according to your environment. - -### Accessing a fileset using the Hadoop fs command - -The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. - -1. Adding the following contents to the `${HADOOP_HOME}/etc/hadoop/core-site.xml` file: - -```xml - - fs.AbstractFileSystem.gvfs.impl - org.apache.gravitino.filesystem.hadoop.Gvfs - - - - fs.gvfs.impl - org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem - - - - fs.gravitino.server.uri - ${GRAVITINO_SERVER_IP:PORT} - - - - fs.gravitino.client.metalake - test - - - - gcs-service-account-file - /path/your-service-account-file.json - -``` - -2. Copy the necessary jars to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. - -For GCS, you need to copy `gravitino-gcp-{version}.jar` to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. -Then copy `hadoop-gcp-${version}.jar` and other possible dependencies to the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory. Those jars can be found in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory, you can add all the jars in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. - - -3. Run the following command to access the fileset: - -```shell -hadoop dfs -ls gvfs://fileset/gcs_catalog/schema/example -hadoop dfs -put /path/to/local/file gvfs://fileset/gcs_catalog/schema/example -``` - - -### Using the Gravitino virtual file system Python client to access a fileset - -```python -from gravitino import gvfs -options = { - "cache_size": 20, - "cache_expired_time": 3600, - "auth_type": "simple", - "gcs_service_account_file": "path_of_gcs_service_account_file.json", -} -fs = gvfs.GravitinoVirtualFileSystem(server_uri="${GRAVITINO_SERVER_IP:PORT}", metalake_name="test_metalake", options=options) -fs.ls("gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/") -``` - -### Using fileset with pandas - -The following are examples of how to use the pandas library to access the GCS fileset - -```python -import pandas as pd - -storage_options = { - "server_uri": "${GRAVITINO_SERVER_IP:PORT}", - "metalake_name": "test", - "options": { - "gcs_service_account_file": "path_of_gcs_service_account_file.json", - } -} -ds = pd.read_csv(f"gvfs://fileset/${catalog_name}/${schema_name}/${fileset_name}/people/part-00000-51d366e2-d5eb-448d-9109-32a96c8a14dc-c000.csv", - storage_options=storage_options) -ds.head() -``` - -For other use cases, please refer to the [Gravitino Virtual File System](./how-to-use-gvfs.md) document. - -## Fileset with credential - -Since 0.8.0-incubating, Gravitino supports credential vending for GCS fileset. If the catalog has been configured with credential, you can access GCS fileset without providing authentication information like `gcs-service-account-file` in the properties. - -### How to create a GCS Hadoop catalog with credential enabled - -Apart from configuration method in [create-gcs-hadoop-catalog](#catalog-a-catalog), properties needed by [gcs-credential](./security/credential-vending.md#gcs-credentials) should also be set to enable credential vending for GCS fileset. - -### How to access GCS fileset with credential - -If the catalog has been configured with credential, you can access GCS fileset without providing authentication information via GVFS. Let's see how to access GCS fileset with credential: - -GVFS Java client: - -```java -Configuration conf = new Configuration(); -conf.set("fs.AbstractFileSystem.gvfs.impl","org.apache.gravitino.filesystem.hadoop.Gvfs"); -conf.set("fs.gvfs.impl","org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); -conf.set("fs.gravitino.server.uri","${GRAVITINO_SERVER_IP:PORT}"); -conf.set("fs.gravitino.client.metalake","test_metalake"); -// No need to set gcs-service-account-file -Path filesetPath = new Path("gvfs://fileset/gcs_test_catalog/test_schema/test_fileset/new_dir"); -FileSystem fs = filesetPath.getFileSystem(conf); -fs.mkdirs(filesetPath); -... -``` - -Spark: - -```python -spark = SparkSession.builder - .appName("gcs_fileset_test") - .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") - .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") - .config("spark.hadoop.fs.gravitino.server.uri", "${GRAVITINO_SERVER_IP:PORT}") - .config("spark.hadoop.fs.gravitino.client.metalake", "test") - # No need to set gcs-service-account-file - .config("spark.driver.memory", "2g") - .config("spark.driver.port", "2048") - .getOrCreate() -``` - -Python client and Hadoop command are similar to the above examples. diff --git a/docs/hadoop-catalog-with-oss.md b/docs/hadoop-catalog-with-oss.md deleted file mode 100644 index 761501079d3..00000000000 --- a/docs/hadoop-catalog-with-oss.md +++ /dev/null @@ -1,447 +0,0 @@ ---- -title: "Hadoop catalog with OSS" -slug: /hadoop-catalog-with-oss -date: 2025-01-03 -keyword: Hadoop catalog OSS -license: "This software is licensed under the Apache License version 2." ---- - -This document describes how to configure a Hadoop catalog with Aliyun OSS. - -## Prerequisites - -In order to create a Hadoop catalog with OSS, you need to place [`gravitino-aliyun-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-aliyun-bundle) in Gravitino Hadoop classpath located -at `${HADOOP_HOME}/share/hadoop/common/lib/`. After that, start Gravitino server with the following command: - -```bash -$ bin/gravitino-server.sh start -``` - -## Create a Hadoop Catalog with OSS - -### Catalog an OSS Hadoop catalog - -Apart from configuration method in [Hadoop-catalog-catalog-configuration](./hadoop-catalog.md#catalog-properties), the following properties are required to configure a Hadoop catalog with OSS: - -| Configuration item | Description | Default value | Required | Since version | -|-------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|----------------------------|------------------| -| `filesystem-providers` | The file system providers to add. Set it to `oss` if it's a OSS fileset, or a comma separated string that contains `oss` like `oss,gs,s3` to support multiple kinds of fileset including `oss`. | (none) | Yes | 0.7.0-incubating | -| `default-filesystem-provider` | The name default filesystem providers of this Hadoop catalog if users do not specify the scheme in the URI. Default value is `builtin-local`, for OSS, if we set this value, we can omit the prefix 'oss://' in the location. | `builtin-local` | No | 0.7.0-incubating | -| `oss-endpoint` | The endpoint of the Aliyun OSS. | (none) | Yes if it's a OSS fileset. | 0.7.0-incubating | -| `oss-access-key-id` | The access key of the Aliyun OSS. | (none) | Yes if it's a OSS fileset. | 0.7.0-incubating | -| `oss-secret-access-key` | The secret key of the Aliyun OSS. | (none) | Yes if it's a OSS fileset. | 0.7.0-incubating | - -### Create a schema - -Refer to [Schema operation](./manage-fileset-metadata-using-gravitino.md#schema-operations) for more details. - -### Create a fileset - -Refer to [Fileset operation](./manage-fileset-metadata-using-gravitino.md#fileset-operations) for more details. - - -## Using Hadoop catalog with OSS - -The rest of this document shows how to use the Hadoop catalog with OSS in Gravitino with a full example. - -### Create a Hadoop catalog/schema/fileset with OSS - -First, you need to create a Hadoop catalog with OSS. The following example shows how to create a Hadoop catalog with OSS: - - - - -```shell -curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ --H "Content-Type: application/json" -d '{ - "name": "test_catalog", - "type": "FILESET", - "comment": "comment", - "provider": "hadoop", - "properties": { - "location": "oss://bucket/root", - "oss-access-key-id": "access_key", - "oss-secret-access-key": "secret_key", - "oss-endpoint": "http://oss-cn-hangzhou.aliyuncs.com", - "filesystem-providers": "oss" - } -}' http://localhost:8090/api/metalakes/metalake/catalogs -``` - - - - -```java -GravitinoClient gravitinoClient = GravitinoClient - .builder("http://localhost:8090") - .withMetalake("metalake") - .build(); - -ossProperties = ImmutableMap.builder() - .put("location", "oss://bucket/root") - .put("oss-access-key-id", "access_key") - .put("oss-secret-access-key", "secret_key") - .put("oss-endpoint", "http://oss-cn-hangzhou.aliyuncs.com") - .put("filesystem-providers", "oss") - .build(); - -Catalog ossCatalog = gravitinoClient.createCatalog("test_catalog", - Type.FILESET, - "hadoop", // provider, Gravitino only supports "hadoop" for now. - "This is a OSS fileset catalog", - ossProperties); -// ... - -``` - - - - -```python -gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") -oss_properties = { - "location": "oss://bucket/root", - "oss-access-key-id": "access_key" - "oss-secret-access-key": "secret_key", - "oss-endpoint": "ossProperties" -} - -oss_catalog = gravitino_client.create_catalog(name="test_catalog", - type=Catalog.Type.FILESET, - provider="hadoop", - comment="This is a OSS fileset catalog", - properties=oss_properties) - -``` - - - - -Then create a schema and a fileset in the catalog created above. - -Using the following code to create a schema and fileset: - - - - -```shell -curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ --H "Content-Type: application/json" -d '{ - "name": "test_schema", - "comment": "comment", - "properties": { - "location": "oss://bucket/root/schema" - } -}' http://localhost:8090/api/metalakes/metalake/catalogs/test_catalog/schemas -``` - - - - -```java -Catalog catalog = gravitinoClient.loadCatalog("test_catalog"); - -SupportsSchemas supportsSchemas = catalog.asSchemas(); - -Map schemaProperties = ImmutableMap.builder() - .put("location", "oss://bucket/root/schema") - .build(); -Schema schema = supportsSchemas.createSchema("test_schema", - "This is a schema", - schemaProperties -); -// ... -``` - - - - -```python -gravitino_client: GravitinoClient = GravitinoClient(uri="http://127.0.0.1:8090", metalake_name="metalake") -catalog: Catalog = gravitino_client.load_catalog(name="test_catalog") -catalog.as_schemas().create_schema(name="test_schema", - comment="This is a schema", - properties={"location": "oss://bucket/root/schema"}) -``` - - - - - - - -```shell -curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ --H "Content-Type: application/json" -d '{ - "name": "example_fileset", - "comment": "This is an example fileset", - "type": "MANAGED", - "storageLocation": "oss://bucket/root/schema/example_fileset", - "properties": { - "k1": "v1" - } -}' http://localhost:8090/api/metalakes/metalake/catalogs/test_catalog/schemas/test_schema/filesets -``` - - - - -```java -GravitinoClient gravitinoClient = GravitinoClient - .builder("http://localhost:8090") - .withMetalake("metalake") - .build(); - -Catalog catalog = gravitinoClient.loadCatalog("test_catalog"); -FilesetCatalog filesetCatalog = catalog.asFilesetCatalog(); - -Map propertiesMap = ImmutableMap.builder() - .put("k1", "v1") - .build(); - -filesetCatalog.createFileset( - NameIdentifier.of("test_schema", "example_fileset"), - "This is an example fileset", - Fileset.Type.MANAGED, - "oss://bucket/root/schema/example_fileset", - propertiesMap, -); -``` - - - - -```python -gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") - -catalog: Catalog = gravitino_client.load_catalog(name="test_catalog") -catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("test_schema", "example_fileset"), - type=Fileset.Type.MANAGED, - comment="This is an example fileset", - storage_location="oss://bucket/root/schema/example_fileset", - properties={"k1": "v1"}) -``` - - - - -### Using Spark to access the fileset - -The following code snippet shows how to use **PySpark 3.1.3 with Hadoop environment(Hadoop 3.2.0)** to access the fileset: - -```python -import logging -from gravitino import NameIdentifier, GravitinoClient, Catalog, Fileset, GravitinoAdminClient -from pyspark.sql import SparkSession -import os - -gravitino_url = "http://localhost:8090" -metalake_name = "test" - -catalog_name = "your_oss_catalog" -schema_name = "your_oss_schema" -fileset_name = "your_oss_fileset" - -os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-aliyun-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar,/path/to/aliyun-sdk-oss-2.8.3.jar,/path/to/hadoop-aliyun-3.2.0.jar,/path/to/jdom-1.1.jar --master local[1] pyspark-shell" -spark = SparkSession.builder -.appName("oss_fielset_test") -.config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") -.config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") -.config("spark.hadoop.fs.gravitino.server.uri", "${GRAVITINO_SERVER_URL}") -.config("spark.hadoop.fs.gravitino.client.metalake", "test") -.config("spark.hadoop.oss-access-key-id", os.environ["OSS_ACCESS_KEY_ID"]) -.config("spark.hadoop.oss-secret-access-key", os.environ["OSS_SECRET_ACCESS_KEY"]) -.config("spark.hadoop.oss-endpoint", "http://oss-cn-hangzhou.aliyuncs.com") -.config("spark.driver.memory", "2g") -.config("spark.driver.port", "2048") -.getOrCreate() - -data = [("Alice", 25), ("Bob", 30), ("Cathy", 45)] -columns = ["Name", "Age"] -spark_df = spark.createDataFrame(data, schema=columns) -gvfs_path = f"gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/people" - -spark_df.coalesce(1).write -.mode("overwrite") -.option("header", "true") -.csv(gvfs_path) -``` - -If your Spark **without Hadoop environment**, you can use the following code snippet to access the fileset: - -```python -## Replace the following code snippet with the above code snippet with the same environment variables - -os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-aliyun-bundle-{gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-{gravitino-version}.jar, --master local[1] pyspark-shell" -``` - -- [`gravitino-aliyun-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-aliyun-bundle) is the Gravitino Aliyun jar with Hadoop environment and `hadoop-oss` jar. -- [`gravitino-aliyun-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-aliyun) is a condensed version of the Gravitino Aliyun bundle jar without Hadoop environment and `hadoop-aliyun` jar. - -Please choose the correct jar according to your environment. - -:::note -In some Spark versions, a Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work. If this is the case, you should add the jars to the spark CLASSPATH directly. -::: - -### Using Gravitino virtual file system Java client to access the fileset - -```java -Configuration conf = new Configuration(); -conf.set("fs.AbstractFileSystem.gvfs.impl","org.apache.gravitino.filesystem.hadoop.Gvfs"); -conf.set("fs.gvfs.impl","org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); -conf.set("fs.gravitino.server.uri","http://localhost:8090"); -conf.set("fs.gravitino.client.metalake","test_metalake"); -conf.set("oss-endpoint", "${GRAVITINO_SERVER_IP:PORT}"); -conf.set("oss-access-key-id", "minio"); -conf.set("oss-secret-access-key", "minio123"); -Path filesetPath = new Path("gvfs://fileset/test_catalog/test_schema/test_fileset/new_dir"); -FileSystem fs = filesetPath.getFileSystem(conf); -fs.mkdirs(filesetPath); -... -``` - -Similar to Spark configurations, you need to add OSS bundle jars to the classpath according to your environment. - -### Accessing a fileset using the Hadoop fs command - -The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. - -1. Adding the following contents to the `${HADOOP_HOME}/etc/hadoop/core-site.xml` file: - -```xml - - fs.AbstractFileSystem.gvfs.impl - org.apache.gravitino.filesystem.hadoop.Gvfs - - - - fs.gvfs.impl - org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem - - - - fs.gravitino.server.uri - ${GRAVITINO_SERVER_IP:PORT} - - - - fs.gravitino.client.metalake - test - - - - oss-endpoint - http://oss-cn-hangzhou.aliyuncs.com - - - - oss-access-key-id - access-key - - - - oss-secret-access-key - secret-key - -``` - -2. Copy the necessary jars to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. - -For OSS, you need to copy `gravitino-aliyun-{version}.jar` to the `${HADOOP_HOME}/share/hadoop/common/lib` directory, -then copy hadoop-aliyun-{version}.jar and related dependencies to the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory. Those jars can be found in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory, you can add all the jars in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. - - -3. Run the following command to access the fileset: - -```shell -hadoop dfs -ls gvfs://fileset/oss_catalog/schema/example -hadoop dfs -put /path/to/local/file gvfs://fileset/oss_catalog/schema/example -``` - - -### Using Gravitino virtual file system Python client to access a fileset - -```python -from gravitino import gvfs -options = { - "cache_size": 20, - "cache_expired_time": 3600, - "auth_type": "simple", - "oss_endpoint": "${GRAVITINO_SERVER_IP:PORT}", - "oss_access_key_id": "minio", - "oss_secret_access_key": "minio123" -} -fs = gvfs.GravitinoVirtualFileSystem(server_uri="${GRAVITINO_SERVER_IP:PORT}", metalake_name="test_metalake", options=options) - -fs.ls("gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/") -``` - - -### Using fileset with pandas - -The following are examples of how to use the pandas library to access the OSS fileset - -```python -import pandas as pd - -storage_options = { - "server_uri": "${GRAVITINO_SERVER_IP:PORT}", - "metalake_name": "test", - "options": { - "oss_access_key_id": "access_key", - "oss_secret_access_key": "secret_key", - "oss_endpoint": "http://oss-cn-hangzhou.aliyuncs.com" - } -} -ds = pd.read_csv(f"gvfs://fileset/${catalog_name}/${schema_name}/${fileset_name}/people/part-00000-51d366e2-d5eb-448d-9109-32a96c8a14dc-c000.csv", - storage_options=storage_options) -ds.head() -``` -For other use cases, please refer to the [Gravitino Virtual File System](./how-to-use-gvfs.md) document. - -## Fileset with credential - -Since 0.8.0-incubating, Gravitino supports credential vending for OSS fileset. If the catalog has been configured with credential, you can access OSS fileset without providing authentication information like `oss-access-key-id` and `oss-secret-access-key` in the properties. - -### How to create a OSS Hadoop catalog with credential enabled - -Apart from configuration method in [create-oss-hadoop-catalog](#catalog-a-catalog), properties needed by [oss-credential](./security/credential-vending.md#oss-credentials) should also be set to enable credential vending for OSS fileset. - -### How to access OSS fileset with credential - -If the catalog has been configured with credential, you can access OSS fileset without providing authentication information via GVFS. Let's see how to access OSS fileset with credential: - -GVFS Java client: - -```java -Configuration conf = new Configuration(); -conf.set("fs.AbstractFileSystem.gvfs.impl","org.apache.gravitino.filesystem.hadoop.Gvfs"); -conf.set("fs.gvfs.impl","org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); -conf.set("fs.gravitino.server.uri","${GRAVITINO_SERVER_IP:PORT}"); -conf.set("fs.gravitino.client.metalake","test_metalake"); -// No need to set oss-access-key-id and oss-secret-access-key -Path filesetPath = new Path("gvfs://fileset/oss_test_catalog/test_schema/test_fileset/new_dir"); -FileSystem fs = filesetPath.getFileSystem(conf); -fs.mkdirs(filesetPath); -... -``` - -Spark: - -```python -spark = SparkSession.builder - .appName("oss_fielset_test") - .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") - .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") - .config("spark.hadoop.fs.gravitino.server.uri", "${GRAVITINO_SERVER_IP:PORT}") - .config("spark.hadoop.fs.gravitino.client.metalake", "test") - # No need to set oss-access-key-id and oss-secret-access-key - .config("spark.driver.memory", "2g") - .config("spark.driver.port", "2048") - .getOrCreate() -``` - -Python client and Hadoop command are similar to the above examples. - - diff --git a/docs/hadoop-catalog-with-s3.md b/docs/hadoop-catalog-with-s3.md deleted file mode 100644 index de3c3e7fc3d..00000000000 --- a/docs/hadoop-catalog-with-s3.md +++ /dev/null @@ -1,451 +0,0 @@ ---- -title: "Hadoop catalog with S3" -slug: /hadoop-catalog-with-s3 -date: 2025-01-03 -keyword: Hadoop catalog S3 -license: "This software is licensed under the Apache License version 2." ---- - -This document describes how to configure a Hadoop catalog with S3. - -## Prerequisites - -In order to create a Hadoop catalog with S3, you need to place [`gravitino-aws-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-aws-bundle) in Gravitino Hadoop classpath located -at `${HADOOP_HOME}/share/hadoop/common/lib/`. After that, start Gravitino server with the following command: - -```bash -$ bin/gravitino-server.sh start -``` - -## Create a Hadoop Catalog with S3 - -### Catalog a S3 Hadoop catalog - -Apart from configuration method in [Hadoop-catalog-catalog-configuration](./hadoop-catalog.md#catalog-properties), the following properties are required to configure a Hadoop catalog with S3: - -| Configuration item | Description | Default value | Required | Since version | -|-------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|---------------------------|------------------| -| `filesystem-providers` | The file system providers to add. Set it to `s3` if it's a S3 fileset, or a comma separated string that contains `s3` like `gs,s3` to support multiple kinds of fileset including `s3`. | (none) | Yes | 0.7.0-incubating | -| `default-filesystem-provider` | The name default filesystem providers of this Hadoop catalog if users do not specify the scheme in the URI. Default value is `builtin-local`, for S3, if we set this value, we can omit the prefix 's3a://' in the location. | `builtin-local` | No | 0.7.0-incubating | -| `s3-endpoint` | The endpoint of the AWS S3. | (none) | Yes if it's a S3 fileset. | 0.7.0-incubating | -| `s3-access-key-id` | The access key of the AWS S3. | (none) | Yes if it's a S3 fileset. | 0.7.0-incubating | -| `s3-secret-access-key` | The secret key of the AWS S3. | (none) | Yes if it's a S3 fileset. | 0.7.0-incubating | - -### Create a schema - -Refer to [Schema operation](./manage-fileset-metadata-using-gravitino.md#schema-operations) for more details. - -### Create a fileset - -Refer to [Fileset operation](./manage-fileset-metadata-using-gravitino.md#fileset-operations) for more details. - - -## Using Hadoop catalog with S3 - -The rest of this document shows how to use the Hadoop catalog with S3 in Gravitino with a full example. - -### Create a Hadoop catalog/schema/file set with S3 - -First of all, you need to create a Hadoop catalog with S3. The following example shows how to create a Hadoop catalog with S3: - - - - -```shell -curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ --H "Content-Type: application/json" -d '{ - "name": "test_catalog", - "type": "FILESET", - "comment": "comment", - "provider": "hadoop", - "properties": { - "location": "s3a://bucket/root", - "s3-access-key-id": "access_key", - "s3-secret-access-key": "secret_key", - "s3-endpoint": "http://s3.ap-northeast-1.amazonaws.com", - "filesystem-providers": "s3" - } -}' http://localhost:8090/api/metalakes/metalake/catalogs -``` - - - - -```java -GravitinoClient gravitinoClient = GravitinoClient - .builder("http://localhost:8090") - .withMetalake("metalake") - .build(); - -s3Properties = ImmutableMap.builder() - .put("location", "s3a://bucket/root") - .put("s3-access-key-id", "access_key") - .put("s3-secret-access-key", "secret_key") - .put("s3-endpoint", "http://s3.ap-northeast-1.amazonaws.com") - .put("filesystem-providers", "s3") - .build(); - -Catalog s3Catalog = gravitinoClient.createCatalog("test_catalog", - Type.FILESET, - "hadoop", // provider, Gravitino only supports "hadoop" for now. - "This is a S3 fileset catalog", - s3Properties); -// ... - -``` - - - - -```python -gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") -s3_properties = { - "location": "s3a://bucket/root", - "s3-access-key-id": "access_key" - "s3-secret-access-key": "secret_key", - "s3-endpoint": "http://s3.ap-northeast-1.amazonaws.com" -} - -s3_catalog = gravitino_client.create_catalog(name="test_catalog", - type=Catalog.Type.FILESET, - provider="hadoop", - comment="This is a S3 fileset catalog", - properties=s3_properties) - -``` - - - - -:::note -The value of location should always start with `s3a` NOT `s3` for AWS S3, for instance, `s3a://bucket/root`. Value like `s3://bucket/root` is not supported due to the limitation of the hadoop-aws library. -::: - -Then create a schema and a fileset in the catalog created above. - -Using the following code to create a schema and fileset: - - - - -```shell -curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ --H "Content-Type: application/json" -d '{ - "name": "test_schema", - "comment": "comment", - "properties": { - "location": "s3a://bucket/root/schema" - } -}' http://localhost:8090/api/metalakes/metalake/catalogs/test_catalog/schemas -``` - - - - -```java -// Assuming you have just created a Hive catalog named `hive_catalog` -Catalog catalog = gravitinoClient.loadCatalog("hive_catalog"); - -SupportsSchemas supportsSchemas = catalog.asSchemas(); - -Map schemaProperties = ImmutableMap.builder() - .put("location", "s3a://bucket/root/schema") - .build(); -Schema schema = supportsSchemas.createSchema("test_schema", - "This is a schema", - schemaProperties -); -// ... -``` - - - - -```python -gravitino_client: GravitinoClient = GravitinoClient(uri="http://127.0.0.1:8090", metalake_name="metalake") -catalog: Catalog = gravitino_client.load_catalog(name="test_catalog") -catalog.as_schemas().create_schema(name="test_schema", - comment="This is a schema", - properties={"location": "s3a://bucket/root/schema"}) -``` - - - - - - - -```shell -curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ --H "Content-Type: application/json" -d '{ - "name": "example_fileset", - "comment": "This is an example fileset", - "type": "MANAGED", - "storageLocation": "s3a://bucket/root/schema/example_fileset", - "properties": { - "k1": "v1" - } -}' http://localhost:8090/api/metalakes/metalake/catalogs/test_catalog/schemas/test_schema/filesets -``` - - - - -```java -GravitinoClient gravitinoClient = GravitinoClient - .builder("http://localhost:8090") - .withMetalake("metalake") - .build(); - -Catalog catalog = gravitinoClient.loadCatalog("test_catalog"); -FilesetCatalog filesetCatalog = catalog.asFilesetCatalog(); - -Map propertiesMap = ImmutableMap.builder() - .put("k1", "v1") - .build(); - -filesetCatalog.createFileset( - NameIdentifier.of("test_schema", "example_fileset"), - "This is an example fileset", - Fileset.Type.MANAGED, - "s3a://bucket/root/schema/example_fileset", - propertiesMap, -); -``` - - - - -```python -gravitino_client: GravitinoClient = GravitinoClient(uri="http://localhost:8090", metalake_name="metalake") - -catalog: Catalog = gravitino_client.load_catalog(name="catalog") -catalog.as_fileset_catalog().create_fileset(ident=NameIdentifier.of("schema", "example_fileset"), - type=Fileset.Type.MANAGED, - comment="This is an example fileset", - storage_location="s3a://bucket/root/schema/example_fileset", - properties={"k1": "v1"}) -``` - - - - - -### Using Spark to access the fileset - -The following code snippet shows how to use **PySpark 3.1.3 with Hadoop environment(Hadoop 3.2.0)** to access the fileset: - -```python -import logging -from gravitino import NameIdentifier, GravitinoClient, Catalog, Fileset, GravitinoAdminClient -from pyspark.sql import SparkSession -import os - -gravitino_url = "http://localhost:8090" -metalake_name = "test" - -catalog_name = "your_s3_catalog" -schema_name = "your_s3_schema" -fileset_name = "your_s3_fileset" - -os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-aws-${gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-${gravitino-version}-SNAPSHOT.jar,/path/to/hadoop-aws-3.2.0.jar,/path/to/aws-java-sdk-bundle-1.11.375.jar --master local[1] pyspark-shell" -spark = SparkSession.builder - .appName("s3_fielset_test") - .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") - .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") - .config("spark.hadoop.fs.gravitino.server.uri", "${GRAVITINO_SERVER_URL}") - .config("spark.hadoop.fs.gravitino.client.metalake", "test") - .config("spark.hadoop.s3-access-key-id", os.environ["S3_ACCESS_KEY_ID"]) - .config("spark.hadoop.s3-secret-access-key", os.environ["S3_SECRET_ACCESS_KEY"]) - .config("spark.hadoop.s3-endpoint", "http://s3.ap-northeast-1.amazonaws.com") - .config("spark.driver.memory", "2g") - .config("spark.driver.port", "2048") - .getOrCreate() - -data = [("Alice", 25), ("Bob", 30), ("Cathy", 45)] -columns = ["Name", "Age"] -spark_df = spark.createDataFrame(data, schema=columns) -gvfs_path = f"gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/people" - -spark_df.coalesce(1).write -.mode("overwrite") -.option("header", "true") -.csv(gvfs_path) -``` - -If your Spark **without Hadoop environment**, you can use the following code snippet to access the fileset: - -```python -## Replace the following code snippet with the above code snippet with the same environment variables -os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /path/to/gravitino-aws-${gravitino-version}.jar,/path/to/gravitino-filesystem-hadoop3-runtime-${gravitino-version}-SNAPSHOT.jar,/path/to/hadoop-aws-3.2.0.jar,/path/to/aws-java-sdk-bundle-1.11.375.jar --master local[1] pyspark-shell" -``` - -- [`gravitino-aws-bundle-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-aws-bundle) is the Gravitino AWS jar with Hadoop environment and `hadoop-aws` jar. -- [`gravitino-aws-${gravitino-version}.jar`](https://mvnrepository.com/artifact/org.apache.gravitino/gravitino-aws) is a condensed version of the Gravitino AWS bundle jar without Hadoop environment and `hadoop-aws` jar. - -Please choose the correct jar according to your environment. - -:::note -In some Spark versions, a Hadoop environment is needed by the driver, adding the bundle jars with '--jars' may not work. If this is the case, you should add the jars to the spark CLASSPATH directly. -::: - -### Using Gravitino virtual file system Java client to access the fileset - -```java -Configuration conf = new Configuration(); -conf.set("fs.AbstractFileSystem.gvfs.impl","org.apache.gravitino.filesystem.hadoop.Gvfs"); -conf.set("fs.gvfs.impl","org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); -conf.set("fs.gravitino.server.uri","${GRAVITINO_SERVER_IP:PORT}"); -conf.set("fs.gravitino.client.metalake","test_metalake"); - -conf.set("s3-endpoint", "${GRAVITINO_SERVER_IP:PORT}"); -conf.set("s3-access-key-id", "minio"); -conf.set("s3-secret-access-key", "minio123"); - -Path filesetPath = new Path("gvfs://fileset/test_catalog/test_schema/test_fileset/new_dir"); -FileSystem fs = filesetPath.getFileSystem(conf); -fs.mkdirs(filesetPath); -... -``` - -Similar to Spark configurations, you need to add S3 bundle jars to the classpath according to your environment. - -### Accessing a fileset using the Hadoop fs command - -The following are examples of how to use the `hadoop fs` command to access the fileset in Hadoop 3.1.3. - -1. Adding the following contents to the `${HADOOP_HOME}/etc/hadoop/core-site.xml` file: - -```xml - - fs.AbstractFileSystem.gvfs.impl - org.apache.gravitino.filesystem.hadoop.Gvfs - - - - fs.gvfs.impl - org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem - - - - fs.gravitino.server.uri - ${GRAVITINO_SERVER_IP:PORT} - - - - fs.gravitino.client.metalake - test - - - - s3-endpoint - http://s3.ap-northeast-1.amazonaws.com - - - - s3-access-key-id - access-key - - - - s3-secret-access-key - secret-key - -``` - -2. Copy the necessary jars to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. - -For S3, you need to copy `gravitino-aws-{version}.jar` to the `${HADOOP_HOME}/share/hadoop/common/lib` directoryl, -then copy hadoop-aws-{version}.jar and related dependencies to the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory. Those jars can be found in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory, you can add all the jars in the `${HADOOP_HOME}/share/hadoop/tools/lib/` directory to the `${HADOOP_HOME}/share/hadoop/common/lib` directory. - - -3. Run the following command to access the fileset: - -```shell -hadoop dfs -ls gvfs://fileset/s3_catalog/schema/example -hadoop dfs -put /path/to/local/file gvfs://fileset/s3_catalog/schema/example -``` - -### Using the Gravitino virtual file system Python client to access a fileset - -```python -from gravitino import gvfs -options = { - "cache_size": 20, - "cache_expired_time": 3600, - "auth_type": "simple", - "s3_endpoint": "${GRAVITINO_SERVER_IP:PORT}", - "s3_access_key_id": "minio", - "s3_secret_access_key": "minio123" -} -fs = gvfs.GravitinoVirtualFileSystem(server_uri="http://localhost:8090", metalake_name="test_metalake", options=options) -fs.ls("gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/") ") -``` - -### Using fileset with pandas - -The following are examples of how to use the pandas library to access the S3 fileset - -```python -import pandas as pd - -storage_options = { - "server_uri": "http://localhost:8090", - "metalake_name": "test", - "options": { - "s3_access_key_id": "access_key", - "s3_secret_access_key": "secret_key", - "s3_endpoint": "http://s3.ap-northeast-1.amazonaws.com" - } -} -ds = pd.read_csv(f"gvfs://fileset/${catalog_name}/${schema_name}/${fileset_name}/people/part-00000-51d366e2-d5eb-448d-9109-32a96c8a14dc-c000.csv", - storage_options=storage_options) -ds.head() -``` -For other use cases, please refer to the [Gravitino Virtual File System](./how-to-use-gvfs.md) document. - -## Fileset with credential - -Since 0.8.0-incubating, Gravitino supports credential vending for S3 fileset. If the catalog has been configured with credential, you can access S3 fileset without providing authentication information like `s3-access-key-id` and `s3-secret-access-key` in the properties. - -### How to create a S3 Hadoop catalog with credential enabled - -Apart from configuration method in [create-s3-hadoop-catalog](#catalog-a-catalog), properties needed by [s3-credential](./security/credential-vending.md#s3-credentials) should also be set to enable credential vending for S3 fileset. - -### How to access S3 fileset with credential - -If the catalog has been configured with credential, you can access S3 fileset without providing authentication information via GVFS. Let's see how to access S3 fileset with credential: - -GVFS Java client: - -```java -Configuration conf = new Configuration(); -conf.set("fs.AbstractFileSystem.gvfs.impl","org.apache.gravitino.filesystem.hadoop.Gvfs"); -conf.set("fs.gvfs.impl","org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); -conf.set("fs.gravitino.server.uri","http://localhost:8090"); -conf.set("fs.gravitino.client.metalake","test_metalake"); -// No need to set s3-access-key-id and s3-secret-access-key -Path filesetPath = new Path("gvfs://fileset/test_catalog/test_schema/test_fileset/new_dir"); -FileSystem fs = filesetPath.getFileSystem(conf); -fs.mkdirs(filesetPath); -... -``` - -Spark: - -```python -spark = SparkSession.builder - .appName("s3_fielset_test") - .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") - .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") - .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") - .config("spark.hadoop.fs.gravitino.client.metalake", "test") - # No need to set s3-access-key-id and s3-secret-access-key - .config("spark.driver.memory", "2g") - .config("spark.driver.port", "2048") - .getOrCreate() -``` - -Python client and Hadoop command are similar to the above examples. - - diff --git a/docs/hadoop-catalog.md b/docs/hadoop-catalog.md index 1f9efebe0bd..99e1dd7854e 100644 --- a/docs/hadoop-catalog.md +++ b/docs/hadoop-catalog.md @@ -9,9 +9,9 @@ license: "This software is licensed under the Apache License version 2." ## Introduction Hadoop catalog is a fileset catalog that using Hadoop Compatible File System (HCFS) to manage -the storage location of the fileset. Currently, it supports the local filesystem and HDFS. Since 0.7.0-incubating, Gravitino supports [S3](hadoop-catalog-with-S3.md), [GCS](hadoop-catalog-with-gcs.md), [OSS](hadoop-catalog-with-oss.md) and [Azure Blob Storage](hadoop-catalog-with-adls.md) through Hadoop catalog. - -The rest of this document will use HDFS or local file as an example to illustrate how to use the Hadoop catalog. For S3, GCS, OSS and Azure Blob Storage, the configuration is similar to HDFS, please refer to the corresponding document for more details. +the storage location of the fileset. Currently, it supports local filesystem and HDFS. For +object storage like S3, GCS, Azure Blob Storage and OSS, you can put the hadoop object store jar like +`gravitino-aws-bundle-{gravitino-version}.jar` into the `$GRAVITINO_HOME/catalogs/hadoop/libs` directory to enable the support. Note that Gravitino uses Hadoop 3 dependencies to build Hadoop catalog. Theoretically, it should be compatible with both Hadoop 2.x and 3.x, since Gravitino doesn't leverage any new features in @@ -57,6 +57,8 @@ Please refer to [S3 credentials](./security/credential-vending.md#s3-credentials At the same time, you need to place the corresponding bundle jar [`gravitino-aws-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aws-bundle/) in the directory `${GRAVITINO_HOME}/catalogs/hadoop/libs`. +#### GCS fileset + | Configuration item | Description | Default value | Required | Since version | |-------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|----------------------------|------------------| | `filesystem-providers` | The file system providers to add. Set it to `gs` if it's a GCS fileset, a comma separated string that contains `gs` like `gs,s3` to support multiple kinds of fileset including `gs`. | (none) | Yes | 0.7.0-incubating | @@ -81,6 +83,7 @@ Please refer to [OSS credentials](./security/credential-vending.md#oss-credentia In the meantime, you need to place the corresponding bundle jar [`gravitino-aliyun-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aliyun-bundle/) in the directory `${GRAVITINO_HOME}/catalogs/hadoop/libs`. + #### Azure Blob Storage fileset | Configuration item | Description | Default value | Required | Since version | diff --git a/docs/how-to-use-gvfs.md b/docs/how-to-use-gvfs.md index d32ad3da672..31ede3a5374 100644 --- a/docs/how-to-use-gvfs.md +++ b/docs/how-to-use-gvfs.md @@ -42,10 +42,7 @@ the path mapping and convert automatically. ### Prerequisites -+ A Hadoop environment with HDFS running. GVFS has been tested against - Hadoop 3.3.1. It is recommended to use Hadoop 3.3.1 or later, but it should work with Hadoop 2. - x. Please create an [issue](https://www.github.com/apache/gravitino/issues) if you find any - compatibility issues. ++ A Hadoop environment with HDFS or other Hadoop Compatible File System (HCFS) implementations like S3, GCS, etc. GVFS has been tested against Hadoop 3.3.1. It is recommended to use Hadoop 3.3.1 or later, but it should work with Hadoop 2.x. Please create an [issue](https://www.github.com/apache/gravitino/issues) if you find any compatibility issues. ### Configuration @@ -71,51 +68,51 @@ Apart from the above properties, to access fileset like S3, GCS, OSS and custom #### S3 fileset -| Configuration item | Description | Default value | Required | Since version | -|------------------------|-------------------------------|---------------|--------------------------|------------------| -| `s3-endpoint` | The endpoint of the AWS S3. | (none) | Yes if it's a S3 fileset.| 0.7.0-incubating | -| `s3-access-key-id` | The access key of the AWS S3. | (none) | Yes if it's a S3 fileset.| 0.7.0-incubating | -| `s3-secret-access-key` | The secret key of the AWS S3. | (none) | Yes if it's a S3 fileset.| 0.7.0-incubating | +| Configuration item | Description | Default value | Required | Since version | +|------------------------|-------------------------------|---------------|---------------------------|------------------| +| `s3-endpoint` | The endpoint of the AWS S3. | (none) | Yes if it's a S3 fileset. | 0.7.0-incubating | +| `s3-access-key-id` | The access key of the AWS S3. | (none) | Yes if it's a S3 fileset. | 0.7.0-incubating | +| `s3-secret-access-key` | The secret key of the AWS S3. | (none) | Yes if it's a S3 fileset. | 0.7.0-incubating | At the same time, you need to add the corresponding bundle jar -1. [`gravitino-aws-bundle-${gravitino-version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aws-bundle/) in the classpath if no Hadoop environment is available, or -2. [`gravitino-aws-${gravitino-version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aws/) and `hadoop-aws-${hadoop-version}.jar` and other necessary dependencies (They are usually located at `${HADOOP_HOME}/share/hadoop/tools/lib`) in the classpath. +1. [`gravitino-aws-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aws-bundle/) in the classpath if no hadoop environment is available, or +2. [`gravitino-aws-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aws/) and hadoop-aws jar and other necessary dependencies in the classpath. #### GCS fileset -| Configuration item | Description | Default value | Required | Since version | -|----------------------------|--------------------------------------------|---------------|---------------------------|------------------| -| `gcs-service-account-file` | The path of GCS service account JSON file. | (none) | Yes if it's a GCS fileset.| 0.7.0-incubating | +| Configuration item | Description | Default value | Required | Since version | +|----------------------------|--------------------------------------------|---------------|----------------------------|------------------| +| `gcs-service-account-file` | The path of GCS service account JSON file. | (none) | Yes if it's a GCS fileset. | 0.7.0-incubating | In the meantime, you need to add the corresponding bundle jar -1. [`gravitino-gcp-bundle-${gravitino-version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-gcp-bundle/) in the classpath if no hadoop environment is available, or -2. [`gravitino-gcp-${gravitino-version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-gcp/) and [gcs-connector jar](https://github.com/GoogleCloudDataproc/hadoop-connectors/releases) and other necessary dependencies in the classpath. +1. [`gravitino-gcp-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-gcp-bundle/) in the classpath if no hadoop environment is available, or +2. or [`gravitino-gcp-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-gcp/) and [gcs-connector jar](https://github.com/GoogleCloudDataproc/hadoop-connectors/releases) and other necessary dependencies in the classpath. #### OSS fileset -| Configuration item | Description | Default value | Required | Since version | -|-------------------------|-----------------------------------|---------------|---------------------------|------------------| -| `oss-endpoint` | The endpoint of the Aliyun OSS. | (none) | Yes if it's a OSS fileset.| 0.7.0-incubating | -| `oss-access-key-id` | The access key of the Aliyun OSS. | (none) | Yes if it's a OSS fileset.| 0.7.0-incubating | -| `oss-secret-access-key` | The secret key of the Aliyun OSS. | (none) | Yes if it's a OSS fileset.| 0.7.0-incubating | +| Configuration item | Description | Default value | Required | Since version | +|---------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|---------------------------|------------------| +| `oss-endpoint` | The endpoint of the Aliyun OSS. | (none) | Yes if it's a OSS fileset.| 0.7.0-incubating | +| `oss-access-key-id` | The access key of the Aliyun OSS. | (none) | Yes if it's a OSS fileset.| 0.7.0-incubating | +| `oss-secret-access-key` | The secret key of the Aliyun OSS. | (none) | Yes if it's a OSS fileset.| 0.7.0-incubating | In the meantime, you need to place the corresponding bundle jar -1. [`gravitino-aliyun-bundle-${gravitino-version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aliyun-bundle/) in the classpath if no hadoop environment is available, or -2. [`gravitino-aliyun-${gravitino-version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aliyun/) and `hadoop-aliyun-${hadoop-version}.jar` and other necessary dependencies (They are usually located at `${HADOOP_HOME}/share/hadoop/tools/lib`) in the classpath. +1. [`gravitino-aliyun-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aliyun-bundle/) in the classpath if no hadoop environment is available, or +2. [`gravitino-aliyun-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aliyun/) and hadoop-aliyun jar and other necessary dependencies in the classpath. #### Azure Blob Storage fileset -| Configuration item | Description | Default value | Required | Since version | -|------------------------------|-----------------------------------------|---------------|-------------------------------------------|------------------| -| `azure-storage-account-name` | The account name of Azure Blob Storage. | (none) | Yes if it's a Azure Blob Storage fileset. | 0.8.0-incubating | -| `azure-storage-account-key` | The account key of Azure Blob Storage. | (none) | Yes if it's a Azure Blob Storage fileset. | 0.8.0-incubating | +| Configuration item | Description | Default value | Required | Since version | +|-----------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|-------------------------------------------|------------------| +| `azure-storage-account-name` | The account name of Azure Blob Storage. | (none) | Yes if it's a Azure Blob Storage fileset. | 0.8.0-incubating | +| `azure-storage-account-key` | The account key of Azure Blob Storage. | (none) | Yes if it's a Azure Blob Storage fileset. | 0.8.0-incubating | Similar to the above, you need to place the corresponding bundle jar -1. [`gravitino-azure-bundle-${gravitino-version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-azure-bundle/) in the classpath if no hadoop environment is available, or -2. [`gravitino-azure-${gravitino-version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-azure/) and `hadoop-azure-${hadoop-version}.jar` and other necessary dependencies (They are usually located at `${HADOOP_HOME}/share/hadoop/tools/lib) in the classpath. +1. [`gravitino-azure-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-azure-bundle/) in the classpath if no hadoop environment is available, or +2. [`gravitino-azure-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-azure/) and hadoop-azure jar and other necessary dependencies in the classpath. #### Custom fileset Since 0.7.0-incubating, users can define their own fileset type and configure the corresponding properties, for more, please refer to [Custom Fileset](./hadoop-catalog.md#how-to-custom-your-own-hcfs-file-system-fileset). @@ -146,7 +143,13 @@ You can configure these properties in two ways: ``` :::note -If you want to access the S3, GCS, OSS or custom fileset through GVFS, apart from the above properties, you need to place the corresponding bundle jars in the Hadoop environment. +If you want to access the S3, GCS, OSS or custom fileset through GVFS, apart from the above properties, you need to place the corresponding bundle jars in the Hadoop environment. +For example, if you want to access the S3 fileset, you need to place +1. The aws hadoop bundle jar [`gravitino-aws-bundle-${gravitino-version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aws-bundle/) +2. or [`gravitino-aws-${gravitino-version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/gravitino-aws/), and hadoop-aws jar and other necessary dependencies + +to the classpath, it typically locates in `${HADOOP_HOME}/share/hadoop/common/lib/`). + ::: 2. Configure the properties in the `core-site.xml` file of the Hadoop environment: @@ -220,6 +223,7 @@ cp gravitino-filesystem-hadoop3-runtime-{version}.jar ${HADOOP_HOME}/share/hadoo # You need to ensure that the Kerberos has permission on the HDFS directory. kinit -kt your_kerberos.keytab your_kerberos@xxx.com + # 4. Copy other dependencies to the Hadoop environment if you want to access the S3 fileset via GVFS cp bundles/aws-bundle/build/libs/gravitino-aws-bundle-{version}.jar ${HADOOP_HOME}/share/hadoop/common/lib/ cp clients/filesystem-hadoop3-runtime/build/libs/gravitino-filesystem-hadoop3-runtime-{version}-SNAPSHOT.jar ${HADOOP_HOME}/share/hadoop/common/lib/ @@ -317,6 +321,7 @@ fs.getFileStatus(filesetPath); rdd.foreach(println) ``` + #### Via Tensorflow For Tensorflow to support GVFS, you need to recompile the [tensorflow-io](https://github.com/tensorflow/io) module. @@ -511,7 +516,6 @@ options = { fs = gvfs.GravitinoVirtualFileSystem(server_uri="http://localhost:8090", metalake_name="test_metalake", options=options) ``` - :::note Gravitino python client does not support [customized file systems](hadoop-catalog.md#how-to-custom-your-own-hcfs-file-system-fileset) defined by users due to the limit of `fsspec` library. From a285deefe3ad2e441f92877996e55328b2c5145e Mon Sep 17 00:00:00 2001 From: yuqi Date: Wed, 8 Jan 2025 15:50:05 +0800 Subject: [PATCH 46/59] Polish the code. --- .../oss/fs/OSSCredentialsProvider.java | 17 ++------------ .../s3/fs/S3CredentialsProvider.java | 17 ++------------ .../abs/fs/AzureSasCredentialsProvider.java | 2 ++ .../gcs/fs/GCSCredentialsProvider.java | 18 +++------------ .../catalog/hadoop/fs/FileSystemUtils.java | 23 +++++++++++++++++++ .../hadoop/GravitinoVirtualFileSystem.java | 1 + 6 files changed, 33 insertions(+), 45 deletions(-) diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java index e16a393558b..1a980c76c03 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java @@ -24,6 +24,7 @@ import com.aliyun.oss.common.auth.CredentialsProvider; import com.aliyun.oss.common.auth.DefaultCredentials; import java.net.URI; +import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; import org.apache.gravitino.credential.Credential; import org.apache.gravitino.credential.OSSSecretKeyCredential; @@ -38,21 +39,7 @@ public class OSSCredentialsProvider implements CredentialsProvider { private static final double EXPIRATION_TIME_FACTOR = 0.9D; public OSSCredentialsProvider(URI uri, Configuration conf) { - initGvfsCredentialProvider(conf); - } - - private void initGvfsCredentialProvider(Configuration conf) { - try { - gravitinoFileSystemCredentialProvider = - (GravitinoFileSystemCredentialProvider) - Class.forName( - conf.get(GravitinoFileSystemCredentialProvider.GVFS_CREDENTIAL_PROVIDER)) - .getDeclaredConstructor() - .newInstance(); - gravitinoFileSystemCredentialProvider.setConf(conf); - } catch (Exception e) { - throw new RuntimeException("Failed to create GravitinoFileSystemCredentialProvider", e); - } + this.gravitinoFileSystemCredentialProvider = FileSystemUtils.getGvfsCredentialProvider(conf); } @Override diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java index 10ea1d1b6d0..854152d30d0 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java @@ -24,6 +24,7 @@ import com.amazonaws.auth.BasicAWSCredentials; import com.amazonaws.auth.BasicSessionCredentials; import java.net.URI; +import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; import org.apache.gravitino.credential.Credential; import org.apache.gravitino.credential.S3SecretKeyCredential; @@ -38,21 +39,7 @@ public class S3CredentialsProvider implements AWSCredentialsProvider { private static final double EXPIRATION_TIME_FACTOR = 0.9D; public S3CredentialsProvider(final URI uri, final Configuration conf) { - initGvfsCredentialProvider(conf); - } - - private void initGvfsCredentialProvider(Configuration conf) { - try { - gravitinoFileSystemCredentialProvider = - (GravitinoFileSystemCredentialProvider) - Class.forName( - conf.get(GravitinoFileSystemCredentialProvider.GVFS_CREDENTIAL_PROVIDER)) - .getDeclaredConstructor() - .newInstance(); - gravitinoFileSystemCredentialProvider.setConf(conf); - } catch (Exception e) { - throw new RuntimeException("Failed to create GravitinoFileSystemCredentialProvider", e); - } + this.gravitinoFileSystemCredentialProvider = FileSystemUtils.getGvfsCredentialProvider(conf); } @Override diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java index 821489b5abd..68ef2795b08 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java @@ -20,6 +20,7 @@ package org.apache.gravitino.abs.fs; import java.io.IOException; +import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; import org.apache.gravitino.credential.ADLSTokenCredential; import org.apache.gravitino.credential.AzureAccountKeyCredential; @@ -60,6 +61,7 @@ public Configuration getConf() { @Override public void initialize(Configuration conf, String accountName) throws IOException { this.configuration = conf; + this.gravitinoFileSystemCredentialProvider = FileSystemUtils.getGvfsCredentialProvider(conf); } @Override diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java index ba28dd97372..802188c5039 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java @@ -21,6 +21,7 @@ import com.google.cloud.hadoop.util.AccessTokenProvider; import java.io.IOException; +import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; import org.apache.gravitino.credential.Credential; import org.apache.gravitino.credential.GCSTokenCredential; @@ -72,21 +73,8 @@ public void refresh() throws IOException { @Override public void setConf(Configuration configuration) { this.configuration = configuration; - initGvfsCredentialProvider(configuration); - } - - private void initGvfsCredentialProvider(Configuration conf) { - try { - gravitinoFileSystemCredentialProvider = - (GravitinoFileSystemCredentialProvider) - Class.forName( - conf.get(GravitinoFileSystemCredentialProvider.GVFS_CREDENTIAL_PROVIDER)) - .getDeclaredConstructor() - .newInstance(); - gravitinoFileSystemCredentialProvider.setConf(conf); - } catch (Exception e) { - throw new RuntimeException("Failed to create GravitinoFileSystemCredentialProvider", e); - } + this.gravitinoFileSystemCredentialProvider = + FileSystemUtils.getGvfsCredentialProvider(configuration); } @Override diff --git a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/FileSystemUtils.java b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/FileSystemUtils.java index a1434e85c3e..031605dec1d 100644 --- a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/FileSystemUtils.java +++ b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/FileSystemUtils.java @@ -31,6 +31,7 @@ import java.util.ServiceLoader; import java.util.Set; import java.util.stream.Collectors; +import org.apache.hadoop.conf.Configuration; public class FileSystemUtils { @@ -160,4 +161,26 @@ public static Map toHadoopConfigMap( return result; } + + /** + * Get the GravitinoFileSystemCredentialProvider from the configuration. + * + * @param conf Configuration + * @return GravitinoFileSystemCredentialProvider + */ + public static GravitinoFileSystemCredentialProvider getGvfsCredentialProvider( + Configuration conf) { + try { + GravitinoFileSystemCredentialProvider gravitinoFileSystemCredentialProvider = + (GravitinoFileSystemCredentialProvider) + Class.forName( + conf.get(GravitinoFileSystemCredentialProvider.GVFS_CREDENTIAL_PROVIDER)) + .getDeclaredConstructor() + .newInstance(); + gravitinoFileSystemCredentialProvider.setConf(conf); + return gravitinoFileSystemCredentialProvider; + } catch (Exception e) { + throw new RuntimeException("Failed to create GravitinoFileSystemCredentialProvider", e); + } + } } diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java index 9cbbcd7bf58..94f4b9a975c 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java @@ -351,6 +351,7 @@ private boolean enableGravitinoCredentialProvider( return credentials.length > 0; } catch (Exception e) { // No credential found, do nothing. + Logger.warn("Failed to fetch credentials from fileset: {}", filesetIdentifier, e); return false; } } From ce49fa3e3924c29612f9a0aaf30d11bf3f48c4f9 Mon Sep 17 00:00:00 2001 From: yuqi Date: Wed, 8 Jan 2025 20:16:06 +0800 Subject: [PATCH 47/59] fix --- .../oss/fs/OSSCredentialsProvider.java | 8 ++--- .../oss/fs/OSSFileSystemProvider.java | 9 ++--- .../s3/fs/S3CredentialsProvider.java | 8 ++--- .../gravitino/s3/fs/S3FileSystemProvider.java | 9 ++--- .../abs/fs/AzureFileSystemProvider.java | 8 ++--- .../abs/fs/AzureSasCredentialsProvider.java | 8 ++--- bundles/gcp/build.gradle.kts | 1 - .../gcs/fs/GCSCredentialsProvider.java | 8 ++--- .../gcs/fs/GCSFileSystemProvider.java | 9 ++--- .../catalog/hadoop/fs/CredentialUtils.java | 36 +++++++++++++++++++ .../catalog/hadoop/fs/FileSystemUtils.java | 12 +++---- ...avitinoFileSystemCredentialsProvider.java} | 4 +-- ...avitinoFileSystemCredentialsProvider.java} | 10 +++--- .../hadoop/GravitinoVirtualFileSystem.java | 19 +++++----- 14 files changed, 84 insertions(+), 65 deletions(-) create mode 100644 catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/CredentialUtils.java rename catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/{GravitinoFileSystemCredentialProvider.java => GravitinoFileSystemCredentialsProvider.java} (89%) rename clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/{DefaultGravitinoFileSystemCredentialProvider.java => DefaultGravitinoFileSystemCredentialsProvider.java} (92%) diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java index 1a980c76c03..6f264759ea2 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java @@ -25,7 +25,7 @@ import com.aliyun.oss.common.auth.DefaultCredentials; import java.net.URI; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; -import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; +import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialsProvider; import org.apache.gravitino.credential.Credential; import org.apache.gravitino.credential.OSSSecretKeyCredential; import org.apache.gravitino.credential.OSSTokenCredential; @@ -33,13 +33,13 @@ public class OSSCredentialsProvider implements CredentialsProvider { - private GravitinoFileSystemCredentialProvider gravitinoFileSystemCredentialProvider; + private GravitinoFileSystemCredentialsProvider gravitinoFileSystemCredentialsProvider; private Credentials basicCredentials; private long expirationTime = Long.MAX_VALUE; private static final double EXPIRATION_TIME_FACTOR = 0.9D; public OSSCredentialsProvider(URI uri, Configuration conf) { - this.gravitinoFileSystemCredentialProvider = FileSystemUtils.getGvfsCredentialProvider(conf); + this.gravitinoFileSystemCredentialsProvider = FileSystemUtils.getGvfsCredentialProvider(conf); } @Override @@ -57,7 +57,7 @@ public Credentials getCredentials() { } private void refresh() { - Credential[] gravitinoCredentials = gravitinoFileSystemCredentialProvider.getCredentials(); + Credential[] gravitinoCredentials = gravitinoFileSystemCredentialsProvider.getCredentials(); Credential credential = getSuitableCredential(gravitinoCredentials); if (credential == null) { throw new RuntimeException("No suitable credential for OSS found..."); diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java index c3c0c0d7853..1c0b55b4baa 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java @@ -18,13 +18,14 @@ */ package org.apache.gravitino.oss.fs; +import static org.apache.gravitino.catalog.hadoop.fs.CredentialUtils.enableGravitinoCredentialVending; + import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableMap; import java.io.IOException; import java.util.Map; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; -import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; import org.apache.gravitino.storage.OSSProperties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -61,7 +62,7 @@ public FileSystem getFileSystem(Path path, Map config) throws IO hadoopConfMap.put(OSS_FILESYSTEM_IMPL, AliyunOSSFileSystem.class.getCanonicalName()); } - if (enableCredentialProvidedByGravitino(config)) { + if (enableGravitinoCredentialVending(config)) { hadoopConfMap.put( Constants.CREDENTIALS_PROVIDER_KEY, OSSCredentialsProvider.class.getCanonicalName()); } @@ -71,10 +72,6 @@ public FileSystem getFileSystem(Path path, Map config) throws IO return AliyunOSSFileSystem.newInstance(path.toUri(), configuration); } - private boolean enableCredentialProvidedByGravitino(Map config) { - return null != config.get(GravitinoFileSystemCredentialProvider.GVFS_CREDENTIAL_PROVIDER); - } - @Override public String scheme() { return "oss"; diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java index 854152d30d0..38cbc45be6c 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java @@ -25,21 +25,21 @@ import com.amazonaws.auth.BasicSessionCredentials; import java.net.URI; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; -import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; +import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialsProvider; import org.apache.gravitino.credential.Credential; import org.apache.gravitino.credential.S3SecretKeyCredential; import org.apache.gravitino.credential.S3TokenCredential; import org.apache.hadoop.conf.Configuration; public class S3CredentialsProvider implements AWSCredentialsProvider { - private GravitinoFileSystemCredentialProvider gravitinoFileSystemCredentialProvider; + private GravitinoFileSystemCredentialsProvider gravitinoFileSystemCredentialsProvider; private AWSCredentials basicSessionCredentials; private long expirationTime = Long.MAX_VALUE; private static final double EXPIRATION_TIME_FACTOR = 0.9D; public S3CredentialsProvider(final URI uri, final Configuration conf) { - this.gravitinoFileSystemCredentialProvider = FileSystemUtils.getGvfsCredentialProvider(conf); + this.gravitinoFileSystemCredentialsProvider = FileSystemUtils.getGvfsCredentialProvider(conf); } @Override @@ -56,7 +56,7 @@ public AWSCredentials getCredentials() { @Override public void refresh() { - Credential[] gravitinoCredentials = gravitinoFileSystemCredentialProvider.getCredentials(); + Credential[] gravitinoCredentials = gravitinoFileSystemCredentialsProvider.getCredentials(); Credential credential = getSuitableCredential(gravitinoCredentials); if (credential == null) { diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java index 3dd5feae7e3..73e1c2edff1 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java @@ -19,6 +19,8 @@ package org.apache.gravitino.s3.fs; +import static org.apache.gravitino.catalog.hadoop.fs.CredentialUtils.enableGravitinoCredentialVending; + import com.amazonaws.auth.AWSCredentialsProvider; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Joiner; @@ -30,7 +32,6 @@ import java.util.Map; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; -import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; import org.apache.gravitino.storage.S3Properties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -67,7 +68,7 @@ public FileSystem getFileSystem(Path path, Map config) throws IO configuration.set(S3_CREDENTIAL_KEY, S3_SIMPLE_CREDENTIAL); } - if (enableCredentialProvidedByGravitino(config)) { + if (enableGravitinoCredentialVending(config)) { configuration.set( Constants.AWS_CREDENTIALS_PROVIDER, S3CredentialsProvider.class.getCanonicalName()); } @@ -78,10 +79,6 @@ public FileSystem getFileSystem(Path path, Map config) throws IO return S3AFileSystem.newInstance(path.toUri(), configuration); } - private boolean enableCredentialProvidedByGravitino(Map config) { - return null != config.get(GravitinoFileSystemCredentialProvider.GVFS_CREDENTIAL_PROVIDER); - } - private void checkAndSetCredentialProvider(Configuration configuration) { String provides = configuration.get(S3_CREDENTIAL_KEY); if (provides == null) { diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java index b8cedfd9330..292b2cee433 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java @@ -19,6 +19,7 @@ package org.apache.gravitino.abs.fs; +import static org.apache.gravitino.catalog.hadoop.fs.CredentialUtils.enableGravitinoCredentialVending; import static org.apache.hadoop.fs.azurebfs.constants.ConfigurationKeys.FS_AZURE_ACCOUNT_AUTH_TYPE_PROPERTY_NAME; import static org.apache.hadoop.fs.azurebfs.constants.ConfigurationKeys.FS_AZURE_ACCOUNT_IS_HNS_ENABLED; import static org.apache.hadoop.fs.azurebfs.constants.ConfigurationKeys.FS_AZURE_SAS_TOKEN_PROVIDER_TYPE; @@ -30,7 +31,6 @@ import javax.annotation.Nonnull; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; -import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; import org.apache.gravitino.storage.AzureProperties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -70,7 +70,7 @@ public FileSystem getFileSystem(@Nonnull Path path, @Nonnull Map hadoopConfMap.forEach(configuration::set); - if (enableCredentialProvidedByGravitino(hadoopConfMap)) { + if (enableGravitinoCredentialVending(hadoopConfMap)) { try { AzureSasCredentialsProvider azureSasCredentialsProvider = new AzureSasCredentialsProvider(); azureSasCredentialsProvider.initialize(configuration, null); @@ -103,10 +103,6 @@ public FileSystem getFileSystem(@Nonnull Path path, @Nonnull Map return FileSystem.get(path.toUri(), configuration); } - private boolean enableCredentialProvidedByGravitino(Map config) { - return null != config.get(GravitinoFileSystemCredentialProvider.GVFS_CREDENTIAL_PROVIDER); - } - @Override public String scheme() { return ABS_PROVIDER_SCHEME; diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java index 68ef2795b08..fd0fe450a7b 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java @@ -21,7 +21,7 @@ import java.io.IOException; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; -import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; +import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialsProvider; import org.apache.gravitino.credential.ADLSTokenCredential; import org.apache.gravitino.credential.AzureAccountKeyCredential; import org.apache.gravitino.credential.Credential; @@ -36,7 +36,7 @@ public class AzureSasCredentialsProvider implements SASTokenProvider, Configurab private String azureStorageAccountName; private String azureStorageAccountKey; - private GravitinoFileSystemCredentialProvider gravitinoFileSystemCredentialProvider; + private GravitinoFileSystemCredentialsProvider gravitinoFileSystemCredentialsProvider; private long expirationTime = Long.MAX_VALUE; private static final double EXPIRATION_TIME_FACTOR = 0.9D; @@ -61,7 +61,7 @@ public Configuration getConf() { @Override public void initialize(Configuration conf, String accountName) throws IOException { this.configuration = conf; - this.gravitinoFileSystemCredentialProvider = FileSystemUtils.getGvfsCredentialProvider(conf); + this.gravitinoFileSystemCredentialsProvider = FileSystemUtils.getGvfsCredentialProvider(conf); } @Override @@ -76,7 +76,7 @@ public String getSASToken(String account, String fileSystem, String path, String } private void refresh() { - Credential[] gravitinoCredentials = gravitinoFileSystemCredentialProvider.getCredentials(); + Credential[] gravitinoCredentials = gravitinoFileSystemCredentialsProvider.getCredentials(); Credential credential = getSuitableCredential(gravitinoCredentials); if (credential == null) { throw new RuntimeException("No suitable credential for OSS found..."); diff --git a/bundles/gcp/build.gradle.kts b/bundles/gcp/build.gradle.kts index a0bc169bc01..7d46fde9e98 100644 --- a/bundles/gcp/build.gradle.kts +++ b/bundles/gcp/build.gradle.kts @@ -40,7 +40,6 @@ dependencies { implementation(project(":catalogs:hadoop-common")) { exclude("*") } - implementation(libs.commons.lang3) // runtime used implementation(libs.commons.logging) diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java index 802188c5039..02e87300858 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java @@ -22,14 +22,14 @@ import com.google.cloud.hadoop.util.AccessTokenProvider; import java.io.IOException; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; -import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; +import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialsProvider; import org.apache.gravitino.credential.Credential; import org.apache.gravitino.credential.GCSTokenCredential; import org.apache.hadoop.conf.Configuration; public class GCSCredentialsProvider implements AccessTokenProvider { private Configuration configuration; - private GravitinoFileSystemCredentialProvider gravitinoFileSystemCredentialProvider; + private GravitinoFileSystemCredentialsProvider gravitinoFileSystemCredentialsProvider; private AccessToken accessToken; private long expirationTime = Long.MAX_VALUE; @@ -49,7 +49,7 @@ public AccessToken getAccessToken() { @Override public void refresh() throws IOException { - Credential[] gravitinoCredentials = gravitinoFileSystemCredentialProvider.getCredentials(); + Credential[] gravitinoCredentials = gravitinoFileSystemCredentialsProvider.getCredentials(); Credential credential = getSuitableCredential(gravitinoCredentials); if (credential == null) { @@ -73,7 +73,7 @@ public void refresh() throws IOException { @Override public void setConf(Configuration configuration) { this.configuration = configuration; - this.gravitinoFileSystemCredentialProvider = + this.gravitinoFileSystemCredentialsProvider = FileSystemUtils.getGvfsCredentialProvider(configuration); } diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java index c509ac1ca78..c361e5302c8 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java @@ -18,13 +18,14 @@ */ package org.apache.gravitino.gcs.fs; +import static org.apache.gravitino.catalog.hadoop.fs.CredentialUtils.enableGravitinoCredentialVending; + import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableMap; import java.io.IOException; import java.util.Map; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; -import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; import org.apache.gravitino.storage.GCSProperties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -46,17 +47,13 @@ public FileSystem getFileSystem(Path path, Map config) throws IO FileSystemUtils.toHadoopConfigMap(config, GRAVITINO_KEY_TO_GCS_HADOOP_KEY) .forEach(configuration::set); - if (enableCredentialProvidedByGravitino(config)) { + if (enableGravitinoCredentialVending(config)) { configuration.set(GCS_TOKEN_PROVIDER_IMPL, GCSCredentialsProvider.class.getName()); } return FileSystem.newInstance(path.toUri(), configuration); } - private boolean enableCredentialProvidedByGravitino(Map config) { - return null != config.get(GravitinoFileSystemCredentialProvider.GVFS_CREDENTIAL_PROVIDER); - } - @Override public String scheme() { return "gs"; diff --git a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/CredentialUtils.java b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/CredentialUtils.java new file mode 100644 index 00000000000..e8d526e52d2 --- /dev/null +++ b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/CredentialUtils.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.catalog.hadoop.fs; + +import java.util.Map; + +/** Utility class for Gravitino Virtual File System. */ +public class CredentialUtils { + + /** + * Check if Gravitino credential vending is enabled. + * + * @param config The configuration for Gravitino Virtual File System. + * @return true if Gravitino credential vending is enabled, false otherwise. + */ + public static boolean enableGravitinoCredentialVending(Map config) { + return null != config.get(GravitinoFileSystemCredentialsProvider.GVFS_CREDENTIAL_PROVIDER); + } +} diff --git a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/FileSystemUtils.java b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/FileSystemUtils.java index 031605dec1d..11ecd1ee9c3 100644 --- a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/FileSystemUtils.java +++ b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/FileSystemUtils.java @@ -168,17 +168,17 @@ public static Map toHadoopConfigMap( * @param conf Configuration * @return GravitinoFileSystemCredentialProvider */ - public static GravitinoFileSystemCredentialProvider getGvfsCredentialProvider( + public static GravitinoFileSystemCredentialsProvider getGvfsCredentialProvider( Configuration conf) { try { - GravitinoFileSystemCredentialProvider gravitinoFileSystemCredentialProvider = - (GravitinoFileSystemCredentialProvider) + GravitinoFileSystemCredentialsProvider gravitinoFileSystemCredentialsProvider = + (GravitinoFileSystemCredentialsProvider) Class.forName( - conf.get(GravitinoFileSystemCredentialProvider.GVFS_CREDENTIAL_PROVIDER)) + conf.get(GravitinoFileSystemCredentialsProvider.GVFS_CREDENTIAL_PROVIDER)) .getDeclaredConstructor() .newInstance(); - gravitinoFileSystemCredentialProvider.setConf(conf); - return gravitinoFileSystemCredentialProvider; + gravitinoFileSystemCredentialsProvider.setConf(conf); + return gravitinoFileSystemCredentialsProvider; } catch (Exception e) { throw new RuntimeException("Failed to create GravitinoFileSystemCredentialProvider", e); } diff --git a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialProvider.java b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialsProvider.java similarity index 89% rename from catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialProvider.java rename to catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialsProvider.java index 6d2d63f371e..b53a3014d0e 100644 --- a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialProvider.java +++ b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialsProvider.java @@ -23,11 +23,11 @@ import org.apache.hadoop.conf.Configurable; /** Interface for providing credentials for Gravitino Virtual File System. */ -public interface GravitinoFileSystemCredentialProvider extends Configurable { +public interface GravitinoFileSystemCredentialsProvider extends Configurable { String GVFS_CREDENTIAL_PROVIDER = "fs.gvfs.credential.provider"; - String GVFS_CREDENTIAL_PROVIDER_PATH = "fs.gvfs.virtual.path"; + String GVFS_PATH = "fs.gvfs.virtual.path"; /** * Get credentials for Gravitino Virtual File System. diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialProvider.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialsProvider.java similarity index 92% rename from clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialProvider.java rename to clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialsProvider.java index fd970507ceb..1df8bf6062e 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialProvider.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialsProvider.java @@ -24,7 +24,7 @@ import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import org.apache.gravitino.NameIdentifier; -import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; +import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialsProvider; import org.apache.gravitino.client.GravitinoClient; import org.apache.gravitino.credential.Credential; import org.apache.gravitino.file.Fileset; @@ -32,11 +32,11 @@ import org.apache.hadoop.conf.Configuration; /** - * Default implementation of {@link GravitinoFileSystemCredentialProvider} which provides + * Default implementation of {@link GravitinoFileSystemCredentialsProvider} which provides * credentials for Gravitino Virtual File System. */ -public class DefaultGravitinoFileSystemCredentialProvider - implements GravitinoFileSystemCredentialProvider { +public class DefaultGravitinoFileSystemCredentialsProvider + implements GravitinoFileSystemCredentialsProvider { private Configuration configuration; @@ -55,7 +55,7 @@ public Configuration getConf() { @Override public Credential[] getCredentials() { - String virtualPath = configuration.get(GVFS_CREDENTIAL_PROVIDER_PATH); + String virtualPath = configuration.get(GVFS_PATH); NameIdentifier nameIdentifier = getNameIdentifierFromVirtualPath(virtualPath); String[] idents = nameIdentifier.namespace().levels(); try (GravitinoClient client = GravitinoVirtualFileSystemUtils.createClient(configuration)) { diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java index 94f4b9a975c..6946a64fd1b 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java @@ -50,7 +50,7 @@ import org.apache.gravitino.audit.FilesetDataOperation; import org.apache.gravitino.audit.InternalClientType; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; -import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialProvider; +import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialsProvider; import org.apache.gravitino.client.GravitinoClient; import org.apache.gravitino.credential.Credential; import org.apache.gravitino.exceptions.GravitinoRuntimeException; @@ -315,16 +315,14 @@ private FilesetContextPair getFilesetContext(Path virtualPath, FilesetDataOperat Map totalProperty = Maps.newHashMap(necessaryPropertyFromCatalog); totalProperty.putAll(getConfigMap(getConf())); - boolean enableCredentialProvider = - enableGravitinoCredentialProvider(catalog, identifier); - if (enableCredentialProvider) { - // It has enabled the credential provider + if (credentialVending(catalog, identifier)) { + // It has enabled the credential vending, so we need to set the credential + // provider totalProperty.put( - GravitinoFileSystemCredentialProvider.GVFS_CREDENTIAL_PROVIDER, - DefaultGravitinoFileSystemCredentialProvider.class.getCanonicalName()); + GravitinoFileSystemCredentialsProvider.GVFS_CREDENTIAL_PROVIDER, + DefaultGravitinoFileSystemCredentialsProvider.class.getCanonicalName()); totalProperty.put( - GravitinoFileSystemCredentialProvider.GVFS_CREDENTIAL_PROVIDER_PATH, - virtualPathString); + GravitinoFileSystemCredentialsProvider.GVFS_PATH, virtualPathString); } return provider.getFileSystem(filePath, totalProperty); @@ -338,8 +336,7 @@ private FilesetContextPair getFilesetContext(Path virtualPath, FilesetDataOperat return new FilesetContextPair(new Path(actualFileLocation), fs); } - private boolean enableGravitinoCredentialProvider( - Catalog catalog, NameIdentifier filesetIdentifier) { + private boolean credentialVending(Catalog catalog, NameIdentifier filesetIdentifier) { try { Fileset fileset = catalog From 6b848c8f5e334d67078926132e68bdce6e03667c Mon Sep 17 00:00:00 2001 From: yuqi Date: Thu, 9 Jan 2025 00:07:08 +0800 Subject: [PATCH 48/59] fix --- .../hadoop/GravitinoVirtualFileSystem.java | 10 ++--- .../filesystem/hadoop/TestGvfsBase.java | 44 +++++++++++++++++++ 2 files changed, 48 insertions(+), 6 deletions(-) diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java index 6946a64fd1b..ffb70e4dc5b 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java @@ -103,9 +103,9 @@ public class GravitinoVirtualFileSystem extends FileSystem { private static final Set CATALOG_NECESSARY_PROPERTIES_TO_KEEP = Sets.newHashSet( OSSProperties.GRAVITINO_OSS_ENDPOINT, - OSSProperties.GRAVITINO_OSS_ENDPOINT, - S3Properties.GRAVITINO_S3_ENDPOINT, + OSSProperties.GRAVITINO_OSS_REGION, S3Properties.GRAVITINO_S3_ENDPOINT, + S3Properties.GRAVITINO_S3_REGION, AzureProperties.GRAVITINO_AZURE_STORAGE_ACCOUNT_NAME); @Override @@ -291,7 +291,7 @@ private FilesetContextPair getFilesetContext(Path virtualPath, FilesetDataOperat FileSystem fs = internalFileSystemCache.get( identifier, - str -> { + ident -> { try { FileSystemProvider provider = fileSystemProvidersMap.get(scheme); if (provider == null) { @@ -347,9 +347,7 @@ private boolean credentialVending(Catalog catalog, NameIdentifier filesetIdentif Credential[] credentials = fileset.supportsCredentials().getCredentials(); return credentials.length > 0; } catch (Exception e) { - // No credential found, do nothing. - Logger.warn("Failed to fetch credentials from fileset: {}", filesetIdentifier, e); - return false; + throw new RuntimeException(e); } } diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java index 16fb4e1282c..be30e42a4b8 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java @@ -27,6 +27,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import com.fasterxml.jackson.core.JsonProcessingException; +import com.google.common.collect.ImmutableMap; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; @@ -40,7 +41,13 @@ import java.util.Objects; import java.util.concurrent.TimeUnit; import org.apache.gravitino.NameIdentifier; +import org.apache.gravitino.dto.AuditDTO; +import org.apache.gravitino.dto.credential.CredentialDTO; +import org.apache.gravitino.dto.file.FilesetDTO; +import org.apache.gravitino.dto.responses.CredentialResponse; import org.apache.gravitino.dto.responses.FileLocationResponse; +import org.apache.gravitino.dto.responses.FilesetResponse; +import org.apache.gravitino.file.Fileset; import org.apache.gravitino.rest.RESTUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; @@ -140,6 +147,7 @@ public void testFSCache() throws IOException { queryParams.put("sub_path", RESTUtils.encodeString("")); try { buildMockResource(Method.GET, locationPath, queryParams, null, fileLocationResponse, SC_OK); + buildMockResourceForCredential(filesetName, localPath.toString()); } catch (JsonProcessingException e) { throw new RuntimeException(e); } @@ -185,6 +193,7 @@ public void testInternalCache() throws IOException { try { buildMockResource( Method.GET, locationPath1, queryParams, null, fileLocationResponse, SC_OK); + buildMockResourceForCredential("fileset1", localPath1.toString()); } catch (JsonProcessingException e) { throw new RuntimeException(e); } @@ -228,6 +237,7 @@ public void testCreate(boolean withScheme) throws IOException { queryParams.put("sub_path", RESTUtils.encodeString("/test.txt")); try { buildMockResource(Method.GET, locationPath, queryParams, null, fileLocationResponse, SC_OK); + buildMockResourceForCredential(filesetName, localPath + "/test.txt"); } catch (JsonProcessingException e) { throw new RuntimeException(e); } @@ -280,6 +290,7 @@ public void testAppend(boolean withScheme) throws IOException { queryParams.put("sub_path", RESTUtils.encodeString("/test.txt")); try { buildMockResource(Method.GET, locationPath, queryParams, null, fileLocationResponse, SC_OK); + buildMockResourceForCredential(filesetName, localPath + "/test.txt"); } catch (JsonProcessingException e) { throw new RuntimeException(e); } @@ -313,6 +324,32 @@ public void testAppend(boolean withScheme) throws IOException { } } + private void buildMockResourceForCredential(String filesetName, String filesetLocation) + throws JsonProcessingException { + String filesetPath = + String.format( + "/api/metalakes/%s/catalogs/%s/schemas/%s/filesets/%s", + metalakeName, catalogName, schemaName, filesetName); + String credentialsPath = + String.format( + "/api/metalakes/%s/objects/fileset/%s.%s.%s/credentials", + metalakeName, catalogName, schemaName, filesetName); + FilesetResponse filesetResponse = + new FilesetResponse( + FilesetDTO.builder() + .name(filesetName) + .comment("comment") + .type(Fileset.Type.MANAGED) + .audit(AuditDTO.builder().build()) + .storageLocation(filesetLocation.toString()) + .build()); + CredentialResponse credentialResponse = new CredentialResponse(new CredentialDTO[] {}); + + buildMockResource(Method.GET, filesetPath, ImmutableMap.of(), null, filesetResponse, SC_OK); + buildMockResource( + Method.GET, credentialsPath, ImmutableMap.of(), null, credentialResponse, SC_OK); + } + @ParameterizedTest @ValueSource(booleans = {true, false}) public void testRename(boolean withScheme) throws IOException { @@ -347,6 +384,7 @@ public void testRename(boolean withScheme) throws IOException { try { buildMockResource( Method.GET, locationPath, queryParams1, null, fileLocationResponse1, SC_OK); + buildMockResourceForCredential(filesetName, localPath + "/rename_dst2"); } catch (JsonProcessingException e) { throw new RuntimeException(e); } @@ -413,6 +451,7 @@ public void testDelete(boolean withScheme) throws IOException { queryParams.put("sub_path", RESTUtils.encodeString("/test_delete")); try { buildMockResource(Method.GET, locationPath, queryParams, null, fileLocationResponse, SC_OK); + buildMockResourceForCredential(filesetName, localPath + "/test_delete"); } catch (JsonProcessingException e) { throw new RuntimeException(e); } @@ -459,6 +498,7 @@ public void testGetStatus() throws IOException { queryParams.put("sub_path", RESTUtils.encodeString("")); try { buildMockResource(Method.GET, locationPath, queryParams, null, fileLocationResponse, SC_OK); + buildMockResourceForCredential(filesetName, localPath.toString()); } catch (JsonProcessingException e) { throw new RuntimeException(e); } @@ -503,6 +543,7 @@ public void testListStatus() throws IOException { queryParams.put("sub_path", RESTUtils.encodeString("")); try { buildMockResource(Method.GET, locationPath, queryParams, null, fileLocationResponse, SC_OK); + buildMockResourceForCredential(filesetName, localPath.toString()); } catch (JsonProcessingException e) { throw new RuntimeException(e); } @@ -553,6 +594,7 @@ public void testMkdirs() throws IOException { queryParams.put("sub_path", RESTUtils.encodeString("/test_mkdirs")); try { buildMockResource(Method.GET, locationPath, queryParams, null, fileLocationResponse, SC_OK); + buildMockResourceForCredential(filesetName, localPath + "/test_mkdirs"); } catch (JsonProcessingException e) { throw new RuntimeException(e); } @@ -671,6 +713,7 @@ public void testGetDefaultReplications() throws IOException { queryParams.put("sub_path", RESTUtils.encodeString("")); try { buildMockResource(Method.GET, locationPath, queryParams, null, fileLocationResponse, SC_OK); + buildMockResourceForCredential(filesetName, localPath.toString()); } catch (JsonProcessingException e) { throw new RuntimeException(e); } @@ -697,6 +740,7 @@ public void testGetDefaultBlockSize() throws IOException { queryParams.put("sub_path", RESTUtils.encodeString("")); try { buildMockResource(Method.GET, locationPath, queryParams, null, fileLocationResponse, SC_OK); + buildMockResourceForCredential(filesetName, localPath.toString()); } catch (JsonProcessingException e) { throw new RuntimeException(e); } From 40422f052b25a79d17f41c21bdeb3ecea2b99e99 Mon Sep 17 00:00:00 2001 From: yuqi Date: Thu, 9 Jan 2025 11:41:31 +0800 Subject: [PATCH 49/59] Fix method naming problem and fix improper description in comments. --- .../org/apache/gravitino/oss/fs/OSSFileSystemProvider.java | 4 ++-- .../org/apache/gravitino/s3/fs/S3FileSystemProvider.java | 4 ++-- .../apache/gravitino/abs/fs/AzureFileSystemProvider.java | 4 ++-- .../org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java | 4 ++-- .../gravitino/catalog/hadoop/fs/CredentialUtils.java | 2 +- .../filesystem/hadoop/GravitinoVirtualFileSystem.java | 7 ++++--- 6 files changed, 13 insertions(+), 12 deletions(-) diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java index 1c0b55b4baa..de8f2226903 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java @@ -18,7 +18,7 @@ */ package org.apache.gravitino.oss.fs; -import static org.apache.gravitino.catalog.hadoop.fs.CredentialUtils.enableGravitinoCredentialVending; +import static org.apache.gravitino.catalog.hadoop.fs.CredentialUtils.gravitinoCredentialVendingEnabled; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableMap; @@ -62,7 +62,7 @@ public FileSystem getFileSystem(Path path, Map config) throws IO hadoopConfMap.put(OSS_FILESYSTEM_IMPL, AliyunOSSFileSystem.class.getCanonicalName()); } - if (enableGravitinoCredentialVending(config)) { + if (gravitinoCredentialVendingEnabled(config)) { hadoopConfMap.put( Constants.CREDENTIALS_PROVIDER_KEY, OSSCredentialsProvider.class.getCanonicalName()); } diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java index 73e1c2edff1..cdc007ade2a 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java @@ -19,7 +19,7 @@ package org.apache.gravitino.s3.fs; -import static org.apache.gravitino.catalog.hadoop.fs.CredentialUtils.enableGravitinoCredentialVending; +import static org.apache.gravitino.catalog.hadoop.fs.CredentialUtils.gravitinoCredentialVendingEnabled; import com.amazonaws.auth.AWSCredentialsProvider; import com.google.common.annotations.VisibleForTesting; @@ -68,7 +68,7 @@ public FileSystem getFileSystem(Path path, Map config) throws IO configuration.set(S3_CREDENTIAL_KEY, S3_SIMPLE_CREDENTIAL); } - if (enableGravitinoCredentialVending(config)) { + if (gravitinoCredentialVendingEnabled(config)) { configuration.set( Constants.AWS_CREDENTIALS_PROVIDER, S3CredentialsProvider.class.getCanonicalName()); } diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java index 292b2cee433..37997aa718e 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java @@ -19,7 +19,7 @@ package org.apache.gravitino.abs.fs; -import static org.apache.gravitino.catalog.hadoop.fs.CredentialUtils.enableGravitinoCredentialVending; +import static org.apache.gravitino.catalog.hadoop.fs.CredentialUtils.gravitinoCredentialVendingEnabled; import static org.apache.hadoop.fs.azurebfs.constants.ConfigurationKeys.FS_AZURE_ACCOUNT_AUTH_TYPE_PROPERTY_NAME; import static org.apache.hadoop.fs.azurebfs.constants.ConfigurationKeys.FS_AZURE_ACCOUNT_IS_HNS_ENABLED; import static org.apache.hadoop.fs.azurebfs.constants.ConfigurationKeys.FS_AZURE_SAS_TOKEN_PROVIDER_TYPE; @@ -70,7 +70,7 @@ public FileSystem getFileSystem(@Nonnull Path path, @Nonnull Map hadoopConfMap.forEach(configuration::set); - if (enableGravitinoCredentialVending(hadoopConfMap)) { + if (gravitinoCredentialVendingEnabled(hadoopConfMap)) { try { AzureSasCredentialsProvider azureSasCredentialsProvider = new AzureSasCredentialsProvider(); azureSasCredentialsProvider.initialize(configuration, null); diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java index c361e5302c8..df35b61055e 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java @@ -18,7 +18,7 @@ */ package org.apache.gravitino.gcs.fs; -import static org.apache.gravitino.catalog.hadoop.fs.CredentialUtils.enableGravitinoCredentialVending; +import static org.apache.gravitino.catalog.hadoop.fs.CredentialUtils.gravitinoCredentialVendingEnabled; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableMap; @@ -47,7 +47,7 @@ public FileSystem getFileSystem(Path path, Map config) throws IO FileSystemUtils.toHadoopConfigMap(config, GRAVITINO_KEY_TO_GCS_HADOOP_KEY) .forEach(configuration::set); - if (enableGravitinoCredentialVending(config)) { + if (gravitinoCredentialVendingEnabled(config)) { configuration.set(GCS_TOKEN_PROVIDER_IMPL, GCSCredentialsProvider.class.getName()); } diff --git a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/CredentialUtils.java b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/CredentialUtils.java index e8d526e52d2..26b2d2d248a 100644 --- a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/CredentialUtils.java +++ b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/CredentialUtils.java @@ -30,7 +30,7 @@ public class CredentialUtils { * @param config The configuration for Gravitino Virtual File System. * @return true if Gravitino credential vending is enabled, false otherwise. */ - public static boolean enableGravitinoCredentialVending(Map config) { + public static boolean gravitinoCredentialVendingEnabled(Map config) { return null != config.get(GravitinoFileSystemCredentialsProvider.GVFS_CREDENTIAL_PROVIDER); } } diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java index ffb70e4dc5b..36e235a24b1 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java @@ -315,8 +315,9 @@ private FilesetContextPair getFilesetContext(Path virtualPath, FilesetDataOperat Map totalProperty = Maps.newHashMap(necessaryPropertyFromCatalog); totalProperty.putAll(getConfigMap(getConf())); - if (credentialVending(catalog, identifier)) { - // It has enabled the credential vending, so we need to set the credential + if (credentialVendingEnabled(catalog, identifier)) { + // If the catalog has enabled the credential vending, we need to set the + // credential // provider totalProperty.put( GravitinoFileSystemCredentialsProvider.GVFS_CREDENTIAL_PROVIDER, @@ -336,7 +337,7 @@ private FilesetContextPair getFilesetContext(Path virtualPath, FilesetDataOperat return new FilesetContextPair(new Path(actualFileLocation), fs); } - private boolean credentialVending(Catalog catalog, NameIdentifier filesetIdentifier) { + private boolean credentialVendingEnabled(Catalog catalog, NameIdentifier filesetIdentifier) { try { Fileset fileset = catalog From 8c228fd5880085484f61569266c8351a5f882850 Mon Sep 17 00:00:00 2001 From: yuqi Date: Thu, 9 Jan 2025 18:00:46 +0800 Subject: [PATCH 50/59] Resolve comments --- .../oss/fs/OSSCredentialsProvider.java | 2 +- .../oss/fs/OSSFileSystemProvider.java | 26 +++++--- .../s3/fs/S3CredentialsProvider.java | 2 +- .../gravitino/s3/fs/S3FileSystemProvider.java | 19 +++++- .../abs/fs/AzureFileSystemProvider.java | 61 +++++++++---------- .../abs/fs/AzureSasCredentialsProvider.java | 16 +---- .../gcs/fs/GCSCredentialsProvider.java | 2 +- .../gcs/fs/GCSFileSystemProvider.java | 17 +++++- .../hadoop/fs/SupportsCredentialVending.java | 37 +++++++++++ .../hadoop/GravitinoVirtualFileSystem.java | 33 ++++++---- 10 files changed, 144 insertions(+), 71 deletions(-) create mode 100644 catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/SupportsCredentialVending.java diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java index 6f264759ea2..13ca9a0a59b 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java @@ -93,7 +93,7 @@ private void refresh() { * @param credentials The credential array. * @return A credential. Null if not found. */ - private Credential getSuitableCredential(Credential[] credentials) { + static Credential getSuitableCredential(Credential[] credentials) { // Use dynamic credential if found. for (Credential credential : credentials) { if (credential instanceof OSSTokenCredential) { diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java index de8f2226903..7cc050d1444 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java @@ -18,14 +18,17 @@ */ package org.apache.gravitino.oss.fs; -import static org.apache.gravitino.catalog.hadoop.fs.CredentialUtils.gravitinoCredentialVendingEnabled; - import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Maps; import java.io.IOException; import java.util.Map; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; +import org.apache.gravitino.catalog.hadoop.fs.SupportsCredentialVending; +import org.apache.gravitino.credential.Credential; +import org.apache.gravitino.credential.OSSSecretKeyCredential; +import org.apache.gravitino.credential.OSSTokenCredential; import org.apache.gravitino.storage.OSSProperties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -33,7 +36,7 @@ import org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem; import org.apache.hadoop.fs.aliyun.oss.Constants; -public class OSSFileSystemProvider implements FileSystemProvider { +public class OSSFileSystemProvider implements FileSystemProvider, SupportsCredentialVending { private static final String OSS_FILESYSTEM_IMPL = "fs.oss.impl"; @@ -62,16 +65,23 @@ public FileSystem getFileSystem(Path path, Map config) throws IO hadoopConfMap.put(OSS_FILESYSTEM_IMPL, AliyunOSSFileSystem.class.getCanonicalName()); } - if (gravitinoCredentialVendingEnabled(config)) { - hadoopConfMap.put( - Constants.CREDENTIALS_PROVIDER_KEY, OSSCredentialsProvider.class.getCanonicalName()); - } - hadoopConfMap.forEach(configuration::set); return AliyunOSSFileSystem.newInstance(path.toUri(), configuration); } + @Override + public Map getFileSystemCredentialConf(Credential[] credentials) { + Credential credential = OSSCredentialsProvider.getSuitableCredential(credentials); + Map result = Maps.newHashMap(); + if (credential instanceof OSSSecretKeyCredential || credential instanceof OSSTokenCredential) { + result.put( + Constants.CREDENTIALS_PROVIDER_KEY, OSSCredentialsProvider.class.getCanonicalName()); + } + + return result; + } + @Override public String scheme() { return "oss"; diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java index 38cbc45be6c..32bb6b74f0a 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java @@ -93,7 +93,7 @@ public void refresh() { * @param credentials The credential array. * @return A credential. Null if not found. */ - private Credential getSuitableCredential(Credential[] credentials) { + static Credential getSuitableCredential(Credential[] credentials) { // Use dynamic credential if found. for (Credential credential : credentials) { if (credential instanceof S3TokenCredential) { diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java index cdc007ade2a..cee654a4fb8 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java @@ -27,11 +27,16 @@ import com.google.common.base.Splitter; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; +import com.google.common.collect.Maps; import java.io.IOException; import java.util.List; import java.util.Map; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; +import org.apache.gravitino.catalog.hadoop.fs.SupportsCredentialVending; +import org.apache.gravitino.credential.Credential; +import org.apache.gravitino.credential.S3SecretKeyCredential; +import org.apache.gravitino.credential.S3TokenCredential; import org.apache.gravitino.storage.S3Properties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -41,7 +46,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class S3FileSystemProvider implements FileSystemProvider { +public class S3FileSystemProvider implements FileSystemProvider, SupportsCredentialVending { private static final Logger LOG = LoggerFactory.getLogger(S3FileSystemProvider.class); @@ -79,6 +84,18 @@ public FileSystem getFileSystem(Path path, Map config) throws IO return S3AFileSystem.newInstance(path.toUri(), configuration); } + @Override + public Map getFileSystemCredentialConf(Credential[] credentials) { + Credential credential = S3CredentialsProvider.getSuitableCredential(credentials); + Map result = Maps.newHashMap(); + if (credential instanceof S3SecretKeyCredential || credential instanceof S3TokenCredential) { + result.put( + Constants.AWS_CREDENTIALS_PROVIDER, S3CredentialsProvider.class.getCanonicalName()); + } + + return result; + } + private void checkAndSetCredentialProvider(Configuration configuration) { String provides = configuration.get(S3_CREDENTIAL_KEY); if (provides == null) { diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java index 37997aa718e..c8a65a4aba9 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java @@ -19,25 +19,29 @@ package org.apache.gravitino.abs.fs; -import static org.apache.gravitino.catalog.hadoop.fs.CredentialUtils.gravitinoCredentialVendingEnabled; import static org.apache.hadoop.fs.azurebfs.constants.ConfigurationKeys.FS_AZURE_ACCOUNT_AUTH_TYPE_PROPERTY_NAME; import static org.apache.hadoop.fs.azurebfs.constants.ConfigurationKeys.FS_AZURE_ACCOUNT_IS_HNS_ENABLED; import static org.apache.hadoop.fs.azurebfs.constants.ConfigurationKeys.FS_AZURE_SAS_TOKEN_PROVIDER_TYPE; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Maps; import java.io.IOException; import java.util.Map; import javax.annotation.Nonnull; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; +import org.apache.gravitino.catalog.hadoop.fs.SupportsCredentialVending; +import org.apache.gravitino.credential.ADLSTokenCredential; +import org.apache.gravitino.credential.AzureAccountKeyCredential; +import org.apache.gravitino.credential.Credential; import org.apache.gravitino.storage.AzureProperties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.azurebfs.services.AuthType; -public class AzureFileSystemProvider implements FileSystemProvider { +public class AzureFileSystemProvider implements FileSystemProvider, SupportsCredentialVending { @VisibleForTesting public static final String ABS_PROVIDER_SCHEME = "abfss"; @@ -70,37 +74,32 @@ public FileSystem getFileSystem(@Nonnull Path path, @Nonnull Map hadoopConfMap.forEach(configuration::set); - if (gravitinoCredentialVendingEnabled(hadoopConfMap)) { - try { - AzureSasCredentialsProvider azureSasCredentialsProvider = new AzureSasCredentialsProvider(); - azureSasCredentialsProvider.initialize(configuration, null); - String sas = azureSasCredentialsProvider.getSASToken(null, null, null, null); - if (sas != null) { - String accountName = - String.format( - "%s.dfs.core.windows.net", - config.get(AzureProperties.GRAVITINO_AZURE_STORAGE_ACCOUNT_NAME)); - - configuration.set( - FS_AZURE_ACCOUNT_AUTH_TYPE_PROPERTY_NAME + "." + accountName, AuthType.SAS.name()); - configuration.set( - FS_AZURE_SAS_TOKEN_PROVIDER_TYPE + "." + accountName, - AzureSasCredentialsProvider.class.getName()); - configuration.set(FS_AZURE_ACCOUNT_IS_HNS_ENABLED, "true"); - } else if (azureSasCredentialsProvider.getAzureStorageAccountKey() != null - && azureSasCredentialsProvider.getAzureStorageAccountName() != null) { - configuration.set( - String.format( - "fs.azure.account.key.%s.dfs.core.windows.net", - azureSasCredentialsProvider.getAzureStorageAccountName()), - azureSasCredentialsProvider.getAzureStorageAccountKey()); - } - } catch (Exception e) { - throw new IOException("Failed to get SAS token from AzureSasCredentialsProvider", e); - } + return FileSystem.get(path.toUri(), configuration); + } + + @Override + public Map getFileSystemCredentialConf(Credential[] credentials) { + Credential credential = AzureSasCredentialsProvider.getSuitableCredential(credentials); + Map result = Maps.newHashMap(); + if (credential instanceof ADLSTokenCredential) { + ADLSTokenCredential adlsTokenCredential = (ADLSTokenCredential) credential; + + String accountName = adlsTokenCredential.accountName(); + result.put(FS_AZURE_ACCOUNT_AUTH_TYPE_PROPERTY_NAME + "." + accountName, AuthType.SAS.name()); + result.put( + FS_AZURE_SAS_TOKEN_PROVIDER_TYPE + "." + accountName, + AzureSasCredentialsProvider.class.getName()); + result.put(FS_AZURE_ACCOUNT_IS_HNS_ENABLED, "true"); + } else if (credential instanceof AzureAccountKeyCredential) { + AzureAccountKeyCredential azureAccountKeyCredential = (AzureAccountKeyCredential) credential; + result.put( + String.format( + "fs.azure.account.key.%s.dfs.core.windows.net", + azureAccountKeyCredential.accountName()), + azureAccountKeyCredential.accountKey()); } - return FileSystem.get(path.toUri(), configuration); + return result; } @Override diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java index fd0fe450a7b..0e0bbfef60f 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java @@ -33,21 +33,11 @@ public class AzureSasCredentialsProvider implements SASTokenProvider, Configurab private Configuration configuration; private String sasToken; - private String azureStorageAccountName; - private String azureStorageAccountKey; private GravitinoFileSystemCredentialsProvider gravitinoFileSystemCredentialsProvider; private long expirationTime = Long.MAX_VALUE; private static final double EXPIRATION_TIME_FACTOR = 0.9D; - public String getAzureStorageAccountName() { - return azureStorageAccountName; - } - - public String getAzureStorageAccountKey() { - return azureStorageAccountKey; - } - @Override public void setConf(Configuration configuration) { this.configuration = configuration; @@ -85,10 +75,6 @@ private void refresh() { if (credential instanceof ADLSTokenCredential) { ADLSTokenCredential adlsTokenCredential = (ADLSTokenCredential) credential; sasToken = adlsTokenCredential.sasToken(); - } else if (credential instanceof AzureAccountKeyCredential) { - AzureAccountKeyCredential azureAccountKeyCredential = (AzureAccountKeyCredential) credential; - azureStorageAccountName = azureAccountKeyCredential.accountName(); - azureStorageAccountKey = azureAccountKeyCredential.accountKey(); } if (credential.expireTimeInMs() > 0) { @@ -107,7 +93,7 @@ private void refresh() { * @param credentials The credential array. * @return A credential. Null if not found. */ - private Credential getSuitableCredential(Credential[] credentials) { + static Credential getSuitableCredential(Credential[] credentials) { // Use dynamic credential if found. for (Credential credential : credentials) { if (credential instanceof ADLSTokenCredential) { diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java index 02e87300858..0ba8f1ff863 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java @@ -89,7 +89,7 @@ public Configuration getConf() { * @param credentials The credential array. * @return An credential. */ - private Credential getSuitableCredential(Credential[] credentials) { + static Credential getSuitableCredential(Credential[] credentials) { for (Credential credential : credentials) { if (credential instanceof GCSTokenCredential) { return credential; diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java index df35b61055e..8078bbec5e9 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java @@ -22,16 +22,20 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Maps; import java.io.IOException; import java.util.Map; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; +import org.apache.gravitino.catalog.hadoop.fs.SupportsCredentialVending; +import org.apache.gravitino.credential.Credential; +import org.apache.gravitino.credential.GCSTokenCredential; import org.apache.gravitino.storage.GCSProperties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -public class GCSFileSystemProvider implements FileSystemProvider { +public class GCSFileSystemProvider implements FileSystemProvider, SupportsCredentialVending { private static final String GCS_SERVICE_ACCOUNT_JSON_FILE = "fs.gs.auth.service.account.json.keyfile"; private static final String GCS_TOKEN_PROVIDER_IMPL = "fs.gs.auth.access.token.provider.impl"; @@ -54,6 +58,17 @@ public FileSystem getFileSystem(Path path, Map config) throws IO return FileSystem.newInstance(path.toUri(), configuration); } + @Override + public Map getFileSystemCredentialConf(Credential[] credentials) { + Credential credential = GCSCredentialsProvider.getSuitableCredential(credentials); + Map result = Maps.newHashMap(); + if (credential instanceof GCSTokenCredential) { + result.put(GCS_TOKEN_PROVIDER_IMPL, GCSCredentialsProvider.class.getName()); + } + + return result; + } + @Override public String scheme() { return "gs"; diff --git a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/SupportsCredentialVending.java b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/SupportsCredentialVending.java new file mode 100644 index 00000000000..a9c0b688d0c --- /dev/null +++ b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/SupportsCredentialVending.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.catalog.hadoop.fs; + +import com.google.common.collect.ImmutableMap; +import java.util.Map; +import org.apache.gravitino.credential.Credential; + +/** Interface for file systems that support credential vending. */ +public interface SupportsCredentialVending { + /** + * Get the configuration needed for the file system credential based on the credentials. + * + * @param credentials the credentials to be used for the file system + * @return the configuration for the file system credential + */ + default Map getFileSystemCredentialConf(Credential[] credentials) { + return ImmutableMap.of(); + } +} diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java index 36e235a24b1..f721521e63d 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java @@ -51,6 +51,7 @@ import org.apache.gravitino.audit.InternalClientType; import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialsProvider; +import org.apache.gravitino.catalog.hadoop.fs.SupportsCredentialVending; import org.apache.gravitino.client.GravitinoClient; import org.apache.gravitino.credential.Credential; import org.apache.gravitino.exceptions.GravitinoRuntimeException; @@ -315,16 +316,8 @@ private FilesetContextPair getFilesetContext(Path virtualPath, FilesetDataOperat Map totalProperty = Maps.newHashMap(necessaryPropertyFromCatalog); totalProperty.putAll(getConfigMap(getConf())); - if (credentialVendingEnabled(catalog, identifier)) { - // If the catalog has enabled the credential vending, we need to set the - // credential - // provider - totalProperty.put( - GravitinoFileSystemCredentialsProvider.GVFS_CREDENTIAL_PROVIDER, - DefaultGravitinoFileSystemCredentialsProvider.class.getCanonicalName()); - totalProperty.put( - GravitinoFileSystemCredentialsProvider.GVFS_PATH, virtualPathString); - } + totalProperty.putAll( + getCredentialProperties(provider, catalog, identifier, virtualPathString)); return provider.getFileSystem(filePath, totalProperty); } catch (IOException ioe) { @@ -337,7 +330,12 @@ private FilesetContextPair getFilesetContext(Path virtualPath, FilesetDataOperat return new FilesetContextPair(new Path(actualFileLocation), fs); } - private boolean credentialVendingEnabled(Catalog catalog, NameIdentifier filesetIdentifier) { + private Map getCredentialProperties( + FileSystemProvider fileSystemProvider, + Catalog catalog, + NameIdentifier filesetIdentifier, + String virtualPathString) { + Map maps = Maps.newHashMap(); try { Fileset fileset = catalog @@ -346,10 +344,21 @@ private boolean credentialVendingEnabled(Catalog catalog, NameIdentifier fileset NameIdentifier.of( filesetIdentifier.namespace().level(2), filesetIdentifier.name())); Credential[] credentials = fileset.supportsCredentials().getCredentials(); - return credentials.length > 0; + if (credentials.length > 0 && fileSystemProvider instanceof SupportsCredentialVending) { + maps.put( + GravitinoFileSystemCredentialsProvider.GVFS_CREDENTIAL_PROVIDER, + DefaultGravitinoFileSystemCredentialsProvider.class.getCanonicalName()); + maps.put(GravitinoFileSystemCredentialsProvider.GVFS_PATH, virtualPathString); + + SupportsCredentialVending supportsCredentialVending = + (SupportsCredentialVending) fileSystemProvider; + maps.putAll(supportsCredentialVending.getFileSystemCredentialConf(credentials)); + } } catch (Exception e) { throw new RuntimeException(e); } + + return maps; } private void resetFileSystemServiceLoader(String fsScheme) { From d8ba11eb159324c985fdcc3f787e119fee3f1361 Mon Sep 17 00:00:00 2001 From: yuqi Date: Thu, 9 Jan 2025 19:09:14 +0800 Subject: [PATCH 51/59] fix minor --- .../org/apache/gravitino/s3/fs/S3FileSystemProvider.java | 7 ------- .../org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java | 6 ------ 2 files changed, 13 deletions(-) diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java index cee654a4fb8..ed445a24e1e 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java @@ -19,8 +19,6 @@ package org.apache.gravitino.s3.fs; -import static org.apache.gravitino.catalog.hadoop.fs.CredentialUtils.gravitinoCredentialVendingEnabled; - import com.amazonaws.auth.AWSCredentialsProvider; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Joiner; @@ -73,11 +71,6 @@ public FileSystem getFileSystem(Path path, Map config) throws IO configuration.set(S3_CREDENTIAL_KEY, S3_SIMPLE_CREDENTIAL); } - if (gravitinoCredentialVendingEnabled(config)) { - configuration.set( - Constants.AWS_CREDENTIALS_PROVIDER, S3CredentialsProvider.class.getCanonicalName()); - } - // Hadoop-aws 2 does not support IAMInstanceCredentialsProvider checkAndSetCredentialProvider(configuration); diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java index 8078bbec5e9..89574909948 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java @@ -18,8 +18,6 @@ */ package org.apache.gravitino.gcs.fs; -import static org.apache.gravitino.catalog.hadoop.fs.CredentialUtils.gravitinoCredentialVendingEnabled; - import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Maps; @@ -51,10 +49,6 @@ public FileSystem getFileSystem(Path path, Map config) throws IO FileSystemUtils.toHadoopConfigMap(config, GRAVITINO_KEY_TO_GCS_HADOOP_KEY) .forEach(configuration::set); - if (gravitinoCredentialVendingEnabled(config)) { - configuration.set(GCS_TOKEN_PROVIDER_IMPL, GCSCredentialsProvider.class.getName()); - } - return FileSystem.newInstance(path.toUri(), configuration); } From f5c3837694d6c305bb74bed45fe05841001b9961 Mon Sep 17 00:00:00 2001 From: yuqi Date: Thu, 9 Jan 2025 19:11:25 +0800 Subject: [PATCH 52/59] Remove unnecessary class --- .../catalog/hadoop/fs/CredentialUtils.java | 36 ------------------- 1 file changed, 36 deletions(-) delete mode 100644 catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/CredentialUtils.java diff --git a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/CredentialUtils.java b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/CredentialUtils.java deleted file mode 100644 index 26b2d2d248a..00000000000 --- a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/CredentialUtils.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.gravitino.catalog.hadoop.fs; - -import java.util.Map; - -/** Utility class for Gravitino Virtual File System. */ -public class CredentialUtils { - - /** - * Check if Gravitino credential vending is enabled. - * - * @param config The configuration for Gravitino Virtual File System. - * @return true if Gravitino credential vending is enabled, false otherwise. - */ - public static boolean gravitinoCredentialVendingEnabled(Map config) { - return null != config.get(GravitinoFileSystemCredentialsProvider.GVFS_CREDENTIAL_PROVIDER); - } -} From 46ba6da3c6d398cbe49a5c046b6d03379c3686ec Mon Sep 17 00:00:00 2001 From: yuqi Date: Thu, 9 Jan 2025 21:12:46 +0800 Subject: [PATCH 53/59] fix --- .../oss/fs/OSSCredentialsProvider.java | 29 +---------- .../oss/fs/OSSFileSystemProvider.java | 2 +- .../org/apache/gravitino/oss/fs/OSSUtils.java | 52 +++++++++++++++++++ .../s3/fs/S3CredentialsProvider.java | 28 +--------- .../gravitino/s3/fs/S3FileSystemProvider.java | 2 +- .../org/apache/gravitino/s3/fs/S3Utils.java | 51 ++++++++++++++++++ .../abs/fs/AzureFileSystemProvider.java | 2 +- .../abs/fs/AzureSasCredentialsProvider.java | 30 +---------- .../gravitino/abs/fs/AzureStorageUtils.java | 43 +++++++++++++++ .../gcs/fs/GCSCredentialsProvider.java | 21 +------- .../gcs/fs/GCSFileSystemProvider.java | 2 +- .../org/apache/gravitino/gcs/fs/GCSUtils.java | 42 +++++++++++++++ ...ravitinoFileSystemCredentialsProvider.java | 2 +- ...ravitinoFileSystemCredentialsProvider.java | 33 ++---------- .../hadoop/GravitinoVirtualFileSystem.java | 28 +++++----- ...itinoVirtualFileSystemGCSCredentialIT.java | 2 +- ...itinoVirtualFileSystemOSSCredentialIT.java | 24 ++++----- ...vitinoVirtualFileSystemS3CredentialIT.java | 24 ++++----- 18 files changed, 247 insertions(+), 170 deletions(-) create mode 100644 bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSUtils.java create mode 100644 bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3Utils.java create mode 100644 bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureStorageUtils.java create mode 100644 bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSUtils.java diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java index 13ca9a0a59b..ef4afe434a8 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSCredentialsProvider.java @@ -36,7 +36,7 @@ public class OSSCredentialsProvider implements CredentialsProvider { private GravitinoFileSystemCredentialsProvider gravitinoFileSystemCredentialsProvider; private Credentials basicCredentials; private long expirationTime = Long.MAX_VALUE; - private static final double EXPIRATION_TIME_FACTOR = 0.9D; + private static final double EXPIRATION_TIME_FACTOR = 0.5D; public OSSCredentialsProvider(URI uri, Configuration conf) { this.gravitinoFileSystemCredentialsProvider = FileSystemUtils.getGvfsCredentialProvider(conf); @@ -58,7 +58,7 @@ public Credentials getCredentials() { private void refresh() { Credential[] gravitinoCredentials = gravitinoFileSystemCredentialsProvider.getCredentials(); - Credential credential = getSuitableCredential(gravitinoCredentials); + Credential credential = OSSUtils.getSuitableCredential(gravitinoCredentials); if (credential == null) { throw new RuntimeException("No suitable credential for OSS found..."); } @@ -85,29 +85,4 @@ private void refresh() { * EXPIRATION_TIME_FACTOR); } } - - /** - * Get the credential from the credential array. Using dynamic credential first, if not found, - * uses static credential. - * - * @param credentials The credential array. - * @return A credential. Null if not found. - */ - static Credential getSuitableCredential(Credential[] credentials) { - // Use dynamic credential if found. - for (Credential credential : credentials) { - if (credential instanceof OSSTokenCredential) { - return credential; - } - } - - // If dynamic credential not found, use the static one - for (Credential credential : credentials) { - if (credential instanceof OSSSecretKeyCredential) { - return credential; - } - } - - return null; - } } diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java index 7cc050d1444..358e3a08c76 100644 --- a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSFileSystemProvider.java @@ -72,7 +72,7 @@ public FileSystem getFileSystem(Path path, Map config) throws IO @Override public Map getFileSystemCredentialConf(Credential[] credentials) { - Credential credential = OSSCredentialsProvider.getSuitableCredential(credentials); + Credential credential = OSSUtils.getSuitableCredential(credentials); Map result = Maps.newHashMap(); if (credential instanceof OSSSecretKeyCredential || credential instanceof OSSTokenCredential) { result.put( diff --git a/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSUtils.java b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSUtils.java new file mode 100644 index 00000000000..87c71de377b --- /dev/null +++ b/bundles/aliyun/src/main/java/org/apache/gravitino/oss/fs/OSSUtils.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.oss.fs; + +import org.apache.gravitino.credential.Credential; +import org.apache.gravitino.credential.OSSSecretKeyCredential; +import org.apache.gravitino.credential.OSSTokenCredential; + +public class OSSUtils { + + /** + * Get the credential from the credential array. Using dynamic credential first, if not found, + * uses static credential. + * + * @param credentials The credential array. + * @return A credential. Null if not found. + */ + static Credential getSuitableCredential(Credential[] credentials) { + // Use dynamic credential if found. + for (Credential credential : credentials) { + if (credential instanceof OSSTokenCredential) { + return credential; + } + } + + // If dynamic credential not found, use the static one + for (Credential credential : credentials) { + if (credential instanceof OSSSecretKeyCredential) { + return credential; + } + } + + return null; + } +} diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java index 32bb6b74f0a..2fc14588959 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3CredentialsProvider.java @@ -36,7 +36,7 @@ public class S3CredentialsProvider implements AWSCredentialsProvider { private AWSCredentials basicSessionCredentials; private long expirationTime = Long.MAX_VALUE; - private static final double EXPIRATION_TIME_FACTOR = 0.9D; + private static final double EXPIRATION_TIME_FACTOR = 0.5D; public S3CredentialsProvider(final URI uri, final Configuration conf) { this.gravitinoFileSystemCredentialsProvider = FileSystemUtils.getGvfsCredentialProvider(conf); @@ -57,7 +57,7 @@ public AWSCredentials getCredentials() { @Override public void refresh() { Credential[] gravitinoCredentials = gravitinoFileSystemCredentialsProvider.getCredentials(); - Credential credential = getSuitableCredential(gravitinoCredentials); + Credential credential = S3Utils.getSuitableCredential(gravitinoCredentials); if (credential == null) { throw new RuntimeException("No suitable credential for S3 found..."); @@ -85,28 +85,4 @@ public void refresh() { * EXPIRATION_TIME_FACTOR); } } - - /** - * Get the credential from the credential array. Using dynamic credential first, if not found, - * uses static credential. - * - * @param credentials The credential array. - * @return A credential. Null if not found. - */ - static Credential getSuitableCredential(Credential[] credentials) { - // Use dynamic credential if found. - for (Credential credential : credentials) { - if (credential instanceof S3TokenCredential) { - return credential; - } - } - - // If dynamic credential not found, use the static one - for (Credential credential : credentials) { - if (credential instanceof S3SecretKeyCredential) { - return credential; - } - } - return null; - } } diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java index ed445a24e1e..cbe133ed778 100644 --- a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3FileSystemProvider.java @@ -79,7 +79,7 @@ public FileSystem getFileSystem(Path path, Map config) throws IO @Override public Map getFileSystemCredentialConf(Credential[] credentials) { - Credential credential = S3CredentialsProvider.getSuitableCredential(credentials); + Credential credential = S3Utils.getSuitableCredential(credentials); Map result = Maps.newHashMap(); if (credential instanceof S3SecretKeyCredential || credential instanceof S3TokenCredential) { result.put( diff --git a/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3Utils.java b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3Utils.java new file mode 100644 index 00000000000..078a1180ba4 --- /dev/null +++ b/bundles/aws/src/main/java/org/apache/gravitino/s3/fs/S3Utils.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.s3.fs; + +import org.apache.gravitino.credential.Credential; +import org.apache.gravitino.credential.S3SecretKeyCredential; +import org.apache.gravitino.credential.S3TokenCredential; + +public class S3Utils { + + /** + * Get the credential from the credential array. Using dynamic credential first, if not found, + * uses static credential. + * + * @param credentials The credential array. + * @return A credential. Null if not found. + */ + static Credential getSuitableCredential(Credential[] credentials) { + // Use dynamic credential if found. + for (Credential credential : credentials) { + if (credential instanceof S3TokenCredential) { + return credential; + } + } + + // If dynamic credential not found, use the static one + for (Credential credential : credentials) { + if (credential instanceof S3SecretKeyCredential) { + return credential; + } + } + return null; + } +} diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java index c8a65a4aba9..866f702c2f7 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java @@ -79,7 +79,7 @@ public FileSystem getFileSystem(@Nonnull Path path, @Nonnull Map @Override public Map getFileSystemCredentialConf(Credential[] credentials) { - Credential credential = AzureSasCredentialsProvider.getSuitableCredential(credentials); + Credential credential = AzureStorageUtils.getSuitableCredential(credentials); Map result = Maps.newHashMap(); if (credential instanceof ADLSTokenCredential) { ADLSTokenCredential adlsTokenCredential = (ADLSTokenCredential) credential; diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java index 0e0bbfef60f..0677039cad7 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java @@ -23,7 +23,6 @@ import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialsProvider; import org.apache.gravitino.credential.ADLSTokenCredential; -import org.apache.gravitino.credential.AzureAccountKeyCredential; import org.apache.gravitino.credential.Credential; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; @@ -36,7 +35,7 @@ public class AzureSasCredentialsProvider implements SASTokenProvider, Configurab private GravitinoFileSystemCredentialsProvider gravitinoFileSystemCredentialsProvider; private long expirationTime = Long.MAX_VALUE; - private static final double EXPIRATION_TIME_FACTOR = 0.9D; + private static final double EXPIRATION_TIME_FACTOR = 0.5D; @Override public void setConf(Configuration configuration) { @@ -67,7 +66,7 @@ public String getSASToken(String account, String fileSystem, String path, String private void refresh() { Credential[] gravitinoCredentials = gravitinoFileSystemCredentialsProvider.getCredentials(); - Credential credential = getSuitableCredential(gravitinoCredentials); + Credential credential = AzureStorageUtils.getSuitableCredential(gravitinoCredentials); if (credential == null) { throw new RuntimeException("No suitable credential for OSS found..."); } @@ -85,29 +84,4 @@ private void refresh() { * EXPIRATION_TIME_FACTOR); } } - - /** - * Get the credential from the credential array. Using dynamic credential first, if not found, - * uses static credential. - * - * @param credentials The credential array. - * @return A credential. Null if not found. - */ - static Credential getSuitableCredential(Credential[] credentials) { - // Use dynamic credential if found. - for (Credential credential : credentials) { - if (credential instanceof ADLSTokenCredential) { - return credential; - } - } - - // If dynamic credential not found, use the static one - for (Credential credential : credentials) { - if (credential instanceof AzureAccountKeyCredential) { - return credential; - } - } - - return null; - } } diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureStorageUtils.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureStorageUtils.java new file mode 100644 index 00000000000..5a9512361d6 --- /dev/null +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureStorageUtils.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.abs.fs; + +import org.apache.gravitino.credential.ADLSTokenCredential; +import org.apache.gravitino.credential.Credential; + +public class AzureStorageUtils { + + /** + * Get the credential from the credential array. Using dynamic credential first, if not found, + * uses static credential. + * + * @param credentials The credential array. + * @return A credential. Null if not found. + */ + static Credential getSuitableCredential(Credential[] credentials) { + // Use dynamic credential if found. + for (Credential credential : credentials) { + if (credential instanceof ADLSTokenCredential) { + return credential; + } + } + return null; + } +} diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java index 0ba8f1ff863..58bd88dcd03 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java @@ -33,7 +33,7 @@ public class GCSCredentialsProvider implements AccessTokenProvider { private AccessToken accessToken; private long expirationTime = Long.MAX_VALUE; - private static final double EXPIRATION_TIME_FACTOR = 0.9D; + private static final double EXPIRATION_TIME_FACTOR = 0.5D; @Override public AccessToken getAccessToken() { @@ -51,7 +51,7 @@ public AccessToken getAccessToken() { public void refresh() throws IOException { Credential[] gravitinoCredentials = gravitinoFileSystemCredentialsProvider.getCredentials(); - Credential credential = getSuitableCredential(gravitinoCredentials); + Credential credential = GCSUtils.getSuitableCredential(gravitinoCredentials); if (credential == null) { throw new RuntimeException("No suitable credential for OSS found..."); } @@ -81,21 +81,4 @@ public void setConf(Configuration configuration) { public Configuration getConf() { return configuration; } - - /** - * Get the credential from the credential array. Using dynamic credential first, if not found, - * uses static credential. - * - * @param credentials The credential array. - * @return An credential. - */ - static Credential getSuitableCredential(Credential[] credentials) { - for (Credential credential : credentials) { - if (credential instanceof GCSTokenCredential) { - return credential; - } - } - - return null; - } } diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java index 89574909948..07d44fd4396 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java @@ -54,7 +54,7 @@ public FileSystem getFileSystem(Path path, Map config) throws IO @Override public Map getFileSystemCredentialConf(Credential[] credentials) { - Credential credential = GCSCredentialsProvider.getSuitableCredential(credentials); + Credential credential = GCSUtils.getSuitableCredential(credentials); Map result = Maps.newHashMap(); if (credential instanceof GCSTokenCredential) { result.put(GCS_TOKEN_PROVIDER_IMPL, GCSCredentialsProvider.class.getName()); diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSUtils.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSUtils.java new file mode 100644 index 00000000000..361ebdc13a2 --- /dev/null +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSUtils.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.gcs.fs; + +import org.apache.gravitino.credential.Credential; +import org.apache.gravitino.credential.GCSTokenCredential; + +public class GCSUtils { + /** + * Get the credential from the credential array. Using dynamic credential first, if not found, + * uses static credential. + * + * @param credentials The credential array. + * @return An credential. + */ + static Credential getSuitableCredential(Credential[] credentials) { + for (Credential credential : credentials) { + if (credential instanceof GCSTokenCredential) { + return credential; + } + } + + return null; + } +} diff --git a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialsProvider.java b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialsProvider.java index b53a3014d0e..40c0492c7f2 100644 --- a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialsProvider.java +++ b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/GravitinoFileSystemCredentialsProvider.java @@ -27,7 +27,7 @@ public interface GravitinoFileSystemCredentialsProvider extends Configurable { String GVFS_CREDENTIAL_PROVIDER = "fs.gvfs.credential.provider"; - String GVFS_PATH = "fs.gvfs.virtual.path"; + String GVFS_NAME_IDENTIFIER = "fs.gvfs.name.identifier"; /** * Get credentials for Gravitino Virtual File System. diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialsProvider.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialsProvider.java index 1df8bf6062e..129a7c98e90 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialsProvider.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialsProvider.java @@ -19,10 +19,7 @@ package org.apache.gravitino.filesystem.hadoop; -import com.google.common.base.Preconditions; -import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.apache.commons.lang3.StringUtils; import org.apache.gravitino.NameIdentifier; import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialsProvider; import org.apache.gravitino.client.GravitinoClient; @@ -40,9 +37,6 @@ public class DefaultGravitinoFileSystemCredentialsProvider private Configuration configuration; - private static final Pattern IDENTIFIER_PATTERN = - Pattern.compile("^(?:gvfs://fileset)?/([^/]+)/([^/]+)/([^/]+)(?>/[^/]+)*/?$"); - @Override public void setConf(Configuration configuration) { this.configuration = configuration; @@ -55,30 +49,13 @@ public Configuration getConf() { @Override public Credential[] getCredentials() { - String virtualPath = configuration.get(GVFS_PATH); - NameIdentifier nameIdentifier = getNameIdentifierFromVirtualPath(virtualPath); - String[] idents = nameIdentifier.namespace().levels(); + // The format of name identifier is `metalake.catalog.schema.fileset` + String nameIdentifier = configuration.get(GVFS_NAME_IDENTIFIER); + String[] idents = nameIdentifier.split("\\."); try (GravitinoClient client = GravitinoVirtualFileSystemUtils.createClient(configuration)) { - FilesetCatalog filesetCatalog = client.loadCatalog(idents[0]).asFilesetCatalog(); - Fileset fileset = - filesetCatalog.loadFileset(NameIdentifier.of(idents[1], nameIdentifier.name())); + FilesetCatalog filesetCatalog = client.loadCatalog(idents[1]).asFilesetCatalog(); + Fileset fileset = filesetCatalog.loadFileset(NameIdentifier.of(idents[2], idents[3])); return fileset.supportsCredentials().getCredentials(); } } - - private NameIdentifier getNameIdentifierFromVirtualPath(String gravitinoVirtualPath) { - String virtualPath = gravitinoVirtualPath.toString(); - Preconditions.checkArgument( - StringUtils.isNotBlank(virtualPath), - "Uri which need be extracted cannot be null or empty."); - - Matcher matcher = IDENTIFIER_PATTERN.matcher(virtualPath); - Preconditions.checkArgument( - matcher.matches() && matcher.groupCount() == 3, - "URI %s doesn't contains valid identifier", - virtualPath); - - // The format is `catalog.schema.fileset` - return NameIdentifier.of(matcher.group(1), matcher.group(2), matcher.group(3)); - } } diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java index f721521e63d..26d248736a9 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java @@ -23,6 +23,7 @@ import com.github.benmanes.caffeine.cache.Scheduler; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; @@ -316,8 +317,7 @@ private FilesetContextPair getFilesetContext(Path virtualPath, FilesetDataOperat Map totalProperty = Maps.newHashMap(necessaryPropertyFromCatalog); totalProperty.putAll(getConfigMap(getConf())); - totalProperty.putAll( - getCredentialProperties(provider, catalog, identifier, virtualPathString)); + totalProperty.putAll(getCredentialProperties(provider, catalog, identifier)); return provider.getFileSystem(filePath, totalProperty); } catch (IOException ioe) { @@ -331,11 +331,13 @@ private FilesetContextPair getFilesetContext(Path virtualPath, FilesetDataOperat } private Map getCredentialProperties( - FileSystemProvider fileSystemProvider, - Catalog catalog, - NameIdentifier filesetIdentifier, - String virtualPathString) { - Map maps = Maps.newHashMap(); + FileSystemProvider fileSystemProvider, Catalog catalog, NameIdentifier filesetIdentifier) { + // Do not support credential vending, we do not need to add any credential properties. + if (!(fileSystemProvider instanceof SupportsCredentialVending)) { + return ImmutableMap.of(); + } + + ImmutableMap.Builder mapBuilder = ImmutableMap.builder(); try { Fileset fileset = catalog @@ -344,21 +346,23 @@ private Map getCredentialProperties( NameIdentifier.of( filesetIdentifier.namespace().level(2), filesetIdentifier.name())); Credential[] credentials = fileset.supportsCredentials().getCredentials(); - if (credentials.length > 0 && fileSystemProvider instanceof SupportsCredentialVending) { - maps.put( + if (credentials.length > 0) { + mapBuilder.put( GravitinoFileSystemCredentialsProvider.GVFS_CREDENTIAL_PROVIDER, DefaultGravitinoFileSystemCredentialsProvider.class.getCanonicalName()); - maps.put(GravitinoFileSystemCredentialsProvider.GVFS_PATH, virtualPathString); + mapBuilder.put( + GravitinoFileSystemCredentialsProvider.GVFS_NAME_IDENTIFIER, + filesetIdentifier.toString()); SupportsCredentialVending supportsCredentialVending = (SupportsCredentialVending) fileSystemProvider; - maps.putAll(supportsCredentialVending.getFileSystemCredentialConf(credentials)); + mapBuilder.putAll(supportsCredentialVending.getFileSystemCredentialConf(credentials)); } } catch (Exception e) { throw new RuntimeException(e); } - return maps; + return mapBuilder.build(); } private void resetFileSystemServiceLoader(String fsScheme) { diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java index 04fbf209588..b2f324aa249 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java @@ -137,7 +137,7 @@ protected Configuration convertGvfsConfigToRealFileSystemConfig(Configuration gv } protected String genStorageLocation(String fileset) { - return String.format("gs://%s/%s", BUCKET_NAME, fileset); + return String.format("gs://%s/dir1/dir2/%s/", BUCKET_NAME, fileset); } @Disabled( diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemOSSCredentialIT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemOSSCredentialIT.java index db33533eece..662e8f6e464 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemOSSCredentialIT.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemOSSCredentialIT.java @@ -47,12 +47,12 @@ public class GravitinoVirtualFileSystemOSSCredentialIT extends GravitinoVirtualF private static final Logger LOG = LoggerFactory.getLogger(GravitinoVirtualFileSystemOSSCredentialIT.class); - public static final String BUCKET_NAME = System.getenv("OSS_STS_BUCKET_NAME"); - public static final String OSS_ACCESS_KEY = System.getenv("OSS_STS_ACCESS_KEY_ID"); - public static final String OSS_SECRET_KEY = System.getenv("OSS_STS_SECRET_ACCESS_KEY"); - public static final String OSS_ENDPOINT = System.getenv("OSS_STS_ENDPOINT"); - public static final String OSS_REGION = System.getenv("OSS_STS_REGION"); - public static final String OSS_ROLE_ARN = System.getenv("OSS_STS_ROLE_ARN"); + public static final String BUCKET_NAME = System.getenv("OSS_BUCKET_NAME_FOR_CREDENTIAL"); + public static final String OSS_ACCESS_KEY = System.getenv("OSS_ACCESS_KEY_ID_FOR_CREDENTIAL"); + public static final String OSS_SECRET_KEY = System.getenv("OSS_SECRET_ACCESS_KEY_FOR_CREDENTIAL"); + public static final String OSS_ENDPOINT = System.getenv("OSS_ENDPOINT_FOR_CREDENTIAL"); + public static final String OSS_REGION = System.getenv("OSS_REGION_FOR_CREDENTIAL"); + public static final String OSS_ROLE_ARN = System.getenv("OSS_ROLE_ARN_FOR_CREDENTIAL"); @BeforeAll public void startIntegrationTest() { @@ -158,11 +158,11 @@ protected String genStorageLocation(String fileset) { public void testAppend() throws IOException {} protected static boolean ossIsConfigured() { - return StringUtils.isNotBlank(System.getenv("OSS_STS_ACCESS_KEY_ID")) - && StringUtils.isNotBlank(System.getenv("OSS_STS_SECRET_ACCESS_KEY")) - && StringUtils.isNotBlank(System.getenv("OSS_STS_ENDPOINT")) - && StringUtils.isNotBlank(System.getenv("OSS_STS_BUCKET_NAME")) - && StringUtils.isNotBlank(System.getenv("OSS_STS_REGION")) - && StringUtils.isNotBlank(System.getenv("OSS_STS_ROLE_ARN")); + return StringUtils.isNotBlank(System.getenv("OSS_ACCESS_KEY_ID_FOR_CREDENTIAL")) + && StringUtils.isNotBlank(System.getenv("OSS_SECRET_ACCESS_KEY_FOR_CREDENTIAL")) + && StringUtils.isNotBlank(System.getenv("OSS_ENDPOINT_FOR_CREDENTIAL")) + && StringUtils.isNotBlank(System.getenv("OSS_BUCKET_NAME_FOR_CREDENTIAL")) + && StringUtils.isNotBlank(System.getenv("OSS_REGION_FOR_CREDENTIAL")) + && StringUtils.isNotBlank(System.getenv("OSS_ROLE_ARN_FOR_CREDENTIAL")); } } diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemS3CredentialIT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemS3CredentialIT.java index bb087cfc1d1..12d5309675d 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemS3CredentialIT.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemS3CredentialIT.java @@ -47,12 +47,12 @@ public class GravitinoVirtualFileSystemS3CredentialIT extends GravitinoVirtualFi private static final Logger LOG = LoggerFactory.getLogger(GravitinoVirtualFileSystemS3CredentialIT.class); - public static final String BUCKET_NAME = System.getenv("S3_STS_BUCKET_NAME"); - public static final String S3_ACCESS_KEY = System.getenv("S3_STS_ACCESS_KEY_ID"); - public static final String S3_SECRET_KEY = System.getenv("S3_STS_SECRET_ACCESS_KEY"); - public static final String S3_ENDPOINT = System.getenv("S3_STS_ENDPOINT"); - public static final String S3_REGION = System.getenv("S3_STS_REGION"); - public static final String S3_ROLE_ARN = System.getenv("S3_STS_ROLE_ARN"); + public static final String BUCKET_NAME = System.getenv("S3_BUCKET_NAME_FOR_CREDENTIAL"); + public static final String S3_ACCESS_KEY = System.getenv("S3_ACCESS_KEY_ID_FOR_CREDENTIAL"); + public static final String S3_SECRET_KEY = System.getenv("S3_SECRET_ACCESS_KEY_FOR_CREDENTIAL"); + public static final String S3_ENDPOINT = System.getenv("S3_ENDPOINT_FOR_CREDENTIAL"); + public static final String S3_REGION = System.getenv("S3_REGION_FOR_CREDENTIAL"); + public static final String S3_ROLE_ARN = System.getenv("S3_ROLE_ARN_FOR_CREDENTIAL"); @BeforeAll public void startIntegrationTest() { @@ -163,11 +163,11 @@ protected String genStorageLocation(String fileset) { public void testAppend() throws IOException {} protected static boolean s3IsConfigured() { - return StringUtils.isNotBlank(System.getenv("S3_STS_ACCESS_KEY_ID")) - && StringUtils.isNotBlank(System.getenv("S3_STS_SECRET_ACCESS_KEY")) - && StringUtils.isNotBlank(System.getenv("S3_STS_ENDPOINT")) - && StringUtils.isNotBlank(System.getenv("S3_STS_BUCKET_NAME")) - && StringUtils.isNotBlank(System.getenv("S3_STS_REGION")) - && StringUtils.isNotBlank(System.getenv("S3_STS_ROLE_ARN")); + return StringUtils.isNotBlank(System.getenv("S3_ACCESS_KEY_ID_FOR_CREDENTIAL")) + && StringUtils.isNotBlank(System.getenv("S3_SECRET_ACCESS_KEY_FOR_CREDENTIAL")) + && StringUtils.isNotBlank(System.getenv("S3_ENDPOINT_FOR_CREDENTIAL")) + && StringUtils.isNotBlank(System.getenv("S3_BUCKET_NAME_FOR_CREDENTIAL")) + && StringUtils.isNotBlank(System.getenv("S3_REGION_FOR_CREDENTIAL")) + && StringUtils.isNotBlank(System.getenv("S3_ROLE_ARN_FOR_CREDENTIAL")); } } From f646aed75c5a9429737825aaae57f81880ab5d11 Mon Sep 17 00:00:00 2001 From: yuqi Date: Thu, 9 Jan 2025 21:15:07 +0800 Subject: [PATCH 54/59] fix --- .../hadoop/DefaultGravitinoFileSystemCredentialsProvider.java | 1 - 1 file changed, 1 deletion(-) diff --git a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialsProvider.java b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialsProvider.java index 129a7c98e90..2f3278f8744 100644 --- a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialsProvider.java +++ b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/DefaultGravitinoFileSystemCredentialsProvider.java @@ -19,7 +19,6 @@ package org.apache.gravitino.filesystem.hadoop; -import java.util.regex.Pattern; import org.apache.gravitino.NameIdentifier; import org.apache.gravitino.catalog.hadoop.fs.GravitinoFileSystemCredentialsProvider; import org.apache.gravitino.client.GravitinoClient; From ee441f7957e33cdf67054d734d4e292910a33f15 Mon Sep 17 00:00:00 2001 From: yuqi Date: Thu, 9 Jan 2025 22:23:27 +0800 Subject: [PATCH 55/59] fix --- .../org/apache/gravitino/abs/fs/AzureFileSystemProvider.java | 5 +++-- .../java/org/apache/gravitino/abs/fs/AzureStorageUtils.java | 4 ++-- .../src/main/java/org/apache/gravitino/gcs/fs/GCSUtils.java | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java index 866f702c2f7..3dcbb502f62 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java @@ -74,7 +74,7 @@ public FileSystem getFileSystem(@Nonnull Path path, @Nonnull Map hadoopConfMap.forEach(configuration::set); - return FileSystem.get(path.toUri(), configuration); + return FileSystem.newInstance(path.toUri(), configuration); } @Override @@ -84,7 +84,8 @@ public Map getFileSystemCredentialConf(Credential[] credentials) if (credential instanceof ADLSTokenCredential) { ADLSTokenCredential adlsTokenCredential = (ADLSTokenCredential) credential; - String accountName = adlsTokenCredential.accountName(); + String accountName = + String.format("%s.dfs.core.windows.net", adlsTokenCredential.accountName()); result.put(FS_AZURE_ACCOUNT_AUTH_TYPE_PROPERTY_NAME + "." + accountName, AuthType.SAS.name()); result.put( FS_AZURE_SAS_TOKEN_PROVIDER_TYPE + "." + accountName, diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureStorageUtils.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureStorageUtils.java index 5a9512361d6..66790c60da5 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureStorageUtils.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureStorageUtils.java @@ -25,8 +25,8 @@ public class AzureStorageUtils { /** - * Get the credential from the credential array. Using dynamic credential first, if not found, - * uses static credential. + * Get the ADLS credential from the credential array. If the dynamic credential is not found, + * return null. * * @param credentials The credential array. * @return A credential. Null if not found. diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSUtils.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSUtils.java index 361ebdc13a2..bf71fc89f04 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSUtils.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSUtils.java @@ -24,8 +24,8 @@ public class GCSUtils { /** - * Get the credential from the credential array. Using dynamic credential first, if not found, - * uses static credential. + * Get the credential from the credential array. If the dynamic credential is not found, return + * null. * * @param credentials The credential array. * @return An credential. From da98dcf53fe5371f498c3f73d4df323cb66b636e Mon Sep 17 00:00:00 2001 From: yuqi Date: Thu, 9 Jan 2025 23:26:27 +0800 Subject: [PATCH 56/59] fix --- .../test/GravitinoVirtualFileSystemGCSCredentialIT.java | 1 - 1 file changed, 1 deletion(-) diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java index b2f324aa249..81b352fa55c 100644 --- a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemGCSCredentialIT.java @@ -76,7 +76,6 @@ public void startUp() throws Exception { Map properties = Maps.newHashMap(); properties.put(FILESYSTEM_PROVIDERS, "gcs"); properties.put(GCSProperties.GRAVITINO_GCS_SERVICE_ACCOUNT_FILE, SERVICE_ACCOUNT_FILE); - properties.put("gcs-credential-file-path", SERVICE_ACCOUNT_FILE); properties.put(CredentialConstants.CREDENTIAL_PROVIDERS, "gcs-token"); Catalog catalog = From 41c6dfe37cf3189c0713ec4cf2636d6ad0f9300b Mon Sep 17 00:00:00 2001 From: yuqi Date: Fri, 10 Jan 2025 08:41:20 +0800 Subject: [PATCH 57/59] Fix --- .../org/apache/gravitino/abs/fs/AzureFileSystemProvider.java | 2 +- .../apache/gravitino/abs/fs/AzureSasCredentialsProvider.java | 2 +- .../java/org/apache/gravitino/abs/fs/AzureStorageUtils.java | 2 +- .../org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java | 2 +- .../java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java | 2 +- .../gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSUtils.java | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java index 3dcbb502f62..762f7fe44e4 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java @@ -79,7 +79,7 @@ public FileSystem getFileSystem(@Nonnull Path path, @Nonnull Map @Override public Map getFileSystemCredentialConf(Credential[] credentials) { - Credential credential = AzureStorageUtils.getSuitableCredential(credentials); + Credential credential = AzureStorageUtils.getAzureStorageTokenCredential(credentials); Map result = Maps.newHashMap(); if (credential instanceof ADLSTokenCredential) { ADLSTokenCredential adlsTokenCredential = (ADLSTokenCredential) credential; diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java index 0677039cad7..183445a82eb 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java @@ -66,7 +66,7 @@ public String getSASToken(String account, String fileSystem, String path, String private void refresh() { Credential[] gravitinoCredentials = gravitinoFileSystemCredentialsProvider.getCredentials(); - Credential credential = AzureStorageUtils.getSuitableCredential(gravitinoCredentials); + Credential credential = AzureStorageUtils.getAzureStorageTokenCredential(gravitinoCredentials); if (credential == null) { throw new RuntimeException("No suitable credential for OSS found..."); } diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureStorageUtils.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureStorageUtils.java index 66790c60da5..a265fec31a5 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureStorageUtils.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureStorageUtils.java @@ -31,7 +31,7 @@ public class AzureStorageUtils { * @param credentials The credential array. * @return A credential. Null if not found. */ - static Credential getSuitableCredential(Credential[] credentials) { + static Credential getAzureStorageTokenCredential(Credential[] credentials) { // Use dynamic credential if found. for (Credential credential : credentials) { if (credential instanceof ADLSTokenCredential) { diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java index 58bd88dcd03..c4eefeeebe0 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSCredentialsProvider.java @@ -51,7 +51,7 @@ public AccessToken getAccessToken() { public void refresh() throws IOException { Credential[] gravitinoCredentials = gravitinoFileSystemCredentialsProvider.getCredentials(); - Credential credential = GCSUtils.getSuitableCredential(gravitinoCredentials); + Credential credential = GCSUtils.getGCSTokenCredential(gravitinoCredentials); if (credential == null) { throw new RuntimeException("No suitable credential for OSS found..."); } diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java index 07d44fd4396..7ab38b2d7a9 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSFileSystemProvider.java @@ -54,7 +54,7 @@ public FileSystem getFileSystem(Path path, Map config) throws IO @Override public Map getFileSystemCredentialConf(Credential[] credentials) { - Credential credential = GCSUtils.getSuitableCredential(credentials); + Credential credential = GCSUtils.getGCSTokenCredential(credentials); Map result = Maps.newHashMap(); if (credential instanceof GCSTokenCredential) { result.put(GCS_TOKEN_PROVIDER_IMPL, GCSCredentialsProvider.class.getName()); diff --git a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSUtils.java b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSUtils.java index bf71fc89f04..f8fbfd6351b 100644 --- a/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSUtils.java +++ b/bundles/gcp/src/main/java/org/apache/gravitino/gcs/fs/GCSUtils.java @@ -30,7 +30,7 @@ public class GCSUtils { * @param credentials The credential array. * @return An credential. */ - static Credential getSuitableCredential(Credential[] credentials) { + static Credential getGCSTokenCredential(Credential[] credentials) { for (Credential credential : credentials) { if (credential instanceof GCSTokenCredential) { return credential; From f3f5676dee0123ca0ce698801c331c9273e339aa Mon Sep 17 00:00:00 2001 From: yuqi Date: Fri, 10 Jan 2025 08:47:58 +0800 Subject: [PATCH 58/59] Fix --- .../gravitino/abs/fs/AzureFileSystemProvider.java | 2 +- .../abs/fs/AzureSasCredentialsProvider.java | 2 +- .../apache/gravitino/abs/fs/AzureStorageUtils.java | 14 +++++++++++--- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java index 762f7fe44e4..3dcbb502f62 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java @@ -79,7 +79,7 @@ public FileSystem getFileSystem(@Nonnull Path path, @Nonnull Map @Override public Map getFileSystemCredentialConf(Credential[] credentials) { - Credential credential = AzureStorageUtils.getAzureStorageTokenCredential(credentials); + Credential credential = AzureStorageUtils.getSuitableCredential(credentials); Map result = Maps.newHashMap(); if (credential instanceof ADLSTokenCredential) { ADLSTokenCredential adlsTokenCredential = (ADLSTokenCredential) credential; diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java index 183445a82eb..0677039cad7 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java @@ -66,7 +66,7 @@ public String getSASToken(String account, String fileSystem, String path, String private void refresh() { Credential[] gravitinoCredentials = gravitinoFileSystemCredentialsProvider.getCredentials(); - Credential credential = AzureStorageUtils.getAzureStorageTokenCredential(gravitinoCredentials); + Credential credential = AzureStorageUtils.getSuitableCredential(gravitinoCredentials); if (credential == null) { throw new RuntimeException("No suitable credential for OSS found..."); } diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureStorageUtils.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureStorageUtils.java index a265fec31a5..589c314d12e 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureStorageUtils.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureStorageUtils.java @@ -20,24 +20,32 @@ package org.apache.gravitino.abs.fs; import org.apache.gravitino.credential.ADLSTokenCredential; +import org.apache.gravitino.credential.AzureAccountKeyCredential; import org.apache.gravitino.credential.Credential; public class AzureStorageUtils { /** - * Get the ADLS credential from the credential array. If the dynamic credential is not found, - * return null. + * Get the ADLS credential from the credential array. Use the account and secret if dynamic token + * is not found, null if both are not found. * * @param credentials The credential array. * @return A credential. Null if not found. */ - static Credential getAzureStorageTokenCredential(Credential[] credentials) { + static Credential getSuitableCredential(Credential[] credentials) { // Use dynamic credential if found. for (Credential credential : credentials) { if (credential instanceof ADLSTokenCredential) { return credential; } } + + for (Credential credential : credentials) { + if (credential instanceof AzureAccountKeyCredential) { + return credential; + } + } + return null; } } From 57db471e210b92032a62fe3803a4865fd00e868c Mon Sep 17 00:00:00 2001 From: yuqi Date: Fri, 10 Jan 2025 09:16:22 +0800 Subject: [PATCH 59/59] Fix --- .../abs/fs/AzureSasCredentialsProvider.java | 18 +++++++++--------- .../gravitino/abs/fs/AzureStorageUtils.java | 17 ++++++++++++++++- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java index 0677039cad7..85793d3d973 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureSasCredentialsProvider.java @@ -66,22 +66,22 @@ public String getSASToken(String account, String fileSystem, String path, String private void refresh() { Credential[] gravitinoCredentials = gravitinoFileSystemCredentialsProvider.getCredentials(); - Credential credential = AzureStorageUtils.getSuitableCredential(gravitinoCredentials); + Credential credential = AzureStorageUtils.getADLSTokenCredential(gravitinoCredentials); if (credential == null) { - throw new RuntimeException("No suitable credential for OSS found..."); + throw new RuntimeException("No token credential for OSS found..."); } if (credential instanceof ADLSTokenCredential) { ADLSTokenCredential adlsTokenCredential = (ADLSTokenCredential) credential; sasToken = adlsTokenCredential.sasToken(); - } - if (credential.expireTimeInMs() > 0) { - expirationTime = - System.currentTimeMillis() - + (long) - ((credential.expireTimeInMs() - System.currentTimeMillis()) - * EXPIRATION_TIME_FACTOR); + if (credential.expireTimeInMs() > 0) { + expirationTime = + System.currentTimeMillis() + + (long) + ((credential.expireTimeInMs() - System.currentTimeMillis()) + * EXPIRATION_TIME_FACTOR); + } } } } diff --git a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureStorageUtils.java b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureStorageUtils.java index 589c314d12e..873f61930ee 100644 --- a/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureStorageUtils.java +++ b/bundles/azure/src/main/java/org/apache/gravitino/abs/fs/AzureStorageUtils.java @@ -33,7 +33,6 @@ public class AzureStorageUtils { * @return A credential. Null if not found. */ static Credential getSuitableCredential(Credential[] credentials) { - // Use dynamic credential if found. for (Credential credential : credentials) { if (credential instanceof ADLSTokenCredential) { return credential; @@ -48,4 +47,20 @@ static Credential getSuitableCredential(Credential[] credentials) { return null; } + + /** + * Get the ADLS token credential from the credential array. Null if not found. + * + * @param credentials The credential array. + * @return A credential. Null if not found. + */ + static Credential getADLSTokenCredential(Credential[] credentials) { + for (Credential credential : credentials) { + if (credential instanceof ADLSTokenCredential) { + return credential; + } + } + + return null; + } }