From c46330151b19826981c8471be9ba3a0864c98073 Mon Sep 17 00:00:00 2001 From: David Phillips Date: Mon, 19 Aug 2024 13:20:02 -0700 Subject: [PATCH] Allow configuring endpoint for Azure FS --- .../object-storage/file-system-azure.md | 6 +++ .../filesystem/azure/AzureFileSystem.java | 15 ++++++- .../azure/AzureFileSystemConfig.java | 16 ++++++- .../azure/AzureFileSystemFactory.java | 6 ++- .../trino/filesystem/azure/AzureLocation.java | 42 +++++++++++-------- .../azure/TestAzureFileSystemConfig.java | 3 ++ .../filesystem/azure/TestAzureLocation.java | 32 +++++++------- 7 files changed, 83 insertions(+), 37 deletions(-) diff --git a/docs/src/main/sphinx/object-storage/file-system-azure.md b/docs/src/main/sphinx/object-storage/file-system-azure.md index 3e74dd2c5677..fa8da1c62ffc 100644 --- a/docs/src/main/sphinx/object-storage/file-system-azure.md +++ b/docs/src/main/sphinx/object-storage/file-system-azure.md @@ -27,6 +27,12 @@ system support: authentication used with `NONE`. Use `ACCESS_KEY` for [](azure-access-key-authentication) or and `OAUTH` for [](azure-oauth-authentication). +* - `azure.endpoint` + - Hostname suffix of the Azure storage endpoint. + Defaults to `core.windows.net` for the global Azure cloud. + Use `core.usgovcloudapi.net` for the Azure US Government cloud, + `core.cloudapi.de` for the Azure Germany cloud, + or `core.chinacloudapi.cn` for the Azure China cloud. * - `azure.read-block-size` - [Data size](prop-type-data-size) for blocks during read operations. Defaults to `4MB`. diff --git a/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureFileSystem.java b/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureFileSystem.java index 7f7d333072db..9adf20bbd4d1 100644 --- a/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureFileSystem.java +++ b/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureFileSystem.java @@ -63,6 +63,7 @@ public class AzureFileSystem private final HttpClient httpClient; private final TracingOptions tracingOptions; private final AzureAuth azureAuth; + private final String endpoint; private final int readBlockSizeBytes; private final long writeBlockSizeBytes; private final int maxWriteConcurrency; @@ -72,6 +73,7 @@ public AzureFileSystem( HttpClient httpClient, TracingOptions tracingOptions, AzureAuth azureAuth, + String endpoint, DataSize readBlockSize, DataSize writeBlockSize, int maxWriteConcurrency, @@ -80,6 +82,7 @@ public AzureFileSystem( this.httpClient = requireNonNull(httpClient, "httpClient is null"); this.tracingOptions = requireNonNull(tracingOptions, "tracingOptions is null"); this.azureAuth = requireNonNull(azureAuth, "azureAuth is null"); + this.endpoint = requireNonNull(endpoint, "endpoint is null"); this.readBlockSizeBytes = toIntExact(readBlockSize.toBytes()); this.writeBlockSizeBytes = writeBlockSize.toBytes(); checkArgument(maxWriteConcurrency >= 0, "maxWriteConcurrency is negative"); @@ -450,6 +453,14 @@ private boolean isHierarchicalNamespaceEnabled(AzureLocation location) } } + private String validatedEndpoint(AzureLocation location) + { + if (!location.endpoint().equals(endpoint)) { + throw new IllegalArgumentException("Location does not match configured Azure endpoint: " + location); + } + return location.endpoint(); + } + private BlobClient createBlobClient(AzureLocation location) { return createBlobContainerClient(location).getBlobClient(location.path()); @@ -462,7 +473,7 @@ private BlobContainerClient createBlobContainerClient(AzureLocation location) BlobContainerClientBuilder builder = new BlobContainerClientBuilder() .httpClient(httpClient) .clientOptions(new ClientOptions().setTracingOptions(tracingOptions)) - .endpoint(String.format("https://%s.blob.core.windows.net", location.account())); + .endpoint("https://%s.blob.%s".formatted(location.account(), validatedEndpoint(location))); azureAuth.setAuth(location.account(), builder); location.container().ifPresent(builder::containerName); return builder.buildClient(); @@ -475,7 +486,7 @@ private DataLakeFileSystemClient createFileSystemClient(AzureLocation location) DataLakeServiceClientBuilder builder = new DataLakeServiceClientBuilder() .httpClient(httpClient) .clientOptions(new ClientOptions().setTracingOptions(tracingOptions)) - .endpoint(String.format("https://%s.dfs.core.windows.net", location.account())); + .endpoint("https://%s.dfs.%s".formatted(location.account(), validatedEndpoint(location))); azureAuth.setAuth(location.account(), builder); DataLakeServiceClient client = builder.buildClient(); DataLakeFileSystemClient fileSystemClient = client.getFileSystemClient(location.container().orElseThrow()); diff --git a/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureFileSystemConfig.java b/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureFileSystemConfig.java index 2425974233ce..acd01a75e5da 100644 --- a/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureFileSystemConfig.java +++ b/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureFileSystemConfig.java @@ -16,6 +16,7 @@ import io.airlift.configuration.Config; import io.airlift.units.DataSize; import io.airlift.units.DataSize.Unit; +import jakarta.validation.constraints.NotEmpty; import jakarta.validation.constraints.NotNull; public class AzureFileSystemConfig @@ -28,7 +29,7 @@ public enum AuthType } private AuthType authType = AuthType.DEFAULT; - + private String endpoint = "core.windows.net"; private DataSize readBlockSize = DataSize.of(4, Unit.MEGABYTE); private DataSize writeBlockSize = DataSize.of(4, Unit.MEGABYTE); private int maxWriteConcurrency = 8; @@ -47,6 +48,19 @@ public AzureFileSystemConfig setAuthType(AuthType authType) return this; } + @NotEmpty + public String getEndpoint() + { + return endpoint; + } + + @Config("azure.endpoint") + public AzureFileSystemConfig setEndpoint(String endpoint) + { + this.endpoint = endpoint; + return this; + } + @NotNull public DataSize getReadBlockSize() { diff --git a/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureFileSystemFactory.java b/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureFileSystemFactory.java index 6a2c4b488463..2f00b9ca9423 100644 --- a/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureFileSystemFactory.java +++ b/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureFileSystemFactory.java @@ -38,6 +38,7 @@ public class AzureFileSystemFactory implements TrinoFileSystemFactory { private final AzureAuth auth; + private final String endpoint; private final DataSize readBlockSize; private final DataSize writeBlockSize; private final int maxWriteConcurrency; @@ -51,6 +52,7 @@ public AzureFileSystemFactory(OpenTelemetry openTelemetry, AzureAuth azureAuth, { this(openTelemetry, azureAuth, + config.getEndpoint(), config.getReadBlockSize(), config.getWriteBlockSize(), config.getMaxWriteConcurrency(), @@ -60,12 +62,14 @@ public AzureFileSystemFactory(OpenTelemetry openTelemetry, AzureAuth azureAuth, public AzureFileSystemFactory( OpenTelemetry openTelemetry, AzureAuth azureAuth, + String endpoint, DataSize readBlockSize, DataSize writeBlockSize, int maxWriteConcurrency, DataSize maxSingleUploadSize) { this.auth = requireNonNull(azureAuth, "azureAuth is null"); + this.endpoint = requireNonNull(endpoint, "endpoint is null"); this.readBlockSize = requireNonNull(readBlockSize, "readBlockSize is null"); this.writeBlockSize = requireNonNull(writeBlockSize, "writeBlockSize is null"); checkArgument(maxWriteConcurrency >= 0, "maxWriteConcurrency is negative"); @@ -89,7 +93,7 @@ public void destroy() @Override public TrinoFileSystem create(ConnectorIdentity identity) { - return new AzureFileSystem(httpClient, tracingOptions, auth, readBlockSize, writeBlockSize, maxWriteConcurrency, maxSingleUploadSize); + return new AzureFileSystem(httpClient, tracingOptions, auth, endpoint, readBlockSize, writeBlockSize, maxWriteConcurrency, maxSingleUploadSize); } public static HttpClient createAzureHttpClient(OkHttpClient okHttpClient, HttpClientOptions clientOptions) diff --git a/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureLocation.java b/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureLocation.java index 512bde40fc8b..aef9490fe216 100644 --- a/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureLocation.java +++ b/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureLocation.java @@ -23,7 +23,7 @@ class AzureLocation { - private static final String INVALID_LOCATION_MESSAGE = "Invalid Azure location. Expected form is 'abfs://[@].dfs.core.windows.net/': %s"; + private static final String INVALID_LOCATION_MESSAGE = "Invalid Azure location. Expected form is 'abfs://[@].dfs./': %s"; // https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/resource-name-rules private static final CharMatcher CONTAINER_VALID_CHARACTERS = CharMatcher.inRange('a', 'z').or(CharMatcher.inRange('0', '9')).or(CharMatcher.is('-')); @@ -32,7 +32,16 @@ class AzureLocation private final Location location; private final String scheme; private final String account; + private final String endpoint; + /** + * Creates a new location based on the endpoint, storage account, container and blob path parsed from the location. + *

+ * Locations follow the conventions used by + * ABFS URI + * that follows the following convention + *

{@code abfs://@.dfs./}
+ */ public AzureLocation(Location location) { this.location = requireNonNull(location, "location is null"); @@ -67,8 +76,12 @@ public AzureLocation(Location location) this.location); this.account = host.substring(0, accountSplit); - // host must end with ".dfs.core.windows.net" - checkArgument(host.substring(accountSplit).equals(".dfs.core.windows.net"), INVALID_LOCATION_MESSAGE, location); + // host must contain ".dfs." before endpoint + checkArgument(host.substring(accountSplit).startsWith(".dfs."), INVALID_LOCATION_MESSAGE, location); + + // endpoint is the part after ".dfs." + this.endpoint = host.substring(accountSplit + ".dfs.".length()); + checkArgument(!endpoint.isEmpty(), INVALID_LOCATION_MESSAGE, location); // storage account is interpolated into URL host name, so perform extra checks checkArgument(STORAGE_ACCOUNT_VALID_CHARACTERS.matchesAllOf(account), @@ -76,19 +89,6 @@ public AzureLocation(Location location) location); } - /** - * Creates a new {@link AzureLocation} based on the storage account, container and blob path parsed from the location. - *

- * Locations follow the conventions used by - * ABFS URI - * that follows the following convention - *

{@code abfs://@.dfs.core.windows.net/}
- */ - public static AzureLocation from(String location) - { - return new AzureLocation(Location.of(location)); - } - public Location location() { return location; @@ -104,6 +104,11 @@ public String account() return account; } + public String endpoint() + { + return endpoint; + } + public String path() { return location.path(); @@ -126,9 +131,10 @@ public String toString() public Location baseLocation() { - return Location.of("%s://%s%s.dfs.core.windows.net/".formatted( + return Location.of("%s://%s%s.dfs.%s/".formatted( scheme, container().map(container -> container + "@").orElse(""), - account())); + account(), + endpoint)); } } diff --git a/lib/trino-filesystem-azure/src/test/java/io/trino/filesystem/azure/TestAzureFileSystemConfig.java b/lib/trino-filesystem-azure/src/test/java/io/trino/filesystem/azure/TestAzureFileSystemConfig.java index 0d514715ea4e..c45b09e75c07 100644 --- a/lib/trino-filesystem-azure/src/test/java/io/trino/filesystem/azure/TestAzureFileSystemConfig.java +++ b/lib/trino-filesystem-azure/src/test/java/io/trino/filesystem/azure/TestAzureFileSystemConfig.java @@ -32,6 +32,7 @@ void testDefaults() { assertRecordedDefaults(recordDefaults(AzureFileSystemConfig.class) .setAuthType(AuthType.DEFAULT) + .setEndpoint("core.windows.net") .setReadBlockSize(DataSize.of(4, Unit.MEGABYTE)) .setWriteBlockSize(DataSize.of(4, Unit.MEGABYTE)) .setMaxWriteConcurrency(8) @@ -43,6 +44,7 @@ public void testExplicitPropertyMappings() { Map properties = ImmutableMap.builder() .put("azure.auth-type", "oauth") + .put("azure.endpoint", "core.usgovcloudapi.net") .put("azure.read-block-size", "3MB") .put("azure.write-block-size", "5MB") .put("azure.max-write-concurrency", "7") @@ -51,6 +53,7 @@ public void testExplicitPropertyMappings() AzureFileSystemConfig expected = new AzureFileSystemConfig() .setAuthType(AuthType.OAUTH) + .setEndpoint("core.usgovcloudapi.net") .setReadBlockSize(DataSize.of(3, Unit.MEGABYTE)) .setWriteBlockSize(DataSize.of(5, Unit.MEGABYTE)) .setMaxWriteConcurrency(7) diff --git a/lib/trino-filesystem-azure/src/test/java/io/trino/filesystem/azure/TestAzureLocation.java b/lib/trino-filesystem-azure/src/test/java/io/trino/filesystem/azure/TestAzureLocation.java index d725fa339cb5..6390d8183c1a 100644 --- a/lib/trino-filesystem-azure/src/test/java/io/trino/filesystem/azure/TestAzureLocation.java +++ b/lib/trino-filesystem-azure/src/test/java/io/trino/filesystem/azure/TestAzureLocation.java @@ -26,18 +26,23 @@ class TestAzureLocation @Test void test() { - assertValid("abfs://container@account.dfs.core.windows.net/some/path/file", "account", "container", "some/path/file"); - assertValid("abfss://container@account.dfs.core.windows.net/some/path/file", "account", "container", "some/path/file", "abfss"); + assertValid("abfs://container@account.dfs.core.windows.net/some/path/file", "account", "container", "some/path/file", "abfs", "core.windows.net"); + assertValid("abfss://container@account.dfs.core.windows.net/some/path/file", "account", "container", "some/path/file", "abfss", "core.windows.net"); - assertValid("abfs://container-stuff@account.dfs.core.windows.net/some/path/file", "account", "container-stuff", "some/path/file"); - assertValid("abfs://container2@account.dfs.core.windows.net/some/path/file", "account", "container2", "some/path/file"); - assertValid("abfs://account.dfs.core.windows.net/some/path/file", "account", null, "some/path/file"); + assertValid("abfs://container-stuff@account.dfs.core.windows.net/some/path/file", "account", "container-stuff", "some/path/file", "abfs", "core.windows.net"); + assertValid("abfs://container2@account.dfs.core.windows.net/some/path/file", "account", "container2", "some/path/file", "abfs", "core.windows.net"); + assertValid("abfs://account.dfs.core.windows.net/some/path/file", "account", null, "some/path/file", "abfs", "core.windows.net"); - assertValid("abfs://container@account.dfs.core.windows.net/file", "account", "container", "file"); - assertValid("abfs://container@account0.dfs.core.windows.net///f///i///l///e///", "account0", "container", "//f///i///l///e///"); + assertValid("abfs://container@account.dfs.core.windows.net/file", "account", "container", "file", "abfs", "core.windows.net"); + assertValid("abfs://container@account0.dfs.core.windows.net///f///i///l///e///", "account0", "container", "//f///i///l///e///", "abfs", "core.windows.net"); + + // other endpoints are allowed + assertValid("abfs://container@account.dfs.core.usgovcloudapi.net/some/path/file", "account", "container", "some/path/file", "abfs", "core.usgovcloudapi.net"); + assertValid("abfss://container@account.dfs.core.usgovcloudapi.net/some/path/file", "account", "container", "some/path/file", "abfss", "core.usgovcloudapi.net"); // only abfs and abfss schemes allowed assertInvalid("https://container@account.dfs.core.windows.net/some/path/file"); + // host must have at least to labels assertInvalid("abfs://container@account/some/path/file"); assertInvalid("abfs://container@/some/path/file"); @@ -54,32 +59,29 @@ void test() assertInvalid("abfs://container-@account.dfs.core.windows.net/some/path/file"); assertInvalid("abfs://con---tainer@account.dfs.core.windows.net/some/path/file"); assertInvalid("abfs://con--tainer@account.dfs.core.windows.net/some/path/file"); + // account is only a-z and 0-9 assertInvalid("abfs://container@ac-count.dfs.core.windows.net/some/path/file"); assertInvalid("abfs://container@ac_count.dfs.core.windows.net/some/path/file"); assertInvalid("abfs://container@ac$count.dfs.core.windows.net/some/path/file"); - // host must end with .dfs.core.windows.net + + // host must contain .dfs. after account assertInvalid("abfs://container@account.example.com/some/path/file"); - // host must be just account.dfs.core.windows.net assertInvalid("abfs://container@account.fake.dfs.core.windows.net/some/path/file"); } - private static void assertValid(String uri, String expectedAccount, String expectedContainer, String expectedPath, String expectedScheme) + private static void assertValid(String uri, String expectedAccount, String expectedContainer, String expectedPath, String expectedScheme, String expectedEndpoint) { Location location = Location.of(uri); AzureLocation azureLocation = new AzureLocation(location); assertThat(azureLocation.location()).isEqualTo(location); assertThat(azureLocation.account()).isEqualTo(expectedAccount); + assertThat(azureLocation.endpoint()).isEqualTo(expectedEndpoint); assertThat(azureLocation.container()).isEqualTo(Optional.ofNullable(expectedContainer)); assertThat(azureLocation.path()).contains(expectedPath); assertThat(azureLocation.baseLocation().scheme()).isEqualTo(Optional.of(expectedScheme)); } - private static void assertValid(String uri, String expectedAccount, String expectedContainer, String expectedPath) - { - assertValid(uri, expectedAccount, expectedContainer, expectedPath, "abfs"); - } - private static void assertInvalid(String uri) { Location location = Location.of(uri);